From b1893caeb6b6079a66ba146e73d68d6255d255ce Mon Sep 17 00:00:00 2001
From: Lakshmi-Surekha <Lakshmi.Kovvuri@ibm.com>
Date: Fri, 10 Jan 2025 07:21:10 +0530
Subject: [PATCH 001/408] [lldb][AIX] Added support for AIX in HostInfo section
 (#122301)

This PR is in reference to porting LLDB on AIX.

Link to discussions on llvm discourse and github:

1. https://discourse.llvm.org/t/port-lldb-to-ibm-aix/80640
https://github.com/llvm/llvm-project/issues/101657
2. The complete changes for porting are present in this draft PR:
https://github.com/llvm/llvm-project/pull/102601
Added support for AIX in HostInfo section

Review Request : @DavidSpickett @labath @DhruvSrivastavaX
---
 lldb/include/lldb/Host/HostInfo.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lldb/include/lldb/Host/HostInfo.h b/lldb/include/lldb/Host/HostInfo.h
index b7010d69d88e7..0f7ec0e0aa0d2 100644
--- a/lldb/include/lldb/Host/HostInfo.h
+++ b/lldb/include/lldb/Host/HostInfo.h
@@ -55,6 +55,9 @@
 #elif defined(__APPLE__)
 #include "lldb/Host/macosx/HostInfoMacOSX.h"
 #define HOST_INFO_TYPE HostInfoMacOSX
+#elif defined(_AIX)
+#include "lldb/Host/aix/HostInfoAIX.h"
+#define HOST_INFO_TYPE HostInfoAIX
 #else
 #include "lldb/Host/posix/HostInfoPosix.h"
 #define HOST_INFO_TYPE HostInfoPosix

From b11fe33aea82444387422e550e10f1fba5bcfaa3 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 9 Jan 2025 18:05:22 -0800
Subject: [PATCH 002/408] [RISCV] Correct the cost model for the i1 reduce.add
 and reduce.or. (#122349)

reduce.add uses the same sequence as reduce.xor. reduce.or should use
vmor not vmxor.
---
 llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 850d6244affa5..66be30c4b42ba 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1548,7 +1548,7 @@ RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
              getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
              getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
                                 CmpInst::ICMP_EQ, CostKind);
-    } else if (ISD == ISD::XOR) {
+    } else if (ISD == ISD::XOR || ISD == ISD::ADD) {
       // Example sequences:
       //   vsetvli a0, zero, e8, mf8, ta, ma
       //   vmxor.mm v8, v0, v8 ; needed every time type is split
@@ -1558,13 +1558,14 @@ RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
                  getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) +
              getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1;
     } else {
+      assert(ISD == ISD::OR);
       // Example sequences:
       //   vsetvli a0, zero, e8, mf8, ta, ma
-      //   vmxor.mm v8, v9, v8 ; needed every time type is split
+      //   vmor.mm v8, v9, v8 ; needed every time type is split
       //   vcpop.m a0, v0
       //   snez a0, a0
       return (LT.first - 1) *
-                 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) +
+                 getRISCVInstructionCost(RISCV::VMOR_MM, LT.second, CostKind) +
              getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
              getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
                                 CmpInst::ICMP_NE, CostKind);

From 41e4018f9c858af15c4fe0ea0d1de8ff4602071e Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 9 Jan 2025 18:05:41 -0800
Subject: [PATCH 003/408] [RISCV][VLOPT] Simplify code by removing extra
 temporary variables. NFC (#122333)

Just do the conditional operator in the return statement.
---
 llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index 9338e0a1c8741..69ee210071286 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -550,10 +550,8 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
   case RISCV::VFWCVT_RTZ_X_F_V:
   case RISCV::VFWCVT_F_XU_V:
   case RISCV::VFWCVT_F_X_V:
-  case RISCV::VFWCVT_F_F_V: {
-    unsigned Log2EEW = IsMODef ? MILog2SEW + 1 : MILog2SEW;
-    return Log2EEW;
-  }
+  case RISCV::VFWCVT_F_F_V:
+    return IsMODef ? MILog2SEW + 1 : MILog2SEW;
 
   // Def and Op1 uses EEW=2*SEW. Op2 uses EEW=SEW.
   case RISCV::VWADDU_WV:
@@ -571,8 +569,7 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
   case RISCV::VFWSUB_WV: {
     bool IsOp1 = HasPassthru ? MO.getOperandNo() == 2 : MO.getOperandNo() == 1;
     bool TwoTimes = IsMODef || IsOp1;
-    unsigned Log2EEW = TwoTimes ? MILog2SEW + 1 : MILog2SEW;
-    return Log2EEW;
+    return TwoTimes ? MILog2SEW + 1 : MILog2SEW;
   }
 
   // Vector Integer Extension
@@ -613,8 +610,7 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
   case RISCV::VFNCVT_ROD_F_F_W: {
     bool IsOp1 = HasPassthru ? MO.getOperandNo() == 2 : MO.getOperandNo() == 1;
     bool TwoTimes = IsOp1;
-    unsigned Log2EEW = TwoTimes ? MILog2SEW + 1 : MILog2SEW;
-    return Log2EEW;
+    return TwoTimes ? MILog2SEW + 1 : MILog2SEW;
   }
 
   // Vector Mask Instructions
@@ -728,8 +724,7 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
   case RISCV::VFWREDOSUM_VS:
   case RISCV::VFWREDUSUM_VS: {
     bool TwoTimes = IsMODef || MO.getOperandNo() == 3;
-    unsigned Log2EEW = TwoTimes ? MILog2SEW + 1 : MILog2SEW;
-    return Log2EEW;
+    return TwoTimes ? MILog2SEW + 1 : MILog2SEW;
   }
 
   default:

From 369c61744a435c52e3564398d9972fa556db022b Mon Sep 17 00:00:00 2001
From: Shao-Ce SUN <sunshaoce@outlook.com>
Date: Fri, 10 Jan 2025 10:10:42 +0800
Subject: [PATCH 004/408] [RISCV] Fix the cost of `llvm.vector.reduce.and`
 (#119160)

I added some CodeGen test cases related to reduce. To maintain
consistency, I also added cases for instructions like
`vector.reduce.or`.

For cases where `v1i1` type generates `VFIRST`, please refer to:
https://reviews.llvm.org/D139512.
---
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |  24 +-
 .../Analysis/CostModel/RISCV/reduce-and-i1.ll | 239 ++++++++++++++++++
 .../Analysis/CostModel/RISCV/reduce-and.ll    |  32 +--
 .../Analysis/CostModel/RISCV/reduce-max.ll    |   4 +-
 .../Analysis/CostModel/RISCV/reduce-min.ll    |   4 +-
 5 files changed, 279 insertions(+), 24 deletions(-)
 create mode 100644 llvm/test/Analysis/CostModel/RISCV/reduce-and-i1.ll

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 66be30c4b42ba..a8f04f038f810 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1536,15 +1536,31 @@ RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
   Type *ElementTy = Ty->getElementType();
   if (ElementTy->isIntegerTy(1)) {
+    // Example sequences:
+    //   vfirst.m a0, v0
+    //   seqz a0, a0
+    if (LT.second == MVT::v1i1)
+      return getRISCVInstructionCost(RISCV::VFIRST_M, LT.second, CostKind) +
+             getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
+                                CmpInst::ICMP_EQ, CostKind);
+
     if (ISD == ISD::AND) {
       // Example sequences:
-      //   vsetvli a0, zero, e8, mf8, ta, ma
       //   vmand.mm v8, v9, v8 ; needed every time type is split
-      //   vmnot.m v8, v0
+      //   vmnot.m v8, v0      ; alias for vmnand
       //   vcpop.m a0, v8
       //   seqz a0, a0
-      return LT.first * getRISCVInstructionCost(RISCV::VMNAND_MM, LT.second,
-                                                CostKind) +
+
+      // See the discussion: https://github.com/llvm/llvm-project/pull/119160
+      // For LMUL <= 8, there is no splitting,
+      //   the sequences are vmnot, vcpop and seqz.
+      // When LMUL > 8 and split = 1,
+      //   the sequences are vmnand, vcpop and seqz.
+      // When LMUL > 8 and split > 1,
+      //   the sequences are (LT.first-2) * vmand, vmnand, vcpop and seqz.
+      return ((LT.first > 2) ? (LT.first - 2) : 0) *
+                 getRISCVInstructionCost(RISCV::VMAND_MM, LT.second, CostKind) +
+             getRISCVInstructionCost(RISCV::VMNAND_MM, LT.second, CostKind) +
              getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
              getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
                                 CmpInst::ICMP_EQ, CostKind);
diff --git a/llvm/test/Analysis/CostModel/RISCV/reduce-and-i1.ll b/llvm/test/Analysis/CostModel/RISCV/reduce-and-i1.ll
new file mode 100644
index 0000000000000..cc88d907187d1
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/RISCV/reduce-and-i1.ll
@@ -0,0 +1,239 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=riscv32 -mattr=+v,+zvl128b -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output \
+; RUN:   | FileCheck %s --check-prefixes=THROUGHPUT,THROUGHPUT-VL128B
+; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zvl128b -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output \
+; RUN:   | FileCheck %s --check-prefixes=THROUGHPUT,THROUGHPUT-VL128B
+; RUN: opt < %s -mtriple=riscv32 -mattr=+v,+zvl256b -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output \
+; RUN:   | FileCheck %s --check-prefixes=THROUGHPUT,THROUGHPUT-VL256B
+; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zvl256b -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output \
+; RUN:   | FileCheck %s --check-prefixes=THROUGHPUT,THROUGHPUT-VL256B
+; RUN: opt < %s -mtriple=riscv32 -mattr=+v,+zvl512b -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output \
+; RUN:   | FileCheck %s --check-prefixes=THROUGHPUT,THROUGHPUT-VL512B
+; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zvl512b -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output \
+; RUN:   | FileCheck %s --check-prefixes=THROUGHPUT,THROUGHPUT-VL512B
+; RUN: opt < %s -mtriple=riscv32 -mattr=+v,+zvl1024b -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output \
+; RUN:   | FileCheck %s --check-prefixes=THROUGHPUT,THROUGHPUT-VL1024B
+; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zvl1024b -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output \
+; RUN:   | FileCheck %s --check-prefixes=THROUGHPUT,THROUGHPUT-VL1024B
+
+define zeroext i1 @vreduce_and_v1i1(<1 x i1> %v) {
+; THROUGHPUT-LABEL: 'vreduce_and_v1i1'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %red = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %v)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i1 %red
+;
+  %red = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %v)
+  ret i1 %red
+}
+
+define zeroext i1 @vreduce_and_v2i1(<2 x i1> %v) {
+; THROUGHPUT-LABEL: 'vreduce_and_v2i1'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %v)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i1 %red
+;
+  %red = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %v)
+  ret i1 %red
+}
+
+define zeroext i1 @vreduce_and_v4i1(<4 x i1> %v) {
+; THROUGHPUT-LABEL: 'vreduce_and_v4i1'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %v)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i1 %red
+;
+  %red = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %v)
+  ret i1 %red
+}
+
+define zeroext i1 @vreduce_and_v8i1(<8 x i1> %v) {
+; THROUGHPUT-LABEL: 'vreduce_and_v8i1'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %v)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i1 %red
+;
+  %red = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %v)
+  ret i1 %red
+}
+
+define zeroext i1 @vreduce_and_v16i1(<16 x i1> %v) {
+; THROUGHPUT-LABEL: 'vreduce_and_v16i1'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %v)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i1 %red
+;
+  %red = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %v)
+  ret i1 %red
+}
+
+define zeroext i1 @vreduce_and_v32i1(<32 x i1> %v) {
+; THROUGHPUT-LABEL: 'vreduce_and_v32i1'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> %v)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i1 %red
+;
+  %red = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> %v)
+  ret i1 %red
+}
+
+define zeroext i1 @vreduce_and_v64i1(<64 x i1> %v) {
+; THROUGHPUT-LABEL: 'vreduce_and_v64i1'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> %v)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i1 %red
+;
+  %red = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> %v)
+  ret i1 %red
+}
+
+define zeroext i1 @vreduce_and_v128i1(<128 x i1> %v) {
+; THROUGHPUT-LABEL: 'vreduce_and_v128i1'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> %v)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i1 %red
+;
+  %red = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> %v)
+  ret i1 %red
+}
+
+define zeroext i1 @vreduce_and_v256i1(<256 x i1> %v) {
+; THROUGHPUT-LABEL: 'vreduce_and_v256i1'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i1 @llvm.vector.reduce.and.v256i1(<256 x i1> %v)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i1 %red
+;
+  %red = call i1 @llvm.vector.reduce.and.v256i1(<256 x i1> %v)
+  ret i1 %red
+}
+
+define zeroext i1 @vreduce_and_v512i1(<512 x i1> %v) {
+; THROUGHPUT-VL128B-LABEL: 'vreduce_and_v512i1'
+; THROUGHPUT-VL128B-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i1 @llvm.vector.reduce.and.v512i1(<512 x i1> %v)
+; THROUGHPUT-VL128B-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i1 %red
+;
+; THROUGHPUT-VL256B-LABEL: 'vreduce_and_v512i1'
+; THROUGHPUT-VL256B-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i1 @llvm.vector.reduce.and.v512i1(<512 x i1> %v)
+; THROUGHPUT-VL256B-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i1 %red
+;
+; THROUGHPUT-VL512B-LABEL: 'vreduce_and_v512i1'
+; THROUGHPUT-VL512B-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i1 @llvm.vector.reduce.and.v512i1(<512 x i1> %v)
+; THROUGHPUT-VL512B-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i1 %red
+;
+; THROUGHPUT-VL1024B-LABEL: 'vreduce_and_v512i1'
+; THROUGHPUT-VL1024B-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i1 @llvm.vector.reduce.and.v512i1(<512 x i1> %v)
+; THROUGHPUT-VL1024B-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i1 %red
+;
+  %red = call i1 @llvm.vector.reduce.and.v512i1(<512 x i1> %v)
+  ret i1 %red
+}
+
+define zeroext i1 @vreduce_and_v1024i1(<1024 x i1> %v) {
+; THROUGHPUT-VL128B-LABEL: 'vreduce_and_v1024i1'
+; THROUGHPUT-VL128B-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %red = call i1 @llvm.vector.reduce.and.v1024i1(<1024 x i1> %v)
+; THROUGHPUT-VL128B-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i1 %red
+;
+; THROUGHPUT-VL256B-LABEL: 'vreduce_and_v1024i1'
+; THROUGHPUT-VL256B-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i1 @llvm.vector.reduce.and.v1024i1(<1024 x i1> %v)
+; THROUGHPUT-VL256B-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i1 %red
+;
+; THROUGHPUT-VL512B-LABEL: 'vreduce_and_v1024i1'
+; THROUGHPUT-VL512B-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i1 @llvm.vector.reduce.and.v1024i1(<1024 x i1> %v)
+; THROUGHPUT-VL512B-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i1 %red
+;
+; THROUGHPUT-VL1024B-LABEL: 'vreduce_and_v1024i1'
+; THROUGHPUT-VL1024B-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i1 @llvm.vector.reduce.and.v1024i1(<1024 x i1> %v)
+; THROUGHPUT-VL1024B-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i1 %red
+;
+  %red = call i1 @llvm.vector.reduce.and.v1024i1(<1024 x i1> %v)
+  ret i1 %red
+}
+
+define zeroext i1 @vreduce_and_nxv1i1(<vscale x 1 x i1> %v) {
+; THROUGHPUT-LABEL: 'vreduce_and_nxv1i1'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i1 @llvm.vector.reduce.and.nxv1i1(<vscale x 1 x i1> %v)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i1 %red
+;
+  %red = call i1 @llvm.vector.reduce.and.nxv1i1(<vscale x 1 x i1> %v)
+  ret i1 %red
+}
+
+define zeroext i1 @vreduce_and_nxv2i1(<vscale x 2 x i1> %v) {
+; THROUGHPUT-LABEL: 'vreduce_and_nxv2i1'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i1 @llvm.vector.reduce.and.nxv2i1(<vscale x 2 x i1> %v)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i1 %red
+;
+  %red = call i1 @llvm.vector.reduce.and.nxv2i1(<vscale x 2 x i1> %v)
+  ret i1 %red
+}
+
+define zeroext i1 @vreduce_and_nxv4i1(<vscale x 4 x i1> %v) {
+; THROUGHPUT-LABEL: 'vreduce_and_nxv4i1'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i1 @llvm.vector.reduce.and.nxv4i1(<vscale x 4 x i1> %v)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i1 %red
+;
+  %red = call i1 @llvm.vector.reduce.and.nxv4i1(<vscale x 4 x i1> %v)
+  ret i1 %red
+}
+
+define zeroext i1 @vreduce_and_nxv8i1(<vscale x 8 x i1> %v) {
+; THROUGHPUT-LABEL: 'vreduce_and_nxv8i1'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i1 @llvm.vector.reduce.and.nxv8i1(<vscale x 8 x i1> %v)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i1 %red
+;
+  %red = call i1 @llvm.vector.reduce.and.nxv8i1(<vscale x 8 x i1> %v)
+  ret i1 %red
+}
+
+define zeroext i1 @vreduce_and_nxv16i1(<vscale x 16 x i1> %v) {
+; THROUGHPUT-LABEL: 'vreduce_and_nxv16i1'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i1 @llvm.vector.reduce.and.nxv16i1(<vscale x 16 x i1> %v)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i1 %red
+;
+  %red = call i1 @llvm.vector.reduce.and.nxv16i1(<vscale x 16 x i1> %v)
+  ret i1 %red
+}
+
+define zeroext i1 @vreduce_and_nxv32i1(<vscale x 32 x i1> %v) {
+; THROUGHPUT-LABEL: 'vreduce_and_nxv32i1'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i1 @llvm.vector.reduce.and.nxv32i1(<vscale x 32 x i1> %v)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i1 %red
+;
+  %red = call i1 @llvm.vector.reduce.and.nxv32i1(<vscale x 32 x i1> %v)
+  ret i1 %red
+}
+
+define zeroext i1 @vreduce_and_nxv64i1(<vscale x 64 x i1> %v) {
+; THROUGHPUT-LABEL: 'vreduce_and_nxv64i1'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i1 @llvm.vector.reduce.and.nxv64i1(<vscale x 64 x i1> %v)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i1 %red
+;
+  %red = call i1 @llvm.vector.reduce.and.nxv64i1(<vscale x 64 x i1> %v)
+  ret i1 %red
+}
+
+define zeroext i1 @vreduce_and_nxv128i1(<vscale x 128 x i1> %v) {
+; THROUGHPUT-LABEL: 'vreduce_and_nxv128i1'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i1 @llvm.vector.reduce.and.nxv128i1(<vscale x 128 x i1> %v)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i1 %red
+;
+  %red = call i1 @llvm.vector.reduce.and.nxv128i1(<vscale x 128 x i1> %v)
+  ret i1 %red
+}
+
+define zeroext i1 @vreduce_and_nxv256i1(<vscale x 256 x i1> %v) {
+; THROUGHPUT-LABEL: 'vreduce_and_nxv256i1'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i1 @llvm.vector.reduce.and.nxv256i1(<vscale x 256 x i1> %v)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i1 %red
+;
+  %red = call i1 @llvm.vector.reduce.and.nxv256i1(<vscale x 256 x i1> %v)
+  ret i1 %red
+}
+
+define zeroext i1 @vreduce_and_nxv512i1(<vscale x 512 x i1> %v) {
+; THROUGHPUT-LABEL: 'vreduce_and_nxv512i1'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %red = call i1 @llvm.vector.reduce.and.nxv512i1(<vscale x 512 x i1> %v)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i1 %red
+;
+  %red = call i1 @llvm.vector.reduce.and.nxv512i1(<vscale x 512 x i1> %v)
+  ret i1 %red
+}
+
+define zeroext i1 @vreduce_and_nxv1024i1(<vscale x 1024 x i1> %v) {
+; THROUGHPUT-LABEL: 'vreduce_and_nxv1024i1'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %red = call i1 @llvm.vector.reduce.and.nxv1024i1(<vscale x 1024 x i1> %v)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i1 %red
+;
+  %red = call i1 @llvm.vector.reduce.and.nxv1024i1(<vscale x 1024 x i1> %v)
+  ret i1 %red
+}
diff --git a/llvm/test/Analysis/CostModel/RISCV/reduce-and.ll b/llvm/test/Analysis/CostModel/RISCV/reduce-and.ll
index 463232f082f40..dc6a582df133b 100644
--- a/llvm/test/Analysis/CostModel/RISCV/reduce-and.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/reduce-and.ll
@@ -6,7 +6,7 @@
 
 define i32 @reduce_i1(i32 %arg) {
 ; CHECK-LABEL: 'reduce_i1'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
@@ -14,9 +14,9 @@ define i32 @reduce_i1(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = call i1 @llvm.vector.reduce.and.v256i1(<256 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = call i1 @llvm.vector.reduce.and.v512i1(<512 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V1024 = call i1 @llvm.vector.reduce.and.v1024i1(<1024 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = call i1 @llvm.vector.reduce.and.v256i1(<256 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V512 = call i1 @llvm.vector.reduce.and.v512i1(<512 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V1024 = call i1 @llvm.vector.reduce.and.v1024i1(<1024 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV1 = call i1 @llvm.vector.reduce.and.nxv1i1(<vscale x 1 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV2 = call i1 @llvm.vector.reduce.and.nxv2i1(<vscale x 2 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV4 = call i1 @llvm.vector.reduce.and.nxv4i1(<vscale x 4 x i1> undef)
@@ -24,14 +24,14 @@ define i32 @reduce_i1(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV16 = call i1 @llvm.vector.reduce.and.nxv16i1(<vscale x 16 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV32 = call i1 @llvm.vector.reduce.and.nxv32i1(<vscale x 32 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV64 = call i1 @llvm.vector.reduce.and.nxv64i1(<vscale x 64 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV128 = call i1 @llvm.vector.reduce.and.nxv128i1(<vscale x 128 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %NXV256 = call i1 @llvm.vector.reduce.and.nxv256i1(<vscale x 256 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %NXV512 = call i1 @llvm.vector.reduce.and.nxv512i1(<vscale x 512 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV1024 = call i1 @llvm.vector.reduce.and.nxv1024i1(<vscale x 1024 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV128 = call i1 @llvm.vector.reduce.and.nxv128i1(<vscale x 128 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV256 = call i1 @llvm.vector.reduce.and.nxv256i1(<vscale x 256 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV512 = call i1 @llvm.vector.reduce.and.nxv512i1(<vscale x 512 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %NXV1024 = call i1 @llvm.vector.reduce.and.nxv1024i1(<vscale x 1024 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SIZE-LABEL: 'reduce_i1'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
@@ -39,9 +39,9 @@ define i32 @reduce_i1(i32 %arg) {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = call i1 @llvm.vector.reduce.and.v256i1(<256 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = call i1 @llvm.vector.reduce.and.v512i1(<512 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V1024 = call i1 @llvm.vector.reduce.and.v1024i1(<1024 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = call i1 @llvm.vector.reduce.and.v256i1(<256 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V512 = call i1 @llvm.vector.reduce.and.v512i1(<512 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V1024 = call i1 @llvm.vector.reduce.and.v1024i1(<1024 x i1> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV1 = call i1 @llvm.vector.reduce.and.nxv1i1(<vscale x 1 x i1> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV2 = call i1 @llvm.vector.reduce.and.nxv2i1(<vscale x 2 x i1> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV4 = call i1 @llvm.vector.reduce.and.nxv4i1(<vscale x 4 x i1> undef)
@@ -49,10 +49,10 @@ define i32 @reduce_i1(i32 %arg) {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV16 = call i1 @llvm.vector.reduce.and.nxv16i1(<vscale x 16 x i1> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV32 = call i1 @llvm.vector.reduce.and.nxv32i1(<vscale x 32 x i1> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV64 = call i1 @llvm.vector.reduce.and.nxv64i1(<vscale x 64 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV128 = call i1 @llvm.vector.reduce.and.nxv128i1(<vscale x 128 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %NXV256 = call i1 @llvm.vector.reduce.and.nxv256i1(<vscale x 256 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %NXV512 = call i1 @llvm.vector.reduce.and.nxv512i1(<vscale x 512 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV1024 = call i1 @llvm.vector.reduce.and.nxv1024i1(<vscale x 1024 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV128 = call i1 @llvm.vector.reduce.and.nxv128i1(<vscale x 128 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV256 = call i1 @llvm.vector.reduce.and.nxv256i1(<vscale x 256 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV512 = call i1 @llvm.vector.reduce.and.nxv512i1(<vscale x 512 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %NXV1024 = call i1 @llvm.vector.reduce.and.nxv1024i1(<vscale x 1024 x i1> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %V1   = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
diff --git a/llvm/test/Analysis/CostModel/RISCV/reduce-max.ll b/llvm/test/Analysis/CostModel/RISCV/reduce-max.ll
index f11e9f2b5ae83..5c9303af31747 100644
--- a/llvm/test/Analysis/CostModel/RISCV/reduce-max.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/reduce-max.ll
@@ -176,7 +176,7 @@ define i32 @reduce_umax_i64(i32 %arg) {
 
 define i32 @reduce_smin_i1(i32 %arg) {
 ; CHECK-LABEL: 'reduce_smin_i1'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i1 @llvm.vector.reduce.smax.v1i1(<1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i1 @llvm.vector.reduce.smax.v1i1(<1 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.smax.v2i1(<2 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i1 @llvm.vector.reduce.smax.v4i1(<4 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8 = call i1 @llvm.vector.reduce.smax.v8i1(<8 x i1> undef)
@@ -187,7 +187,7 @@ define i32 @reduce_smin_i1(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SIZE-LABEL: 'reduce_smin_i1'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i1 @llvm.vector.reduce.smax.v1i1(<1 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i1 @llvm.vector.reduce.smax.v1i1(<1 x i1> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.smax.v2i1(<2 x i1> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i1 @llvm.vector.reduce.smax.v4i1(<4 x i1> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8 = call i1 @llvm.vector.reduce.smax.v8i1(<8 x i1> undef)
diff --git a/llvm/test/Analysis/CostModel/RISCV/reduce-min.ll b/llvm/test/Analysis/CostModel/RISCV/reduce-min.ll
index 457fdbe46f73b..9875d3e585811 100644
--- a/llvm/test/Analysis/CostModel/RISCV/reduce-min.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/reduce-min.ll
@@ -6,7 +6,7 @@
 
 define i32 @reduce_umin_i1(i32 %arg) {
 ; CHECK-LABEL: 'reduce_umin_i1'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i1 @llvm.vector.reduce.umin.v1i1(<1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i1 @llvm.vector.reduce.umin.v1i1(<1 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.umin.v2i1(<2 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i1 @llvm.vector.reduce.umin.v4i1(<4 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8 = call i1 @llvm.vector.reduce.umin.v8i1(<8 x i1> undef)
@@ -17,7 +17,7 @@ define i32 @reduce_umin_i1(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SIZE-LABEL: 'reduce_umin_i1'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i1 @llvm.vector.reduce.umin.v1i1(<1 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i1 @llvm.vector.reduce.umin.v1i1(<1 x i1> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.umin.v2i1(<2 x i1> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i1 @llvm.vector.reduce.umin.v4i1(<4 x i1> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8 = call i1 @llvm.vector.reduce.umin.v8i1(<8 x i1> undef)

From 5454ac28b302240ccfd164dfd1e2bce526aee7ca Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen@sifive.com>
Date: Thu, 9 Jan 2025 18:41:47 -0800
Subject: [PATCH 005/408] Revert "[SLP] NFC. Replace MainOp and AltOp in
 TreeEntry with InstructionsState. (#120198)"

This reverts commit 760f550de25792db83cd39c88ef57ab6d80a41a0.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 208 +++++++++---------
 1 file changed, 102 insertions(+), 106 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 8ff70fdb1180b..fd897b3f720be 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2414,16 +2414,15 @@ class BoUpSLP {
     }
 
     /// Go through the instructions in VL and append their operands.
-    void appendOperandsOfVL(ArrayRef<Value *> VL, const InstructionsState &S) {
+    void appendOperandsOfVL(ArrayRef<Value *> VL, Instruction *VL0) {
       assert(!VL.empty() && "Bad VL");
       assert((empty() || VL.size() == getNumLanes()) &&
              "Expected same number of lanes");
       // IntrinsicInst::isCommutative returns true if swapping the first "two"
       // arguments to the intrinsic produces the same result.
       constexpr unsigned IntrinsicNumOperands = 2;
-      unsigned NumOperands = S.getMainOp()->getNumOperands();
-      ArgSize = isa<IntrinsicInst>(S.getMainOp()) ? IntrinsicNumOperands
-                                                  : NumOperands;
+      unsigned NumOperands = VL0->getNumOperands();
+      ArgSize = isa<IntrinsicInst>(VL0) ? IntrinsicNumOperands : NumOperands;
       OpsVec.resize(NumOperands);
       unsigned NumLanes = VL.size();
       for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
@@ -2443,8 +2442,8 @@ class BoUpSLP {
           // tell the inverse operations by checking commutativity.
           if (isa<PoisonValue>(VL[Lane])) {
             OpsVec[OpIdx][Lane] = {
-                PoisonValue::get(S.getMainOp()->getOperand(OpIdx)->getType()),
-                true, false};
+                PoisonValue::get(VL0->getOperand(OpIdx)->getType()), true,
+                false};
             continue;
           }
           bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
@@ -2556,12 +2555,11 @@ class BoUpSLP {
 
   public:
     /// Initialize with all the operands of the instruction vector \p RootVL.
-    VLOperands(ArrayRef<Value *> RootVL, const InstructionsState &S,
-               const BoUpSLP &R)
+    VLOperands(ArrayRef<Value *> RootVL, Instruction *VL0, const BoUpSLP &R)
         : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
-          L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
+          L(R.LI->getLoopFor((VL0->getParent()))) {
       // Append all the operands of RootVL.
-      appendOperandsOfVL(RootVL, S);
+      appendOperandsOfVL(RootVL, VL0);
     }
 
     /// \Returns a value vector with the operands across all lanes for the
@@ -3034,7 +3032,7 @@ class BoUpSLP {
   /// non-identity permutation that allows to reuse extract instructions.
   /// \param ResizeAllowed indicates whether it is allowed to handle subvector
   /// extract order.
-  bool canReuseExtract(ArrayRef<Value *> VL,
+  bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
                        SmallVectorImpl<unsigned> &CurrentOrder,
                        bool ResizeAllowed = false) const;
 
@@ -3261,7 +3259,7 @@ class BoUpSLP {
     };
 
     /// Checks if the current node is a gather node.
-    bool isGather() const { return State == NeedToGather; }
+    bool isGather() const {return State == NeedToGather; }
 
     /// A vector of scalars.
     ValueList Scalars;
@@ -3325,9 +3323,9 @@ class BoUpSLP {
     /// reordering of operands during buildTree_rec() and vectorizeTree().
     SmallVector<ValueList, 2> Operands;
 
-    /// MainOp and AltOp are recorded inside. S should be obtained from
-    /// newTreeEntry.
-    InstructionsState S = InstructionsState::invalid();
+    /// The main/alternate instruction.
+    Instruction *MainOp = nullptr;
+    Instruction *AltOp = nullptr;
 
     /// Interleaving factor for interleaved loads Vectorize nodes.
     unsigned InterleaveFactor = 0;
@@ -3351,10 +3349,10 @@ class BoUpSLP {
 
     /// Set this bundle's operand from Scalars.
     void setOperand(const BoUpSLP &R, bool RequireReorder = false) {
-      VLOperands Ops(Scalars, S, R);
+      VLOperands Ops(Scalars, MainOp, R);
       if (RequireReorder)
         Ops.reorder();
-      for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands()))
+      for (unsigned I : seq<unsigned>(MainOp->getNumOperands()))
         setOperand(I, Ops.getVL(I));
     }
 
@@ -3387,9 +3385,13 @@ class BoUpSLP {
     }
 
     /// Some of the instructions in the list have alternate opcodes.
-    bool isAltShuffle() const { return S.isAltShuffle(); }
+    bool isAltShuffle() const { return MainOp != AltOp; }
 
-    bool isOpcodeOrAlt(Instruction *I) const { return S.isOpcodeOrAlt(I); }
+    bool isOpcodeOrAlt(Instruction *I) const {
+      unsigned CheckedOpcode = I->getOpcode();
+      return (getOpcode() == CheckedOpcode ||
+              getAltOpcode() == CheckedOpcode);
+    }
 
     /// Chooses the correct key for scheduling data. If \p Op has the same (or
     /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
@@ -3398,24 +3400,31 @@ class BoUpSLP {
       auto *I = dyn_cast<Instruction>(Op);
       if (I && isOpcodeOrAlt(I))
         return Op;
-      return S.getMainOp();
+      return MainOp;
     }
 
     void setOperations(const InstructionsState &S) {
       assert(S && "InstructionsState is invalid.");
-      this->S = S;
+      MainOp = S.getMainOp();
+      AltOp = S.getAltOp();
     }
 
-    Instruction *getMainOp() const { return S.getMainOp(); }
+    Instruction *getMainOp() const {
+      return MainOp;
+    }
 
-    Instruction *getAltOp() const { return S.getAltOp(); }
+    Instruction *getAltOp() const {
+      return AltOp;
+    }
 
     /// The main/alternate opcodes for the list of instructions.
-    unsigned getOpcode() const { return S.getOpcode(); }
-
-    unsigned getAltOpcode() const { return S.getAltOpcode(); }
+    unsigned getOpcode() const {
+      return MainOp ? MainOp->getOpcode() : 0;
+    }
 
-    bool hasState() const { return S.valid(); }
+    unsigned getAltOpcode() const {
+      return AltOp ? AltOp->getOpcode() : 0;
+    }
 
     /// When ReuseReorderShuffleIndices is empty it just returns position of \p
     /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
@@ -3511,13 +3520,16 @@ class BoUpSLP {
         dbgs() << "CombinedVectorize\n";
         break;
       }
-      if (S) {
-        dbgs() << "MainOp: " << *S.getMainOp() << "\n";
-        dbgs() << "AltOp: " << *S.getAltOp() << "\n";
-      } else {
-        dbgs() << "MainOp: NULL\n";
-        dbgs() << "AltOp: NULL\n";
-      }
+      dbgs() << "MainOp: ";
+      if (MainOp)
+        dbgs() << *MainOp << "\n";
+      else
+        dbgs() << "NULL\n";
+      dbgs() << "AltOp: ";
+      if (AltOp)
+        dbgs() << *AltOp << "\n";
+      else
+        dbgs() << "NULL\n";
       dbgs() << "VectorizedValue: ";
       if (VectorizedValue)
         dbgs() << *VectorizedValue << "\n";
@@ -3692,13 +3704,9 @@ class BoUpSLP {
   }
 #endif
 
-  TreeEntry *getTreeEntry(Value *V) {
-    assert(V && "V cannot be nullptr.");
-    return ScalarToTreeEntry.lookup(V);
-  }
+  TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
 
   const TreeEntry *getTreeEntry(Value *V) const {
-    assert(V && "V cannot be nullptr.");
     return ScalarToTreeEntry.lookup(V);
   }
 
@@ -5579,7 +5587,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
     // Try build correct order for extractelement instructions.
     SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
                                 TE.ReuseShuffleIndices.end());
-    if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
+    if (TE.getOpcode() == Instruction::ExtractElement &&
         all_of(TE.Scalars, [Sz](Value *V) {
           if (isa<PoisonValue>(V))
             return true;
@@ -5741,11 +5749,10 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
       return std::nullopt; // No need to reorder.
     return std::move(Phis);
   }
-  if (TE.isGather() && (!TE.hasState() || !TE.isAltShuffle()) &&
-      allSameType(TE.Scalars)) {
+  if (TE.isGather() && !TE.isAltShuffle() && allSameType(TE.Scalars)) {
     // TODO: add analysis of other gather nodes with extractelement
     // instructions and other values/instructions, not only undefs.
-    if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
+    if ((TE.getOpcode() == Instruction::ExtractElement ||
          (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
           any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
         all_of(TE.Scalars, [](Value *V) {
@@ -5755,8 +5762,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
       // Check that gather of extractelements can be represented as
       // just a shuffle of a single vector.
       OrdersType CurrentOrder;
-      bool Reuse =
-          canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
+      bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
+                                   /*ResizeAllowed=*/true);
       if (Reuse || !CurrentOrder.empty())
         return std::move(CurrentOrder);
     }
@@ -5805,7 +5812,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
         return Order;
     // Check if can include the order of vectorized loads. For masked gathers do
     // extra analysis later, so include such nodes into a special list.
-    if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
+    if (TE.isGather() && TE.getOpcode() == Instruction::Load) {
       SmallVector<Value *> PointerOps;
       OrdersType CurrentOrder;
       LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
@@ -5920,7 +5927,7 @@ void BoUpSLP::reorderTopToBottom() {
     // Patterns like [fadd,fsub] can be combined into a single instruction in
     // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
     // to take into account their order when looking for the most used order.
-    if (TE->hasState() && TE->isAltShuffle()) {
+    if (TE->isAltShuffle()) {
       VectorType *VecTy =
           getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
       unsigned Opcode0 = TE->getOpcode();
@@ -5999,7 +6006,7 @@ void BoUpSLP::reorderTopToBottom() {
           if (It != GathersToOrders.end())
             return It->second;
         }
-        if (OpTE->hasState() && OpTE->isAltShuffle()) {
+        if (OpTE->isAltShuffle()) {
           auto It = AltShufflesToOrders.find(OpTE);
           if (It != AltShufflesToOrders.end())
             return It->second;
@@ -7600,7 +7607,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
   }
   case Instruction::ExtractValue:
   case Instruction::ExtractElement: {
-    bool Reuse = canReuseExtract(VL, CurrentOrder);
+    bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
     // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
     if (!has_single_bit(VL.size()))
       return TreeEntry::NeedToGather;
@@ -8613,7 +8620,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                  TE->dump());
 
       ValueList Left, Right;
-      VLOperands Ops(VL, S, *this);
+      VLOperands Ops(VL, VL0, *this);
       if (cast<CmpInst>(VL0)->isCommutative()) {
         // Commutative predicate - collect + sort operands of the instructions
         // so that each side is more likely to have the same opcode.
@@ -8881,7 +8888,7 @@ unsigned BoUpSLP::canMapToVector(Type *T) const {
   return N;
 }
 
-bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
+bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
                               SmallVectorImpl<unsigned> &CurrentOrder,
                               bool ResizeAllowed) const {
   const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
@@ -9535,7 +9542,7 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
 
   // Do not reorder nodes if it small (just 2 elements), all-constant or all
   // instructions have same opcode already.
-  if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
+  if (TE.Scalars.size() == 2 || (TE.getOpcode() && !TE.isAltShuffle()) ||
       all_of(TE.Scalars, isConstant))
     return;
 
@@ -9754,7 +9761,7 @@ void BoUpSLP::transformNodes() {
       // Do not try partial vectorization for small nodes (<= 2), nodes with the
       // same opcode and same parent block or all constants.
       if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
-          !(!E.hasState() || E.getOpcode() == Instruction::Load ||
+          !(!E.getOpcode() || E.getOpcode() == Instruction::Load ||
             E.isAltShuffle() || !allSameBlock(VL)) ||
           allConstant(VL) || isSplat(VL))
         continue;
@@ -9897,8 +9904,6 @@ void BoUpSLP::transformNodes() {
         E.ReorderIndices.clear();
       }
     }
-    if (!E.hasState())
-      continue;
     switch (E.getOpcode()) {
     case Instruction::Load: {
       // No need to reorder masked gather loads, just reorder the scalar
@@ -10018,7 +10023,7 @@ void BoUpSLP::transformNodes() {
         getCanonicalGraphSize() <= SmallTree &&
         count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
                  [](const std::unique_ptr<TreeEntry> &TE) {
-                   return TE->isGather() && TE->hasState() &&
+                   return TE->isGather() &&
                           TE->getOpcode() == Instruction::Load &&
                           !allSameBlock(TE->Scalars);
                  }) == 1)
@@ -10034,13 +10039,13 @@ void BoUpSLP::transformNodes() {
   for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
     TreeEntry &E = *TE;
     if (E.isGather() &&
-        ((E.hasState() && E.getOpcode() == Instruction::Load) ||
-         (!E.hasState() && any_of(E.Scalars,
-                                  [&](Value *V) {
-                                    return isa<LoadInst>(V) &&
-                                           !isVectorized(V) &&
-                                           !isDeleted(cast<Instruction>(V));
-                                  }))) &&
+        (E.getOpcode() == Instruction::Load ||
+         (!E.getOpcode() && any_of(E.Scalars,
+                                   [&](Value *V) {
+                                     return isa<LoadInst>(V) &&
+                                            !isVectorized(V) &&
+                                            !isDeleted(cast<Instruction>(V));
+                                   }))) &&
         !isSplat(E.Scalars)) {
       for (Value *V : E.Scalars) {
         auto *LI = dyn_cast<LoadInst>(V);
@@ -10634,7 +10639,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     bool PrevNodeFound = any_of(
         ArrayRef(R.VectorizableTree).take_front(E->Idx),
         [&](const std::unique_ptr<TreeEntry> &TE) {
-          return ((TE->hasState() && !TE->isAltShuffle() &&
+          return ((!TE->isAltShuffle() &&
                    TE->getOpcode() == Instruction::ExtractElement) ||
                   TE->isGather()) &&
                  all_of(enumerate(TE->Scalars), [&](auto &&Data) {
@@ -11760,7 +11765,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
         if (TE.get() == E)
           break;
-        if (TE->hasState() && TE->isAltShuffle() &&
+        if (TE->isAltShuffle() &&
             ((TE->getOpcode() == E->getOpcode() &&
               TE->getAltOpcode() == E->getAltOpcode()) ||
              (TE->getOpcode() == E->getAltOpcode() &&
@@ -11922,12 +11927,10 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
                    [this](Value *V) { return EphValues.contains(V); }) &&
            (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
             TE->Scalars.size() < Limit ||
-            (((TE->hasState() &&
-               TE->getOpcode() == Instruction::ExtractElement) ||
+            ((TE->getOpcode() == Instruction::ExtractElement ||
               all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
              isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
-            ((TE->hasState() && TE->getOpcode() == Instruction::Load) &&
-             (!TE->hasState() || !TE->isAltShuffle())) ||
+            (TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()) ||
             any_of(TE->Scalars, IsaPred<LoadInst>));
   };
 
@@ -12056,10 +12059,9 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
       !VectorizableTree.empty() &&
       all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
         return (TE->isGather() &&
-                (!TE->hasState() ||
-                 TE->getOpcode() != Instruction::ExtractElement) &&
+                TE->getOpcode() != Instruction::ExtractElement &&
                 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
-               (TE->hasState() && TE->getOpcode() == Instruction::PHI);
+               TE->getOpcode() == Instruction::PHI;
       }))
     return true;
 
@@ -12093,7 +12095,6 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
     return false;
 
   if (VectorizableTree.back()->isGather() &&
-      VectorizableTree.back()->hasState() &&
       VectorizableTree.back()->isAltShuffle() &&
       VectorizableTree.back()->getVectorFactor() > 2 &&
       allSameBlock(VectorizableTree.back()->Scalars) &&
@@ -12118,7 +12119,7 @@ bool BoUpSLP::isTreeNotExtendable() const {
         getCanonicalGraphSize() <= SmallTree &&
         count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
                  [](const std::unique_ptr<TreeEntry> &TE) {
-                   return TE->isGather() && TE->hasState() &&
+                   return TE->isGather() &&
                           TE->getOpcode() == Instruction::Load &&
                           !allSameBlock(TE->Scalars);
                  }) == 1)
@@ -12130,7 +12131,7 @@ bool BoUpSLP::isTreeNotExtendable() const {
     TreeEntry &E = *VectorizableTree[Idx];
     if (!E.isGather())
       continue;
-    if (E.hasState() && E.getOpcode() != Instruction::Load)
+    if (E.getOpcode() && E.getOpcode() != Instruction::Load)
       return false;
     if (isSplat(E.Scalars) || allConstant(E.Scalars))
       continue;
@@ -12440,7 +12441,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
           TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
       continue;
     }
-    if (TE.isGather() && TE.hasState()) {
+    if (TE.isGather()) {
       if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
           E && E->getVectorFactor() == TE.getVectorFactor() &&
           E->isSame(TE.Scalars)) {
@@ -14871,15 +14872,14 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
       }
     }
     // Gather extracts after we check for full matched gathers only.
-    if (!ExtractShuffles.empty() || !E->hasState() ||
-        E->getOpcode() != Instruction::Load ||
-        (((E->hasState() && E->getOpcode() == Instruction::Load) ||
+    if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
+        ((E->getOpcode() == Instruction::Load ||
           any_of(E->Scalars, IsaPred<LoadInst>)) &&
          any_of(E->Scalars,
                 [this](Value *V) {
                   return isa<LoadInst>(V) && getTreeEntry(V);
                 })) ||
-        (E->hasState() && E->isAltShuffle()) ||
+        E->isAltShuffle() ||
         all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
         isSplat(E->Scalars) ||
         (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
@@ -15259,7 +15259,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
   auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
   if (E->isGather()) {
     // Set insert point for non-reduction initial nodes.
-    if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
+    if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
       setInsertPointAfterBundle(E);
     Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);
     E->VectorizedValue = Vec;
@@ -18147,9 +18147,10 @@ void BoUpSLP::computeMinimumValueSizes() {
     return;
 
   SmallVector<unsigned> ToDemote;
-  auto ComputeMaxBitWidth =
-      [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
-          unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
+  auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot,
+                                bool IsProfitableToDemoteRoot, unsigned Opcode,
+                                unsigned Limit, bool IsTruncRoot,
+                                bool IsSignedCmp) -> unsigned {
     ToDemote.clear();
     // Check if the root is trunc and the next node is gather/buildvector, then
     // keep trunc in scalars, which is free in most cases.
@@ -18190,14 +18191,11 @@ void BoUpSLP::computeMinimumValueSizes() {
       return MaxBitWidth;
     }
 
-    if (!E.hasState())
-      return 0u;
-
     unsigned VF = E.getVectorFactor();
     Type *ScalarTy = E.Scalars.front()->getType();
     unsigned ScalarTyNumElements = getNumElements(ScalarTy);
     auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
-    if (!TreeRootIT)
+    if (!TreeRootIT || !Opcode)
       return 0u;
 
     if (any_of(E.Scalars,
@@ -18269,7 +18267,6 @@ void BoUpSLP::computeMinimumValueSizes() {
                 IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
       return 0u;
 
-    unsigned Opcode = E.getOpcode();
     bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
                                 Opcode == Instruction::SExt ||
                                 Opcode == Instruction::ZExt || NumParts > 1;
@@ -18350,14 +18347,15 @@ void BoUpSLP::computeMinimumValueSizes() {
   while (NodeIdx < VectorizableTree.size()) {
     ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
     unsigned Limit = 2;
+    unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
     if (IsTopRoot &&
         ReductionBitWidth ==
             DL->getTypeSizeInBits(
                 VectorizableTree.front()->Scalars.front()->getType()))
       Limit = 3;
     unsigned MaxBitWidth = ComputeMaxBitWidth(
-        *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
-        IsTruncRoot, IsSignedCmp);
+        *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Opcode,
+        Limit, IsTruncRoot, IsSignedCmp);
     if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
       if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
         ReductionBitWidth = bit_ceil(MaxBitWidth);
@@ -18400,21 +18398,19 @@ void BoUpSLP::computeMinimumValueSizes() {
                  });
       IsSignedCmp =
           NodeIdx < VectorizableTree.size() &&
-          any_of(
-              VectorizableTree[NodeIdx]->UserTreeIndices,
-              [&](const EdgeInfo &EI) {
-                return (EI.UserTE->hasState() &&
-                        EI.UserTE->getOpcode() == Instruction::ICmp) &&
-                       any_of(EI.UserTE->Scalars, [&](Value *V) {
-                         auto *IC = dyn_cast<ICmpInst>(V);
-                         return IC &&
-                                (IC->isSigned() ||
-                                 !isKnownNonNegative(IC->getOperand(0),
-                                                     SimplifyQuery(*DL)) ||
-                                 !isKnownNonNegative(IC->getOperand(1),
-                                                     SimplifyQuery(*DL)));
-                       });
-              });
+          any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
+                 [&](const EdgeInfo &EI) {
+                   return EI.UserTE->getOpcode() == Instruction::ICmp &&
+                          any_of(EI.UserTE->Scalars, [&](Value *V) {
+                            auto *IC = dyn_cast<ICmpInst>(V);
+                            return IC &&
+                                   (IC->isSigned() ||
+                                    !isKnownNonNegative(IC->getOperand(0),
+                                                        SimplifyQuery(*DL)) ||
+                                    !isKnownNonNegative(IC->getOperand(1),
+                                                        SimplifyQuery(*DL)));
+                          });
+                 });
     }
 
     // If the maximum bit width we compute is less than the width of the roots'

From f926bcf9068c808b643a56322b7ef6910eb36599 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Thu, 9 Jan 2025 18:52:56 -0800
Subject: [PATCH 006/408] [clang-format][doc] Fix the description of
 BreakBinaryOperations

---
 clang/docs/ClangFormatStyleOptions.rst | 3 ++-
 clang/include/clang/Format/Format.h    | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index 637ec23e0abaf..0edf7af72c24e 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -3441,7 +3441,7 @@ the configuration (without a prefix: ``Auto``).
 .. _BreakBinaryOperations:
 
 **BreakBinaryOperations** (``BreakBinaryOperationsStyle``) :versionbadge:`clang-format 20` :ref:`¶ <BreakBinaryOperations>`
-  The break constructor initializers style to use.
+  The break binary operations style to use.
 
   Possible values:
 
@@ -3764,6 +3764,7 @@ the configuration (without a prefix: ``Auto``).
   lists.
 
   Important differences:
+
   * No spaces inside the braced list.
   * No line break before the closing brace.
   * Indentation with the continuation indent, not with the block indent.
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index 8d41077549690..7c2afd4d94ab0 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -2298,7 +2298,7 @@ struct FormatStyle {
     BBO_RespectPrecedence
   };
 
-  /// The break constructor initializers style to use.
+  /// The break binary operations style to use.
   /// \version 20
   BreakBinaryOperationsStyle BreakBinaryOperations;
 
@@ -2510,6 +2510,7 @@ struct FormatStyle {
   /// lists.
   ///
   /// Important differences:
+  ///
   /// * No spaces inside the braced list.
   /// * No line break before the closing brace.
   /// * Indentation with the continuation indent, not with the block indent.

From 2ea34cdf2ba86cd129633f2a01fb695c79c0fe11 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Thu, 9 Jan 2025 19:10:15 -0800
Subject: [PATCH 007/408] [clang-format] Stop fixing indentation on namespace
 closing brace (#122234)

Fixes #119790.
---
 clang/lib/Format/UnwrappedLineFormatter.cpp    |  2 +-
 clang/unittests/Format/FormatTestSelective.cpp | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp
index ec65fea6ec3df..cee84fb1191ab 100644
--- a/clang/lib/Format/UnwrappedLineFormatter.cpp
+++ b/clang/lib/Format/UnwrappedLineFormatter.cpp
@@ -1460,7 +1460,7 @@ unsigned UnwrappedLineFormatter::format(
     bool ContinueFormatting =
         TheLine.Level > RangeMinLevel ||
         (TheLine.Level == RangeMinLevel && !PreviousRBrace &&
-         !TheLine.startsWith(tok::r_brace));
+         !TheLine.startsWith(TT_NamespaceRBrace));
 
     bool FixIndentation = (FixBadIndentation || ContinueFormatting) &&
                           Indent != TheLine.First->OriginalColumn;
diff --git a/clang/unittests/Format/FormatTestSelective.cpp b/clang/unittests/Format/FormatTestSelective.cpp
index 3ae70a15d359b..624684c7a079b 100644
--- a/clang/unittests/Format/FormatTestSelective.cpp
+++ b/clang/unittests/Format/FormatTestSelective.cpp
@@ -388,6 +388,17 @@ TEST_F(FormatTestSelective, WrongIndent) {
                    "  int j;\n" // Format here.
                    "}",
                    24, 0));
+  EXPECT_EQ("namespace {\n"
+            "class C {\n"
+            "  int i;\n"
+            "};\n"
+            "} // namespace",
+            format("namespace {\n" // Format here.
+                   "  class C {\n"
+                   "    int i;\n"
+                   "  };\n"
+                   "}",
+                   1, 0));
 }
 
 TEST_F(FormatTestSelective, AlwaysFormatsEntireMacroDefinitions) {

From 211bcf67aadb1175af382f55403ae759177281c7 Mon Sep 17 00:00:00 2001
From: Chinmay Deshpande <chdeshpa@amd.com>
Date: Fri, 10 Jan 2025 09:05:41 +0530
Subject: [PATCH 008/408] [AMDGPU] Implement IR variant of
 isFMAFasterThanFMulAndFAdd (#121465)

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  56 +++++++++
 llvm/lib/Target/AMDGPU/SIISelLowering.h       |   3 +
 .../CodeGen/AMDGPU/prevent-fmul-hoist-ir.ll   | 108 ++++++++++++------
 3 files changed, 132 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 992f7ed99d3bb..e057c665e39da 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5732,6 +5732,35 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
   return false;
 }
 
+// Refer to comments added to the MIR variant of isFMAFasterThanFMulAndFAdd for
+// specific details.
+bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
+                                                  Type *Ty) const {
+  switch (Ty->getScalarSizeInBits()) {
+  case 16: {
+    SIModeRegisterDefaults Mode = SIModeRegisterDefaults(F, *Subtarget);
+    return Subtarget->has16BitInsts() &&
+           Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
+  }
+  case 32: {
+    if (!Subtarget->hasMadMacF32Insts())
+      return Subtarget->hasFastFMAF32();
+
+    SIModeRegisterDefaults Mode = SIModeRegisterDefaults(F, *Subtarget);
+    if (Mode.FP32Denormals != DenormalMode::getPreserveSign())
+      return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
+
+    return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
+  }
+  case 64:
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
 bool SITargetLowering::isFMADLegal(const MachineInstr &MI, LLT Ty) const {
   if (!Ty.isScalar())
     return false;
@@ -16992,6 +17021,33 @@ bool SITargetLowering::checkForPhysRegDependency(
   return false;
 }
 
+/// Check if it is profitable to hoist instruction in then/else to if.
+bool SITargetLowering::isProfitableToHoist(Instruction *I) const {
+  if (!I->hasOneUse())
+    return true;
+
+  Instruction *User = I->user_back();
+  // TODO: Add more patterns that are not profitable to hoist and
+  // handle modifiers such as fabs and fneg
+  switch (I->getOpcode()) {
+  case Instruction::FMul: {
+    if (User->getOpcode() != Instruction::FSub &&
+        User->getOpcode() != Instruction::FAdd)
+      return true;
+
+    const TargetOptions &Options = getTargetMachine().Options;
+
+    return ((!I->hasAllowContract() || !User->hasAllowContract()) &&
+            Options.AllowFPOpFusion != FPOpFusion::Fast &&
+            !Options.UnsafeFPMath) ||
+           !isFMAFasterThanFMulAndFAdd(*I->getFunction(), User->getType());
+  }
+  default:
+    return true;
+  }
+  return true;
+}
+
 void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
     Instruction *AI) const {
   // Given: atomicrmw fadd ptr %addr, float %val ordering
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 299c8f5f73923..27960a0940923 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -459,6 +459,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
                                   EVT VT) const override;
   bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
                                   const LLT Ty) const override;
+  bool isFMAFasterThanFMulAndFAdd(const Function &F, Type *Ty) const override;
   bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override;
   bool isFMADLegal(const MachineInstr &MI, const LLT Ty) const override;
 
@@ -538,6 +539,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
                                  const TargetInstrInfo *TII, unsigned &PhysReg,
                                  int &Cost) const override;
 
+  bool isProfitableToHoist(Instruction *I) const override;
+
   bool isKnownNeverNaNForTargetNode(SDValue Op,
                                     const SelectionDAG &DAG,
                                     bool SNaN = false,
diff --git a/llvm/test/CodeGen/AMDGPU/prevent-fmul-hoist-ir.ll b/llvm/test/CodeGen/AMDGPU/prevent-fmul-hoist-ir.ll
index ef3e04c0e9968..c68cd82540911 100644
--- a/llvm/test/CodeGen/AMDGPU/prevent-fmul-hoist-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/prevent-fmul-hoist-ir.ll
@@ -11,16 +11,17 @@ define double @is_profitable_f64_contract(ptr dereferenceable(8) %ptr_x, ptr der
 ; GFX-NEXT:    [[CMP:%.*]] = fcmp oeq double [[Y]], 0.000000e+00
 ; GFX-NEXT:    [[X:%.*]] = load double, ptr [[PTR_X]], align 8
 ; GFX-NEXT:    [[A_1:%.*]] = load double, ptr [[PTR_A]], align 8
-; GFX-NEXT:    [[MUL:%.*]] = fmul contract double [[X]], [[A_1]]
 ; GFX-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
 ; GFX:       [[COMMON_RET:.*]]:
 ; GFX-NEXT:    [[COMMON_RET_OP:%.*]] = phi double [ [[ADD:%.*]], %[[IF_THEN]] ], [ [[SUB:%.*]], %[[IF_ELSE]] ]
 ; GFX-NEXT:    ret double [[COMMON_RET_OP]]
 ; GFX:       [[IF_THEN]]:
+; GFX-NEXT:    [[MUL:%.*]] = fmul contract double [[X]], [[A_1]]
 ; GFX-NEXT:    [[ADD]] = fadd contract double 1.000000e+00, [[MUL]]
 ; GFX-NEXT:    br label %[[COMMON_RET]]
 ; GFX:       [[IF_ELSE]]:
-; GFX-NEXT:    [[SUB]] = fsub contract double [[MUL]], [[Y]]
+; GFX-NEXT:    [[MUL1:%.*]] = fmul contract double [[X]], [[A_1]]
+; GFX-NEXT:    [[SUB]] = fsub contract double [[MUL1]], [[Y]]
 ; GFX-NEXT:    br label %[[COMMON_RET]]
 ;
 entry:
@@ -93,16 +94,17 @@ define float @is_profitable_f32(ptr dereferenceable(8) %ptr_x, ptr dereferenceab
 ; GFX-NEXT:    [[CMP:%.*]] = fcmp oeq float [[Y]], 0.000000e+00
 ; GFX-NEXT:    [[X:%.*]] = load float, ptr [[PTR_X]], align 8
 ; GFX-NEXT:    [[A_1:%.*]] = load float, ptr [[PTR_A]], align 8
-; GFX-NEXT:    [[MUL:%.*]] = fmul contract float [[X]], [[A_1]]
 ; GFX-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
 ; GFX:       [[COMMON_RET:.*]]:
-; GFX-NEXT:    [[COMMON_RET_OP:%.*]] = phi float [ [[MUL]], %[[IF_THEN]] ], [ [[SUB:%.*]], %[[IF_ELSE]] ]
+; GFX-NEXT:    [[COMMON_RET_OP:%.*]] = phi float [ [[MUL:%.*]], %[[IF_THEN]] ], [ [[SUB:%.*]], %[[IF_ELSE]] ]
 ; GFX-NEXT:    ret float [[COMMON_RET_OP]]
 ; GFX:       [[IF_THEN]]:
+; GFX-NEXT:    [[MUL]] = fmul contract float [[X]], [[A_1]]
 ; GFX-NEXT:    [[ADD:%.*]] = fadd contract float 1.000000e+00, [[MUL]]
 ; GFX-NEXT:    br label %[[COMMON_RET]]
 ; GFX:       [[IF_ELSE]]:
-; GFX-NEXT:    [[SUB]] = fsub contract float [[MUL]], [[Y]]
+; GFX-NEXT:    [[MUL1:%.*]] = fmul contract float [[X]], [[A_1]]
+; GFX-NEXT:    [[SUB]] = fsub contract float [[MUL1]], [[Y]]
 ; GFX-NEXT:    br label %[[COMMON_RET]]
 ;
 entry:
@@ -111,7 +113,6 @@ entry:
   %x = load float, ptr %ptr_x, align 8
   br i1 %cmp, label %if.then, label %if.else
 
-
 if.then:                                          ; preds = %entry
   %a_1 = load float, ptr %ptr_a, align 8
   %mul = fmul contract float %x, %a_1
@@ -172,16 +173,17 @@ define half @is_profitable_f16_ieee(ptr dereferenceable(8) %ptr_x, ptr dereferen
 ; GFX-NEXT:    [[CMP:%.*]] = fcmp oeq half [[Y]], 0xH0000
 ; GFX-NEXT:    [[X:%.*]] = load half, ptr [[PTR_X]], align 8
 ; GFX-NEXT:    [[A_1:%.*]] = load half, ptr [[PTR_A]], align 8
-; GFX-NEXT:    [[MUL:%.*]] = fmul contract half [[X]], [[A_1]]
 ; GFX-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
 ; GFX:       [[COMMON_RET:.*]]:
-; GFX-NEXT:    [[COMMON_RET_OP:%.*]] = phi half [ [[MUL]], %[[IF_THEN]] ], [ [[SUB:%.*]], %[[IF_ELSE]] ]
+; GFX-NEXT:    [[COMMON_RET_OP:%.*]] = phi half [ [[MUL:%.*]], %[[IF_THEN]] ], [ [[SUB:%.*]], %[[IF_ELSE]] ]
 ; GFX-NEXT:    ret half [[COMMON_RET_OP]]
 ; GFX:       [[IF_THEN]]:
+; GFX-NEXT:    [[MUL]] = fmul contract half [[X]], [[A_1]]
 ; GFX-NEXT:    [[ADD:%.*]] = fadd contract half [[Y]], [[MUL]]
 ; GFX-NEXT:    br label %[[COMMON_RET]]
 ; GFX:       [[IF_ELSE]]:
-; GFX-NEXT:    [[SUB]] = fsub contract half [[MUL]], [[Y]]
+; GFX-NEXT:    [[MUL1:%.*]] = fmul contract half [[X]], [[A_1]]
+; GFX-NEXT:    [[SUB]] = fsub contract half [[MUL1]], [[Y]]
 ; GFX-NEXT:    br label %[[COMMON_RET]]
 ;
 entry:
@@ -250,16 +252,17 @@ define bfloat @is_profitable_bfloat_ieee(ptr dereferenceable(8) %ptr_x, ptr dere
 ; GFX-NEXT:    [[CMP:%.*]] = fcmp oeq bfloat [[Y]], 0xR0000
 ; GFX-NEXT:    [[X:%.*]] = load bfloat, ptr [[PTR_X]], align 8
 ; GFX-NEXT:    [[A_1:%.*]] = load bfloat, ptr [[PTR_A]], align 8
-; GFX-NEXT:    [[MUL:%.*]] = fmul contract bfloat [[X]], [[A_1]]
 ; GFX-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
 ; GFX:       [[COMMON_RET:.*]]:
-; GFX-NEXT:    [[COMMON_RET_OP:%.*]] = phi bfloat [ [[MUL]], %[[IF_THEN]] ], [ [[SUB:%.*]], %[[IF_ELSE]] ]
+; GFX-NEXT:    [[COMMON_RET_OP:%.*]] = phi bfloat [ [[MUL:%.*]], %[[IF_THEN]] ], [ [[SUB:%.*]], %[[IF_ELSE]] ]
 ; GFX-NEXT:    ret bfloat [[COMMON_RET_OP]]
 ; GFX:       [[IF_THEN]]:
+; GFX-NEXT:    [[MUL]] = fmul contract bfloat [[X]], [[A_1]]
 ; GFX-NEXT:    [[ADD:%.*]] = fadd contract bfloat 0xR3F80, [[MUL]]
 ; GFX-NEXT:    br label %[[COMMON_RET]]
 ; GFX:       [[IF_ELSE]]:
-; GFX-NEXT:    [[SUB]] = fsub contract bfloat [[MUL]], [[Y]]
+; GFX-NEXT:    [[MUL1:%.*]] = fmul contract bfloat [[X]], [[A_1]]
+; GFX-NEXT:    [[SUB]] = fsub contract bfloat [[MUL1]], [[Y]]
 ; GFX-NEXT:    br label %[[COMMON_RET]]
 ;
 entry:
@@ -330,16 +333,17 @@ define <8 x half> @is_profitable_vector(ptr dereferenceable(8) %ptr_x, ptr deref
 ; GFX-NEXT:    [[V1:%.*]] = load <8 x half>, ptr addrspace(3) @v1_ptr, align 16
 ; GFX-NEXT:    [[V2:%.*]] = load <8 x half>, ptr addrspace(3) @v2_ptr, align 16
 ; GFX-NEXT:    [[CMP:%.*]] = fcmp oeq double [[Y]], 0.000000e+00
-; GFX-NEXT:    [[MUL:%.*]] = fmul contract <8 x half> [[V1]], [[X]]
 ; GFX-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
 ; GFX:       [[COMMON_RET:.*]]:
 ; GFX-NEXT:    [[COMMON_RET_OP:%.*]] = phi <8 x half> [ [[ADD:%.*]], %[[IF_THEN]] ], [ [[SUB:%.*]], %[[IF_ELSE]] ]
 ; GFX-NEXT:    ret <8 x half> [[COMMON_RET_OP]]
 ; GFX:       [[IF_THEN]]:
+; GFX-NEXT:    [[MUL:%.*]] = fmul contract <8 x half> [[V1]], [[X]]
 ; GFX-NEXT:    [[ADD]] = fadd contract <8 x half> [[V2]], [[MUL]]
 ; GFX-NEXT:    br label %[[COMMON_RET]]
 ; GFX:       [[IF_ELSE]]:
-; GFX-NEXT:    [[SUB]] = fsub contract <8 x half> [[MUL]], [[V2]]
+; GFX-NEXT:    [[MUL1:%.*]] = fmul contract <8 x half> [[V1]], [[X]]
+; GFX-NEXT:    [[SUB]] = fsub contract <8 x half> [[MUL1]], [[V2]]
 ; GFX-NEXT:    br label %[[COMMON_RET]]
 ;
 entry:
@@ -362,23 +366,61 @@ if.else:                                          ; preds = %entry
 }
 
 define double @is_profitable_f64_nocontract(ptr dereferenceable(8) %ptr_x, ptr dereferenceable(8) %ptr_y, ptr dereferenceable(8) %ptr_a) #0 {
-; GFX-LABEL: define double @is_profitable_f64_nocontract(
-; GFX-SAME: ptr dereferenceable(8) [[PTR_X:%.*]], ptr dereferenceable(8) [[PTR_Y:%.*]], ptr dereferenceable(8) [[PTR_A:%.*]]) #[[ATTR0]] {
-; GFX-NEXT:    [[Y:%.*]] = load double, ptr [[PTR_Y]], align 8
-; GFX-NEXT:    [[CMP:%.*]] = fcmp oeq double [[Y]], 0.000000e+00
-; GFX-NEXT:    [[X:%.*]] = load double, ptr [[PTR_X]], align 8
-; GFX-NEXT:    [[A_1:%.*]] = load double, ptr [[PTR_A]], align 8
-; GFX-NEXT:    [[MUL:%.*]] = fmul double [[X]], [[A_1]]
-; GFX-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
-; GFX:       [[COMMON_RET:.*]]:
-; GFX-NEXT:    [[COMMON_RET_OP:%.*]] = phi double [ [[PTR_ADD:%.*]], %[[IF_THEN]] ], [ [[SUB:%.*]], %[[IF_ELSE]] ]
-; GFX-NEXT:    ret double [[COMMON_RET_OP]]
-; GFX:       [[IF_THEN]]:
-; GFX-NEXT:    [[PTR_ADD]] = fadd double 1.000000e+00, [[MUL]]
-; GFX-NEXT:    br label %[[COMMON_RET]]
-; GFX:       [[IF_ELSE]]:
-; GFX-NEXT:    [[SUB]] = fsub double [[MUL]], [[Y]]
-; GFX-NEXT:    br label %[[COMMON_RET]]
+; FP-CONTRACT-FAST-LABEL: define double @is_profitable_f64_nocontract(
+; FP-CONTRACT-FAST-SAME: ptr dereferenceable(8) [[PTR_X:%.*]], ptr dereferenceable(8) [[PTR_Y:%.*]], ptr dereferenceable(8) [[PTR_A:%.*]]) #[[ATTR0]] {
+; FP-CONTRACT-FAST-NEXT:    [[Y:%.*]] = load double, ptr [[PTR_Y]], align 8
+; FP-CONTRACT-FAST-NEXT:    [[CMP:%.*]] = fcmp oeq double [[Y]], 0.000000e+00
+; FP-CONTRACT-FAST-NEXT:    [[X:%.*]] = load double, ptr [[PTR_X]], align 8
+; FP-CONTRACT-FAST-NEXT:    [[A_1:%.*]] = load double, ptr [[PTR_A]], align 8
+; FP-CONTRACT-FAST-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
+; FP-CONTRACT-FAST:       [[COMMON_RET:.*]]:
+; FP-CONTRACT-FAST-NEXT:    [[COMMON_RET_OP:%.*]] = phi double [ [[PTR_ADD:%.*]], %[[IF_THEN]] ], [ [[SUB:%.*]], %[[IF_ELSE]] ]
+; FP-CONTRACT-FAST-NEXT:    ret double [[COMMON_RET_OP]]
+; FP-CONTRACT-FAST:       [[IF_THEN]]:
+; FP-CONTRACT-FAST-NEXT:    [[MUL:%.*]] = fmul double [[X]], [[A_1]]
+; FP-CONTRACT-FAST-NEXT:    [[PTR_ADD]] = fadd double 1.000000e+00, [[MUL]]
+; FP-CONTRACT-FAST-NEXT:    br label %[[COMMON_RET]]
+; FP-CONTRACT-FAST:       [[IF_ELSE]]:
+; FP-CONTRACT-FAST-NEXT:    [[MUL1:%.*]] = fmul double [[X]], [[A_1]]
+; FP-CONTRACT-FAST-NEXT:    [[SUB]] = fsub double [[MUL1]], [[Y]]
+; FP-CONTRACT-FAST-NEXT:    br label %[[COMMON_RET]]
+;
+; UNSAFE-FP-MATH-LABEL: define double @is_profitable_f64_nocontract(
+; UNSAFE-FP-MATH-SAME: ptr dereferenceable(8) [[PTR_X:%.*]], ptr dereferenceable(8) [[PTR_Y:%.*]], ptr dereferenceable(8) [[PTR_A:%.*]]) #[[ATTR0]] {
+; UNSAFE-FP-MATH-NEXT:    [[Y:%.*]] = load double, ptr [[PTR_Y]], align 8
+; UNSAFE-FP-MATH-NEXT:    [[CMP:%.*]] = fcmp oeq double [[Y]], 0.000000e+00
+; UNSAFE-FP-MATH-NEXT:    [[X:%.*]] = load double, ptr [[PTR_X]], align 8
+; UNSAFE-FP-MATH-NEXT:    [[A_1:%.*]] = load double, ptr [[PTR_A]], align 8
+; UNSAFE-FP-MATH-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
+; UNSAFE-FP-MATH:       [[COMMON_RET:.*]]:
+; UNSAFE-FP-MATH-NEXT:    [[COMMON_RET_OP:%.*]] = phi double [ [[PTR_ADD:%.*]], %[[IF_THEN]] ], [ [[SUB:%.*]], %[[IF_ELSE]] ]
+; UNSAFE-FP-MATH-NEXT:    ret double [[COMMON_RET_OP]]
+; UNSAFE-FP-MATH:       [[IF_THEN]]:
+; UNSAFE-FP-MATH-NEXT:    [[MUL:%.*]] = fmul double [[X]], [[A_1]]
+; UNSAFE-FP-MATH-NEXT:    [[PTR_ADD]] = fadd double 1.000000e+00, [[MUL]]
+; UNSAFE-FP-MATH-NEXT:    br label %[[COMMON_RET]]
+; UNSAFE-FP-MATH:       [[IF_ELSE]]:
+; UNSAFE-FP-MATH-NEXT:    [[MUL1:%.*]] = fmul double [[X]], [[A_1]]
+; UNSAFE-FP-MATH-NEXT:    [[SUB]] = fsub double [[MUL1]], [[Y]]
+; UNSAFE-FP-MATH-NEXT:    br label %[[COMMON_RET]]
+;
+; NO-UNSAFE-FP-MATH-LABEL: define double @is_profitable_f64_nocontract(
+; NO-UNSAFE-FP-MATH-SAME: ptr dereferenceable(8) [[PTR_X:%.*]], ptr dereferenceable(8) [[PTR_Y:%.*]], ptr dereferenceable(8) [[PTR_A:%.*]]) #[[ATTR0]] {
+; NO-UNSAFE-FP-MATH-NEXT:    [[Y:%.*]] = load double, ptr [[PTR_Y]], align 8
+; NO-UNSAFE-FP-MATH-NEXT:    [[CMP:%.*]] = fcmp oeq double [[Y]], 0.000000e+00
+; NO-UNSAFE-FP-MATH-NEXT:    [[X:%.*]] = load double, ptr [[PTR_X]], align 8
+; NO-UNSAFE-FP-MATH-NEXT:    [[A_1:%.*]] = load double, ptr [[PTR_A]], align 8
+; NO-UNSAFE-FP-MATH-NEXT:    [[MUL:%.*]] = fmul double [[X]], [[A_1]]
+; NO-UNSAFE-FP-MATH-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
+; NO-UNSAFE-FP-MATH:       [[COMMON_RET:.*]]:
+; NO-UNSAFE-FP-MATH-NEXT:    [[COMMON_RET_OP:%.*]] = phi double [ [[PTR_ADD:%.*]], %[[IF_THEN]] ], [ [[SUB:%.*]], %[[IF_ELSE]] ]
+; NO-UNSAFE-FP-MATH-NEXT:    ret double [[COMMON_RET_OP]]
+; NO-UNSAFE-FP-MATH:       [[IF_THEN]]:
+; NO-UNSAFE-FP-MATH-NEXT:    [[PTR_ADD]] = fadd double 1.000000e+00, [[MUL]]
+; NO-UNSAFE-FP-MATH-NEXT:    br label %[[COMMON_RET]]
+; NO-UNSAFE-FP-MATH:       [[IF_ELSE]]:
+; NO-UNSAFE-FP-MATH-NEXT:    [[SUB]] = fsub double [[MUL]], [[Y]]
+; NO-UNSAFE-FP-MATH-NEXT:    br label %[[COMMON_RET]]
 ;
   %y = load double, ptr %ptr_y, align 8
   %cmp = fcmp oeq double %y, 0.000000e+00
@@ -400,7 +442,3 @@ if.else:                                          ; preds = %entry
 
 attributes #0 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" }
 attributes #1 = { nounwind "denormal-fp-math"="ieee,ieee" }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; FP-CONTRACT-FAST: {{.*}}
-; NO-UNSAFE-FP-MATH: {{.*}}
-; UNSAFE-FP-MATH: {{.*}}

From 9c2de994a18b8eb50634e620af762ed28ec5dcc2 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 9 Jan 2025 20:19:01 -0800
Subject: [PATCH 009/408] [nfc][BoundsChecking] Refactor BoundsCheckingOptions
 (#122346)

Remove ReportingMode and ReportingOpts.
---
 clang/lib/CodeGen/BackendUtil.cpp             | 35 +++----
 .../Instrumentation/BoundsChecking.h          | 21 ++--
 llvm/lib/Passes/PassBuilder.cpp               | 25 +++--
 .../Instrumentation/BoundsChecking.cpp        | 99 +++++++------------
 4 files changed, 77 insertions(+), 103 deletions(-)

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 2dbab785658aa..f350accfc530b 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -1025,26 +1025,21 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
     // Register callbacks to schedule sanitizer passes at the appropriate part
     // of the pipeline.
     if (LangOpts.Sanitize.has(SanitizerKind::LocalBounds))
-      PB.registerScalarOptimizerLateEPCallback(
-          [this](FunctionPassManager &FPM, OptimizationLevel Level) {
-            BoundsCheckingPass::ReportingMode Mode;
-            bool Merge = CodeGenOpts.SanitizeMergeHandlers.has(
-                SanitizerKind::LocalBounds);
-
-            if (CodeGenOpts.SanitizeTrap.has(SanitizerKind::LocalBounds)) {
-              Mode = BoundsCheckingPass::ReportingMode::Trap;
-            } else if (CodeGenOpts.SanitizeMinimalRuntime) {
-              Mode = CodeGenOpts.SanitizeRecover.has(SanitizerKind::LocalBounds)
-                         ? BoundsCheckingPass::ReportingMode::MinRuntime
-                         : BoundsCheckingPass::ReportingMode::MinRuntimeAbort;
-            } else {
-              Mode = CodeGenOpts.SanitizeRecover.has(SanitizerKind::LocalBounds)
-                         ? BoundsCheckingPass::ReportingMode::FullRuntime
-                         : BoundsCheckingPass::ReportingMode::FullRuntimeAbort;
-            }
-            BoundsCheckingPass::BoundsCheckingOptions Options(Mode, Merge);
-            FPM.addPass(BoundsCheckingPass(Options));
-          });
+      PB.registerScalarOptimizerLateEPCallback([this](FunctionPassManager &FPM,
+                                                      OptimizationLevel Level) {
+        BoundsCheckingPass::BoundsCheckingOptions Options;
+        Options.Merge =
+            CodeGenOpts.SanitizeMergeHandlers.has(SanitizerKind::LocalBounds);
+        if (!CodeGenOpts.SanitizeTrap.has(SanitizerKind::LocalBounds)) {
+          Options.Rt = {
+              /*MinRuntime=*/static_cast<bool>(
+                  CodeGenOpts.SanitizeMinimalRuntime),
+              /*MayReturn=*/
+              CodeGenOpts.SanitizeRecover.has(SanitizerKind::LocalBounds),
+          };
+        }
+        FPM.addPass(BoundsCheckingPass(Options));
+      });
 
     // Don't add sanitizers if we are here from ThinLTO PostLink. That already
     // done on PreLink stage.
diff --git a/llvm/include/llvm/Transforms/Instrumentation/BoundsChecking.h b/llvm/include/llvm/Transforms/Instrumentation/BoundsChecking.h
index ee71aa64f85ee..9c0506428bd62 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/BoundsChecking.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/BoundsChecking.h
@@ -10,6 +10,7 @@
 #define LLVM_TRANSFORMS_INSTRUMENTATION_BOUNDSCHECKING_H
 
 #include "llvm/IR/PassManager.h"
+#include <optional>
 
 namespace llvm {
 class Function;
@@ -19,19 +20,15 @@ class Function;
 class BoundsCheckingPass : public PassInfoMixin<BoundsCheckingPass> {
 
 public:
-  enum class ReportingMode {
-    Trap,
-    MinRuntime,
-    MinRuntimeAbort,
-    FullRuntime,
-    FullRuntimeAbort,
-  };
-
   struct BoundsCheckingOptions {
-    BoundsCheckingOptions(ReportingMode Mode, bool Merge);
-
-    ReportingMode Mode;
-    bool Merge;
+    struct Runtime {
+      Runtime(bool MinRuntime, bool MayReturn)
+          : MinRuntime(MinRuntime), MayReturn(MayReturn) {}
+      bool MinRuntime;
+      bool MayReturn;
+    };
+    std::optional<Runtime> Rt; // Trap if empty.
+    bool Merge = false;
   };
 
   BoundsCheckingPass(BoundsCheckingOptions Options) : Options(Options) {}
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 30b8d7c949948..b75387ac556e3 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -1286,21 +1286,32 @@ parseRegAllocFastPassOptions(PassBuilder &PB, StringRef Params) {
 
 Expected<BoundsCheckingPass::BoundsCheckingOptions>
 parseBoundsCheckingOptions(StringRef Params) {
-  BoundsCheckingPass::BoundsCheckingOptions Options(
-      BoundsCheckingPass::ReportingMode::Trap, false);
+  BoundsCheckingPass::BoundsCheckingOptions Options;
   while (!Params.empty()) {
     StringRef ParamName;
     std::tie(ParamName, Params) = Params.split(';');
     if (ParamName == "trap") {
-      Options.Mode = BoundsCheckingPass::ReportingMode::Trap;
+      Options.Rt = std::nullopt;
     } else if (ParamName == "rt") {
-      Options.Mode = BoundsCheckingPass::ReportingMode::FullRuntime;
+      Options.Rt = {
+          /*MinRuntime=*/false,
+          /*MayReturn=*/true,
+      };
     } else if (ParamName == "rt-abort") {
-      Options.Mode = BoundsCheckingPass::ReportingMode::FullRuntimeAbort;
+      Options.Rt = {
+          /*MinRuntime=*/false,
+          /*MayReturn=*/false,
+      };
     } else if (ParamName == "min-rt") {
-      Options.Mode = BoundsCheckingPass::ReportingMode::MinRuntime;
+      Options.Rt = {
+          /*MinRuntime=*/true,
+          /*MayReturn=*/true,
+      };
     } else if (ParamName == "min-rt-abort") {
-      Options.Mode = BoundsCheckingPass::ReportingMode::MinRuntimeAbort;
+      Options.Rt = {
+          /*MinRuntime=*/true,
+          /*MayReturn=*/false,
+      };
     } else if (ParamName == "merge") {
       Options.Merge = true;
     } else {
diff --git a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
index 41e5038581246..3d2a2663230c0 100644
--- a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -43,10 +43,6 @@ STATISTIC(ChecksUnable, "Bounds checks unable to add");
 
 using BuilderTy = IRBuilder<TargetFolder>;
 
-BoundsCheckingPass::BoundsCheckingOptions::BoundsCheckingOptions(
-    ReportingMode Mode, bool Merge)
-    : Mode(Mode), Merge(Merge) {}
-
 /// Gets the conditions under which memory accessing instructions will overflow.
 ///
 /// \p Ptr is the pointer that will be read/written, and \p InstVal is either
@@ -166,42 +162,19 @@ static void insertBoundsCheck(Value *Or, BuilderTy &IRB, GetTrapBBT GetTrapBB) {
   BranchInst::Create(TrapBB, Cont, Or, OldBB);
 }
 
-struct ReportingOpts {
-  bool MayReturn = false;
-  bool UseTrap = false;
-  bool MinRuntime = false;
-  bool MayMerge = true;
-  StringRef Name;
-
-  ReportingOpts(BoundsCheckingPass::ReportingMode Mode, bool Merge) {
-    switch (Mode) {
-    case BoundsCheckingPass::ReportingMode::Trap:
-      UseTrap = true;
-      break;
-    case BoundsCheckingPass::ReportingMode::MinRuntime:
-      Name = "__ubsan_handle_local_out_of_bounds_minimal";
-      MinRuntime = true;
-      MayReturn = true;
-      break;
-    case BoundsCheckingPass::ReportingMode::MinRuntimeAbort:
-      Name = "__ubsan_handle_local_out_of_bounds_minimal_abort";
-      MinRuntime = true;
-      break;
-    case BoundsCheckingPass::ReportingMode::FullRuntime:
-      Name = "__ubsan_handle_local_out_of_bounds";
-      MayReturn = true;
-      break;
-    case BoundsCheckingPass::ReportingMode::FullRuntimeAbort:
-      Name = "__ubsan_handle_local_out_of_bounds_abort";
-      break;
-    }
-
-    MayMerge = Merge;
-  }
-};
+static std::string getRuntimeCallName(
+    const BoundsCheckingPass::BoundsCheckingOptions::Runtime &Opts) {
+  std::string Name = "__ubsan_handle_local_out_of_bounds";
+  if (Opts.MinRuntime)
+    Name += "_minimal";
+  if (!Opts.MayReturn)
+    Name += "_abort";
+  return Name;
+}
 
-static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI,
-                              ScalarEvolution &SE, const ReportingOpts &Opts) {
+static bool
+addBoundsChecking(Function &F, TargetLibraryInfo &TLI, ScalarEvolution &SE,
+                  const BoundsCheckingPass::BoundsCheckingOptions &Opts) {
   if (F.hasFnAttribute(Attribute::NoSanitizeBounds))
     return false;
 
@@ -239,11 +212,16 @@ static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI,
       TrapInfo.push_back(std::make_pair(&I, Or));
   }
 
+  std::string Name;
+  if (Opts.Rt)
+    Name = getRuntimeCallName(*Opts.Rt);
+
   // Create a trapping basic block on demand using a callback. Depending on
   // flags, this will either create a single block for the entire function or
   // will create a fresh block every time it is called.
   BasicBlock *ReuseTrapBB = nullptr;
-  auto GetTrapBB = [&ReuseTrapBB, &Opts](BuilderTy &IRB, BasicBlock *Cont) {
+  auto GetTrapBB = [&ReuseTrapBB, &Opts, &Name](BuilderTy &IRB,
+                                                BasicBlock *Cont) {
     Function *Fn = IRB.GetInsertBlock()->getParent();
     auto DebugLoc = IRB.getCurrentDebugLocation();
     IRBuilder<>::InsertPointGuard Guard(IRB);
@@ -257,23 +235,24 @@ static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI,
     BasicBlock *TrapBB = BasicBlock::Create(Fn->getContext(), "trap", Fn);
     IRB.SetInsertPoint(TrapBB);
 
-    bool DebugTrapBB = !Opts.MayMerge;
-    CallInst *TrapCall = Opts.UseTrap
-                             ? InsertTrap(IRB, DebugTrapBB)
-                             : InsertCall(IRB, Opts.MayReturn, Opts.Name);
+    bool DebugTrapBB = !Opts.Merge;
+    CallInst *TrapCall = Opts.Rt ? InsertCall(IRB, Opts.Rt->MayReturn, Name)
+                                 : InsertTrap(IRB, DebugTrapBB);
     if (DebugTrapBB)
       TrapCall->addFnAttr(llvm::Attribute::NoMerge);
 
     TrapCall->setDoesNotThrow();
     TrapCall->setDebugLoc(DebugLoc);
-    if (Opts.MayReturn) {
+
+    bool MayReturn = Opts.Rt && Opts.Rt->MayReturn;
+    if (MayReturn) {
       IRB.CreateBr(Cont);
     } else {
       TrapCall->setDoesNotReturn();
       IRB.CreateUnreachable();
     }
 
-    if (!Opts.MayReturn && SingleTrapBB && !DebugTrapBB)
+    if (!MayReturn && SingleTrapBB && !DebugTrapBB)
       ReuseTrapBB = TrapBB;
 
     return TrapBB;
@@ -292,8 +271,7 @@ PreservedAnalyses BoundsCheckingPass::run(Function &F, FunctionAnalysisManager &
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
 
-  if (!addBoundsChecking(F, TLI, SE,
-                         ReportingOpts(Options.Mode, Options.Merge)))
+  if (!addBoundsChecking(F, TLI, SE, Options))
     return PreservedAnalyses::all();
 
   return PreservedAnalyses::none();
@@ -303,22 +281,15 @@ void BoundsCheckingPass::printPipeline(
     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
   static_cast<PassInfoMixin<BoundsCheckingPass> *>(this)->printPipeline(
       OS, MapClassName2PassName);
-  switch (Options.Mode) {
-  case ReportingMode::Trap:
-    OS << "<trap";
-    break;
-  case ReportingMode::MinRuntime:
-    OS << "<min-rt";
-    break;
-  case ReportingMode::MinRuntimeAbort:
-    OS << "<min-rt-abort";
-    break;
-  case ReportingMode::FullRuntime:
-    OS << "<rt";
-    break;
-  case ReportingMode::FullRuntimeAbort:
-    OS << "<rt-abort";
-    break;
+  OS << "<";
+  if (Options.Rt) {
+    if (Options.Rt->MinRuntime)
+      OS << "min-";
+    OS << "rt";
+    if (!Options.Rt->MayReturn)
+      OS << "-abort";
+  } else {
+    OS << "trap";
   }
   if (Options.Merge)
     OS << ";merge";

From 4c8fdc29549f9c7c3c710e3ada2cb00a70af609e Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 9 Jan 2025 20:38:13 -0800
Subject: [PATCH 010/408] [nfc][BoundsChecking] Rename BoundsCheckingOptions
 into Options (#122359)

---
 clang/lib/CodeGen/BackendUtil.cpp             |  2 +-
 .../Instrumentation/BoundsChecking.h          |  6 +++---
 llvm/lib/Passes/PassBuilder.cpp               |  4 ++--
 llvm/lib/Passes/PassRegistry.def              |  2 +-
 .../Instrumentation/BoundsChecking.cpp        | 20 +++++++++----------
 5 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index f350accfc530b..37ef2bd203fdf 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -1027,7 +1027,7 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
     if (LangOpts.Sanitize.has(SanitizerKind::LocalBounds))
       PB.registerScalarOptimizerLateEPCallback([this](FunctionPassManager &FPM,
                                                       OptimizationLevel Level) {
-        BoundsCheckingPass::BoundsCheckingOptions Options;
+        BoundsCheckingPass::Options Options;
         Options.Merge =
             CodeGenOpts.SanitizeMergeHandlers.has(SanitizerKind::LocalBounds);
         if (!CodeGenOpts.SanitizeTrap.has(SanitizerKind::LocalBounds)) {
diff --git a/llvm/include/llvm/Transforms/Instrumentation/BoundsChecking.h b/llvm/include/llvm/Transforms/Instrumentation/BoundsChecking.h
index 9c0506428bd62..836fc907375d3 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/BoundsChecking.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/BoundsChecking.h
@@ -20,7 +20,7 @@ class Function;
 class BoundsCheckingPass : public PassInfoMixin<BoundsCheckingPass> {
 
 public:
-  struct BoundsCheckingOptions {
+  struct Options {
     struct Runtime {
       Runtime(bool MinRuntime, bool MayReturn)
           : MinRuntime(MinRuntime), MayReturn(MayReturn) {}
@@ -31,14 +31,14 @@ class BoundsCheckingPass : public PassInfoMixin<BoundsCheckingPass> {
     bool Merge = false;
   };
 
-  BoundsCheckingPass(BoundsCheckingOptions Options) : Options(Options) {}
+  BoundsCheckingPass(Options Opts) : Opts(Opts) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
   static bool isRequired() { return true; }
   void printPipeline(raw_ostream &OS,
                      function_ref<StringRef(StringRef)> MapClassName2PassName);
 
 private:
-  BoundsCheckingOptions Options;
+  Options Opts;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index b75387ac556e3..aac4407740055 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -1284,9 +1284,9 @@ parseRegAllocFastPassOptions(PassBuilder &PB, StringRef Params) {
   return Opts;
 }
 
-Expected<BoundsCheckingPass::BoundsCheckingOptions>
+Expected<BoundsCheckingPass::Options>
 parseBoundsCheckingOptions(StringRef Params) {
-  BoundsCheckingPass::BoundsCheckingOptions Options;
+  BoundsCheckingPass::Options Options;
   while (!Params.empty()) {
     StringRef ParamName;
     std::tie(ParamName, Params) = Params.split(';');
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 13e192fffbdd9..1021d7fcd9247 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -623,7 +623,7 @@ FUNCTION_PASS_WITH_PARAMS(
     parseWinEHPrepareOptions, "demote-catchswitch-only")
 FUNCTION_PASS_WITH_PARAMS(
     "bounds-checking", "BoundsCheckingPass",
-    [](BoundsCheckingPass::BoundsCheckingOptions Options) {
+    [](BoundsCheckingPass::Options Options) {
       return BoundsCheckingPass(Options);
     },
     parseBoundsCheckingOptions, "trap")
diff --git a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
index 3d2a2663230c0..10596f87fbcab 100644
--- a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -162,8 +162,8 @@ static void insertBoundsCheck(Value *Or, BuilderTy &IRB, GetTrapBBT GetTrapBB) {
   BranchInst::Create(TrapBB, Cont, Or, OldBB);
 }
 
-static std::string getRuntimeCallName(
-    const BoundsCheckingPass::BoundsCheckingOptions::Runtime &Opts) {
+static std::string
+getRuntimeCallName(const BoundsCheckingPass::Options::Runtime &Opts) {
   std::string Name = "__ubsan_handle_local_out_of_bounds";
   if (Opts.MinRuntime)
     Name += "_minimal";
@@ -172,9 +172,9 @@ static std::string getRuntimeCallName(
   return Name;
 }
 
-static bool
-addBoundsChecking(Function &F, TargetLibraryInfo &TLI, ScalarEvolution &SE,
-                  const BoundsCheckingPass::BoundsCheckingOptions &Opts) {
+static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI,
+                              ScalarEvolution &SE,
+                              const BoundsCheckingPass::Options &Opts) {
   if (F.hasFnAttribute(Attribute::NoSanitizeBounds))
     return false;
 
@@ -271,7 +271,7 @@ PreservedAnalyses BoundsCheckingPass::run(Function &F, FunctionAnalysisManager &
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
 
-  if (!addBoundsChecking(F, TLI, SE, Options))
+  if (!addBoundsChecking(F, TLI, SE, Opts))
     return PreservedAnalyses::all();
 
   return PreservedAnalyses::none();
@@ -282,16 +282,16 @@ void BoundsCheckingPass::printPipeline(
   static_cast<PassInfoMixin<BoundsCheckingPass> *>(this)->printPipeline(
       OS, MapClassName2PassName);
   OS << "<";
-  if (Options.Rt) {
-    if (Options.Rt->MinRuntime)
+  if (Opts.Rt) {
+    if (Opts.Rt->MinRuntime)
       OS << "min-";
     OS << "rt";
-    if (!Options.Rt->MayReturn)
+    if (!Opts.Rt->MayReturn)
       OS << "-abort";
   } else {
     OS << "trap";
   }
-  if (Options.Merge)
+  if (Opts.Merge)
     OS << ";merge";
   OS << ">";
 }

From 73dd730fb9403ca648a46b489bf04e27b2a93840 Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan <yifanzhu@rochester.edu>
Date: Fri, 10 Jan 2025 12:49:00 +0800
Subject: [PATCH 011/408] [libc] implement sys/uio/writev (#122233)

implement sys/uio/writev according to POSIX standard. This vectorized IO
API is needed by many logging libraries to achieve atomic logging
multiple strings.
---
 libc/config/linux/aarch64/entrypoints.txt |  3 +++
 libc/config/linux/x86_64/entrypoints.txt  |  3 +++
 libc/hdr/types/CMakeLists.txt             |  9 +++++++
 libc/hdr/types/struct_iovec.h             | 21 ++++++++++++++++
 libc/include/CMakeLists.txt               | 10 ++++++++
 libc/include/sys/uio.h.def                | 16 +++++++++++++
 libc/include/sys/uio.yaml                 | 17 +++++++++++++
 libc/src/sys/CMakeLists.txt               |  1 +
 libc/src/sys/uio/CMakeLists.txt           | 10 ++++++++
 libc/src/sys/uio/linux/CMakeLists.txt     | 14 +++++++++++
 libc/src/sys/uio/linux/writev.cpp         | 27 +++++++++++++++++++++
 libc/src/sys/uio/writev.h                 | 22 +++++++++++++++++
 libc/test/src/sys/CMakeLists.txt          |  1 +
 libc/test/src/sys/uio/CMakeLists.txt      | 15 ++++++++++++
 libc/test/src/sys/uio/writev_test.cpp     | 29 +++++++++++++++++++++++
 15 files changed, 198 insertions(+)
 create mode 100644 libc/hdr/types/struct_iovec.h
 create mode 100644 libc/include/sys/uio.h.def
 create mode 100644 libc/include/sys/uio.yaml
 create mode 100644 libc/src/sys/uio/CMakeLists.txt
 create mode 100644 libc/src/sys/uio/linux/CMakeLists.txt
 create mode 100644 libc/src/sys/uio/linux/writev.cpp
 create mode 100644 libc/src/sys/uio/writev.h
 create mode 100644 libc/test/src/sys/uio/CMakeLists.txt
 create mode 100644 libc/test/src/sys/uio/writev_test.cpp

diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 00f0c6a8bfb8e..fc2b0e91c1286 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -350,6 +350,9 @@ set(TARGET_LIBC_ENTRYPOINTS
 
     # wchar.h entrypoints
     libc.src.wchar.wctob
+
+    # sys/uio.h entrypoints
+    libc.src.sys.uio.writev
 )
 
 if(LLVM_LIBC_INCLUDE_SCUDO)
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 7e549607716c0..e7b049c0a6638 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -350,6 +350,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     # wchar.h entrypoints
     libc.src.wchar.wctob
     libc.src.wchar.btowc
+
+    # sys/uio.h entrypoints
+    libc.src.sys.uio.writev
 )
 
 if(LLVM_LIBC_INCLUDE_SCUDO)
diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt
index 1674de1420152..3dfa38a020fad 100644
--- a/libc/hdr/types/CMakeLists.txt
+++ b/libc/hdr/types/CMakeLists.txt
@@ -333,3 +333,12 @@ add_proxy_header_library(
   FULL_BUILD_DEPENDS
     libc.include.llvm-libc-types.uid_t
 )
+
+add_proxy_header_library(
+  struct_iovec
+  HDRS
+    struct_iovec.h
+  FULL_BUILD_DEPENDS
+    libc.include.llvm-libc-types.struct_iovec
+    libc.include.sys_uio
+)
diff --git a/libc/hdr/types/struct_iovec.h b/libc/hdr/types/struct_iovec.h
new file mode 100644
index 0000000000000..fc6174c6d4871
--- /dev/null
+++ b/libc/hdr/types/struct_iovec.h
@@ -0,0 +1,21 @@
+//===-- Proxy for struct iovec  -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_HDR_TYPES_STRUCT_IOVEC_H
+#define LLVM_LIBC_HDR_TYPES_STRUCT_IOVEC_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-types/struct_iovec.h"
+
+#else
+
+#include <sys/uio.h>
+
+#endif // LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_TYPES_STRUCT_IOVEC_H
diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index 568bb05d92302..e5ceea360d396 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -632,6 +632,16 @@ add_header_macro(
     .llvm-libc-types.struct_utsname
 )
 
+add_header_macro(
+  sys_uio
+  ../libc/include/sys/uio.yaml
+  sys/uio.h
+  DEPENDS
+    .llvm_libc_common_h
+    .llvm-libc-types.struct_iovec
+    .llvm-libc-types.ssize_t
+)
+
 add_header_macro(
   sys_wait
   ../libc/include/sys/wait.yaml
diff --git a/libc/include/sys/uio.h.def b/libc/include/sys/uio.h.def
new file mode 100644
index 0000000000000..76496cb2310f7
--- /dev/null
+++ b/libc/include/sys/uio.h.def
@@ -0,0 +1,16 @@
+//===-- POSIX header uio.h ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SYS_UIO_H
+#define LLVM_LIBC_SYS_UIO_H
+
+#include "__llvm-libc-common.h"
+
+%%public_api()
+
+#endif // LLVM_LIBC_SYS_UIO_H
diff --git a/libc/include/sys/uio.yaml b/libc/include/sys/uio.yaml
new file mode 100644
index 0000000000000..808d8ec790198
--- /dev/null
+++ b/libc/include/sys/uio.yaml
@@ -0,0 +1,17 @@
+header: sys/uio.h
+header_template: uio.h.def
+macros: []
+types: 
+  - type_name: struct_iovec
+  - type_name: ssize_t
+enums: []
+objects: []
+functions:
+  - name: writev
+    standards:
+      - POSIX
+    return_type: ssize_t
+    arguments:
+      - type: int
+      - type: const struct iovec *
+      - type: int
diff --git a/libc/src/sys/CMakeLists.txt b/libc/src/sys/CMakeLists.txt
index adc666b94202f..bb177f11c6d62 100644
--- a/libc/src/sys/CMakeLists.txt
+++ b/libc/src/sys/CMakeLists.txt
@@ -11,3 +11,4 @@ add_subdirectory(statvfs)
 add_subdirectory(utsname)
 add_subdirectory(wait)
 add_subdirectory(prctl)
+add_subdirectory(uio)
diff --git a/libc/src/sys/uio/CMakeLists.txt b/libc/src/sys/uio/CMakeLists.txt
new file mode 100644
index 0000000000000..6298f86cd937d
--- /dev/null
+++ b/libc/src/sys/uio/CMakeLists.txt
@@ -0,0 +1,10 @@
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
+endif()
+
+add_entrypoint_object(
+  writev
+  ALIAS
+  DEPENDS
+    .${LIBC_TARGET_OS}.writev
+)
diff --git a/libc/src/sys/uio/linux/CMakeLists.txt b/libc/src/sys/uio/linux/CMakeLists.txt
new file mode 100644
index 0000000000000..85a7a3ae4d5c2
--- /dev/null
+++ b/libc/src/sys/uio/linux/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_entrypoint_object(
+  writev
+  SRCS
+    writev.cpp
+  HDRS
+    ../writev.h
+  DEPENDS
+    libc.include.sys_syscall
+    libc.src.__support.OSUtil.osutil
+    libc.src.__support.common
+    libc.src.errno.errno
+    libc.hdr.types.ssize_t
+    libc.hdr.types.struct_iovec
+)
diff --git a/libc/src/sys/uio/linux/writev.cpp b/libc/src/sys/uio/linux/writev.cpp
new file mode 100644
index 0000000000000..a3bb8986d522e
--- /dev/null
+++ b/libc/src/sys/uio/linux/writev.cpp
@@ -0,0 +1,27 @@
+//===-- Implementation file for writev ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "src/sys/uio/writev.h"
+#include "src/__support/OSUtil/syscall.h"
+#include "src/__support/common.h"
+#include "src/errno/libc_errno.h"
+#include <sys/syscall.h>
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(ssize_t, writev, (int fd, const iovec *iov, int iovcnt)) {
+  long ret = LIBC_NAMESPACE::syscall_impl<long>(SYS_writev, fd, iov, iovcnt);
+  // On failure, return -1 and set errno.
+  if (ret < 0) {
+    libc_errno = static_cast<int>(-ret);
+    return -1;
+  }
+  // On success, return number of bytes written.
+  return static_cast<ssize_t>(ret);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/sys/uio/writev.h b/libc/src/sys/uio/writev.h
new file mode 100644
index 0000000000000..787bc4b3044b0
--- /dev/null
+++ b/libc/src/sys/uio/writev.h
@@ -0,0 +1,22 @@
+//===-- Implementation header for writev ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_SYS_UIO_WRITEV_H
+#define LLVM_LIBC_SRC_SYS_UIO_WRITEV_H
+
+#include "hdr/types/ssize_t.h"
+#include "hdr/types/struct_iovec.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+ssize_t writev(int fd, const iovec *iov, int iovcnt);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_SYS_UIO_WRITEV_H
diff --git a/libc/test/src/sys/CMakeLists.txt b/libc/test/src/sys/CMakeLists.txt
index dc0aa8bf7b75d..9e9293aab628f 100644
--- a/libc/test/src/sys/CMakeLists.txt
+++ b/libc/test/src/sys/CMakeLists.txt
@@ -11,3 +11,4 @@ add_subdirectory(wait)
 add_subdirectory(prctl)
 add_subdirectory(auxv)
 add_subdirectory(epoll)
+add_subdirectory(uio)
diff --git a/libc/test/src/sys/uio/CMakeLists.txt b/libc/test/src/sys/uio/CMakeLists.txt
new file mode 100644
index 0000000000000..45f8d14c16179
--- /dev/null
+++ b/libc/test/src/sys/uio/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_custom_target(libc_sys_uio_unittests)
+add_libc_unittest(
+  writev_test
+  SUITE
+  libc_sys_uio_unittests
+  SRCS
+    writev_test.cpp
+  DEPENDS
+    libc.src.errno.errno
+    libc.src.__support.common
+    libc.src.sys.uio.writev
+    libc.src.unistd.close
+    libc.src.fcntl.open
+    libc.test.UnitTest.ErrnoSetterMatcher
+)
diff --git a/libc/test/src/sys/uio/writev_test.cpp b/libc/test/src/sys/uio/writev_test.cpp
new file mode 100644
index 0000000000000..a9a314813182d
--- /dev/null
+++ b/libc/test/src/sys/uio/writev_test.cpp
@@ -0,0 +1,29 @@
+//===-- Unittests for writev ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/fcntl/open.h"
+#include "src/sys/uio/writev.h"
+#include "src/unistd/close.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
+#include "test/UnitTest/Test.h"
+
+using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
+
+TEST(LlvmLibcSysUioWritevTest, SmokeTest) {
+  int fd = LIBC_NAMESPACE::open("/dev/null", O_WRONLY);
+  ASSERT_THAT(fd, returns(GT(0)).with_errno(EQ(0)));
+  const char *data = "Hello, World!\n";
+  struct iovec iov[2];
+  iov[0].iov_base = const_cast<char *>(data);
+  iov[0].iov_len = 7;
+  iov[1].iov_base = const_cast<char *>(data + 7);
+  iov[1].iov_len = 8;
+  ASSERT_THAT(LIBC_NAMESPACE::writev(fd, iov, 2),
+              returns(EQ(15)).with_errno(EQ(0)));
+  ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds());
+}

From a4394d9d42fb6e60e3702588fb56bec243038c49 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 9 Jan 2025 21:10:06 -0800
Subject: [PATCH 012/408] [NFC][ubsan] Rename prefixes in test

Looks like update_cc_test_checks is being confused if
it creates vars with the name matching prefix.

Issue triggered with #122415
---
 clang/test/CodeGen/allow-ubsan-check.c | 272 ++++++++++++-------------
 1 file changed, 136 insertions(+), 136 deletions(-)

diff --git a/clang/test/CodeGen/allow-ubsan-check.c b/clang/test/CodeGen/allow-ubsan-check.c
index 5232d24085466..e3860784e716f 100644
--- a/clang/test/CodeGen/allow-ubsan-check.c
+++ b/clang/test/CodeGen/allow-ubsan-check.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
 // RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -emit-llvm -o - %s -fsanitize=signed-integer-overflow,integer-divide-by-zero,null -mllvm -ubsan-guard-checks | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -emit-llvm -o - %s -fsanitize=signed-integer-overflow,integer-divide-by-zero,null -mllvm -ubsan-guard-checks -fsanitize-trap=signed-integer-overflow,integer-divide-by-zero,null | FileCheck %s --check-prefixes=TRAP
-// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -emit-llvm -o - %s -fsanitize=signed-integer-overflow,integer-divide-by-zero,null -mllvm -ubsan-guard-checks -fsanitize-recover=signed-integer-overflow,integer-divide-by-zero,null | FileCheck %s --check-prefixes=RECOVER
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -emit-llvm -o - %s -fsanitize=signed-integer-overflow,integer-divide-by-zero,null -mllvm -ubsan-guard-checks -fsanitize-trap=signed-integer-overflow,integer-divide-by-zero,null | FileCheck %s --check-prefixes=TR
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -emit-llvm -o - %s -fsanitize=signed-integer-overflow,integer-divide-by-zero,null -mllvm -ubsan-guard-checks -fsanitize-recover=signed-integer-overflow,integer-divide-by-zero,null | FileCheck %s --check-prefixes=REC
 
 
 // CHECK-LABEL: define dso_local i32 @div(
@@ -31,57 +31,57 @@
 // CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    ret i32 [[DIV]]
 //
-// TRAP-LABEL: define dso_local i32 @div(
-// TRAP-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
-// TRAP-NEXT:  entry:
-// TRAP-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
-// TRAP-NEXT:    [[Y_ADDR:%.*]] = alloca i32, align 4
-// TRAP-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
-// TRAP-NEXT:    store i32 [[Y]], ptr [[Y_ADDR]], align 4
-// TRAP-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
-// TRAP-NEXT:    [[TMP1:%.*]] = load i32, ptr [[Y_ADDR]], align 4
-// TRAP-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0, !nosanitize [[META2:![0-9]+]]
-// TRAP-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP0]], -2147483648, !nosanitize [[META2]]
-// TRAP-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], -1, !nosanitize [[META2]]
-// TRAP-NEXT:    [[OR:%.*]] = or i1 [[TMP3]], [[TMP4]], !nosanitize [[META2]]
-// TRAP-NEXT:    [[TMP5:%.*]] = and i1 [[TMP2]], [[OR]], !nosanitize [[META2]]
-// TRAP-NEXT:    [[TMP6:%.*]] = call i1 @llvm.allow.ubsan.check(i8 3), !nosanitize [[META2]]
-// TRAP-NEXT:    [[TMP7:%.*]] = xor i1 [[TMP6]], true, !nosanitize [[META2]]
-// TRAP-NEXT:    [[TMP8:%.*]] = or i1 [[TMP5]], [[TMP7]], !nosanitize [[META2]]
-// TRAP-NEXT:    br i1 [[TMP8]], label [[CONT:%.*]], label [[TRAP:%.*]], !nosanitize [[META2]]
-// TRAP:       trap:
-// TRAP-NEXT:    call void @llvm.ubsantrap(i8 3) #[[ATTR4:[0-9]+]], !nosanitize [[META2]]
-// TRAP-NEXT:    unreachable, !nosanitize [[META2]]
-// TRAP:       cont:
-// TRAP-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP0]], [[TMP1]]
-// TRAP-NEXT:    ret i32 [[DIV]]
+// TR-LABEL: define dso_local i32 @div(
+// TR-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+// TR-NEXT:  entry:
+// TR-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
+// TR-NEXT:    [[Y_ADDR:%.*]] = alloca i32, align 4
+// TR-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
+// TR-NEXT:    store i32 [[Y]], ptr [[Y_ADDR]], align 4
+// TR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
+// TR-NEXT:    [[TMP1:%.*]] = load i32, ptr [[Y_ADDR]], align 4
+// TR-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0, !nosanitize [[META2:![0-9]+]]
+// TR-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP0]], -2147483648, !nosanitize [[META2]]
+// TR-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], -1, !nosanitize [[META2]]
+// TR-NEXT:    [[OR:%.*]] = or i1 [[TMP3]], [[TMP4]], !nosanitize [[META2]]
+// TR-NEXT:    [[TMP5:%.*]] = and i1 [[TMP2]], [[OR]], !nosanitize [[META2]]
+// TR-NEXT:    [[TMP6:%.*]] = call i1 @llvm.allow.ubsan.check(i8 3), !nosanitize [[META2]]
+// TR-NEXT:    [[TMP7:%.*]] = xor i1 [[TMP6]], true, !nosanitize [[META2]]
+// TR-NEXT:    [[TMP8:%.*]] = or i1 [[TMP5]], [[TMP7]], !nosanitize [[META2]]
+// TR-NEXT:    br i1 [[TMP8]], label [[CONT:%.*]], label [[TRAP:%.*]], !nosanitize [[META2]]
+// TR:       trap:
+// TR-NEXT:    call void @llvm.ubsantrap(i8 3) #[[ATTR4:[0-9]+]], !nosanitize [[META2]]
+// TR-NEXT:    unreachable, !nosanitize [[META2]]
+// TR:       cont:
+// TR-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP0]], [[TMP1]]
+// TR-NEXT:    ret i32 [[DIV]]
 //
-// RECOVER-LABEL: define dso_local i32 @div(
-// RECOVER-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
-// RECOVER-NEXT:  entry:
-// RECOVER-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
-// RECOVER-NEXT:    [[Y_ADDR:%.*]] = alloca i32, align 4
-// RECOVER-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
-// RECOVER-NEXT:    store i32 [[Y]], ptr [[Y_ADDR]], align 4
-// RECOVER-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
-// RECOVER-NEXT:    [[TMP1:%.*]] = load i32, ptr [[Y_ADDR]], align 4
-// RECOVER-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0, !nosanitize [[META2:![0-9]+]]
-// RECOVER-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP0]], -2147483648, !nosanitize [[META2]]
-// RECOVER-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], -1, !nosanitize [[META2]]
-// RECOVER-NEXT:    [[OR:%.*]] = or i1 [[TMP3]], [[TMP4]], !nosanitize [[META2]]
-// RECOVER-NEXT:    [[TMP5:%.*]] = and i1 [[TMP2]], [[OR]], !nosanitize [[META2]]
-// RECOVER-NEXT:    [[TMP6:%.*]] = call i1 @llvm.allow.ubsan.check(i8 3), !nosanitize [[META2]]
-// RECOVER-NEXT:    [[TMP7:%.*]] = xor i1 [[TMP6]], true, !nosanitize [[META2]]
-// RECOVER-NEXT:    [[TMP8:%.*]] = or i1 [[TMP5]], [[TMP7]], !nosanitize [[META2]]
-// RECOVER-NEXT:    br i1 [[TMP8]], label [[CONT:%.*]], label [[HANDLER_DIVREM_OVERFLOW:%.*]], !prof [[PROF3:![0-9]+]], !nosanitize [[META2]]
-// RECOVER:       handler.divrem_overflow:
-// RECOVER-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP0]] to i64, !nosanitize [[META2]]
-// RECOVER-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP1]] to i64, !nosanitize [[META2]]
-// RECOVER-NEXT:    call void @__ubsan_handle_divrem_overflow(ptr @[[GLOB1:[0-9]+]], i64 [[TMP9]], i64 [[TMP10]]) #[[ATTR4:[0-9]+]], !nosanitize [[META2]]
-// RECOVER-NEXT:    br label [[CONT]], !nosanitize [[META2]]
-// RECOVER:       cont:
-// RECOVER-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP0]], [[TMP1]]
-// RECOVER-NEXT:    ret i32 [[DIV]]
+// REC-LABEL: define dso_local i32 @div(
+// REC-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+// REC-NEXT:  entry:
+// REC-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
+// REC-NEXT:    [[Y_ADDR:%.*]] = alloca i32, align 4
+// REC-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
+// REC-NEXT:    store i32 [[Y]], ptr [[Y_ADDR]], align 4
+// REC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
+// REC-NEXT:    [[TMP1:%.*]] = load i32, ptr [[Y_ADDR]], align 4
+// REC-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0, !nosanitize [[META2:![0-9]+]]
+// REC-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP0]], -2147483648, !nosanitize [[META2]]
+// REC-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], -1, !nosanitize [[META2]]
+// REC-NEXT:    [[OR:%.*]] = or i1 [[TMP3]], [[TMP4]], !nosanitize [[META2]]
+// REC-NEXT:    [[TMP5:%.*]] = and i1 [[TMP2]], [[OR]], !nosanitize [[META2]]
+// REC-NEXT:    [[TMP6:%.*]] = call i1 @llvm.allow.ubsan.check(i8 3), !nosanitize [[META2]]
+// REC-NEXT:    [[TMP7:%.*]] = xor i1 [[TMP6]], true, !nosanitize [[META2]]
+// REC-NEXT:    [[TMP8:%.*]] = or i1 [[TMP5]], [[TMP7]], !nosanitize [[META2]]
+// REC-NEXT:    br i1 [[TMP8]], label [[CONT:%.*]], label [[HANDLER_DIVREM_OVERFLOW:%.*]], !prof [[PROF3:![0-9]+]], !nosanitize [[META2]]
+// REC:       handler.divrem_overflow:
+// REC-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP0]] to i64, !nosanitize [[META2]]
+// REC-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP1]] to i64, !nosanitize [[META2]]
+// REC-NEXT:    call void @__ubsan_handle_divrem_overflow(ptr @[[GLOB1:[0-9]+]], i64 [[TMP9]], i64 [[TMP10]]) #[[ATTR4:[0-9]+]], !nosanitize [[META2]]
+// REC-NEXT:    br label [[CONT]], !nosanitize [[META2]]
+// REC:       cont:
+// REC-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP0]], [[TMP1]]
+// REC-NEXT:    ret i32 [[DIV]]
 //
 int div(int x, int y) {
   return x / y;
@@ -106,42 +106,42 @@ int div(int x, int y) {
 // CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK-NEXT:    ret i32 [[TMP6]]
 //
-// TRAP-LABEL: define dso_local i32 @null(
-// TRAP-SAME: ptr noundef [[X:%.*]]) #[[ATTR0]] {
-// TRAP-NEXT:  entry:
-// TRAP-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 8
-// TRAP-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8
-// TRAP-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 8
-// TRAP-NEXT:    [[TMP1:%.*]] = icmp ne ptr [[TMP0]], null, !nosanitize [[META2]]
-// TRAP-NEXT:    [[TMP2:%.*]] = call i1 @llvm.allow.ubsan.check(i8 22), !nosanitize [[META2]]
-// TRAP-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true, !nosanitize [[META2]]
-// TRAP-NEXT:    [[TMP4:%.*]] = or i1 [[TMP1]], [[TMP3]], !nosanitize [[META2]]
-// TRAP-NEXT:    br i1 [[TMP4]], label [[CONT:%.*]], label [[TRAP:%.*]], !nosanitize [[META2]]
-// TRAP:       trap:
-// TRAP-NEXT:    call void @llvm.ubsantrap(i8 22) #[[ATTR4]], !nosanitize [[META2]]
-// TRAP-NEXT:    unreachable, !nosanitize [[META2]]
-// TRAP:       cont:
-// TRAP-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
-// TRAP-NEXT:    ret i32 [[TMP5]]
+// TR-LABEL: define dso_local i32 @null(
+// TR-SAME: ptr noundef [[X:%.*]]) #[[ATTR0]] {
+// TR-NEXT:  entry:
+// TR-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 8
+// TR-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8
+// TR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 8
+// TR-NEXT:    [[TMP1:%.*]] = icmp ne ptr [[TMP0]], null, !nosanitize [[META2]]
+// TR-NEXT:    [[TMP2:%.*]] = call i1 @llvm.allow.ubsan.check(i8 22), !nosanitize [[META2]]
+// TR-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true, !nosanitize [[META2]]
+// TR-NEXT:    [[TMP4:%.*]] = or i1 [[TMP1]], [[TMP3]], !nosanitize [[META2]]
+// TR-NEXT:    br i1 [[TMP4]], label [[CONT:%.*]], label [[TRAP:%.*]], !nosanitize [[META2]]
+// TR:       trap:
+// TR-NEXT:    call void @llvm.ubsantrap(i8 22) #[[ATTR4]], !nosanitize [[META2]]
+// TR-NEXT:    unreachable, !nosanitize [[META2]]
+// TR:       cont:
+// TR-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+// TR-NEXT:    ret i32 [[TMP5]]
 //
-// RECOVER-LABEL: define dso_local i32 @null(
-// RECOVER-SAME: ptr noundef [[X:%.*]]) #[[ATTR0]] {
-// RECOVER-NEXT:  entry:
-// RECOVER-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 8
-// RECOVER-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8
-// RECOVER-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 8
-// RECOVER-NEXT:    [[TMP1:%.*]] = icmp ne ptr [[TMP0]], null, !nosanitize [[META2]]
-// RECOVER-NEXT:    [[TMP2:%.*]] = call i1 @llvm.allow.ubsan.check(i8 22), !nosanitize [[META2]]
-// RECOVER-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true, !nosanitize [[META2]]
-// RECOVER-NEXT:    [[TMP4:%.*]] = or i1 [[TMP1]], [[TMP3]], !nosanitize [[META2]]
-// RECOVER-NEXT:    br i1 [[TMP4]], label [[CONT:%.*]], label [[HANDLER_TYPE_MISMATCH:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// RECOVER:       handler.type_mismatch:
-// RECOVER-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[TMP0]] to i64, !nosanitize [[META2]]
-// RECOVER-NEXT:    call void @__ubsan_handle_type_mismatch_v1(ptr @[[GLOB2:[0-9]+]], i64 [[TMP5]]) #[[ATTR4]], !nosanitize [[META2]]
-// RECOVER-NEXT:    br label [[CONT]], !nosanitize [[META2]]
-// RECOVER:       cont:
-// RECOVER-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP0]], align 4
-// RECOVER-NEXT:    ret i32 [[TMP6]]
+// REC-LABEL: define dso_local i32 @null(
+// REC-SAME: ptr noundef [[X:%.*]]) #[[ATTR0]] {
+// REC-NEXT:  entry:
+// REC-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 8
+// REC-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8
+// REC-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 8
+// REC-NEXT:    [[TMP1:%.*]] = icmp ne ptr [[TMP0]], null, !nosanitize [[META2]]
+// REC-NEXT:    [[TMP2:%.*]] = call i1 @llvm.allow.ubsan.check(i8 22), !nosanitize [[META2]]
+// REC-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true, !nosanitize [[META2]]
+// REC-NEXT:    [[TMP4:%.*]] = or i1 [[TMP1]], [[TMP3]], !nosanitize [[META2]]
+// REC-NEXT:    br i1 [[TMP4]], label [[CONT:%.*]], label [[HANDLER_TYPE_MISMATCH:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// REC:       handler.type_mismatch:
+// REC-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[TMP0]] to i64, !nosanitize [[META2]]
+// REC-NEXT:    call void @__ubsan_handle_type_mismatch_v1(ptr @[[GLOB2:[0-9]+]], i64 [[TMP5]]) #[[ATTR4]], !nosanitize [[META2]]
+// REC-NEXT:    br label [[CONT]], !nosanitize [[META2]]
+// REC:       cont:
+// REC-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP0]], align 4
+// REC-NEXT:    ret i32 [[TMP6]]
 //
 int null(int* x) {
   return *x;
@@ -172,53 +172,53 @@ int null(int* x) {
 // CHECK:       cont:
 // CHECK-NEXT:    ret i32 [[TMP3]]
 //
-// TRAP-LABEL: define dso_local i32 @overflow(
-// TRAP-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR0]] {
-// TRAP-NEXT:  entry:
-// TRAP-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
-// TRAP-NEXT:    [[Y_ADDR:%.*]] = alloca i32, align 4
-// TRAP-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
-// TRAP-NEXT:    store i32 [[Y]], ptr [[Y_ADDR]], align 4
-// TRAP-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
-// TRAP-NEXT:    [[TMP1:%.*]] = load i32, ptr [[Y_ADDR]], align 4
-// TRAP-NEXT:    [[TMP2:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[TMP0]], i32 [[TMP1]]), !nosanitize [[META2]]
-// TRAP-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i1 } [[TMP2]], 0, !nosanitize [[META2]]
-// TRAP-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1, !nosanitize [[META2]]
-// TRAP-NEXT:    [[TMP5:%.*]] = xor i1 [[TMP4]], true, !nosanitize [[META2]]
-// TRAP-NEXT:    [[TMP6:%.*]] = call i1 @llvm.allow.ubsan.check(i8 0), !nosanitize [[META2]]
-// TRAP-NEXT:    [[TMP7:%.*]] = xor i1 [[TMP6]], true, !nosanitize [[META2]]
-// TRAP-NEXT:    [[TMP8:%.*]] = or i1 [[TMP5]], [[TMP7]], !nosanitize [[META2]]
-// TRAP-NEXT:    br i1 [[TMP8]], label [[CONT:%.*]], label [[TRAP:%.*]], !nosanitize [[META2]]
-// TRAP:       trap:
-// TRAP-NEXT:    call void @llvm.ubsantrap(i8 0) #[[ATTR4]], !nosanitize [[META2]]
-// TRAP-NEXT:    unreachable, !nosanitize [[META2]]
-// TRAP:       cont:
-// TRAP-NEXT:    ret i32 [[TMP3]]
+// TR-LABEL: define dso_local i32 @overflow(
+// TR-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR0]] {
+// TR-NEXT:  entry:
+// TR-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
+// TR-NEXT:    [[Y_ADDR:%.*]] = alloca i32, align 4
+// TR-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
+// TR-NEXT:    store i32 [[Y]], ptr [[Y_ADDR]], align 4
+// TR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
+// TR-NEXT:    [[TMP1:%.*]] = load i32, ptr [[Y_ADDR]], align 4
+// TR-NEXT:    [[TMP2:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[TMP0]], i32 [[TMP1]]), !nosanitize [[META2]]
+// TR-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i1 } [[TMP2]], 0, !nosanitize [[META2]]
+// TR-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1, !nosanitize [[META2]]
+// TR-NEXT:    [[TMP5:%.*]] = xor i1 [[TMP4]], true, !nosanitize [[META2]]
+// TR-NEXT:    [[TMP6:%.*]] = call i1 @llvm.allow.ubsan.check(i8 0), !nosanitize [[META2]]
+// TR-NEXT:    [[TMP7:%.*]] = xor i1 [[TMP6]], true, !nosanitize [[META2]]
+// TR-NEXT:    [[TMP8:%.*]] = or i1 [[TMP5]], [[TMP7]], !nosanitize [[META2]]
+// TR-NEXT:    br i1 [[TMP8]], label [[CONT:%.*]], label [[TRAP:%.*]], !nosanitize [[META2]]
+// TR:       trap:
+// TR-NEXT:    call void @llvm.ubsantrap(i8 0) #[[ATTR4]], !nosanitize [[META2]]
+// TR-NEXT:    unreachable, !nosanitize [[META2]]
+// TR:       cont:
+// TR-NEXT:    ret i32 [[TMP3]]
 //
-// RECOVER-LABEL: define dso_local i32 @overflow(
-// RECOVER-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR0]] {
-// RECOVER-NEXT:  entry:
-// RECOVER-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
-// RECOVER-NEXT:    [[Y_ADDR:%.*]] = alloca i32, align 4
-// RECOVER-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
-// RECOVER-NEXT:    store i32 [[Y]], ptr [[Y_ADDR]], align 4
-// RECOVER-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
-// RECOVER-NEXT:    [[TMP1:%.*]] = load i32, ptr [[Y_ADDR]], align 4
-// RECOVER-NEXT:    [[TMP2:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[TMP0]], i32 [[TMP1]]), !nosanitize [[META2]]
-// RECOVER-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i1 } [[TMP2]], 0, !nosanitize [[META2]]
-// RECOVER-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1, !nosanitize [[META2]]
-// RECOVER-NEXT:    [[TMP5:%.*]] = xor i1 [[TMP4]], true, !nosanitize [[META2]]
-// RECOVER-NEXT:    [[TMP6:%.*]] = call i1 @llvm.allow.ubsan.check(i8 0), !nosanitize [[META2]]
-// RECOVER-NEXT:    [[TMP7:%.*]] = xor i1 [[TMP6]], true, !nosanitize [[META2]]
-// RECOVER-NEXT:    [[TMP8:%.*]] = or i1 [[TMP5]], [[TMP7]], !nosanitize [[META2]]
-// RECOVER-NEXT:    br i1 [[TMP8]], label [[CONT:%.*]], label [[HANDLER_ADD_OVERFLOW:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// RECOVER:       handler.add_overflow:
-// RECOVER-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP0]] to i64, !nosanitize [[META2]]
-// RECOVER-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP1]] to i64, !nosanitize [[META2]]
-// RECOVER-NEXT:    call void @__ubsan_handle_add_overflow(ptr @[[GLOB3:[0-9]+]], i64 [[TMP9]], i64 [[TMP10]]) #[[ATTR4]], !nosanitize [[META2]]
-// RECOVER-NEXT:    br label [[CONT]], !nosanitize [[META2]]
-// RECOVER:       cont:
-// RECOVER-NEXT:    ret i32 [[TMP3]]
+// REC-LABEL: define dso_local i32 @overflow(
+// REC-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR0]] {
+// REC-NEXT:  entry:
+// REC-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
+// REC-NEXT:    [[Y_ADDR:%.*]] = alloca i32, align 4
+// REC-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
+// REC-NEXT:    store i32 [[Y]], ptr [[Y_ADDR]], align 4
+// REC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
+// REC-NEXT:    [[TMP1:%.*]] = load i32, ptr [[Y_ADDR]], align 4
+// REC-NEXT:    [[TMP2:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[TMP0]], i32 [[TMP1]]), !nosanitize [[META2]]
+// REC-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i1 } [[TMP2]], 0, !nosanitize [[META2]]
+// REC-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1, !nosanitize [[META2]]
+// REC-NEXT:    [[TMP5:%.*]] = xor i1 [[TMP4]], true, !nosanitize [[META2]]
+// REC-NEXT:    [[TMP6:%.*]] = call i1 @llvm.allow.ubsan.check(i8 0), !nosanitize [[META2]]
+// REC-NEXT:    [[TMP7:%.*]] = xor i1 [[TMP6]], true, !nosanitize [[META2]]
+// REC-NEXT:    [[TMP8:%.*]] = or i1 [[TMP5]], [[TMP7]], !nosanitize [[META2]]
+// REC-NEXT:    br i1 [[TMP8]], label [[CONT:%.*]], label [[HANDLER_ADD_OVERFLOW:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// REC:       handler.add_overflow:
+// REC-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP0]] to i64, !nosanitize [[META2]]
+// REC-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP1]] to i64, !nosanitize [[META2]]
+// REC-NEXT:    call void @__ubsan_handle_add_overflow(ptr @[[GLOB3:[0-9]+]], i64 [[TMP9]], i64 [[TMP10]]) #[[ATTR4]], !nosanitize [[META2]]
+// REC-NEXT:    br label [[CONT]], !nosanitize [[META2]]
+// REC:       cont:
+// REC-NEXT:    ret i32 [[TMP3]]
 //
 int overflow(int x, int y) {
   return x + y;
@@ -227,8 +227,8 @@ int overflow(int x, int y) {
 // CHECK: [[META2]] = !{}
 // CHECK: [[PROF3]] = !{!"branch_weights", i32 1048575, i32 1}
 //.
-// TRAP: [[META2]] = !{}
+// TR: [[META2]] = !{}
 //.
-// RECOVER: [[META2]] = !{}
-// RECOVER: [[PROF3]] = !{!"branch_weights", i32 1048575, i32 1}
+// REC: [[META2]] = !{}
+// REC: [[PROF3]] = !{!"branch_weights", i32 1048575, i32 1}
 //.

From 48d0eb5181065a3d086de2e30f5c619fe407e4ce Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 9 Jan 2025 21:23:52 -0800
Subject: [PATCH 013/408] [CodeGen] Simplify EmitAssemblyHelper and
 emitBackendOutput

Prepare for -ftime-report change (#122225).
---
 clang/include/clang/CodeGen/BackendUtil.h     | 10 +--
 clang/lib/CodeGen/BackendConsumer.h           |  4 +-
 clang/lib/CodeGen/BackendUtil.cpp             | 75 +++++++++----------
 clang/lib/CodeGen/CodeGenAction.cpp           | 30 ++++----
 .../CodeGen/ObjectFilePCHContainerWriter.cpp  | 15 ++--
 5 files changed, 63 insertions(+), 71 deletions(-)

diff --git a/clang/include/clang/CodeGen/BackendUtil.h b/clang/include/clang/CodeGen/BackendUtil.h
index 7aa4f9db6c2e4..78d1e5ee8e6d5 100644
--- a/clang/include/clang/CodeGen/BackendUtil.h
+++ b/clang/include/clang/CodeGen/BackendUtil.h
@@ -25,11 +25,9 @@ class FileSystem;
 } // namespace llvm
 
 namespace clang {
+class CompilerInstance;
 class DiagnosticsEngine;
-class HeaderSearchOptions;
 class CodeGenOptions;
-class TargetOptions;
-class LangOptions;
 class BackendConsumer;
 
 enum BackendAction {
@@ -41,10 +39,8 @@ enum BackendAction {
   Backend_EmitObj       ///< Emit native object files
 };
 
-void EmitBackendOutput(DiagnosticsEngine &Diags, const HeaderSearchOptions &,
-                       const CodeGenOptions &CGOpts, const TargetOptions &TOpts,
-                       const LangOptions &LOpts, StringRef TDesc,
-                       llvm::Module *M, BackendAction Action,
+void emitBackendOutput(CompilerInstance &CI, StringRef TDesc, llvm::Module *M,
+                       BackendAction Action,
                        llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS,
                        std::unique_ptr<raw_pwrite_stream> OS,
                        BackendConsumer *BC = nullptr);
diff --git a/clang/lib/CodeGen/BackendConsumer.h b/clang/lib/CodeGen/BackendConsumer.h
index d932a78f469b9..ad3adfca36785 100644
--- a/clang/lib/CodeGen/BackendConsumer.h
+++ b/clang/lib/CodeGen/BackendConsumer.h
@@ -28,8 +28,8 @@ class BackendConsumer : public ASTConsumer {
   using LinkModule = CodeGenAction::LinkModule;
 
   virtual void anchor();
+  CompilerInstance &CI;
   DiagnosticsEngine &Diags;
-  const HeaderSearchOptions &HeaderSearchOpts;
   const CodeGenOptions &CodeGenOpts;
   const TargetOptions &TargetOpts;
   const LangOptions &LangOpts;
@@ -70,7 +70,7 @@ class BackendConsumer : public ASTConsumer {
   llvm::Module *CurLinkModule = nullptr;
 
 public:
-  BackendConsumer(const CompilerInstance &CI, BackendAction Action,
+  BackendConsumer(CompilerInstance &CI, BackendAction Action,
                   IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS,
                   llvm::LLVMContext &C, SmallVector<LinkModule, 4> LinkModules,
                   StringRef InFile, std::unique_ptr<raw_pwrite_stream> OS,
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 37ef2bd203fdf..29478f6829e39 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -129,8 +129,8 @@ std::string getDefaultProfileGenName() {
 }
 
 class EmitAssemblyHelper {
+  CompilerInstance &CI;
   DiagnosticsEngine &Diags;
-  const HeaderSearchOptions &HSOpts;
   const CodeGenOptions &CodeGenOpts;
   const clang::TargetOptions &TargetOpts;
   const LangOptions &LangOpts;
@@ -203,15 +203,11 @@ class EmitAssemblyHelper {
   }
 
 public:
-  EmitAssemblyHelper(DiagnosticsEngine &_Diags,
-                     const HeaderSearchOptions &HeaderSearchOpts,
-                     const CodeGenOptions &CGOpts,
-                     const clang::TargetOptions &TOpts,
-                     const LangOptions &LOpts, llvm::Module *M,
+  EmitAssemblyHelper(CompilerInstance &CI, llvm::Module *M,
                      IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS)
-      : Diags(_Diags), HSOpts(HeaderSearchOpts), CodeGenOpts(CGOpts),
-        TargetOpts(TOpts), LangOpts(LOpts), TheModule(M), VFS(std::move(VFS)),
-        CodeGenerationTime("codegen", "Code Generation Time"),
+      : CI(CI), Diags(CI.getDiagnostics()), CodeGenOpts(CI.getCodeGenOpts()),
+        TargetOpts(CI.getTargetOpts()), LangOpts(CI.getLangOpts()),
+        TheModule(M), VFS(std::move(VFS)),
         TargetTriple(TheModule->getTargetTriple()) {}
 
   ~EmitAssemblyHelper() {
@@ -222,7 +218,7 @@ class EmitAssemblyHelper {
   std::unique_ptr<TargetMachine> TM;
 
   // Emit output using the new pass manager for the optimization pipeline.
-  void EmitAssembly(BackendAction Action, std::unique_ptr<raw_pwrite_stream> OS,
+  void emitAssembly(BackendAction Action, std::unique_ptr<raw_pwrite_stream> OS,
                     BackendConsumer *BC);
 };
 } // namespace
@@ -351,12 +347,13 @@ static std::string flattenClangCommandLine(ArrayRef<std::string> Args,
   return FlatCmdLine;
 }
 
-static bool initTargetOptions(DiagnosticsEngine &Diags,
-                              llvm::TargetOptions &Options,
-                              const CodeGenOptions &CodeGenOpts,
-                              const clang::TargetOptions &TargetOpts,
-                              const LangOptions &LangOpts,
-                              const HeaderSearchOptions &HSOpts) {
+static bool initTargetOptions(const CompilerInstance &CI,
+                              DiagnosticsEngine &Diags,
+                              llvm::TargetOptions &Options) {
+  const auto &CodeGenOpts = CI.getCodeGenOpts();
+  const auto &TargetOpts = CI.getTargetOpts();
+  const auto &LangOpts = CI.getLangOpts();
+  const auto &HSOpts = CI.getHeaderSearchOpts();
   switch (LangOpts.getThreadModel()) {
   case LangOptions::ThreadModelKind::POSIX:
     Options.ThreadModel = llvm::ThreadModel::POSIX;
@@ -600,8 +597,7 @@ void EmitAssemblyHelper::CreateTargetMachine(bool MustCreateTM) {
   CodeGenOptLevel OptLevel = *OptLevelOrNone;
 
   llvm::TargetOptions Options;
-  if (!initTargetOptions(Diags, Options, CodeGenOpts, TargetOpts, LangOpts,
-                         HSOpts))
+  if (!initTargetOptions(CI, Diags, Options))
     return;
   TM.reset(TheTarget->createTargetMachine(Triple, TargetOpts.CPU, FeaturesStr,
                                           Options, RM, CM, OptLevel));
@@ -1207,7 +1203,7 @@ void EmitAssemblyHelper::RunCodegenPipeline(
   }
 }
 
-void EmitAssemblyHelper::EmitAssembly(BackendAction Action,
+void EmitAssemblyHelper::emitAssembly(BackendAction Action,
                                       std::unique_ptr<raw_pwrite_stream> OS,
                                       BackendConsumer *BC) {
   TimeRegion Region(CodeGenOpts.TimePasses ? &CodeGenerationTime : nullptr);
@@ -1234,13 +1230,14 @@ void EmitAssemblyHelper::EmitAssembly(BackendAction Action,
     DwoOS->keep();
 }
 
-static void runThinLTOBackend(
-    DiagnosticsEngine &Diags, ModuleSummaryIndex *CombinedIndex,
-    llvm::Module *M, const HeaderSearchOptions &HeaderOpts,
-    const CodeGenOptions &CGOpts, const clang::TargetOptions &TOpts,
-    const LangOptions &LOpts, std::unique_ptr<raw_pwrite_stream> OS,
-    std::string SampleProfile, std::string ProfileRemapping,
-    BackendAction Action) {
+static void
+runThinLTOBackend(CompilerInstance &CI, ModuleSummaryIndex *CombinedIndex,
+                  llvm::Module *M, std::unique_ptr<raw_pwrite_stream> OS,
+                  std::string SampleProfile, std::string ProfileRemapping,
+                  BackendAction Action) {
+  DiagnosticsEngine &Diags = CI.getDiagnostics();
+  const auto &CGOpts = CI.getCodeGenOpts();
+  const auto &TOpts = CI.getTargetOpts();
   DenseMap<StringRef, DenseMap<GlobalValue::GUID, GlobalValueSummary *>>
       ModuleToDefinedGVSummaries;
   CombinedIndex->collectDefinedGVSummariesPerModule(ModuleToDefinedGVSummaries);
@@ -1278,7 +1275,7 @@ static void runThinLTOBackend(
   assert(OptLevelOrNone && "Invalid optimization level!");
   Conf.CGOptLevel = *OptLevelOrNone;
   Conf.OptLevel = CGOpts.OptimizationLevel;
-  initTargetOptions(Diags, Conf.Options, CGOpts, TOpts, LOpts, HeaderOpts);
+  initTargetOptions(CI, Diags, Conf.Options);
   Conf.SampleProfile = std::move(SampleProfile);
   Conf.PTO.LoopUnrolling = CGOpts.UnrollLoops;
   // For historical reasons, loop interleaving is set to mirror setting for loop
@@ -1341,14 +1338,14 @@ static void runThinLTOBackend(
   }
 }
 
-void clang::EmitBackendOutput(
-    DiagnosticsEngine &Diags, const HeaderSearchOptions &HeaderOpts,
-    const CodeGenOptions &CGOpts, const clang::TargetOptions &TOpts,
-    const LangOptions &LOpts, StringRef TDesc, llvm::Module *M,
-    BackendAction Action, IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS,
-    std::unique_ptr<raw_pwrite_stream> OS, BackendConsumer *BC) {
-
+void clang::emitBackendOutput(CompilerInstance &CI, StringRef TDesc,
+                              llvm::Module *M, BackendAction Action,
+                              IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS,
+                              std::unique_ptr<raw_pwrite_stream> OS,
+                              BackendConsumer *BC) {
   llvm::TimeTraceScope TimeScope("Backend");
+  DiagnosticsEngine &Diags = CI.getDiagnostics();
+  const auto &CGOpts = CI.getCodeGenOpts();
 
   std::unique_ptr<llvm::Module> EmptyModule;
   if (!CGOpts.ThinLTOIndexFile.empty()) {
@@ -1371,9 +1368,9 @@ void clang::EmitBackendOutput(
     // of an error).
     if (CombinedIndex) {
       if (!CombinedIndex->skipModuleByDistributedBackend()) {
-        runThinLTOBackend(Diags, CombinedIndex.get(), M, HeaderOpts, CGOpts,
-                          TOpts, LOpts, std::move(OS), CGOpts.SampleProfileFile,
-                          CGOpts.ProfileRemappingFile, Action);
+        runThinLTOBackend(CI, CombinedIndex.get(), M, std::move(OS),
+                          CGOpts.SampleProfileFile, CGOpts.ProfileRemappingFile,
+                          Action);
         return;
       }
       // Distributed indexing detected that nothing from the module is needed
@@ -1388,8 +1385,8 @@ void clang::EmitBackendOutput(
     }
   }
 
-  EmitAssemblyHelper AsmHelper(Diags, HeaderOpts, CGOpts, TOpts, LOpts, M, VFS);
-  AsmHelper.EmitAssembly(Action, std::move(OS), BC);
+  EmitAssemblyHelper AsmHelper(CI, M, VFS);
+  AsmHelper.emitAssembly(Action, std::move(OS), BC);
 
   // Verify clang's TargetInfo DataLayout against the LLVM TargetMachine's
   // DataLayout.
diff --git a/clang/lib/CodeGen/CodeGenAction.cpp b/clang/lib/CodeGen/CodeGenAction.cpp
index f63cb9b082d5b..7446bddc11345 100644
--- a/clang/lib/CodeGen/CodeGenAction.cpp
+++ b/clang/lib/CodeGen/CodeGenAction.cpp
@@ -105,15 +105,17 @@ static void reportOptRecordError(Error E, DiagnosticsEngine &Diags,
       });
 }
 
-BackendConsumer::BackendConsumer(
-    const CompilerInstance &CI, BackendAction Action,
-    IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS, LLVMContext &C,
-    SmallVector<LinkModule, 4> LinkModules, StringRef InFile,
-    std::unique_ptr<raw_pwrite_stream> OS, CoverageSourceInfo *CoverageInfo,
-    llvm::Module *CurLinkModule)
-    : Diags(CI.getDiagnostics()), HeaderSearchOpts(CI.getHeaderSearchOpts()),
-      CodeGenOpts(CI.getCodeGenOpts()), TargetOpts(CI.getTargetOpts()),
-      LangOpts(CI.getLangOpts()), AsmOutStream(std::move(OS)), FS(VFS),
+BackendConsumer::BackendConsumer(CompilerInstance &CI, BackendAction Action,
+                                 IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS,
+                                 LLVMContext &C,
+                                 SmallVector<LinkModule, 4> LinkModules,
+                                 StringRef InFile,
+                                 std::unique_ptr<raw_pwrite_stream> OS,
+                                 CoverageSourceInfo *CoverageInfo,
+                                 llvm::Module *CurLinkModule)
+    : CI(CI), Diags(CI.getDiagnostics()), CodeGenOpts(CI.getCodeGenOpts()),
+      TargetOpts(CI.getTargetOpts()), LangOpts(CI.getLangOpts()),
+      AsmOutStream(std::move(OS)), FS(VFS),
       LLVMIRGeneration("irgen", "LLVM IR Generation Time"), Action(Action),
       Gen(CreateLLVMCodeGen(Diags, InFile, std::move(VFS),
                             CI.getHeaderSearchOpts(), CI.getPreprocessorOpts(),
@@ -321,8 +323,7 @@ void BackendConsumer::HandleTranslationUnit(ASTContext &C) {
 
   EmbedBitcode(getModule(), CodeGenOpts, llvm::MemoryBufferRef());
 
-  EmitBackendOutput(Diags, HeaderSearchOpts, CodeGenOpts, TargetOpts, LangOpts,
-                    C.getTargetInfo().getDataLayoutString(), getModule(),
+  emitBackendOutput(CI, C.getTargetInfo().getDataLayoutString(), getModule(),
                     Action, FS, std::move(AsmOutStream), this);
 
   Ctx.setDiagnosticHandler(std::move(OldDiagnosticHandler));
@@ -1183,10 +1184,9 @@ void CodeGenAction::ExecuteAction() {
   std::unique_ptr<llvm::ToolOutputFile> OptRecordFile =
       std::move(*OptRecordFileOrErr);
 
-  EmitBackendOutput(
-      Diagnostics, CI.getHeaderSearchOpts(), CodeGenOpts, TargetOpts,
-      CI.getLangOpts(), CI.getTarget().getDataLayoutString(), TheModule.get(),
-      BA, CI.getFileManager().getVirtualFileSystemPtr(), std::move(OS));
+  emitBackendOutput(CI, CI.getTarget().getDataLayoutString(), TheModule.get(),
+                    BA, CI.getFileManager().getVirtualFileSystemPtr(),
+                    std::move(OS));
   if (OptRecordFile)
     OptRecordFile->keep();
 }
diff --git a/clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp b/clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp
index 71745480706ed..5447b98d7105e 100644
--- a/clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp
+++ b/clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp
@@ -37,6 +37,7 @@ using namespace clang;
 
 namespace {
 class PCHContainerGenerator : public ASTConsumer {
+  CompilerInstance &CI;
   DiagnosticsEngine &Diags;
   const std::string MainFileName;
   const std::string OutputFileName;
@@ -139,7 +140,7 @@ class PCHContainerGenerator : public ASTConsumer {
                         const std::string &OutputFileName,
                         std::unique_ptr<raw_pwrite_stream> OS,
                         std::shared_ptr<PCHBuffer> Buffer)
-      : Diags(CI.getDiagnostics()), MainFileName(MainFileName),
+      : CI(CI), Diags(CI.getDiagnostics()), MainFileName(MainFileName),
         OutputFileName(OutputFileName), Ctx(nullptr),
         MMap(CI.getPreprocessor().getHeaderSearchInfo().getModuleMap()),
         FS(&CI.getVirtualFileSystem()),
@@ -317,19 +318,17 @@ class PCHContainerGenerator : public ASTConsumer {
     LLVM_DEBUG({
       // Print the IR for the PCH container to the debug output.
       llvm::SmallString<0> Buffer;
-      clang::EmitBackendOutput(
-          Diags, HeaderSearchOpts, CodeGenOpts, TargetOpts, LangOpts,
-          Ctx.getTargetInfo().getDataLayoutString(), M.get(),
+      clang::emitBackendOutput(
+          CI, Ctx.getTargetInfo().getDataLayoutString(), M.get(),
           BackendAction::Backend_EmitLL, FS,
           std::make_unique<llvm::raw_svector_ostream>(Buffer));
       llvm::dbgs() << Buffer;
     });
 
     // Use the LLVM backend to emit the pch container.
-    clang::EmitBackendOutput(Diags, HeaderSearchOpts, CodeGenOpts, TargetOpts,
-                             LangOpts,
-                             Ctx.getTargetInfo().getDataLayoutString(), M.get(),
-                             BackendAction::Backend_EmitObj, FS, std::move(OS));
+    clang::emitBackendOutput(CI, Ctx.getTargetInfo().getDataLayoutString(),
+                             M.get(), BackendAction::Backend_EmitObj, FS,
+                             std::move(OS));
 
     // Free the memory for the temporary buffer.
     llvm::SmallVector<char, 0> Empty;

From 186bd8e4cd8d239be67172448c53e92be396359a Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 9 Jan 2025 21:47:20 -0800
Subject: [PATCH 014/408] [CodeGen] Restore CodeGenerationTime

Fixes 48d0eb5181065a3d086de2e30f5c619fe407e4ce
---
 clang/lib/CodeGen/BackendUtil.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 29478f6829e39..2863887fd4d2f 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -208,6 +208,7 @@ class EmitAssemblyHelper {
       : CI(CI), Diags(CI.getDiagnostics()), CodeGenOpts(CI.getCodeGenOpts()),
         TargetOpts(CI.getTargetOpts()), LangOpts(CI.getLangOpts()),
         TheModule(M), VFS(std::move(VFS)),
+        CodeGenerationTime("codegen", "Code Generation Time"),
         TargetTriple(TheModule->getTargetTriple()) {}
 
   ~EmitAssemblyHelper() {

From 76fac9c01736b1254e42427f8e0910c0f1d01fba Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston@google.com>
Date: Thu, 9 Jan 2025 21:52:30 -0800
Subject: [PATCH 015/408] [sanitizer] Parse weighted sanitizer args and
 -fsanitize-skip-hot-cutoff (#121619)

This adds a function to parse weighted sanitizer flags (e.g.,
`-fsanitize-blah=undefined=0.5,null=0.3`) and adds the plumbing to apply
that to a new flag, `-fsanitize-skip-hot-cutoff`.

`-fsanitize-skip-hot-cutoff` currently has no effect; future work will
use it to generalize ubsan-guard-checks (originally introduced in
5f9ed2ff8364ff3e4fac410472f421299dafa793).

---------

Co-authored-by: Vitaly Buka <vitalybuka@google.com>
---
 clang/include/clang/Basic/CodeGenOptions.h |  5 ++
 clang/include/clang/Basic/Sanitizers.h     | 24 +++++++++
 clang/include/clang/Driver/Options.td      |  8 +++
 clang/include/clang/Driver/SanitizerArgs.h |  1 +
 clang/lib/Basic/Sanitizers.cpp             | 60 ++++++++++++++++++++++
 clang/lib/Driver/SanitizerArgs.cpp         | 53 +++++++++++++++++++
 clang/lib/Frontend/CompilerInvocation.cpp  | 22 ++++++++
 clang/test/Driver/fsanitize.c              | 53 +++++++++++++++++++
 8 files changed, 226 insertions(+)

diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h
index c555fb3b72d64..b64ad74d711c6 100644
--- a/clang/include/clang/Basic/CodeGenOptions.h
+++ b/clang/include/clang/Basic/CodeGenOptions.h
@@ -384,6 +384,11 @@ class CodeGenOptions : public CodeGenOptionsBase {
   /// the expense of debuggability).
   SanitizerSet SanitizeMergeHandlers;
 
+  /// Set of thresholds in a range [0.0, 1.0]: the top hottest code responsible
+  /// for the given fraction of PGO counters will be excluded from sanitization
+  /// (0.0 [default] to skip none, 1.0 to skip all).
+  SanitizerMaskCutoffs SanitizeSkipHotCutoffs;
+
   /// List of backend command-line options for -fembed-bitcode.
   std::vector<uint8_t> CmdArgs;
 
diff --git a/clang/include/clang/Basic/Sanitizers.h b/clang/include/clang/Basic/Sanitizers.h
index c890242269b33..2ff1acb772094 100644
--- a/clang/include/clang/Basic/Sanitizers.h
+++ b/clang/include/clang/Basic/Sanitizers.h
@@ -154,6 +154,16 @@ struct SanitizerKind {
 #include "clang/Basic/Sanitizers.def"
 }; // SanitizerKind
 
+class SanitizerMaskCutoffs {
+  std::vector<double> Cutoffs;
+
+public:
+  std::optional<double> operator[](unsigned Kind) const;
+
+  void set(SanitizerMask K, double V);
+  void clear(SanitizerMask K = SanitizerKind::All);
+};
+
 struct SanitizerSet {
   /// Check if a certain (single) sanitizer is enabled.
   bool has(SanitizerMask K) const {
@@ -186,10 +196,24 @@ struct SanitizerSet {
 /// Returns a non-zero SanitizerMask, or \c 0 if \p Value is not known.
 SanitizerMask parseSanitizerValue(StringRef Value, bool AllowGroups);
 
+/// Parse a single weighted value (e.g., 'undefined=0.05') from a -fsanitize= or
+/// -fno-sanitize= value list.
+/// The relevant weight(s) are updated in the passed Cutoffs parameter.
+/// Individual Cutoffs are never reset to zero unless explicitly set
+/// (e.g., 'null=0.0').
+/// Returns \c false if \p Value is not known or the weight is not valid.
+bool parseSanitizerWeightedValue(StringRef Value, bool AllowGroups,
+                                 SanitizerMaskCutoffs &Cutoffs);
+
 /// Serialize a SanitizerSet into values for -fsanitize= or -fno-sanitize=.
 void serializeSanitizerSet(SanitizerSet Set,
                            SmallVectorImpl<StringRef> &Values);
 
+/// Serialize a SanitizerMaskCutoffs into values for -fsanitize= or
+/// -fno-sanitize=.
+void serializeSanitizerMaskCutoffs(const SanitizerMaskCutoffs &Cutoffs,
+                                   SmallVectorImpl<std::string> &Values);
+
 /// For each sanitizer group bit set in \p Kinds, set the bits for sanitizers
 /// this group enables.
 SanitizerMask expandSanitizerGroups(SanitizerMask Kinds);
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index eb860f73121fd..41a7e8c372806 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2652,6 +2652,14 @@ def fsanitize_undefined_strip_path_components_EQ : Joined<["-"], "fsanitize-unde
   HelpText<"Strip (or keep only, if negative) a given number of path components "
            "when emitting check metadata.">,
   MarshallingInfoInt<CodeGenOpts<"EmitCheckPathComponentsToStrip">, "0", "int">;
+def fsanitize_skip_hot_cutoff_EQ
+    : CommaJoined<["-"], "fsanitize-skip-hot-cutoff=">,
+      Group<f_clang_Group>,
+      HelpText<
+          "Exclude sanitization for the top hottest code responsible for "
+          "the given fraction of PGO counters "
+          "(0.0 [default] = skip none; 1.0 = skip all). "
+          "Argument format: <sanitizer1>=<value1>,<sanitizer2>=<value2>,...">;
 
 } // end -f[no-]sanitize* flags
 
diff --git a/clang/include/clang/Driver/SanitizerArgs.h b/clang/include/clang/Driver/SanitizerArgs.h
index 3b275092bbbe8..a54995e2b153b 100644
--- a/clang/include/clang/Driver/SanitizerArgs.h
+++ b/clang/include/clang/Driver/SanitizerArgs.h
@@ -26,6 +26,7 @@ class SanitizerArgs {
   SanitizerSet RecoverableSanitizers;
   SanitizerSet TrapSanitizers;
   SanitizerSet MergeHandlers;
+  SanitizerMaskCutoffs SkipHotCutoffs;
 
   std::vector<std::string> UserIgnorelistFiles;
   std::vector<std::string> SystemIgnorelistFiles;
diff --git a/clang/lib/Basic/Sanitizers.cpp b/clang/lib/Basic/Sanitizers.cpp
index 62ccdf8e9bbf2..5b9b88d032702 100644
--- a/clang/lib/Basic/Sanitizers.cpp
+++ b/clang/lib/Basic/Sanitizers.cpp
@@ -14,10 +14,35 @@
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <optional>
 
 using namespace clang;
 
+static const double SanitizerMaskCutoffsEps = 0.000000001f;
+
+void SanitizerMaskCutoffs::set(SanitizerMask K, double V) {
+  if (V < SanitizerMaskCutoffsEps && Cutoffs.empty())
+    return;
+  for (unsigned int i = 0; i < SanitizerKind::SO_Count; ++i)
+    if (K & SanitizerMask::bitPosToMask(i)) {
+      Cutoffs.resize(SanitizerKind::SO_Count);
+      Cutoffs[i] = V;
+    }
+}
+
+std::optional<double> SanitizerMaskCutoffs::operator[](unsigned Kind) const {
+  if (Cutoffs.empty() || Cutoffs[Kind] < SanitizerMaskCutoffsEps)
+    return std::nullopt;
+
+  return Cutoffs[Kind];
+}
+
+void SanitizerMaskCutoffs::clear(SanitizerMask K) { set(K, 0); }
+
 // Once LLVM switches to C++17, the constexpr variables can be inline and we
 // won't need this.
 #define SANITIZER(NAME, ID) constexpr SanitizerMask SanitizerKind::ID;
@@ -36,6 +61,29 @@ SanitizerMask clang::parseSanitizerValue(StringRef Value, bool AllowGroups) {
   return ParsedKind;
 }
 
+bool clang::parseSanitizerWeightedValue(StringRef Value, bool AllowGroups,
+                                        SanitizerMaskCutoffs &Cutoffs) {
+  SanitizerMask ParsedKind = llvm::StringSwitch<SanitizerMask>(Value)
+#define SANITIZER(NAME, ID) .StartsWith(NAME "=", SanitizerKind::ID)
+#define SANITIZER_GROUP(NAME, ID, ALIAS)                                       \
+  .StartsWith(NAME "=",                                                        \
+              AllowGroups ? SanitizerKind::ID##Group : SanitizerMask())
+#include "clang/Basic/Sanitizers.def"
+                                 .Default(SanitizerMask());
+
+  if (!ParsedKind)
+    return false;
+  auto [N, W] = Value.split('=');
+  double A;
+  if (W.getAsDouble(A))
+    return false;
+  A = std::clamp(A, 0.0, 1.0);
+  // AllowGroups is already taken into account for ParsedKind,
+  // hence we unconditionally expandSanitizerGroups.
+  Cutoffs.set(expandSanitizerGroups(ParsedKind), A);
+  return true;
+}
+
 void clang::serializeSanitizerSet(SanitizerSet Set,
                                   SmallVectorImpl<StringRef> &Values) {
 #define SANITIZER(NAME, ID)                                                    \
@@ -44,6 +92,18 @@ void clang::serializeSanitizerSet(SanitizerSet Set,
 #include "clang/Basic/Sanitizers.def"
 }
 
+void clang::serializeSanitizerMaskCutoffs(
+    const SanitizerMaskCutoffs &Cutoffs, SmallVectorImpl<std::string> &Values) {
+#define SANITIZER(NAME, ID)                                                    \
+  if (auto C = Cutoffs[SanitizerKind::SO_##ID]) {                              \
+    std::string Str;                                                           \
+    llvm::raw_string_ostream OS(Str);                                          \
+    OS << NAME "=" << llvm::format("%.8f", *C);                                \
+    Values.emplace_back(StringRef(Str).rtrim('0'));                            \
+  }
+#include "clang/Basic/Sanitizers.def"
+}
+
 SanitizerMask clang::expandSanitizerGroups(SanitizerMask Kinds) {
 #define SANITIZER(NAME, ID)
 #define SANITIZER_GROUP(NAME, ID, ALIAS)                                       \
diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index 98116e2c8336b..a0d6919c6dc8d 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -10,6 +10,7 @@
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/Options.h"
 #include "clang/Driver/ToolChain.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Path.h"
@@ -113,6 +114,12 @@ enum BinaryMetadataFeature {
 static SanitizerMask parseArgValues(const Driver &D, const llvm::opt::Arg *A,
                                     bool DiagnoseErrors);
 
+/// Parse a -fsanitize=<sanitizer1>=<value1>... or -fno-sanitize= argument's
+/// values, diagnosing any invalid components.
+/// Cutoffs are stored in the passed parameter.
+static void parseArgCutoffs(const Driver &D, const llvm::opt::Arg *A,
+                            bool DiagnoseErrors, SanitizerMaskCutoffs &Cutoffs);
+
 /// Parse -f(no-)?sanitize-coverage= flag values, diagnosing any invalid
 /// components. Returns OR of members of \c CoverageFeature enumeration.
 static int parseCoverageFeatures(const Driver &D, const llvm::opt::Arg *A,
@@ -323,6 +330,19 @@ static SanitizerMask parseSanitizeTrapArgs(const Driver &D,
                            options::OPT_fno_sanitize_trap_EQ);
 }
 
+static SanitizerMaskCutoffs
+parseSanitizeSkipHotCutoffArgs(const Driver &D, const llvm::opt::ArgList &Args,
+                               bool DiagnoseErrors) {
+  SanitizerMaskCutoffs Cutoffs;
+  for (const auto *Arg : Args)
+    if (Arg->getOption().matches(options::OPT_fsanitize_skip_hot_cutoff_EQ)) {
+      Arg->claim();
+      parseArgCutoffs(D, Arg, DiagnoseErrors, Cutoffs);
+    }
+
+  return Cutoffs;
+}
+
 bool SanitizerArgs::needsFuzzerInterceptors() const {
   return needsFuzzer() && !needsAsanRt() && !needsTsanRt() && !needsMsanRt();
 }
@@ -713,6 +733,9 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC,
                         options::OPT_fno_sanitize_merge_handlers_EQ);
   MergeKinds &= Kinds;
 
+  // Parse -fno-sanitize-top-hot flags
+  SkipHotCutoffs = parseSanitizeSkipHotCutoffArgs(D, Args, DiagnoseErrors);
+
   // Setup ignorelist files.
   // Add default ignorelist from resource directory for activated sanitizers,
   // and validate special case lists format.
@@ -1132,6 +1155,9 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC,
          "Overlap between recoverable and trapping sanitizers");
 
   MergeHandlers.Mask |= MergeKinds;
+
+  // Zero out SkipHotCutoffs for unused sanitizers
+  SkipHotCutoffs.clear(~Sanitizers.Mask);
 }
 
 static std::string toString(const clang::SanitizerSet &Sanitizers) {
@@ -1146,6 +1172,12 @@ static std::string toString(const clang::SanitizerSet &Sanitizers) {
   return Res;
 }
 
+static std::string toString(const clang::SanitizerMaskCutoffs &Cutoffs) {
+  llvm::SmallVector<std::string, 4> Res;
+  serializeSanitizerMaskCutoffs(Cutoffs, Res);
+  return llvm::join(Res, ",");
+}
+
 static void addSpecialCaseListOpt(const llvm::opt::ArgList &Args,
                                   llvm::opt::ArgStringList &CmdArgs,
                                   const char *SCLOptFlag,
@@ -1297,6 +1329,11 @@ void SanitizerArgs::addArgs(const ToolChain &TC, const llvm::opt::ArgList &Args,
     CmdArgs.push_back(
         Args.MakeArgString("-fsanitize-merge=" + toString(MergeHandlers)));
 
+  std::string SkipHotCutoffsStr = toString(SkipHotCutoffs);
+  if (!SkipHotCutoffsStr.empty())
+    CmdArgs.push_back(
+        Args.MakeArgString("-fsanitize-skip-hot-cutoff=" + SkipHotCutoffsStr));
+
   addSpecialCaseListOpt(Args, CmdArgs,
                         "-fsanitize-ignorelist=", UserIgnorelistFiles);
   addSpecialCaseListOpt(Args, CmdArgs,
@@ -1494,6 +1531,22 @@ SanitizerMask parseArgValues(const Driver &D, const llvm::opt::Arg *A,
   return Kinds;
 }
 
+void parseArgCutoffs(const Driver &D, const llvm::opt::Arg *A,
+                     bool DiagnoseErrors, SanitizerMaskCutoffs &Cutoffs) {
+  assert(A->getOption().matches(options::OPT_fsanitize_skip_hot_cutoff_EQ) &&
+         "Invalid argument in parseArgCutoffs!");
+  for (int i = 0, n = A->getNumValues(); i != n; ++i) {
+    const char *Value = A->getValue(i);
+
+    // We don't check the value of Cutoffs[i]: it's legal to specify
+    // a cutoff of 0.
+    if (!parseSanitizerWeightedValue(Value, /*AllowGroups=*/true, Cutoffs) &&
+        DiagnoseErrors)
+      D.Diag(clang::diag::err_drv_unsupported_option_argument)
+          << A->getSpelling() << Value;
+  }
+}
+
 static int parseOverflowPatternExclusionValues(const Driver &D,
                                                const llvm::opt::Arg *A,
                                                bool DiagnoseErrors) {
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index d711df02ce950..39bed84536c6a 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -1453,6 +1453,18 @@ static SmallVector<StringRef, 4> serializeSanitizerKinds(SanitizerSet S) {
   return Values;
 }
 
+static SanitizerMaskCutoffs
+parseSanitizerWeightedKinds(StringRef FlagName,
+                            const std::vector<std::string> &Sanitizers,
+                            DiagnosticsEngine &Diags) {
+  SanitizerMaskCutoffs Cutoffs;
+  for (const auto &Sanitizer : Sanitizers) {
+    if (!parseSanitizerWeightedValue(Sanitizer, /*AllowGroups=*/false, Cutoffs))
+      Diags.Report(diag::err_drv_invalid_value) << FlagName << Sanitizer;
+  }
+  return Cutoffs;
+}
+
 static void parseXRayInstrumentationBundle(StringRef FlagName, StringRef Bundle,
                                            ArgList &Args, DiagnosticsEngine &D,
                                            XRayInstrSet &S) {
@@ -1813,6 +1825,11 @@ void CompilerInvocationBase::GenerateCodeGenArgs(const CodeGenOptions &Opts,
        serializeSanitizerKinds(Opts.SanitizeMergeHandlers))
     GenerateArg(Consumer, OPT_fsanitize_merge_handlers_EQ, Sanitizer);
 
+  SmallVector<std::string, 4> Values;
+  serializeSanitizerMaskCutoffs(Opts.SanitizeSkipHotCutoffs, Values);
+  for (std::string Sanitizer : Values)
+    GenerateArg(Consumer, OPT_fsanitize_skip_hot_cutoff_EQ, Sanitizer);
+
   if (!Opts.EmitVersionIdentMetadata)
     GenerateArg(Consumer, OPT_Qn);
 
@@ -2293,6 +2310,11 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args,
                       Args.getAllArgValues(OPT_fsanitize_merge_handlers_EQ),
                       Diags, Opts.SanitizeMergeHandlers);
 
+  // Parse -fsanitize-skip-hot-cutoff= arguments.
+  Opts.SanitizeSkipHotCutoffs = parseSanitizerWeightedKinds(
+      "-fsanitize-skip-hot-cutoff=",
+      Args.getAllArgValues(OPT_fsanitize_skip_hot_cutoff_EQ), Diags);
+
   Opts.EmitVersionIdentMetadata = Args.hasFlag(OPT_Qy, OPT_Qn, true);
 
   if (!LangOpts->CUDAIsDevice)
diff --git a/clang/test/Driver/fsanitize.c b/clang/test/Driver/fsanitize.c
index aeae15aada70c..1d3caec748d77 100644
--- a/clang/test/Driver/fsanitize.c
+++ b/clang/test/Driver/fsanitize.c
@@ -1154,3 +1154,56 @@
 
 // RUN: not %clang --target=x86_64-linux-gnu -fsanitize=realtime,undefined  %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-REALTIME-UBSAN
 // CHECK-REALTIME-UBSAN: error: invalid argument '-fsanitize=realtime' not allowed with '-fsanitize=undefined'
+
+
+// * Test -fsanitize-skip-hot-cutoff *
+
+// -fsanitize-skip-hot-cutoff=undefined=0.5
+// RUN: %clang -Werror --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-skip-hot-cutoff=undefined=0.5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SKIP-HOT-CUTOFF1
+// CHECK-SKIP-HOT-CUTOFF1: "-fsanitize-skip-hot-cutoff={{((signed-integer-overflow|integer-divide-by-zero|shift-base|shift-exponent|unreachable|return|vla-bound|alignment|null|pointer-overflow|float-cast-overflow|array-bounds|enum|bool|builtin|returns-nonnull-attribute|nonnull-attribute|function|vptr)=0.5(0*),?){19}"}}
+
+// No-op: no sanitizers are specified
+// RUN: %clang -Werror --target=x86_64-linux-gnu -fsanitize-skip-hot-cutoff=undefined=0.5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SKIP-HOT-CUTOFF2
+// CHECK-SKIP-HOT-CUTOFF2-NOT: "-fsanitize"
+// CHECK-SKIP-HOT-CUTOFF2-NOT: "-fsanitize-skip-hot-cutoff"
+
+// Enable undefined, then cancel out integer using a cutoff of 0.0
+// RUN: %clang -Werror --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-skip-hot-cutoff=undefined=0.5,integer=0.0 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SKIP-HOT-CUTOFF3
+// CHECK-SKIP-HOT-CUTOFF3: "-fsanitize-skip-hot-cutoff={{((unreachable|return|vla-bound|alignment|null|pointer-overflow|float-cast-overflow|array-bounds|enum|bool|builtin|returns-nonnull-attribute|nonnull-attribute|function|vptr)=0.5(0*),?){15}"}}
+
+// Enable undefined, then cancel out integer using a cutoff of 0.0, then re-enable signed-integer-overflow
+// RUN: %clang -Werror --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-skip-hot-cutoff=undefined=0.5,integer=0.0,signed-integer-overflow=0.7 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SKIP-HOT-CUTOFF4
+// CHECK-SKIP-HOT-CUTOFF4: "-fsanitize-skip-hot-cutoff={{((signed-integer-overflow|unreachable|return|vla-bound|alignment|null|pointer-overflow|float-cast-overflow|array-bounds|enum|bool|builtin|returns-nonnull-attribute|nonnull-attribute|function|vptr)=0.[57]0*,?){16}"}}
+
+// Check that -fsanitize-skip-hot-cutoff=undefined=0.4 does not widen the set of -fsanitize=integer checks.
+// RUN: %clang -Werror --target=x86_64-linux-gnu -fsanitize=integer -fsanitize-skip-hot-cutoff=undefined=0.4 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SKIP-HOT-CUTOFF5
+// CHECK-SKIP-HOT-CUTOFF5: "-fsanitize-skip-hot-cutoff={{((integer-divide-by-zero|shift-base|shift-exponent|signed-integer-overflow)=0.40*,?){4}"}}
+
+// No-op: it's allowed for the user to specify a cutoff of 0.0, though the argument is not passed along by the driver.
+// RUN: %clang -Werror --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-skip-hot-cutoff=undefined=0.0 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SKIP-HOT-CUTOFF6
+// CHECK-SKIP-HOT-CUTOFF6: "-fsanitize={{((signed-integer-overflow|integer-divide-by-zero|shift-base|shift-exponent|unreachable|return|vla-bound|alignment|null|pointer-overflow|float-cast-overflow|array-bounds|enum|bool|builtin|returns-nonnull-attribute|nonnull-attribute|function|vptr),?){19}"}}
+// CHECK-SKIP-HOT-CUTOFF6-NOT: "-fsanitize-skip-hot-cutoff"
+
+// Invalid: bad sanitizer
+// RUN: not %clang --target=x86_64-linux-gnu -fsanitize-skip-hot-cutoff=pot=0.0 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SKIP-HOT-CUTOFF7
+// CHECK-SKIP-HOT-CUTOFF7: unsupported argument 'pot=0.0' to option '-fsanitize-skip-hot-cutoff='
+
+// Invalid: bad cutoff
+// RUN: not %clang --target=x86_64-linux-gnu -fsanitize-skip-hot-cutoff=undefined=xyzzy %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SKIP-HOT-CUTOFF8
+// CHECK-SKIP-HOT-CUTOFF8: unsupported argument 'undefined=xyzzy' to option '-fsanitize-skip-hot-cutoff='
+
+// Invalid: -fno-sanitize-top without parameters
+// RUN: not %clang --target=x86_64-linux-gnu -fsanitize-skip-hot-cutoff %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SKIP-HOT-CUTOFF9
+// CHECK-SKIP-HOT-CUTOFF9: unknown argument: '-fsanitize-skip-hot-cutoff'
+
+// Invalid: -fno-sanitize-top=undefined without cutoff
+// RUN: not %clang --target=x86_64-linux-gnu -fsanitize-skip-hot-cutoff=undefined %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SKIP-HOT-CUTOFF10
+// CHECK-SKIP-HOT-CUTOFF10: unsupported argument 'undefined' to option '-fsanitize-skip-hot-cutoff='
+
+// Invalid: -fno-sanitize-top=undefined= without cutoff
+// RUN: not %clang --target=x86_64-linux-gnu -fsanitize-skip-hot-cutoff=undefined= %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SKIP-HOT-CUTOFF11
+// CHECK-SKIP-HOT-CUTOFF11: unsupported argument 'undefined=' to option '-fsanitize-skip-hot-cutoff='
+
+// No-op: -fno-sanitize-top= without parameters is unusual but valid
+// RUN: %clang -Werror --target=x86_64-linux-gnu -fsanitize-skip-hot-cutoff= %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SKIP-HOT-CUTOFF12
+// CHECK-SKIP-HOT-CUTOFF12-NOT: "-fsanitize-skip-hot-cutoff"

From a4472c7dac7dc69ef6e76ad7f92a1865f199e046 Mon Sep 17 00:00:00 2001
From: ssijaric-nv <ssijaric@nvidia.com>
Date: Thu, 9 Jan 2025 22:09:08 -0800
Subject: [PATCH 016/408] [AArch64] Fix the size passed to __trampoline_setup
 (#118234)

The trampoline size is 36 bytes on AArch64. The runtime function
__trampoline_setup aborts as it expects the trampoline size of at least 36
bytes, and the size passed is 20 bytes. Fix the inconsistency in
AArch64TargetLowering::LowerINIT_TRAMPOLINE.
---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 11 +++++++++--
 llvm/test/CodeGen/AArch64/trampoline.ll         | 15 ++++++++++++++-
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 23671c9ffcf19..443f5f71b1084 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7281,9 +7281,16 @@ SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   Entry.Ty = IntPtrTy;
   Entry.Node = Trmp;
   Args.push_back(Entry);
-  Entry.Node = DAG.getConstant(20, dl, MVT::i64);
-  Args.push_back(Entry);
 
+  if (auto *FI = dyn_cast<FrameIndexSDNode>(Trmp.getNode())) {
+    MachineFunction &MF = DAG.getMachineFunction();
+    MachineFrameInfo &MFI = MF.getFrameInfo();
+    Entry.Node =
+        DAG.getConstant(MFI.getObjectSize(FI->getIndex()), dl, MVT::i64);
+  } else
+    Entry.Node = DAG.getConstant(36, dl, MVT::i64);
+
+  Args.push_back(Entry);
   Entry.Node = FPtr;
   Args.push_back(Entry);
   Entry.Node = Nest;
diff --git a/llvm/test/CodeGen/AArch64/trampoline.ll b/llvm/test/CodeGen/AArch64/trampoline.ll
index 293e538a7459d..30ac2aa283b3e 100644
--- a/llvm/test/CodeGen/AArch64/trampoline.ll
+++ b/llvm/test/CodeGen/AArch64/trampoline.ll
@@ -1,5 +1,7 @@
 ; RUN: llc -mtriple=aarch64-- < %s | FileCheck %s
 
+@trampg = internal global [36 x i8] zeroinitializer, align 8
+
 declare void @llvm.init.trampoline(ptr, ptr, ptr);
 declare ptr @llvm.adjust.trampoline(ptr);
 
@@ -8,12 +10,23 @@ define i64 @f(ptr nest %c, i64 %x, i64 %y) {
   ret i64 %sum
 }
 
-define i64 @main() {
+define i64 @func1() {
   %val = alloca i64
   %nval = bitcast ptr %val to ptr
   %tramp = alloca [36 x i8], align 8
+  ; CHECK:	mov	w1, #36
   ; CHECK:	bl	__trampoline_setup
   call void @llvm.init.trampoline(ptr %tramp, ptr @f, ptr %nval)
   %fp = call ptr @llvm.adjust.trampoline(ptr %tramp)
   ret i64 0
 }
+
+define i64 @func2() {
+  %val = alloca i64
+  %nval = bitcast ptr %val to ptr
+  ; CHECK:	mov	w1, #36
+  ; CHECK:	bl	__trampoline_setup
+  call void @llvm.init.trampoline(ptr @trampg, ptr @f, ptr %nval)
+  %fp = call ptr @llvm.adjust.trampoline(ptr @trampg)
+  ret i64 0
+}

From 6829f30883fa7e71e3b7af022916003a82f0216d Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 9 Jan 2025 22:11:04 -0800
Subject: [PATCH 017/408] [RISCV] Add a default common assignment of Inst{6-2}
 to the RVInst16CI base class. NFC (#122377)

Many instructions assign all or a subset of Inst{6-2} to Imm{4-0}. Make
this the default. Subsets of Inst{6-2} can be overridden as needed by
derived classes/records which we already do with Inst{12} in a few
places.
---
 llvm/lib/Target/RISCV/RISCVInstrFormatsC.td |  4 ++-
 llvm/lib/Target/RISCV/RISCVInstrInfoC.td    | 27 +++------------------
 2 files changed, 7 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormatsC.td b/llvm/lib/Target/RISCV/RISCVInstrFormatsC.td
index e14be7dac08ea..198d1466f022e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormatsC.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormatsC.td
@@ -36,7 +36,8 @@ class RVInst16CR<bits<4> funct4, bits<2> opcode, dag outs, dag ins,
 
 // The immediate value encoding differs for each instruction, so each subclass
 // is responsible for setting the appropriate bits in the Inst field.
-// The bits Inst{6-2} must be set for each instruction.
+// The bits Inst{12} and Inst{6-2} may need to be set differently for some
+// instructions.
 class RVInst16CI<bits<3> funct3, bits<2> opcode, dag outs, dag ins,
                  string opcodestr, string argstr>
     : RVInst16<outs, ins, opcodestr, argstr, [], InstFormatCI> {
@@ -46,6 +47,7 @@ class RVInst16CI<bits<3> funct3, bits<2> opcode, dag outs, dag ins,
   let Inst{15-13} = funct3;
   let Inst{12} = imm{5};
   let Inst{11-7} = rd;
+  let Inst{6-2} = imm{4-0};
   let Inst{1-0} = opcode;
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
index 4438957b0dc90..1fab1fe1f3a15 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
@@ -400,7 +400,6 @@ def C_NOP : RVInst16CI<0b000, 0b01, (outs), (ins), "c.nop", "">,
             Sched<[WriteNop]> {
   let rd = 0;
   let imm = 0;
-  let Inst{6-2} = 0;
 }
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
@@ -409,7 +408,6 @@ def C_ADDI : RVInst16CI<0b000, 0b01, (outs GPRNoX0:$rd_wb),
                         "c.addi", "$rd, $imm">,
              Sched<[WriteIALU, ReadIALU]> {
   let Constraints = "$rd = $rd_wb";
-  let Inst{6-2} = imm{4-0};
 }
 
 // Alternate syntax for c.nop. Converted to C_NOP by the assembler.
@@ -431,15 +429,12 @@ def C_ADDIW : RVInst16CI<0b001, 0b01, (outs GPRNoX0:$rd_wb),
                          "c.addiw", "$rd, $imm">,
               Sched<[WriteIALU32, ReadIALU32]> {
   let Constraints = "$rd = $rd_wb";
-  let Inst{6-2} = imm{4-0};
 }
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 def C_LI : RVInst16CI<0b010, 0b01, (outs GPRNoX0:$rd), (ins simm6:$imm),
                       "c.li", "$rd, $imm">,
-           Sched<[WriteIALU]> {
-  let Inst{6-2} = imm{4-0};
-}
+           Sched<[WriteIALU]>;
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 def C_ADDI16SP : RVInst16CI<0b011, 0b01, (outs SP:$rd_wb),
@@ -459,9 +454,7 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 def C_LUI : RVInst16CI<0b011, 0b01, (outs GPRNoX0X2:$rd),
                        (ins c_lui_imm:$imm),
                        "c.lui", "$rd, $imm">,
-            Sched<[WriteIALU]> {
-  let Inst{6-2} = imm{4-0};
-}
+            Sched<[WriteIALU]>;
 
 def C_SRLI : Shift_right<0b00, "c.srli">,
              Sched<[WriteShiftImm, ReadShiftImm]>;
@@ -511,26 +504,22 @@ def C_SLLI : RVInst16CI<0b000, 0b10, (outs GPRNoX0:$rd_wb),
                         "c.slli", "$rd, $imm">,
              Sched<[WriteShiftImm, ReadShiftImm]> {
   let Constraints = "$rd = $rd_wb";
-  let Inst{6-2} = imm{4-0};
 }
 
 let Predicates = [HasStdExtCOrZcd, HasStdExtD] in
 def C_FLDSP  : CStackLoad<0b001, "c.fldsp", FPR64, uimm9_lsb000>,
                Sched<[WriteFLD64, ReadFMemBase]> {
-  let Inst{6-5} = imm{4-3};
   let Inst{4-2} = imm{8-6};
 }
 
 def C_LWSP : CStackLoad<0b010, "c.lwsp", GPRNoX0, uimm8_lsb00>,
              Sched<[WriteLDW, ReadMemBase]> {
-  let Inst{6-4} = imm{4-2};
   let Inst{3-2} = imm{7-6};
 }
 
 let isCodeGenOnly = 1 in
 def C_LWSP_INX : CStackLoad<0b010, "c.lwsp", GPRF32NoX0, uimm8_lsb00>,
                  Sched<[WriteLDW, ReadMemBase]> {
-  let Inst{6-4} = imm{4-2};
   let Inst{3-2} = imm{7-6};
 }
 
@@ -538,14 +527,12 @@ let DecoderNamespace = "RISCV32Only_",
     Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in
 def C_FLWSP  : CStackLoad<0b011, "c.flwsp", FPR32, uimm8_lsb00>,
                Sched<[WriteFLD32, ReadFMemBase]> {
-  let Inst{6-4} = imm{4-2};
   let Inst{3-2} = imm{7-6};
 }
 
 let Predicates = [HasStdExtCOrZca, IsRV64] in
 def C_LDSP : CStackLoad<0b011, "c.ldsp", GPRNoX0, uimm9_lsb000>,
              Sched<[WriteLDD, ReadMemBase]> {
-  let Inst{6-5} = imm{4-3};
   let Inst{4-2} = imm{8-6};
 }
 
@@ -634,7 +621,6 @@ let Predicates = [HasStdExtCOrZca, HasRVCHints], hasSideEffects = 0, mayLoad = 0
 def C_NOP_HINT : RVInst16CI<0b000, 0b01, (outs), (ins simm6nonzero:$imm),
                             "c.nop", "$imm">, Sched<[WriteNop]> {
   let rd = 0;
-  let Inst{6-2} = imm{4-0};
 }
 
 def C_ADDI_HINT_IMM_ZERO : RVInst16CI<0b000, 0b01, (outs GPRNoX0:$rd_wb),
@@ -642,15 +628,13 @@ def C_ADDI_HINT_IMM_ZERO : RVInst16CI<0b000, 0b01, (outs GPRNoX0:$rd_wb),
                                       "c.addi", "$rd, $imm">,
                            Sched<[WriteIALU, ReadIALU]> {
   let Constraints = "$rd = $rd_wb";
-  let Inst{12} = 0;
-  let Inst{6-2} = 0;
+  let imm = 0;
   let DecoderMethod = "decodeRVCInstrRdRs1ImmZero";
 }
 
 def C_LI_HINT : RVInst16CI<0b010, 0b01, (outs GPRX0:$rd), (ins simm6:$imm),
                            "c.li", "$rd, $imm">,
                 Sched<[WriteIALU]> {
-  let Inst{6-2} = imm{4-0};
   let Inst{11-7} = 0;
   let DecoderMethod = "decodeRVCInstrRdSImm";
 }
@@ -659,7 +643,6 @@ def C_LUI_HINT : RVInst16CI<0b011, 0b01, (outs GPRX0:$rd),
                             (ins c_lui_imm:$imm),
                             "c.lui", "$rd, $imm">,
                  Sched<[WriteIALU]> {
-  let Inst{6-2} = imm{4-0};
   let Inst{11-7} = 0;
   let DecoderMethod = "decodeRVCInstrRdSImm";
 }
@@ -684,7 +667,6 @@ def C_SLLI_HINT : RVInst16CI<0b000, 0b10, (outs GPRX0:$rd_wb),
                              "c.slli", "$rd, $imm">,
                   Sched<[WriteShiftImm, ReadShiftImm]> {
   let Constraints = "$rd = $rd_wb";
-  let Inst{6-2} = imm{4-0};
   let Inst{11-7} = 0;
   let DecoderMethod = "decodeRVCInstrRdRs1UImm";
 }
@@ -693,8 +675,7 @@ def C_SLLI64_HINT : RVInst16CI<0b000, 0b10, (outs GPR:$rd_wb), (ins GPR:$rd),
                                "c.slli64", "$rd">,
                     Sched<[WriteShiftImm, ReadShiftImm]> {
   let Constraints = "$rd = $rd_wb";
-  let Inst{6-2} = 0;
-  let Inst{12} = 0;
+  let imm = 0;
 }
 
 def C_SRLI64_HINT : RVInst16CB<0b100, 0b01, (outs GPRC:$rd),

From a531800344dc54e9c197a13b22e013f919f3f5e1 Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke@amd.com>
Date: Fri, 10 Jan 2025 11:46:56 +0530
Subject: [PATCH 018/408] Spiller: Detach legacy pass and supply analyses
 instead (#119181)

Makes Inline Spiller amenable to the new PM.
---
 llvm/include/llvm/CodeGen/Spiller.h | 16 +++++++++++--
 llvm/lib/CodeGen/InlineSpiller.cpp  | 36 +++++++++++------------------
 llvm/lib/CodeGen/RegAllocBasic.cpp  | 16 +++++++++----
 llvm/lib/CodeGen/RegAllocGreedy.cpp |  4 +++-
 llvm/lib/CodeGen/RegAllocPBQP.cpp   |  5 +++-
 5 files changed, 46 insertions(+), 31 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/Spiller.h b/llvm/include/llvm/CodeGen/Spiller.h
index 51ad36bc6b1f8..3132cefeb6c68 100644
--- a/llvm/include/llvm/CodeGen/Spiller.h
+++ b/llvm/include/llvm/CodeGen/Spiller.h
@@ -19,6 +19,10 @@ class MachineFunction;
 class MachineFunctionPass;
 class VirtRegMap;
 class VirtRegAuxInfo;
+class LiveIntervals;
+class LiveStacks;
+class MachineDominatorTree;
+class MachineBlockFrequencyInfo;
 
 /// Spiller interface.
 ///
@@ -41,12 +45,20 @@ class Spiller {
   virtual ArrayRef<Register> getReplacedRegs() = 0;
 
   virtual void postOptimization() {}
+
+  struct RequiredAnalyses {
+    LiveIntervals &LIS;
+    LiveStacks &LSS;
+    MachineDominatorTree &MDT;
+    const MachineBlockFrequencyInfo &MBFI;
+  };
 };
 
 /// Create and return a spiller that will insert spill code directly instead
 /// of deferring though VirtRegMap.
-Spiller *createInlineSpiller(MachineFunctionPass &Pass, MachineFunction &MF,
-                             VirtRegMap &VRM, VirtRegAuxInfo &VRAI);
+Spiller *createInlineSpiller(const Spiller::RequiredAnalyses &Analyses,
+                             MachineFunction &MF, VirtRegMap &VRM,
+                             VirtRegAuxInfo &VRAI);
 
 } // end namespace llvm
 
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index 64f290f5930a1..b9768d5c63a5d 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -75,7 +75,6 @@ RestrictStatepointRemat("restrict-statepoint-remat",
                        cl::desc("Restrict remat for statepoint operands"));
 
 namespace {
-
 class HoistSpillHelper : private LiveRangeEdit::Delegate {
   MachineFunction &MF;
   LiveIntervals &LIS;
@@ -128,15 +127,11 @@ class HoistSpillHelper : private LiveRangeEdit::Delegate {
                       DenseMap<MachineBasicBlock *, unsigned> &SpillsToIns);
 
 public:
-  HoistSpillHelper(MachineFunctionPass &pass, MachineFunction &mf,
-                   VirtRegMap &vrm)
-      : MF(mf), LIS(pass.getAnalysis<LiveIntervalsWrapperPass>().getLIS()),
-        LSS(pass.getAnalysis<LiveStacksWrapperLegacy>().getLS()),
-        MDT(pass.getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree()),
+  HoistSpillHelper(const Spiller::RequiredAnalyses &Analyses,
+                   MachineFunction &mf, VirtRegMap &vrm)
+      : MF(mf), LIS(Analyses.LIS), LSS(Analyses.LSS), MDT(Analyses.MDT),
         VRM(vrm), MRI(mf.getRegInfo()), TII(*mf.getSubtarget().getInstrInfo()),
-        TRI(*mf.getSubtarget().getRegisterInfo()),
-        MBFI(
-            pass.getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI()),
+        TRI(*mf.getSubtarget().getRegisterInfo()), MBFI(Analyses.MBFI),
         IPA(LIS, mf.getNumBlockIDs()) {}
 
   void addToMergeableSpills(MachineInstr &Spill, int StackSlot,
@@ -190,16 +185,12 @@ class InlineSpiller : public Spiller {
   ~InlineSpiller() override = default;
 
 public:
-  InlineSpiller(MachineFunctionPass &Pass, MachineFunction &MF, VirtRegMap &VRM,
-                VirtRegAuxInfo &VRAI)
-      : MF(MF), LIS(Pass.getAnalysis<LiveIntervalsWrapperPass>().getLIS()),
-        LSS(Pass.getAnalysis<LiveStacksWrapperLegacy>().getLS()),
-        MDT(Pass.getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree()),
+  InlineSpiller(const Spiller::RequiredAnalyses &Analyses, MachineFunction &MF,
+                VirtRegMap &VRM, VirtRegAuxInfo &VRAI)
+      : MF(MF), LIS(Analyses.LIS), LSS(Analyses.LSS), MDT(Analyses.MDT),
         VRM(VRM), MRI(MF.getRegInfo()), TII(*MF.getSubtarget().getInstrInfo()),
-        TRI(*MF.getSubtarget().getRegisterInfo()),
-        MBFI(
-            Pass.getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI()),
-        HSpiller(Pass, MF, VRM), VRAI(VRAI) {}
+        TRI(*MF.getSubtarget().getRegisterInfo()), MBFI(Analyses.MBFI),
+        HSpiller(Analyses, MF, VRM), VRAI(VRAI) {}
 
   void spill(LiveRangeEdit &) override;
   ArrayRef<Register> getSpilledRegs() override { return RegsToSpill; }
@@ -237,10 +228,11 @@ Spiller::~Spiller() = default;
 
 void Spiller::anchor() {}
 
-Spiller *llvm::createInlineSpiller(MachineFunctionPass &Pass,
-                                   MachineFunction &MF, VirtRegMap &VRM,
-                                   VirtRegAuxInfo &VRAI) {
-  return new InlineSpiller(Pass, MF, VRM, VRAI);
+Spiller *
+llvm::createInlineSpiller(const InlineSpiller::RequiredAnalyses &Analyses,
+                          MachineFunction &MF, VirtRegMap &VRM,
+                          VirtRegAuxInfo &VRAI) {
+  return new InlineSpiller(Analyses, MF, VRM, VRAI);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/RegAllocBasic.cpp b/llvm/lib/CodeGen/RegAllocBasic.cpp
index c05aa1e40e477..f3f34f890be11 100644
--- a/llvm/lib/CodeGen/RegAllocBasic.cpp
+++ b/llvm/lib/CodeGen/RegAllocBasic.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/LiveRegMatrix.h"
 #include "llvm/CodeGen/LiveStacks.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/Passes.h"
@@ -187,6 +188,7 @@ void RABasic::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<ProfileSummaryInfoWrapperPass>();
   AU.addRequired<MachineBlockFrequencyInfoWrapperPass>();
   AU.addPreserved<MachineBlockFrequencyInfoWrapperPass>();
+  AU.addRequired<MachineDominatorTreeWrapperPass>();
   AU.addRequiredID(MachineDominatorsID);
   AU.addPreservedID(MachineDominatorsID);
   AU.addRequired<MachineLoopInfoWrapperPass>();
@@ -310,16 +312,20 @@ bool RABasic::runOnMachineFunction(MachineFunction &mf) {
                     << "********** Function: " << mf.getName() << '\n');
 
   MF = &mf;
+  auto &MBFI = getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI();
+  auto &LiveStks = getAnalysis<LiveStacksWrapperLegacy>().getLS();
+  auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+
   RegAllocBase::init(getAnalysis<VirtRegMapWrapperLegacy>().getVRM(),
                      getAnalysis<LiveIntervalsWrapperPass>().getLIS(),
                      getAnalysis<LiveRegMatrixWrapperLegacy>().getLRM());
-  VirtRegAuxInfo VRAI(
-      *MF, *LIS, *VRM, getAnalysis<MachineLoopInfoWrapperPass>().getLI(),
-      getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI(),
-      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI());
+  VirtRegAuxInfo VRAI(*MF, *LIS, *VRM,
+                      getAnalysis<MachineLoopInfoWrapperPass>().getLI(), MBFI,
+                      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI());
   VRAI.calculateSpillWeightsAndHints();
 
-  SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM, VRAI));
+  SpillerInstance.reset(
+      createInlineSpiller({*LIS, LiveStks, MDT, MBFI}, *MF, *VRM, VRAI));
 
   allocatePhysRegs();
   postOptimization();
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index b94992c20b119..66e9cf546b837 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -2750,6 +2750,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   Bundles = &getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles();
   SpillPlacer = &getAnalysis<SpillPlacementWrapperLegacy>().getResult();
   DebugVars = &getAnalysis<LiveDebugVariablesWrapperLegacy>().getLDV();
+  auto &LSS = getAnalysis<LiveStacksWrapperLegacy>().getLS();
 
   initializeCSRCost();
 
@@ -2770,7 +2771,8 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
       getAnalysis<RegAllocPriorityAdvisorAnalysis>().getAdvisor(*MF, *this);
 
   VRAI = std::make_unique<VirtRegAuxInfo>(*MF, *LIS, *VRM, *Loops, *MBFI);
-  SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM, *VRAI));
+  SpillerInstance.reset(
+      createInlineSpiller({*LIS, LSS, *DomTree, *MBFI}, *MF, *VRM, *VRAI));
 
   VRAI->calculateSpillWeightsAndHints();
 
diff --git a/llvm/lib/CodeGen/RegAllocPBQP.cpp b/llvm/lib/CodeGen/RegAllocPBQP.cpp
index 696c312e4ba00..e230a1be95c9f 100644
--- a/llvm/lib/CodeGen/RegAllocPBQP.cpp
+++ b/llvm/lib/CodeGen/RegAllocPBQP.cpp
@@ -794,6 +794,9 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
   MachineBlockFrequencyInfo &MBFI =
       getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI();
 
+  auto &LiveStks = getAnalysis<LiveStacksWrapperLegacy>().getLS();
+  auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+
   VirtRegMap &VRM = getAnalysis<VirtRegMapWrapperLegacy>().getVRM();
 
   PBQPVirtRegAuxInfo VRAI(
@@ -807,7 +810,7 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
   VirtRegAuxInfo DefaultVRAI(
       MF, LIS, VRM, getAnalysis<MachineLoopInfoWrapperPass>().getLI(), MBFI);
   std::unique_ptr<Spiller> VRegSpiller(
-      createInlineSpiller(*this, MF, VRM, DefaultVRAI));
+      createInlineSpiller({LIS, LiveStks, MDT, MBFI}, MF, VRM, DefaultVRAI));
 
   MF.getRegInfo().freezeReservedRegs();
 

From a8e1135baa9074f7c088c8e1999561f88699b56e Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Thu, 9 Jan 2025 22:36:10 -0800
Subject: [PATCH 019/408] [WebAssembly] Add -wasm-use-legacy-eh option
 (#122158)

This replaces the existing `-wasm-enable-exnref` with
`-wasm-use-legacy-eh` option, in an effort to make the new standardized
exnref proposal the 'default' state and the legacy proposal needs to be
separately enabled an option. But given that most users haven't switched
to the new proposal and major web browsers haven't turned it on by
default, this `-wasm-use-legacy-eh` is turned on by default, so nothing
will change for now for the functionality perspective.

This also removes the restriction that `-wasm-enable-exnref` be only
used with `-wasm-enable-eh` because this option is enabled by default.
This option does not have any effect when `-wasm-enable-eh` is not used.
---
 llvm/docs/ReleaseNotes.md                     |  8 +++++++
 .../MCTargetDesc/WebAssemblyMCTargetDesc.cpp  | 16 +++++++------
 .../MCTargetDesc/WebAssemblyMCTargetDesc.h    |  2 +-
 .../WebAssembly/WebAssemblyCFGStackify.cpp    | 24 +++++++++----------
 .../WebAssembly/WebAssemblyISelDAGToDAG.cpp   |  6 ++---
 .../WebAssembly/WebAssemblyLateEHPrepare.cpp  |  8 +++----
 .../WebAssembly/WebAssemblyTargetMachine.cpp  |  5 +---
 .../WebAssembly/cfg-stackify-eh-legacy.ll     | 10 ++++----
 .../WebAssembly/cfg-stackify-eh-legacy.mir    |  2 +-
 .../CodeGen/WebAssembly/cfg-stackify-eh.ll    | 10 ++++----
 .../CodeGen/WebAssembly/eh-option-errors.ll   |  3 ---
 .../CodeGen/WebAssembly/exception-legacy.ll   |  6 ++---
 .../CodeGen/WebAssembly/exception-legacy.mir  |  2 +-
 llvm/test/CodeGen/WebAssembly/exception.ll    |  8 +++----
 14 files changed, 57 insertions(+), 53 deletions(-)

diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 4d5f5fb0384b8..a3febf27ae833 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -250,8 +250,16 @@ Changes to the WebAssembly Backend
   `-mmutable-globals`, `-mcall-indirect-overlong`, `-msign-ext`,
   `-mbulk-memory-opt`, `-mnontrapping-fptoint`, and `-mextended-const`.
 
+* Support for the new standardized [Exception Handling] proposal is added.
+  The [legacy Exception Handling] proposal is still supported, and turned on by
+  the newly added `-wasm-use-legacy-eh` option. Given that major web browsers
+  still default to the legacy EH proposal, this option is turned on by default
+  for the moment.
+
 [Bulk Memory Operations]: https://github.com/WebAssembly/bulk-memory-operations/blob/master/proposals/bulk-memory-operations/Overview.md
 [Non-trapping float-to-int Conversions]: https://github.com/WebAssembly/spec/blob/master/proposals/nontrapping-float-to-int-conversion/Overview.md
+[Exception Handling]: https://github.com/WebAssembly/exception-handling/blob/main/proposals/exception-handling/Exceptions.md
+[legacy Exception Handling]: https://github.com/WebAssembly/exception-handling/blob/main/proposals/exception-handling/legacy/Exceptions.md
 [widely implemented in engines]: https://webassembly.org/features/
 [here]: https://github.com/WebAssembly/tool-conventions/blob/main/Lime.md#lime1
 
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
index cea282f62fe15..a4162a07ee33f 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -54,13 +54,15 @@ cl::opt<bool>
 // setjmp/longjmp handling using wasm EH instrutions
 cl::opt<bool> WebAssembly::WasmEnableSjLj(
     "wasm-enable-sjlj", cl::desc("WebAssembly setjmp/longjmp handling"));
-// Whether we use the new exnref Wasm EH proposal adopted on Oct 2023.
-// Should be used with -wasm-enable-eh.
-// Currently set to false by default, but will later change to true and then
-// later can be removed after the legacy WAsm EH instructions are removed.
-cl::opt<bool> WebAssembly::WasmEnableExnref(
-    "wasm-enable-exnref", cl::desc("WebAssembly exception handling (exnref)"),
-    cl::init(false));
+// If true, use the legacy Wasm EH proposal:
+// https://github.com/WebAssembly/exception-handling/blob/main/proposals/exception-handling/legacy/Exceptions.md
+// And if false, use the standardized Wasm EH proposal:
+// https://github.com/WebAssembly/exception-handling/blob/main/proposals/exception-handling/Exceptions.md
+// Currently set to true by default because not all major web browsers turn on
+// the new standard proposal by default, but will later change to false.
+cl::opt<bool> WebAssembly::WasmUseLegacyEH(
+    "wasm-use-legacy-eh", cl::desc("WebAssembly exception handling (legacy)"),
+    cl::init(true));
 
 static MCAsmInfo *createMCAsmInfo(const MCRegisterInfo & /*MRI*/,
                                   const Triple &TT,
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 3900d4a0aa704..d6a2fe4c7839d 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -44,7 +44,7 @@ extern cl::opt<bool> WasmEnableEmEH;   // asm.js-style EH
 extern cl::opt<bool> WasmEnableEmSjLj; // asm.js-style SjLJ
 extern cl::opt<bool> WasmEnableEH;     // EH using Wasm EH instructions
 extern cl::opt<bool> WasmEnableSjLj;   // SjLj using Wasm EH instructions
-extern cl::opt<bool> WasmEnableExnref; // EH using new Wasm EH (exnref)
+extern cl::opt<bool> WasmUseLegacyEH;  // Legacy Wasm EH
 
 enum OperandType {
   /// Basic block label in a branch construct.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index 90b5945541dd7..6cae0e766dbc0 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -1942,7 +1942,7 @@ bool WebAssemblyCFGStackify::fixCallUnwindMismatches(MachineFunction &MF) {
 
   // When end_loop is before end_try_table within the same BB in unwind
   // destinations, we should split the end_loop into another BB.
-  if (WebAssembly::WasmEnableExnref)
+  if (!WebAssembly::WasmUseLegacyEH)
     for (auto &[UnwindDest, _] : UnwindDestToTryRanges)
       splitEndLoopBB(UnwindDest);
 
@@ -1975,10 +1975,10 @@ bool WebAssemblyCFGStackify::fixCallUnwindMismatches(MachineFunction &MF) {
           MBB->removeSuccessor(EHPad);
       }
 
-      if (WebAssembly::WasmEnableExnref)
-        addNestedTryTable(RangeBegin, RangeEnd, UnwindDest);
-      else
+      if (WebAssembly::WasmUseLegacyEH)
         addNestedTryDelegate(RangeBegin, RangeEnd, UnwindDest);
+      else
+        addNestedTryTable(RangeBegin, RangeEnd, UnwindDest);
     }
   }
 
@@ -2188,15 +2188,15 @@ bool WebAssemblyCFGStackify::fixCatchUnwindMismatches(MachineFunction &MF) {
   for (auto &[EHPad, UnwindDest] : EHPadToUnwindDest) {
     MachineInstr *Try = EHPadToTry[EHPad];
     MachineInstr *EndTry = BeginToEnd[Try];
-    if (WebAssembly::WasmEnableExnref) {
-      addNestedTryTable(Try, EndTry, UnwindDest);
-    } else {
+    if (WebAssembly::WasmUseLegacyEH) {
       addNestedTryDelegate(Try, EndTry, UnwindDest);
       NewEndTryBBs.insert(EndTry->getParent());
+    } else {
+      addNestedTryTable(Try, EndTry, UnwindDest);
     }
   }
 
-  if (WebAssembly::WasmEnableExnref)
+  if (!WebAssembly::WasmUseLegacyEH)
     return true;
 
   // Adding a try-delegate wrapping an existing try-catch-end can make existing
@@ -2387,10 +2387,10 @@ void WebAssemblyCFGStackify::placeMarkers(MachineFunction &MF) {
       // Place the TRY/TRY_TABLE for MBB if MBB is the EH pad of an exception.
       if (MCAI->getExceptionHandlingType() == ExceptionHandling::Wasm &&
           MF.getFunction().hasPersonalityFn()) {
-        if (WebAssembly::WasmEnableExnref)
-          placeTryTableMarker(MBB);
-        else
+        if (WebAssembly::WasmUseLegacyEH)
           placeTryMarker(MBB);
+        else
+          placeTryTableMarker(MBB);
       }
     } else {
       // Place the BLOCK for MBB if MBB is branched to from above.
@@ -2576,7 +2576,7 @@ bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
 
   // Remove unnecessary instructions possibly introduced by try/end_trys.
   if (MCAI->getExceptionHandlingType() == ExceptionHandling::Wasm &&
-      MF.getFunction().hasPersonalityFn() && !WebAssembly::WasmEnableExnref)
+      MF.getFunction().hasPersonalityFn() && WebAssembly::WasmUseLegacyEH)
     removeUnnecessaryInstrs(MF);
 
   // Convert MBB operands in terminators to relative depth immediates.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
index 0b38c5f69e8e6..48c0b7e50f080 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -210,9 +210,9 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
     case Intrinsic::wasm_catch: {
       int Tag = Node->getConstantOperandVal(2);
       SDValue SymNode = getTagSymNode(Tag, CurDAG);
-      unsigned CatchOpcode = WebAssembly::WasmEnableExnref
-                                 ? WebAssembly::CATCH
-                                 : WebAssembly::CATCH_LEGACY;
+      unsigned CatchOpcode = WebAssembly::WasmUseLegacyEH
+                                 ? WebAssembly::CATCH_LEGACY
+                                 : WebAssembly::CATCH;
       MachineSDNode *Catch =
           CurDAG->getMachineNode(CatchOpcode, DL,
                                  {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
index e9b2df11b4915..254ad5c4f2beb 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
@@ -128,7 +128,7 @@ bool WebAssemblyLateEHPrepare::runOnMachineFunction(MachineFunction &MF) {
     Changed |= hoistCatches(MF);
     Changed |= addCatchAlls(MF);
     Changed |= replaceFuncletReturns(MF);
-    if (WebAssembly::WasmEnableExnref)
+    if (!WebAssembly::WasmUseLegacyEH)
       Changed |= addCatchRefsAndThrowRefs(MF);
   }
   Changed |= removeUnnecessaryUnreachables(MF);
@@ -217,9 +217,9 @@ bool WebAssemblyLateEHPrepare::addCatchAlls(MachineFunction &MF) {
     if (InsertPos == MBB.end() ||
         !WebAssembly::isCatch(InsertPos->getOpcode())) {
       Changed = true;
-      unsigned CatchAllOpcode = WebAssembly::WasmEnableExnref
-                                    ? WebAssembly::CATCH_ALL
-                                    : WebAssembly::CATCH_ALL_LEGACY;
+      unsigned CatchAllOpcode = WebAssembly::WasmUseLegacyEH
+                                    ? WebAssembly::CATCH_ALL_LEGACY
+                                    : WebAssembly::CATCH_ALL;
       BuildMI(MBB, InsertPos,
               InsertPos == MBB.end() ? DebugLoc() : InsertPos->getDebugLoc(),
               TII.get(CatchAllOpcode));
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 9c95d730480f9..ba8c479a658fc 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -400,8 +400,8 @@ FunctionPass *WebAssemblyPassConfig::createTargetRegisterAllocator(bool) {
 using WebAssembly::WasmEnableEH;
 using WebAssembly::WasmEnableEmEH;
 using WebAssembly::WasmEnableEmSjLj;
-using WebAssembly::WasmEnableExnref;
 using WebAssembly::WasmEnableSjLj;
+using WebAssembly::WasmUseLegacyEH;
 
 static void basicCheckForEHAndSjLj(TargetMachine *TM) {
 
@@ -417,9 +417,6 @@ static void basicCheckForEHAndSjLj(TargetMachine *TM) {
   if (WasmEnableEmEH && WasmEnableSjLj)
     report_fatal_error(
         "-enable-emscripten-cxx-exceptions not allowed with -wasm-enable-sjlj");
-  if (WasmEnableExnref && !WasmEnableEH)
-    report_fatal_error(
-        "-wasm-enable-exnref should be used with -wasm-enable-eh");
 
   // Here we make sure TargetOptions.ExceptionModel is the same as
   // MCAsmInfo.ExceptionsType. Normally these have to be the same, because clang
diff --git a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.ll b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.ll
index ab9023cbac604..f0a1d9805c806 100644
--- a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.ll
+++ b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.ll
@@ -1,9 +1,9 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling,bulk-memory | FileCheck %s
-; RUN: llc < %s -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling,bulk-memory
-; RUN: llc < %s -O0 -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -verify-machineinstrs -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling,-bulk-memory,-bulk-memory-opt | FileCheck %s --check-prefix=NOOPT
-; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling,-bulk-memory,-bulk-memory-opt -wasm-disable-ehpad-sort -stats 2>&1 | FileCheck %s --check-prefix=NOSORT
-; RUN: llc < %s -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling,-bulk-memory,-bulk-memory-opt -wasm-disable-ehpad-sort | FileCheck %s --check-prefix=NOSORT-LOCALS
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -wasm-use-legacy-eh -exception-model=wasm -mattr=+exception-handling,bulk-memory | FileCheck %s
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -wasm-use-legacy-eh -exception-model=wasm -mattr=+exception-handling,bulk-memory
+; RUN: llc < %s -O0 -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -verify-machineinstrs -wasm-enable-eh -wasm-use-legacy-eh -exception-model=wasm -mattr=+exception-handling,-bulk-memory,-bulk-memory-opt | FileCheck %s --check-prefix=NOOPT
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -wasm-use-legacy-eh -exception-model=wasm -mattr=+exception-handling,-bulk-memory,-bulk-memory-opt -wasm-disable-ehpad-sort -stats 2>&1 | FileCheck %s --check-prefix=NOSORT
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -wasm-use-legacy-eh -exception-model=wasm -mattr=+exception-handling,-bulk-memory,-bulk-memory-opt -wasm-disable-ehpad-sort | FileCheck %s --check-prefix=NOSORT-LOCALS
 
 target triple = "wasm32-unknown-unknown"
 
diff --git a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.mir b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.mir
index 2494ad1ad581d..5a4dc11cd328b 100644
--- a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.mir
+++ b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=wasm32-unknown-unknown -exception-model=wasm -mattr=+exception-handling -run-pass wasm-cfg-stackify %s -o - | FileCheck %s
+# RUN: llc -mtriple=wasm32-unknown-unknown -wasm-use-legacy-eh -exception-model=wasm -mattr=+exception-handling -run-pass wasm-cfg-stackify %s -o - | FileCheck %s
 
 --- |
   target triple = "wasm32-unknown-unknown"
diff --git a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll
index 22fda36c25bfd..683b03d16d57b 100644
--- a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll
+++ b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll
@@ -1,9 +1,9 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -wasm-enable-exnref -exception-model=wasm -mattr=+exception-handling,bulk-memory | FileCheck %s
-; RUN: llc < %s -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -wasm-enable-exnref -exception-model=wasm -mattr=+exception-handling,bulk-memory
-; RUN: llc < %s -O0 -disable-wasm-fallthrough-return-opt -verify-machineinstrs -wasm-enable-eh -wasm-enable-exnref -exception-model=wasm -mattr=+exception-handling,-bulk-memory,-bulk-memory-opt | FileCheck %s --check-prefix=NOOPT
-; RUN: llc < %s -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -wasm-enable-exnref -exception-model=wasm -mattr=+exception-handling,-bulk-memory,-bulk-memory-opt -wasm-disable-ehpad-sort -stats 2>&1 | FileCheck %s --check-prefix=NOSORT
-; RUN: llc < %s -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -wasm-enable-exnref -exception-model=wasm -mattr=+exception-handling,-bulk-memory,-bulk-memory-opt -wasm-disable-ehpad-sort | FileCheck %s --check-prefix=NOSORT-LOCALS
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -wasm-use-legacy-eh=false -exception-model=wasm -mattr=+exception-handling,bulk-memory | FileCheck %s
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -wasm-use-legacy-eh=false -exception-model=wasm -mattr=+exception-handling,bulk-memory
+; RUN: llc < %s -O0 -disable-wasm-fallthrough-return-opt -verify-machineinstrs -wasm-enable-eh -wasm-use-legacy-eh=false -exception-model=wasm -mattr=+exception-handling,-bulk-memory,-bulk-memory-opt | FileCheck %s --check-prefix=NOOPT
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -wasm-use-legacy-eh=false -exception-model=wasm -mattr=+exception-handling,-bulk-memory,-bulk-memory-opt -wasm-disable-ehpad-sort -stats 2>&1 | FileCheck %s --check-prefix=NOSORT
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -wasm-use-legacy-eh=false -exception-model=wasm -mattr=+exception-handling,-bulk-memory,-bulk-memory-opt -wasm-disable-ehpad-sort | FileCheck %s --check-prefix=NOSORT-LOCALS
 
 target triple = "wasm32-unknown-unknown"
 
diff --git a/llvm/test/CodeGen/WebAssembly/eh-option-errors.ll b/llvm/test/CodeGen/WebAssembly/eh-option-errors.ll
index 52a6364e12258..74d02ddc405d3 100644
--- a/llvm/test/CodeGen/WebAssembly/eh-option-errors.ll
+++ b/llvm/test/CodeGen/WebAssembly/eh-option-errors.ll
@@ -9,9 +9,6 @@ target triple = "wasm32-unknown-unknown"
 ; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -wasm-enable-sjlj 2>&1 | FileCheck %s --check-prefix=EM_EH_W_WASM_SJLJ
 ; EM_EH_W_WASM_SJLJ: LLVM ERROR: -enable-emscripten-cxx-exceptions not allowed with -wasm-enable-sjlj
 
-; RUN: not --crash llc < %s -wasm-enable-exnref 2>&1 | FileCheck %s --check-prefix=WASM_EXNREF_ONLY
-; WASM_EXNREF_ONLY: LLVM ERROR: -wasm-enable-exnref should be used with -wasm-enable-eh
-
 ; RUN: not --crash llc < %s -wasm-enable-eh -exception-model=dwarf 2>&1 | FileCheck %s --check-prefix=EH_MODEL_DWARF
 ; EH_MODEL_DWARF: LLVM ERROR: -exception-model should be either 'none' or 'wasm'
 
diff --git a/llvm/test/CodeGen/WebAssembly/exception-legacy.ll b/llvm/test/CodeGen/WebAssembly/exception-legacy.ll
index 9ebdc06d9666d..3fe45bcc4cd29 100644
--- a/llvm/test/CodeGen/WebAssembly/exception-legacy.ll
+++ b/llvm/test/CodeGen/WebAssembly/exception-legacy.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling -verify-machineinstrs | FileCheck --implicit-check-not=ehgcr -allow-deprecated-dag-overlap %s
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling -verify-machineinstrs -O0
-; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-keep-registers -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -wasm-enable-eh -wasm-use-legacy-eh -exception-model=wasm -mattr=+exception-handling -verify-machineinstrs | FileCheck --implicit-check-not=ehgcr -allow-deprecated-dag-overlap %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -wasm-enable-eh -wasm-use-legacy-eh -exception-model=wasm -mattr=+exception-handling -verify-machineinstrs -O0
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-keep-registers -wasm-enable-eh -wasm-use-legacy-eh -exception-model=wasm -mattr=+exception-handling
 
 target triple = "wasm32-unknown-unknown"
 
diff --git a/llvm/test/CodeGen/WebAssembly/exception-legacy.mir b/llvm/test/CodeGen/WebAssembly/exception-legacy.mir
index fbed7db1dcb11..d6f734c64acd6 100644
--- a/llvm/test/CodeGen/WebAssembly/exception-legacy.mir
+++ b/llvm/test/CodeGen/WebAssembly/exception-legacy.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=wasm32-unknown-unknown -exception-model=wasm -mattr=+exception-handling -run-pass wasm-late-eh-prepare -run-pass wasm-cfg-stackify %s -o - | FileCheck %s
+# RUN: llc -mtriple=wasm32-unknown-unknown -wasm-use-legacy-eh -exception-model=wasm -mattr=+exception-handling -run-pass wasm-late-eh-prepare -run-pass wasm-cfg-stackify %s -o - | FileCheck %s
 
 --- |
   target triple = "wasm32-unknown-unknown"
diff --git a/llvm/test/CodeGen/WebAssembly/exception.ll b/llvm/test/CodeGen/WebAssembly/exception.ll
index 779775b676071..d6f3ffc8c33cb 100644
--- a/llvm/test/CodeGen/WebAssembly/exception.ll
+++ b/llvm/test/CodeGen/WebAssembly/exception.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -asm-verbose=false -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling -wasm-enable-exnref -verify-machineinstrs | FileCheck --implicit-check-not=ehgcr -allow-deprecated-dag-overlap %s
-; RUN: llc < %s -asm-verbose=false -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling -wasm-enable-exnref -verify-machineinstrs -O0
-; RUN: llc < %s -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling -wasm-enable-exnref
-; RUN: llc < %s -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling -wasm-enable-exnref -filetype=obj
+; RUN: llc < %s -asm-verbose=false -wasm-enable-eh -wasm-use-legacy-eh=false -exception-model=wasm -mattr=+exception-handling -verify-machineinstrs | FileCheck --implicit-check-not=ehgcr -allow-deprecated-dag-overlap %s
+; RUN: llc < %s -asm-verbose=false -wasm-enable-eh -wasm-use-legacy-eh=false -exception-model=wasm -mattr=+exception-handling -verify-machineinstrs -O0
+; RUN: llc < %s -wasm-enable-eh -wasm-use-legacy-eh=false -exception-model=wasm -mattr=+exception-handling
+; RUN: llc < %s -wasm-enable-eh -wasm-use-legacy-eh=false -exception-model=wasm -mattr=+exception-handling -filetype=obj
 
 target triple = "wasm32-unknown-unknown"
 

From 4c0a0f72418b21161b5c1fb9225462bd039121e3 Mon Sep 17 00:00:00 2001
From: Tyler Lanphear <tylanphear@gmail.com>
Date: Thu, 9 Jan 2025 22:43:30 -0800
Subject: [PATCH 020/408] [SandboxVectorizer][NFCI] Fix use of
 possibly-uninitialized scalar. (#122201)

The `EraseCallbackID` field is not always initialized in the ctor for
SeedCollector; if not, it will be used uninitialized by its dtor. This
could potentially lead to the erasure of a random callback, leading to a
bug.

Fixed by making `CallbackID` an opaque type, which is always
default-initialized to an invalid ID.
---
 llvm/include/llvm/SandboxIR/Context.h | 54 +++++++++++++++++++++++----
 llvm/lib/SandboxIR/Context.cpp        |  6 +--
 2 files changed, 50 insertions(+), 10 deletions(-)

diff --git a/llvm/include/llvm/SandboxIR/Context.h b/llvm/include/llvm/SandboxIR/Context.h
index b0d6f8335d9e0..7fe97d984b958 100644
--- a/llvm/include/llvm/SandboxIR/Context.h
+++ b/llvm/include/llvm/SandboxIR/Context.h
@@ -18,7 +18,8 @@
 
 #include <cstdint>
 
-namespace llvm::sandboxir {
+namespace llvm {
+namespace sandboxir {
 
 class Argument;
 class BBIterator;
@@ -37,10 +38,28 @@ class Context {
   using MoveInstrCallback =
       std::function<void(Instruction *, const BBIterator &)>;
 
-  /// An ID for a registered callback. Used for deregistration. Using a 64-bit
-  /// integer so we don't have to worry about the unlikely case of overflowing
-  /// a 32-bit counter.
-  using CallbackID = uint64_t;
+  /// An ID for a registered callback. Used for deregistration. A dedicated type
+  /// is employed so as to keep IDs opaque to the end user; only Context should
+  /// deal with its underlying representation.
+  class CallbackID {
+  public:
+    // Uses a 64-bit integer so we don't have to worry about the unlikely case
+    // of overflowing a 32-bit counter.
+    using ValTy = uint64_t;
+    static constexpr const ValTy InvalidVal = 0;
+
+  private:
+    // Default initialization results in an invalid ID.
+    ValTy Val = InvalidVal;
+    explicit CallbackID(ValTy Val) : Val{Val} {
+      assert(Val != InvalidVal && "newly-created ID is invalid!");
+    }
+
+  public:
+    CallbackID() = default;
+    friend class Context;
+    friend struct DenseMapInfo<CallbackID>;
+  };
 
 protected:
   LLVMContext &LLVMCtx;
@@ -83,7 +102,7 @@ class Context {
   /// A counter used for assigning callback IDs during registration. The same
   /// counter is used for all kinds of callbacks so we can detect mismatched
   /// registration/deregistration.
-  CallbackID NextCallbackID = 0;
+  CallbackID::ValTy NextCallbackID = 1;
 
   /// Remove \p V from the maps and returns the unique_ptr.
   std::unique_ptr<Value> detachLLVMValue(llvm::Value *V);
@@ -263,6 +282,27 @@ class Context {
   // TODO: Add callbacks for instructions inserted/removed if needed.
 };
 
-} // namespace llvm::sandboxir
+} // namespace sandboxir
+
+// DenseMap info for CallbackIDs
+template <> struct DenseMapInfo<sandboxir::Context::CallbackID> {
+  using CallbackID = sandboxir::Context::CallbackID;
+  using ReprInfo = DenseMapInfo<CallbackID::ValTy>;
+
+  static CallbackID getEmptyKey() {
+    return CallbackID{ReprInfo::getEmptyKey()};
+  }
+  static CallbackID getTombstoneKey() {
+    return CallbackID{ReprInfo::getTombstoneKey()};
+  }
+  static unsigned getHashValue(const CallbackID &ID) {
+    return ReprInfo::getHashValue(ID.Val);
+  }
+  static bool isEqual(const CallbackID &LHS, const CallbackID &RHS) {
+    return ReprInfo::isEqual(LHS.Val, RHS.Val);
+  }
+};
+
+} // namespace llvm
 
 #endif // LLVM_SANDBOXIR_CONTEXT_H
diff --git a/llvm/lib/SandboxIR/Context.cpp b/llvm/lib/SandboxIR/Context.cpp
index b86ed5864c1ac..42ca456881fd0 100644
--- a/llvm/lib/SandboxIR/Context.cpp
+++ b/llvm/lib/SandboxIR/Context.cpp
@@ -686,7 +686,7 @@ void Context::runMoveInstrCallbacks(Instruction *I, const BBIterator &WhereIt) {
 Context::CallbackID Context::registerEraseInstrCallback(EraseInstrCallback CB) {
   assert(EraseInstrCallbacks.size() <= MaxRegisteredCallbacks &&
          "EraseInstrCallbacks size limit exceeded");
-  CallbackID ID = NextCallbackID++;
+  CallbackID ID{NextCallbackID++};
   EraseInstrCallbacks[ID] = CB;
   return ID;
 }
@@ -700,7 +700,7 @@ Context::CallbackID
 Context::registerCreateInstrCallback(CreateInstrCallback CB) {
   assert(CreateInstrCallbacks.size() <= MaxRegisteredCallbacks &&
          "CreateInstrCallbacks size limit exceeded");
-  CallbackID ID = NextCallbackID++;
+  CallbackID ID{NextCallbackID++};
   CreateInstrCallbacks[ID] = CB;
   return ID;
 }
@@ -713,7 +713,7 @@ void Context::unregisterCreateInstrCallback(CallbackID ID) {
 Context::CallbackID Context::registerMoveInstrCallback(MoveInstrCallback CB) {
   assert(MoveInstrCallbacks.size() <= MaxRegisteredCallbacks &&
          "MoveInstrCallbacks size limit exceeded");
-  CallbackID ID = NextCallbackID++;
+  CallbackID ID{NextCallbackID++};
   MoveInstrCallbacks[ID] = CB;
   return ID;
 }

From 01a7d4e26b9bac27e282b113209f53c4c1d290b2 Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <jakub@codeplay.com>
Date: Fri, 10 Jan 2025 07:49:11 +0100
Subject: [PATCH 021/408] [AMDGPU] Allow selection of BITOP3 for some 2 opcodes
 and B32 cases (#122267)

This came up in downstream static analysis - as a dead code.

Admittedly, it depends on what the intention was when checking for [`if
(NumOpcodes == 2 &&
IsB32)`](https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp#L3792C3-L3792C32)
and I took a guess that for certain cases the selection should take
place.

If that's incorrect, that whole if statement can be removed, as it is
after a check for: [`if (NumOpcodes <
4)`](https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp#L3788)
---
 .../AMDGPU/AMDGPUInstructionSelector.cpp      | 13 +++++-----
 llvm/test/CodeGen/AMDGPU/bitop3.ll            | 26 ++++++++-----------
 2 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 041b9b4d66f63..1e654b260cbfa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3782,13 +3782,7 @@ bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
   if (NumOpcodes < 2 || Src.empty())
     return false;
 
-  // For a uniform case threshold should be higher to account for moves between
-  // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
-  // and a readtfirstlane after.
-  if (NumOpcodes < 4)
-    return false;
-
-  bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
+  const bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
   if (NumOpcodes == 2 && IsB32) {
     // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
     // asm more readable. This cannot be modeled with AddedComplexity because
@@ -3797,6 +3791,11 @@ bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
         mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||
         mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))
       return false;
+  } else if (NumOpcodes < 4) {
+    // For a uniform case threshold should be higher to account for moves
+    // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be
+    // in SGPRs and a readtfirstlane after.
+    return false;
   }
 
   unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
diff --git a/llvm/test/CodeGen/AMDGPU/bitop3.ll b/llvm/test/CodeGen/AMDGPU/bitop3.ll
index b08ab5a2dc422..eb149a93ee328 100644
--- a/llvm/test/CodeGen/AMDGPU/bitop3.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitop3.ll
@@ -52,8 +52,7 @@ define amdgpu_ps float @not_and_and_and(i32 %a, i32 %b, i32 %c) {
 ;
 ; GFX950-GISEL-LABEL: not_and_and_and:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    v_not_b32_e32 v0, v0
-; GFX950-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX950-GISEL-NEXT:    v_bitop3_b32 v0, v0, v2, v0 bitop3:0xc
 ; GFX950-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX950-GISEL-NEXT:    ; return to shader part epilog
   %nota = xor i32 %a, -1
@@ -103,8 +102,7 @@ define amdgpu_ps float @and_and_not_and(i32 %a, i32 %b, i32 %c) {
 ;
 ; GFX950-GISEL-LABEL: and_and_not_and:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    v_not_b32_e32 v2, v2
-; GFX950-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX950-GISEL-NEXT:    v_bitop3_b32 v0, v0, v2, v0 bitop3:0x30
 ; GFX950-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX950-GISEL-NEXT:    ; return to shader part epilog
   %notc = xor i32 %c, -1
@@ -122,8 +120,7 @@ define amdgpu_ps float @and_and_and(i32 %a, i32 %b, i32 %c) {
 ;
 ; GFX950-GISEL-LABEL: and_and_and:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX950-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:0x80
 ; GFX950-GISEL-NEXT:    ; return to shader part epilog
   %and1 = and i32 %a, %c
   %and2 = and i32 %and1, %b
@@ -141,8 +138,7 @@ define amdgpu_ps float @test_12(i32 %a, i32 %b) {
 ;
 ; GFX950-GISEL-LABEL: test_12:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    v_not_b32_e32 v0, v0
-; GFX950-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc
 ; GFX950-GISEL-NEXT:    ; return to shader part epilog
   %nota = xor i32 %a, -1
   %and1 = and i32 %nota, %b
@@ -214,9 +210,11 @@ define amdgpu_ps float @test_12_src_overflow(i32 %a, i32 %b, i32 %c) {
 ;
 ; GFX950-GISEL-LABEL: test_12_src_overflow:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    v_not_b32_e32 v0, v0
-; GFX950-GISEL-NEXT:    v_bfi_b32 v0, v2, v0, v0
-; GFX950-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    v_not_b32_e32 v3, v0
+; GFX950-GISEL-NEXT:    v_not_b32_e32 v4, v2
+; GFX950-GISEL-NEXT:    v_bitop3_b32 v0, v0, v2, v0 bitop3:0xc
+; GFX950-GISEL-NEXT:    v_and_b32_e32 v2, v3, v4
+; GFX950-GISEL-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:0xc8
 ; GFX950-GISEL-NEXT:    ; return to shader part epilog
   %nota = xor i32 %a, -1
   %notc = xor i32 %c, -1
@@ -242,11 +240,9 @@ define amdgpu_ps float @test_100_src_overflow(i32 %a, i32 %b, i32 %c) {
 ;
 ; GFX950-GISEL-LABEL: test_100_src_overflow:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    v_or_b32_e32 v3, v2, v0
-; GFX950-GISEL-NEXT:    v_not_b32_e32 v3, v3
-; GFX950-GISEL-NEXT:    v_not_b32_e32 v4, v1
+; GFX950-GISEL-NEXT:    v_bitop3_b32 v3, v2, v0, v2 bitop3:3
 ; GFX950-GISEL-NEXT:    v_and_b32_e32 v3, v1, v3
-; GFX950-GISEL-NEXT:    v_and_b32_e32 v4, v0, v4
+; GFX950-GISEL-NEXT:    v_bitop3_b32 v4, v0, v1, v0 bitop3:0x30
 ; GFX950-GISEL-NEXT:    v_and_b32_e32 v0, v1, v0
 ; GFX950-GISEL-NEXT:    v_not_b32_e32 v1, v2
 ; GFX950-GISEL-NEXT:    v_and_b32_e32 v4, v4, v2

From 089555095b91d693ab68d039cb5fda4b7b8e45bc Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke@amd.com>
Date: Fri, 10 Jan 2025 12:23:07 +0530
Subject: [PATCH 022/408] =?UTF-8?q?Revert=20"Spiller:=20Detach=20legacy=20?=
 =?UTF-8?q?pass=20and=20supply=20analyses=20instead=20(#119=E2=80=A6=20(#1?=
 =?UTF-8?q?22426)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…181)"

This reverts commit a531800344dc54e9c197a13b22e013f919f3f5e1.
---
 llvm/include/llvm/CodeGen/Spiller.h | 16 ++-----------
 llvm/lib/CodeGen/InlineSpiller.cpp  | 36 ++++++++++++++++++-----------
 llvm/lib/CodeGen/RegAllocBasic.cpp  | 16 ++++---------
 llvm/lib/CodeGen/RegAllocGreedy.cpp |  4 +---
 llvm/lib/CodeGen/RegAllocPBQP.cpp   |  5 +---
 5 files changed, 31 insertions(+), 46 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/Spiller.h b/llvm/include/llvm/CodeGen/Spiller.h
index 3132cefeb6c68..51ad36bc6b1f8 100644
--- a/llvm/include/llvm/CodeGen/Spiller.h
+++ b/llvm/include/llvm/CodeGen/Spiller.h
@@ -19,10 +19,6 @@ class MachineFunction;
 class MachineFunctionPass;
 class VirtRegMap;
 class VirtRegAuxInfo;
-class LiveIntervals;
-class LiveStacks;
-class MachineDominatorTree;
-class MachineBlockFrequencyInfo;
 
 /// Spiller interface.
 ///
@@ -45,20 +41,12 @@ class Spiller {
   virtual ArrayRef<Register> getReplacedRegs() = 0;
 
   virtual void postOptimization() {}
-
-  struct RequiredAnalyses {
-    LiveIntervals &LIS;
-    LiveStacks &LSS;
-    MachineDominatorTree &MDT;
-    const MachineBlockFrequencyInfo &MBFI;
-  };
 };
 
 /// Create and return a spiller that will insert spill code directly instead
 /// of deferring though VirtRegMap.
-Spiller *createInlineSpiller(const Spiller::RequiredAnalyses &Analyses,
-                             MachineFunction &MF, VirtRegMap &VRM,
-                             VirtRegAuxInfo &VRAI);
+Spiller *createInlineSpiller(MachineFunctionPass &Pass, MachineFunction &MF,
+                             VirtRegMap &VRM, VirtRegAuxInfo &VRAI);
 
 } // end namespace llvm
 
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index b9768d5c63a5d..64f290f5930a1 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -75,6 +75,7 @@ RestrictStatepointRemat("restrict-statepoint-remat",
                        cl::desc("Restrict remat for statepoint operands"));
 
 namespace {
+
 class HoistSpillHelper : private LiveRangeEdit::Delegate {
   MachineFunction &MF;
   LiveIntervals &LIS;
@@ -127,11 +128,15 @@ class HoistSpillHelper : private LiveRangeEdit::Delegate {
                       DenseMap<MachineBasicBlock *, unsigned> &SpillsToIns);
 
 public:
-  HoistSpillHelper(const Spiller::RequiredAnalyses &Analyses,
-                   MachineFunction &mf, VirtRegMap &vrm)
-      : MF(mf), LIS(Analyses.LIS), LSS(Analyses.LSS), MDT(Analyses.MDT),
+  HoistSpillHelper(MachineFunctionPass &pass, MachineFunction &mf,
+                   VirtRegMap &vrm)
+      : MF(mf), LIS(pass.getAnalysis<LiveIntervalsWrapperPass>().getLIS()),
+        LSS(pass.getAnalysis<LiveStacksWrapperLegacy>().getLS()),
+        MDT(pass.getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree()),
         VRM(vrm), MRI(mf.getRegInfo()), TII(*mf.getSubtarget().getInstrInfo()),
-        TRI(*mf.getSubtarget().getRegisterInfo()), MBFI(Analyses.MBFI),
+        TRI(*mf.getSubtarget().getRegisterInfo()),
+        MBFI(
+            pass.getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI()),
         IPA(LIS, mf.getNumBlockIDs()) {}
 
   void addToMergeableSpills(MachineInstr &Spill, int StackSlot,
@@ -185,12 +190,16 @@ class InlineSpiller : public Spiller {
   ~InlineSpiller() override = default;
 
 public:
-  InlineSpiller(const Spiller::RequiredAnalyses &Analyses, MachineFunction &MF,
-                VirtRegMap &VRM, VirtRegAuxInfo &VRAI)
-      : MF(MF), LIS(Analyses.LIS), LSS(Analyses.LSS), MDT(Analyses.MDT),
+  InlineSpiller(MachineFunctionPass &Pass, MachineFunction &MF, VirtRegMap &VRM,
+                VirtRegAuxInfo &VRAI)
+      : MF(MF), LIS(Pass.getAnalysis<LiveIntervalsWrapperPass>().getLIS()),
+        LSS(Pass.getAnalysis<LiveStacksWrapperLegacy>().getLS()),
+        MDT(Pass.getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree()),
         VRM(VRM), MRI(MF.getRegInfo()), TII(*MF.getSubtarget().getInstrInfo()),
-        TRI(*MF.getSubtarget().getRegisterInfo()), MBFI(Analyses.MBFI),
-        HSpiller(Analyses, MF, VRM), VRAI(VRAI) {}
+        TRI(*MF.getSubtarget().getRegisterInfo()),
+        MBFI(
+            Pass.getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI()),
+        HSpiller(Pass, MF, VRM), VRAI(VRAI) {}
 
   void spill(LiveRangeEdit &) override;
   ArrayRef<Register> getSpilledRegs() override { return RegsToSpill; }
@@ -228,11 +237,10 @@ Spiller::~Spiller() = default;
 
 void Spiller::anchor() {}
 
-Spiller *
-llvm::createInlineSpiller(const InlineSpiller::RequiredAnalyses &Analyses,
-                          MachineFunction &MF, VirtRegMap &VRM,
-                          VirtRegAuxInfo &VRAI) {
-  return new InlineSpiller(Analyses, MF, VRM, VRAI);
+Spiller *llvm::createInlineSpiller(MachineFunctionPass &Pass,
+                                   MachineFunction &MF, VirtRegMap &VRM,
+                                   VirtRegAuxInfo &VRAI) {
+  return new InlineSpiller(Pass, MF, VRM, VRAI);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/RegAllocBasic.cpp b/llvm/lib/CodeGen/RegAllocBasic.cpp
index f3f34f890be11..c05aa1e40e477 100644
--- a/llvm/lib/CodeGen/RegAllocBasic.cpp
+++ b/llvm/lib/CodeGen/RegAllocBasic.cpp
@@ -22,7 +22,6 @@
 #include "llvm/CodeGen/LiveRegMatrix.h"
 #include "llvm/CodeGen/LiveStacks.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
-#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/Passes.h"
@@ -188,7 +187,6 @@ void RABasic::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<ProfileSummaryInfoWrapperPass>();
   AU.addRequired<MachineBlockFrequencyInfoWrapperPass>();
   AU.addPreserved<MachineBlockFrequencyInfoWrapperPass>();
-  AU.addRequired<MachineDominatorTreeWrapperPass>();
   AU.addRequiredID(MachineDominatorsID);
   AU.addPreservedID(MachineDominatorsID);
   AU.addRequired<MachineLoopInfoWrapperPass>();
@@ -312,20 +310,16 @@ bool RABasic::runOnMachineFunction(MachineFunction &mf) {
                     << "********** Function: " << mf.getName() << '\n');
 
   MF = &mf;
-  auto &MBFI = getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI();
-  auto &LiveStks = getAnalysis<LiveStacksWrapperLegacy>().getLS();
-  auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
-
   RegAllocBase::init(getAnalysis<VirtRegMapWrapperLegacy>().getVRM(),
                      getAnalysis<LiveIntervalsWrapperPass>().getLIS(),
                      getAnalysis<LiveRegMatrixWrapperLegacy>().getLRM());
-  VirtRegAuxInfo VRAI(*MF, *LIS, *VRM,
-                      getAnalysis<MachineLoopInfoWrapperPass>().getLI(), MBFI,
-                      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI());
+  VirtRegAuxInfo VRAI(
+      *MF, *LIS, *VRM, getAnalysis<MachineLoopInfoWrapperPass>().getLI(),
+      getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI(),
+      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI());
   VRAI.calculateSpillWeightsAndHints();
 
-  SpillerInstance.reset(
-      createInlineSpiller({*LIS, LiveStks, MDT, MBFI}, *MF, *VRM, VRAI));
+  SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM, VRAI));
 
   allocatePhysRegs();
   postOptimization();
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 66e9cf546b837..b94992c20b119 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -2750,7 +2750,6 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   Bundles = &getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles();
   SpillPlacer = &getAnalysis<SpillPlacementWrapperLegacy>().getResult();
   DebugVars = &getAnalysis<LiveDebugVariablesWrapperLegacy>().getLDV();
-  auto &LSS = getAnalysis<LiveStacksWrapperLegacy>().getLS();
 
   initializeCSRCost();
 
@@ -2771,8 +2770,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
       getAnalysis<RegAllocPriorityAdvisorAnalysis>().getAdvisor(*MF, *this);
 
   VRAI = std::make_unique<VirtRegAuxInfo>(*MF, *LIS, *VRM, *Loops, *MBFI);
-  SpillerInstance.reset(
-      createInlineSpiller({*LIS, LSS, *DomTree, *MBFI}, *MF, *VRM, *VRAI));
+  SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM, *VRAI));
 
   VRAI->calculateSpillWeightsAndHints();
 
diff --git a/llvm/lib/CodeGen/RegAllocPBQP.cpp b/llvm/lib/CodeGen/RegAllocPBQP.cpp
index e230a1be95c9f..696c312e4ba00 100644
--- a/llvm/lib/CodeGen/RegAllocPBQP.cpp
+++ b/llvm/lib/CodeGen/RegAllocPBQP.cpp
@@ -794,9 +794,6 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
   MachineBlockFrequencyInfo &MBFI =
       getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI();
 
-  auto &LiveStks = getAnalysis<LiveStacksWrapperLegacy>().getLS();
-  auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
-
   VirtRegMap &VRM = getAnalysis<VirtRegMapWrapperLegacy>().getVRM();
 
   PBQPVirtRegAuxInfo VRAI(
@@ -810,7 +807,7 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
   VirtRegAuxInfo DefaultVRAI(
       MF, LIS, VRM, getAnalysis<MachineLoopInfoWrapperPass>().getLI(), MBFI);
   std::unique_ptr<Spiller> VRegSpiller(
-      createInlineSpiller({LIS, LiveStks, MDT, MBFI}, MF, VRM, DefaultVRAI));
+      createInlineSpiller(*this, MF, VRM, DefaultVRAI));
 
   MF.getRegInfo().freezeReservedRegs();
 

From 99d2ff54abb89b0aabe085c87c8064a7ab0f2872 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Fri, 10 Jan 2025 17:51:31 +1100
Subject: [PATCH 023/408] [ORC-RT] Use llvm-jitlink -num-threads=0 for
 objc-imageinfo.S tests.

These testcases depend on debugging output, which isn't stable under concurrent
linking.
---
 .../TestCases/Darwin/arm64/objc-imageinfo.S   | 40 ++++++++++++++-----
 .../TestCases/Darwin/x86-64/objc-imageinfo.S  | 40 ++++++++++++++-----
 2 files changed, 60 insertions(+), 20 deletions(-)

diff --git a/compiler-rt/test/orc/TestCases/Darwin/arm64/objc-imageinfo.S b/compiler-rt/test/orc/TestCases/Darwin/arm64/objc-imageinfo.S
index 2ee7d3f5eac17..78454e33f7356 100644
--- a/compiler-rt/test/orc/TestCases/Darwin/arm64/objc-imageinfo.S
+++ b/compiler-rt/test/orc/TestCases/Darwin/arm64/objc-imageinfo.S
@@ -9,19 +9,29 @@
 
 // Check individual versions are loadable.
 
-// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/objc_old.o 2>&1 | FileCheck %s -check-prefix=OLD
+// RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o \
+// RUN:               %t/objc_old.o 2>&1 \
+// RUN:               | FileCheck %s -check-prefix=OLD
 // OLD: MachOPlatform: Registered __objc_imageinfo for main
 // OLD-SAME: flags = 0x0000
-// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/objc_new.o 2>&1 | FileCheck %s -check-prefix=NEW
+// RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o \
+// RUN:               %t/objc_new.o 2>&1 \
+// RUN:               | FileCheck %s -check-prefix=NEW
 // NEW: MachOPlatform: Registered __objc_imageinfo for main
 // NEW-SAME: flags = 0x0040
-// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/swift_4.o 2>&1 | FileCheck %s -check-prefix=SWIFT_4
+// RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o \
+// RUN:               %t/swift_4.o 2>&1 \
+// RUN:               | FileCheck %s -check-prefix=SWIFT_4
 // SWIFT_4: MachOPlatform: Registered __objc_imageinfo for main
 // SWIFT_4-SAME: flags = 0x0640
-// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/swift_5.o 2>&1 | FileCheck %s -check-prefix=SWIFT_5
+// RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o \
+// RUN:               %t/swift_5.o 2>&1 \
+// RUN:               | FileCheck %s -check-prefix=SWIFT_5
 // SWIFT_5: MachOPlatform: Registered __objc_imageinfo for main
 // SWIFT_5-SAME: flags = 0x5000740
-// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/swift_59.o 2>&1 | FileCheck %s -check-prefix=SWIFT_59
+// RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o \
+// RUN:               %t/swift_59.o 2>&1 \
+// RUN:               | FileCheck %s -check-prefix=SWIFT_59
 // SWIFT_59: MachOPlatform: Registered __objc_imageinfo for main
 // SWIFT_59-SAME: flags = 0x5090740
 
@@ -33,23 +43,33 @@
 // Check merging.
 
 // Take the lowest swift version.
-// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/swift_59.o %t/swift_5.o 2>&1 | FileCheck %s -check-prefix=SWIFT_MIX1
+// RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o %t/swift_59.o \
+// RUN:               %t/swift_5.o 2>&1 \
+// RUN:               | FileCheck %s -check-prefix=SWIFT_MIX1
 // SWIFT_MIX1: MachOPlatform: Merging __objc_imageinfo flags for main {{.*}} -> 0x5000740
 
 // Add swift to objc.
-// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/swift_59.o %t/objc_new.o 2>&1 | FileCheck %s -check-prefix=SWIFT_MIX2
+// RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o %t/swift_59.o \
+// RUN:               %t/objc_new.o 2>&1 \
+// RUN:               | FileCheck %s -check-prefix=SWIFT_MIX2
 // SWIFT_MIX2: MachOPlatform: Merging __objc_imageinfo flags for main {{.*}} -> 0x5090740
 
 // Add multiple swift to objc.
-// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/swift_59.o %t/swift_5.o %t/objc_new.o 2>&1 | FileCheck %s -check-prefix=SWIFT_MIX3
+// RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o %t/swift_59.o \
+// RUN:               %t/swift_5.o %t/objc_new.o 2>&1 \
+// RUN:               | FileCheck %s -check-prefix=SWIFT_MIX3
 // SWIFT_MIX3: MachOPlatform: Merging __objc_imageinfo flags for main {{.*}} -> 0x5000740
 
 // Disable categories.
-// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/objc_old.o %t/objc_new.o 2>&1 | FileCheck %s -check-prefix=SWIFT_MIX4
+// RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o %t/objc_old.o \
+// RUN:               %t/objc_new.o 2>&1 \
+// RUN:               | FileCheck %s -check-prefix=SWIFT_MIX4
 // SWIFT_MIX4: MachOPlatform: Merging __objc_imageinfo flags for main {{.*}} -> 0x0000
 
 // Disable signed class_ro.
-// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/objc_new.o %t/objc_new_signed_ro.o 2>&1 | FileCheck %s -check-prefix=SWIFT_MIX5
+// RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o %t/objc_new.o \
+// RUN:               %t/objc_new_signed_ro.o 2>&1 \
+// RUN:               | FileCheck %s -check-prefix=SWIFT_MIX5
 // SWIFT_MIX5: MachOPlatform: Merging __objc_imageinfo flags for main {{.*}} -> 0x0040
 
 //--- main.S
diff --git a/compiler-rt/test/orc/TestCases/Darwin/x86-64/objc-imageinfo.S b/compiler-rt/test/orc/TestCases/Darwin/x86-64/objc-imageinfo.S
index d4e9b4b05fb88..2d0d8d8c19af4 100644
--- a/compiler-rt/test/orc/TestCases/Darwin/x86-64/objc-imageinfo.S
+++ b/compiler-rt/test/orc/TestCases/Darwin/x86-64/objc-imageinfo.S
@@ -9,19 +9,29 @@
 
 // Check individual versions are loadable.
 
-// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/objc_old.o 2>&1 | FileCheck %s -check-prefix=OLD
+// RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o \
+// RUN:               %t/objc_old.o 2>&1 \
+// RUN:               | FileCheck %s -check-prefix=OLD
 // OLD: MachOPlatform: Registered __objc_imageinfo for main
 // OLD-SAME: flags = 0x0000
-// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/objc_new.o 2>&1 | FileCheck %s -check-prefix=NEW
+// RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o \
+// RUN:               %t/objc_new.o 2>&1 \
+// RUN:               | FileCheck %s -check-prefix=NEW
 // NEW: MachOPlatform: Registered __objc_imageinfo for main
 // NEW-SAME: flags = 0x0040
-// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/swift_4.o 2>&1 | FileCheck %s -check-prefix=SWIFT_4
+// RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o \
+// RUN:               %t/swift_4.o 2>&1
+// RUN:               | FileCheck %s -check-prefix=SWIFT_4
 // SWIFT_4: MachOPlatform: Registered __objc_imageinfo for main
 // SWIFT_4-SAME: flags = 0x0640
-// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/swift_5.o 2>&1 | FileCheck %s -check-prefix=SWIFT_5
+// RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o \
+// RUN:               %t/swift_5.o 2>&1
+// RUN:               | FileCheck %s -check-prefix=SWIFT_5
 // SWIFT_5: MachOPlatform: Registered __objc_imageinfo for main
 // SWIFT_5-SAME: flags = 0x5000740
-// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/swift_59.o 2>&1 | FileCheck %s -check-prefix=SWIFT_59
+// RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o \
+// RUN:               %t/swift_59.o 2>&1
+// RUN:               | FileCheck %s -check-prefix=SWIFT_59
 // SWIFT_59: MachOPlatform: Registered __objc_imageinfo for main
 // SWIFT_59-SAME: flags = 0x5090740
 
@@ -33,23 +43,33 @@
 // Check merging.
 
 // Take the lowest swift version.
-// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/swift_59.o %t/swift_5.o 2>&1 | FileCheck %s -check-prefix=SWIFT_MIX1
+// RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o %t/swift_59.o \
+// RUN:               %t/swift_5.o 2>&1 \
+// RUN:               | FileCheck %s -check-prefix=SWIFT_MIX1
 // SWIFT_MIX1: MachOPlatform: Merging __objc_imageinfo flags for main {{.*}} -> 0x5000740
 
 // Add swift to objc.
-// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/swift_59.o %t/objc_new.o 2>&1 | FileCheck %s -check-prefix=SWIFT_MIX2
+// RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o %t/swift_59.o \
+// RUN:               %t/objc_new.o 2>&1
+// RUN:               | FileCheck %s -check-prefix=SWIFT_MIX2
 // SWIFT_MIX2: MachOPlatform: Merging __objc_imageinfo flags for main {{.*}} -> 0x5090740
 
 // Add multiple swift to objc.
-// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/swift_59.o %t/swift_5.o %t/objc_new.o 2>&1 | FileCheck %s -check-prefix=SWIFT_MIX3
+// RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o %t/swift_59.o \
+// RUN:               %t/swift_5.o %t/objc_new.o 2>&1
+// RUN:               | FileCheck %s -check-prefix=SWIFT_MIX3
 // SWIFT_MIX3: MachOPlatform: Merging __objc_imageinfo flags for main {{.*}} -> 0x5000740
 
 // Disable categories.
-// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/objc_old.o %t/objc_new.o 2>&1 | FileCheck %s -check-prefix=SWIFT_MIX4
+// RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o %t/objc_old.o
+// RUN:               %t/objc_new.o 2>&1
+// RUN:               | FileCheck %s -check-prefix=SWIFT_MIX4
 // SWIFT_MIX4: MachOPlatform: Merging __objc_imageinfo flags for main {{.*}} -> 0x0000
 
 // Disable signed class_ro.
-// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/objc_new.o %t/objc_new_signed_ro.o 2>&1 | FileCheck %s -check-prefix=SWIFT_MIX5
+// RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o %t/objc_new.o
+// RUN:               %t/objc_new_signed_ro.o 2>&1
+// RUN:               | FileCheck %s -check-prefix=SWIFT_MIX5
 // SWIFT_MIX5: MachOPlatform: Merging __objc_imageinfo flags for main {{.*}} -> 0x0040
 
 //--- main.S

From e8cc4d24bce8e12023c460ff7f11495cb42d5315 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Fri, 10 Jan 2025 17:56:31 +1100
Subject: [PATCH 024/408] [ORC][MachO] Fix deferred action handling during
 MachOPlatform bootstrap.

DeferredAAs should only capture bootstrap actions, but after 30b73ed7bd it was
capturing all actions, including those from other plugins. This is problematic
as other plugins may introduce actions that need to run before the platform
actions (e.g. on arm64e we need pointer signing to run before we access any
global pointers in the graph).

Note that this effectively undoes 30b73ed7bd, which was a buggy attempt to
synchronize writes to the DeferredAAs vector. This patch fixes that issue the
obvious way by locking the bootstrap mutex while accessing the DeferredAAs
vector.

No testcase yet: So far I've only seen this fail during bootstrap of arm64e
JIT'd programs.
---
 .../llvm/ExecutionEngine/Orc/MachOPlatform.h  |  4 +-
 .../lib/ExecutionEngine/Orc/MachOPlatform.cpp | 76 +++++++++----------
 2 files changed, 37 insertions(+), 43 deletions(-)

diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h b/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
index 1f11d9f61f6a1..c9f7178ebcadb 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
@@ -240,7 +240,6 @@ class MachOPlatform : public Platform {
     };
     using JITSymTabVector = SmallVector<SymbolTablePair>;
 
-    Error bootstrapPipelineStart(jitlink::LinkGraph &G);
     Error bootstrapPipelineRecordRuntimeFunctions(jitlink::LinkGraph &G);
     Error bootstrapPipelineEnd(jitlink::LinkGraph &G);
 
@@ -368,11 +367,10 @@ class MachOPlatform : public Platform {
   DenseMap<JITDylib *, SymbolLookupSet> RegisteredInitSymbols;
 
   std::mutex PlatformMutex;
+  BootstrapInfo *Bootstrap = nullptr;
   DenseMap<JITDylib *, ExecutorAddr> JITDylibToHeaderAddr;
   DenseMap<ExecutorAddr, JITDylib *> HeaderAddrToJITDylib;
   DenseMap<JITDylib *, uint64_t> JITDylibToPThreadKey;
-
-  std::atomic<BootstrapInfo *> Bootstrap;
 };
 
 // Generates a MachO header.
diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
index 9f324c7048c63..0013eddb1f2c9 100644
--- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -799,17 +799,21 @@ void MachOPlatform::MachOPlatformPlugin::modifyPassConfig(
 
   using namespace jitlink;
 
-  bool InBootstrapPhase =
-      &MR.getTargetJITDylib() == &MP.PlatformJD && MP.Bootstrap;
+  bool InBootstrapPhase = false;
+
+  if (LLVM_UNLIKELY(&MR.getTargetJITDylib() == &MP.PlatformJD)) {
+    std::lock_guard<std::mutex> Lock(MP.PlatformMutex);
+    if (MP.Bootstrap) {
+      InBootstrapPhase = true;
+      ++MP.Bootstrap->ActiveGraphs;
+    }
+  }
 
   // If we're in the bootstrap phase then increment the active graphs.
-  if (InBootstrapPhase) {
-    Config.PrePrunePasses.push_back(
-        [this](LinkGraph &G) { return bootstrapPipelineStart(G); });
+  if (LLVM_UNLIKELY(InBootstrapPhase))
     Config.PostAllocationPasses.push_back([this](LinkGraph &G) {
       return bootstrapPipelineRecordRuntimeFunctions(G);
     });
-  }
 
   // --- Handle Initializers ---
   if (auto InitSymbol = MR.getInitializerSymbol()) {
@@ -872,19 +876,11 @@ void MachOPlatform::MachOPlatformPlugin::modifyPassConfig(
         [this](LinkGraph &G) { return bootstrapPipelineEnd(G); });
 }
 
-Error MachOPlatform::MachOPlatformPlugin::bootstrapPipelineStart(
-    jitlink::LinkGraph &G) {
-  // Increment the active graphs count in BootstrapInfo.
-  std::lock_guard<std::mutex> Lock(MP.Bootstrap.load()->Mutex);
-  ++MP.Bootstrap.load()->ActiveGraphs;
-  return Error::success();
-}
-
 Error MachOPlatform::MachOPlatformPlugin::
     bootstrapPipelineRecordRuntimeFunctions(jitlink::LinkGraph &G) {
   // Record bootstrap function names.
   std::pair<StringRef, ExecutorAddr *> RuntimeSymbols[] = {
-      {*MP.MachOHeaderStartSymbol, &MP.Bootstrap.load()->MachOHeaderAddr},
+      {*MP.MachOHeaderStartSymbol, &MP.Bootstrap->MachOHeaderAddr},
       {*MP.PlatformBootstrap.Name, &MP.PlatformBootstrap.Addr},
       {*MP.PlatformShutdown.Name, &MP.PlatformShutdown.Addr},
       {*MP.RegisterJITDylib.Name, &MP.RegisterJITDylib.Addr},
@@ -924,10 +920,8 @@ Error MachOPlatform::MachOPlatformPlugin::
     // If this graph defines the macho header symbol then create the internal
     // mapping between it and PlatformJD.
     std::lock_guard<std::mutex> Lock(MP.PlatformMutex);
-    MP.JITDylibToHeaderAddr[&MP.PlatformJD] =
-        MP.Bootstrap.load()->MachOHeaderAddr;
-    MP.HeaderAddrToJITDylib[MP.Bootstrap.load()->MachOHeaderAddr] =
-        &MP.PlatformJD;
+    MP.JITDylibToHeaderAddr[&MP.PlatformJD] = MP.Bootstrap->MachOHeaderAddr;
+    MP.HeaderAddrToJITDylib[MP.Bootstrap->MachOHeaderAddr] = &MP.PlatformJD;
   }
 
   return Error::success();
@@ -935,19 +929,13 @@ Error MachOPlatform::MachOPlatformPlugin::
 
 Error MachOPlatform::MachOPlatformPlugin::bootstrapPipelineEnd(
     jitlink::LinkGraph &G) {
-  std::lock_guard<std::mutex> Lock(MP.Bootstrap.load()->Mutex);
-  assert(MP.Bootstrap && "DeferredAAs reset before bootstrap completed");
-
-  // Transfer any allocation actions to DeferredAAs.
-  std::move(G.allocActions().begin(), G.allocActions().end(),
-            std::back_inserter(MP.Bootstrap.load()->DeferredAAs));
-  G.allocActions().clear();
+  std::lock_guard<std::mutex> Lock(MP.Bootstrap->Mutex);
 
-  --MP.Bootstrap.load()->ActiveGraphs;
+  --MP.Bootstrap->ActiveGraphs;
   // Notify Bootstrap->CV while holding the mutex because the mutex is
   // also keeping Bootstrap->CV alive.
-  if (MP.Bootstrap.load()->ActiveGraphs == 0)
-    MP.Bootstrap.load()->CV.notify_all();
+  if (MP.Bootstrap->ActiveGraphs == 0)
+    MP.Bootstrap->CV.notify_all();
   return Error::success();
 }
 
@@ -1412,15 +1400,23 @@ Error MachOPlatform::MachOPlatformPlugin::registerObjectPlatformSections(
       assert(I->second && "Null header registered for JD");
       HeaderAddr = I->second;
     }
-    G.allocActions().push_back(
-        {cantFail(
-             WrapperFunctionCall::Create<SPSRegisterObjectPlatformSectionsArgs>(
-                 MP.RegisterObjectPlatformSections.Addr, HeaderAddr, UnwindInfo,
-                 MachOPlatformSecs)),
-         cantFail(
-             WrapperFunctionCall::Create<SPSRegisterObjectPlatformSectionsArgs>(
-                 MP.DeregisterObjectPlatformSections.Addr, HeaderAddr,
-                 UnwindInfo, MachOPlatformSecs))});
+
+    AllocActionCallPair AllocActions = {
+        cantFail(
+            WrapperFunctionCall::Create<SPSRegisterObjectPlatformSectionsArgs>(
+                MP.RegisterObjectPlatformSections.Addr, HeaderAddr, UnwindInfo,
+                MachOPlatformSecs)),
+        cantFail(
+            WrapperFunctionCall::Create<SPSRegisterObjectPlatformSectionsArgs>(
+                MP.DeregisterObjectPlatformSections.Addr, HeaderAddr,
+                UnwindInfo, MachOPlatformSecs))};
+
+    if (LLVM_LIKELY(!InBootstrapPhase))
+      G.allocActions().push_back(std::move(AllocActions));
+    else {
+      std::lock_guard<std::mutex> Lock(MP.Bootstrap->Mutex);
+      MP.Bootstrap->DeferredAAs.push_back(std::move(AllocActions));
+    }
   }
 
   return Error::success();
@@ -1701,8 +1697,8 @@ Error MachOPlatform::MachOPlatformPlugin::addSymbolTableRegistration(
     // If we're in the bootstrap phase then just record these symbols in the
     // bootstrap object and then bail out -- registration will be attached to
     // the bootstrap graph.
-    std::lock_guard<std::mutex> Lock(MP.Bootstrap.load()->Mutex);
-    auto &SymTab = MP.Bootstrap.load()->SymTab;
+    std::lock_guard<std::mutex> Lock(MP.Bootstrap->Mutex);
+    auto &SymTab = MP.Bootstrap->SymTab;
     for (auto &[OriginalSymbol, NameSym] : JITSymTabInfo)
       SymTab.push_back({NameSym->getAddress(), OriginalSymbol->getAddress(),
                         flagsForSymbol(*OriginalSymbol)});

From dd331082e706d833ec3cc897176cd2d3a622ce76 Mon Sep 17 00:00:00 2001
From: Arseniy Zaostrovnykh <necto.ne@gmail.com>
Date: Fri, 10 Jan 2025 08:21:00 +0100
Subject: [PATCH 025/408] [analyzer][NFC] Factor out SymbolManager::get<*>
 (#121781)

Replace the family of `SymbolManager::get*Symbol(...)` member functions
with a single generic `SymbolManager::get<*>` member function.
---
 .../Core/PathSensitive/SymbolManager.h        |  74 ++++-----
 clang/lib/StaticAnalyzer/Core/MemRegion.cpp   |   6 +-
 .../Core/RangeConstraintManager.cpp           |  13 +-
 .../Core/RangedConstraintManager.cpp          |   8 +-
 clang/lib/StaticAnalyzer/Core/SValBuilder.cpp |  20 +--
 .../StaticAnalyzer/Core/SimpleSValBuilder.cpp |  10 +-
 .../lib/StaticAnalyzer/Core/SymbolManager.cpp | 155 ------------------
 7 files changed, 58 insertions(+), 228 deletions(-)

diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h
index c530dff495238..cbbea1b56bb40 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h
@@ -113,10 +113,10 @@ class SymbolConjured : public SymbolData {
 
   void dumpToStream(raw_ostream &os) const override;
 
-  static void Profile(llvm::FoldingSetNodeID& profile, const Stmt *S,
-                      QualType T, unsigned Count, const LocationContext *LCtx,
+  static void Profile(llvm::FoldingSetNodeID &profile, const Stmt *S,
+                      const LocationContext *LCtx, QualType T, unsigned Count,
                       const void *SymbolTag) {
-    profile.AddInteger((unsigned) SymbolConjuredKind);
+    profile.AddInteger((unsigned)SymbolConjuredKind);
     profile.AddPointer(S);
     profile.AddPointer(LCtx);
     profile.Add(T);
@@ -125,7 +125,7 @@ class SymbolConjured : public SymbolData {
   }
 
   void Profile(llvm::FoldingSetNodeID& profile) override {
-    Profile(profile, S, T, Count, LCtx, SymbolTag);
+    Profile(profile, S, LCtx, T, Count, SymbolTag);
   }
 
   // Implement isa<T> support.
@@ -224,6 +224,8 @@ class SymbolMetadata : public SymbolData {
   const Stmt *S;
   QualType T;
   const LocationContext *LCtx;
+  /// Count can be used to differentiate regions corresponding to
+  /// different loop iterations, thus, making the symbol path-dependent.
   unsigned Count;
   const void *Tag;
 
@@ -525,14 +527,18 @@ class SymbolManager {
 
   static bool canSymbolicate(QualType T);
 
-  /// Make a unique symbol for MemRegion R according to its kind.
-  const SymbolRegionValue* getRegionValueSymbol(const TypedValueRegion* R);
+  /// Create or retrieve a SymExpr of type \p SymExprT for the given arguments.
+  /// Use the arguments to check for an existing SymExpr and return it,
+  /// otherwise, create a new one and keep a pointer to it to avoid duplicates.
+  template <typename SymExprT, typename... Args>
+  const SymExprT *acquire(Args &&...args);
 
-  const SymbolConjured* conjureSymbol(const Stmt *E,
-                                      const LocationContext *LCtx,
-                                      QualType T,
+  const SymbolConjured *conjureSymbol(const Stmt *E,
+                                      const LocationContext *LCtx, QualType T,
                                       unsigned VisitCount,
-                                      const void *SymbolTag = nullptr);
+                                      const void *SymbolTag = nullptr) {
+    return acquire<SymbolConjured>(E, LCtx, T, VisitCount, SymbolTag);
+  }
 
   const SymbolConjured* conjureSymbol(const Expr *E,
                                       const LocationContext *LCtx,
@@ -541,41 +547,6 @@ class SymbolManager {
     return conjureSymbol(E, LCtx, E->getType(), VisitCount, SymbolTag);
   }
 
-  const SymbolDerived *getDerivedSymbol(SymbolRef parentSymbol,
-                                        const TypedValueRegion *R);
-
-  const SymbolExtent *getExtentSymbol(const SubRegion *R);
-
-  /// Creates a metadata symbol associated with a specific region.
-  ///
-  /// VisitCount can be used to differentiate regions corresponding to
-  /// different loop iterations, thus, making the symbol path-dependent.
-  const SymbolMetadata *getMetadataSymbol(const MemRegion *R, const Stmt *S,
-                                          QualType T,
-                                          const LocationContext *LCtx,
-                                          unsigned VisitCount,
-                                          const void *SymbolTag = nullptr);
-
-  const SymbolCast* getCastSymbol(const SymExpr *Operand,
-                                  QualType From, QualType To);
-
-  const SymIntExpr *getSymIntExpr(const SymExpr *lhs, BinaryOperator::Opcode op,
-                                  APSIntPtr rhs, QualType t);
-
-  const SymIntExpr *getSymIntExpr(const SymExpr &lhs, BinaryOperator::Opcode op,
-                                  APSIntPtr rhs, QualType t) {
-    return getSymIntExpr(&lhs, op, rhs, t);
-  }
-
-  const IntSymExpr *getIntSymExpr(APSIntPtr lhs, BinaryOperator::Opcode op,
-                                  const SymExpr *rhs, QualType t);
-
-  const SymSymExpr *getSymSymExpr(const SymExpr *lhs, BinaryOperator::Opcode op,
-                                  const SymExpr *rhs, QualType t);
-
-  const UnarySymExpr *getUnarySymExpr(const SymExpr *operand,
-                                      UnaryOperator::Opcode op, QualType t);
-
   QualType getType(const SymExpr *SE) const {
     return SE->getType();
   }
@@ -707,6 +678,19 @@ class SymbolVisitor {
   virtual bool VisitMemRegion(const MemRegion *) { return true; }
 };
 
+template <typename T, typename... Args>
+const T *SymbolManager::acquire(Args &&...args) {
+  llvm::FoldingSetNodeID profile;
+  T::Profile(profile, args...);
+  void *InsertPos;
+  SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos);
+  if (!SD) {
+    SD = Alloc.make<T>(std::forward<Args>(args)...);
+    DataSet.InsertNode(SD, InsertPos);
+  }
+  return cast<T>(SD);
+}
+
 } // namespace ento
 
 } // namespace clang
diff --git a/clang/lib/StaticAnalyzer/Core/MemRegion.cpp b/clang/lib/StaticAnalyzer/Core/MemRegion.cpp
index 559c80634c12e..2c5cd2cf7630f 100644
--- a/clang/lib/StaticAnalyzer/Core/MemRegion.cpp
+++ b/clang/lib/StaticAnalyzer/Core/MemRegion.cpp
@@ -811,7 +811,7 @@ DefinedOrUnknownSVal MemRegionManager::getStaticSize(const MemRegion *MR,
   switch (SR->getKind()) {
   case MemRegion::AllocaRegionKind:
   case MemRegion::SymbolicRegionKind:
-    return nonloc::SymbolVal(SymMgr.getExtentSymbol(SR));
+    return nonloc::SymbolVal(SymMgr.acquire<SymbolExtent>(SR));
   case MemRegion::StringRegionKind:
     return SVB.makeIntVal(
         cast<StringRegion>(SR)->getStringLiteral()->getByteLength() + 1,
@@ -829,7 +829,7 @@ DefinedOrUnknownSVal MemRegionManager::getStaticSize(const MemRegion *MR,
   case MemRegion::ObjCStringRegionKind: {
     QualType Ty = cast<TypedValueRegion>(SR)->getDesugaredValueType(Ctx);
     if (isa<VariableArrayType>(Ty))
-      return nonloc::SymbolVal(SymMgr.getExtentSymbol(SR));
+      return nonloc::SymbolVal(SymMgr.acquire<SymbolExtent>(SR));
 
     if (Ty->isIncompleteType())
       return UnknownVal();
@@ -897,7 +897,7 @@ DefinedOrUnknownSVal MemRegionManager::getStaticSize(const MemRegion *MR,
   case MemRegion::BlockDataRegionKind:
   case MemRegion::BlockCodeRegionKind:
   case MemRegion::FunctionCodeRegionKind:
-    return nonloc::SymbolVal(SymMgr.getExtentSymbol(SR));
+    return nonloc::SymbolVal(SymMgr.acquire<SymbolExtent>(SR));
   default:
     llvm_unreachable("Unhandled region");
   }
diff --git a/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp b/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
index c39fa81109c85..ab45e678bafd5 100644
--- a/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
+++ b/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
@@ -1471,7 +1471,7 @@ class SymbolicRangeInferrer
     return getRangeForNegatedExpr(
         [SSE, State = this->State]() -> SymbolRef {
           if (SSE->getOpcode() == BO_Sub)
-            return State->getSymbolManager().getSymSymExpr(
+            return State->getSymbolManager().acquire<SymSymExpr>(
                 SSE->getRHS(), BO_Sub, SSE->getLHS(), SSE->getType());
           return nullptr;
         },
@@ -1481,8 +1481,8 @@ class SymbolicRangeInferrer
   std::optional<RangeSet> getRangeForNegatedSym(SymbolRef Sym) {
     return getRangeForNegatedExpr(
         [Sym, State = this->State]() {
-          return State->getSymbolManager().getUnarySymExpr(Sym, UO_Minus,
-                                                           Sym->getType());
+          return State->getSymbolManager().acquire<UnarySymExpr>(
+              Sym, UO_Minus, Sym->getType());
         },
         Sym->getType());
   }
@@ -1495,7 +1495,7 @@ class SymbolicRangeInferrer
     if (!IsCommutative)
       return std::nullopt;
 
-    SymbolRef Commuted = State->getSymbolManager().getSymSymExpr(
+    SymbolRef Commuted = State->getSymbolManager().acquire<SymSymExpr>(
         SSE->getRHS(), Op, SSE->getLHS(), SSE->getType());
     if (const RangeSet *Range = getConstraint(State, Commuted))
       return *Range;
@@ -1540,7 +1540,8 @@ class SymbolicRangeInferrer
 
       // Let's find an expression e.g. (x < y).
       BinaryOperatorKind QueriedOP = OperatorRelationsTable::getOpFromIndex(i);
-      const SymSymExpr *SymSym = SymMgr.getSymSymExpr(LHS, QueriedOP, RHS, T);
+      const SymSymExpr *SymSym =
+          SymMgr.acquire<SymSymExpr>(LHS, QueriedOP, RHS, T);
       const RangeSet *QueriedRangeSet = getConstraint(State, SymSym);
 
       // If ranges were not previously found,
@@ -1548,7 +1549,7 @@ class SymbolicRangeInferrer
       if (!QueriedRangeSet) {
         const BinaryOperatorKind ROP =
             BinaryOperator::reverseComparisonOp(QueriedOP);
-        SymSym = SymMgr.getSymSymExpr(RHS, ROP, LHS, T);
+        SymSym = SymMgr.acquire<SymSymExpr>(RHS, ROP, LHS, T);
         QueriedRangeSet = getConstraint(State, SymSym);
       }
 
diff --git a/clang/lib/StaticAnalyzer/Core/RangedConstraintManager.cpp b/clang/lib/StaticAnalyzer/Core/RangedConstraintManager.cpp
index 4bbe933be2129..94dcdaf327689 100644
--- a/clang/lib/StaticAnalyzer/Core/RangedConstraintManager.cpp
+++ b/clang/lib/StaticAnalyzer/Core/RangedConstraintManager.cpp
@@ -62,8 +62,8 @@ ProgramStateRef RangedConstraintManager::assumeSym(ProgramStateRef State,
 
         SymbolManager &SymMgr = getSymbolManager();
         QualType DiffTy = SymMgr.getContext().getPointerDiffType();
-        SymbolRef Subtraction =
-            SymMgr.getSymSymExpr(SSE->getRHS(), BO_Sub, SSE->getLHS(), DiffTy);
+        SymbolRef Subtraction = SymMgr.acquire<SymSymExpr>(
+            SSE->getRHS(), BO_Sub, SSE->getLHS(), DiffTy);
 
         const llvm::APSInt &Zero = getBasicVals().getValue(0, DiffTy);
         Op = BinaryOperator::reverseComparisonOp(Op);
@@ -76,8 +76,8 @@ ProgramStateRef RangedConstraintManager::assumeSym(ProgramStateRef State,
         SymbolManager &SymMgr = getSymbolManager();
 
         QualType ExprType = SSE->getType();
-        SymbolRef CanonicalEquality =
-            SymMgr.getSymSymExpr(SSE->getLHS(), BO_EQ, SSE->getRHS(), ExprType);
+        SymbolRef CanonicalEquality = SymMgr.acquire<SymSymExpr>(
+            SSE->getLHS(), BO_EQ, SSE->getRHS(), ExprType);
 
         bool WasEqual = SSE->getOpcode() == BO_EQ;
         bool IsExpectedEqual = WasEqual == Assumption;
diff --git a/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp b/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp
index 2b85580186381..4f45b24be86c1 100644
--- a/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp
+++ b/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp
@@ -79,7 +79,7 @@ nonloc::SymbolVal SValBuilder::makeNonLoc(const SymExpr *lhs,
                                           APSIntPtr rhs, QualType type) {
   assert(lhs);
   assert(!Loc::isLocType(type));
-  return nonloc::SymbolVal(SymMgr.getSymIntExpr(lhs, op, rhs, type));
+  return nonloc::SymbolVal(SymMgr.acquire<SymIntExpr>(lhs, op, rhs, type));
 }
 
 nonloc::SymbolVal SValBuilder::makeNonLoc(APSIntPtr lhs,
@@ -87,7 +87,7 @@ nonloc::SymbolVal SValBuilder::makeNonLoc(APSIntPtr lhs,
                                           const SymExpr *rhs, QualType type) {
   assert(rhs);
   assert(!Loc::isLocType(type));
-  return nonloc::SymbolVal(SymMgr.getIntSymExpr(lhs, op, rhs, type));
+  return nonloc::SymbolVal(SymMgr.acquire<IntSymExpr>(lhs, op, rhs, type));
 }
 
 nonloc::SymbolVal SValBuilder::makeNonLoc(const SymExpr *lhs,
@@ -95,14 +95,14 @@ nonloc::SymbolVal SValBuilder::makeNonLoc(const SymExpr *lhs,
                                           const SymExpr *rhs, QualType type) {
   assert(lhs && rhs);
   assert(!Loc::isLocType(type));
-  return nonloc::SymbolVal(SymMgr.getSymSymExpr(lhs, op, rhs, type));
+  return nonloc::SymbolVal(SymMgr.acquire<SymSymExpr>(lhs, op, rhs, type));
 }
 
 NonLoc SValBuilder::makeNonLoc(const SymExpr *operand, UnaryOperator::Opcode op,
                                QualType type) {
   assert(operand);
   assert(!Loc::isLocType(type));
-  return nonloc::SymbolVal(SymMgr.getUnarySymExpr(operand, op, type));
+  return nonloc::SymbolVal(SymMgr.acquire<UnarySymExpr>(operand, op, type));
 }
 
 nonloc::SymbolVal SValBuilder::makeNonLoc(const SymExpr *operand,
@@ -111,7 +111,7 @@ nonloc::SymbolVal SValBuilder::makeNonLoc(const SymExpr *operand,
   assert(!Loc::isLocType(toTy));
   if (fromTy == toTy)
     return nonloc::SymbolVal(operand);
-  return nonloc::SymbolVal(SymMgr.getCastSymbol(operand, fromTy, toTy));
+  return nonloc::SymbolVal(SymMgr.acquire<SymbolCast>(operand, fromTy, toTy));
 }
 
 SVal SValBuilder::convertToArrayIndex(SVal val) {
@@ -143,7 +143,7 @@ SValBuilder::getRegionValueSymbolVal(const TypedValueRegion *region) {
   if (!SymbolManager::canSymbolicate(T))
     return UnknownVal();
 
-  SymbolRef sym = SymMgr.getRegionValueSymbol(region);
+  SymbolRef sym = SymMgr.acquire<SymbolRegionValue>(region);
 
   if (Loc::isLocType(T))
     return loc::MemRegionVal(MemMgr.getSymbolicRegion(sym));
@@ -244,8 +244,8 @@ DefinedSVal SValBuilder::getMetadataSymbolVal(const void *symbolTag,
                                               unsigned count) {
   assert(SymbolManager::canSymbolicate(type) && "Invalid metadata symbol type");
 
-  SymbolRef sym =
-      SymMgr.getMetadataSymbol(region, expr, type, LCtx, count, symbolTag);
+  SymbolRef sym = SymMgr.acquire<SymbolMetadata>(region, expr, type, LCtx,
+                                                 count, symbolTag);
 
   if (Loc::isLocType(type))
     return loc::MemRegionVal(MemMgr.getSymbolicRegion(sym));
@@ -264,7 +264,7 @@ SValBuilder::getDerivedRegionValueSymbolVal(SymbolRef parentSymbol,
   if (!SymbolManager::canSymbolicate(T))
     return UnknownVal();
 
-  SymbolRef sym = SymMgr.getDerivedSymbol(parentSymbol, region);
+  SymbolRef sym = SymMgr.acquire<SymbolDerived>(parentSymbol, region);
 
   if (Loc::isLocType(T))
     return loc::MemRegionVal(MemMgr.getSymbolicRegion(sym));
@@ -724,7 +724,7 @@ class EvalCastVisitor : public SValVisitor<EvalCastVisitor, SVal> {
             // because there are no generic region address metadata
             // symbols to use, only content metadata.
             return nonloc::SymbolVal(
-                VB.getSymbolManager().getExtentSymbol(FTR));
+                VB.getSymbolManager().acquire<SymbolExtent>(FTR));
 
       if (const SymbolicRegion *SymR = R->getSymbolicBase()) {
         SymbolRef Sym = SymR->getSymbol();
diff --git a/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp b/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp
index 455621739f693..afb0273d23bd4 100644
--- a/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp
+++ b/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp
@@ -328,16 +328,16 @@ static NonLoc doRearrangeUnchecked(ProgramStateRef State,
     // FIXME: Maybe it'd be better to have consistency in
     // "$x - $y" vs. "$y - $x" because those are solver's keys.
     if (LInt > RInt) {
-      ResultSym = SymMgr.getSymSymExpr(RSym, BO_Sub, LSym, SymTy);
+      ResultSym = SymMgr.acquire<SymSymExpr>(RSym, BO_Sub, LSym, SymTy);
       ResultOp = BinaryOperator::reverseComparisonOp(Op);
       ResultInt = LInt - RInt; // Opposite order!
     } else {
-      ResultSym = SymMgr.getSymSymExpr(LSym, BO_Sub, RSym, SymTy);
+      ResultSym = SymMgr.acquire<SymSymExpr>(LSym, BO_Sub, RSym, SymTy);
       ResultOp = Op;
       ResultInt = RInt - LInt; // Opposite order!
     }
   } else {
-    ResultSym = SymMgr.getSymSymExpr(LSym, Op, RSym, SymTy);
+    ResultSym = SymMgr.acquire<SymSymExpr>(LSym, Op, RSym, SymTy);
     ResultInt = (Op == BO_Add) ? (LInt + RInt) : (LInt - RInt);
     ResultOp = BO_Add;
     // Bring back the cosmetic difference.
@@ -350,8 +350,8 @@ static NonLoc doRearrangeUnchecked(ProgramStateRef State,
     }
   }
   APSIntPtr PersistentResultInt = BV.getValue(ResultInt);
-  return nonloc::SymbolVal(
-      SymMgr.getSymIntExpr(ResultSym, ResultOp, PersistentResultInt, ResultTy));
+  return nonloc::SymbolVal(SymMgr.acquire<SymIntExpr>(
+      ResultSym, ResultOp, PersistentResultInt, ResultTy));
 }
 
 // Rearrange if symbol type matches the result type and if the operator is a
diff --git a/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp b/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp
index 738b6a175ce6d..a4648f5922ef1 100644
--- a/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp
+++ b/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp
@@ -163,161 +163,6 @@ void SymExpr::symbol_iterator::expand() {
   llvm_unreachable("unhandled expansion case");
 }
 
-const SymbolRegionValue*
-SymbolManager::getRegionValueSymbol(const TypedValueRegion* R) {
-  llvm::FoldingSetNodeID profile;
-  SymbolRegionValue::Profile(profile, R);
-  void *InsertPos;
-  SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos);
-  if (!SD) {
-    SD = Alloc.make<SymbolRegionValue>(R);
-    DataSet.InsertNode(SD, InsertPos);
-  }
-
-  return cast<SymbolRegionValue>(SD);
-}
-
-const SymbolConjured* SymbolManager::conjureSymbol(const Stmt *E,
-                                                   const LocationContext *LCtx,
-                                                   QualType T,
-                                                   unsigned Count,
-                                                   const void *SymbolTag) {
-  llvm::FoldingSetNodeID profile;
-  SymbolConjured::Profile(profile, E, T, Count, LCtx, SymbolTag);
-  void *InsertPos;
-  SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos);
-  if (!SD) {
-    SD = Alloc.make<SymbolConjured>(E, LCtx, T, Count, SymbolTag);
-    DataSet.InsertNode(SD, InsertPos);
-  }
-
-  return cast<SymbolConjured>(SD);
-}
-
-const SymbolDerived*
-SymbolManager::getDerivedSymbol(SymbolRef parentSymbol,
-                                const TypedValueRegion *R) {
-  llvm::FoldingSetNodeID profile;
-  SymbolDerived::Profile(profile, parentSymbol, R);
-  void *InsertPos;
-  SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos);
-  if (!SD) {
-    SD = Alloc.make<SymbolDerived>(parentSymbol, R);
-    DataSet.InsertNode(SD, InsertPos);
-  }
-
-  return cast<SymbolDerived>(SD);
-}
-
-const SymbolExtent*
-SymbolManager::getExtentSymbol(const SubRegion *R) {
-  llvm::FoldingSetNodeID profile;
-  SymbolExtent::Profile(profile, R);
-  void *InsertPos;
-  SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos);
-  if (!SD) {
-    SD = Alloc.make<SymbolExtent>(R);
-    DataSet.InsertNode(SD, InsertPos);
-  }
-
-  return cast<SymbolExtent>(SD);
-}
-
-const SymbolMetadata *
-SymbolManager::getMetadataSymbol(const MemRegion* R, const Stmt *S, QualType T,
-                                 const LocationContext *LCtx,
-                                 unsigned Count, const void *SymbolTag) {
-  llvm::FoldingSetNodeID profile;
-  SymbolMetadata::Profile(profile, R, S, T, LCtx, Count, SymbolTag);
-  void *InsertPos;
-  SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos);
-  if (!SD) {
-    SD = Alloc.make<SymbolMetadata>(R, S, T, LCtx, Count, SymbolTag);
-    DataSet.InsertNode(SD, InsertPos);
-  }
-
-  return cast<SymbolMetadata>(SD);
-}
-
-const SymbolCast*
-SymbolManager::getCastSymbol(const SymExpr *Op,
-                             QualType From, QualType To) {
-  llvm::FoldingSetNodeID ID;
-  SymbolCast::Profile(ID, Op, From, To);
-  void *InsertPos;
-  SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos);
-  if (!data) {
-    data = Alloc.make<SymbolCast>(Op, From, To);
-    DataSet.InsertNode(data, InsertPos);
-  }
-
-  return cast<SymbolCast>(data);
-}
-
-const SymIntExpr *SymbolManager::getSymIntExpr(const SymExpr *lhs,
-                                               BinaryOperator::Opcode op,
-                                               APSIntPtr v, QualType t) {
-  llvm::FoldingSetNodeID ID;
-  SymIntExpr::Profile(ID, lhs, op, v, t);
-  void *InsertPos;
-  SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos);
-
-  if (!data) {
-    data = Alloc.make<SymIntExpr>(lhs, op, v, t);
-    DataSet.InsertNode(data, InsertPos);
-  }
-
-  return cast<SymIntExpr>(data);
-}
-
-const IntSymExpr *SymbolManager::getIntSymExpr(APSIntPtr lhs,
-                                               BinaryOperator::Opcode op,
-                                               const SymExpr *rhs, QualType t) {
-  llvm::FoldingSetNodeID ID;
-  IntSymExpr::Profile(ID, lhs, op, rhs, t);
-  void *InsertPos;
-  SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos);
-
-  if (!data) {
-    data = Alloc.make<IntSymExpr>(lhs, op, rhs, t);
-    DataSet.InsertNode(data, InsertPos);
-  }
-
-  return cast<IntSymExpr>(data);
-}
-
-const SymSymExpr *SymbolManager::getSymSymExpr(const SymExpr *lhs,
-                                               BinaryOperator::Opcode op,
-                                               const SymExpr *rhs,
-                                               QualType t) {
-  llvm::FoldingSetNodeID ID;
-  SymSymExpr::Profile(ID, lhs, op, rhs, t);
-  void *InsertPos;
-  SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos);
-
-  if (!data) {
-    data = Alloc.make<SymSymExpr>(lhs, op, rhs, t);
-    DataSet.InsertNode(data, InsertPos);
-  }
-
-  return cast<SymSymExpr>(data);
-}
-
-const UnarySymExpr *SymbolManager::getUnarySymExpr(const SymExpr *Operand,
-                                                   UnaryOperator::Opcode Opc,
-                                                   QualType T) {
-  llvm::FoldingSetNodeID ID;
-  UnarySymExpr::Profile(ID, Operand, Opc, T);
-  void *InsertPos;
-  SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos);
-  if (!data) {
-    data = Alloc.make<UnarySymExpr>(Operand, Opc, T);
-    DataSet.InsertNode(data, InsertPos);
-  }
-
-  return cast<UnarySymExpr>(data);
-}
-
 QualType SymbolConjured::getType() const {
   return T;
 }

From f3d6cdc5aebafac3961d4fccbd2ca0e302c6082c Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen@sifive.com>
Date: Fri, 10 Jan 2025 09:05:39 +0800
Subject: [PATCH 026/408] [SLP] NFC. Replace MainOp and AltOp in TreeEntry with
 InstructionsState. (#120198)

Add TreeEntry::hasState.
Add assert for getTreeEntry.
Remove the OpValue parameter from the canReuseExtract function.
Remove the Opcode parameter from the ComputeMaxBitWidth lambda function.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 210 +++++++++---------
 1 file changed, 107 insertions(+), 103 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index fd897b3f720be..7e5649c1db215 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2414,15 +2414,16 @@ class BoUpSLP {
     }
 
     /// Go through the instructions in VL and append their operands.
-    void appendOperandsOfVL(ArrayRef<Value *> VL, Instruction *VL0) {
+    void appendOperandsOfVL(ArrayRef<Value *> VL, const InstructionsState &S) {
       assert(!VL.empty() && "Bad VL");
       assert((empty() || VL.size() == getNumLanes()) &&
              "Expected same number of lanes");
       // IntrinsicInst::isCommutative returns true if swapping the first "two"
       // arguments to the intrinsic produces the same result.
       constexpr unsigned IntrinsicNumOperands = 2;
-      unsigned NumOperands = VL0->getNumOperands();
-      ArgSize = isa<IntrinsicInst>(VL0) ? IntrinsicNumOperands : NumOperands;
+      unsigned NumOperands = S.getMainOp()->getNumOperands();
+      ArgSize = isa<IntrinsicInst>(S.getMainOp()) ? IntrinsicNumOperands
+                                                  : NumOperands;
       OpsVec.resize(NumOperands);
       unsigned NumLanes = VL.size();
       for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
@@ -2442,8 +2443,8 @@ class BoUpSLP {
           // tell the inverse operations by checking commutativity.
           if (isa<PoisonValue>(VL[Lane])) {
             OpsVec[OpIdx][Lane] = {
-                PoisonValue::get(VL0->getOperand(OpIdx)->getType()), true,
-                false};
+                PoisonValue::get(S.getMainOp()->getOperand(OpIdx)->getType()),
+                true, false};
             continue;
           }
           bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
@@ -2555,11 +2556,12 @@ class BoUpSLP {
 
   public:
     /// Initialize with all the operands of the instruction vector \p RootVL.
-    VLOperands(ArrayRef<Value *> RootVL, Instruction *VL0, const BoUpSLP &R)
+    VLOperands(ArrayRef<Value *> RootVL, const InstructionsState &S,
+               const BoUpSLP &R)
         : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
-          L(R.LI->getLoopFor((VL0->getParent()))) {
+          L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
       // Append all the operands of RootVL.
-      appendOperandsOfVL(RootVL, VL0);
+      appendOperandsOfVL(RootVL, S);
     }
 
     /// \Returns a value vector with the operands across all lanes for the
@@ -3032,7 +3034,7 @@ class BoUpSLP {
   /// non-identity permutation that allows to reuse extract instructions.
   /// \param ResizeAllowed indicates whether it is allowed to handle subvector
   /// extract order.
-  bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
+  bool canReuseExtract(ArrayRef<Value *> VL,
                        SmallVectorImpl<unsigned> &CurrentOrder,
                        bool ResizeAllowed = false) const;
 
@@ -3259,7 +3261,7 @@ class BoUpSLP {
     };
 
     /// Checks if the current node is a gather node.
-    bool isGather() const {return State == NeedToGather; }
+    bool isGather() const { return State == NeedToGather; }
 
     /// A vector of scalars.
     ValueList Scalars;
@@ -3323,9 +3325,9 @@ class BoUpSLP {
     /// reordering of operands during buildTree_rec() and vectorizeTree().
     SmallVector<ValueList, 2> Operands;
 
-    /// The main/alternate instruction.
-    Instruction *MainOp = nullptr;
-    Instruction *AltOp = nullptr;
+    /// MainOp and AltOp are recorded inside. S should be obtained from
+    /// newTreeEntry.
+    InstructionsState S = InstructionsState::invalid();
 
     /// Interleaving factor for interleaved loads Vectorize nodes.
     unsigned InterleaveFactor = 0;
@@ -3349,10 +3351,10 @@ class BoUpSLP {
 
     /// Set this bundle's operand from Scalars.
     void setOperand(const BoUpSLP &R, bool RequireReorder = false) {
-      VLOperands Ops(Scalars, MainOp, R);
+      VLOperands Ops(Scalars, S, R);
       if (RequireReorder)
         Ops.reorder();
-      for (unsigned I : seq<unsigned>(MainOp->getNumOperands()))
+      for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands()))
         setOperand(I, Ops.getVL(I));
     }
 
@@ -3385,13 +3387,9 @@ class BoUpSLP {
     }
 
     /// Some of the instructions in the list have alternate opcodes.
-    bool isAltShuffle() const { return MainOp != AltOp; }
+    bool isAltShuffle() const { return S.isAltShuffle(); }
 
-    bool isOpcodeOrAlt(Instruction *I) const {
-      unsigned CheckedOpcode = I->getOpcode();
-      return (getOpcode() == CheckedOpcode ||
-              getAltOpcode() == CheckedOpcode);
-    }
+    bool isOpcodeOrAlt(Instruction *I) const { return S.isOpcodeOrAlt(I); }
 
     /// Chooses the correct key for scheduling data. If \p Op has the same (or
     /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
@@ -3400,31 +3398,24 @@ class BoUpSLP {
       auto *I = dyn_cast<Instruction>(Op);
       if (I && isOpcodeOrAlt(I))
         return Op;
-      return MainOp;
+      return S.getMainOp();
     }
 
     void setOperations(const InstructionsState &S) {
       assert(S && "InstructionsState is invalid.");
-      MainOp = S.getMainOp();
-      AltOp = S.getAltOp();
+      this->S = S;
     }
 
-    Instruction *getMainOp() const {
-      return MainOp;
-    }
+    Instruction *getMainOp() const { return S.getMainOp(); }
 
-    Instruction *getAltOp() const {
-      return AltOp;
-    }
+    Instruction *getAltOp() const { return S.getAltOp(); }
 
     /// The main/alternate opcodes for the list of instructions.
-    unsigned getOpcode() const {
-      return MainOp ? MainOp->getOpcode() : 0;
-    }
+    unsigned getOpcode() const { return S.getOpcode(); }
 
-    unsigned getAltOpcode() const {
-      return AltOp ? AltOp->getOpcode() : 0;
-    }
+    unsigned getAltOpcode() const { return S.getAltOpcode(); }
+
+    bool hasState() const { return S.valid(); }
 
     /// When ReuseReorderShuffleIndices is empty it just returns position of \p
     /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
@@ -3520,16 +3511,13 @@ class BoUpSLP {
         dbgs() << "CombinedVectorize\n";
         break;
       }
-      dbgs() << "MainOp: ";
-      if (MainOp)
-        dbgs() << *MainOp << "\n";
-      else
-        dbgs() << "NULL\n";
-      dbgs() << "AltOp: ";
-      if (AltOp)
-        dbgs() << *AltOp << "\n";
-      else
-        dbgs() << "NULL\n";
+      if (S) {
+        dbgs() << "MainOp: " << *S.getMainOp() << "\n";
+        dbgs() << "AltOp: " << *S.getAltOp() << "\n";
+      } else {
+        dbgs() << "MainOp: NULL\n";
+        dbgs() << "AltOp: NULL\n";
+      }
       dbgs() << "VectorizedValue: ";
       if (VectorizedValue)
         dbgs() << *VectorizedValue << "\n";
@@ -3704,9 +3692,13 @@ class BoUpSLP {
   }
 #endif
 
-  TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
+  TreeEntry *getTreeEntry(Value *V) {
+    assert(V && "V cannot be nullptr.");
+    return ScalarToTreeEntry.lookup(V);
+  }
 
   const TreeEntry *getTreeEntry(Value *V) const {
+    assert(V && "V cannot be nullptr.");
     return ScalarToTreeEntry.lookup(V);
   }
 
@@ -5587,7 +5579,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
     // Try build correct order for extractelement instructions.
     SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
                                 TE.ReuseShuffleIndices.end());
-    if (TE.getOpcode() == Instruction::ExtractElement &&
+    if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
         all_of(TE.Scalars, [Sz](Value *V) {
           if (isa<PoisonValue>(V))
             return true;
@@ -5749,10 +5741,11 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
       return std::nullopt; // No need to reorder.
     return std::move(Phis);
   }
-  if (TE.isGather() && !TE.isAltShuffle() && allSameType(TE.Scalars)) {
+  if (TE.isGather() && (!TE.hasState() || !TE.isAltShuffle()) &&
+      allSameType(TE.Scalars)) {
     // TODO: add analysis of other gather nodes with extractelement
     // instructions and other values/instructions, not only undefs.
-    if ((TE.getOpcode() == Instruction::ExtractElement ||
+    if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
          (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
           any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
         all_of(TE.Scalars, [](Value *V) {
@@ -5762,8 +5755,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
       // Check that gather of extractelements can be represented as
       // just a shuffle of a single vector.
       OrdersType CurrentOrder;
-      bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
-                                   /*ResizeAllowed=*/true);
+      bool Reuse =
+          canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
       if (Reuse || !CurrentOrder.empty())
         return std::move(CurrentOrder);
     }
@@ -5812,7 +5805,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
         return Order;
     // Check if can include the order of vectorized loads. For masked gathers do
     // extra analysis later, so include such nodes into a special list.
-    if (TE.isGather() && TE.getOpcode() == Instruction::Load) {
+    if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
       SmallVector<Value *> PointerOps;
       OrdersType CurrentOrder;
       LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
@@ -5927,7 +5920,7 @@ void BoUpSLP::reorderTopToBottom() {
     // Patterns like [fadd,fsub] can be combined into a single instruction in
     // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
     // to take into account their order when looking for the most used order.
-    if (TE->isAltShuffle()) {
+    if (TE->hasState() && TE->isAltShuffle()) {
       VectorType *VecTy =
           getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
       unsigned Opcode0 = TE->getOpcode();
@@ -6006,7 +5999,7 @@ void BoUpSLP::reorderTopToBottom() {
           if (It != GathersToOrders.end())
             return It->second;
         }
-        if (OpTE->isAltShuffle()) {
+        if (OpTE->hasState() && OpTE->isAltShuffle()) {
           auto It = AltShufflesToOrders.find(OpTE);
           if (It != AltShufflesToOrders.end())
             return It->second;
@@ -7607,7 +7600,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
   }
   case Instruction::ExtractValue:
   case Instruction::ExtractElement: {
-    bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
+    bool Reuse = canReuseExtract(VL, CurrentOrder);
     // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
     if (!has_single_bit(VL.size()))
       return TreeEntry::NeedToGather;
@@ -8620,7 +8613,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                  TE->dump());
 
       ValueList Left, Right;
-      VLOperands Ops(VL, VL0, *this);
+      VLOperands Ops(VL, S, *this);
       if (cast<CmpInst>(VL0)->isCommutative()) {
         // Commutative predicate - collect + sort operands of the instructions
         // so that each side is more likely to have the same opcode.
@@ -8888,7 +8881,7 @@ unsigned BoUpSLP::canMapToVector(Type *T) const {
   return N;
 }
 
-bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
+bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
                               SmallVectorImpl<unsigned> &CurrentOrder,
                               bool ResizeAllowed) const {
   const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
@@ -9542,7 +9535,7 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
 
   // Do not reorder nodes if it small (just 2 elements), all-constant or all
   // instructions have same opcode already.
-  if (TE.Scalars.size() == 2 || (TE.getOpcode() && !TE.isAltShuffle()) ||
+  if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
       all_of(TE.Scalars, isConstant))
     return;
 
@@ -9761,7 +9754,7 @@ void BoUpSLP::transformNodes() {
       // Do not try partial vectorization for small nodes (<= 2), nodes with the
       // same opcode and same parent block or all constants.
       if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
-          !(!E.getOpcode() || E.getOpcode() == Instruction::Load ||
+          !(!E.hasState() || E.getOpcode() == Instruction::Load ||
             E.isAltShuffle() || !allSameBlock(VL)) ||
           allConstant(VL) || isSplat(VL))
         continue;
@@ -9904,6 +9897,8 @@ void BoUpSLP::transformNodes() {
         E.ReorderIndices.clear();
       }
     }
+    if (!E.hasState())
+      continue;
     switch (E.getOpcode()) {
     case Instruction::Load: {
       // No need to reorder masked gather loads, just reorder the scalar
@@ -10023,7 +10018,7 @@ void BoUpSLP::transformNodes() {
         getCanonicalGraphSize() <= SmallTree &&
         count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
                  [](const std::unique_ptr<TreeEntry> &TE) {
-                   return TE->isGather() &&
+                   return TE->isGather() && TE->hasState() &&
                           TE->getOpcode() == Instruction::Load &&
                           !allSameBlock(TE->Scalars);
                  }) == 1)
@@ -10039,13 +10034,13 @@ void BoUpSLP::transformNodes() {
   for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
     TreeEntry &E = *TE;
     if (E.isGather() &&
-        (E.getOpcode() == Instruction::Load ||
-         (!E.getOpcode() && any_of(E.Scalars,
-                                   [&](Value *V) {
-                                     return isa<LoadInst>(V) &&
-                                            !isVectorized(V) &&
-                                            !isDeleted(cast<Instruction>(V));
-                                   }))) &&
+        ((E.hasState() && E.getOpcode() == Instruction::Load) ||
+         (!E.hasState() && any_of(E.Scalars,
+                                  [&](Value *V) {
+                                    return isa<LoadInst>(V) &&
+                                           !isVectorized(V) &&
+                                           !isDeleted(cast<Instruction>(V));
+                                  }))) &&
         !isSplat(E.Scalars)) {
       for (Value *V : E.Scalars) {
         auto *LI = dyn_cast<LoadInst>(V);
@@ -10639,7 +10634,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     bool PrevNodeFound = any_of(
         ArrayRef(R.VectorizableTree).take_front(E->Idx),
         [&](const std::unique_ptr<TreeEntry> &TE) {
-          return ((!TE->isAltShuffle() &&
+          return ((TE->hasState() && !TE->isAltShuffle() &&
                    TE->getOpcode() == Instruction::ExtractElement) ||
                   TE->isGather()) &&
                  all_of(enumerate(TE->Scalars), [&](auto &&Data) {
@@ -11765,7 +11760,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
         if (TE.get() == E)
           break;
-        if (TE->isAltShuffle() &&
+        if (TE->hasState() && TE->isAltShuffle() &&
             ((TE->getOpcode() == E->getOpcode() &&
               TE->getAltOpcode() == E->getAltOpcode()) ||
              (TE->getOpcode() == E->getAltOpcode() &&
@@ -11927,10 +11922,12 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
                    [this](Value *V) { return EphValues.contains(V); }) &&
            (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
             TE->Scalars.size() < Limit ||
-            ((TE->getOpcode() == Instruction::ExtractElement ||
+            (((TE->hasState() &&
+               TE->getOpcode() == Instruction::ExtractElement) ||
               all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
              isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
-            (TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()) ||
+            ((TE->hasState() && TE->getOpcode() == Instruction::Load) &&
+             (!TE->hasState() || !TE->isAltShuffle())) ||
             any_of(TE->Scalars, IsaPred<LoadInst>));
   };
 
@@ -12059,9 +12056,10 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
       !VectorizableTree.empty() &&
       all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
         return (TE->isGather() &&
-                TE->getOpcode() != Instruction::ExtractElement &&
+                (!TE->hasState() ||
+                 TE->getOpcode() != Instruction::ExtractElement) &&
                 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
-               TE->getOpcode() == Instruction::PHI;
+               (TE->hasState() && TE->getOpcode() == Instruction::PHI);
       }))
     return true;
 
@@ -12079,7 +12077,7 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
   // somewhere.
   bool IsAllowedSingleBVNode =
       VectorizableTree.size() > 1 ||
-      (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
+      (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
        !VectorizableTree.front()->isAltShuffle() &&
        VectorizableTree.front()->getOpcode() != Instruction::PHI &&
        VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
@@ -12095,6 +12093,7 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
     return false;
 
   if (VectorizableTree.back()->isGather() &&
+      VectorizableTree.back()->hasState() &&
       VectorizableTree.back()->isAltShuffle() &&
       VectorizableTree.back()->getVectorFactor() > 2 &&
       allSameBlock(VectorizableTree.back()->Scalars) &&
@@ -12119,7 +12118,7 @@ bool BoUpSLP::isTreeNotExtendable() const {
         getCanonicalGraphSize() <= SmallTree &&
         count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
                  [](const std::unique_ptr<TreeEntry> &TE) {
-                   return TE->isGather() &&
+                   return TE->isGather() && TE->hasState() &&
                           TE->getOpcode() == Instruction::Load &&
                           !allSameBlock(TE->Scalars);
                  }) == 1)
@@ -12131,7 +12130,7 @@ bool BoUpSLP::isTreeNotExtendable() const {
     TreeEntry &E = *VectorizableTree[Idx];
     if (!E.isGather())
       continue;
-    if (E.getOpcode() && E.getOpcode() != Instruction::Load)
+    if (E.hasState() && E.getOpcode() != Instruction::Load)
       return false;
     if (isSplat(E.Scalars) || allConstant(E.Scalars))
       continue;
@@ -12441,7 +12440,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
           TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
       continue;
     }
-    if (TE.isGather()) {
+    if (TE.isGather() && TE.hasState()) {
       if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
           E && E->getVectorFactor() == TE.getVectorFactor() &&
           E->isSame(TE.Scalars)) {
@@ -14872,14 +14871,15 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
       }
     }
     // Gather extracts after we check for full matched gathers only.
-    if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
-        ((E->getOpcode() == Instruction::Load ||
+    if (!ExtractShuffles.empty() || !E->hasState() ||
+        E->getOpcode() != Instruction::Load ||
+        (((E->hasState() && E->getOpcode() == Instruction::Load) ||
           any_of(E->Scalars, IsaPred<LoadInst>)) &&
          any_of(E->Scalars,
                 [this](Value *V) {
                   return isa<LoadInst>(V) && getTreeEntry(V);
                 })) ||
-        E->isAltShuffle() ||
+        (E->hasState() && E->isAltShuffle()) ||
         all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
         isSplat(E->Scalars) ||
         (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
@@ -15259,7 +15259,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
   auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
   if (E->isGather()) {
     // Set insert point for non-reduction initial nodes.
-    if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
+    if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
       setInsertPointAfterBundle(E);
     Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);
     E->VectorizedValue = Vec;
@@ -18147,10 +18147,9 @@ void BoUpSLP::computeMinimumValueSizes() {
     return;
 
   SmallVector<unsigned> ToDemote;
-  auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot,
-                                bool IsProfitableToDemoteRoot, unsigned Opcode,
-                                unsigned Limit, bool IsTruncRoot,
-                                bool IsSignedCmp) -> unsigned {
+  auto ComputeMaxBitWidth =
+      [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
+          unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
     ToDemote.clear();
     // Check if the root is trunc and the next node is gather/buildvector, then
     // keep trunc in scalars, which is free in most cases.
@@ -18191,11 +18190,14 @@ void BoUpSLP::computeMinimumValueSizes() {
       return MaxBitWidth;
     }
 
+    if (!E.hasState())
+      return 0u;
+
     unsigned VF = E.getVectorFactor();
     Type *ScalarTy = E.Scalars.front()->getType();
     unsigned ScalarTyNumElements = getNumElements(ScalarTy);
     auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
-    if (!TreeRootIT || !Opcode)
+    if (!TreeRootIT)
       return 0u;
 
     if (any_of(E.Scalars,
@@ -18267,6 +18269,7 @@ void BoUpSLP::computeMinimumValueSizes() {
                 IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
       return 0u;
 
+    unsigned Opcode = E.getOpcode();
     bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
                                 Opcode == Instruction::SExt ||
                                 Opcode == Instruction::ZExt || NumParts > 1;
@@ -18347,15 +18350,14 @@ void BoUpSLP::computeMinimumValueSizes() {
   while (NodeIdx < VectorizableTree.size()) {
     ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
     unsigned Limit = 2;
-    unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
     if (IsTopRoot &&
         ReductionBitWidth ==
             DL->getTypeSizeInBits(
                 VectorizableTree.front()->Scalars.front()->getType()))
       Limit = 3;
     unsigned MaxBitWidth = ComputeMaxBitWidth(
-        *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Opcode,
-        Limit, IsTruncRoot, IsSignedCmp);
+        *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
+        IsTruncRoot, IsSignedCmp);
     if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
       if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
         ReductionBitWidth = bit_ceil(MaxBitWidth);
@@ -18398,19 +18400,21 @@ void BoUpSLP::computeMinimumValueSizes() {
                  });
       IsSignedCmp =
           NodeIdx < VectorizableTree.size() &&
-          any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
-                 [&](const EdgeInfo &EI) {
-                   return EI.UserTE->getOpcode() == Instruction::ICmp &&
-                          any_of(EI.UserTE->Scalars, [&](Value *V) {
-                            auto *IC = dyn_cast<ICmpInst>(V);
-                            return IC &&
-                                   (IC->isSigned() ||
-                                    !isKnownNonNegative(IC->getOperand(0),
-                                                        SimplifyQuery(*DL)) ||
-                                    !isKnownNonNegative(IC->getOperand(1),
-                                                        SimplifyQuery(*DL)));
-                          });
-                 });
+          any_of(
+              VectorizableTree[NodeIdx]->UserTreeIndices,
+              [&](const EdgeInfo &EI) {
+                return (EI.UserTE->hasState() &&
+                        EI.UserTE->getOpcode() == Instruction::ICmp) &&
+                       any_of(EI.UserTE->Scalars, [&](Value *V) {
+                         auto *IC = dyn_cast<ICmpInst>(V);
+                         return IC &&
+                                (IC->isSigned() ||
+                                 !isKnownNonNegative(IC->getOperand(0),
+                                                     SimplifyQuery(*DL)) ||
+                                 !isKnownNonNegative(IC->getOperand(1),
+                                                     SimplifyQuery(*DL)));
+                       });
+              });
     }
 
     // If the maximum bit width we compute is less than the width of the roots'

From 5e92e8ca98dba21c9d8131e611f7158fe9ab3968 Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang@intel.com>
Date: Fri, 10 Jan 2025 15:49:16 +0800
Subject: [PATCH 027/408] [X86] Fix the implementation of
 __readcr[4,8]/__writecr[4,8] to work in 64-bit mode (#122238)

According to MSVC, __readcr4/__writecr4 return/use `unsigned __int64`,
and are supported on both x86 and x64. While __readcr8/__writecr8 are
only supported on x64. So we use __INTPTR_TYPE__ and __int64
respectively.

Following:
https://github.com/llvm/llvm-project/commit/3cec2a17de744900401c83aedb442e2acc1f23f8

Ref.:
https://learn.microsoft.com/en-us/cpp/intrinsics/readcr3?view=msvc-170
https://learn.microsoft.com/en-us/cpp/intrinsics/readcr8?view=msvc-170
---
 clang/lib/Headers/intrin.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Headers/intrin.h b/clang/lib/Headers/intrin.h
index e8a01d1888026..376046aeeaf5e 100644
--- a/clang/lib/Headers/intrin.h
+++ b/clang/lib/Headers/intrin.h
@@ -94,8 +94,8 @@ void __outwordstring(unsigned short, unsigned short *, unsigned long);
 unsigned long __readcr0(void);
 unsigned long __readcr2(void);
 unsigned __LPTRINT_TYPE__ __readcr3(void);
-unsigned long __readcr4(void);
-unsigned long __readcr8(void);
+unsigned __LPTRINT_TYPE__ __readcr4(void);
+unsigned __int64 __readcr8(void);
 unsigned int __readdr(unsigned int);
 #ifdef __i386__
 unsigned char __readfsbyte(unsigned long);
@@ -124,8 +124,8 @@ void __vmx_vmptrst(unsigned __int64 *);
 void __wbinvd(void);
 void __writecr0(unsigned int);
 void __writecr3(unsigned __INTPTR_TYPE__);
-void __writecr4(unsigned int);
-void __writecr8(unsigned int);
+void __writecr4(unsigned __INTPTR_TYPE__);
+void __writecr8(unsigned __int64);
 void __writedr(unsigned int, unsigned int);
 void __writefsbyte(unsigned long, unsigned char);
 void __writefsdword(unsigned long, unsigned long);

From e0f14e11c7d1a5e82297b1dc9590d79f84c15163 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen@sifive.com>
Date: Fri, 10 Jan 2025 16:01:36 +0800
Subject: [PATCH 028/408] [SLPVectorizer] Refine the scope of RdxOpcode in
 HorizontalReduction::createOp (NFC) (#122239)

This patch is one part of unifying IAnyOf and FAnyOf reduction. #118393
The related patch is #118777.
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7e5649c1db215..cdfec332af37a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -19393,27 +19393,32 @@ class HorizontalReduction {
   /// Creates reduction operation with the current opcode.
   static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
                          Value *RHS, const Twine &Name, bool UseSelect) {
-    unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
     switch (Kind) {
-    case RecurKind::Or:
+    case RecurKind::Or: {
       if (UseSelect &&
           LHS->getType() == CmpInst::makeCmpResultType(LHS->getType()))
         return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
+      unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
       return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
                                  Name);
-    case RecurKind::And:
+    }
+    case RecurKind::And: {
       if (UseSelect &&
           LHS->getType() == CmpInst::makeCmpResultType(LHS->getType()))
         return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
+      unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
       return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
                                  Name);
+    }
     case RecurKind::Add:
     case RecurKind::Mul:
     case RecurKind::Xor:
     case RecurKind::FAdd:
-    case RecurKind::FMul:
+    case RecurKind::FMul: {
+      unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
       return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
                                  Name);
+    }
     case RecurKind::FMax:
       return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
     case RecurKind::FMin:

From eeac0ffaf46cf9f9b0f680b9940cc4b68a0286d8 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 10 Jan 2025 08:56:27 +0100
Subject: [PATCH 029/408] Revert "[MachineLICM] Use
 `RegisterClassInfo::getRegPressureSetLimit` (#119826)"

This reverts commit b4e17d4a314ed87ff6b40b4b05397d4b25b6636a.

This causes a large compile-time regression.
---
 llvm/lib/CodeGen/MachineLICM.cpp              |    4 +-
 .../AMDGPU/GlobalISel/atomicrmw_fmax.ll       |  670 +-
 .../AMDGPU/GlobalISel/atomicrmw_fmin.ll       |  670 +-
 .../AMDGPU/agpr-copy-no-free-registers.ll     |   44 +-
 .../buffer-fat-pointer-atomicrmw-fadd.ll      | 6214 ++++++++---------
 .../buffer-fat-pointer-atomicrmw-fmax.ll      | 4454 ++++++------
 .../buffer-fat-pointer-atomicrmw-fmin.ll      | 4454 ++++++------
 .../codegen-prepare-addrspacecast-non-null.ll |    8 +-
 ...cannot-create-empty-or-backward-segment.ll |   95 +-
 .../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll     | 2524 +++----
 .../CodeGen/AMDGPU/flat-atomicrmw-fmax.ll     | 5674 ++++++++-------
 .../CodeGen/AMDGPU/flat-atomicrmw-fmin.ll     | 5674 ++++++++-------
 .../CodeGen/AMDGPU/flat-atomicrmw-fsub.ll     | 2364 +++----
 .../CodeGen/AMDGPU/flat_atomics_i32_system.ll | 1706 +++--
 .../CodeGen/AMDGPU/flat_atomics_i64_system.ll | 1668 ++---
 .../flat_atomics_i64_system_noprivate.ll      | 2028 +++---
 .../CodeGen/AMDGPU/fp64-atomics-gfx90a.ll     |   12 +-
 .../CodeGen/AMDGPU/global-atomicrmw-fadd.ll   | 3006 ++++----
 .../CodeGen/AMDGPU/global-atomicrmw-fmax.ll   | 4987 ++++++-------
 .../CodeGen/AMDGPU/global-atomicrmw-fmin.ll   | 4987 ++++++-------
 .../CodeGen/AMDGPU/global-atomicrmw-fsub.ll   | 2384 +++----
 .../AMDGPU/global-load-saddr-to-vaddr.ll      |   14 +-
 .../AMDGPU/global_atomics_i32_system.ll       |  706 +-
 .../AMDGPU/global_atomics_i64_system.ll       | 1944 +++---
 .../AMDGPU/global_atomics_scan_fmax.ll        |  655 +-
 .../AMDGPU/global_atomics_scan_fmin.ll        |  655 +-
 .../CodeGen/AMDGPU/insert-delay-alu-bug.ll    |   70 +-
 llvm/test/CodeGen/AMDGPU/licm-regpressure.mir |   38 +-
 .../llvm.amdgcn.struct.atomic.buffer.load.ll  |  150 +-
 ...vm.amdgcn.struct.ptr.atomic.buffer.load.ll |  150 +-
 .../CodeGen/AMDGPU/local-atomicrmw-fadd.ll    | 1034 ++-
 .../CodeGen/AMDGPU/local-atomicrmw-fmax.ll    | 1380 ++--
 .../CodeGen/AMDGPU/local-atomicrmw-fmin.ll    | 1380 ++--
 .../CodeGen/AMDGPU/local-atomicrmw-fsub.ll    | 1120 +--
 .../CodeGen/AMDGPU/memcpy-crash-issue63986.ll |   10 +-
 .../CodeGen/AMDGPU/no-fold-accvgpr-mov.ll     |   54 +-
 .../CodeGen/AMDGPU/optimize-negated-cond.ll   |    6 +-
 llvm/test/CodeGen/AMDGPU/sdiv64.ll            |    2 +-
 llvm/test/CodeGen/AMDGPU/srem64.ll            |    2 +-
 .../AMDGPU/tuple-allocation-failure.ll        |  292 +-
 llvm/test/CodeGen/AMDGPU/udiv64.ll            |    2 +-
 .../AMDGPU/undefined-subreg-liverange.ll      |    7 +-
 llvm/test/CodeGen/AMDGPU/urem64.ll            |    2 +-
 ...r-descriptor-waterfall-loop-idom-update.ll |    6 +-
 llvm/test/CodeGen/LoongArch/jr-without-ra.ll  |  112 +-
 .../RISCV/rvv/vxrm-insert-out-of-loop.ll      |    5 +-
 .../test/CodeGen/Thumb2/mve-blockplacement.ll |  128 +-
 .../CodeGen/Thumb2/mve-gather-increment.ll    |  788 ++-
 .../Thumb2/mve-gather-scatter-optimisation.ll |  140 +-
 .../InferAddressSpaces/AMDGPU/flat_atomic.ll  |   28 +-
 50 files changed, 32037 insertions(+), 32470 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 798c3461094a8..d1d5509dc482a 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -124,7 +124,6 @@ namespace {
     const TargetRegisterInfo *TRI = nullptr;
     const MachineFrameInfo *MFI = nullptr;
     MachineRegisterInfo *MRI = nullptr;
-    RegisterClassInfo RegClassInfo;
     TargetSchedModel SchedModel;
     bool PreRegAlloc = false;
     bool HasProfileData = false;
@@ -393,7 +392,6 @@ bool MachineLICMImpl::run(MachineFunction &MF) {
   MFI = &MF.getFrameInfo();
   MRI = &MF.getRegInfo();
   SchedModel.init(&ST);
-  RegClassInfo.runOnMachineFunction(MF);
 
   HasProfileData = MF.getFunction().hasProfileData();
 
@@ -410,7 +408,7 @@ bool MachineLICMImpl::run(MachineFunction &MF) {
     std::fill(RegPressure.begin(), RegPressure.end(), 0);
     RegLimit.resize(NumRPS);
     for (unsigned i = 0, e = NumRPS; i != e; ++i)
-      RegLimit[i] = RegClassInfo.getRegPressureSetLimit(i);
+      RegLimit[i] = TRI->getRegPressureSetLimit(MF, i);
   }
 
   if (HoistConstLoads)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
index bd2bbb9798312..23f24a9dc9982 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
@@ -325,13 +325,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -370,13 +370,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v4, v3
+; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -394,13 +394,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v5, v3
+; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -469,21 +469,21 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX940-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -513,20 +513,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
 ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v4
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -536,20 +536,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
 ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
-; GFX908-NEXT:    v_max_f32_e32 v3, v3, v5
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -602,15 +602,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -640,15 +640,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -686,15 +686,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX908-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -712,15 +712,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
-; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX8-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -758,21 +758,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[6:7], v[0:1], off
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[2:3], v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[8:9]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -795,22 +795,22 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[6:7], v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX11-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -840,21 +840,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[8:9]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -864,21 +864,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[8:9]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -918,13 +918,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -963,13 +963,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v4, v3
+; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -987,13 +987,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v5, v3
+; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -1058,21 +1058,21 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1]
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX940-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1104,20 +1104,20 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v4
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1127,20 +1127,20 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1]
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
-; GFX908-NEXT:    v_max_f32_e32 v3, v3, v5
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1190,15 +1190,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1228,15 +1228,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1274,15 +1274,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX908-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -1303,15 +1303,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8-NEXT:    flat_load_dword v5, v[5:6]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
-; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX8-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -1344,21 +1344,21 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b64 v[6:7], v[0:1]
+; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[2:3], v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[8:9]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -1381,22 +1381,22 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b64 v[6:7], v[0:1]
+; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX11-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -1427,21 +1427,21 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[8:9]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1451,24 +1451,24 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v6, v[0:1]
-; GFX8-NEXT:    flat_load_dword v7, v[4:5]
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v5, v[5:6]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[8:9]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1504,21 +1504,20 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, v0
-; GFX940-NEXT:    v_mov_b32_e32 v0, s16
-; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX940-NEXT:    v_mov_b32_e32 v2, s16
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
+; GFX940-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_max_f32_e32 v3, v1, v1
 ; GFX940-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v0
-; GFX940-NEXT:    v_max_f32_e32 v0, v2, v2
-; GFX940-NEXT:    v_max_f32_e32 v1, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v1, v0
-; GFX940-NEXT:    v_mov_b32_e32 v3, s16
+; GFX940-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX940-NEXT:    v_max_f32_e32 v4, v0, v3
 ; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v2, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1554,20 +1553,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
-; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s20
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-NEXT:    buffer_load_dword v0, v2, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v3, v1, v1
 ; GFX90A-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
-; GFX90A-NEXT:    v_max_f32_e32 v0, v2, v2
-; GFX90A-NEXT:    v_max_f32_e32 v1, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v1, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s20
+; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX90A-NEXT:    v_max_f32_e32 v4, v0, v3
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1581,24 +1579,23 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
-; GFX908-NEXT:    v_mov_b32_e32 v0, s20
-; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen
+; GFX908-NEXT:    v_mov_b32_e32 v2, s20
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
+; GFX908-NEXT:    buffer_load_dword v0, v2, s[16:19], 0 offen
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v3, v1, v1
 ; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    v_max_f32_e32 v0, v2, v2
-; GFX908-NEXT:    v_max_f32_e32 v1, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v1, v0
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
-; GFX908-NEXT:    v_mov_b32_e32 v5, s20
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT:    v_max_f32_e32 v4, v0, v3
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
@@ -1609,24 +1606,23 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s20
-; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen
+; GFX8-NEXT:    buffer_load_dword v0, v2, s[16:19], 0 offen
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v1
 ; GFX8-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v4
-; GFX8-NEXT:    v_max_f32_e32 v3, v0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mov_b32_e32 v5, s20
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT:    v_max_f32_e32 v4, v0, v3
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
@@ -1664,24 +1660,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v1, s16
-; GFX940-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen
+; GFX940-NEXT:    v_mov_b32_e32 v2, s16
+; GFX940-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_max_f32_e32 v3, v0, v0
 ; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v1, v3, v3
-; GFX940-NEXT:    v_max_f32_e32 v2, v0, v0
-; GFX940-NEXT:    v_max_f32_e32 v2, v1, v2
-; GFX940-NEXT:    v_mov_b32_e32 v6, s16
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX940-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1713,23 +1708,22 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
-; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s20
+; GFX90A-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v3, v0, v0
 ; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v1, v3, v3
-; GFX90A-NEXT:    v_max_f32_e32 v2, v0, v0
-; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s20
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX90A-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1739,24 +1733,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v1, s20
-; GFX908-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX908-NEXT:    v_mov_b32_e32 v2, s20
+; GFX908-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v3, v0, v0
 ; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX908-NEXT:    v_max_f32_e32 v3, v0, v0
-; GFX908-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_mov_b32_e32 v5, s20
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX908-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX908-NEXT:    v_mov_b32_e32 v5, v1
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1766,24 +1759,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, s20
-; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v2, s20
+; GFX8-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v0
 ; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v1
-; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v4, v1
-; GFX8-NEXT:    v_mov_b32_e32 v5, s20
-; GFX8-NEXT:    v_mov_b32_e32 v3, v0
-; GFX8-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v1, v3
+; GFX8-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1810,27 +1802,26 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
-; GFX12-NEXT:    v_mov_b32_e32 v0, s16
+; GFX12-NEXT:    v_mov_b32_e32 v6, s16
+; GFX12-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1
 ; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen
+; GFX12-NEXT:    buffer_load_b64 v[0:1], v6, s[0:3], null offen
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[8:9], v[8:9]
-; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[0:1]
-; GFX12-NEXT:    v_mov_b32_e32 v10, s16
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX12-NEXT:    v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT:    v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -1854,29 +1845,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
-; GFX11-NEXT:    v_mov_b32_e32 v0, s16
+; GFX11-NEXT:    v_mov_b32_e32 v6, s16
+; GFX11-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen
-; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:    buffer_load_b64 v[0:1], v6, s[0:3], 0 offen
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[0:1]
-; GFX11-NEXT:    v_mov_b32_e32 v10, s16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX11-NEXT:    v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1908,28 +1897,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    v_mov_b32_e32 v0, s20
-; GFX908-NEXT:    v_mov_b32_e32 v5, v1
-; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen
+; GFX908-NEXT:    v_mov_b32_e32 v6, s20
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v9, v1
-; GFX908-NEXT:    v_mov_b32_e32 v8, v0
-; GFX908-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX908-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX908-NEXT:    v_mov_b32_e32 v10, s20
-; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[0:1]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v6
-; GFX908-NEXT:    v_mov_b32_e32 v1, v7
-; GFX908-NEXT:    v_mov_b32_e32 v2, v8
-; GFX908-NEXT:    v_mov_b32_e32 v3, v9
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
@@ -1940,28 +1928,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s20
-; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v6, s20
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX8-NEXT:    v_mov_b32_e32 v10, s20
-; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v7
-; GFX8-NEXT:    v_mov_b32_e32 v2, v8
-; GFX8-NEXT:    v_mov_b32_e32 v3, v9
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
@@ -1989,26 +1976,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v2, s16
+; GFX12-NEXT:    v_mov_b32_e32 v6, s16
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
 ; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], null offen
+; GFX12-NEXT:    buffer_load_b64 v[2:3], v6, s[0:3], null offen
 ; GFX12-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
-; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[0:1], v[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
-; GFX12-NEXT:    v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v9, v5
-; GFX12-NEXT:    v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_mov_b32_e32 v6, v2
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX12-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -2032,28 +2017,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, s16
+; GFX11-NEXT:    v_mov_b32_e32 v6, s16
+; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], 0 offen
-; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:    buffer_load_b64 v[2:3], v6, s[0:3], 0 offen
 ; GFX11-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX11-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX11-NEXT:    v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v9, v5
-; GFX11-NEXT:    v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_mov_b32_e32 v6, v2
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX11-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2085,27 +2067,26 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, s20
-; GFX908-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen
+; GFX908-NEXT:    v_mov_b32_e32 v6, s20
+; GFX908-NEXT:    buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen
+; GFX908-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
-; GFX908-NEXT:    v_mov_b32_e32 v10, s20
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v9, v5
-; GFX908-NEXT:    v_mov_b32_e32 v8, v4
-; GFX908-NEXT:    v_mov_b32_e32 v7, v3
-; GFX908-NEXT:    v_mov_b32_e32 v6, v2
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v10, v3
+; GFX908-NEXT:    v_mov_b32_e32 v9, v2
+; GFX908-NEXT:    v_mov_b32_e32 v8, v1
+; GFX908-NEXT:    v_mov_b32_e32 v7, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v6
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX908-NEXT:    v_mov_b32_e32 v2, v7
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v5, v7
+; GFX908-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2115,27 +2096,26 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, s20
-; GFX8-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v6, s20
+; GFX8-NEXT:    buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen
+; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v10, s20
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v9, v5
-; GFX8-NEXT:    v_mov_b32_e32 v8, v4
-; GFX8-NEXT:    v_mov_b32_e32 v7, v3
-; GFX8-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v10, v3
+; GFX8-NEXT:    v_mov_b32_e32 v9, v2
+; GFX8-NEXT:    v_mov_b32_e32 v8, v1
+; GFX8-NEXT:    v_mov_b32_e32 v7, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v6
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v7
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v5, v7
+; GFX8-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
index b646b7f301092..11024b0a88d6b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
@@ -325,13 +325,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -370,13 +370,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v3
+; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -394,13 +394,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v5, v3
+; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -469,21 +469,21 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX940-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -513,20 +513,20 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p
 ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v4
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -536,20 +536,20 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p
 ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
-; GFX908-NEXT:    v_min_f32_e32 v3, v3, v5
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -602,15 +602,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -640,15 +640,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -686,15 +686,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX908-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -712,15 +712,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
-; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX8-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -758,21 +758,21 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[6:7], v[0:1], off
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[2:3], v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[8:9]
+; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -795,22 +795,22 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[6:7], v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX11-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[8:9]
+; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -840,21 +840,21 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[8:9]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -864,21 +864,21 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[8:9]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -918,13 +918,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -963,13 +963,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v3
+; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -987,13 +987,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v5, v3
+; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -1058,21 +1058,21 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1]
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX940-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1104,20 +1104,20 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v4
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1127,20 +1127,20 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1]
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
-; GFX908-NEXT:    v_min_f32_e32 v3, v3, v5
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1190,15 +1190,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1228,15 +1228,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1274,15 +1274,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX908-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -1303,15 +1303,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8-NEXT:    flat_load_dword v5, v[5:6]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
-; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX8-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -1344,21 +1344,21 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b64 v[6:7], v[0:1]
+; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[2:3], v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[8:9]
+; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -1381,22 +1381,22 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b64 v[6:7], v[0:1]
+; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX11-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[8:9]
+; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -1427,21 +1427,21 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[8:9]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1451,24 +1451,24 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v6, v[0:1]
-; GFX8-NEXT:    flat_load_dword v7, v[4:5]
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v5, v[5:6]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[8:9]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1504,21 +1504,20 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, v0
-; GFX940-NEXT:    v_mov_b32_e32 v0, s16
-; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX940-NEXT:    v_mov_b32_e32 v2, s16
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
+; GFX940-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_max_f32_e32 v3, v1, v1
 ; GFX940-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v0
-; GFX940-NEXT:    v_max_f32_e32 v0, v2, v2
-; GFX940-NEXT:    v_max_f32_e32 v1, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v1, v0
-; GFX940-NEXT:    v_mov_b32_e32 v3, s16
+; GFX940-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX940-NEXT:    v_min_f32_e32 v4, v0, v3
 ; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v2, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1554,20 +1553,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
-; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s20
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-NEXT:    buffer_load_dword v0, v2, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v3, v1, v1
 ; GFX90A-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
-; GFX90A-NEXT:    v_max_f32_e32 v0, v2, v2
-; GFX90A-NEXT:    v_max_f32_e32 v1, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v1, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s20
+; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX90A-NEXT:    v_min_f32_e32 v4, v0, v3
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1581,24 +1579,23 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
-; GFX908-NEXT:    v_mov_b32_e32 v0, s20
-; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen
+; GFX908-NEXT:    v_mov_b32_e32 v2, s20
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
+; GFX908-NEXT:    buffer_load_dword v0, v2, s[16:19], 0 offen
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v3, v1, v1
 ; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    v_max_f32_e32 v0, v2, v2
-; GFX908-NEXT:    v_max_f32_e32 v1, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v1, v0
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
-; GFX908-NEXT:    v_mov_b32_e32 v5, s20
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT:    v_min_f32_e32 v4, v0, v3
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
@@ -1609,24 +1606,23 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s20
-; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen
+; GFX8-NEXT:    buffer_load_dword v0, v2, s[16:19], 0 offen
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v1
 ; GFX8-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v4
-; GFX8-NEXT:    v_min_f32_e32 v3, v0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mov_b32_e32 v5, s20
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT:    v_min_f32_e32 v4, v0, v3
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
@@ -1664,24 +1660,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v1, s16
-; GFX940-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen
+; GFX940-NEXT:    v_mov_b32_e32 v2, s16
+; GFX940-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_max_f32_e32 v3, v0, v0
 ; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v1, v3, v3
-; GFX940-NEXT:    v_max_f32_e32 v2, v0, v0
-; GFX940-NEXT:    v_min_f32_e32 v2, v1, v2
-; GFX940-NEXT:    v_mov_b32_e32 v6, s16
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX940-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1713,23 +1708,22 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
-; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s20
+; GFX90A-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v3, v0, v0
 ; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v1, v3, v3
-; GFX90A-NEXT:    v_max_f32_e32 v2, v0, v0
-; GFX90A-NEXT:    v_min_f32_e32 v2, v1, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s20
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1739,24 +1733,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v1, s20
-; GFX908-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX908-NEXT:    v_mov_b32_e32 v2, s20
+; GFX908-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v3, v0, v0
 ; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX908-NEXT:    v_max_f32_e32 v3, v0, v0
-; GFX908-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_mov_b32_e32 v5, s20
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX908-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX908-NEXT:    v_mov_b32_e32 v5, v1
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1766,24 +1759,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, s20
-; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v2, s20
+; GFX8-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v0
 ; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v1
-; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v4, v1
-; GFX8-NEXT:    v_mov_b32_e32 v5, s20
-; GFX8-NEXT:    v_mov_b32_e32 v3, v0
-; GFX8-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v1, v3
+; GFX8-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1810,27 +1802,26 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
-; GFX12-NEXT:    v_mov_b32_e32 v0, s16
+; GFX12-NEXT:    v_mov_b32_e32 v6, s16
+; GFX12-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1
 ; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen
+; GFX12-NEXT:    buffer_load_b64 v[0:1], v6, s[0:3], null offen
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[8:9], v[8:9]
-; GFX12-NEXT:    v_min_num_f64_e32 v[6:7], v[2:3], v[0:1]
-; GFX12-NEXT:    v_mov_b32_e32 v10, s16
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX12-NEXT:    v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT:    v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -1854,29 +1845,27 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
-; GFX11-NEXT:    v_mov_b32_e32 v0, s16
+; GFX11-NEXT:    v_mov_b32_e32 v6, s16
+; GFX11-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen
-; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:    buffer_load_b64 v[0:1], v6, s[0:3], 0 offen
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX11-NEXT:    v_min_f64 v[6:7], v[2:3], v[0:1]
-; GFX11-NEXT:    v_mov_b32_e32 v10, s16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX11-NEXT:    v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1908,28 +1897,27 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    v_mov_b32_e32 v0, s20
-; GFX908-NEXT:    v_mov_b32_e32 v5, v1
-; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen
+; GFX908-NEXT:    v_mov_b32_e32 v6, s20
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v9, v1
-; GFX908-NEXT:    v_mov_b32_e32 v8, v0
-; GFX908-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX908-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX908-NEXT:    v_mov_b32_e32 v10, s20
-; GFX908-NEXT:    v_min_f64 v[6:7], v[2:3], v[0:1]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v6
-; GFX908-NEXT:    v_mov_b32_e32 v1, v7
-; GFX908-NEXT:    v_mov_b32_e32 v2, v8
-; GFX908-NEXT:    v_mov_b32_e32 v3, v9
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
@@ -1940,28 +1928,27 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s20
-; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v6, s20
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX8-NEXT:    v_mov_b32_e32 v10, s20
-; GFX8-NEXT:    v_min_f64 v[6:7], v[2:3], v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v7
-; GFX8-NEXT:    v_mov_b32_e32 v2, v8
-; GFX8-NEXT:    v_mov_b32_e32 v3, v9
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
@@ -1989,26 +1976,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v2, s16
+; GFX12-NEXT:    v_mov_b32_e32 v6, s16
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
 ; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], null offen
+; GFX12-NEXT:    buffer_load_b64 v[2:3], v6, s[0:3], null offen
 ; GFX12-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
-; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[0:1], v[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
-; GFX12-NEXT:    v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v9, v5
-; GFX12-NEXT:    v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_mov_b32_e32 v6, v2
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX12-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -2032,28 +2017,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, s16
+; GFX11-NEXT:    v_mov_b32_e32 v6, s16
+; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], 0 offen
-; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:    buffer_load_b64 v[2:3], v6, s[0:3], 0 offen
 ; GFX11-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX11-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX11-NEXT:    v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v9, v5
-; GFX11-NEXT:    v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_mov_b32_e32 v6, v2
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX11-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2085,27 +2067,26 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, s20
-; GFX908-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen
+; GFX908-NEXT:    v_mov_b32_e32 v6, s20
+; GFX908-NEXT:    buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen
+; GFX908-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
-; GFX908-NEXT:    v_mov_b32_e32 v10, s20
-; GFX908-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v9, v5
-; GFX908-NEXT:    v_mov_b32_e32 v8, v4
-; GFX908-NEXT:    v_mov_b32_e32 v7, v3
-; GFX908-NEXT:    v_mov_b32_e32 v6, v2
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v10, v3
+; GFX908-NEXT:    v_mov_b32_e32 v9, v2
+; GFX908-NEXT:    v_mov_b32_e32 v8, v1
+; GFX908-NEXT:    v_mov_b32_e32 v7, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v6
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX908-NEXT:    v_mov_b32_e32 v2, v7
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v5, v7
+; GFX908-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2115,27 +2096,26 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, s20
-; GFX8-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v6, s20
+; GFX8-NEXT:    buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen
+; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v10, s20
-; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v9, v5
-; GFX8-NEXT:    v_mov_b32_e32 v8, v4
-; GFX8-NEXT:    v_mov_b32_e32 v7, v3
-; GFX8-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v10, v3
+; GFX8-NEXT:    v_mov_b32_e32 v9, v2
+; GFX8-NEXT:    v_mov_b32_e32 v8, v1
+; GFX8-NEXT:    v_mov_b32_e32 v7, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v6
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v7
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v5, v7
+; GFX8-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index 45a9f9d47acba..823db84a053b8 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -557,11 +557,11 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    s_mul_hi_u32 s9, s0, s7
 ; GFX908-NEXT:    s_mul_i32 s0, s0, s7
 ; GFX908-NEXT:    s_add_i32 s1, s9, s1
-; GFX908-NEXT:    s_lshl_b64 s[0:1], s[0:1], 5
+; GFX908-NEXT:    s_lshl_b64 s[14:15], s[0:1], 5
 ; GFX908-NEXT:    s_branch .LBB3_2
 ; GFX908-NEXT:  .LBB3_1: ; %Flow20
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[14:15]
+; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB3_12
 ; GFX908-NEXT:  .LBB3_2: ; %bb9
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
@@ -571,15 +571,17 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:  ; %bb.3: ; %bb14
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX908-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX908-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], -1
 ; GFX908-NEXT:    s_mov_b32 s7, s6
+; GFX908-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
 ; GFX908-NEXT:    v_mov_b32_e32 v4, s6
+; GFX908-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v6
 ; GFX908-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX908-NEXT:    v_mov_b32_e32 v9, s7
 ; GFX908-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX908-NEXT:    v_mov_b32_e32 v7, s7
 ; GFX908-NEXT:    v_mov_b32_e32 v8, s6
-; GFX908-NEXT:    v_cmp_lt_i64_e64 s[14:15], s[4:5], 0
-; GFX908-NEXT:    v_cmp_gt_i64_e64 s[16:17], s[4:5], -1
+; GFX908-NEXT:    v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
 ; GFX908-NEXT:    v_mov_b32_e32 v11, v5
 ; GFX908-NEXT:    s_mov_b64 s[18:19], s[10:11]
 ; GFX908-NEXT:    v_mov_b32_e32 v10, v4
@@ -599,9 +601,9 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    ; in Loop: Header=BB3_5 Depth=2
 ; GFX908-NEXT:    v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX908-NEXT:    s_add_u32 s18, s18, s0
+; GFX908-NEXT:    s_add_u32 s18, s18, s14
 ; GFX908-NEXT:    v_cmp_lt_i64_e64 s[22:23], -1, v[2:3]
-; GFX908-NEXT:    s_addc_u32 s19, s19, s1
+; GFX908-NEXT:    s_addc_u32 s19, s19, s15
 ; GFX908-NEXT:    s_mov_b64 s[20:21], 0
 ; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[22:23]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB3_9
@@ -620,7 +622,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    ds_read_b64 v[12:13], v19
 ; GFX908-NEXT:    ds_read_b64 v[14:15], v0
-; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[16:17]
+; GFX908-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    s_cbranch_vccnz .LBB3_7
 ; GFX908-NEXT:  ; %bb.6: ; %bb51
@@ -648,7 +650,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    s_mov_b64 s[20:21], -1
 ; GFX908-NEXT:    s_branch .LBB3_4
 ; GFX908-NEXT:  .LBB3_7: ; in Loop: Header=BB3_5 Depth=2
-; GFX908-NEXT:    s_mov_b64 s[20:21], s[14:15]
+; GFX908-NEXT:    s_mov_b64 s[20:21], s[16:17]
 ; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB3_4
 ; GFX908-NEXT:  ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
@@ -659,7 +661,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    s_xor_b64 s[16:17], s[20:21], -1
 ; GFX908-NEXT:  .LBB3_10: ; %Flow19
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT:    s_mov_b64 s[14:15], -1
+; GFX908-NEXT:    s_mov_b64 s[0:1], -1
 ; GFX908-NEXT:    s_and_b64 vcc, exec, s[16:17]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB3_1
 ; GFX908-NEXT:  ; %bb.11: ; %bb12
@@ -668,7 +670,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    s_addc_u32 s5, s5, 0
 ; GFX908-NEXT:    s_add_u32 s10, s10, s12
 ; GFX908-NEXT:    s_addc_u32 s11, s11, s13
-; GFX908-NEXT:    s_mov_b64 s[14:15], 0
+; GFX908-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX908-NEXT:    s_branch .LBB3_1
 ; GFX908-NEXT:  .LBB3_12: ; %DummyReturnBlock
 ; GFX908-NEXT:    s_endpgm
@@ -718,11 +720,11 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    s_mul_hi_u32 s9, s0, s7
 ; GFX90A-NEXT:    s_mul_i32 s0, s0, s7
 ; GFX90A-NEXT:    s_add_i32 s1, s9, s1
-; GFX90A-NEXT:    s_lshl_b64 s[0:1], s[0:1], 5
+; GFX90A-NEXT:    s_lshl_b64 s[14:15], s[0:1], 5
 ; GFX90A-NEXT:    s_branch .LBB3_2
 ; GFX90A-NEXT:  .LBB3_1: ; %Flow20
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[14:15]
+; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB3_12
 ; GFX90A-NEXT:  .LBB3_2: ; %bb9
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
@@ -732,12 +734,14 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:  ; %bb.3: ; %bb14
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX90A-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], -1
 ; GFX90A-NEXT:    s_mov_b32 s7, s6
+; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v8
 ; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_cmp_lt_i64_e64 s[14:15], s[4:5], 0
-; GFX90A-NEXT:    v_cmp_gt_i64_e64 s[16:17], s[4:5], -1
+; GFX90A-NEXT:    v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
 ; GFX90A-NEXT:    s_mov_b64 s[18:19], s[10:11]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -756,8 +760,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_5 Depth=2
 ; GFX90A-NEXT:    v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX90A-NEXT:    s_add_u32 s18, s18, s0
-; GFX90A-NEXT:    s_addc_u32 s19, s19, s1
+; GFX90A-NEXT:    s_add_u32 s18, s18, s14
+; GFX90A-NEXT:    s_addc_u32 s19, s19, s15
 ; GFX90A-NEXT:    v_cmp_lt_i64_e64 s[22:23], -1, v[4:5]
 ; GFX90A-NEXT:    s_mov_b64 s[20:21], 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[22:23]
@@ -777,7 +781,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    ds_read_b64 v[14:15], v19
 ; GFX90A-NEXT:    ds_read_b64 v[16:17], v0
-; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[16:17]
+; GFX90A-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; GFX90A-NEXT:    ; kill: killed $sgpr20 killed $sgpr21
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_cbranch_vccnz .LBB3_7
@@ -798,7 +802,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    s_mov_b64 s[20:21], -1
 ; GFX90A-NEXT:    s_branch .LBB3_4
 ; GFX90A-NEXT:  .LBB3_7: ; in Loop: Header=BB3_5 Depth=2
-; GFX90A-NEXT:    s_mov_b64 s[20:21], s[14:15]
+; GFX90A-NEXT:    s_mov_b64 s[20:21], s[16:17]
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB3_4
 ; GFX90A-NEXT:  ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
@@ -809,7 +813,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    s_xor_b64 s[16:17], s[20:21], -1
 ; GFX90A-NEXT:  .LBB3_10: ; %Flow19
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX90A-NEXT:    s_mov_b64 s[14:15], -1
+; GFX90A-NEXT:    s_mov_b64 s[0:1], -1
 ; GFX90A-NEXT:    s_and_b64 vcc, exec, s[16:17]
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB3_1
 ; GFX90A-NEXT:  ; %bb.11: ; %bb12
@@ -818,7 +822,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    s_addc_u32 s5, s5, 0
 ; GFX90A-NEXT:    s_add_u32 s10, s10, s12
 ; GFX90A-NEXT:    s_addc_u32 s11, s11, s13
-; GFX90A-NEXT:    s_mov_b64 s[14:15], 0
+; GFX90A-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX90A-NEXT:    s_branch .LBB3_1
 ; GFX90A-NEXT:  .LBB3_12: ; %DummyReturnBlock
 ; GFX90A-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index 48c7a9f5e2d14..e8f1619c5d418 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -54,23 +54,23 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s5, s20, 0x400
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB0_1
@@ -95,18 +95,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB0_1
@@ -122,18 +122,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB0_1
@@ -149,18 +149,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
 ; GFX7-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX7-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, s6
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v5, v0
+; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB0_1
@@ -176,19 +176,19 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
 ; GFX6-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX6-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v4, v0
-; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, s6
-; GFX6-NEXT:    v_mov_b32_e32 v1, v4
-; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v0, v4
+; GFX6-NEXT:    v_mov_b32_e32 v1, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB0_1
@@ -241,23 +241,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
-; GFX10-NEXT:    s_add_i32 s5, s20, 0x400
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB1_1
@@ -290,19 +290,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, v1
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -316,19 +316,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
 ; GFX7-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX7-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
-; GFX7-NEXT:    v_mov_b32_e32 v5, s6
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v2, v3
+; GFX7-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -342,20 +342,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
 ; GFX6-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX6-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_add_f32_e32 v1, v2, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v4, v2
-; GFX6-NEXT:    v_mov_b32_e32 v5, s6
-; GFX6-NEXT:    v_mov_b32_e32 v3, v1
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v5, v2
+; GFX6-NEXT:    v_mov_b32_e32 v4, v1
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v2, v3
+; GFX6-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -827,23 +827,23 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s5, s20, 0x400
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB3_1
@@ -859,12 +859,12 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -885,18 +885,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB3_1
@@ -912,18 +912,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB3_1
@@ -939,18 +939,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
 ; GFX7-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX7-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, s6
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v5, v0
+; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB3_1
@@ -966,19 +966,19 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
 ; GFX6-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX6-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v4, v0
-; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, s6
-; GFX6-NEXT:    v_mov_b32_e32 v1, v4
-; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v0, v4
+; GFX6-NEXT:    v_mov_b32_e32 v1, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB3_1
@@ -1031,23 +1031,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
-; GFX10-NEXT:    s_add_i32 s5, s20, 0x400
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB4_1
@@ -1062,11 +1062,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
 ; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_add_f32_e32 v2, v3, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -1087,19 +1087,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v2
+; GFX908-NEXT:    v_mov_b32_e32 v4, v1
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1113,19 +1113,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, v1
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1139,19 +1139,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
 ; GFX7-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX7-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
-; GFX7-NEXT:    v_mov_b32_e32 v5, s6
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v2, v3
+; GFX7-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1165,20 +1165,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
 ; GFX6-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX6-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_add_f32_e32 v1, v2, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v4, v2
-; GFX6-NEXT:    v_mov_b32_e32 v5, s6
-; GFX6-NEXT:    v_mov_b32_e32 v3, v1
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v5, v2
+; GFX6-NEXT:    v_mov_b32_e32 v4, v1
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v2, v3
+; GFX6-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1218,24 +1218,25 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s5, s16, 0x400
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX11-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1249,23 +1250,23 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s5, s20, 0x400
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB5_1
@@ -1281,12 +1282,12 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -1307,18 +1308,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB5_1
@@ -1334,18 +1335,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB5_1
@@ -1361,18 +1362,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX7-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX7-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, s6
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v5, v0
+; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB5_1
@@ -1388,19 +1389,19 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX6-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX6-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v4, v0
-; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, s6
-; GFX6-NEXT:    v_mov_b32_e32 v1, v4
-; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v0, v4
+; GFX6-NEXT:    v_mov_b32_e32 v1, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB5_1
@@ -1441,24 +1442,25 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s5, s16, 0x400
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX11-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1472,23 +1474,23 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s5, s20, 0x400
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB6_1
@@ -1504,12 +1506,12 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -1530,18 +1532,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB6_1
@@ -1557,18 +1559,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB6_1
@@ -1584,18 +1586,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX7-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX7-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, s6
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v5, v0
+; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB6_1
@@ -1611,19 +1613,19 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX6-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX6-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v4, v0
-; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, s6
-; GFX6-NEXT:    v_mov_b32_e32 v1, v4
-; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v0, v4
+; GFX6-NEXT:    v_mov_b32_e32 v1, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB6_1
@@ -1664,24 +1666,25 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s5, s16, 0x400
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX11-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1695,23 +1698,23 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s5, s20, 0x400
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
@@ -1727,12 +1730,12 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -1753,18 +1756,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
@@ -1780,18 +1783,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB7_1
@@ -1807,18 +1810,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX7-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, s6
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v5, v0
+; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
@@ -1834,19 +1837,19 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX6-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX6-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v4, v0
-; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, s6
-; GFX6-NEXT:    v_mov_b32_e32 v1, v4
-; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v0, v4
+; GFX6-NEXT:    v_mov_b32_e32 v1, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB7_1
@@ -1873,25 +1876,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    s_add_co_i32 s5, s16, 0x800
+; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
 ; GFX12-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_add_f64_e32 v[6:7], v[8:9], v[4:5]
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v10, s5
-; GFX12-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f64_e32 v[7:8], v[9:10], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1916,25 +1919,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s5, s16, 0x800
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
 ; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX11-NEXT:    v_mov_b32_e32 v10, s5
-; GFX11-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1949,26 +1952,26 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v1
-; GFX10-NEXT:    s_add_i32 s5, s20, 0x800
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x800
+; GFX10-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v9, v1
-; GFX10-NEXT:    v_mov_b32_e32 v8, v0
-; GFX10-NEXT:    v_mov_b32_e32 v10, s5
+; GFX10-NEXT:    v_mov_b32_e32 v10, v1
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v6
-; GFX10-NEXT:    v_mov_b32_e32 v1, v7
-; GFX10-NEXT:    v_mov_b32_e32 v2, v8
-; GFX10-NEXT:    v_mov_b32_e32 v3, v9
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v7
+; GFX10-NEXT:    v_mov_b32_e32 v1, v8
+; GFX10-NEXT:    v_mov_b32_e32 v2, v9
+; GFX10-NEXT:    v_mov_b32_e32 v3, v10
+; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
@@ -1994,21 +1997,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v9, v1
-; GFX908-NEXT:    v_mov_b32_e32 v8, v0
-; GFX908-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v10, s6
-; GFX908-NEXT:    v_mov_b32_e32 v0, v6
-; GFX908-NEXT:    v_mov_b32_e32 v1, v7
-; GFX908-NEXT:    v_mov_b32_e32 v2, v8
-; GFX908-NEXT:    v_mov_b32_e32 v3, v9
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB8_1
@@ -2025,21 +2028,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX8-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v10, s6
-; GFX8-NEXT:    v_mov_b32_e32 v0, v6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v7
-; GFX8-NEXT:    v_mov_b32_e32 v2, v8
-; GFX8-NEXT:    v_mov_b32_e32 v3, v9
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
@@ -2056,21 +2059,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX7-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v9, v1
-; GFX7-NEXT:    v_mov_b32_e32 v8, v0
-; GFX7-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v10, s6
-; GFX7-NEXT:    v_mov_b32_e32 v0, v6
-; GFX7-NEXT:    v_mov_b32_e32 v1, v7
-; GFX7-NEXT:    v_mov_b32_e32 v2, v8
-; GFX7-NEXT:    v_mov_b32_e32 v3, v9
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v10, v1
+; GFX7-NEXT:    v_mov_b32_e32 v9, v0
+; GFX7-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v7
+; GFX7-NEXT:    v_mov_b32_e32 v1, v8
+; GFX7-NEXT:    v_mov_b32_e32 v2, v9
+; GFX7-NEXT:    v_mov_b32_e32 v3, v10
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
@@ -2087,22 +2090,22 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX6-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX6-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v9, v1
-; GFX6-NEXT:    v_mov_b32_e32 v8, v0
-; GFX6-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v10, s6
+; GFX6-NEXT:    v_mov_b32_e32 v10, v1
+; GFX6-NEXT:    v_mov_b32_e32 v9, v0
+; GFX6-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, v6
-; GFX6-NEXT:    v_mov_b32_e32 v1, v7
-; GFX6-NEXT:    v_mov_b32_e32 v2, v8
-; GFX6-NEXT:    v_mov_b32_e32 v3, v9
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v0, v7
+; GFX6-NEXT:    v_mov_b32_e32 v1, v8
+; GFX6-NEXT:    v_mov_b32_e32 v2, v9
+; GFX6-NEXT:    v_mov_b32_e32 v3, v10
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB8_1
@@ -2124,24 +2127,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s16
-; GFX12-NEXT:    s_add_co_i32 s5, s16, 0x800
+; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048
 ; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_add_f64_e32 v[2:3], v[4:5], v[0:1]
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, v5
-; GFX12-NEXT:    v_mov_b32_e32 v8, v4
+; GFX12-NEXT:    v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2165,24 +2169,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s16
-; GFX11-NEXT:    s_add_i32 s5, s16, 0x800
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048
 ; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX11-NEXT:    v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, v5
-; GFX11-NEXT:    v_mov_b32_e32 v8, v4
+; GFX11-NEXT:    v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2195,26 +2200,26 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s20
-; GFX10-NEXT:    s_add_i32 s5, s20, 0x800
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x800
+; GFX10-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
 ; GFX10-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX10-NEXT:    v_mov_b32_e32 v9, v5
-; GFX10-NEXT:    v_mov_b32_e32 v10, s5
-; GFX10-NEXT:    v_mov_b32_e32 v8, v4
+; GFX10-NEXT:    v_mov_b32_e32 v10, v5
+; GFX10-NEXT:    v_mov_b32_e32 v9, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v7, v3
-; GFX10-NEXT:    v_mov_b32_e32 v6, v2
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_mov_b32_e32 v8, v3
+; GFX10-NEXT:    v_mov_b32_e32 v7, v2
+; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v4, v6
-; GFX10-NEXT:    v_mov_b32_e32 v5, v7
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v4, v7
+; GFX10-NEXT:    v_mov_b32_e32 v5, v8
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB9_1
@@ -2238,22 +2243,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX908-NEXT:    v_mov_b32_e32 v9, v5
-; GFX908-NEXT:    v_mov_b32_e32 v10, s6
-; GFX908-NEXT:    v_mov_b32_e32 v8, v4
-; GFX908-NEXT:    v_mov_b32_e32 v7, v3
-; GFX908-NEXT:    v_mov_b32_e32 v6, v2
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v10, v5
+; GFX908-NEXT:    v_mov_b32_e32 v9, v4
+; GFX908-NEXT:    v_mov_b32_e32 v8, v3
+; GFX908-NEXT:    v_mov_b32_e32 v7, v2
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v6
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v5, v7
+; GFX908-NEXT:    v_mov_b32_e32 v5, v8
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2267,22 +2272,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v9, v5
-; GFX8-NEXT:    v_mov_b32_e32 v10, s6
-; GFX8-NEXT:    v_mov_b32_e32 v8, v4
-; GFX8-NEXT:    v_mov_b32_e32 v7, v3
-; GFX8-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v10, v5
+; GFX8-NEXT:    v_mov_b32_e32 v9, v4
+; GFX8-NEXT:    v_mov_b32_e32 v8, v3
+; GFX8-NEXT:    v_mov_b32_e32 v7, v2
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v6
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v5, v7
+; GFX8-NEXT:    v_mov_b32_e32 v5, v8
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2296,22 +2301,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
 ; GFX7-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
 ; GFX7-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX7-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v9, v5
-; GFX7-NEXT:    v_mov_b32_e32 v10, s6
-; GFX7-NEXT:    v_mov_b32_e32 v8, v4
-; GFX7-NEXT:    v_mov_b32_e32 v7, v3
-; GFX7-NEXT:    v_mov_b32_e32 v6, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v10, v5
+; GFX7-NEXT:    v_mov_b32_e32 v9, v4
+; GFX7-NEXT:    v_mov_b32_e32 v8, v3
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v6
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v5, v7
+; GFX7-NEXT:    v_mov_b32_e32 v5, v8
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2325,23 +2330,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
 ; GFX6-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
 ; GFX6-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX6-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v9, v5
-; GFX6-NEXT:    v_mov_b32_e32 v10, s6
-; GFX6-NEXT:    v_mov_b32_e32 v8, v4
-; GFX6-NEXT:    v_mov_b32_e32 v7, v3
-; GFX6-NEXT:    v_mov_b32_e32 v6, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v10, v5
+; GFX6-NEXT:    v_mov_b32_e32 v9, v4
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v4, v6
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v5, v7
+; GFX6-NEXT:    v_mov_b32_e32 v5, v8
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2898,25 +2903,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    s_add_co_i32 s5, s16, 0x800
+; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
 ; GFX12-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_add_f64_e32 v[6:7], v[8:9], v[4:5]
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v10, s5
-; GFX12-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f64_e32 v[7:8], v[9:10], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2941,25 +2946,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s5, s16, 0x800
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
 ; GFX11-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX11-NEXT:    v_mov_b32_e32 v10, s5
-; GFX11-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2974,26 +2979,26 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v1
-; GFX10-NEXT:    s_add_i32 s5, s20, 0x800
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x800
+; GFX10-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v9, v1
-; GFX10-NEXT:    v_mov_b32_e32 v8, v0
-; GFX10-NEXT:    v_mov_b32_e32 v10, s5
+; GFX10-NEXT:    v_mov_b32_e32 v10, v1
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v6
-; GFX10-NEXT:    v_mov_b32_e32 v1, v7
-; GFX10-NEXT:    v_mov_b32_e32 v2, v8
-; GFX10-NEXT:    v_mov_b32_e32 v3, v9
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v7
+; GFX10-NEXT:    v_mov_b32_e32 v1, v8
+; GFX10-NEXT:    v_mov_b32_e32 v2, v9
+; GFX10-NEXT:    v_mov_b32_e32 v3, v10
+; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
@@ -3010,18 +3015,18 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX90A-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v10, s6
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[8:9], v[8:9] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    v_add_f64 v[8:9], v[10:11], v[4:5]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
@@ -3038,21 +3043,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v9, v1
-; GFX908-NEXT:    v_mov_b32_e32 v8, v0
-; GFX908-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v10, s6
-; GFX908-NEXT:    v_mov_b32_e32 v0, v6
-; GFX908-NEXT:    v_mov_b32_e32 v1, v7
-; GFX908-NEXT:    v_mov_b32_e32 v2, v8
-; GFX908-NEXT:    v_mov_b32_e32 v3, v9
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
@@ -3069,21 +3074,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v10, s6
-; GFX8-NEXT:    v_mov_b32_e32 v0, v6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v7
-; GFX8-NEXT:    v_mov_b32_e32 v2, v8
-; GFX8-NEXT:    v_mov_b32_e32 v3, v9
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
@@ -3100,21 +3105,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX7-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v9, v1
-; GFX7-NEXT:    v_mov_b32_e32 v8, v0
-; GFX7-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v10, s6
-; GFX7-NEXT:    v_mov_b32_e32 v0, v6
-; GFX7-NEXT:    v_mov_b32_e32 v1, v7
-; GFX7-NEXT:    v_mov_b32_e32 v2, v8
-; GFX7-NEXT:    v_mov_b32_e32 v3, v9
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v10, v1
+; GFX7-NEXT:    v_mov_b32_e32 v9, v0
+; GFX7-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v7
+; GFX7-NEXT:    v_mov_b32_e32 v1, v8
+; GFX7-NEXT:    v_mov_b32_e32 v2, v9
+; GFX7-NEXT:    v_mov_b32_e32 v3, v10
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
@@ -3131,22 +3136,22 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX6-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX6-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v9, v1
-; GFX6-NEXT:    v_mov_b32_e32 v8, v0
-; GFX6-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v10, s6
+; GFX6-NEXT:    v_mov_b32_e32 v10, v1
+; GFX6-NEXT:    v_mov_b32_e32 v9, v0
+; GFX6-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, v6
-; GFX6-NEXT:    v_mov_b32_e32 v1, v7
-; GFX6-NEXT:    v_mov_b32_e32 v2, v8
-; GFX6-NEXT:    v_mov_b32_e32 v3, v9
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v0, v7
+; GFX6-NEXT:    v_mov_b32_e32 v1, v8
+; GFX6-NEXT:    v_mov_b32_e32 v2, v9
+; GFX6-NEXT:    v_mov_b32_e32 v3, v10
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB11_1
@@ -3169,25 +3174,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    s_add_co_i32 s5, s16, 0x800
+; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
 ; GFX12-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_add_f64_e32 v[6:7], v[8:9], v[4:5]
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v10, s5
-; GFX12-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f64_e32 v[7:8], v[9:10], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -3212,25 +3217,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s5, s16, 0x800
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
 ; GFX11-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX11-NEXT:    v_mov_b32_e32 v10, s5
-; GFX11-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -3245,26 +3250,26 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v1
-; GFX10-NEXT:    s_add_i32 s5, s20, 0x800
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x800
+; GFX10-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v9, v1
-; GFX10-NEXT:    v_mov_b32_e32 v8, v0
-; GFX10-NEXT:    v_mov_b32_e32 v10, s5
+; GFX10-NEXT:    v_mov_b32_e32 v10, v1
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v6
-; GFX10-NEXT:    v_mov_b32_e32 v1, v7
-; GFX10-NEXT:    v_mov_b32_e32 v2, v8
-; GFX10-NEXT:    v_mov_b32_e32 v3, v9
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v7
+; GFX10-NEXT:    v_mov_b32_e32 v1, v8
+; GFX10-NEXT:    v_mov_b32_e32 v2, v9
+; GFX10-NEXT:    v_mov_b32_e32 v3, v10
+; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
@@ -3290,21 +3295,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v9, v1
-; GFX908-NEXT:    v_mov_b32_e32 v8, v0
-; GFX908-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v10, s6
-; GFX908-NEXT:    v_mov_b32_e32 v0, v6
-; GFX908-NEXT:    v_mov_b32_e32 v1, v7
-; GFX908-NEXT:    v_mov_b32_e32 v2, v8
-; GFX908-NEXT:    v_mov_b32_e32 v3, v9
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
@@ -3321,21 +3326,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX8-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v10, s6
-; GFX8-NEXT:    v_mov_b32_e32 v0, v6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v7
-; GFX8-NEXT:    v_mov_b32_e32 v2, v8
-; GFX8-NEXT:    v_mov_b32_e32 v3, v9
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
@@ -3352,21 +3357,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX7-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX7-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v9, v1
-; GFX7-NEXT:    v_mov_b32_e32 v8, v0
-; GFX7-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v10, s6
-; GFX7-NEXT:    v_mov_b32_e32 v0, v6
-; GFX7-NEXT:    v_mov_b32_e32 v1, v7
-; GFX7-NEXT:    v_mov_b32_e32 v2, v8
-; GFX7-NEXT:    v_mov_b32_e32 v3, v9
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v10, v1
+; GFX7-NEXT:    v_mov_b32_e32 v9, v0
+; GFX7-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v7
+; GFX7-NEXT:    v_mov_b32_e32 v1, v8
+; GFX7-NEXT:    v_mov_b32_e32 v2, v9
+; GFX7-NEXT:    v_mov_b32_e32 v3, v10
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
@@ -3383,22 +3388,22 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX6-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX6-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v9, v1
-; GFX6-NEXT:    v_mov_b32_e32 v8, v0
-; GFX6-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v10, s6
+; GFX6-NEXT:    v_mov_b32_e32 v10, v1
+; GFX6-NEXT:    v_mov_b32_e32 v9, v0
+; GFX6-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, v6
-; GFX6-NEXT:    v_mov_b32_e32 v1, v7
-; GFX6-NEXT:    v_mov_b32_e32 v2, v8
-; GFX6-NEXT:    v_mov_b32_e32 v3, v9
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v0, v7
+; GFX6-NEXT:    v_mov_b32_e32 v1, v8
+; GFX6-NEXT:    v_mov_b32_e32 v2, v9
+; GFX6-NEXT:    v_mov_b32_e32 v3, v10
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB12_1
@@ -3426,43 +3431,43 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX12-NEXT:    s_addk_co_i32 s16, 0x200
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_b32 s4, s16, -4
-; GFX12-NEXT:    s_and_b32 s5, s16, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v1, s4
-; GFX12-NEXT:    s_lshl_b32 s5, s5, 3
+; GFX12-NEXT:    v_mov_b32_e32 v5, s4
+; GFX12-NEXT:    s_and_b32 s4, s16, 3
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_lshl_b32 s6, 0xffff, s5
+; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT:    buffer_load_b32 v2, v5, s[0:3], null offen
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_not_b32 s7, s6
-; GFX12-NEXT:    buffer_load_b32 v2, v1, s[0:3], null offen
-; GFX12-NEXT:    s_mov_b32 s6, 0
+; GFX12-NEXT:    s_not_b32 s6, s5
+; GFX12-NEXT:    s_mov_b32 s5, 0
 ; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, s5, v2
+; GFX12-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_add_f16_e32 v1, v1, v0
 ; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v1, s5, v1
+; GFX12-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_and_or_b32 v1, v2, s7, v1
-; GFX12-NEXT:    v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_mov_b32_e32 v3, v1
+; GFX12-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-NEXT:    s_or_b32 s6, vcc_lo, s6
+; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s6
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s5, v3
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3470,25 +3475,24 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s16, 0x200
-; GFX940-NEXT:    s_and_b32 s6, s16, -4
-; GFX940-NEXT:    v_mov_b32_e32 v1, s6
+; GFX940-NEXT:    s_and_b32 s4, s16, -4
+; GFX940-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX940-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen
 ; GFX940-NEXT:    s_and_b32 s4, s16, 3
-; GFX940-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX940-NEXT:    s_not_b32 s8, s4
+; GFX940-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT:    s_not_b32 s7, s4
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_e32 v1, s7, v3
-; GFX940-NEXT:    v_add_f16_e32 v1, v1, v0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v1, s7, v1
-; GFX940-NEXT:    v_and_or_b32 v2, v3, s8, v1
-; GFX940-NEXT:    v_mov_b32_e32 v6, s6
+; GFX940-NEXT:    v_lshrrev_b32_e32 v2, s6, v3
+; GFX940-NEXT:    v_add_f16_e32 v2, v2, v0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, s6, v2
+; GFX940-NEXT:    v_and_or_b32 v2, v3, s7, v2
 ; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
@@ -3498,51 +3502,49 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX940-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s7, v4
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s4, s16, -4
-; GFX11-NEXT:    s_and_b32 s5, s16, 3
-; GFX11-NEXT:    v_mov_b32_e32 v1, s4
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX11-NEXT:    s_lshl_b32 s6, 0xffff, s5
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_not_b32 s7, s6
-; GFX11-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 offen
-; GFX11-NEXT:    s_mov_b32 s6, 0
-; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:    v_mov_b32_e32 v5, s4
+; GFX11-NEXT:    s_and_b32 s4, s16, 3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT:    buffer_load_b32 v2, v5, s[0:3], 0 offen
+; GFX11-NEXT:    s_not_b32 s6, s5
+; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, s5, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f16_e32 v1, v1, v0
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, s5, v1
-; GFX11-NEXT:    v_and_or_b32 v1, v2, s7, v1
-; GFX11-NEXT:    v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_mov_b32_e32 v3, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
 ; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-NEXT:    s_or_b32 s6, vcc_lo, s6
+; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s5, v3
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -3550,22 +3552,21 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s20, 0x200
 ; GFX10-NEXT:    s_and_b32 s4, s20, -4
-; GFX10-NEXT:    s_and_b32 s5, s20, 3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX10-NEXT:    s_lshl_b32 s6, 0xffff, s5
-; GFX10-NEXT:    s_not_b32 s7, s6
-; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
-; GFX10-NEXT:    s_mov_b32 s6, 0
+; GFX10-NEXT:    v_mov_b32_e32 v5, s4
+; GFX10-NEXT:    s_and_b32 s4, s20, 3
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX10-NEXT:    buffer_load_dword v2, v5, s[16:19], 0 offen
+; GFX10-NEXT:    s_not_b32 s6, s5
+; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, s5, v2
-; GFX10-NEXT:    v_mov_b32_e32 v5, s4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_add_f16_e32 v1, v1, v0
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v1, v2, s7, v1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v1, v2, s6, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX10-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
@@ -3574,36 +3575,35 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-NEXT:    s_or_b32 s6, vcc_lo, s6
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s5, v3
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s20, 0x200
-; GFX90A-NEXT:    s_and_b32 s6, s20, -4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NEXT:    s_and_b32 s4, s20, -4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_and_b32 s4, s20, 3
-; GFX90A-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX90A-NEXT:    s_not_b32 s8, s4
+; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX90A-NEXT:    s_not_b32 s7, s4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v1, s7, v3
-; GFX90A-NEXT:    v_add_f16_e32 v1, v1, v0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, s7, v1
-; GFX90A-NEXT:    v_and_or_b32 v2, v3, s8, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s6
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, s6, v3
+; GFX90A-NEXT:    v_add_f16_e32 v2, v2, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, s6, v2
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, s7, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
@@ -3613,30 +3613,29 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s7, v4
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s20, 0x200
-; GFX908-NEXT:    s_and_b32 s6, s20, -4
-; GFX908-NEXT:    v_mov_b32_e32 v1, s6
-; GFX908-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX908-NEXT:    s_and_b32 s4, s20, -4
+; GFX908-NEXT:    v_mov_b32_e32 v5, s4
+; GFX908-NEXT:    buffer_load_dword v2, v5, s[16:19], 0 offen
 ; GFX908-NEXT:    s_and_b32 s4, s20, 3
-; GFX908-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX908-NEXT:    s_not_b32 s8, s4
+; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX908-NEXT:    s_not_b32 s7, s4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v1, s7, v2
+; GFX908-NEXT:    v_lshrrev_b32_e32 v1, s6, v2
 ; GFX908-NEXT:    v_add_f16_e32 v1, v1, v0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v1, s7, v1
-; GFX908-NEXT:    v_and_or_b32 v1, v2, s8, v1
+; GFX908-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
+; GFX908-NEXT:    v_and_or_b32 v1, v2, s7, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX908-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -3648,31 +3647,30 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s7, v3
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s20, 0x200
-; GFX8-NEXT:    s_and_b32 s6, s20, -4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX8-NEXT:    s_and_b32 s4, s20, -4
+; GFX8-NEXT:    v_mov_b32_e32 v5, s4
+; GFX8-NEXT:    buffer_load_dword v2, v5, s[16:19], 0 offen
 ; GFX8-NEXT:    s_and_b32 s4, s20, 3
-; GFX8-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX8-NEXT:    s_not_b32 s8, s4
+; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, s7, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, s6, v2
 ; GFX8-NEXT:    v_add_f16_e32 v1, v1, v0
-; GFX8-NEXT:    v_and_b32_e32 v3, s8, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, s7, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, s7, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
 ; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -3684,37 +3682,36 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s7, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s20, 0x200
-; GFX7-NEXT:    s_and_b32 s6, s20, -4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX7-NEXT:    s_and_b32 s4, s20, -4
+; GFX7-NEXT:    v_mov_b32_e32 v4, s4
+; GFX7-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    s_and_b32 s4, s20, 3
-; GFX7-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v0
-; GFX7-NEXT:    s_not_b32 s8, s4
+; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s7, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_and_b32_e32 v2, s8, v1
-; GFX7-NEXT:    v_mov_b32_e32 v5, s6
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX7-NEXT:    v_and_b32_e32 v2, s7, v1
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s7, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
@@ -3724,7 +3721,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s7, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3732,31 +3729,30 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s20, 0x200
-; GFX6-NEXT:    s_and_b32 s6, s20, -4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s6
-; GFX6-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX6-NEXT:    s_and_b32 s4, s20, -4
+; GFX6-NEXT:    v_mov_b32_e32 v4, s4
+; GFX6-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    s_and_b32 s4, s20, 3
-; GFX6-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v0
-; GFX6-NEXT:    s_not_b32 s8, s4
+; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s7, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v2, s8, v1
-; GFX6-NEXT:    v_mov_b32_e32 v5, s6
-; GFX6-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX6-NEXT:    v_and_b32_e32 v2, s7, v1
+; GFX6-NEXT:    v_add_f32_e32 v0, v0, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s7, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v2, v0
-; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
@@ -3766,7 +3762,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX6-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s7, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -3786,42 +3782,42 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_addk_co_i32 s16, 0x200
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_b32 s4, s16, -4
-; GFX12-NEXT:    s_and_b32 s5, s16, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v1, s4
-; GFX12-NEXT:    s_lshl_b32 s5, s5, 3
+; GFX12-NEXT:    v_mov_b32_e32 v3, s4
+; GFX12-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_lshl_b32 s6, 0xffff, s5
+; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_not_b32 s7, s6
-; GFX12-NEXT:    buffer_load_b32 v2, v1, s[0:3], null offen
-; GFX12-NEXT:    s_mov_b32 s6, 0
+; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT:    buffer_load_b32 v2, v3, s[0:3], null offen
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_not_b32 s6, s5
+; GFX12-NEXT:    s_mov_b32 s5, 0
 ; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, s5, v2
+; GFX12-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_add_f16_e32 v1, v1, v0
 ; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v1, s5, v1
+; GFX12-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_and_or_b32 v1, v2, s7, v1
-; GFX12-NEXT:    v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_mov_b32_e32 v3, v1
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-NEXT:    s_or_b32 s6, vcc_lo, s6
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-NEXT:    v_mov_b32_e32 v2, v4
+; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3829,25 +3825,24 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s16, 0x200
-; GFX940-NEXT:    s_and_b32 s6, s16, -4
-; GFX940-NEXT:    v_mov_b32_e32 v1, s6
+; GFX940-NEXT:    s_and_b32 s4, s16, -4
+; GFX940-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX940-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen
 ; GFX940-NEXT:    s_and_b32 s4, s16, 3
-; GFX940-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX940-NEXT:    s_not_b32 s8, s4
+; GFX940-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT:    s_not_b32 s7, s4
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_e32 v1, s7, v3
-; GFX940-NEXT:    v_add_f16_e32 v1, v1, v0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v1, s7, v1
-; GFX940-NEXT:    v_and_or_b32 v2, v3, s8, v1
-; GFX940-NEXT:    v_mov_b32_e32 v6, s6
+; GFX940-NEXT:    v_lshrrev_b32_e32 v2, s6, v3
+; GFX940-NEXT:    v_add_f16_e32 v2, v2, v0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, s6, v2
+; GFX940-NEXT:    v_and_or_b32 v2, v3, s7, v2
 ; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
@@ -3863,43 +3858,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s4, s16, -4
-; GFX11-NEXT:    s_and_b32 s5, s16, 3
-; GFX11-NEXT:    v_mov_b32_e32 v1, s4
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX11-NEXT:    s_lshl_b32 s6, 0xffff, s5
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_not_b32 s7, s6
-; GFX11-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 offen
-; GFX11-NEXT:    s_mov_b32 s6, 0
-; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-NEXT:    s_and_b32 s4, s16, 3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT:    buffer_load_b32 v2, v3, s[0:3], 0 offen
+; GFX11-NEXT:    s_not_b32 s6, s5
+; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, s5, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f16_e32 v1, v1, v0
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, s5, v1
-; GFX11-NEXT:    v_and_or_b32 v1, v2, s7, v1
-; GFX11-NEXT:    v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_mov_b32_e32 v3, v1
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-NEXT:    s_or_b32 s6, vcc_lo, s6
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-NEXT:    v_mov_b32_e32 v2, v4
+; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -3907,59 +3900,57 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s20, 0x200
 ; GFX10-NEXT:    s_and_b32 s4, s20, -4
-; GFX10-NEXT:    s_and_b32 s5, s20, 3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX10-NEXT:    s_lshl_b32 s6, 0xffff, s5
-; GFX10-NEXT:    s_not_b32 s7, s6
-; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
-; GFX10-NEXT:    s_mov_b32 s6, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    s_and_b32 s4, s20, 3
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX10-NEXT:    buffer_load_dword v2, v3, s[16:19], 0 offen
+; GFX10-NEXT:    s_not_b32 s6, s5
+; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, s5, v2
-; GFX10-NEXT:    v_mov_b32_e32 v5, s4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_add_f16_e32 v1, v1, v0
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v1, v2, s7, v1
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v2
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-NEXT:    s_or_b32 s6, vcc_lo, s6
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v4
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s20, 0x200
-; GFX90A-NEXT:    s_and_b32 s6, s20, -4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NEXT:    s_and_b32 s4, s20, -4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_and_b32 s4, s20, 3
-; GFX90A-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX90A-NEXT:    s_not_b32 s8, s4
+; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX90A-NEXT:    s_not_b32 s7, s4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v1, s7, v3
-; GFX90A-NEXT:    v_add_f16_e32 v1, v1, v0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, s7, v1
-; GFX90A-NEXT:    v_and_or_b32 v2, v3, s8, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s6
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, s6, v3
+; GFX90A-NEXT:    v_add_f16_e32 v2, v2, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, s6, v2
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, s7, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
@@ -3975,30 +3966,29 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s20, 0x200
-; GFX908-NEXT:    s_and_b32 s6, s20, -4
-; GFX908-NEXT:    v_mov_b32_e32 v1, s6
-; GFX908-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX908-NEXT:    s_and_b32 s4, s20, -4
+; GFX908-NEXT:    v_mov_b32_e32 v3, s4
+; GFX908-NEXT:    buffer_load_dword v2, v3, s[16:19], 0 offen
 ; GFX908-NEXT:    s_and_b32 s4, s20, 3
-; GFX908-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX908-NEXT:    s_not_b32 s8, s4
+; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX908-NEXT:    s_not_b32 s7, s4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v1, s7, v2
+; GFX908-NEXT:    v_lshrrev_b32_e32 v1, s6, v2
 ; GFX908-NEXT:    v_add_f16_e32 v1, v1, v0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v1, s7, v1
-; GFX908-NEXT:    v_and_or_b32 v1, v2, s8, v1
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
+; GFX908-NEXT:    v_and_or_b32 v1, v2, s7, v1
+; GFX908-NEXT:    v_mov_b32_e32 v5, v2
+; GFX908-NEXT:    v_mov_b32_e32 v4, v1
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4009,31 +3999,30 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s20, 0x200
-; GFX8-NEXT:    s_and_b32 s6, s20, -4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX8-NEXT:    s_and_b32 s4, s20, -4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s4
+; GFX8-NEXT:    buffer_load_dword v2, v3, s[16:19], 0 offen
 ; GFX8-NEXT:    s_and_b32 s4, s20, 3
-; GFX8-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX8-NEXT:    s_not_b32 s8, s4
+; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, s7, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, s6, v2
 ; GFX8-NEXT:    v_add_f16_e32 v1, v1, v0
-; GFX8-NEXT:    v_and_b32_e32 v3, s8, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, s7, v1
-; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_and_b32_e32 v4, s7, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
+; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, v1
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4044,35 +4033,34 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s20, 0x200
-; GFX7-NEXT:    s_and_b32 s6, s20, -4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX7-NEXT:    s_and_b32 s4, s20, -4
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    s_and_b32 s4, s20, 3
-; GFX7-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v0
-; GFX7-NEXT:    s_not_b32 s8, s4
+; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s7, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_and_b32_e32 v3, s8, v1
-; GFX7-NEXT:    v_mov_b32_e32 v5, s6
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v3
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s7, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX7-NEXT:    v_mov_b32_e32 v4, v1
-; GFX7-NEXT:    v_mov_b32_e32 v3, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX7-NEXT:    v_mov_b32_e32 v5, v1
+; GFX7-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4083,36 +4071,35 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s20, 0x200
-; GFX6-NEXT:    s_and_b32 s6, s20, -4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s6
-; GFX6-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX6-NEXT:    s_and_b32 s4, s20, -4
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    s_and_b32 s4, s20, 3
-; GFX6-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v0
-; GFX6-NEXT:    s_not_b32 s8, s4
+; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s7, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v3, s8, v1
-; GFX6-NEXT:    v_mov_b32_e32 v5, s6
-; GFX6-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX6-NEXT:    v_add_f32_e32 v0, v0, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s7, v0
-; GFX6-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX6-NEXT:    v_mov_b32_e32 v4, v1
-; GFX6-NEXT:    v_mov_b32_e32 v3, v0
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v1, v3
+; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4781,27 +4768,28 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_addk_co_i32 s16, 0x200
-; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_b32 s4, s16, -4
-; GFX12-NEXT:    s_and_b32 s5, s16, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v1, s4
-; GFX12-NEXT:    s_lshl_b32 s5, s5, 3
+; GFX12-NEXT:    v_mov_b32_e32 v4, s4
+; GFX12-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_lshl_b32 s6, 0xffff, s5
+; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_not_b32 s7, s6
-; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen
-; GFX12-NEXT:    s_mov_b32 s6, 0
+; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_not_b32 s6, s5
+; GFX12-NEXT:    s_mov_b32 s5, 0
 ; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s5, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX12-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-NEXT:    v_add_f32_e32 v0, v0, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX12-NEXT:    v_or_b32_e32 v3, 0x400000, v0
@@ -4811,23 +4799,23 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
 ; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s5, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX12-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v2
-; GFX12-NEXT:    s_or_b32 s6, vcc_lo, s6
+; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s6
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s5, v2
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4835,33 +4823,32 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s16, 0x200
-; GFX940-NEXT:    s_and_b32 s6, s16, -4
-; GFX940-NEXT:    v_mov_b32_e32 v1, s6
-; GFX940-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
+; GFX940-NEXT:    s_and_b32 s4, s16, -4
+; GFX940-NEXT:    v_mov_b32_e32 v4, s4
+; GFX940-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen
 ; GFX940-NEXT:    s_and_b32 s4, s16, 3
-; GFX940-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX940-NEXT:    s_not_b32 s8, s4
+; GFX940-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT:    s_not_b32 s7, s4
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX940-NEXT:    s_movk_i32 s9, 0x7fff
+; GFX940-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT:    v_mov_b32_e32 v5, s6
-; GFX940-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    v_add_f32_e32 v0, v0, v5
 ; GFX940-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX940-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX940-NEXT:    v_add3_u32 v2, v2, v0, s9
+; GFX940-NEXT:    v_add3_u32 v2, v2, v0, s8
 ; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    s_nop 1
 ; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX940-NEXT:    v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT:    v_and_or_b32 v0, v1, s8, v0
+; GFX940-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT:    v_and_or_b32 v0, v1, s7, v0
 ; GFX940-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
-; GFX940-NEXT:    buffer_atomic_cmpswap v[2:3], v5, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
@@ -4871,32 +4858,33 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
 ; GFX940-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s7, v2
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX11-NEXT:    s_and_b32 s4, s16, -4
-; GFX11-NEXT:    s_and_b32 s5, s16, 3
-; GFX11-NEXT:    v_mov_b32_e32 v1, s4
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_lshl_b32 s6, 0xffff, s5
-; GFX11-NEXT:    s_not_b32 s7, s6
-; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen
-; GFX11-NEXT:    s_mov_b32 s6, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v4, s4
+; GFX11-NEXT:    s_and_b32 s4, s16, 3
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT:    buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-NEXT:    s_not_b32 s6, s5
+; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s5, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v0
@@ -4906,97 +4894,95 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, s5, v0
-; GFX11-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v2
-; GFX11-NEXT:    s_or_b32 s6, vcc_lo, s6
+; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s5, v2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s20, 0x200
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX10-NEXT:    s_and_b32 s4, s20, -4
-; GFX10-NEXT:    s_and_b32 s5, s20, 3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX10-NEXT:    s_lshl_b32 s6, 0xffff, s5
-; GFX10-NEXT:    s_not_b32 s7, s6
-; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
-; GFX10-NEXT:    s_mov_b32 s6, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    s_and_b32 s4, s20, 3
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX10-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX10-NEXT:    s_not_b32 s6, s5
+; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_mov_b32_e32 v5, s4
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v5
 ; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
-; GFX10-NEXT:    buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v2
-; GFX10-NEXT:    s_or_b32 s6, vcc_lo, s6
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s5, v2
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s20, 0x200
-; GFX90A-NEXT:    s_and_b32 s6, s20, -4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
-; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX90A-NEXT:    s_and_b32 s4, s20, -4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
+; GFX90A-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_and_b32 s4, s20, 3
-; GFX90A-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX90A-NEXT:    s_not_b32 s8, s4
+; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX90A-NEXT:    s_not_b32 s7, s4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX90A-NEXT:    s_movk_i32 s9, 0x7fff
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v5
 ; GFX90A-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX90A-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX90A-NEXT:    v_add3_u32 v2, v2, v0, s9
+; GFX90A-NEXT:    v_add3_u32 v2, v2, v0, s8
 ; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v0, v1, s8, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v5, s6
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v0, v1, s7, v0
 ; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
@@ -5006,39 +4992,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s7, v2
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s20, 0x200
-; GFX908-NEXT:    s_and_b32 s6, s20, -4
-; GFX908-NEXT:    v_mov_b32_e32 v1, s6
-; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX908-NEXT:    s_and_b32 s4, s20, -4
+; GFX908-NEXT:    v_mov_b32_e32 v4, s4
+; GFX908-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX908-NEXT:    s_and_b32 s4, s20, 3
-; GFX908-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX908-NEXT:    s_not_b32 s8, s4
+; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX908-NEXT:    s_not_b32 s7, s4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX908-NEXT:    s_movk_i32 s9, 0x7fff
+; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v0, v0, v5
 ; GFX908-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX908-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX908-NEXT:    v_add3_u32 v2, v2, v0, s9
+; GFX908-NEXT:    v_add3_u32 v2, v2, v0, s8
 ; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX908-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v0, v1, s8, v0
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v0, v1, s7, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX908-NEXT:    v_mov_b32_e32 v2, v0
-; GFX908-NEXT:    buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
@@ -5048,41 +5033,40 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
 ; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s7, v2
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s20, 0x200
-; GFX8-NEXT:    s_and_b32 s6, s20, -4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX8-NEXT:    s_and_b32 s4, s20, -4
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX8-NEXT:    s_and_b32 s4, s20, 3
-; GFX8-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX8-NEXT:    s_not_b32 s8, s4
+; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_mov_b32_e32 v0, s7
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v5
 ; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
 ; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
-; GFX8-NEXT:    v_and_b32_e32 v2, s8, v1
+; GFX8-NEXT:    v_and_b32_e32 v2, s7, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
@@ -5092,37 +5076,36 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
 ; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s7, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s20, 0x200
-; GFX7-NEXT:    s_and_b32 s6, s20, -4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX7-NEXT:    s_and_b32 s4, s20, -4
+; GFX7-NEXT:    v_mov_b32_e32 v4, s4
+; GFX7-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX7-NEXT:    s_and_b32 s4, s20, 3
-; GFX7-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s7
+; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    s_not_b32 s8, s4
+; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s7, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v5
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_and_b32_e32 v2, s8, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s7, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s7, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
@@ -5132,7 +5115,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
 ; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s7, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5140,31 +5123,30 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s20, 0x200
-; GFX6-NEXT:    s_and_b32 s6, s20, -4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s6
-; GFX6-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX6-NEXT:    s_and_b32 s4, s20, -4
+; GFX6-NEXT:    v_mov_b32_e32 v4, s4
+; GFX6-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX6-NEXT:    s_and_b32 s4, s20, 3
-; GFX6-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s7
+; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT:    s_not_b32 s8, s4
+; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX6-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s7, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX6-NEXT:    v_add_f32_e32 v0, v0, v5
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v2, s8, v1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s7, v0
+; GFX6-NEXT:    v_and_b32_e32 v2, s7, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v1
-; GFX6-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX6-NEXT:    v_mov_b32_e32 v2, v0
-; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
@@ -5174,7 +5156,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
 ; GFX6-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s7, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -5192,52 +5174,53 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_addk_co_i32 s16, 0x200
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_b32 s4, s16, -4
-; GFX12-NEXT:    s_and_b32 s5, s16, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v1, s4
-; GFX12-NEXT:    s_lshl_b32 s5, s5, 3
+; GFX12-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-NEXT:    s_and_b32 s4, s16, 3
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_lshl_b32 s6, 0xffff, s5
+; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_not_b32 s7, s6
-; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen
-; GFX12-NEXT:    s_mov_b32 s6, 0
+; GFX12-NEXT:    s_not_b32 s6, s5
+; GFX12-NEXT:    s_mov_b32 s5, 0
 ; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s5, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX12-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-NEXT:    v_add_f32_e32 v0, v0, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX12-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s5, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX12-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v1
-; GFX12-NEXT:    v_mov_b32_e32 v1, v3
-; GFX12-NEXT:    s_or_b32 s6, vcc_lo, s6
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-NEXT:    v_mov_b32_e32 v1, v4
+; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5245,33 +5228,32 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s16, 0x200
-; GFX940-NEXT:    s_and_b32 s6, s16, -4
-; GFX940-NEXT:    v_mov_b32_e32 v1, s6
-; GFX940-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
+; GFX940-NEXT:    s_and_b32 s4, s16, -4
+; GFX940-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen
 ; GFX940-NEXT:    s_and_b32 s4, s16, 3
-; GFX940-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX940-NEXT:    s_not_b32 s8, s4
+; GFX940-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT:    s_not_b32 s7, s4
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX940-NEXT:    s_movk_i32 s9, 0x7fff
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX940-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT:    v_mov_b32_e32 v3, s6
-; GFX940-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    v_add_f32_e32 v0, v0, v3
 ; GFX940-NEXT:    v_bfe_u32 v4, v0, 16, 1
 ; GFX940-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX940-NEXT:    v_add3_u32 v4, v4, v0, s9
+; GFX940-NEXT:    v_add3_u32 v4, v4, v0, s8
 ; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    s_nop 1
 ; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX940-NEXT:    v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT:    v_and_or_b32 v0, v1, s8, v0
+; GFX940-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT:    v_and_or_b32 v0, v1, s7, v0
 ; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
-; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
@@ -5287,123 +5269,122 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX11-NEXT:    s_and_b32 s4, s16, -4
-; GFX11-NEXT:    s_and_b32 s5, s16, 3
-; GFX11-NEXT:    v_mov_b32_e32 v1, s4
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_lshl_b32 s6, 0xffff, s5
-; GFX11-NEXT:    s_not_b32 s7, s6
-; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen
-; GFX11-NEXT:    s_mov_b32 s6, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_and_b32 s4, s16, 3
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT:    buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-NEXT:    s_not_b32 s6, s5
+; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s5, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX11-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, s5, v0
-; GFX11-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v3
-; GFX11-NEXT:    s_or_b32 s6, vcc_lo, s6
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v4
+; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s20, 0x200
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX10-NEXT:    s_and_b32 s4, s20, -4
-; GFX10-NEXT:    s_and_b32 s5, s20, 3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX10-NEXT:    s_lshl_b32 s6, 0xffff, s5
-; GFX10-NEXT:    s_not_b32 s7, s6
-; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
-; GFX10-NEXT:    s_mov_b32 s6, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-NEXT:    s_and_b32 s4, s20, 3
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX10-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX10-NEXT:    s_not_b32 s6, s5
+; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_mov_b32_e32 v5, s4
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX10-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v0, v1, s7, v0
-; GFX10-NEXT:    v_mov_b32_e32 v4, v1
-; GFX10-NEXT:    v_mov_b32_e32 v3, v0
-; GFX10-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v3
-; GFX10-NEXT:    s_or_b32 s6, vcc_lo, s6
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s20, 0x200
-; GFX90A-NEXT:    s_and_b32 s6, s20, -4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
-; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX90A-NEXT:    s_and_b32 s4, s20, -4
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_and_b32 s4, s20, 3
-; GFX90A-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX90A-NEXT:    s_not_b32 s8, s4
+; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX90A-NEXT:    s_not_b32 s7, s4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX90A-NEXT:    s_movk_i32 s9, 0x7fff
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v3
 ; GFX90A-NEXT:    v_bfe_u32 v4, v0, 16, 1
 ; GFX90A-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX90A-NEXT:    v_add3_u32 v4, v4, v0, s9
+; GFX90A-NEXT:    v_add3_u32 v4, v4, v0, s8
 ; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v0, v1, s8, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v0, v1, s7, v0
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
@@ -5419,37 +5400,36 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s20, 0x200
-; GFX908-NEXT:    s_and_b32 s6, s20, -4
-; GFX908-NEXT:    v_mov_b32_e32 v1, s6
-; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX908-NEXT:    s_and_b32 s4, s20, -4
+; GFX908-NEXT:    v_mov_b32_e32 v2, s4
+; GFX908-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX908-NEXT:    s_and_b32 s4, s20, 3
-; GFX908-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX908-NEXT:    s_not_b32 s8, s4
+; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX908-NEXT:    s_not_b32 s7, s4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX908-NEXT:    s_movk_i32 s9, 0x7fff
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX908-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; GFX908-NEXT:    v_add3_u32 v3, v3, v0, s9
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX908-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX908-NEXT:    v_add3_u32 v4, v4, v0, s8
 ; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v0, v1, s8, v0
-; GFX908-NEXT:    v_mov_b32_e32 v4, v1
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v3, v0
-; GFX908-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX908-NEXT:    v_mov_b32_e32 v5, v1
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v1, v3
+; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5460,39 +5440,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s20, 0x200
-; GFX8-NEXT:    s_and_b32 s6, s20, -4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX8-NEXT:    s_and_b32 s4, s20, -4
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX8-NEXT:    s_and_b32 s4, s20, 3
-; GFX8-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX8-NEXT:    s_not_b32 s8, s4
+; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_mov_b32_e32 v0, s7
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v4, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_add_f32_e32 v4, v4, v2
-; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX8-NEXT:    v_and_b32_e32 v3, s8, v1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX8-NEXT:    v_mov_b32_e32 v4, v1
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v3, v0
-; GFX8-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX8-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v1, v3
+; GFX8-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5503,35 +5482,34 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s20, 0x200
-; GFX7-NEXT:    s_and_b32 s6, s20, -4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX7-NEXT:    s_and_b32 s4, s20, -4
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX7-NEXT:    s_and_b32 s4, s20, 3
-; GFX7-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s7
+; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    s_not_b32 s8, s4
+; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s7, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v3
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_and_b32_e32 v3, s8, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s7, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX7-NEXT:    v_mov_b32_e32 v4, v1
-; GFX7-NEXT:    v_mov_b32_e32 v5, s6
-; GFX7-NEXT:    v_mov_b32_e32 v3, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX7-NEXT:    v_mov_b32_e32 v5, v1
+; GFX7-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5542,36 +5520,35 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s20, 0x200
-; GFX6-NEXT:    s_and_b32 s6, s20, -4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s6
-; GFX6-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX6-NEXT:    s_and_b32 s4, s20, -4
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX6-NEXT:    s_and_b32 s4, s20, 3
-; GFX6-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s7
+; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT:    s_not_b32 s8, s4
+; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX6-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s7, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX6-NEXT:    v_add_f32_e32 v0, v0, v3
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v3, s8, v1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s7, v0
-; GFX6-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX6-NEXT:    v_mov_b32_e32 v4, v1
-; GFX6-NEXT:    v_mov_b32_e32 v5, s6
-; GFX6-NEXT:    v_mov_b32_e32 v3, v0
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v1, v3
+; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6316,24 +6293,25 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s5, s16, 0x400
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
-; GFX11-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -6347,23 +6325,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s5, s20, 0x400
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB19_1
@@ -6388,18 +6366,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
@@ -6415,20 +6393,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    v_add_f16_sdwa v0, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v1, v4, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v1, v5, v2
+; GFX8-NEXT:    v_or_b32_e32 v4, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
@@ -6451,30 +6429,30 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX7-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v8, s6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_add_f32_e32 v4, v4, v2
-; GFX7-NEXT:    v_add_f32_e32 v5, v5, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v5
-; GFX7-NEXT:    v_or_b32_e32 v5, v0, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT:    v_or_b32_e32 v4, v6, v0
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v6
+; GFX7-NEXT:    v_or_b32_e32 v6, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v0
+; GFX7-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v7
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB19_1
@@ -6497,31 +6475,31 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX6-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v8, s6
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v1
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_add_f32_e32 v4, v4, v2
-; GFX6-NEXT:    v_add_f32_e32 v5, v5, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v5
-; GFX6-NEXT:    v_or_b32_e32 v5, v0, v1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX6-NEXT:    v_or_b32_e32 v4, v6, v0
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v6
+; GFX6-NEXT:    v_or_b32_e32 v6, v0, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
+; GFX6-NEXT:    v_or_b32_e32 v5, v7, v0
+; GFX6-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX6-NEXT:    v_mov_b32_e32 v7, v5
-; GFX6-NEXT:    v_mov_b32_e32 v6, v4
-; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v6
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v7
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB19_1
@@ -6563,23 +6541,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s16
-; GFX11-NEXT:    s_add_i32 s5, s16, 0x400
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_add_f16 v1, v2, v0
-; GFX11-NEXT:    v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_mov_b32_e32 v3, v1
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_mov_b32_e32 v4, v1
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -6592,23 +6572,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
-; GFX10-NEXT:    s_add_i32 s5, s20, 0x400
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_add_f16 v1, v2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB20_1
@@ -6641,21 +6621,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
 ; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v3, v2, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_add_f16_e32 v4, v2, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, v1
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6666,41 +6646,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s20
-; GFX7-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v0
 ; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_mov_b32_e32 v7, s6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_add_f32_e32 v4, v4, v0
-; GFX7-NEXT:    v_add_f32_e32 v5, v5, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    v_or_b32_e32 v4, v2, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX7-NEXT:    v_or_b32_e32 v3, v5, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v0
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_or_b32_e32 v5, v3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-NEXT:    v_or_b32_e32 v4, v6, v3
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB20_1
@@ -6712,42 +6692,42 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s20
-; GFX6-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v0
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v0
 ; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX6-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_mov_b32_e32 v7, s6
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_add_f32_e32 v4, v4, v0
-; GFX6-NEXT:    v_add_f32_e32 v5, v5, v1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v4
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX6-NEXT:    v_or_b32_e32 v4, v2, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX6-NEXT:    v_or_b32_e32 v3, v5, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT:    v_add_f32_e32 v5, v5, v0
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT:    v_or_b32_e32 v5, v3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX6-NEXT:    v_or_b32_e32 v4, v6, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX6-NEXT:    v_mov_b32_e32 v6, v4
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc
+; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB20_1
@@ -7288,24 +7268,25 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s5, s16, 0x400
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
-; GFX11-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -7319,23 +7300,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s5, s20, 0x400
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB22_1
@@ -7351,12 +7332,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -7377,18 +7358,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
@@ -7404,20 +7385,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    v_add_f16_sdwa v0, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v1, v4, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v1, v5, v2
+; GFX8-NEXT:    v_or_b32_e32 v4, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
@@ -7440,30 +7421,30 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX7-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v8, s6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_add_f32_e32 v4, v4, v2
-; GFX7-NEXT:    v_add_f32_e32 v5, v5, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v5
-; GFX7-NEXT:    v_or_b32_e32 v5, v0, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT:    v_or_b32_e32 v4, v6, v0
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v6
+; GFX7-NEXT:    v_or_b32_e32 v6, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v0
+; GFX7-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v7
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB22_1
@@ -7486,31 +7467,31 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX6-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v8, s6
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v1
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_add_f32_e32 v4, v4, v2
-; GFX6-NEXT:    v_add_f32_e32 v5, v5, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v5
-; GFX6-NEXT:    v_or_b32_e32 v5, v0, v1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX6-NEXT:    v_or_b32_e32 v4, v6, v0
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v6
+; GFX6-NEXT:    v_or_b32_e32 v6, v0, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
+; GFX6-NEXT:    v_or_b32_e32 v5, v7, v0
+; GFX6-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX6-NEXT:    v_mov_b32_e32 v7, v5
-; GFX6-NEXT:    v_mov_b32_e32 v6, v4
-; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v6
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v7
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB22_1
@@ -7552,23 +7533,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s16
-; GFX11-NEXT:    s_add_i32 s5, s16, 0x400
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_add_f16 v1, v2, v0
-; GFX11-NEXT:    v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_mov_b32_e32 v3, v1
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_mov_b32_e32 v4, v1
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -7581,23 +7564,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
-; GFX10-NEXT:    s_add_i32 s5, s20, 0x400
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_add_f16 v1, v2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB23_1
@@ -7612,11 +7595,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
 ; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_add_f16 v2, v3, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -7637,19 +7620,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
 ; GFX908-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_add_f16 v1, v2, v0
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v2
+; GFX908-NEXT:    v_mov_b32_e32 v4, v1
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7663,21 +7646,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
 ; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v3, v2, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_add_f16_e32 v4, v2, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, v1
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7688,41 +7671,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s20
-; GFX7-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v0
 ; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_mov_b32_e32 v7, s6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_add_f32_e32 v4, v4, v0
-; GFX7-NEXT:    v_add_f32_e32 v5, v5, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    v_or_b32_e32 v4, v2, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX7-NEXT:    v_or_b32_e32 v3, v5, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v0
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_or_b32_e32 v5, v3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-NEXT:    v_or_b32_e32 v4, v6, v3
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB23_1
@@ -7734,42 +7717,42 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s20
-; GFX6-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v0
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v0
 ; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX6-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_mov_b32_e32 v7, s6
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_add_f32_e32 v4, v4, v0
-; GFX6-NEXT:    v_add_f32_e32 v5, v5, v1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v4
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX6-NEXT:    v_or_b32_e32 v4, v2, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX6-NEXT:    v_or_b32_e32 v3, v5, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT:    v_add_f32_e32 v5, v5, v0
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT:    v_or_b32_e32 v5, v3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX6-NEXT:    v_or_b32_e32 v4, v6, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX6-NEXT:    v_mov_b32_e32 v6, v4
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc
+; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB23_1
@@ -7810,24 +7793,25 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s5, s16, 0x400
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
-; GFX11-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -7841,23 +7825,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s5, s20, 0x400
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB24_1
@@ -7873,12 +7857,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -7899,18 +7883,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB24_1
@@ -7926,20 +7910,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    v_add_f16_sdwa v0, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v1, v4, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v1, v5, v2
+; GFX8-NEXT:    v_or_b32_e32 v4, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB24_1
@@ -7962,30 +7946,30 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX7-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v8, s6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_add_f32_e32 v4, v4, v2
-; GFX7-NEXT:    v_add_f32_e32 v5, v5, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v5
-; GFX7-NEXT:    v_or_b32_e32 v5, v0, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT:    v_or_b32_e32 v4, v6, v0
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v6
+; GFX7-NEXT:    v_or_b32_e32 v6, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v0
+; GFX7-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v7
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB24_1
@@ -8008,31 +7992,31 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX6-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v8, s6
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v1
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_add_f32_e32 v4, v4, v2
-; GFX6-NEXT:    v_add_f32_e32 v5, v5, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v5
-; GFX6-NEXT:    v_or_b32_e32 v5, v0, v1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX6-NEXT:    v_or_b32_e32 v4, v6, v0
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v6
+; GFX6-NEXT:    v_or_b32_e32 v6, v0, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
+; GFX6-NEXT:    v_or_b32_e32 v5, v7, v0
+; GFX6-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX6-NEXT:    v_mov_b32_e32 v7, v5
-; GFX6-NEXT:    v_mov_b32_e32 v6, v4
-; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v6
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v7
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB24_1
@@ -8074,23 +8058,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s16
-; GFX11-NEXT:    s_add_i32 s5, s16, 0x400
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_add_f16 v1, v2, v0
-; GFX11-NEXT:    v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_mov_b32_e32 v3, v1
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_mov_b32_e32 v4, v1
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -8103,23 +8089,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
-; GFX10-NEXT:    s_add_i32 s5, s20, 0x400
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_add_f16 v1, v2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB25_1
@@ -8134,11 +8120,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
 ; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_add_f16 v2, v3, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -8159,19 +8145,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
 ; GFX908-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_add_f16 v1, v2, v0
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v2
+; GFX908-NEXT:    v_mov_b32_e32 v4, v1
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8185,21 +8171,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
 ; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v3, v2, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_add_f16_e32 v4, v2, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, v1
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8210,41 +8196,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s20
-; GFX7-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v0
 ; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_mov_b32_e32 v7, s6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_add_f32_e32 v4, v4, v0
-; GFX7-NEXT:    v_add_f32_e32 v5, v5, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    v_or_b32_e32 v4, v2, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX7-NEXT:    v_or_b32_e32 v3, v5, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v0
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_or_b32_e32 v5, v3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-NEXT:    v_or_b32_e32 v4, v6, v3
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB25_1
@@ -8256,42 +8242,42 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s20
-; GFX6-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v0
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v0
 ; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX6-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_mov_b32_e32 v7, s6
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_add_f32_e32 v4, v4, v0
-; GFX6-NEXT:    v_add_f32_e32 v5, v5, v1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v4
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX6-NEXT:    v_or_b32_e32 v4, v2, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX6-NEXT:    v_or_b32_e32 v3, v5, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT:    v_add_f32_e32 v5, v5, v0
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT:    v_or_b32_e32 v5, v3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX6-NEXT:    v_or_b32_e32 v4, v6, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX6-NEXT:    v_mov_b32_e32 v6, v4
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc
+; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB25_1
@@ -8326,41 +8312,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, v0
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX940-NEXT:    s_add_i32 s8, s16, 0x400
+; GFX940-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX940-NEXT:    s_mov_b64 s[6:7], 0
-; GFX940-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX940-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX940-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX940-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX940-NEXT:    v_add_f32_e32 v1, v4, v1
-; GFX940-NEXT:    v_add_f32_e32 v0, v6, v0
-; GFX940-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v7, v0, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX940-NEXT:    v_add3_u32 v4, v4, v1, s9
-; GFX940-NEXT:    v_add3_u32 v7, v7, v0, s9
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, s8
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v4, v6, s[4:5]
-; GFX940-NEXT:    v_perm_b32 v4, v1, v0, s10
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
+; GFX940-NEXT:    v_mov_b32_e32 v7, v0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
+; GFX940-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX940-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX940-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX940-NEXT:    v_add3_u32 v5, v5, v0, s8
+; GFX940-NEXT:    v_add3_u32 v8, v8, v1, s8
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[4:5]
+; GFX940-NEXT:    v_perm_b32 v6, v1, v0, s9
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
+; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB26_1
@@ -8371,45 +8357,47 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s6, s16, 0x400
+; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX11-NEXT:    s_mov_b32 s5, 0
+; GFX11-NEXT:    v_mov_b32_e32 v4, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-NEXT:    v_add_f32_e32 v1, v5, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX11-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX11-NEXT:    v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
-; GFX11-NEXT:    v_mov_b32_e32 v5, s6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v3, v6, s4
+; GFX11-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v1, v0, 0x7060302
-; GFX11-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX11-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -8422,41 +8410,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s6, s20, 0x400
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v0, v3, v0
-; GFX10-NEXT:    v_add_f32_e32 v1, v5, v1
-; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v0
-; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v3, v6, s4
-; GFX10-NEXT:    v_mov_b32_e32 v5, s6
-; GFX10-NEXT:    v_perm_b32 v3, v1, v0, 0x7060302
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX10-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v0, v5
+; GFX10-NEXT:    v_mov_b32_e32 v1, v6
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
@@ -8467,40 +8455,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX90A-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX90A-NEXT:    v_add_f32_e32 v1, v4, v1
-; GFX90A-NEXT:    v_add_f32_e32 v0, v6, v0
-; GFX90A-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v7, v0, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX90A-NEXT:    v_add3_u32 v4, v4, v1, s9
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v0, s9
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v4, v6, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v1, v0, s10
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s8
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX90A-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX90A-NEXT:    v_add3_u32 v5, v5, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v1, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v1, v0, s9
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
@@ -8511,41 +8499,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX908-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX908-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX908-NEXT:    v_add_f32_e32 v0, v6, v0
-; GFX908-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v7, v0, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX908-NEXT:    v_add3_u32 v3, v3, v1, s9
-; GFX908-NEXT:    v_add3_u32 v7, v7, v0, s9
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX908-NEXT:    v_cndmask_b32_e64 v0, v3, v6, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v1, v0, s10
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
-; GFX908-NEXT:    v_mov_b32_e32 v5, s8
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX908-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX908-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX908-NEXT:    v_add3_u32 v5, v5, v0, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v1, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v5, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v1, v0, s9
+; GFX908-NEXT:    v_mov_b32_e32 v0, v5
+; GFX908-NEXT:    v_mov_b32_e32 v1, v6
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
@@ -8556,42 +8544,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX8-NEXT:    v_add_f32_e32 v0, v6, v0
-; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v3, v6, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v5, v7, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v3, v1, v0, 16
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mov_b32_e32 v5, s8
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v1, v0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v0, v5
+; GFX8-NEXT:    v_mov_b32_e32 v1, v6
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
@@ -8603,38 +8591,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s20
-; GFX7-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT:    buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX7-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_add_f32_e32 v5, v5, v3
-; GFX7-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v3
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v2
 ; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX7-NEXT:    v_alignbit_b32 v0, v0, v4, 16
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX7-NEXT:    v_alignbit_b32 v0, v0, v5, 16
+; GFX7-NEXT:    v_mov_b32_e32 v6, v1
+; GFX7-NEXT:    v_mov_b32_e32 v5, v0
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8645,39 +8633,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s20
-; GFX6-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT:    buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v1
+; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX6-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_add_f32_e32 v5, v5, v3
-; GFX6-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v3
+; GFX6-NEXT:    v_add_f32_e32 v5, v5, v2
 ; GFX6-NEXT:    v_alignbit_b32 v1, v1, v0, 16
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX6-NEXT:    v_alignbit_b32 v0, v0, v4, 16
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
-; GFX6-NEXT:    v_mov_b32_e32 v6, s6
-; GFX6-NEXT:    v_mov_b32_e32 v4, v0
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX6-NEXT:    v_alignbit_b32 v0, v0, v5, 16
+; GFX6-NEXT:    v_mov_b32_e32 v6, v1
+; GFX6-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8708,40 +8696,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s16
-; GFX940-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024
-; GFX940-NEXT:    s_add_i32 s8, s16, 0x400
+; GFX940-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX940-NEXT:    s_mov_b64 s[6:7], 0
-; GFX940-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX940-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX940-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX940-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX940-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX940-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX940-NEXT:    v_add_f32_e32 v2, v5, v4
-; GFX940-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT:    v_add3_u32 v4, v4, v1, s9
-; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s9
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX940-NEXT:    v_mov_b32_e32 v6, s8
-; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX940-NEXT:    v_perm_b32 v2, v2, v1, s10
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX940-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX940-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX940-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX940-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX940-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
+; GFX940-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
 ; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8751,45 +8739,43 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, s16
-; GFX11-NEXT:    s_add_i32 s6, s16, 0x400
+; GFX11-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:    s_mov_b32 s5, 0
-; GFX11-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX11-NEXT:    v_add_f32_e32 v3, v5, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v1, v1
-; GFX11-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v4, v6, s4
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc_lo
-; GFX11-NEXT:    v_mov_b32_e32 v5, s6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
-; GFX11-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -8803,39 +8789,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
-; GFX10-NEXT:    s_add_i32 s6, s20, 0x400
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX10-NEXT:    v_add_f32_e32 v3, v5, v4
-; GFX10-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX10-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v1, v1
-; GFX10-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v4, v6, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v5, s6
-; GFX10-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
@@ -8847,39 +8833,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
-; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX90A-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX90A-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX90A-NEXT:    v_add_f32_e32 v2, v5, v4
-; GFX90A-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v4, v4, v1, s9
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s9
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX90A-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX90A-NEXT:    v_perm_b32 v2, v2, v1, s10
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s8
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8890,40 +8876,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s20
-; GFX908-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX908-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX908-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX908-NEXT:    v_add_f32_e32 v3, v5, v4
-; GFX908-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v4, v4, v1, s9
-; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s9
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX908-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT:    v_perm_b32 v1, v3, v1, s10
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_mov_b32_e32 v6, s8
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX908-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX908-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX908-NEXT:    v_mov_b32_e32 v6, v1
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8934,41 +8920,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s20
-; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX8-NEXT:    v_add_f32_e32 v3, v5, v4
-; GFX8-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v1, v3, v1, 16
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_mov_b32_e32 v6, s8
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v6, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8979,38 +8965,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s20
-; GFX7-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_add_f32_e32 v5, v5, v1
-; GFX7-NEXT:    v_add_f32_e32 v4, v4, v0
-; GFX7-NEXT:    v_alignbit_b32 v3, v2, v3, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX7-NEXT:    v_alignbit_b32 v2, v2, v4, 16
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v1
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v0
+; GFX7-NEXT:    v_alignbit_b32 v4, v3, v4, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
+; GFX7-NEXT:    v_alignbit_b32 v3, v3, v5, 16
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -9021,39 +9007,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s20
-; GFX6-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX6-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_add_f32_e32 v5, v5, v1
-; GFX6-NEXT:    v_add_f32_e32 v4, v4, v0
-; GFX6-NEXT:    v_alignbit_b32 v3, v2, v3, 16
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX6-NEXT:    v_alignbit_b32 v2, v2, v4, 16
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v1
+; GFX6-NEXT:    v_add_f32_e32 v5, v5, v0
+; GFX6-NEXT:    v_alignbit_b32 v4, v3, v4, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
+; GFX6-NEXT:    v_alignbit_b32 v3, v3, v5, 16
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    v_mov_b32_e32 v6, s6
-; GFX6-NEXT:    v_mov_b32_e32 v4, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -9106,7 +9092,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_u32_e32 v10, 0x400, v4
+; GFX940-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX940-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
@@ -9118,40 +9104,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
 ; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT:    buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024
+; GFX940-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
 ; GFX940-NEXT:    ; implicit-def: $vgpr4
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX940-NEXT:    s_movk_i32 s10, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
 ; GFX940-NEXT:    s_mov_b32 s11, 0x7060302
 ; GFX940-NEXT:  .LBB28_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX940-NEXT:    ; Child Loop BB28_4 Depth 2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
-; GFX940-NEXT:    v_add_f32_e32 v4, v6, v4
-; GFX940-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT:    v_add3_u32 v6, v6, v4, s10
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX940-NEXT:    v_add_f32_e32 v4, v4, v9
+; GFX940-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX940-NEXT:    v_add3_u32 v5, v5, v4, s10
+; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v9
-; GFX940-NEXT:    v_add_f32_e32 v6, v7, v6
-; GFX940-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX940-NEXT:    v_add3_u32 v7, v7, v6, s10
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_add_f32_e32 v5, v5, v10
+; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s10
+; GFX940-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX940-NEXT:    v_perm_b32 v8, v6, v4, s11
-; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[8:9]
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v11, vcc
+; GFX940-NEXT:    v_perm_b32 v6, v5, v4, s11
+; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX940-NEXT:  .LBB28_4: ; Parent Loop BB28_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
@@ -9164,27 +9150,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB28_4
 ; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v9
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v9, v6
+; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB28_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v6
+; GFX940-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x400, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
@@ -9198,41 +9184,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024
+; GFX11-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
 ; GFX11-NEXT:    ; implicit-def: $vgpr4
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB28_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-NEXT:    ; Child Loop BB28_4 Depth 2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v8
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_add_f32_e32 v4, v6, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v6, v10, v7
-; GFX11-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8
+; GFX11-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v4
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v6
-; GFX11-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v10, v10, v6, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v7, v11, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v10, v12, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v5
+; GFX11-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v7, v6, v4, 0x7060302
-; GFX11-NEXT:    v_mov_b32_e32 v6, v7
-; GFX11-NEXT:    v_mov_b32_e32 v7, v8
+; GFX11-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
+; GFX11-NEXT:    v_mov_b32_e32 v4, v5
+; GFX11-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX11-NEXT:  .LBB28_4: ; Parent Loop BB28_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
@@ -9246,14 +9233,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB28_4
 ; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX11-NEXT:    v_mov_b32_e32 v8, v6
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
@@ -9261,14 +9248,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_cbranch_execnz .LBB28_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    v_mov_b32_e32 v0, v6
+; GFX11-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v9, 0x400, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
@@ -9280,38 +9268,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX10-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX10-NEXT:    ; implicit-def: $vgpr4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX10-NEXT:  .LBB28_3: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX10-NEXT:    ; Child Loop BB28_4 Depth 2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_f32_e32 v4, v6, v4
-; GFX10-NEXT:    v_add_f32_e32 v6, v10, v7
-; GFX10-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX10-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX10-NEXT:    v_add_f32_e32 v4, v4, v8
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v9
+; GFX10-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v4
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v6
-; GFX10-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v10, v10, v6, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v7, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v10, v12, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v7, v6, v4, 0x7060302
-; GFX10-NEXT:    v_mov_b32_e32 v6, v7
-; GFX10-NEXT:    v_mov_b32_e32 v7, v8
+; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v5
+; GFX10-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v4, v5
+; GFX10-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX10-NEXT:  .LBB28_4: ; Parent Loop BB28_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
@@ -9323,15 +9311,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB28_4
 ; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX10-NEXT:    v_mov_b32_e32 v8, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
@@ -9340,13 +9328,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    s_cbranch_execnz .LBB28_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    v_mov_b32_e32 v0, v6
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_u32_e32 v10, 0x400, v4
+; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX90A-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -9358,38 +9346,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024
+; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX90A-NEXT:    s_movk_i32 s14, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
 ; GFX90A-NEXT:    s_mov_b32 s15, 0x7060302
 ; GFX90A-NEXT:  .LBB28_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX90A-NEXT:    ; Child Loop BB28_4 Depth 2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
-; GFX90A-NEXT:    v_add_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s14
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v9
-; GFX90A-NEXT:    v_add_f32_e32 v6, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v6, s14
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX90A-NEXT:    v_perm_b32 v8, v6, v4, s15
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX90A-NEXT:    v_add_f32_e32 v4, v4, v9
+; GFX90A-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX90A-NEXT:    v_add3_u32 v5, v5, v4, s14
+; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v4
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v10
+; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s14
+; GFX90A-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v11, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v4, s15
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
 ; GFX90A-NEXT:  .LBB28_4: ; Parent Loop BB28_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -9401,27 +9389,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB28_4
 ; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v9
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v9, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB28_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_u32_e32 v9, 0x400, v4
+; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX908-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -9433,39 +9421,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX908-NEXT:    ; implicit-def: $vgpr4
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX908-NEXT:    s_movk_i32 s14, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX908-NEXT:    s_mov_b32 s15, 0x7060302
 ; GFX908-NEXT:  .LBB28_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX908-NEXT:    ; Child Loop BB28_4 Depth 2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
-; GFX908-NEXT:    v_add_f32_e32 v4, v6, v4
-; GFX908-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX908-NEXT:    v_add3_u32 v6, v6, v4, s14
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX908-NEXT:    v_add_f32_e32 v4, v4, v8
+; GFX908-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX908-NEXT:    v_add3_u32 v5, v5, v4, s14
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v4
 ; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v8
-; GFX908-NEXT:    v_add_f32_e32 v6, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX908-NEXT:    v_add3_u32 v7, v7, v6, s14
-; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX908-NEXT:    v_cndmask_b32_e32 v6, v7, v10, vcc
-; GFX908-NEXT:    v_perm_b32 v7, v6, v4, s15
-; GFX908-NEXT:    v_mov_b32_e32 v6, v7
+; GFX908-NEXT:    v_cndmask_b32_e32 v4, v5, v10, vcc
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v9
+; GFX908-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX908-NEXT:    v_add3_u32 v10, v10, v5, s14
+; GFX908-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v4, s15
+; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
-; GFX908-NEXT:    v_mov_b32_e32 v7, v8
+; GFX908-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX908-NEXT:  .LBB28_4: ; Parent Loop BB28_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -9477,27 +9465,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB28_4
 ; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v8
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v8, v6
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB28_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v6
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x400, v4
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -9509,40 +9497,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX8-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX8-NEXT:    ; implicit-def: $vgpr4
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX8-NEXT:  .LBB28_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX8-NEXT:    ; Child Loop BB28_4 Depth 2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
-; GFX8-NEXT:    v_add_f32_e32 v4, v6, v4
-; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX8-NEXT:    v_add_f32_e32 v4, v4, v8
+; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v8
-; GFX8-NEXT:    v_add_f32_e32 v6, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v6
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v7, v10, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_alignbit_b32 v7, v6, v4, 16
-; GFX8-NEXT:    v_mov_b32_e32 v6, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v10, vcc
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v9
+; GFX8-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v5
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 0x7fff, v10
+; GFX8-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v4, 16
+; GFX8-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
-; GFX8-NEXT:    v_mov_b32_e32 v7, v8
+; GFX8-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX8-NEXT:  .LBB28_4: ; Parent Loop BB28_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -9554,21 +9542,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB28_4
 ; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v8
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v8, v6
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB28_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v6
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -9742,41 +9730,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, v0
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX940-NEXT:    s_add_i32 s8, s16, 0x400
+; GFX940-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX940-NEXT:    s_mov_b64 s[6:7], 0
-; GFX940-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX940-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX940-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX940-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX940-NEXT:    v_add_f32_e32 v1, v4, v1
-; GFX940-NEXT:    v_add_f32_e32 v0, v6, v0
-; GFX940-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v7, v0, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX940-NEXT:    v_add3_u32 v4, v4, v1, s9
-; GFX940-NEXT:    v_add3_u32 v7, v7, v0, s9
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, s8
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v4, v6, s[4:5]
-; GFX940-NEXT:    v_perm_b32 v4, v1, v0, s10
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
+; GFX940-NEXT:    v_mov_b32_e32 v7, v0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
+; GFX940-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX940-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX940-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX940-NEXT:    v_add3_u32 v5, v5, v0, s8
+; GFX940-NEXT:    v_add3_u32 v8, v8, v1, s8
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[4:5]
+; GFX940-NEXT:    v_perm_b32 v6, v1, v0, s9
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
+; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB29_1
@@ -9787,45 +9775,47 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s6, s16, 0x400
+; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX11-NEXT:    s_mov_b32 s5, 0
+; GFX11-NEXT:    v_mov_b32_e32 v4, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-NEXT:    v_add_f32_e32 v1, v5, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX11-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX11-NEXT:    v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
-; GFX11-NEXT:    v_mov_b32_e32 v5, s6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v3, v6, s4
+; GFX11-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v1, v0, 0x7060302
-; GFX11-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX11-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -9838,41 +9828,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s6, s20, 0x400
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX10-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v0, v3, v0
-; GFX10-NEXT:    v_add_f32_e32 v1, v5, v1
-; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v0
-; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v3, v6, s4
-; GFX10-NEXT:    v_mov_b32_e32 v5, s6
-; GFX10-NEXT:    v_perm_b32 v3, v1, v0, 0x7060302
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX10-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v0, v5
+; GFX10-NEXT:    v_mov_b32_e32 v1, v6
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB29_1
@@ -9883,40 +9873,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX90A-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX90A-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX90A-NEXT:    v_add_f32_e32 v1, v4, v1
-; GFX90A-NEXT:    v_add_f32_e32 v0, v6, v0
-; GFX90A-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v7, v0, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX90A-NEXT:    v_add3_u32 v4, v4, v1, s9
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v0, s9
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v4, v6, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v1, v0, s10
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s8
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX90A-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX90A-NEXT:    v_add3_u32 v5, v5, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v1, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v1, v0, s9
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB29_1
@@ -9927,41 +9917,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX908-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX908-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX908-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX908-NEXT:    v_add_f32_e32 v0, v6, v0
-; GFX908-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v7, v0, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX908-NEXT:    v_add3_u32 v3, v3, v1, s9
-; GFX908-NEXT:    v_add3_u32 v7, v7, v0, s9
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX908-NEXT:    v_cndmask_b32_e64 v0, v3, v6, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v1, v0, s10
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
-; GFX908-NEXT:    v_mov_b32_e32 v5, s8
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX908-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX908-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX908-NEXT:    v_add3_u32 v5, v5, v0, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v1, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v5, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v1, v0, s9
+; GFX908-NEXT:    v_mov_b32_e32 v0, v5
+; GFX908-NEXT:    v_mov_b32_e32 v1, v6
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB29_1
@@ -9972,42 +9962,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX8-NEXT:    v_add_f32_e32 v0, v6, v0
-; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v3, v6, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v5, v7, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v3, v1, v0, 16
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mov_b32_e32 v5, s8
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v1, v0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v0, v5
+; GFX8-NEXT:    v_mov_b32_e32 v1, v6
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB29_1
@@ -10019,38 +10009,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s20
-; GFX7-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT:    buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX7-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_add_f32_e32 v5, v5, v3
-; GFX7-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v3
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v2
 ; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX7-NEXT:    v_alignbit_b32 v0, v0, v4, 16
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX7-NEXT:    v_alignbit_b32 v0, v0, v5, 16
+; GFX7-NEXT:    v_mov_b32_e32 v6, v1
+; GFX7-NEXT:    v_mov_b32_e32 v5, v0
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10061,39 +10051,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s20
-; GFX6-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT:    buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v1
+; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX6-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_add_f32_e32 v5, v5, v3
-; GFX6-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v3
+; GFX6-NEXT:    v_add_f32_e32 v5, v5, v2
 ; GFX6-NEXT:    v_alignbit_b32 v1, v1, v0, 16
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX6-NEXT:    v_alignbit_b32 v0, v0, v4, 16
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
-; GFX6-NEXT:    v_mov_b32_e32 v6, s6
-; GFX6-NEXT:    v_mov_b32_e32 v4, v0
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX6-NEXT:    v_alignbit_b32 v0, v0, v5, 16
+; GFX6-NEXT:    v_mov_b32_e32 v6, v1
+; GFX6-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10124,40 +10114,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s16
-; GFX940-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024
-; GFX940-NEXT:    s_add_i32 s8, s16, 0x400
+; GFX940-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX940-NEXT:    s_mov_b64 s[6:7], 0
-; GFX940-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX940-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX940-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX940-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX940-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX940-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX940-NEXT:    v_add_f32_e32 v2, v5, v4
-; GFX940-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT:    v_add3_u32 v4, v4, v1, s9
-; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s9
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX940-NEXT:    v_mov_b32_e32 v6, s8
-; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX940-NEXT:    v_perm_b32 v2, v2, v1, s10
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX940-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX940-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX940-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX940-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX940-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
+; GFX940-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
 ; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10167,45 +10157,43 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, s16
-; GFX11-NEXT:    s_add_i32 s6, s16, 0x400
+; GFX11-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:    s_mov_b32 s5, 0
-; GFX11-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX11-NEXT:    v_add_f32_e32 v3, v5, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v1, v1
-; GFX11-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v4, v6, s4
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc_lo
-; GFX11-NEXT:    v_mov_b32_e32 v5, s6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
-; GFX11-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -10219,39 +10207,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
-; GFX10-NEXT:    s_add_i32 s6, s20, 0x400
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX10-NEXT:    v_add_f32_e32 v3, v5, v4
-; GFX10-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX10-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v1, v1
-; GFX10-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v4, v6, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v5, s6
-; GFX10-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
@@ -10263,39 +10251,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
-; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX90A-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX90A-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX90A-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX90A-NEXT:    v_add_f32_e32 v2, v5, v4
-; GFX90A-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v4, v4, v1, s9
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s9
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX90A-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX90A-NEXT:    v_perm_b32 v2, v2, v1, s10
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s8
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10306,40 +10294,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s20
-; GFX908-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX908-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX908-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX908-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX908-NEXT:    v_add_f32_e32 v3, v5, v4
-; GFX908-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v4, v4, v1, s9
-; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s9
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX908-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT:    v_perm_b32 v1, v3, v1, s10
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_mov_b32_e32 v6, s8
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX908-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX908-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX908-NEXT:    v_mov_b32_e32 v6, v1
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10350,41 +10338,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s20
-; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX8-NEXT:    v_add_f32_e32 v3, v5, v4
-; GFX8-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v1, v3, v1, 16
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_mov_b32_e32 v6, s8
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v6, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10395,38 +10383,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s20
-; GFX7-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_add_f32_e32 v5, v5, v1
-; GFX7-NEXT:    v_add_f32_e32 v4, v4, v0
-; GFX7-NEXT:    v_alignbit_b32 v3, v2, v3, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX7-NEXT:    v_alignbit_b32 v2, v2, v4, 16
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v1
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v0
+; GFX7-NEXT:    v_alignbit_b32 v4, v3, v4, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
+; GFX7-NEXT:    v_alignbit_b32 v3, v3, v5, 16
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10437,39 +10425,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s20
-; GFX6-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX6-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_add_f32_e32 v5, v5, v1
-; GFX6-NEXT:    v_add_f32_e32 v4, v4, v0
-; GFX6-NEXT:    v_alignbit_b32 v3, v2, v3, 16
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX6-NEXT:    v_alignbit_b32 v2, v2, v4, 16
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v1
+; GFX6-NEXT:    v_add_f32_e32 v5, v5, v0
+; GFX6-NEXT:    v_alignbit_b32 v4, v3, v4, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
+; GFX6-NEXT:    v_alignbit_b32 v3, v3, v5, 16
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    v_mov_b32_e32 v6, s6
-; GFX6-NEXT:    v_mov_b32_e32 v4, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10499,41 +10487,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, v0
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX940-NEXT:    s_add_i32 s8, s16, 0x400
+; GFX940-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX940-NEXT:    s_mov_b64 s[6:7], 0
-; GFX940-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX940-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX940-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX940-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX940-NEXT:    v_add_f32_e32 v1, v4, v1
-; GFX940-NEXT:    v_add_f32_e32 v0, v6, v0
-; GFX940-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v7, v0, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX940-NEXT:    v_add3_u32 v4, v4, v1, s9
-; GFX940-NEXT:    v_add3_u32 v7, v7, v0, s9
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, s8
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v4, v6, s[4:5]
-; GFX940-NEXT:    v_perm_b32 v4, v1, v0, s10
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
+; GFX940-NEXT:    v_mov_b32_e32 v7, v0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
+; GFX940-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX940-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX940-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX940-NEXT:    v_add3_u32 v5, v5, v0, s8
+; GFX940-NEXT:    v_add3_u32 v8, v8, v1, s8
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[4:5]
+; GFX940-NEXT:    v_perm_b32 v6, v1, v0, s9
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
+; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB31_1
@@ -10544,45 +10532,47 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s6, s16, 0x400
+; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX11-NEXT:    s_mov_b32 s5, 0
+; GFX11-NEXT:    v_mov_b32_e32 v4, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-NEXT:    v_add_f32_e32 v1, v5, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX11-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX11-NEXT:    v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
-; GFX11-NEXT:    v_mov_b32_e32 v5, s6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v3, v6, s4
+; GFX11-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v1, v0, 0x7060302
-; GFX11-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX11-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -10595,41 +10585,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s6, s20, 0x400
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX10-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v0, v3, v0
-; GFX10-NEXT:    v_add_f32_e32 v1, v5, v1
-; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v0
-; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v3, v6, s4
-; GFX10-NEXT:    v_mov_b32_e32 v5, s6
-; GFX10-NEXT:    v_perm_b32 v3, v1, v0, 0x7060302
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX10-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v0, v5
+; GFX10-NEXT:    v_mov_b32_e32 v1, v6
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB31_1
@@ -10640,40 +10630,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX90A-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX90A-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX90A-NEXT:    v_add_f32_e32 v1, v4, v1
-; GFX90A-NEXT:    v_add_f32_e32 v0, v6, v0
-; GFX90A-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v7, v0, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX90A-NEXT:    v_add3_u32 v4, v4, v1, s9
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v0, s9
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v4, v6, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v1, v0, s10
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s8
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX90A-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX90A-NEXT:    v_add3_u32 v5, v5, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v1, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v1, v0, s9
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB31_1
@@ -10684,41 +10674,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX908-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX908-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX908-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX908-NEXT:    v_add_f32_e32 v0, v6, v0
-; GFX908-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v7, v0, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX908-NEXT:    v_add3_u32 v3, v3, v1, s9
-; GFX908-NEXT:    v_add3_u32 v7, v7, v0, s9
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX908-NEXT:    v_cndmask_b32_e64 v0, v3, v6, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v1, v0, s10
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
-; GFX908-NEXT:    v_mov_b32_e32 v5, s8
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX908-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX908-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX908-NEXT:    v_add3_u32 v5, v5, v0, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v1, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v5, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v1, v0, s9
+; GFX908-NEXT:    v_mov_b32_e32 v0, v5
+; GFX908-NEXT:    v_mov_b32_e32 v1, v6
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB31_1
@@ -10729,42 +10719,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX8-NEXT:    v_add_f32_e32 v0, v6, v0
-; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v3, v6, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v5, v7, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v3, v1, v0, 16
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mov_b32_e32 v5, s8
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v1, v0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v0, v5
+; GFX8-NEXT:    v_mov_b32_e32 v1, v6
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB31_1
@@ -10776,38 +10766,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s20
-; GFX7-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT:    buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX7-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_add_f32_e32 v5, v5, v3
-; GFX7-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v3
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v2
 ; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX7-NEXT:    v_alignbit_b32 v0, v0, v4, 16
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX7-NEXT:    v_alignbit_b32 v0, v0, v5, 16
+; GFX7-NEXT:    v_mov_b32_e32 v6, v1
+; GFX7-NEXT:    v_mov_b32_e32 v5, v0
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10818,39 +10808,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s20
-; GFX6-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT:    buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v1
+; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX6-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_add_f32_e32 v5, v5, v3
-; GFX6-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v3
+; GFX6-NEXT:    v_add_f32_e32 v5, v5, v2
 ; GFX6-NEXT:    v_alignbit_b32 v1, v1, v0, 16
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX6-NEXT:    v_alignbit_b32 v0, v0, v4, 16
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
-; GFX6-NEXT:    v_mov_b32_e32 v6, s6
-; GFX6-NEXT:    v_mov_b32_e32 v4, v0
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX6-NEXT:    v_alignbit_b32 v0, v0, v5, 16
+; GFX6-NEXT:    v_mov_b32_e32 v6, v1
+; GFX6-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10881,40 +10871,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s16
-; GFX940-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024
-; GFX940-NEXT:    s_add_i32 s8, s16, 0x400
+; GFX940-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX940-NEXT:    s_mov_b64 s[6:7], 0
-; GFX940-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX940-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX940-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX940-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX940-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX940-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX940-NEXT:    v_add_f32_e32 v2, v5, v4
-; GFX940-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT:    v_add3_u32 v4, v4, v1, s9
-; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s9
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX940-NEXT:    v_mov_b32_e32 v6, s8
-; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX940-NEXT:    v_perm_b32 v2, v2, v1, s10
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX940-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX940-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX940-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX940-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX940-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
+; GFX940-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
 ; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10924,45 +10914,43 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, s16
-; GFX11-NEXT:    s_add_i32 s6, s16, 0x400
+; GFX11-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:    s_mov_b32 s5, 0
-; GFX11-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX11-NEXT:    v_add_f32_e32 v3, v5, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v1, v1
-; GFX11-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v4, v6, s4
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc_lo
-; GFX11-NEXT:    v_mov_b32_e32 v5, s6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
-; GFX11-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -10976,39 +10964,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
-; GFX10-NEXT:    s_add_i32 s6, s20, 0x400
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX10-NEXT:    v_add_f32_e32 v3, v5, v4
-; GFX10-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX10-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v1, v1
-; GFX10-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v4, v6, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v5, s6
-; GFX10-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB32_1
@@ -11020,39 +11008,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
-; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX90A-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX90A-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX90A-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX90A-NEXT:    v_add_f32_e32 v2, v5, v4
-; GFX90A-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v4, v4, v1, s9
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s9
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX90A-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX90A-NEXT:    v_perm_b32 v2, v2, v1, s10
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s8
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11063,40 +11051,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s20
-; GFX908-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX908-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX908-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX908-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX908-NEXT:    v_add_f32_e32 v3, v5, v4
-; GFX908-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v4, v4, v1, s9
-; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s9
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX908-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT:    v_perm_b32 v1, v3, v1, s10
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_mov_b32_e32 v6, s8
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX908-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX908-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX908-NEXT:    v_mov_b32_e32 v6, v1
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11107,41 +11095,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s20
-; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX8-NEXT:    v_add_f32_e32 v3, v5, v4
-; GFX8-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v1, v3, v1, 16
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_mov_b32_e32 v6, s8
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v6, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11152,38 +11140,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s20
-; GFX7-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_add_f32_e32 v5, v5, v1
-; GFX7-NEXT:    v_add_f32_e32 v4, v4, v0
-; GFX7-NEXT:    v_alignbit_b32 v3, v2, v3, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX7-NEXT:    v_alignbit_b32 v2, v2, v4, 16
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v1
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v0
+; GFX7-NEXT:    v_alignbit_b32 v4, v3, v4, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
+; GFX7-NEXT:    v_alignbit_b32 v3, v3, v5, 16
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11194,39 +11182,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s20
-; GFX6-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX6-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_add_f32_e32 v5, v5, v1
-; GFX6-NEXT:    v_add_f32_e32 v4, v4, v0
-; GFX6-NEXT:    v_alignbit_b32 v3, v2, v3, 16
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX6-NEXT:    v_alignbit_b32 v2, v2, v4, 16
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v1
+; GFX6-NEXT:    v_add_f32_e32 v5, v5, v0
+; GFX6-NEXT:    v_alignbit_b32 v4, v3, v4, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
+; GFX6-NEXT:    v_alignbit_b32 v3, v3, v5, 16
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    v_mov_b32_e32 v6, s6
-; GFX6-NEXT:    v_mov_b32_e32 v4, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11257,40 +11245,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s16
-; GFX940-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024
-; GFX940-NEXT:    s_add_i32 s8, s16, 0x400
+; GFX940-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX940-NEXT:    s_mov_b64 s[6:7], 0
-; GFX940-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX940-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX940-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX940-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX940-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX940-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX940-NEXT:    v_add_f32_e32 v2, v5, v4
-; GFX940-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT:    v_add3_u32 v4, v4, v1, s9
-; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s9
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX940-NEXT:    v_mov_b32_e32 v6, s8
-; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX940-NEXT:    v_perm_b32 v2, v2, v1, s10
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX940-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX940-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX940-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX940-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX940-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
+; GFX940-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
 ; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11300,45 +11288,43 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, s16
-; GFX11-NEXT:    s_add_i32 s6, s16, 0x400
+; GFX11-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:    s_mov_b32 s5, 0
-; GFX11-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX11-NEXT:    v_add_f32_e32 v3, v5, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v1, v1
-; GFX11-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v4, v6, s4
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc_lo
-; GFX11-NEXT:    v_mov_b32_e32 v5, s6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
-; GFX11-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -11352,39 +11338,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
-; GFX10-NEXT:    s_add_i32 s6, s20, 0x400
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX10-NEXT:    v_add_f32_e32 v3, v5, v4
-; GFX10-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX10-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v1, v1
-; GFX10-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v4, v6, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v5, s6
-; GFX10-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB33_1
@@ -11396,39 +11382,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
-; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX90A-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX90A-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX90A-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX90A-NEXT:    v_add_f32_e32 v2, v5, v4
-; GFX90A-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v4, v4, v1, s9
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s9
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX90A-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX90A-NEXT:    v_perm_b32 v2, v2, v1, s10
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s8
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11439,40 +11425,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s20
-; GFX908-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX908-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX908-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX908-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX908-NEXT:    v_add_f32_e32 v3, v5, v4
-; GFX908-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v4, v4, v1, s9
-; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s9
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX908-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT:    v_perm_b32 v1, v3, v1, s10
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_mov_b32_e32 v6, s8
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX908-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX908-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX908-NEXT:    v_mov_b32_e32 v6, v1
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11483,41 +11469,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s20
-; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX8-NEXT:    v_add_f32_e32 v3, v5, v4
-; GFX8-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v1, v3, v1, 16
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_mov_b32_e32 v6, s8
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v6, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11528,38 +11514,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s20
-; GFX7-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_add_f32_e32 v5, v5, v1
-; GFX7-NEXT:    v_add_f32_e32 v4, v4, v0
-; GFX7-NEXT:    v_alignbit_b32 v3, v2, v3, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX7-NEXT:    v_alignbit_b32 v2, v2, v4, 16
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v1
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v0
+; GFX7-NEXT:    v_alignbit_b32 v4, v3, v4, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
+; GFX7-NEXT:    v_alignbit_b32 v3, v3, v5, 16
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11570,39 +11556,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s20
-; GFX6-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX6-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_add_f32_e32 v5, v5, v1
-; GFX6-NEXT:    v_add_f32_e32 v4, v4, v0
-; GFX6-NEXT:    v_alignbit_b32 v3, v2, v3, 16
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX6-NEXT:    v_alignbit_b32 v2, v2, v4, 16
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v1
+; GFX6-NEXT:    v_add_f32_e32 v5, v5, v0
+; GFX6-NEXT:    v_alignbit_b32 v4, v3, v4, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
+; GFX6-NEXT:    v_alignbit_b32 v3, v3, v5, 16
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    v_mov_b32_e32 v6, s6
-; GFX6-NEXT:    v_mov_b32_e32 v4, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11660,23 +11646,23 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s5, s20, 0x400
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB34_1
@@ -11692,12 +11678,12 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_wbl2
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
@@ -11720,18 +11706,18 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB34_1
@@ -11747,18 +11733,18 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB34_1
@@ -11774,18 +11760,18 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
 ; GFX7-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX7-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, s6
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v5, v0
+; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB34_1
@@ -11801,19 +11787,19 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
 ; GFX6-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX6-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v4, v0
-; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, s6
-; GFX6-NEXT:    v_mov_b32_e32 v1, v4
-; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v0, v4
+; GFX6-NEXT:    v_mov_b32_e32 v1, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB34_1
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index b00f8aecbd2f5..c7511a2df9fe1 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -31,19 +31,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, v0
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
 ; GFX940-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX940-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v0
-; GFX940-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v0, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, s6
+; GFX940-NEXT:    v_max_f32_e32 v4, v0, v2
 ; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
@@ -82,19 +82,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
-; GFX90A-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v0, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_max_f32_e32 v4, v0, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -110,25 +110,25 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX908-NEXT:    v_max_f32_e32 v0, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v0, v1
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB0_1
@@ -145,19 +145,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v4
-; GFX8-NEXT:    v_max_f32_e32 v3, v0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB0_1
@@ -207,24 +207,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s16
-; GFX940-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
 ; GFX940-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX940-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX940-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v1, v0, v0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX940-NEXT:    v_max_f32_e32 v2, v2, v1
-; GFX940-NEXT:    v_mov_b32_e32 v6, s6
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX940-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -257,23 +257,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
-; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v1, v0, v0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s6
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX90A-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -284,24 +284,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s20
-; GFX908-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v1, v0, v0
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
-; GFX908-NEXT:    v_max_f32_e32 v1, v3, v1
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX908-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v5, v1
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -316,20 +316,20 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v1
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v4, v1
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v3, v0
-; GFX8-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v1, v3
+; GFX8-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -400,7 +400,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_u32_e32 v10, 0x400, v4
+; GFX940-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX940-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
@@ -412,22 +412,22 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
 ; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
 ; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT:    buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024
+; GFX940-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
 ; GFX940-NEXT:    ; implicit-def: $vgpr4
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_max_f32_e32 v9, v5, v5
 ; GFX940-NEXT:  .LBB2_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX940-NEXT:    ; Child Loop BB2_4 Depth 2
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v6, v9, v9
-; GFX940-NEXT:    v_max_f32_e32 v8, v6, v4
+; GFX940-NEXT:    v_max_f32_e32 v4, v7, v7
+; GFX940-NEXT:    v_max_f32_e32 v6, v4, v9
 ; GFX940-NEXT:    s_mov_b64 s[8:9], exec
-; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[8:9]
+; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
@@ -441,21 +441,21 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
 ; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB2_4
 ; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v9
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v9, v6
+; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB2_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v6
+; GFX940-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -520,7 +520,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_u32_e32 v10, 0x400, v4
+; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX90A-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -532,22 +532,22 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024
+; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_max_f32_e32 v9, v5, v5
 ; GFX90A-NEXT:  .LBB2_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX90A-NEXT:    ; Child Loop BB2_4 Depth 2
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v6, v9, v9
-; GFX90A-NEXT:    v_max_f32_e32 v8, v6, v4
+; GFX90A-NEXT:    v_max_f32_e32 v4, v7, v7
+; GFX90A-NEXT:    v_max_f32_e32 v6, v4, v9
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
 ; GFX90A-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -559,27 +559,27 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB2_4
 ; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v9
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v9, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB2_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_u32_e32 v9, 0x400, v4
+; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX908-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -591,23 +591,23 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX908-NEXT:    ; implicit-def: $vgpr4
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_max_f32_e32 v8, v5, v5
 ; GFX908-NEXT:  .LBB2_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX908-NEXT:    ; Child Loop BB2_4 Depth 2
-; GFX908-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v6, v8, v8
-; GFX908-NEXT:    v_max_f32_e32 v7, v6, v4
-; GFX908-NEXT:    v_mov_b32_e32 v6, v7
+; GFX908-NEXT:    v_max_f32_e32 v4, v6, v6
+; GFX908-NEXT:    v_max_f32_e32 v5, v4, v8
+; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
-; GFX908-NEXT:    v_mov_b32_e32 v7, v8
+; GFX908-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX908-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -619,21 +619,21 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB2_4
 ; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v8
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v8, v6
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB2_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v6
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -771,19 +771,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, v0
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
 ; GFX940-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX940-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v0
-; GFX940-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v0, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, s6
+; GFX940-NEXT:    v_max_f32_e32 v4, v0, v2
 ; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
@@ -800,28 +800,27 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s5, s16, 0x400
-; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v0
-; GFX11-NEXT:    v_max_f32_e32 v0, v2, v2
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v1, v4, v4
-; GFX11-NEXT:    v_max_f32_e32 v3, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX11-NEXT:    v_max_f32_e32 v4, v0, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v0, v3
-; GFX11-NEXT:    v_mov_b32_e32 v1, v4
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -833,27 +832,27 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s5, s20, 0x400
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_max_f32_e32 v2, v1, v1
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10-NEXT:    v_max_f32_e32 v0, v2, v2
-; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f32_e32 v1, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v1, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB3_1
@@ -864,19 +863,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
-; GFX90A-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v0, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_max_f32_e32 v4, v0, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -892,25 +891,25 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX908-NEXT:    v_max_f32_e32 v0, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v0, v1
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB3_1
@@ -927,19 +926,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v4
-; GFX8-NEXT:    v_max_f32_e32 v3, v0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB3_1
@@ -956,19 +955,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
 ; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX7-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v4
-; GFX7-NEXT:    v_max_f32_e32 v3, v0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, s6
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v5, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX7-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB3_1
@@ -985,20 +984,20 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
 ; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX6-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX6-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v4
-; GFX6-NEXT:    v_max_f32_e32 v3, v0, v2
-; GFX6-NEXT:    v_mov_b32_e32 v0, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, s6
-; GFX6-NEXT:    v_mov_b32_e32 v1, v4
-; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX6-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v0, v4
+; GFX6-NEXT:    v_mov_b32_e32 v1, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB3_1
@@ -1029,19 +1028,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, v0
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
 ; GFX940-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX940-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v0
-; GFX940-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v0, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, s6
+; GFX940-NEXT:    v_max_f32_e32 v4, v0, v2
 ; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
@@ -1080,19 +1079,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
-; GFX90A-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v0, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_max_f32_e32 v4, v0, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -1108,25 +1107,25 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX908-NEXT:    v_max_f32_e32 v0, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v0, v1
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB4_1
@@ -1143,19 +1142,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v4
-; GFX8-NEXT:    v_max_f32_e32 v3, v0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB4_1
@@ -1198,29 +1197,30 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    s_add_co_i32 s5, s16, 0x800
-; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[8:9], v[8:9]
-; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[0:1]
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v10, s5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX12-NEXT:    v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT:    v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1243,30 +1243,30 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s5, s16, 0x800
-; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
-; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[0:1]
-; GFX11-NEXT:    v_mov_b32_e32 v10, s5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX11-NEXT:    v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1298,29 +1298,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
-; GFX908-NEXT:    v_mov_b32_e32 v5, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX908-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v9, v1
-; GFX908-NEXT:    v_mov_b32_e32 v8, v0
-; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX908-NEXT:    v_mov_b32_e32 v10, s6
-; GFX908-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v6
-; GFX908-NEXT:    v_mov_b32_e32 v1, v7
-; GFX908-NEXT:    v_mov_b32_e32 v2, v8
-; GFX908-NEXT:    v_mov_b32_e32 v3, v9
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB5_1
@@ -1331,29 +1331,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
-; GFX8-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX8-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT:    v_mov_b32_e32 v10, s6
-; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v7
-; GFX8-NEXT:    v_mov_b32_e32 v2, v8
-; GFX8-NEXT:    v_mov_b32_e32 v3, v9
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB5_1
@@ -1393,27 +1393,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s16
-; GFX12-NEXT:    s_add_co_i32 s5, s16, 0x800
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
+; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
 ; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048
 ; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[0:1], v[0:1]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[4:5], v[4:5]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[6:7], v[2:3]
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, v5
-; GFX12-NEXT:    v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_mov_b32_e32 v6, v2
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX12-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1437,28 +1437,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s16
-; GFX11-NEXT:    s_add_i32 s5, s16, 0x800
+; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048
-; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[6:7], v[4:5], v[4:5]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[6:7], v[2:3]
-; GFX11-NEXT:    v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, v5
-; GFX11-NEXT:    v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_mov_b32_e32 v6, v2
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX11-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1491,27 +1490,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v2, s20
-; GFX908-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
+; GFX908-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[6:7], v[4:5], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v10, s6
-; GFX908-NEXT:    v_max_f64 v[2:3], v[6:7], v[2:3]
-; GFX908-NEXT:    v_mov_b32_e32 v9, v5
-; GFX908-NEXT:    v_mov_b32_e32 v8, v4
-; GFX908-NEXT:    v_mov_b32_e32 v7, v3
-; GFX908-NEXT:    v_mov_b32_e32 v6, v2
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v10, v3
+; GFX908-NEXT:    v_mov_b32_e32 v9, v2
+; GFX908-NEXT:    v_mov_b32_e32 v8, v1
+; GFX908-NEXT:    v_mov_b32_e32 v7, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v6
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX908-NEXT:    v_mov_b32_e32 v2, v7
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v5, v7
+; GFX908-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1522,27 +1521,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s20
-; GFX8-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
+; GFX8-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[6:7], v[4:5], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v10, s6
-; GFX8-NEXT:    v_max_f64 v[2:3], v[6:7], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v9, v5
-; GFX8-NEXT:    v_mov_b32_e32 v8, v4
-; GFX8-NEXT:    v_mov_b32_e32 v7, v3
-; GFX8-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v10, v3
+; GFX8-NEXT:    v_mov_b32_e32 v9, v2
+; GFX8-NEXT:    v_mov_b32_e32 v8, v1
+; GFX8-NEXT:    v_mov_b32_e32 v7, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v6
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v7
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v5, v7
+; GFX8-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1606,17 +1605,17 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
 ; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[5:6], v[5:6]
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB7_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX12-NEXT:    ; Child Loop BB7_4 Depth 2
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[5:6], v[5:6]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[13:14], v[13:14]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[13:14], v[13:14]
 ; GFX12-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[11:12], v[2:3], v[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[11:12], v[0:1], v[4:5]
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
 ; GFX12-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
@@ -1710,17 +1709,17 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
 ; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB7_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-NEXT:    ; Child Loop BB7_4 Depth 2
-; GFX11-NEXT:    v_max_f64 v[0:1], v[5:6], v[5:6]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[13:14], v[13:14]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[11:12], v[2:3], v[0:1]
+; GFX11-NEXT:    v_max_f64 v[11:12], v[0:1], v[4:5]
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
 ; GFX11-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
 ; GFX11-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
@@ -1838,15 +1837,15 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
 ; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:  .LBB7_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX908-NEXT:    ; Child Loop BB7_4 Depth 2
-; GFX908-NEXT:    v_max_f64 v[0:1], v[5:6], v[5:6]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[2:3], v[13:14], v[13:14]
+; GFX908-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
-; GFX908-NEXT:    v_max_f64 v[11:12], v[2:3], v[0:1]
+; GFX908-NEXT:    v_max_f64 v[11:12], v[0:1], v[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v11
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v12
 ; GFX908-NEXT:    v_mov_b32_e32 v2, v13
@@ -1904,15 +1903,15 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
 ; GFX8-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:  .LBB7_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX8-NEXT:    ; Child Loop BB7_4 Depth 2
-; GFX8-NEXT:    v_max_f64 v[0:1], v[5:6], v[5:6]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[2:3], v[13:14], v[13:14]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
-; GFX8-NEXT:    v_max_f64 v[11:12], v[2:3], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[11:12], v[0:1], v[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v11
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v12
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v13
@@ -2012,29 +2011,30 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    s_add_co_i32 s5, s16, 0x800
-; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[8:9], v[8:9]
-; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[0:1]
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v10, s5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX12-NEXT:    v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT:    v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2057,30 +2057,30 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s5, s16, 0x800
-; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
-; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[0:1]
-; GFX11-NEXT:    v_mov_b32_e32 v10, s5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX11-NEXT:    v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2092,31 +2092,31 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    v_mov_b32_e32 v5, v1
-; GFX10-NEXT:    s_add_i32 s5, s20, 0x800
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x800
+; GFX10-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX10-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v9, v1
-; GFX10-NEXT:    v_mov_b32_e32 v8, v0
-; GFX10-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v10, s5
+; GFX10-NEXT:    v_mov_b32_e32 v10, v1
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX10-NEXT:    v_max_f64 v[6:7], v[2:3], v[0:1]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v6
-; GFX10-NEXT:    v_mov_b32_e32 v1, v7
-; GFX10-NEXT:    v_mov_b32_e32 v2, v8
-; GFX10-NEXT:    v_mov_b32_e32 v3, v9
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX10-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v7
+; GFX10-NEXT:    v_mov_b32_e32 v1, v8
+; GFX10-NEXT:    v_mov_b32_e32 v2, v9
+; GFX10-NEXT:    v_mov_b32_e32 v3, v10
+; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
@@ -2127,26 +2127,26 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v4, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX90A-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX90A-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX90A-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v10, s6
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[8:9], v[8:9] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
+; GFX90A-NEXT:    v_max_f64 v[8:9], v[0:1], v[4:5]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB8_1
@@ -2157,29 +2157,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
-; GFX908-NEXT:    v_mov_b32_e32 v5, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v9, v1
-; GFX908-NEXT:    v_mov_b32_e32 v8, v0
-; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX908-NEXT:    v_mov_b32_e32 v10, s6
-; GFX908-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v6
-; GFX908-NEXT:    v_mov_b32_e32 v1, v7
-; GFX908-NEXT:    v_mov_b32_e32 v2, v8
-; GFX908-NEXT:    v_mov_b32_e32 v3, v9
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB8_1
@@ -2190,29 +2190,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
-; GFX8-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX8-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT:    v_mov_b32_e32 v10, s6
-; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v7
-; GFX8-NEXT:    v_mov_b32_e32 v2, v8
-; GFX8-NEXT:    v_mov_b32_e32 v3, v9
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
@@ -2223,29 +2223,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s20
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX7-NEXT:    s_add_i32 s6, s20, 0x800
+; GFX7-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v9, v1
-; GFX7-NEXT:    v_mov_b32_e32 v8, v0
-; GFX7-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v10, s6
-; GFX7-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v6
-; GFX7-NEXT:    v_mov_b32_e32 v1, v7
-; GFX7-NEXT:    v_mov_b32_e32 v2, v8
-; GFX7-NEXT:    v_mov_b32_e32 v3, v9
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v10, v1
+; GFX7-NEXT:    v_mov_b32_e32 v9, v0
+; GFX7-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX7-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v7
+; GFX7-NEXT:    v_mov_b32_e32 v1, v8
+; GFX7-NEXT:    v_mov_b32_e32 v2, v9
+; GFX7-NEXT:    v_mov_b32_e32 v3, v10
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
@@ -2256,30 +2256,30 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s20
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX6-NEXT:    s_add_i32 s6, s20, 0x800
+; GFX6-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX6-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v9, v1
-; GFX6-NEXT:    v_mov_b32_e32 v8, v0
+; GFX6-NEXT:    v_mov_b32_e32 v10, v1
+; GFX6-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX6-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v10, s6
-; GFX6-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v6
-; GFX6-NEXT:    v_mov_b32_e32 v1, v7
-; GFX6-NEXT:    v_mov_b32_e32 v2, v8
-; GFX6-NEXT:    v_mov_b32_e32 v3, v9
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX6-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v7
+; GFX6-NEXT:    v_mov_b32_e32 v1, v8
+; GFX6-NEXT:    v_mov_b32_e32 v2, v9
+; GFX6-NEXT:    v_mov_b32_e32 v3, v10
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB8_1
@@ -2300,29 +2300,30 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    s_add_co_i32 s5, s16, 0x800
-; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[8:9], v[8:9]
-; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[0:1]
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v10, s5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX12-NEXT:    v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT:    v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2345,30 +2346,30 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s5, s16, 0x800
-; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
-; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[0:1]
-; GFX11-NEXT:    v_mov_b32_e32 v10, s5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX11-NEXT:    v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2400,29 +2401,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
-; GFX908-NEXT:    v_mov_b32_e32 v5, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v9, v1
-; GFX908-NEXT:    v_mov_b32_e32 v8, v0
-; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX908-NEXT:    v_mov_b32_e32 v10, s6
-; GFX908-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v6
-; GFX908-NEXT:    v_mov_b32_e32 v1, v7
-; GFX908-NEXT:    v_mov_b32_e32 v2, v8
-; GFX908-NEXT:    v_mov_b32_e32 v3, v9
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB9_1
@@ -2433,29 +2434,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
-; GFX8-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT:    v_mov_b32_e32 v10, s6
-; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v7
-; GFX8-NEXT:    v_mov_b32_e32 v2, v8
-; GFX8-NEXT:    v_mov_b32_e32 v3, v9
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
@@ -2499,47 +2500,47 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_addk_co_i32 s16, 0x200
+; GFX12-NEXT:    v_max_num_f16_e32 v5, v0, v0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_b32 s4, s16, -4
-; GFX12-NEXT:    s_and_b32 s5, s16, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v1, s4
-; GFX12-NEXT:    s_lshl_b32 s5, s5, 3
+; GFX12-NEXT:    v_mov_b32_e32 v4, s4
+; GFX12-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_lshl_b32 s6, 0xffff, s5
+; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_not_b32 s7, s6
-; GFX12-NEXT:    buffer_load_b32 v2, v1, s[0:3], null offen
-; GFX12-NEXT:    s_mov_b32 s6, 0
+; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_not_b32 s6, s5
+; GFX12-NEXT:    s_mov_b32 s5, 0
 ; GFX12-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, s5, v2
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v0, v0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
-; GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v1, s5, v1
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_and_or_b32 v1, v2, s7, v1
-; GFX12-NEXT:    v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2
-; GFX12-NEXT:    v_mov_b32_e32 v3, v1
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-NEXT:    s_or_b32 s6, vcc_lo, s6
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s6
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s5, v3
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2547,263 +2548,256 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s16, 0x200
-; GFX940-NEXT:    s_and_b32 s6, s16, -4
-; GFX940-NEXT:    v_mov_b32_e32 v1, s6
-; GFX940-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen
+; GFX940-NEXT:    s_and_b32 s4, s16, -4
+; GFX940-NEXT:    v_mov_b32_e32 v4, s4
+; GFX940-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen
 ; GFX940-NEXT:    s_and_b32 s4, s16, 3
-; GFX940-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX940-NEXT:    s_not_b32 s8, s4
+; GFX940-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT:    s_not_b32 s7, s4
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_max_f16_e32 v5, v0, v0
 ; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_e32 v1, s7, v3
-; GFX940-NEXT:    v_max_f16_e32 v2, v0, v0
-; GFX940-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX940-NEXT:    v_max_f16_e32 v1, v1, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v1, s7, v1
-; GFX940-NEXT:    v_and_or_b32 v2, v3, s8, v1
-; GFX940-NEXT:    v_mov_b32_e32 v6, s6
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX940-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX940-NEXT:    v_max_f16_e32 v0, v0, v5
+; GFX940-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX940-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX940-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s7, v4
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_max_f16_e32 v5, v0, v0
 ; GFX11-NEXT:    s_and_b32 s4, s16, -4
-; GFX11-NEXT:    s_and_b32 s5, s16, 3
-; GFX11-NEXT:    v_mov_b32_e32 v1, s4
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX11-NEXT:    s_lshl_b32 s6, 0xffff, s5
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v4, s4
+; GFX11-NEXT:    s_and_b32 s4, s16, 3
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_not_b32 s7, s6
-; GFX11-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 offen
-; GFX11-NEXT:    s_mov_b32 s6, 0
+; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT:    buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-NEXT:    s_not_b32 s6, s5
+; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, s5, v2
-; GFX11-NEXT:    v_max_f16_e32 v3, v0, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-NEXT:    v_max_f16_e32 v1, v1, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, s5, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v1, v2, s7, v1
-; GFX11-NEXT:    v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2
-; GFX11-NEXT:    v_mov_b32_e32 v3, v1
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-NEXT:    s_or_b32 s6, vcc_lo, s6
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s5, v3
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s20, 0x200
+; GFX10-NEXT:    v_max_f16_e32 v5, v0, v0
 ; GFX10-NEXT:    s_and_b32 s4, s20, -4
-; GFX10-NEXT:    s_and_b32 s5, s20, 3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX10-NEXT:    s_lshl_b32 s6, 0xffff, s5
-; GFX10-NEXT:    s_not_b32 s7, s6
-; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
-; GFX10-NEXT:    s_mov_b32 s6, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    s_and_b32 s4, s20, 3
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX10-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX10-NEXT:    s_not_b32 s6, s5
+; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, s5, v2
-; GFX10-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX10-NEXT:    v_mov_b32_e32 v5, s4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX10-NEXT:    v_max_f16_e32 v1, v1, v3
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v1, v2, s7, v1
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v5
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-NEXT:    s_or_b32 s6, vcc_lo, s6
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v2
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s5, v3
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s20, 0x200
-; GFX90A-NEXT:    s_and_b32 s6, s20, -4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
-; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen
+; GFX90A-NEXT:    s_and_b32 s4, s20, -4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
+; GFX90A-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_and_b32 s4, s20, 3
-; GFX90A-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX90A-NEXT:    s_not_b32 s8, s4
+; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX90A-NEXT:    s_not_b32 s7, s4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v5, v0, v0
 ; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v1, s7, v3
-; GFX90A-NEXT:    v_max_f16_e32 v2, v0, v0
-; GFX90A-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX90A-NEXT:    v_max_f16_e32 v1, v1, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, s7, v1
-; GFX90A-NEXT:    v_and_or_b32 v2, v3, s8, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s6
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX90A-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX90A-NEXT:    v_max_f16_e32 v0, v0, v5
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX90A-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s7, v4
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s20, 0x200
-; GFX908-NEXT:    s_and_b32 s6, s20, -4
-; GFX908-NEXT:    v_mov_b32_e32 v1, s6
-; GFX908-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX908-NEXT:    s_and_b32 s4, s20, -4
+; GFX908-NEXT:    v_mov_b32_e32 v4, s4
+; GFX908-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX908-NEXT:    s_and_b32 s4, s20, 3
-; GFX908-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX908-NEXT:    s_not_b32 s8, s4
+; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX908-NEXT:    s_not_b32 s7, s4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v5, v0, v0
 ; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v1, s7, v2
-; GFX908-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX908-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX908-NEXT:    v_max_f16_e32 v1, v1, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v1, s7, v1
-; GFX908-NEXT:    v_and_or_b32 v1, v2, s8, v1
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX908-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX908-NEXT:    v_max_f16_e32 v0, v0, v5
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX908-NEXT:    v_and_or_b32 v0, v1, s7, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s7, v3
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s20, 0x200
-; GFX8-NEXT:    s_and_b32 s6, s20, -4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX8-NEXT:    s_and_b32 s4, s20, -4
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX8-NEXT:    s_and_b32 s4, s20, 3
-; GFX8-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX8-NEXT:    s_not_b32 s8, s4
+; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v5, v0, v0
 ; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, s7, v2
-; GFX8-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT:    v_max_f16_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, s8, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, s7, v1
-; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v5
+; GFX8-NEXT:    v_and_b32_e32 v2, s7, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s7, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s20, 0x200
-; GFX7-NEXT:    s_and_b32 s6, s20, -4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX7-NEXT:    s_and_b32 s4, s20, -4
+; GFX7-NEXT:    v_mov_b32_e32 v4, s4
+; GFX7-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    s_and_b32 s4, s20, 3
-; GFX7-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v0
-; GFX7-NEXT:    s_not_b32 s8, s4
+; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s7, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_and_b32_e32 v2, s8, v1
-; GFX7-NEXT:    v_mov_b32_e32 v5, s6
-; GFX7-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX7-NEXT:    v_and_b32_e32 v2, s7, v1
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s7, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
@@ -2813,7 +2807,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s7, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2821,31 +2815,30 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s20, 0x200
-; GFX6-NEXT:    s_and_b32 s6, s20, -4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s6
-; GFX6-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX6-NEXT:    s_and_b32 s4, s20, -4
+; GFX6-NEXT:    v_mov_b32_e32 v4, s4
+; GFX6-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    s_and_b32 s4, s20, 3
-; GFX6-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v0
-; GFX6-NEXT:    s_not_b32 s8, s4
+; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s7, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v2, s8, v1
-; GFX6-NEXT:    v_mov_b32_e32 v5, s6
-; GFX6-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX6-NEXT:    v_and_b32_e32 v2, s7, v1
+; GFX6-NEXT:    v_max_f32_e32 v0, v0, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s7, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v2, v0
-; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
@@ -2855,7 +2848,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX6-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s7, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -2873,46 +2866,46 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_addk_co_i32 s16, 0x200
+; GFX12-NEXT:    v_max_num_f16_e32 v3, v0, v0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_b32 s4, s16, -4
-; GFX12-NEXT:    s_and_b32 s5, s16, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v1, s4
-; GFX12-NEXT:    s_lshl_b32 s5, s5, 3
+; GFX12-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_lshl_b32 s6, 0xffff, s5
+; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_not_b32 s7, s6
-; GFX12-NEXT:    buffer_load_b32 v2, v1, s[0:3], null offen
-; GFX12-NEXT:    s_mov_b32 s6, 0
+; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_not_b32 s6, s5
+; GFX12-NEXT:    s_mov_b32 s5, 0
 ; GFX12-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, s5, v2
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v0, v0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
-; GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v1, s5, v1
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_and_or_b32 v1, v2, s7, v1
-; GFX12-NEXT:    v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2
-; GFX12-NEXT:    v_mov_b32_e32 v3, v1
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-NEXT:    s_or_b32 s6, vcc_lo, s6
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-NEXT:    v_mov_b32_e32 v1, v4
+; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2920,32 +2913,31 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s16, 0x200
-; GFX940-NEXT:    s_and_b32 s6, s16, -4
-; GFX940-NEXT:    v_mov_b32_e32 v1, s6
-; GFX940-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen
+; GFX940-NEXT:    s_and_b32 s4, s16, -4
+; GFX940-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen
 ; GFX940-NEXT:    s_and_b32 s4, s16, 3
-; GFX940-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX940-NEXT:    s_not_b32 s8, s4
+; GFX940-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT:    s_not_b32 s7, s4
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX940-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_e32 v1, s7, v3
-; GFX940-NEXT:    v_max_f16_e32 v2, v0, v0
-; GFX940-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX940-NEXT:    v_max_f16_e32 v1, v1, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v1, s7, v1
-; GFX940-NEXT:    v_and_or_b32 v2, v3, s8, v1
-; GFX940-NEXT:    v_mov_b32_e32 v6, s6
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX940-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX940-NEXT:    v_max_f16_e32 v0, v0, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX940-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2956,114 +2948,111 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX11-NEXT:    s_and_b32 s4, s16, -4
-; GFX11-NEXT:    s_and_b32 s5, s16, 3
-; GFX11-NEXT:    v_mov_b32_e32 v1, s4
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX11-NEXT:    s_lshl_b32 s6, 0xffff, s5
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_and_b32 s4, s16, 3
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_not_b32 s7, s6
-; GFX11-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 offen
-; GFX11-NEXT:    s_mov_b32 s6, 0
+; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT:    buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-NEXT:    s_not_b32 s6, s5
+; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, s5, v2
-; GFX11-NEXT:    v_max_f16_e32 v3, v0, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-NEXT:    v_max_f16_e32 v1, v1, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, s5, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v1, v2, s7, v1
-; GFX11-NEXT:    v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2
-; GFX11-NEXT:    v_mov_b32_e32 v3, v1
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-NEXT:    s_or_b32 s6, vcc_lo, s6
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v4
+; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s20, 0x200
+; GFX10-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX10-NEXT:    s_and_b32 s4, s20, -4
-; GFX10-NEXT:    s_and_b32 s5, s20, 3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX10-NEXT:    s_lshl_b32 s6, 0xffff, s5
-; GFX10-NEXT:    s_not_b32 s7, s6
-; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
-; GFX10-NEXT:    s_mov_b32 s6, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-NEXT:    s_and_b32 s4, s20, 3
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX10-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX10-NEXT:    s_not_b32 s6, s5
+; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, s5, v2
-; GFX10-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX10-NEXT:    v_mov_b32_e32 v5, s4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX10-NEXT:    v_max_f16_e32 v1, v1, v3
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v1, v2, s7, v1
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v3
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-NEXT:    s_or_b32 s6, vcc_lo, s6
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s20, 0x200
-; GFX90A-NEXT:    s_and_b32 s6, s20, -4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
-; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen
+; GFX90A-NEXT:    s_and_b32 s4, s20, -4
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_and_b32 s4, s20, 3
-; GFX90A-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX90A-NEXT:    s_not_b32 s8, s4
+; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX90A-NEXT:    s_not_b32 s7, s4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v1, s7, v3
-; GFX90A-NEXT:    v_max_f16_e32 v2, v0, v0
-; GFX90A-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX90A-NEXT:    v_max_f16_e32 v1, v1, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, s7, v1
-; GFX90A-NEXT:    v_and_or_b32 v2, v3, s8, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s6
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX90A-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX90A-NEXT:    v_max_f16_e32 v0, v0, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX90A-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3074,32 +3063,31 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s20, 0x200
-; GFX908-NEXT:    s_and_b32 s6, s20, -4
-; GFX908-NEXT:    v_mov_b32_e32 v1, s6
-; GFX908-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX908-NEXT:    s_and_b32 s4, s20, -4
+; GFX908-NEXT:    v_mov_b32_e32 v2, s4
+; GFX908-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX908-NEXT:    s_and_b32 s4, s20, 3
-; GFX908-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX908-NEXT:    s_not_b32 s8, s4
+; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX908-NEXT:    s_not_b32 s7, s4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v1, s7, v2
-; GFX908-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX908-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX908-NEXT:    v_max_f16_e32 v1, v1, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v1, s7, v1
-; GFX908-NEXT:    v_and_or_b32 v1, v2, s8, v1
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX908-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX908-NEXT:    v_max_f16_e32 v0, v0, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX908-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX908-NEXT:    v_mov_b32_e32 v5, v1
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3110,33 +3098,32 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s20, 0x200
-; GFX8-NEXT:    s_and_b32 s6, s20, -4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX8-NEXT:    s_and_b32 s4, s20, -4
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX8-NEXT:    s_and_b32 s4, s20, 3
-; GFX8-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX8-NEXT:    s_not_b32 s8, s4
+; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, s7, v2
-; GFX8-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT:    v_max_f16_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, s8, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, s7, v1
-; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v3
+; GFX8-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3147,35 +3134,34 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s20, 0x200
-; GFX7-NEXT:    s_and_b32 s6, s20, -4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX7-NEXT:    s_and_b32 s4, s20, -4
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    s_and_b32 s4, s20, 3
-; GFX7-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v0
-; GFX7-NEXT:    s_not_b32 s8, s4
+; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s7, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_and_b32_e32 v3, s8, v1
-; GFX7-NEXT:    v_mov_b32_e32 v5, s6
-; GFX7-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v3
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s7, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX7-NEXT:    v_mov_b32_e32 v4, v1
-; GFX7-NEXT:    v_mov_b32_e32 v3, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX7-NEXT:    v_mov_b32_e32 v5, v1
+; GFX7-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3186,36 +3172,35 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s20, 0x200
-; GFX6-NEXT:    s_and_b32 s6, s20, -4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s6
-; GFX6-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX6-NEXT:    s_and_b32 s4, s20, -4
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    s_and_b32 s4, s20, 3
-; GFX6-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v0
-; GFX6-NEXT:    s_not_b32 s8, s4
+; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s7, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v3, s8, v1
-; GFX6-NEXT:    v_mov_b32_e32 v5, s6
-; GFX6-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX6-NEXT:    v_max_f32_e32 v0, v0, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s7, v0
-; GFX6-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX6-NEXT:    v_mov_b32_e32 v4, v1
-; GFX6-NEXT:    v_mov_b32_e32 v3, v0
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v1, v3
+; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3235,15 +3220,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_nc_u32_e32 v6, 0x200, v4
+; GFX12-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
 ; GFX12-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_and_b32_e32 v4, 3, v6
-; GFX12-NEXT:    v_and_b32_e32 v10, -4, v6
-; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT:    v_and_b32_e32 v6, 3, v4
+; GFX12-NEXT:    v_and_b32_e32 v8, -4, v4
+; GFX12-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e64 v7, v4, 0xffff
-; GFX12-NEXT:    v_not_b32_e32 v11, v7
+; GFX12-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX12-NEXT:    v_not_b32_e32 v9, v6
 ; GFX12-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
@@ -3258,30 +3243,31 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_b32 v7, v10, s[4:7], null offen
+; GFX12-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-NEXT:    v_max_num_f16_e32 v10, v5, v5
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX12-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
-; GFX12-NEXT:    v_max_num_f16_e32 v8, v5, v5
+; GFX12-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX12-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v6, v6, v6
-; GFX12-NEXT:    v_max_num_f16_e32 v6, v6, v8
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX12-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
+; GFX12-NEXT:    v_max_num_f16_e32 v4, v4, v4
+; GFX12-NEXT:    v_max_num_f16_e32 v4, v4, v10
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v6, v7, v11, v6
-; GFX12-NEXT:    v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
+; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_or_b32 v5, v6, v9, v4
+; GFX12-NEXT:    v_mov_b32_e32 v4, v5
+; GFX12-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX12-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
@@ -3297,15 +3283,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB12_4
 ; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v8, v7
-; GFX12-NEXT:    v_mov_b32_e32 v7, v8
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -3313,7 +3299,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX12-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3321,12 +3307,12 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v4, 0x200, v4
-; GFX940-NEXT:    v_and_b32_e32 v10, -4, v4
+; GFX940-NEXT:    v_and_b32_e32 v9, -4, v4
 ; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v8, 3, v4
 ; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v6, v4, s0
-; GFX940-NEXT:    v_not_b32_e32 v11, v6
+; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v8, s0
+; GFX940-NEXT:    v_not_b32_e32 v10, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX940-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
@@ -3338,24 +3324,24 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
 ; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT:    buffer_load_dword v7, v10, s[4:7], 0 offen
+; GFX940-NEXT:    buffer_load_dword v7, v9, s[4:7], 0 offen
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_max_f16_e32 v11, v5, v5
 ; GFX940-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX940-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
-; GFX940-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT:    v_max_f16_e32 v8, v5, v5
-; GFX940-NEXT:    v_max_f16_e32 v6, v6, v8
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
-; GFX940-NEXT:    v_and_or_b32 v6, v7, v11, v6
+; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v8, v7
+; GFX940-NEXT:    v_max_f16_e32 v4, v4, v4
+; GFX940-NEXT:    v_max_f16_e32 v4, v4, v11
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v8, v4
+; GFX940-NEXT:    v_and_or_b32 v6, v7, v10, v4
 ; GFX940-NEXT:    s_mov_b64 s[8:9], exec
-; GFX940-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
+; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
@@ -3369,36 +3355,36 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB12_4
 ; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v7, v8
+; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x200, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v4, 3, v6
-; GFX11-NEXT:    v_and_b32_e32 v10, -4, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT:    v_and_b32_e32 v6, 3, v4
+; GFX11-NEXT:    v_and_b32_e32 v8, -4, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e64 v7, v4, 0xffff
-; GFX11-NEXT:    v_not_b32_e32 v11, v7
+; GFX11-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX11-NEXT:    v_not_b32_e32 v9, v6
 ; GFX11-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
@@ -3410,29 +3396,30 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    buffer_load_b32 v7, v10, s[4:7], 0 offen
+; GFX11-NEXT:    buffer_load_b32 v6, v8, s[4:7], 0 offen
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_max_f16_e32 v10, v5, v5
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
-; GFX11-NEXT:    v_max_f16_e32 v8, v5, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX11-NEXT:    v_max_f16_e32 v6, v6, v8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
+; GFX11-NEXT:    v_max_f16_e32 v4, v4, v4
+; GFX11-NEXT:    v_max_f16_e32 v4, v4, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v6, v7, v11, v6
-; GFX11-NEXT:    v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
+; GFX11-NEXT:    v_and_or_b32 v5, v6, v9, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v5
+; GFX11-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX11-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
@@ -3446,14 +3433,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB12_4
 ; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v8, v7
-; GFX11-NEXT:    v_mov_b32_e32 v7, v8
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
@@ -3462,20 +3449,20 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX11-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v6, 0x200, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
-; GFX10-NEXT:    v_and_b32_e32 v4, 3, v6
-; GFX10-NEXT:    v_and_b32_e32 v10, -4, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX10-NEXT:    v_lshlrev_b32_e64 v7, v4, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v11, v7
+; GFX10-NEXT:    v_and_b32_e32 v6, 3, v4
+; GFX10-NEXT:    v_and_b32_e32 v8, -4, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
+; GFX10-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v9, v6
 ; GFX10-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
@@ -3485,26 +3472,26 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    buffer_load_dword v7, v10, s[8:11], 0 offen
+; GFX10-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    v_max_f16_e32 v10, v5, v5
 ; GFX10-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX10-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
-; GFX10-NEXT:    v_max_f16_e32 v8, v5, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX10-NEXT:    v_max_f16_e32 v6, v6, v8
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v6, v7, v11, v6
-; GFX10-NEXT:    v_mov_b32_e32 v9, v7
-; GFX10-NEXT:    v_mov_b32_e32 v8, v6
+; GFX10-NEXT:    v_max_f16_e32 v4, v4, v4
+; GFX10-NEXT:    v_max_f16_e32 v4, v4, v10
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v5, v6, v9, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v5
+; GFX10-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX10-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
@@ -3516,15 +3503,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB12_4
 ; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v8, v7
-; GFX10-NEXT:    v_mov_b32_e32 v7, v8
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
@@ -3533,19 +3520,19 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX10-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v4, 0x200, v4
-; GFX90A-NEXT:    v_and_b32_e32 v10, -4, v4
+; GFX90A-NEXT:    v_and_b32_e32 v9, -4, v4
 ; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v8, 3, v4
 ; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v6, v4, s4
-; GFX90A-NEXT:    v_not_b32_e32 v11, v6
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v8, s4
+; GFX90A-NEXT:    v_not_b32_e32 v10, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX90A-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -3557,24 +3544,24 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_load_dword v7, v10, s[8:11], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v7, v9, s[8:11], 0 offen
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_max_f16_e32 v11, v5, v5
 ; GFX90A-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX90A-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
-; GFX90A-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT:    v_max_f16_e32 v8, v5, v5
-; GFX90A-NEXT:    v_max_f16_e32 v6, v6, v8
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
-; GFX90A-NEXT:    v_and_or_b32 v6, v7, v11, v6
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v8, v7
+; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
+; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v11
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v8, v4
+; GFX90A-NEXT:    v_and_or_b32 v6, v7, v10, v4
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
-; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
 ; GFX90A-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -3586,33 +3573,33 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB12_4
 ; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v8
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v4, 0x200, v4
-; GFX908-NEXT:    v_and_b32_e32 v10, -4, v4
+; GFX908-NEXT:    v_and_b32_e32 v8, -4, v4
 ; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX908-NEXT:    v_lshlrev_b32_e32 v7, 3, v4
 ; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v6, v4, s4
-; GFX908-NEXT:    v_not_b32_e32 v11, v6
+; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
+; GFX908-NEXT:    v_not_b32_e32 v9, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX908-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -3624,25 +3611,25 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dword v7, v10, s[8:11], 0 offen
+; GFX908-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_max_f16_e32 v10, v5, v5
 ; GFX908-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX908-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
-; GFX908-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX908-NEXT:    v_max_f16_e32 v8, v5, v5
-; GFX908-NEXT:    v_max_f16_e32 v6, v6, v8
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
-; GFX908-NEXT:    v_and_or_b32 v6, v7, v11, v6
-; GFX908-NEXT:    v_mov_b32_e32 v9, v7
+; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
+; GFX908-NEXT:    v_max_f16_e32 v4, v4, v4
+; GFX908-NEXT:    v_max_f16_e32 v4, v4, v10
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
+; GFX908-NEXT:    v_and_or_b32 v5, v6, v9, v4
+; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
-; GFX908-NEXT:    v_mov_b32_e32 v8, v6
+; GFX908-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX908-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -3654,33 +3641,33 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB12_4
 ; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v7, v8
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x200, v4
-; GFX8-NEXT:    v_and_b32_e32 v10, -4, v4
+; GFX8-NEXT:    v_and_b32_e32 v8, -4, v4
 ; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 3, v4
 ; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v6, v4, s4
-; GFX8-NEXT:    v_not_b32_e32 v11, v6
+; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
+; GFX8-NEXT:    v_not_b32_e32 v9, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -3692,26 +3679,26 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    buffer_load_dword v7, v10, s[8:11], 0 offen
+; GFX8-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_max_f16_e32 v10, v5, v5
 ; GFX8-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX8-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
-; GFX8-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX8-NEXT:    v_max_f16_e32 v8, v5, v5
-; GFX8-NEXT:    v_max_f16_e32 v6, v6, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
-; GFX8-NEXT:    v_and_b32_e32 v8, v7, v11
-; GFX8-NEXT:    v_or_b32_e32 v6, v8, v6
-; GFX8-NEXT:    v_mov_b32_e32 v9, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
+; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
+; GFX8-NEXT:    v_max_f16_e32 v4, v4, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, v6, v9
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
-; GFX8-NEXT:    v_mov_b32_e32 v8, v6
+; GFX8-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX8-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -3723,21 +3710,21 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB12_4
 ; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v7, v8
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -3898,27 +3885,28 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_addk_co_i32 s16, 0x200
-; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_b32 s4, s16, -4
-; GFX12-NEXT:    s_and_b32 s5, s16, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v1, s4
-; GFX12-NEXT:    s_lshl_b32 s5, s5, 3
+; GFX12-NEXT:    v_mov_b32_e32 v4, s4
+; GFX12-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_lshl_b32 s6, 0xffff, s5
+; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_not_b32 s7, s6
-; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen
-; GFX12-NEXT:    s_mov_b32 s6, 0
+; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_not_b32 s6, s5
+; GFX12-NEXT:    s_mov_b32 s5, 0
 ; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s5, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v4
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX12-NEXT:    v_or_b32_e32 v3, 0x400000, v0
@@ -3928,23 +3916,23 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
 ; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s5, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX12-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v2
-; GFX12-NEXT:    s_or_b32 s6, vcc_lo, s6
+; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s6
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s5, v2
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3952,33 +3940,32 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s16, 0x200
-; GFX940-NEXT:    s_and_b32 s6, s16, -4
-; GFX940-NEXT:    v_mov_b32_e32 v1, s6
-; GFX940-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
+; GFX940-NEXT:    s_and_b32 s4, s16, -4
+; GFX940-NEXT:    v_mov_b32_e32 v4, s4
+; GFX940-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen
 ; GFX940-NEXT:    s_and_b32 s4, s16, 3
-; GFX940-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX940-NEXT:    s_not_b32 s8, s4
+; GFX940-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT:    s_not_b32 s7, s4
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX940-NEXT:    s_movk_i32 s9, 0x7fff
+; GFX940-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT:    v_mov_b32_e32 v5, s6
-; GFX940-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    v_max_f32_e32 v0, v0, v5
 ; GFX940-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX940-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX940-NEXT:    v_add3_u32 v2, v2, v0, s9
+; GFX940-NEXT:    v_add3_u32 v2, v2, v0, s8
 ; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    s_nop 1
 ; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX940-NEXT:    v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT:    v_and_or_b32 v0, v1, s8, v0
+; GFX940-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT:    v_and_or_b32 v0, v1, s7, v0
 ; GFX940-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
-; GFX940-NEXT:    buffer_atomic_cmpswap v[2:3], v5, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
@@ -3988,32 +3975,33 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
 ; GFX940-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s7, v2
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX11-NEXT:    s_and_b32 s4, s16, -4
-; GFX11-NEXT:    s_and_b32 s5, s16, 3
-; GFX11-NEXT:    v_mov_b32_e32 v1, s4
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_lshl_b32 s6, 0xffff, s5
-; GFX11-NEXT:    s_not_b32 s7, s6
-; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen
-; GFX11-NEXT:    s_mov_b32 s6, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v4, s4
+; GFX11-NEXT:    s_and_b32 s4, s16, 3
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT:    buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-NEXT:    s_not_b32 s6, s5
+; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s5, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v0
@@ -4023,97 +4011,95 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, s5, v0
-; GFX11-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v2
-; GFX11-NEXT:    s_or_b32 s6, vcc_lo, s6
+; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s5, v2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s20, 0x200
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX10-NEXT:    s_and_b32 s4, s20, -4
-; GFX10-NEXT:    s_and_b32 s5, s20, 3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX10-NEXT:    s_lshl_b32 s6, 0xffff, s5
-; GFX10-NEXT:    s_not_b32 s7, s6
-; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
-; GFX10-NEXT:    s_mov_b32 s6, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    s_and_b32 s4, s20, 3
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX10-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX10-NEXT:    s_not_b32 s6, s5
+; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_mov_b32_e32 v5, s4
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v5
 ; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
-; GFX10-NEXT:    buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v2
-; GFX10-NEXT:    s_or_b32 s6, vcc_lo, s6
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s5, v2
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s20, 0x200
-; GFX90A-NEXT:    s_and_b32 s6, s20, -4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
-; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX90A-NEXT:    s_and_b32 s4, s20, -4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
+; GFX90A-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_and_b32 s4, s20, 3
-; GFX90A-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX90A-NEXT:    s_not_b32 s8, s4
+; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX90A-NEXT:    s_not_b32 s7, s4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX90A-NEXT:    s_movk_i32 s9, 0x7fff
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_max_f32_e32 v0, v0, v5
 ; GFX90A-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX90A-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX90A-NEXT:    v_add3_u32 v2, v2, v0, s9
+; GFX90A-NEXT:    v_add3_u32 v2, v2, v0, s8
 ; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v0, v1, s8, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v5, s6
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v0, v1, s7, v0
 ; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
@@ -4123,39 +4109,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s7, v2
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s20, 0x200
-; GFX908-NEXT:    s_and_b32 s6, s20, -4
-; GFX908-NEXT:    v_mov_b32_e32 v1, s6
-; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX908-NEXT:    s_and_b32 s4, s20, -4
+; GFX908-NEXT:    v_mov_b32_e32 v4, s4
+; GFX908-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX908-NEXT:    s_and_b32 s4, s20, 3
-; GFX908-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX908-NEXT:    s_not_b32 s8, s4
+; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX908-NEXT:    s_not_b32 s7, s4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX908-NEXT:    s_movk_i32 s9, 0x7fff
+; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_max_f32_e32 v0, v0, v5
 ; GFX908-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX908-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX908-NEXT:    v_add3_u32 v2, v2, v0, s9
+; GFX908-NEXT:    v_add3_u32 v2, v2, v0, s8
 ; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX908-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v0, v1, s8, v0
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v0, v1, s7, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX908-NEXT:    v_mov_b32_e32 v2, v0
-; GFX908-NEXT:    buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
@@ -4165,41 +4150,40 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
 ; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s7, v2
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s20, 0x200
-; GFX8-NEXT:    s_and_b32 s6, s20, -4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX8-NEXT:    s_and_b32 s4, s20, -4
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX8-NEXT:    s_and_b32 s4, s20, 3
-; GFX8-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX8-NEXT:    s_not_b32 s8, s4
+; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_mov_b32_e32 v0, s7
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v5
 ; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
 ; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
-; GFX8-NEXT:    v_and_b32_e32 v2, s8, v1
+; GFX8-NEXT:    v_and_b32_e32 v2, s7, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
@@ -4209,38 +4193,37 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
 ; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s7, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s20, 0x200
-; GFX7-NEXT:    s_and_b32 s6, s20, -4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX7-NEXT:    s_and_b32 s4, s20, -4
+; GFX7-NEXT:    v_mov_b32_e32 v4, s4
+; GFX7-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX7-NEXT:    s_and_b32 s4, s20, 3
-; GFX7-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s7
+; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    s_not_b32 s8, s4
+; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s7, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v5
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_and_b32_e32 v2, s8, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s7, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s7, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
@@ -4250,7 +4233,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
 ; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s7, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4258,32 +4241,31 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s20, 0x200
-; GFX6-NEXT:    s_and_b32 s6, s20, -4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s6
-; GFX6-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX6-NEXT:    s_and_b32 s4, s20, -4
+; GFX6-NEXT:    v_mov_b32_e32 v4, s4
+; GFX6-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX6-NEXT:    s_and_b32 s4, s20, 3
-; GFX6-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s7
+; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT:    s_not_b32 s8, s4
+; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX6-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s7, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX6-NEXT:    v_max_f32_e32 v0, v0, v5
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v2, s8, v1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s7, v0
+; GFX6-NEXT:    v_and_b32_e32 v2, s7, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v1
-; GFX6-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX6-NEXT:    v_mov_b32_e32 v2, v0
-; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
@@ -4293,7 +4275,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
 ; GFX6-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s7, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -4311,52 +4293,53 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_addk_co_i32 s16, 0x200
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_b32 s4, s16, -4
-; GFX12-NEXT:    s_and_b32 s5, s16, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v1, s4
-; GFX12-NEXT:    s_lshl_b32 s5, s5, 3
+; GFX12-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_lshl_b32 s6, 0xffff, s5
+; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_not_b32 s7, s6
-; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen
-; GFX12-NEXT:    s_mov_b32 s6, 0
+; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_not_b32 s6, s5
+; GFX12-NEXT:    s_mov_b32 s5, 0
 ; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s5, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX12-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s5, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX12-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v1
-; GFX12-NEXT:    v_mov_b32_e32 v1, v3
-; GFX12-NEXT:    s_or_b32 s6, vcc_lo, s6
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-NEXT:    v_mov_b32_e32 v1, v4
+; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4364,33 +4347,32 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s16, 0x200
-; GFX940-NEXT:    s_and_b32 s6, s16, -4
-; GFX940-NEXT:    v_mov_b32_e32 v1, s6
-; GFX940-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
+; GFX940-NEXT:    s_and_b32 s4, s16, -4
+; GFX940-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen
 ; GFX940-NEXT:    s_and_b32 s4, s16, 3
-; GFX940-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX940-NEXT:    s_not_b32 s8, s4
+; GFX940-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT:    s_not_b32 s7, s4
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX940-NEXT:    s_movk_i32 s9, 0x7fff
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT:    v_mov_b32_e32 v3, s6
-; GFX940-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    v_max_f32_e32 v0, v0, v3
 ; GFX940-NEXT:    v_bfe_u32 v4, v0, 16, 1
 ; GFX940-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX940-NEXT:    v_add3_u32 v4, v4, v0, s9
+; GFX940-NEXT:    v_add3_u32 v4, v4, v0, s8
 ; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    s_nop 1
 ; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX940-NEXT:    v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT:    v_and_or_b32 v0, v1, s8, v0
+; GFX940-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT:    v_and_or_b32 v0, v1, s7, v0
 ; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
-; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
@@ -4406,123 +4388,122 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX11-NEXT:    s_and_b32 s4, s16, -4
-; GFX11-NEXT:    s_and_b32 s5, s16, 3
-; GFX11-NEXT:    v_mov_b32_e32 v1, s4
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_lshl_b32 s6, 0xffff, s5
-; GFX11-NEXT:    s_not_b32 s7, s6
-; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen
-; GFX11-NEXT:    s_mov_b32 s6, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_and_b32 s4, s16, 3
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT:    buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-NEXT:    s_not_b32 s6, s5
+; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s5, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX11-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, s5, v0
-; GFX11-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v3
-; GFX11-NEXT:    s_or_b32 s6, vcc_lo, s6
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v4
+; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s20, 0x200
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX10-NEXT:    s_and_b32 s4, s20, -4
-; GFX10-NEXT:    s_and_b32 s5, s20, 3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX10-NEXT:    s_lshl_b32 s6, 0xffff, s5
-; GFX10-NEXT:    s_not_b32 s7, s6
-; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
-; GFX10-NEXT:    s_mov_b32 s6, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-NEXT:    s_and_b32 s4, s20, 3
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX10-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX10-NEXT:    s_not_b32 s6, s5
+; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_mov_b32_e32 v5, s4
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX10-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v0, v1, s7, v0
-; GFX10-NEXT:    v_mov_b32_e32 v4, v1
-; GFX10-NEXT:    v_mov_b32_e32 v3, v0
-; GFX10-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v3
-; GFX10-NEXT:    s_or_b32 s6, vcc_lo, s6
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s20, 0x200
-; GFX90A-NEXT:    s_and_b32 s6, s20, -4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
-; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX90A-NEXT:    s_and_b32 s4, s20, -4
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_and_b32 s4, s20, 3
-; GFX90A-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX90A-NEXT:    s_not_b32 s8, s4
+; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX90A-NEXT:    s_not_b32 s7, s4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX90A-NEXT:    s_movk_i32 s9, 0x7fff
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_max_f32_e32 v0, v0, v3
 ; GFX90A-NEXT:    v_bfe_u32 v4, v0, 16, 1
 ; GFX90A-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX90A-NEXT:    v_add3_u32 v4, v4, v0, s9
+; GFX90A-NEXT:    v_add3_u32 v4, v4, v0, s8
 ; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v0, v1, s8, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v0, v1, s7, v0
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
@@ -4538,37 +4519,36 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s20, 0x200
-; GFX908-NEXT:    s_and_b32 s6, s20, -4
-; GFX908-NEXT:    v_mov_b32_e32 v1, s6
-; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX908-NEXT:    s_and_b32 s4, s20, -4
+; GFX908-NEXT:    v_mov_b32_e32 v2, s4
+; GFX908-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX908-NEXT:    s_and_b32 s4, s20, 3
-; GFX908-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX908-NEXT:    s_not_b32 s8, s4
+; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX908-NEXT:    s_not_b32 s7, s4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX908-NEXT:    s_movk_i32 s9, 0x7fff
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX908-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; GFX908-NEXT:    v_add3_u32 v3, v3, v0, s9
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX908-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX908-NEXT:    v_add3_u32 v4, v4, v0, s8
 ; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v0, v1, s8, v0
-; GFX908-NEXT:    v_mov_b32_e32 v4, v1
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v3, v0
-; GFX908-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX908-NEXT:    v_mov_b32_e32 v5, v1
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v1, v3
+; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4579,39 +4559,38 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s20, 0x200
-; GFX8-NEXT:    s_and_b32 s6, s20, -4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX8-NEXT:    s_and_b32 s4, s20, -4
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX8-NEXT:    s_and_b32 s4, s20, 3
-; GFX8-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX8-NEXT:    s_not_b32 s8, s4
+; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_mov_b32_e32 v0, s7
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v4, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f32_e32 v4, v4, v2
-; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX8-NEXT:    v_and_b32_e32 v3, s8, v1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX8-NEXT:    v_mov_b32_e32 v4, v1
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v3, v0
-; GFX8-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX8-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v1, v3
+; GFX8-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4622,36 +4601,35 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s20, 0x200
-; GFX7-NEXT:    s_and_b32 s6, s20, -4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX7-NEXT:    s_and_b32 s4, s20, -4
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX7-NEXT:    s_and_b32 s4, s20, 3
-; GFX7-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s7
+; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    s_not_b32 s8, s4
+; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s7, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v3
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_and_b32_e32 v3, s8, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s7, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX7-NEXT:    v_mov_b32_e32 v4, v1
-; GFX7-NEXT:    v_mov_b32_e32 v5, s6
-; GFX7-NEXT:    v_mov_b32_e32 v3, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX7-NEXT:    v_mov_b32_e32 v5, v1
+; GFX7-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4662,37 +4640,36 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s20, 0x200
-; GFX6-NEXT:    s_and_b32 s6, s20, -4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s6
-; GFX6-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX6-NEXT:    s_and_b32 s4, s20, -4
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX6-NEXT:    s_and_b32 s4, s20, 3
-; GFX6-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s7
+; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT:    s_not_b32 s8, s4
+; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX6-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s7, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX6-NEXT:    v_max_f32_e32 v0, v0, v3
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v3, s8, v1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s7, v0
-; GFX6-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX6-NEXT:    v_mov_b32_e32 v4, v1
-; GFX6-NEXT:    v_mov_b32_e32 v5, s6
-; GFX6-NEXT:    v_mov_b32_e32 v3, v0
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v1, v3
+; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5419,27 +5396,29 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v2, v0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    s_add_co_i32 s5, s16, 0x400
-; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x400
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_mov_b32_e32 v3, s4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v1, v1
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5
-; GFX12-NEXT:    v_pk_max_num_f16 v0, v2, v2
+; GFX12-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v1, v4, v4
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v1, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v5, v5
+; GFX12-NEXT:    v_pk_max_num_f16 v4, v0, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -5452,20 +5431,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, v0
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
 ; GFX940-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_pk_max_f16 v2, v1, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v0
-; GFX940-NEXT:    v_pk_max_f16 v1, v2, v2
 ; GFX940-NEXT:    v_pk_max_f16 v0, v5, v5
-; GFX940-NEXT:    v_mov_b32_e32 v3, s6
-; GFX940-NEXT:    v_pk_max_f16 v4, v0, v1
 ; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    v_pk_max_f16 v4, v0, v2
+; GFX940-NEXT:    s_nop 0
 ; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -5481,27 +5461,28 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s5, s16, 0x400
-; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5
-; GFX11-NEXT:    v_pk_max_f16 v0, v2, v2
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v1, v4, v4
-; GFX11-NEXT:    v_pk_max_f16 v3, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_max_f16 v0, v5, v5
+; GFX11-NEXT:    v_pk_max_f16 v4, v0, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -5513,27 +5494,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s5, s20, 0x400
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10-NEXT:    v_pk_max_f16 v0, v2, v2
-; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_pk_max_f16 v1, v4, v4
-; GFX10-NEXT:    v_pk_max_f16 v3, v1, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_pk_max_f16 v0, v5, v5
+; GFX10-NEXT:    v_pk_max_f16 v4, v0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
@@ -5544,19 +5525,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v2, v1, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
-; GFX90A-NEXT:    v_pk_max_f16 v1, v2, v2
 ; GFX90A-NEXT:    v_pk_max_f16 v0, v5, v5
-; GFX90A-NEXT:    v_pk_max_f16 v4, v0, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_pk_max_f16 v4, v0, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -5572,25 +5553,25 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v2, v1, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    v_pk_max_f16 v1, v2, v2
-; GFX908-NEXT:    v_pk_max_f16 v0, v4, v4
-; GFX908-NEXT:    v_pk_max_f16 v3, v0, v1
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_pk_max_f16 v0, v5, v5
+; GFX908-NEXT:    v_pk_max_f16 v4, v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
@@ -5601,29 +5582,29 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v3, v1, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT:    v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v6, v4, v4
-; GFX8-NEXT:    v_max_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v0, v6, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, v0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v1, v6, v6
+; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v3
+; GFX8-NEXT:    v_or_b32_e32 v5, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, v5
+; GFX8-NEXT:    v_mov_b32_e32 v1, v6
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
@@ -5646,30 +5627,30 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v8, s6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_max_f32_e32 v4, v4, v2
-; GFX7-NEXT:    v_max_f32_e32 v5, v5, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v5
-; GFX7-NEXT:    v_or_b32_e32 v5, v0, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT:    v_or_b32_e32 v4, v6, v0
+; GFX7-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX7-NEXT:    v_max_f32_e32 v6, v6, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v6
+; GFX7-NEXT:    v_or_b32_e32 v6, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v0
+; GFX7-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v7
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
@@ -5692,31 +5673,31 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX6-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v8, s6
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v1
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_max_f32_e32 v4, v4, v2
-; GFX6-NEXT:    v_max_f32_e32 v5, v5, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX6-NEXT:    v_max_f32_e32 v6, v6, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v5
-; GFX6-NEXT:    v_or_b32_e32 v5, v0, v1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX6-NEXT:    v_or_b32_e32 v4, v6, v0
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v6
+; GFX6-NEXT:    v_or_b32_e32 v6, v0, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
+; GFX6-NEXT:    v_or_b32_e32 v5, v7, v0
+; GFX6-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX6-NEXT:    v_mov_b32_e32 v7, v5
-; GFX6-NEXT:    v_mov_b32_e32 v6, v4
-; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v6
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v7
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB16_1
@@ -5738,25 +5719,26 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s16
-; GFX12-NEXT:    s_add_co_i32 s5, s16, 0x400
+; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x400
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v0, v0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    buffer_load_b32 v2, v1, s[0:3], null offen offset:1024
+; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
 ; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_pk_max_num_f16 v1, v0, v0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v1, v1
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_pk_max_num_f16 v1, v3, v1
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2
-; GFX12-NEXT:    v_mov_b32_e32 v3, v1
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v2
+; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-NEXT:    v_mov_b32_e32 v1, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -5770,24 +5752,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s16
-; GFX940-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
 ; GFX940-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_pk_max_f16 v2, v0, v0
+; GFX940-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX940-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_pk_max_f16 v1, v0, v0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
-; GFX940-NEXT:    v_mov_b32_e32 v6, s6
-; GFX940-NEXT:    v_pk_max_f16 v2, v2, v1
+; GFX940-NEXT:    v_pk_max_f16 v0, v1, v1
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
+; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5798,25 +5781,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s16
-; GFX11-NEXT:    s_add_i32 s5, s16, 0x400
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
+; GFX11-NEXT:    v_pk_max_f16 v2, v0, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_pk_max_f16 v1, v0, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
+; GFX11-NEXT:    v_pk_max_f16 v0, v1, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_pk_max_f16 v1, v3, v1
-; GFX11-NEXT:    v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2
-; GFX11-NEXT:    v_mov_b32_e32 v3, v1
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -5829,25 +5812,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
-; GFX10-NEXT:    s_add_i32 s5, s20, 0x400
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
+; GFX10-NEXT:    v_pk_max_f16 v2, v0, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_pk_max_f16 v1, v0, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    v_pk_max_f16 v0, v1, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_pk_max_f16 v1, v3, v1
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
@@ -5859,23 +5842,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
-; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v2, v0, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_pk_max_f16 v1, v0, v0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
-; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s6
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_pk_max_f16 v0, v1, v1
+; GFX90A-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5886,24 +5869,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s20
-; GFX908-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v2, v0, v0
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_pk_max_f16 v1, v0, v0
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX908-NEXT:    v_pk_max_f16 v1, v3, v1
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_pk_max_f16 v0, v1, v1
+; GFX908-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v5, v1
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5914,28 +5897,28 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s20
-; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v3, v0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v4, v0, v0
-; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:    v_max_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v3, v5, v4
-; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v1, v1
+; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT:    v_mov_b32_e32 v6, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5946,41 +5929,41 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s20
-; GFX7-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v0
 ; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_mov_b32_e32 v7, s6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_max_f32_e32 v4, v4, v0
-; GFX7-NEXT:    v_max_f32_e32 v5, v5, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    v_or_b32_e32 v4, v2, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX7-NEXT:    v_or_b32_e32 v3, v5, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_max_f32_e32 v5, v5, v0
+; GFX7-NEXT:    v_max_f32_e32 v6, v6, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_or_b32_e32 v5, v3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-NEXT:    v_or_b32_e32 v4, v6, v3
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
@@ -5992,42 +5975,42 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s20
-; GFX6-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v0
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v0
 ; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX6-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_mov_b32_e32 v7, s6
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_max_f32_e32 v4, v4, v0
-; GFX6-NEXT:    v_max_f32_e32 v5, v5, v1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v4
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX6-NEXT:    v_or_b32_e32 v4, v2, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX6-NEXT:    v_or_b32_e32 v3, v5, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT:    v_max_f32_e32 v5, v5, v0
+; GFX6-NEXT:    v_max_f32_e32 v6, v6, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT:    v_or_b32_e32 v5, v3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX6-NEXT:    v_or_b32_e32 v4, v6, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX6-NEXT:    v_mov_b32_e32 v6, v4
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc
+; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB17_1
@@ -6048,7 +6031,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_nc_u32_e32 v9, 0x400, v4
+; GFX12-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX12-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX12-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
@@ -6064,26 +6047,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_b32 v8, v4, s[4:7], null offen offset:1024
+; GFX12-NEXT:    buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
 ; GFX12-NEXT:    ; implicit-def: $vgpr4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-NEXT:    v_pk_max_num_f16 v8, v5, v5
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX12-NEXT:    ; Child Loop BB18_4 Depth 2
-; GFX12-NEXT:    v_pk_max_num_f16 v4, v5, v5
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_pk_max_num_f16 v6, v8, v8
+; GFX12-NEXT:    v_pk_max_num_f16 v4, v6, v6
 ; GFX12-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v7, v6, v4
-; GFX12-NEXT:    v_mov_b32_e32 v6, v7
-; GFX12-NEXT:    v_mov_b32_e32 v7, v8
+; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v8
+; GFX12-NEXT:    v_mov_b32_e32 v4, v5
+; GFX12-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX12-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
@@ -6099,15 +6082,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX12-NEXT:    v_mov_b32_e32 v8, v6
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -6115,14 +6098,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    v_mov_b32_e32 v0, v6
+; GFX12-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_u32_e32 v10, 0x400, v4
+; GFX940-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX940-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
@@ -6134,23 +6117,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
 ; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT:    buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024
+; GFX940-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
 ; GFX940-NEXT:    ; implicit-def: $vgpr4
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_pk_max_f16 v9, v5, v5
 ; GFX940-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX940-NEXT:    ; Child Loop BB18_4 Depth 2
-; GFX940-NEXT:    v_pk_max_f16 v4, v5, v5
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v6, v9, v9
+; GFX940-NEXT:    v_pk_max_f16 v4, v7, v7
 ; GFX940-NEXT:    s_mov_b64 s[8:9], exec
-; GFX940-NEXT:    v_pk_max_f16 v8, v6, v4
+; GFX940-NEXT:    v_pk_max_f16 v6, v4, v9
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[8:9]
+; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX940-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
@@ -6163,27 +6146,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v9
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v9, v6
+; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v6
+; GFX940-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x400, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
@@ -6197,25 +6180,25 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024
+; GFX11-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
 ; GFX11-NEXT:    ; implicit-def: $vgpr4
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_pk_max_f16 v8, v5, v5
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-NEXT:    ; Child Loop BB18_4 Depth 2
-; GFX11-NEXT:    v_pk_max_f16 v4, v5, v5
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v6, v8, v8
+; GFX11-NEXT:    v_pk_max_f16 v4, v6, v6
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v7, v6, v4
-; GFX11-NEXT:    v_mov_b32_e32 v6, v7
-; GFX11-NEXT:    v_mov_b32_e32 v7, v8
+; GFX11-NEXT:    v_pk_max_f16 v5, v4, v8
+; GFX11-NEXT:    v_mov_b32_e32 v4, v5
+; GFX11-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX11-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
@@ -6229,14 +6212,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX11-NEXT:    v_mov_b32_e32 v8, v6
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
@@ -6245,13 +6228,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    v_mov_b32_e32 v0, v6
+; GFX11-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v9, 0x400, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
@@ -6263,24 +6246,24 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX10-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX10-NEXT:    ; implicit-def: $vgpr4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    v_pk_max_f16 v8, v5, v5
 ; GFX10-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX10-NEXT:    ; Child Loop BB18_4 Depth 2
-; GFX10-NEXT:    v_pk_max_f16 v4, v5, v5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v6, v8, v8
+; GFX10-NEXT:    v_pk_max_f16 v4, v6, v6
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_pk_max_f16 v7, v6, v4
-; GFX10-NEXT:    v_mov_b32_e32 v6, v7
-; GFX10-NEXT:    v_mov_b32_e32 v7, v8
+; GFX10-NEXT:    v_pk_max_f16 v5, v4, v8
+; GFX10-NEXT:    v_mov_b32_e32 v4, v5
+; GFX10-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX10-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
@@ -6292,15 +6275,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX10-NEXT:    v_mov_b32_e32 v8, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
@@ -6309,13 +6292,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    v_mov_b32_e32 v0, v6
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_u32_e32 v10, 0x400, v4
+; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX90A-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -6327,22 +6310,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024
+; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_pk_max_f16 v9, v5, v5
 ; GFX90A-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX90A-NEXT:    ; Child Loop BB18_4 Depth 2
-; GFX90A-NEXT:    v_pk_max_f16 v4, v5, v5
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_max_f16 v6, v9, v9
-; GFX90A-NEXT:    v_pk_max_f16 v8, v6, v4
+; GFX90A-NEXT:    v_pk_max_f16 v4, v7, v7
+; GFX90A-NEXT:    v_pk_max_f16 v6, v4, v9
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
 ; GFX90A-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -6354,27 +6337,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v9
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v9, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_u32_e32 v9, 0x400, v4
+; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX908-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -6386,23 +6369,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX908-NEXT:    ; implicit-def: $vgpr4
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_pk_max_f16 v8, v5, v5
 ; GFX908-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX908-NEXT:    ; Child Loop BB18_4 Depth 2
-; GFX908-NEXT:    v_pk_max_f16 v4, v5, v5
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_pk_max_f16 v6, v8, v8
-; GFX908-NEXT:    v_pk_max_f16 v7, v6, v4
-; GFX908-NEXT:    v_mov_b32_e32 v6, v7
+; GFX908-NEXT:    v_pk_max_f16 v4, v6, v6
+; GFX908-NEXT:    v_pk_max_f16 v5, v4, v8
+; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
-; GFX908-NEXT:    v_mov_b32_e32 v7, v8
+; GFX908-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX908-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -6414,27 +6397,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v8
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v8, v6
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v6
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x400, v4
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -6446,27 +6429,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX8-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX8-NEXT:    ; implicit-def: $vgpr4
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v9, v5, v5
 ; GFX8-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX8-NEXT:    ; Child Loop BB18_4 Depth 2
-; GFX8-NEXT:    v_max_f16_sdwa v4, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v6, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v6, v5, v5
-; GFX8-NEXT:    v_max_f16_e32 v7, v8, v8
-; GFX8-NEXT:    v_max_f16_e32 v6, v7, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, v6, v4
-; GFX8-NEXT:    v_mov_b32_e32 v6, v7
+; GFX8-NEXT:    v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT:    v_max_f16_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v9
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
-; GFX8-NEXT:    v_mov_b32_e32 v7, v8
+; GFX8-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX8-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -6478,21 +6461,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v8
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v8, v6
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v6
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -6669,43 +6652,45 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v2, v0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    s_add_co_i32 s6, s16, 0x400
+; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x400
 ; GFX12-NEXT:    s_mov_b32 s5, 0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_mov_b32_e32 v4, s4
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
+; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX12-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2
+; GFX12-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX12-NEXT:    v_max_num_f32_e32 v1, v1, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
-; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-NEXT:    v_max_num_f32_e32 v1, v5, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX12-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_max_num_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7
-; GFX12-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX12-NEXT:    v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v5, s6
-; GFX12-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, v3, v6, s4
-; GFX12-NEXT:    v_perm_b32 v3, v1, v0, 0x7060302
+; GFX12-NEXT:    v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX12-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -6718,41 +6703,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, v0
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX940-NEXT:    s_add_i32 s8, s16, 0x400
+; GFX940-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX940-NEXT:    s_mov_b64 s[6:7], 0
-; GFX940-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX940-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX940-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX940-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX940-NEXT:    v_max_f32_e32 v1, v4, v1
-; GFX940-NEXT:    v_max_f32_e32 v0, v6, v0
-; GFX940-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v7, v0, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX940-NEXT:    v_add3_u32 v4, v4, v1, s9
-; GFX940-NEXT:    v_add3_u32 v7, v7, v0, s9
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, s8
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v4, v6, s[4:5]
-; GFX940-NEXT:    v_perm_b32 v4, v1, v0, s10
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
+; GFX940-NEXT:    v_mov_b32_e32 v7, v0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
+; GFX940-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX940-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX940-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX940-NEXT:    v_add3_u32 v5, v5, v0, s8
+; GFX940-NEXT:    v_add3_u32 v8, v8, v1, s8
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[4:5]
+; GFX940-NEXT:    v_perm_b32 v6, v1, v0, s9
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
+; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB19_1
@@ -6763,45 +6748,47 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s6, s16, 0x400
+; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX11-NEXT:    s_mov_b32 s5, 0
+; GFX11-NEXT:    v_mov_b32_e32 v4, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX11-NEXT:    v_max_f32_e32 v1, v1, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-NEXT:    v_max_f32_e32 v1, v5, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX11-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX11-NEXT:    v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
-; GFX11-NEXT:    v_mov_b32_e32 v5, s6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v3, v6, s4
+; GFX11-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v1, v0, 0x7060302
-; GFX11-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX11-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -6814,41 +6801,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s6, s20, 0x400
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX10-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
-; GFX10-NEXT:    v_max_f32_e32 v0, v3, v0
-; GFX10-NEXT:    v_max_f32_e32 v1, v5, v1
-; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v0
-; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v3, v6, s4
-; GFX10-NEXT:    v_mov_b32_e32 v5, s6
-; GFX10-NEXT:    v_perm_b32 v3, v1, v0, 0x7060302
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX10-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v0, v5
+; GFX10-NEXT:    v_mov_b32_e32 v1, v6
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB19_1
@@ -6859,40 +6846,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX90A-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX90A-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX90A-NEXT:    v_max_f32_e32 v1, v4, v1
-; GFX90A-NEXT:    v_max_f32_e32 v0, v6, v0
-; GFX90A-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v7, v0, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX90A-NEXT:    v_add3_u32 v4, v4, v1, s9
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v0, s9
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v4, v6, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v1, v0, s10
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s8
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
+; GFX90A-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX90A-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX90A-NEXT:    v_add3_u32 v5, v5, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v1, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v1, v0, s9
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB19_1
@@ -6903,41 +6890,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX908-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX908-NEXT:    v_max_f32_e32 v1, v3, v1
-; GFX908-NEXT:    v_max_f32_e32 v0, v6, v0
-; GFX908-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v7, v0, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX908-NEXT:    v_add3_u32 v3, v3, v1, s9
-; GFX908-NEXT:    v_add3_u32 v7, v7, v0, s9
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX908-NEXT:    v_cndmask_b32_e64 v0, v3, v6, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v1, v0, s10
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
-; GFX908-NEXT:    v_mov_b32_e32 v5, s8
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX908-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX908-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX908-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX908-NEXT:    v_add3_u32 v5, v5, v0, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v1, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v5, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v1, v0, s9
+; GFX908-NEXT:    v_mov_b32_e32 v0, v5
+; GFX908-NEXT:    v_mov_b32_e32 v1, v6
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
@@ -6948,42 +6935,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT:    v_max_f32_e32 v1, v3, v1
-; GFX8-NEXT:    v_max_f32_e32 v0, v6, v0
-; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v3, v6, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v5, v7, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v3, v1, v0, 16
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mov_b32_e32 v5, s8
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v1, v0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v0, v5
+; GFX8-NEXT:    v_mov_b32_e32 v1, v6
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
@@ -6995,38 +6982,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s20
-; GFX7-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT:    buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX7-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_max_f32_e32 v4, v4, v2
-; GFX7-NEXT:    v_max_f32_e32 v5, v5, v3
+; GFX7-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX7-NEXT:    v_max_f32_e32 v6, v6, v3
 ; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
-; GFX7-NEXT:    v_alignbit_b32 v0, v0, v5, 16
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX7-NEXT:    v_alignbit_b32 v0, v0, v6, 16
+; GFX7-NEXT:    v_mov_b32_e32 v6, v1
+; GFX7-NEXT:    v_mov_b32_e32 v5, v0
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7037,39 +7024,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s20
-; GFX6-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT:    buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v0
+; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX6-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_max_f32_e32 v4, v4, v2
-; GFX6-NEXT:    v_max_f32_e32 v5, v5, v3
+; GFX6-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX6-NEXT:    v_max_f32_e32 v6, v6, v3
 ; GFX6-NEXT:    v_alignbit_b32 v1, v1, v0, 16
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
-; GFX6-NEXT:    v_alignbit_b32 v0, v0, v5, 16
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
-; GFX6-NEXT:    v_mov_b32_e32 v6, s6
-; GFX6-NEXT:    v_mov_b32_e32 v4, v0
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX6-NEXT:    v_alignbit_b32 v0, v0, v6, 16
+; GFX6-NEXT:    v_mov_b32_e32 v6, v1
+; GFX6-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7089,43 +7076,41 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v1, s16
-; GFX12-NEXT:    s_add_co_i32 s6, s16, 0x400
+; GFX12-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x400
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:    buffer_load_b32 v2, v1, s[0:3], null offen offset:1024
 ; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_max_num_f32 v5, v5, v3 :: v_dual_max_num_f32 v0, v0, v2
+; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_max_num_f32_e32 v1, v3, v1
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v5, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX12-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s4, v1, v1
-; GFX12-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v1, v4, v6, s4
-; GFX12-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc_lo
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v5, s6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
-; GFX12-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX12-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX12-NEXT:    v_mov_b32_e32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -7139,40 +7124,40 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s16
-; GFX940-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024
-; GFX940-NEXT:    s_add_i32 s8, s16, 0x400
+; GFX940-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX940-NEXT:    s_mov_b64 s[6:7], 0
-; GFX940-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX940-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX940-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX940-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX940-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX940-NEXT:    v_max_f32_e32 v1, v2, v1
-; GFX940-NEXT:    v_max_f32_e32 v2, v5, v4
-; GFX940-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT:    v_add3_u32 v4, v4, v1, s9
-; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s9
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX940-NEXT:    v_mov_b32_e32 v6, s8
-; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX940-NEXT:    v_perm_b32 v2, v2, v1, s10
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX940-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX940-NEXT:    v_max_f32_e32 v5, v5, v3
+; GFX940-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX940-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX940-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
+; GFX940-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
 ; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7182,45 +7167,43 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, s16
-; GFX11-NEXT:    s_add_i32 s6, s16, 0x400
+; GFX11-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:    s_mov_b32 s5, 0
-; GFX11-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v5, v5, v3 :: v_dual_max_f32 v0, v0, v2
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f32_e32 v1, v3, v1
-; GFX11-NEXT:    v_max_f32_e32 v3, v5, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v1, v1
-; GFX11-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v4, v6, s4
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc_lo
-; GFX11-NEXT:    v_mov_b32_e32 v5, s6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
-; GFX11-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -7234,39 +7217,39 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
-; GFX10-NEXT:    s_add_i32 s6, s20, 0x400
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f32_e32 v1, v3, v1
-; GFX10-NEXT:    v_max_f32_e32 v3, v5, v4
-; GFX10-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX10-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v1, v1
-; GFX10-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v4, v6, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v5, s6
-; GFX10-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v3
+; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB20_1
@@ -7278,39 +7261,39 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
-; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX90A-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX90A-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX90A-NEXT:    v_max_f32_e32 v1, v2, v1
-; GFX90A-NEXT:    v_max_f32_e32 v2, v5, v4
-; GFX90A-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v4, v4, v1, s9
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s9
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX90A-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX90A-NEXT:    v_perm_b32 v2, v2, v1, s10
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s8
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX90A-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    v_max_f32_e32 v5, v5, v3
+; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7321,40 +7304,40 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s20
-; GFX908-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX908-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX908-NEXT:    v_max_f32_e32 v1, v3, v1
-; GFX908-NEXT:    v_max_f32_e32 v3, v5, v4
-; GFX908-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v4, v4, v1, s9
-; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s9
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX908-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT:    v_perm_b32 v1, v3, v1, s10
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_mov_b32_e32 v6, s8
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX908-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX908-NEXT:    v_max_f32_e32 v5, v5, v3
+; GFX908-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX908-NEXT:    v_mov_b32_e32 v6, v1
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7365,41 +7348,41 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s20
-; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:    v_max_f32_e32 v1, v3, v1
-; GFX8-NEXT:    v_max_f32_e32 v3, v5, v4
-; GFX8-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v1, v3, v1, 16
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_mov_b32_e32 v6, s8
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v6, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7411,37 +7394,37 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX7-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_max_f32_e32 v4, v4, v0
-; GFX7-NEXT:    v_max_f32_e32 v5, v5, v1
-; GFX7-NEXT:    v_alignbit_b32 v3, v3, v2, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; GFX7-NEXT:    v_alignbit_b32 v2, v2, v5, 16
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_max_f32_e32 v5, v5, v0
+; GFX7-NEXT:    v_max_f32_e32 v6, v6, v1
+; GFX7-NEXT:    v_alignbit_b32 v4, v4, v3, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_alignbit_b32 v3, v3, v6, 16
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7453,38 +7436,38 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX6-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v0
+; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v0
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff0000, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX6-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_max_f32_e32 v4, v4, v0
-; GFX6-NEXT:    v_max_f32_e32 v5, v5, v1
-; GFX6-NEXT:    v_alignbit_b32 v3, v3, v2, 16
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; GFX6-NEXT:    v_alignbit_b32 v2, v2, v5, 16
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT:    v_max_f32_e32 v5, v5, v0
+; GFX6-NEXT:    v_max_f32_e32 v6, v6, v1
+; GFX6-NEXT:    v_alignbit_b32 v4, v4, v3, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT:    v_alignbit_b32 v3, v3, v6, 16
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    v_mov_b32_e32 v6, s6
-; GFX6-NEXT:    v_mov_b32_e32 v4, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7504,7 +7487,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_nc_u32_e32 v9, 0x400, v4
+; GFX12-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX12-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX12-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
@@ -7520,43 +7503,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_b32 v8, v4, s[4:7], null offen offset:1024
+; GFX12-NEXT:    buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
 ; GFX12-NEXT:    ; implicit-def: $vgpr4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX12-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX12-NEXT:    ; Child Loop BB21_4 Depth 2
-; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX12-NEXT:    v_and_b32_e32 v10, 0xffff0000, v8
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX12-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v6, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_max_num_f32_e32 v6, v10, v7
-; GFX12-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_max_num_f32 v5, v5, v9 :: v_dual_max_num_f32 v4, v4, v8
+; GFX12-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v12, 0x400000, v4
 ; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT:    v_or_b32_e32 v12, 0x400000, v6
-; GFX12-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v10, v10, v6, 0x7fff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, v7, v11, vcc_lo
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT:    v_cndmask_b32_e32 v6, v10, v12, vcc_lo
+; GFX12-NEXT:    v_or_b32_e32 v13, 0x400000, v5
+; GFX12-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v7, v6, v4, 0x7060302
-; GFX12-NEXT:    v_mov_b32_e32 v6, v7
-; GFX12-NEXT:    v_mov_b32_e32 v7, v8
+; GFX12-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
+; GFX12-NEXT:    v_mov_b32_e32 v4, v5
+; GFX12-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX12-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
@@ -7572,15 +7554,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX12-NEXT:    v_mov_b32_e32 v8, v6
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -7588,14 +7570,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    v_mov_b32_e32 v0, v6
+; GFX12-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_u32_e32 v10, 0x400, v4
+; GFX940-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX940-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
@@ -7607,40 +7589,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
 ; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT:    buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024
+; GFX940-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
 ; GFX940-NEXT:    ; implicit-def: $vgpr4
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX940-NEXT:    s_movk_i32 s10, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
 ; GFX940-NEXT:    s_mov_b32 s11, 0x7060302
 ; GFX940-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX940-NEXT:    ; Child Loop BB21_4 Depth 2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
-; GFX940-NEXT:    v_max_f32_e32 v4, v6, v4
-; GFX940-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT:    v_add3_u32 v6, v6, v4, s10
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX940-NEXT:    v_max_f32_e32 v4, v4, v9
+; GFX940-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX940-NEXT:    v_add3_u32 v5, v5, v4, s10
+; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v9
-; GFX940-NEXT:    v_max_f32_e32 v6, v7, v6
-; GFX940-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX940-NEXT:    v_add3_u32 v7, v7, v6, s10
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_max_f32_e32 v5, v5, v10
+; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s10
+; GFX940-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX940-NEXT:    v_perm_b32 v8, v6, v4, s11
-; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[8:9]
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v11, vcc
+; GFX940-NEXT:    v_perm_b32 v6, v5, v4, s11
+; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX940-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
@@ -7653,27 +7635,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v9
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v9, v6
+; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v6
+; GFX940-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x400, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
@@ -7687,41 +7669,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024
+; GFX11-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
 ; GFX11-NEXT:    ; implicit-def: $vgpr4
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-NEXT:    ; Child Loop BB21_4 Depth 2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v8
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_max_f32_e32 v4, v6, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f32_e32 v6, v10, v7
-; GFX11-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v5, v5, v9 :: v_dual_max_f32 v4, v4, v8
+; GFX11-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v4
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v6
-; GFX11-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v10, v10, v6, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v7, v11, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v10, v12, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v5
+; GFX11-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v7, v6, v4, 0x7060302
-; GFX11-NEXT:    v_mov_b32_e32 v6, v7
-; GFX11-NEXT:    v_mov_b32_e32 v7, v8
+; GFX11-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
+; GFX11-NEXT:    v_mov_b32_e32 v4, v5
+; GFX11-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX11-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
@@ -7735,14 +7718,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX11-NEXT:    v_mov_b32_e32 v8, v6
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
@@ -7750,14 +7733,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    v_mov_b32_e32 v0, v6
+; GFX11-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v9, 0x400, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
@@ -7769,38 +7753,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX10-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX10-NEXT:    ; implicit-def: $vgpr4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX10-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX10-NEXT:    ; Child Loop BB21_4 Depth 2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f32_e32 v4, v6, v4
-; GFX10-NEXT:    v_max_f32_e32 v6, v10, v7
-; GFX10-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX10-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v8
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v9
+; GFX10-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v4
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v6
-; GFX10-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v10, v10, v6, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v7, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v10, v12, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v7, v6, v4, 0x7060302
-; GFX10-NEXT:    v_mov_b32_e32 v6, v7
-; GFX10-NEXT:    v_mov_b32_e32 v7, v8
+; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v5
+; GFX10-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v4, v5
+; GFX10-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX10-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7812,15 +7796,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX10-NEXT:    v_mov_b32_e32 v8, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
@@ -7829,13 +7813,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    v_mov_b32_e32 v0, v6
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_u32_e32 v10, 0x400, v4
+; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX90A-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7847,38 +7831,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024
+; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX90A-NEXT:    s_movk_i32 s14, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
 ; GFX90A-NEXT:    s_mov_b32 s15, 0x7060302
 ; GFX90A-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX90A-NEXT:    ; Child Loop BB21_4 Depth 2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
-; GFX90A-NEXT:    v_max_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s14
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX90A-NEXT:    v_max_f32_e32 v4, v4, v9
+; GFX90A-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX90A-NEXT:    v_add3_u32 v5, v5, v4, s14
+; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v9
-; GFX90A-NEXT:    v_max_f32_e32 v6, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v6, s14
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX90A-NEXT:    v_perm_b32 v8, v6, v4, s15
+; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_max_f32_e32 v5, v5, v10
+; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s14
+; GFX90A-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v11, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v4, s15
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
 ; GFX90A-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7890,27 +7874,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v9
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v9, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_u32_e32 v9, 0x400, v4
+; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX908-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7922,39 +7906,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX908-NEXT:    ; implicit-def: $vgpr4
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX908-NEXT:    s_movk_i32 s14, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX908-NEXT:    s_mov_b32 s15, 0x7060302
 ; GFX908-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX908-NEXT:    ; Child Loop BB21_4 Depth 2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
-; GFX908-NEXT:    v_max_f32_e32 v4, v6, v4
-; GFX908-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX908-NEXT:    v_add3_u32 v6, v6, v4, s14
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX908-NEXT:    v_max_f32_e32 v4, v4, v8
+; GFX908-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX908-NEXT:    v_add3_u32 v5, v5, v4, s14
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v4
 ; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v8
-; GFX908-NEXT:    v_max_f32_e32 v6, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX908-NEXT:    v_add3_u32 v7, v7, v6, s14
-; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX908-NEXT:    v_cndmask_b32_e32 v6, v7, v10, vcc
-; GFX908-NEXT:    v_perm_b32 v7, v6, v4, s15
-; GFX908-NEXT:    v_mov_b32_e32 v6, v7
+; GFX908-NEXT:    v_cndmask_b32_e32 v4, v5, v10, vcc
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_max_f32_e32 v5, v5, v9
+; GFX908-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX908-NEXT:    v_add3_u32 v10, v10, v5, s14
+; GFX908-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v4, s15
+; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
-; GFX908-NEXT:    v_mov_b32_e32 v7, v8
+; GFX908-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX908-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7966,27 +7950,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v8
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v8, v6
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v6
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x400, v4
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7998,40 +7982,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX8-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX8-NEXT:    ; implicit-def: $vgpr4
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX8-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX8-NEXT:    ; Child Loop BB21_4 Depth 2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
-; GFX8-NEXT:    v_max_f32_e32 v4, v6, v4
-; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX8-NEXT:    v_max_f32_e32 v4, v4, v8
+; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v8
-; GFX8-NEXT:    v_max_f32_e32 v6, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v6
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v7, v10, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_alignbit_b32 v7, v6, v4, 16
-; GFX8-NEXT:    v_mov_b32_e32 v6, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v10, vcc
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v9
+; GFX8-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v5
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 0x7fff, v10
+; GFX8-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v4, 16
+; GFX8-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
-; GFX8-NEXT:    v_mov_b32_e32 v7, v8
+; GFX8-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX8-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -8043,21 +8027,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v8
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v8, v6
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v6
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -8236,19 +8220,19 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_
 ; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, v0
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
 ; GFX940-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX940-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v0
-; GFX940-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v0, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, s6
+; GFX940-NEXT:    v_max_f32_e32 v4, v0, v2
 ; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
@@ -8287,19 +8271,19 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_
 ; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
-; GFX90A-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v0, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_max_f32_e32 v4, v0, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_wbl2
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
@@ -8317,25 +8301,25 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_
 ; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX908-NEXT:    v_max_f32_e32 v0, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v0, v1
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
@@ -8352,19 +8336,19 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v4
-; GFX8-NEXT:    v_max_f32_e32 v3, v0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index 0d38ee36415b8..0bcaacc6b08e8 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -31,19 +31,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, v0
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
 ; GFX940-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX940-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v0
-; GFX940-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v0, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, s6
+; GFX940-NEXT:    v_min_f32_e32 v4, v0, v2
 ; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
@@ -82,19 +82,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
-; GFX90A-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v0, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_min_f32_e32 v4, v0, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -110,25 +110,25 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX908-NEXT:    v_max_f32_e32 v0, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v0, v1
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB0_1
@@ -145,19 +145,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v4
-; GFX8-NEXT:    v_min_f32_e32 v3, v0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB0_1
@@ -207,24 +207,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s16
-; GFX940-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
 ; GFX940-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX940-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX940-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v1, v0, v0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX940-NEXT:    v_min_f32_e32 v2, v2, v1
-; GFX940-NEXT:    v_mov_b32_e32 v6, s6
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX940-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -257,23 +257,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
-; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v1, v0, v0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s6
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -284,24 +284,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s20
-; GFX908-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v1, v0, v0
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
-; GFX908-NEXT:    v_min_f32_e32 v1, v3, v1
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX908-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v5, v1
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -316,20 +316,20 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v1
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v4, v1
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v3, v0
-; GFX8-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v1, v3
+; GFX8-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -400,7 +400,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_u32_e32 v10, 0x400, v4
+; GFX940-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX940-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
@@ -412,22 +412,22 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
 ; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
 ; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT:    buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024
+; GFX940-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
 ; GFX940-NEXT:    ; implicit-def: $vgpr4
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_max_f32_e32 v9, v5, v5
 ; GFX940-NEXT:  .LBB2_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX940-NEXT:    ; Child Loop BB2_4 Depth 2
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v6, v9, v9
-; GFX940-NEXT:    v_min_f32_e32 v8, v6, v4
+; GFX940-NEXT:    v_max_f32_e32 v4, v7, v7
+; GFX940-NEXT:    v_min_f32_e32 v6, v4, v9
 ; GFX940-NEXT:    s_mov_b64 s[8:9], exec
-; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[8:9]
+; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
@@ -441,21 +441,21 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
 ; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB2_4
 ; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v9
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v9, v6
+; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB2_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v6
+; GFX940-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -520,7 +520,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_u32_e32 v10, 0x400, v4
+; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX90A-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -532,22 +532,22 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024
+; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_max_f32_e32 v9, v5, v5
 ; GFX90A-NEXT:  .LBB2_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX90A-NEXT:    ; Child Loop BB2_4 Depth 2
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v6, v9, v9
-; GFX90A-NEXT:    v_min_f32_e32 v8, v6, v4
+; GFX90A-NEXT:    v_max_f32_e32 v4, v7, v7
+; GFX90A-NEXT:    v_min_f32_e32 v6, v4, v9
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
 ; GFX90A-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -559,27 +559,27 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB2_4
 ; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v9
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v9, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB2_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_u32_e32 v9, 0x400, v4
+; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX908-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -591,23 +591,23 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX908-NEXT:    ; implicit-def: $vgpr4
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_max_f32_e32 v8, v5, v5
 ; GFX908-NEXT:  .LBB2_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX908-NEXT:    ; Child Loop BB2_4 Depth 2
-; GFX908-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v6, v8, v8
-; GFX908-NEXT:    v_min_f32_e32 v7, v6, v4
-; GFX908-NEXT:    v_mov_b32_e32 v6, v7
+; GFX908-NEXT:    v_max_f32_e32 v4, v6, v6
+; GFX908-NEXT:    v_min_f32_e32 v5, v4, v8
+; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
-; GFX908-NEXT:    v_mov_b32_e32 v7, v8
+; GFX908-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX908-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -619,21 +619,21 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB2_4
 ; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v8
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v8, v6
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB2_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v6
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -771,19 +771,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, v0
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
 ; GFX940-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX940-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v0
-; GFX940-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v0, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, s6
+; GFX940-NEXT:    v_min_f32_e32 v4, v0, v2
 ; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
@@ -800,28 +800,27 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s5, s16, 0x400
-; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v0
-; GFX11-NEXT:    v_max_f32_e32 v0, v2, v2
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v1, v4, v4
-; GFX11-NEXT:    v_min_f32_e32 v3, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX11-NEXT:    v_min_f32_e32 v4, v0, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v0, v3
-; GFX11-NEXT:    v_mov_b32_e32 v1, v4
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -833,27 +832,27 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s5, s20, 0x400
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_max_f32_e32 v2, v1, v1
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10-NEXT:    v_max_f32_e32 v0, v2, v2
-; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f32_e32 v1, v4, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v1, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX10-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB3_1
@@ -864,19 +863,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
-; GFX90A-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v0, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_min_f32_e32 v4, v0, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -892,25 +891,25 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX908-NEXT:    v_max_f32_e32 v0, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v0, v1
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB3_1
@@ -927,19 +926,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v4
-; GFX8-NEXT:    v_min_f32_e32 v3, v0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB3_1
@@ -956,19 +955,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
 ; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX7-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v4
-; GFX7-NEXT:    v_min_f32_e32 v3, v0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, s6
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v5, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX7-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB3_1
@@ -985,20 +984,20 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
 ; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX6-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX6-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v4
-; GFX6-NEXT:    v_min_f32_e32 v3, v0, v2
-; GFX6-NEXT:    v_mov_b32_e32 v0, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, s6
-; GFX6-NEXT:    v_mov_b32_e32 v1, v4
-; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX6-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v0, v4
+; GFX6-NEXT:    v_mov_b32_e32 v1, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB3_1
@@ -1029,19 +1028,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, v0
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
 ; GFX940-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX940-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v0
-; GFX940-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v0, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, s6
+; GFX940-NEXT:    v_min_f32_e32 v4, v0, v2
 ; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
@@ -1080,19 +1079,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
-; GFX90A-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v0, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_min_f32_e32 v4, v0, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -1108,25 +1107,25 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX908-NEXT:    v_max_f32_e32 v0, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v0, v1
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB4_1
@@ -1143,19 +1142,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v4
-; GFX8-NEXT:    v_min_f32_e32 v3, v0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB4_1
@@ -1198,29 +1197,30 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    s_add_co_i32 s5, s16, 0x800
-; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[8:9], v[8:9]
-; GFX12-NEXT:    v_min_num_f64_e32 v[6:7], v[2:3], v[0:1]
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v10, s5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX12-NEXT:    v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT:    v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1243,30 +1243,30 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s5, s16, 0x800
-; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
-; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX11-NEXT:    v_min_f64 v[6:7], v[2:3], v[0:1]
-; GFX11-NEXT:    v_mov_b32_e32 v10, s5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX11-NEXT:    v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1298,29 +1298,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
-; GFX908-NEXT:    v_mov_b32_e32 v5, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX908-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v9, v1
-; GFX908-NEXT:    v_mov_b32_e32 v8, v0
-; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX908-NEXT:    v_mov_b32_e32 v10, s6
-; GFX908-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v6
-; GFX908-NEXT:    v_mov_b32_e32 v1, v7
-; GFX908-NEXT:    v_mov_b32_e32 v2, v8
-; GFX908-NEXT:    v_mov_b32_e32 v3, v9
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB5_1
@@ -1331,29 +1331,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
-; GFX8-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX8-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT:    v_mov_b32_e32 v10, s6
-; GFX8-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v7
-; GFX8-NEXT:    v_mov_b32_e32 v2, v8
-; GFX8-NEXT:    v_mov_b32_e32 v3, v9
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB5_1
@@ -1393,27 +1393,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s16
-; GFX12-NEXT:    s_add_co_i32 s5, s16, 0x800
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
+; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
 ; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048
 ; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[0:1], v[0:1]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[4:5], v[4:5]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[6:7], v[2:3]
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, v5
-; GFX12-NEXT:    v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_mov_b32_e32 v6, v2
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX12-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1437,28 +1437,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s16
-; GFX11-NEXT:    s_add_i32 s5, s16, 0x800
+; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048
-; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[6:7], v[4:5], v[4:5]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f64 v[2:3], v[6:7], v[2:3]
-; GFX11-NEXT:    v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, v5
-; GFX11-NEXT:    v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_mov_b32_e32 v6, v2
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX11-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1491,27 +1490,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v2, s20
-; GFX908-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
+; GFX908-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[6:7], v[4:5], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v10, s6
-; GFX908-NEXT:    v_min_f64 v[2:3], v[6:7], v[2:3]
-; GFX908-NEXT:    v_mov_b32_e32 v9, v5
-; GFX908-NEXT:    v_mov_b32_e32 v8, v4
-; GFX908-NEXT:    v_mov_b32_e32 v7, v3
-; GFX908-NEXT:    v_mov_b32_e32 v6, v2
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v10, v3
+; GFX908-NEXT:    v_mov_b32_e32 v9, v2
+; GFX908-NEXT:    v_mov_b32_e32 v8, v1
+; GFX908-NEXT:    v_mov_b32_e32 v7, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v6
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX908-NEXT:    v_mov_b32_e32 v2, v7
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v5, v7
+; GFX908-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1522,27 +1521,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s20
-; GFX8-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
+; GFX8-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[6:7], v[4:5], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v10, s6
-; GFX8-NEXT:    v_min_f64 v[2:3], v[6:7], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v9, v5
-; GFX8-NEXT:    v_mov_b32_e32 v8, v4
-; GFX8-NEXT:    v_mov_b32_e32 v7, v3
-; GFX8-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v10, v3
+; GFX8-NEXT:    v_mov_b32_e32 v9, v2
+; GFX8-NEXT:    v_mov_b32_e32 v8, v1
+; GFX8-NEXT:    v_mov_b32_e32 v7, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v6
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v7
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v5, v7
+; GFX8-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1606,17 +1605,17 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
 ; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[5:6], v[5:6]
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB7_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX12-NEXT:    ; Child Loop BB7_4 Depth 2
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[5:6], v[5:6]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[13:14], v[13:14]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[13:14], v[13:14]
 ; GFX12-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[11:12], v[2:3], v[0:1]
+; GFX12-NEXT:    v_min_num_f64_e32 v[11:12], v[0:1], v[4:5]
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
 ; GFX12-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
@@ -1710,17 +1709,17 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
 ; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB7_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-NEXT:    ; Child Loop BB7_4 Depth 2
-; GFX11-NEXT:    v_max_f64 v[0:1], v[5:6], v[5:6]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[13:14], v[13:14]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f64 v[11:12], v[2:3], v[0:1]
+; GFX11-NEXT:    v_min_f64 v[11:12], v[0:1], v[4:5]
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
 ; GFX11-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
 ; GFX11-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
@@ -1838,15 +1837,15 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
 ; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:  .LBB7_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX908-NEXT:    ; Child Loop BB7_4 Depth 2
-; GFX908-NEXT:    v_max_f64 v[0:1], v[5:6], v[5:6]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[2:3], v[13:14], v[13:14]
+; GFX908-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
-; GFX908-NEXT:    v_min_f64 v[11:12], v[2:3], v[0:1]
+; GFX908-NEXT:    v_min_f64 v[11:12], v[0:1], v[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v11
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v12
 ; GFX908-NEXT:    v_mov_b32_e32 v2, v13
@@ -1904,15 +1903,15 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
 ; GFX8-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:  .LBB7_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX8-NEXT:    ; Child Loop BB7_4 Depth 2
-; GFX8-NEXT:    v_max_f64 v[0:1], v[5:6], v[5:6]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[2:3], v[13:14], v[13:14]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
-; GFX8-NEXT:    v_min_f64 v[11:12], v[2:3], v[0:1]
+; GFX8-NEXT:    v_min_f64 v[11:12], v[0:1], v[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v11
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v12
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v13
@@ -2012,29 +2011,30 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    s_add_co_i32 s5, s16, 0x800
-; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[8:9], v[8:9]
-; GFX12-NEXT:    v_min_num_f64_e32 v[6:7], v[2:3], v[0:1]
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v10, s5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX12-NEXT:    v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT:    v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2057,30 +2057,30 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s5, s16, 0x800
-; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
-; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX11-NEXT:    v_min_f64 v[6:7], v[2:3], v[0:1]
-; GFX11-NEXT:    v_mov_b32_e32 v10, s5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX11-NEXT:    v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2092,31 +2092,31 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    v_mov_b32_e32 v5, v1
-; GFX10-NEXT:    s_add_i32 s5, s20, 0x800
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x800
+; GFX10-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX10-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v9, v1
-; GFX10-NEXT:    v_mov_b32_e32 v8, v0
-; GFX10-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v10, s5
+; GFX10-NEXT:    v_mov_b32_e32 v10, v1
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX10-NEXT:    v_min_f64 v[6:7], v[2:3], v[0:1]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v6
-; GFX10-NEXT:    v_mov_b32_e32 v1, v7
-; GFX10-NEXT:    v_mov_b32_e32 v2, v8
-; GFX10-NEXT:    v_mov_b32_e32 v3, v9
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX10-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v7
+; GFX10-NEXT:    v_mov_b32_e32 v1, v8
+; GFX10-NEXT:    v_mov_b32_e32 v2, v9
+; GFX10-NEXT:    v_mov_b32_e32 v3, v10
+; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
@@ -2127,26 +2127,26 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v4, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX90A-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX90A-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX90A-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v10, s6
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[8:9], v[8:9] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
+; GFX90A-NEXT:    v_min_f64 v[8:9], v[0:1], v[4:5]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB8_1
@@ -2157,29 +2157,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
-; GFX908-NEXT:    v_mov_b32_e32 v5, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v9, v1
-; GFX908-NEXT:    v_mov_b32_e32 v8, v0
-; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX908-NEXT:    v_mov_b32_e32 v10, s6
-; GFX908-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v6
-; GFX908-NEXT:    v_mov_b32_e32 v1, v7
-; GFX908-NEXT:    v_mov_b32_e32 v2, v8
-; GFX908-NEXT:    v_mov_b32_e32 v3, v9
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB8_1
@@ -2190,29 +2190,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
-; GFX8-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX8-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT:    v_mov_b32_e32 v10, s6
-; GFX8-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v7
-; GFX8-NEXT:    v_mov_b32_e32 v2, v8
-; GFX8-NEXT:    v_mov_b32_e32 v3, v9
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
@@ -2223,29 +2223,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s20
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX7-NEXT:    s_add_i32 s6, s20, 0x800
+; GFX7-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v9, v1
-; GFX7-NEXT:    v_mov_b32_e32 v8, v0
-; GFX7-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v10, s6
-; GFX7-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v6
-; GFX7-NEXT:    v_mov_b32_e32 v1, v7
-; GFX7-NEXT:    v_mov_b32_e32 v2, v8
-; GFX7-NEXT:    v_mov_b32_e32 v3, v9
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v10, v1
+; GFX7-NEXT:    v_mov_b32_e32 v9, v0
+; GFX7-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX7-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v7
+; GFX7-NEXT:    v_mov_b32_e32 v1, v8
+; GFX7-NEXT:    v_mov_b32_e32 v2, v9
+; GFX7-NEXT:    v_mov_b32_e32 v3, v10
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
@@ -2256,30 +2256,30 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s20
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX6-NEXT:    s_add_i32 s6, s20, 0x800
+; GFX6-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX6-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v9, v1
-; GFX6-NEXT:    v_mov_b32_e32 v8, v0
+; GFX6-NEXT:    v_mov_b32_e32 v10, v1
+; GFX6-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX6-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v10, s6
-; GFX6-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v6
-; GFX6-NEXT:    v_mov_b32_e32 v1, v7
-; GFX6-NEXT:    v_mov_b32_e32 v2, v8
-; GFX6-NEXT:    v_mov_b32_e32 v3, v9
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX6-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v7
+; GFX6-NEXT:    v_mov_b32_e32 v1, v8
+; GFX6-NEXT:    v_mov_b32_e32 v2, v9
+; GFX6-NEXT:    v_mov_b32_e32 v3, v10
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB8_1
@@ -2300,29 +2300,30 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    s_add_co_i32 s5, s16, 0x800
-; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[8:9], v[8:9]
-; GFX12-NEXT:    v_min_num_f64_e32 v[6:7], v[2:3], v[0:1]
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v10, s5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX12-NEXT:    v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT:    v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2345,30 +2346,30 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s5, s16, 0x800
-; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
-; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX11-NEXT:    v_min_f64 v[6:7], v[2:3], v[0:1]
-; GFX11-NEXT:    v_mov_b32_e32 v10, s5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX11-NEXT:    v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2400,29 +2401,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
-; GFX908-NEXT:    v_mov_b32_e32 v5, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v9, v1
-; GFX908-NEXT:    v_mov_b32_e32 v8, v0
-; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX908-NEXT:    v_mov_b32_e32 v10, s6
-; GFX908-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v6
-; GFX908-NEXT:    v_mov_b32_e32 v1, v7
-; GFX908-NEXT:    v_mov_b32_e32 v2, v8
-; GFX908-NEXT:    v_mov_b32_e32 v3, v9
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB9_1
@@ -2433,29 +2434,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
-; GFX8-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT:    v_mov_b32_e32 v10, s6
-; GFX8-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v7
-; GFX8-NEXT:    v_mov_b32_e32 v2, v8
-; GFX8-NEXT:    v_mov_b32_e32 v3, v9
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
@@ -2499,47 +2500,47 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_addk_co_i32 s16, 0x200
+; GFX12-NEXT:    v_max_num_f16_e32 v5, v0, v0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_b32 s4, s16, -4
-; GFX12-NEXT:    s_and_b32 s5, s16, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v1, s4
-; GFX12-NEXT:    s_lshl_b32 s5, s5, 3
+; GFX12-NEXT:    v_mov_b32_e32 v4, s4
+; GFX12-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_lshl_b32 s6, 0xffff, s5
+; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_not_b32 s7, s6
-; GFX12-NEXT:    buffer_load_b32 v2, v1, s[0:3], null offen
-; GFX12-NEXT:    s_mov_b32 s6, 0
+; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_not_b32 s6, s5
+; GFX12-NEXT:    s_mov_b32 s5, 0
 ; GFX12-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, s5, v2
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v0, v0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
-; GFX12-NEXT:    v_min_num_f16_e32 v1, v1, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v1, s5, v1
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-NEXT:    v_min_num_f16_e32 v0, v0, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_and_or_b32 v1, v2, s7, v1
-; GFX12-NEXT:    v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2
-; GFX12-NEXT:    v_mov_b32_e32 v3, v1
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-NEXT:    s_or_b32 s6, vcc_lo, s6
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s6
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s5, v3
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2547,263 +2548,256 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s16, 0x200
-; GFX940-NEXT:    s_and_b32 s6, s16, -4
-; GFX940-NEXT:    v_mov_b32_e32 v1, s6
-; GFX940-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen
+; GFX940-NEXT:    s_and_b32 s4, s16, -4
+; GFX940-NEXT:    v_mov_b32_e32 v4, s4
+; GFX940-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen
 ; GFX940-NEXT:    s_and_b32 s4, s16, 3
-; GFX940-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX940-NEXT:    s_not_b32 s8, s4
+; GFX940-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT:    s_not_b32 s7, s4
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_max_f16_e32 v5, v0, v0
 ; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_e32 v1, s7, v3
-; GFX940-NEXT:    v_max_f16_e32 v2, v0, v0
-; GFX940-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX940-NEXT:    v_min_f16_e32 v1, v1, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v1, s7, v1
-; GFX940-NEXT:    v_and_or_b32 v2, v3, s8, v1
-; GFX940-NEXT:    v_mov_b32_e32 v6, s6
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX940-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX940-NEXT:    v_min_f16_e32 v0, v0, v5
+; GFX940-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX940-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX940-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s7, v4
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_max_f16_e32 v5, v0, v0
 ; GFX11-NEXT:    s_and_b32 s4, s16, -4
-; GFX11-NEXT:    s_and_b32 s5, s16, 3
-; GFX11-NEXT:    v_mov_b32_e32 v1, s4
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX11-NEXT:    s_lshl_b32 s6, 0xffff, s5
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v4, s4
+; GFX11-NEXT:    s_and_b32 s4, s16, 3
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_not_b32 s7, s6
-; GFX11-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 offen
-; GFX11-NEXT:    s_mov_b32 s6, 0
+; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT:    buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-NEXT:    s_not_b32 s6, s5
+; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, s5, v2
-; GFX11-NEXT:    v_max_f16_e32 v3, v0, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-NEXT:    v_min_f16_e32 v1, v1, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, s5, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v1, v2, s7, v1
-; GFX11-NEXT:    v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2
-; GFX11-NEXT:    v_mov_b32_e32 v3, v1
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT:    v_min_f16_e32 v0, v0, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-NEXT:    s_or_b32 s6, vcc_lo, s6
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s5, v3
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s20, 0x200
+; GFX10-NEXT:    v_max_f16_e32 v5, v0, v0
 ; GFX10-NEXT:    s_and_b32 s4, s20, -4
-; GFX10-NEXT:    s_and_b32 s5, s20, 3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX10-NEXT:    s_lshl_b32 s6, 0xffff, s5
-; GFX10-NEXT:    s_not_b32 s7, s6
-; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
-; GFX10-NEXT:    s_mov_b32 s6, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    s_and_b32 s4, s20, 3
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX10-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX10-NEXT:    s_not_b32 s6, s5
+; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, s5, v2
-; GFX10-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX10-NEXT:    v_mov_b32_e32 v5, s4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX10-NEXT:    v_min_f16_e32 v1, v1, v3
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v1, v2, s7, v1
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f16_e32 v0, v0, v5
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-NEXT:    s_or_b32 s6, vcc_lo, s6
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v2
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s5, v3
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s20, 0x200
-; GFX90A-NEXT:    s_and_b32 s6, s20, -4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
-; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen
+; GFX90A-NEXT:    s_and_b32 s4, s20, -4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
+; GFX90A-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_and_b32 s4, s20, 3
-; GFX90A-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX90A-NEXT:    s_not_b32 s8, s4
+; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX90A-NEXT:    s_not_b32 s7, s4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v5, v0, v0
 ; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v1, s7, v3
-; GFX90A-NEXT:    v_max_f16_e32 v2, v0, v0
-; GFX90A-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX90A-NEXT:    v_min_f16_e32 v1, v1, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, s7, v1
-; GFX90A-NEXT:    v_and_or_b32 v2, v3, s8, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s6
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX90A-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX90A-NEXT:    v_min_f16_e32 v0, v0, v5
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX90A-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s7, v4
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s20, 0x200
-; GFX908-NEXT:    s_and_b32 s6, s20, -4
-; GFX908-NEXT:    v_mov_b32_e32 v1, s6
-; GFX908-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX908-NEXT:    s_and_b32 s4, s20, -4
+; GFX908-NEXT:    v_mov_b32_e32 v4, s4
+; GFX908-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX908-NEXT:    s_and_b32 s4, s20, 3
-; GFX908-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX908-NEXT:    s_not_b32 s8, s4
+; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX908-NEXT:    s_not_b32 s7, s4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v5, v0, v0
 ; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v1, s7, v2
-; GFX908-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX908-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX908-NEXT:    v_min_f16_e32 v1, v1, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v1, s7, v1
-; GFX908-NEXT:    v_and_or_b32 v1, v2, s8, v1
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX908-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX908-NEXT:    v_min_f16_e32 v0, v0, v5
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX908-NEXT:    v_and_or_b32 v0, v1, s7, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s7, v3
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s20, 0x200
-; GFX8-NEXT:    s_and_b32 s6, s20, -4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX8-NEXT:    s_and_b32 s4, s20, -4
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX8-NEXT:    s_and_b32 s4, s20, 3
-; GFX8-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX8-NEXT:    s_not_b32 s8, s4
+; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v5, v0, v0
 ; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, s7, v2
-; GFX8-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT:    v_min_f16_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, s8, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, s7, v1
-; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v5
+; GFX8-NEXT:    v_and_b32_e32 v2, s7, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s7, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s20, 0x200
-; GFX7-NEXT:    s_and_b32 s6, s20, -4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX7-NEXT:    s_and_b32 s4, s20, -4
+; GFX7-NEXT:    v_mov_b32_e32 v4, s4
+; GFX7-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    s_and_b32 s4, s20, 3
-; GFX7-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v0
-; GFX7-NEXT:    s_not_b32 s8, s4
+; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s7, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_and_b32_e32 v2, s8, v1
-; GFX7-NEXT:    v_mov_b32_e32 v5, s6
-; GFX7-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX7-NEXT:    v_and_b32_e32 v2, s7, v1
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s7, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
@@ -2813,7 +2807,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s7, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2821,31 +2815,30 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s20, 0x200
-; GFX6-NEXT:    s_and_b32 s6, s20, -4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s6
-; GFX6-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX6-NEXT:    s_and_b32 s4, s20, -4
+; GFX6-NEXT:    v_mov_b32_e32 v4, s4
+; GFX6-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    s_and_b32 s4, s20, 3
-; GFX6-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v0
-; GFX6-NEXT:    s_not_b32 s8, s4
+; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s7, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v2, s8, v1
-; GFX6-NEXT:    v_mov_b32_e32 v5, s6
-; GFX6-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX6-NEXT:    v_and_b32_e32 v2, s7, v1
+; GFX6-NEXT:    v_min_f32_e32 v0, v0, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s7, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v2, v0
-; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
@@ -2855,7 +2848,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX6-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s7, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -2873,46 +2866,46 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_addk_co_i32 s16, 0x200
+; GFX12-NEXT:    v_max_num_f16_e32 v3, v0, v0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_b32 s4, s16, -4
-; GFX12-NEXT:    s_and_b32 s5, s16, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v1, s4
-; GFX12-NEXT:    s_lshl_b32 s5, s5, 3
+; GFX12-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_lshl_b32 s6, 0xffff, s5
+; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_not_b32 s7, s6
-; GFX12-NEXT:    buffer_load_b32 v2, v1, s[0:3], null offen
-; GFX12-NEXT:    s_mov_b32 s6, 0
+; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_not_b32 s6, s5
+; GFX12-NEXT:    s_mov_b32 s5, 0
 ; GFX12-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, s5, v2
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v0, v0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
-; GFX12-NEXT:    v_min_num_f16_e32 v1, v1, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v1, s5, v1
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-NEXT:    v_min_num_f16_e32 v0, v0, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_and_or_b32 v1, v2, s7, v1
-; GFX12-NEXT:    v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2
-; GFX12-NEXT:    v_mov_b32_e32 v3, v1
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-NEXT:    s_or_b32 s6, vcc_lo, s6
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-NEXT:    v_mov_b32_e32 v1, v4
+; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2920,32 +2913,31 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s16, 0x200
-; GFX940-NEXT:    s_and_b32 s6, s16, -4
-; GFX940-NEXT:    v_mov_b32_e32 v1, s6
-; GFX940-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen
+; GFX940-NEXT:    s_and_b32 s4, s16, -4
+; GFX940-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen
 ; GFX940-NEXT:    s_and_b32 s4, s16, 3
-; GFX940-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX940-NEXT:    s_not_b32 s8, s4
+; GFX940-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT:    s_not_b32 s7, s4
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX940-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_e32 v1, s7, v3
-; GFX940-NEXT:    v_max_f16_e32 v2, v0, v0
-; GFX940-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX940-NEXT:    v_min_f16_e32 v1, v1, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v1, s7, v1
-; GFX940-NEXT:    v_and_or_b32 v2, v3, s8, v1
-; GFX940-NEXT:    v_mov_b32_e32 v6, s6
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX940-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX940-NEXT:    v_min_f16_e32 v0, v0, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX940-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2956,114 +2948,111 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX11-NEXT:    s_and_b32 s4, s16, -4
-; GFX11-NEXT:    s_and_b32 s5, s16, 3
-; GFX11-NEXT:    v_mov_b32_e32 v1, s4
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX11-NEXT:    s_lshl_b32 s6, 0xffff, s5
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_and_b32 s4, s16, 3
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_not_b32 s7, s6
-; GFX11-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 offen
-; GFX11-NEXT:    s_mov_b32 s6, 0
+; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT:    buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-NEXT:    s_not_b32 s6, s5
+; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, s5, v2
-; GFX11-NEXT:    v_max_f16_e32 v3, v0, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-NEXT:    v_min_f16_e32 v1, v1, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, s5, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v1, v2, s7, v1
-; GFX11-NEXT:    v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2
-; GFX11-NEXT:    v_mov_b32_e32 v3, v1
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT:    v_min_f16_e32 v0, v0, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-NEXT:    s_or_b32 s6, vcc_lo, s6
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v4
+; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s20, 0x200
+; GFX10-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX10-NEXT:    s_and_b32 s4, s20, -4
-; GFX10-NEXT:    s_and_b32 s5, s20, 3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX10-NEXT:    s_lshl_b32 s6, 0xffff, s5
-; GFX10-NEXT:    s_not_b32 s7, s6
-; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
-; GFX10-NEXT:    s_mov_b32 s6, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-NEXT:    s_and_b32 s4, s20, 3
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX10-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX10-NEXT:    s_not_b32 s6, s5
+; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, s5, v2
-; GFX10-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX10-NEXT:    v_mov_b32_e32 v5, s4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX10-NEXT:    v_min_f16_e32 v1, v1, v3
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v1, v2, s7, v1
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f16_e32 v0, v0, v3
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-NEXT:    s_or_b32 s6, vcc_lo, s6
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s20, 0x200
-; GFX90A-NEXT:    s_and_b32 s6, s20, -4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
-; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen
+; GFX90A-NEXT:    s_and_b32 s4, s20, -4
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_and_b32 s4, s20, 3
-; GFX90A-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX90A-NEXT:    s_not_b32 s8, s4
+; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX90A-NEXT:    s_not_b32 s7, s4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v1, s7, v3
-; GFX90A-NEXT:    v_max_f16_e32 v2, v0, v0
-; GFX90A-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX90A-NEXT:    v_min_f16_e32 v1, v1, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, s7, v1
-; GFX90A-NEXT:    v_and_or_b32 v2, v3, s8, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s6
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX90A-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX90A-NEXT:    v_min_f16_e32 v0, v0, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX90A-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3074,32 +3063,31 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s20, 0x200
-; GFX908-NEXT:    s_and_b32 s6, s20, -4
-; GFX908-NEXT:    v_mov_b32_e32 v1, s6
-; GFX908-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX908-NEXT:    s_and_b32 s4, s20, -4
+; GFX908-NEXT:    v_mov_b32_e32 v2, s4
+; GFX908-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX908-NEXT:    s_and_b32 s4, s20, 3
-; GFX908-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX908-NEXT:    s_not_b32 s8, s4
+; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX908-NEXT:    s_not_b32 s7, s4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v1, s7, v2
-; GFX908-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX908-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX908-NEXT:    v_min_f16_e32 v1, v1, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v1, s7, v1
-; GFX908-NEXT:    v_and_or_b32 v1, v2, s8, v1
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX908-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX908-NEXT:    v_min_f16_e32 v0, v0, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX908-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX908-NEXT:    v_mov_b32_e32 v5, v1
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3110,33 +3098,32 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s20, 0x200
-; GFX8-NEXT:    s_and_b32 s6, s20, -4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX8-NEXT:    s_and_b32 s4, s20, -4
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX8-NEXT:    s_and_b32 s4, s20, 3
-; GFX8-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX8-NEXT:    s_not_b32 s8, s4
+; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, s7, v2
-; GFX8-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT:    v_min_f16_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, s8, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, s7, v1
-; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v3
+; GFX8-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3147,35 +3134,34 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s20, 0x200
-; GFX7-NEXT:    s_and_b32 s6, s20, -4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX7-NEXT:    s_and_b32 s4, s20, -4
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    s_and_b32 s4, s20, 3
-; GFX7-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v0
-; GFX7-NEXT:    s_not_b32 s8, s4
+; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s7, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_and_b32_e32 v3, s8, v1
-; GFX7-NEXT:    v_mov_b32_e32 v5, s6
-; GFX7-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v3
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s7, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX7-NEXT:    v_mov_b32_e32 v4, v1
-; GFX7-NEXT:    v_mov_b32_e32 v3, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX7-NEXT:    v_mov_b32_e32 v5, v1
+; GFX7-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3186,36 +3172,35 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s20, 0x200
-; GFX6-NEXT:    s_and_b32 s6, s20, -4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s6
-; GFX6-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX6-NEXT:    s_and_b32 s4, s20, -4
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    s_and_b32 s4, s20, 3
-; GFX6-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v0
-; GFX6-NEXT:    s_not_b32 s8, s4
+; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s7, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v3, s8, v1
-; GFX6-NEXT:    v_mov_b32_e32 v5, s6
-; GFX6-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX6-NEXT:    v_min_f32_e32 v0, v0, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s7, v0
-; GFX6-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX6-NEXT:    v_mov_b32_e32 v4, v1
-; GFX6-NEXT:    v_mov_b32_e32 v3, v0
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v1, v3
+; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3235,15 +3220,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_nc_u32_e32 v6, 0x200, v4
+; GFX12-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
 ; GFX12-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_and_b32_e32 v4, 3, v6
-; GFX12-NEXT:    v_and_b32_e32 v10, -4, v6
-; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT:    v_and_b32_e32 v6, 3, v4
+; GFX12-NEXT:    v_and_b32_e32 v8, -4, v4
+; GFX12-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e64 v7, v4, 0xffff
-; GFX12-NEXT:    v_not_b32_e32 v11, v7
+; GFX12-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX12-NEXT:    v_not_b32_e32 v9, v6
 ; GFX12-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
@@ -3258,30 +3243,31 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_b32 v7, v10, s[4:7], null offen
+; GFX12-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-NEXT:    v_max_num_f16_e32 v10, v5, v5
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX12-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
-; GFX12-NEXT:    v_max_num_f16_e32 v8, v5, v5
+; GFX12-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX12-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v6, v6, v6
-; GFX12-NEXT:    v_min_num_f16_e32 v6, v6, v8
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX12-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
+; GFX12-NEXT:    v_max_num_f16_e32 v4, v4, v4
+; GFX12-NEXT:    v_min_num_f16_e32 v4, v4, v10
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v6, v7, v11, v6
-; GFX12-NEXT:    v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
+; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_or_b32 v5, v6, v9, v4
+; GFX12-NEXT:    v_mov_b32_e32 v4, v5
+; GFX12-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX12-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
@@ -3297,15 +3283,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB12_4
 ; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v8, v7
-; GFX12-NEXT:    v_mov_b32_e32 v7, v8
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -3313,7 +3299,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX12-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3321,12 +3307,12 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v4, 0x200, v4
-; GFX940-NEXT:    v_and_b32_e32 v10, -4, v4
+; GFX940-NEXT:    v_and_b32_e32 v9, -4, v4
 ; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v8, 3, v4
 ; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v6, v4, s0
-; GFX940-NEXT:    v_not_b32_e32 v11, v6
+; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v8, s0
+; GFX940-NEXT:    v_not_b32_e32 v10, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX940-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
@@ -3338,24 +3324,24 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
 ; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT:    buffer_load_dword v7, v10, s[4:7], 0 offen
+; GFX940-NEXT:    buffer_load_dword v7, v9, s[4:7], 0 offen
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_max_f16_e32 v11, v5, v5
 ; GFX940-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX940-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
-; GFX940-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT:    v_max_f16_e32 v8, v5, v5
-; GFX940-NEXT:    v_min_f16_e32 v6, v6, v8
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
-; GFX940-NEXT:    v_and_or_b32 v6, v7, v11, v6
+; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v8, v7
+; GFX940-NEXT:    v_max_f16_e32 v4, v4, v4
+; GFX940-NEXT:    v_min_f16_e32 v4, v4, v11
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v8, v4
+; GFX940-NEXT:    v_and_or_b32 v6, v7, v10, v4
 ; GFX940-NEXT:    s_mov_b64 s[8:9], exec
-; GFX940-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
+; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
@@ -3369,36 +3355,36 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB12_4
 ; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v7, v8
+; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x200, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v4, 3, v6
-; GFX11-NEXT:    v_and_b32_e32 v10, -4, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT:    v_and_b32_e32 v6, 3, v4
+; GFX11-NEXT:    v_and_b32_e32 v8, -4, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e64 v7, v4, 0xffff
-; GFX11-NEXT:    v_not_b32_e32 v11, v7
+; GFX11-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX11-NEXT:    v_not_b32_e32 v9, v6
 ; GFX11-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
@@ -3410,29 +3396,30 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    buffer_load_b32 v7, v10, s[4:7], 0 offen
+; GFX11-NEXT:    buffer_load_b32 v6, v8, s[4:7], 0 offen
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_max_f16_e32 v10, v5, v5
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
-; GFX11-NEXT:    v_max_f16_e32 v8, v5, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX11-NEXT:    v_min_f16_e32 v6, v6, v8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
+; GFX11-NEXT:    v_max_f16_e32 v4, v4, v4
+; GFX11-NEXT:    v_min_f16_e32 v4, v4, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v6, v7, v11, v6
-; GFX11-NEXT:    v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
+; GFX11-NEXT:    v_and_or_b32 v5, v6, v9, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v5
+; GFX11-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX11-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
@@ -3446,14 +3433,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB12_4
 ; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v8, v7
-; GFX11-NEXT:    v_mov_b32_e32 v7, v8
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
@@ -3462,20 +3449,20 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX11-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v6, 0x200, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
-; GFX10-NEXT:    v_and_b32_e32 v4, 3, v6
-; GFX10-NEXT:    v_and_b32_e32 v10, -4, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX10-NEXT:    v_lshlrev_b32_e64 v7, v4, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v11, v7
+; GFX10-NEXT:    v_and_b32_e32 v6, 3, v4
+; GFX10-NEXT:    v_and_b32_e32 v8, -4, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
+; GFX10-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v9, v6
 ; GFX10-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
@@ -3485,26 +3472,26 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    buffer_load_dword v7, v10, s[8:11], 0 offen
+; GFX10-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    v_max_f16_e32 v10, v5, v5
 ; GFX10-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX10-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
-; GFX10-NEXT:    v_max_f16_e32 v8, v5, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX10-NEXT:    v_min_f16_e32 v6, v6, v8
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v6, v7, v11, v6
-; GFX10-NEXT:    v_mov_b32_e32 v9, v7
-; GFX10-NEXT:    v_mov_b32_e32 v8, v6
+; GFX10-NEXT:    v_max_f16_e32 v4, v4, v4
+; GFX10-NEXT:    v_min_f16_e32 v4, v4, v10
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v5, v6, v9, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v5
+; GFX10-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX10-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
@@ -3516,15 +3503,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB12_4
 ; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v8, v7
-; GFX10-NEXT:    v_mov_b32_e32 v7, v8
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
@@ -3533,19 +3520,19 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX10-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v4, 0x200, v4
-; GFX90A-NEXT:    v_and_b32_e32 v10, -4, v4
+; GFX90A-NEXT:    v_and_b32_e32 v9, -4, v4
 ; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v8, 3, v4
 ; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v6, v4, s4
-; GFX90A-NEXT:    v_not_b32_e32 v11, v6
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v8, s4
+; GFX90A-NEXT:    v_not_b32_e32 v10, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX90A-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -3557,24 +3544,24 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_load_dword v7, v10, s[8:11], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v7, v9, s[8:11], 0 offen
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_max_f16_e32 v11, v5, v5
 ; GFX90A-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX90A-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
-; GFX90A-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT:    v_max_f16_e32 v8, v5, v5
-; GFX90A-NEXT:    v_min_f16_e32 v6, v6, v8
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
-; GFX90A-NEXT:    v_and_or_b32 v6, v7, v11, v6
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v8, v7
+; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
+; GFX90A-NEXT:    v_min_f16_e32 v4, v4, v11
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v8, v4
+; GFX90A-NEXT:    v_and_or_b32 v6, v7, v10, v4
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
-; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
 ; GFX90A-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -3586,33 +3573,33 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB12_4
 ; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v8
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v4, 0x200, v4
-; GFX908-NEXT:    v_and_b32_e32 v10, -4, v4
+; GFX908-NEXT:    v_and_b32_e32 v8, -4, v4
 ; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX908-NEXT:    v_lshlrev_b32_e32 v7, 3, v4
 ; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v6, v4, s4
-; GFX908-NEXT:    v_not_b32_e32 v11, v6
+; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
+; GFX908-NEXT:    v_not_b32_e32 v9, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX908-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -3624,25 +3611,25 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dword v7, v10, s[8:11], 0 offen
+; GFX908-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_max_f16_e32 v10, v5, v5
 ; GFX908-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX908-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
-; GFX908-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX908-NEXT:    v_max_f16_e32 v8, v5, v5
-; GFX908-NEXT:    v_min_f16_e32 v6, v6, v8
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
-; GFX908-NEXT:    v_and_or_b32 v6, v7, v11, v6
-; GFX908-NEXT:    v_mov_b32_e32 v9, v7
+; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
+; GFX908-NEXT:    v_max_f16_e32 v4, v4, v4
+; GFX908-NEXT:    v_min_f16_e32 v4, v4, v10
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
+; GFX908-NEXT:    v_and_or_b32 v5, v6, v9, v4
+; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
-; GFX908-NEXT:    v_mov_b32_e32 v8, v6
+; GFX908-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX908-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -3654,33 +3641,33 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB12_4
 ; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v7, v8
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x200, v4
-; GFX8-NEXT:    v_and_b32_e32 v10, -4, v4
+; GFX8-NEXT:    v_and_b32_e32 v8, -4, v4
 ; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 3, v4
 ; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v6, v4, s4
-; GFX8-NEXT:    v_not_b32_e32 v11, v6
+; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
+; GFX8-NEXT:    v_not_b32_e32 v9, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -3692,26 +3679,26 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    buffer_load_dword v7, v10, s[8:11], 0 offen
+; GFX8-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_max_f16_e32 v10, v5, v5
 ; GFX8-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX8-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
-; GFX8-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX8-NEXT:    v_max_f16_e32 v8, v5, v5
-; GFX8-NEXT:    v_min_f16_e32 v6, v6, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
-; GFX8-NEXT:    v_and_b32_e32 v8, v7, v11
-; GFX8-NEXT:    v_or_b32_e32 v6, v8, v6
-; GFX8-NEXT:    v_mov_b32_e32 v9, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
+; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
+; GFX8-NEXT:    v_min_f16_e32 v4, v4, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, v6, v9
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
-; GFX8-NEXT:    v_mov_b32_e32 v8, v6
+; GFX8-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX8-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -3723,21 +3710,21 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB12_4
 ; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v7, v8
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -3898,27 +3885,28 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_addk_co_i32 s16, 0x200
-; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_b32 s4, s16, -4
-; GFX12-NEXT:    s_and_b32 s5, s16, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v1, s4
-; GFX12-NEXT:    s_lshl_b32 s5, s5, 3
+; GFX12-NEXT:    v_mov_b32_e32 v4, s4
+; GFX12-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_lshl_b32 s6, 0xffff, s5
+; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_not_b32 s7, s6
-; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen
-; GFX12-NEXT:    s_mov_b32 s6, 0
+; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_not_b32 s6, s5
+; GFX12-NEXT:    s_mov_b32 s5, 0
 ; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s5, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v4
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX12-NEXT:    v_or_b32_e32 v3, 0x400000, v0
@@ -3928,23 +3916,23 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
 ; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s5, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX12-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v2
-; GFX12-NEXT:    s_or_b32 s6, vcc_lo, s6
+; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s6
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s5, v2
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3952,33 +3940,32 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s16, 0x200
-; GFX940-NEXT:    s_and_b32 s6, s16, -4
-; GFX940-NEXT:    v_mov_b32_e32 v1, s6
-; GFX940-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
+; GFX940-NEXT:    s_and_b32 s4, s16, -4
+; GFX940-NEXT:    v_mov_b32_e32 v4, s4
+; GFX940-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen
 ; GFX940-NEXT:    s_and_b32 s4, s16, 3
-; GFX940-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX940-NEXT:    s_not_b32 s8, s4
+; GFX940-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT:    s_not_b32 s7, s4
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX940-NEXT:    s_movk_i32 s9, 0x7fff
+; GFX940-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT:    v_mov_b32_e32 v5, s6
-; GFX940-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    v_min_f32_e32 v0, v0, v5
 ; GFX940-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX940-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX940-NEXT:    v_add3_u32 v2, v2, v0, s9
+; GFX940-NEXT:    v_add3_u32 v2, v2, v0, s8
 ; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    s_nop 1
 ; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX940-NEXT:    v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT:    v_and_or_b32 v0, v1, s8, v0
+; GFX940-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT:    v_and_or_b32 v0, v1, s7, v0
 ; GFX940-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
-; GFX940-NEXT:    buffer_atomic_cmpswap v[2:3], v5, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
@@ -3988,32 +3975,33 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
 ; GFX940-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s7, v2
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX11-NEXT:    s_and_b32 s4, s16, -4
-; GFX11-NEXT:    s_and_b32 s5, s16, 3
-; GFX11-NEXT:    v_mov_b32_e32 v1, s4
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_lshl_b32 s6, 0xffff, s5
-; GFX11-NEXT:    s_not_b32 s7, s6
-; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen
-; GFX11-NEXT:    s_mov_b32 s6, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v4, s4
+; GFX11-NEXT:    s_and_b32 s4, s16, 3
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT:    buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-NEXT:    s_not_b32 s6, s5
+; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s5, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v0
@@ -4023,97 +4011,95 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, s5, v0
-; GFX11-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v2
-; GFX11-NEXT:    s_or_b32 s6, vcc_lo, s6
+; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s5, v2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s20, 0x200
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX10-NEXT:    s_and_b32 s4, s20, -4
-; GFX10-NEXT:    s_and_b32 s5, s20, 3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX10-NEXT:    s_lshl_b32 s6, 0xffff, s5
-; GFX10-NEXT:    s_not_b32 s7, s6
-; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
-; GFX10-NEXT:    s_mov_b32 s6, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    s_and_b32 s4, s20, 3
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX10-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX10-NEXT:    s_not_b32 s6, s5
+; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_mov_b32_e32 v5, s4
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v5
 ; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
-; GFX10-NEXT:    buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v2
-; GFX10-NEXT:    s_or_b32 s6, vcc_lo, s6
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s5, v2
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s20, 0x200
-; GFX90A-NEXT:    s_and_b32 s6, s20, -4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
-; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX90A-NEXT:    s_and_b32 s4, s20, -4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
+; GFX90A-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_and_b32 s4, s20, 3
-; GFX90A-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX90A-NEXT:    s_not_b32 s8, s4
+; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX90A-NEXT:    s_not_b32 s7, s4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX90A-NEXT:    s_movk_i32 s9, 0x7fff
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v5
 ; GFX90A-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX90A-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX90A-NEXT:    v_add3_u32 v2, v2, v0, s9
+; GFX90A-NEXT:    v_add3_u32 v2, v2, v0, s8
 ; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v0, v1, s8, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v5, s6
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v0, v1, s7, v0
 ; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
@@ -4123,39 +4109,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s7, v2
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s20, 0x200
-; GFX908-NEXT:    s_and_b32 s6, s20, -4
-; GFX908-NEXT:    v_mov_b32_e32 v1, s6
-; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX908-NEXT:    s_and_b32 s4, s20, -4
+; GFX908-NEXT:    v_mov_b32_e32 v4, s4
+; GFX908-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX908-NEXT:    s_and_b32 s4, s20, 3
-; GFX908-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX908-NEXT:    s_not_b32 s8, s4
+; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX908-NEXT:    s_not_b32 s7, s4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX908-NEXT:    s_movk_i32 s9, 0x7fff
+; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_min_f32_e32 v0, v0, v5
 ; GFX908-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX908-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX908-NEXT:    v_add3_u32 v2, v2, v0, s9
+; GFX908-NEXT:    v_add3_u32 v2, v2, v0, s8
 ; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX908-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v0, v1, s8, v0
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v0, v1, s7, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX908-NEXT:    v_mov_b32_e32 v2, v0
-; GFX908-NEXT:    buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
@@ -4165,41 +4150,40 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
 ; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s7, v2
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s20, 0x200
-; GFX8-NEXT:    s_and_b32 s6, s20, -4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX8-NEXT:    s_and_b32 s4, s20, -4
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX8-NEXT:    s_and_b32 s4, s20, 3
-; GFX8-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX8-NEXT:    s_not_b32 s8, s4
+; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_mov_b32_e32 v0, s7
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v5
 ; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
 ; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
-; GFX8-NEXT:    v_and_b32_e32 v2, s8, v1
+; GFX8-NEXT:    v_and_b32_e32 v2, s7, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
@@ -4209,38 +4193,37 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
 ; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s7, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s20, 0x200
-; GFX7-NEXT:    s_and_b32 s6, s20, -4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX7-NEXT:    s_and_b32 s4, s20, -4
+; GFX7-NEXT:    v_mov_b32_e32 v4, s4
+; GFX7-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX7-NEXT:    s_and_b32 s4, s20, 3
-; GFX7-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s7
+; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    s_not_b32 s8, s4
+; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s7, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v5
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_and_b32_e32 v2, s8, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s7, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s7, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
@@ -4250,7 +4233,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
 ; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s7, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4258,32 +4241,31 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s20, 0x200
-; GFX6-NEXT:    s_and_b32 s6, s20, -4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s6
-; GFX6-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX6-NEXT:    s_and_b32 s4, s20, -4
+; GFX6-NEXT:    v_mov_b32_e32 v4, s4
+; GFX6-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX6-NEXT:    s_and_b32 s4, s20, 3
-; GFX6-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s7
+; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT:    s_not_b32 s8, s4
+; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX6-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s7, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX6-NEXT:    v_min_f32_e32 v0, v0, v5
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v2, s8, v1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s7, v0
+; GFX6-NEXT:    v_and_b32_e32 v2, s7, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v1
-; GFX6-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX6-NEXT:    v_mov_b32_e32 v2, v0
-; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
@@ -4293,7 +4275,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
 ; GFX6-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s7, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -4311,52 +4293,53 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_addk_co_i32 s16, 0x200
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_b32 s4, s16, -4
-; GFX12-NEXT:    s_and_b32 s5, s16, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v1, s4
-; GFX12-NEXT:    s_lshl_b32 s5, s5, 3
+; GFX12-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_lshl_b32 s6, 0xffff, s5
+; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_not_b32 s7, s6
-; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen
-; GFX12-NEXT:    s_mov_b32 s6, 0
+; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_not_b32 s6, s5
+; GFX12-NEXT:    s_mov_b32 s5, 0
 ; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s5, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX12-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s5, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX12-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v1
-; GFX12-NEXT:    v_mov_b32_e32 v1, v3
-; GFX12-NEXT:    s_or_b32 s6, vcc_lo, s6
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-NEXT:    v_mov_b32_e32 v1, v4
+; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4364,33 +4347,32 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s16, 0x200
-; GFX940-NEXT:    s_and_b32 s6, s16, -4
-; GFX940-NEXT:    v_mov_b32_e32 v1, s6
-; GFX940-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
+; GFX940-NEXT:    s_and_b32 s4, s16, -4
+; GFX940-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen
 ; GFX940-NEXT:    s_and_b32 s4, s16, 3
-; GFX940-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX940-NEXT:    s_not_b32 s8, s4
+; GFX940-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT:    s_not_b32 s7, s4
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX940-NEXT:    s_movk_i32 s9, 0x7fff
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT:    v_mov_b32_e32 v3, s6
-; GFX940-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    v_min_f32_e32 v0, v0, v3
 ; GFX940-NEXT:    v_bfe_u32 v4, v0, 16, 1
 ; GFX940-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX940-NEXT:    v_add3_u32 v4, v4, v0, s9
+; GFX940-NEXT:    v_add3_u32 v4, v4, v0, s8
 ; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    s_nop 1
 ; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX940-NEXT:    v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT:    v_and_or_b32 v0, v1, s8, v0
+; GFX940-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT:    v_and_or_b32 v0, v1, s7, v0
 ; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
-; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
@@ -4406,123 +4388,122 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX11-NEXT:    s_and_b32 s4, s16, -4
-; GFX11-NEXT:    s_and_b32 s5, s16, 3
-; GFX11-NEXT:    v_mov_b32_e32 v1, s4
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_lshl_b32 s6, 0xffff, s5
-; GFX11-NEXT:    s_not_b32 s7, s6
-; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen
-; GFX11-NEXT:    s_mov_b32 s6, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_and_b32 s4, s16, 3
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT:    buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-NEXT:    s_not_b32 s6, s5
+; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s5, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX11-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, s5, v0
-; GFX11-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v3
-; GFX11-NEXT:    s_or_b32 s6, vcc_lo, s6
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v4
+; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s20, 0x200
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX10-NEXT:    s_and_b32 s4, s20, -4
-; GFX10-NEXT:    s_and_b32 s5, s20, 3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX10-NEXT:    s_lshl_b32 s6, 0xffff, s5
-; GFX10-NEXT:    s_not_b32 s7, s6
-; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
-; GFX10-NEXT:    s_mov_b32 s6, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-NEXT:    s_and_b32 s4, s20, 3
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX10-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX10-NEXT:    s_not_b32 s6, s5
+; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_mov_b32_e32 v5, s4
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX10-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v0, v1, s7, v0
-; GFX10-NEXT:    v_mov_b32_e32 v4, v1
-; GFX10-NEXT:    v_mov_b32_e32 v3, v0
-; GFX10-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v3
-; GFX10-NEXT:    s_or_b32 s6, vcc_lo, s6
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s20, 0x200
-; GFX90A-NEXT:    s_and_b32 s6, s20, -4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
-; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX90A-NEXT:    s_and_b32 s4, s20, -4
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_and_b32 s4, s20, 3
-; GFX90A-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX90A-NEXT:    s_not_b32 s8, s4
+; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX90A-NEXT:    s_not_b32 s7, s4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX90A-NEXT:    s_movk_i32 s9, 0x7fff
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v3
 ; GFX90A-NEXT:    v_bfe_u32 v4, v0, 16, 1
 ; GFX90A-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX90A-NEXT:    v_add3_u32 v4, v4, v0, s9
+; GFX90A-NEXT:    v_add3_u32 v4, v4, v0, s8
 ; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v0, v1, s8, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v0, v1, s7, v0
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
@@ -4538,37 +4519,36 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s20, 0x200
-; GFX908-NEXT:    s_and_b32 s6, s20, -4
-; GFX908-NEXT:    v_mov_b32_e32 v1, s6
-; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX908-NEXT:    s_and_b32 s4, s20, -4
+; GFX908-NEXT:    v_mov_b32_e32 v2, s4
+; GFX908-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX908-NEXT:    s_and_b32 s4, s20, 3
-; GFX908-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX908-NEXT:    s_not_b32 s8, s4
+; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX908-NEXT:    s_not_b32 s7, s4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX908-NEXT:    s_movk_i32 s9, 0x7fff
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX908-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; GFX908-NEXT:    v_add3_u32 v3, v3, v0, s9
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX908-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX908-NEXT:    v_add3_u32 v4, v4, v0, s8
 ; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v0, v1, s8, v0
-; GFX908-NEXT:    v_mov_b32_e32 v4, v1
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v3, v0
-; GFX908-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX908-NEXT:    v_mov_b32_e32 v5, v1
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v1, v3
+; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4579,39 +4559,38 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s20, 0x200
-; GFX8-NEXT:    s_and_b32 s6, s20, -4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX8-NEXT:    s_and_b32 s4, s20, -4
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX8-NEXT:    s_and_b32 s4, s20, 3
-; GFX8-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s7
-; GFX8-NEXT:    s_not_b32 s8, s4
+; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_mov_b32_e32 v0, s7
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v4, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f32_e32 v4, v4, v2
-; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX8-NEXT:    v_and_b32_e32 v3, s8, v1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX8-NEXT:    v_mov_b32_e32 v4, v1
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v3, v0
-; GFX8-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX8-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v1, v3
+; GFX8-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4622,36 +4601,35 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s20, 0x200
-; GFX7-NEXT:    s_and_b32 s6, s20, -4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX7-NEXT:    s_and_b32 s4, s20, -4
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX7-NEXT:    s_and_b32 s4, s20, 3
-; GFX7-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s7
+; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    s_not_b32 s8, s4
+; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s7, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v3
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_and_b32_e32 v3, s8, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s7, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX7-NEXT:    v_mov_b32_e32 v4, v1
-; GFX7-NEXT:    v_mov_b32_e32 v5, s6
-; GFX7-NEXT:    v_mov_b32_e32 v3, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX7-NEXT:    v_mov_b32_e32 v5, v1
+; GFX7-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4662,37 +4640,36 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s20, 0x200
-; GFX6-NEXT:    s_and_b32 s6, s20, -4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s6
-; GFX6-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX6-NEXT:    s_and_b32 s4, s20, -4
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX6-NEXT:    s_and_b32 s4, s20, 3
-; GFX6-NEXT:    s_lshl_b32 s7, s4, 3
-; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s7
+; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT:    s_not_b32 s8, s4
+; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX6-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s7, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX6-NEXT:    v_min_f32_e32 v0, v0, v3
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v3, s8, v1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s7, v0
-; GFX6-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX6-NEXT:    v_mov_b32_e32 v4, v1
-; GFX6-NEXT:    v_mov_b32_e32 v5, s6
-; GFX6-NEXT:    v_mov_b32_e32 v3, v0
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v1, v3
+; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5419,27 +5396,29 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v2, v0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    s_add_co_i32 s5, s16, 0x400
-; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x400
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_mov_b32_e32 v3, s4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v1, v1
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5
-; GFX12-NEXT:    v_pk_max_num_f16 v0, v2, v2
+; GFX12-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v1, v4, v4
-; GFX12-NEXT:    v_pk_min_num_f16 v3, v1, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v5, v5
+; GFX12-NEXT:    v_pk_min_num_f16 v4, v0, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -5452,20 +5431,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, v0
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
 ; GFX940-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_pk_max_f16 v2, v1, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v0
-; GFX940-NEXT:    v_pk_max_f16 v1, v2, v2
 ; GFX940-NEXT:    v_pk_max_f16 v0, v5, v5
-; GFX940-NEXT:    v_mov_b32_e32 v3, s6
-; GFX940-NEXT:    v_pk_min_f16 v4, v0, v1
 ; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    v_pk_min_f16 v4, v0, v2
+; GFX940-NEXT:    s_nop 0
 ; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -5481,27 +5461,28 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s5, s16, 0x400
-; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5
-; GFX11-NEXT:    v_pk_max_f16 v0, v2, v2
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v1, v4, v4
-; GFX11-NEXT:    v_pk_min_f16 v3, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_max_f16 v0, v5, v5
+; GFX11-NEXT:    v_pk_min_f16 v4, v0, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -5513,27 +5494,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s5, s20, 0x400
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10-NEXT:    v_pk_max_f16 v0, v2, v2
-; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_pk_max_f16 v1, v4, v4
-; GFX10-NEXT:    v_pk_min_f16 v3, v1, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_pk_max_f16 v0, v5, v5
+; GFX10-NEXT:    v_pk_min_f16 v4, v0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
@@ -5544,19 +5525,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v2, v1, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
-; GFX90A-NEXT:    v_pk_max_f16 v1, v2, v2
 ; GFX90A-NEXT:    v_pk_max_f16 v0, v5, v5
-; GFX90A-NEXT:    v_pk_min_f16 v4, v0, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_pk_min_f16 v4, v0, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -5572,25 +5553,25 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v2, v1, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    v_pk_max_f16 v1, v2, v2
-; GFX908-NEXT:    v_pk_max_f16 v0, v4, v4
-; GFX908-NEXT:    v_pk_min_f16 v3, v0, v1
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_pk_max_f16 v0, v5, v5
+; GFX908-NEXT:    v_pk_min_f16 v4, v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
@@ -5601,29 +5582,29 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v3, v1, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT:    v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v6, v4, v4
-; GFX8-NEXT:    v_min_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v0, v6, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, v0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v1, v6, v6
+; GFX8-NEXT:    v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v1, v1, v3
+; GFX8-NEXT:    v_or_b32_e32 v5, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, v5
+; GFX8-NEXT:    v_mov_b32_e32 v1, v6
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
@@ -5646,30 +5627,30 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v8, s6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_min_f32_e32 v4, v4, v2
-; GFX7-NEXT:    v_min_f32_e32 v5, v5, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v5
-; GFX7-NEXT:    v_or_b32_e32 v5, v0, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT:    v_or_b32_e32 v4, v6, v0
+; GFX7-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX7-NEXT:    v_min_f32_e32 v6, v6, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v6
+; GFX7-NEXT:    v_or_b32_e32 v6, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v0
+; GFX7-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v7
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
@@ -5692,31 +5673,31 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX6-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v8, s6
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v1
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_min_f32_e32 v4, v4, v2
-; GFX6-NEXT:    v_min_f32_e32 v5, v5, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX6-NEXT:    v_min_f32_e32 v6, v6, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v5
-; GFX6-NEXT:    v_or_b32_e32 v5, v0, v1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX6-NEXT:    v_or_b32_e32 v4, v6, v0
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v6
+; GFX6-NEXT:    v_or_b32_e32 v6, v0, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
+; GFX6-NEXT:    v_or_b32_e32 v5, v7, v0
+; GFX6-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX6-NEXT:    v_mov_b32_e32 v7, v5
-; GFX6-NEXT:    v_mov_b32_e32 v6, v4
-; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v6
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v7
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB16_1
@@ -5738,25 +5719,26 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s16
-; GFX12-NEXT:    s_add_co_i32 s5, s16, 0x400
+; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x400
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v0, v0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    buffer_load_b32 v2, v1, s[0:3], null offen offset:1024
+; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
 ; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_pk_max_num_f16 v1, v0, v0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v1, v1
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_pk_min_num_f16 v1, v3, v1
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2
-; GFX12-NEXT:    v_mov_b32_e32 v3, v1
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v0, v2
+; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-NEXT:    v_mov_b32_e32 v1, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -5770,24 +5752,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s16
-; GFX940-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
 ; GFX940-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_pk_max_f16 v2, v0, v0
+; GFX940-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX940-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_pk_max_f16 v1, v0, v0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
-; GFX940-NEXT:    v_mov_b32_e32 v6, s6
-; GFX940-NEXT:    v_pk_min_f16 v2, v2, v1
+; GFX940-NEXT:    v_pk_max_f16 v0, v1, v1
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
+; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5798,25 +5781,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s16
-; GFX11-NEXT:    s_add_i32 s5, s16, 0x400
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
+; GFX11-NEXT:    v_pk_max_f16 v2, v0, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_pk_max_f16 v1, v0, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
+; GFX11-NEXT:    v_pk_max_f16 v0, v1, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_pk_min_f16 v1, v3, v1
-; GFX11-NEXT:    v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2
-; GFX11-NEXT:    v_mov_b32_e32 v3, v1
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -5829,25 +5812,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
-; GFX10-NEXT:    s_add_i32 s5, s20, 0x400
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
+; GFX10-NEXT:    v_pk_max_f16 v2, v0, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_pk_max_f16 v1, v0, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    v_pk_max_f16 v0, v1, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_pk_min_f16 v1, v3, v1
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
@@ -5859,23 +5842,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
-; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v2, v0, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_pk_max_f16 v1, v0, v0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
-; GFX90A-NEXT:    v_pk_min_f16 v2, v2, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s6
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_pk_max_f16 v0, v1, v1
+; GFX90A-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5886,24 +5869,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s20
-; GFX908-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v2, v0, v0
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_pk_max_f16 v1, v0, v0
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX908-NEXT:    v_pk_min_f16 v1, v3, v1
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_pk_max_f16 v0, v1, v1
+; GFX908-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v5, v1
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5914,28 +5897,28 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s20
-; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v3, v0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v4, v0, v0
-; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:    v_min_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v3, v5, v4
-; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v1, v1
+; GFX8-NEXT:    v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v5, v5, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT:    v_mov_b32_e32 v6, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5946,41 +5929,41 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s20
-; GFX7-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v0
 ; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_mov_b32_e32 v7, s6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_min_f32_e32 v4, v4, v0
-; GFX7-NEXT:    v_min_f32_e32 v5, v5, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    v_or_b32_e32 v4, v2, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX7-NEXT:    v_or_b32_e32 v3, v5, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_min_f32_e32 v5, v5, v0
+; GFX7-NEXT:    v_min_f32_e32 v6, v6, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_or_b32_e32 v5, v3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-NEXT:    v_or_b32_e32 v4, v6, v3
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
@@ -5992,42 +5975,42 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s20
-; GFX6-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v0
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v0
 ; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX6-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_mov_b32_e32 v7, s6
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_min_f32_e32 v4, v4, v0
-; GFX6-NEXT:    v_min_f32_e32 v5, v5, v1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v4
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX6-NEXT:    v_or_b32_e32 v4, v2, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX6-NEXT:    v_or_b32_e32 v3, v5, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT:    v_min_f32_e32 v5, v5, v0
+; GFX6-NEXT:    v_min_f32_e32 v6, v6, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT:    v_or_b32_e32 v5, v3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX6-NEXT:    v_or_b32_e32 v4, v6, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX6-NEXT:    v_mov_b32_e32 v6, v4
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc
+; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB17_1
@@ -6048,7 +6031,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_nc_u32_e32 v9, 0x400, v4
+; GFX12-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX12-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX12-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
@@ -6064,26 +6047,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_b32 v8, v4, s[4:7], null offen offset:1024
+; GFX12-NEXT:    buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
 ; GFX12-NEXT:    ; implicit-def: $vgpr4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-NEXT:    v_pk_max_num_f16 v8, v5, v5
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX12-NEXT:    ; Child Loop BB18_4 Depth 2
-; GFX12-NEXT:    v_pk_max_num_f16 v4, v5, v5
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_pk_max_num_f16 v6, v8, v8
+; GFX12-NEXT:    v_pk_max_num_f16 v4, v6, v6
 ; GFX12-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_min_num_f16 v7, v6, v4
-; GFX12-NEXT:    v_mov_b32_e32 v6, v7
-; GFX12-NEXT:    v_mov_b32_e32 v7, v8
+; GFX12-NEXT:    v_pk_min_num_f16 v5, v4, v8
+; GFX12-NEXT:    v_mov_b32_e32 v4, v5
+; GFX12-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX12-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
@@ -6099,15 +6082,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX12-NEXT:    v_mov_b32_e32 v8, v6
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -6115,14 +6098,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    v_mov_b32_e32 v0, v6
+; GFX12-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_u32_e32 v10, 0x400, v4
+; GFX940-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX940-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
@@ -6134,23 +6117,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
 ; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT:    buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024
+; GFX940-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
 ; GFX940-NEXT:    ; implicit-def: $vgpr4
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_pk_max_f16 v9, v5, v5
 ; GFX940-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX940-NEXT:    ; Child Loop BB18_4 Depth 2
-; GFX940-NEXT:    v_pk_max_f16 v4, v5, v5
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v6, v9, v9
+; GFX940-NEXT:    v_pk_max_f16 v4, v7, v7
 ; GFX940-NEXT:    s_mov_b64 s[8:9], exec
-; GFX940-NEXT:    v_pk_min_f16 v8, v6, v4
+; GFX940-NEXT:    v_pk_min_f16 v6, v4, v9
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[8:9]
+; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX940-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
@@ -6163,27 +6146,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v9
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v9, v6
+; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v6
+; GFX940-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x400, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
@@ -6197,25 +6180,25 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024
+; GFX11-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
 ; GFX11-NEXT:    ; implicit-def: $vgpr4
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_pk_max_f16 v8, v5, v5
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-NEXT:    ; Child Loop BB18_4 Depth 2
-; GFX11-NEXT:    v_pk_max_f16 v4, v5, v5
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v6, v8, v8
+; GFX11-NEXT:    v_pk_max_f16 v4, v6, v6
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_min_f16 v7, v6, v4
-; GFX11-NEXT:    v_mov_b32_e32 v6, v7
-; GFX11-NEXT:    v_mov_b32_e32 v7, v8
+; GFX11-NEXT:    v_pk_min_f16 v5, v4, v8
+; GFX11-NEXT:    v_mov_b32_e32 v4, v5
+; GFX11-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX11-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
@@ -6229,14 +6212,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX11-NEXT:    v_mov_b32_e32 v8, v6
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
@@ -6245,13 +6228,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    v_mov_b32_e32 v0, v6
+; GFX11-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v9, 0x400, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
@@ -6263,24 +6246,24 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX10-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX10-NEXT:    ; implicit-def: $vgpr4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    v_pk_max_f16 v8, v5, v5
 ; GFX10-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX10-NEXT:    ; Child Loop BB18_4 Depth 2
-; GFX10-NEXT:    v_pk_max_f16 v4, v5, v5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v6, v8, v8
+; GFX10-NEXT:    v_pk_max_f16 v4, v6, v6
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_pk_min_f16 v7, v6, v4
-; GFX10-NEXT:    v_mov_b32_e32 v6, v7
-; GFX10-NEXT:    v_mov_b32_e32 v7, v8
+; GFX10-NEXT:    v_pk_min_f16 v5, v4, v8
+; GFX10-NEXT:    v_mov_b32_e32 v4, v5
+; GFX10-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX10-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
@@ -6292,15 +6275,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX10-NEXT:    v_mov_b32_e32 v8, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
@@ -6309,13 +6292,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    v_mov_b32_e32 v0, v6
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_u32_e32 v10, 0x400, v4
+; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX90A-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -6327,22 +6310,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024
+; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_pk_max_f16 v9, v5, v5
 ; GFX90A-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX90A-NEXT:    ; Child Loop BB18_4 Depth 2
-; GFX90A-NEXT:    v_pk_max_f16 v4, v5, v5
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_max_f16 v6, v9, v9
-; GFX90A-NEXT:    v_pk_min_f16 v8, v6, v4
+; GFX90A-NEXT:    v_pk_max_f16 v4, v7, v7
+; GFX90A-NEXT:    v_pk_min_f16 v6, v4, v9
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
 ; GFX90A-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -6354,27 +6337,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v9
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v9, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_u32_e32 v9, 0x400, v4
+; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX908-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -6386,23 +6369,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX908-NEXT:    ; implicit-def: $vgpr4
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_pk_max_f16 v8, v5, v5
 ; GFX908-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX908-NEXT:    ; Child Loop BB18_4 Depth 2
-; GFX908-NEXT:    v_pk_max_f16 v4, v5, v5
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_pk_max_f16 v6, v8, v8
-; GFX908-NEXT:    v_pk_min_f16 v7, v6, v4
-; GFX908-NEXT:    v_mov_b32_e32 v6, v7
+; GFX908-NEXT:    v_pk_max_f16 v4, v6, v6
+; GFX908-NEXT:    v_pk_min_f16 v5, v4, v8
+; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
-; GFX908-NEXT:    v_mov_b32_e32 v7, v8
+; GFX908-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX908-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -6414,27 +6397,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v8
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v8, v6
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v6
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x400, v4
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -6446,27 +6429,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX8-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX8-NEXT:    ; implicit-def: $vgpr4
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v9, v5, v5
 ; GFX8-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX8-NEXT:    ; Child Loop BB18_4 Depth 2
-; GFX8-NEXT:    v_max_f16_sdwa v4, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v6, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_min_f16_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v6, v5, v5
-; GFX8-NEXT:    v_max_f16_e32 v7, v8, v8
-; GFX8-NEXT:    v_min_f16_e32 v6, v7, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, v6, v4
-; GFX8-NEXT:    v_mov_b32_e32 v6, v7
+; GFX8-NEXT:    v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT:    v_min_f16_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v5, v5, v9
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
-; GFX8-NEXT:    v_mov_b32_e32 v7, v8
+; GFX8-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX8-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -6478,21 +6461,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v8
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v8, v6
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v6
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -6669,43 +6652,45 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v2, v0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    s_add_co_i32 s6, s16, 0x400
+; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x400
 ; GFX12-NEXT:    s_mov_b32 s5, 0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_mov_b32_e32 v4, s4
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
+; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX12-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2
+; GFX12-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX12-NEXT:    v_min_num_f32_e32 v1, v1, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
-; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-NEXT:    v_min_num_f32_e32 v1, v5, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX12-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_min_num_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7
-; GFX12-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX12-NEXT:    v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v5, s6
-; GFX12-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, v3, v6, s4
-; GFX12-NEXT:    v_perm_b32 v3, v1, v0, 0x7060302
+; GFX12-NEXT:    v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX12-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -6718,41 +6703,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, v0
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX940-NEXT:    s_add_i32 s8, s16, 0x400
+; GFX940-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX940-NEXT:    s_mov_b64 s[6:7], 0
-; GFX940-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX940-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX940-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX940-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX940-NEXT:    v_min_f32_e32 v1, v4, v1
-; GFX940-NEXT:    v_min_f32_e32 v0, v6, v0
-; GFX940-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v7, v0, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX940-NEXT:    v_add3_u32 v4, v4, v1, s9
-; GFX940-NEXT:    v_add3_u32 v7, v7, v0, s9
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, s8
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v4, v6, s[4:5]
-; GFX940-NEXT:    v_perm_b32 v4, v1, v0, s10
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
+; GFX940-NEXT:    v_mov_b32_e32 v7, v0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
+; GFX940-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX940-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX940-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX940-NEXT:    v_add3_u32 v5, v5, v0, s8
+; GFX940-NEXT:    v_add3_u32 v8, v8, v1, s8
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[4:5]
+; GFX940-NEXT:    v_perm_b32 v6, v1, v0, s9
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
+; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB19_1
@@ -6763,45 +6748,47 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s6, s16, 0x400
+; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX11-NEXT:    s_mov_b32 s5, 0
+; GFX11-NEXT:    v_mov_b32_e32 v4, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX11-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-NEXT:    v_min_f32_e32 v1, v5, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX11-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_min_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX11-NEXT:    v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
-; GFX11-NEXT:    v_mov_b32_e32 v5, s6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v3, v6, s4
+; GFX11-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v1, v0, 0x7060302
-; GFX11-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX11-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -6814,41 +6801,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s6, s20, 0x400
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX10-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
-; GFX10-NEXT:    v_min_f32_e32 v0, v3, v0
-; GFX10-NEXT:    v_min_f32_e32 v1, v5, v1
-; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v0
-; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v3, v6, s4
-; GFX10-NEXT:    v_mov_b32_e32 v5, s6
-; GFX10-NEXT:    v_perm_b32 v3, v1, v0, 0x7060302
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX10-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v0, v5
+; GFX10-NEXT:    v_mov_b32_e32 v1, v6
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB19_1
@@ -6859,40 +6846,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX90A-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX90A-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX90A-NEXT:    v_min_f32_e32 v1, v4, v1
-; GFX90A-NEXT:    v_min_f32_e32 v0, v6, v0
-; GFX90A-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v7, v0, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX90A-NEXT:    v_add3_u32 v4, v4, v1, s9
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v0, s9
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v4, v6, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v1, v0, s10
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s8
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
+; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX90A-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX90A-NEXT:    v_add3_u32 v5, v5, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v1, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v1, v0, s9
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB19_1
@@ -6903,41 +6890,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX908-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX908-NEXT:    v_min_f32_e32 v1, v3, v1
-; GFX908-NEXT:    v_min_f32_e32 v0, v6, v0
-; GFX908-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v7, v0, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX908-NEXT:    v_add3_u32 v3, v3, v1, s9
-; GFX908-NEXT:    v_add3_u32 v7, v7, v0, s9
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX908-NEXT:    v_cndmask_b32_e64 v0, v3, v6, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v1, v0, s10
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
-; GFX908-NEXT:    v_mov_b32_e32 v5, s8
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX908-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX908-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX908-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX908-NEXT:    v_add3_u32 v5, v5, v0, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v1, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v5, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v1, v0, s9
+; GFX908-NEXT:    v_mov_b32_e32 v0, v5
+; GFX908-NEXT:    v_mov_b32_e32 v1, v6
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
@@ -6948,42 +6935,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT:    v_min_f32_e32 v1, v3, v1
-; GFX8-NEXT:    v_min_f32_e32 v0, v6, v0
-; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v3, v6, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v5, v7, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v3, v1, v0, 16
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mov_b32_e32 v5, s8
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v1, v0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v0, v5
+; GFX8-NEXT:    v_mov_b32_e32 v1, v6
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
@@ -6995,38 +6982,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s20
-; GFX7-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT:    buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX7-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_min_f32_e32 v4, v4, v2
-; GFX7-NEXT:    v_min_f32_e32 v5, v5, v3
+; GFX7-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX7-NEXT:    v_min_f32_e32 v6, v6, v3
 ; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
-; GFX7-NEXT:    v_alignbit_b32 v0, v0, v5, 16
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX7-NEXT:    v_alignbit_b32 v0, v0, v6, 16
+; GFX7-NEXT:    v_mov_b32_e32 v6, v1
+; GFX7-NEXT:    v_mov_b32_e32 v5, v0
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7037,39 +7024,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s20
-; GFX6-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT:    buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v0
+; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX6-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_min_f32_e32 v4, v4, v2
-; GFX6-NEXT:    v_min_f32_e32 v5, v5, v3
+; GFX6-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX6-NEXT:    v_min_f32_e32 v6, v6, v3
 ; GFX6-NEXT:    v_alignbit_b32 v1, v1, v0, 16
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
-; GFX6-NEXT:    v_alignbit_b32 v0, v0, v5, 16
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
-; GFX6-NEXT:    v_mov_b32_e32 v6, s6
-; GFX6-NEXT:    v_mov_b32_e32 v4, v0
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX6-NEXT:    v_alignbit_b32 v0, v0, v6, 16
+; GFX6-NEXT:    v_mov_b32_e32 v6, v1
+; GFX6-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7089,43 +7076,41 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v1, s16
-; GFX12-NEXT:    s_add_co_i32 s6, s16, 0x400
+; GFX12-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x400
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:    buffer_load_b32 v2, v1, s[0:3], null offen offset:1024
 ; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_min_num_f32 v5, v5, v3 :: v_dual_min_num_f32 v0, v0, v2
+; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_min_num_f32_e32 v1, v3, v1
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v5, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX12-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s4, v1, v1
-; GFX12-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v1, v4, v6, s4
-; GFX12-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc_lo
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v5, s6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
-; GFX12-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX12-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX12-NEXT:    v_mov_b32_e32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -7139,40 +7124,40 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s16
-; GFX940-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024
-; GFX940-NEXT:    s_add_i32 s8, s16, 0x400
+; GFX940-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX940-NEXT:    s_mov_b64 s[6:7], 0
-; GFX940-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX940-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX940-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX940-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX940-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX940-NEXT:    v_min_f32_e32 v1, v2, v1
-; GFX940-NEXT:    v_min_f32_e32 v2, v5, v4
-; GFX940-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT:    v_add3_u32 v4, v4, v1, s9
-; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s9
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX940-NEXT:    v_mov_b32_e32 v6, s8
-; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX940-NEXT:    v_perm_b32 v2, v2, v1, s10
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX940-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX940-NEXT:    v_min_f32_e32 v5, v5, v3
+; GFX940-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX940-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX940-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
+; GFX940-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
 ; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7182,45 +7167,43 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, s16
-; GFX11-NEXT:    s_add_i32 s6, s16, 0x400
+; GFX11-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:    s_mov_b32 s5, 0
-; GFX11-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_min_f32 v5, v5, v3 :: v_dual_min_f32 v0, v0, v2
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f32_e32 v1, v3, v1
-; GFX11-NEXT:    v_min_f32_e32 v3, v5, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v1, v1
-; GFX11-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v4, v6, s4
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc_lo
-; GFX11-NEXT:    v_mov_b32_e32 v5, s6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
-; GFX11-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -7234,39 +7217,39 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
-; GFX10-NEXT:    s_add_i32 s6, s20, 0x400
+; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_min_f32_e32 v1, v3, v1
-; GFX10-NEXT:    v_min_f32_e32 v3, v5, v4
-; GFX10-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX10-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v1, v1
-; GFX10-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v4, v6, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v5, s6
-; GFX10-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_min_f32_e32 v5, v5, v3
+; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB20_1
@@ -7278,39 +7261,39 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
-; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX90A-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX90A-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX90A-NEXT:    v_min_f32_e32 v1, v2, v1
-; GFX90A-NEXT:    v_min_f32_e32 v2, v5, v4
-; GFX90A-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v4, v4, v1, s9
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s9
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX90A-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX90A-NEXT:    v_perm_b32 v2, v2, v1, s10
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s8
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    v_min_f32_e32 v5, v5, v3
+; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7321,40 +7304,40 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s20
-; GFX908-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX908-NEXT:    s_mov_b32 s10, 0x7060302
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX908-NEXT:    v_min_f32_e32 v1, v3, v1
-; GFX908-NEXT:    v_min_f32_e32 v3, v5, v4
-; GFX908-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v4, v4, v1, s9
-; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s9
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX908-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT:    v_perm_b32 v1, v3, v1, s10
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_mov_b32_e32 v6, s8
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX908-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX908-NEXT:    v_min_f32_e32 v5, v5, v3
+; GFX908-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX908-NEXT:    v_mov_b32_e32 v6, v1
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7365,41 +7348,41 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s20
-; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s8, s20, 0x400
+; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:    v_min_f32_e32 v1, v3, v1
-; GFX8-NEXT:    v_min_f32_e32 v3, v5, v4
-; GFX8-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v1, v3, v1, 16
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_mov_b32_e32 v6, s8
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v6, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7411,37 +7394,37 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX7-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_min_f32_e32 v4, v4, v0
-; GFX7-NEXT:    v_min_f32_e32 v5, v5, v1
-; GFX7-NEXT:    v_alignbit_b32 v3, v3, v2, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; GFX7-NEXT:    v_alignbit_b32 v2, v2, v5, 16
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_min_f32_e32 v5, v5, v0
+; GFX7-NEXT:    v_min_f32_e32 v6, v6, v1
+; GFX7-NEXT:    v_alignbit_b32 v4, v4, v3, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_alignbit_b32 v3, v3, v6, 16
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7453,38 +7436,38 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX6-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v0
+; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v0
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff0000, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX6-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_min_f32_e32 v4, v4, v0
-; GFX6-NEXT:    v_min_f32_e32 v5, v5, v1
-; GFX6-NEXT:    v_alignbit_b32 v3, v3, v2, 16
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; GFX6-NEXT:    v_alignbit_b32 v2, v2, v5, 16
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT:    v_min_f32_e32 v5, v5, v0
+; GFX6-NEXT:    v_min_f32_e32 v6, v6, v1
+; GFX6-NEXT:    v_alignbit_b32 v4, v4, v3, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT:    v_alignbit_b32 v3, v3, v6, 16
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    v_mov_b32_e32 v6, s6
-; GFX6-NEXT:    v_mov_b32_e32 v4, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7504,7 +7487,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_nc_u32_e32 v9, 0x400, v4
+; GFX12-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX12-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX12-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
@@ -7520,43 +7503,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_b32 v8, v4, s[4:7], null offen offset:1024
+; GFX12-NEXT:    buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
 ; GFX12-NEXT:    ; implicit-def: $vgpr4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX12-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX12-NEXT:    ; Child Loop BB21_4 Depth 2
-; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX12-NEXT:    v_and_b32_e32 v10, 0xffff0000, v8
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX12-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    v_min_num_f32_e32 v4, v6, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_min_num_f32_e32 v6, v10, v7
-; GFX12-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_min_num_f32 v5, v5, v9 :: v_dual_min_num_f32 v4, v4, v8
+; GFX12-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v12, 0x400000, v4
 ; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT:    v_or_b32_e32 v12, 0x400000, v6
-; GFX12-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v10, v10, v6, 0x7fff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, v7, v11, vcc_lo
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT:    v_cndmask_b32_e32 v6, v10, v12, vcc_lo
+; GFX12-NEXT:    v_or_b32_e32 v13, 0x400000, v5
+; GFX12-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v7, v6, v4, 0x7060302
-; GFX12-NEXT:    v_mov_b32_e32 v6, v7
-; GFX12-NEXT:    v_mov_b32_e32 v7, v8
+; GFX12-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
+; GFX12-NEXT:    v_mov_b32_e32 v4, v5
+; GFX12-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX12-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
@@ -7572,15 +7554,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX12-NEXT:    v_mov_b32_e32 v8, v6
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -7588,14 +7570,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    v_mov_b32_e32 v0, v6
+; GFX12-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_u32_e32 v10, 0x400, v4
+; GFX940-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX940-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
@@ -7607,40 +7589,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
 ; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT:    buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024
+; GFX940-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
 ; GFX940-NEXT:    ; implicit-def: $vgpr4
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX940-NEXT:    s_movk_i32 s10, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
 ; GFX940-NEXT:    s_mov_b32 s11, 0x7060302
 ; GFX940-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX940-NEXT:    ; Child Loop BB21_4 Depth 2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
-; GFX940-NEXT:    v_min_f32_e32 v4, v6, v4
-; GFX940-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT:    v_add3_u32 v6, v6, v4, s10
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX940-NEXT:    v_min_f32_e32 v4, v4, v9
+; GFX940-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX940-NEXT:    v_add3_u32 v5, v5, v4, s10
+; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v9
-; GFX940-NEXT:    v_min_f32_e32 v6, v7, v6
-; GFX940-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX940-NEXT:    v_add3_u32 v7, v7, v6, s10
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_min_f32_e32 v5, v5, v10
+; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s10
+; GFX940-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX940-NEXT:    v_perm_b32 v8, v6, v4, s11
-; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[8:9]
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v11, vcc
+; GFX940-NEXT:    v_perm_b32 v6, v5, v4, s11
+; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX940-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
@@ -7653,27 +7635,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v9
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v9, v6
+; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v6
+; GFX940-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x400, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
@@ -7687,41 +7669,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024
+; GFX11-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
 ; GFX11-NEXT:    ; implicit-def: $vgpr4
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-NEXT:    ; Child Loop BB21_4 Depth 2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v8
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_min_f32_e32 v4, v6, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f32_e32 v6, v10, v7
-; GFX11-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_min_f32 v5, v5, v9 :: v_dual_min_f32 v4, v4, v8
+; GFX11-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v4
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v6
-; GFX11-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v10, v10, v6, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v7, v11, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v10, v12, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v5
+; GFX11-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v7, v6, v4, 0x7060302
-; GFX11-NEXT:    v_mov_b32_e32 v6, v7
-; GFX11-NEXT:    v_mov_b32_e32 v7, v8
+; GFX11-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
+; GFX11-NEXT:    v_mov_b32_e32 v4, v5
+; GFX11-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX11-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
@@ -7735,14 +7718,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX11-NEXT:    v_mov_b32_e32 v8, v6
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
@@ -7750,14 +7733,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    v_mov_b32_e32 v0, v6
+; GFX11-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v9, 0x400, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
@@ -7769,38 +7753,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX10-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX10-NEXT:    ; implicit-def: $vgpr4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX10-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX10-NEXT:    ; Child Loop BB21_4 Depth 2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_min_f32_e32 v4, v6, v4
-; GFX10-NEXT:    v_min_f32_e32 v6, v10, v7
-; GFX10-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX10-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX10-NEXT:    v_min_f32_e32 v4, v4, v8
+; GFX10-NEXT:    v_min_f32_e32 v5, v5, v9
+; GFX10-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v4
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v6
-; GFX10-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v10, v10, v6, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v7, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v10, v12, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v7, v6, v4, 0x7060302
-; GFX10-NEXT:    v_mov_b32_e32 v6, v7
-; GFX10-NEXT:    v_mov_b32_e32 v7, v8
+; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v5
+; GFX10-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v4, v5
+; GFX10-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX10-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7812,15 +7796,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX10-NEXT:    v_mov_b32_e32 v8, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
@@ -7829,13 +7813,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    v_mov_b32_e32 v0, v6
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_u32_e32 v10, 0x400, v4
+; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX90A-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7847,38 +7831,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024
+; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX90A-NEXT:    s_movk_i32 s14, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
 ; GFX90A-NEXT:    s_mov_b32 s15, 0x7060302
 ; GFX90A-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX90A-NEXT:    ; Child Loop BB21_4 Depth 2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
-; GFX90A-NEXT:    v_min_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s14
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v9
+; GFX90A-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX90A-NEXT:    v_add3_u32 v5, v5, v4, s14
+; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v9
-; GFX90A-NEXT:    v_min_f32_e32 v6, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v6, s14
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX90A-NEXT:    v_perm_b32 v8, v6, v4, s15
+; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_min_f32_e32 v5, v5, v10
+; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s14
+; GFX90A-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v11, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v4, s15
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
 ; GFX90A-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7890,27 +7874,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v9
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v9, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_u32_e32 v9, 0x400, v4
+; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX908-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7922,39 +7906,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX908-NEXT:    ; implicit-def: $vgpr4
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX908-NEXT:    s_movk_i32 s14, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX908-NEXT:    s_mov_b32 s15, 0x7060302
 ; GFX908-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX908-NEXT:    ; Child Loop BB21_4 Depth 2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
-; GFX908-NEXT:    v_min_f32_e32 v4, v6, v4
-; GFX908-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX908-NEXT:    v_add3_u32 v6, v6, v4, s14
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX908-NEXT:    v_min_f32_e32 v4, v4, v8
+; GFX908-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX908-NEXT:    v_add3_u32 v5, v5, v4, s14
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v4
 ; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v8
-; GFX908-NEXT:    v_min_f32_e32 v6, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX908-NEXT:    v_add3_u32 v7, v7, v6, s14
-; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX908-NEXT:    v_cndmask_b32_e32 v6, v7, v10, vcc
-; GFX908-NEXT:    v_perm_b32 v7, v6, v4, s15
-; GFX908-NEXT:    v_mov_b32_e32 v6, v7
+; GFX908-NEXT:    v_cndmask_b32_e32 v4, v5, v10, vcc
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_min_f32_e32 v5, v5, v9
+; GFX908-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX908-NEXT:    v_add3_u32 v10, v10, v5, s14
+; GFX908-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v4, s15
+; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
-; GFX908-NEXT:    v_mov_b32_e32 v7, v8
+; GFX908-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX908-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7966,27 +7950,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v8
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v8, v6
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v6
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x400, v4
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7998,40 +7982,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX8-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX8-NEXT:    ; implicit-def: $vgpr4
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX8-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX8-NEXT:    ; Child Loop BB21_4 Depth 2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
-; GFX8-NEXT:    v_min_f32_e32 v4, v6, v4
-; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX8-NEXT:    v_min_f32_e32 v4, v4, v8
+; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v8
-; GFX8-NEXT:    v_min_f32_e32 v6, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v6
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v7, v10, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_alignbit_b32 v7, v6, v4, 16
-; GFX8-NEXT:    v_mov_b32_e32 v6, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v10, vcc
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v9
+; GFX8-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v5
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 0x7fff, v10
+; GFX8-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v4, 16
+; GFX8-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
-; GFX8-NEXT:    v_mov_b32_e32 v7, v8
+; GFX8-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX8-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -8043,21 +8027,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v8
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v8, v6
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v6
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -8236,19 +8220,19 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_
 ; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, v0
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
 ; GFX940-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX940-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v0
-; GFX940-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v0, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, s6
+; GFX940-NEXT:    v_min_f32_e32 v4, v0, v2
 ; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
@@ -8287,19 +8271,19 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_
 ; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
-; GFX90A-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v0, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_min_f32_e32 v4, v0, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_wbl2
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
@@ -8317,25 +8301,25 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_
 ; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX908-NEXT:    v_max_f32_e32 v0, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v0, v1
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
-; GFX908-NEXT:    v_mov_b32_e32 v5, s6
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
@@ -8352,19 +8336,19 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v4
-; GFX8-NEXT:    v_min_f32_e32 v3, v0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
index 28a2245406842..3216e71e6221a 100644
--- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
+++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
@@ -245,14 +245,14 @@ define void @recursive_phis(i1 %cond, ptr addrspace(5) %ptr) {
 ; GISEL-ASM-NEXT:    v_and_b32_e32 v0, 0xffff, v1
 ; GISEL-ASM-NEXT:  ; %bb.2: ; %finallyendcf.split
 ; GISEL-ASM-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GISEL-ASM-NEXT:    s_mov_b64 s[6:7], 0
 ; GISEL-ASM-NEXT:    s_mov_b64 s[8:9], src_private_base
+; GISEL-ASM-NEXT:    s_mov_b64 s[6:7], 0
+; GISEL-ASM-NEXT:    v_mov_b32_e32 v1, s9
 ; GISEL-ASM-NEXT:    v_mov_b32_e32 v2, 7
 ; GISEL-ASM-NEXT:  .LBB7_3: ; %finally
 ; GISEL-ASM-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GISEL-ASM-NEXT:    s_and_b64 s[10:11], exec, s[4:5]
-; GISEL-ASM-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
-; GISEL-ASM-NEXT:    v_mov_b32_e32 v1, s9
+; GISEL-ASM-NEXT:    s_and_b64 s[8:9], exec, s[4:5]
+; GISEL-ASM-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
 ; GISEL-ASM-NEXT:    flat_store_dword v[0:1], v2
 ; GISEL-ASM-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-ASM-NEXT:    s_andn2_b64 exec, exec, s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
index a8ddece611783..9104dc68eb9b4 100644
--- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
+++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
@@ -6,90 +6,95 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_mov_b64 s[26:27], s[2:3]
 ; CHECK-NEXT:    s_mov_b64 s[24:25], s[0:1]
-; CHECK-NEXT:    s_load_dword s4, s[8:9], 0x0
+; CHECK-NEXT:    s_load_dword s2, s[8:9], 0x0
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-NEXT:    s_load_dword s12, s[8:9], 0x4
+; CHECK-NEXT:    s_load_dword s6, s[8:9], 0x4
 ; CHECK-NEXT:    s_add_u32 s24, s24, s15
 ; CHECK-NEXT:    s_addc_u32 s25, s25, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_bitcmp1_b32 s4, 0
+; CHECK-NEXT:    s_bitcmp1_b32 s2, 0
+; CHECK-NEXT:    s_cselect_b64 s[16:17], -1, 0
+; CHECK-NEXT:    s_bitcmp1_b32 s2, 8
+; CHECK-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; CHECK-NEXT:    s_bitcmp1_b32 s2, 16
 ; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT:    s_bitcmp1_b32 s4, 8
-; CHECK-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; CHECK-NEXT:    s_bitcmp1_b32 s4, 16
-; CHECK-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; CHECK-NEXT:    s_bitcmp1_b32 s0, 24
-; CHECK-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; CHECK-NEXT:    s_xor_b64 s[14:15], s[4:5], -1
+; CHECK-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; CHECK-NEXT:    s_xor_b64 s[4:5], s[8:9], -1
 ; CHECK-NEXT:    s_bitcmp1_b32 s1, 0
-; CHECK-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; CHECK-NEXT:    s_bitcmp1_b32 s12, 8
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
 ; CHECK-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; CHECK-NEXT:    s_and_b64 s[0:1], exec, s[14:15]
+; CHECK-NEXT:    s_bitcmp1_b32 s6, 8
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[16:17]
+; CHECK-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; CHECK-NEXT:    s_and_b64 s[4:5], exec, s[4:5]
+; CHECK-NEXT:    s_and_b64 s[6:7], exec, s[10:11]
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_branch .LBB0_3
 ; CHECK-NEXT:  .LBB0_1: ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    s_mov_b64 s[16:17], 0
-; CHECK-NEXT:    s_mov_b64 s[18:19], -1
-; CHECK-NEXT:    s_mov_b64 s[14:15], -1
+; CHECK-NEXT:    s_mov_b64 s[18:19], 0
 ; CHECK-NEXT:    s_mov_b64 s[20:21], -1
+; CHECK-NEXT:    s_mov_b64 s[16:17], -1
+; CHECK-NEXT:    s_mov_b64 s[22:23], -1
 ; CHECK-NEXT:  .LBB0_2: ; %Flow7
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    s_and_b64 vcc, exec, s[20:21]
+; CHECK-NEXT:    s_and_b64 vcc, exec, s[22:23]
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_12
 ; CHECK-NEXT:  .LBB0_3: ; %bb7
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
+; CHECK-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_1
 ; CHECK-NEXT:  ; %bb.4: ; %bb8
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    s_mov_b64 vcc, s[0:1]
+; CHECK-NEXT:    s_mov_b64 vcc, s[4:5]
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_6
 ; CHECK-NEXT:  ; %bb.5: ; %bb9
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    s_mov_b64 s[14:15], 0
-; CHECK-NEXT:    s_mov_b64 s[16:17], -1
-; CHECK-NEXT:    s_mov_b64 s[20:21], s[6:7]
+; CHECK-NEXT:    s_mov_b64 s[16:17], 0
+; CHECK-NEXT:    s_mov_b64 s[18:19], -1
+; CHECK-NEXT:    s_mov_b64 s[22:23], s[10:11]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_7
 ; CHECK-NEXT:    s_branch .LBB0_8
 ; CHECK-NEXT:  .LBB0_6: ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    s_mov_b64 s[14:15], -1
-; CHECK-NEXT:    s_mov_b64 s[16:17], 0
-; CHECK-NEXT:    s_mov_b64 s[20:21], 0
+; CHECK-NEXT:    s_mov_b64 s[16:17], -1
+; CHECK-NEXT:    s_mov_b64 s[18:19], 0
+; CHECK-NEXT:    s_mov_b64 s[22:23], 0
 ; CHECK-NEXT:  .LBB0_7: ; %bb10
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    s_mov_b64 s[16:17], -1
-; CHECK-NEXT:    s_mov_b64 s[14:15], 0
-; CHECK-NEXT:    s_mov_b64 s[20:21], s[12:13]
+; CHECK-NEXT:    s_mov_b64 s[18:19], -1
+; CHECK-NEXT:    s_mov_b64 s[16:17], 0
+; CHECK-NEXT:    s_mov_b64 s[22:23], s[14:15]
 ; CHECK-NEXT:  .LBB0_8: ; %Flow9
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    s_mov_b64 s[18:19], -1
-; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
 ; CHECK-NEXT:    s_mov_b64 s[20:21], -1
+; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[22:23]
+; CHECK-NEXT:    s_mov_b64 s[22:23], -1
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_2
 ; CHECK-NEXT:  ; %bb.9: ; %bb13
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    s_mov_b64 s[14:15], 0
-; CHECK-NEXT:    s_and_b64 vcc, exec, s[6:7]
-; CHECK-NEXT:    s_mov_b64 s[18:19], 0
+; CHECK-NEXT:    s_mov_b64 s[16:17], 0
+; CHECK-NEXT:    s_mov_b64 s[20:21], 0
+; CHECK-NEXT:    s_mov_b64 vcc, s[6:7]
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_11
 ; CHECK-NEXT:  ; %bb.10: ; %bb16
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    s_mov_b64 s[18:19], -1
-; CHECK-NEXT:    s_mov_b64 s[20:21], s[10:11]
+; CHECK-NEXT:    s_mov_b64 s[20:21], -1
+; CHECK-NEXT:    s_mov_b64 s[22:23], s[12:13]
 ; CHECK-NEXT:  .LBB0_11: ; %Flow11
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    s_mov_b64 s[16:17], 0
+; CHECK-NEXT:    s_mov_b64 s[18:19], 0
 ; CHECK-NEXT:    s_branch .LBB0_2
 ; CHECK-NEXT:  .LBB0_12: ; %loop.exit.guard6
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    s_xor_b64 s[20:21], s[18:19], -1
-; CHECK-NEXT:    s_mov_b64 s[18:19], -1
-; CHECK-NEXT:    s_and_b64 vcc, exec, s[20:21]
+; CHECK-NEXT:    s_xor_b64 s[22:23], s[20:21], -1
+; CHECK-NEXT:    s_mov_b64 s[20:21], -1
+; CHECK-NEXT:    s_and_b64 vcc, exec, s[22:23]
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_16
 ; CHECK-NEXT:  ; %bb.13: ; %bb14
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; CHECK-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_15
 ; CHECK-NEXT:  ; %bb.14: ; %bb15
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
@@ -97,22 +102,22 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[24:27], 0
 ; CHECK-NEXT:  .LBB0_15: ; %Flow
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    s_mov_b64 s[18:19], 0
+; CHECK-NEXT:    s_mov_b64 s[20:21], 0
 ; CHECK-NEXT:  .LBB0_16: ; %Flow13
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[18:19]
+; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_3
 ; CHECK-NEXT:  ; %bb.17: ; %loop.exit.guard
-; CHECK-NEXT:    s_and_b64 vcc, exec, s[14:15]
+; CHECK-NEXT:    s_and_b64 vcc, exec, s[16:17]
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_22
 ; CHECK-NEXT:  ; %bb.18: ; %loop.exit.guard5
-; CHECK-NEXT:    s_and_b64 vcc, exec, s[16:17]
+; CHECK-NEXT:    s_and_b64 vcc, exec, s[18:19]
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_23
 ; CHECK-NEXT:  ; %bb.19: ; %bb17
-; CHECK-NEXT:    s_and_b64 vcc, exec, s[4:5]
+; CHECK-NEXT:    s_and_b64 vcc, exec, s[8:9]
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_21
 ; CHECK-NEXT:  ; %bb.20: ; %bb19
-; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; CHECK-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_22
 ; CHECK-NEXT:  .LBB0_21: ; %bb18
 ; CHECK-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
index 4281a4b74cbae..ff48a3fc98018 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
@@ -16515,39 +16515,41 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB68_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16562,34 +16564,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB68_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v3, v6, v3
-; GFX10-NEXT:    v_add_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB68_1
@@ -16603,33 +16605,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB68_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_add_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB68_1
@@ -16643,33 +16645,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB68_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_add_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB68_1
@@ -16683,34 +16685,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB68_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_add_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB68_1
@@ -16788,39 +16790,41 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB69_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16836,35 +16840,35 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX10-NEXT:  .LBB69_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX10-NEXT:    v_add_f32_e32 v0, v6, v0
-; GFX10-NEXT:    v_add_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB69_1
@@ -16877,33 +16881,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB69_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_add_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB69_1
@@ -16917,33 +16921,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB69_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_add_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB69_1
@@ -16959,34 +16963,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB69_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_add_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_add_f32_e32 v0, v7, v0
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB69_1
@@ -17074,38 +17078,40 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
 ; GFX11-NEXT:    flat_load_b32 v0, v[4:5]
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB70_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_lshlrev_b32 v0, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    v_dual_add_f32 v0, v6, v0 :: v_dual_and_b32 v7, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v5, v7, v5
-; GFX11-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v0, v0
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -17120,35 +17126,35 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX10-NEXT:  .LBB70_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX10-NEXT:    v_add_f32_e32 v0, v6, v0
-; GFX10-NEXT:    v_add_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB70_1
@@ -17165,33 +17171,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB70_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX90A-NEXT:    v_add_f32_e32 v3, v6, v3
-; GFX90A-NEXT:    v_add_f32_e32 v0, v7, v0
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v0, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v0, v3, v0, s9
-; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v3, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v3, v0, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[6:7] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB70_1
@@ -17208,33 +17214,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB70_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v1, v0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX908-NEXT:    v_add_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_add_f32_e32 v0, v7, v0
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v0, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v0, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v0, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB70_1
@@ -17249,34 +17255,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB70_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_add_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_add_f32_e32 v0, v7, v0
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB70_1
@@ -17353,41 +17359,41 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory(
 ; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB71_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -17400,35 +17406,35 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory(
 ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB71_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB71_1
@@ -17439,36 +17445,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory(
 ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB71_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_add_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB71_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -17478,36 +17484,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory(
 ; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1]
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB71_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB71_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -17517,37 +17523,37 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory(
 ; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB71_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB71_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -17620,41 +17626,41 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB72_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -17669,35 +17675,35 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:  .LBB72_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB72_1
@@ -17708,36 +17714,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB72_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_add_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB72_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -17747,36 +17753,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB72_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB72_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -17788,37 +17794,37 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB72_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB72_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -17901,41 +17907,41 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    flat_load_b32 v4, v[3:4]
+; GFX11-NEXT:    flat_load_b32 v3, v[3:4]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB73_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -17950,35 +17956,35 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:  .LBB73_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB73_1
@@ -17995,28 +18001,28 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB73_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX90A-NEXT:    v_add_f32_e32 v0, v3, v0
-; GFX90A-NEXT:    v_add_f32_e32 v3, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX90A-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v0, v3, v0, s9
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v0, v6, v0, s9
 ; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -18038,28 +18044,28 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB73_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX908-NEXT:    v_add_f32_e32 v0, v5, v0
-; GFX908-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v0, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX908-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v0, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX908-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v0, v6, v0, s9
 ; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -18077,37 +18083,37 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB73_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB73_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -18185,39 +18191,41 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB74_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -18233,35 +18241,35 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX10-NEXT:  .LBB74_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX10-NEXT:    v_add_f32_e32 v0, v6, v0
-; GFX10-NEXT:    v_add_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB74_1
@@ -18274,35 +18282,35 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB74_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_add_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB74_1
@@ -18316,33 +18324,33 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB74_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_add_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB74_1
@@ -18358,34 +18366,34 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB74_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_add_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_add_f32_e32 v0, v7, v0
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB74_1
@@ -18463,41 +18471,41 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine
 ; GFX11-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB75_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -18512,35 +18520,35 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:  .LBB75_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB75_1
@@ -18551,38 +18559,38 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine
 ; GFX90A-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB75_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_add_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB75_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -18592,36 +18600,36 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine
 ; GFX908-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB75_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB75_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -18633,37 +18641,37 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB75_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB75_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -18740,39 +18748,41 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB76_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -18787,34 +18797,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB76_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v3, v6, v3
-; GFX10-NEXT:    v_add_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB76_1
@@ -18828,33 +18838,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB76_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_add_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB76_1
@@ -18868,33 +18878,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB76_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_add_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB76_1
@@ -18908,34 +18918,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB76_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_add_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB76_1
@@ -19012,41 +19022,41 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p
 ; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB77_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -19059,35 +19069,35 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p
 ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB77_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB77_1
@@ -19098,36 +19108,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p
 ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB77_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_add_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB77_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -19137,36 +19147,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p
 ; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1]
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB77_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB77_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -19176,37 +19186,37 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p
 ; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB77_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB77_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -19280,39 +19290,41 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB78_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -19327,34 +19339,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB78_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v3, v6, v3
-; GFX10-NEXT:    v_add_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB78_1
@@ -19368,33 +19380,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB78_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_add_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB78_1
@@ -19408,33 +19420,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB78_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_add_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB78_1
@@ -19448,34 +19460,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB78_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_add_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB78_1
@@ -19552,41 +19564,41 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_
 ; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB79_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -19599,35 +19611,35 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_
 ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB79_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB79_1
@@ -19638,36 +19650,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_
 ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB79_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_add_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB79_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -19677,36 +19689,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_
 ; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1]
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB79_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB79_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -19716,37 +19728,37 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_
 ; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB79_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB79_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
index 36d7201c6fe46..36aa73fbf8e92 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
@@ -31,13 +31,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -76,13 +76,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -100,13 +100,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -173,13 +173,13 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -220,13 +220,13 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -244,13 +244,13 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -329,18 +329,18 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai
 ; GFX940-NEXT:    s_mov_b32 s1, -1
 ; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX940-NEXT:  .LBB2_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v1, v0
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
-; GFX940-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX940-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v0
+; GFX940-NEXT:    v_max_f32_e32 v0, v3, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v0, v1
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB2_1
@@ -381,17 +381,17 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX90A-NEXT:  .LBB2_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
-; GFX90A-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX90A-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NEXT:    v_max_f32_e32 v0, v3, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB2_1
@@ -408,17 +408,17 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX908-NEXT:  .LBB2_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v1, v0
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
-; GFX908-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX908-NEXT:    v_max_f32_e32 v0, v0, v5
-; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v6, v6
+; GFX908-NEXT:    v_max_f32_e32 v5, v0, v1
+; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB2_1
@@ -482,21 +482,21 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1]
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -528,20 +528,20 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v4, v3
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -551,20 +551,20 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1]
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -622,21 +622,21 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -670,20 +670,20 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v4, v3
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -693,20 +693,20 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -773,23 +773,23 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
 ; GFX940-NEXT:    s_nop 0
 ; GFX940-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX940-NEXT:    flat_load_dword v5, v[4:5]
+; GFX940-NEXT:    flat_load_dword v3, v[4:5]
 ; GFX940-NEXT:    s_mov_b32 s1, -1
 ; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -831,12 +831,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v0, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v3, v1, v1
-; GFX90A-NEXT:    v_max_f32_e32 v0, v3, v0
+; GFX90A-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX90A-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -858,12 +858,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v0, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v1, v1
-; GFX908-NEXT:    v_max_f32_e32 v0, v5, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX908-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -935,13 +935,13 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -982,13 +982,13 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    buffer_wbl2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1008,13 +1008,13 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -1084,21 +1084,21 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX940-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1132,22 +1132,22 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX90A-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v4, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1157,20 +1157,20 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX908-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1235,13 +1235,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1259,14 +1259,15 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_max_f32 v3, v2, v2
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX11-NEXT:    v_max_f32_e32 v3, v5, v3
+; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1286,14 +1287,14 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v2, v2
-; GFX10-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v5, v3
+; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1313,13 +1314,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -1337,13 +1338,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -1426,13 +1427,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1471,13 +1472,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -1495,13 +1496,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -1572,13 +1573,13 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1617,13 +1618,13 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -1641,13 +1642,13 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -1714,13 +1715,13 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1761,13 +1762,13 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -1785,13 +1786,13 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -1870,18 +1871,18 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
 ; GFX940-NEXT:    s_mov_b32 s1, -1
 ; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX940-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v1, v0
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
-; GFX940-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX940-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v0
+; GFX940-NEXT:    v_max_f32_e32 v0, v3, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v0, v1
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB12_1
@@ -1922,17 +1923,17 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX90A-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
-; GFX90A-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX90A-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NEXT:    v_max_f32_e32 v0, v3, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB12_1
@@ -1949,17 +1950,17 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v1, v0
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
-; GFX908-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX908-NEXT:    v_max_f32_e32 v0, v0, v5
-; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v6, v6
+; GFX908-NEXT:    v_max_f32_e32 v5, v0, v1
+; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
@@ -2023,21 +2024,21 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1]
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2069,20 +2070,20 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor
 ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v4, v3
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2092,20 +2093,20 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor
 ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1]
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2163,21 +2164,21 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2211,20 +2212,20 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v4, v3
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2234,20 +2235,20 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2314,23 +2315,23 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
 ; GFX940-NEXT:    s_nop 0
 ; GFX940-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX940-NEXT:    flat_load_dword v5, v[4:5]
+; GFX940-NEXT:    flat_load_dword v3, v[4:5]
 ; GFX940-NEXT:    s_mov_b32 s1, -1
 ; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2372,12 +2373,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v0, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v3, v1, v1
-; GFX90A-NEXT:    v_max_f32_e32 v0, v3, v0
+; GFX90A-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX90A-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -2399,12 +2400,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v0, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v1, v1
-; GFX908-NEXT:    v_max_f32_e32 v0, v5, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX908-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -2476,13 +2477,13 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2523,13 +2524,13 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    buffer_wbl2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2549,13 +2550,13 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -2625,21 +2626,21 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX940-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2673,22 +2674,22 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX90A-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v4, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2698,20 +2699,20 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX908-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2769,29 +2770,29 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX12-NEXT:    s_mov_b32 s0, exec_lo
-; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v1
 ; GFX12-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execz .LBB18_4
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT:    flat_load_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB18_2: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[8:9], v[8:9]
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[4:5]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -2800,25 +2801,24 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:  ; %bb.3: ; %Flow
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX12-NEXT:  .LBB18_4: ; %Flow2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_cbranch_execz .LBB18_6
 ; GFX12-NEXT:  ; %bb.5: ; %atomicrmw.private
 ; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
-; GFX12-NEXT:    scratch_load_b64 v[4:5], v6, off
+; GFX12-NEXT:    scratch_load_b64 v[2:3], v6, off
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
 ; GFX12-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX12-NEXT:  .LBB18_6: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2865,29 +2865,29 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v1
 ; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execz .LBB18_4
 ; GFX11-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT:    flat_load_b64 v[2:3], v[0:1]
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:  .LBB18_2: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[4:5]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -2895,23 +2895,22 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX11-NEXT:  ; %bb.3: ; %Flow
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX11-NEXT:  .LBB18_4: ; %Flow2
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-NEXT:    s_cbranch_execz .LBB18_6
 ; GFX11-NEXT:  ; %bb.5: ; %atomicrmw.private
 ; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
-; GFX11-NEXT:    scratch_load_b64 v[4:5], v6, off
+; GFX11-NEXT:    scratch_load_b64 v[2:3], v6, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
 ; GFX11-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX11-NEXT:  .LBB18_6: ; %atomicrmw.phi
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -3001,91 +3000,90 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execz .LBB18_4
 ; GFX908-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:  .LBB18_2: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[8:9]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT:    v_mov_b32_e32 v9, v3
+; GFX908-NEXT:    v_mov_b32_e32 v8, v2
+; GFX908-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[4:5]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB18_2
 ; GFX908-NEXT:  ; %bb.3: ; %Flow
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX908-NEXT:  .LBB18_4: ; %Flow2
 ; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_cbranch_execz .LBB18_6
 ; GFX908-NEXT:  ; %bb.5: ; %atomicrmw.private
 ; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; GFX908-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX908-NEXT:    buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX908-NEXT:    buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
-; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT:    buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX908-NEXT:    buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
 ; GFX908-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
 ; GFX908-NEXT:    buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
 ; GFX908-NEXT:  .LBB18_6: ; %atomicrmw.phi
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    v_mov_b32_e32 v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
 ; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX8-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execz .LBB18_4
 ; GFX8-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v5, v[4:5]
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[2:3]
+; GFX8-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:  .LBB18_2: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
-; GFX8-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[8:9]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_mov_b32_e32 v9, v3
+; GFX8-NEXT:    v_mov_b32_e32 v8, v2
+; GFX8-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[4:5]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB18_2
 ; GFX8-NEXT:  ; %bb.3: ; %Flow
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX8-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX8-NEXT:  .LBB18_4: ; %Flow2
 ; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_cbranch_execz .LBB18_6
@@ -3093,18 +3091,17 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 4, v6
-; GFX8-NEXT:    buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_load_dword v5, v7, s[0:3], 0 offen
-; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_load_dword v3, v7, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
 ; GFX8-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
 ; GFX8-NEXT:    buffer_store_dword v1, v7, s[0:3], 0 offen
 ; GFX8-NEXT:  .LBB18_6: ; %atomicrmw.phi
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v4
-; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3162,6 +3159,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7f8, v0
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
 ; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
@@ -3188,10 +3186,9 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[8:9], v[8:9]
-; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[6:7], v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[8:9], v[8:9]
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[0:1], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -3210,7 +3207,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_cbranch_execz .LBB19_2
 ; GFX12-NEXT:  .LBB19_6: ; %atomicrmw.private
 ; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -3266,6 +3262,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7f8, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
 ; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
@@ -3288,10 +3285,9 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[6:7], v[8:9], v[8:9]
-; GFX11-NEXT:    v_max_f64 v[6:7], v[6:7], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX11-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3310,7 +3306,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX11-NEXT:    s_cbranch_execz .LBB19_2
 ; GFX11-NEXT:  .LBB19_6: ; %atomicrmw.private
 ; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
 ; GFX11-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -3408,6 +3403,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7f8, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
@@ -3430,9 +3426,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v8, v0
-; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX908-NEXT:    v_max_f64 v[6:7], v[0:1], v[6:7]
+; GFX908-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
 ; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -3448,7 +3443,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908-NEXT:    s_cbranch_execz .LBB19_2
 ; GFX908-NEXT:  .LBB19_6: ; %atomicrmw.private
 ; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc
 ; GFX908-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
 ; GFX908-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
@@ -3464,6 +3458,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
 ; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7f8, v0
@@ -3491,9 +3486,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[6:7]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -3509,7 +3503,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX8-NEXT:    s_cbranch_execz .LBB19_2
 ; GFX8-NEXT:  .LBB19_6: ; %atomicrmw.private
 ; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 4, v6
 ; GFX8-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
@@ -3578,6 +3571,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
 ; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
@@ -3604,10 +3598,9 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[8:9], v[8:9]
-; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[6:7], v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[8:9], v[8:9]
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[0:1], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -3626,7 +3619,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX12-NEXT:  .LBB20_6: ; %atomicrmw.private
 ; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -3683,6 +3675,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
@@ -3705,10 +3698,9 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[6:7], v[8:9], v[8:9]
-; GFX11-NEXT:    v_max_f64 v[6:7], v[6:7], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX11-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3727,7 +3719,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX11-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX11-NEXT:  .LBB20_6: ; %atomicrmw.private
 ; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
 ; GFX11-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -3825,6 +3816,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
@@ -3847,9 +3839,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v8, v0
-; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX908-NEXT:    v_max_f64 v[6:7], v[0:1], v[6:7]
+; GFX908-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
 ; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -3865,7 +3856,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX908-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX908-NEXT:  .LBB20_6: ; %atomicrmw.private
 ; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc
 ; GFX908-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
 ; GFX908-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
@@ -3881,6 +3871,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
 ; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
@@ -3908,9 +3899,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[6:7]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -3926,7 +3916,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX8-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX8-NEXT:  .LBB20_6: ; %atomicrmw.private
 ; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 4, v6
 ; GFX8-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
@@ -3995,6 +3984,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX12-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -4011,21 +4001,20 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ; GFX12-NEXT:  .LBB21_3: ; %atomicrmw.global
-; GFX12-NEXT:    flat_load_b64 v[6:7], v[0:1]
+; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB21_4: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -4034,19 +4023,18 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:  ; %bb.5: ; %Flow
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX12-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_cbranch_execz .LBB21_2
 ; GFX12-NEXT:  .LBB21_6: ; %atomicrmw.private
 ; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
-; GFX12-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
+; GFX12-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[6:7]
+; GFX12-NEXT:    scratch_store_b64 v2, v[0:1], off
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -4092,6 +4080,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX11-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v1
@@ -4104,22 +4093,21 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-NEXT:  .LBB21_3: ; %atomicrmw.global
-; GFX11-NEXT:    flat_load_b64 v[6:7], v[0:1]
+; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:  .LBB21_4: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -4127,19 +4115,18 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX11-NEXT:  ; %bb.5: ; %Flow
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX11-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-NEXT:    s_cbranch_execz .LBB21_2
 ; GFX11-NEXT:  .LBB21_6: ; %atomicrmw.private
 ; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
-; GFX11-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
+; GFX11-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
+; GFX11-NEXT:    scratch_store_b64 v2, v[0:1], off
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4225,6 +4212,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -4237,40 +4225,38 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ; GFX908-NEXT:  .LBB21_3: ; %atomicrmw.global
-; GFX908-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:  .LBB21_4: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX908-NEXT:  ; %bb.5: ; %Flow
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX908-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_cbranch_execz .LBB21_2
 ; GFX908-NEXT:  .LBB21_6: ; %atomicrmw.private
 ; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX908-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX908-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX908-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX908-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX908-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX908-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX908-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
+; GFX908-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX908-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
@@ -4278,6 +4264,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
 ; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4292,44 +4279,42 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ; GFX8-NEXT:  .LBB21_3: ; %atomicrmw.global
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v7, v[4:5]
-; GFX8-NEXT:    flat_load_dword v6, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v5, v[2:3]
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:  .LBB21_4: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX8-NEXT:  ; %bb.5: ; %Flow
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX8-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_cbranch_execz .LBB21_2
 ; GFX8-NEXT:  .LBB21_6: ; %atomicrmw.private
 ; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v4
-; GFX8-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_load_dword v1, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v2
+; GFX8-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_load_dword v1, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX8-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
+; GFX8-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -4385,13 +4370,14 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7f8, v0
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT:    v_add_co_u32 v6, vcc_lo, 0x7f8, v0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
 ; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX12-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v7
 ; GFX12-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB22_3
 ; GFX12-NEXT:  ; %bb.1: ; %Flow2
@@ -4404,21 +4390,20 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ; GFX12-NEXT:  .LBB22_3: ; %atomicrmw.global
-; GFX12-NEXT:    flat_load_b64 v[6:7], v[0:1]
+; GFX12-NEXT:    flat_load_b64 v[2:3], v[6:7]
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB22_4: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -4426,20 +4411,19 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_cbranch_execnz .LBB22_4
 ; GFX12-NEXT:  ; %bb.5: ; %Flow
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX12-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_cbranch_execz .LBB22_2
 ; GFX12-NEXT:  .LBB22_6: ; %atomicrmw.private
-; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
-; GFX12-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, -1, v6, vcc_lo
+; GFX12-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    scratch_store_b64 v2, v[0:1], off
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -4487,12 +4471,13 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7f8, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, 0x7f8, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
 ; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX11-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v7
 ; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB22_3
 ; GFX11-NEXT:  ; %bb.1: ; %Flow2
@@ -4502,42 +4487,40 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-NEXT:  .LBB22_3: ; %atomicrmw.global
-; GFX11-NEXT:    flat_load_b64 v[6:7], v[0:1]
+; GFX11-NEXT:    flat_load_b64 v[2:3], v[6:7]
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:  .LBB22_4: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_cbranch_execnz .LBB22_4
 ; GFX11-NEXT:  ; %bb.5: ; %Flow
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX11-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-NEXT:    s_cbranch_execz .LBB22_2
 ; GFX11-NEXT:  .LBB22_6: ; %atomicrmw.private
-; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
-; GFX11-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, -1, v6, vcc_lo
+; GFX11-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    scratch_store_b64 v2, v[0:1], off
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4627,10 +4610,11 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7f8, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT:    v_add_co_u32_e32 v6, vcc, 0x7f8, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
-; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB22_3
@@ -4641,40 +4625,38 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ; GFX908-NEXT:  .LBB22_3: ; %atomicrmw.global
-; GFX908-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX908-NEXT:    flat_load_dwordx2 v[2:3], v[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:  .LBB22_4: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB22_4
 ; GFX908-NEXT:  ; %bb.5: ; %Flow
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX908-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_cbranch_execz .LBB22_2
 ; GFX908-NEXT:  .LBB22_6: ; %atomicrmw.private
-; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX908-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX908-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX908-NEXT:    v_cndmask_b32_e32 v2, -1, v6, vcc
+; GFX908-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX908-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX908-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX908-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX908-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX908-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
@@ -4682,12 +4664,13 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
 ; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7f8, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7f8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v7
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB22_3
@@ -4698,44 +4681,42 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ; GFX8-NEXT:  .LBB22_3: ; %atomicrmw.global
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v7, v[4:5]
-; GFX8-NEXT:    flat_load_dword v6, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v6
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v7, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    flat_load_dword v2, v[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:  .LBB22_4: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB22_4
 ; GFX8-NEXT:  ; %bb.5: ; %Flow
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX8-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX8-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_cbranch_execz .LBB22_2
 ; GFX8-NEXT:  .LBB22_6: ; %atomicrmw.private
-; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v4
-; GFX8-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_load_dword v1, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, -1, v6, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v2
+; GFX8-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_load_dword v1, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX8-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -4794,13 +4775,14 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT:    v_add_co_u32 v6, vcc_lo, 0xfffff800, v0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo
 ; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX12-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v7
 ; GFX12-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB23_3
 ; GFX12-NEXT:  ; %bb.1: ; %Flow2
@@ -4813,21 +4795,20 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ; GFX12-NEXT:  .LBB23_3: ; %atomicrmw.global
-; GFX12-NEXT:    flat_load_b64 v[6:7], v[0:1]
+; GFX12-NEXT:    flat_load_b64 v[2:3], v[6:7]
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB23_4: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -4835,20 +4816,19 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_cbranch_execnz .LBB23_4
 ; GFX12-NEXT:  ; %bb.5: ; %Flow
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX12-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_cbranch_execz .LBB23_2
 ; GFX12-NEXT:  .LBB23_6: ; %atomicrmw.private
-; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
-; GFX12-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, -1, v6, vcc_lo
+; GFX12-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    scratch_store_b64 v2, v[0:1], off
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -4897,12 +4877,13 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX11-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v7
 ; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB23_3
 ; GFX11-NEXT:  ; %bb.1: ; %Flow2
@@ -4912,42 +4893,40 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-NEXT:  .LBB23_3: ; %atomicrmw.global
-; GFX11-NEXT:    flat_load_b64 v[6:7], v[0:1]
+; GFX11-NEXT:    flat_load_b64 v[2:3], v[6:7]
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:  .LBB23_4: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_cbranch_execnz .LBB23_4
 ; GFX11-NEXT:  ; %bb.5: ; %Flow
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX11-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-NEXT:    s_cbranch_execz .LBB23_2
 ; GFX11-NEXT:  .LBB23_6: ; %atomicrmw.private
-; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
-; GFX11-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, -1, v6, vcc_lo
+; GFX11-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    scratch_store_b64 v2, v[0:1], off
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5037,10 +5016,11 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT:    v_add_co_u32_e32 v6, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v7, vcc, -1, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
-; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB23_3
@@ -5051,40 +5031,38 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ; GFX908-NEXT:  .LBB23_3: ; %atomicrmw.global
-; GFX908-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX908-NEXT:    flat_load_dwordx2 v[2:3], v[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:  .LBB23_4: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB23_4
 ; GFX908-NEXT:  ; %bb.5: ; %Flow
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX908-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_cbranch_execz .LBB23_2
 ; GFX908-NEXT:  .LBB23_6: ; %atomicrmw.private
-; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX908-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX908-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX908-NEXT:    v_cndmask_b32_e32 v2, -1, v6, vcc
+; GFX908-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX908-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX908-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX908-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX908-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX908-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
@@ -5092,12 +5070,13 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
 ; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, -1, v1, vcc
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v7
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB23_3
@@ -5108,44 +5087,42 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ; GFX8-NEXT:  .LBB23_3: ; %atomicrmw.global
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v7, v[4:5]
-; GFX8-NEXT:    flat_load_dword v6, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v6
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v7, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    flat_load_dword v2, v[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:  .LBB23_4: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB23_4
 ; GFX8-NEXT:  ; %bb.5: ; %Flow
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX8-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX8-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_cbranch_execz .LBB23_2
 ; GFX8-NEXT:  .LBB23_6: ; %atomicrmw.private
-; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v4
-; GFX8-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_load_dword v1, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, -1, v6, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v2
+; GFX8-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_load_dword v1, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX8-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -5204,29 +5181,29 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX12-NEXT:    s_mov_b32 s0, exec_lo
-; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v1
 ; GFX12-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execz .LBB24_4
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT:    flat_load_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB24_2: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[8:9], v[8:9]
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[4:5]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -5235,25 +5212,24 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX12-NEXT:  ; %bb.3: ; %Flow
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX12-NEXT:  .LBB24_4: ; %Flow2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_cbranch_execz .LBB24_6
 ; GFX12-NEXT:  ; %bb.5: ; %atomicrmw.private
 ; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
-; GFX12-NEXT:    scratch_load_b64 v[4:5], v6, off
+; GFX12-NEXT:    scratch_load_b64 v[2:3], v6, off
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
 ; GFX12-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX12-NEXT:  .LBB24_6: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5300,29 +5276,29 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v1
 ; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execz .LBB24_4
 ; GFX11-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT:    flat_load_b64 v[2:3], v[0:1]
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:  .LBB24_2: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[4:5]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -5330,78 +5306,76 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX11-NEXT:  ; %bb.3: ; %Flow
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX11-NEXT:  .LBB24_4: ; %Flow2
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-NEXT:    s_cbranch_execz .LBB24_6
 ; GFX11-NEXT:  ; %bb.5: ; %atomicrmw.private
 ; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
-; GFX11-NEXT:    scratch_load_b64 v[4:5], v6, off
+; GFX11-NEXT:    scratch_load_b64 v[2:3], v6, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
 ; GFX11-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX11-NEXT:  .LBB24_6: ; %atomicrmw.phi
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX10-NEXT:    s_mov_b64 s[4:5], src_private_base
-; GFX10-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX10-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v1
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX10-NEXT:    s_xor_b32 s4, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execz .LBB24_4
 ; GFX10-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX10-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX10-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB24_2: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v7, v5
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
-; GFX10-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX10-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX10-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v9, v3
+; GFX10-NEXT:    v_mov_b32_e32 v8, v2
+; GFX10-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX10-NEXT:    v_max_f64 v[6:7], v[2:3], v[4:5]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB24_2
 ; GFX10-NEXT:  ; %bb.3: ; %Flow
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX10-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX10-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX10-NEXT:  .LBB24_4: ; %Flow2
 ; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
 ; GFX10-NEXT:    s_cbranch_execz .LBB24_6
 ; GFX10-NEXT:  ; %bb.5: ; %atomicrmw.private
 ; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
-; GFX10-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX10-NEXT:    buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX10-NEXT:    buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT:    v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
 ; GFX10-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
 ; GFX10-NEXT:    buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
 ; GFX10-NEXT:  .LBB24_6: ; %atomicrmw.phi
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
@@ -5409,140 +5383,138 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB24_4
 ; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX90A-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:  .LBB24_2: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[4:5], v[8:9]
-; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX90A-NEXT:    v_max_f64 v[6:7], v[2:3], v[4:5]
+; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB24_2
 ; GFX90A-NEXT:  ; %bb.3: ; %Flow
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX90A-NEXT:  .LBB24_4: ; %Flow2
 ; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB24_6
 ; GFX90A-NEXT:  ; %bb.5: ; %atomicrmw.private
 ; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX90A-NEXT:    buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
-; GFX90A-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX90A-NEXT:    buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT:    v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX90A-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX90A-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
 ; GFX90A-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
 ; GFX90A-NEXT:    buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:  .LBB24_6: ; %atomicrmw.phi
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execz .LBB24_4
 ; GFX908-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:  .LBB24_2: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[8:9]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT:    v_mov_b32_e32 v9, v3
+; GFX908-NEXT:    v_mov_b32_e32 v8, v2
+; GFX908-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[4:5]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB24_2
 ; GFX908-NEXT:  ; %bb.3: ; %Flow
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX908-NEXT:  .LBB24_4: ; %Flow2
 ; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_cbranch_execz .LBB24_6
 ; GFX908-NEXT:  ; %bb.5: ; %atomicrmw.private
 ; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; GFX908-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX908-NEXT:    buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX908-NEXT:    buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
-; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT:    buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX908-NEXT:    buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
 ; GFX908-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
 ; GFX908-NEXT:    buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
 ; GFX908-NEXT:  .LBB24_6: ; %atomicrmw.phi
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    v_mov_b32_e32 v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
 ; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX8-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execz .LBB24_4
 ; GFX8-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v5, v[4:5]
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[2:3]
+; GFX8-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:  .LBB24_2: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
-; GFX8-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[8:9]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_mov_b32_e32 v9, v3
+; GFX8-NEXT:    v_mov_b32_e32 v8, v2
+; GFX8-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[4:5]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB24_2
 ; GFX8-NEXT:  ; %bb.3: ; %Flow
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX8-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX8-NEXT:  .LBB24_4: ; %Flow2
 ; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_cbranch_execz .LBB24_6
@@ -5550,18 +5522,17 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 4, v6
-; GFX8-NEXT:    buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_load_dword v5, v7, s[0:3], 0 offen
-; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_load_dword v3, v7, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
 ; GFX8-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
 ; GFX8-NEXT:    buffer_store_dword v1, v7, s[0:3], 0 offen
 ; GFX8-NEXT:  .LBB24_6: ; %atomicrmw.phi
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v4
-; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5570,37 +5541,37 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0xc0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX7-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX7-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX7-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execz .LBB24_4
 ; GFX7-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 4, v0
-; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v5, v[4:5]
-; GFX7-NEXT:    flat_load_dword v4, v[0:1]
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
+; GFX7-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v3, v[2:3]
+; GFX7-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX7-NEXT:  .LBB24_2: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX7-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX7-NEXT:    v_max_f64 v[4:5], v[4:5], v[8:9]
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[4:5]
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
 ; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB24_2
 ; GFX7-NEXT:  ; %bb.3: ; %Flow
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX7-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX7-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX7-NEXT:  .LBB24_4: ; %Flow2
 ; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX7-NEXT:    s_cbranch_execz .LBB24_6
@@ -5608,18 +5579,17 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX7-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 4, v6
-; GFX7-NEXT:    buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX7-NEXT:    buffer_load_dword v5, v7, s[0:3], 0 offen
-; GFX7-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7-NEXT:    buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_load_dword v3, v7, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT:    v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX7-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
 ; GFX7-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
 ; GFX7-NEXT:    buffer_store_dword v1, v7, s[0:3], 0 offen
 ; GFX7-NEXT:  .LBB24_6: ; %atomicrmw.phi
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v4
-; GFX7-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
@@ -5634,29 +5604,29 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX12-NEXT:    s_mov_b32 s0, exec_lo
-; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v1
 ; GFX12-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execz .LBB25_4
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT:    flat_load_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB25_2: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[8:9], v[8:9]
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[4:5]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -5665,25 +5635,24 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am
 ; GFX12-NEXT:  ; %bb.3: ; %Flow
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX12-NEXT:  .LBB25_4: ; %Flow2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_cbranch_execz .LBB25_6
 ; GFX12-NEXT:  ; %bb.5: ; %atomicrmw.private
 ; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
-; GFX12-NEXT:    scratch_load_b64 v[4:5], v6, off
+; GFX12-NEXT:    scratch_load_b64 v[2:3], v6, off
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
 ; GFX12-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX12-NEXT:  .LBB25_6: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5730,29 +5699,29 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am
 ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v1
 ; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execz .LBB25_4
 ; GFX11-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT:    flat_load_b64 v[2:3], v[0:1]
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:  .LBB25_2: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[4:5]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -5760,23 +5729,22 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am
 ; GFX11-NEXT:  ; %bb.3: ; %Flow
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX11-NEXT:  .LBB25_4: ; %Flow2
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-NEXT:    s_cbranch_execz .LBB25_6
 ; GFX11-NEXT:  ; %bb.5: ; %atomicrmw.private
 ; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
-; GFX11-NEXT:    scratch_load_b64 v[4:5], v6, off
+; GFX11-NEXT:    scratch_load_b64 v[2:3], v6, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
 ; GFX11-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX11-NEXT:  .LBB25_6: ; %atomicrmw.phi
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -5866,91 +5834,90 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am
 ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execz .LBB25_4
 ; GFX908-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:  .LBB25_2: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[8:9]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT:    v_mov_b32_e32 v9, v3
+; GFX908-NEXT:    v_mov_b32_e32 v8, v2
+; GFX908-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[4:5]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB25_2
 ; GFX908-NEXT:  ; %bb.3: ; %Flow
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX908-NEXT:  .LBB25_4: ; %Flow2
 ; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_cbranch_execz .LBB25_6
 ; GFX908-NEXT:  ; %bb.5: ; %atomicrmw.private
 ; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; GFX908-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX908-NEXT:    buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX908-NEXT:    buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
-; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT:    buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX908-NEXT:    buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
 ; GFX908-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
 ; GFX908-NEXT:    buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
 ; GFX908-NEXT:  .LBB25_6: ; %atomicrmw.phi
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    v_mov_b32_e32 v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
 ; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX8-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execz .LBB25_4
 ; GFX8-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v5, v[4:5]
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[2:3]
+; GFX8-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:  .LBB25_2: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
-; GFX8-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[8:9]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_mov_b32_e32 v9, v3
+; GFX8-NEXT:    v_mov_b32_e32 v8, v2
+; GFX8-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[4:5]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB25_2
 ; GFX8-NEXT:  ; %bb.3: ; %Flow
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX8-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX8-NEXT:  .LBB25_4: ; %Flow2
 ; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_cbranch_execz .LBB25_6
@@ -5958,18 +5925,17 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am
 ; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 4, v6
-; GFX8-NEXT:    buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_load_dword v5, v7, s[0:3], 0 offen
-; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_load_dword v3, v7, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
 ; GFX8-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
 ; GFX8-NEXT:    buffer_store_dword v1, v7, s[0:3], 0 offen
 ; GFX8-NEXT:  .LBB25_6: ; %atomicrmw.phi
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v4
-; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6032,8 +5998,9 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
@@ -6045,12 +6012,11 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v7
+; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -6083,14 +6049,14 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
-; GFX940-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT:    v_max_f16_e32 v5, v6, v5
+; GFX940-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX940-NEXT:    buffer_wbl2 sc1
@@ -6110,8 +6076,9 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
@@ -6123,12 +6090,11 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-NEXT:    v_max_f16_e32 v5, v5, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v5, v5, v7
+; GFX11-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -6152,6 +6118,7 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
@@ -6163,10 +6130,9 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX10-NEXT:    v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT:    v_max_f16_e32 v5, v5, v7
+; GFX10-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -6195,14 +6161,14 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT:    v_max_f16_e32 v5, v6, v5
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX90A-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
@@ -6229,14 +6195,14 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
-; GFX908-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT:    v_max_f16_e32 v5, v7, v5
+; GFX908-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX908-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
@@ -6263,17 +6229,17 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT:    v_max_f16_e32 v5, v7, v5
-; GFX8-NEXT:    v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
 ; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -6336,10 +6302,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -6349,12 +6316,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v7
+; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -6389,14 +6355,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
-; GFX940-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT:    v_max_f16_e32 v5, v6, v5
+; GFX940-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX940-NEXT:    buffer_wbl2 sc1
@@ -6417,10 +6383,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -6430,12 +6397,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-NEXT:    v_max_f16_e32 v5, v5, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v5, v5, v7
+; GFX11-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -6460,9 +6426,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v5, v[0:1]
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -6471,10 +6438,9 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX10-NEXT:    v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT:    v_max_f16_e32 v5, v5, v7
+; GFX10-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -6504,14 +6470,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT:    v_max_f16_e32 v5, v6, v5
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX90A-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
@@ -6539,14 +6505,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
-; GFX908-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT:    v_max_f16_e32 v5, v7, v5
+; GFX908-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX908-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
@@ -6574,17 +6540,17 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT:    v_max_f16_e32 v5, v7, v5
-; GFX8-NEXT:    v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
 ; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -6649,10 +6615,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -6662,12 +6629,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v7
+; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -6703,14 +6669,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
-; GFX940-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT:    v_max_f16_e32 v5, v6, v5
+; GFX940-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX940-NEXT:    buffer_wbl2 sc1
@@ -6731,10 +6697,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -6744,12 +6711,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-NEXT:    v_max_f16_e32 v5, v5, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v5, v5, v7
+; GFX11-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -6774,9 +6740,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v5, v[0:1]
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -6785,10 +6752,9 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX10-NEXT:    v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT:    v_max_f16_e32 v5, v5, v7
+; GFX10-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -6818,14 +6784,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT:    v_max_f16_e32 v5, v6, v5
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX90A-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
@@ -6853,14 +6819,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
-; GFX908-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT:    v_max_f16_e32 v5, v7, v5
+; GFX908-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX908-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
@@ -6888,17 +6854,17 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT:    v_max_f16_e32 v5, v7, v5
-; GFX8-NEXT:    v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
 ; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -6962,8 +6928,9 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
@@ -6975,10 +6942,9 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v3, v7
+; GFX12-NEXT:    v_max_num_f16_e32 v3, v3, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
@@ -7012,13 +6978,13 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
 ; GFX940-NEXT:    v_not_b32_e32 v6, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX940-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT:    v_max_f16_e32 v4, v4, v7
+; GFX940-NEXT:    v_max_f16_e32 v4, v4, v2
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
 ; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
@@ -7038,8 +7004,9 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
@@ -7051,10 +7018,9 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT:    v_max_f16_e32 v3, v3, v7
+; GFX11-NEXT:    v_max_f16_e32 v3, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
@@ -7079,6 +7045,7 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
@@ -7090,9 +7057,8 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX10-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT:    v_max_f16_e32 v3, v3, v7
+; GFX10-NEXT:    v_max_f16_e32 v3, v3, v2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -7121,13 +7087,13 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v6, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v7
+; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v2
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
 ; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
 ; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
@@ -7154,13 +7120,13 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
 ; GFX908-NEXT:    v_not_b32_e32 v6, v3
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX908-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT:    v_max_f16_e32 v3, v3, v7
+; GFX908-NEXT:    v_max_f16_e32 v3, v3, v2
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
 ; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
@@ -7187,16 +7153,16 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
 ; GFX8-NEXT:    v_not_b32_e32 v6, v3
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v7
-; GFX8-NEXT:    v_and_b32_e32 v8, v4, v6
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v8, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
 ; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -7255,36 +7221,36 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT:    v_max_num_f16_e32 v6, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v6, v3
+; GFX12-NEXT:    v_not_b32_e32 v5, v5
 ; GFX12-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v3, v7
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -7302,29 +7268,29 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
 ; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v5
-; GFX940-NEXT:    flat_load_dword v5, v[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v6, v4
+; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX940-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX940-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT:    v_max_f16_e32 v4, v4, v7
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v4
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7334,37 +7300,37 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    v_max_f16_e32 v6, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v6, v3
+; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT:    v_max_f16_e32 v3, v3, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -7376,31 +7342,31 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_max_f16_e32 v6, v2, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v6, v3
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v5, v5
 ; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX10-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT:    v_max_f16_e32 v3, v3, v7
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
@@ -7411,31 +7377,31 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
+; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v6, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX90A-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v7
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7445,31 +7411,31 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    flat_load_dword v4, v[0:1]
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
+; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT:    v_not_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX908-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX908-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT:    v_max_f16_e32 v3, v3, v7
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7479,32 +7445,32 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT:    v_not_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX8-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v7
-; GFX8-NEXT:    v_and_b32_e32 v8, v4, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v8, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7559,36 +7525,36 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT:    v_max_num_f16_e32 v6, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v6, v3
+; GFX12-NEXT:    v_not_b32_e32 v5, v5
 ; GFX12-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v3, v7
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -7607,29 +7573,29 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
 ; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v5
-; GFX940-NEXT:    flat_load_dword v5, v[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v6, v4
+; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX940-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX940-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT:    v_max_f16_e32 v4, v4, v7
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v4
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7639,37 +7605,37 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_max_f16_e32 v6, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v6, v3
+; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT:    v_max_f16_e32 v3, v3, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -7681,31 +7647,31 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_max_f16_e32 v6, v2, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v6, v3
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v5, v5
 ; GFX10-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX10-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT:    v_max_f16_e32 v3, v3, v7
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB31_1
@@ -7716,31 +7682,31 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
+; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v6, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX90A-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v7
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7750,31 +7716,31 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    flat_load_dword v4, v[0:1]
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
+; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT:    v_not_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX908-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX908-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT:    v_max_f16_e32 v3, v3, v7
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7784,32 +7750,32 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT:    v_not_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX8-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v7
-; GFX8-NEXT:    v_and_b32_e32 v8, v4, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v8, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7865,15 +7831,15 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v5, v4, v4
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v5, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v3, v4, v4
+; GFX12-NEXT:    v_max_num_f16_e32 v3, v3, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX12-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
@@ -7898,14 +7864,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2046
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
 ; GFX940-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f16_e32 v3, v5, v5
-; GFX940-NEXT:    v_max_f16_e32 v3, v3, v4
+; GFX940-NEXT:    v_max_f16_e32 v3, v3, v2
 ; GFX940-NEXT:    v_and_or_b32 v4, v5, s2, v3
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
@@ -7924,15 +7890,15 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    v_max_f16_e32 v3, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v5, v4, v4
-; GFX11-NEXT:    v_max_f16_e32 v3, v5, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v3, v4, v4
+; GFX11-NEXT:    v_max_f16_e32 v3, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX11-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
@@ -7956,23 +7922,23 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_max_f16_e32 v1, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX10-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_max_f16_e32 v0, v2, v2
-; GFX10-NEXT:    v_max_f16_e32 v5, v1, v1
-; GFX10-NEXT:    v_max_f16_e32 v0, v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_max_f16_e32 v0, v6, v6
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX10-NEXT:    v_and_or_b32 v5, 0xffff0000, v6, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB32_1
@@ -7985,14 +7951,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2046
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
 ; GFX90A-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f16_e32 v3, v5, v5
-; GFX90A-NEXT:    v_max_f16_e32 v3, v3, v4
+; GFX90A-NEXT:    v_max_f16_e32 v3, v3, v2
 ; GFX90A-NEXT:    v_and_or_b32 v4, v5, s6, v3
 ; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -8011,14 +7977,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2046
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
 ; GFX908-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f16_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f16_e32 v3, v4, v4
-; GFX908-NEXT:    v_max_f16_e32 v3, v3, v5
+; GFX908-NEXT:    v_max_f16_e32 v3, v3, v2
 ; GFX908-NEXT:    v_and_or_b32 v3, v4, s6, v3
 ; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -8039,19 +8005,19 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v1, v2, v2
 ; GFX8-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v0, v1, v1
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v5
-; GFX8-NEXT:    v_or_b32_e32 v0, v6, v0
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_max_f16_e32 v0, v6, v6
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX8-NEXT:    v_or_b32_e32 v5, v2, v0
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB32_1
@@ -8101,24 +8067,24 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:2046
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-NEXT:    v_max_num_f16_e32 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v2, v2
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f16_e32 v5, v4, v4
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v3, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v5, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v4
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -8132,23 +8098,23 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1] offset:2046
+; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2046
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
 ; GFX940-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f16_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f16_e32 v4, v5, v5
-; GFX940-NEXT:    v_max_f16_e32 v3, v4, v3
-; GFX940-NEXT:    v_and_or_b32 v4, v5, s2, v3
+; GFX940-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v4
+; GFX940-NEXT:    v_and_or_b32 v2, v3, s2, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8158,25 +8124,25 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_
 ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2046
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_max_f16_e32 v3, v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e32 v5, v4, v4
+; GFX11-NEXT:    v_max_f16_e32 v2, v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v3, v5, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v4
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -8190,23 +8156,23 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fe, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_max_f16_e32 v3, v2, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f16_e32 v5, v4, v4
-; GFX10-NEXT:    v_max_f16_e32 v3, v5, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX10-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX10-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v4
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB33_1
@@ -8217,22 +8183,22 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_
 ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2046
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2046
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
 ; GFX90A-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f16_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f16_e32 v4, v5, v5
-; GFX90A-NEXT:    v_max_f16_e32 v3, v4, v3
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, s6, v3
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
+; GFX90A-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v4
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, s6, v2
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8242,22 +8208,22 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_
 ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2046
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2046
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
 ; GFX908-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f16_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f16_e32 v5, v4, v4
-; GFX908-NEXT:    v_max_f16_e32 v3, v5, v3
-; GFX908-NEXT:    v_and_or_b32 v3, v4, s6, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc
+; GFX908-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v4
+; GFX908-NEXT:    v_and_or_b32 v2, v3, s6, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8269,22 +8235,22 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX8-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f16_e32 v3, v2, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_e32 v5, v4, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT:    v_max_f16_e32 v3, v5, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v6, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v4
+; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8334,10 +8300,11 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -8347,12 +8314,11 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v7
+; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -8388,14 +8354,14 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
-; GFX940-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT:    v_max_f16_e32 v5, v6, v5
+; GFX940-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
@@ -8416,10 +8382,11 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -8429,12 +8396,11 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-NEXT:    v_max_f16_e32 v5, v5, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v5, v5, v7
+; GFX11-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -8459,9 +8425,10 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v5, v[0:1]
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -8470,10 +8437,9 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX10-NEXT:    v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT:    v_max_f16_e32 v5, v5, v7
+; GFX10-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -8503,14 +8469,14 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT:    v_max_f16_e32 v5, v6, v5
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX90A-NEXT:    buffer_wbl2
@@ -8540,14 +8506,14 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
-; GFX908-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT:    v_max_f16_e32 v5, v7, v5
+; GFX908-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX908-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
@@ -8575,17 +8541,17 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT:    v_max_f16_e32 v5, v7, v5
-; GFX8-NEXT:    v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
 ; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -8648,37 +8614,37 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT:    v_max_num_f16_e32 v6, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v6, v3
+; GFX12-NEXT:    v_not_b32_e32 v5, v5
 ; GFX12-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v3, v7
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -8696,29 +8662,29 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
 ; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v5
-; GFX940-NEXT:    flat_load_dword v5, v[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v6, v4
+; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX940-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX940-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT:    v_max_f16_e32 v4, v4, v7
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v4
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8728,37 +8694,37 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX11-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    v_max_f16_e32 v6, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v6, v3
+; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT:    v_max_f16_e32 v3, v3, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -8770,31 +8736,31 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX10-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_max_f16_e32 v6, v2, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v6, v3
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v5, v5
 ; GFX10-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX10-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT:    v_max_f16_e32 v3, v3, v7
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB35_1
@@ -8805,33 +8771,33 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX90A-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
+; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v6, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX90A-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v7
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8841,31 +8807,31 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX908-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    flat_load_dword v4, v[0:1]
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
+; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT:    v_not_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX908-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX908-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT:    v_max_f16_e32 v3, v3, v7
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8875,32 +8841,32 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX8-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT:    v_not_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX8-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v7
-; GFX8-NEXT:    v_and_b32_e32 v8, v4, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v8, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -12308,15 +12274,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -12338,14 +12304,14 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX940-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX940-NEXT:    v_pk_max_f16 v3, v5, v5
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_max_f16 v4, v3, v4
+; GFX940-NEXT:    v_pk_max_f16 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12363,15 +12329,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX11-NEXT:    v_pk_max_f16 v3, v5, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX11-NEXT:    v_pk_max_f16 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12391,14 +12357,14 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX10-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT:    v_pk_max_f16 v3, v5, v3
+; GFX10-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX10-NEXT:    v_pk_max_f16 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12418,13 +12384,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX90A-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX90A-NEXT:    v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT:    v_pk_max_f16 v4, v3, v4
+; GFX90A-NEXT:    v_pk_max_f16 v4, v3, v2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -12442,13 +12408,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX908-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_pk_max_f16 v5, v2, v2
 ; GFX908-NEXT:    v_pk_max_f16 v3, v4, v4
-; GFX908-NEXT:    v_pk_max_f16 v3, v3, v5
+; GFX908-NEXT:    v_pk_max_f16 v3, v3, v2
 ; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -12466,21 +12432,21 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
-; GFX8-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v3, v2, v2
-; GFX8-NEXT:    v_max_f16_sdwa v6, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT:    v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v3, v7, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v5
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8-NEXT:    v_max_f16_sdwa v3, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT:    v_max_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB46_1
@@ -12544,15 +12510,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -12574,14 +12540,14 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX940-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX940-NEXT:    v_pk_max_f16 v3, v5, v5
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_max_f16 v4, v3, v4
+; GFX940-NEXT:    v_pk_max_f16 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12599,15 +12565,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX11-NEXT:    v_pk_max_f16 v3, v5, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX11-NEXT:    v_pk_max_f16 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12628,21 +12594,21 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_pk_max_f16 v1, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX10-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_pk_max_f16 v0, v2, v2
-; GFX10-NEXT:    v_pk_max_f16 v5, v1, v1
-; GFX10-NEXT:    v_pk_max_f16 v0, v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_pk_max_f16 v0, v6, v6
+; GFX10-NEXT:    v_pk_max_f16 v5, v0, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB47_1
@@ -12655,13 +12621,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX90A-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX90A-NEXT:    v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT:    v_pk_max_f16 v4, v3, v4
+; GFX90A-NEXT:    v_pk_max_f16 v4, v3, v2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -12679,13 +12645,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX908-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_pk_max_f16 v5, v2, v2
 ; GFX908-NEXT:    v_pk_max_f16 v3, v4, v4
-; GFX908-NEXT:    v_pk_max_f16 v3, v3, v5
+; GFX908-NEXT:    v_pk_max_f16 v3, v3, v2
 ; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -12705,21 +12671,21 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT:    v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v7, v1, v1
-; GFX8-NEXT:    v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v0, v7, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v0
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB47_1
@@ -12783,15 +12749,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -12821,19 +12787,19 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi
 ; GFX940-NEXT:    s_mov_b32 s1, -1
 ; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v1, v2, v2
 ; GFX940-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v1, v0
-; GFX940-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX940-NEXT:    v_pk_max_f16 v0, v1, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, v0
+; GFX940-NEXT:    v_pk_max_f16 v0, v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_max_f16 v0, v0, v3
+; GFX940-NEXT:    v_pk_max_f16 v2, v0, v1
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB48_1
@@ -12852,20 +12818,20 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
 ; GFX11-NEXT:    flat_load_b32 v0, v[4:5]
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_pk_max_f16 v1, v2, v2
 ; GFX11-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11-NEXT:    v_pk_max_f16 v0, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v5, v1, v1
-; GFX11-NEXT:    v_pk_max_f16 v0, v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_max_f16 v0, v6, v6
+; GFX11-NEXT:    v_pk_max_f16 v5, v0, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -12879,21 +12845,21 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_pk_max_f16 v1, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX10-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_pk_max_f16 v0, v2, v2
-; GFX10-NEXT:    v_pk_max_f16 v5, v1, v1
-; GFX10-NEXT:    v_pk_max_f16 v0, v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_pk_max_f16 v0, v6, v6
+; GFX10-NEXT:    v_pk_max_f16 v5, v0, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB48_1
@@ -12910,17 +12876,17 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v1, v2, v2
 ; GFX90A-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX90A-NEXT:    v_pk_max_f16 v0, v1, v1
-; GFX90A-NEXT:    v_pk_max_f16 v0, v0, v3
-; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NEXT:    v_pk_max_f16 v0, v3, v3
+; GFX90A-NEXT:    v_pk_max_f16 v2, v0, v1
+; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB48_1
@@ -12937,17 +12903,17 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v1, v2, v2
 ; GFX908-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v1, v0
-; GFX908-NEXT:    v_pk_max_f16 v5, v2, v2
-; GFX908-NEXT:    v_pk_max_f16 v0, v1, v1
-; GFX908-NEXT:    v_pk_max_f16 v0, v0, v5
-; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v0
+; GFX908-NEXT:    v_pk_max_f16 v0, v6, v6
+; GFX908-NEXT:    v_pk_max_f16 v5, v0, v1
+; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB48_1
@@ -12962,21 +12928,21 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT:    v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v7, v1, v1
-; GFX8-NEXT:    v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v0, v7, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v0
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB48_1
@@ -13039,21 +13005,21 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -13067,22 +13033,22 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1]
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX940-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_max_f16 v4, v4, v3
+; GFX940-NEXT:    v_pk_max_f16 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13092,22 +13058,22 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p
 ; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v3, v5, v3
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -13119,21 +13085,21 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p
 ; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT:    v_pk_max_f16 v3, v5, v3
+; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB49_1
@@ -13144,20 +13110,20 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p
 ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX90A-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_max_f16 v4, v5, v5
-; GFX90A-NEXT:    v_pk_max_f16 v4, v4, v3
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v4
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13167,20 +13133,20 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p
 ; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1]
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX908-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX908-NEXT:    v_pk_max_f16 v3, v5, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT:    v_pk_max_f16 v2, v2, v4
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13190,24 +13156,24 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p
 ; GFX8-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
 ; GFX8-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT:    v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v5, v7, v6
-; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT:    v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v6, v6, v5
+; GFX8-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13266,21 +13232,21 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -13294,22 +13260,22 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX940-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_max_f16 v4, v4, v3
+; GFX940-NEXT:    v_pk_max_f16 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13319,22 +13285,22 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v3, v5, v3
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -13348,21 +13314,21 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT:    v_pk_max_f16 v3, v5, v3
+; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB50_1
@@ -13373,20 +13339,20 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_max_f16 v4, v5, v5
-; GFX90A-NEXT:    v_pk_max_f16 v4, v4, v3
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v4
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13396,20 +13362,20 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX908-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX908-NEXT:    v_pk_max_f16 v3, v5, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT:    v_pk_max_f16 v2, v2, v4
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13421,24 +13387,24 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
 ; GFX8-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT:    v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v5, v7, v6
-; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT:    v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v6, v6, v5
+; GFX8-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13500,21 +13466,21 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:-2048
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -13532,24 +13498,24 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
 ; GFX940-NEXT:    s_nop 0
 ; GFX940-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX940-NEXT:    flat_load_dword v5, v[4:5]
+; GFX940-NEXT:    flat_load_dword v3, v[4:5]
 ; GFX940-NEXT:    s_mov_b32 s1, -1
 ; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX940-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_max_f16 v4, v4, v3
+; GFX940-NEXT:    v_pk_max_f16 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13563,22 +13529,22 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    flat_load_b32 v4, v[3:4]
+; GFX11-NEXT:    flat_load_b32 v3, v[3:4]
+; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v3, v5, v3
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -13592,21 +13558,21 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT:    v_pk_max_f16 v3, v5, v3
+; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB51_1
@@ -13623,12 +13589,12 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX90A-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_pk_max_f16 v0, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_max_f16 v3, v1, v1
-; GFX90A-NEXT:    v_pk_max_f16 v0, v3, v0
+; GFX90A-NEXT:    v_pk_max_f16 v0, v1, v1
+; GFX90A-NEXT:    v_pk_max_f16 v0, v0, v2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -13650,12 +13616,12 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX908-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_pk_max_f16 v0, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_pk_max_f16 v5, v1, v1
-; GFX908-NEXT:    v_pk_max_f16 v0, v5, v0
+; GFX908-NEXT:    v_pk_max_f16 v0, v1, v1
+; GFX908-NEXT:    v_pk_max_f16 v0, v0, v2
 ; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -13673,24 +13639,24 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
 ; GFX8-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT:    v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v5, v7, v6
-; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT:    v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v6, v6, v5
+; GFX8-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13753,15 +13719,15 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v3, v2
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
@@ -13784,14 +13750,14 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX940-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX940-NEXT:    v_pk_max_f16 v3, v5, v5
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_max_f16 v4, v3, v4
+; GFX940-NEXT:    v_pk_max_f16 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13809,15 +13775,15 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX11-NEXT:    v_pk_max_f16 v3, v5, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX11-NEXT:    v_pk_max_f16 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13838,21 +13804,21 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_pk_max_f16 v1, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX10-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_pk_max_f16 v0, v2, v2
-; GFX10-NEXT:    v_pk_max_f16 v5, v1, v1
-; GFX10-NEXT:    v_pk_max_f16 v0, v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_pk_max_f16 v0, v6, v6
+; GFX10-NEXT:    v_pk_max_f16 v5, v0, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB52_1
@@ -13865,13 +13831,13 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX90A-NEXT:    v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT:    v_pk_max_f16 v4, v3, v4
+; GFX90A-NEXT:    v_pk_max_f16 v4, v3, v2
 ; GFX90A-NEXT:    buffer_wbl2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13891,13 +13857,13 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX908-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_pk_max_f16 v5, v2, v2
 ; GFX908-NEXT:    v_pk_max_f16 v3, v4, v4
-; GFX908-NEXT:    v_pk_max_f16 v3, v3, v5
+; GFX908-NEXT:    v_pk_max_f16 v3, v3, v2
 ; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -13917,21 +13883,21 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT:    v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v7, v1, v1
-; GFX8-NEXT:    v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v0, v7, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v0
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB52_1
@@ -13994,22 +13960,22 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v4
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -14023,22 +13989,22 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_
 ; GFX940-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX940-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_max_f16 v4, v4, v3
+; GFX940-NEXT:    v_pk_max_f16 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -14048,22 +14014,22 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_
 ; GFX11-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v3, v5, v3
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -14077,21 +14043,21 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT:    v_pk_max_f16 v3, v5, v3
+; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB53_1
@@ -14102,22 +14068,22 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_
 ; GFX90A-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_max_f16 v4, v5, v5
-; GFX90A-NEXT:    v_pk_max_f16 v4, v4, v3
+; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v4
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -14127,20 +14093,20 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_
 ; GFX908-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX908-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX908-NEXT:    v_pk_max_f16 v3, v5, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT:    v_pk_max_f16 v2, v2, v4
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -14152,24 +14118,24 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
 ; GFX8-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT:    v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v5, v7, v6
-; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT:    v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v6, v6, v5
+; GFX8-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -14236,36 +14202,38 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v6, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_max_num_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -14282,35 +14250,35 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v6, v4
-; GFX940-NEXT:    v_max_f32_e32 v3, v7, v3
-; GFX940-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT:    v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v7, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX940-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB54_1
@@ -14323,39 +14291,41 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -14370,34 +14340,34 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v6, v3
-; GFX10-NEXT:    v_max_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB54_1
@@ -14411,33 +14381,33 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_max_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB54_1
@@ -14451,33 +14421,33 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_max_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_max_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB54_1
@@ -14491,34 +14461,34 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_max_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_max_f32_e32 v3, v7, v3
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB54_1
@@ -14578,36 +14548,38 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v6, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_max_num_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -14624,35 +14596,35 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v6, v4
-; GFX940-NEXT:    v_max_f32_e32 v3, v7, v3
-; GFX940-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT:    v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_mov_b32_e32 v7, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX940-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB55_1
@@ -14665,39 +14637,41 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -14713,35 +14687,35 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX10-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX10-NEXT:    v_max_f32_e32 v0, v6, v0
-; GFX10-NEXT:    v_max_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB55_1
@@ -14754,33 +14728,33 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_max_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB55_1
@@ -14794,33 +14768,33 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_max_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_max_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB55_1
@@ -14836,34 +14810,34 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_max_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_max_f32_e32 v0, v7, v0
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB55_1
@@ -14923,36 +14897,38 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v6, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_max_num_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -14977,35 +14953,35 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
 ; GFX940-NEXT:    s_mov_b32 s1, -1
 ; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v1, v0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX940-NEXT:    v_max_f32_e32 v3, v6, v3
-; GFX940-NEXT:    v_max_f32_e32 v0, v7, v0
-; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v0, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_mov_b32_e32 v7, v0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
+; GFX940-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX940-NEXT:    v_max_f32_e32 v3, v3, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX940-NEXT:    v_add3_u32 v6, v6, v0, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v3, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v0, v0
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v0, v3, v0, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v3, v0, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB56_1
@@ -15024,38 +15000,40 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
 ; GFX11-NEXT:    flat_load_b32 v0, v[4:5]
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_lshlrev_b32 v0, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    v_dual_max_f32 v0, v6, v0 :: v_dual_and_b32 v7, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f32_e32 v5, v7, v5
-; GFX11-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v0, v0
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15070,35 +15048,35 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX10-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX10-NEXT:    v_max_f32_e32 v0, v6, v0
-; GFX10-NEXT:    v_max_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB56_1
@@ -15115,33 +15093,33 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX90A-NEXT:    v_max_f32_e32 v3, v6, v3
-; GFX90A-NEXT:    v_max_f32_e32 v0, v7, v0
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v0, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v0, v3, v0, s9
-; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
+; GFX90A-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX90A-NEXT:    v_max_f32_e32 v3, v3, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v3, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v3, v0, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[6:7] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB56_1
@@ -15158,33 +15136,33 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v1, v0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX908-NEXT:    v_max_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_max_f32_e32 v0, v7, v0
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v0, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX908-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v0, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v0, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB56_1
@@ -15199,34 +15177,34 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_max_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_max_f32_e32 v0, v7, v0
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB56_1
@@ -15285,38 +15263,38 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v5, v3
-; GFX12-NEXT:    v_max_num_f32_e32 v5, v7, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_max_num_f32_e32 v6, v6, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -15330,38 +15308,38 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1]
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v7, v6
-; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15371,41 +15349,41 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(
 ; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15418,35 +15396,35 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(
 ; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB57_1
@@ -15457,36 +15435,36 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(
 ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15496,36 +15474,36 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(
 ; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1]
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15535,37 +15513,37 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(
 ; GFX8-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15620,38 +15598,38 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v5, v3
-; GFX12-NEXT:    v_max_num_f32_e32 v5, v7, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_max_num_f32_e32 v6, v6, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -15665,38 +15643,38 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v7, v6
-; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15706,41 +15684,41 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_max_f32_e32 v6, v6, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15755,35 +15733,35 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB58_1
@@ -15794,36 +15772,36 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15833,36 +15811,36 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15874,37 +15852,37 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15962,38 +15940,38 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:-2048
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v5, v3
-; GFX12-NEXT:    v_max_num_f32_e32 v5, v7, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_max_num_f32_e32 v6, v6, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -16011,40 +15989,40 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
 ; GFX940-NEXT:    s_nop 0
 ; GFX940-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX940-NEXT:    flat_load_dword v5, v[4:5]
+; GFX940-NEXT:    flat_load_dword v3, v[4:5]
 ; GFX940-NEXT:    s_mov_b32 s1, -1
 ; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v7, v6
-; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16058,41 +16036,41 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    flat_load_b32 v4, v[3:4]
+; GFX11-NEXT:    flat_load_b32 v3, v[3:4]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16107,35 +16085,35 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB59_1
@@ -16152,28 +16130,28 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX90A-NEXT:    v_max_f32_e32 v0, v3, v0
-; GFX90A-NEXT:    v_max_f32_e32 v3, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX90A-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX90A-NEXT:    v_max_f32_e32 v6, v6, v2
+; GFX90A-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v0, v3, v0, s9
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v0, v6, v0, s9
 ; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -16195,28 +16173,28 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX908-NEXT:    v_max_f32_e32 v0, v5, v0
-; GFX908-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v0, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX908-NEXT:    v_max_f32_e32 v0, v0, v5
+; GFX908-NEXT:    v_max_f32_e32 v6, v6, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v0, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX908-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v0, v6, v0, s9
 ; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -16234,37 +16212,37 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16323,37 +16301,39 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v6, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_max_num_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -16370,35 +16350,35 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v6, v4
-; GFX940-NEXT:    v_max_f32_e32 v3, v7, v3
-; GFX940-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT:    v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v7, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX940-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB60_1
@@ -16411,39 +16391,41 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16459,35 +16441,35 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX10-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX10-NEXT:    v_max_f32_e32 v0, v6, v0
-; GFX10-NEXT:    v_max_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB60_1
@@ -16500,35 +16482,35 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_max_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB60_1
@@ -16542,33 +16524,33 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_max_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_max_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB60_1
@@ -16584,34 +16566,34 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_max_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_max_f32_e32 v0, v7, v0
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB60_1
@@ -16670,39 +16652,39 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v5, v3
-; GFX12-NEXT:    v_max_num_f32_e32 v5, v7, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_max_num_f32_e32 v6, v6, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -16716,38 +16698,38 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine
 ; GFX940-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v7, v6
-; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16757,41 +16739,41 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine
 ; GFX11-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16806,35 +16788,35 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB61_1
@@ -16845,38 +16827,38 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine
 ; GFX90A-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16886,36 +16868,36 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine
 ; GFX908-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16927,37 +16909,37 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
index daa3df680e5ca..d96d3db9f005d 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
@@ -31,13 +31,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -76,13 +76,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -100,13 +100,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -173,13 +173,13 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -220,13 +220,13 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -244,13 +244,13 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -329,18 +329,18 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai
 ; GFX940-NEXT:    s_mov_b32 s1, -1
 ; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX940-NEXT:  .LBB2_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v1, v0
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
-; GFX940-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX940-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v0
+; GFX940-NEXT:    v_max_f32_e32 v0, v3, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v0, v1
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB2_1
@@ -381,17 +381,17 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX90A-NEXT:  .LBB2_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
-; GFX90A-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NEXT:    v_max_f32_e32 v0, v3, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB2_1
@@ -408,17 +408,17 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX908-NEXT:  .LBB2_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v1, v0
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
-; GFX908-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX908-NEXT:    v_min_f32_e32 v0, v0, v5
-; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v6, v6
+; GFX908-NEXT:    v_min_f32_e32 v5, v0, v1
+; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB2_1
@@ -482,21 +482,21 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1]
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -528,20 +528,20 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v3
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -551,20 +551,20 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1]
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -622,21 +622,21 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -670,20 +670,20 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v3
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -693,20 +693,20 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -773,23 +773,23 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
 ; GFX940-NEXT:    s_nop 0
 ; GFX940-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX940-NEXT:    flat_load_dword v5, v[4:5]
+; GFX940-NEXT:    flat_load_dword v3, v[4:5]
 ; GFX940-NEXT:    s_mov_b32 s1, -1
 ; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -831,12 +831,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v0, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v3, v1, v1
-; GFX90A-NEXT:    v_min_f32_e32 v0, v3, v0
+; GFX90A-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -858,12 +858,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v0, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v1, v1
-; GFX908-NEXT:    v_min_f32_e32 v0, v5, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX908-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -935,13 +935,13 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -982,13 +982,13 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    buffer_wbl2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1008,13 +1008,13 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -1084,21 +1084,21 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX940-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1132,22 +1132,22 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX90A-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1157,20 +1157,20 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX908-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1235,13 +1235,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1259,14 +1259,15 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_max_f32 v3, v2, v2
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX11-NEXT:    v_min_f32_e32 v3, v5, v3
+; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1286,14 +1287,14 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v2, v2
-; GFX10-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v5, v3
+; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1313,13 +1314,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -1337,13 +1338,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -1426,13 +1427,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1471,13 +1472,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -1495,13 +1496,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -1572,13 +1573,13 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1617,13 +1618,13 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -1641,13 +1642,13 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -1714,13 +1715,13 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1761,13 +1762,13 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -1785,13 +1786,13 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -1870,18 +1871,18 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
 ; GFX940-NEXT:    s_mov_b32 s1, -1
 ; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX940-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v1, v0
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
-; GFX940-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX940-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v0
+; GFX940-NEXT:    v_max_f32_e32 v0, v3, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v0, v1
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB12_1
@@ -1922,17 +1923,17 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX90A-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
-; GFX90A-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NEXT:    v_max_f32_e32 v0, v3, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB12_1
@@ -1949,17 +1950,17 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v1, v0
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
-; GFX908-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX908-NEXT:    v_min_f32_e32 v0, v0, v5
-; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v6, v6
+; GFX908-NEXT:    v_min_f32_e32 v5, v0, v1
+; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
@@ -2023,21 +2024,21 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1]
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2069,20 +2070,20 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor
 ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v3
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2092,20 +2093,20 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor
 ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1]
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2163,21 +2164,21 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2211,20 +2212,20 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v3
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2234,20 +2235,20 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2314,23 +2315,23 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
 ; GFX940-NEXT:    s_nop 0
 ; GFX940-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX940-NEXT:    flat_load_dword v5, v[4:5]
+; GFX940-NEXT:    flat_load_dword v3, v[4:5]
 ; GFX940-NEXT:    s_mov_b32 s1, -1
 ; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2372,12 +2373,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v0, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v3, v1, v1
-; GFX90A-NEXT:    v_min_f32_e32 v0, v3, v0
+; GFX90A-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -2399,12 +2400,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v0, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v1, v1
-; GFX908-NEXT:    v_min_f32_e32 v0, v5, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX908-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -2476,13 +2477,13 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2523,13 +2524,13 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    buffer_wbl2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2549,13 +2550,13 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -2625,21 +2626,21 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX940-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2673,22 +2674,22 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX90A-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2698,20 +2699,20 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX908-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2769,29 +2770,29 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX12-NEXT:    s_mov_b32 s0, exec_lo
-; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v1
 ; GFX12-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execz .LBB18_4
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT:    flat_load_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB18_2: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[8:9], v[8:9]
+; GFX12-NEXT:    v_min_num_f64_e32 v[6:7], v[2:3], v[4:5]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -2800,25 +2801,24 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:  ; %bb.3: ; %Flow
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX12-NEXT:  .LBB18_4: ; %Flow2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_cbranch_execz .LBB18_6
 ; GFX12-NEXT:  ; %bb.5: ; %atomicrmw.private
 ; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
-; GFX12-NEXT:    scratch_load_b64 v[4:5], v6, off
+; GFX12-NEXT:    scratch_load_b64 v[2:3], v6, off
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
 ; GFX12-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX12-NEXT:  .LBB18_6: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2865,29 +2865,29 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v1
 ; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execz .LBB18_4
 ; GFX11-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT:    flat_load_b64 v[2:3], v[0:1]
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:  .LBB18_2: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX11-NEXT:    v_min_f64 v[6:7], v[2:3], v[4:5]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -2895,23 +2895,22 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX11-NEXT:  ; %bb.3: ; %Flow
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX11-NEXT:  .LBB18_4: ; %Flow2
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-NEXT:    s_cbranch_execz .LBB18_6
 ; GFX11-NEXT:  ; %bb.5: ; %atomicrmw.private
 ; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
-; GFX11-NEXT:    scratch_load_b64 v[4:5], v6, off
+; GFX11-NEXT:    scratch_load_b64 v[2:3], v6, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
 ; GFX11-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX11-NEXT:  .LBB18_6: ; %atomicrmw.phi
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -3001,91 +3000,90 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execz .LBB18_4
 ; GFX908-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:  .LBB18_2: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[8:9]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT:    v_mov_b32_e32 v9, v3
+; GFX908-NEXT:    v_mov_b32_e32 v8, v2
+; GFX908-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX908-NEXT:    v_min_f64 v[6:7], v[2:3], v[4:5]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB18_2
 ; GFX908-NEXT:  ; %bb.3: ; %Flow
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX908-NEXT:  .LBB18_4: ; %Flow2
 ; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_cbranch_execz .LBB18_6
 ; GFX908-NEXT:  ; %bb.5: ; %atomicrmw.private
 ; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; GFX908-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX908-NEXT:    buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX908-NEXT:    buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
-; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT:    buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX908-NEXT:    buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
 ; GFX908-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
 ; GFX908-NEXT:    buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
 ; GFX908-NEXT:  .LBB18_6: ; %atomicrmw.phi
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    v_mov_b32_e32 v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
 ; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX8-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execz .LBB18_4
 ; GFX8-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v5, v[4:5]
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[2:3]
+; GFX8-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:  .LBB18_2: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
-; GFX8-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[8:9]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_mov_b32_e32 v9, v3
+; GFX8-NEXT:    v_mov_b32_e32 v8, v2
+; GFX8-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX8-NEXT:    v_min_f64 v[6:7], v[2:3], v[4:5]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB18_2
 ; GFX8-NEXT:  ; %bb.3: ; %Flow
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX8-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX8-NEXT:  .LBB18_4: ; %Flow2
 ; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_cbranch_execz .LBB18_6
@@ -3093,18 +3091,17 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 4, v6
-; GFX8-NEXT:    buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_load_dword v5, v7, s[0:3], 0 offen
-; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_load_dword v3, v7, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
 ; GFX8-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
 ; GFX8-NEXT:    buffer_store_dword v1, v7, s[0:3], 0 offen
 ; GFX8-NEXT:  .LBB18_6: ; %atomicrmw.phi
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v4
-; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3162,6 +3159,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7f8, v0
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
 ; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
@@ -3188,10 +3186,9 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[8:9], v[8:9]
-; GFX12-NEXT:    v_min_num_f64_e32 v[6:7], v[6:7], v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[8:9], v[8:9]
+; GFX12-NEXT:    v_min_num_f64_e32 v[6:7], v[0:1], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -3210,7 +3207,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_cbranch_execz .LBB19_2
 ; GFX12-NEXT:  .LBB19_6: ; %atomicrmw.private
 ; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -3266,6 +3262,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7f8, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
 ; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
@@ -3288,10 +3285,9 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[6:7], v[8:9], v[8:9]
-; GFX11-NEXT:    v_min_f64 v[6:7], v[6:7], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX11-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3310,7 +3306,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX11-NEXT:    s_cbranch_execz .LBB19_2
 ; GFX11-NEXT:  .LBB19_6: ; %atomicrmw.private
 ; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
 ; GFX11-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -3408,6 +3403,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7f8, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
@@ -3430,9 +3426,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v8, v0
-; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX908-NEXT:    v_min_f64 v[6:7], v[0:1], v[6:7]
+; GFX908-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
 ; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -3448,7 +3443,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908-NEXT:    s_cbranch_execz .LBB19_2
 ; GFX908-NEXT:  .LBB19_6: ; %atomicrmw.private
 ; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc
 ; GFX908-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
 ; GFX908-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
@@ -3464,6 +3458,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
 ; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7f8, v0
@@ -3491,9 +3486,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT:    v_min_f64 v[6:7], v[0:1], v[6:7]
+; GFX8-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -3509,7 +3503,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX8-NEXT:    s_cbranch_execz .LBB19_2
 ; GFX8-NEXT:  .LBB19_6: ; %atomicrmw.private
 ; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 4, v6
 ; GFX8-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
@@ -3578,6 +3571,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
 ; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
@@ -3604,10 +3598,9 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[8:9], v[8:9]
-; GFX12-NEXT:    v_min_num_f64_e32 v[6:7], v[6:7], v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[8:9], v[8:9]
+; GFX12-NEXT:    v_min_num_f64_e32 v[6:7], v[0:1], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -3626,7 +3619,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX12-NEXT:  .LBB20_6: ; %atomicrmw.private
 ; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -3683,6 +3675,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
@@ -3705,10 +3698,9 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[6:7], v[8:9], v[8:9]
-; GFX11-NEXT:    v_min_f64 v[6:7], v[6:7], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX11-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3727,7 +3719,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX11-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX11-NEXT:  .LBB20_6: ; %atomicrmw.private
 ; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
 ; GFX11-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -3825,6 +3816,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
@@ -3847,9 +3839,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v8, v0
-; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX908-NEXT:    v_min_f64 v[6:7], v[0:1], v[6:7]
+; GFX908-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
 ; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -3865,7 +3856,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX908-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX908-NEXT:  .LBB20_6: ; %atomicrmw.private
 ; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc
 ; GFX908-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
 ; GFX908-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
@@ -3881,6 +3871,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
 ; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
@@ -3908,9 +3899,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT:    v_min_f64 v[6:7], v[0:1], v[6:7]
+; GFX8-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -3926,7 +3916,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX8-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX8-NEXT:  .LBB20_6: ; %atomicrmw.private
 ; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 4, v6
 ; GFX8-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
@@ -3995,6 +3984,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX12-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -4011,21 +4001,20 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ; GFX12-NEXT:  .LBB21_3: ; %atomicrmw.global
-; GFX12-NEXT:    flat_load_b64 v[6:7], v[0:1]
+; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB21_4: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -4034,19 +4023,18 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:  ; %bb.5: ; %Flow
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX12-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_cbranch_execz .LBB21_2
 ; GFX12-NEXT:  .LBB21_6: ; %atomicrmw.private
 ; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
-; GFX12-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
+; GFX12-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[6:7]
+; GFX12-NEXT:    scratch_store_b64 v2, v[0:1], off
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -4092,6 +4080,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX11-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v1
@@ -4104,22 +4093,21 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-NEXT:  .LBB21_3: ; %atomicrmw.global
-; GFX11-NEXT:    flat_load_b64 v[6:7], v[0:1]
+; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:  .LBB21_4: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -4127,19 +4115,18 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX11-NEXT:  ; %bb.5: ; %Flow
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX11-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-NEXT:    s_cbranch_execz .LBB21_2
 ; GFX11-NEXT:  .LBB21_6: ; %atomicrmw.private
 ; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
-; GFX11-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
+; GFX11-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
+; GFX11-NEXT:    scratch_store_b64 v2, v[0:1], off
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4225,6 +4212,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -4237,40 +4225,38 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ; GFX908-NEXT:  .LBB21_3: ; %atomicrmw.global
-; GFX908-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:  .LBB21_4: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX908-NEXT:  ; %bb.5: ; %Flow
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX908-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_cbranch_execz .LBB21_2
 ; GFX908-NEXT:  .LBB21_6: ; %atomicrmw.private
 ; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX908-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX908-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX908-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX908-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX908-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX908-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX908-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX908-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX908-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
+; GFX908-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX908-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
@@ -4278,6 +4264,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
 ; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4292,44 +4279,42 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ; GFX8-NEXT:  .LBB21_3: ; %atomicrmw.global
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v7, v[4:5]
-; GFX8-NEXT:    flat_load_dword v6, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v5, v[2:3]
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:  .LBB21_4: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX8-NEXT:  ; %bb.5: ; %Flow
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX8-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_cbranch_execz .LBB21_2
 ; GFX8-NEXT:  .LBB21_6: ; %atomicrmw.private
 ; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v4
-; GFX8-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_load_dword v1, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v2
+; GFX8-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_load_dword v1, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX8-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
+; GFX8-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -4385,13 +4370,14 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7f8, v0
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT:    v_add_co_u32 v6, vcc_lo, 0x7f8, v0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
 ; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX12-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v7
 ; GFX12-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB22_3
 ; GFX12-NEXT:  ; %bb.1: ; %Flow2
@@ -4404,21 +4390,20 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ; GFX12-NEXT:  .LBB22_3: ; %atomicrmw.global
-; GFX12-NEXT:    flat_load_b64 v[6:7], v[0:1]
+; GFX12-NEXT:    flat_load_b64 v[2:3], v[6:7]
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB22_4: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -4426,20 +4411,19 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_cbranch_execnz .LBB22_4
 ; GFX12-NEXT:  ; %bb.5: ; %Flow
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX12-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_cbranch_execz .LBB22_2
 ; GFX12-NEXT:  .LBB22_6: ; %atomicrmw.private
-; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
-; GFX12-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, -1, v6, vcc_lo
+; GFX12-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    scratch_store_b64 v2, v[0:1], off
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -4487,12 +4471,13 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7f8, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, 0x7f8, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
 ; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX11-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v7
 ; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB22_3
 ; GFX11-NEXT:  ; %bb.1: ; %Flow2
@@ -4502,42 +4487,40 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-NEXT:  .LBB22_3: ; %atomicrmw.global
-; GFX11-NEXT:    flat_load_b64 v[6:7], v[0:1]
+; GFX11-NEXT:    flat_load_b64 v[2:3], v[6:7]
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:  .LBB22_4: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_cbranch_execnz .LBB22_4
 ; GFX11-NEXT:  ; %bb.5: ; %Flow
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX11-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-NEXT:    s_cbranch_execz .LBB22_2
 ; GFX11-NEXT:  .LBB22_6: ; %atomicrmw.private
-; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
-; GFX11-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, -1, v6, vcc_lo
+; GFX11-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    scratch_store_b64 v2, v[0:1], off
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4627,10 +4610,11 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7f8, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT:    v_add_co_u32_e32 v6, vcc, 0x7f8, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
-; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB22_3
@@ -4641,40 +4625,38 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ; GFX908-NEXT:  .LBB22_3: ; %atomicrmw.global
-; GFX908-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX908-NEXT:    flat_load_dwordx2 v[2:3], v[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:  .LBB22_4: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB22_4
 ; GFX908-NEXT:  ; %bb.5: ; %Flow
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX908-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_cbranch_execz .LBB22_2
 ; GFX908-NEXT:  .LBB22_6: ; %atomicrmw.private
-; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX908-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX908-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX908-NEXT:    v_cndmask_b32_e32 v2, -1, v6, vcc
+; GFX908-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX908-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX908-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX908-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX908-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX908-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX908-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX908-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
@@ -4682,12 +4664,13 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
 ; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7f8, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7f8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v7
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB22_3
@@ -4698,44 +4681,42 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ; GFX8-NEXT:  .LBB22_3: ; %atomicrmw.global
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v7, v[4:5]
-; GFX8-NEXT:    flat_load_dword v6, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v6
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v7, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    flat_load_dword v2, v[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:  .LBB22_4: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB22_4
 ; GFX8-NEXT:  ; %bb.5: ; %Flow
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX8-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX8-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_cbranch_execz .LBB22_2
 ; GFX8-NEXT:  .LBB22_6: ; %atomicrmw.private
-; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v4
-; GFX8-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_load_dword v1, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, -1, v6, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v2
+; GFX8-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_load_dword v1, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX8-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -4794,13 +4775,14 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT:    v_add_co_u32 v6, vcc_lo, 0xfffff800, v0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo
 ; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX12-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v7
 ; GFX12-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB23_3
 ; GFX12-NEXT:  ; %bb.1: ; %Flow2
@@ -4813,21 +4795,20 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ; GFX12-NEXT:  .LBB23_3: ; %atomicrmw.global
-; GFX12-NEXT:    flat_load_b64 v[6:7], v[0:1]
+; GFX12-NEXT:    flat_load_b64 v[2:3], v[6:7]
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB23_4: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -4835,20 +4816,19 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_cbranch_execnz .LBB23_4
 ; GFX12-NEXT:  ; %bb.5: ; %Flow
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX12-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_cbranch_execz .LBB23_2
 ; GFX12-NEXT:  .LBB23_6: ; %atomicrmw.private
-; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
-; GFX12-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, -1, v6, vcc_lo
+; GFX12-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    scratch_store_b64 v2, v[0:1], off
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -4897,12 +4877,13 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX11-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v7
 ; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB23_3
 ; GFX11-NEXT:  ; %bb.1: ; %Flow2
@@ -4912,42 +4893,40 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-NEXT:  .LBB23_3: ; %atomicrmw.global
-; GFX11-NEXT:    flat_load_b64 v[6:7], v[0:1]
+; GFX11-NEXT:    flat_load_b64 v[2:3], v[6:7]
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:  .LBB23_4: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_cbranch_execnz .LBB23_4
 ; GFX11-NEXT:  ; %bb.5: ; %Flow
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX11-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-NEXT:    s_cbranch_execz .LBB23_2
 ; GFX11-NEXT:  .LBB23_6: ; %atomicrmw.private
-; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
-; GFX11-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, -1, v6, vcc_lo
+; GFX11-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    scratch_store_b64 v2, v[0:1], off
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5037,10 +5016,11 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT:    v_add_co_u32_e32 v6, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v7, vcc, -1, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
-; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB23_3
@@ -5051,40 +5031,38 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ; GFX908-NEXT:  .LBB23_3: ; %atomicrmw.global
-; GFX908-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX908-NEXT:    flat_load_dwordx2 v[2:3], v[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:  .LBB23_4: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB23_4
 ; GFX908-NEXT:  ; %bb.5: ; %Flow
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX908-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_cbranch_execz .LBB23_2
 ; GFX908-NEXT:  .LBB23_6: ; %atomicrmw.private
-; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX908-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX908-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX908-NEXT:    v_cndmask_b32_e32 v2, -1, v6, vcc
+; GFX908-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX908-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX908-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX908-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX908-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX908-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX908-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX908-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
@@ -5092,12 +5070,13 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
 ; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, -1, v1, vcc
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v7
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB23_3
@@ -5108,44 +5087,42 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ; GFX8-NEXT:  .LBB23_3: ; %atomicrmw.global
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v7, v[4:5]
-; GFX8-NEXT:    flat_load_dword v6, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v6
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v7, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    flat_load_dword v2, v[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:  .LBB23_4: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB23_4
 ; GFX8-NEXT:  ; %bb.5: ; %Flow
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX8-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX8-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_cbranch_execz .LBB23_2
 ; GFX8-NEXT:  .LBB23_6: ; %atomicrmw.private
-; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v4
-; GFX8-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_load_dword v1, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, -1, v6, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v2
+; GFX8-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_load_dword v1, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX8-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -5204,29 +5181,29 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX12-NEXT:    s_mov_b32 s0, exec_lo
-; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v1
 ; GFX12-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execz .LBB24_4
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT:    flat_load_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB24_2: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[8:9], v[8:9]
+; GFX12-NEXT:    v_min_num_f64_e32 v[6:7], v[2:3], v[4:5]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -5235,25 +5212,24 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX12-NEXT:  ; %bb.3: ; %Flow
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX12-NEXT:  .LBB24_4: ; %Flow2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_cbranch_execz .LBB24_6
 ; GFX12-NEXT:  ; %bb.5: ; %atomicrmw.private
 ; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
-; GFX12-NEXT:    scratch_load_b64 v[4:5], v6, off
+; GFX12-NEXT:    scratch_load_b64 v[2:3], v6, off
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
 ; GFX12-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX12-NEXT:  .LBB24_6: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5300,29 +5276,29 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v1
 ; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execz .LBB24_4
 ; GFX11-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT:    flat_load_b64 v[2:3], v[0:1]
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:  .LBB24_2: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX11-NEXT:    v_min_f64 v[6:7], v[2:3], v[4:5]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -5330,78 +5306,76 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX11-NEXT:  ; %bb.3: ; %Flow
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX11-NEXT:  .LBB24_4: ; %Flow2
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-NEXT:    s_cbranch_execz .LBB24_6
 ; GFX11-NEXT:  ; %bb.5: ; %atomicrmw.private
 ; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
-; GFX11-NEXT:    scratch_load_b64 v[4:5], v6, off
+; GFX11-NEXT:    scratch_load_b64 v[2:3], v6, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
 ; GFX11-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX11-NEXT:  .LBB24_6: ; %atomicrmw.phi
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX10-NEXT:    s_mov_b64 s[4:5], src_private_base
-; GFX10-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX10-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v1
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX10-NEXT:    s_xor_b32 s4, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execz .LBB24_4
 ; GFX10-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX10-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX10-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB24_2: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v7, v5
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
-; GFX10-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX10-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX10-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v9, v3
+; GFX10-NEXT:    v_mov_b32_e32 v8, v2
+; GFX10-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX10-NEXT:    v_min_f64 v[6:7], v[2:3], v[4:5]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB24_2
 ; GFX10-NEXT:  ; %bb.3: ; %Flow
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX10-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX10-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX10-NEXT:  .LBB24_4: ; %Flow2
 ; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
 ; GFX10-NEXT:    s_cbranch_execz .LBB24_6
 ; GFX10-NEXT:  ; %bb.5: ; %atomicrmw.private
 ; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
-; GFX10-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX10-NEXT:    buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX10-NEXT:    buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX10-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
 ; GFX10-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
 ; GFX10-NEXT:    buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
 ; GFX10-NEXT:  .LBB24_6: ; %atomicrmw.phi
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
@@ -5409,140 +5383,138 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB24_4
 ; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX90A-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:  .LBB24_2: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT:    v_min_f64 v[4:5], v[4:5], v[8:9]
-; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX90A-NEXT:    v_min_f64 v[6:7], v[2:3], v[4:5]
+; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB24_2
 ; GFX90A-NEXT:  ; %bb.3: ; %Flow
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX90A-NEXT:  .LBB24_4: ; %Flow2
 ; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB24_6
 ; GFX90A-NEXT:  ; %bb.5: ; %atomicrmw.private
 ; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX90A-NEXT:    buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
-; GFX90A-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX90A-NEXT:    buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX90A-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX90A-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
 ; GFX90A-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
 ; GFX90A-NEXT:    buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:  .LBB24_6: ; %atomicrmw.phi
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execz .LBB24_4
 ; GFX908-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:  .LBB24_2: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[8:9]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT:    v_mov_b32_e32 v9, v3
+; GFX908-NEXT:    v_mov_b32_e32 v8, v2
+; GFX908-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX908-NEXT:    v_min_f64 v[6:7], v[2:3], v[4:5]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB24_2
 ; GFX908-NEXT:  ; %bb.3: ; %Flow
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX908-NEXT:  .LBB24_4: ; %Flow2
 ; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_cbranch_execz .LBB24_6
 ; GFX908-NEXT:  ; %bb.5: ; %atomicrmw.private
 ; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; GFX908-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX908-NEXT:    buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX908-NEXT:    buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
-; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT:    buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX908-NEXT:    buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
 ; GFX908-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
 ; GFX908-NEXT:    buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
 ; GFX908-NEXT:  .LBB24_6: ; %atomicrmw.phi
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    v_mov_b32_e32 v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
 ; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX8-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execz .LBB24_4
 ; GFX8-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v5, v[4:5]
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[2:3]
+; GFX8-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:  .LBB24_2: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
-; GFX8-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[8:9]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_mov_b32_e32 v9, v3
+; GFX8-NEXT:    v_mov_b32_e32 v8, v2
+; GFX8-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX8-NEXT:    v_min_f64 v[6:7], v[2:3], v[4:5]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB24_2
 ; GFX8-NEXT:  ; %bb.3: ; %Flow
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX8-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX8-NEXT:  .LBB24_4: ; %Flow2
 ; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_cbranch_execz .LBB24_6
@@ -5550,18 +5522,17 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 4, v6
-; GFX8-NEXT:    buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_load_dword v5, v7, s[0:3], 0 offen
-; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_load_dword v3, v7, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
 ; GFX8-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
 ; GFX8-NEXT:    buffer_store_dword v1, v7, s[0:3], 0 offen
 ; GFX8-NEXT:  .LBB24_6: ; %atomicrmw.phi
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v4
-; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5570,37 +5541,37 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0xc0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX7-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX7-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX7-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execz .LBB24_4
 ; GFX7-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 4, v0
-; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v5, v[4:5]
-; GFX7-NEXT:    flat_load_dword v4, v[0:1]
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
+; GFX7-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v3, v[2:3]
+; GFX7-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX7-NEXT:  .LBB24_2: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX7-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX7-NEXT:    v_min_f64 v[4:5], v[4:5], v[8:9]
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX7-NEXT:    v_min_f64 v[6:7], v[2:3], v[4:5]
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
 ; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB24_2
 ; GFX7-NEXT:  ; %bb.3: ; %Flow
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX7-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX7-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX7-NEXT:  .LBB24_4: ; %Flow2
 ; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX7-NEXT:    s_cbranch_execz .LBB24_6
@@ -5608,18 +5579,17 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX7-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 4, v6
-; GFX7-NEXT:    buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX7-NEXT:    buffer_load_dword v5, v7, s[0:3], 0 offen
-; GFX7-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7-NEXT:    buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_load_dword v3, v7, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX7-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
 ; GFX7-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
 ; GFX7-NEXT:    buffer_store_dword v1, v7, s[0:3], 0 offen
 ; GFX7-NEXT:  .LBB24_6: ; %atomicrmw.phi
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v4
-; GFX7-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
@@ -5634,29 +5604,29 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX12-NEXT:    s_mov_b32 s0, exec_lo
-; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v1
 ; GFX12-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execz .LBB25_4
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT:    flat_load_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB25_2: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[8:9], v[8:9]
+; GFX12-NEXT:    v_min_num_f64_e32 v[6:7], v[2:3], v[4:5]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -5665,25 +5635,24 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am
 ; GFX12-NEXT:  ; %bb.3: ; %Flow
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX12-NEXT:  .LBB25_4: ; %Flow2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_cbranch_execz .LBB25_6
 ; GFX12-NEXT:  ; %bb.5: ; %atomicrmw.private
 ; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
-; GFX12-NEXT:    scratch_load_b64 v[4:5], v6, off
+; GFX12-NEXT:    scratch_load_b64 v[2:3], v6, off
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
 ; GFX12-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX12-NEXT:  .LBB25_6: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5730,29 +5699,29 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am
 ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v1
 ; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execz .LBB25_4
 ; GFX11-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT:    flat_load_b64 v[2:3], v[0:1]
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:  .LBB25_2: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX11-NEXT:    v_min_f64 v[6:7], v[2:3], v[4:5]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -5760,23 +5729,22 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am
 ; GFX11-NEXT:  ; %bb.3: ; %Flow
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX11-NEXT:  .LBB25_4: ; %Flow2
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-NEXT:    s_cbranch_execz .LBB25_6
 ; GFX11-NEXT:  ; %bb.5: ; %atomicrmw.private
 ; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
-; GFX11-NEXT:    scratch_load_b64 v[4:5], v6, off
+; GFX11-NEXT:    scratch_load_b64 v[2:3], v6, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
 ; GFX11-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX11-NEXT:  .LBB25_6: ; %atomicrmw.phi
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -5866,91 +5834,90 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am
 ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execz .LBB25_4
 ; GFX908-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:  .LBB25_2: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[8:9]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT:    v_mov_b32_e32 v9, v3
+; GFX908-NEXT:    v_mov_b32_e32 v8, v2
+; GFX908-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX908-NEXT:    v_min_f64 v[6:7], v[2:3], v[4:5]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB25_2
 ; GFX908-NEXT:  ; %bb.3: ; %Flow
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX908-NEXT:  .LBB25_4: ; %Flow2
 ; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_cbranch_execz .LBB25_6
 ; GFX908-NEXT:  ; %bb.5: ; %atomicrmw.private
 ; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; GFX908-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX908-NEXT:    buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX908-NEXT:    buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
-; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT:    buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX908-NEXT:    buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
 ; GFX908-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
 ; GFX908-NEXT:    buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
 ; GFX908-NEXT:  .LBB25_6: ; %atomicrmw.phi
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    v_mov_b32_e32 v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
 ; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX8-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execz .LBB25_4
 ; GFX8-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v5, v[4:5]
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[2:3]
+; GFX8-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:  .LBB25_2: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
-; GFX8-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[8:9]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_mov_b32_e32 v9, v3
+; GFX8-NEXT:    v_mov_b32_e32 v8, v2
+; GFX8-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX8-NEXT:    v_min_f64 v[6:7], v[2:3], v[4:5]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB25_2
 ; GFX8-NEXT:  ; %bb.3: ; %Flow
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX8-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX8-NEXT:  .LBB25_4: ; %Flow2
 ; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_cbranch_execz .LBB25_6
@@ -5958,18 +5925,17 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am
 ; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 4, v6
-; GFX8-NEXT:    buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_load_dword v5, v7, s[0:3], 0 offen
-; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_load_dword v3, v7, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
 ; GFX8-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
 ; GFX8-NEXT:    buffer_store_dword v1, v7, s[0:3], 0 offen
 ; GFX8-NEXT:  .LBB25_6: ; %atomicrmw.phi
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v4
-; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6032,8 +5998,9 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
@@ -6045,12 +6012,11 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f16_e32 v5, v5, v7
+; GFX12-NEXT:    v_min_num_f16_e32 v5, v5, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -6083,14 +6049,14 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
-; GFX940-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT:    v_min_f16_e32 v5, v6, v5
+; GFX940-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX940-NEXT:    buffer_wbl2 sc1
@@ -6110,8 +6076,9 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
@@ -6123,12 +6090,11 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-NEXT:    v_max_f16_e32 v5, v5, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f16_e32 v5, v5, v7
+; GFX11-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -6152,6 +6118,7 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
@@ -6163,10 +6130,9 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX10-NEXT:    v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT:    v_min_f16_e32 v5, v5, v7
+; GFX10-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -6195,14 +6161,14 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT:    v_min_f16_e32 v5, v6, v5
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX90A-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
@@ -6229,14 +6195,14 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
-; GFX908-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT:    v_min_f16_e32 v5, v7, v5
+; GFX908-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX908-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
@@ -6263,17 +6229,17 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT:    v_min_f16_e32 v5, v7, v5
-; GFX8-NEXT:    v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT:    v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
 ; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -6336,10 +6302,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -6349,12 +6316,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f16_e32 v5, v5, v7
+; GFX12-NEXT:    v_min_num_f16_e32 v5, v5, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -6389,14 +6355,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
-; GFX940-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT:    v_min_f16_e32 v5, v6, v5
+; GFX940-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX940-NEXT:    buffer_wbl2 sc1
@@ -6417,10 +6383,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -6430,12 +6397,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-NEXT:    v_max_f16_e32 v5, v5, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f16_e32 v5, v5, v7
+; GFX11-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -6460,9 +6426,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v5, v[0:1]
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -6471,10 +6438,9 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX10-NEXT:    v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT:    v_min_f16_e32 v5, v5, v7
+; GFX10-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -6504,14 +6470,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT:    v_min_f16_e32 v5, v6, v5
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX90A-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
@@ -6539,14 +6505,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
-; GFX908-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT:    v_min_f16_e32 v5, v7, v5
+; GFX908-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX908-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
@@ -6574,17 +6540,17 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT:    v_min_f16_e32 v5, v7, v5
-; GFX8-NEXT:    v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT:    v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
 ; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -6649,10 +6615,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -6662,12 +6629,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f16_e32 v5, v5, v7
+; GFX12-NEXT:    v_min_num_f16_e32 v5, v5, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -6703,14 +6669,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
-; GFX940-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT:    v_min_f16_e32 v5, v6, v5
+; GFX940-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX940-NEXT:    buffer_wbl2 sc1
@@ -6731,10 +6697,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -6744,12 +6711,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-NEXT:    v_max_f16_e32 v5, v5, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f16_e32 v5, v5, v7
+; GFX11-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -6774,9 +6740,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v5, v[0:1]
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -6785,10 +6752,9 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX10-NEXT:    v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT:    v_min_f16_e32 v5, v5, v7
+; GFX10-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -6818,14 +6784,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT:    v_min_f16_e32 v5, v6, v5
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX90A-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
@@ -6853,14 +6819,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
-; GFX908-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT:    v_min_f16_e32 v5, v7, v5
+; GFX908-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX908-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
@@ -6888,17 +6854,17 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT:    v_min_f16_e32 v5, v7, v5
-; GFX8-NEXT:    v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT:    v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
 ; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -6962,8 +6928,9 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
@@ -6975,10 +6942,9 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT:    v_min_num_f16_e32 v3, v3, v7
+; GFX12-NEXT:    v_min_num_f16_e32 v3, v3, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
@@ -7012,13 +6978,13 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
 ; GFX940-NEXT:    v_not_b32_e32 v6, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX940-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT:    v_min_f16_e32 v4, v4, v7
+; GFX940-NEXT:    v_min_f16_e32 v4, v4, v2
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
 ; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
@@ -7038,8 +7004,9 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
@@ -7051,10 +7018,9 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT:    v_min_f16_e32 v3, v3, v7
+; GFX11-NEXT:    v_min_f16_e32 v3, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
@@ -7079,6 +7045,7 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
@@ -7090,9 +7057,8 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX10-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT:    v_min_f16_e32 v3, v3, v7
+; GFX10-NEXT:    v_min_f16_e32 v3, v3, v2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -7121,13 +7087,13 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v6, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT:    v_min_f16_e32 v4, v4, v7
+; GFX90A-NEXT:    v_min_f16_e32 v4, v4, v2
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
 ; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
 ; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
@@ -7154,13 +7120,13 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
 ; GFX908-NEXT:    v_not_b32_e32 v6, v3
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX908-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT:    v_min_f16_e32 v3, v3, v7
+; GFX908-NEXT:    v_min_f16_e32 v3, v3, v2
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
 ; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
@@ -7187,16 +7153,16 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
 ; GFX8-NEXT:    v_not_b32_e32 v6, v3
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT:    v_min_f16_e32 v3, v3, v7
-; GFX8-NEXT:    v_and_b32_e32 v8, v4, v6
+; GFX8-NEXT:    v_min_f16_e32 v3, v3, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v8, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
 ; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -7255,36 +7221,36 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT:    v_max_num_f16_e32 v6, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v6, v3
+; GFX12-NEXT:    v_not_b32_e32 v5, v5
 ; GFX12-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT:    v_min_num_f16_e32 v3, v3, v7
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT:    v_min_num_f16_e32 v2, v2, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -7302,29 +7268,29 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
 ; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v5
-; GFX940-NEXT:    flat_load_dword v5, v[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v6, v4
+; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX940-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX940-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT:    v_min_f16_e32 v4, v4, v7
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX940-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v4
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7334,37 +7300,37 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    v_max_f16_e32 v6, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v6, v3
+; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT:    v_min_f16_e32 v3, v3, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -7376,31 +7342,31 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_max_f16_e32 v6, v2, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v6, v3
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v5, v5
 ; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX10-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT:    v_min_f16_e32 v3, v3, v7
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX10-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
@@ -7411,31 +7377,31 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
+; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v6, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX90A-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT:    v_min_f16_e32 v4, v4, v7
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7445,31 +7411,31 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    flat_load_dword v4, v[0:1]
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
+; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT:    v_not_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX908-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX908-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT:    v_min_f16_e32 v3, v3, v7
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX908-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7479,32 +7445,32 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT:    v_not_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX8-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT:    v_min_f16_e32 v3, v3, v7
-; GFX8-NEXT:    v_and_b32_e32 v8, v4, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v8, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7559,36 +7525,36 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT:    v_max_num_f16_e32 v6, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v6, v3
+; GFX12-NEXT:    v_not_b32_e32 v5, v5
 ; GFX12-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT:    v_min_num_f16_e32 v3, v3, v7
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT:    v_min_num_f16_e32 v2, v2, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -7607,29 +7573,29 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
 ; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v5
-; GFX940-NEXT:    flat_load_dword v5, v[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v6, v4
+; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX940-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX940-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT:    v_min_f16_e32 v4, v4, v7
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX940-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v4
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7639,37 +7605,37 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_max_f16_e32 v6, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v6, v3
+; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT:    v_min_f16_e32 v3, v3, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -7681,31 +7647,31 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_max_f16_e32 v6, v2, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v6, v3
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v5, v5
 ; GFX10-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX10-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT:    v_min_f16_e32 v3, v3, v7
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX10-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB31_1
@@ -7716,31 +7682,31 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
+; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v6, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX90A-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT:    v_min_f16_e32 v4, v4, v7
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7750,31 +7716,31 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    flat_load_dword v4, v[0:1]
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
+; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT:    v_not_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX908-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX908-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT:    v_min_f16_e32 v3, v3, v7
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX908-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7784,32 +7750,32 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT:    v_not_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX8-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT:    v_min_f16_e32 v3, v3, v7
-; GFX8-NEXT:    v_and_b32_e32 v8, v4, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v8, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7865,15 +7831,15 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v5, v4, v4
-; GFX12-NEXT:    v_min_num_f16_e32 v3, v5, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v3, v4, v4
+; GFX12-NEXT:    v_min_num_f16_e32 v3, v3, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX12-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
@@ -7898,14 +7864,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2046
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
 ; GFX940-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f16_e32 v3, v5, v5
-; GFX940-NEXT:    v_min_f16_e32 v3, v3, v4
+; GFX940-NEXT:    v_min_f16_e32 v3, v3, v2
 ; GFX940-NEXT:    v_and_or_b32 v4, v5, s2, v3
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
@@ -7924,15 +7890,15 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    v_max_f16_e32 v3, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v5, v4, v4
-; GFX11-NEXT:    v_min_f16_e32 v3, v5, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v3, v4, v4
+; GFX11-NEXT:    v_min_f16_e32 v3, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX11-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
@@ -7956,23 +7922,23 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_max_f16_e32 v1, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX10-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_max_f16_e32 v0, v2, v2
-; GFX10-NEXT:    v_max_f16_e32 v5, v1, v1
-; GFX10-NEXT:    v_min_f16_e32 v0, v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_max_f16_e32 v0, v6, v6
+; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX10-NEXT:    v_and_or_b32 v5, 0xffff0000, v6, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB32_1
@@ -7985,14 +7951,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2046
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
 ; GFX90A-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f16_e32 v3, v5, v5
-; GFX90A-NEXT:    v_min_f16_e32 v3, v3, v4
+; GFX90A-NEXT:    v_min_f16_e32 v3, v3, v2
 ; GFX90A-NEXT:    v_and_or_b32 v4, v5, s6, v3
 ; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -8011,14 +7977,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2046
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
 ; GFX908-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f16_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f16_e32 v3, v4, v4
-; GFX908-NEXT:    v_min_f16_e32 v3, v3, v5
+; GFX908-NEXT:    v_min_f16_e32 v3, v3, v2
 ; GFX908-NEXT:    v_and_or_b32 v3, v4, s6, v3
 ; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -8039,19 +8005,19 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v1, v2, v2
 ; GFX8-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v0, v1, v1
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX8-NEXT:    v_min_f16_e32 v0, v0, v5
-; GFX8-NEXT:    v_or_b32_e32 v0, v6, v0
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_max_f16_e32 v0, v6, v6
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-NEXT:    v_or_b32_e32 v5, v2, v0
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB32_1
@@ -8101,24 +8067,24 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:2046
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-NEXT:    v_max_num_f16_e32 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v2, v2
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f16_e32 v5, v4, v4
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v3, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f16_e32 v3, v5, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT:    v_min_num_f16_e32 v2, v2, v4
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -8132,23 +8098,23 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1] offset:2046
+; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2046
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
 ; GFX940-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f16_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f16_e32 v4, v5, v5
-; GFX940-NEXT:    v_min_f16_e32 v3, v4, v3
-; GFX940-NEXT:    v_and_or_b32 v4, v5, s2, v3
+; GFX940-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX940-NEXT:    v_min_f16_e32 v2, v2, v4
+; GFX940-NEXT:    v_and_or_b32 v2, v3, s2, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8158,25 +8124,25 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_
 ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2046
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_max_f16_e32 v3, v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e32 v5, v4, v4
+; GFX11-NEXT:    v_max_f16_e32 v2, v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f16_e32 v3, v5, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    v_min_f16_e32 v2, v2, v4
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -8190,23 +8156,23 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fe, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_max_f16_e32 v3, v2, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f16_e32 v5, v4, v4
-; GFX10-NEXT:    v_min_f16_e32 v3, v5, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX10-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX10-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX10-NEXT:    v_min_f16_e32 v2, v2, v4
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB33_1
@@ -8217,22 +8183,22 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_
 ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2046
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2046
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
 ; GFX90A-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f16_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f16_e32 v4, v5, v5
-; GFX90A-NEXT:    v_min_f16_e32 v3, v4, v3
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, s6, v3
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
+; GFX90A-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX90A-NEXT:    v_min_f16_e32 v2, v2, v4
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, s6, v2
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8242,22 +8208,22 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_
 ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2046
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2046
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
 ; GFX908-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f16_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f16_e32 v5, v4, v4
-; GFX908-NEXT:    v_min_f16_e32 v3, v5, v3
-; GFX908-NEXT:    v_and_or_b32 v3, v4, s6, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc
+; GFX908-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX908-NEXT:    v_min_f16_e32 v2, v2, v4
+; GFX908-NEXT:    v_and_or_b32 v2, v3, s6, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8269,22 +8235,22 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX8-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f16_e32 v3, v2, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_e32 v5, v4, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT:    v_min_f16_e32 v3, v5, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v6, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT:    v_min_f16_e32 v2, v2, v4
+; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8334,10 +8300,11 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -8347,12 +8314,11 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f16_e32 v5, v5, v7
+; GFX12-NEXT:    v_min_num_f16_e32 v5, v5, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -8388,14 +8354,14 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
-; GFX940-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT:    v_min_f16_e32 v5, v6, v5
+; GFX940-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
@@ -8416,10 +8382,11 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -8429,12 +8396,11 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-NEXT:    v_max_f16_e32 v5, v5, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f16_e32 v5, v5, v7
+; GFX11-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -8459,9 +8425,10 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v5, v[0:1]
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -8470,10 +8437,9 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX10-NEXT:    v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT:    v_min_f16_e32 v5, v5, v7
+; GFX10-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -8503,14 +8469,14 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT:    v_min_f16_e32 v5, v6, v5
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX90A-NEXT:    buffer_wbl2
@@ -8540,14 +8506,14 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
-; GFX908-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT:    v_min_f16_e32 v5, v7, v5
+; GFX908-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX908-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
@@ -8575,17 +8541,17 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT:    v_min_f16_e32 v5, v7, v5
-; GFX8-NEXT:    v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT:    v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
 ; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -8648,37 +8614,37 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT:    v_max_num_f16_e32 v6, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v6, v3
+; GFX12-NEXT:    v_not_b32_e32 v5, v5
 ; GFX12-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT:    v_min_num_f16_e32 v3, v3, v7
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT:    v_min_num_f16_e32 v2, v2, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -8696,29 +8662,29 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
 ; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v5
-; GFX940-NEXT:    flat_load_dword v5, v[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v6, v4
+; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX940-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX940-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT:    v_min_f16_e32 v4, v4, v7
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX940-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v4
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8728,37 +8694,37 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX11-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    v_max_f16_e32 v6, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v6, v3
+; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT:    v_min_f16_e32 v3, v3, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -8770,31 +8736,31 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX10-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_max_f16_e32 v6, v2, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v6, v3
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v5, v5
 ; GFX10-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX10-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT:    v_min_f16_e32 v3, v3, v7
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX10-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB35_1
@@ -8805,33 +8771,33 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX90A-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
+; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v6, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX90A-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT:    v_min_f16_e32 v4, v4, v7
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8841,31 +8807,31 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX908-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    flat_load_dword v4, v[0:1]
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
+; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT:    v_not_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX908-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX908-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT:    v_min_f16_e32 v3, v3, v7
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX908-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8875,32 +8841,32 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX8-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT:    v_not_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX8-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT:    v_min_f16_e32 v3, v3, v7
-; GFX8-NEXT:    v_and_b32_e32 v8, v4, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v8, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -12308,15 +12274,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT:    v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT:    v_pk_min_num_f16 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -12338,14 +12304,14 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX940-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX940-NEXT:    v_pk_max_f16 v3, v5, v5
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_min_f16 v4, v3, v4
+; GFX940-NEXT:    v_pk_min_f16 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12363,15 +12329,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX11-NEXT:    v_pk_min_f16 v3, v5, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX11-NEXT:    v_pk_min_f16 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12391,14 +12357,14 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX10-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT:    v_pk_min_f16 v3, v5, v3
+; GFX10-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX10-NEXT:    v_pk_min_f16 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12418,13 +12384,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX90A-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX90A-NEXT:    v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT:    v_pk_min_f16 v4, v3, v4
+; GFX90A-NEXT:    v_pk_min_f16 v4, v3, v2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -12442,13 +12408,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX908-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_pk_max_f16 v5, v2, v2
 ; GFX908-NEXT:    v_pk_max_f16 v3, v4, v4
-; GFX908-NEXT:    v_pk_min_f16 v3, v3, v5
+; GFX908-NEXT:    v_pk_min_f16 v3, v3, v2
 ; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -12466,21 +12432,21 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
-; GFX8-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v3, v2, v2
-; GFX8-NEXT:    v_max_f16_sdwa v6, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT:    v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v3, v7, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v5
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8-NEXT:    v_max_f16_sdwa v3, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT:    v_min_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB46_1
@@ -12544,15 +12510,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT:    v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT:    v_pk_min_num_f16 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -12574,14 +12540,14 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX940-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX940-NEXT:    v_pk_max_f16 v3, v5, v5
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_min_f16 v4, v3, v4
+; GFX940-NEXT:    v_pk_min_f16 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12599,15 +12565,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX11-NEXT:    v_pk_min_f16 v3, v5, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX11-NEXT:    v_pk_min_f16 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12628,21 +12594,21 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_pk_max_f16 v1, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX10-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_pk_max_f16 v0, v2, v2
-; GFX10-NEXT:    v_pk_max_f16 v5, v1, v1
-; GFX10-NEXT:    v_pk_min_f16 v0, v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_pk_max_f16 v0, v6, v6
+; GFX10-NEXT:    v_pk_min_f16 v5, v0, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB47_1
@@ -12655,13 +12621,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX90A-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX90A-NEXT:    v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT:    v_pk_min_f16 v4, v3, v4
+; GFX90A-NEXT:    v_pk_min_f16 v4, v3, v2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -12679,13 +12645,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX908-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_pk_max_f16 v5, v2, v2
 ; GFX908-NEXT:    v_pk_max_f16 v3, v4, v4
-; GFX908-NEXT:    v_pk_min_f16 v3, v3, v5
+; GFX908-NEXT:    v_pk_min_f16 v3, v3, v2
 ; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -12705,21 +12671,21 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT:    v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v7, v1, v1
-; GFX8-NEXT:    v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v0, v7, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT:    v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v0
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB47_1
@@ -12783,15 +12749,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT:    v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT:    v_pk_min_num_f16 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -12821,19 +12787,19 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi
 ; GFX940-NEXT:    s_mov_b32 s1, -1
 ; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v1, v2, v2
 ; GFX940-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v1, v0
-; GFX940-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX940-NEXT:    v_pk_max_f16 v0, v1, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, v0
+; GFX940-NEXT:    v_pk_max_f16 v0, v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_min_f16 v0, v0, v3
+; GFX940-NEXT:    v_pk_min_f16 v2, v0, v1
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB48_1
@@ -12852,20 +12818,20 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
 ; GFX11-NEXT:    flat_load_b32 v0, v[4:5]
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_pk_max_f16 v1, v2, v2
 ; GFX11-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11-NEXT:    v_pk_max_f16 v0, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v5, v1, v1
-; GFX11-NEXT:    v_pk_min_f16 v0, v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_max_f16 v0, v6, v6
+; GFX11-NEXT:    v_pk_min_f16 v5, v0, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -12879,21 +12845,21 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_pk_max_f16 v1, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX10-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_pk_max_f16 v0, v2, v2
-; GFX10-NEXT:    v_pk_max_f16 v5, v1, v1
-; GFX10-NEXT:    v_pk_min_f16 v0, v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_pk_max_f16 v0, v6, v6
+; GFX10-NEXT:    v_pk_min_f16 v5, v0, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB48_1
@@ -12910,17 +12876,17 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v1, v2, v2
 ; GFX90A-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX90A-NEXT:    v_pk_max_f16 v0, v1, v1
-; GFX90A-NEXT:    v_pk_min_f16 v0, v0, v3
-; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NEXT:    v_pk_max_f16 v0, v3, v3
+; GFX90A-NEXT:    v_pk_min_f16 v2, v0, v1
+; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB48_1
@@ -12937,17 +12903,17 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v1, v2, v2
 ; GFX908-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v1, v0
-; GFX908-NEXT:    v_pk_max_f16 v5, v2, v2
-; GFX908-NEXT:    v_pk_max_f16 v0, v1, v1
-; GFX908-NEXT:    v_pk_min_f16 v0, v0, v5
-; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v0
+; GFX908-NEXT:    v_pk_max_f16 v0, v6, v6
+; GFX908-NEXT:    v_pk_min_f16 v5, v0, v1
+; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB48_1
@@ -12962,21 +12928,21 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT:    v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v7, v1, v1
-; GFX8-NEXT:    v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v0, v7, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT:    v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v0
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB48_1
@@ -13039,21 +13005,21 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT:    v_pk_min_num_f16 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -13067,22 +13033,22 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1]
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX940-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_min_f16 v4, v4, v3
+; GFX940-NEXT:    v_pk_min_f16 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13092,22 +13058,22 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p
 ; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_min_f16 v3, v5, v3
+; GFX11-NEXT:    v_pk_min_f16 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -13119,21 +13085,21 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p
 ; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT:    v_pk_min_f16 v3, v5, v3
+; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT:    v_pk_min_f16 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB49_1
@@ -13144,20 +13110,20 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p
 ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX90A-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_max_f16 v4, v5, v5
-; GFX90A-NEXT:    v_pk_min_f16 v4, v4, v3
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT:    v_pk_min_f16 v2, v2, v4
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13167,20 +13133,20 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p
 ; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1]
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX908-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX908-NEXT:    v_pk_min_f16 v3, v5, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT:    v_pk_min_f16 v2, v2, v4
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13190,24 +13156,24 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p
 ; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
 ; GFX8-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT:    v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v5, v7, v6
-; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT:    v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v6, v6, v5
+; GFX8-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13266,21 +13232,21 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT:    v_pk_min_num_f16 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -13294,22 +13260,22 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX940-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_min_f16 v4, v4, v3
+; GFX940-NEXT:    v_pk_min_f16 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13319,22 +13285,22 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_min_f16 v3, v5, v3
+; GFX11-NEXT:    v_pk_min_f16 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -13348,21 +13314,21 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT:    v_pk_min_f16 v3, v5, v3
+; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT:    v_pk_min_f16 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB50_1
@@ -13373,20 +13339,20 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_max_f16 v4, v5, v5
-; GFX90A-NEXT:    v_pk_min_f16 v4, v4, v3
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT:    v_pk_min_f16 v2, v2, v4
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13396,20 +13362,20 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX908-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX908-NEXT:    v_pk_min_f16 v3, v5, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT:    v_pk_min_f16 v2, v2, v4
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13421,24 +13387,24 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
 ; GFX8-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT:    v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v5, v7, v6
-; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT:    v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v6, v6, v5
+; GFX8-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13500,21 +13466,21 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:-2048
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT:    v_pk_min_num_f16 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -13532,24 +13498,24 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
 ; GFX940-NEXT:    s_nop 0
 ; GFX940-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX940-NEXT:    flat_load_dword v5, v[4:5]
+; GFX940-NEXT:    flat_load_dword v3, v[4:5]
 ; GFX940-NEXT:    s_mov_b32 s1, -1
 ; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX940-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_min_f16 v4, v4, v3
+; GFX940-NEXT:    v_pk_min_f16 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13563,22 +13529,22 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    flat_load_b32 v4, v[3:4]
+; GFX11-NEXT:    flat_load_b32 v3, v[3:4]
+; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_min_f16 v3, v5, v3
+; GFX11-NEXT:    v_pk_min_f16 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -13592,21 +13558,21 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT:    v_pk_min_f16 v3, v5, v3
+; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT:    v_pk_min_f16 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB51_1
@@ -13623,12 +13589,12 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX90A-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_pk_max_f16 v0, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_max_f16 v3, v1, v1
-; GFX90A-NEXT:    v_pk_min_f16 v0, v3, v0
+; GFX90A-NEXT:    v_pk_max_f16 v0, v1, v1
+; GFX90A-NEXT:    v_pk_min_f16 v0, v0, v2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -13650,12 +13616,12 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX908-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_pk_max_f16 v0, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_pk_max_f16 v5, v1, v1
-; GFX908-NEXT:    v_pk_min_f16 v0, v5, v0
+; GFX908-NEXT:    v_pk_max_f16 v0, v1, v1
+; GFX908-NEXT:    v_pk_min_f16 v0, v0, v2
 ; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -13673,24 +13639,24 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
 ; GFX8-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT:    v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v5, v7, v6
-; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT:    v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v6, v6, v5
+; GFX8-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13753,15 +13719,15 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT:    v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT:    v_pk_min_num_f16 v3, v3, v2
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
@@ -13784,14 +13750,14 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX940-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX940-NEXT:    v_pk_max_f16 v3, v5, v5
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_min_f16 v4, v3, v4
+; GFX940-NEXT:    v_pk_min_f16 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13809,15 +13775,15 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX11-NEXT:    v_pk_min_f16 v3, v5, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX11-NEXT:    v_pk_min_f16 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13838,21 +13804,21 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_pk_max_f16 v1, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX10-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_pk_max_f16 v0, v2, v2
-; GFX10-NEXT:    v_pk_max_f16 v5, v1, v1
-; GFX10-NEXT:    v_pk_min_f16 v0, v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_pk_max_f16 v0, v6, v6
+; GFX10-NEXT:    v_pk_min_f16 v5, v0, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB52_1
@@ -13865,13 +13831,13 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX90A-NEXT:    v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT:    v_pk_min_f16 v4, v3, v4
+; GFX90A-NEXT:    v_pk_min_f16 v4, v3, v2
 ; GFX90A-NEXT:    buffer_wbl2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13891,13 +13857,13 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX908-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_pk_max_f16 v5, v2, v2
 ; GFX908-NEXT:    v_pk_max_f16 v3, v4, v4
-; GFX908-NEXT:    v_pk_min_f16 v3, v3, v5
+; GFX908-NEXT:    v_pk_min_f16 v3, v3, v2
 ; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -13917,21 +13883,21 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT:    v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v7, v1, v1
-; GFX8-NEXT:    v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v0, v7, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT:    v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v0
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB52_1
@@ -13994,22 +13960,22 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT:    v_pk_min_num_f16 v2, v2, v4
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -14023,22 +13989,22 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_
 ; GFX940-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX940-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_min_f16 v4, v4, v3
+; GFX940-NEXT:    v_pk_min_f16 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -14048,22 +14014,22 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_
 ; GFX11-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_min_f16 v3, v5, v3
+; GFX11-NEXT:    v_pk_min_f16 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -14077,21 +14043,21 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT:    v_pk_min_f16 v3, v5, v3
+; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT:    v_pk_min_f16 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB53_1
@@ -14102,22 +14068,22 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_
 ; GFX90A-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_max_f16 v4, v5, v5
-; GFX90A-NEXT:    v_pk_min_f16 v4, v4, v3
+; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT:    v_pk_min_f16 v2, v2, v4
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -14127,20 +14093,20 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_
 ; GFX908-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX908-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX908-NEXT:    v_pk_min_f16 v3, v5, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT:    v_pk_min_f16 v2, v2, v4
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -14152,24 +14118,24 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
 ; GFX8-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT:    v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v5, v7, v6
-; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT:    v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v6, v6, v5
+; GFX8-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -14236,36 +14202,38 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v6, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_min_num_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -14282,35 +14250,35 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v6, v4
-; GFX940-NEXT:    v_min_f32_e32 v3, v7, v3
-; GFX940-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT:    v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v7, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX940-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB54_1
@@ -14323,39 +14291,41 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -14370,34 +14340,34 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v6, v3
-; GFX10-NEXT:    v_min_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB54_1
@@ -14411,33 +14381,33 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_min_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB54_1
@@ -14451,33 +14421,33 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_min_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_min_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB54_1
@@ -14491,34 +14461,34 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_min_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_min_f32_e32 v3, v7, v3
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB54_1
@@ -14578,36 +14548,38 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v6, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_min_num_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -14624,35 +14596,35 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v6, v4
-; GFX940-NEXT:    v_min_f32_e32 v3, v7, v3
-; GFX940-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT:    v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_mov_b32_e32 v7, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX940-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB55_1
@@ -14665,39 +14637,41 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -14713,35 +14687,35 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX10-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX10-NEXT:    v_min_f32_e32 v0, v6, v0
-; GFX10-NEXT:    v_min_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB55_1
@@ -14754,33 +14728,33 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_min_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB55_1
@@ -14794,33 +14768,33 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_min_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_min_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB55_1
@@ -14836,34 +14810,34 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_min_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_min_f32_e32 v0, v7, v0
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB55_1
@@ -14923,36 +14897,38 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v6, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_min_num_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -14977,35 +14953,35 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
 ; GFX940-NEXT:    s_mov_b32 s1, -1
 ; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v1, v0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX940-NEXT:    v_min_f32_e32 v3, v6, v3
-; GFX940-NEXT:    v_min_f32_e32 v0, v7, v0
-; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v0, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_mov_b32_e32 v7, v0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
+; GFX940-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX940-NEXT:    v_min_f32_e32 v3, v3, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX940-NEXT:    v_add3_u32 v6, v6, v0, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v3, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v0, v0
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v0, v3, v0, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v3, v0, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB56_1
@@ -15024,38 +15000,40 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
 ; GFX11-NEXT:    flat_load_b32 v0, v[4:5]
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_lshlrev_b32 v0, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    v_dual_min_f32 v0, v6, v0 :: v_dual_and_b32 v7, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f32_e32 v5, v7, v5
-; GFX11-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v0, v0
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15070,35 +15048,35 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX10-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX10-NEXT:    v_min_f32_e32 v0, v6, v0
-; GFX10-NEXT:    v_min_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB56_1
@@ -15115,33 +15093,33 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX90A-NEXT:    v_min_f32_e32 v3, v6, v3
-; GFX90A-NEXT:    v_min_f32_e32 v0, v7, v0
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v0, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v0, v3, v0, s9
-; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
+; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX90A-NEXT:    v_min_f32_e32 v3, v3, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v3, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v3, v0, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[6:7] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB56_1
@@ -15158,33 +15136,33 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v1, v0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX908-NEXT:    v_min_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_min_f32_e32 v0, v7, v0
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v0, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX908-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v0, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v0, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB56_1
@@ -15199,34 +15177,34 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_min_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_min_f32_e32 v0, v7, v0
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB56_1
@@ -15285,38 +15263,38 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v5, v3
-; GFX12-NEXT:    v_min_num_f32_e32 v5, v7, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_min_num_f32_e32 v6, v6, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -15330,38 +15308,38 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1]
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX940-NEXT:    v_min_f32_e32 v4, v7, v6
-; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15371,41 +15349,41 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(
 ; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15418,35 +15396,35 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(
 ; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB57_1
@@ -15457,36 +15435,36 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(
 ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_min_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15496,36 +15474,36 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(
 ; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1]
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15535,37 +15513,37 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(
 ; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15620,38 +15598,38 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v5, v3
-; GFX12-NEXT:    v_min_num_f32_e32 v5, v7, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_min_num_f32_e32 v6, v6, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -15665,38 +15643,38 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX940-NEXT:    v_min_f32_e32 v4, v7, v6
-; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15706,41 +15684,41 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_min_f32_e32 v6, v6, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15755,35 +15733,35 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB58_1
@@ -15794,36 +15772,36 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_min_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15833,36 +15811,36 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15874,37 +15852,37 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15962,38 +15940,38 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:-2048
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v5, v3
-; GFX12-NEXT:    v_min_num_f32_e32 v5, v7, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_min_num_f32_e32 v6, v6, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -16011,40 +15989,40 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
 ; GFX940-NEXT:    s_nop 0
 ; GFX940-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX940-NEXT:    flat_load_dword v5, v[4:5]
+; GFX940-NEXT:    flat_load_dword v3, v[4:5]
 ; GFX940-NEXT:    s_mov_b32 s1, -1
 ; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX940-NEXT:    v_min_f32_e32 v4, v7, v6
-; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16058,41 +16036,41 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    flat_load_b32 v4, v[3:4]
+; GFX11-NEXT:    flat_load_b32 v3, v[3:4]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16107,35 +16085,35 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB59_1
@@ -16152,28 +16130,28 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX90A-NEXT:    v_min_f32_e32 v0, v3, v0
-; GFX90A-NEXT:    v_min_f32_e32 v3, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX90A-NEXT:    v_min_f32_e32 v6, v6, v2
+; GFX90A-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v0, v3, v0, s9
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v0, v6, v0, s9
 ; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -16195,28 +16173,28 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX908-NEXT:    v_min_f32_e32 v0, v5, v0
-; GFX908-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v0, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX908-NEXT:    v_min_f32_e32 v0, v0, v5
+; GFX908-NEXT:    v_min_f32_e32 v6, v6, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v0, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX908-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v0, v6, v0, s9
 ; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -16234,37 +16212,37 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16323,37 +16301,39 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v6, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_min_num_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -16370,35 +16350,35 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v6, v4
-; GFX940-NEXT:    v_min_f32_e32 v3, v7, v3
-; GFX940-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT:    v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v7, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX940-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB60_1
@@ -16411,39 +16391,41 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16459,35 +16441,35 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX10-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX10-NEXT:    v_min_f32_e32 v0, v6, v0
-; GFX10-NEXT:    v_min_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB60_1
@@ -16500,35 +16482,35 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_min_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB60_1
@@ -16542,33 +16524,33 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_min_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_min_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB60_1
@@ -16584,34 +16566,34 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_min_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_min_f32_e32 v0, v7, v0
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB60_1
@@ -16670,39 +16652,39 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v5, v3
-; GFX12-NEXT:    v_min_num_f32_e32 v5, v7, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_min_num_f32_e32 v6, v6, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -16716,38 +16698,38 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine
 ; GFX940-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX940-NEXT:    v_min_f32_e32 v4, v7, v6
-; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16757,41 +16739,41 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine
 ; GFX11-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16806,35 +16788,35 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB61_1
@@ -16845,38 +16827,38 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine
 ; GFX90A-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_min_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16886,36 +16868,36 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine
 ; GFX908-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16927,37 +16909,37 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
index 1b56fd020d502..14f75814128f1 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
@@ -13729,36 +13729,38 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_sub_f32_e32 v3, v6, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -13775,35 +13777,35 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_sub_f32_e32 v4, v6, v4
-; GFX940-NEXT:    v_sub_f32_e32 v3, v7, v3
-; GFX940-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT:    v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v7, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX940-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB50_1
@@ -13816,39 +13818,41 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -13863,34 +13867,34 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_sub_f32_e32 v3, v6, v3
-; GFX10-NEXT:    v_sub_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB50_1
@@ -13904,33 +13908,33 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_sub_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_sub_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB50_1
@@ -13944,33 +13948,33 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_sub_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_sub_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB50_1
@@ -13984,34 +13988,34 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_sub_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_sub_f32_e32 v3, v7, v3
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB50_1
@@ -14071,36 +14075,38 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_sub_f32_e32 v3, v6, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -14117,35 +14123,35 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_sub_f32_e32 v4, v6, v4
-; GFX940-NEXT:    v_sub_f32_e32 v3, v7, v3
-; GFX940-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT:    v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v7, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX940-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB51_1
@@ -14158,39 +14164,41 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -14206,35 +14214,35 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX10-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX10-NEXT:    v_sub_f32_e32 v0, v6, v0
-; GFX10-NEXT:    v_sub_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB51_1
@@ -14247,33 +14255,33 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_sub_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_sub_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB51_1
@@ -14287,33 +14295,33 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_sub_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_sub_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB51_1
@@ -14329,34 +14337,34 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_sub_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_sub_f32_e32 v0, v7, v0
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB51_1
@@ -14416,36 +14424,38 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_sub_f32_e32 v3, v6, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -14470,35 +14480,35 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX940-NEXT:    s_mov_b32 s1, -1
 ; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v1, v0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX940-NEXT:    v_sub_f32_e32 v3, v6, v3
-; GFX940-NEXT:    v_sub_f32_e32 v0, v7, v0
-; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v0, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_mov_b32_e32 v7, v0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
+; GFX940-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX940-NEXT:    v_sub_f32_e32 v3, v3, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX940-NEXT:    v_add3_u32 v6, v6, v0, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v3, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v0, v0
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v0, v3, v0, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v3, v0, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB52_1
@@ -14517,38 +14527,40 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
 ; GFX11-NEXT:    flat_load_b32 v0, v[4:5]
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_lshlrev_b32 v0, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    v_dual_sub_f32 v0, v6, v0 :: v_dual_and_b32 v7, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_f32_e32 v5, v7, v5
-; GFX11-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v0, v0
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -14563,35 +14575,35 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX10-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX10-NEXT:    v_sub_f32_e32 v0, v6, v0
-; GFX10-NEXT:    v_sub_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB52_1
@@ -14608,33 +14620,33 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX90A-NEXT:    v_sub_f32_e32 v3, v6, v3
-; GFX90A-NEXT:    v_sub_f32_e32 v0, v7, v0
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v0, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v0, v3, v0, s9
-; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
+; GFX90A-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX90A-NEXT:    v_sub_f32_e32 v3, v3, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v3, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v3, v0, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[6:7] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB52_1
@@ -14651,33 +14663,33 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v1, v0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX908-NEXT:    v_sub_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_sub_f32_e32 v0, v7, v0
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v0, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX908-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v0, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v0, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB52_1
@@ -14692,34 +14704,34 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_sub_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_sub_f32_e32 v0, v7, v0
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB52_1
@@ -14778,38 +14790,38 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX12-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_sub_f32_e32 v6, v6, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -14823,38 +14835,38 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1]
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_sub_f32_e32 v3, v4, v3
-; GFX940-NEXT:    v_sub_f32_e32 v4, v7, v6
-; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -14864,41 +14876,41 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -14911,35 +14923,35 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX10-LABEL: flat_agent_atomic_fsub_noret_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB53_1
@@ -14950,36 +14962,36 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_v2bf16:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_sub_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_sub_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -14989,36 +15001,36 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX908-LABEL: flat_agent_atomic_fsub_noret_v2bf16:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1]
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15028,37 +15040,37 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX8-LABEL: flat_agent_atomic_fsub_noret_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15113,38 +15125,38 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX12-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -15158,38 +15170,38 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_sub_f32_e32 v3, v4, v3
-; GFX940-NEXT:    v_sub_f32_e32 v4, v7, v6
-; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15199,41 +15211,41 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_sub_f32_e32 v6, v6, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15248,35 +15260,35 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB54_1
@@ -15287,36 +15299,36 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_sub_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_sub_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15326,36 +15338,36 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX908-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15367,37 +15379,37 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15455,38 +15467,38 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:-2048
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX12-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_sub_f32_e32 v6, v6, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -15504,40 +15516,40 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
 ; GFX940-NEXT:    s_nop 0
 ; GFX940-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX940-NEXT:    flat_load_dword v5, v[4:5]
+; GFX940-NEXT:    flat_load_dword v3, v[4:5]
 ; GFX940-NEXT:    s_mov_b32 s1, -1
 ; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_sub_f32_e32 v3, v4, v3
-; GFX940-NEXT:    v_sub_f32_e32 v4, v7, v6
-; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15551,41 +15563,41 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    flat_load_b32 v4, v[3:4]
+; GFX11-NEXT:    flat_load_b32 v3, v[3:4]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_sub_f32_e32 v6, v6, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15600,35 +15612,35 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB55_1
@@ -15645,28 +15657,28 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX90A-NEXT:    v_sub_f32_e32 v0, v3, v0
-; GFX90A-NEXT:    v_sub_f32_e32 v3, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX90A-NEXT:    v_sub_f32_e32 v0, v0, v3
+; GFX90A-NEXT:    v_sub_f32_e32 v6, v6, v2
+; GFX90A-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v0, v3, v0, s9
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v0, v6, v0, s9
 ; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -15688,28 +15700,28 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX908-NEXT:    v_sub_f32_e32 v0, v5, v0
-; GFX908-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v0, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX908-NEXT:    v_sub_f32_e32 v0, v0, v5
+; GFX908-NEXT:    v_sub_f32_e32 v6, v6, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v0, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX908-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v0, v6, v0, s9
 ; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -15727,37 +15739,37 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15816,37 +15828,39 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_sub_f32_e32 v3, v6, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -15863,35 +15877,35 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_sub_f32_e32 v4, v6, v4
-; GFX940-NEXT:    v_sub_f32_e32 v3, v7, v3
-; GFX940-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT:    v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v7, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX940-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB56_1
@@ -15904,39 +15918,41 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15952,35 +15968,35 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX10-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX10-NEXT:    v_sub_f32_e32 v0, v6, v0
-; GFX10-NEXT:    v_sub_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB56_1
@@ -15993,35 +16009,35 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_sub_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_sub_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB56_1
@@ -16035,33 +16051,33 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_sub_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_sub_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB56_1
@@ -16077,34 +16093,34 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_sub_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_sub_f32_e32 v0, v7, v0
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB56_1
@@ -16163,39 +16179,39 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX12-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -16209,38 +16225,38 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX940-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_sub_f32_e32 v3, v4, v3
-; GFX940-NEXT:    v_sub_f32_e32 v4, v7, v6
-; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16250,41 +16266,41 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX11-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16299,35 +16315,35 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB57_1
@@ -16338,38 +16354,38 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX90A-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_sub_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_sub_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16379,36 +16395,36 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX908-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16420,37 +16436,37 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
index 61549fa88f424..1311560715ddd 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
@@ -2057,21 +2057,19 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
-; GCN1-NEXT:    flat_load_dword v1, v[0:1]
+; GCN1-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN1-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN1-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_and_b32_e32 v0, s6, v1
-; GCN1-NEXT:    v_mov_b32_e32 v2, s4
-; GCN1-NEXT:    v_mov_b32_e32 v3, s5
-; GCN1-NEXT:    v_not_b32_e32 v0, v0
-; GCN1-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT:    v_and_b32_e32 v2, s6, v3
+; GCN1-NEXT:    v_not_b32_e32 v2, v2
+; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
+; GCN1-NEXT:    v_mov_b32_e32 v3, v2
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB54_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2083,21 +2081,19 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
-; GCN2-NEXT:    flat_load_dword v1, v[0:1]
+; GCN2-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN2-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN2-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_and_b32_e32 v0, s6, v1
-; GCN2-NEXT:    v_mov_b32_e32 v2, s4
-; GCN2-NEXT:    v_mov_b32_e32 v3, s5
-; GCN2-NEXT:    v_not_b32_e32 v0, v0
-; GCN2-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT:    v_and_b32_e32 v2, s6, v3
+; GCN2-NEXT:    v_not_b32_e32 v2, v2
+; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
+; GCN2-NEXT:    v_mov_b32_e32 v3, v2
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB54_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2109,21 +2105,19 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    flat_load_dword v1, v[0:1]
+; GCN3-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN3-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN3-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_and_b32_e32 v0, s6, v1
-; GCN3-NEXT:    v_mov_b32_e32 v2, s4
-; GCN3-NEXT:    v_mov_b32_e32 v3, s5
-; GCN3-NEXT:    v_not_b32_e32 v0, v0
-; GCN3-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN3-NEXT:    v_and_b32_e32 v2, s6, v3
+; GCN3-NEXT:    v_not_b32_e32 v2, v2
+; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
+; GCN3-NEXT:    v_mov_b32_e32 v3, v2
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB54_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2141,25 +2135,23 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s35
-; GCN1-NEXT:    flat_load_dword v1, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:    flat_load_dword v3, v[0:1]
+; GCN1-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN1-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_and_b32_e32 v0, s6, v1
-; GCN1-NEXT:    v_mov_b32_e32 v2, s34
-; GCN1-NEXT:    v_mov_b32_e32 v3, s35
-; GCN1-NEXT:    v_not_b32_e32 v0, v0
-; GCN1-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT:    v_and_b32_e32 v2, s6, v3
+; GCN1-NEXT:    v_not_b32_e32 v2, v2
+; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT:    v_mov_b32_e32 v3, v2
+; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB55_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_nand_i32_noret_offset_scalar:
@@ -2169,25 +2161,23 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s35
-; GCN2-NEXT:    flat_load_dword v1, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:    flat_load_dword v3, v[0:1]
+; GCN2-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN2-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_and_b32_e32 v0, s6, v1
-; GCN2-NEXT:    v_mov_b32_e32 v2, s34
-; GCN2-NEXT:    v_mov_b32_e32 v3, s35
-; GCN2-NEXT:    v_not_b32_e32 v0, v0
-; GCN2-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT:    v_and_b32_e32 v2, s6, v3
+; GCN2-NEXT:    v_not_b32_e32 v2, v2
+; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT:    v_mov_b32_e32 v3, v2
+; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB55_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_nand_i32_noret_offset_scalar:
@@ -2195,21 +2185,19 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    flat_load_dword v1, v[0:1] offset:16
+; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
 ; GCN3-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN3-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_and_b32_e32 v0, s6, v1
-; GCN3-NEXT:    v_mov_b32_e32 v2, s4
-; GCN3-NEXT:    v_mov_b32_e32 v3, s5
-; GCN3-NEXT:    v_not_b32_e32 v0, v0
-; GCN3-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT:    v_and_b32_e32 v2, s6, v3
+; GCN3-NEXT:    v_not_b32_e32 v2, v2
+; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
+; GCN3-NEXT:    v_mov_b32_e32 v3, v2
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB55_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2227,19 +2215,19 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    flat_load_dword v0, v[0:1]
+; GCN1-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN1-NEXT:    s_mov_b64 s[34:35], 0
+; GCN1-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN1-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    v_mov_b32_e32 v2, s4
-; GCN1-NEXT:    v_and_b32_e32 v0, s6, v1
-; GCN1-NEXT:    v_mov_b32_e32 v3, s5
-; GCN1-NEXT:    v_not_b32_e32 v0, v0
-; GCN1-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT:    v_mov_b32_e32 v4, v0
+; GCN1-NEXT:    v_and_b32_e32 v0, s6, v4
+; GCN1-NEXT:    v_not_b32_e32 v3, v0
+; GCN1-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB56_1
@@ -2253,19 +2241,19 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    flat_load_dword v0, v[0:1]
+; GCN2-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN2-NEXT:    s_mov_b64 s[34:35], 0
+; GCN2-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN2-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    v_mov_b32_e32 v2, s4
-; GCN2-NEXT:    v_and_b32_e32 v0, s6, v1
-; GCN2-NEXT:    v_mov_b32_e32 v3, s5
-; GCN2-NEXT:    v_not_b32_e32 v0, v0
-; GCN2-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT:    v_mov_b32_e32 v4, v0
+; GCN2-NEXT:    v_and_b32_e32 v0, s6, v4
+; GCN2-NEXT:    v_not_b32_e32 v3, v0
+; GCN2-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB56_1
@@ -2279,19 +2267,19 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v0, v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN3-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
-; GCN3-NEXT:    v_mov_b32_e32 v2, s4
-; GCN3-NEXT:    v_and_b32_e32 v0, s6, v1
-; GCN3-NEXT:    v_mov_b32_e32 v3, s5
-; GCN3-NEXT:    v_not_b32_e32 v0, v0
-; GCN3-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN3-NEXT:    v_mov_b32_e32 v4, v0
+; GCN3-NEXT:    v_and_b32_e32 v0, s6, v4
+; GCN3-NEXT:    v_not_b32_e32 v3, v0
+; GCN3-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB56_1
@@ -2308,27 +2296,25 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
-; GCN1-NEXT:    v_mov_b32_e32 v0, s34
-; GCN1-NEXT:    v_mov_b32_e32 v1, s35
-; GCN1-NEXT:    flat_load_dword v0, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:    v_mov_b32_e32 v1, s34
+; GCN1-NEXT:    v_mov_b32_e32 v2, s35
+; GCN1-NEXT:    flat_load_dword v0, v[1:2]
+; GCN1-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN1-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    v_mov_b32_e32 v2, s34
-; GCN1-NEXT:    v_and_b32_e32 v0, s6, v1
-; GCN1-NEXT:    v_mov_b32_e32 v3, s35
-; GCN1-NEXT:    v_not_b32_e32 v0, v0
-; GCN1-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT:    v_mov_b32_e32 v4, v0
+; GCN1-NEXT:    v_and_b32_e32 v0, s6, v4
+; GCN1-NEXT:    v_not_b32_e32 v3, v0
+; GCN1-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB57_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_nand_i32_ret_offset_scalar:
@@ -2336,27 +2322,25 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
-; GCN2-NEXT:    v_mov_b32_e32 v0, s34
-; GCN2-NEXT:    v_mov_b32_e32 v1, s35
-; GCN2-NEXT:    flat_load_dword v0, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:    v_mov_b32_e32 v1, s34
+; GCN2-NEXT:    v_mov_b32_e32 v2, s35
+; GCN2-NEXT:    flat_load_dword v0, v[1:2]
+; GCN2-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN2-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    v_mov_b32_e32 v2, s34
-; GCN2-NEXT:    v_and_b32_e32 v0, s6, v1
-; GCN2-NEXT:    v_mov_b32_e32 v3, s35
-; GCN2-NEXT:    v_not_b32_e32 v0, v0
-; GCN2-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT:    v_mov_b32_e32 v4, v0
+; GCN2-NEXT:    v_and_b32_e32 v0, s6, v4
+; GCN2-NEXT:    v_not_b32_e32 v3, v0
+; GCN2-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB57_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_nand_i32_ret_offset_scalar:
@@ -2365,19 +2349,19 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v0, v[0:1] offset:16
+; GCN3-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN3-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
-; GCN3-NEXT:    v_mov_b32_e32 v2, s4
-; GCN3-NEXT:    v_and_b32_e32 v0, s6, v1
-; GCN3-NEXT:    v_mov_b32_e32 v3, s5
-; GCN3-NEXT:    v_not_b32_e32 v0, v0
-; GCN3-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT:    v_mov_b32_e32 v4, v0
+; GCN3-NEXT:    v_and_b32_e32 v0, s6, v4
+; GCN3-NEXT:    v_not_b32_e32 v3, v0
+; GCN3-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB57_1
@@ -3532,20 +3516,18 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
-; GCN1-NEXT:    flat_load_dword v1, v[0:1]
+; GCN1-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN1-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN1-NEXT:  .LBB84_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_max_i32_e32 v0, s6, v1
-; GCN1-NEXT:    v_mov_b32_e32 v3, s5
-; GCN1-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT:    v_max_i32_e32 v2, s6, v3
+; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
+; GCN1-NEXT:    v_mov_b32_e32 v3, v2
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB84_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3557,20 +3539,18 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
-; GCN2-NEXT:    flat_load_dword v1, v[0:1]
+; GCN2-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN2-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN2-NEXT:  .LBB84_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_max_i32_e32 v0, s6, v1
-; GCN2-NEXT:    v_mov_b32_e32 v3, s5
-; GCN2-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT:    v_max_i32_e32 v2, s6, v3
+; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
+; GCN2-NEXT:    v_mov_b32_e32 v3, v2
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB84_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3582,20 +3562,18 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    flat_load_dword v1, v[0:1]
+; GCN3-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN3-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN3-NEXT:  .LBB84_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_max_i32_e32 v0, s6, v1
-; GCN3-NEXT:    v_mov_b32_e32 v3, s5
-; GCN3-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN3-NEXT:    v_max_i32_e32 v2, s6, v3
+; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
+; GCN3-NEXT:    v_mov_b32_e32 v3, v2
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB84_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3613,24 +3591,22 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s35
-; GCN1-NEXT:    flat_load_dword v1, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:    flat_load_dword v3, v[0:1]
+; GCN1-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN1-NEXT:  .LBB85_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    v_mov_b32_e32 v2, s34
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_max_i32_e32 v0, s6, v1
-; GCN1-NEXT:    v_mov_b32_e32 v3, s35
-; GCN1-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT:    v_max_i32_e32 v2, s6, v3
+; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT:    v_mov_b32_e32 v3, v2
+; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB85_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_max_i32_noret_offset_scalar:
@@ -3640,24 +3616,22 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s35
-; GCN2-NEXT:    flat_load_dword v1, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:    flat_load_dword v3, v[0:1]
+; GCN2-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN2-NEXT:  .LBB85_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    v_mov_b32_e32 v2, s34
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_max_i32_e32 v0, s6, v1
-; GCN2-NEXT:    v_mov_b32_e32 v3, s35
-; GCN2-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT:    v_max_i32_e32 v2, s6, v3
+; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT:    v_mov_b32_e32 v3, v2
+; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB85_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_max_i32_noret_offset_scalar:
@@ -3665,20 +3639,18 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    flat_load_dword v1, v[0:1] offset:16
+; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
 ; GCN3-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN3-NEXT:  .LBB85_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_max_i32_e32 v0, s6, v1
-; GCN3-NEXT:    v_mov_b32_e32 v3, s5
-; GCN3-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT:    v_max_i32_e32 v2, s6, v3
+; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
+; GCN3-NEXT:    v_mov_b32_e32 v3, v2
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB85_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3696,18 +3668,18 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    flat_load_dword v0, v[0:1]
+; GCN1-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN1-NEXT:    s_mov_b64 s[34:35], 0
+; GCN1-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN1-NEXT:  .LBB86_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    v_mov_b32_e32 v2, s4
-; GCN1-NEXT:    v_mov_b32_e32 v3, s5
-; GCN1-NEXT:    v_max_i32_e32 v0, s6, v1
-; GCN1-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT:    v_mov_b32_e32 v4, v0
+; GCN1-NEXT:    v_max_i32_e32 v3, s6, v4
+; GCN1-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB86_1
@@ -3721,18 +3693,18 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    flat_load_dword v0, v[0:1]
+; GCN2-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN2-NEXT:    s_mov_b64 s[34:35], 0
+; GCN2-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN2-NEXT:  .LBB86_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    v_mov_b32_e32 v2, s4
-; GCN2-NEXT:    v_mov_b32_e32 v3, s5
-; GCN2-NEXT:    v_max_i32_e32 v0, s6, v1
-; GCN2-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT:    v_mov_b32_e32 v4, v0
+; GCN2-NEXT:    v_max_i32_e32 v3, s6, v4
+; GCN2-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB86_1
@@ -3746,18 +3718,18 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v0, v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN3-NEXT:  .LBB86_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
-; GCN3-NEXT:    v_mov_b32_e32 v2, s4
-; GCN3-NEXT:    v_mov_b32_e32 v3, s5
-; GCN3-NEXT:    v_max_i32_e32 v0, s6, v1
-; GCN3-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN3-NEXT:    v_mov_b32_e32 v4, v0
+; GCN3-NEXT:    v_max_i32_e32 v3, s6, v4
+; GCN3-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB86_1
@@ -3774,26 +3746,24 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
-; GCN1-NEXT:    v_mov_b32_e32 v0, s34
-; GCN1-NEXT:    v_mov_b32_e32 v1, s35
-; GCN1-NEXT:    flat_load_dword v0, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:    v_mov_b32_e32 v1, s34
+; GCN1-NEXT:    v_mov_b32_e32 v2, s35
+; GCN1-NEXT:    flat_load_dword v0, v[1:2]
+; GCN1-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN1-NEXT:  .LBB87_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    v_mov_b32_e32 v2, s34
-; GCN1-NEXT:    v_mov_b32_e32 v3, s35
-; GCN1-NEXT:    v_max_i32_e32 v0, s6, v1
-; GCN1-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT:    v_mov_b32_e32 v4, v0
+; GCN1-NEXT:    v_max_i32_e32 v3, s6, v4
+; GCN1-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB87_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_max_i32_ret_offset_scalar:
@@ -3801,26 +3771,24 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
-; GCN2-NEXT:    v_mov_b32_e32 v0, s34
-; GCN2-NEXT:    v_mov_b32_e32 v1, s35
-; GCN2-NEXT:    flat_load_dword v0, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:    v_mov_b32_e32 v1, s34
+; GCN2-NEXT:    v_mov_b32_e32 v2, s35
+; GCN2-NEXT:    flat_load_dword v0, v[1:2]
+; GCN2-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN2-NEXT:  .LBB87_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    v_mov_b32_e32 v2, s34
-; GCN2-NEXT:    v_mov_b32_e32 v3, s35
-; GCN2-NEXT:    v_max_i32_e32 v0, s6, v1
-; GCN2-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT:    v_mov_b32_e32 v4, v0
+; GCN2-NEXT:    v_max_i32_e32 v3, s6, v4
+; GCN2-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB87_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_max_i32_ret_offset_scalar:
@@ -3829,18 +3797,18 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v0, v[0:1] offset:16
+; GCN3-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN3-NEXT:  .LBB87_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
-; GCN3-NEXT:    v_mov_b32_e32 v2, s4
-; GCN3-NEXT:    v_mov_b32_e32 v3, s5
-; GCN3-NEXT:    v_max_i32_e32 v0, s6, v1
-; GCN3-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT:    v_mov_b32_e32 v4, v0
+; GCN3-NEXT:    v_max_i32_e32 v3, s6, v4
+; GCN3-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB87_1
@@ -3866,21 +3834,19 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
-; GCN1-NEXT:    flat_load_dword v1, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[4:5], 0
+; GCN1-NEXT:    flat_load_dword v3, v[0:1]
+; GCN1-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN1-NEXT:  .LBB88_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_max_i32_e32 v0, s2, v1
-; GCN1-NEXT:    v_mov_b32_e32 v2, s0
-; GCN1-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT:    v_max_i32_e32 v2, s2, v3
+; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN1-NEXT:    v_mov_b32_e32 v3, v2
+; GCN1-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB88_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN1-NEXT:    s_endpgm
@@ -3898,21 +3864,19 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
-; GCN2-NEXT:    flat_load_dword v1, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[4:5], 0
+; GCN2-NEXT:    flat_load_dword v3, v[0:1]
+; GCN2-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN2-NEXT:  .LBB88_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_max_i32_e32 v0, s2, v1
-; GCN2-NEXT:    v_mov_b32_e32 v2, s0
-; GCN2-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT:    v_max_i32_e32 v2, s2, v3
+; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN2-NEXT:    v_mov_b32_e32 v3, v2
+; GCN2-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB88_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN2-NEXT:    s_endpgm
@@ -3928,21 +3892,19 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ; GCN3-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
-; GCN3-NEXT:    flat_load_dword v1, v[0:1] offset:16
-; GCN3-NEXT:    s_mov_b64 s[4:5], 0
+; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
+; GCN3-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN3-NEXT:  .LBB88_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_max_i32_e32 v0, s2, v1
-; GCN3-NEXT:    v_mov_b32_e32 v2, s0
-; GCN3-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT:    v_max_i32_e32 v2, s2, v3
+; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v3, v2
+; GCN3-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB88_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_endpgm
@@ -3968,27 +3930,25 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
-; GCN1-NEXT:    flat_load_dword v0, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[4:5], 0
+; GCN1-NEXT:    flat_load_dword v2, v[0:1]
+; GCN1-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN1-NEXT:  .LBB89_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    v_mov_b32_e32 v3, s1
-; GCN1-NEXT:    v_mov_b32_e32 v2, s0
-; GCN1-NEXT:    v_max_i32_e32 v0, s6, v1
-; GCN1-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT:    v_mov_b32_e32 v3, v2
+; GCN1-NEXT:    v_max_i32_e32 v2, s6, v3
+; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN1-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB89_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    v_mov_b32_e32 v1, s2
-; GCN1-NEXT:    v_mov_b32_e32 v2, s3
-; GCN1-NEXT:    flat_store_dword v[1:2], v0
+; GCN1-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GCN1-NEXT:    v_mov_b32_e32 v0, s2
+; GCN1-NEXT:    v_mov_b32_e32 v1, s3
+; GCN1-NEXT:    flat_store_dword v[0:1], v2
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_max_i32_ret_addr64_offset:
@@ -4005,27 +3965,25 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
-; GCN2-NEXT:    flat_load_dword v0, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[4:5], 0
+; GCN2-NEXT:    flat_load_dword v2, v[0:1]
+; GCN2-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN2-NEXT:  .LBB89_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    v_mov_b32_e32 v3, s1
-; GCN2-NEXT:    v_mov_b32_e32 v2, s0
-; GCN2-NEXT:    v_max_i32_e32 v0, s6, v1
-; GCN2-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT:    v_mov_b32_e32 v3, v2
+; GCN2-NEXT:    v_max_i32_e32 v2, s6, v3
+; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN2-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB89_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    v_mov_b32_e32 v1, s2
-; GCN2-NEXT:    v_mov_b32_e32 v2, s3
-; GCN2-NEXT:    flat_store_dword v[1:2], v0
+; GCN2-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GCN2-NEXT:    v_mov_b32_e32 v0, s2
+; GCN2-NEXT:    v_mov_b32_e32 v1, s3
+; GCN2-NEXT:    flat_store_dword v[0:1], v2
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GCN3-LABEL: atomic_max_i32_ret_addr64_offset:
@@ -4040,27 +3998,25 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN3-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
-; GCN3-NEXT:    flat_load_dword v0, v[0:1] offset:16
-; GCN3-NEXT:    s_mov_b64 s[4:5], 0
+; GCN3-NEXT:    flat_load_dword v2, v[0:1] offset:16
+; GCN3-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN3-NEXT:  .LBB89_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
-; GCN3-NEXT:    v_mov_b32_e32 v3, s1
-; GCN3-NEXT:    v_mov_b32_e32 v2, s0
-; GCN3-NEXT:    v_max_i32_e32 v0, s6, v1
-; GCN3-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT:    v_mov_b32_e32 v3, v2
+; GCN3-NEXT:    v_max_i32_e32 v2, s6, v3
+; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN3-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB89_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    v_mov_b32_e32 v1, s2
-; GCN3-NEXT:    v_mov_b32_e32 v2, s3
-; GCN3-NEXT:    flat_store_dword v[1:2], v0
+; GCN3-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i32 %index
@@ -4082,21 +4038,19 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index)
 ; GCN1-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
-; GCN1-NEXT:    flat_load_dword v1, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[4:5], 0
+; GCN1-NEXT:    flat_load_dword v3, v[0:1]
+; GCN1-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN1-NEXT:  .LBB90_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_max_i32_e32 v0, s2, v1
-; GCN1-NEXT:    v_mov_b32_e32 v2, s0
-; GCN1-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT:    v_max_i32_e32 v2, s2, v3
+; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN1-NEXT:    v_mov_b32_e32 v3, v2
+; GCN1-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB90_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN1-NEXT:    s_endpgm
@@ -4112,21 +4066,19 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index)
 ; GCN2-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
-; GCN2-NEXT:    flat_load_dword v1, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[4:5], 0
+; GCN2-NEXT:    flat_load_dword v3, v[0:1]
+; GCN2-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN2-NEXT:  .LBB90_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_max_i32_e32 v0, s2, v1
-; GCN2-NEXT:    v_mov_b32_e32 v2, s0
-; GCN2-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT:    v_max_i32_e32 v2, s2, v3
+; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN2-NEXT:    v_mov_b32_e32 v3, v2
+; GCN2-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB90_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN2-NEXT:    s_endpgm
@@ -4142,21 +4094,19 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index)
 ; GCN3-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
-; GCN3-NEXT:    flat_load_dword v1, v[0:1]
-; GCN3-NEXT:    s_mov_b64 s[4:5], 0
+; GCN3-NEXT:    flat_load_dword v3, v[0:1]
+; GCN3-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN3-NEXT:  .LBB90_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_max_i32_e32 v0, s2, v1
-; GCN3-NEXT:    v_mov_b32_e32 v2, s0
-; GCN3-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN3-NEXT:    v_max_i32_e32 v2, s2, v3
+; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v3, v2
+; GCN3-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB90_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_endpgm
@@ -4179,27 +4129,25 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ; GCN1-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
-; GCN1-NEXT:    flat_load_dword v0, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[4:5], 0
+; GCN1-NEXT:    flat_load_dword v2, v[0:1]
+; GCN1-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN1-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    v_mov_b32_e32 v3, s1
-; GCN1-NEXT:    v_mov_b32_e32 v2, s0
-; GCN1-NEXT:    v_max_i32_e32 v0, s6, v1
-; GCN1-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT:    v_mov_b32_e32 v3, v2
+; GCN1-NEXT:    v_max_i32_e32 v2, s6, v3
+; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN1-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB91_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    v_mov_b32_e32 v1, s2
-; GCN1-NEXT:    v_mov_b32_e32 v2, s3
-; GCN1-NEXT:    flat_store_dword v[1:2], v0
+; GCN1-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GCN1-NEXT:    v_mov_b32_e32 v0, s2
+; GCN1-NEXT:    v_mov_b32_e32 v1, s3
+; GCN1-NEXT:    flat_store_dword v[0:1], v2
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_max_i32_ret_addr64:
@@ -4214,27 +4162,25 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ; GCN2-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
-; GCN2-NEXT:    flat_load_dword v0, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[4:5], 0
+; GCN2-NEXT:    flat_load_dword v2, v[0:1]
+; GCN2-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN2-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    v_mov_b32_e32 v3, s1
-; GCN2-NEXT:    v_mov_b32_e32 v2, s0
-; GCN2-NEXT:    v_max_i32_e32 v0, s6, v1
-; GCN2-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT:    v_mov_b32_e32 v3, v2
+; GCN2-NEXT:    v_max_i32_e32 v2, s6, v3
+; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN2-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB91_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    v_mov_b32_e32 v1, s2
-; GCN2-NEXT:    v_mov_b32_e32 v2, s3
-; GCN2-NEXT:    flat_store_dword v[1:2], v0
+; GCN2-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GCN2-NEXT:    v_mov_b32_e32 v0, s2
+; GCN2-NEXT:    v_mov_b32_e32 v1, s3
+; GCN2-NEXT:    flat_store_dword v[0:1], v2
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GCN3-LABEL: atomic_max_i32_ret_addr64:
@@ -4249,27 +4195,25 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ; GCN3-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
-; GCN3-NEXT:    flat_load_dword v0, v[0:1]
-; GCN3-NEXT:    s_mov_b64 s[4:5], 0
+; GCN3-NEXT:    flat_load_dword v2, v[0:1]
+; GCN3-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN3-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
-; GCN3-NEXT:    v_mov_b32_e32 v3, s1
-; GCN3-NEXT:    v_mov_b32_e32 v2, s0
-; GCN3-NEXT:    v_max_i32_e32 v0, s6, v1
-; GCN3-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN3-NEXT:    v_mov_b32_e32 v3, v2
+; GCN3-NEXT:    v_max_i32_e32 v2, s6, v3
+; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN3-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB91_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    v_mov_b32_e32 v1, s2
-; GCN3-NEXT:    v_mov_b32_e32 v2, s3
-; GCN3-NEXT:    flat_store_dword v[1:2], v0
+; GCN3-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i32 %index
@@ -4715,20 +4659,18 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
-; GCN1-NEXT:    flat_load_dword v1, v[0:1]
+; GCN1-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN1-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN1-NEXT:  .LBB98_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_max_u32_e32 v0, s6, v1
-; GCN1-NEXT:    v_mov_b32_e32 v3, s5
-; GCN1-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT:    v_max_u32_e32 v2, s6, v3
+; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
+; GCN1-NEXT:    v_mov_b32_e32 v3, v2
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB98_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4740,20 +4682,18 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
-; GCN2-NEXT:    flat_load_dword v1, v[0:1]
+; GCN2-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN2-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN2-NEXT:  .LBB98_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_max_u32_e32 v0, s6, v1
-; GCN2-NEXT:    v_mov_b32_e32 v3, s5
-; GCN2-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT:    v_max_u32_e32 v2, s6, v3
+; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
+; GCN2-NEXT:    v_mov_b32_e32 v3, v2
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB98_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4765,20 +4705,18 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    flat_load_dword v1, v[0:1]
+; GCN3-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN3-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN3-NEXT:  .LBB98_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_max_u32_e32 v0, s6, v1
-; GCN3-NEXT:    v_mov_b32_e32 v3, s5
-; GCN3-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN3-NEXT:    v_max_u32_e32 v2, s6, v3
+; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
+; GCN3-NEXT:    v_mov_b32_e32 v3, v2
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB98_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4796,24 +4734,22 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s35
-; GCN1-NEXT:    flat_load_dword v1, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:    flat_load_dword v3, v[0:1]
+; GCN1-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN1-NEXT:  .LBB99_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    v_mov_b32_e32 v2, s34
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_max_u32_e32 v0, s6, v1
-; GCN1-NEXT:    v_mov_b32_e32 v3, s35
-; GCN1-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT:    v_max_u32_e32 v2, s6, v3
+; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT:    v_mov_b32_e32 v3, v2
+; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB99_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_umax_i32_noret_offset_scalar:
@@ -4823,24 +4759,22 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s35
-; GCN2-NEXT:    flat_load_dword v1, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:    flat_load_dword v3, v[0:1]
+; GCN2-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN2-NEXT:  .LBB99_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    v_mov_b32_e32 v2, s34
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_max_u32_e32 v0, s6, v1
-; GCN2-NEXT:    v_mov_b32_e32 v3, s35
-; GCN2-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT:    v_max_u32_e32 v2, s6, v3
+; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT:    v_mov_b32_e32 v3, v2
+; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB99_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_umax_i32_noret_offset_scalar:
@@ -4848,20 +4782,18 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    flat_load_dword v1, v[0:1] offset:16
+; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
 ; GCN3-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN3-NEXT:  .LBB99_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_max_u32_e32 v0, s6, v1
-; GCN3-NEXT:    v_mov_b32_e32 v3, s5
-; GCN3-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT:    v_max_u32_e32 v2, s6, v3
+; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
+; GCN3-NEXT:    v_mov_b32_e32 v3, v2
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB99_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4879,18 +4811,18 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    flat_load_dword v0, v[0:1]
+; GCN1-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN1-NEXT:    s_mov_b64 s[34:35], 0
+; GCN1-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN1-NEXT:  .LBB100_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    v_mov_b32_e32 v2, s4
-; GCN1-NEXT:    v_mov_b32_e32 v3, s5
-; GCN1-NEXT:    v_max_u32_e32 v0, s6, v1
-; GCN1-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT:    v_mov_b32_e32 v4, v0
+; GCN1-NEXT:    v_max_u32_e32 v3, s6, v4
+; GCN1-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB100_1
@@ -4904,18 +4836,18 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    flat_load_dword v0, v[0:1]
+; GCN2-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN2-NEXT:    s_mov_b64 s[34:35], 0
+; GCN2-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN2-NEXT:  .LBB100_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    v_mov_b32_e32 v2, s4
-; GCN2-NEXT:    v_mov_b32_e32 v3, s5
-; GCN2-NEXT:    v_max_u32_e32 v0, s6, v1
-; GCN2-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT:    v_mov_b32_e32 v4, v0
+; GCN2-NEXT:    v_max_u32_e32 v3, s6, v4
+; GCN2-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB100_1
@@ -4929,18 +4861,18 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v0, v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN3-NEXT:  .LBB100_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
-; GCN3-NEXT:    v_mov_b32_e32 v2, s4
-; GCN3-NEXT:    v_mov_b32_e32 v3, s5
-; GCN3-NEXT:    v_max_u32_e32 v0, s6, v1
-; GCN3-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN3-NEXT:    v_mov_b32_e32 v4, v0
+; GCN3-NEXT:    v_max_u32_e32 v3, s6, v4
+; GCN3-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB100_1
@@ -4957,26 +4889,24 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
-; GCN1-NEXT:    v_mov_b32_e32 v0, s34
-; GCN1-NEXT:    v_mov_b32_e32 v1, s35
-; GCN1-NEXT:    flat_load_dword v0, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:    v_mov_b32_e32 v1, s34
+; GCN1-NEXT:    v_mov_b32_e32 v2, s35
+; GCN1-NEXT:    flat_load_dword v0, v[1:2]
+; GCN1-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN1-NEXT:  .LBB101_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    v_mov_b32_e32 v2, s34
-; GCN1-NEXT:    v_mov_b32_e32 v3, s35
-; GCN1-NEXT:    v_max_u32_e32 v0, s6, v1
-; GCN1-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT:    v_mov_b32_e32 v4, v0
+; GCN1-NEXT:    v_max_u32_e32 v3, s6, v4
+; GCN1-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB101_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_umax_i32_ret_offset_scalar:
@@ -4984,26 +4914,24 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
-; GCN2-NEXT:    v_mov_b32_e32 v0, s34
-; GCN2-NEXT:    v_mov_b32_e32 v1, s35
-; GCN2-NEXT:    flat_load_dword v0, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:    v_mov_b32_e32 v1, s34
+; GCN2-NEXT:    v_mov_b32_e32 v2, s35
+; GCN2-NEXT:    flat_load_dword v0, v[1:2]
+; GCN2-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN2-NEXT:  .LBB101_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    v_mov_b32_e32 v2, s34
-; GCN2-NEXT:    v_mov_b32_e32 v3, s35
-; GCN2-NEXT:    v_max_u32_e32 v0, s6, v1
-; GCN2-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT:    v_mov_b32_e32 v4, v0
+; GCN2-NEXT:    v_max_u32_e32 v3, s6, v4
+; GCN2-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB101_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_umax_i32_ret_offset_scalar:
@@ -5012,18 +4940,18 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v0, v[0:1] offset:16
+; GCN3-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN3-NEXT:  .LBB101_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
-; GCN3-NEXT:    v_mov_b32_e32 v2, s4
-; GCN3-NEXT:    v_mov_b32_e32 v3, s5
-; GCN3-NEXT:    v_max_u32_e32 v0, s6, v1
-; GCN3-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT:    v_mov_b32_e32 v4, v0
+; GCN3-NEXT:    v_max_u32_e32 v3, s6, v4
+; GCN3-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB101_1
@@ -5049,21 +4977,19 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
-; GCN1-NEXT:    flat_load_dword v1, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[4:5], 0
+; GCN1-NEXT:    flat_load_dword v3, v[0:1]
+; GCN1-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN1-NEXT:  .LBB102_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_max_u32_e32 v0, s2, v1
-; GCN1-NEXT:    v_mov_b32_e32 v2, s0
-; GCN1-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT:    v_max_u32_e32 v2, s2, v3
+; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN1-NEXT:    v_mov_b32_e32 v3, v2
+; GCN1-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB102_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN1-NEXT:    s_endpgm
@@ -5081,21 +5007,19 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
-; GCN2-NEXT:    flat_load_dword v1, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[4:5], 0
+; GCN2-NEXT:    flat_load_dword v3, v[0:1]
+; GCN2-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN2-NEXT:  .LBB102_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_max_u32_e32 v0, s2, v1
-; GCN2-NEXT:    v_mov_b32_e32 v2, s0
-; GCN2-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT:    v_max_u32_e32 v2, s2, v3
+; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN2-NEXT:    v_mov_b32_e32 v3, v2
+; GCN2-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB102_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN2-NEXT:    s_endpgm
@@ -5111,21 +5035,19 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32
 ; GCN3-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
-; GCN3-NEXT:    flat_load_dword v1, v[0:1] offset:16
-; GCN3-NEXT:    s_mov_b64 s[4:5], 0
+; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
+; GCN3-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN3-NEXT:  .LBB102_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_max_u32_e32 v0, s2, v1
-; GCN3-NEXT:    v_mov_b32_e32 v2, s0
-; GCN3-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT:    v_max_u32_e32 v2, s2, v3
+; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v3, v2
+; GCN3-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB102_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_endpgm
@@ -5151,27 +5073,25 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
-; GCN1-NEXT:    flat_load_dword v0, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[4:5], 0
+; GCN1-NEXT:    flat_load_dword v2, v[0:1]
+; GCN1-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN1-NEXT:  .LBB103_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    v_mov_b32_e32 v3, s1
-; GCN1-NEXT:    v_mov_b32_e32 v2, s0
-; GCN1-NEXT:    v_max_u32_e32 v0, s6, v1
-; GCN1-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT:    v_mov_b32_e32 v3, v2
+; GCN1-NEXT:    v_max_u32_e32 v2, s6, v3
+; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN1-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB103_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    v_mov_b32_e32 v1, s2
-; GCN1-NEXT:    v_mov_b32_e32 v2, s3
-; GCN1-NEXT:    flat_store_dword v[1:2], v0
+; GCN1-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GCN1-NEXT:    v_mov_b32_e32 v0, s2
+; GCN1-NEXT:    v_mov_b32_e32 v1, s3
+; GCN1-NEXT:    flat_store_dword v[0:1], v2
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset:
@@ -5188,27 +5108,25 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
-; GCN2-NEXT:    flat_load_dword v0, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[4:5], 0
+; GCN2-NEXT:    flat_load_dword v2, v[0:1]
+; GCN2-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN2-NEXT:  .LBB103_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    v_mov_b32_e32 v3, s1
-; GCN2-NEXT:    v_mov_b32_e32 v2, s0
-; GCN2-NEXT:    v_max_u32_e32 v0, s6, v1
-; GCN2-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT:    v_mov_b32_e32 v3, v2
+; GCN2-NEXT:    v_max_u32_e32 v2, s6, v3
+; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN2-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB103_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    v_mov_b32_e32 v1, s2
-; GCN2-NEXT:    v_mov_b32_e32 v2, s3
-; GCN2-NEXT:    flat_store_dword v[1:2], v0
+; GCN2-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GCN2-NEXT:    v_mov_b32_e32 v0, s2
+; GCN2-NEXT:    v_mov_b32_e32 v1, s3
+; GCN2-NEXT:    flat_store_dword v[0:1], v2
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GCN3-LABEL: atomic_umax_i32_ret_addr64_offset:
@@ -5223,27 +5141,25 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN3-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
-; GCN3-NEXT:    flat_load_dword v0, v[0:1] offset:16
-; GCN3-NEXT:    s_mov_b64 s[4:5], 0
+; GCN3-NEXT:    flat_load_dword v2, v[0:1] offset:16
+; GCN3-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN3-NEXT:  .LBB103_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
-; GCN3-NEXT:    v_mov_b32_e32 v3, s1
-; GCN3-NEXT:    v_mov_b32_e32 v2, s0
-; GCN3-NEXT:    v_max_u32_e32 v0, s6, v1
-; GCN3-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT:    v_mov_b32_e32 v3, v2
+; GCN3-NEXT:    v_max_u32_e32 v2, s6, v3
+; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN3-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB103_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    v_mov_b32_e32 v1, s2
-; GCN3-NEXT:    v_mov_b32_e32 v2, s3
-; GCN3-NEXT:    flat_store_dword v[1:2], v0
+; GCN3-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i32 %index
@@ -5266,27 +5182,25 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %
 ; GCN1-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
-; GCN1-NEXT:    flat_load_dword v0, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[4:5], 0
+; GCN1-NEXT:    flat_load_dword v2, v[0:1]
+; GCN1-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN1-NEXT:  .LBB104_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    v_mov_b32_e32 v3, s1
-; GCN1-NEXT:    v_mov_b32_e32 v2, s0
-; GCN1-NEXT:    v_max_u32_e32 v0, s6, v1
-; GCN1-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT:    v_mov_b32_e32 v3, v2
+; GCN1-NEXT:    v_max_u32_e32 v2, s6, v3
+; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN1-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB104_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    v_mov_b32_e32 v1, s2
-; GCN1-NEXT:    v_mov_b32_e32 v2, s3
-; GCN1-NEXT:    flat_store_dword v[1:2], v0
+; GCN1-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GCN1-NEXT:    v_mov_b32_e32 v0, s2
+; GCN1-NEXT:    v_mov_b32_e32 v1, s3
+; GCN1-NEXT:    flat_store_dword v[0:1], v2
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_umax_i32_ret_addr64:
@@ -5301,27 +5215,25 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %
 ; GCN2-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
-; GCN2-NEXT:    flat_load_dword v0, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[4:5], 0
+; GCN2-NEXT:    flat_load_dword v2, v[0:1]
+; GCN2-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN2-NEXT:  .LBB104_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    v_mov_b32_e32 v3, s1
-; GCN2-NEXT:    v_mov_b32_e32 v2, s0
-; GCN2-NEXT:    v_max_u32_e32 v0, s6, v1
-; GCN2-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT:    v_mov_b32_e32 v3, v2
+; GCN2-NEXT:    v_max_u32_e32 v2, s6, v3
+; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN2-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB104_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    v_mov_b32_e32 v1, s2
-; GCN2-NEXT:    v_mov_b32_e32 v2, s3
-; GCN2-NEXT:    flat_store_dword v[1:2], v0
+; GCN2-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GCN2-NEXT:    v_mov_b32_e32 v0, s2
+; GCN2-NEXT:    v_mov_b32_e32 v1, s3
+; GCN2-NEXT:    flat_store_dword v[0:1], v2
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GCN3-LABEL: atomic_umax_i32_ret_addr64:
@@ -5336,27 +5248,25 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %
 ; GCN3-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
-; GCN3-NEXT:    flat_load_dword v0, v[0:1]
-; GCN3-NEXT:    s_mov_b64 s[4:5], 0
+; GCN3-NEXT:    flat_load_dword v2, v[0:1]
+; GCN3-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN3-NEXT:  .LBB104_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
-; GCN3-NEXT:    v_mov_b32_e32 v3, s1
-; GCN3-NEXT:    v_mov_b32_e32 v2, s0
-; GCN3-NEXT:    v_max_u32_e32 v0, s6, v1
-; GCN3-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN3-NEXT:    v_mov_b32_e32 v3, v2
+; GCN3-NEXT:    v_max_u32_e32 v2, s6, v3
+; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN3-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB104_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    v_mov_b32_e32 v1, s2
-; GCN3-NEXT:    v_mov_b32_e32 v2, s3
-; GCN3-NEXT:    flat_store_dword v[1:2], v0
+; GCN3-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i32 %index
@@ -5802,20 +5712,18 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
-; GCN1-NEXT:    flat_load_dword v1, v[0:1]
+; GCN1-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN1-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN1-NEXT:  .LBB111_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_min_u32_e32 v0, s6, v1
-; GCN1-NEXT:    v_mov_b32_e32 v3, s5
-; GCN1-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT:    v_min_u32_e32 v2, s6, v3
+; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
+; GCN1-NEXT:    v_mov_b32_e32 v3, v2
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB111_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5827,20 +5735,18 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
-; GCN2-NEXT:    flat_load_dword v1, v[0:1]
+; GCN2-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN2-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN2-NEXT:  .LBB111_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_min_u32_e32 v0, s6, v1
-; GCN2-NEXT:    v_mov_b32_e32 v3, s5
-; GCN2-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT:    v_min_u32_e32 v2, s6, v3
+; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
+; GCN2-NEXT:    v_mov_b32_e32 v3, v2
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB111_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5852,20 +5758,18 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    flat_load_dword v1, v[0:1]
+; GCN3-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN3-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN3-NEXT:  .LBB111_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_min_u32_e32 v0, s6, v1
-; GCN3-NEXT:    v_mov_b32_e32 v3, s5
-; GCN3-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN3-NEXT:    v_min_u32_e32 v2, s6, v3
+; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
+; GCN3-NEXT:    v_mov_b32_e32 v3, v2
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB111_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5883,24 +5787,22 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s35
-; GCN1-NEXT:    flat_load_dword v1, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:    flat_load_dword v3, v[0:1]
+; GCN1-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN1-NEXT:  .LBB112_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    v_mov_b32_e32 v2, s34
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_min_u32_e32 v0, s6, v1
-; GCN1-NEXT:    v_mov_b32_e32 v3, s35
-; GCN1-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT:    v_min_u32_e32 v2, s6, v3
+; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT:    v_mov_b32_e32 v3, v2
+; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB112_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_umin_i32_noret_offset_scalar:
@@ -5910,24 +5812,22 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s35
-; GCN2-NEXT:    flat_load_dword v1, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:    flat_load_dword v3, v[0:1]
+; GCN2-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN2-NEXT:  .LBB112_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    v_mov_b32_e32 v2, s34
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_min_u32_e32 v0, s6, v1
-; GCN2-NEXT:    v_mov_b32_e32 v3, s35
-; GCN2-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT:    v_min_u32_e32 v2, s6, v3
+; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT:    v_mov_b32_e32 v3, v2
+; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB112_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_umin_i32_noret_offset_scalar:
@@ -5935,20 +5835,18 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    flat_load_dword v1, v[0:1] offset:16
+; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
 ; GCN3-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN3-NEXT:  .LBB112_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_min_u32_e32 v0, s6, v1
-; GCN3-NEXT:    v_mov_b32_e32 v3, s5
-; GCN3-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT:    v_min_u32_e32 v2, s6, v3
+; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
+; GCN3-NEXT:    v_mov_b32_e32 v3, v2
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB112_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5966,18 +5864,18 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    flat_load_dword v0, v[0:1]
+; GCN1-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN1-NEXT:    s_mov_b64 s[34:35], 0
+; GCN1-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN1-NEXT:  .LBB113_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    v_mov_b32_e32 v2, s4
-; GCN1-NEXT:    v_mov_b32_e32 v3, s5
-; GCN1-NEXT:    v_min_u32_e32 v0, s6, v1
-; GCN1-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT:    v_mov_b32_e32 v4, v0
+; GCN1-NEXT:    v_min_u32_e32 v3, s6, v4
+; GCN1-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB113_1
@@ -5991,18 +5889,18 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    flat_load_dword v0, v[0:1]
+; GCN2-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN2-NEXT:    s_mov_b64 s[34:35], 0
+; GCN2-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN2-NEXT:  .LBB113_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    v_mov_b32_e32 v2, s4
-; GCN2-NEXT:    v_mov_b32_e32 v3, s5
-; GCN2-NEXT:    v_min_u32_e32 v0, s6, v1
-; GCN2-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT:    v_mov_b32_e32 v4, v0
+; GCN2-NEXT:    v_min_u32_e32 v3, s6, v4
+; GCN2-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB113_1
@@ -6016,18 +5914,18 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v0, v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN3-NEXT:  .LBB113_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
-; GCN3-NEXT:    v_mov_b32_e32 v2, s4
-; GCN3-NEXT:    v_mov_b32_e32 v3, s5
-; GCN3-NEXT:    v_min_u32_e32 v0, s6, v1
-; GCN3-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN3-NEXT:    v_mov_b32_e32 v4, v0
+; GCN3-NEXT:    v_min_u32_e32 v3, s6, v4
+; GCN3-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB113_1
@@ -6044,26 +5942,24 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
-; GCN1-NEXT:    v_mov_b32_e32 v0, s34
-; GCN1-NEXT:    v_mov_b32_e32 v1, s35
-; GCN1-NEXT:    flat_load_dword v0, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:    v_mov_b32_e32 v1, s34
+; GCN1-NEXT:    v_mov_b32_e32 v2, s35
+; GCN1-NEXT:    flat_load_dword v0, v[1:2]
+; GCN1-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN1-NEXT:  .LBB114_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    v_mov_b32_e32 v2, s34
-; GCN1-NEXT:    v_mov_b32_e32 v3, s35
-; GCN1-NEXT:    v_min_u32_e32 v0, s6, v1
-; GCN1-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT:    v_mov_b32_e32 v4, v0
+; GCN1-NEXT:    v_min_u32_e32 v3, s6, v4
+; GCN1-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB114_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_umin_i32_ret_offset_scalar:
@@ -6071,26 +5967,24 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
-; GCN2-NEXT:    v_mov_b32_e32 v0, s34
-; GCN2-NEXT:    v_mov_b32_e32 v1, s35
-; GCN2-NEXT:    flat_load_dword v0, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:    v_mov_b32_e32 v1, s34
+; GCN2-NEXT:    v_mov_b32_e32 v2, s35
+; GCN2-NEXT:    flat_load_dword v0, v[1:2]
+; GCN2-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN2-NEXT:  .LBB114_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    v_mov_b32_e32 v2, s34
-; GCN2-NEXT:    v_mov_b32_e32 v3, s35
-; GCN2-NEXT:    v_min_u32_e32 v0, s6, v1
-; GCN2-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT:    v_mov_b32_e32 v4, v0
+; GCN2-NEXT:    v_min_u32_e32 v3, s6, v4
+; GCN2-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB114_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_umin_i32_ret_offset_scalar:
@@ -6099,18 +5993,18 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v0, v[0:1] offset:16
+; GCN3-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN3-NEXT:  .LBB114_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
-; GCN3-NEXT:    v_mov_b32_e32 v2, s4
-; GCN3-NEXT:    v_mov_b32_e32 v3, s5
-; GCN3-NEXT:    v_min_u32_e32 v0, s6, v1
-; GCN3-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT:    v_mov_b32_e32 v4, v0
+; GCN3-NEXT:    v_min_u32_e32 v3, s6, v4
+; GCN3-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB114_1
@@ -6559,20 +6453,18 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
-; GCN1-NEXT:    flat_load_dword v1, v[0:1]
+; GCN1-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN1-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN1-NEXT:  .LBB121_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_min_i32_e32 v0, s6, v1
-; GCN1-NEXT:    v_mov_b32_e32 v3, s5
-; GCN1-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT:    v_min_i32_e32 v2, s6, v3
+; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
+; GCN1-NEXT:    v_mov_b32_e32 v3, v2
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB121_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6584,20 +6476,18 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
-; GCN2-NEXT:    flat_load_dword v1, v[0:1]
+; GCN2-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN2-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN2-NEXT:  .LBB121_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_min_i32_e32 v0, s6, v1
-; GCN2-NEXT:    v_mov_b32_e32 v3, s5
-; GCN2-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT:    v_min_i32_e32 v2, s6, v3
+; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
+; GCN2-NEXT:    v_mov_b32_e32 v3, v2
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB121_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6609,20 +6499,18 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    flat_load_dword v1, v[0:1]
+; GCN3-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN3-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN3-NEXT:  .LBB121_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_min_i32_e32 v0, s6, v1
-; GCN3-NEXT:    v_mov_b32_e32 v3, s5
-; GCN3-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN3-NEXT:    v_min_i32_e32 v2, s6, v3
+; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
+; GCN3-NEXT:    v_mov_b32_e32 v3, v2
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB121_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6640,24 +6528,22 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s35
-; GCN1-NEXT:    flat_load_dword v1, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:    flat_load_dword v3, v[0:1]
+; GCN1-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN1-NEXT:  .LBB122_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    v_mov_b32_e32 v2, s34
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_min_i32_e32 v0, s6, v1
-; GCN1-NEXT:    v_mov_b32_e32 v3, s35
-; GCN1-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT:    v_min_i32_e32 v2, s6, v3
+; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT:    v_mov_b32_e32 v3, v2
+; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB122_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_min_i32_noret_offset_scalar:
@@ -6667,24 +6553,22 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s35
-; GCN2-NEXT:    flat_load_dword v1, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:    flat_load_dword v3, v[0:1]
+; GCN2-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN2-NEXT:  .LBB122_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    v_mov_b32_e32 v2, s34
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_min_i32_e32 v0, s6, v1
-; GCN2-NEXT:    v_mov_b32_e32 v3, s35
-; GCN2-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT:    v_min_i32_e32 v2, s6, v3
+; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT:    v_mov_b32_e32 v3, v2
+; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB122_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_min_i32_noret_offset_scalar:
@@ -6692,20 +6576,18 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    flat_load_dword v1, v[0:1] offset:16
+; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
 ; GCN3-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN3-NEXT:  .LBB122_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_min_i32_e32 v0, s6, v1
-; GCN3-NEXT:    v_mov_b32_e32 v3, s5
-; GCN3-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT:    v_min_i32_e32 v2, s6, v3
+; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
+; GCN3-NEXT:    v_mov_b32_e32 v3, v2
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB122_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6723,18 +6605,18 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    flat_load_dword v0, v[0:1]
+; GCN1-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN1-NEXT:    s_mov_b64 s[34:35], 0
+; GCN1-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN1-NEXT:  .LBB123_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    v_mov_b32_e32 v2, s4
-; GCN1-NEXT:    v_mov_b32_e32 v3, s5
-; GCN1-NEXT:    v_min_i32_e32 v0, s6, v1
-; GCN1-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT:    v_mov_b32_e32 v4, v0
+; GCN1-NEXT:    v_min_i32_e32 v3, s6, v4
+; GCN1-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB123_1
@@ -6748,18 +6630,18 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    flat_load_dword v0, v[0:1]
+; GCN2-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN2-NEXT:    s_mov_b64 s[34:35], 0
+; GCN2-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN2-NEXT:  .LBB123_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    v_mov_b32_e32 v2, s4
-; GCN2-NEXT:    v_mov_b32_e32 v3, s5
-; GCN2-NEXT:    v_min_i32_e32 v0, s6, v1
-; GCN2-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT:    v_mov_b32_e32 v4, v0
+; GCN2-NEXT:    v_min_i32_e32 v3, s6, v4
+; GCN2-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB123_1
@@ -6773,18 +6655,18 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v0, v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN3-NEXT:  .LBB123_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
-; GCN3-NEXT:    v_mov_b32_e32 v2, s4
-; GCN3-NEXT:    v_mov_b32_e32 v3, s5
-; GCN3-NEXT:    v_min_i32_e32 v0, s6, v1
-; GCN3-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN3-NEXT:    v_mov_b32_e32 v4, v0
+; GCN3-NEXT:    v_min_i32_e32 v3, s6, v4
+; GCN3-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB123_1
@@ -6801,26 +6683,24 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
-; GCN1-NEXT:    v_mov_b32_e32 v0, s34
-; GCN1-NEXT:    v_mov_b32_e32 v1, s35
-; GCN1-NEXT:    flat_load_dword v0, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:    v_mov_b32_e32 v1, s34
+; GCN1-NEXT:    v_mov_b32_e32 v2, s35
+; GCN1-NEXT:    flat_load_dword v0, v[1:2]
+; GCN1-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN1-NEXT:  .LBB124_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    v_mov_b32_e32 v2, s34
-; GCN1-NEXT:    v_mov_b32_e32 v3, s35
-; GCN1-NEXT:    v_min_i32_e32 v0, s6, v1
-; GCN1-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT:    v_mov_b32_e32 v4, v0
+; GCN1-NEXT:    v_min_i32_e32 v3, s6, v4
+; GCN1-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB124_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_min_i32_ret_offset_scalar:
@@ -6828,26 +6708,24 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
-; GCN2-NEXT:    v_mov_b32_e32 v0, s34
-; GCN2-NEXT:    v_mov_b32_e32 v1, s35
-; GCN2-NEXT:    flat_load_dword v0, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:    v_mov_b32_e32 v1, s34
+; GCN2-NEXT:    v_mov_b32_e32 v2, s35
+; GCN2-NEXT:    flat_load_dword v0, v[1:2]
+; GCN2-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN2-NEXT:  .LBB124_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    v_mov_b32_e32 v2, s34
-; GCN2-NEXT:    v_mov_b32_e32 v3, s35
-; GCN2-NEXT:    v_min_i32_e32 v0, s6, v1
-; GCN2-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT:    v_mov_b32_e32 v4, v0
+; GCN2-NEXT:    v_min_i32_e32 v3, s6, v4
+; GCN2-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB124_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_min_i32_ret_offset_scalar:
@@ -6856,18 +6734,18 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v0, v[0:1] offset:16
+; GCN3-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN3-NEXT:  .LBB124_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
-; GCN3-NEXT:    v_mov_b32_e32 v2, s4
-; GCN3-NEXT:    v_mov_b32_e32 v3, s5
-; GCN3-NEXT:    v_min_i32_e32 v0, s6, v1
-; GCN3-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT:    v_mov_b32_e32 v4, v0
+; GCN3-NEXT:    v_min_i32_e32 v3, s6, v4
+; GCN3-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB124_1
@@ -6893,21 +6771,19 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
-; GCN1-NEXT:    flat_load_dword v1, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[4:5], 0
+; GCN1-NEXT:    flat_load_dword v3, v[0:1]
+; GCN1-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN1-NEXT:  .LBB125_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_min_i32_e32 v0, s2, v1
-; GCN1-NEXT:    v_mov_b32_e32 v2, s0
-; GCN1-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT:    v_min_i32_e32 v2, s2, v3
+; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN1-NEXT:    v_mov_b32_e32 v3, v2
+; GCN1-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB125_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN1-NEXT:    s_endpgm
@@ -6925,21 +6801,19 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
-; GCN2-NEXT:    flat_load_dword v1, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[4:5], 0
+; GCN2-NEXT:    flat_load_dword v3, v[0:1]
+; GCN2-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN2-NEXT:  .LBB125_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_min_i32_e32 v0, s2, v1
-; GCN2-NEXT:    v_mov_b32_e32 v2, s0
-; GCN2-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT:    v_min_i32_e32 v2, s2, v3
+; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN2-NEXT:    v_mov_b32_e32 v3, v2
+; GCN2-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB125_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN2-NEXT:    s_endpgm
@@ -6955,21 +6829,19 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ; GCN3-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
-; GCN3-NEXT:    flat_load_dword v1, v[0:1] offset:16
-; GCN3-NEXT:    s_mov_b64 s[4:5], 0
+; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
+; GCN3-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN3-NEXT:  .LBB125_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_min_i32_e32 v0, s2, v1
-; GCN3-NEXT:    v_mov_b32_e32 v2, s0
-; GCN3-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT:    v_min_i32_e32 v2, s2, v3
+; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v3, v2
+; GCN3-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB125_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_endpgm
@@ -6995,27 +6867,25 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
-; GCN1-NEXT:    flat_load_dword v0, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[4:5], 0
+; GCN1-NEXT:    flat_load_dword v2, v[0:1]
+; GCN1-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN1-NEXT:  .LBB126_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    v_mov_b32_e32 v3, s1
-; GCN1-NEXT:    v_mov_b32_e32 v2, s0
-; GCN1-NEXT:    v_min_i32_e32 v0, s6, v1
-; GCN1-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT:    v_mov_b32_e32 v3, v2
+; GCN1-NEXT:    v_min_i32_e32 v2, s6, v3
+; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN1-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB126_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    v_mov_b32_e32 v1, s2
-; GCN1-NEXT:    v_mov_b32_e32 v2, s3
-; GCN1-NEXT:    flat_store_dword v[1:2], v0
+; GCN1-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GCN1-NEXT:    v_mov_b32_e32 v0, s2
+; GCN1-NEXT:    v_mov_b32_e32 v1, s3
+; GCN1-NEXT:    flat_store_dword v[0:1], v2
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_min_i32_ret_addr64_offset:
@@ -7032,27 +6902,25 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
-; GCN2-NEXT:    flat_load_dword v0, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[4:5], 0
+; GCN2-NEXT:    flat_load_dword v2, v[0:1]
+; GCN2-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN2-NEXT:  .LBB126_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    v_mov_b32_e32 v3, s1
-; GCN2-NEXT:    v_mov_b32_e32 v2, s0
-; GCN2-NEXT:    v_min_i32_e32 v0, s6, v1
-; GCN2-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT:    v_mov_b32_e32 v3, v2
+; GCN2-NEXT:    v_min_i32_e32 v2, s6, v3
+; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN2-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB126_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    v_mov_b32_e32 v1, s2
-; GCN2-NEXT:    v_mov_b32_e32 v2, s3
-; GCN2-NEXT:    flat_store_dword v[1:2], v0
+; GCN2-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GCN2-NEXT:    v_mov_b32_e32 v0, s2
+; GCN2-NEXT:    v_mov_b32_e32 v1, s3
+; GCN2-NEXT:    flat_store_dword v[0:1], v2
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GCN3-LABEL: atomic_min_i32_ret_addr64_offset:
@@ -7067,27 +6935,25 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN3-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
-; GCN3-NEXT:    flat_load_dword v0, v[0:1] offset:16
-; GCN3-NEXT:    s_mov_b64 s[4:5], 0
+; GCN3-NEXT:    flat_load_dword v2, v[0:1] offset:16
+; GCN3-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN3-NEXT:  .LBB126_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
-; GCN3-NEXT:    v_mov_b32_e32 v3, s1
-; GCN3-NEXT:    v_mov_b32_e32 v2, s0
-; GCN3-NEXT:    v_min_i32_e32 v0, s6, v1
-; GCN3-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT:    v_mov_b32_e32 v3, v2
+; GCN3-NEXT:    v_min_i32_e32 v2, s6, v3
+; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN3-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB126_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    v_mov_b32_e32 v1, s2
-; GCN3-NEXT:    v_mov_b32_e32 v2, s3
-; GCN3-NEXT:    flat_store_dword v[1:2], v0
+; GCN3-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i32 %index
@@ -7100,78 +6966,72 @@ entry:
 define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_min_i32:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
-; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
+; GCN1-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v0, s0
-; GCN1-NEXT:    v_mov_b32_e32 v1, s1
-; GCN1-NEXT:    flat_load_dword v1, v[0:1]
+; GCN1-NEXT:    v_mov_b32_e32 v0, s6
+; GCN1-NEXT:    v_mov_b32_e32 v1, s7
+; GCN1-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN1-NEXT:  .LBB127_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_min_i32_e32 v0, s4, v1
-; GCN1-NEXT:    v_mov_b32_e32 v2, s0
-; GCN1-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT:    v_min_i32_e32 v2, s2, v3
+; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN1-NEXT:    v_mov_b32_e32 v3, v2
+; GCN1-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB127_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_min_i32:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
-; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GCN2-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v0, s0
-; GCN2-NEXT:    v_mov_b32_e32 v1, s1
-; GCN2-NEXT:    flat_load_dword v1, v[0:1]
+; GCN2-NEXT:    v_mov_b32_e32 v0, s6
+; GCN2-NEXT:    v_mov_b32_e32 v1, s7
+; GCN2-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN2-NEXT:  .LBB127_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_min_i32_e32 v0, s4, v1
-; GCN2-NEXT:    v_mov_b32_e32 v2, s0
-; GCN2-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT:    v_min_i32_e32 v2, s2, v3
+; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN2-NEXT:    v_mov_b32_e32 v3, v2
+; GCN2-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB127_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GCN3-LABEL: atomic_min_i32:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; GCN3-NEXT:    s_mov_b64 s[2:3], 0
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GCN3-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s0
-; GCN3-NEXT:    v_mov_b32_e32 v1, s1
-; GCN3-NEXT:    flat_load_dword v1, v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v0, s6
+; GCN3-NEXT:    v_mov_b32_e32 v1, s7
+; GCN3-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN3-NEXT:  .LBB127_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_min_i32_e32 v0, s6, v1
-; GCN3-NEXT:    v_mov_b32_e32 v2, s0
-; GCN3-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN3-NEXT:    v_min_i32_e32 v2, s2, v3
+; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v3, v2
+; GCN3-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB127_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_endpgm
@@ -7193,27 +7053,25 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ; GCN1-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
-; GCN1-NEXT:    flat_load_dword v0, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[4:5], 0
+; GCN1-NEXT:    flat_load_dword v2, v[0:1]
+; GCN1-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN1-NEXT:  .LBB128_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    v_mov_b32_e32 v3, s1
-; GCN1-NEXT:    v_mov_b32_e32 v2, s0
-; GCN1-NEXT:    v_min_i32_e32 v0, s6, v1
-; GCN1-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT:    v_mov_b32_e32 v3, v2
+; GCN1-NEXT:    v_min_i32_e32 v2, s6, v3
+; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN1-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB128_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    v_mov_b32_e32 v1, s2
-; GCN1-NEXT:    v_mov_b32_e32 v2, s3
-; GCN1-NEXT:    flat_store_dword v[1:2], v0
+; GCN1-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GCN1-NEXT:    v_mov_b32_e32 v0, s2
+; GCN1-NEXT:    v_mov_b32_e32 v1, s3
+; GCN1-NEXT:    flat_store_dword v[0:1], v2
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_min_i32_ret_addr64:
@@ -7228,27 +7086,25 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ; GCN2-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
-; GCN2-NEXT:    flat_load_dword v0, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[4:5], 0
+; GCN2-NEXT:    flat_load_dword v2, v[0:1]
+; GCN2-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN2-NEXT:  .LBB128_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    v_mov_b32_e32 v3, s1
-; GCN2-NEXT:    v_mov_b32_e32 v2, s0
-; GCN2-NEXT:    v_min_i32_e32 v0, s6, v1
-; GCN2-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT:    v_mov_b32_e32 v3, v2
+; GCN2-NEXT:    v_min_i32_e32 v2, s6, v3
+; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN2-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB128_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    v_mov_b32_e32 v1, s2
-; GCN2-NEXT:    v_mov_b32_e32 v2, s3
-; GCN2-NEXT:    flat_store_dword v[1:2], v0
+; GCN2-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GCN2-NEXT:    v_mov_b32_e32 v0, s2
+; GCN2-NEXT:    v_mov_b32_e32 v1, s3
+; GCN2-NEXT:    flat_store_dword v[0:1], v2
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GCN3-LABEL: atomic_min_i32_ret_addr64:
@@ -7263,27 +7119,25 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ; GCN3-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
-; GCN3-NEXT:    flat_load_dword v0, v[0:1]
-; GCN3-NEXT:    s_mov_b64 s[4:5], 0
+; GCN3-NEXT:    flat_load_dword v2, v[0:1]
+; GCN3-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN3-NEXT:  .LBB128_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
-; GCN3-NEXT:    v_mov_b32_e32 v3, s1
-; GCN3-NEXT:    v_mov_b32_e32 v2, s0
-; GCN3-NEXT:    v_min_i32_e32 v0, s6, v1
-; GCN3-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN3-NEXT:    v_mov_b32_e32 v3, v2
+; GCN3-NEXT:    v_min_i32_e32 v2, s6, v3
+; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN3-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB128_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    v_mov_b32_e32 v1, s2
-; GCN3-NEXT:    v_mov_b32_e32 v2, s3
-; GCN3-NEXT:    flat_store_dword v[1:2], v0
+; GCN3-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i32 %index
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
index 66ac4d2198ea5..36bddb7ac2fd6 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
@@ -6998,8 +6998,6 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_and_b32_e32 v0, s7, v3
 ; GCN1-NEXT:    v_and_b32_e32 v6, s6, v2
-; GCN1-NEXT:    v_mov_b32_e32 v4, s4
-; GCN1-NEXT:    v_mov_b32_e32 v5, s5
 ; GCN1-NEXT:    v_not_b32_e32 v1, v0
 ; GCN1-NEXT:    v_not_b32_e32 v0, v6
 ; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -7065,8 +7063,6 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_and_b32_e32 v0, s7, v3
 ; GCN2-NEXT:    v_and_b32_e32 v6, s6, v2
-; GCN2-NEXT:    v_mov_b32_e32 v4, s4
-; GCN2-NEXT:    v_mov_b32_e32 v5, s5
 ; GCN2-NEXT:    v_not_b32_e32 v1, v0
 ; GCN2-NEXT:    v_not_b32_e32 v0, v6
 ; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -7115,17 +7111,15 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN3-NEXT:  .LBB54_2: ; %atomicrmw.phi
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; GCN3-NEXT:  .LBB54_3: ; %atomicrmw.global
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v4, s4
+; GCN3-NEXT:    v_mov_b32_e32 v5, s5
+; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
 ; GCN3-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN3-NEXT:  .LBB54_4: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_and_b32_e32 v0, s7, v3
 ; GCN3-NEXT:    v_and_b32_e32 v6, s6, v2
-; GCN3-NEXT:    v_mov_b32_e32 v4, s4
-; GCN3-NEXT:    v_mov_b32_e32 v5, s5
 ; GCN3-NEXT:    v_not_b32_e32 v1, v0
 ; GCN3-NEXT:    v_not_b32_e32 v0, v6
 ; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -7194,8 +7188,6 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_and_b32_e32 v0, s7, v3
 ; GCN1-NEXT:    v_and_b32_e32 v6, s6, v2
-; GCN1-NEXT:    v_mov_b32_e32 v4, s34
-; GCN1-NEXT:    v_mov_b32_e32 v5, s35
 ; GCN1-NEXT:    v_not_b32_e32 v1, v0
 ; GCN1-NEXT:    v_not_b32_e32 v0, v6
 ; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -7263,8 +7255,6 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_and_b32_e32 v0, s7, v3
 ; GCN2-NEXT:    v_and_b32_e32 v6, s6, v2
-; GCN2-NEXT:    v_mov_b32_e32 v4, s34
-; GCN2-NEXT:    v_mov_b32_e32 v5, s35
 ; GCN2-NEXT:    v_not_b32_e32 v1, v0
 ; GCN2-NEXT:    v_not_b32_e32 v0, v6
 ; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -7315,17 +7305,15 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:  .LBB55_2: ; %atomicrmw.phi
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; GCN3-NEXT:  .LBB55_3: ; %atomicrmw.global
-; GCN3-NEXT:    v_mov_b32_e32 v0, s34
-; GCN3-NEXT:    v_mov_b32_e32 v1, s35
-; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v4, s34
+; GCN3-NEXT:    v_mov_b32_e32 v5, s35
+; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
 ; GCN3-NEXT:    s_mov_b64 s[36:37], 0
 ; GCN3-NEXT:  .LBB55_4: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_and_b32_e32 v0, s7, v3
 ; GCN3-NEXT:    v_and_b32_e32 v6, s6, v2
-; GCN3-NEXT:    v_mov_b32_e32 v4, s34
-; GCN3-NEXT:    v_mov_b32_e32 v5, s35
 ; GCN3-NEXT:    v_not_b32_e32 v1, v0
 ; GCN3-NEXT:    v_not_b32_e32 v0, v6
 ; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -7385,18 +7373,16 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN1-NEXT:  .LBB56_2: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v3, v1
-; GCN1-NEXT:    v_mov_b32_e32 v2, v0
-; GCN1-NEXT:    v_mov_b32_e32 v4, s4
-; GCN1-NEXT:    v_and_b32_e32 v0, s7, v3
-; GCN1-NEXT:    v_and_b32_e32 v6, s6, v2
-; GCN1-NEXT:    v_mov_b32_e32 v5, s5
-; GCN1-NEXT:    v_not_b32_e32 v1, v0
-; GCN1-NEXT:    v_not_b32_e32 v0, v6
-; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT:    v_mov_b32_e32 v7, v1
+; GCN1-NEXT:    v_mov_b32_e32 v6, v0
+; GCN1-NEXT:    v_and_b32_e32 v0, s7, v7
+; GCN1-NEXT:    v_and_b32_e32 v1, s6, v6
+; GCN1-NEXT:    v_not_b32_e32 v5, v0
+; GCN1-NEXT:    v_not_b32_e32 v4, v1
+; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB56_2
@@ -7450,18 +7436,16 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN2-NEXT:  .LBB56_2: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v3, v1
-; GCN2-NEXT:    v_mov_b32_e32 v2, v0
-; GCN2-NEXT:    v_mov_b32_e32 v4, s4
-; GCN2-NEXT:    v_and_b32_e32 v0, s7, v3
-; GCN2-NEXT:    v_and_b32_e32 v6, s6, v2
-; GCN2-NEXT:    v_mov_b32_e32 v5, s5
-; GCN2-NEXT:    v_not_b32_e32 v1, v0
-; GCN2-NEXT:    v_not_b32_e32 v0, v6
-; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT:    v_mov_b32_e32 v7, v1
+; GCN2-NEXT:    v_mov_b32_e32 v6, v0
+; GCN2-NEXT:    v_and_b32_e32 v0, s7, v7
+; GCN2-NEXT:    v_and_b32_e32 v1, s6, v6
+; GCN2-NEXT:    v_not_b32_e32 v5, v0
+; GCN2-NEXT:    v_not_b32_e32 v4, v1
+; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB56_2
@@ -7500,25 +7484,23 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_vccz .LBB56_4
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v2, s4
+; GCN3-NEXT:    v_mov_b32_e32 v3, s5
+; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_mov_b64 s[34:35], 0
 ; GCN3-NEXT:  .LBB56_2: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v3, v1
-; GCN3-NEXT:    v_mov_b32_e32 v2, v0
-; GCN3-NEXT:    v_mov_b32_e32 v4, s4
-; GCN3-NEXT:    v_and_b32_e32 v0, s7, v3
-; GCN3-NEXT:    v_and_b32_e32 v6, s6, v2
-; GCN3-NEXT:    v_mov_b32_e32 v5, s5
-; GCN3-NEXT:    v_not_b32_e32 v1, v0
-; GCN3-NEXT:    v_not_b32_e32 v0, v6
-; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT:    v_mov_b32_e32 v7, v1
+; GCN3-NEXT:    v_mov_b32_e32 v6, v0
+; GCN3-NEXT:    v_and_b32_e32 v0, s7, v7
+; GCN3-NEXT:    v_and_b32_e32 v1, s6, v6
+; GCN3-NEXT:    v_not_b32_e32 v5, v0
+; GCN3-NEXT:    v_not_b32_e32 v4, v1
+; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB56_2
@@ -7575,18 +7557,16 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN1-NEXT:  .LBB57_2: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v3, v1
-; GCN1-NEXT:    v_mov_b32_e32 v2, v0
-; GCN1-NEXT:    v_mov_b32_e32 v4, s34
-; GCN1-NEXT:    v_and_b32_e32 v0, s7, v3
-; GCN1-NEXT:    v_and_b32_e32 v6, s6, v2
-; GCN1-NEXT:    v_mov_b32_e32 v5, s35
-; GCN1-NEXT:    v_not_b32_e32 v1, v0
-; GCN1-NEXT:    v_not_b32_e32 v0, v6
-; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT:    v_mov_b32_e32 v7, v1
+; GCN1-NEXT:    v_mov_b32_e32 v6, v0
+; GCN1-NEXT:    v_and_b32_e32 v0, s7, v7
+; GCN1-NEXT:    v_and_b32_e32 v1, s6, v6
+; GCN1-NEXT:    v_not_b32_e32 v5, v0
+; GCN1-NEXT:    v_not_b32_e32 v4, v1
+; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
 ; GCN1-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB57_2
@@ -7642,18 +7622,16 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN2-NEXT:  .LBB57_2: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v3, v1
-; GCN2-NEXT:    v_mov_b32_e32 v2, v0
-; GCN2-NEXT:    v_mov_b32_e32 v4, s34
-; GCN2-NEXT:    v_and_b32_e32 v0, s7, v3
-; GCN2-NEXT:    v_and_b32_e32 v6, s6, v2
-; GCN2-NEXT:    v_mov_b32_e32 v5, s35
-; GCN2-NEXT:    v_not_b32_e32 v1, v0
-; GCN2-NEXT:    v_not_b32_e32 v0, v6
-; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT:    v_mov_b32_e32 v7, v1
+; GCN2-NEXT:    v_mov_b32_e32 v6, v0
+; GCN2-NEXT:    v_and_b32_e32 v0, s7, v7
+; GCN2-NEXT:    v_and_b32_e32 v1, s6, v6
+; GCN2-NEXT:    v_not_b32_e32 v5, v0
+; GCN2-NEXT:    v_not_b32_e32 v4, v1
+; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
 ; GCN2-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB57_2
@@ -7694,25 +7672,23 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
 ; GCN3-NEXT:    s_cbranch_vccz .LBB57_4
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT:    v_mov_b32_e32 v0, s34
-; GCN3-NEXT:    v_mov_b32_e32 v1, s35
-; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v2, s34
+; GCN3-NEXT:    v_mov_b32_e32 v3, s35
+; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_mov_b64 s[36:37], 0
 ; GCN3-NEXT:  .LBB57_2: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v3, v1
-; GCN3-NEXT:    v_mov_b32_e32 v2, v0
-; GCN3-NEXT:    v_mov_b32_e32 v4, s34
-; GCN3-NEXT:    v_and_b32_e32 v0, s7, v3
-; GCN3-NEXT:    v_and_b32_e32 v6, s6, v2
-; GCN3-NEXT:    v_mov_b32_e32 v5, s35
-; GCN3-NEXT:    v_not_b32_e32 v1, v0
-; GCN3-NEXT:    v_not_b32_e32 v0, v6
-; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT:    v_mov_b32_e32 v7, v1
+; GCN3-NEXT:    v_mov_b32_e32 v6, v0
+; GCN3-NEXT:    v_and_b32_e32 v0, s7, v7
+; GCN3-NEXT:    v_and_b32_e32 v1, s6, v6
+; GCN3-NEXT:    v_not_b32_e32 v5, v0
+; GCN3-NEXT:    v_not_b32_e32 v4, v1
+; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
 ; GCN3-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB57_2
@@ -11446,16 +11422,14 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN1-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN1-NEXT:    flat_load_dword v2, v[4:5]
 ; GCN1-NEXT:    s_mov_b64 s[34:35], 0
+; GCN1-NEXT:    v_mov_b32_e32 v6, s7
+; GCN1-NEXT:    v_mov_b32_e32 v7, s6
 ; GCN1-NEXT:  .LBB84_4: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT:    v_mov_b32_e32 v0, s7
-; GCN1-NEXT:    v_mov_b32_e32 v6, s6
-; GCN1-NEXT:    v_mov_b32_e32 v4, s4
-; GCN1-NEXT:    v_mov_b32_e32 v5, s5
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
@@ -11514,16 +11488,14 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN2-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN2-NEXT:    flat_load_dword v2, v[4:5]
 ; GCN2-NEXT:    s_mov_b64 s[34:35], 0
+; GCN2-NEXT:    v_mov_b32_e32 v6, s7
+; GCN2-NEXT:    v_mov_b32_e32 v7, s6
 ; GCN2-NEXT:  .LBB84_4: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT:    v_mov_b32_e32 v0, s7
-; GCN2-NEXT:    v_mov_b32_e32 v6, s6
-; GCN2-NEXT:    v_mov_b32_e32 v4, s4
-; GCN2-NEXT:    v_mov_b32_e32 v5, s5
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
@@ -11570,20 +11542,18 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN3-NEXT:  .LBB84_2: ; %atomicrmw.phi
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; GCN3-NEXT:  .LBB84_3: ; %atomicrmw.global
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v4, s4
+; GCN3-NEXT:    v_mov_b32_e32 v5, s5
+; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
 ; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:    v_mov_b32_e32 v6, s7
+; GCN3-NEXT:    v_mov_b32_e32 v7, s6
 ; GCN3-NEXT:  .LBB84_4: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s7
-; GCN3-NEXT:    v_mov_b32_e32 v6, s6
-; GCN3-NEXT:    v_mov_b32_e32 v4, s4
-; GCN3-NEXT:    v_mov_b32_e32 v5, s5
-; GCN3-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -11645,16 +11615,14 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN1-NEXT:    flat_load_dword v2, v[4:5]
 ; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:    v_mov_b32_e32 v6, s7
+; GCN1-NEXT:    v_mov_b32_e32 v7, s6
 ; GCN1-NEXT:  .LBB85_4: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT:    v_mov_b32_e32 v0, s7
-; GCN1-NEXT:    v_mov_b32_e32 v6, s6
-; GCN1-NEXT:    v_mov_b32_e32 v4, s34
-; GCN1-NEXT:    v_mov_b32_e32 v5, s35
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
@@ -11715,16 +11683,14 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN2-NEXT:    flat_load_dword v2, v[4:5]
 ; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:    v_mov_b32_e32 v6, s7
+; GCN2-NEXT:    v_mov_b32_e32 v7, s6
 ; GCN2-NEXT:  .LBB85_4: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT:    v_mov_b32_e32 v0, s7
-; GCN2-NEXT:    v_mov_b32_e32 v6, s6
-; GCN2-NEXT:    v_mov_b32_e32 v4, s34
-; GCN2-NEXT:    v_mov_b32_e32 v5, s35
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
@@ -11773,20 +11739,18 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:  .LBB85_2: ; %atomicrmw.phi
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; GCN3-NEXT:  .LBB85_3: ; %atomicrmw.global
-; GCN3-NEXT:    v_mov_b32_e32 v0, s34
-; GCN3-NEXT:    v_mov_b32_e32 v1, s35
-; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v4, s34
+; GCN3-NEXT:    v_mov_b32_e32 v5, s35
+; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
 ; GCN3-NEXT:    s_mov_b64 s[36:37], 0
+; GCN3-NEXT:    v_mov_b32_e32 v6, s7
+; GCN3-NEXT:    v_mov_b32_e32 v7, s6
 ; GCN3-NEXT:  .LBB85_4: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s7
-; GCN3-NEXT:    v_mov_b32_e32 v6, s6
-; GCN3-NEXT:    v_mov_b32_e32 v4, s34
-; GCN3-NEXT:    v_mov_b32_e32 v5, s35
-; GCN3-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -11841,22 +11805,20 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN1-NEXT:    flat_load_dword v1, v[0:1]
 ; GCN1-NEXT:    flat_load_dword v0, v[2:3]
 ; GCN1-NEXT:    s_mov_b64 s[34:35], 0
+; GCN1-NEXT:    v_mov_b32_e32 v4, s7
+; GCN1-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN1-NEXT:  .LBB86_2: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v3, v1
-; GCN1-NEXT:    v_mov_b32_e32 v2, v0
-; GCN1-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT:    v_mov_b32_e32 v0, s7
-; GCN1-NEXT:    v_mov_b32_e32 v6, s6
-; GCN1-NEXT:    v_mov_b32_e32 v4, s4
-; GCN1-NEXT:    v_mov_b32_e32 v5, s5
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT:    v_mov_b32_e32 v9, v1
+; GCN1-NEXT:    v_mov_b32_e32 v8, v0
+; GCN1-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; GCN1-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB86_2
@@ -11907,22 +11869,20 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN2-NEXT:    flat_load_dword v1, v[0:1]
 ; GCN2-NEXT:    flat_load_dword v0, v[2:3]
 ; GCN2-NEXT:    s_mov_b64 s[34:35], 0
+; GCN2-NEXT:    v_mov_b32_e32 v4, s7
+; GCN2-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN2-NEXT:  .LBB86_2: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v3, v1
-; GCN2-NEXT:    v_mov_b32_e32 v2, v0
-; GCN2-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT:    v_mov_b32_e32 v0, s7
-; GCN2-NEXT:    v_mov_b32_e32 v6, s6
-; GCN2-NEXT:    v_mov_b32_e32 v4, s4
-; GCN2-NEXT:    v_mov_b32_e32 v5, s5
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT:    v_mov_b32_e32 v9, v1
+; GCN2-NEXT:    v_mov_b32_e32 v8, v0
+; GCN2-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; GCN2-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB86_2
@@ -11961,26 +11921,24 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_vccz .LBB86_4
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v2, s4
+; GCN3-NEXT:    v_mov_b32_e32 v3, s5
+; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:    v_mov_b32_e32 v4, s7
+; GCN3-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN3-NEXT:  .LBB86_2: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v3, v1
-; GCN3-NEXT:    v_mov_b32_e32 v2, v0
-; GCN3-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s7
-; GCN3-NEXT:    v_mov_b32_e32 v6, s6
-; GCN3-NEXT:    v_mov_b32_e32 v4, s4
-; GCN3-NEXT:    v_mov_b32_e32 v5, s5
-; GCN3-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT:    v_mov_b32_e32 v9, v1
+; GCN3-NEXT:    v_mov_b32_e32 v8, v0
+; GCN3-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; GCN3-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB86_2
@@ -12034,22 +11992,20 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN1-NEXT:    flat_load_dword v1, v[0:1]
 ; GCN1-NEXT:    flat_load_dword v0, v[2:3]
 ; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:    v_mov_b32_e32 v4, s7
+; GCN1-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN1-NEXT:  .LBB87_2: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v3, v1
-; GCN1-NEXT:    v_mov_b32_e32 v2, v0
-; GCN1-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT:    v_mov_b32_e32 v0, s7
-; GCN1-NEXT:    v_mov_b32_e32 v6, s6
-; GCN1-NEXT:    v_mov_b32_e32 v4, s34
-; GCN1-NEXT:    v_mov_b32_e32 v5, s35
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT:    v_mov_b32_e32 v9, v1
+; GCN1-NEXT:    v_mov_b32_e32 v8, v0
+; GCN1-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; GCN1-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN1-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB87_2
@@ -12102,22 +12058,20 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN2-NEXT:    flat_load_dword v1, v[0:1]
 ; GCN2-NEXT:    flat_load_dword v0, v[2:3]
 ; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:    v_mov_b32_e32 v4, s7
+; GCN2-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN2-NEXT:  .LBB87_2: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v3, v1
-; GCN2-NEXT:    v_mov_b32_e32 v2, v0
-; GCN2-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT:    v_mov_b32_e32 v0, s7
-; GCN2-NEXT:    v_mov_b32_e32 v6, s6
-; GCN2-NEXT:    v_mov_b32_e32 v4, s34
-; GCN2-NEXT:    v_mov_b32_e32 v5, s35
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT:    v_mov_b32_e32 v9, v1
+; GCN2-NEXT:    v_mov_b32_e32 v8, v0
+; GCN2-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; GCN2-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN2-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB87_2
@@ -12158,26 +12112,24 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
 ; GCN3-NEXT:    s_cbranch_vccz .LBB87_4
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT:    v_mov_b32_e32 v0, s34
-; GCN3-NEXT:    v_mov_b32_e32 v1, s35
-; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v2, s34
+; GCN3-NEXT:    v_mov_b32_e32 v3, s35
+; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_mov_b64 s[36:37], 0
+; GCN3-NEXT:    v_mov_b32_e32 v4, s7
+; GCN3-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN3-NEXT:  .LBB87_2: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v3, v1
-; GCN3-NEXT:    v_mov_b32_e32 v2, v0
-; GCN3-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s7
-; GCN3-NEXT:    v_mov_b32_e32 v6, s6
-; GCN3-NEXT:    v_mov_b32_e32 v4, s34
-; GCN3-NEXT:    v_mov_b32_e32 v5, s35
-; GCN3-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT:    v_mov_b32_e32 v9, v1
+; GCN3-NEXT:    v_mov_b32_e32 v8, v0
+; GCN3-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; GCN3-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN3-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB87_2
@@ -12238,20 +12190,18 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN1-NEXT:  .LBB88_2: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ; GCN1-NEXT:  .LBB88_3: ; %atomicrmw.global
-; GCN1-NEXT:    v_mov_b32_e32 v0, s0
-; GCN1-NEXT:    v_mov_b32_e32 v1, s1
-; GCN1-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GCN1-NEXT:    v_mov_b32_e32 v5, s1
+; GCN1-NEXT:    v_mov_b32_e32 v4, s0
+; GCN1-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
 ; GCN1-NEXT:    s_mov_b64 s[4:5], 0
+; GCN1-NEXT:    v_mov_b32_e32 v6, s3
+; GCN1-NEXT:    v_mov_b32_e32 v7, s2
 ; GCN1-NEXT:  .LBB88_4: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GCN1-NEXT:    v_mov_b32_e32 v0, s3
-; GCN1-NEXT:    v_mov_b32_e32 v6, s2
-; GCN1-NEXT:    v_mov_b32_e32 v5, s1
-; GCN1-NEXT:    v_mov_b32_e32 v4, s0
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
@@ -12311,20 +12261,18 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN2-NEXT:  .LBB88_2: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ; GCN2-NEXT:  .LBB88_3: ; %atomicrmw.global
-; GCN2-NEXT:    v_mov_b32_e32 v0, s0
-; GCN2-NEXT:    v_mov_b32_e32 v1, s1
-; GCN2-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GCN2-NEXT:    v_mov_b32_e32 v5, s1
+; GCN2-NEXT:    v_mov_b32_e32 v4, s0
+; GCN2-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
 ; GCN2-NEXT:    s_mov_b64 s[4:5], 0
+; GCN2-NEXT:    v_mov_b32_e32 v6, s3
+; GCN2-NEXT:    v_mov_b32_e32 v7, s2
 ; GCN2-NEXT:  .LBB88_4: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GCN2-NEXT:    v_mov_b32_e32 v0, s3
-; GCN2-NEXT:    v_mov_b32_e32 v6, s2
-; GCN2-NEXT:    v_mov_b32_e32 v5, s1
-; GCN2-NEXT:    v_mov_b32_e32 v4, s0
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
@@ -12383,20 +12331,18 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN3-NEXT:  .LBB88_2: ; %atomicrmw.phi
 ; GCN3-NEXT:    s_endpgm
 ; GCN3-NEXT:  .LBB88_3: ; %atomicrmw.global
-; GCN3-NEXT:    v_mov_b32_e32 v0, s0
-; GCN3-NEXT:    v_mov_b32_e32 v1, s1
-; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v5, s1
+; GCN3-NEXT:    v_mov_b32_e32 v4, s0
+; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
+; GCN3-NEXT:    v_mov_b32_e32 v6, s3
+; GCN3-NEXT:    v_mov_b32_e32 v7, s2
 ; GCN3-NEXT:  .LBB88_4: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s3
-; GCN3-NEXT:    v_mov_b32_e32 v6, s2
-; GCN3-NEXT:    v_mov_b32_e32 v5, s1
-; GCN3-NEXT:    v_mov_b32_e32 v4, s0
-; GCN3-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -12453,26 +12399,24 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; GCN1-NEXT:    s_cbranch_vccz .LBB89_4
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GCN1-NEXT:    v_mov_b32_e32 v0, s0
-; GCN1-NEXT:    v_mov_b32_e32 v1, s1
-; GCN1-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GCN1-NEXT:    v_mov_b32_e32 v3, s1
+; GCN1-NEXT:    v_mov_b32_e32 v2, s0
+; GCN1-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    v_mov_b32_e32 v4, s13
+; GCN1-NEXT:    v_mov_b32_e32 v5, s12
 ; GCN1-NEXT:  .LBB89_2: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v3, v1
-; GCN1-NEXT:    v_mov_b32_e32 v2, v0
-; GCN1-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3]
-; GCN1-NEXT:    v_mov_b32_e32 v0, s13
-; GCN1-NEXT:    v_mov_b32_e32 v6, s12
-; GCN1-NEXT:    v_mov_b32_e32 v5, s1
-; GCN1-NEXT:    v_mov_b32_e32 v4, s0
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT:    v_mov_b32_e32 v9, v1
+; GCN1-NEXT:    v_mov_b32_e32 v8, v0
+; GCN1-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9]
+; GCN1-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN1-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB89_2
@@ -12526,26 +12470,24 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; GCN2-NEXT:    s_cbranch_vccz .LBB89_4
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GCN2-NEXT:    v_mov_b32_e32 v0, s0
-; GCN2-NEXT:    v_mov_b32_e32 v1, s1
-; GCN2-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GCN2-NEXT:    v_mov_b32_e32 v3, s1
+; GCN2-NEXT:    v_mov_b32_e32 v2, s0
+; GCN2-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    v_mov_b32_e32 v4, s13
+; GCN2-NEXT:    v_mov_b32_e32 v5, s12
 ; GCN2-NEXT:  .LBB89_2: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v3, v1
-; GCN2-NEXT:    v_mov_b32_e32 v2, v0
-; GCN2-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3]
-; GCN2-NEXT:    v_mov_b32_e32 v0, s13
-; GCN2-NEXT:    v_mov_b32_e32 v6, s12
-; GCN2-NEXT:    v_mov_b32_e32 v5, s1
-; GCN2-NEXT:    v_mov_b32_e32 v4, s0
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT:    v_mov_b32_e32 v9, v1
+; GCN2-NEXT:    v_mov_b32_e32 v8, v0
+; GCN2-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9]
+; GCN2-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN2-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB89_2
@@ -12598,26 +12540,24 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; GCN3-NEXT:    s_cbranch_vccz .LBB89_4
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT:    v_mov_b32_e32 v0, s0
-; GCN3-NEXT:    v_mov_b32_e32 v1, s1
-; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v3, s1
+; GCN3-NEXT:    v_mov_b32_e32 v2, s0
+; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_mov_b64 s[2:3], 0
+; GCN3-NEXT:    v_mov_b32_e32 v4, s13
+; GCN3-NEXT:    v_mov_b32_e32 v5, s12
 ; GCN3-NEXT:  .LBB89_2: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v3, v1
-; GCN3-NEXT:    v_mov_b32_e32 v2, v0
-; GCN3-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s13
-; GCN3-NEXT:    v_mov_b32_e32 v6, s12
-; GCN3-NEXT:    v_mov_b32_e32 v5, s1
-; GCN3-NEXT:    v_mov_b32_e32 v4, s0
-; GCN3-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT:    v_mov_b32_e32 v9, v1
+; GCN3-NEXT:    v_mov_b32_e32 v8, v0
+; GCN3-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9]
+; GCN3-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN3-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB89_2
@@ -12681,20 +12621,18 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN1-NEXT:  .LBB90_2: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ; GCN1-NEXT:  .LBB90_3: ; %atomicrmw.global
-; GCN1-NEXT:    v_mov_b32_e32 v0, s0
-; GCN1-NEXT:    v_mov_b32_e32 v1, s1
-; GCN1-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GCN1-NEXT:    v_mov_b32_e32 v5, s1
+; GCN1-NEXT:    v_mov_b32_e32 v4, s0
+; GCN1-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
 ; GCN1-NEXT:    s_mov_b64 s[4:5], 0
+; GCN1-NEXT:    v_mov_b32_e32 v6, s3
+; GCN1-NEXT:    v_mov_b32_e32 v7, s2
 ; GCN1-NEXT:  .LBB90_4: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GCN1-NEXT:    v_mov_b32_e32 v0, s3
-; GCN1-NEXT:    v_mov_b32_e32 v6, s2
-; GCN1-NEXT:    v_mov_b32_e32 v5, s1
-; GCN1-NEXT:    v_mov_b32_e32 v4, s0
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
@@ -12752,20 +12690,18 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:  .LBB90_2: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ; GCN2-NEXT:  .LBB90_3: ; %atomicrmw.global
-; GCN2-NEXT:    v_mov_b32_e32 v0, s0
-; GCN2-NEXT:    v_mov_b32_e32 v1, s1
-; GCN2-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GCN2-NEXT:    v_mov_b32_e32 v5, s1
+; GCN2-NEXT:    v_mov_b32_e32 v4, s0
+; GCN2-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
 ; GCN2-NEXT:    s_mov_b64 s[4:5], 0
+; GCN2-NEXT:    v_mov_b32_e32 v6, s3
+; GCN2-NEXT:    v_mov_b32_e32 v7, s2
 ; GCN2-NEXT:  .LBB90_4: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GCN2-NEXT:    v_mov_b32_e32 v0, s3
-; GCN2-NEXT:    v_mov_b32_e32 v6, s2
-; GCN2-NEXT:    v_mov_b32_e32 v5, s1
-; GCN2-NEXT:    v_mov_b32_e32 v4, s0
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
@@ -12822,20 +12758,18 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN3-NEXT:  .LBB90_2: ; %atomicrmw.phi
 ; GCN3-NEXT:    s_endpgm
 ; GCN3-NEXT:  .LBB90_3: ; %atomicrmw.global
-; GCN3-NEXT:    v_mov_b32_e32 v0, s0
-; GCN3-NEXT:    v_mov_b32_e32 v1, s1
-; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v5, s1
+; GCN3-NEXT:    v_mov_b32_e32 v4, s0
+; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
+; GCN3-NEXT:    v_mov_b32_e32 v6, s3
+; GCN3-NEXT:    v_mov_b32_e32 v7, s2
 ; GCN3-NEXT:  .LBB90_4: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s3
-; GCN3-NEXT:    v_mov_b32_e32 v6, s2
-; GCN3-NEXT:    v_mov_b32_e32 v5, s1
-; GCN3-NEXT:    v_mov_b32_e32 v4, s0
-; GCN3-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -12889,26 +12823,24 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; GCN1-NEXT:    s_cbranch_vccz .LBB91_4
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GCN1-NEXT:    v_mov_b32_e32 v0, s0
-; GCN1-NEXT:    v_mov_b32_e32 v1, s1
-; GCN1-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GCN1-NEXT:    v_mov_b32_e32 v3, s1
+; GCN1-NEXT:    v_mov_b32_e32 v2, s0
+; GCN1-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    v_mov_b32_e32 v4, s13
+; GCN1-NEXT:    v_mov_b32_e32 v5, s12
 ; GCN1-NEXT:  .LBB91_2: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v3, v1
-; GCN1-NEXT:    v_mov_b32_e32 v2, v0
-; GCN1-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3]
-; GCN1-NEXT:    v_mov_b32_e32 v0, s13
-; GCN1-NEXT:    v_mov_b32_e32 v6, s12
-; GCN1-NEXT:    v_mov_b32_e32 v5, s1
-; GCN1-NEXT:    v_mov_b32_e32 v4, s0
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT:    v_mov_b32_e32 v9, v1
+; GCN1-NEXT:    v_mov_b32_e32 v8, v0
+; GCN1-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9]
+; GCN1-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN1-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB91_2
@@ -12960,26 +12892,24 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; GCN2-NEXT:    s_cbranch_vccz .LBB91_4
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GCN2-NEXT:    v_mov_b32_e32 v0, s0
-; GCN2-NEXT:    v_mov_b32_e32 v1, s1
-; GCN2-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GCN2-NEXT:    v_mov_b32_e32 v3, s1
+; GCN2-NEXT:    v_mov_b32_e32 v2, s0
+; GCN2-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    v_mov_b32_e32 v4, s13
+; GCN2-NEXT:    v_mov_b32_e32 v5, s12
 ; GCN2-NEXT:  .LBB91_2: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v3, v1
-; GCN2-NEXT:    v_mov_b32_e32 v2, v0
-; GCN2-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3]
-; GCN2-NEXT:    v_mov_b32_e32 v0, s13
-; GCN2-NEXT:    v_mov_b32_e32 v6, s12
-; GCN2-NEXT:    v_mov_b32_e32 v5, s1
-; GCN2-NEXT:    v_mov_b32_e32 v4, s0
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT:    v_mov_b32_e32 v9, v1
+; GCN2-NEXT:    v_mov_b32_e32 v8, v0
+; GCN2-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9]
+; GCN2-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN2-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB91_2
@@ -13030,26 +12960,24 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; GCN3-NEXT:    s_cbranch_vccz .LBB91_4
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT:    v_mov_b32_e32 v0, s0
-; GCN3-NEXT:    v_mov_b32_e32 v1, s1
-; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v3, s1
+; GCN3-NEXT:    v_mov_b32_e32 v2, s0
+; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_mov_b64 s[2:3], 0
+; GCN3-NEXT:    v_mov_b32_e32 v4, s13
+; GCN3-NEXT:    v_mov_b32_e32 v5, s12
 ; GCN3-NEXT:  .LBB91_2: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v3, v1
-; GCN3-NEXT:    v_mov_b32_e32 v2, v0
-; GCN3-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s13
-; GCN3-NEXT:    v_mov_b32_e32 v6, s12
-; GCN3-NEXT:    v_mov_b32_e32 v5, s1
-; GCN3-NEXT:    v_mov_b32_e32 v4, s0
-; GCN3-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT:    v_mov_b32_e32 v9, v1
+; GCN3-NEXT:    v_mov_b32_e32 v8, v0
+; GCN3-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9]
+; GCN3-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN3-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB91_2
@@ -14197,16 +14125,14 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN1-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN1-NEXT:    flat_load_dword v2, v[4:5]
 ; GCN1-NEXT:    s_mov_b64 s[34:35], 0
+; GCN1-NEXT:    v_mov_b32_e32 v6, s7
+; GCN1-NEXT:    v_mov_b32_e32 v7, s6
 ; GCN1-NEXT:  .LBB98_4: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT:    v_mov_b32_e32 v0, s7
-; GCN1-NEXT:    v_mov_b32_e32 v6, s6
-; GCN1-NEXT:    v_mov_b32_e32 v4, s4
-; GCN1-NEXT:    v_mov_b32_e32 v5, s5
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
@@ -14265,16 +14191,14 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN2-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN2-NEXT:    flat_load_dword v2, v[4:5]
 ; GCN2-NEXT:    s_mov_b64 s[34:35], 0
+; GCN2-NEXT:    v_mov_b32_e32 v6, s7
+; GCN2-NEXT:    v_mov_b32_e32 v7, s6
 ; GCN2-NEXT:  .LBB98_4: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT:    v_mov_b32_e32 v0, s7
-; GCN2-NEXT:    v_mov_b32_e32 v6, s6
-; GCN2-NEXT:    v_mov_b32_e32 v4, s4
-; GCN2-NEXT:    v_mov_b32_e32 v5, s5
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
@@ -14321,20 +14245,18 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN3-NEXT:  .LBB98_2: ; %atomicrmw.phi
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; GCN3-NEXT:  .LBB98_3: ; %atomicrmw.global
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v4, s4
+; GCN3-NEXT:    v_mov_b32_e32 v5, s5
+; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
 ; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:    v_mov_b32_e32 v6, s7
+; GCN3-NEXT:    v_mov_b32_e32 v7, s6
 ; GCN3-NEXT:  .LBB98_4: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s7
-; GCN3-NEXT:    v_mov_b32_e32 v6, s6
-; GCN3-NEXT:    v_mov_b32_e32 v4, s4
-; GCN3-NEXT:    v_mov_b32_e32 v5, s5
-; GCN3-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -14396,16 +14318,14 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN1-NEXT:    flat_load_dword v2, v[4:5]
 ; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:    v_mov_b32_e32 v6, s7
+; GCN1-NEXT:    v_mov_b32_e32 v7, s6
 ; GCN1-NEXT:  .LBB99_4: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT:    v_mov_b32_e32 v0, s7
-; GCN1-NEXT:    v_mov_b32_e32 v6, s6
-; GCN1-NEXT:    v_mov_b32_e32 v4, s34
-; GCN1-NEXT:    v_mov_b32_e32 v5, s35
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
@@ -14466,16 +14386,14 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN2-NEXT:    flat_load_dword v2, v[4:5]
 ; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:    v_mov_b32_e32 v6, s7
+; GCN2-NEXT:    v_mov_b32_e32 v7, s6
 ; GCN2-NEXT:  .LBB99_4: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT:    v_mov_b32_e32 v0, s7
-; GCN2-NEXT:    v_mov_b32_e32 v6, s6
-; GCN2-NEXT:    v_mov_b32_e32 v4, s34
-; GCN2-NEXT:    v_mov_b32_e32 v5, s35
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
@@ -14524,20 +14442,18 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:  .LBB99_2: ; %atomicrmw.phi
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; GCN3-NEXT:  .LBB99_3: ; %atomicrmw.global
-; GCN3-NEXT:    v_mov_b32_e32 v0, s34
-; GCN3-NEXT:    v_mov_b32_e32 v1, s35
-; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v4, s34
+; GCN3-NEXT:    v_mov_b32_e32 v5, s35
+; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
 ; GCN3-NEXT:    s_mov_b64 s[36:37], 0
+; GCN3-NEXT:    v_mov_b32_e32 v6, s7
+; GCN3-NEXT:    v_mov_b32_e32 v7, s6
 ; GCN3-NEXT:  .LBB99_4: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s7
-; GCN3-NEXT:    v_mov_b32_e32 v6, s6
-; GCN3-NEXT:    v_mov_b32_e32 v4, s34
-; GCN3-NEXT:    v_mov_b32_e32 v5, s35
-; GCN3-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -14592,22 +14508,20 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN1-NEXT:    flat_load_dword v1, v[0:1]
 ; GCN1-NEXT:    flat_load_dword v0, v[2:3]
 ; GCN1-NEXT:    s_mov_b64 s[34:35], 0
+; GCN1-NEXT:    v_mov_b32_e32 v4, s7
+; GCN1-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN1-NEXT:  .LBB100_2: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v3, v1
-; GCN1-NEXT:    v_mov_b32_e32 v2, v0
-; GCN1-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT:    v_mov_b32_e32 v0, s7
-; GCN1-NEXT:    v_mov_b32_e32 v6, s6
-; GCN1-NEXT:    v_mov_b32_e32 v4, s4
-; GCN1-NEXT:    v_mov_b32_e32 v5, s5
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT:    v_mov_b32_e32 v9, v1
+; GCN1-NEXT:    v_mov_b32_e32 v8, v0
+; GCN1-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; GCN1-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB100_2
@@ -14658,22 +14572,20 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN2-NEXT:    flat_load_dword v1, v[0:1]
 ; GCN2-NEXT:    flat_load_dword v0, v[2:3]
 ; GCN2-NEXT:    s_mov_b64 s[34:35], 0
+; GCN2-NEXT:    v_mov_b32_e32 v4, s7
+; GCN2-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN2-NEXT:  .LBB100_2: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v3, v1
-; GCN2-NEXT:    v_mov_b32_e32 v2, v0
-; GCN2-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT:    v_mov_b32_e32 v0, s7
-; GCN2-NEXT:    v_mov_b32_e32 v6, s6
-; GCN2-NEXT:    v_mov_b32_e32 v4, s4
-; GCN2-NEXT:    v_mov_b32_e32 v5, s5
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT:    v_mov_b32_e32 v9, v1
+; GCN2-NEXT:    v_mov_b32_e32 v8, v0
+; GCN2-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; GCN2-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB100_2
@@ -14712,26 +14624,24 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_vccz .LBB100_4
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v2, s4
+; GCN3-NEXT:    v_mov_b32_e32 v3, s5
+; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:    v_mov_b32_e32 v4, s7
+; GCN3-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN3-NEXT:  .LBB100_2: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v3, v1
-; GCN3-NEXT:    v_mov_b32_e32 v2, v0
-; GCN3-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s7
-; GCN3-NEXT:    v_mov_b32_e32 v6, s6
-; GCN3-NEXT:    v_mov_b32_e32 v4, s4
-; GCN3-NEXT:    v_mov_b32_e32 v5, s5
-; GCN3-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT:    v_mov_b32_e32 v9, v1
+; GCN3-NEXT:    v_mov_b32_e32 v8, v0
+; GCN3-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; GCN3-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB100_2
@@ -14785,22 +14695,20 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN1-NEXT:    flat_load_dword v1, v[0:1]
 ; GCN1-NEXT:    flat_load_dword v0, v[2:3]
 ; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:    v_mov_b32_e32 v4, s7
+; GCN1-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN1-NEXT:  .LBB101_2: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v3, v1
-; GCN1-NEXT:    v_mov_b32_e32 v2, v0
-; GCN1-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT:    v_mov_b32_e32 v0, s7
-; GCN1-NEXT:    v_mov_b32_e32 v6, s6
-; GCN1-NEXT:    v_mov_b32_e32 v4, s34
-; GCN1-NEXT:    v_mov_b32_e32 v5, s35
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT:    v_mov_b32_e32 v9, v1
+; GCN1-NEXT:    v_mov_b32_e32 v8, v0
+; GCN1-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; GCN1-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN1-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB101_2
@@ -14853,22 +14761,20 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN2-NEXT:    flat_load_dword v1, v[0:1]
 ; GCN2-NEXT:    flat_load_dword v0, v[2:3]
 ; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:    v_mov_b32_e32 v4, s7
+; GCN2-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN2-NEXT:  .LBB101_2: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v3, v1
-; GCN2-NEXT:    v_mov_b32_e32 v2, v0
-; GCN2-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT:    v_mov_b32_e32 v0, s7
-; GCN2-NEXT:    v_mov_b32_e32 v6, s6
-; GCN2-NEXT:    v_mov_b32_e32 v4, s34
-; GCN2-NEXT:    v_mov_b32_e32 v5, s35
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT:    v_mov_b32_e32 v9, v1
+; GCN2-NEXT:    v_mov_b32_e32 v8, v0
+; GCN2-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; GCN2-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN2-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB101_2
@@ -14909,26 +14815,24 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
 ; GCN3-NEXT:    s_cbranch_vccz .LBB101_4
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT:    v_mov_b32_e32 v0, s34
-; GCN3-NEXT:    v_mov_b32_e32 v1, s35
-; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v2, s34
+; GCN3-NEXT:    v_mov_b32_e32 v3, s35
+; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_mov_b64 s[36:37], 0
+; GCN3-NEXT:    v_mov_b32_e32 v4, s7
+; GCN3-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN3-NEXT:  .LBB101_2: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v3, v1
-; GCN3-NEXT:    v_mov_b32_e32 v2, v0
-; GCN3-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s7
-; GCN3-NEXT:    v_mov_b32_e32 v6, s6
-; GCN3-NEXT:    v_mov_b32_e32 v4, s34
-; GCN3-NEXT:    v_mov_b32_e32 v5, s35
-; GCN3-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT:    v_mov_b32_e32 v9, v1
+; GCN3-NEXT:    v_mov_b32_e32 v8, v0
+; GCN3-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; GCN3-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN3-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB101_2
@@ -14989,20 +14893,18 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GCN1-NEXT:  .LBB102_2: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ; GCN1-NEXT:  .LBB102_3: ; %atomicrmw.global
-; GCN1-NEXT:    v_mov_b32_e32 v0, s0
-; GCN1-NEXT:    v_mov_b32_e32 v1, s1
-; GCN1-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GCN1-NEXT:    v_mov_b32_e32 v5, s1
+; GCN1-NEXT:    v_mov_b32_e32 v4, s0
+; GCN1-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
 ; GCN1-NEXT:    s_mov_b64 s[4:5], 0
+; GCN1-NEXT:    v_mov_b32_e32 v6, s3
+; GCN1-NEXT:    v_mov_b32_e32 v7, s2
 ; GCN1-NEXT:  .LBB102_4: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
-; GCN1-NEXT:    v_mov_b32_e32 v0, s3
-; GCN1-NEXT:    v_mov_b32_e32 v6, s2
-; GCN1-NEXT:    v_mov_b32_e32 v5, s1
-; GCN1-NEXT:    v_mov_b32_e32 v4, s0
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
@@ -15062,20 +14964,18 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GCN2-NEXT:  .LBB102_2: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ; GCN2-NEXT:  .LBB102_3: ; %atomicrmw.global
-; GCN2-NEXT:    v_mov_b32_e32 v0, s0
-; GCN2-NEXT:    v_mov_b32_e32 v1, s1
-; GCN2-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GCN2-NEXT:    v_mov_b32_e32 v5, s1
+; GCN2-NEXT:    v_mov_b32_e32 v4, s0
+; GCN2-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
 ; GCN2-NEXT:    s_mov_b64 s[4:5], 0
+; GCN2-NEXT:    v_mov_b32_e32 v6, s3
+; GCN2-NEXT:    v_mov_b32_e32 v7, s2
 ; GCN2-NEXT:  .LBB102_4: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
-; GCN2-NEXT:    v_mov_b32_e32 v0, s3
-; GCN2-NEXT:    v_mov_b32_e32 v6, s2
-; GCN2-NEXT:    v_mov_b32_e32 v5, s1
-; GCN2-NEXT:    v_mov_b32_e32 v4, s0
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
@@ -15134,20 +15034,18 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GCN3-NEXT:  .LBB102_2: ; %atomicrmw.phi
 ; GCN3-NEXT:    s_endpgm
 ; GCN3-NEXT:  .LBB102_3: ; %atomicrmw.global
-; GCN3-NEXT:    v_mov_b32_e32 v0, s0
-; GCN3-NEXT:    v_mov_b32_e32 v1, s1
-; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v5, s1
+; GCN3-NEXT:    v_mov_b32_e32 v4, s0
+; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
+; GCN3-NEXT:    v_mov_b32_e32 v6, s3
+; GCN3-NEXT:    v_mov_b32_e32 v7, s2
 ; GCN3-NEXT:  .LBB102_4: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s3
-; GCN3-NEXT:    v_mov_b32_e32 v6, s2
-; GCN3-NEXT:    v_mov_b32_e32 v5, s1
-; GCN3-NEXT:    v_mov_b32_e32 v4, s0
-; GCN3-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -15204,26 +15102,24 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; GCN1-NEXT:    s_cbranch_vccz .LBB103_4
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GCN1-NEXT:    v_mov_b32_e32 v0, s0
-; GCN1-NEXT:    v_mov_b32_e32 v1, s1
-; GCN1-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GCN1-NEXT:    v_mov_b32_e32 v3, s1
+; GCN1-NEXT:    v_mov_b32_e32 v2, s0
+; GCN1-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    v_mov_b32_e32 v4, s13
+; GCN1-NEXT:    v_mov_b32_e32 v5, s12
 ; GCN1-NEXT:  .LBB103_2: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v3, v1
-; GCN1-NEXT:    v_mov_b32_e32 v2, v0
-; GCN1-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3]
-; GCN1-NEXT:    v_mov_b32_e32 v0, s13
-; GCN1-NEXT:    v_mov_b32_e32 v6, s12
-; GCN1-NEXT:    v_mov_b32_e32 v5, s1
-; GCN1-NEXT:    v_mov_b32_e32 v4, s0
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT:    v_mov_b32_e32 v9, v1
+; GCN1-NEXT:    v_mov_b32_e32 v8, v0
+; GCN1-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9]
+; GCN1-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN1-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB103_2
@@ -15277,26 +15173,24 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; GCN2-NEXT:    s_cbranch_vccz .LBB103_4
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GCN2-NEXT:    v_mov_b32_e32 v0, s0
-; GCN2-NEXT:    v_mov_b32_e32 v1, s1
-; GCN2-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GCN2-NEXT:    v_mov_b32_e32 v3, s1
+; GCN2-NEXT:    v_mov_b32_e32 v2, s0
+; GCN2-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    v_mov_b32_e32 v4, s13
+; GCN2-NEXT:    v_mov_b32_e32 v5, s12
 ; GCN2-NEXT:  .LBB103_2: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v3, v1
-; GCN2-NEXT:    v_mov_b32_e32 v2, v0
-; GCN2-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3]
-; GCN2-NEXT:    v_mov_b32_e32 v0, s13
-; GCN2-NEXT:    v_mov_b32_e32 v6, s12
-; GCN2-NEXT:    v_mov_b32_e32 v5, s1
-; GCN2-NEXT:    v_mov_b32_e32 v4, s0
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT:    v_mov_b32_e32 v9, v1
+; GCN2-NEXT:    v_mov_b32_e32 v8, v0
+; GCN2-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9]
+; GCN2-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN2-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB103_2
@@ -15349,26 +15243,24 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; GCN3-NEXT:    s_cbranch_vccz .LBB103_4
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT:    v_mov_b32_e32 v0, s0
-; GCN3-NEXT:    v_mov_b32_e32 v1, s1
-; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v3, s1
+; GCN3-NEXT:    v_mov_b32_e32 v2, s0
+; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_mov_b64 s[2:3], 0
+; GCN3-NEXT:    v_mov_b32_e32 v4, s13
+; GCN3-NEXT:    v_mov_b32_e32 v5, s12
 ; GCN3-NEXT:  .LBB103_2: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v3, v1
-; GCN3-NEXT:    v_mov_b32_e32 v2, v0
-; GCN3-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s13
-; GCN3-NEXT:    v_mov_b32_e32 v6, s12
-; GCN3-NEXT:    v_mov_b32_e32 v5, s1
-; GCN3-NEXT:    v_mov_b32_e32 v4, s0
-; GCN3-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT:    v_mov_b32_e32 v9, v1
+; GCN3-NEXT:    v_mov_b32_e32 v8, v0
+; GCN3-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9]
+; GCN3-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN3-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB103_2
@@ -15425,26 +15317,24 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; GCN1-NEXT:    s_cbranch_vccz .LBB104_4
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GCN1-NEXT:    v_mov_b32_e32 v0, s0
-; GCN1-NEXT:    v_mov_b32_e32 v1, s1
-; GCN1-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GCN1-NEXT:    v_mov_b32_e32 v3, s1
+; GCN1-NEXT:    v_mov_b32_e32 v2, s0
+; GCN1-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    v_mov_b32_e32 v4, s13
+; GCN1-NEXT:    v_mov_b32_e32 v5, s12
 ; GCN1-NEXT:  .LBB104_2: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v3, v1
-; GCN1-NEXT:    v_mov_b32_e32 v2, v0
-; GCN1-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3]
-; GCN1-NEXT:    v_mov_b32_e32 v0, s13
-; GCN1-NEXT:    v_mov_b32_e32 v6, s12
-; GCN1-NEXT:    v_mov_b32_e32 v5, s1
-; GCN1-NEXT:    v_mov_b32_e32 v4, s0
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT:    v_mov_b32_e32 v9, v1
+; GCN1-NEXT:    v_mov_b32_e32 v8, v0
+; GCN1-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9]
+; GCN1-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN1-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB104_2
@@ -15496,26 +15386,24 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; GCN2-NEXT:    s_cbranch_vccz .LBB104_4
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GCN2-NEXT:    v_mov_b32_e32 v0, s0
-; GCN2-NEXT:    v_mov_b32_e32 v1, s1
-; GCN2-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GCN2-NEXT:    v_mov_b32_e32 v3, s1
+; GCN2-NEXT:    v_mov_b32_e32 v2, s0
+; GCN2-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    v_mov_b32_e32 v4, s13
+; GCN2-NEXT:    v_mov_b32_e32 v5, s12
 ; GCN2-NEXT:  .LBB104_2: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v3, v1
-; GCN2-NEXT:    v_mov_b32_e32 v2, v0
-; GCN2-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3]
-; GCN2-NEXT:    v_mov_b32_e32 v0, s13
-; GCN2-NEXT:    v_mov_b32_e32 v6, s12
-; GCN2-NEXT:    v_mov_b32_e32 v5, s1
-; GCN2-NEXT:    v_mov_b32_e32 v4, s0
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT:    v_mov_b32_e32 v9, v1
+; GCN2-NEXT:    v_mov_b32_e32 v8, v0
+; GCN2-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9]
+; GCN2-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN2-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB104_2
@@ -15566,26 +15454,24 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; GCN3-NEXT:    s_cbranch_vccz .LBB104_4
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT:    v_mov_b32_e32 v0, s0
-; GCN3-NEXT:    v_mov_b32_e32 v1, s1
-; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v3, s1
+; GCN3-NEXT:    v_mov_b32_e32 v2, s0
+; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_mov_b64 s[2:3], 0
+; GCN3-NEXT:    v_mov_b32_e32 v4, s13
+; GCN3-NEXT:    v_mov_b32_e32 v5, s12
 ; GCN3-NEXT:  .LBB104_2: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v3, v1
-; GCN3-NEXT:    v_mov_b32_e32 v2, v0
-; GCN3-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s13
-; GCN3-NEXT:    v_mov_b32_e32 v6, s12
-; GCN3-NEXT:    v_mov_b32_e32 v5, s1
-; GCN3-NEXT:    v_mov_b32_e32 v4, s0
-; GCN3-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT:    v_mov_b32_e32 v9, v1
+; GCN3-NEXT:    v_mov_b32_e32 v8, v0
+; GCN3-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9]
+; GCN3-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN3-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB104_2
@@ -16733,16 +16619,14 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN1-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN1-NEXT:    flat_load_dword v2, v[4:5]
 ; GCN1-NEXT:    s_mov_b64 s[34:35], 0
+; GCN1-NEXT:    v_mov_b32_e32 v6, s7
+; GCN1-NEXT:    v_mov_b32_e32 v7, s6
 ; GCN1-NEXT:  .LBB111_4: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT:    v_mov_b32_e32 v0, s7
-; GCN1-NEXT:    v_mov_b32_e32 v6, s6
-; GCN1-NEXT:    v_mov_b32_e32 v4, s4
-; GCN1-NEXT:    v_mov_b32_e32 v5, s5
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
@@ -16801,16 +16685,14 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN2-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN2-NEXT:    flat_load_dword v2, v[4:5]
 ; GCN2-NEXT:    s_mov_b64 s[34:35], 0
+; GCN2-NEXT:    v_mov_b32_e32 v6, s7
+; GCN2-NEXT:    v_mov_b32_e32 v7, s6
 ; GCN2-NEXT:  .LBB111_4: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT:    v_mov_b32_e32 v0, s7
-; GCN2-NEXT:    v_mov_b32_e32 v6, s6
-; GCN2-NEXT:    v_mov_b32_e32 v4, s4
-; GCN2-NEXT:    v_mov_b32_e32 v5, s5
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
@@ -16857,20 +16739,18 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN3-NEXT:  .LBB111_2: ; %atomicrmw.phi
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; GCN3-NEXT:  .LBB111_3: ; %atomicrmw.global
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v4, s4
+; GCN3-NEXT:    v_mov_b32_e32 v5, s5
+; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
 ; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:    v_mov_b32_e32 v6, s7
+; GCN3-NEXT:    v_mov_b32_e32 v7, s6
 ; GCN3-NEXT:  .LBB111_4: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s7
-; GCN3-NEXT:    v_mov_b32_e32 v6, s6
-; GCN3-NEXT:    v_mov_b32_e32 v4, s4
-; GCN3-NEXT:    v_mov_b32_e32 v5, s5
-; GCN3-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -16932,16 +16812,14 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN1-NEXT:    flat_load_dword v2, v[4:5]
 ; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:    v_mov_b32_e32 v6, s7
+; GCN1-NEXT:    v_mov_b32_e32 v7, s6
 ; GCN1-NEXT:  .LBB112_4: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT:    v_mov_b32_e32 v0, s7
-; GCN1-NEXT:    v_mov_b32_e32 v6, s6
-; GCN1-NEXT:    v_mov_b32_e32 v4, s34
-; GCN1-NEXT:    v_mov_b32_e32 v5, s35
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
@@ -17002,16 +16880,14 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN2-NEXT:    flat_load_dword v2, v[4:5]
 ; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:    v_mov_b32_e32 v6, s7
+; GCN2-NEXT:    v_mov_b32_e32 v7, s6
 ; GCN2-NEXT:  .LBB112_4: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT:    v_mov_b32_e32 v0, s7
-; GCN2-NEXT:    v_mov_b32_e32 v6, s6
-; GCN2-NEXT:    v_mov_b32_e32 v4, s34
-; GCN2-NEXT:    v_mov_b32_e32 v5, s35
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
@@ -17060,20 +16936,18 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:  .LBB112_2: ; %atomicrmw.phi
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; GCN3-NEXT:  .LBB112_3: ; %atomicrmw.global
-; GCN3-NEXT:    v_mov_b32_e32 v0, s34
-; GCN3-NEXT:    v_mov_b32_e32 v1, s35
-; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v4, s34
+; GCN3-NEXT:    v_mov_b32_e32 v5, s35
+; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
 ; GCN3-NEXT:    s_mov_b64 s[36:37], 0
+; GCN3-NEXT:    v_mov_b32_e32 v6, s7
+; GCN3-NEXT:    v_mov_b32_e32 v7, s6
 ; GCN3-NEXT:  .LBB112_4: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s7
-; GCN3-NEXT:    v_mov_b32_e32 v6, s6
-; GCN3-NEXT:    v_mov_b32_e32 v4, s34
-; GCN3-NEXT:    v_mov_b32_e32 v5, s35
-; GCN3-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -17128,22 +17002,20 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN1-NEXT:    flat_load_dword v1, v[0:1]
 ; GCN1-NEXT:    flat_load_dword v0, v[2:3]
 ; GCN1-NEXT:    s_mov_b64 s[34:35], 0
+; GCN1-NEXT:    v_mov_b32_e32 v4, s7
+; GCN1-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN1-NEXT:  .LBB113_2: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v3, v1
-; GCN1-NEXT:    v_mov_b32_e32 v2, v0
-; GCN1-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT:    v_mov_b32_e32 v0, s7
-; GCN1-NEXT:    v_mov_b32_e32 v6, s6
-; GCN1-NEXT:    v_mov_b32_e32 v4, s4
-; GCN1-NEXT:    v_mov_b32_e32 v5, s5
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT:    v_mov_b32_e32 v9, v1
+; GCN1-NEXT:    v_mov_b32_e32 v8, v0
+; GCN1-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; GCN1-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB113_2
@@ -17194,22 +17066,20 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN2-NEXT:    flat_load_dword v1, v[0:1]
 ; GCN2-NEXT:    flat_load_dword v0, v[2:3]
 ; GCN2-NEXT:    s_mov_b64 s[34:35], 0
+; GCN2-NEXT:    v_mov_b32_e32 v4, s7
+; GCN2-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN2-NEXT:  .LBB113_2: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v3, v1
-; GCN2-NEXT:    v_mov_b32_e32 v2, v0
-; GCN2-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT:    v_mov_b32_e32 v0, s7
-; GCN2-NEXT:    v_mov_b32_e32 v6, s6
-; GCN2-NEXT:    v_mov_b32_e32 v4, s4
-; GCN2-NEXT:    v_mov_b32_e32 v5, s5
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT:    v_mov_b32_e32 v9, v1
+; GCN2-NEXT:    v_mov_b32_e32 v8, v0
+; GCN2-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; GCN2-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB113_2
@@ -17248,26 +17118,24 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_vccz .LBB113_4
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v2, s4
+; GCN3-NEXT:    v_mov_b32_e32 v3, s5
+; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:    v_mov_b32_e32 v4, s7
+; GCN3-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN3-NEXT:  .LBB113_2: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v3, v1
-; GCN3-NEXT:    v_mov_b32_e32 v2, v0
-; GCN3-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s7
-; GCN3-NEXT:    v_mov_b32_e32 v6, s6
-; GCN3-NEXT:    v_mov_b32_e32 v4, s4
-; GCN3-NEXT:    v_mov_b32_e32 v5, s5
-; GCN3-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT:    v_mov_b32_e32 v9, v1
+; GCN3-NEXT:    v_mov_b32_e32 v8, v0
+; GCN3-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; GCN3-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB113_2
@@ -17321,22 +17189,20 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN1-NEXT:    flat_load_dword v1, v[0:1]
 ; GCN1-NEXT:    flat_load_dword v0, v[2:3]
 ; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:    v_mov_b32_e32 v4, s7
+; GCN1-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN1-NEXT:  .LBB114_2: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v3, v1
-; GCN1-NEXT:    v_mov_b32_e32 v2, v0
-; GCN1-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT:    v_mov_b32_e32 v0, s7
-; GCN1-NEXT:    v_mov_b32_e32 v6, s6
-; GCN1-NEXT:    v_mov_b32_e32 v4, s34
-; GCN1-NEXT:    v_mov_b32_e32 v5, s35
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT:    v_mov_b32_e32 v9, v1
+; GCN1-NEXT:    v_mov_b32_e32 v8, v0
+; GCN1-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; GCN1-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN1-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB114_2
@@ -17389,22 +17255,20 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN2-NEXT:    flat_load_dword v1, v[0:1]
 ; GCN2-NEXT:    flat_load_dword v0, v[2:3]
 ; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:    v_mov_b32_e32 v4, s7
+; GCN2-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN2-NEXT:  .LBB114_2: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v3, v1
-; GCN2-NEXT:    v_mov_b32_e32 v2, v0
-; GCN2-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT:    v_mov_b32_e32 v0, s7
-; GCN2-NEXT:    v_mov_b32_e32 v6, s6
-; GCN2-NEXT:    v_mov_b32_e32 v4, s34
-; GCN2-NEXT:    v_mov_b32_e32 v5, s35
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT:    v_mov_b32_e32 v9, v1
+; GCN2-NEXT:    v_mov_b32_e32 v8, v0
+; GCN2-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; GCN2-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN2-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB114_2
@@ -17445,26 +17309,24 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
 ; GCN3-NEXT:    s_cbranch_vccz .LBB114_4
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT:    v_mov_b32_e32 v0, s34
-; GCN3-NEXT:    v_mov_b32_e32 v1, s35
-; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v2, s34
+; GCN3-NEXT:    v_mov_b32_e32 v3, s35
+; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_mov_b64 s[36:37], 0
+; GCN3-NEXT:    v_mov_b32_e32 v4, s7
+; GCN3-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN3-NEXT:  .LBB114_2: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v3, v1
-; GCN3-NEXT:    v_mov_b32_e32 v2, v0
-; GCN3-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s7
-; GCN3-NEXT:    v_mov_b32_e32 v6, s6
-; GCN3-NEXT:    v_mov_b32_e32 v4, s34
-; GCN3-NEXT:    v_mov_b32_e32 v5, s35
-; GCN3-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT:    v_mov_b32_e32 v9, v1
+; GCN3-NEXT:    v_mov_b32_e32 v8, v0
+; GCN3-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; GCN3-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN3-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB114_2
@@ -18608,16 +18470,14 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN1-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN1-NEXT:    flat_load_dword v2, v[4:5]
 ; GCN1-NEXT:    s_mov_b64 s[34:35], 0
+; GCN1-NEXT:    v_mov_b32_e32 v6, s7
+; GCN1-NEXT:    v_mov_b32_e32 v7, s6
 ; GCN1-NEXT:  .LBB121_4: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT:    v_mov_b32_e32 v0, s7
-; GCN1-NEXT:    v_mov_b32_e32 v6, s6
-; GCN1-NEXT:    v_mov_b32_e32 v4, s4
-; GCN1-NEXT:    v_mov_b32_e32 v5, s5
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
@@ -18676,16 +18536,14 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN2-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN2-NEXT:    flat_load_dword v2, v[4:5]
 ; GCN2-NEXT:    s_mov_b64 s[34:35], 0
+; GCN2-NEXT:    v_mov_b32_e32 v6, s7
+; GCN2-NEXT:    v_mov_b32_e32 v7, s6
 ; GCN2-NEXT:  .LBB121_4: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT:    v_mov_b32_e32 v0, s7
-; GCN2-NEXT:    v_mov_b32_e32 v6, s6
-; GCN2-NEXT:    v_mov_b32_e32 v4, s4
-; GCN2-NEXT:    v_mov_b32_e32 v5, s5
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
@@ -18732,20 +18590,18 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN3-NEXT:  .LBB121_2: ; %atomicrmw.phi
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; GCN3-NEXT:  .LBB121_3: ; %atomicrmw.global
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v4, s4
+; GCN3-NEXT:    v_mov_b32_e32 v5, s5
+; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
 ; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:    v_mov_b32_e32 v6, s7
+; GCN3-NEXT:    v_mov_b32_e32 v7, s6
 ; GCN3-NEXT:  .LBB121_4: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s7
-; GCN3-NEXT:    v_mov_b32_e32 v6, s6
-; GCN3-NEXT:    v_mov_b32_e32 v4, s4
-; GCN3-NEXT:    v_mov_b32_e32 v5, s5
-; GCN3-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -18807,16 +18663,14 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN1-NEXT:    flat_load_dword v2, v[4:5]
 ; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:    v_mov_b32_e32 v6, s7
+; GCN1-NEXT:    v_mov_b32_e32 v7, s6
 ; GCN1-NEXT:  .LBB122_4: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT:    v_mov_b32_e32 v0, s7
-; GCN1-NEXT:    v_mov_b32_e32 v6, s6
-; GCN1-NEXT:    v_mov_b32_e32 v4, s34
-; GCN1-NEXT:    v_mov_b32_e32 v5, s35
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
@@ -18877,16 +18731,14 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN2-NEXT:    flat_load_dword v2, v[4:5]
 ; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:    v_mov_b32_e32 v6, s7
+; GCN2-NEXT:    v_mov_b32_e32 v7, s6
 ; GCN2-NEXT:  .LBB122_4: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT:    v_mov_b32_e32 v0, s7
-; GCN2-NEXT:    v_mov_b32_e32 v6, s6
-; GCN2-NEXT:    v_mov_b32_e32 v4, s34
-; GCN2-NEXT:    v_mov_b32_e32 v5, s35
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
@@ -18935,20 +18787,18 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:  .LBB122_2: ; %atomicrmw.phi
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; GCN3-NEXT:  .LBB122_3: ; %atomicrmw.global
-; GCN3-NEXT:    v_mov_b32_e32 v0, s34
-; GCN3-NEXT:    v_mov_b32_e32 v1, s35
-; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v4, s34
+; GCN3-NEXT:    v_mov_b32_e32 v5, s35
+; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
 ; GCN3-NEXT:    s_mov_b64 s[36:37], 0
+; GCN3-NEXT:    v_mov_b32_e32 v6, s7
+; GCN3-NEXT:    v_mov_b32_e32 v7, s6
 ; GCN3-NEXT:  .LBB122_4: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s7
-; GCN3-NEXT:    v_mov_b32_e32 v6, s6
-; GCN3-NEXT:    v_mov_b32_e32 v4, s34
-; GCN3-NEXT:    v_mov_b32_e32 v5, s35
-; GCN3-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -19003,22 +18853,20 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN1-NEXT:    flat_load_dword v1, v[0:1]
 ; GCN1-NEXT:    flat_load_dword v0, v[2:3]
 ; GCN1-NEXT:    s_mov_b64 s[34:35], 0
+; GCN1-NEXT:    v_mov_b32_e32 v4, s7
+; GCN1-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN1-NEXT:  .LBB123_2: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v3, v1
-; GCN1-NEXT:    v_mov_b32_e32 v2, v0
-; GCN1-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT:    v_mov_b32_e32 v0, s7
-; GCN1-NEXT:    v_mov_b32_e32 v6, s6
-; GCN1-NEXT:    v_mov_b32_e32 v4, s4
-; GCN1-NEXT:    v_mov_b32_e32 v5, s5
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT:    v_mov_b32_e32 v9, v1
+; GCN1-NEXT:    v_mov_b32_e32 v8, v0
+; GCN1-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; GCN1-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB123_2
@@ -19069,22 +18917,20 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN2-NEXT:    flat_load_dword v1, v[0:1]
 ; GCN2-NEXT:    flat_load_dword v0, v[2:3]
 ; GCN2-NEXT:    s_mov_b64 s[34:35], 0
+; GCN2-NEXT:    v_mov_b32_e32 v4, s7
+; GCN2-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN2-NEXT:  .LBB123_2: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v3, v1
-; GCN2-NEXT:    v_mov_b32_e32 v2, v0
-; GCN2-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT:    v_mov_b32_e32 v0, s7
-; GCN2-NEXT:    v_mov_b32_e32 v6, s6
-; GCN2-NEXT:    v_mov_b32_e32 v4, s4
-; GCN2-NEXT:    v_mov_b32_e32 v5, s5
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT:    v_mov_b32_e32 v9, v1
+; GCN2-NEXT:    v_mov_b32_e32 v8, v0
+; GCN2-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; GCN2-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB123_2
@@ -19123,26 +18969,24 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_vccz .LBB123_4
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v2, s4
+; GCN3-NEXT:    v_mov_b32_e32 v3, s5
+; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:    v_mov_b32_e32 v4, s7
+; GCN3-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN3-NEXT:  .LBB123_2: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v3, v1
-; GCN3-NEXT:    v_mov_b32_e32 v2, v0
-; GCN3-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s7
-; GCN3-NEXT:    v_mov_b32_e32 v6, s6
-; GCN3-NEXT:    v_mov_b32_e32 v4, s4
-; GCN3-NEXT:    v_mov_b32_e32 v5, s5
-; GCN3-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT:    v_mov_b32_e32 v9, v1
+; GCN3-NEXT:    v_mov_b32_e32 v8, v0
+; GCN3-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; GCN3-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB123_2
@@ -19196,22 +19040,20 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN1-NEXT:    flat_load_dword v1, v[0:1]
 ; GCN1-NEXT:    flat_load_dword v0, v[2:3]
 ; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:    v_mov_b32_e32 v4, s7
+; GCN1-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN1-NEXT:  .LBB124_2: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v3, v1
-; GCN1-NEXT:    v_mov_b32_e32 v2, v0
-; GCN1-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT:    v_mov_b32_e32 v0, s7
-; GCN1-NEXT:    v_mov_b32_e32 v6, s6
-; GCN1-NEXT:    v_mov_b32_e32 v4, s34
-; GCN1-NEXT:    v_mov_b32_e32 v5, s35
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT:    v_mov_b32_e32 v9, v1
+; GCN1-NEXT:    v_mov_b32_e32 v8, v0
+; GCN1-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; GCN1-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN1-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB124_2
@@ -19264,22 +19106,20 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN2-NEXT:    flat_load_dword v1, v[0:1]
 ; GCN2-NEXT:    flat_load_dword v0, v[2:3]
 ; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:    v_mov_b32_e32 v4, s7
+; GCN2-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN2-NEXT:  .LBB124_2: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v3, v1
-; GCN2-NEXT:    v_mov_b32_e32 v2, v0
-; GCN2-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT:    v_mov_b32_e32 v0, s7
-; GCN2-NEXT:    v_mov_b32_e32 v6, s6
-; GCN2-NEXT:    v_mov_b32_e32 v4, s34
-; GCN2-NEXT:    v_mov_b32_e32 v5, s35
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT:    v_mov_b32_e32 v9, v1
+; GCN2-NEXT:    v_mov_b32_e32 v8, v0
+; GCN2-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; GCN2-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN2-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB124_2
@@ -19320,26 +19160,24 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
 ; GCN3-NEXT:    s_cbranch_vccz .LBB124_4
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT:    v_mov_b32_e32 v0, s34
-; GCN3-NEXT:    v_mov_b32_e32 v1, s35
-; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v2, s34
+; GCN3-NEXT:    v_mov_b32_e32 v3, s35
+; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_mov_b64 s[36:37], 0
+; GCN3-NEXT:    v_mov_b32_e32 v4, s7
+; GCN3-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN3-NEXT:  .LBB124_2: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v3, v1
-; GCN3-NEXT:    v_mov_b32_e32 v2, v0
-; GCN3-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s7
-; GCN3-NEXT:    v_mov_b32_e32 v6, s6
-; GCN3-NEXT:    v_mov_b32_e32 v4, s34
-; GCN3-NEXT:    v_mov_b32_e32 v5, s35
-; GCN3-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT:    v_mov_b32_e32 v9, v1
+; GCN3-NEXT:    v_mov_b32_e32 v8, v0
+; GCN3-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; GCN3-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN3-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB124_2
@@ -19400,20 +19238,18 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN1-NEXT:  .LBB125_2: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ; GCN1-NEXT:  .LBB125_3: ; %atomicrmw.global
-; GCN1-NEXT:    v_mov_b32_e32 v0, s0
-; GCN1-NEXT:    v_mov_b32_e32 v1, s1
-; GCN1-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GCN1-NEXT:    v_mov_b32_e32 v5, s1
+; GCN1-NEXT:    v_mov_b32_e32 v4, s0
+; GCN1-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
 ; GCN1-NEXT:    s_mov_b64 s[4:5], 0
+; GCN1-NEXT:    v_mov_b32_e32 v6, s3
+; GCN1-NEXT:    v_mov_b32_e32 v7, s2
 ; GCN1-NEXT:  .LBB125_4: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; GCN1-NEXT:    v_mov_b32_e32 v0, s3
-; GCN1-NEXT:    v_mov_b32_e32 v6, s2
-; GCN1-NEXT:    v_mov_b32_e32 v5, s1
-; GCN1-NEXT:    v_mov_b32_e32 v4, s0
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
@@ -19473,20 +19309,18 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN2-NEXT:  .LBB125_2: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ; GCN2-NEXT:  .LBB125_3: ; %atomicrmw.global
-; GCN2-NEXT:    v_mov_b32_e32 v0, s0
-; GCN2-NEXT:    v_mov_b32_e32 v1, s1
-; GCN2-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GCN2-NEXT:    v_mov_b32_e32 v5, s1
+; GCN2-NEXT:    v_mov_b32_e32 v4, s0
+; GCN2-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
 ; GCN2-NEXT:    s_mov_b64 s[4:5], 0
+; GCN2-NEXT:    v_mov_b32_e32 v6, s3
+; GCN2-NEXT:    v_mov_b32_e32 v7, s2
 ; GCN2-NEXT:  .LBB125_4: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; GCN2-NEXT:    v_mov_b32_e32 v0, s3
-; GCN2-NEXT:    v_mov_b32_e32 v6, s2
-; GCN2-NEXT:    v_mov_b32_e32 v5, s1
-; GCN2-NEXT:    v_mov_b32_e32 v4, s0
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
@@ -19545,20 +19379,18 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN3-NEXT:  .LBB125_2: ; %atomicrmw.phi
 ; GCN3-NEXT:    s_endpgm
 ; GCN3-NEXT:  .LBB125_3: ; %atomicrmw.global
-; GCN3-NEXT:    v_mov_b32_e32 v0, s0
-; GCN3-NEXT:    v_mov_b32_e32 v1, s1
-; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v5, s1
+; GCN3-NEXT:    v_mov_b32_e32 v4, s0
+; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
+; GCN3-NEXT:    v_mov_b32_e32 v6, s3
+; GCN3-NEXT:    v_mov_b32_e32 v7, s2
 ; GCN3-NEXT:  .LBB125_4: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s3
-; GCN3-NEXT:    v_mov_b32_e32 v6, s2
-; GCN3-NEXT:    v_mov_b32_e32 v5, s1
-; GCN3-NEXT:    v_mov_b32_e32 v4, s0
-; GCN3-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -19615,26 +19447,24 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; GCN1-NEXT:    s_cbranch_vccz .LBB126_4
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GCN1-NEXT:    v_mov_b32_e32 v0, s0
-; GCN1-NEXT:    v_mov_b32_e32 v1, s1
-; GCN1-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GCN1-NEXT:    v_mov_b32_e32 v3, s1
+; GCN1-NEXT:    v_mov_b32_e32 v2, s0
+; GCN1-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    v_mov_b32_e32 v4, s13
+; GCN1-NEXT:    v_mov_b32_e32 v5, s12
 ; GCN1-NEXT:  .LBB126_2: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v3, v1
-; GCN1-NEXT:    v_mov_b32_e32 v2, v0
-; GCN1-NEXT:    v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3]
-; GCN1-NEXT:    v_mov_b32_e32 v0, s13
-; GCN1-NEXT:    v_mov_b32_e32 v6, s12
-; GCN1-NEXT:    v_mov_b32_e32 v5, s1
-; GCN1-NEXT:    v_mov_b32_e32 v4, s0
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT:    v_mov_b32_e32 v9, v1
+; GCN1-NEXT:    v_mov_b32_e32 v8, v0
+; GCN1-NEXT:    v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9]
+; GCN1-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN1-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB126_2
@@ -19688,26 +19518,24 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; GCN2-NEXT:    s_cbranch_vccz .LBB126_4
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GCN2-NEXT:    v_mov_b32_e32 v0, s0
-; GCN2-NEXT:    v_mov_b32_e32 v1, s1
-; GCN2-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GCN2-NEXT:    v_mov_b32_e32 v3, s1
+; GCN2-NEXT:    v_mov_b32_e32 v2, s0
+; GCN2-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    v_mov_b32_e32 v4, s13
+; GCN2-NEXT:    v_mov_b32_e32 v5, s12
 ; GCN2-NEXT:  .LBB126_2: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v3, v1
-; GCN2-NEXT:    v_mov_b32_e32 v2, v0
-; GCN2-NEXT:    v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3]
-; GCN2-NEXT:    v_mov_b32_e32 v0, s13
-; GCN2-NEXT:    v_mov_b32_e32 v6, s12
-; GCN2-NEXT:    v_mov_b32_e32 v5, s1
-; GCN2-NEXT:    v_mov_b32_e32 v4, s0
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT:    v_mov_b32_e32 v9, v1
+; GCN2-NEXT:    v_mov_b32_e32 v8, v0
+; GCN2-NEXT:    v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9]
+; GCN2-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN2-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB126_2
@@ -19760,26 +19588,24 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; GCN3-NEXT:    s_cbranch_vccz .LBB126_4
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT:    v_mov_b32_e32 v0, s0
-; GCN3-NEXT:    v_mov_b32_e32 v1, s1
-; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v3, s1
+; GCN3-NEXT:    v_mov_b32_e32 v2, s0
+; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_mov_b64 s[2:3], 0
+; GCN3-NEXT:    v_mov_b32_e32 v4, s13
+; GCN3-NEXT:    v_mov_b32_e32 v5, s12
 ; GCN3-NEXT:  .LBB126_2: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v3, v1
-; GCN3-NEXT:    v_mov_b32_e32 v2, v0
-; GCN3-NEXT:    v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s13
-; GCN3-NEXT:    v_mov_b32_e32 v6, s12
-; GCN3-NEXT:    v_mov_b32_e32 v5, s1
-; GCN3-NEXT:    v_mov_b32_e32 v4, s0
-; GCN3-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT:    v_mov_b32_e32 v9, v1
+; GCN3-NEXT:    v_mov_b32_e32 v8, v0
+; GCN3-NEXT:    v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9]
+; GCN3-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN3-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB126_2
@@ -19839,20 +19665,18 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:  .LBB127_2: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ; GCN1-NEXT:  .LBB127_3: ; %atomicrmw.global
-; GCN1-NEXT:    v_mov_b32_e32 v0, s0
-; GCN1-NEXT:    v_mov_b32_e32 v1, s1
-; GCN1-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GCN1-NEXT:    v_mov_b32_e32 v5, s1
+; GCN1-NEXT:    v_mov_b32_e32 v4, s0
+; GCN1-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
 ; GCN1-NEXT:    s_mov_b64 s[4:5], 0
+; GCN1-NEXT:    v_mov_b32_e32 v6, s3
+; GCN1-NEXT:    v_mov_b32_e32 v7, s2
 ; GCN1-NEXT:  .LBB127_4: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; GCN1-NEXT:    v_mov_b32_e32 v0, s3
-; GCN1-NEXT:    v_mov_b32_e32 v6, s2
-; GCN1-NEXT:    v_mov_b32_e32 v5, s1
-; GCN1-NEXT:    v_mov_b32_e32 v4, s0
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
@@ -19906,20 +19730,18 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:  .LBB127_2: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ; GCN2-NEXT:  .LBB127_3: ; %atomicrmw.global
-; GCN2-NEXT:    v_mov_b32_e32 v0, s0
-; GCN2-NEXT:    v_mov_b32_e32 v1, s1
-; GCN2-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GCN2-NEXT:    v_mov_b32_e32 v5, s1
+; GCN2-NEXT:    v_mov_b32_e32 v4, s0
+; GCN2-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
 ; GCN2-NEXT:    s_mov_b64 s[4:5], 0
+; GCN2-NEXT:    v_mov_b32_e32 v6, s3
+; GCN2-NEXT:    v_mov_b32_e32 v7, s2
 ; GCN2-NEXT:  .LBB127_4: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; GCN2-NEXT:    v_mov_b32_e32 v0, s3
-; GCN2-NEXT:    v_mov_b32_e32 v6, s2
-; GCN2-NEXT:    v_mov_b32_e32 v5, s1
-; GCN2-NEXT:    v_mov_b32_e32 v4, s0
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
@@ -19972,20 +19794,18 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
 ; GCN3-NEXT:  .LBB127_2: ; %atomicrmw.phi
 ; GCN3-NEXT:    s_endpgm
 ; GCN3-NEXT:  .LBB127_3: ; %atomicrmw.global
-; GCN3-NEXT:    v_mov_b32_e32 v0, s0
-; GCN3-NEXT:    v_mov_b32_e32 v1, s1
-; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v5, s1
+; GCN3-NEXT:    v_mov_b32_e32 v4, s0
+; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
+; GCN3-NEXT:    v_mov_b32_e32 v6, s3
+; GCN3-NEXT:    v_mov_b32_e32 v7, s2
 ; GCN3-NEXT:  .LBB127_4: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s3
-; GCN3-NEXT:    v_mov_b32_e32 v6, s2
-; GCN3-NEXT:    v_mov_b32_e32 v5, s1
-; GCN3-NEXT:    v_mov_b32_e32 v4, s0
-; GCN3-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -20038,26 +19858,24 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; GCN1-NEXT:    s_cbranch_vccz .LBB128_4
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GCN1-NEXT:    v_mov_b32_e32 v0, s0
-; GCN1-NEXT:    v_mov_b32_e32 v1, s1
-; GCN1-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GCN1-NEXT:    v_mov_b32_e32 v3, s1
+; GCN1-NEXT:    v_mov_b32_e32 v2, s0
+; GCN1-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    v_mov_b32_e32 v4, s13
+; GCN1-NEXT:    v_mov_b32_e32 v5, s12
 ; GCN1-NEXT:  .LBB128_2: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v3, v1
-; GCN1-NEXT:    v_mov_b32_e32 v2, v0
-; GCN1-NEXT:    v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3]
-; GCN1-NEXT:    v_mov_b32_e32 v0, s13
-; GCN1-NEXT:    v_mov_b32_e32 v6, s12
-; GCN1-NEXT:    v_mov_b32_e32 v5, s1
-; GCN1-NEXT:    v_mov_b32_e32 v4, s0
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT:    v_mov_b32_e32 v9, v1
+; GCN1-NEXT:    v_mov_b32_e32 v8, v0
+; GCN1-NEXT:    v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9]
+; GCN1-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN1-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN1-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB128_2
@@ -20109,26 +19927,24 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; GCN2-NEXT:    s_cbranch_vccz .LBB128_4
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GCN2-NEXT:    v_mov_b32_e32 v0, s0
-; GCN2-NEXT:    v_mov_b32_e32 v1, s1
-; GCN2-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GCN2-NEXT:    v_mov_b32_e32 v3, s1
+; GCN2-NEXT:    v_mov_b32_e32 v2, s0
+; GCN2-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    v_mov_b32_e32 v4, s13
+; GCN2-NEXT:    v_mov_b32_e32 v5, s12
 ; GCN2-NEXT:  .LBB128_2: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v3, v1
-; GCN2-NEXT:    v_mov_b32_e32 v2, v0
-; GCN2-NEXT:    v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3]
-; GCN2-NEXT:    v_mov_b32_e32 v0, s13
-; GCN2-NEXT:    v_mov_b32_e32 v6, s12
-; GCN2-NEXT:    v_mov_b32_e32 v5, s1
-; GCN2-NEXT:    v_mov_b32_e32 v4, s0
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT:    v_mov_b32_e32 v9, v1
+; GCN2-NEXT:    v_mov_b32_e32 v8, v0
+; GCN2-NEXT:    v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9]
+; GCN2-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN2-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN2-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB128_2
@@ -20179,26 +19995,24 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; GCN3-NEXT:    s_cbranch_vccz .LBB128_4
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT:    v_mov_b32_e32 v0, s0
-; GCN3-NEXT:    v_mov_b32_e32 v1, s1
-; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT:    v_mov_b32_e32 v3, s1
+; GCN3-NEXT:    v_mov_b32_e32 v2, s0
+; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_mov_b64 s[2:3], 0
+; GCN3-NEXT:    v_mov_b32_e32 v4, s13
+; GCN3-NEXT:    v_mov_b32_e32 v5, s12
 ; GCN3-NEXT:  .LBB128_2: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v3, v1
-; GCN3-NEXT:    v_mov_b32_e32 v2, v0
-; GCN3-NEXT:    v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s13
-; GCN3-NEXT:    v_mov_b32_e32 v6, s12
-; GCN3-NEXT:    v_mov_b32_e32 v5, s1
-; GCN3-NEXT:    v_mov_b32_e32 v4, s0
-; GCN3-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT:    v_mov_b32_e32 v9, v1
+; GCN3-NEXT:    v_mov_b32_e32 v8, v0
+; GCN3-NEXT:    v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9]
+; GCN3-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN3-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN3-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB128_2
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll
index 4a4fcfe5afcc8..fe47461ebf956 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll
@@ -2187,14 +2187,14 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s35
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX7-NEXT:    flat_load_dword v3, v[3:4]
+; GFX7-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX7-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, s7, v3
 ; GFX7-NEXT:    v_and_b32_e32 v6, s6, v2
-; GFX7-NEXT:    v_mov_b32_e32 v4, s4
-; GFX7-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX7-NEXT:    v_not_b32_e32 v1, v0
 ; GFX7-NEXT:    v_not_b32_e32 v0, v6
 ; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -2221,14 +2221,14 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s35
 ; GFX8-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX8-NEXT:    flat_load_dword v3, v[3:4]
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX8-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, s7, v3
 ; GFX8-NEXT:    v_and_b32_e32 v6, s6, v2
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX8-NEXT:    v_not_b32_e32 v1, v0
 ; GFX8-NEXT:    v_not_b32_e32 v0, v6
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -2250,14 +2250,14 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX9-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, s7, v3
 ; GFX9-NEXT:    v_and_b32_e32 v6, s6, v2
-; GFX9-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX9-NEXT:    v_not_b32_e32 v1, v0
 ; GFX9-NEXT:    v_not_b32_e32 v0, v6
 ; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -2290,14 +2290,12 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out,
 ; GFX7-NEXT:    v_mov_b32_e32 v5, s35
 ; GFX7-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX7-NEXT:    flat_load_dword v2, v[4:5]
-; GFX7-NEXT:    s_mov_b64 s[36:37], 0
+; GFX7-NEXT:    s_mov_b64 s[34:35], 0
 ; GFX7-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, s7, v3
 ; GFX7-NEXT:    v_and_b32_e32 v6, s6, v2
-; GFX7-NEXT:    v_mov_b32_e32 v4, s34
-; GFX7-NEXT:    v_mov_b32_e32 v5, s35
 ; GFX7-NEXT:    v_not_b32_e32 v1, v0
 ; GFX7-NEXT:    v_not_b32_e32 v0, v6
 ; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -2305,12 +2303,12 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out,
 ; GFX7-NEXT:    buffer_wbinvl1_vol
 ; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GFX7-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: flat_atomic_nand_i64_noret_offset_scalar:
@@ -2326,14 +2324,12 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out,
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s35
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    flat_load_dword v2, v[4:5]
-; GFX8-NEXT:    s_mov_b64 s[36:37], 0
+; GFX8-NEXT:    s_mov_b64 s[34:35], 0
 ; GFX8-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, s7, v3
 ; GFX8-NEXT:    v_and_b32_e32 v6, s6, v2
-; GFX8-NEXT:    v_mov_b32_e32 v4, s34
-; GFX8-NEXT:    v_mov_b32_e32 v5, s35
 ; GFX8-NEXT:    v_not_b32_e32 v1, v0
 ; GFX8-NEXT:    v_not_b32_e32 v0, v6
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -2341,12 +2337,12 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out,
 ; GFX8-NEXT:    buffer_wbinvl1_vol
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GFX8-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: flat_atomic_nand_i64_noret_offset_scalar:
@@ -2355,14 +2351,14 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out,
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX9-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, s7, v3
 ; GFX9-NEXT:    v_and_b32_e32 v6, s6, v2
-; GFX9-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX9-NEXT:    v_not_b32_e32 v1, v0
 ; GFX9-NEXT:    v_not_b32_e32 v0, v6
 ; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
@@ -2394,22 +2390,22 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s35
 ; GFX7-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX7-NEXT:    flat_load_dword v1, v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX7-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    v_mov_b32_e32 v4, s4
-; GFX7-NEXT:    v_and_b32_e32 v0, s7, v3
-; GFX7-NEXT:    v_and_b32_e32 v6, s6, v2
-; GFX7-NEXT:    v_mov_b32_e32 v5, s5
-; GFX7-NEXT:    v_not_b32_e32 v1, v0
-; GFX7-NEXT:    v_not_b32_e32 v0, v6
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, s7, v7
+; GFX7-NEXT:    v_and_b32_e32 v1, s6, v6
+; GFX7-NEXT:    v_not_b32_e32 v5, v0
+; GFX7-NEXT:    v_not_b32_e32 v4, v1
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
 ; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB56_1
@@ -2428,22 +2424,22 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s35
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    flat_load_dword v1, v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX8-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:    v_and_b32_e32 v0, s7, v3
-; GFX8-NEXT:    v_and_b32_e32 v6, s6, v2
-; GFX8-NEXT:    v_mov_b32_e32 v5, s5
-; GFX8-NEXT:    v_not_b32_e32 v1, v0
-; GFX8-NEXT:    v_not_b32_e32 v0, v6
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v7, v1
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, s7, v7
+; GFX8-NEXT:    v_and_b32_e32 v1, s6, v6
+; GFX8-NEXT:    v_not_b32_e32 v5, v0
+; GFX8-NEXT:    v_not_b32_e32 v4, v1
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
 ; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB56_1
@@ -2457,22 +2453,22 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NEXT:    v_and_b32_e32 v0, s7, v3
-; GFX9-NEXT:    v_and_b32_e32 v6, s6, v2
-; GFX9-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-NEXT:    v_not_b32_e32 v1, v0
-; GFX9-NEXT:    v_not_b32_e32 v0, v6
-; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX9-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, s7, v7
+; GFX9-NEXT:    v_and_b32_e32 v1, s6, v6
+; GFX9-NEXT:    v_not_b32_e32 v5, v0
+; GFX9-NEXT:    v_not_b32_e32 v4, v1
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB56_1
@@ -2497,27 +2493,25 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s35
 ; GFX7-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX7-NEXT:    flat_load_dword v0, v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[36:37], 0
+; GFX7-NEXT:    s_mov_b64 s[34:35], 0
 ; GFX7-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    v_mov_b32_e32 v4, s34
-; GFX7-NEXT:    v_and_b32_e32 v0, s7, v3
-; GFX7-NEXT:    v_and_b32_e32 v6, s6, v2
-; GFX7-NEXT:    v_mov_b32_e32 v5, s35
-; GFX7-NEXT:    v_not_b32_e32 v1, v0
-; GFX7-NEXT:    v_not_b32_e32 v0, v6
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, s7, v7
+; GFX7-NEXT:    v_and_b32_e32 v1, s6, v6
+; GFX7-NEXT:    v_not_b32_e32 v5, v0
+; GFX7-NEXT:    v_not_b32_e32 v4, v1
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX7-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GFX7-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: flat_atomic_nand_i64_ret_offset_scalar:
@@ -2533,27 +2527,25 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s35
 ; GFX8-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX8-NEXT:    flat_load_dword v0, v[2:3]
-; GFX8-NEXT:    s_mov_b64 s[36:37], 0
+; GFX8-NEXT:    s_mov_b64 s[34:35], 0
 ; GFX8-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s34
-; GFX8-NEXT:    v_and_b32_e32 v0, s7, v3
-; GFX8-NEXT:    v_and_b32_e32 v6, s6, v2
-; GFX8-NEXT:    v_mov_b32_e32 v5, s35
-; GFX8-NEXT:    v_not_b32_e32 v1, v0
-; GFX8-NEXT:    v_not_b32_e32 v0, v6
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v7, v1
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, s7, v7
+; GFX8-NEXT:    v_and_b32_e32 v1, s6, v6
+; GFX8-NEXT:    v_not_b32_e32 v5, v0
+; GFX8-NEXT:    v_not_b32_e32 v4, v1
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX8-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GFX8-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: flat_atomic_nand_i64_ret_offset_scalar:
@@ -2562,22 +2554,22 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NEXT:    v_and_b32_e32 v0, s7, v3
-; GFX9-NEXT:    v_and_b32_e32 v6, s6, v2
-; GFX9-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-NEXT:    v_not_b32_e32 v1, v0
-; GFX9-NEXT:    v_not_b32_e32 v0, v6
-; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
+; GFX9-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, s7, v7
+; GFX9-NEXT:    v_and_b32_e32 v1, s6, v6
+; GFX9-NEXT:    v_not_b32_e32 v5, v0
+; GFX9-NEXT:    v_not_b32_e32 v4, v1
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB57_1
@@ -3857,17 +3849,17 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s35
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX7-NEXT:    flat_load_dword v3, v[3:4]
+; GFX7-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s7
+; GFX7-NEXT:    v_mov_b32_e32 v7, s6
+; GFX7-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX7-NEXT:  .LBB84_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s7
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
-; GFX7-NEXT:    v_mov_b32_e32 v4, s4
-; GFX7-NEXT:    v_mov_b32_e32 v5, s5
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -3892,17 +3884,17 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s35
 ; GFX8-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX8-NEXT:    flat_load_dword v3, v[3:4]
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s7
+; GFX8-NEXT:    v_mov_b32_e32 v7, s6
+; GFX8-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX8-NEXT:  .LBB84_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s7
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:    v_mov_b32_e32 v5, s5
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
@@ -3922,17 +3914,17 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX9-NEXT:  .LBB84_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
@@ -3963,28 +3955,26 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out,
 ; GFX7-NEXT:    v_mov_b32_e32 v5, s35
 ; GFX7-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX7-NEXT:    flat_load_dword v2, v[4:5]
-; GFX7-NEXT:    s_mov_b64 s[36:37], 0
+; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s7
+; GFX7-NEXT:    v_mov_b32_e32 v7, s6
 ; GFX7-NEXT:  .LBB85_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s7
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
-; GFX7-NEXT:    v_mov_b32_e32 v4, s34
-; GFX7-NEXT:    v_mov_b32_e32 v5, s35
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
 ; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB85_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GFX7-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: flat_atomic_max_i64_noret_offset_scalar:
@@ -4000,28 +3990,26 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out,
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s35
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    flat_load_dword v2, v[4:5]
-; GFX8-NEXT:    s_mov_b64 s[36:37], 0
+; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s7
+; GFX8-NEXT:    v_mov_b32_e32 v7, s6
 ; GFX8-NEXT:  .LBB85_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s7
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
-; GFX8-NEXT:    v_mov_b32_e32 v4, s34
-; GFX8-NEXT:    v_mov_b32_e32 v5, s35
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB85_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GFX8-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: flat_atomic_max_i64_noret_offset_scalar:
@@ -4030,17 +4018,17 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out,
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX9-NEXT:  .LBB85_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
@@ -4070,23 +4058,23 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s35
 ; GFX7-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX7-NEXT:    flat_load_dword v1, v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v5, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX7-NEXT:  .LBB86_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s7
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
-; GFX7-NEXT:    v_mov_b32_e32 v4, s4
-; GFX7-NEXT:    v_mov_b32_e32 v5, s5
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
+; GFX7-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB86_1
@@ -4105,23 +4093,23 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s35
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    flat_load_dword v1, v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s7
+; GFX8-NEXT:    v_mov_b32_e32 v5, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX8-NEXT:  .LBB86_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s7
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:    v_mov_b32_e32 v5, s5
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB86_1
@@ -4135,23 +4123,23 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s7
+; GFX9-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:  .LBB86_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX9-NEXT:    v_mov_b32_e32 v9, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB86_1
@@ -4176,28 +4164,26 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s35
 ; GFX7-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX7-NEXT:    flat_load_dword v0, v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[36:37], 0
+; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX7-NEXT:  .LBB87_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s7
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
-; GFX7-NEXT:    v_mov_b32_e32 v4, s34
-; GFX7-NEXT:    v_mov_b32_e32 v5, s35
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
+; GFX7-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX7-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB87_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GFX7-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: flat_atomic_max_i64_ret_offset_scalar:
@@ -4213,28 +4199,26 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s35
 ; GFX8-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX8-NEXT:    flat_load_dword v0, v[2:3]
-; GFX8-NEXT:    s_mov_b64 s[36:37], 0
+; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s7
+; GFX8-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX8-NEXT:  .LBB87_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s7
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
-; GFX8-NEXT:    v_mov_b32_e32 v4, s34
-; GFX8-NEXT:    v_mov_b32_e32 v5, s35
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX8-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB87_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GFX8-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: flat_atomic_max_i64_ret_offset_scalar:
@@ -4243,23 +4227,23 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s7
+; GFX9-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:  .LBB87_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
+; GFX9-NEXT:    v_mov_b32_e32 v9, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB87_1
@@ -4282,28 +4266,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX7-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX7-NEXT:    s_add_u32 s0, s0, 32
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mov_b32_e32 v5, s1
+; GFX7-NEXT:    v_mov_b32_e32 v4, s0
+; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s3
+; GFX7-NEXT:    v_mov_b32_e32 v7, s2
 ; GFX7-NEXT:  .LBB88_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s3
-; GFX7-NEXT:    v_mov_b32_e32 v6, s2
-; GFX7-NEXT:    v_mov_b32_e32 v5, s1
-; GFX7-NEXT:    v_mov_b32_e32 v4, s0
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
 ; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB88_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_endpgm
@@ -4318,28 +4300,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX8-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX8-NEXT:    s_add_u32 s0, s0, 32
 ; GFX8-NEXT:    s_addc_u32 s1, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT:    s_mov_b64 s[0:1], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s3
+; GFX8-NEXT:    v_mov_b32_e32 v7, s2
 ; GFX8-NEXT:  .LBB88_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s3
-; GFX8-NEXT:    v_mov_b32_e32 v6, s2
-; GFX8-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB88_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_endpgm
@@ -4352,28 +4332,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX9-NEXT:    s_add_u32 s0, s0, s4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] offset:32
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[4:5] offset:32
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, s3
+; GFX9-NEXT:    v_mov_b32_e32 v7, s2
 ; GFX9-NEXT:  .LBB88_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v6, s2
-; GFX9-NEXT:    v_mov_b32_e32 v5, s1
-; GFX9-NEXT:    v_mov_b32_e32 v4, s0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB88_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
@@ -4396,32 +4374,30 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[6:7], 0
+; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_mov_b32_e32 v4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX7-NEXT:  .LBB89_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s5
-; GFX7-NEXT:    v_mov_b32_e32 v6, s4
-; GFX7-NEXT:    v_mov_b32_e32 v5, s1
-; GFX7-NEXT:    v_mov_b32_e32 v4, s0
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB89_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: atomic_max_i64_ret_addr64_offset:
@@ -4435,69 +4411,65 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX8-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[0:1], 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s5
+; GFX8-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX8-NEXT:  .LBB89_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s5
-; GFX8-NEXT:    v_mov_b32_e32 v6, s4
-; GFX8-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v9, v3
+; GFX8-NEXT:    v_mov_b32_e32 v8, v2
+; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB89_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: atomic_max_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[14:15], 3
 ; GFX9-NEXT:    s_add_u32 s0, s8, s0
 ; GFX9-NEXT:    s_addc_u32 s1, s9, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] offset:32
+; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s13
+; GFX9-NEXT:    v_mov_b32_e32 v5, s12
 ; GFX9-NEXT:  .LBB89_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s13
-; GFX9-NEXT:    v_mov_b32_e32 v6, s12
-; GFX9-NEXT:    v_mov_b32_e32 v5, s1
-; GFX9-NEXT:    v_mov_b32_e32 v4, s0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
+; GFX9-NEXT:    v_mov_b32_e32 v9, v3
+; GFX9-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB89_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v2, s10
-; GFX9-NEXT:    v_mov_b32_e32 v3, s11
-; GFX9-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s10
+; GFX9-NEXT:    v_mov_b32_e32 v1, s11
+; GFX9-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -4516,28 +4488,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX7-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX7-NEXT:    s_add_u32 s0, s0, s4
 ; GFX7-NEXT:    s_addc_u32 s1, s1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mov_b32_e32 v5, s1
+; GFX7-NEXT:    v_mov_b32_e32 v4, s0
+; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s3
+; GFX7-NEXT:    v_mov_b32_e32 v7, s2
 ; GFX7-NEXT:  .LBB90_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s3
-; GFX7-NEXT:    v_mov_b32_e32 v6, s2
-; GFX7-NEXT:    v_mov_b32_e32 v5, s1
-; GFX7-NEXT:    v_mov_b32_e32 v4, s0
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
 ; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB90_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_endpgm
@@ -4550,28 +4520,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX8-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX8-NEXT:    s_add_u32 s0, s0, s4
 ; GFX8-NEXT:    s_addc_u32 s1, s1, s5
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT:    s_mov_b64 s[0:1], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s3
+; GFX8-NEXT:    v_mov_b32_e32 v7, s2
 ; GFX8-NEXT:  .LBB90_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s3
-; GFX8-NEXT:    v_mov_b32_e32 v6, s2
-; GFX8-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB90_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_endpgm
@@ -4584,28 +4552,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX9-NEXT:    s_add_u32 s0, s0, s4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, s3
+; GFX9-NEXT:    v_mov_b32_e32 v7, s2
 ; GFX9-NEXT:  .LBB90_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v6, s2
-; GFX9-NEXT:    v_mov_b32_e32 v5, s1
-; GFX9-NEXT:    v_mov_b32_e32 v4, s0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB90_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
@@ -4625,32 +4591,30 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GFX7-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[6:7], 0
+; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_mov_b32_e32 v4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX7-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s5
-; GFX7-NEXT:    v_mov_b32_e32 v6, s4
-; GFX7-NEXT:    v_mov_b32_e32 v5, s1
-; GFX7-NEXT:    v_mov_b32_e32 v4, s0
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB91_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: atomic_max_i64_ret_addr64:
@@ -4662,69 +4626,65 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GFX8-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[0:1], 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s5
+; GFX8-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX8-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s5
-; GFX8-NEXT:    v_mov_b32_e32 v6, s4
-; GFX8-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v9, v3
+; GFX8-NEXT:    v_mov_b32_e32 v8, v2
+; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB91_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: atomic_max_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[14:15], 3
 ; GFX9-NEXT:    s_add_u32 s0, s8, s0
 ; GFX9-NEXT:    s_addc_u32 s1, s9, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s13
+; GFX9-NEXT:    v_mov_b32_e32 v5, s12
 ; GFX9-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s13
-; GFX9-NEXT:    v_mov_b32_e32 v6, s12
-; GFX9-NEXT:    v_mov_b32_e32 v5, s1
-; GFX9-NEXT:    v_mov_b32_e32 v4, s0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX9-NEXT:    v_mov_b32_e32 v9, v3
+; GFX9-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB91_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v2, s10
-; GFX9-NEXT:    v_mov_b32_e32 v3, s11
-; GFX9-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s10
+; GFX9-NEXT:    v_mov_b32_e32 v1, s11
+; GFX9-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -5271,17 +5231,17 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s35
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX7-NEXT:    flat_load_dword v3, v[3:4]
+; GFX7-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s7
+; GFX7-NEXT:    v_mov_b32_e32 v7, s6
+; GFX7-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX7-NEXT:  .LBB98_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s7
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
-; GFX7-NEXT:    v_mov_b32_e32 v4, s4
-; GFX7-NEXT:    v_mov_b32_e32 v5, s5
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -5306,17 +5266,17 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s35
 ; GFX8-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX8-NEXT:    flat_load_dword v3, v[3:4]
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s7
+; GFX8-NEXT:    v_mov_b32_e32 v7, s6
+; GFX8-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX8-NEXT:  .LBB98_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s7
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:    v_mov_b32_e32 v5, s5
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
@@ -5336,17 +5296,17 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX9-NEXT:  .LBB98_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
@@ -5377,28 +5337,26 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out,
 ; GFX7-NEXT:    v_mov_b32_e32 v5, s35
 ; GFX7-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX7-NEXT:    flat_load_dword v2, v[4:5]
-; GFX7-NEXT:    s_mov_b64 s[36:37], 0
+; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s7
+; GFX7-NEXT:    v_mov_b32_e32 v7, s6
 ; GFX7-NEXT:  .LBB99_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s7
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
-; GFX7-NEXT:    v_mov_b32_e32 v4, s34
-; GFX7-NEXT:    v_mov_b32_e32 v5, s35
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
 ; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB99_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GFX7-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: flat_atomic_umax_i64_noret_offset_scalar:
@@ -5414,28 +5372,26 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out,
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s35
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    flat_load_dword v2, v[4:5]
-; GFX8-NEXT:    s_mov_b64 s[36:37], 0
+; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s7
+; GFX8-NEXT:    v_mov_b32_e32 v7, s6
 ; GFX8-NEXT:  .LBB99_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s7
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
-; GFX8-NEXT:    v_mov_b32_e32 v4, s34
-; GFX8-NEXT:    v_mov_b32_e32 v5, s35
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB99_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GFX8-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: flat_atomic_umax_i64_noret_offset_scalar:
@@ -5444,17 +5400,17 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out,
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX9-NEXT:  .LBB99_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
@@ -5484,23 +5440,23 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s35
 ; GFX7-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX7-NEXT:    flat_load_dword v1, v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v5, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX7-NEXT:  .LBB100_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s7
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
-; GFX7-NEXT:    v_mov_b32_e32 v4, s4
-; GFX7-NEXT:    v_mov_b32_e32 v5, s5
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
+; GFX7-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB100_1
@@ -5519,23 +5475,23 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s35
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    flat_load_dword v1, v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s7
+; GFX8-NEXT:    v_mov_b32_e32 v5, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX8-NEXT:  .LBB100_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s7
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:    v_mov_b32_e32 v5, s5
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB100_1
@@ -5549,23 +5505,23 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s7
+; GFX9-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:  .LBB100_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX9-NEXT:    v_mov_b32_e32 v9, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB100_1
@@ -5590,28 +5546,26 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s35
 ; GFX7-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX7-NEXT:    flat_load_dword v0, v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[36:37], 0
+; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX7-NEXT:  .LBB101_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s7
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
-; GFX7-NEXT:    v_mov_b32_e32 v4, s34
-; GFX7-NEXT:    v_mov_b32_e32 v5, s35
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
+; GFX7-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX7-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB101_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GFX7-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: flat_atomic_umax_i64_ret_offset_scalar:
@@ -5627,28 +5581,26 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s35
 ; GFX8-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX8-NEXT:    flat_load_dword v0, v[2:3]
-; GFX8-NEXT:    s_mov_b64 s[36:37], 0
+; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s7
+; GFX8-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX8-NEXT:  .LBB101_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s7
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
-; GFX8-NEXT:    v_mov_b32_e32 v4, s34
-; GFX8-NEXT:    v_mov_b32_e32 v5, s35
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX8-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB101_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GFX8-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: flat_atomic_umax_i64_ret_offset_scalar:
@@ -5657,23 +5609,23 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s7
+; GFX9-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:  .LBB101_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
+; GFX9-NEXT:    v_mov_b32_e32 v9, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB101_1
@@ -5696,28 +5648,26 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GFX7-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX7-NEXT:    s_add_u32 s0, s0, 32
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mov_b32_e32 v5, s1
+; GFX7-NEXT:    v_mov_b32_e32 v4, s0
+; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s3
+; GFX7-NEXT:    v_mov_b32_e32 v7, s2
 ; GFX7-NEXT:  .LBB102_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s3
-; GFX7-NEXT:    v_mov_b32_e32 v6, s2
-; GFX7-NEXT:    v_mov_b32_e32 v5, s1
-; GFX7-NEXT:    v_mov_b32_e32 v4, s0
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
 ; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB102_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_endpgm
@@ -5732,28 +5682,26 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GFX8-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX8-NEXT:    s_add_u32 s0, s0, 32
 ; GFX8-NEXT:    s_addc_u32 s1, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT:    s_mov_b64 s[0:1], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s3
+; GFX8-NEXT:    v_mov_b32_e32 v7, s2
 ; GFX8-NEXT:  .LBB102_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s3
-; GFX8-NEXT:    v_mov_b32_e32 v6, s2
-; GFX8-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB102_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_endpgm
@@ -5766,28 +5714,26 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX9-NEXT:    s_add_u32 s0, s0, s4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] offset:32
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[4:5] offset:32
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, s3
+; GFX9-NEXT:    v_mov_b32_e32 v7, s2
 ; GFX9-NEXT:  .LBB102_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v6, s2
-; GFX9-NEXT:    v_mov_b32_e32 v5, s1
-; GFX9-NEXT:    v_mov_b32_e32 v4, s0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB102_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
@@ -5810,32 +5756,30 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[6:7], 0
+; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_mov_b32_e32 v4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX7-NEXT:  .LBB103_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s5
-; GFX7-NEXT:    v_mov_b32_e32 v6, s4
-; GFX7-NEXT:    v_mov_b32_e32 v5, s1
-; GFX7-NEXT:    v_mov_b32_e32 v4, s0
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB103_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: atomic_umax_i64_ret_addr64_offset:
@@ -5849,69 +5793,65 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GFX8-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[0:1], 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s5
+; GFX8-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX8-NEXT:  .LBB103_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s5
-; GFX8-NEXT:    v_mov_b32_e32 v6, s4
-; GFX8-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v9, v3
+; GFX8-NEXT:    v_mov_b32_e32 v8, v2
+; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB103_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[14:15], 3
 ; GFX9-NEXT:    s_add_u32 s0, s8, s0
 ; GFX9-NEXT:    s_addc_u32 s1, s9, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] offset:32
+; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s13
+; GFX9-NEXT:    v_mov_b32_e32 v5, s12
 ; GFX9-NEXT:  .LBB103_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s13
-; GFX9-NEXT:    v_mov_b32_e32 v6, s12
-; GFX9-NEXT:    v_mov_b32_e32 v5, s1
-; GFX9-NEXT:    v_mov_b32_e32 v4, s0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
+; GFX9-NEXT:    v_mov_b32_e32 v9, v3
+; GFX9-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB103_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v2, s10
-; GFX9-NEXT:    v_mov_b32_e32 v3, s11
-; GFX9-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s10
+; GFX9-NEXT:    v_mov_b32_e32 v1, s11
+; GFX9-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -5931,32 +5871,30 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GFX7-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[6:7], 0
+; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_mov_b32_e32 v4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX7-NEXT:  .LBB104_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s5
-; GFX7-NEXT:    v_mov_b32_e32 v6, s4
-; GFX7-NEXT:    v_mov_b32_e32 v5, s1
-; GFX7-NEXT:    v_mov_b32_e32 v4, s0
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB104_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: atomic_umax_i64_ret_addr64:
@@ -5968,69 +5906,65 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GFX8-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[0:1], 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s5
+; GFX8-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX8-NEXT:  .LBB104_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s5
-; GFX8-NEXT:    v_mov_b32_e32 v6, s4
-; GFX8-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v9, v3
+; GFX8-NEXT:    v_mov_b32_e32 v8, v2
+; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB104_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: atomic_umax_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[14:15], 3
 ; GFX9-NEXT:    s_add_u32 s0, s8, s0
 ; GFX9-NEXT:    s_addc_u32 s1, s9, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s13
+; GFX9-NEXT:    v_mov_b32_e32 v5, s12
 ; GFX9-NEXT:  .LBB104_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s13
-; GFX9-NEXT:    v_mov_b32_e32 v6, s12
-; GFX9-NEXT:    v_mov_b32_e32 v5, s1
-; GFX9-NEXT:    v_mov_b32_e32 v4, s0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX9-NEXT:    v_mov_b32_e32 v9, v3
+; GFX9-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB104_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v2, s10
-; GFX9-NEXT:    v_mov_b32_e32 v3, s11
-; GFX9-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s10
+; GFX9-NEXT:    v_mov_b32_e32 v1, s11
+; GFX9-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -6577,17 +6511,17 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s35
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX7-NEXT:    flat_load_dword v3, v[3:4]
+; GFX7-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s7
+; GFX7-NEXT:    v_mov_b32_e32 v7, s6
+; GFX7-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX7-NEXT:  .LBB111_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s7
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
-; GFX7-NEXT:    v_mov_b32_e32 v4, s4
-; GFX7-NEXT:    v_mov_b32_e32 v5, s5
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -6612,17 +6546,17 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s35
 ; GFX8-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX8-NEXT:    flat_load_dword v3, v[3:4]
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s7
+; GFX8-NEXT:    v_mov_b32_e32 v7, s6
+; GFX8-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX8-NEXT:  .LBB111_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s7
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:    v_mov_b32_e32 v5, s5
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
@@ -6642,17 +6576,17 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX9-NEXT:  .LBB111_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
@@ -6683,28 +6617,26 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out,
 ; GFX7-NEXT:    v_mov_b32_e32 v5, s35
 ; GFX7-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX7-NEXT:    flat_load_dword v2, v[4:5]
-; GFX7-NEXT:    s_mov_b64 s[36:37], 0
+; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s7
+; GFX7-NEXT:    v_mov_b32_e32 v7, s6
 ; GFX7-NEXT:  .LBB112_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s7
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
-; GFX7-NEXT:    v_mov_b32_e32 v4, s34
-; GFX7-NEXT:    v_mov_b32_e32 v5, s35
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
 ; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB112_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GFX7-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: flat_atomic_umin_i64_noret_offset_scalar:
@@ -6720,28 +6652,26 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out,
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s35
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    flat_load_dword v2, v[4:5]
-; GFX8-NEXT:    s_mov_b64 s[36:37], 0
+; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s7
+; GFX8-NEXT:    v_mov_b32_e32 v7, s6
 ; GFX8-NEXT:  .LBB112_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s7
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
-; GFX8-NEXT:    v_mov_b32_e32 v4, s34
-; GFX8-NEXT:    v_mov_b32_e32 v5, s35
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB112_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GFX8-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: flat_atomic_umin_i64_noret_offset_scalar:
@@ -6750,17 +6680,17 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out,
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX9-NEXT:  .LBB112_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
@@ -6790,23 +6720,23 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s35
 ; GFX7-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX7-NEXT:    flat_load_dword v1, v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v5, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX7-NEXT:  .LBB113_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s7
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
-; GFX7-NEXT:    v_mov_b32_e32 v4, s4
-; GFX7-NEXT:    v_mov_b32_e32 v5, s5
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
+; GFX7-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB113_1
@@ -6825,23 +6755,23 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s35
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    flat_load_dword v1, v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s7
+; GFX8-NEXT:    v_mov_b32_e32 v5, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX8-NEXT:  .LBB113_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s7
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:    v_mov_b32_e32 v5, s5
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB113_1
@@ -6855,23 +6785,23 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s7
+; GFX9-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:  .LBB113_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX9-NEXT:    v_mov_b32_e32 v9, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB113_1
@@ -6896,28 +6826,26 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s35
 ; GFX7-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX7-NEXT:    flat_load_dword v0, v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[36:37], 0
+; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX7-NEXT:  .LBB114_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s7
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
-; GFX7-NEXT:    v_mov_b32_e32 v4, s34
-; GFX7-NEXT:    v_mov_b32_e32 v5, s35
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
+; GFX7-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX7-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB114_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GFX7-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: flat_atomic_umin_i64_ret_offset_scalar:
@@ -6933,28 +6861,26 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s35
 ; GFX8-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX8-NEXT:    flat_load_dword v0, v[2:3]
-; GFX8-NEXT:    s_mov_b64 s[36:37], 0
+; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s7
+; GFX8-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX8-NEXT:  .LBB114_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s7
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
-; GFX8-NEXT:    v_mov_b32_e32 v4, s34
-; GFX8-NEXT:    v_mov_b32_e32 v5, s35
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX8-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB114_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GFX8-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: flat_atomic_umin_i64_ret_offset_scalar:
@@ -6963,23 +6889,23 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s7
+; GFX9-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:  .LBB114_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
+; GFX9-NEXT:    v_mov_b32_e32 v9, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB114_1
@@ -7529,17 +7455,17 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s35
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX7-NEXT:    flat_load_dword v3, v[3:4]
+; GFX7-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s7
+; GFX7-NEXT:    v_mov_b32_e32 v7, s6
+; GFX7-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX7-NEXT:  .LBB121_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s7
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
-; GFX7-NEXT:    v_mov_b32_e32 v4, s4
-; GFX7-NEXT:    v_mov_b32_e32 v5, s5
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -7564,17 +7490,17 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s35
 ; GFX8-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX8-NEXT:    flat_load_dword v3, v[3:4]
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s7
+; GFX8-NEXT:    v_mov_b32_e32 v7, s6
+; GFX8-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX8-NEXT:  .LBB121_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s7
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:    v_mov_b32_e32 v5, s5
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
@@ -7594,17 +7520,17 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX9-NEXT:  .LBB121_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
@@ -7635,28 +7561,26 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out,
 ; GFX7-NEXT:    v_mov_b32_e32 v5, s35
 ; GFX7-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX7-NEXT:    flat_load_dword v2, v[4:5]
-; GFX7-NEXT:    s_mov_b64 s[36:37], 0
+; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s7
+; GFX7-NEXT:    v_mov_b32_e32 v7, s6
 ; GFX7-NEXT:  .LBB122_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s7
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
-; GFX7-NEXT:    v_mov_b32_e32 v4, s34
-; GFX7-NEXT:    v_mov_b32_e32 v5, s35
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
 ; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB122_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GFX7-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: flat_atomic_min_i64_noret_offset_scalar:
@@ -7672,28 +7596,26 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out,
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s35
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    flat_load_dword v2, v[4:5]
-; GFX8-NEXT:    s_mov_b64 s[36:37], 0
+; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s7
+; GFX8-NEXT:    v_mov_b32_e32 v7, s6
 ; GFX8-NEXT:  .LBB122_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s7
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
-; GFX8-NEXT:    v_mov_b32_e32 v4, s34
-; GFX8-NEXT:    v_mov_b32_e32 v5, s35
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB122_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GFX8-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: flat_atomic_min_i64_noret_offset_scalar:
@@ -7702,17 +7624,17 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out,
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX9-NEXT:  .LBB122_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
@@ -7742,23 +7664,23 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s35
 ; GFX7-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX7-NEXT:    flat_load_dword v1, v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v5, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX7-NEXT:  .LBB123_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s7
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
-; GFX7-NEXT:    v_mov_b32_e32 v4, s4
-; GFX7-NEXT:    v_mov_b32_e32 v5, s5
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
+; GFX7-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB123_1
@@ -7777,23 +7699,23 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s35
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    flat_load_dword v1, v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s7
+; GFX8-NEXT:    v_mov_b32_e32 v5, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX8-NEXT:  .LBB123_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s7
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:    v_mov_b32_e32 v5, s5
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB123_1
@@ -7807,23 +7729,23 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s7
+; GFX9-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:  .LBB123_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX9-NEXT:    v_mov_b32_e32 v9, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB123_1
@@ -7848,28 +7770,26 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s35
 ; GFX7-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX7-NEXT:    flat_load_dword v0, v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[36:37], 0
+; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX7-NEXT:  .LBB124_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s7
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
-; GFX7-NEXT:    v_mov_b32_e32 v4, s34
-; GFX7-NEXT:    v_mov_b32_e32 v5, s35
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
+; GFX7-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX7-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB124_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GFX7-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: flat_atomic_min_i64_ret_offset_scalar:
@@ -7885,28 +7805,26 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s35
 ; GFX8-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX8-NEXT:    flat_load_dword v0, v[2:3]
-; GFX8-NEXT:    s_mov_b64 s[36:37], 0
+; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s7
+; GFX8-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX8-NEXT:  .LBB124_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s7
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
-; GFX8-NEXT:    v_mov_b32_e32 v4, s34
-; GFX8-NEXT:    v_mov_b32_e32 v5, s35
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX8-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB124_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GFX8-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: flat_atomic_min_i64_ret_offset_scalar:
@@ -7915,23 +7833,23 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s7
+; GFX9-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:  .LBB124_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
+; GFX9-NEXT:    v_mov_b32_e32 v9, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB124_1
@@ -7954,28 +7872,26 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX7-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX7-NEXT:    s_add_u32 s0, s0, 32
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mov_b32_e32 v5, s1
+; GFX7-NEXT:    v_mov_b32_e32 v4, s0
+; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s3
+; GFX7-NEXT:    v_mov_b32_e32 v7, s2
 ; GFX7-NEXT:  .LBB125_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s3
-; GFX7-NEXT:    v_mov_b32_e32 v6, s2
-; GFX7-NEXT:    v_mov_b32_e32 v5, s1
-; GFX7-NEXT:    v_mov_b32_e32 v4, s0
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
 ; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB125_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_endpgm
@@ -7990,28 +7906,26 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX8-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX8-NEXT:    s_add_u32 s0, s0, 32
 ; GFX8-NEXT:    s_addc_u32 s1, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT:    s_mov_b64 s[0:1], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s3
+; GFX8-NEXT:    v_mov_b32_e32 v7, s2
 ; GFX8-NEXT:  .LBB125_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s3
-; GFX8-NEXT:    v_mov_b32_e32 v6, s2
-; GFX8-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB125_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_endpgm
@@ -8024,28 +7938,26 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX9-NEXT:    s_add_u32 s0, s0, s4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] offset:32
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[4:5] offset:32
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, s3
+; GFX9-NEXT:    v_mov_b32_e32 v7, s2
 ; GFX9-NEXT:  .LBB125_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v6, s2
-; GFX9-NEXT:    v_mov_b32_e32 v5, s1
-; GFX9-NEXT:    v_mov_b32_e32 v4, s0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB125_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
@@ -8068,32 +7980,30 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[6:7], 0
+; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_mov_b32_e32 v4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX7-NEXT:  .LBB126_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s5
-; GFX7-NEXT:    v_mov_b32_e32 v6, s4
-; GFX7-NEXT:    v_mov_b32_e32 v5, s1
-; GFX7-NEXT:    v_mov_b32_e32 v4, s0
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB126_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: atomic_min_i64_ret_addr64_offset:
@@ -8107,69 +8017,65 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX8-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[0:1], 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s5
+; GFX8-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX8-NEXT:  .LBB126_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s5
-; GFX8-NEXT:    v_mov_b32_e32 v6, s4
-; GFX8-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v9, v3
+; GFX8-NEXT:    v_mov_b32_e32 v8, v2
+; GFX8-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB126_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: atomic_min_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[14:15], 3
 ; GFX9-NEXT:    s_add_u32 s0, s8, s0
 ; GFX9-NEXT:    s_addc_u32 s1, s9, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] offset:32
+; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s13
+; GFX9-NEXT:    v_mov_b32_e32 v5, s12
 ; GFX9-NEXT:  .LBB126_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s13
-; GFX9-NEXT:    v_mov_b32_e32 v6, s12
-; GFX9-NEXT:    v_mov_b32_e32 v5, s1
-; GFX9-NEXT:    v_mov_b32_e32 v4, s0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
+; GFX9-NEXT:    v_mov_b32_e32 v9, v3
+; GFX9-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB126_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v2, s10
-; GFX9-NEXT:    v_mov_b32_e32 v3, s11
-; GFX9-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s10
+; GFX9-NEXT:    v_mov_b32_e32 v1, s11
+; GFX9-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -8188,16 +8094,16 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v5, s1
+; GFX7-NEXT:    v_mov_b32_e32 v6, s3
+; GFX7-NEXT:    v_mov_b32_e32 v7, s2
+; GFX7-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX7-NEXT:  .LBB127_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s3
-; GFX7-NEXT:    v_mov_b32_e32 v6, s2
-; GFX7-NEXT:    v_mov_b32_e32 v5, s1
-; GFX7-NEXT:    v_mov_b32_e32 v4, s0
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -8218,16 +8124,16 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v6, s3
+; GFX8-NEXT:    v_mov_b32_e32 v7, s2
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NEXT:  .LBB127_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s3
-; GFX8-NEXT:    v_mov_b32_e32 v6, s2
-; GFX8-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
@@ -8248,16 +8154,16 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v6, s3
+; GFX9-NEXT:    v_mov_b32_e32 v7, s2
+; GFX9-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX9-NEXT:  .LBB127_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v6, s2
-; GFX9-NEXT:    v_mov_b32_e32 v5, s1
-; GFX9-NEXT:    v_mov_b32_e32 v4, s0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
@@ -8284,32 +8190,30 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GFX7-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[6:7], 0
+; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_mov_b32_e32 v4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX7-NEXT:  .LBB128_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s5
-; GFX7-NEXT:    v_mov_b32_e32 v6, s4
-; GFX7-NEXT:    v_mov_b32_e32 v5, s1
-; GFX7-NEXT:    v_mov_b32_e32 v4, s0
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB128_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: atomic_min_i64_ret_addr64:
@@ -8321,69 +8225,65 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GFX8-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[0:1], 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s5
+; GFX8-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX8-NEXT:  .LBB128_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s5
-; GFX8-NEXT:    v_mov_b32_e32 v6, s4
-; GFX8-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v9, v3
+; GFX8-NEXT:    v_mov_b32_e32 v8, v2
+; GFX8-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB128_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: atomic_min_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[14:15], 3
 ; GFX9-NEXT:    s_add_u32 s0, s8, s0
 ; GFX9-NEXT:    s_addc_u32 s1, s9, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s13
+; GFX9-NEXT:    v_mov_b32_e32 v5, s12
 ; GFX9-NEXT:  .LBB128_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s13
-; GFX9-NEXT:    v_mov_b32_e32 v6, s12
-; GFX9-NEXT:    v_mov_b32_e32 v5, s1
-; GFX9-NEXT:    v_mov_b32_e32 v4, s0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX9-NEXT:    v_mov_b32_e32 v9, v3
+; GFX9-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB128_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v2, s10
-; GFX9-NEXT:    v_mov_b32_e32 v3, s11
-; GFX9-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s10
+; GFX9-NEXT:    v_mov_b32_e32 v1, s11
+; GFX9-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index 09230d21544ff..4aec2ffead437 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -1426,23 +1426,23 @@ main_body:
 define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
 ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90A-NEXT:    s_mov_b64 s[2:3], 0
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX90A-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_f64 v[0:1], v[2:3], 4.0
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
 ; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index d0fc798e55b6e..ec4ea232e661c 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -18501,39 +18501,41 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB78_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -18548,34 +18550,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB78_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v3, v6, v3
-; GFX10-NEXT:    v_add_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB78_1
@@ -18589,33 +18591,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB78_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_add_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB78_1
@@ -18629,33 +18631,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB78_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_add_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB78_1
@@ -18669,34 +18671,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB78_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_add_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB78_1
@@ -18827,39 +18829,41 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB79_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -18874,34 +18878,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB79_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v3, v6, v3
-; GFX10-NEXT:    v_add_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB79_1
@@ -18915,33 +18919,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB79_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_add_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB79_1
@@ -18955,33 +18959,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB79_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_add_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB79_1
@@ -18997,34 +19001,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB79_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_add_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_add_f32_e32 v0, v7, v0
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB79_1
@@ -19155,39 +19159,41 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB80_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -19202,34 +19208,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB80_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v3, v6, v3
-; GFX10-NEXT:    v_add_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB80_1
@@ -19243,33 +19249,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB80_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_add_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB80_1
@@ -19283,33 +19289,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB80_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_add_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB80_1
@@ -19325,34 +19331,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB80_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_add_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_add_f32_e32 v0, v7, v0
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB80_1
@@ -19486,41 +19492,41 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
 ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB81_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -19533,35 +19539,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
 ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB81_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB81_1
@@ -19572,36 +19578,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
 ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB81_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_add_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB81_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -19611,36 +19617,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
 ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB81_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB81_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -19650,37 +19656,37 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
 ; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB81_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB81_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -19804,41 +19810,41 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin
 ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:2044
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB82_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -19851,35 +19857,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin
 ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB82_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB82_1
@@ -19890,36 +19896,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin
 ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB82_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_add_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB82_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -19929,36 +19935,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin
 ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB82_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB82_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -19970,37 +19976,37 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB82_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB82_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -20125,41 +20131,41 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin
 ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:-2048
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB83_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -20172,35 +20178,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin
 ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:-2048
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB83_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB83_1
@@ -20211,36 +20217,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin
 ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:-2048
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB83_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_add_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB83_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -20250,36 +20256,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin
 ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:-2048
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB83_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB83_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -20291,37 +20297,37 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB83_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB83_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -20456,39 +20462,41 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB84_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -20503,34 +20511,34 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB84_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v3, v6, v3
-; GFX10-NEXT:    v_add_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB84_1
@@ -20544,35 +20552,35 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB84_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_add_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB84_1
@@ -20586,33 +20594,33 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB84_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_add_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB84_1
@@ -20628,34 +20636,34 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB84_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_add_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_add_f32_e32 v0, v7, v0
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB84_1
@@ -20786,41 +20794,41 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi
 ; GFX11-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:2044
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB85_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -20833,35 +20841,35 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi
 ; GFX10-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB85_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB85_1
@@ -20872,38 +20880,38 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi
 ; GFX90A-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB85_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_add_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB85_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -20913,36 +20921,36 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi
 ; GFX908-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB85_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB85_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -20954,37 +20962,37 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB85_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB85_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -21110,39 +21118,41 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB86_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -21157,34 +21167,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB86_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v3, v6, v3
-; GFX10-NEXT:    v_add_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB86_1
@@ -21198,33 +21208,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB86_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_add_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB86_1
@@ -21238,33 +21248,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB86_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_add_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB86_1
@@ -21278,34 +21288,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB86_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_add_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB86_1
@@ -21435,41 +21445,41 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr
 ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB87_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -21482,35 +21492,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr
 ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB87_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB87_1
@@ -21521,36 +21531,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr
 ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB87_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_add_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB87_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -21560,36 +21570,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr
 ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB87_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB87_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -21599,37 +21609,37 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr
 ; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB87_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB87_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -21754,39 +21764,41 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB88_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -21801,34 +21813,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB88_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v3, v6, v3
-; GFX10-NEXT:    v_add_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB88_1
@@ -21842,33 +21854,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB88_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_add_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB88_1
@@ -21882,33 +21894,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB88_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_add_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB88_1
@@ -21922,34 +21934,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB88_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_add_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB88_1
@@ -22079,41 +22091,41 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
 ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB89_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -22126,35 +22138,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
 ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB89_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB89_1
@@ -22165,36 +22177,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
 ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB89_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_add_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB89_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -22204,36 +22216,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
 ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB89_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB89_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -22243,37 +22255,37 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
 ; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB89_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB89_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -22398,39 +22410,41 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB90_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -22445,34 +22459,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB90_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v3, v6, v3
-; GFX10-NEXT:    v_add_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB90_1
@@ -22486,33 +22500,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB90_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_add_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB90_1
@@ -22526,33 +22540,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB90_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_add_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB90_1
@@ -22566,34 +22580,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB90_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_add_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB90_1
@@ -22723,41 +22737,41 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1
 ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -22770,35 +22784,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1
 ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB91_1
@@ -22809,36 +22823,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1
 ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_add_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB91_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -22848,36 +22862,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1
 ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB91_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -22887,37 +22901,37 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1
 ; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB91_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -23156,34 +23170,34 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
 ;
 ; GFX8-LABEL: infer_as_before_atomic:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b64 s[2:3], exec
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX8-NEXT:    s_mov_b64 s[0:1], exec
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX8-NEXT:    s_cbranch_execz .LBB92_3
 ; GFX8-NEXT:  ; %bb.1:
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
-; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v2, s2
-; GFX8-NEXT:    s_mov_b64 s[2:3], 0
+; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX8-NEXT:    s_bcnt1_i32_b64 s5, s[0:1]
+; GFX8-NEXT:    s_mov_b64 s[0:1], 0
+; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v4, s5
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s4, s[2:3], 0x0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX8-NEXT:  .LBB92_2: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s1
-; GFX8-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s0
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_add_f32_e32 v2, v3, v4
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX8-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB92_2
 ; GFX8-NEXT:  .LBB92_3:
 ; GFX8-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
index d70159b0b0ac7..e2fde562d36b1 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
@@ -32,13 +32,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -77,13 +77,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -101,13 +101,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -193,13 +193,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -238,13 +238,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -262,13 +262,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -356,13 +356,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB2_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -401,13 +401,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB2_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -425,13 +425,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB2_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -517,21 +517,21 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -561,20 +561,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
 ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v4, v3
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -584,20 +584,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
 ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -672,21 +672,21 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -716,20 +716,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g
 ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v4, v3
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -739,20 +739,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g
 ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -830,21 +830,21 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:-2048
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -874,20 +874,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g
 ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:-2048
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v4, v3
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -897,20 +897,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g
 ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:-2048
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -991,13 +991,13 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -1036,13 +1036,13 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    buffer_wbl2
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -1062,13 +1062,13 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -1155,21 +1155,21 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_
 ; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1199,22 +1199,22 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_
 ; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v4, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1224,20 +1224,20 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_
 ; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1317,13 +1317,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -1341,14 +1341,15 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_max_f32 v3, v2, v2
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX11-NEXT:    v_max_f32_e32 v3, v5, v3
+; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -1368,14 +1369,14 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v2, v2
-; GFX10-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v5, v3
+; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -1395,13 +1396,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -1419,13 +1420,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -1546,13 +1547,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -1591,13 +1592,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -1615,13 +1616,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -1711,13 +1712,13 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -1756,13 +1757,13 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -1780,13 +1781,13 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -1872,13 +1873,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -1917,13 +1918,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -1941,13 +1942,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -2035,13 +2036,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -2080,13 +2081,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -2104,13 +2105,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -2196,21 +2197,21 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2240,20 +2241,20 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem
 ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v4, v3
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2263,20 +2264,20 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem
 ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2351,21 +2352,21 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_f
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2395,20 +2396,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_f
 ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v4, v3
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2418,20 +2419,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_f
 ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2509,21 +2510,21 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:-2048
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2553,20 +2554,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f
 ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:-2048
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v4, v3
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2576,20 +2577,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f
 ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:-2048
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2670,13 +2671,13 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -2715,13 +2716,13 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    buffer_wbl2
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -2741,13 +2742,13 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -2834,21 +2835,21 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_
 ; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2878,22 +2879,22 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_
 ; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v4, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2903,20 +2904,20 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_
 ; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2990,15 +2991,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -3028,15 +3029,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -3074,15 +3075,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -3100,15 +3101,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
-; GFX8-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
 ; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -3163,15 +3164,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -3201,15 +3202,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -3247,15 +3248,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -3275,15 +3276,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7f8, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[6:7]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -3337,15 +3338,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -3375,15 +3376,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -3421,15 +3422,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -3449,15 +3450,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[6:7]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -3510,21 +3511,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[6:7], v[0:1], off
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -3547,22 +3548,22 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[6:7], v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -3592,21 +3593,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3616,21 +3617,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3673,21 +3674,21 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[6:7], v[0:1], off offset:2040
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -3710,22 +3711,22 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g
 ; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[6:7], v[0:1], off offset:2040
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -3755,21 +3756,21 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g
 ; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:2040
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
+; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3781,21 +3782,21 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7f8, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3839,21 +3840,21 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[6:7], v[0:1], off offset:-2048
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -3876,22 +3877,22 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[6:7], v[0:1], off offset:-2048
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
+; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -3921,21 +3922,21 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:-2048
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
+; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3947,21 +3948,21 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4006,15 +4007,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -4044,15 +4045,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -4072,15 +4073,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v4
-; GFX10-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX10-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX10-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX10-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX10-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -4101,13 +4102,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX90A-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
 ; GFX90A-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX90A-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -4125,15 +4126,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -4151,15 +4152,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
-; GFX8-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
 ; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -4177,28 +4178,26 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    v_mov_b32_e32 v7, v1
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7-NEXT:    v_mov_b32_e32 v5, v1
+; GFX7-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v11, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v10, v0
-; GFX7-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX7-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX7-NEXT:    v_max_f64 v[8:9], v[0:1], v[2:3]
+; GFX7-NEXT:    v_max_f64 v[8:9], v[0:1], v[6:7]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v8
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v9
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v10
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v11
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
@@ -4213,14 +4212,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_mov_b32_e32 v7, v1
-; GFX6-NEXT:    v_mov_b32_e32 v6, v0
+; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GFX6-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -4228,14 +4226,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add
 ; GFX6-NEXT:    v_mov_b32_e32 v11, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v10, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX6-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX6-NEXT:    v_max_f64 v[8:9], v[0:1], v[2:3]
+; GFX6-NEXT:    v_max_f64 v[8:9], v[0:1], v[6:7]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v8
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v9
 ; GFX6-NEXT:    v_mov_b32_e32 v2, v10
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v11
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
@@ -4259,15 +4256,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -4297,15 +4294,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT:    v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -4343,15 +4340,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -4369,15 +4366,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
-; GFX8-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
 ; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -4436,8 +4433,9 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX12-NEXT:    global_load_b32 v5, v[0:1], off
@@ -4449,12 +4447,11 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v7
+; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -4487,14 +4484,14 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
-; GFX940-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT:    v_max_f16_e32 v5, v6, v5
+; GFX940-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX940-NEXT:    buffer_wbl2 sc1
@@ -4514,8 +4511,9 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX11-NEXT:    global_load_b32 v5, v[0:1], off
@@ -4527,12 +4525,11 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-NEXT:    v_max_f16_e32 v5, v5, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v5, v5, v7
+; GFX11-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -4556,6 +4553,7 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
@@ -4567,10 +4565,9 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX10-NEXT:    v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT:    v_max_f16_e32 v5, v5, v7
+; GFX10-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4599,14 +4596,14 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT:    v_max_f16_e32 v5, v6, v5
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX90A-NEXT:    global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
@@ -4633,14 +4630,14 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
-; GFX908-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT:    v_max_f16_e32 v5, v7, v5
+; GFX908-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX908-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
@@ -4667,17 +4664,17 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT:    v_max_f16_e32 v5, v7, v5
-; GFX8-NEXT:    v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
 ; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -4790,10 +4787,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -4803,12 +4801,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v7
+; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -4843,14 +4840,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
-; GFX940-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT:    v_max_f16_e32 v5, v6, v5
+; GFX940-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX940-NEXT:    buffer_wbl2 sc1
@@ -4871,10 +4868,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -4884,12 +4882,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-NEXT:    v_max_f16_e32 v5, v5, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v5, v5, v7
+; GFX11-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -4914,9 +4911,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    global_load_dword v5, v[0:1], off
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -4925,10 +4923,9 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX10-NEXT:    v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT:    v_max_f16_e32 v5, v5, v7
+; GFX10-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4958,14 +4955,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT:    v_max_f16_e32 v5, v6, v5
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX90A-NEXT:    global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
@@ -4993,14 +4990,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
-; GFX908-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT:    v_max_f16_e32 v5, v7, v5
+; GFX908-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX908-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
@@ -5028,17 +5025,17 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT:    v_max_f16_e32 v5, v7, v5
-; GFX8-NEXT:    v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
 ; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -5155,10 +5152,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -5168,12 +5166,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v7
+; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -5209,14 +5206,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
-; GFX940-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT:    v_max_f16_e32 v5, v6, v5
+; GFX940-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX940-NEXT:    buffer_wbl2 sc1
@@ -5237,10 +5234,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -5250,12 +5248,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-NEXT:    v_max_f16_e32 v5, v5, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v5, v5, v7
+; GFX11-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -5280,9 +5277,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    global_load_dword v5, v[0:1], off
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -5291,10 +5289,9 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX10-NEXT:    v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT:    v_max_f16_e32 v5, v5, v7
+; GFX10-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -5324,14 +5321,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT:    v_max_f16_e32 v5, v6, v5
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX90A-NEXT:    global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
@@ -5359,14 +5356,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
-; GFX908-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT:    v_max_f16_e32 v5, v7, v5
+; GFX908-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX908-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
@@ -5394,17 +5391,17 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT:    v_max_f16_e32 v5, v7, v5
-; GFX8-NEXT:    v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
 ; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -5520,8 +5517,9 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX12-NEXT:    global_load_b32 v4, v[0:1], off
@@ -5533,10 +5531,9 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v3, v7
+; GFX12-NEXT:    v_max_num_f16_e32 v3, v3, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
@@ -5570,13 +5567,13 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
 ; GFX940-NEXT:    v_not_b32_e32 v6, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX940-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT:    v_max_f16_e32 v4, v4, v7
+; GFX940-NEXT:    v_max_f16_e32 v4, v4, v2
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
 ; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
@@ -5596,8 +5593,9 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
@@ -5609,10 +5607,9 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT:    v_max_f16_e32 v3, v3, v7
+; GFX11-NEXT:    v_max_f16_e32 v3, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
@@ -5637,6 +5634,7 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
@@ -5648,9 +5646,8 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX10-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT:    v_max_f16_e32 v3, v3, v7
+; GFX10-NEXT:    v_max_f16_e32 v3, v3, v2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -5679,13 +5676,13 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v6, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v7
+; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v2
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
 ; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
 ; GFX90A-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
@@ -5712,13 +5709,13 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
 ; GFX908-NEXT:    v_not_b32_e32 v6, v3
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX908-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT:    v_max_f16_e32 v3, v3, v7
+; GFX908-NEXT:    v_max_f16_e32 v3, v3, v2
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
 ; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
@@ -5745,16 +5742,16 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
 ; GFX8-NEXT:    v_not_b32_e32 v6, v3
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v7
-; GFX8-NEXT:    v_and_b32_e32 v8, v4, v6
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v8, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
 ; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -5862,36 +5859,36 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT:    v_max_num_f16_e32 v6, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v6, v3
+; GFX12-NEXT:    v_not_b32_e32 v5, v5
 ; GFX12-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v3, v7
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -5909,29 +5906,29 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
 ; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v5
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
+; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v6, v4
+; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX940-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX940-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT:    v_max_f16_e32 v4, v4, v7
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v4
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5941,37 +5938,37 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    v_max_f16_e32 v6, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v6, v3
+; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT:    v_max_f16_e32 v3, v3, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -5983,31 +5980,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_max_f16_e32 v6, v2, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v6, v3
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v5, v5
 ; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX10-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT:    v_max_f16_e32 v3, v3, v7
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
@@ -6018,31 +6015,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
+; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v6, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX90A-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v7
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6052,31 +6049,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
+; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT:    v_not_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX908-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX908-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT:    v_max_f16_e32 v3, v3, v7
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6086,32 +6083,32 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT:    v_not_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX8-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v7
-; GFX8-NEXT:    v_and_b32_e32 v8, v4, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v8, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6216,36 +6213,36 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT:    v_max_num_f16_e32 v6, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v6, v3
+; GFX12-NEXT:    v_not_b32_e32 v5, v5
 ; GFX12-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v3, v7
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -6264,29 +6261,29 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
 ; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v5
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
+; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v6, v4
+; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX940-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX940-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT:    v_max_f16_e32 v4, v4, v7
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v4
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6296,37 +6293,37 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_max_f16_e32 v6, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v6, v3
+; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT:    v_max_f16_e32 v3, v3, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -6338,31 +6335,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_max_f16_e32 v6, v2, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v6, v3
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v5, v5
 ; GFX10-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX10-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT:    v_max_f16_e32 v3, v3, v7
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB31_1
@@ -6373,31 +6370,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
+; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v6, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX90A-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v7
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6407,31 +6404,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
+; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT:    v_not_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX908-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX908-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT:    v_max_f16_e32 v3, v3, v7
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6441,32 +6438,32 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT:    v_not_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX8-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v7
-; GFX8-NEXT:    v_and_b32_e32 v8, v4, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v8, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6572,15 +6569,15 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v5, v4, v4
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v5, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v3, v4, v4
+; GFX12-NEXT:    v_max_num_f16_e32 v3, v3, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX12-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
@@ -6605,14 +6602,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
 ; GFX940-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f16_e32 v3, v5, v5
-; GFX940-NEXT:    v_max_f16_e32 v3, v3, v4
+; GFX940-NEXT:    v_max_f16_e32 v3, v3, v2
 ; GFX940-NEXT:    v_and_or_b32 v4, v5, s2, v3
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0
@@ -6631,15 +6628,15 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    v_max_f16_e32 v3, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v5, v4, v4
-; GFX11-NEXT:    v_max_f16_e32 v3, v5, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v3, v4, v4
+; GFX11-NEXT:    v_max_f16_e32 v3, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX11-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
@@ -6662,14 +6659,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f16_e32 v3, v2, v2
-; GFX10-NEXT:    v_max_f16_e32 v5, v4, v4
-; GFX10-NEXT:    v_max_f16_e32 v3, v5, v3
+; GFX10-NEXT:    v_max_f16_e32 v3, v4, v4
+; GFX10-NEXT:    v_max_f16_e32 v3, v3, v2
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX10-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -6691,14 +6688,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
 ; GFX90A-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f16_e32 v3, v5, v5
-; GFX90A-NEXT:    v_max_f16_e32 v3, v3, v4
+; GFX90A-NEXT:    v_max_f16_e32 v3, v3, v2
 ; GFX90A-NEXT:    v_and_or_b32 v4, v5, s6, v3
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -6717,14 +6714,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
 ; GFX908-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f16_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f16_e32 v3, v4, v4
-; GFX908-NEXT:    v_max_f16_e32 v3, v3, v5
+; GFX908-NEXT:    v_max_f16_e32 v3, v3, v2
 ; GFX908-NEXT:    v_and_or_b32 v3, v4, s6, v3
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -6745,19 +6742,19 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v1, v2, v2
 ; GFX8-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v0, v1, v1
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v5
-; GFX8-NEXT:    v_or_b32_e32 v0, v6, v0
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_max_f16_e32 v0, v6, v6
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX8-NEXT:    v_or_b32_e32 v5, v2, v0
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB32_1
@@ -6847,24 +6844,24 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off offset:2046
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-NEXT:    v_max_num_f16_e32 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v2, v2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f16_e32 v5, v4, v4
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v3, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v5, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v4
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -6878,23 +6875,23 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:2046
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
 ; GFX940-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f16_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f16_e32 v4, v5, v5
-; GFX940-NEXT:    v_max_f16_e32 v3, v4, v3
-; GFX940-NEXT:    v_and_or_b32 v4, v5, s2, v3
+; GFX940-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v4
+; GFX940-NEXT:    v_and_or_b32 v2, v3, s2, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6904,25 +6901,25 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n
 ; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:2046
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_max_f16_e32 v3, v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f16_e32 v5, v4, v4
+; GFX11-NEXT:    v_max_f16_e32 v2, v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v3, v5, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v4
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -6934,24 +6931,24 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n
 ; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:2046
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
+; GFX10-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_max_f16_e32 v3, v2, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f16_e32 v5, v4, v4
-; GFX10-NEXT:    v_max_f16_e32 v3, v5, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX10-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX10-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v4
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6961,22 +6958,22 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n
 ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:2046
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
 ; GFX90A-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f16_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f16_e32 v4, v5, v5
-; GFX90A-NEXT:    v_max_f16_e32 v3, v4, v3
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, s6, v3
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc
+; GFX90A-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v4
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, s6, v2
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6986,22 +6983,22 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n
 ; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:2046
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
 ; GFX908-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f16_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f16_e32 v5, v4, v4
-; GFX908-NEXT:    v_max_f16_e32 v3, v5, v3
-; GFX908-NEXT:    v_and_or_b32 v3, v4, s6, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX908-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v4
+; GFX908-NEXT:    v_and_or_b32 v2, v3, s6, v2
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7013,22 +7010,22 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX8-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f16_e32 v3, v2, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f16_e32 v5, v4, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT:    v_max_f16_e32 v3, v5, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v6, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v4
+; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7117,10 +7114,11 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -7130,12 +7128,11 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v7
+; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -7171,14 +7168,14 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
-; GFX940-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT:    v_max_f16_e32 v5, v6, v5
+; GFX940-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
@@ -7199,10 +7196,11 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -7212,12 +7210,11 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-NEXT:    v_max_f16_e32 v5, v5, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v5, v5, v7
+; GFX11-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -7242,9 +7239,10 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    global_load_dword v5, v[0:1], off
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -7253,10 +7251,9 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX10-NEXT:    v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT:    v_max_f16_e32 v5, v5, v7
+; GFX10-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -7286,14 +7283,14 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT:    v_max_f16_e32 v5, v6, v5
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX90A-NEXT:    buffer_wbl2
@@ -7323,14 +7320,14 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
-; GFX908-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT:    v_max_f16_e32 v5, v7, v5
+; GFX908-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT:    v_max_f16_e32 v5, v5, v2
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX908-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
@@ -7358,17 +7355,17 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT:    v_max_f16_e32 v5, v7, v5
-; GFX8-NEXT:    v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
 ; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -7483,37 +7480,37 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT:    v_max_num_f16_e32 v6, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v6, v3
+; GFX12-NEXT:    v_not_b32_e32 v5, v5
 ; GFX12-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v3, v7
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -7531,29 +7528,29 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_
 ; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
 ; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v5
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
+; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v6, v4
+; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX940-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX940-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT:    v_max_f16_e32 v4, v4, v7
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v4
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7563,37 +7560,37 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_
 ; GFX11-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    v_max_f16_e32 v6, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v6, v3
+; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT:    v_max_f16_e32 v3, v3, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -7605,31 +7602,31 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_
 ; GFX10-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_max_f16_e32 v6, v2, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v6, v3
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v5, v5
 ; GFX10-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX10-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT:    v_max_f16_e32 v3, v3, v7
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB35_1
@@ -7640,33 +7637,33 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_
 ; GFX90A-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
+; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v6, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX90A-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v7
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7676,31 +7673,31 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_
 ; GFX908-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
+; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT:    v_not_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX908-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX908-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT:    v_max_f16_e32 v3, v3, v7
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7710,32 +7707,32 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_
 ; GFX8-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT:    v_not_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX8-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v7
-; GFX8-NEXT:    v_and_b32_e32 v8, v4, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v8, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11684,15 +11681,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -11714,14 +11711,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX940-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX940-NEXT:    v_pk_max_f16 v3, v5, v5
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_max_f16 v4, v3, v4
+; GFX940-NEXT:    v_pk_max_f16 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -11739,15 +11736,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX11-NEXT:    v_pk_max_f16 v3, v5, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX11-NEXT:    v_pk_max_f16 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -11767,14 +11764,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX10-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT:    v_pk_max_f16 v3, v5, v3
+; GFX10-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX10-NEXT:    v_pk_max_f16 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -11794,13 +11791,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX90A-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX90A-NEXT:    v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT:    v_pk_max_f16 v4, v3, v4
+; GFX90A-NEXT:    v_pk_max_f16 v4, v3, v2
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -11818,13 +11815,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX908-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_pk_max_f16 v5, v2, v2
 ; GFX908-NEXT:    v_pk_max_f16 v3, v4, v4
-; GFX908-NEXT:    v_pk_max_f16 v3, v3, v5
+; GFX908-NEXT:    v_pk_max_f16 v3, v3, v2
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -11842,21 +11839,21 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
-; GFX8-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v3, v2, v2
-; GFX8-NEXT:    v_max_f16_sdwa v6, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT:    v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v3, v7, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v5
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8-NEXT:    v_max_f16_sdwa v3, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT:    v_max_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB46_1
@@ -11977,15 +11974,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -12007,14 +12004,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX940-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX940-NEXT:    v_pk_max_f16 v3, v5, v5
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_max_f16 v4, v3, v4
+; GFX940-NEXT:    v_pk_max_f16 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -12032,15 +12029,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX11-NEXT:    v_pk_max_f16 v3, v5, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX11-NEXT:    v_pk_max_f16 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -12060,14 +12057,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX10-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT:    v_pk_max_f16 v3, v5, v3
+; GFX10-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX10-NEXT:    v_pk_max_f16 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -12087,13 +12084,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX90A-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX90A-NEXT:    v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT:    v_pk_max_f16 v4, v3, v4
+; GFX90A-NEXT:    v_pk_max_f16 v4, v3, v2
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -12111,13 +12108,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX908-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_pk_max_f16 v5, v2, v2
 ; GFX908-NEXT:    v_pk_max_f16 v3, v4, v4
-; GFX908-NEXT:    v_pk_max_f16 v3, v3, v5
+; GFX908-NEXT:    v_pk_max_f16 v3, v3, v2
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -12137,21 +12134,21 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT:    v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v7, v1, v1
-; GFX8-NEXT:    v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v0, v7, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v0
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB47_1
@@ -12272,15 +12269,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -12302,14 +12299,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX940-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX940-NEXT:    v_pk_max_f16 v3, v5, v5
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_max_f16 v4, v3, v4
+; GFX940-NEXT:    v_pk_max_f16 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -12327,15 +12324,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX11-NEXT:    v_pk_max_f16 v3, v5, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX11-NEXT:    v_pk_max_f16 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -12355,14 +12352,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX10-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT:    v_pk_max_f16 v3, v5, v3
+; GFX10-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX10-NEXT:    v_pk_max_f16 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -12382,13 +12379,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX90A-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX90A-NEXT:    v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT:    v_pk_max_f16 v4, v3, v4
+; GFX90A-NEXT:    v_pk_max_f16 v4, v3, v2
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -12406,13 +12403,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX908-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_pk_max_f16 v5, v2, v2
 ; GFX908-NEXT:    v_pk_max_f16 v3, v4, v4
-; GFX908-NEXT:    v_pk_max_f16 v3, v3, v5
+; GFX908-NEXT:    v_pk_max_f16 v3, v3, v2
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -12432,21 +12429,21 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT:    v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v7, v1, v1
-; GFX8-NEXT:    v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v0, v7, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v0
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB48_1
@@ -12570,21 +12567,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -12598,22 +12595,22 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX940-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_max_f16 v4, v4, v3
+; GFX940-NEXT:    v_pk_max_f16 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -12623,22 +12620,22 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory
 ; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v3, v5, v3
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -12650,21 +12647,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory
 ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT:    v_pk_max_f16 v3, v5, v3
+; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB49_1
@@ -12675,20 +12672,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory
 ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX90A-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_max_f16 v4, v5, v5
-; GFX90A-NEXT:    v_pk_max_f16 v4, v4, v3
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -12698,20 +12695,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory
 ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX908-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX908-NEXT:    v_pk_max_f16 v3, v5, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT:    v_pk_max_f16 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -12721,24 +12718,24 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory
 ; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
 ; GFX8-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT:    v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v5, v7, v6
-; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT:    v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v6, v6, v5
+; GFX8-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -12852,21 +12849,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off offset:2044
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -12880,22 +12877,22 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX940-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_max_f16 v4, v4, v3
+; GFX940-NEXT:    v_pk_max_f16 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -12905,22 +12902,22 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine
 ; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:2044
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v3, v5, v3
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -12932,21 +12929,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine
 ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT:    v_pk_max_f16 v3, v5, v3
+; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB50_1
@@ -12957,20 +12954,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine
 ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_max_f16 v4, v5, v5
-; GFX90A-NEXT:    v_pk_max_f16 v4, v4, v3
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -12980,20 +12977,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine
 ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX908-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX908-NEXT:    v_pk_max_f16 v3, v5, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT:    v_pk_max_f16 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13005,24 +13002,24 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
 ; GFX8-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT:    v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v5, v7, v6
-; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT:    v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v6, v6, v5
+; GFX8-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13137,21 +13134,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off offset:-2048
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -13165,22 +13162,22 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:-2048
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX940-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_max_f16 v4, v4, v3
+; GFX940-NEXT:    v_pk_max_f16 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13190,22 +13187,22 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine
 ; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:-2048
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v3, v5, v3
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -13217,21 +13214,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine
 ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:-2048
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT:    v_pk_max_f16 v3, v5, v3
+; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB51_1
@@ -13242,20 +13239,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine
 ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:-2048
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX90A-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_max_f16 v4, v5, v5
-; GFX90A-NEXT:    v_pk_max_f16 v4, v4, v3
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13265,20 +13262,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine
 ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:-2048
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX908-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX908-NEXT:    v_pk_max_f16 v3, v5, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT:    v_pk_max_f16 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13290,24 +13287,24 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
 ; GFX8-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT:    v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v5, v7, v6
-; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT:    v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v6, v6, v5
+; GFX8-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13431,15 +13428,15 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v3, v2
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
@@ -13462,14 +13459,14 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX940-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX940-NEXT:    v_pk_max_f16 v3, v5, v5
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_max_f16 v4, v3, v4
+; GFX940-NEXT:    v_pk_max_f16 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -13487,15 +13484,15 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX11-NEXT:    v_pk_max_f16 v3, v5, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX11-NEXT:    v_pk_max_f16 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -13515,14 +13512,14 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX10-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT:    v_pk_max_f16 v3, v5, v3
+; GFX10-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX10-NEXT:    v_pk_max_f16 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -13542,13 +13539,13 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX90A-NEXT:    v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT:    v_pk_max_f16 v4, v3, v4
+; GFX90A-NEXT:    v_pk_max_f16 v4, v3, v2
 ; GFX90A-NEXT:    buffer_wbl2
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -13568,13 +13565,13 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX908-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_pk_max_f16 v5, v2, v2
 ; GFX908-NEXT:    v_pk_max_f16 v3, v4, v4
-; GFX908-NEXT:    v_pk_max_f16 v3, v3, v5
+; GFX908-NEXT:    v_pk_max_f16 v3, v3, v2
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -13594,21 +13591,21 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT:    v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v7, v1, v1
-; GFX8-NEXT:    v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v0, v7, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v0
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB52_1
@@ -13728,22 +13725,22 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off offset:2044
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v4
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -13757,22 +13754,22 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin
 ; GFX940-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX940-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_max_f16 v4, v4, v3
+; GFX940-NEXT:    v_pk_max_f16 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13782,22 +13779,22 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin
 ; GFX11-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:2044
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v3, v5, v3
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -13809,21 +13806,21 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin
 ; GFX10-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT:    v_pk_max_f16 v3, v5, v3
+; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB53_1
@@ -13834,22 +13831,22 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin
 ; GFX90A-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_max_f16 v4, v5, v5
-; GFX90A-NEXT:    v_pk_max_f16 v4, v4, v3
+; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v4
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13859,20 +13856,20 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin
 ; GFX908-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX908-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX908-NEXT:    v_pk_max_f16 v3, v5, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT:    v_pk_max_f16 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13884,24 +13881,24 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
 ; GFX8-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT:    v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v5, v7, v6
-; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT:    v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v6, v6, v5
+; GFX8-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -14021,36 +14018,38 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v6, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_max_num_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -14067,35 +14066,35 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v6, v4
-; GFX940-NEXT:    v_max_f32_e32 v3, v7, v3
-; GFX940-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT:    v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v7, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX940-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB54_1
@@ -14108,39 +14107,41 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -14155,34 +14156,34 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v6, v3
-; GFX10-NEXT:    v_max_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB54_1
@@ -14196,33 +14197,33 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_max_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB54_1
@@ -14236,33 +14237,33 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_max_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_max_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB54_1
@@ -14276,34 +14277,34 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_max_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_max_f32_e32 v3, v7, v3
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB54_1
@@ -14416,36 +14417,38 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v6, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_max_num_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -14462,35 +14465,35 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v6, v4
-; GFX940-NEXT:    v_max_f32_e32 v3, v7, v3
-; GFX940-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT:    v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v7, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX940-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB55_1
@@ -14503,39 +14506,41 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -14550,34 +14555,34 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v6, v3
-; GFX10-NEXT:    v_max_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB55_1
@@ -14591,33 +14596,33 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_max_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB55_1
@@ -14631,33 +14636,33 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_max_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_max_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB55_1
@@ -14673,34 +14678,34 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_max_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_max_f32_e32 v0, v7, v0
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB55_1
@@ -14813,36 +14818,38 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v6, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_max_num_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -14859,35 +14866,35 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v6, v4
-; GFX940-NEXT:    v_max_f32_e32 v3, v7, v3
-; GFX940-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT:    v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v7, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX940-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB56_1
@@ -14900,39 +14907,41 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -14947,34 +14956,34 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v6, v3
-; GFX10-NEXT:    v_max_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB56_1
@@ -14988,33 +14997,33 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_max_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB56_1
@@ -15028,33 +15037,33 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_max_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_max_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB56_1
@@ -15070,34 +15079,34 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_max_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_max_f32_e32 v0, v7, v0
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB56_1
@@ -15213,38 +15222,38 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v5, v3
-; GFX12-NEXT:    v_max_num_f32_e32 v5, v7, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_max_num_f32_e32 v6, v6, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -15258,38 +15267,38 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v7, v6
-; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15299,41 +15308,41 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor
 ; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15346,35 +15355,35 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor
 ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB57_1
@@ -15385,36 +15394,36 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor
 ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15424,36 +15433,36 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor
 ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15463,37 +15472,37 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor
 ; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15599,38 +15608,38 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off offset:2044
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v5, v3
-; GFX12-NEXT:    v_max_num_f32_e32 v5, v7, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_max_num_f32_e32 v6, v6, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -15644,38 +15653,38 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v7, v6
-; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15685,41 +15694,41 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin
 ; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:2044
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_max_f32_e32 v6, v6, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15732,35 +15741,35 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin
 ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB58_1
@@ -15771,36 +15780,36 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin
 ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15810,36 +15819,36 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin
 ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15851,37 +15860,37 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15988,38 +15997,38 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off offset:-2048
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v5, v3
-; GFX12-NEXT:    v_max_num_f32_e32 v5, v7, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_max_num_f32_e32 v6, v6, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -16033,38 +16042,38 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:-2048
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v7, v6
-; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16074,41 +16083,41 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin
 ; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:-2048
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16121,35 +16130,35 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin
 ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:-2048
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB59_1
@@ -16160,36 +16169,36 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin
 ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:-2048
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16199,36 +16208,36 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin
 ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:-2048
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16240,37 +16249,37 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16386,37 +16395,39 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v6, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_max_num_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -16433,35 +16444,35 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v6, v4
-; GFX940-NEXT:    v_max_f32_e32 v3, v7, v3
-; GFX940-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT:    v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v7, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX940-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB60_1
@@ -16474,39 +16485,41 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16521,34 +16534,34 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v6, v3
-; GFX10-NEXT:    v_max_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB60_1
@@ -16562,35 +16575,35 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_max_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB60_1
@@ -16604,33 +16617,33 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_max_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_max_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB60_1
@@ -16646,34 +16659,34 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_max_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_max_f32_e32 v0, v7, v0
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB60_1
@@ -16785,39 +16798,39 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off offset:2044
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v5, v3
-; GFX12-NEXT:    v_max_num_f32_e32 v5, v7, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_max_num_f32_e32 v6, v6, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -16831,38 +16844,38 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi
 ; GFX940-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v7, v6
-; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16872,41 +16885,41 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi
 ; GFX11-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:2044
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16919,35 +16932,35 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi
 ; GFX10-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB61_1
@@ -16958,38 +16971,38 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi
 ; GFX90A-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16999,36 +17012,36 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi
 ; GFX908-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -17040,37 +17053,37 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_max_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
index ecea5fdecfcd0..903e80b15814f 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
@@ -32,13 +32,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -77,13 +77,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -101,13 +101,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -193,13 +193,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -238,13 +238,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -262,13 +262,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -356,13 +356,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB2_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -401,13 +401,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB2_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -425,13 +425,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB2_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -517,21 +517,21 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -561,20 +561,20 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p
 ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v3
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -584,20 +584,20 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p
 ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -672,21 +672,21 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -716,20 +716,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g
 ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v3
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -739,20 +739,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g
 ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -830,21 +830,21 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:-2048
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -874,20 +874,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g
 ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:-2048
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v3
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -897,20 +897,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g
 ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:-2048
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -991,13 +991,13 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -1036,13 +1036,13 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    buffer_wbl2
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -1062,13 +1062,13 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -1155,21 +1155,21 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_
 ; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1199,22 +1199,22 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_
 ; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1224,20 +1224,20 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_
 ; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1317,13 +1317,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -1341,14 +1341,15 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_max_f32 v3, v2, v2
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX11-NEXT:    v_min_f32_e32 v3, v5, v3
+; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -1368,14 +1369,14 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v2, v2
-; GFX10-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v5, v3
+; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -1395,13 +1396,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -1419,13 +1420,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -1546,13 +1547,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -1591,13 +1592,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -1615,13 +1616,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -1711,13 +1712,13 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -1756,13 +1757,13 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -1780,13 +1781,13 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -1872,13 +1873,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -1917,13 +1918,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -1941,13 +1942,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -2035,13 +2036,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -2080,13 +2081,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -2104,13 +2105,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -2196,21 +2197,21 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2240,20 +2241,20 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem
 ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v3
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2263,20 +2264,20 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem
 ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2351,21 +2352,21 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_f
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2395,20 +2396,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_f
 ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v3
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2418,20 +2419,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_f
 ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2509,21 +2510,21 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:-2048
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2553,20 +2554,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f
 ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:-2048
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v3
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2576,20 +2577,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f
 ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:-2048
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2670,13 +2671,13 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -2715,13 +2716,13 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX90A-NEXT:    buffer_wbl2
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -2741,13 +2742,13 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -2834,21 +2835,21 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_
 ; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2878,22 +2879,22 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_
 ; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2903,20 +2904,20 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_
 ; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f32_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2990,15 +2991,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -3028,15 +3029,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -3074,15 +3075,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[8:9]
+; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -3100,15 +3101,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
-; GFX8-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
 ; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[8:9]
+; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -3163,15 +3164,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -3201,15 +3202,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -3247,15 +3248,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[8:9]
+; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -3275,15 +3276,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7f8, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT:    v_min_f64 v[6:7], v[0:1], v[6:7]
+; GFX8-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -3337,15 +3338,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -3375,15 +3376,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -3421,15 +3422,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[8:9]
+; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -3449,15 +3450,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT:    v_min_f64 v[6:7], v[0:1], v[6:7]
+; GFX8-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -3510,21 +3511,21 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[6:7], v[0:1], off
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -3547,22 +3548,22 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[6:7], v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -3592,21 +3593,21 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3616,21 +3617,21 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3673,21 +3674,21 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[6:7], v[0:1], off offset:2040
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -3710,22 +3711,22 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g
 ; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[6:7], v[0:1], off offset:2040
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -3755,21 +3756,21 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g
 ; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:2040
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
+; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3781,21 +3782,21 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7f8, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3839,21 +3840,21 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[6:7], v[0:1], off offset:-2048
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -3876,22 +3877,22 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[6:7], v[0:1], off offset:-2048
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
+; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -3921,21 +3922,21 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:-2048
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
+; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3947,21 +3948,21 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4006,15 +4007,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -4044,15 +4045,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -4072,15 +4073,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v4
-; GFX10-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX10-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX10-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX10-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX10-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -4101,13 +4102,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX90A-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
 ; GFX90A-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT:    v_min_f64 v[4:5], v[4:5], v[8:9]
+; GFX90A-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -4125,15 +4126,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[8:9]
+; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -4151,15 +4152,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
-; GFX8-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
 ; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[8:9]
+; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -4177,28 +4178,26 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    v_mov_b32_e32 v7, v1
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7-NEXT:    v_mov_b32_e32 v5, v1
+; GFX7-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v11, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v10, v0
-; GFX7-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX7-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX7-NEXT:    v_min_f64 v[8:9], v[0:1], v[2:3]
+; GFX7-NEXT:    v_min_f64 v[8:9], v[0:1], v[6:7]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v8
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v9
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v10
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v11
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
@@ -4213,14 +4212,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_mov_b32_e32 v7, v1
-; GFX6-NEXT:    v_mov_b32_e32 v6, v0
+; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GFX6-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -4228,14 +4226,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add
 ; GFX6-NEXT:    v_mov_b32_e32 v11, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v10, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX6-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX6-NEXT:    v_min_f64 v[8:9], v[0:1], v[2:3]
+; GFX6-NEXT:    v_min_f64 v[8:9], v[0:1], v[6:7]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v8
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v9
 ; GFX6-NEXT:    v_mov_b32_e32 v2, v10
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v11
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
@@ -4259,15 +4256,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -4297,15 +4294,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT:    v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -4343,15 +4340,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[8:9]
+; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -4369,15 +4366,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
-; GFX8-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
 ; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[8:9]
+; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -4436,8 +4433,9 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX12-NEXT:    global_load_b32 v5, v[0:1], off
@@ -4449,12 +4447,11 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f16_e32 v5, v5, v7
+; GFX12-NEXT:    v_min_num_f16_e32 v5, v5, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -4487,14 +4484,14 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
-; GFX940-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT:    v_min_f16_e32 v5, v6, v5
+; GFX940-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX940-NEXT:    buffer_wbl2 sc1
@@ -4514,8 +4511,9 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX11-NEXT:    global_load_b32 v5, v[0:1], off
@@ -4527,12 +4525,11 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-NEXT:    v_max_f16_e32 v5, v5, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f16_e32 v5, v5, v7
+; GFX11-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -4556,6 +4553,7 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
@@ -4567,10 +4565,9 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX10-NEXT:    v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT:    v_min_f16_e32 v5, v5, v7
+; GFX10-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4599,14 +4596,14 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT:    v_min_f16_e32 v5, v6, v5
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX90A-NEXT:    global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
@@ -4633,14 +4630,14 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
-; GFX908-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT:    v_min_f16_e32 v5, v7, v5
+; GFX908-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX908-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
@@ -4667,17 +4664,17 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT:    v_min_f16_e32 v5, v7, v5
-; GFX8-NEXT:    v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT:    v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
 ; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -4790,10 +4787,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -4803,12 +4801,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f16_e32 v5, v5, v7
+; GFX12-NEXT:    v_min_num_f16_e32 v5, v5, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -4843,14 +4840,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
-; GFX940-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT:    v_min_f16_e32 v5, v6, v5
+; GFX940-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX940-NEXT:    buffer_wbl2 sc1
@@ -4871,10 +4868,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -4884,12 +4882,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-NEXT:    v_max_f16_e32 v5, v5, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f16_e32 v5, v5, v7
+; GFX11-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -4914,9 +4911,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    global_load_dword v5, v[0:1], off
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -4925,10 +4923,9 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX10-NEXT:    v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT:    v_min_f16_e32 v5, v5, v7
+; GFX10-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4958,14 +4955,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT:    v_min_f16_e32 v5, v6, v5
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX90A-NEXT:    global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
@@ -4993,14 +4990,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
-; GFX908-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT:    v_min_f16_e32 v5, v7, v5
+; GFX908-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX908-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
@@ -5028,17 +5025,17 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT:    v_min_f16_e32 v5, v7, v5
-; GFX8-NEXT:    v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT:    v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
 ; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -5155,10 +5152,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -5168,12 +5166,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f16_e32 v5, v5, v7
+; GFX12-NEXT:    v_min_num_f16_e32 v5, v5, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -5209,14 +5206,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
-; GFX940-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT:    v_min_f16_e32 v5, v6, v5
+; GFX940-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX940-NEXT:    buffer_wbl2 sc1
@@ -5237,10 +5234,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -5250,12 +5248,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-NEXT:    v_max_f16_e32 v5, v5, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f16_e32 v5, v5, v7
+; GFX11-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -5280,9 +5277,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    global_load_dword v5, v[0:1], off
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -5291,10 +5289,9 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX10-NEXT:    v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT:    v_min_f16_e32 v5, v5, v7
+; GFX10-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -5324,14 +5321,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT:    v_min_f16_e32 v5, v6, v5
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX90A-NEXT:    global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
@@ -5359,14 +5356,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
-; GFX908-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT:    v_min_f16_e32 v5, v7, v5
+; GFX908-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX908-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
@@ -5394,17 +5391,17 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT:    v_min_f16_e32 v5, v7, v5
-; GFX8-NEXT:    v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT:    v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
 ; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -5520,8 +5517,9 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX12-NEXT:    global_load_b32 v4, v[0:1], off
@@ -5533,10 +5531,9 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT:    v_min_num_f16_e32 v3, v3, v7
+; GFX12-NEXT:    v_min_num_f16_e32 v3, v3, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
@@ -5570,13 +5567,13 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
 ; GFX940-NEXT:    v_not_b32_e32 v6, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX940-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT:    v_min_f16_e32 v4, v4, v7
+; GFX940-NEXT:    v_min_f16_e32 v4, v4, v2
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
 ; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
@@ -5596,8 +5593,9 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
@@ -5609,10 +5607,9 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT:    v_min_f16_e32 v3, v3, v7
+; GFX11-NEXT:    v_min_f16_e32 v3, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
@@ -5637,6 +5634,7 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
@@ -5648,9 +5646,8 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX10-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT:    v_min_f16_e32 v3, v3, v7
+; GFX10-NEXT:    v_min_f16_e32 v3, v3, v2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -5679,13 +5676,13 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v6, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT:    v_min_f16_e32 v4, v4, v7
+; GFX90A-NEXT:    v_min_f16_e32 v4, v4, v2
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
 ; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
 ; GFX90A-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
@@ -5712,13 +5709,13 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
 ; GFX908-NEXT:    v_not_b32_e32 v6, v3
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX908-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT:    v_min_f16_e32 v3, v3, v7
+; GFX908-NEXT:    v_min_f16_e32 v3, v3, v2
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
 ; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
@@ -5745,16 +5742,16 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
 ; GFX8-NEXT:    v_not_b32_e32 v6, v3
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT:    v_min_f16_e32 v3, v3, v7
-; GFX8-NEXT:    v_and_b32_e32 v8, v4, v6
+; GFX8-NEXT:    v_min_f16_e32 v3, v3, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v8, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
 ; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -5862,36 +5859,36 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT:    v_max_num_f16_e32 v6, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v6, v3
+; GFX12-NEXT:    v_not_b32_e32 v5, v5
 ; GFX12-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT:    v_min_num_f16_e32 v3, v3, v7
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT:    v_min_num_f16_e32 v2, v2, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -5909,29 +5906,29 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
 ; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v5
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
+; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v6, v4
+; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX940-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX940-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT:    v_min_f16_e32 v4, v4, v7
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX940-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v4
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5941,37 +5938,37 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    v_max_f16_e32 v6, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v6, v3
+; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT:    v_min_f16_e32 v3, v3, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -5983,31 +5980,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_max_f16_e32 v6, v2, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v6, v3
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v5, v5
 ; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX10-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT:    v_min_f16_e32 v3, v3, v7
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX10-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
@@ -6018,31 +6015,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
+; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v6, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX90A-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT:    v_min_f16_e32 v4, v4, v7
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6052,31 +6049,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
+; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT:    v_not_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX908-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX908-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT:    v_min_f16_e32 v3, v3, v7
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX908-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6086,32 +6083,32 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT:    v_not_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX8-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT:    v_min_f16_e32 v3, v3, v7
-; GFX8-NEXT:    v_and_b32_e32 v8, v4, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v8, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6216,36 +6213,36 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT:    v_max_num_f16_e32 v6, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v6, v3
+; GFX12-NEXT:    v_not_b32_e32 v5, v5
 ; GFX12-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT:    v_min_num_f16_e32 v3, v3, v7
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT:    v_min_num_f16_e32 v2, v2, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -6264,29 +6261,29 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
 ; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v5
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
+; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v6, v4
+; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX940-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX940-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT:    v_min_f16_e32 v4, v4, v7
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX940-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v4
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6296,37 +6293,37 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_max_f16_e32 v6, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v6, v3
+; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT:    v_min_f16_e32 v3, v3, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -6338,31 +6335,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_max_f16_e32 v6, v2, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v6, v3
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v5, v5
 ; GFX10-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX10-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT:    v_min_f16_e32 v3, v3, v7
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX10-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB31_1
@@ -6373,31 +6370,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
+; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v6, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX90A-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT:    v_min_f16_e32 v4, v4, v7
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6407,31 +6404,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
+; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT:    v_not_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX908-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX908-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT:    v_min_f16_e32 v3, v3, v7
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX908-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6441,32 +6438,32 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT:    v_not_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX8-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT:    v_min_f16_e32 v3, v3, v7
-; GFX8-NEXT:    v_and_b32_e32 v8, v4, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v8, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6572,15 +6569,15 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v5, v4, v4
-; GFX12-NEXT:    v_min_num_f16_e32 v3, v5, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v3, v4, v4
+; GFX12-NEXT:    v_min_num_f16_e32 v3, v3, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX12-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
@@ -6605,14 +6602,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
 ; GFX940-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX940-NEXT:    v_max_f16_e32 v3, v5, v5
-; GFX940-NEXT:    v_min_f16_e32 v3, v3, v4
+; GFX940-NEXT:    v_min_f16_e32 v3, v3, v2
 ; GFX940-NEXT:    v_and_or_b32 v4, v5, s2, v3
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0
@@ -6631,15 +6628,15 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    v_max_f16_e32 v3, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v5, v4, v4
-; GFX11-NEXT:    v_min_f16_e32 v3, v5, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v3, v4, v4
+; GFX11-NEXT:    v_min_f16_e32 v3, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX11-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
@@ -6662,14 +6659,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f16_e32 v3, v2, v2
-; GFX10-NEXT:    v_max_f16_e32 v5, v4, v4
-; GFX10-NEXT:    v_min_f16_e32 v3, v5, v3
+; GFX10-NEXT:    v_max_f16_e32 v3, v4, v4
+; GFX10-NEXT:    v_min_f16_e32 v3, v3, v2
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX10-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -6691,14 +6688,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
 ; GFX90A-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX90A-NEXT:    v_max_f16_e32 v3, v5, v5
-; GFX90A-NEXT:    v_min_f16_e32 v3, v3, v4
+; GFX90A-NEXT:    v_min_f16_e32 v3, v3, v2
 ; GFX90A-NEXT:    v_and_or_b32 v4, v5, s6, v3
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -6717,14 +6714,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
 ; GFX908-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f16_e32 v5, v2, v2
 ; GFX908-NEXT:    v_max_f16_e32 v3, v4, v4
-; GFX908-NEXT:    v_min_f16_e32 v3, v3, v5
+; GFX908-NEXT:    v_min_f16_e32 v3, v3, v2
 ; GFX908-NEXT:    v_and_or_b32 v3, v4, s6, v3
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -6745,19 +6742,19 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v1, v2, v2
 ; GFX8-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v0, v1, v1
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX8-NEXT:    v_min_f16_e32 v0, v0, v5
-; GFX8-NEXT:    v_or_b32_e32 v0, v6, v0
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_max_f16_e32 v0, v6, v6
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-NEXT:    v_or_b32_e32 v5, v2, v0
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB32_1
@@ -6847,24 +6844,24 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off offset:2046
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-NEXT:    v_max_num_f16_e32 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v2, v2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f16_e32 v5, v4, v4
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v3, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f16_e32 v3, v5, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT:    v_min_num_f16_e32 v2, v2, v4
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -6878,23 +6875,23 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:2046
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
 ; GFX940-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_max_f16_e32 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f16_e32 v4, v5, v5
-; GFX940-NEXT:    v_min_f16_e32 v3, v4, v3
-; GFX940-NEXT:    v_and_or_b32 v4, v5, s2, v3
+; GFX940-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX940-NEXT:    v_min_f16_e32 v2, v2, v4
+; GFX940-NEXT:    v_and_or_b32 v2, v3, s2, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6904,25 +6901,25 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n
 ; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:2046
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_max_f16_e32 v3, v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f16_e32 v5, v4, v4
+; GFX11-NEXT:    v_max_f16_e32 v2, v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f16_e32 v3, v5, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    v_min_f16_e32 v2, v2, v4
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -6934,24 +6931,24 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n
 ; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:2046
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
+; GFX10-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_max_f16_e32 v3, v2, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f16_e32 v5, v4, v4
-; GFX10-NEXT:    v_min_f16_e32 v3, v5, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX10-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX10-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX10-NEXT:    v_min_f16_e32 v2, v2, v4
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6961,22 +6958,22 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n
 ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:2046
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
 ; GFX90A-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_max_f16_e32 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f16_e32 v4, v5, v5
-; GFX90A-NEXT:    v_min_f16_e32 v3, v4, v3
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, s6, v3
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc
+; GFX90A-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX90A-NEXT:    v_min_f16_e32 v2, v2, v4
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, s6, v2
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6986,22 +6983,22 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n
 ; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:2046
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
 ; GFX908-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_max_f16_e32 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f16_e32 v5, v4, v4
-; GFX908-NEXT:    v_min_f16_e32 v3, v5, v3
-; GFX908-NEXT:    v_and_or_b32 v3, v4, s6, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX908-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX908-NEXT:    v_min_f16_e32 v2, v2, v4
+; GFX908-NEXT:    v_and_or_b32 v2, v3, s6, v2
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7013,22 +7010,22 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX8-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f16_e32 v3, v2, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f16_e32 v5, v4, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT:    v_min_f16_e32 v3, v5, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v6, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT:    v_min_f16_e32 v2, v2, v4
+; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7117,10 +7114,11 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -7130,12 +7128,11 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f16_e32 v5, v5, v7
+; GFX12-NEXT:    v_min_num_f16_e32 v5, v5, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -7171,14 +7168,14 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
-; GFX940-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT:    v_min_f16_e32 v5, v6, v5
+; GFX940-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
@@ -7199,10 +7196,11 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -7212,12 +7210,11 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-NEXT:    v_max_f16_e32 v5, v5, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f16_e32 v5, v5, v7
+; GFX11-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
@@ -7242,9 +7239,10 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    global_load_dword v5, v[0:1], off
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -7253,10 +7251,9 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX10-NEXT:    v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT:    v_min_f16_e32 v5, v5, v7
+; GFX10-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -7286,14 +7283,14 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT:    v_min_f16_e32 v5, v6, v5
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX90A-NEXT:    buffer_wbl2
@@ -7323,14 +7320,14 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
-; GFX908-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT:    v_min_f16_e32 v5, v7, v5
+; GFX908-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT:    v_min_f16_e32 v5, v5, v2
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX908-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
@@ -7358,17 +7355,17 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT:    v_min_f16_e32 v5, v7, v5
-; GFX8-NEXT:    v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT:    v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
 ; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -7483,37 +7480,37 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT:    v_max_num_f16_e32 v6, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v6, v3
+; GFX12-NEXT:    v_not_b32_e32 v5, v5
 ; GFX12-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT:    v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT:    v_min_num_f16_e32 v3, v3, v7
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT:    v_min_num_f16_e32 v2, v2, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -7531,29 +7528,29 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_
 ; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
 ; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v5
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
+; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v6, v4
+; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX940-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX940-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT:    v_min_f16_e32 v4, v4, v7
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX940-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v4
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7563,37 +7560,37 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_
 ; GFX11-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    v_max_f16_e32 v6, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v6, v3
+; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT:    v_min_f16_e32 v3, v3, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -7605,31 +7602,31 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_
 ; GFX10-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_max_f16_e32 v6, v2, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v6, v3
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v5, v5
 ; GFX10-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX10-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT:    v_min_f16_e32 v3, v3, v7
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX10-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB35_1
@@ -7640,33 +7637,33 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_
 ; GFX90A-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
+; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v6, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX90A-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT:    v_min_f16_e32 v4, v4, v7
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7676,31 +7673,31 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_
 ; GFX908-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
+; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT:    v_not_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX908-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX908-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT:    v_min_f16_e32 v3, v3, v7
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX908-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7710,32 +7707,32 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_
 ; GFX8-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT:    v_not_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX8-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT:    v_max_f16_e32 v7, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT:    v_min_f16_e32 v3, v3, v7
-; GFX8-NEXT:    v_and_b32_e32 v8, v4, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v8, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11684,15 +11681,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT:    v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT:    v_pk_min_num_f16 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -11714,14 +11711,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX940-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX940-NEXT:    v_pk_max_f16 v3, v5, v5
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_min_f16 v4, v3, v4
+; GFX940-NEXT:    v_pk_min_f16 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -11739,15 +11736,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX11-NEXT:    v_pk_min_f16 v3, v5, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX11-NEXT:    v_pk_min_f16 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -11767,14 +11764,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX10-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT:    v_pk_min_f16 v3, v5, v3
+; GFX10-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX10-NEXT:    v_pk_min_f16 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -11794,13 +11791,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX90A-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX90A-NEXT:    v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT:    v_pk_min_f16 v4, v3, v4
+; GFX90A-NEXT:    v_pk_min_f16 v4, v3, v2
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -11818,13 +11815,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX908-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_pk_max_f16 v5, v2, v2
 ; GFX908-NEXT:    v_pk_max_f16 v3, v4, v4
-; GFX908-NEXT:    v_pk_min_f16 v3, v3, v5
+; GFX908-NEXT:    v_pk_min_f16 v3, v3, v2
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -11842,21 +11839,21 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
-; GFX8-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v3, v2, v2
-; GFX8-NEXT:    v_max_f16_sdwa v6, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT:    v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v3, v7, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v5
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8-NEXT:    v_max_f16_sdwa v3, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT:    v_min_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB46_1
@@ -11977,15 +11974,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT:    v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT:    v_pk_min_num_f16 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -12007,14 +12004,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX940-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX940-NEXT:    v_pk_max_f16 v3, v5, v5
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_min_f16 v4, v3, v4
+; GFX940-NEXT:    v_pk_min_f16 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -12032,15 +12029,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX11-NEXT:    v_pk_min_f16 v3, v5, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX11-NEXT:    v_pk_min_f16 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -12060,14 +12057,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX10-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT:    v_pk_min_f16 v3, v5, v3
+; GFX10-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX10-NEXT:    v_pk_min_f16 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -12087,13 +12084,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX90A-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX90A-NEXT:    v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT:    v_pk_min_f16 v4, v3, v4
+; GFX90A-NEXT:    v_pk_min_f16 v4, v3, v2
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -12111,13 +12108,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX908-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_pk_max_f16 v5, v2, v2
 ; GFX908-NEXT:    v_pk_max_f16 v3, v4, v4
-; GFX908-NEXT:    v_pk_min_f16 v3, v3, v5
+; GFX908-NEXT:    v_pk_min_f16 v3, v3, v2
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -12137,21 +12134,21 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT:    v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v7, v1, v1
-; GFX8-NEXT:    v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v0, v7, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT:    v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v0
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB47_1
@@ -12272,15 +12269,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT:    v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT:    v_pk_min_num_f16 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -12302,14 +12299,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX940-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX940-NEXT:    v_pk_max_f16 v3, v5, v5
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_min_f16 v4, v3, v4
+; GFX940-NEXT:    v_pk_min_f16 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -12327,15 +12324,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX11-NEXT:    v_pk_min_f16 v3, v5, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX11-NEXT:    v_pk_min_f16 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -12355,14 +12352,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX10-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT:    v_pk_min_f16 v3, v5, v3
+; GFX10-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX10-NEXT:    v_pk_min_f16 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -12382,13 +12379,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX90A-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX90A-NEXT:    v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT:    v_pk_min_f16 v4, v3, v4
+; GFX90A-NEXT:    v_pk_min_f16 v4, v3, v2
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -12406,13 +12403,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX908-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_pk_max_f16 v5, v2, v2
 ; GFX908-NEXT:    v_pk_max_f16 v3, v4, v4
-; GFX908-NEXT:    v_pk_min_f16 v3, v3, v5
+; GFX908-NEXT:    v_pk_min_f16 v3, v3, v2
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -12432,21 +12429,21 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT:    v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v7, v1, v1
-; GFX8-NEXT:    v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v0, v7, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT:    v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v0
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB48_1
@@ -12570,21 +12567,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT:    v_pk_min_num_f16 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -12598,22 +12595,22 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX940-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_min_f16 v4, v4, v3
+; GFX940-NEXT:    v_pk_min_f16 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -12623,22 +12620,22 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory
 ; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_min_f16 v3, v5, v3
+; GFX11-NEXT:    v_pk_min_f16 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -12650,21 +12647,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory
 ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT:    v_pk_min_f16 v3, v5, v3
+; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT:    v_pk_min_f16 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB49_1
@@ -12675,20 +12672,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory
 ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX90A-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_max_f16 v4, v5, v5
-; GFX90A-NEXT:    v_pk_min_f16 v4, v4, v3
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT:    v_pk_min_f16 v2, v2, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -12698,20 +12695,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory
 ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX908-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX908-NEXT:    v_pk_min_f16 v3, v5, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT:    v_pk_min_f16 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -12721,24 +12718,24 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory
 ; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
 ; GFX8-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT:    v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v5, v7, v6
-; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT:    v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v6, v6, v5
+; GFX8-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -12852,21 +12849,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off offset:2044
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT:    v_pk_min_num_f16 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -12880,22 +12877,22 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX940-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_min_f16 v4, v4, v3
+; GFX940-NEXT:    v_pk_min_f16 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -12905,22 +12902,22 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine
 ; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:2044
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_min_f16 v3, v5, v3
+; GFX11-NEXT:    v_pk_min_f16 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -12932,21 +12929,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine
 ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT:    v_pk_min_f16 v3, v5, v3
+; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT:    v_pk_min_f16 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB50_1
@@ -12957,20 +12954,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine
 ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_max_f16 v4, v5, v5
-; GFX90A-NEXT:    v_pk_min_f16 v4, v4, v3
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT:    v_pk_min_f16 v2, v2, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -12980,20 +12977,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine
 ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX908-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX908-NEXT:    v_pk_min_f16 v3, v5, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT:    v_pk_min_f16 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13005,24 +13002,24 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
 ; GFX8-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT:    v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v5, v7, v6
-; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT:    v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v6, v6, v5
+; GFX8-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13137,21 +13134,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off offset:-2048
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT:    v_pk_min_num_f16 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -13165,22 +13162,22 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:-2048
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX940-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_min_f16 v4, v4, v3
+; GFX940-NEXT:    v_pk_min_f16 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13190,22 +13187,22 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine
 ; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:-2048
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_min_f16 v3, v5, v3
+; GFX11-NEXT:    v_pk_min_f16 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -13217,21 +13214,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine
 ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:-2048
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT:    v_pk_min_f16 v3, v5, v3
+; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT:    v_pk_min_f16 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB51_1
@@ -13242,20 +13239,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine
 ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:-2048
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX90A-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_max_f16 v4, v5, v5
-; GFX90A-NEXT:    v_pk_min_f16 v4, v4, v3
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT:    v_pk_min_f16 v2, v2, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13265,20 +13262,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine
 ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:-2048
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX908-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX908-NEXT:    v_pk_min_f16 v3, v5, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT:    v_pk_min_f16 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13290,24 +13287,24 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
 ; GFX8-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT:    v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v5, v7, v6
-; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT:    v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v6, v6, v5
+; GFX8-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13431,15 +13428,15 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT:    v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT:    v_pk_min_num_f16 v3, v3, v2
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
@@ -13462,14 +13459,14 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX940-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX940-NEXT:    v_pk_max_f16 v3, v5, v5
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_min_f16 v4, v3, v4
+; GFX940-NEXT:    v_pk_min_f16 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -13487,15 +13484,15 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX11-NEXT:    v_pk_min_f16 v3, v5, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX11-NEXT:    v_pk_min_f16 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -13515,14 +13512,14 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX10-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT:    v_pk_min_f16 v3, v5, v3
+; GFX10-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX10-NEXT:    v_pk_min_f16 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -13542,13 +13539,13 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX90A-NEXT:    v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT:    v_pk_min_f16 v4, v3, v4
+; GFX90A-NEXT:    v_pk_min_f16 v4, v3, v2
 ; GFX90A-NEXT:    buffer_wbl2
 ; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -13568,13 +13565,13 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX908-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_pk_max_f16 v5, v2, v2
 ; GFX908-NEXT:    v_pk_max_f16 v3, v4, v4
-; GFX908-NEXT:    v_pk_min_f16 v3, v3, v5
+; GFX908-NEXT:    v_pk_min_f16 v3, v3, v2
 ; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
@@ -13594,21 +13591,21 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT:    v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v7, v1, v1
-; GFX8-NEXT:    v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v0, v7, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT:    v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v0
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB52_1
@@ -13728,22 +13725,22 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off offset:2044
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT:    v_pk_min_num_f16 v2, v2, v4
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -13757,22 +13754,22 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin
 ; GFX940-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX940-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_min_f16 v4, v4, v3
+; GFX940-NEXT:    v_pk_min_f16 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13782,22 +13779,22 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin
 ; GFX11-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:2044
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_min_f16 v3, v5, v3
+; GFX11-NEXT:    v_pk_min_f16 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -13809,21 +13806,21 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin
 ; GFX10-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT:    v_pk_min_f16 v3, v5, v3
+; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT:    v_pk_min_f16 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB53_1
@@ -13834,22 +13831,22 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin
 ; GFX90A-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_max_f16 v4, v5, v5
-; GFX90A-NEXT:    v_pk_min_f16 v4, v4, v3
+; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT:    v_pk_min_f16 v2, v2, v4
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13859,20 +13856,20 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin
 ; GFX908-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX908-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_pk_max_f16 v5, v4, v4
-; GFX908-NEXT:    v_pk_min_f16 v3, v5, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT:    v_pk_min_f16 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13884,24 +13881,24 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
 ; GFX8-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT:    v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v5, v7, v6
-; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT:    v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v6, v6, v5
+; GFX8-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -14021,36 +14018,38 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v6, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_min_num_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -14067,35 +14066,35 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v6, v4
-; GFX940-NEXT:    v_min_f32_e32 v3, v7, v3
-; GFX940-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT:    v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v7, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX940-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB54_1
@@ -14108,39 +14107,41 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -14155,34 +14156,34 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v6, v3
-; GFX10-NEXT:    v_min_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB54_1
@@ -14196,33 +14197,33 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_min_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB54_1
@@ -14236,33 +14237,33 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_min_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_min_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB54_1
@@ -14276,34 +14277,34 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_min_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_min_f32_e32 v3, v7, v3
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB54_1
@@ -14416,36 +14417,38 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v6, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_min_num_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -14462,35 +14465,35 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v6, v4
-; GFX940-NEXT:    v_min_f32_e32 v3, v7, v3
-; GFX940-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT:    v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v7, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX940-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB55_1
@@ -14503,39 +14506,41 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -14550,34 +14555,34 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v6, v3
-; GFX10-NEXT:    v_min_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB55_1
@@ -14591,33 +14596,33 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_min_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB55_1
@@ -14631,33 +14636,33 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_min_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_min_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB55_1
@@ -14673,34 +14678,34 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_min_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_min_f32_e32 v0, v7, v0
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB55_1
@@ -14813,36 +14818,38 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v6, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_min_num_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -14859,35 +14866,35 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v6, v4
-; GFX940-NEXT:    v_min_f32_e32 v3, v7, v3
-; GFX940-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT:    v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v7, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX940-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB56_1
@@ -14900,39 +14907,41 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -14947,34 +14956,34 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v6, v3
-; GFX10-NEXT:    v_min_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB56_1
@@ -14988,33 +14997,33 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_min_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB56_1
@@ -15028,33 +15037,33 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_min_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_min_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB56_1
@@ -15070,34 +15079,34 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_min_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_min_f32_e32 v0, v7, v0
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB56_1
@@ -15213,38 +15222,38 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v5, v3
-; GFX12-NEXT:    v_min_num_f32_e32 v5, v7, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_min_num_f32_e32 v6, v6, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -15258,38 +15267,38 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX940-NEXT:    v_min_f32_e32 v4, v7, v6
-; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15299,41 +15308,41 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor
 ; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15346,35 +15355,35 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor
 ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB57_1
@@ -15385,36 +15394,36 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor
 ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_min_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15424,36 +15433,36 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor
 ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15463,37 +15472,37 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor
 ; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15599,38 +15608,38 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off offset:2044
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v5, v3
-; GFX12-NEXT:    v_min_num_f32_e32 v5, v7, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_min_num_f32_e32 v6, v6, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -15644,38 +15653,38 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX940-NEXT:    v_min_f32_e32 v4, v7, v6
-; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15685,41 +15694,41 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin
 ; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:2044
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_min_f32_e32 v6, v6, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15732,35 +15741,35 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin
 ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB58_1
@@ -15771,36 +15780,36 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin
 ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_min_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15810,36 +15819,36 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin
 ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15851,37 +15860,37 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15988,38 +15997,38 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off offset:-2048
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v5, v3
-; GFX12-NEXT:    v_min_num_f32_e32 v5, v7, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_min_num_f32_e32 v6, v6, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -16033,38 +16042,38 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:-2048
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX940-NEXT:    v_min_f32_e32 v4, v7, v6
-; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16074,41 +16083,41 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin
 ; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:-2048
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16121,35 +16130,35 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin
 ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:-2048
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB59_1
@@ -16160,36 +16169,36 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin
 ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:-2048
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_min_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16199,36 +16208,36 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin
 ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:-2048
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16240,37 +16249,37 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16386,37 +16395,39 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v6, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_min_num_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -16433,35 +16444,35 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v6, v4
-; GFX940-NEXT:    v_min_f32_e32 v3, v7, v3
-; GFX940-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT:    v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v7, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX940-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB60_1
@@ -16474,39 +16485,41 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16521,34 +16534,34 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v6, v3
-; GFX10-NEXT:    v_min_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB60_1
@@ -16562,35 +16575,35 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_min_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB60_1
@@ -16604,33 +16617,33 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_min_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_min_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB60_1
@@ -16646,34 +16659,34 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_min_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_min_f32_e32 v0, v7, v0
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB60_1
@@ -16785,39 +16798,39 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off offset:2044
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v5, v3
-; GFX12-NEXT:    v_min_num_f32_e32 v5, v7, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_min_num_f32_e32 v6, v6, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -16831,38 +16844,38 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi
 ; GFX940-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX940-NEXT:    v_min_f32_e32 v4, v7, v6
-; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16872,41 +16885,41 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi
 ; GFX11-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:2044
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16919,35 +16932,35 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi
 ; GFX10-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB61_1
@@ -16958,38 +16971,38 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi
 ; GFX90A-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_min_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16999,36 +17012,36 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi
 ; GFX908-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -17040,37 +17053,37 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_min_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
index 88a7a5f4d9c6b..3dbf6477a7cb8 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
@@ -14477,36 +14477,38 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_sub_f32_e32 v3, v6, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -14523,35 +14525,35 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_sub_f32_e32 v4, v6, v4
-; GFX940-NEXT:    v_sub_f32_e32 v3, v7, v3
-; GFX940-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT:    v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v7, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX940-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB50_1
@@ -14564,39 +14566,41 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -14611,34 +14615,34 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_sub_f32_e32 v3, v6, v3
-; GFX10-NEXT:    v_sub_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB50_1
@@ -14652,33 +14656,33 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_sub_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_sub_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB50_1
@@ -14692,33 +14696,33 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_sub_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_sub_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB50_1
@@ -14732,34 +14736,34 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_sub_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_sub_f32_e32 v3, v7, v3
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB50_1
@@ -14872,36 +14876,38 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_sub_f32_e32 v3, v6, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -14918,35 +14924,35 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_sub_f32_e32 v4, v6, v4
-; GFX940-NEXT:    v_sub_f32_e32 v3, v7, v3
-; GFX940-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT:    v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v7, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX940-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB51_1
@@ -14959,39 +14965,41 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15006,34 +15014,34 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_sub_f32_e32 v3, v6, v3
-; GFX10-NEXT:    v_sub_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB51_1
@@ -15047,33 +15055,33 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_sub_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_sub_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB51_1
@@ -15087,33 +15095,33 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_sub_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_sub_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB51_1
@@ -15129,34 +15137,34 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_sub_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_sub_f32_e32 v0, v7, v0
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB51_1
@@ -15269,36 +15277,38 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_sub_f32_e32 v3, v6, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -15315,35 +15325,35 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_sub_f32_e32 v4, v6, v4
-; GFX940-NEXT:    v_sub_f32_e32 v3, v7, v3
-; GFX940-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT:    v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v7, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX940-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB52_1
@@ -15356,39 +15366,41 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15403,34 +15415,34 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_sub_f32_e32 v3, v6, v3
-; GFX10-NEXT:    v_sub_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB52_1
@@ -15444,33 +15456,33 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_sub_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_sub_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB52_1
@@ -15484,33 +15496,33 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_sub_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_sub_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB52_1
@@ -15526,34 +15538,34 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_sub_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_sub_f32_e32 v0, v7, v0
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB52_1
@@ -15669,38 +15681,38 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX12-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -15714,38 +15726,38 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_sub_f32_e32 v3, v4, v3
-; GFX940-NEXT:    v_sub_f32_e32 v4, v7, v6
-; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15755,41 +15767,41 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX11-LABEL: global_agent_atomic_fsub_noret_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15802,35 +15814,35 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX10-LABEL: global_agent_atomic_fsub_noret_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB53_1
@@ -15841,36 +15853,36 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX90A-LABEL: global_agent_atomic_fsub_noret_v2bf16:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_sub_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_sub_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15880,36 +15892,36 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX908-LABEL: global_agent_atomic_fsub_noret_v2bf16:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15919,37 +15931,37 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX8-LABEL: global_agent_atomic_fsub_noret_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16055,38 +16067,38 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off offset:2044
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX12-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -16100,38 +16112,38 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_sub_f32_e32 v3, v4, v3
-; GFX940-NEXT:    v_sub_f32_e32 v4, v7, v6
-; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16141,41 +16153,41 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX11-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:2044
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16188,35 +16200,35 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX10-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB54_1
@@ -16227,36 +16239,36 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX90A-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_sub_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_sub_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16266,36 +16278,36 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX908-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16307,37 +16319,37 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16444,38 +16456,38 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off offset:-2048
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX12-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_sub_f32_e32 v6, v6, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -16489,38 +16501,38 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:-2048
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_sub_f32_e32 v3, v4, v3
-; GFX940-NEXT:    v_sub_f32_e32 v4, v7, v6
-; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16530,41 +16542,41 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX11-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:-2048
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16577,35 +16589,35 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX10-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:-2048
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB55_1
@@ -16616,36 +16628,36 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX90A-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:-2048
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_sub_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_sub_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16655,36 +16667,36 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX908-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:-2048
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16696,37 +16708,37 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16842,37 +16854,39 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_sub_f32_e32 v3, v6, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -16889,35 +16903,35 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_sub_f32_e32 v4, v6, v4
-; GFX940-NEXT:    v_sub_f32_e32 v3, v7, v3
-; GFX940-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT:    v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v7, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX940-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB56_1
@@ -16930,39 +16944,41 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_f32_e32 v3, v6, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16977,34 +16993,34 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_sub_f32_e32 v3, v6, v3
-; GFX10-NEXT:    v_sub_f32_e32 v5, v7, v5
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB56_1
@@ -17018,35 +17034,35 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_sub_f32_e32 v4, v6, v4
-; GFX90A-NEXT:    v_sub_f32_e32 v3, v7, v3
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB56_1
@@ -17060,33 +17076,33 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_sub_f32_e32 v5, v6, v5
-; GFX908-NEXT:    v_sub_f32_e32 v3, v7, v3
-; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB56_1
@@ -17102,34 +17118,34 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_sub_f32_e32 v5, v6, v5
-; GFX8-NEXT:    v_sub_f32_e32 v0, v7, v0
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB56_1
@@ -17241,39 +17257,39 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off offset:2044
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX12-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -17287,38 +17303,38 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX940-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT:    v_sub_f32_e32 v3, v4, v3
-; GFX940-NEXT:    v_sub_f32_e32 v4, v7, v6
-; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT:    v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -17328,41 +17344,41 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX11-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:2044
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -17375,35 +17391,35 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX10-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX10-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB57_1
@@ -17414,38 +17430,38 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX90A-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT:    v_sub_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_sub_f32_e32 v4, v7, v6
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -17455,36 +17471,36 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX908-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -17496,37 +17512,37 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_sub_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll
index dde6671178af7..b9592a9ff9073 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll
@@ -56,19 +56,19 @@ define amdgpu_kernel void @test_move_load_address_to_vgpr_d16_hi(ptr addrspace(1
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_movk_i32 s2, 0x100
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    global_load_ushort v0, v1, s[0:1] glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NEXT:    s_movk_i32 s1, 0x100
 ; GCN-NEXT:  .LBB1_1: ; %bb3
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_lshlrev_b64 v[2:3], 1, v[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v0, s1
-; GCN-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
-; GCN-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
-; GCN-NEXT:    global_load_short_d16_hi v0, v[2:3], off glc
+; GCN-NEXT:    v_lshlrev_b64 v[3:4], 1, v[0:1]
+; GCN-NEXT:    v_add_co_u32_e32 v3, vcc, s0, v3
+; GCN-NEXT:    v_addc_co_u32_e32 v4, vcc, v2, v4, vcc
+; GCN-NEXT:    global_load_short_d16_hi v0, v[3:4], off glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s1, v0
 ; GCN-NEXT:    s_cbranch_vccz .LBB1_1
 ; GCN-NEXT:  ; %bb.2: ; %bb2
 ; GCN-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
index 10d6a54baf665..f7882e6f12022 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
@@ -2528,21 +2528,19 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    flat_load_dword v1, v[0:1]
+; VI-NEXT:    flat_load_dword v3, v[0:1]
 ; VI-NEXT:    s_mov_b64 s[34:35], 0
 ; VI-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v0, s6, v1
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    v_not_b32_e32 v0, v0
-; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT:    v_and_b32_e32 v2, s6, v3
+; VI-NEXT:    v_not_b32_e32 v2, v2
+; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; VI-NEXT:    v_mov_b32_e32 v1, v0
+; VI-NEXT:    v_mov_b32_e32 v3, v2
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB55_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2623,25 +2621,23 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s34
 ; VI-NEXT:    v_mov_b32_e32 v1, s35
-; VI-NEXT:    flat_load_dword v1, v[0:1]
-; VI-NEXT:    s_mov_b64 s[36:37], 0
+; VI-NEXT:    flat_load_dword v3, v[0:1]
+; VI-NEXT:    s_mov_b64 s[34:35], 0
 ; VI-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v0, s6, v1
-; VI-NEXT:    v_mov_b32_e32 v2, s34
-; VI-NEXT:    v_mov_b32_e32 v3, s35
-; VI-NEXT:    v_not_b32_e32 v0, v0
-; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT:    v_and_b32_e32 v2, s6, v3
+; VI-NEXT:    v_not_b32_e32 v2, v2
+; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; VI-NEXT:    v_mov_b32_e32 v1, v0
-; VI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    v_mov_b32_e32 v3, v2
+; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB56_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[36:37]
+; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_nand_i32_noret_offset_scalar:
@@ -2718,19 +2714,19 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, s4
 ; VI-NEXT:    s_mov_b64 s[34:35], 0
+; VI-NEXT:    v_mov_b32_e32 v2, s5
 ; VI-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, v0
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_and_b32_e32 v0, s6, v1
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    v_not_b32_e32 v0, v0
-; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT:    v_mov_b32_e32 v4, v0
+; VI-NEXT:    v_and_b32_e32 v0, s6, v4
+; VI-NEXT:    v_not_b32_e32 v3, v0
+; VI-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB57_1
@@ -2810,27 +2806,25 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1)
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s34
-; VI-NEXT:    v_mov_b32_e32 v1, s35
-; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b64 s[36:37], 0
+; VI-NEXT:    v_mov_b32_e32 v1, s34
+; VI-NEXT:    v_mov_b32_e32 v2, s35
+; VI-NEXT:    flat_load_dword v0, v[1:2]
+; VI-NEXT:    s_mov_b64 s[34:35], 0
 ; VI-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, v0
-; VI-NEXT:    v_mov_b32_e32 v2, s34
-; VI-NEXT:    v_and_b32_e32 v0, s6, v1
-; VI-NEXT:    v_mov_b32_e32 v3, s35
-; VI-NEXT:    v_not_b32_e32 v0, v0
-; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT:    v_mov_b32_e32 v4, v0
+; VI-NEXT:    v_and_b32_e32 v0, s6, v4
+; VI-NEXT:    v_not_b32_e32 v3, v0
+; VI-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; VI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB58_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[36:37]
+; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_nand_i32_ret_offset_scalar:
@@ -4304,20 +4298,18 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    flat_load_dword v1, v[0:1]
+; VI-NEXT:    flat_load_dword v3, v[0:1]
 ; VI-NEXT:    s_mov_b64 s[34:35], 0
 ; VI-NEXT:  .LBB87_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_max_i32_e32 v0, s6, v1
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT:    v_max_i32_e32 v2, s6, v3
+; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; VI-NEXT:    v_mov_b32_e32 v1, v0
+; VI-NEXT:    v_mov_b32_e32 v3, v2
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB87_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4396,24 +4388,22 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace(
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s34
 ; VI-NEXT:    v_mov_b32_e32 v1, s35
-; VI-NEXT:    flat_load_dword v1, v[0:1]
-; VI-NEXT:    s_mov_b64 s[36:37], 0
+; VI-NEXT:    flat_load_dword v3, v[0:1]
+; VI-NEXT:    s_mov_b64 s[34:35], 0
 ; VI-NEXT:  .LBB88_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_mov_b32_e32 v2, s34
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_max_i32_e32 v0, s6, v1
-; VI-NEXT:    v_mov_b32_e32 v3, s35
-; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT:    v_max_i32_e32 v2, s6, v3
+; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; VI-NEXT:    v_mov_b32_e32 v1, v0
-; VI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    v_mov_b32_e32 v3, v2
+; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB88_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[36:37]
+; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_max_i32_noret_offset_scalar:
@@ -4488,18 +4478,18 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg %
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, s4
 ; VI-NEXT:    s_mov_b64 s[34:35], 0
+; VI-NEXT:    v_mov_b32_e32 v2, s5
 ; VI-NEXT:  .LBB89_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, v0
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    v_max_i32_e32 v0, s6, v1
-; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT:    v_mov_b32_e32 v4, v0
+; VI-NEXT:    v_max_i32_e32 v3, s6, v4
+; VI-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB89_1
@@ -4577,26 +4567,24 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1)
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s34
-; VI-NEXT:    v_mov_b32_e32 v1, s35
-; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b64 s[36:37], 0
+; VI-NEXT:    v_mov_b32_e32 v1, s34
+; VI-NEXT:    v_mov_b32_e32 v2, s35
+; VI-NEXT:    flat_load_dword v0, v[1:2]
+; VI-NEXT:    s_mov_b64 s[34:35], 0
 ; VI-NEXT:  .LBB90_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, v0
-; VI-NEXT:    v_mov_b32_e32 v2, s34
-; VI-NEXT:    v_mov_b32_e32 v3, s35
-; VI-NEXT:    v_max_i32_e32 v0, s6, v1
-; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT:    v_mov_b32_e32 v4, v0
+; VI-NEXT:    v_max_i32_e32 v3, s6, v4
+; VI-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; VI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB90_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[36:37]
+; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_max_i32_ret_offset_scalar:
@@ -4665,26 +4653,26 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i
 ; VI-NEXT:    s_ashr_i32 s5, s3, 31
 ; VI-NEXT:    s_mov_b32 s4, s3
 ; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT:    s_add_u32 s0, s0, s4
-; VI-NEXT:    s_addc_u32 s1, s1, s5
-; VI-NEXT:    s_load_dword s3, s[0:1], 0x10
-; VI-NEXT:    s_add_u32 s0, s0, 16
-; VI-NEXT:    s_addc_u32 s1, s1, 0
-; VI-NEXT:    s_mov_b64 s[4:5], 0
+; VI-NEXT:    s_add_u32 s4, s0, s4
+; VI-NEXT:    s_addc_u32 s5, s1, s5
+; VI-NEXT:    s_load_dword s3, s[4:5], 0x10
+; VI-NEXT:    s_add_u32 s4, s4, 16
+; VI-NEXT:    s_addc_u32 s5, s5, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    s_mov_b64 s[0:1], 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_max_i32_e32 v0, s2, v1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT:    v_max_i32_e32 v2, s2, v3
+; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT:    v_mov_b32_e32 v1, v0
-; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, v2
+; VI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; VI-NEXT:    s_cbranch_execnz .LBB91_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; VI-NEXT:    s_endpgm
@@ -4771,32 +4759,32 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; VI-NEXT:    s_ashr_i32 s5, s7, 31
 ; VI-NEXT:    s_mov_b32 s4, s7
 ; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT:    s_add_u32 s0, s0, s4
-; VI-NEXT:    s_addc_u32 s1, s1, s5
-; VI-NEXT:    s_load_dword s7, s[0:1], 0x10
-; VI-NEXT:    s_add_u32 s0, s0, 16
-; VI-NEXT:    s_addc_u32 s1, s1, 0
-; VI-NEXT:    s_mov_b64 s[4:5], 0
+; VI-NEXT:    s_add_u32 s4, s0, s4
+; VI-NEXT:    s_addc_u32 s5, s1, s5
+; VI-NEXT:    s_load_dword s7, s[4:5], 0x10
+; VI-NEXT:    s_add_u32 s4, s4, 16
+; VI-NEXT:    s_addc_u32 s5, s5, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    s_mov_b64 s[0:1], 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s7
+; VI-NEXT:    v_mov_b32_e32 v2, s7
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:  .LBB92_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_mov_b32_e32 v1, v0
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_max_i32_e32 v0, s6, v1
-; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT:    v_mov_b32_e32 v3, v2
+; VI-NEXT:    v_max_i32_e32 v2, s6, v3
+; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; VI-NEXT:    s_cbranch_execnz .LBB92_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    v_mov_b32_e32 v1, s2
-; VI-NEXT:    v_mov_b32_e32 v2, s3
-; VI-NEXT:    flat_store_dword v[1:2], v0
+; VI-NEXT:    s_or_b64 exec, exec, s[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: atomic_max_i32_ret_addr64_offset:
@@ -4878,24 +4866,24 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ; VI-NEXT:    s_ashr_i32 s5, s3, 31
 ; VI-NEXT:    s_mov_b32 s4, s3
 ; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT:    s_add_u32 s0, s0, s4
-; VI-NEXT:    s_addc_u32 s1, s1, s5
-; VI-NEXT:    s_load_dword s3, s[0:1], 0x0
-; VI-NEXT:    s_mov_b64 s[4:5], 0
+; VI-NEXT:    s_add_u32 s4, s0, s4
+; VI-NEXT:    s_addc_u32 s5, s1, s5
+; VI-NEXT:    s_load_dword s3, s[4:5], 0x0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    s_mov_b64 s[0:1], 0
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:  .LBB93_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_max_i32_e32 v0, s2, v1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT:    v_max_i32_e32 v2, s2, v3
+; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT:    v_mov_b32_e32 v1, v0
-; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, v2
+; VI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; VI-NEXT:    s_cbranch_execnz .LBB93_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; VI-NEXT:    s_endpgm
@@ -4981,30 +4969,30 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; VI-NEXT:    s_ashr_i32 s5, s7, 31
 ; VI-NEXT:    s_mov_b32 s4, s7
 ; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT:    s_add_u32 s0, s0, s4
-; VI-NEXT:    s_addc_u32 s1, s1, s5
-; VI-NEXT:    s_load_dword s7, s[0:1], 0x0
-; VI-NEXT:    s_mov_b64 s[4:5], 0
+; VI-NEXT:    s_add_u32 s4, s0, s4
+; VI-NEXT:    s_addc_u32 s5, s1, s5
+; VI-NEXT:    s_load_dword s7, s[4:5], 0x0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    s_mov_b64 s[0:1], 0
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s7
+; VI-NEXT:    v_mov_b32_e32 v2, s7
 ; VI-NEXT:  .LBB94_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_mov_b32_e32 v1, v0
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_max_i32_e32 v0, s6, v1
-; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT:    v_mov_b32_e32 v3, v2
+; VI-NEXT:    v_max_i32_e32 v2, s6, v3
+; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; VI-NEXT:    s_cbranch_execnz .LBB94_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    v_mov_b32_e32 v1, s2
-; VI-NEXT:    v_mov_b32_e32 v2, s3
-; VI-NEXT:    flat_store_dword v[1:2], v0
+; VI-NEXT:    s_or_b64 exec, exec, s[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: atomic_max_i32_ret_addr64:
@@ -5563,20 +5551,18 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    flat_load_dword v1, v[0:1]
+; VI-NEXT:    flat_load_dword v3, v[0:1]
 ; VI-NEXT:    s_mov_b64 s[34:35], 0
 ; VI-NEXT:  .LBB101_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_max_u32_e32 v0, s6, v1
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT:    v_max_u32_e32 v2, s6, v3
+; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; VI-NEXT:    v_mov_b32_e32 v1, v0
+; VI-NEXT:    v_mov_b32_e32 v3, v2
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB101_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5655,24 +5641,22 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s34
 ; VI-NEXT:    v_mov_b32_e32 v1, s35
-; VI-NEXT:    flat_load_dword v1, v[0:1]
-; VI-NEXT:    s_mov_b64 s[36:37], 0
+; VI-NEXT:    flat_load_dword v3, v[0:1]
+; VI-NEXT:    s_mov_b64 s[34:35], 0
 ; VI-NEXT:  .LBB102_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_mov_b32_e32 v2, s34
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_max_u32_e32 v0, s6, v1
-; VI-NEXT:    v_mov_b32_e32 v3, s35
-; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT:    v_max_u32_e32 v2, s6, v3
+; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; VI-NEXT:    v_mov_b32_e32 v1, v0
-; VI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    v_mov_b32_e32 v3, v2
+; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB102_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[36:37]
+; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_umax_i32_noret_offset_scalar:
@@ -5747,18 +5731,18 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, s4
 ; VI-NEXT:    s_mov_b64 s[34:35], 0
+; VI-NEXT:    v_mov_b32_e32 v2, s5
 ; VI-NEXT:  .LBB103_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, v0
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    v_max_u32_e32 v0, s6, v1
-; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT:    v_mov_b32_e32 v4, v0
+; VI-NEXT:    v_max_u32_e32 v3, s6, v4
+; VI-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB103_1
@@ -5836,26 +5820,24 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1)
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s34
-; VI-NEXT:    v_mov_b32_e32 v1, s35
-; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b64 s[36:37], 0
+; VI-NEXT:    v_mov_b32_e32 v1, s34
+; VI-NEXT:    v_mov_b32_e32 v2, s35
+; VI-NEXT:    flat_load_dword v0, v[1:2]
+; VI-NEXT:    s_mov_b64 s[34:35], 0
 ; VI-NEXT:  .LBB104_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, v0
-; VI-NEXT:    v_mov_b32_e32 v2, s34
-; VI-NEXT:    v_mov_b32_e32 v3, s35
-; VI-NEXT:    v_max_u32_e32 v0, s6, v1
-; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT:    v_mov_b32_e32 v4, v0
+; VI-NEXT:    v_max_u32_e32 v3, s6, v4
+; VI-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; VI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB104_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[36:37]
+; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_umax_i32_ret_offset_scalar:
@@ -5924,26 +5906,26 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out,
 ; VI-NEXT:    s_ashr_i32 s5, s3, 31
 ; VI-NEXT:    s_mov_b32 s4, s3
 ; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT:    s_add_u32 s0, s0, s4
-; VI-NEXT:    s_addc_u32 s1, s1, s5
-; VI-NEXT:    s_load_dword s3, s[0:1], 0x10
-; VI-NEXT:    s_add_u32 s0, s0, 16
-; VI-NEXT:    s_addc_u32 s1, s1, 0
-; VI-NEXT:    s_mov_b64 s[4:5], 0
+; VI-NEXT:    s_add_u32 s4, s0, s4
+; VI-NEXT:    s_addc_u32 s5, s1, s5
+; VI-NEXT:    s_load_dword s3, s[4:5], 0x10
+; VI-NEXT:    s_add_u32 s4, s4, 16
+; VI-NEXT:    s_addc_u32 s5, s5, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    s_mov_b64 s[0:1], 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:  .LBB105_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_max_u32_e32 v0, s2, v1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT:    v_max_u32_e32 v2, s2, v3
+; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT:    v_mov_b32_e32 v1, v0
-; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, v2
+; VI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; VI-NEXT:    s_cbranch_execnz .LBB105_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; VI-NEXT:    s_endpgm
@@ -6030,32 +6012,32 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o
 ; VI-NEXT:    s_ashr_i32 s5, s7, 31
 ; VI-NEXT:    s_mov_b32 s4, s7
 ; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT:    s_add_u32 s0, s0, s4
-; VI-NEXT:    s_addc_u32 s1, s1, s5
-; VI-NEXT:    s_load_dword s7, s[0:1], 0x10
-; VI-NEXT:    s_add_u32 s0, s0, 16
-; VI-NEXT:    s_addc_u32 s1, s1, 0
-; VI-NEXT:    s_mov_b64 s[4:5], 0
+; VI-NEXT:    s_add_u32 s4, s0, s4
+; VI-NEXT:    s_addc_u32 s5, s1, s5
+; VI-NEXT:    s_load_dword s7, s[4:5], 0x10
+; VI-NEXT:    s_add_u32 s4, s4, 16
+; VI-NEXT:    s_addc_u32 s5, s5, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    s_mov_b64 s[0:1], 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s7
+; VI-NEXT:    v_mov_b32_e32 v2, s7
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:  .LBB106_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_mov_b32_e32 v1, v0
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_max_u32_e32 v0, s6, v1
-; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT:    v_mov_b32_e32 v3, v2
+; VI-NEXT:    v_max_u32_e32 v2, s6, v3
+; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; VI-NEXT:    s_cbranch_execnz .LBB106_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    v_mov_b32_e32 v1, s2
-; VI-NEXT:    v_mov_b32_e32 v2, s3
-; VI-NEXT:    flat_store_dword v[1:2], v0
+; VI-NEXT:    s_or_b64 exec, exec, s[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: atomic_umax_i32_ret_addr64_offset:
@@ -6145,30 +6127,30 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; VI-NEXT:    s_ashr_i32 s5, s7, 31
 ; VI-NEXT:    s_mov_b32 s4, s7
 ; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT:    s_add_u32 s0, s0, s4
-; VI-NEXT:    s_addc_u32 s1, s1, s5
-; VI-NEXT:    s_load_dword s7, s[0:1], 0x0
-; VI-NEXT:    s_mov_b64 s[4:5], 0
+; VI-NEXT:    s_add_u32 s4, s0, s4
+; VI-NEXT:    s_addc_u32 s5, s1, s5
+; VI-NEXT:    s_load_dword s7, s[4:5], 0x0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    s_mov_b64 s[0:1], 0
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s7
+; VI-NEXT:    v_mov_b32_e32 v2, s7
 ; VI-NEXT:  .LBB107_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_mov_b32_e32 v1, v0
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_max_u32_e32 v0, s6, v1
-; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT:    v_mov_b32_e32 v3, v2
+; VI-NEXT:    v_max_u32_e32 v2, s6, v3
+; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; VI-NEXT:    s_cbranch_execnz .LBB107_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    v_mov_b32_e32 v1, s2
-; VI-NEXT:    v_mov_b32_e32 v2, s3
-; VI-NEXT:    flat_store_dword v[1:2], v0
+; VI-NEXT:    s_or_b64 exec, exec, s[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: atomic_umax_i32_ret_addr64:
@@ -6727,20 +6709,18 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    flat_load_dword v1, v[0:1]
+; VI-NEXT:    flat_load_dword v3, v[0:1]
 ; VI-NEXT:    s_mov_b64 s[34:35], 0
 ; VI-NEXT:  .LBB114_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_min_u32_e32 v0, s6, v1
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT:    v_min_u32_e32 v2, s6, v3
+; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; VI-NEXT:    v_mov_b32_e32 v1, v0
+; VI-NEXT:    v_mov_b32_e32 v3, v2
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB114_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6819,24 +6799,22 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s34
 ; VI-NEXT:    v_mov_b32_e32 v1, s35
-; VI-NEXT:    flat_load_dword v1, v[0:1]
-; VI-NEXT:    s_mov_b64 s[36:37], 0
+; VI-NEXT:    flat_load_dword v3, v[0:1]
+; VI-NEXT:    s_mov_b64 s[34:35], 0
 ; VI-NEXT:  .LBB115_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_mov_b32_e32 v2, s34
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_min_u32_e32 v0, s6, v1
-; VI-NEXT:    v_mov_b32_e32 v3, s35
-; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT:    v_min_u32_e32 v2, s6, v3
+; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; VI-NEXT:    v_mov_b32_e32 v1, v0
-; VI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    v_mov_b32_e32 v3, v2
+; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB115_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[36:37]
+; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_umin_i32_noret_offset_scalar:
@@ -6911,18 +6889,18 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, s4
 ; VI-NEXT:    s_mov_b64 s[34:35], 0
+; VI-NEXT:    v_mov_b32_e32 v2, s5
 ; VI-NEXT:  .LBB116_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, v0
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    v_min_u32_e32 v0, s6, v1
-; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT:    v_mov_b32_e32 v4, v0
+; VI-NEXT:    v_min_u32_e32 v3, s6, v4
+; VI-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB116_1
@@ -7000,26 +6978,24 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1)
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s34
-; VI-NEXT:    v_mov_b32_e32 v1, s35
-; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b64 s[36:37], 0
+; VI-NEXT:    v_mov_b32_e32 v1, s34
+; VI-NEXT:    v_mov_b32_e32 v2, s35
+; VI-NEXT:    flat_load_dword v0, v[1:2]
+; VI-NEXT:    s_mov_b64 s[34:35], 0
 ; VI-NEXT:  .LBB117_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, v0
-; VI-NEXT:    v_mov_b32_e32 v2, s34
-; VI-NEXT:    v_mov_b32_e32 v3, s35
-; VI-NEXT:    v_min_u32_e32 v0, s6, v1
-; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT:    v_mov_b32_e32 v4, v0
+; VI-NEXT:    v_min_u32_e32 v3, s6, v4
+; VI-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; VI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB117_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[36:37]
+; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_umin_i32_ret_offset_scalar:
@@ -7566,20 +7542,18 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    flat_load_dword v1, v[0:1]
+; VI-NEXT:    flat_load_dword v3, v[0:1]
 ; VI-NEXT:    s_mov_b64 s[34:35], 0
 ; VI-NEXT:  .LBB124_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_min_i32_e32 v0, s6, v1
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT:    v_min_i32_e32 v2, s6, v3
+; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; VI-NEXT:    v_mov_b32_e32 v1, v0
+; VI-NEXT:    v_mov_b32_e32 v3, v2
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB124_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7658,24 +7632,22 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace(
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s34
 ; VI-NEXT:    v_mov_b32_e32 v1, s35
-; VI-NEXT:    flat_load_dword v1, v[0:1]
-; VI-NEXT:    s_mov_b64 s[36:37], 0
+; VI-NEXT:    flat_load_dword v3, v[0:1]
+; VI-NEXT:    s_mov_b64 s[34:35], 0
 ; VI-NEXT:  .LBB125_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_mov_b32_e32 v2, s34
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_min_i32_e32 v0, s6, v1
-; VI-NEXT:    v_mov_b32_e32 v3, s35
-; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT:    v_min_i32_e32 v2, s6, v3
+; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; VI-NEXT:    v_mov_b32_e32 v1, v0
-; VI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    v_mov_b32_e32 v3, v2
+; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB125_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[36:37]
+; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_min_i32_noret_offset_scalar:
@@ -7750,18 +7722,18 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg %
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, s4
 ; VI-NEXT:    s_mov_b64 s[34:35], 0
+; VI-NEXT:    v_mov_b32_e32 v2, s5
 ; VI-NEXT:  .LBB126_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, v0
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    v_min_i32_e32 v0, s6, v1
-; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT:    v_mov_b32_e32 v4, v0
+; VI-NEXT:    v_min_i32_e32 v3, s6, v4
+; VI-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB126_1
@@ -7839,26 +7811,24 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1)
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s34
-; VI-NEXT:    v_mov_b32_e32 v1, s35
-; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b64 s[36:37], 0
+; VI-NEXT:    v_mov_b32_e32 v1, s34
+; VI-NEXT:    v_mov_b32_e32 v2, s35
+; VI-NEXT:    flat_load_dword v0, v[1:2]
+; VI-NEXT:    s_mov_b64 s[34:35], 0
 ; VI-NEXT:  .LBB127_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, v0
-; VI-NEXT:    v_mov_b32_e32 v2, s34
-; VI-NEXT:    v_mov_b32_e32 v3, s35
-; VI-NEXT:    v_min_i32_e32 v0, s6, v1
-; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT:    v_mov_b32_e32 v4, v0
+; VI-NEXT:    v_min_i32_e32 v3, s6, v4
+; VI-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; VI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB127_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[36:37]
+; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_min_i32_ret_offset_scalar:
@@ -7927,26 +7897,26 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i
 ; VI-NEXT:    s_ashr_i32 s5, s3, 31
 ; VI-NEXT:    s_mov_b32 s4, s3
 ; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT:    s_add_u32 s0, s0, s4
-; VI-NEXT:    s_addc_u32 s1, s1, s5
-; VI-NEXT:    s_load_dword s3, s[0:1], 0x10
-; VI-NEXT:    s_add_u32 s0, s0, 16
-; VI-NEXT:    s_addc_u32 s1, s1, 0
-; VI-NEXT:    s_mov_b64 s[4:5], 0
+; VI-NEXT:    s_add_u32 s4, s0, s4
+; VI-NEXT:    s_addc_u32 s5, s1, s5
+; VI-NEXT:    s_load_dword s3, s[4:5], 0x10
+; VI-NEXT:    s_add_u32 s4, s4, 16
+; VI-NEXT:    s_addc_u32 s5, s5, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    s_mov_b64 s[0:1], 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:  .LBB128_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_min_i32_e32 v0, s2, v1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT:    v_min_i32_e32 v2, s2, v3
+; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT:    v_mov_b32_e32 v1, v0
-; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, v2
+; VI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; VI-NEXT:    s_cbranch_execnz .LBB128_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; VI-NEXT:    s_endpgm
@@ -8033,32 +8003,32 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; VI-NEXT:    s_ashr_i32 s5, s7, 31
 ; VI-NEXT:    s_mov_b32 s4, s7
 ; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT:    s_add_u32 s0, s0, s4
-; VI-NEXT:    s_addc_u32 s1, s1, s5
-; VI-NEXT:    s_load_dword s7, s[0:1], 0x10
-; VI-NEXT:    s_add_u32 s0, s0, 16
-; VI-NEXT:    s_addc_u32 s1, s1, 0
-; VI-NEXT:    s_mov_b64 s[4:5], 0
+; VI-NEXT:    s_add_u32 s4, s0, s4
+; VI-NEXT:    s_addc_u32 s5, s1, s5
+; VI-NEXT:    s_load_dword s7, s[4:5], 0x10
+; VI-NEXT:    s_add_u32 s4, s4, 16
+; VI-NEXT:    s_addc_u32 s5, s5, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    s_mov_b64 s[0:1], 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s7
+; VI-NEXT:    v_mov_b32_e32 v2, s7
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:  .LBB129_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_mov_b32_e32 v1, v0
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_min_i32_e32 v0, s6, v1
-; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT:    v_mov_b32_e32 v3, v2
+; VI-NEXT:    v_min_i32_e32 v2, s6, v3
+; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; VI-NEXT:    s_cbranch_execnz .LBB129_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    v_mov_b32_e32 v1, s2
-; VI-NEXT:    v_mov_b32_e32 v2, s3
-; VI-NEXT:    flat_store_dword v[1:2], v0
+; VI-NEXT:    s_or_b64 exec, exec, s[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: atomic_min_i32_ret_addr64_offset:
@@ -8131,25 +8101,25 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) {
 ;
 ; VI-LABEL: atomic_min_i32:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
-; VI-NEXT:    s_mov_b64 s[2:3], 0
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; VI-NEXT:    s_mov_b64 s[0:1], 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dword s5, s[0:1], 0x0
+; VI-NEXT:    s_load_dword s3, s[6:7], 0x0
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:  .LBB130_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_min_i32_e32 v0, s4, v1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT:    v_min_i32_e32 v2, s2, v3
+; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; VI-NEXT:    v_mov_b32_e32 v1, v0
-; VI-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, v2
+; VI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; VI-NEXT:    s_cbranch_execnz .LBB130_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; VI-NEXT:    s_endpgm
@@ -8230,30 +8200,30 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; VI-NEXT:    s_ashr_i32 s5, s7, 31
 ; VI-NEXT:    s_mov_b32 s4, s7
 ; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT:    s_add_u32 s0, s0, s4
-; VI-NEXT:    s_addc_u32 s1, s1, s5
-; VI-NEXT:    s_load_dword s7, s[0:1], 0x0
-; VI-NEXT:    s_mov_b64 s[4:5], 0
+; VI-NEXT:    s_add_u32 s4, s0, s4
+; VI-NEXT:    s_addc_u32 s5, s1, s5
+; VI-NEXT:    s_load_dword s7, s[4:5], 0x0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    s_mov_b64 s[0:1], 0
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s7
+; VI-NEXT:    v_mov_b32_e32 v2, s7
 ; VI-NEXT:  .LBB131_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_mov_b32_e32 v1, v0
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_min_i32_e32 v0, s6, v1
-; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT:    v_mov_b32_e32 v3, v2
+; VI-NEXT:    v_min_i32_e32 v2, s6, v3
+; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; VI-NEXT:    s_cbranch_execnz .LBB131_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    v_mov_b32_e32 v1, s2
-; VI-NEXT:    v_mov_b32_e32 v2, s3
-; VI-NEXT:    flat_store_dword v[1:2], v0
+; VI-NEXT:    s_or_b64 exec, exec, s[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: atomic_min_i32_ret_addr64:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
index 645bb0b117ccb..59a99a6a0328d 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
@@ -2626,14 +2626,14 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-NEXT:    s_mov_b64 s[34:35], 0
+; VI-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v0, s7, v3
 ; VI-NEXT:    v_and_b32_e32 v6, s6, v2
-; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-NEXT:    v_not_b32_e32 v1, v0
 ; VI-NEXT:    v_not_b32_e32 v0, v6
 ; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -2730,17 +2730,15 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s34
-; VI-NEXT:    v_mov_b32_e32 v1, s35
-; VI-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; VI-NEXT:    s_mov_b64 s[36:37], 0
+; VI-NEXT:    v_mov_b32_e32 v4, s34
+; VI-NEXT:    v_mov_b32_e32 v5, s35
+; VI-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
+; VI-NEXT:    s_mov_b64 s[34:35], 0
 ; VI-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v0, s7, v3
 ; VI-NEXT:    v_and_b32_e32 v6, s6, v2
-; VI-NEXT:    v_mov_b32_e32 v4, s34
-; VI-NEXT:    v_mov_b32_e32 v5, s35
 ; VI-NEXT:    v_not_b32_e32 v1, v0
 ; VI-NEXT:    v_not_b32_e32 v0, v6
 ; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -2748,12 +2746,12 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v3, v1
-; VI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    v_mov_b32_e32 v2, v0
-; VI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB55_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[36:37]
+; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_nand_i64_noret_offset_scalar:
@@ -2839,22 +2837,22 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    s_mov_b64 s[34:35], 0
+; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, v1
-; VI-NEXT:    v_mov_b32_e32 v2, v0
-; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    v_and_b32_e32 v0, s7, v3
-; VI-NEXT:    v_and_b32_e32 v6, s6, v2
-; VI-NEXT:    v_mov_b32_e32 v5, s5
-; VI-NEXT:    v_not_b32_e32 v1, v0
-; VI-NEXT:    v_not_b32_e32 v0, v6
-; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT:    v_mov_b32_e32 v7, v1
+; VI-NEXT:    v_mov_b32_e32 v6, v0
+; VI-NEXT:    v_and_b32_e32 v0, s7, v7
+; VI-NEXT:    v_and_b32_e32 v1, s6, v6
+; VI-NEXT:    v_not_b32_e32 v5, v0
+; VI-NEXT:    v_not_b32_e32 v4, v1
+; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB56_1
@@ -2943,30 +2941,28 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1)
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s34
-; VI-NEXT:    v_mov_b32_e32 v1, s35
-; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    s_mov_b64 s[36:37], 0
+; VI-NEXT:    v_mov_b32_e32 v2, s34
+; VI-NEXT:    v_mov_b32_e32 v3, s35
+; VI-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
+; VI-NEXT:    s_mov_b64 s[34:35], 0
 ; VI-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, v1
-; VI-NEXT:    v_mov_b32_e32 v2, v0
-; VI-NEXT:    v_mov_b32_e32 v4, s34
-; VI-NEXT:    v_and_b32_e32 v0, s7, v3
-; VI-NEXT:    v_and_b32_e32 v6, s6, v2
-; VI-NEXT:    v_mov_b32_e32 v5, s35
-; VI-NEXT:    v_not_b32_e32 v1, v0
-; VI-NEXT:    v_not_b32_e32 v0, v6
-; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT:    v_mov_b32_e32 v7, v1
+; VI-NEXT:    v_mov_b32_e32 v6, v0
+; VI-NEXT:    v_and_b32_e32 v0, s7, v7
+; VI-NEXT:    v_and_b32_e32 v1, s6, v6
+; VI-NEXT:    v_not_b32_e32 v5, v0
+; VI-NEXT:    v_not_b32_e32 v4, v1
+; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; VI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; VI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB57_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[36:37]
+; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_nand_i64_ret_offset_scalar:
@@ -4438,45 +4434,45 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v8, s6, 0
-; SI-NEXT:    v_writelane_b32 v8, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB84_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v0, s35
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[2:3]
-; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s34
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_mov_b32_e32 v6, v2
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v4, v0
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NEXT:    v_mov_b32_e32 v8, v2
+; SI-NEXT:    v_mov_b32_e32 v7, v1
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v2, v4
-; SI-NEXT:    v_mov_b32_e32 v3, v5
+; SI-NEXT:    v_mov_b32_e32 v2, v6
+; SI-NEXT:    v_mov_b32_e32 v3, v7
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB84_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v8, 1
-; SI-NEXT:    v_readlane_b32 s6, v8, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -4487,17 +4483,17 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-NEXT:    s_mov_b64 s[34:35], 0
+; VI-NEXT:    v_mov_b32_e32 v6, s7
+; VI-NEXT:    v_mov_b32_e32 v7, s6
+; VI-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-NEXT:  .LBB84_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s7
-; VI-NEXT:    v_mov_b32_e32 v6, s6
-; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    v_mov_b32_e32 v5, s5
-; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -4517,14 +4513,14 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s7
+; GFX9-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX9-NEXT:  .LBB84_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v5, s6
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
 ; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
@@ -4546,45 +4542,45 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace(
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v8, s6, 0
-; SI-NEXT:    v_writelane_b32 v8, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB85_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v0, s35
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[2:3]
-; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s34
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_mov_b32_e32 v6, v2
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v4, v0
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NEXT:    v_mov_b32_e32 v8, v2
+; SI-NEXT:    v_mov_b32_e32 v7, v1
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v2, v4
-; SI-NEXT:    v_mov_b32_e32 v3, v5
+; SI-NEXT:    v_mov_b32_e32 v2, v6
+; SI-NEXT:    v_mov_b32_e32 v3, v7
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB85_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v8, 1
-; SI-NEXT:    v_readlane_b32 s6, v8, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -4594,31 +4590,29 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace(
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s34
-; VI-NEXT:    v_mov_b32_e32 v1, s35
-; VI-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; VI-NEXT:    s_mov_b64 s[36:37], 0
+; VI-NEXT:    v_mov_b32_e32 v4, s34
+; VI-NEXT:    v_mov_b32_e32 v5, s35
+; VI-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
+; VI-NEXT:    s_mov_b64 s[34:35], 0
+; VI-NEXT:    v_mov_b32_e32 v6, s7
+; VI-NEXT:    v_mov_b32_e32 v7, s6
 ; VI-NEXT:  .LBB85_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s7
-; VI-NEXT:    v_mov_b32_e32 v6, s6
-; VI-NEXT:    v_mov_b32_e32 v4, s34
-; VI-NEXT:    v_mov_b32_e32 v5, s35
-; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v3, v1
-; VI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    v_mov_b32_e32 v2, v0
-; VI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB85_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[36:37]
+; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_max_i64_noret_offset_scalar:
@@ -4627,14 +4621,14 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace(
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[4:5] offset:32
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s7
+; GFX9-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX9-NEXT:  .LBB85_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v5, s6
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
 ; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
@@ -4657,45 +4651,45 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg %
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v6, s6, 0
-; SI-NEXT:    v_writelane_b32 v6, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB86_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v4, v0
+; SI-NEXT:    v_mov_b32_e32 v9, v1
+; SI-NEXT:    v_mov_b32_e32 v8, v0
+; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s35
-; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[4:5]
-; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v5, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s34
-; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, v2
-; SI-NEXT:    v_mov_b32_e32 v1, v3
-; SI-NEXT:    v_mov_b32_e32 v2, v4
-; SI-NEXT:    v_mov_b32_e32 v3, v5
+; SI-NEXT:    v_mov_b32_e32 v0, v6
+; SI-NEXT:    v_mov_b32_e32 v1, v7
+; SI-NEXT:    v_mov_b32_e32 v2, v8
+; SI-NEXT:    v_mov_b32_e32 v3, v9
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB86_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v6, 1
-; SI-NEXT:    v_readlane_b32 s6, v6, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -4706,23 +4700,23 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg %
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    s_mov_b64 s[34:35], 0
+; VI-NEXT:    v_mov_b32_e32 v4, s7
+; VI-NEXT:    v_mov_b32_e32 v5, s6
+; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:  .LBB86_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, v1
-; VI-NEXT:    v_mov_b32_e32 v2, v0
-; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s7
-; VI-NEXT:    v_mov_b32_e32 v6, s6
-; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    v_mov_b32_e32 v5, s5
-; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT:    v_mov_b32_e32 v9, v1
+; VI-NEXT:    v_mov_b32_e32 v8, v0
+; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB86_1
@@ -4736,20 +4730,20 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg %
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX9-NEXT:  .LBB86_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v6, v1
-; GFX9-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[5:6]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
-; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc
+; GFX9-NEXT:    v_mov_b32_e32 v8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[7:8]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v7, vcc
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB86_1
@@ -4765,45 +4759,45 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v6, s6, 0
-; SI-NEXT:    v_writelane_b32 v6, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB87_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v4, v0
+; SI-NEXT:    v_mov_b32_e32 v9, v1
+; SI-NEXT:    v_mov_b32_e32 v8, v0
+; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s35
-; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[4:5]
-; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v5, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s34
-; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, v2
-; SI-NEXT:    v_mov_b32_e32 v1, v3
-; SI-NEXT:    v_mov_b32_e32 v2, v4
-; SI-NEXT:    v_mov_b32_e32 v3, v5
+; SI-NEXT:    v_mov_b32_e32 v0, v6
+; SI-NEXT:    v_mov_b32_e32 v1, v7
+; SI-NEXT:    v_mov_b32_e32 v2, v8
+; SI-NEXT:    v_mov_b32_e32 v3, v9
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB87_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v6, 1
-; SI-NEXT:    v_readlane_b32 s6, v6, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -4813,31 +4807,29 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1)
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s34
-; VI-NEXT:    v_mov_b32_e32 v1, s35
-; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    s_mov_b64 s[36:37], 0
+; VI-NEXT:    v_mov_b32_e32 v2, s34
+; VI-NEXT:    v_mov_b32_e32 v3, s35
+; VI-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
+; VI-NEXT:    s_mov_b64 s[34:35], 0
+; VI-NEXT:    v_mov_b32_e32 v4, s7
+; VI-NEXT:    v_mov_b32_e32 v5, s6
 ; VI-NEXT:  .LBB87_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, v1
-; VI-NEXT:    v_mov_b32_e32 v2, v0
-; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s7
-; VI-NEXT:    v_mov_b32_e32 v6, s6
-; VI-NEXT:    v_mov_b32_e32 v4, s34
-; VI-NEXT:    v_mov_b32_e32 v5, s35
-; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT:    v_mov_b32_e32 v9, v1
+; VI-NEXT:    v_mov_b32_e32 v8, v0
+; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; VI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; VI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB87_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[36:37]
+; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_max_i64_ret_offset_scalar:
@@ -4846,20 +4838,20 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5] offset:32
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX9-NEXT:  .LBB87_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v6, v1
-; GFX9-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[5:6]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
-; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc
+; GFX9-NEXT:    v_mov_b32_e32 v8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[7:8]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v7, vcc
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB87_1
@@ -4883,29 +4875,29 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i
 ; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
 ; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    v_mov_b32_e32 v4, s3
+; SI-NEXT:    v_mov_b32_e32 v5, s2
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v2, s8
 ; SI-NEXT:    v_mov_b32_e32 v3, s9
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:  .LBB88_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_mov_b32_e32 v6, v2
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v4, v0
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NEXT:    v_mov_b32_e32 v8, v2
+; SI-NEXT:    v_mov_b32_e32 v7, v1
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT:    v_mov_b32_e32 v2, v4
-; SI-NEXT:    v_mov_b32_e32 v3, v5
+; SI-NEXT:    v_mov_b32_e32 v2, v6
+; SI-NEXT:    v_mov_b32_e32 v3, v7
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; SI-NEXT:    s_cbranch_execnz .LBB88_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4915,26 +4907,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT:    s_add_u32 s0, s0, s4
-; VI-NEXT:    s_addc_u32 s1, s1, s5
+; VI-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; VI-NEXT:    s_add_u32 s0, s0, s6
+; VI-NEXT:    s_addc_u32 s1, s1, s7
 ; VI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x20
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
-; VI-NEXT:    s_mov_b64 s[4:5], 0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v6, s3
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_mov_b32_e32 v7, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:  .LBB88_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    v_mov_b32_e32 v6, s2
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -4951,24 +4943,24 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX9-NEXT:    s_add_u32 s0, s0, s4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x20
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s3
+; GFX9-NEXT:    v_mov_b32_e32 v5, s2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:  .LBB88_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v5, s2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
-; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -4997,17 +4989,17 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; SI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
 ; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    v_mov_b32_e32 v8, s5
+; SI-NEXT:    v_mov_b32_e32 v9, s4
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v3, s7
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:  .LBB89_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v0, s5
 ; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
-; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v1, v8, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v9, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v7, v3
 ; SI-NEXT:    v_mov_b32_e32 v6, v2
@@ -5034,68 +5026,68 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; VI-LABEL: atomic_max_i64_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-NEXT:    s_mov_b64 s[8:9], 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
 ; VI-NEXT:    s_add_u32 s0, s0, s6
 ; VI-NEXT:    s_addc_u32 s1, s1, s7
-; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x20
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x20
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
-; VI-NEXT:    s_mov_b64 s[6:7], 0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v4, s5
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s8
-; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_mov_b32_e32 v5, s4
+; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:  .LBB89_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_mov_b32_e32 v3, v1
-; VI-NEXT:    v_mov_b32_e32 v2, v0
-; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s5
-; VI-NEXT:    v_mov_b32_e32 v6, s4
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT:    v_mov_b32_e32 v9, v3
+; VI-NEXT:    v_mov_b32_e32 v8, v2
+; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; VI-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; VI-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; VI-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; VI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; VI-NEXT:    s_cbranch_execnz .LBB89_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[6:7]
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    s_or_b64 exec, exec, s[8:9]
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: atomic_max_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[14:15], 3
 ; GFX9-NEXT:    s_add_u32 s0, s8, s0
 ; GFX9-NEXT:    s_addc_u32 s1, s9, s1
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x20
+; GFX9-NEXT:    v_mov_b32_e32 v2, s13
+; GFX9-NEXT:    v_mov_b32_e32 v3, s12
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:  .LBB89_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_mov_b32_e32 v6, v1
-; GFX9-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[5:6]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s13
-; GFX9-NEXT:    v_mov_b32_e32 v1, s12
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
-; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] offset:32 glc
+; GFX9-NEXT:    v_mov_b32_e32 v8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[7:8]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
 ; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB89_1
@@ -5124,29 +5116,29 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in,
 ; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    v_mov_b32_e32 v4, s3
+; SI-NEXT:    v_mov_b32_e32 v5, s2
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v2, s8
 ; SI-NEXT:    v_mov_b32_e32 v3, s9
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:  .LBB90_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_mov_b32_e32 v6, v2
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v4, v0
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NEXT:    v_mov_b32_e32 v8, v2
+; SI-NEXT:    v_mov_b32_e32 v7, v1
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT:    v_mov_b32_e32 v2, v4
-; SI-NEXT:    v_mov_b32_e32 v3, v5
+; SI-NEXT:    v_mov_b32_e32 v2, v6
+; SI-NEXT:    v_mov_b32_e32 v3, v7
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; SI-NEXT:    s_cbranch_execnz .LBB90_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5158,30 +5150,30 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in,
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT:    s_add_u32 s0, s0, s4
-; VI-NEXT:    s_addc_u32 s1, s1, s5
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; VI-NEXT:    s_mov_b64 s[4:5], 0
+; VI-NEXT:    s_add_u32 s4, s0, s4
+; VI-NEXT:    s_addc_u32 s5, s1, s5
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    s_mov_b64 s[0:1], 0
+; VI-NEXT:    v_mov_b32_e32 v6, s3
+; VI-NEXT:    v_mov_b32_e32 v7, s2
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-NEXT:  .LBB90_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    v_mov_b32_e32 v6, s2
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v3, v1
-; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v2, v0
-; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; VI-NEXT:    s_cbranch_execnz .LBB90_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; VI-NEXT:    s_endpgm
@@ -5190,24 +5182,24 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in,
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX9-NEXT:    s_add_u32 s0, s0, s4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s3
+; GFX9-NEXT:    v_mov_b32_e32 v5, s2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:  .LBB90_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v5, s2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
-; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -5235,17 +5227,17 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
 ; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    v_mov_b32_e32 v8, s5
+; SI-NEXT:    v_mov_b32_e32 v9, s4
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v3, s7
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v0, s5
 ; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
-; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v1, v8, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v9, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v7, v3
 ; SI-NEXT:    v_mov_b32_e32 v6, v2
@@ -5274,64 +5266,64 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT:    s_add_u32 s0, s0, s6
-; VI-NEXT:    s_addc_u32 s1, s1, s7
-; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x0
-; VI-NEXT:    s_mov_b64 s[6:7], 0
+; VI-NEXT:    s_add_u32 s6, s0, s6
+; VI-NEXT:    s_addc_u32 s7, s1, s7
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    s_mov_b64 s[0:1], 0
+; VI-NEXT:    v_mov_b32_e32 v4, s5
+; VI-NEXT:    v_mov_b32_e32 v5, s4
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s8
-; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_mov_b32_e32 v2, s8
+; VI-NEXT:    v_mov_b32_e32 v3, s9
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_mov_b32_e32 v3, v1
-; VI-NEXT:    v_mov_b32_e32 v2, v0
-; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s5
-; VI-NEXT:    v_mov_b32_e32 v6, s4
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT:    v_mov_b32_e32 v9, v3
+; VI-NEXT:    v_mov_b32_e32 v8, v2
+; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; VI-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; VI-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; VI-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; VI-NEXT:    s_cbranch_execnz .LBB91_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[6:7]
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    s_or_b64 exec, exec, s[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: atomic_max_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[14:15], 3
 ; GFX9-NEXT:    s_add_u32 s0, s8, s0
 ; GFX9-NEXT:    s_addc_u32 s1, s9, s1
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s13
+; GFX9-NEXT:    v_mov_b32_e32 v3, s12
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_mov_b32_e32 v6, v1
-; GFX9-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[5:6]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s13
-; GFX9-NEXT:    v_mov_b32_e32 v1, s12
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
-; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] glc
+; GFX9-NEXT:    v_mov_b32_e32 v8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[7:8]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
 ; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB91_1
@@ -5904,45 +5896,45 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v8, s6, 0
-; SI-NEXT:    v_writelane_b32 v8, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB98_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v0, s35
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[2:3]
-; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s34
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_mov_b32_e32 v6, v2
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v4, v0
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NEXT:    v_mov_b32_e32 v8, v2
+; SI-NEXT:    v_mov_b32_e32 v7, v1
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v2, v4
-; SI-NEXT:    v_mov_b32_e32 v3, v5
+; SI-NEXT:    v_mov_b32_e32 v2, v6
+; SI-NEXT:    v_mov_b32_e32 v3, v7
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB98_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v8, 1
-; SI-NEXT:    v_readlane_b32 s6, v8, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -5953,17 +5945,17 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-NEXT:    s_mov_b64 s[34:35], 0
+; VI-NEXT:    v_mov_b32_e32 v6, s7
+; VI-NEXT:    v_mov_b32_e32 v7, s6
+; VI-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-NEXT:  .LBB98_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s7
-; VI-NEXT:    v_mov_b32_e32 v6, s6
-; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    v_mov_b32_e32 v5, s5
-; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -5983,14 +5975,14 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s7
+; GFX9-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX9-NEXT:  .LBB98_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v5, s6
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
 ; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
@@ -6012,45 +6004,45 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v8, s6, 0
-; SI-NEXT:    v_writelane_b32 v8, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB99_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v0, s35
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[2:3]
-; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s34
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_mov_b32_e32 v6, v2
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v4, v0
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NEXT:    v_mov_b32_e32 v8, v2
+; SI-NEXT:    v_mov_b32_e32 v7, v1
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v2, v4
-; SI-NEXT:    v_mov_b32_e32 v3, v5
+; SI-NEXT:    v_mov_b32_e32 v2, v6
+; SI-NEXT:    v_mov_b32_e32 v3, v7
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB99_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v8, 1
-; SI-NEXT:    v_readlane_b32 s6, v8, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -6060,31 +6052,29 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s34
-; VI-NEXT:    v_mov_b32_e32 v1, s35
-; VI-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; VI-NEXT:    s_mov_b64 s[36:37], 0
+; VI-NEXT:    v_mov_b32_e32 v4, s34
+; VI-NEXT:    v_mov_b32_e32 v5, s35
+; VI-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
+; VI-NEXT:    s_mov_b64 s[34:35], 0
+; VI-NEXT:    v_mov_b32_e32 v6, s7
+; VI-NEXT:    v_mov_b32_e32 v7, s6
 ; VI-NEXT:  .LBB99_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s7
-; VI-NEXT:    v_mov_b32_e32 v6, s6
-; VI-NEXT:    v_mov_b32_e32 v4, s34
-; VI-NEXT:    v_mov_b32_e32 v5, s35
-; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v3, v1
-; VI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    v_mov_b32_e32 v2, v0
-; VI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB99_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[36:37]
+; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_umax_i64_noret_offset_scalar:
@@ -6093,14 +6083,14 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[4:5] offset:32
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s7
+; GFX9-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX9-NEXT:  .LBB99_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v5, s6
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
 ; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
@@ -6123,45 +6113,45 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v6, s6, 0
-; SI-NEXT:    v_writelane_b32 v6, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB100_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v4, v0
+; SI-NEXT:    v_mov_b32_e32 v9, v1
+; SI-NEXT:    v_mov_b32_e32 v8, v0
+; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s35
-; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[4:5]
-; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v5, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s34
-; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, v2
-; SI-NEXT:    v_mov_b32_e32 v1, v3
-; SI-NEXT:    v_mov_b32_e32 v2, v4
-; SI-NEXT:    v_mov_b32_e32 v3, v5
+; SI-NEXT:    v_mov_b32_e32 v0, v6
+; SI-NEXT:    v_mov_b32_e32 v1, v7
+; SI-NEXT:    v_mov_b32_e32 v2, v8
+; SI-NEXT:    v_mov_b32_e32 v3, v9
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB100_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v6, 1
-; SI-NEXT:    v_readlane_b32 s6, v6, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -6172,23 +6162,23 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    s_mov_b64 s[34:35], 0
+; VI-NEXT:    v_mov_b32_e32 v4, s7
+; VI-NEXT:    v_mov_b32_e32 v5, s6
+; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:  .LBB100_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, v1
-; VI-NEXT:    v_mov_b32_e32 v2, v0
-; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s7
-; VI-NEXT:    v_mov_b32_e32 v6, s6
-; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    v_mov_b32_e32 v5, s5
-; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT:    v_mov_b32_e32 v9, v1
+; VI-NEXT:    v_mov_b32_e32 v8, v0
+; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB100_1
@@ -6202,20 +6192,20 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX9-NEXT:  .LBB100_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v6, v1
-; GFX9-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[5:6]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
-; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc
+; GFX9-NEXT:    v_mov_b32_e32 v8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[7:8]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v7, vcc
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB100_1
@@ -6231,45 +6221,45 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v6, s6, 0
-; SI-NEXT:    v_writelane_b32 v6, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB101_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v4, v0
+; SI-NEXT:    v_mov_b32_e32 v9, v1
+; SI-NEXT:    v_mov_b32_e32 v8, v0
+; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s35
-; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[4:5]
-; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v5, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s34
-; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, v2
-; SI-NEXT:    v_mov_b32_e32 v1, v3
-; SI-NEXT:    v_mov_b32_e32 v2, v4
-; SI-NEXT:    v_mov_b32_e32 v3, v5
+; SI-NEXT:    v_mov_b32_e32 v0, v6
+; SI-NEXT:    v_mov_b32_e32 v1, v7
+; SI-NEXT:    v_mov_b32_e32 v2, v8
+; SI-NEXT:    v_mov_b32_e32 v3, v9
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB101_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v6, 1
-; SI-NEXT:    v_readlane_b32 s6, v6, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -6279,31 +6269,29 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1)
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s34
-; VI-NEXT:    v_mov_b32_e32 v1, s35
-; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    s_mov_b64 s[36:37], 0
+; VI-NEXT:    v_mov_b32_e32 v2, s34
+; VI-NEXT:    v_mov_b32_e32 v3, s35
+; VI-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
+; VI-NEXT:    s_mov_b64 s[34:35], 0
+; VI-NEXT:    v_mov_b32_e32 v4, s7
+; VI-NEXT:    v_mov_b32_e32 v5, s6
 ; VI-NEXT:  .LBB101_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, v1
-; VI-NEXT:    v_mov_b32_e32 v2, v0
-; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s7
-; VI-NEXT:    v_mov_b32_e32 v6, s6
-; VI-NEXT:    v_mov_b32_e32 v4, s34
-; VI-NEXT:    v_mov_b32_e32 v5, s35
-; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT:    v_mov_b32_e32 v9, v1
+; VI-NEXT:    v_mov_b32_e32 v8, v0
+; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; VI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; VI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB101_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[36:37]
+; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_umax_i64_ret_offset_scalar:
@@ -6312,20 +6300,20 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5] offset:32
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX9-NEXT:  .LBB101_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v6, v1
-; GFX9-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[5:6]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
-; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc
+; GFX9-NEXT:    v_mov_b32_e32 v8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[7:8]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v7, vcc
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB101_1
@@ -6349,29 +6337,29 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out,
 ; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
 ; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    v_mov_b32_e32 v4, s3
+; SI-NEXT:    v_mov_b32_e32 v5, s2
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v2, s8
 ; SI-NEXT:    v_mov_b32_e32 v3, s9
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:  .LBB102_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
-; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_mov_b32_e32 v6, v2
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v4, v0
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NEXT:    v_mov_b32_e32 v8, v2
+; SI-NEXT:    v_mov_b32_e32 v7, v1
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT:    v_mov_b32_e32 v2, v4
-; SI-NEXT:    v_mov_b32_e32 v3, v5
+; SI-NEXT:    v_mov_b32_e32 v2, v6
+; SI-NEXT:    v_mov_b32_e32 v3, v7
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; SI-NEXT:    s_cbranch_execnz .LBB102_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6381,26 +6369,26 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out,
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT:    s_add_u32 s0, s0, s4
-; VI-NEXT:    s_addc_u32 s1, s1, s5
+; VI-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; VI-NEXT:    s_add_u32 s0, s0, s6
+; VI-NEXT:    s_addc_u32 s1, s1, s7
 ; VI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x20
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
-; VI-NEXT:    s_mov_b64 s[4:5], 0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v6, s3
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_mov_b32_e32 v7, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:  .LBB102_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    v_mov_b32_e32 v6, s2
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -6417,24 +6405,24 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out,
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX9-NEXT:    s_add_u32 s0, s0, s4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x20
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s3
+; GFX9-NEXT:    v_mov_b32_e32 v5, s2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:  .LBB102_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v5, s2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
-; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -6463,17 +6451,17 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o
 ; SI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
 ; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    v_mov_b32_e32 v8, s5
+; SI-NEXT:    v_mov_b32_e32 v9, s4
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v3, s7
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:  .LBB103_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v0, s5
 ; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
-; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v1, v8, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v9, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v7, v3
 ; SI-NEXT:    v_mov_b32_e32 v6, v2
@@ -6500,68 +6488,68 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o
 ; VI-LABEL: atomic_umax_i64_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-NEXT:    s_mov_b64 s[8:9], 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
 ; VI-NEXT:    s_add_u32 s0, s0, s6
 ; VI-NEXT:    s_addc_u32 s1, s1, s7
-; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x20
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x20
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
-; VI-NEXT:    s_mov_b64 s[6:7], 0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v4, s5
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s8
-; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_mov_b32_e32 v5, s4
+; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:  .LBB103_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_mov_b32_e32 v3, v1
-; VI-NEXT:    v_mov_b32_e32 v2, v0
-; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s5
-; VI-NEXT:    v_mov_b32_e32 v6, s4
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT:    v_mov_b32_e32 v9, v3
+; VI-NEXT:    v_mov_b32_e32 v8, v2
+; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; VI-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; VI-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; VI-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; VI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; VI-NEXT:    s_cbranch_execnz .LBB103_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[6:7]
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    s_or_b64 exec, exec, s[8:9]
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[14:15], 3
 ; GFX9-NEXT:    s_add_u32 s0, s8, s0
 ; GFX9-NEXT:    s_addc_u32 s1, s9, s1
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x20
+; GFX9-NEXT:    v_mov_b32_e32 v2, s13
+; GFX9-NEXT:    v_mov_b32_e32 v3, s12
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:  .LBB103_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_mov_b32_e32 v6, v1
-; GFX9-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[5:6]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s13
-; GFX9-NEXT:    v_mov_b32_e32 v1, s12
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
-; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] offset:32 glc
+; GFX9-NEXT:    v_mov_b32_e32 v8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[7:8]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
 ; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB103_1
@@ -6589,17 +6577,17 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
 ; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    v_mov_b32_e32 v8, s5
+; SI-NEXT:    v_mov_b32_e32 v9, s4
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v3, s7
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:  .LBB104_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v0, s5
 ; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
-; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v1, v8, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v9, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v7, v3
 ; SI-NEXT:    v_mov_b32_e32 v6, v2
@@ -6628,64 +6616,64 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT:    s_add_u32 s0, s0, s6
-; VI-NEXT:    s_addc_u32 s1, s1, s7
-; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x0
-; VI-NEXT:    s_mov_b64 s[6:7], 0
+; VI-NEXT:    s_add_u32 s6, s0, s6
+; VI-NEXT:    s_addc_u32 s7, s1, s7
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    s_mov_b64 s[0:1], 0
+; VI-NEXT:    v_mov_b32_e32 v4, s5
+; VI-NEXT:    v_mov_b32_e32 v5, s4
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s8
-; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_mov_b32_e32 v2, s8
+; VI-NEXT:    v_mov_b32_e32 v3, s9
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:  .LBB104_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_mov_b32_e32 v3, v1
-; VI-NEXT:    v_mov_b32_e32 v2, v0
-; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s5
-; VI-NEXT:    v_mov_b32_e32 v6, s4
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT:    v_mov_b32_e32 v9, v3
+; VI-NEXT:    v_mov_b32_e32 v8, v2
+; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; VI-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; VI-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; VI-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; VI-NEXT:    s_cbranch_execnz .LBB104_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[6:7]
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    s_or_b64 exec, exec, s[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: atomic_umax_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[14:15], 3
 ; GFX9-NEXT:    s_add_u32 s0, s8, s0
 ; GFX9-NEXT:    s_addc_u32 s1, s9, s1
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s13
+; GFX9-NEXT:    v_mov_b32_e32 v3, s12
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:  .LBB104_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_mov_b32_e32 v6, v1
-; GFX9-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[5:6]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s13
-; GFX9-NEXT:    v_mov_b32_e32 v1, s12
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
-; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] glc
+; GFX9-NEXT:    v_mov_b32_e32 v8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[7:8]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
 ; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB104_1
@@ -7258,45 +7246,45 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v8, s6, 0
-; SI-NEXT:    v_writelane_b32 v8, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB111_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v0, s35
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[2:3]
-; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s34
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_mov_b32_e32 v6, v2
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v4, v0
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NEXT:    v_mov_b32_e32 v8, v2
+; SI-NEXT:    v_mov_b32_e32 v7, v1
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v2, v4
-; SI-NEXT:    v_mov_b32_e32 v3, v5
+; SI-NEXT:    v_mov_b32_e32 v2, v6
+; SI-NEXT:    v_mov_b32_e32 v3, v7
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB111_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v8, 1
-; SI-NEXT:    v_readlane_b32 s6, v8, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -7307,17 +7295,17 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-NEXT:    s_mov_b64 s[34:35], 0
+; VI-NEXT:    v_mov_b32_e32 v6, s7
+; VI-NEXT:    v_mov_b32_e32 v7, s6
+; VI-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-NEXT:  .LBB111_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s7
-; VI-NEXT:    v_mov_b32_e32 v6, s6
-; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    v_mov_b32_e32 v5, s5
-; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -7337,14 +7325,14 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s7
+; GFX9-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX9-NEXT:  .LBB111_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v5, s6
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
 ; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
@@ -7366,45 +7354,45 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v8, s6, 0
-; SI-NEXT:    v_writelane_b32 v8, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB112_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v0, s35
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[2:3]
-; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s34
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_mov_b32_e32 v6, v2
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v4, v0
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NEXT:    v_mov_b32_e32 v8, v2
+; SI-NEXT:    v_mov_b32_e32 v7, v1
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v2, v4
-; SI-NEXT:    v_mov_b32_e32 v3, v5
+; SI-NEXT:    v_mov_b32_e32 v2, v6
+; SI-NEXT:    v_mov_b32_e32 v3, v7
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB112_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v8, 1
-; SI-NEXT:    v_readlane_b32 s6, v8, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -7414,31 +7402,29 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s34
-; VI-NEXT:    v_mov_b32_e32 v1, s35
-; VI-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; VI-NEXT:    s_mov_b64 s[36:37], 0
+; VI-NEXT:    v_mov_b32_e32 v4, s34
+; VI-NEXT:    v_mov_b32_e32 v5, s35
+; VI-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
+; VI-NEXT:    s_mov_b64 s[34:35], 0
+; VI-NEXT:    v_mov_b32_e32 v6, s7
+; VI-NEXT:    v_mov_b32_e32 v7, s6
 ; VI-NEXT:  .LBB112_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s7
-; VI-NEXT:    v_mov_b32_e32 v6, s6
-; VI-NEXT:    v_mov_b32_e32 v4, s34
-; VI-NEXT:    v_mov_b32_e32 v5, s35
-; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v3, v1
-; VI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    v_mov_b32_e32 v2, v0
-; VI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB112_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[36:37]
+; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_umin_i64_noret_offset_scalar:
@@ -7447,14 +7433,14 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[4:5] offset:32
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s7
+; GFX9-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX9-NEXT:  .LBB112_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v5, s6
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
 ; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
@@ -7477,45 +7463,45 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v6, s6, 0
-; SI-NEXT:    v_writelane_b32 v6, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB113_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v4, v0
+; SI-NEXT:    v_mov_b32_e32 v9, v1
+; SI-NEXT:    v_mov_b32_e32 v8, v0
+; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s35
-; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[4:5]
-; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v5, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s34
-; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, v2
-; SI-NEXT:    v_mov_b32_e32 v1, v3
-; SI-NEXT:    v_mov_b32_e32 v2, v4
-; SI-NEXT:    v_mov_b32_e32 v3, v5
+; SI-NEXT:    v_mov_b32_e32 v0, v6
+; SI-NEXT:    v_mov_b32_e32 v1, v7
+; SI-NEXT:    v_mov_b32_e32 v2, v8
+; SI-NEXT:    v_mov_b32_e32 v3, v9
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB113_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v6, 1
-; SI-NEXT:    v_readlane_b32 s6, v6, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -7526,23 +7512,23 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    s_mov_b64 s[34:35], 0
+; VI-NEXT:    v_mov_b32_e32 v4, s7
+; VI-NEXT:    v_mov_b32_e32 v5, s6
+; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:  .LBB113_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, v1
-; VI-NEXT:    v_mov_b32_e32 v2, v0
-; VI-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s7
-; VI-NEXT:    v_mov_b32_e32 v6, s6
-; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    v_mov_b32_e32 v5, s5
-; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT:    v_mov_b32_e32 v9, v1
+; VI-NEXT:    v_mov_b32_e32 v8, v0
+; VI-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB113_1
@@ -7556,20 +7542,20 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX9-NEXT:  .LBB113_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v6, v1
-; GFX9-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[5:6]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
-; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc
+; GFX9-NEXT:    v_mov_b32_e32 v8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[7:8]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v7, vcc
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB113_1
@@ -7585,45 +7571,45 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v6, s6, 0
-; SI-NEXT:    v_writelane_b32 v6, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB114_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v4, v0
+; SI-NEXT:    v_mov_b32_e32 v9, v1
+; SI-NEXT:    v_mov_b32_e32 v8, v0
+; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s35
-; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[4:5]
-; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v5, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s34
-; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, v2
-; SI-NEXT:    v_mov_b32_e32 v1, v3
-; SI-NEXT:    v_mov_b32_e32 v2, v4
-; SI-NEXT:    v_mov_b32_e32 v3, v5
+; SI-NEXT:    v_mov_b32_e32 v0, v6
+; SI-NEXT:    v_mov_b32_e32 v1, v7
+; SI-NEXT:    v_mov_b32_e32 v2, v8
+; SI-NEXT:    v_mov_b32_e32 v3, v9
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB114_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v6, 1
-; SI-NEXT:    v_readlane_b32 s6, v6, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -7633,31 +7619,29 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1)
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s34
-; VI-NEXT:    v_mov_b32_e32 v1, s35
-; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    s_mov_b64 s[36:37], 0
+; VI-NEXT:    v_mov_b32_e32 v2, s34
+; VI-NEXT:    v_mov_b32_e32 v3, s35
+; VI-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
+; VI-NEXT:    s_mov_b64 s[34:35], 0
+; VI-NEXT:    v_mov_b32_e32 v4, s7
+; VI-NEXT:    v_mov_b32_e32 v5, s6
 ; VI-NEXT:  .LBB114_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, v1
-; VI-NEXT:    v_mov_b32_e32 v2, v0
-; VI-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s7
-; VI-NEXT:    v_mov_b32_e32 v6, s6
-; VI-NEXT:    v_mov_b32_e32 v4, s34
-; VI-NEXT:    v_mov_b32_e32 v5, s35
-; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT:    v_mov_b32_e32 v9, v1
+; VI-NEXT:    v_mov_b32_e32 v8, v0
+; VI-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; VI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; VI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB114_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[36:37]
+; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_umin_i64_ret_offset_scalar:
@@ -7666,20 +7650,20 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5] offset:32
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX9-NEXT:  .LBB114_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v6, v1
-; GFX9-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[5:6]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
-; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc
+; GFX9-NEXT:    v_mov_b32_e32 v8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[7:8]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v7, vcc
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB114_1
@@ -8248,45 +8232,45 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v8, s6, 0
-; SI-NEXT:    v_writelane_b32 v8, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB121_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v0, s35
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[2:3]
-; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s34
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_mov_b32_e32 v6, v2
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v4, v0
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NEXT:    v_mov_b32_e32 v8, v2
+; SI-NEXT:    v_mov_b32_e32 v7, v1
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v2, v4
-; SI-NEXT:    v_mov_b32_e32 v3, v5
+; SI-NEXT:    v_mov_b32_e32 v2, v6
+; SI-NEXT:    v_mov_b32_e32 v3, v7
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB121_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v8, 1
-; SI-NEXT:    v_readlane_b32 s6, v8, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -8297,17 +8281,17 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-NEXT:    s_mov_b64 s[34:35], 0
+; VI-NEXT:    v_mov_b32_e32 v6, s7
+; VI-NEXT:    v_mov_b32_e32 v7, s6
+; VI-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-NEXT:  .LBB121_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s7
-; VI-NEXT:    v_mov_b32_e32 v6, s6
-; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    v_mov_b32_e32 v5, s5
-; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -8327,14 +8311,14 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s7
+; GFX9-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX9-NEXT:  .LBB121_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v5, s6
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
 ; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
@@ -8356,45 +8340,45 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace(
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v8, s6, 0
-; SI-NEXT:    v_writelane_b32 v8, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB122_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v0, s35
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[2:3]
-; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s34
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_mov_b32_e32 v6, v2
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v4, v0
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NEXT:    v_mov_b32_e32 v8, v2
+; SI-NEXT:    v_mov_b32_e32 v7, v1
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v2, v4
-; SI-NEXT:    v_mov_b32_e32 v3, v5
+; SI-NEXT:    v_mov_b32_e32 v2, v6
+; SI-NEXT:    v_mov_b32_e32 v3, v7
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB122_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v8, 1
-; SI-NEXT:    v_readlane_b32 s6, v8, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -8404,31 +8388,29 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace(
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s34
-; VI-NEXT:    v_mov_b32_e32 v1, s35
-; VI-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; VI-NEXT:    s_mov_b64 s[36:37], 0
+; VI-NEXT:    v_mov_b32_e32 v4, s34
+; VI-NEXT:    v_mov_b32_e32 v5, s35
+; VI-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
+; VI-NEXT:    s_mov_b64 s[34:35], 0
+; VI-NEXT:    v_mov_b32_e32 v6, s7
+; VI-NEXT:    v_mov_b32_e32 v7, s6
 ; VI-NEXT:  .LBB122_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s7
-; VI-NEXT:    v_mov_b32_e32 v6, s6
-; VI-NEXT:    v_mov_b32_e32 v4, s34
-; VI-NEXT:    v_mov_b32_e32 v5, s35
-; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v3, v1
-; VI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    v_mov_b32_e32 v2, v0
-; VI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB122_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[36:37]
+; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_min_i64_noret_offset_scalar:
@@ -8437,14 +8419,14 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace(
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[4:5] offset:32
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s7
+; GFX9-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX9-NEXT:  .LBB122_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v5, s6
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
 ; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
@@ -8467,45 +8449,45 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg %
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v6, s6, 0
-; SI-NEXT:    v_writelane_b32 v6, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB123_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v4, v0
+; SI-NEXT:    v_mov_b32_e32 v9, v1
+; SI-NEXT:    v_mov_b32_e32 v8, v0
+; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s35
-; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[4:5]
-; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v5, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s34
-; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, v2
-; SI-NEXT:    v_mov_b32_e32 v1, v3
-; SI-NEXT:    v_mov_b32_e32 v2, v4
-; SI-NEXT:    v_mov_b32_e32 v3, v5
+; SI-NEXT:    v_mov_b32_e32 v0, v6
+; SI-NEXT:    v_mov_b32_e32 v1, v7
+; SI-NEXT:    v_mov_b32_e32 v2, v8
+; SI-NEXT:    v_mov_b32_e32 v3, v9
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB123_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v6, 1
-; SI-NEXT:    v_readlane_b32 s6, v6, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -8516,23 +8498,23 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg %
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    s_mov_b64 s[34:35], 0
+; VI-NEXT:    v_mov_b32_e32 v4, s7
+; VI-NEXT:    v_mov_b32_e32 v5, s6
+; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:  .LBB123_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, v1
-; VI-NEXT:    v_mov_b32_e32 v2, v0
-; VI-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s7
-; VI-NEXT:    v_mov_b32_e32 v6, s6
-; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    v_mov_b32_e32 v5, s5
-; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT:    v_mov_b32_e32 v9, v1
+; VI-NEXT:    v_mov_b32_e32 v8, v0
+; VI-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB123_1
@@ -8546,20 +8528,20 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg %
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX9-NEXT:  .LBB123_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v6, v1
-; GFX9-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[5:6]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
-; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc
+; GFX9-NEXT:    v_mov_b32_e32 v8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[7:8]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v7, vcc
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB123_1
@@ -8575,45 +8557,45 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v6, s6, 0
-; SI-NEXT:    v_writelane_b32 v6, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB124_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v4, v0
+; SI-NEXT:    v_mov_b32_e32 v9, v1
+; SI-NEXT:    v_mov_b32_e32 v8, v0
+; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s35
-; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[4:5]
-; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v5, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s34
-; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, v2
-; SI-NEXT:    v_mov_b32_e32 v1, v3
-; SI-NEXT:    v_mov_b32_e32 v2, v4
-; SI-NEXT:    v_mov_b32_e32 v3, v5
+; SI-NEXT:    v_mov_b32_e32 v0, v6
+; SI-NEXT:    v_mov_b32_e32 v1, v7
+; SI-NEXT:    v_mov_b32_e32 v2, v8
+; SI-NEXT:    v_mov_b32_e32 v3, v9
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB124_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v6, 1
-; SI-NEXT:    v_readlane_b32 s6, v6, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -8623,31 +8605,29 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1)
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s34
-; VI-NEXT:    v_mov_b32_e32 v1, s35
-; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    s_mov_b64 s[36:37], 0
+; VI-NEXT:    v_mov_b32_e32 v2, s34
+; VI-NEXT:    v_mov_b32_e32 v3, s35
+; VI-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
+; VI-NEXT:    s_mov_b64 s[34:35], 0
+; VI-NEXT:    v_mov_b32_e32 v4, s7
+; VI-NEXT:    v_mov_b32_e32 v5, s6
 ; VI-NEXT:  .LBB124_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, v1
-; VI-NEXT:    v_mov_b32_e32 v2, v0
-; VI-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s7
-; VI-NEXT:    v_mov_b32_e32 v6, s6
-; VI-NEXT:    v_mov_b32_e32 v4, s34
-; VI-NEXT:    v_mov_b32_e32 v5, s35
-; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT:    v_mov_b32_e32 v9, v1
+; VI-NEXT:    v_mov_b32_e32 v8, v0
+; VI-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; VI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; VI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB124_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[36:37]
+; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_min_i64_ret_offset_scalar:
@@ -8656,20 +8636,20 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5] offset:32
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX9-NEXT:  .LBB124_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v6, v1
-; GFX9-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[5:6]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
-; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc
+; GFX9-NEXT:    v_mov_b32_e32 v8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[7:8]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v7, vcc
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB124_1
@@ -8693,29 +8673,29 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i
 ; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
 ; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    v_mov_b32_e32 v4, s3
+; SI-NEXT:    v_mov_b32_e32 v5, s2
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v2, s8
 ; SI-NEXT:    v_mov_b32_e32 v3, s9
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:  .LBB125_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_mov_b32_e32 v6, v2
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v4, v0
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NEXT:    v_mov_b32_e32 v8, v2
+; SI-NEXT:    v_mov_b32_e32 v7, v1
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT:    v_mov_b32_e32 v2, v4
-; SI-NEXT:    v_mov_b32_e32 v3, v5
+; SI-NEXT:    v_mov_b32_e32 v2, v6
+; SI-NEXT:    v_mov_b32_e32 v3, v7
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; SI-NEXT:    s_cbranch_execnz .LBB125_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8725,26 +8705,26 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT:    s_add_u32 s0, s0, s4
-; VI-NEXT:    s_addc_u32 s1, s1, s5
+; VI-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; VI-NEXT:    s_add_u32 s0, s0, s6
+; VI-NEXT:    s_addc_u32 s1, s1, s7
 ; VI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x20
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
-; VI-NEXT:    s_mov_b64 s[4:5], 0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v6, s3
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_mov_b32_e32 v7, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:  .LBB125_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    v_mov_b32_e32 v6, s2
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -8761,24 +8741,24 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX9-NEXT:    s_add_u32 s0, s0, s4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x20
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s3
+; GFX9-NEXT:    v_mov_b32_e32 v5, s2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:  .LBB125_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v5, s2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
-; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -8807,17 +8787,17 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; SI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
 ; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    v_mov_b32_e32 v8, s5
+; SI-NEXT:    v_mov_b32_e32 v9, s4
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v3, s7
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:  .LBB126_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v0, s5
 ; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
-; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v1, v8, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v9, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v7, v3
 ; SI-NEXT:    v_mov_b32_e32 v6, v2
@@ -8844,68 +8824,68 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; VI-LABEL: atomic_min_i64_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-NEXT:    s_mov_b64 s[8:9], 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
 ; VI-NEXT:    s_add_u32 s0, s0, s6
 ; VI-NEXT:    s_addc_u32 s1, s1, s7
-; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x20
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x20
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
-; VI-NEXT:    s_mov_b64 s[6:7], 0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v4, s5
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s8
-; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_mov_b32_e32 v5, s4
+; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:  .LBB126_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_mov_b32_e32 v3, v1
-; VI-NEXT:    v_mov_b32_e32 v2, v0
-; VI-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s5
-; VI-NEXT:    v_mov_b32_e32 v6, s4
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT:    v_mov_b32_e32 v9, v3
+; VI-NEXT:    v_mov_b32_e32 v8, v2
+; VI-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; VI-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; VI-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; VI-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; VI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; VI-NEXT:    s_cbranch_execnz .LBB126_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[6:7]
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    s_or_b64 exec, exec, s[8:9]
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: atomic_min_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[14:15], 3
 ; GFX9-NEXT:    s_add_u32 s0, s8, s0
 ; GFX9-NEXT:    s_addc_u32 s1, s9, s1
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x20
+; GFX9-NEXT:    v_mov_b32_e32 v2, s13
+; GFX9-NEXT:    v_mov_b32_e32 v3, s12
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:  .LBB126_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_mov_b32_e32 v6, v1
-; GFX9-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[12:13], v[5:6]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s13
-; GFX9-NEXT:    v_mov_b32_e32 v1, s12
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
-; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] offset:32 glc
+; GFX9-NEXT:    v_mov_b32_e32 v8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[12:13], v[7:8]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
 ; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB126_1
@@ -8927,35 +8907,35 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x0
-; SI-NEXT:    s_mov_b64 s[4:5], 0
-; SI-NEXT:    s_mov_b32 s6, s2
-; SI-NEXT:    s_mov_b32 s7, s3
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SI-NEXT:    s_mov_b64 s[8:9], 0
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    v_mov_b32_e32 v4, s3
+; SI-NEXT:    v_mov_b32_e32 v5, s2
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, s8
-; SI-NEXT:    v_mov_b32_e32 v3, s9
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    v_mov_b32_e32 v3, s5
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:  .LBB127_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v0, s7
-; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_mov_b32_e32 v6, v2
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v4, v0
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NEXT:    v_mov_b32_e32 v8, v2
+; SI-NEXT:    v_mov_b32_e32 v7, v1
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
-; SI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; SI-NEXT:    v_mov_b32_e32 v2, v4
-; SI-NEXT:    v_mov_b32_e32 v3, v5
-; SI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v2, v6
+; SI-NEXT:    v_mov_b32_e32 v3, v7
+; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB127_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_endpgm
@@ -8966,18 +8946,18 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v6, s3
+; VI-NEXT:    v_mov_b32_e32 v7, s2
+; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
 ; VI-NEXT:  .LBB127_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    v_mov_b32_e32 v6, s2
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -8994,20 +8974,20 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s3
+; GFX9-NEXT:    v_mov_b32_e32 v5, s2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:  .LBB127_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v5, s2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
-; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -9034,17 +9014,17 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
 ; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    v_mov_b32_e32 v8, s5
+; SI-NEXT:    v_mov_b32_e32 v9, s4
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v3, s7
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:  .LBB128_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v0, s5
 ; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
-; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v1, v8, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v9, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v7, v3
 ; SI-NEXT:    v_mov_b32_e32 v6, v2
@@ -9073,64 +9053,64 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT:    s_add_u32 s0, s0, s6
-; VI-NEXT:    s_addc_u32 s1, s1, s7
-; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x0
-; VI-NEXT:    s_mov_b64 s[6:7], 0
+; VI-NEXT:    s_add_u32 s6, s0, s6
+; VI-NEXT:    s_addc_u32 s7, s1, s7
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    s_mov_b64 s[0:1], 0
+; VI-NEXT:    v_mov_b32_e32 v4, s5
+; VI-NEXT:    v_mov_b32_e32 v5, s4
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s8
-; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_mov_b32_e32 v2, s8
+; VI-NEXT:    v_mov_b32_e32 v3, s9
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:  .LBB128_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_mov_b32_e32 v3, v1
-; VI-NEXT:    v_mov_b32_e32 v2, v0
-; VI-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s5
-; VI-NEXT:    v_mov_b32_e32 v6, s4
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
-; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT:    v_mov_b32_e32 v9, v3
+; VI-NEXT:    v_mov_b32_e32 v8, v2
+; VI-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; VI-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; VI-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; VI-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; VI-NEXT:    s_cbranch_execnz .LBB128_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
-; VI-NEXT:    s_or_b64 exec, exec, s[6:7]
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    s_or_b64 exec, exec, s[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: atomic_min_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[14:15], 3
 ; GFX9-NEXT:    s_add_u32 s0, s8, s0
 ; GFX9-NEXT:    s_addc_u32 s1, s9, s1
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s13
+; GFX9-NEXT:    v_mov_b32_e32 v3, s12
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:  .LBB128_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_mov_b32_e32 v6, v1
-; GFX9-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[12:13], v[5:6]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s13
-; GFX9-NEXT:    v_mov_b32_e32 v1, s12
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
-; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] glc
+; GFX9-NEXT:    v_mov_b32_e32 v8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[12:13], v[7:8]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
 ; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB128_1
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 785114cdbf39a..7792422291998 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -328,12 +328,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7LESS-NEXT:  .LBB1_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX7LESS-NEXT:    v_mul_f32_e32 v3, 1.0, v1
-; GFX7LESS-NEXT:    v_max_f32_e32 v0, v3, v0
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
@@ -399,14 +400,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v1, v3, s[0:1]
 ; GFX9-NEXT:  .LBB1_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_max_f32_e32 v0, v2, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v4, v1, v1
-; GFX9-NEXT:    v_max_f32_e32 v0, v4, v0
+; GFX9-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -757,14 +758,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_max_f32_e64 v6, s4, s4
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX9-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_max_f32_e64 v0, s4, s4
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_max_f32_e32 v6, v1, v1
-; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v6, v0
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v0, v6
 ; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -1366,12 +1367,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7LESS-NEXT:  .LBB3_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX7LESS-NEXT:    v_mul_f32_e32 v3, 1.0, v1
-; GFX7LESS-NEXT:    v_max_f32_e32 v0, v3, v0
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
@@ -1437,14 +1439,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v1, v3, s[0:1]
 ; GFX9-NEXT:  .LBB3_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_max_f32_e32 v0, v2, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v4, v1, v1
-; GFX9-NEXT:    v_max_f32_e32 v0, v4, v0
+; GFX9-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -1795,14 +1797,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_max_f32_e64 v6, s4, s4
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX9-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_max_f32_e64 v0, s4, s4
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_max_f32_e32 v6, v1, v1
-; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v6, v0
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v0, v6
 ; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -2404,12 +2406,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7LESS-NEXT:  .LBB5_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX7LESS-NEXT:    v_mul_f32_e32 v3, 1.0, v1
-; GFX7LESS-NEXT:    v_max_f32_e32 v0, v3, v0
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
@@ -2475,14 +2478,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v1, v3, s[0:1]
 ; GFX9-NEXT:  .LBB5_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_max_f32_e32 v0, v2, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v4, v1, v1
-; GFX9-NEXT:    v_max_f32_e32 v0, v4, v0
+; GFX9-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -2833,14 +2836,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_max_f32_e64 v6, s4, s4
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX9-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_max_f32_e64 v0, s4, s4
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_max_f32_e32 v6, v1, v1
-; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v6, v0
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v0, v6
 ; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -4051,12 +4054,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT:    v_mov_b32_e32 v41, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v42, 0x7ff80000
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX7LESS-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_ff1_i32_b64 s4, s[0:1]
-; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX7LESS-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX7LESS-NEXT:    v_readlane_b32 s2, v0, s4
 ; GFX7LESS-NEXT:    s_lshl_b64 s[4:5], 1, s4
@@ -4064,7 +4067,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX7LESS-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; GFX7LESS-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GFX7LESS-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT:    v_max_f64 v[41:42], v[2:3], v[4:5]
+; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_vccnz .LBB7_1
 ; GFX7LESS-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
@@ -4080,16 +4083,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[44:47], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX7LESS-NEXT:  .LBB7_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[41:42], v[41:42]
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
 ; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0
 ; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[4:5], v[2:3]
+; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[2:3], v[41:42]
 ; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
@@ -4163,20 +4166,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX9-NEXT:    s_movk_i32 s32, 0x800
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    v_mov_b32_e32 v41, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], exec
-; GFX9-NEXT:    v_mov_b32_e32 v42, 0x7ff80000
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX9-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s4, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX9-NEXT:    v_readlane_b32 s2, v0, s4
-; GFX9-NEXT:    v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_max_f64 v[41:42], v[2:3], v[4:5]
+; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4188,14 +4191,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX9-NEXT:  ; %bb.3:
 ; GFX9-NEXT:    s_load_dwordx2 s[44:45], s[36:37], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX9-NEXT:    s_mov_b64 s[46:47], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v0, s[44:45]
 ; GFX9-NEXT:  .LBB7_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX9-NEXT:    s_add_u32 s8, s36, 44
 ; GFX9-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX9-NEXT:    s_getpc_b64 s[0:1]
@@ -4203,19 +4206,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX9-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0
 ; GFX9-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GFX9-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
 ; GFX9-NEXT:    s_mov_b32 s12, s43
 ; GFX9-NEXT:    s_mov_b32 s13, s42
 ; GFX9-NEXT:    s_mov_b32 s14, s33
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s44
@@ -4226,8 +4229,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    buffer_load_dword v4, off, s[48:51], 0
+; GFX9-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
@@ -4272,20 +4275,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    v_mov_b32_e32 v41, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v42, 0x7ff80000
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s4, s[0:1]
-; GFX1064-NEXT:    v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX1064-NEXT:    v_readlane_b32 s2, v0, s4
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_max_f64 v[41:42], v[2:3], v[4:5]
+; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4297,26 +4300,28 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1064-NEXT:  ; %bb.3:
 ; GFX1064-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX1064-NEXT:    s_mov_b64 s[46:47], 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1064-NEXT:    global_load_dwordx2 v[4:5], v0, s[44:45]
 ; GFX1064-NEXT:  .LBB7_4: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1064-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
+; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], s[40:41]
@@ -4326,18 +4331,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    s_mov_b32 s13, s42
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
-; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s45
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    buffer_load_dword v4, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
 ; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX1064-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
@@ -4382,20 +4385,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    v_mov_b32_e32 v41, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v42, 0x7ff80000
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
-; GFX1032-NEXT:    v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1032-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX1032-NEXT:    v_max_f64 v[41:42], v[2:3], v[4:5]
+; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4407,25 +4410,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1032-NEXT:  ; %bb.3:
 ; GFX1032-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1032-NEXT:    global_load_dwordx2 v[4:5], v0, s[44:45]
 ; GFX1032-NEXT:  .LBB7_4: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1032-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0
 ; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
+; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
 ; GFX1032-NEXT:    s_mov_b64 s[4:5], s[40:41]
@@ -4435,18 +4440,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_mov_b32 s13, s42
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s45
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1032-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    buffer_load_dword v4, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
 ; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX1032-NEXT:    s_or_b32 s46, vcc_lo, s46
@@ -4481,14 +4484,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    v_mov_b32_e32 v41, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v42, 0x7ff80000
+; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s4, s[0:1]
-; GFX1164-NEXT:    v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1164-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1164-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX1164-NEXT:    v_readlane_b32 s2, v0, s4
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
@@ -4497,7 +4500,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_max_f64 v[41:42], v[2:3], v[4:5]
+; GFX1164-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4510,16 +4513,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-NEXT:  ; %bb.3:
 ; GFX1164-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX1164-NEXT:    s_mov_b64 s[46:47], 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1164-NEXT:    global_load_b64 v[4:5], v0, s[44:45]
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-NEXT:    .p2align 6
 ; GFX1164-NEXT:  .LBB7_4: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1164-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[0:1]
@@ -4527,7 +4530,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1164-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[38:39]
@@ -4535,19 +4540,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    s_mov_b32 s12, s43
 ; GFX1164-NEXT:    s_mov_b32 s13, s42
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1164-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
-; GFX1164-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s44
-; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1164-NEXT:    scratch_store_b64 off, v[4:5], off
 ; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
+; GFX1164-NEXT:    scratch_store_b64 off, v[0:1], off offset:8
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    scratch_load_b64 v[1:2], off, off
+; GFX1164-NEXT:    scratch_load_b64 v[4:5], off, off
 ; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -4584,14 +4586,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    v_mov_b32_e32 v41, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v42, 0x7ff80000
+; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
-; GFX1132-NEXT:    v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1132-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
@@ -4600,7 +4602,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f64 v[41:42], v[2:3], v[4:5]
+; GFX1132-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4613,22 +4615,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-NEXT:  ; %bb.3:
 ; GFX1132-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
 ; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1132-NEXT:    global_load_b64 v[4:5], v0, s[44:45]
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-NEXT:    .p2align 6
 ; GFX1132-NEXT:  .LBB7_4: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1132-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1132-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
 ; GFX1132-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT:    v_mov_b32_e32 v3, s45
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1132-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[38:39]
@@ -4636,16 +4640,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_mov_b32 s12, s43
 ; GFX1132-NEXT:    s_mov_b32 s13, s42
 ; GFX1132-NEXT:    s_mov_b32 s14, s33
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX1132-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
-; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
-; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1132-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1132-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1132-NEXT:    scratch_store_b64 off, v[4:5], off
+; GFX1132-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8
+; GFX1132-NEXT:    scratch_store_b64 off, v[0:1], off offset:8
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
+; GFX1132-NEXT:    scratch_load_b64 v[4:5], off, off
 ; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
@@ -4684,32 +4688,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v42, v0, v2
+; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v40, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v42
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v40, v0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v41, v1
-; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[0:1], off, s[44:47], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[44:47], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-DPP-NEXT:    v_max_f64 v[41:42], v[0:1], v[0:1]
 ; GFX7LESS-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[2:3], v[40:41], v[40:41]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[4:5], v[2:3]
+; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
@@ -4730,17 +4731,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v42
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s44
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s45
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB7_1
@@ -5093,23 +5094,23 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1032-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, v9
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1032-DPP-NEXT:    s_mov_b32 s46, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1032-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
 ; GFX1032-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -5122,6 +5123,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
@@ -5131,9 +5134,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_mov_b32 s13, s42
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[41:42]
 ; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
 ; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s45
@@ -5228,9 +5229,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, v9
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
@@ -5239,6 +5240,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1164-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[46:47], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
@@ -5246,9 +5248,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    .p2align 6
 ; GFX1164-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -5257,6 +5258,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
@@ -5264,13 +5267,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_mov_b32 s12, s43
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s42
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[41:42]
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s45
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
@@ -5353,9 +5353,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v41, v8
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -5363,15 +5363,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-DPP-NEXT:    .p2align 6
 ; GFX1132-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -5379,6 +5379,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
@@ -5386,11 +5387,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s43
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s42
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[41:42]
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5807,12 +5807,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX7LESS-NEXT:  .LBB9_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX7LESS-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[6:7], v[0:1]
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v7, v1
@@ -5882,15 +5883,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX9-NEXT:  ; %bb.3:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[0:1]
 ; GFX9-NEXT:  .LBB9_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[7:8], v[2:3], v[2:3]
-; GFX9-NEXT:    v_max_f64 v[0:1], v[7:8], v[0:1]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
 ; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -6067,16 +6068,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1164-NEXT:  ; %bb.3:
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
 ; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1164-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    global_load_b64 v[2:3], v6, s[0:1]
 ; GFX1164-NEXT:  .LBB9_4: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_max_f64 v[7:8], v[2:3], v[2:3]
+; GFX1164-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f64 v[0:1], v[7:8], v[0:1]
+; GFX1164-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
 ; GFX1164-NEXT:    global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -6137,15 +6138,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1132-NEXT:  ; %bb.3:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
 ; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    global_load_b64 v[2:3], v6, s[0:1]
 ; GFX1132-NEXT:  .LBB9_4: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_max_f64 v[7:8], v[2:3], v[2:3]
+; GFX1132-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f64 v[0:1], v[7:8], v[0:1]
+; GFX1132-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
 ; GFX1132-NEXT:    global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
@@ -6189,24 +6191,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[4:5], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[36:39], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX7LESS-DPP-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[6:7], v[4:5], v[4:5]
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[2:3], v[6:7], v[2:3]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v9, v5
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v8, v4
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v6
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, v7
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v6
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6560,16 +6563,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    global_load_b64 v[8:9], v10, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB9_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f64 v[11:12], v[8:9], v[8:9]
+; GFX1164-DPP-NEXT:    v_max_f64 v[6:7], v[8:9], v[8:9]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[6:7], v[11:12], v[6:7]
+; GFX1164-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[0:1]
 ; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[8:9]
@@ -6651,15 +6655,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_load_b64 v[8:9], v10, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB9_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f64 v[11:12], v[8:9], v[8:9]
+; GFX1132-DPP-NEXT:    v_max_f64 v[6:7], v[8:9], v[8:9]
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f64 v[6:7], v[11:12], v[6:7]
+; GFX1132-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[0:1]
 ; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[8:9]
@@ -7598,12 +7602,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT:    v_mov_b32_e32 v41, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v42, 0x7ff80000
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX7LESS-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_ff1_i32_b64 s4, s[0:1]
-; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX7LESS-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX7LESS-NEXT:    v_readlane_b32 s2, v0, s4
 ; GFX7LESS-NEXT:    s_lshl_b64 s[4:5], 1, s4
@@ -7611,7 +7615,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX7LESS-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; GFX7LESS-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GFX7LESS-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT:    v_max_f64 v[41:42], v[2:3], v[4:5]
+; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_vccnz .LBB11_1
 ; GFX7LESS-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
@@ -7627,16 +7631,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[44:47], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX7LESS-NEXT:  .LBB11_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[41:42], v[41:42]
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
 ; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0
 ; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[4:5], v[2:3]
+; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[2:3], v[41:42]
 ; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
@@ -7710,20 +7714,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX9-NEXT:    s_movk_i32 s32, 0x800
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    v_mov_b32_e32 v41, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], exec
-; GFX9-NEXT:    v_mov_b32_e32 v42, 0x7ff80000
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX9-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s4, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX9-NEXT:    v_readlane_b32 s2, v0, s4
-; GFX9-NEXT:    v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_max_f64 v[41:42], v[2:3], v[4:5]
+; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7735,14 +7739,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX9-NEXT:  ; %bb.3:
 ; GFX9-NEXT:    s_load_dwordx2 s[44:45], s[36:37], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX9-NEXT:    s_mov_b64 s[46:47], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v0, s[44:45]
 ; GFX9-NEXT:  .LBB11_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX9-NEXT:    s_add_u32 s8, s36, 44
 ; GFX9-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX9-NEXT:    s_getpc_b64 s[0:1]
@@ -7750,19 +7754,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX9-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0
 ; GFX9-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GFX9-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
 ; GFX9-NEXT:    s_mov_b32 s12, s43
 ; GFX9-NEXT:    s_mov_b32 s13, s42
 ; GFX9-NEXT:    s_mov_b32 s14, s33
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s44
@@ -7773,8 +7777,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    buffer_load_dword v4, off, s[48:51], 0
+; GFX9-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
@@ -7819,20 +7823,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    v_mov_b32_e32 v41, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v42, 0x7ff80000
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s4, s[0:1]
-; GFX1064-NEXT:    v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX1064-NEXT:    v_readlane_b32 s2, v0, s4
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_max_f64 v[41:42], v[2:3], v[4:5]
+; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7844,26 +7848,28 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1064-NEXT:  ; %bb.3:
 ; GFX1064-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX1064-NEXT:    s_mov_b64 s[46:47], 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1064-NEXT:    global_load_dwordx2 v[4:5], v0, s[44:45]
 ; GFX1064-NEXT:  .LBB11_4: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1064-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
+; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], s[40:41]
@@ -7873,18 +7879,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    s_mov_b32 s13, s42
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
-; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s45
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    buffer_load_dword v4, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
 ; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX1064-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
@@ -7929,20 +7933,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    v_mov_b32_e32 v41, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v42, 0x7ff80000
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
-; GFX1032-NEXT:    v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1032-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX1032-NEXT:    v_max_f64 v[41:42], v[2:3], v[4:5]
+; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7954,25 +7958,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1032-NEXT:  ; %bb.3:
 ; GFX1032-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1032-NEXT:    global_load_dwordx2 v[4:5], v0, s[44:45]
 ; GFX1032-NEXT:  .LBB11_4: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1032-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0
 ; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
+; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
 ; GFX1032-NEXT:    s_mov_b64 s[4:5], s[40:41]
@@ -7982,18 +7988,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_mov_b32 s13, s42
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s45
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1032-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    buffer_load_dword v4, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
 ; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX1032-NEXT:    s_or_b32 s46, vcc_lo, s46
@@ -8028,14 +8032,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    v_mov_b32_e32 v41, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v42, 0x7ff80000
+; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s4, s[0:1]
-; GFX1164-NEXT:    v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1164-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1164-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX1164-NEXT:    v_readlane_b32 s2, v0, s4
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
@@ -8044,7 +8048,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_max_f64 v[41:42], v[2:3], v[4:5]
+; GFX1164-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8057,16 +8061,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-NEXT:  ; %bb.3:
 ; GFX1164-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX1164-NEXT:    s_mov_b64 s[46:47], 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1164-NEXT:    global_load_b64 v[4:5], v0, s[44:45]
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-NEXT:    .p2align 6
 ; GFX1164-NEXT:  .LBB11_4: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1164-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[0:1]
@@ -8074,7 +8078,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1164-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[38:39]
@@ -8082,19 +8088,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    s_mov_b32 s12, s43
 ; GFX1164-NEXT:    s_mov_b32 s13, s42
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1164-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
-; GFX1164-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s44
-; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1164-NEXT:    scratch_store_b64 off, v[4:5], off
 ; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
+; GFX1164-NEXT:    scratch_store_b64 off, v[0:1], off offset:8
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    scratch_load_b64 v[1:2], off, off
+; GFX1164-NEXT:    scratch_load_b64 v[4:5], off, off
 ; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -8131,14 +8134,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    v_mov_b32_e32 v41, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v42, 0x7ff80000
+; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
-; GFX1132-NEXT:    v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1132-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
@@ -8147,7 +8150,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f64 v[41:42], v[2:3], v[4:5]
+; GFX1132-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8160,22 +8163,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-NEXT:  ; %bb.3:
 ; GFX1132-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
 ; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1132-NEXT:    global_load_b64 v[4:5], v0, s[44:45]
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-NEXT:    .p2align 6
 ; GFX1132-NEXT:  .LBB11_4: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1132-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1132-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
 ; GFX1132-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT:    v_mov_b32_e32 v3, s45
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1132-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[38:39]
@@ -8183,16 +8188,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    s_mov_b32 s12, s43
 ; GFX1132-NEXT:    s_mov_b32 s13, s42
 ; GFX1132-NEXT:    s_mov_b32 s14, s33
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX1132-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
-; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
-; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1132-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1132-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1132-NEXT:    scratch_store_b64 off, v[4:5], off
+; GFX1132-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8
+; GFX1132-NEXT:    scratch_store_b64 off, v[0:1], off offset:8
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
+; GFX1132-NEXT:    scratch_load_b64 v[4:5], off, off
 ; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
@@ -8231,32 +8236,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v42, v0, v2
+; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v40, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v42
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v40, v0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v41, v1
-; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[0:1], off, s[44:47], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[44:47], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-DPP-NEXT:    v_max_f64 v[41:42], v[0:1], v[0:1]
 ; GFX7LESS-DPP-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[2:3], v[40:41], v[40:41]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[4:5], v[2:3]
+; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
@@ -8277,17 +8279,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v42
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s44
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s45
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB11_1
@@ -8640,23 +8642,23 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1032-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, v9
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1032-DPP-NEXT:    s_mov_b32 s46, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1032-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
 ; GFX1032-DPP-NEXT:  .LBB11_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -8669,6 +8671,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
@@ -8678,9 +8682,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    s_mov_b32 s13, s42
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[41:42]
 ; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
 ; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s45
@@ -8775,9 +8777,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, v9
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
@@ -8786,6 +8788,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1164-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[46:47], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
@@ -8793,9 +8796,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    .p2align 6
 ; GFX1164-DPP-NEXT:  .LBB11_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -8804,6 +8806,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
@@ -8811,13 +8815,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_mov_b32 s12, s43
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s42
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[41:42]
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s45
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
@@ -8900,9 +8901,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v41, v8
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -8910,15 +8911,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-DPP-NEXT:    .p2align 6
 ; GFX1132-DPP-NEXT:  .LBB11_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -8926,6 +8927,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
@@ -8933,11 +8935,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s43
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s42
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[41:42]
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index 1f521f2444984..cb3291df891af 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -328,12 +328,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7LESS-NEXT:  .LBB1_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX7LESS-NEXT:    v_mul_f32_e32 v3, 1.0, v1
-; GFX7LESS-NEXT:    v_min_f32_e32 v0, v3, v0
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
@@ -399,14 +400,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v1, v3, s[0:1]
 ; GFX9-NEXT:  .LBB1_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_max_f32_e32 v0, v2, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v4, v1, v1
-; GFX9-NEXT:    v_min_f32_e32 v0, v4, v0
+; GFX9-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -757,14 +758,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_max_f32_e64 v6, s4, s4
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX9-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_max_f32_e64 v0, s4, s4
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_max_f32_e32 v6, v1, v1
-; GFX9-DPP-NEXT:    v_min_f32_e32 v0, v6, v0
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-DPP-NEXT:    v_min_f32_e32 v0, v0, v6
 ; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -1366,12 +1367,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7LESS-NEXT:  .LBB3_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX7LESS-NEXT:    v_mul_f32_e32 v3, 1.0, v1
-; GFX7LESS-NEXT:    v_min_f32_e32 v0, v3, v0
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
@@ -1437,14 +1439,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v1, v3, s[0:1]
 ; GFX9-NEXT:  .LBB3_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_max_f32_e32 v0, v2, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v4, v1, v1
-; GFX9-NEXT:    v_min_f32_e32 v0, v4, v0
+; GFX9-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -1795,14 +1797,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_max_f32_e64 v6, s4, s4
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX9-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_max_f32_e64 v0, s4, s4
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_max_f32_e32 v6, v1, v1
-; GFX9-DPP-NEXT:    v_min_f32_e32 v0, v6, v0
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-DPP-NEXT:    v_min_f32_e32 v0, v0, v6
 ; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -2404,12 +2406,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7LESS-NEXT:  .LBB5_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX7LESS-NEXT:    v_mul_f32_e32 v3, 1.0, v1
-; GFX7LESS-NEXT:    v_min_f32_e32 v0, v3, v0
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
@@ -2475,14 +2478,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v1, v3, s[0:1]
 ; GFX9-NEXT:  .LBB5_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_max_f32_e32 v0, v2, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v4, v1, v1
-; GFX9-NEXT:    v_min_f32_e32 v0, v4, v0
+; GFX9-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -2833,14 +2836,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_max_f32_e64 v6, s4, s4
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX9-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_max_f32_e64 v0, s4, s4
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_max_f32_e32 v6, v1, v1
-; GFX9-DPP-NEXT:    v_min_f32_e32 v0, v6, v0
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-DPP-NEXT:    v_min_f32_e32 v0, v0, v6
 ; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -4051,12 +4054,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT:    v_mov_b32_e32 v41, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v42, 0x7ff80000
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX7LESS-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_ff1_i32_b64 s4, s[0:1]
-; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX7LESS-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX7LESS-NEXT:    v_readlane_b32 s2, v0, s4
 ; GFX7LESS-NEXT:    s_lshl_b64 s[4:5], 1, s4
@@ -4064,7 +4067,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX7LESS-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; GFX7LESS-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GFX7LESS-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT:    v_min_f64 v[41:42], v[2:3], v[4:5]
+; GFX7LESS-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_vccnz .LBB7_1
 ; GFX7LESS-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
@@ -4080,16 +4083,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[44:47], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX7LESS-NEXT:  .LBB7_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[41:42], v[41:42]
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
 ; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0
 ; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_min_f64 v[0:1], v[4:5], v[2:3]
+; GFX7LESS-NEXT:    v_min_f64 v[0:1], v[2:3], v[41:42]
 ; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
@@ -4163,20 +4166,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX9-NEXT:    s_movk_i32 s32, 0x800
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    v_mov_b32_e32 v41, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], exec
-; GFX9-NEXT:    v_mov_b32_e32 v42, 0x7ff80000
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX9-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s4, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX9-NEXT:    v_readlane_b32 s2, v0, s4
-; GFX9-NEXT:    v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_min_f64 v[41:42], v[2:3], v[4:5]
+; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4188,14 +4191,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX9-NEXT:  ; %bb.3:
 ; GFX9-NEXT:    s_load_dwordx2 s[44:45], s[36:37], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX9-NEXT:    s_mov_b64 s[46:47], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v0, s[44:45]
 ; GFX9-NEXT:  .LBB7_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX9-NEXT:    s_add_u32 s8, s36, 44
 ; GFX9-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX9-NEXT:    s_getpc_b64 s[0:1]
@@ -4203,19 +4206,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX9-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0
 ; GFX9-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GFX9-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
 ; GFX9-NEXT:    s_mov_b32 s12, s43
 ; GFX9-NEXT:    s_mov_b32 s13, s42
 ; GFX9-NEXT:    s_mov_b32 s14, s33
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s44
@@ -4226,8 +4229,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    buffer_load_dword v4, off, s[48:51], 0
+; GFX9-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
@@ -4272,20 +4275,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    v_mov_b32_e32 v41, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v42, 0x7ff80000
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s4, s[0:1]
-; GFX1064-NEXT:    v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX1064-NEXT:    v_readlane_b32 s2, v0, s4
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_min_f64 v[41:42], v[2:3], v[4:5]
+; GFX1064-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4297,26 +4300,28 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1064-NEXT:  ; %bb.3:
 ; GFX1064-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX1064-NEXT:    s_mov_b64 s[46:47], 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1064-NEXT:    global_load_dwordx2 v[4:5], v0, s[44:45]
 ; GFX1064-NEXT:  .LBB7_4: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1064-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
+; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], s[40:41]
@@ -4326,18 +4331,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    s_mov_b32 s13, s42
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
-; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s45
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    buffer_load_dword v4, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
 ; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX1064-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
@@ -4382,20 +4385,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    v_mov_b32_e32 v41, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v42, 0x7ff80000
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
-; GFX1032-NEXT:    v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1032-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX1032-NEXT:    v_min_f64 v[41:42], v[2:3], v[4:5]
+; GFX1032-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4407,25 +4410,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1032-NEXT:  ; %bb.3:
 ; GFX1032-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1032-NEXT:    global_load_dwordx2 v[4:5], v0, s[44:45]
 ; GFX1032-NEXT:  .LBB7_4: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1032-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0
 ; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
+; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
 ; GFX1032-NEXT:    s_mov_b64 s[4:5], s[40:41]
@@ -4435,18 +4440,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_mov_b32 s13, s42
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s45
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1032-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    buffer_load_dword v4, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
 ; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX1032-NEXT:    s_or_b32 s46, vcc_lo, s46
@@ -4481,14 +4484,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    v_mov_b32_e32 v41, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v42, 0x7ff80000
+; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s4, s[0:1]
-; GFX1164-NEXT:    v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1164-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1164-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX1164-NEXT:    v_readlane_b32 s2, v0, s4
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
@@ -4497,7 +4500,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_min_f64 v[41:42], v[2:3], v[4:5]
+; GFX1164-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4510,16 +4513,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-NEXT:  ; %bb.3:
 ; GFX1164-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX1164-NEXT:    s_mov_b64 s[46:47], 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1164-NEXT:    global_load_b64 v[4:5], v0, s[44:45]
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-NEXT:    .p2align 6
 ; GFX1164-NEXT:  .LBB7_4: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1164-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[0:1]
@@ -4527,7 +4530,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1164-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[38:39]
@@ -4535,19 +4540,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    s_mov_b32 s12, s43
 ; GFX1164-NEXT:    s_mov_b32 s13, s42
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1164-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
-; GFX1164-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s44
-; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1164-NEXT:    scratch_store_b64 off, v[4:5], off
 ; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
+; GFX1164-NEXT:    scratch_store_b64 off, v[0:1], off offset:8
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    scratch_load_b64 v[1:2], off, off
+; GFX1164-NEXT:    scratch_load_b64 v[4:5], off, off
 ; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -4584,14 +4586,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    v_mov_b32_e32 v41, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v42, 0x7ff80000
+; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
-; GFX1132-NEXT:    v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1132-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
@@ -4600,7 +4602,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_min_f64 v[41:42], v[2:3], v[4:5]
+; GFX1132-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4613,22 +4615,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-NEXT:  ; %bb.3:
 ; GFX1132-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
 ; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1132-NEXT:    global_load_b64 v[4:5], v0, s[44:45]
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-NEXT:    .p2align 6
 ; GFX1132-NEXT:  .LBB7_4: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1132-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1132-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
 ; GFX1132-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT:    v_mov_b32_e32 v3, s45
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1132-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[38:39]
@@ -4636,16 +4640,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_mov_b32 s12, s43
 ; GFX1132-NEXT:    s_mov_b32 s13, s42
 ; GFX1132-NEXT:    s_mov_b32 s14, s33
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX1132-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
-; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
-; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1132-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1132-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1132-NEXT:    scratch_store_b64 off, v[4:5], off
+; GFX1132-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8
+; GFX1132-NEXT:    scratch_store_b64 off, v[0:1], off offset:8
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
+; GFX1132-NEXT:    scratch_load_b64 v[4:5], off, off
 ; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
@@ -4684,32 +4688,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v42, v0, v2
+; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v40, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v42
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v40, v0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v41, v1
-; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[0:1], off, s[44:47], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[44:47], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-DPP-NEXT:    v_max_f64 v[41:42], v[0:1], v[0:1]
 ; GFX7LESS-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[2:3], v[40:41], v[40:41]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_min_f64 v[0:1], v[4:5], v[2:3]
+; GFX7LESS-DPP-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
@@ -4730,17 +4731,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v42
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s44
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s45
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB7_1
@@ -5093,23 +5094,23 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1032-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, v9
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1032-DPP-NEXT:    s_mov_b32 s46, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1032-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
 ; GFX1032-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -5122,6 +5123,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
@@ -5131,9 +5134,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_mov_b32 s13, s42
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[41:42]
 ; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
 ; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s45
@@ -5228,9 +5229,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, v9
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
@@ -5239,6 +5240,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1164-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[46:47], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
@@ -5246,9 +5248,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    .p2align 6
 ; GFX1164-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -5257,6 +5258,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
@@ -5264,13 +5267,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_mov_b32 s12, s43
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s42
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1164-DPP-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX1164-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[41:42]
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s45
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
@@ -5353,9 +5353,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v41, v8
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -5363,15 +5363,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-DPP-NEXT:    .p2align 6
 ; GFX1132-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -5379,6 +5379,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
@@ -5386,11 +5387,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s43
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s42
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX1132-DPP-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[41:42]
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5807,12 +5807,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX7LESS-NEXT:  .LBB9_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX7LESS-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7LESS-NEXT:    v_min_f64 v[0:1], v[6:7], v[0:1]
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v7, v1
@@ -5882,15 +5883,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX9-NEXT:  ; %bb.3:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[0:1]
 ; GFX9-NEXT:  .LBB9_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[7:8], v[2:3], v[2:3]
-; GFX9-NEXT:    v_min_f64 v[0:1], v[7:8], v[0:1]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
 ; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -6067,16 +6068,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1164-NEXT:  ; %bb.3:
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
 ; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1164-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    global_load_b64 v[2:3], v6, s[0:1]
 ; GFX1164-NEXT:  .LBB9_4: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_max_f64 v[7:8], v[2:3], v[2:3]
+; GFX1164-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_min_f64 v[0:1], v[7:8], v[0:1]
+; GFX1164-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
 ; GFX1164-NEXT:    global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -6137,15 +6138,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1132-NEXT:  ; %bb.3:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
 ; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    global_load_b64 v[2:3], v6, s[0:1]
 ; GFX1132-NEXT:  .LBB9_4: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_max_f64 v[7:8], v[2:3], v[2:3]
+; GFX1132-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_min_f64 v[0:1], v[7:8], v[0:1]
+; GFX1132-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
 ; GFX1132-NEXT:    global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
@@ -6189,24 +6191,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[4:5], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[36:39], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX7LESS-DPP-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[6:7], v[4:5], v[4:5]
-; GFX7LESS-DPP-NEXT:    v_min_f64 v[2:3], v[6:7], v[2:3]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v9, v5
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v8, v4
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-DPP-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v6
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, v7
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v6
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6560,16 +6563,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    global_load_b64 v[8:9], v10, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB9_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f64 v[11:12], v[8:9], v[8:9]
+; GFX1164-DPP-NEXT:    v_max_f64 v[6:7], v[8:9], v[8:9]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_min_f64 v[6:7], v[11:12], v[6:7]
+; GFX1164-DPP-NEXT:    v_min_f64 v[6:7], v[6:7], v[0:1]
 ; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[8:9]
@@ -6651,15 +6655,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_load_b64 v[8:9], v10, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB9_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f64 v[11:12], v[8:9], v[8:9]
+; GFX1132-DPP-NEXT:    v_max_f64 v[6:7], v[8:9], v[8:9]
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_min_f64 v[6:7], v[11:12], v[6:7]
+; GFX1132-DPP-NEXT:    v_min_f64 v[6:7], v[6:7], v[0:1]
 ; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[8:9]
@@ -7598,12 +7602,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT:    v_mov_b32_e32 v41, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v42, 0x7ff80000
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX7LESS-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_ff1_i32_b64 s4, s[0:1]
-; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX7LESS-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX7LESS-NEXT:    v_readlane_b32 s2, v0, s4
 ; GFX7LESS-NEXT:    s_lshl_b64 s[4:5], 1, s4
@@ -7611,7 +7615,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX7LESS-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; GFX7LESS-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GFX7LESS-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT:    v_min_f64 v[41:42], v[2:3], v[4:5]
+; GFX7LESS-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_vccnz .LBB11_1
 ; GFX7LESS-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
@@ -7627,16 +7631,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[44:47], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX7LESS-NEXT:  .LBB11_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[41:42], v[41:42]
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
 ; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0
 ; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_min_f64 v[0:1], v[4:5], v[2:3]
+; GFX7LESS-NEXT:    v_min_f64 v[0:1], v[2:3], v[41:42]
 ; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
@@ -7710,20 +7714,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX9-NEXT:    s_movk_i32 s32, 0x800
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    v_mov_b32_e32 v41, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], exec
-; GFX9-NEXT:    v_mov_b32_e32 v42, 0x7ff80000
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX9-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s4, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX9-NEXT:    v_readlane_b32 s2, v0, s4
-; GFX9-NEXT:    v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_min_f64 v[41:42], v[2:3], v[4:5]
+; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7735,14 +7739,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX9-NEXT:  ; %bb.3:
 ; GFX9-NEXT:    s_load_dwordx2 s[44:45], s[36:37], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX9-NEXT:    s_mov_b64 s[46:47], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v0, s[44:45]
 ; GFX9-NEXT:  .LBB11_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX9-NEXT:    s_add_u32 s8, s36, 44
 ; GFX9-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX9-NEXT:    s_getpc_b64 s[0:1]
@@ -7750,19 +7754,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX9-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0
 ; GFX9-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GFX9-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
 ; GFX9-NEXT:    s_mov_b32 s12, s43
 ; GFX9-NEXT:    s_mov_b32 s13, s42
 ; GFX9-NEXT:    s_mov_b32 s14, s33
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s44
@@ -7773,8 +7777,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    buffer_load_dword v4, off, s[48:51], 0
+; GFX9-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
@@ -7819,20 +7823,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    v_mov_b32_e32 v41, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v42, 0x7ff80000
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s4, s[0:1]
-; GFX1064-NEXT:    v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX1064-NEXT:    v_readlane_b32 s2, v0, s4
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_min_f64 v[41:42], v[2:3], v[4:5]
+; GFX1064-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7844,26 +7848,28 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1064-NEXT:  ; %bb.3:
 ; GFX1064-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX1064-NEXT:    s_mov_b64 s[46:47], 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1064-NEXT:    global_load_dwordx2 v[4:5], v0, s[44:45]
 ; GFX1064-NEXT:  .LBB11_4: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1064-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
+; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], s[40:41]
@@ -7873,18 +7879,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    s_mov_b32 s13, s42
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
-; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s45
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    buffer_load_dword v4, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
 ; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX1064-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
@@ -7929,20 +7933,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    v_mov_b32_e32 v41, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v42, 0x7ff80000
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
-; GFX1032-NEXT:    v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1032-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX1032-NEXT:    v_min_f64 v[41:42], v[2:3], v[4:5]
+; GFX1032-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7954,25 +7958,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1032-NEXT:  ; %bb.3:
 ; GFX1032-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1032-NEXT:    global_load_dwordx2 v[4:5], v0, s[44:45]
 ; GFX1032-NEXT:  .LBB11_4: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1032-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0
 ; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
+; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
 ; GFX1032-NEXT:    s_mov_b64 s[4:5], s[40:41]
@@ -7982,18 +7988,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_mov_b32 s13, s42
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s45
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1032-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    buffer_load_dword v4, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
 ; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX1032-NEXT:    s_or_b32 s46, vcc_lo, s46
@@ -8028,14 +8032,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    v_mov_b32_e32 v41, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v42, 0x7ff80000
+; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s4, s[0:1]
-; GFX1164-NEXT:    v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1164-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1164-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX1164-NEXT:    v_readlane_b32 s2, v0, s4
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
@@ -8044,7 +8048,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_min_f64 v[41:42], v[2:3], v[4:5]
+; GFX1164-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8057,16 +8061,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-NEXT:  ; %bb.3:
 ; GFX1164-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX1164-NEXT:    s_mov_b64 s[46:47], 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1164-NEXT:    global_load_b64 v[4:5], v0, s[44:45]
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-NEXT:    .p2align 6
 ; GFX1164-NEXT:  .LBB11_4: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1164-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[0:1]
@@ -8074,7 +8078,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1164-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[38:39]
@@ -8082,19 +8088,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    s_mov_b32 s12, s43
 ; GFX1164-NEXT:    s_mov_b32 s13, s42
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1164-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
-; GFX1164-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s44
-; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1164-NEXT:    scratch_store_b64 off, v[4:5], off
 ; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
+; GFX1164-NEXT:    scratch_store_b64 off, v[0:1], off offset:8
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    scratch_load_b64 v[1:2], off, off
+; GFX1164-NEXT:    scratch_load_b64 v[4:5], off, off
 ; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -8131,14 +8134,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    v_mov_b32_e32 v41, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v42, 0x7ff80000
+; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
-; GFX1132-NEXT:    v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1132-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
@@ -8147,7 +8150,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_min_f64 v[41:42], v[2:3], v[4:5]
+; GFX1132-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8160,22 +8163,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-NEXT:  ; %bb.3:
 ; GFX1132-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
 ; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1132-NEXT:    global_load_b64 v[4:5], v0, s[44:45]
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-NEXT:    .p2align 6
 ; GFX1132-NEXT:  .LBB11_4: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1132-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1132-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
 ; GFX1132-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT:    v_mov_b32_e32 v3, s45
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1132-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[38:39]
@@ -8183,16 +8188,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    s_mov_b32 s12, s43
 ; GFX1132-NEXT:    s_mov_b32 s13, s42
 ; GFX1132-NEXT:    s_mov_b32 s14, s33
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX1132-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
-; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
-; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1132-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1132-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1132-NEXT:    scratch_store_b64 off, v[4:5], off
+; GFX1132-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8
+; GFX1132-NEXT:    scratch_store_b64 off, v[0:1], off offset:8
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
+; GFX1132-NEXT:    scratch_load_b64 v[4:5], off, off
 ; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
@@ -8231,32 +8236,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v42, v0, v2
+; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v40, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v42
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v40, v0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v41, v1
-; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[0:1], off, s[44:47], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[44:47], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-DPP-NEXT:    v_max_f64 v[41:42], v[0:1], v[0:1]
 ; GFX7LESS-DPP-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[2:3], v[40:41], v[40:41]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_min_f64 v[0:1], v[4:5], v[2:3]
+; GFX7LESS-DPP-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
@@ -8277,17 +8279,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v42
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s44
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s45
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB11_1
@@ -8640,23 +8642,23 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1032-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, v9
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1032-DPP-NEXT:    s_mov_b32 s46, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1032-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
 ; GFX1032-DPP-NEXT:  .LBB11_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -8669,6 +8671,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
@@ -8678,9 +8682,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    s_mov_b32 s13, s42
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[41:42]
 ; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
 ; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s45
@@ -8775,9 +8777,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, v9
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
@@ -8786,6 +8788,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1164-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[46:47], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
@@ -8793,9 +8796,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    .p2align 6
 ; GFX1164-DPP-NEXT:  .LBB11_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -8804,6 +8806,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
@@ -8811,13 +8815,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_mov_b32 s12, s43
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s42
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1164-DPP-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX1164-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[41:42]
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s45
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
@@ -8900,9 +8901,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v41, v8
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -8910,15 +8911,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-DPP-NEXT:    .p2align 6
 ; GFX1132-DPP-NEXT:  .LBB11_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -8926,6 +8927,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
@@ -8933,11 +8935,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s43
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s42
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX1132-DPP-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[41:42]
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index f1d58814f9f04..5c5a769178dd9 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -54,18 +54,18 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_mov_b64 s[16:17], s[4:5]
 ; GFX11-NEXT:    v_mov_b32_e32 v31, v0
-; GFX11-NEXT:    s_load_b32 s24, s[16:17], 0x24
+; GFX11-NEXT:    s_load_b32 s19, s[16:17], 0x24
 ; GFX11-NEXT:    s_mov_b32 s12, s13
 ; GFX11-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
 ; GFX11-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX11-NEXT:    s_mov_b32 s19, 0
+; GFX11-NEXT:    s_mov_b32 s20, 0
 ; GFX11-NEXT:    s_mov_b32 s0, -1
 ; GFX11-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_mul_lo_u32 v0, s24, v0
+; GFX11-NEXT:    v_mul_lo_u32 v0, s19, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX11-NEXT:    s_cbranch_execz .LBB2_13
@@ -74,7 +74,7 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:    s_mov_b32 s18, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_bitcmp1_b32 s21, 0
-; GFX11-NEXT:    s_cselect_b32 s19, -1, 0
+; GFX11-NEXT:    s_cselect_b32 s24, -1, 0
 ; GFX11-NEXT:    s_bitcmp0_b32 s21, 0
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB2_3
 ; GFX11-NEXT:  ; %bb.2: ; %bb15
@@ -110,60 +110,58 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB2_8
 ; GFX11-NEXT:  ; %bb.5: ; %bb18.preheader
 ; GFX11-NEXT:    s_load_b128 s[28:31], s[16:17], 0x44
-; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_mul_hi_u32 s0, s29, s28
 ; GFX11-NEXT:    s_mul_i32 s1, s29, s28
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_alignbit_b32 v0, s0, s1, 1
 ; GFX11-NEXT:    s_mov_b32 s1, 0
-; GFX11-NEXT:    s_mov_b32 s13, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_or_b32 s0, s0, 1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshr_b32 s0, s0, s30
-; GFX11-NEXT:    s_mul_i32 s0, s0, s22
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_mul_i32 s0, s0, s22
 ; GFX11-NEXT:    s_mul_i32 s0, s0, s20
-; GFX11-NEXT:    s_or_b32 s0, s24, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s0, s19, s0
 ; GFX11-NEXT:    s_lshl_b64 s[20:21], s[0:1], 1
-; GFX11-NEXT:    global_load_u16 v0, v0, s[20:21]
+; GFX11-NEXT:    s_mov_b32 s0, s1
+; GFX11-NEXT:    global_load_u16 v1, v0, s[20:21]
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s24
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB2_6: ; %bb18
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s19
+; GFX11-NEXT:    v_readfirstlane_b32 s13, v0
 ; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX11-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX11-NEXT:    v_readfirstlane_b32 s20, v0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
 ; GFX11-NEXT:    s_and_b32 s1, s8, s1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    s_and_b32 s1, s1, exec_lo
-; GFX11-NEXT:    v_readfirstlane_b32 s21, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    s_cselect_b32 s1, s21, s20
-; GFX11-NEXT:    s_and_b32 s20, 0xffff, s13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_readfirstlane_b32 s19, v2
+; GFX11-NEXT:    s_cselect_b32 s1, s19, s13
+; GFX11-NEXT:    s_and_b32 s13, 0xffff, s0
 ; GFX11-NEXT:    s_and_b32 s1, s1, 1
-; GFX11-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX11-NEXT:    s_cselect_b32 s20, -1, 0
-; GFX11-NEXT:    s_and_b32 s22, s9, exec_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s20
-; GFX11-NEXT:    v_readfirstlane_b32 s20, v1
+; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX11-NEXT:    s_cselect_b32 s13, -1, 0
+; GFX11-NEXT:    s_and_b32 s20, s9, exec_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s13
+; GFX11-NEXT:    v_readfirstlane_b32 s13, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_readfirstlane_b32 s21, v0
-; GFX11-NEXT:    s_cselect_b32 s20, s21, s20
+; GFX11-NEXT:    v_readfirstlane_b32 s19, v2
+; GFX11-NEXT:    s_cselect_b32 s13, s19, s13
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_bitcmp1_b32 s20, 0
-; GFX11-NEXT:    s_cselect_b32 s20, 0x100, 0
-; GFX11-NEXT:    s_or_b32 s13, s20, s13
+; GFX11-NEXT:    s_bitcmp1_b32 s13, 0
+; GFX11-NEXT:    s_cselect_b32 s13, 0x100, 0
+; GFX11-NEXT:    s_or_b32 s0, s13, s0
 ; GFX11-NEXT:    s_cbranch_vccz .LBB2_6
 ; GFX11-NEXT:  ; %bb.7: ; %Flow
 ; GFX11-NEXT:    s_mov_b32 s0, 0
@@ -181,7 +179,7 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:  ; %bb.11: ; %Flow6
 ; GFX11-NEXT:    s_mov_b32 s18, -1
 ; GFX11-NEXT:  .LBB2_12: ; %Flow11
-; GFX11-NEXT:    s_and_b32 s19, s2, exec_lo
+; GFX11-NEXT:    s_and_b32 s20, s2, exec_lo
 ; GFX11-NEXT:    s_or_not1_b32 s0, s18, exec_lo
 ; GFX11-NEXT:  .LBB2_13: ; %Flow9
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s3
@@ -198,10 +196,10 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:    s_mov_b32 s14, s15
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_or_b32 s19, s19, exec_lo
+; GFX11-NEXT:    s_or_b32 s20, s20, exec_lo
 ; GFX11-NEXT:  .LBB2_15: ; %Flow14
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; GFX11-NEXT:    s_and_saveexec_b32 s0, s19
+; GFX11-NEXT:    s_and_saveexec_b32 s0, s20
 ; GFX11-NEXT:  ; %bb.16: ; %UnifiedUnreachableBlock
 ; GFX11-NEXT:    ; divergent unreachable
 ; GFX11-NEXT:  ; %bb.17: ; %UnifiedReturnBlock
diff --git a/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir b/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir
index 67cdf196a4693..dd478f94e1039 100644
--- a/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir
+++ b/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir
@@ -32,38 +32,38 @@ body:             |
   ; GCN-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GCN-NEXT:   [[COPY16:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GCN-NEXT:   [[COPY17:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT: bb.1:
-  ; GCN-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
-  ; GCN-NEXT:   liveins: $vcc
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc
   ; GCN-NEXT:   [[V_CVT_F64_I32_e32_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY]], implicit $mode, implicit $exec
-  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_]], implicit $exec
   ; GCN-NEXT:   [[V_CVT_F64_I32_e32_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY1]], implicit $mode, implicit $exec
-  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_1]], implicit $exec
   ; GCN-NEXT:   [[V_CVT_F64_I32_e32_2:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY2]], implicit $mode, implicit $exec
-  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_2]], implicit $exec
   ; GCN-NEXT:   [[V_CVT_F64_I32_e32_3:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY3]], implicit $mode, implicit $exec
-  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_3]], implicit $exec
   ; GCN-NEXT:   [[V_CVT_F64_I32_e32_4:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY4]], implicit $mode, implicit $exec
-  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_4]], implicit $exec
   ; GCN-NEXT:   [[V_CVT_F64_I32_e32_5:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY5]], implicit $mode, implicit $exec
-  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_5]], implicit $exec
   ; GCN-NEXT:   [[V_CVT_F64_I32_e32_6:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY6]], implicit $mode, implicit $exec
-  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_6]], implicit $exec
   ; GCN-NEXT:   [[V_CVT_F64_I32_e32_7:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY7]], implicit $mode, implicit $exec
-  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_7]], implicit $exec
   ; GCN-NEXT:   [[V_CVT_F64_I32_e32_8:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY8]], implicit $mode, implicit $exec
-  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_8]], implicit $exec
   ; GCN-NEXT:   [[V_CVT_F64_I32_e32_9:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY9]], implicit $mode, implicit $exec
-  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_9]], implicit $exec
   ; GCN-NEXT:   [[V_CVT_F64_I32_e32_10:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY10]], implicit $mode, implicit $exec
-  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_10]], implicit $exec
   ; GCN-NEXT:   [[V_CVT_F64_I32_e32_11:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY11]], implicit $mode, implicit $exec
-  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_11]], implicit $exec
   ; GCN-NEXT:   [[V_CVT_F64_I32_e32_12:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY12]], implicit $mode, implicit $exec
-  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_12]], implicit $exec
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+  ; GCN-NEXT:   liveins: $vcc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc
+  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_]], implicit $exec
+  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_1]], implicit $exec
+  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_2]], implicit $exec
+  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_3]], implicit $exec
+  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_4]], implicit $exec
+  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_5]], implicit $exec
+  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_6]], implicit $exec
+  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_7]], implicit $exec
+  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_8]], implicit $exec
+  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_9]], implicit $exec
+  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_10]], implicit $exec
+  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_11]], implicit $exec
+  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_12]], implicit $exec
   ; GCN-NEXT:   [[V_CVT_F64_I32_e32_13:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY13]], implicit $mode, implicit $exec
   ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_13]], implicit $exec
   ; GCN-NEXT:   [[V_CVT_F64_I32_e32_14:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY14]], implicit $mode, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
index b079637103fa4..eb2d95e4db2d5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
@@ -6,20 +6,20 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32(<4 x i32> %addr, i32 %i
 ; CHECK-LABEL: struct_atomic_buffer_load_i32:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT:    s_load_b32 s4, s[4:5], 0x34
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s5, 0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, s6
 ; CHECK-NEXT:  .LBB0_1: ; %bb1
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v1, s4
-; CHECK-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 idxen glc
+; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:    s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_1
 ; CHECK-NEXT:  ; %bb.2: ; %bb2
 ; CHECK-NEXT:    s_endpgm
@@ -67,20 +67,20 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_off(<4 x i32> %addr, i3
 ; CHECK-LABEL: struct_atomic_buffer_load_i32_off:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT:    s_load_b32 s4, s[4:5], 0x34
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s5, 0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, s6
 ; CHECK-NEXT:  .LBB2_1: ; %bb1
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v1, s4
-; CHECK-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 idxen glc
+; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:    s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; CHECK-NEXT:    s_cbranch_execnz .LBB2_1
 ; CHECK-NEXT:  ; %bb.2: ; %bb2
 ; CHECK-NEXT:    s_endpgm
@@ -99,20 +99,20 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_soff(<4 x i32> %addr, i
 ; CHECK-LABEL: struct_atomic_buffer_load_i32_soff:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT:    s_load_b32 s4, s[4:5], 0x34
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s5, 0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, s6
 ; CHECK-NEXT:  .LBB3_1: ; %bb1
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v1, s4
-; CHECK-NEXT:    buffer_load_b32 v1, v1, s[0:3], 4 idxen offset:4 glc
+; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 4 idxen offset:4 glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:    s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; CHECK-NEXT:    s_cbranch_execnz .LBB3_1
 ; CHECK-NEXT:  ; %bb.2: ; %bb2
 ; CHECK-NEXT:    s_endpgm
@@ -130,20 +130,20 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_dlc(<4 x i32> %addr, i3
 ; CHECK-LABEL: struct_atomic_buffer_load_i32_dlc:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT:    s_load_b32 s4, s[4:5], 0x34
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s5, 0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, s6
 ; CHECK-NEXT:  .LBB4_1: ; %bb1
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v1, s4
-; CHECK-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 idxen offset:4 dlc
+; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 idxen offset:4 dlc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:    s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; CHECK-NEXT:    s_cbranch_execnz .LBB4_1
 ; CHECK-NEXT:  ; %bb.2: ; %bb2
 ; CHECK-NEXT:    s_endpgm
@@ -194,20 +194,20 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i64(<4 x i32> %addr, i32 %i
 ; CHECK-LABEL: struct_atomic_buffer_load_i64:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT:    s_load_b32 s4, s[4:5], 0x34
 ; CHECK-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s5, 0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v2, s6
 ; CHECK-NEXT:  .LBB6_1: ; %bb1
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v2, s4
-; CHECK-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], 0 idxen offset:4 glc
+; CHECK-NEXT:    buffer_load_b64 v[3:4], v2, s[0:3], 0 idxen offset:4 glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
-; CHECK-NEXT:    s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[3:4], v[0:1]
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; CHECK-NEXT:    s_cbranch_execnz .LBB6_1
 ; CHECK-NEXT:  ; %bb.2: ; %bb2
 ; CHECK-NEXT:    s_endpgm
@@ -227,20 +227,20 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v2i16(<4 x i32> %addr, i32
 ; CHECK-LABEL: struct_atomic_buffer_load_v2i16:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT:    s_load_b32 s4, s[4:5], 0x34
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s5, 0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, s6
 ; CHECK-NEXT:  .LBB7_1: ; %bb1
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v1, s4
-; CHECK-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 idxen glc
+; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:    s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; CHECK-NEXT:    s_cbranch_execnz .LBB7_1
 ; CHECK-NEXT:  ; %bb.2: ; %bb2
 ; CHECK-NEXT:    s_endpgm
@@ -260,23 +260,23 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32
 ; CHECK-LABEL: struct_atomic_buffer_load_v4i16:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT:    s_load_b32 s4, s[4:5], 0x34
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s5, 0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, s6
 ; CHECK-NEXT:  .LBB8_1: ; %bb1
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v1, s4
-; CHECK-NEXT:    buffer_load_b64 v[1:2], v1, s[0:3], 0 idxen offset:4 glc
+; CHECK-NEXT:    buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; CHECK-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:    s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; CHECK-NEXT:    s_cbranch_execnz .LBB8_1
 ; CHECK-NEXT:  ; %bb.2: ; %bb2
 ; CHECK-NEXT:    s_endpgm
@@ -297,20 +297,20 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i32(<4 x i32> %addr, i32
 ; CHECK-LABEL: struct_atomic_buffer_load_v4i32:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT:    s_load_b32 s4, s[4:5], 0x34
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s5, 0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, s6
 ; CHECK-NEXT:  .LBB9_1: ; %bb1
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v1, s4
-; CHECK-NEXT:    buffer_load_b128 v[1:4], v1, s[0:3], 0 idxen offset:4 glc
+; CHECK-NEXT:    buffer_load_b128 v[2:5], v1, s[0:3], 0 idxen offset:4 glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v4, v0
-; CHECK-NEXT:    s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v5, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; CHECK-NEXT:    s_cbranch_execnz .LBB9_1
 ; CHECK-NEXT:  ; %bb.2: ; %bb2
 ; CHECK-NEXT:    s_endpgm
@@ -330,22 +330,22 @@ define amdgpu_kernel void @struct_atomic_buffer_load_ptr(<4 x i32> %addr, i32 %i
 ; CHECK-LABEL: struct_atomic_buffer_load_ptr:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT:    s_load_b32 s4, s[4:5], 0x34
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s5, 0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, s6
 ; CHECK-NEXT:  .LBB10_1: ; %bb1
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v1, s4
-; CHECK-NEXT:    buffer_load_b64 v[1:2], v1, s[0:3], 0 idxen offset:4 glc
+; CHECK-NEXT:    buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_load_b32 v1, v[1:2]
+; CHECK-NEXT:    flat_load_b32 v2, v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:    s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; CHECK-NEXT:    s_cbranch_execnz .LBB10_1
 ; CHECK-NEXT:  ; %bb.2: ; %bb2
 ; CHECK-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
index e056f05aea451..bc50b12b59049 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
@@ -6,20 +6,20 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32(ptr addrspace(8) %p
 ; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT:    s_load_b32 s4, s[4:5], 0x34
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s5, 0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, s6
 ; CHECK-NEXT:  .LBB0_1: ; %bb1
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v1, s4
-; CHECK-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 idxen glc
+; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:    s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_1
 ; CHECK-NEXT:  ; %bb.2: ; %bb2
 ; CHECK-NEXT:    s_endpgm
@@ -67,20 +67,20 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_off(ptr addrspace(8
 ; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32_off:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT:    s_load_b32 s4, s[4:5], 0x34
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s5, 0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, s6
 ; CHECK-NEXT:  .LBB2_1: ; %bb1
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v1, s4
-; CHECK-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 idxen glc
+; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:    s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; CHECK-NEXT:    s_cbranch_execnz .LBB2_1
 ; CHECK-NEXT:  ; %bb.2: ; %bb2
 ; CHECK-NEXT:    s_endpgm
@@ -99,20 +99,20 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_soff(ptr addrspace(
 ; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32_soff:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT:    s_load_b32 s4, s[4:5], 0x34
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s5, 0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, s6
 ; CHECK-NEXT:  .LBB3_1: ; %bb1
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v1, s4
-; CHECK-NEXT:    buffer_load_b32 v1, v1, s[0:3], 4 idxen offset:4 glc
+; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 4 idxen offset:4 glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:    s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; CHECK-NEXT:    s_cbranch_execnz .LBB3_1
 ; CHECK-NEXT:  ; %bb.2: ; %bb2
 ; CHECK-NEXT:    s_endpgm
@@ -130,20 +130,20 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_dlc(ptr addrspace(8
 ; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32_dlc:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT:    s_load_b32 s4, s[4:5], 0x34
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s5, 0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, s6
 ; CHECK-NEXT:  .LBB4_1: ; %bb1
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v1, s4
-; CHECK-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 idxen offset:4 dlc
+; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 idxen offset:4 dlc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:    s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; CHECK-NEXT:    s_cbranch_execnz .LBB4_1
 ; CHECK-NEXT:  ; %bb.2: ; %bb2
 ; CHECK-NEXT:    s_endpgm
@@ -194,20 +194,20 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i64(ptr addrspace(8) %p
 ; CHECK-LABEL: struct_ptr_atomic_buffer_load_i64:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT:    s_load_b32 s4, s[4:5], 0x34
 ; CHECK-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s5, 0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v2, s6
 ; CHECK-NEXT:  .LBB6_1: ; %bb1
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v2, s4
-; CHECK-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], 0 idxen offset:4 glc
+; CHECK-NEXT:    buffer_load_b64 v[3:4], v2, s[0:3], 0 idxen offset:4 glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
-; CHECK-NEXT:    s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[3:4], v[0:1]
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; CHECK-NEXT:    s_cbranch_execnz .LBB6_1
 ; CHECK-NEXT:  ; %bb.2: ; %bb2
 ; CHECK-NEXT:    s_endpgm
@@ -227,20 +227,20 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v2i16(ptr addrspace(8)
 ; CHECK-LABEL: struct_ptr_atomic_buffer_load_v2i16:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT:    s_load_b32 s4, s[4:5], 0x34
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s5, 0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, s6
 ; CHECK-NEXT:  .LBB7_1: ; %bb1
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v1, s4
-; CHECK-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 idxen glc
+; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:    s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; CHECK-NEXT:    s_cbranch_execnz .LBB7_1
 ; CHECK-NEXT:  ; %bb.2: ; %bb2
 ; CHECK-NEXT:    s_endpgm
@@ -260,23 +260,23 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8)
 ; CHECK-LABEL: struct_ptr_atomic_buffer_load_v4i16:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT:    s_load_b32 s4, s[4:5], 0x34
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s5, 0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, s6
 ; CHECK-NEXT:  .LBB8_1: ; %bb1
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v1, s4
-; CHECK-NEXT:    buffer_load_b64 v[1:2], v1, s[0:3], 0 idxen offset:4 glc
+; CHECK-NEXT:    buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; CHECK-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:    s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; CHECK-NEXT:    s_cbranch_execnz .LBB8_1
 ; CHECK-NEXT:  ; %bb.2: ; %bb2
 ; CHECK-NEXT:    s_endpgm
@@ -297,20 +297,20 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i32(ptr addrspace(8)
 ; CHECK-LABEL: struct_ptr_atomic_buffer_load_v4i32:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT:    s_load_b32 s4, s[4:5], 0x34
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s5, 0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, s6
 ; CHECK-NEXT:  .LBB9_1: ; %bb1
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v1, s4
-; CHECK-NEXT:    buffer_load_b128 v[1:4], v1, s[0:3], 0 idxen offset:4 glc
+; CHECK-NEXT:    buffer_load_b128 v[2:5], v1, s[0:3], 0 idxen offset:4 glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v4, v0
-; CHECK-NEXT:    s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v5, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; CHECK-NEXT:    s_cbranch_execnz .LBB9_1
 ; CHECK-NEXT:  ; %bb.2: ; %bb2
 ; CHECK-NEXT:    s_endpgm
@@ -330,22 +330,22 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_ptr(ptr addrspace(8) %p
 ; CHECK-LABEL: struct_ptr_atomic_buffer_load_ptr:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT:    s_load_b32 s4, s[4:5], 0x34
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s5, 0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, s6
 ; CHECK-NEXT:  .LBB10_1: ; %bb1
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v1, s4
-; CHECK-NEXT:    buffer_load_b64 v[1:2], v1, s[0:3], 0 idxen offset:4 glc
+; CHECK-NEXT:    buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_load_b32 v1, v[1:2]
+; CHECK-NEXT:    flat_load_b32 v2, v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:    s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; CHECK-NEXT:    s_cbranch_execnz .LBB10_1
 ; CHECK-NEXT:  ; %bb.2: ; %bb2
 ; CHECK-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index f961b7d9d5223..23b57a7efa586 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -5818,38 +5818,39 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_b32 v2, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT:    v_dual_add_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v2, v5, v2
-; GFX11-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v2
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-NEXT:    v_mov_b32_e32 v4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v2, v4, v2, 0x7060302
+; GFX11-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX11-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3
+; GFX11-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v4
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -5864,33 +5865,33 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_b32 v2, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT:    v_add_f32_e32 v2, v5, v2
-; GFX10-NEXT:    v_add_f32_e32 v4, v6, v4
-; GFX10-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v2
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX10-NEXT:    v_mov_b32_e32 v4, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v1
+; GFX10-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s4
-; GFX10-NEXT:    v_perm_b32 v2, v4, v2, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB24_1
@@ -5904,29 +5905,29 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ds_read_b32 v2, v0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX90A-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX90A-NEXT:    v_add_f32_e32 v2, v6, v2
-; GFX90A-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX90A-NEXT:    v_perm_b32 v2, v3, v2, s9
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v1
+; GFX90A-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v5, v2, s9
 ; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -5943,29 +5944,29 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    ds_read_b32 v2, v0
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX908-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_add_f32_e32 v2, v6, v2
-; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT:    v_perm_b32 v2, v3, v2, s9
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v1
+; GFX908-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v2, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v5, v2, s9
 ; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -5983,30 +5984,30 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_read_b32 v2, v0
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_add_f32_e32 v2, v6, v2
-; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v1
+; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v2, v5, v2, 16
 ; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6124,38 +6125,39 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_b32 v2, v0 offset:65532
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT:    v_dual_add_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v2, v5, v2
-; GFX11-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v2
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-NEXT:    v_mov_b32_e32 v4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v2, v4, v2, 0x7060302
+; GFX11-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX11-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
+; GFX11-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -6170,33 +6172,33 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_b32 v2, v0 offset:65532
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT:    v_add_f32_e32 v2, v5, v2
-; GFX10-NEXT:    v_add_f32_e32 v4, v6, v4
-; GFX10-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v2
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX10-NEXT:    v_mov_b32_e32 v4, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v1
+; GFX10-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s4
-; GFX10-NEXT:    v_perm_b32 v2, v4, v2, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB25_1
@@ -6210,29 +6212,29 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr,
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ds_read_b32 v2, v0 offset:65532
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX90A-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX90A-NEXT:    v_add_f32_e32 v2, v6, v2
-; GFX90A-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX90A-NEXT:    v_perm_b32 v2, v3, v2, s9
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v1
+; GFX90A-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v5, v2, s9
 ; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6249,29 +6251,29 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr,
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    ds_read_b32 v2, v0 offset:65532
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX908-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_add_f32_e32 v2, v6, v2
-; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT:    v_perm_b32 v2, v3, v2, s9
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v1
+; GFX908-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v2, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v5, v2, s9
 ; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6289,30 +6291,30 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr,
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_read_b32 v2, v0 offset:65532
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_add_f32_e32 v2, v6, v2
-; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v1
+; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v2, v5, v2, 16
 ; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6430,39 +6432,39 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX11-LABEL: local_atomic_fadd_noret_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    ds_load_b32 v2, v0
+; GFX11-NEXT:    ds_load_b32 v3, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_add_f32_e32 v4, v6, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v6, v4, 16, 1
 ; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v5, v7, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v4, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2
+; GFX11-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -6475,34 +6477,34 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX10-LABEL: local_atomic_fadd_noret_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    ds_read_b32 v2, v0
+; GFX10-NEXT:    ds_read_b32 v3, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX10-NEXT:    v_add_f32_e32 v4, v6, v5
-; GFX10-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v1
 ; GFX10-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX10-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v5, v7, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v4, v3, 0x7060302
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v4, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
@@ -6513,35 +6515,35 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX90A-LABEL: local_atomic_fadd_noret_v2bf16:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ds_read_b32 v2, v0
+; GFX90A-NEXT:    ds_read_b32 v3, v0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_add_f32_e32 v4, v6, v5
-; GFX90A-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX90A-NEXT:    v_perm_b32 v3, v4, v3, s9
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v1
+; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v4, v5, v4, s9
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6551,35 +6553,35 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX908-LABEL: local_atomic_fadd_noret_v2bf16:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    ds_read_b32 v2, v0
+; GFX908-NEXT:    ds_read_b32 v3, v0
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX908-NEXT:    v_add_f32_e32 v4, v6, v5
-; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT:    v_add3_u32 v7, v7, v4, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v4, v3, s9
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX908-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v1
+; GFX908-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX908-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v4, v5, v4, s9
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6590,36 +6592,36 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_read_b32 v2, v0
+; GFX8-NEXT:    ds_read_b32 v3, v0
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX8-NEXT:    v_add_f32_e32 v4, v6, v5
-; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v4
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v1
+; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6727,39 +6729,39 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX11-LABEL: local_atomic_fadd_noret_v2bf16__ofset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    ds_load_b32 v2, v0 offset:65532
+; GFX11-NEXT:    ds_load_b32 v3, v0 offset:65532
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_add_f32_e32 v4, v6, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v6, v4, 16, 1
 ; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v5, v7, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v4, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
+; GFX11-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -6772,34 +6774,34 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX10-LABEL: local_atomic_fadd_noret_v2bf16__ofset:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    ds_read_b32 v2, v0 offset:65532
+; GFX10-NEXT:    ds_read_b32 v3, v0 offset:65532
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX10-NEXT:    v_add_f32_e32 v4, v6, v5
-; GFX10-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v1
 ; GFX10-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX10-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v5, v7, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v4, v3, 0x7060302
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v4, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
@@ -6810,35 +6812,35 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX90A-LABEL: local_atomic_fadd_noret_v2bf16__ofset:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ds_read_b32 v2, v0 offset:65532
+; GFX90A-NEXT:    ds_read_b32 v3, v0 offset:65532
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_add_f32_e32 v4, v6, v5
-; GFX90A-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX90A-NEXT:    v_perm_b32 v3, v4, v3, s9
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v1
+; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v4, v5, v4, s9
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6848,35 +6850,35 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX908-LABEL: local_atomic_fadd_noret_v2bf16__ofset:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    ds_read_b32 v2, v0 offset:65532
+; GFX908-NEXT:    ds_read_b32 v3, v0 offset:65532
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX908-NEXT:    v_add_f32_e32 v4, v6, v5
-; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT:    v_add3_u32 v7, v7, v4, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v4, v3, s9
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX908-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v1
+; GFX908-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX908-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v4, v5, v4, s9
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6887,36 +6889,36 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_read_b32 v2, v0 offset:65532
+; GFX8-NEXT:    ds_read_b32 v3, v0 offset:65532
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX8-NEXT:    v_add_f32_e32 v4, v6, v5
-; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v4
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v1
+; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7628,22 +7630,21 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX7-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; GFX7-NEXT:    s_cbranch_execz .LBB28_4
 ; GFX7-NEXT:  ; %bb.1:
-; GFX7-NEXT:    s_lshl_b32 s10, s3, 3
-; GFX7-NEXT:    v_mov_b32_e32 v1, s10
-; GFX7-NEXT:    ds_read_b32 v1, v1
+; GFX7-NEXT:    s_lshl_b32 s8, s3, 3
+; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    ds_read_b32 v1, v2
 ; GFX7-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
-; GFX7-NEXT:    v_cvt_f32_ubyte0_e32 v2, s0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 0x42280000, v2
+; GFX7-NEXT:    v_cvt_f32_ubyte0_e32 v3, s0
+; GFX7-NEXT:    v_mul_f32_e32 v3, 0x42280000, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB28_2: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s10
-; GFX7-NEXT:    v_add_f32_e32 v4, v3, v2
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v1, v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7-NEXT:    v_add_f32_e32 v1, v4, v3
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v2, v4, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], s[0:1], s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB28_2
@@ -7659,23 +7660,22 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX7-NEXT:    s_and_saveexec_b64 s[6:7], s[0:1]
 ; GFX7-NEXT:    s_cbranch_execz .LBB28_7
 ; GFX7-NEXT:  ; %bb.5:
-; GFX7-NEXT:    s_lshl_b32 s3, s3, 4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-NEXT:    ds_read_b32 v2, v1
+; GFX7-NEXT:    s_lshl_b32 s0, s3, 4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    ds_read_b32 v3, v1
 ; GFX7-NEXT:    s_bcnt1_i32_b64 s0, s[8:9]
-; GFX7-NEXT:    v_cvt_f32_ubyte0_e32 v1, s0
-; GFX7-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
+; GFX7-NEXT:    v_cvt_f32_ubyte0_e32 v2, s0
+; GFX7-NEXT:    v_mul_f32_e32 v2, 0x42280000, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB28_6: ; %atomicrmw.start2
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_add_f32_e32 v3, v2, v1
-; GFX7-NEXT:    v_mov_b32_e32 v4, s3
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v3, v4, v2, v3
+; GFX7-NEXT:    v_add_f32_e32 v4, v3, v2
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], v3, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], s[0:1], s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v2, v3
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB28_6
 ; GFX7-NEXT:  .LBB28_7: ; %Flow22
@@ -7710,24 +7710,23 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX7-NEXT:    s_xor_b64 s[6:7], exec, s[0:1]
 ; GFX7-NEXT:    s_cbranch_execz .LBB28_13
 ; GFX7-NEXT:  ; %bb.10:
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    ds_read_b32 v2, v2
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    ds_read_b32 v2, v3
+; GFX7-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX7-NEXT:  .LBB28_11: ; %atomicrmw.start8
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_add_f32_e32 v4, v3, v1
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v2, v2, v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    v_add_f32_e32 v2, v4, v1
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v2, v3, v4, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], v2, v3
-; GFX7-NEXT:    s_or_b64 s[8:9], s[0:1], s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], v2, v4
+; GFX7-NEXT:    s_or_b64 s[2:3], s[0:1], s[2:3]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB28_11
 ; GFX7-NEXT:  ; %bb.12: ; %Flow
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX7-NEXT:  .LBB28_13: ; %Flow20
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -7755,22 +7754,21 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; GFX6-NEXT:    s_cbranch_execz .LBB28_4
 ; GFX6-NEXT:  ; %bb.1:
-; GFX6-NEXT:    s_lshl_b32 s10, s3, 3
-; GFX6-NEXT:    v_mov_b32_e32 v1, s10
-; GFX6-NEXT:    ds_read_b32 v1, v1
+; GFX6-NEXT:    s_lshl_b32 s8, s3, 3
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ds_read_b32 v1, v2
 ; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
-; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 0x42280000, v2
+; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v3, s0
+; GFX6-NEXT:    v_mul_f32_e32 v3, 0x42280000, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB28_2: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v3, v1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s10
-; GFX6-NEXT:    v_add_f32_e32 v4, v3, v2
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v1, v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v1
+; GFX6-NEXT:    v_add_f32_e32 v1, v4, v3
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v2, v4, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], s[0:1], s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB28_2
@@ -7786,23 +7784,22 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], s[0:1]
 ; GFX6-NEXT:    s_cbranch_execz .LBB28_7
 ; GFX6-NEXT:  ; %bb.5:
-; GFX6-NEXT:    s_lshl_b32 s3, s3, 4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
-; GFX6-NEXT:    ds_read_b32 v2, v1
+; GFX6-NEXT:    s_lshl_b32 s0, s3, 4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    ds_read_b32 v3, v1
 ; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[8:9]
-; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v1, s0
-; GFX6-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
+; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s0
+; GFX6-NEXT:    v_mul_f32_e32 v2, 0x42280000, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB28_6: ; %atomicrmw.start2
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v3, v2, v1
-; GFX6-NEXT:    v_mov_b32_e32 v4, s3
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v3, v4, v2, v3
+; GFX6-NEXT:    v_add_f32_e32 v4, v3, v2
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], v3, v2
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], s[0:1], s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v2, v3
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB28_6
 ; GFX6-NEXT:  .LBB28_7: ; %Flow20
@@ -7837,24 +7834,23 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX6-NEXT:    s_xor_b64 s[6:7], exec, s[0:1]
 ; GFX6-NEXT:    s_cbranch_execz .LBB28_13
 ; GFX6-NEXT:  ; %bb.10:
-; GFX6-NEXT:    v_mov_b32_e32 v2, s2
+; GFX6-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
-; GFX6-NEXT:    ds_read_b32 v2, v2
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    ds_read_b32 v2, v3
+; GFX6-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX6-NEXT:  .LBB28_11: ; %atomicrmw.start8
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v3, v2
-; GFX6-NEXT:    v_mov_b32_e32 v2, s2
-; GFX6-NEXT:    v_add_f32_e32 v4, v3, v1
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v2, v2, v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    v_add_f32_e32 v2, v4, v1
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v2, v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], v2, v3
-; GFX6-NEXT:    s_or_b64 s[8:9], s[0:1], s[8:9]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], v2, v4
+; GFX6-NEXT:    s_or_b64 s[2:3], s[0:1], s[2:3]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB28_11
 ; GFX6-NEXT:  ; %bb.12: ; %Flow
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX6-NEXT:  .LBB28_13: ; %Flow18
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -8471,22 +8467,21 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX7-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; GFX7-NEXT:    s_cbranch_execz .LBB29_4
 ; GFX7-NEXT:  ; %bb.1:
-; GFX7-NEXT:    s_lshl_b32 s10, s3, 3
-; GFX7-NEXT:    v_mov_b32_e32 v1, s10
-; GFX7-NEXT:    ds_read_b32 v1, v1
+; GFX7-NEXT:    s_lshl_b32 s8, s3, 3
+; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    ds_read_b32 v1, v2
 ; GFX7-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
-; GFX7-NEXT:    v_cvt_f32_ubyte0_e32 v2, s0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 0x42280000, v2
+; GFX7-NEXT:    v_cvt_f32_ubyte0_e32 v3, s0
+; GFX7-NEXT:    v_mul_f32_e32 v3, 0x42280000, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB29_2: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s10
-; GFX7-NEXT:    v_add_f32_e32 v4, v3, v2
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v1, v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7-NEXT:    v_add_f32_e32 v1, v4, v3
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v2, v4, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], s[0:1], s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB29_2
@@ -8502,23 +8497,22 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX7-NEXT:    s_and_saveexec_b64 s[6:7], s[0:1]
 ; GFX7-NEXT:    s_cbranch_execz .LBB29_7
 ; GFX7-NEXT:  ; %bb.5:
-; GFX7-NEXT:    s_lshl_b32 s3, s3, 4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-NEXT:    ds_read_b32 v2, v1
+; GFX7-NEXT:    s_lshl_b32 s0, s3, 4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    ds_read_b32 v3, v1
 ; GFX7-NEXT:    s_bcnt1_i32_b64 s0, s[8:9]
-; GFX7-NEXT:    v_cvt_f32_ubyte0_e32 v1, s0
-; GFX7-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
+; GFX7-NEXT:    v_cvt_f32_ubyte0_e32 v2, s0
+; GFX7-NEXT:    v_mul_f32_e32 v2, 0x42280000, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB29_6: ; %atomicrmw.start2
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_add_f32_e32 v3, v2, v1
-; GFX7-NEXT:    v_mov_b32_e32 v4, s3
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v3, v4, v2, v3
+; GFX7-NEXT:    v_add_f32_e32 v4, v3, v2
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], v3, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], s[0:1], s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v2, v3
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB29_6
 ; GFX7-NEXT:  .LBB29_7: ; %Flow22
@@ -8553,24 +8547,23 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX7-NEXT:    s_xor_b64 s[6:7], exec, s[0:1]
 ; GFX7-NEXT:    s_cbranch_execz .LBB29_13
 ; GFX7-NEXT:  ; %bb.10:
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    ds_read_b32 v2, v2
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    ds_read_b32 v2, v3
+; GFX7-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX7-NEXT:  .LBB29_11: ; %atomicrmw.start8
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_add_f32_e32 v4, v3, v1
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v2, v2, v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    v_add_f32_e32 v2, v4, v1
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v2, v3, v4, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], v2, v3
-; GFX7-NEXT:    s_or_b64 s[8:9], s[0:1], s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], v2, v4
+; GFX7-NEXT:    s_or_b64 s[2:3], s[0:1], s[2:3]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB29_11
 ; GFX7-NEXT:  ; %bb.12: ; %Flow
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX7-NEXT:  .LBB29_13: ; %Flow20
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -8598,22 +8591,21 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; GFX6-NEXT:    s_cbranch_execz .LBB29_4
 ; GFX6-NEXT:  ; %bb.1:
-; GFX6-NEXT:    s_lshl_b32 s10, s3, 3
-; GFX6-NEXT:    v_mov_b32_e32 v1, s10
-; GFX6-NEXT:    ds_read_b32 v1, v1
+; GFX6-NEXT:    s_lshl_b32 s8, s3, 3
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ds_read_b32 v1, v2
 ; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
-; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 0x42280000, v2
+; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v3, s0
+; GFX6-NEXT:    v_mul_f32_e32 v3, 0x42280000, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB29_2: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v3, v1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s10
-; GFX6-NEXT:    v_add_f32_e32 v4, v3, v2
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v1, v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v1
+; GFX6-NEXT:    v_add_f32_e32 v1, v4, v3
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v2, v4, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], s[0:1], s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB29_2
@@ -8629,23 +8621,22 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], s[0:1]
 ; GFX6-NEXT:    s_cbranch_execz .LBB29_7
 ; GFX6-NEXT:  ; %bb.5:
-; GFX6-NEXT:    s_lshl_b32 s3, s3, 4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
-; GFX6-NEXT:    ds_read_b32 v2, v1
+; GFX6-NEXT:    s_lshl_b32 s0, s3, 4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    ds_read_b32 v3, v1
 ; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[8:9]
-; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v1, s0
-; GFX6-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
+; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s0
+; GFX6-NEXT:    v_mul_f32_e32 v2, 0x42280000, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB29_6: ; %atomicrmw.start2
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v3, v2, v1
-; GFX6-NEXT:    v_mov_b32_e32 v4, s3
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v3, v4, v2, v3
+; GFX6-NEXT:    v_add_f32_e32 v4, v3, v2
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], v3, v2
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], s[0:1], s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v2, v3
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB29_6
 ; GFX6-NEXT:  .LBB29_7: ; %Flow20
@@ -8680,24 +8671,23 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX6-NEXT:    s_xor_b64 s[6:7], exec, s[0:1]
 ; GFX6-NEXT:    s_cbranch_execz .LBB29_13
 ; GFX6-NEXT:  ; %bb.10:
-; GFX6-NEXT:    v_mov_b32_e32 v2, s2
+; GFX6-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
-; GFX6-NEXT:    ds_read_b32 v2, v2
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    ds_read_b32 v2, v3
+; GFX6-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX6-NEXT:  .LBB29_11: ; %atomicrmw.start8
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v3, v2
-; GFX6-NEXT:    v_mov_b32_e32 v2, s2
-; GFX6-NEXT:    v_add_f32_e32 v4, v3, v1
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v2, v2, v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    v_add_f32_e32 v2, v4, v1
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v2, v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], v2, v3
-; GFX6-NEXT:    s_or_b64 s[8:9], s[0:1], s[8:9]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], v2, v4
+; GFX6-NEXT:    s_or_b64 s[2:3], s[0:1], s[2:3]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB29_11
 ; GFX6-NEXT:  ; %bb.12: ; %Flow
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX6-NEXT:  .LBB29_13: ; %Flow18
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
index 47058de71e7f4..d419b0cdfdd1a 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
@@ -4520,15 +4520,15 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ds_load_b32 v2, v0
+; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    v_pk_max_num_f16 v2, v1, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v4, v3, v3
-; GFX12-NEXT:    v_pk_max_num_f16 v2, v4, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v1
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3
 ; GFX12-NEXT:    s_wait_dscnt 0x0
@@ -4550,17 +4550,17 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    ds_read_b32 v2, v0
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX940-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v4, v2
-; GFX940-NEXT:    v_pk_max_f16 v3, v1, v1
-; GFX940-NEXT:    v_pk_max_f16 v2, v4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_max_f16 v2, v2, v3
-; GFX940-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2
+; GFX940-NEXT:    v_pk_max_f16 v2, v2, v1
+; GFX940-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB20_1
@@ -4573,15 +4573,15 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_b32 v2, v0
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    v_pk_max_f16 v2, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v4, v3, v3
-; GFX11-NEXT:    v_pk_max_f16 v2, v4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4600,14 +4600,14 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_b32 v2, v0
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    v_pk_max_f16 v2, v1, v1
-; GFX10-NEXT:    v_pk_max_f16 v4, v3, v3
-; GFX10-NEXT:    v_pk_max_f16 v2, v4, v2
+; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4626,16 +4626,16 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ds_read_b32 v2, v0
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX90A-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v4, v2
-; GFX90A-NEXT:    v_pk_max_f16 v3, v1, v1
-; GFX90A-NEXT:    v_pk_max_f16 v2, v4, v4
-; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v3
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v1
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB20_1
@@ -4649,16 +4649,16 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    ds_read_b32 v2, v0
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_pk_max_f16 v3, v1, v1
-; GFX908-NEXT:    v_pk_max_f16 v2, v4, v4
-; GFX908-NEXT:    v_pk_max_f16 v2, v2, v3
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT:    v_pk_max_f16 v2, v2, v1
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
@@ -4673,17 +4673,17 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_read_b32 v2, v0
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
 ; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v2, v1, v1
-; GFX8-NEXT:    v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v6, v4, v4
-; GFX8-NEXT:    v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v2, v6, v2
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT:    v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v4, v4
+; GFX8-NEXT:    v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
 ; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -4792,15 +4792,15 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ds_load_b32 v2, v0 offset:65532
+; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    v_pk_max_num_f16 v2, v1, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v4, v3, v3
-; GFX12-NEXT:    v_pk_max_num_f16 v2, v4, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v1
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
 ; GFX12-NEXT:    s_wait_dscnt 0x0
@@ -4822,17 +4822,17 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    ds_read_b32 v2, v0 offset:65532
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX940-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v4, v2
-; GFX940-NEXT:    v_pk_max_f16 v3, v1, v1
-; GFX940-NEXT:    v_pk_max_f16 v2, v4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_max_f16 v2, v2, v3
-; GFX940-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
+; GFX940-NEXT:    v_pk_max_f16 v2, v2, v1
+; GFX940-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB21_1
@@ -4845,15 +4845,15 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_b32 v2, v0 offset:65532
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    v_pk_max_f16 v2, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v4, v3, v3
-; GFX11-NEXT:    v_pk_max_f16 v2, v4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4872,14 +4872,14 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_b32 v2, v0 offset:65532
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    v_pk_max_f16 v2, v1, v1
-; GFX10-NEXT:    v_pk_max_f16 v4, v3, v3
-; GFX10-NEXT:    v_pk_max_f16 v2, v4, v2
+; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4898,16 +4898,16 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ds_read_b32 v2, v0 offset:65532
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX90A-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v4, v2
-; GFX90A-NEXT:    v_pk_max_f16 v3, v1, v1
-; GFX90A-NEXT:    v_pk_max_f16 v2, v4, v4
-; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v3
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v1
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB21_1
@@ -4921,16 +4921,16 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    ds_read_b32 v2, v0 offset:65532
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX908-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_pk_max_f16 v3, v1, v1
-; GFX908-NEXT:    v_pk_max_f16 v2, v4, v4
-; GFX908-NEXT:    v_pk_max_f16 v2, v2, v3
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT:    v_pk_max_f16 v2, v2, v1
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
@@ -4945,17 +4945,17 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_read_b32 v2, v0 offset:65532
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
 ; GFX8-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v2, v1, v1
-; GFX8-NEXT:    v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v6, v4, v4
-; GFX8-NEXT:    v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v2, v6, v2
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT:    v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v4, v4
+; GFX8-NEXT:    v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
 ; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -5064,14 +5064,14 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ds_load_b32 v2, v0
+; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v1, v1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
-; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v4, v3
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v3, v1
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2
 ; GFX12-NEXT:    s_wait_dscnt 0x0
@@ -5093,13 +5093,13 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    ds_read_b32 v2, v0
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX940-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_pk_max_f16 v3, v1, v1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
+; GFX940-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_max_f16 v3, v4, v3
+; GFX940-NEXT:    v_pk_max_f16 v3, v3, v1
 ; GFX940-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
@@ -5115,14 +5115,14 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_b32 v2, v0
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_pk_max_f16 v3, v1, v1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
+; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v3, v4, v3
+; GFX11-NEXT:    v_pk_max_f16 v3, v3, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5141,13 +5141,13 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_b32 v2, v0
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_pk_max_f16 v3, v1, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX10-NEXT:    v_pk_max_f16 v3, v4, v3
+; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
+; GFX10-NEXT:    v_pk_max_f16 v3, v3, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5166,12 +5166,12 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ds_read_b32 v2, v0
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_pk_max_f16 v3, v1, v1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:    v_pk_max_f16 v3, v4, v3
+; GFX90A-NEXT:    v_pk_max_f16 v3, v2, v2
+; GFX90A-NEXT:    v_pk_max_f16 v3, v3, v1
 ; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
@@ -5188,12 +5188,12 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    ds_read_b32 v2, v0
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_pk_max_f16 v3, v1, v1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:    v_pk_max_f16 v3, v4, v3
+; GFX908-NEXT:    v_pk_max_f16 v3, v2, v2
+; GFX908-NEXT:    v_pk_max_f16 v3, v3, v1
 ; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
@@ -5209,23 +5209,23 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_read_b32 v2, v0
+; GFX8-NEXT:    ds_read_b32 v3, v0
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
 ; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v5, v1, v1
-; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:    v_max_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v4, v6, v5
-; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX8-NEXT:    v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v3, v3
+; GFX8-NEXT:    v_max_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v1
+; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5326,14 +5326,14 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ds_load_b32 v2, v0 offset:65532
+; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v1, v1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
-; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v4, v3
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v3, v1
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
 ; GFX12-NEXT:    s_wait_dscnt 0x0
@@ -5355,13 +5355,13 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    ds_read_b32 v2, v0 offset:65532
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX940-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_pk_max_f16 v3, v1, v1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
+; GFX940-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_max_f16 v3, v4, v3
+; GFX940-NEXT:    v_pk_max_f16 v3, v3, v1
 ; GFX940-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
@@ -5377,14 +5377,14 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_b32 v2, v0 offset:65532
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_pk_max_f16 v3, v1, v1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
+; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v3, v4, v3
+; GFX11-NEXT:    v_pk_max_f16 v3, v3, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5403,13 +5403,13 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_b32 v2, v0 offset:65532
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_pk_max_f16 v3, v1, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX10-NEXT:    v_pk_max_f16 v3, v4, v3
+; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
+; GFX10-NEXT:    v_pk_max_f16 v3, v3, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5428,12 +5428,12 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ds_read_b32 v2, v0 offset:65532
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX90A-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_pk_max_f16 v3, v1, v1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:    v_pk_max_f16 v3, v4, v3
+; GFX90A-NEXT:    v_pk_max_f16 v3, v2, v2
+; GFX90A-NEXT:    v_pk_max_f16 v3, v3, v1
 ; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
@@ -5450,12 +5450,12 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    ds_read_b32 v2, v0 offset:65532
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_pk_max_f16 v3, v1, v1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:    v_pk_max_f16 v3, v4, v3
+; GFX908-NEXT:    v_pk_max_f16 v3, v2, v2
+; GFX908-NEXT:    v_pk_max_f16 v3, v3, v1
 ; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
@@ -5471,23 +5471,23 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_read_b32 v2, v0 offset:65532
+; GFX8-NEXT:    ds_read_b32 v3, v0 offset:65532
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
 ; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v5, v1, v1
-; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:    v_max_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v4, v6, v5
-; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX8-NEXT:    v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v3, v3
+; GFX8-NEXT:    v_max_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v1
+; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5594,36 +5594,37 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ds_load_b32 v2, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1
-; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT:    v_dual_max_num_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v5, v2
-; GFX12-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v7, 0x400000, v2
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-NEXT:    v_mov_b32_e32 v4, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-NEXT:    v_max_num_f32_e32 v5, v5, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v2
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v2, v4, v2, 0x7060302
+; GFX12-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX12-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3
+; GFX12-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v4
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -5640,30 +5641,30 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    ds_read_b32 v2, v0
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v4, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX940-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v2, v6, v2
-; GFX940-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX940-NEXT:    v_max_f32_e32 v5, v5, v1
+; GFX940-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v2, s4
+; GFX940-NEXT:    v_add3_u32 v8, v8, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v2, v3, v2, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v5, v2, s5
 ; GFX940-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -5679,38 +5680,39 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_b32 v2, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT:    v_dual_max_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f32_e32 v2, v5, v2
-; GFX11-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v2
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-NEXT:    v_mov_b32_e32 v4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-NEXT:    v_max_f32_e32 v5, v5, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v2, v4, v2, 0x7060302
+; GFX11-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX11-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3
+; GFX11-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v4
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -5725,33 +5727,33 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_b32 v2, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT:    v_max_f32_e32 v2, v5, v2
-; GFX10-NEXT:    v_max_f32_e32 v4, v6, v4
-; GFX10-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v2
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX10-NEXT:    v_mov_b32_e32 v4, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v1
+; GFX10-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s4
-; GFX10-NEXT:    v_perm_b32 v2, v4, v2, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB24_1
@@ -5765,29 +5767,29 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ds_read_b32 v2, v0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v2, v6, v2
-; GFX90A-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX90A-NEXT:    v_perm_b32 v2, v3, v2, s9
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX90A-NEXT:    v_max_f32_e32 v5, v5, v1
+; GFX90A-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v5, v2, s9
 ; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -5804,29 +5806,29 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    ds_read_b32 v2, v0
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_max_f32_e32 v2, v6, v2
-; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT:    v_perm_b32 v2, v3, v2, s9
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX908-NEXT:    v_max_f32_e32 v5, v5, v1
+; GFX908-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v2, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v5, v2, s9
 ; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -5844,30 +5846,30 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_read_b32 v2, v0
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_max_f32_e32 v2, v6, v2
-; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v1
+; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v2, v5, v2, 16
 ; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -5969,36 +5971,37 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr,
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ds_load_b32 v2, v0 offset:65532
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1
-; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT:    v_dual_max_num_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v5, v2
-; GFX12-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v7, 0x400000, v2
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-NEXT:    v_mov_b32_e32 v4, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-NEXT:    v_max_num_f32_e32 v5, v5, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v2
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v2, v4, v2, 0x7060302
+; GFX12-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX12-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
+; GFX12-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -6015,30 +6018,30 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr,
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    ds_read_b32 v2, v0 offset:65532
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v4, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX940-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v2, v6, v2
-; GFX940-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX940-NEXT:    v_max_f32_e32 v5, v5, v1
+; GFX940-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v2, s4
+; GFX940-NEXT:    v_add3_u32 v8, v8, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v2, v3, v2, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v5, v2, s5
 ; GFX940-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6054,38 +6057,39 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_b32 v2, v0 offset:65532
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT:    v_dual_max_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f32_e32 v2, v5, v2
-; GFX11-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v2
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-NEXT:    v_mov_b32_e32 v4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-NEXT:    v_max_f32_e32 v5, v5, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v2, v4, v2, 0x7060302
+; GFX11-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX11-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
+; GFX11-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -6100,33 +6104,33 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_b32 v2, v0 offset:65532
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT:    v_max_f32_e32 v2, v5, v2
-; GFX10-NEXT:    v_max_f32_e32 v4, v6, v4
-; GFX10-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v2
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX10-NEXT:    v_mov_b32_e32 v4, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v1
+; GFX10-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s4
-; GFX10-NEXT:    v_perm_b32 v2, v4, v2, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB25_1
@@ -6140,29 +6144,29 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr,
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ds_read_b32 v2, v0 offset:65532
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v2, v6, v2
-; GFX90A-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX90A-NEXT:    v_perm_b32 v2, v3, v2, s9
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX90A-NEXT:    v_max_f32_e32 v5, v5, v1
+; GFX90A-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v5, v2, s9
 ; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6179,29 +6183,29 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr,
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    ds_read_b32 v2, v0 offset:65532
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_max_f32_e32 v2, v6, v2
-; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT:    v_perm_b32 v2, v3, v2, s9
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX908-NEXT:    v_max_f32_e32 v5, v5, v1
+; GFX908-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v2, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v5, v2, s9
 ; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6219,30 +6223,30 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr,
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_read_b32 v2, v0 offset:65532
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_max_f32_e32 v2, v6, v2
-; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v1
+; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v2, v5, v2, 16
 ; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6344,37 +6348,37 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    ds_load_b32 v2, v0
+; GFX12-NEXT:    ds_load_b32 v3, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_max_num_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v6, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_max_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-NEXT:    v_max_num_f32_e32 v4, v4, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX12-NEXT:    v_bfe_u32 v6, v4, 16, 1
 ; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
 ; GFX12-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v5, v7, s0
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v4, v3, 0x7060302
+; GFX12-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2
+; GFX12-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -6388,36 +6392,36 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX940-LABEL: local_atomic_fmax_noret_v2bf16:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    ds_read_b32 v2, v0
+; GFX940-NEXT:    ds_read_b32 v3, v0
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v6, v5
-; GFX940-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX940-NEXT:    v_add3_u32 v7, v7, v4, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX940-NEXT:    v_max_f32_e32 v4, v4, v2
+; GFX940-NEXT:    v_max_f32_e32 v5, v5, v1
+; GFX940-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v4, s4
+; GFX940-NEXT:    v_add3_u32 v8, v8, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v3, v4, v3, s5
-; GFX940-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v4, v5, v4, s5
+; GFX940-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v2, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6427,39 +6431,39 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX11-LABEL: local_atomic_fmax_noret_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    ds_load_b32 v2, v0
+; GFX11-NEXT:    ds_load_b32 v3, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_max_f32_e32 v4, v6, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-NEXT:    v_max_f32_e32 v4, v4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v6, v4, 16, 1
 ; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v5, v7, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v4, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2
+; GFX11-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -6472,34 +6476,34 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX10-LABEL: local_atomic_fmax_noret_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    ds_read_b32 v2, v0
+; GFX10-NEXT:    ds_read_b32 v3, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v4, v6, v5
-; GFX10-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v2
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v1
 ; GFX10-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX10-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v5, v7, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v4, v3, 0x7060302
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v4, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
@@ -6510,35 +6514,35 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX90A-LABEL: local_atomic_fmax_noret_v2bf16:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ds_read_b32 v2, v0
+; GFX90A-NEXT:    ds_read_b32 v3, v0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v6, v5
-; GFX90A-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX90A-NEXT:    v_perm_b32 v3, v4, v3, s9
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT:    v_max_f32_e32 v4, v4, v2
+; GFX90A-NEXT:    v_max_f32_e32 v5, v5, v1
+; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v4, v5, v4, s9
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6548,35 +6552,35 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX908-LABEL: local_atomic_fmax_noret_v2bf16:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    ds_read_b32 v2, v0
+; GFX908-NEXT:    ds_read_b32 v3, v0
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v4, v6, v5
-; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT:    v_add3_u32 v7, v7, v4, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v4, v3, s9
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX908-NEXT:    v_max_f32_e32 v4, v4, v2
+; GFX908-NEXT:    v_max_f32_e32 v5, v5, v1
+; GFX908-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX908-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v4, v5, v4, s9
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6587,36 +6591,36 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_read_b32 v2, v0
+; GFX8-NEXT:    ds_read_b32 v3, v0
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX8-NEXT:    v_max_f32_e32 v4, v6, v5
-; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v4
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT:    v_max_f32_e32 v4, v4, v2
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v1
+; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6708,37 +6712,37 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    ds_load_b32 v2, v0 offset:65532
+; GFX12-NEXT:    ds_load_b32 v3, v0 offset:65532
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_max_num_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v6, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_max_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-NEXT:    v_max_num_f32_e32 v4, v4, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX12-NEXT:    v_bfe_u32 v6, v4, 16, 1
 ; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
 ; GFX12-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v5, v7, s0
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v4, v3, 0x7060302
+; GFX12-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
+; GFX12-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -6752,36 +6756,36 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX940-LABEL: local_atomic_fmax_noret_v2bf16__ofset:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    ds_read_b32 v2, v0 offset:65532
+; GFX940-NEXT:    ds_read_b32 v3, v0 offset:65532
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX940-NEXT:    v_max_f32_e32 v4, v6, v5
-; GFX940-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX940-NEXT:    v_add3_u32 v7, v7, v4, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX940-NEXT:    v_max_f32_e32 v4, v4, v2
+; GFX940-NEXT:    v_max_f32_e32 v5, v5, v1
+; GFX940-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v4, s4
+; GFX940-NEXT:    v_add3_u32 v8, v8, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v3, v4, v3, s5
-; GFX940-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v4, v5, v4, s5
+; GFX940-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v2, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6791,39 +6795,39 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX11-LABEL: local_atomic_fmax_noret_v2bf16__ofset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    ds_load_b32 v2, v0 offset:65532
+; GFX11-NEXT:    ds_load_b32 v3, v0 offset:65532
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_max_f32_e32 v4, v6, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-NEXT:    v_max_f32_e32 v4, v4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v6, v4, 16, 1
 ; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v5, v7, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v4, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
+; GFX11-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -6836,34 +6840,34 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX10-LABEL: local_atomic_fmax_noret_v2bf16__ofset:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    ds_read_b32 v2, v0 offset:65532
+; GFX10-NEXT:    ds_read_b32 v3, v0 offset:65532
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v4, v6, v5
-; GFX10-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v2
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v1
 ; GFX10-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX10-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v5, v7, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v4, v3, 0x7060302
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v4, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
@@ -6874,35 +6878,35 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX90A-LABEL: local_atomic_fmax_noret_v2bf16__ofset:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ds_read_b32 v2, v0 offset:65532
+; GFX90A-NEXT:    ds_read_b32 v3, v0 offset:65532
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v6, v5
-; GFX90A-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX90A-NEXT:    v_perm_b32 v3, v4, v3, s9
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT:    v_max_f32_e32 v4, v4, v2
+; GFX90A-NEXT:    v_max_f32_e32 v5, v5, v1
+; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v4, v5, v4, s9
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6912,35 +6916,35 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX908-LABEL: local_atomic_fmax_noret_v2bf16__ofset:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    ds_read_b32 v2, v0 offset:65532
+; GFX908-NEXT:    ds_read_b32 v3, v0 offset:65532
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v4, v6, v5
-; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT:    v_add3_u32 v7, v7, v4, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v4, v3, s9
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX908-NEXT:    v_max_f32_e32 v4, v4, v2
+; GFX908-NEXT:    v_max_f32_e32 v5, v5, v1
+; GFX908-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX908-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v4, v5, v4, s9
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6951,36 +6955,36 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_read_b32 v2, v0 offset:65532
+; GFX8-NEXT:    ds_read_b32 v3, v0 offset:65532
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX8-NEXT:    v_max_f32_e32 v4, v6, v5
-; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v4
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT:    v_max_f32_e32 v4, v4, v2
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v1
+; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
index 1c9cff7326d65..282947afa409a 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
@@ -4520,15 +4520,15 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ds_load_b32 v2, v0
+; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    v_pk_max_num_f16 v2, v1, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v4, v3, v3
-; GFX12-NEXT:    v_pk_min_num_f16 v2, v4, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
+; GFX12-NEXT:    v_pk_min_num_f16 v2, v2, v1
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3
 ; GFX12-NEXT:    s_wait_dscnt 0x0
@@ -4550,17 +4550,17 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    ds_read_b32 v2, v0
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX940-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v4, v2
-; GFX940-NEXT:    v_pk_max_f16 v3, v1, v1
-; GFX940-NEXT:    v_pk_max_f16 v2, v4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_min_f16 v2, v2, v3
-; GFX940-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2
+; GFX940-NEXT:    v_pk_min_f16 v2, v2, v1
+; GFX940-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB20_1
@@ -4573,15 +4573,15 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_b32 v2, v0
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    v_pk_max_f16 v2, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v4, v3, v3
-; GFX11-NEXT:    v_pk_min_f16 v2, v4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX11-NEXT:    v_pk_min_f16 v2, v2, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4600,14 +4600,14 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_b32 v2, v0
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    v_pk_max_f16 v2, v1, v1
-; GFX10-NEXT:    v_pk_max_f16 v4, v3, v3
-; GFX10-NEXT:    v_pk_min_f16 v2, v4, v2
+; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT:    v_pk_min_f16 v2, v2, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4626,16 +4626,16 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ds_read_b32 v2, v0
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX90A-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v4, v2
-; GFX90A-NEXT:    v_pk_max_f16 v3, v1, v1
-; GFX90A-NEXT:    v_pk_max_f16 v2, v4, v4
-; GFX90A-NEXT:    v_pk_min_f16 v2, v2, v3
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT:    v_pk_min_f16 v2, v2, v1
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB20_1
@@ -4649,16 +4649,16 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    ds_read_b32 v2, v0
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_pk_max_f16 v3, v1, v1
-; GFX908-NEXT:    v_pk_max_f16 v2, v4, v4
-; GFX908-NEXT:    v_pk_min_f16 v2, v2, v3
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT:    v_pk_min_f16 v2, v2, v1
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
@@ -4673,17 +4673,17 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_read_b32 v2, v0
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
 ; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v2, v1, v1
-; GFX8-NEXT:    v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v6, v4, v4
-; GFX8-NEXT:    v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v2, v6, v2
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT:    v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v4, v4
+; GFX8-NEXT:    v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v5, v5, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
 ; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -4792,15 +4792,15 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ds_load_b32 v2, v0 offset:65532
+; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    v_pk_max_num_f16 v2, v1, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v4, v3, v3
-; GFX12-NEXT:    v_pk_min_num_f16 v2, v4, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
+; GFX12-NEXT:    v_pk_min_num_f16 v2, v2, v1
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
 ; GFX12-NEXT:    s_wait_dscnt 0x0
@@ -4822,17 +4822,17 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    ds_read_b32 v2, v0 offset:65532
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX940-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v4, v2
-; GFX940-NEXT:    v_pk_max_f16 v3, v1, v1
-; GFX940-NEXT:    v_pk_max_f16 v2, v4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_min_f16 v2, v2, v3
-; GFX940-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
+; GFX940-NEXT:    v_pk_min_f16 v2, v2, v1
+; GFX940-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB21_1
@@ -4845,15 +4845,15 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_b32 v2, v0 offset:65532
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    v_pk_max_f16 v2, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v4, v3, v3
-; GFX11-NEXT:    v_pk_min_f16 v2, v4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX11-NEXT:    v_pk_min_f16 v2, v2, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4872,14 +4872,14 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_b32 v2, v0 offset:65532
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    v_pk_max_f16 v2, v1, v1
-; GFX10-NEXT:    v_pk_max_f16 v4, v3, v3
-; GFX10-NEXT:    v_pk_min_f16 v2, v4, v2
+; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT:    v_pk_min_f16 v2, v2, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4898,16 +4898,16 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ds_read_b32 v2, v0 offset:65532
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX90A-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v4, v2
-; GFX90A-NEXT:    v_pk_max_f16 v3, v1, v1
-; GFX90A-NEXT:    v_pk_max_f16 v2, v4, v4
-; GFX90A-NEXT:    v_pk_min_f16 v2, v2, v3
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT:    v_pk_min_f16 v2, v2, v1
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB21_1
@@ -4921,16 +4921,16 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    ds_read_b32 v2, v0 offset:65532
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX908-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_pk_max_f16 v3, v1, v1
-; GFX908-NEXT:    v_pk_max_f16 v2, v4, v4
-; GFX908-NEXT:    v_pk_min_f16 v2, v2, v3
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT:    v_pk_min_f16 v2, v2, v1
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
@@ -4945,17 +4945,17 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_read_b32 v2, v0 offset:65532
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
 ; GFX8-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v2, v1, v1
-; GFX8-NEXT:    v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v6, v4, v4
-; GFX8-NEXT:    v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v2, v6, v2
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT:    v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v4, v4
+; GFX8-NEXT:    v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v5, v5, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
 ; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -5064,14 +5064,14 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ds_load_b32 v2, v0
+; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v1, v1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
-; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_min_num_f16 v3, v4, v3
+; GFX12-NEXT:    v_pk_min_num_f16 v3, v3, v1
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2
 ; GFX12-NEXT:    s_wait_dscnt 0x0
@@ -5093,13 +5093,13 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    ds_read_b32 v2, v0
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX940-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_pk_max_f16 v3, v1, v1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
+; GFX940-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_min_f16 v3, v4, v3
+; GFX940-NEXT:    v_pk_min_f16 v3, v3, v1
 ; GFX940-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
@@ -5115,14 +5115,14 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_b32 v2, v0
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_pk_max_f16 v3, v1, v1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
+; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_min_f16 v3, v4, v3
+; GFX11-NEXT:    v_pk_min_f16 v3, v3, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5141,13 +5141,13 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_b32 v2, v0
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_pk_max_f16 v3, v1, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX10-NEXT:    v_pk_min_f16 v3, v4, v3
+; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
+; GFX10-NEXT:    v_pk_min_f16 v3, v3, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5166,12 +5166,12 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ds_read_b32 v2, v0
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_pk_max_f16 v3, v1, v1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:    v_pk_min_f16 v3, v4, v3
+; GFX90A-NEXT:    v_pk_max_f16 v3, v2, v2
+; GFX90A-NEXT:    v_pk_min_f16 v3, v3, v1
 ; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
@@ -5188,12 +5188,12 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    ds_read_b32 v2, v0
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_pk_max_f16 v3, v1, v1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:    v_pk_min_f16 v3, v4, v3
+; GFX908-NEXT:    v_pk_max_f16 v3, v2, v2
+; GFX908-NEXT:    v_pk_min_f16 v3, v3, v1
 ; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
@@ -5209,23 +5209,23 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_read_b32 v2, v0
+; GFX8-NEXT:    ds_read_b32 v3, v0
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
 ; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v5, v1, v1
-; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:    v_min_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v4, v6, v5
-; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX8-NEXT:    v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v3, v3
+; GFX8-NEXT:    v_min_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v5, v5, v1
+; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5326,14 +5326,14 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ds_load_b32 v2, v0 offset:65532
+; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v1, v1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
-; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_min_num_f16 v3, v4, v3
+; GFX12-NEXT:    v_pk_min_num_f16 v3, v3, v1
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
 ; GFX12-NEXT:    s_wait_dscnt 0x0
@@ -5355,13 +5355,13 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    ds_read_b32 v2, v0 offset:65532
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX940-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_pk_max_f16 v3, v1, v1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
+; GFX940-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_pk_min_f16 v3, v4, v3
+; GFX940-NEXT:    v_pk_min_f16 v3, v3, v1
 ; GFX940-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
@@ -5377,14 +5377,14 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_b32 v2, v0 offset:65532
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_pk_max_f16 v3, v1, v1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
+; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_min_f16 v3, v4, v3
+; GFX11-NEXT:    v_pk_min_f16 v3, v3, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5403,13 +5403,13 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_b32 v2, v0 offset:65532
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_pk_max_f16 v3, v1, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX10-NEXT:    v_pk_min_f16 v3, v4, v3
+; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
+; GFX10-NEXT:    v_pk_min_f16 v3, v3, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5428,12 +5428,12 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ds_read_b32 v2, v0 offset:65532
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX90A-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_pk_max_f16 v3, v1, v1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:    v_pk_min_f16 v3, v4, v3
+; GFX90A-NEXT:    v_pk_max_f16 v3, v2, v2
+; GFX90A-NEXT:    v_pk_min_f16 v3, v3, v1
 ; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
@@ -5450,12 +5450,12 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    ds_read_b32 v2, v0 offset:65532
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_pk_max_f16 v3, v1, v1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:    v_pk_min_f16 v3, v4, v3
+; GFX908-NEXT:    v_pk_max_f16 v3, v2, v2
+; GFX908-NEXT:    v_pk_min_f16 v3, v3, v1
 ; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
@@ -5471,23 +5471,23 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_read_b32 v2, v0 offset:65532
+; GFX8-NEXT:    ds_read_b32 v3, v0 offset:65532
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
 ; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v5, v1, v1
-; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:    v_min_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v4, v6, v5
-; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX8-NEXT:    v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v3, v3
+; GFX8-NEXT:    v_min_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v5, v5, v1
+; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5594,36 +5594,37 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ds_load_b32 v2, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1
-; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT:    v_dual_min_num_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_min_num_f32_e32 v2, v5, v2
-; GFX12-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v7, 0x400000, v2
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-NEXT:    v_mov_b32_e32 v4, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-NEXT:    v_min_num_f32_e32 v5, v5, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v2
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v2, v4, v2, 0x7060302
+; GFX12-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX12-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3
+; GFX12-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v4
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -5640,30 +5641,30 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    ds_read_b32 v2, v0
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v4, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX940-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX940-NEXT:    v_min_f32_e32 v2, v6, v2
-; GFX940-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX940-NEXT:    v_min_f32_e32 v5, v5, v1
+; GFX940-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v2, s4
+; GFX940-NEXT:    v_add3_u32 v8, v8, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v2, v3, v2, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v5, v2, s5
 ; GFX940-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -5679,38 +5680,39 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_b32 v2, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT:    v_dual_min_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f32_e32 v2, v5, v2
-; GFX11-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v2
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-NEXT:    v_mov_b32_e32 v4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-NEXT:    v_min_f32_e32 v5, v5, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX11-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v2, v4, v2, 0x7060302
+; GFX11-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX11-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3
+; GFX11-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v4
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -5725,33 +5727,33 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_b32 v2, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT:    v_min_f32_e32 v2, v5, v2
-; GFX10-NEXT:    v_min_f32_e32 v4, v6, v4
-; GFX10-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v2
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX10-NEXT:    v_mov_b32_e32 v4, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX10-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX10-NEXT:    v_min_f32_e32 v5, v5, v1
+; GFX10-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s4
-; GFX10-NEXT:    v_perm_b32 v2, v4, v2, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB24_1
@@ -5765,29 +5767,29 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ds_read_b32 v2, v0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX90A-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX90A-NEXT:    v_min_f32_e32 v2, v6, v2
-; GFX90A-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX90A-NEXT:    v_perm_b32 v2, v3, v2, s9
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX90A-NEXT:    v_min_f32_e32 v5, v5, v1
+; GFX90A-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v5, v2, s9
 ; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -5804,29 +5806,29 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    ds_read_b32 v2, v0
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_min_f32_e32 v2, v6, v2
-; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT:    v_perm_b32 v2, v3, v2, s9
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX908-NEXT:    v_min_f32_e32 v5, v5, v1
+; GFX908-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v2, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v5, v2, s9
 ; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -5844,30 +5846,30 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_read_b32 v2, v0
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_min_f32_e32 v2, v6, v2
-; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v1
+; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v2, v5, v2, 16
 ; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -5969,36 +5971,37 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr,
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ds_load_b32 v2, v0 offset:65532
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1
-; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT:    v_dual_min_num_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_min_num_f32_e32 v2, v5, v2
-; GFX12-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v7, 0x400000, v2
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-NEXT:    v_mov_b32_e32 v4, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-NEXT:    v_min_num_f32_e32 v5, v5, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v2
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v2, v4, v2, 0x7060302
+; GFX12-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX12-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
+; GFX12-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -6015,30 +6018,30 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr,
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    ds_read_b32 v2, v0 offset:65532
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v4, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX940-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX940-NEXT:    v_min_f32_e32 v2, v6, v2
-; GFX940-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX940-NEXT:    v_min_f32_e32 v5, v5, v1
+; GFX940-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v2, s4
+; GFX940-NEXT:    v_add3_u32 v8, v8, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v2, v3, v2, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v5, v2, s5
 ; GFX940-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6054,38 +6057,39 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_b32 v2, v0 offset:65532
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT:    v_dual_min_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f32_e32 v2, v5, v2
-; GFX11-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v2
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-NEXT:    v_mov_b32_e32 v4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-NEXT:    v_min_f32_e32 v5, v5, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX11-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v2, v4, v2, 0x7060302
+; GFX11-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX11-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
+; GFX11-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -6100,33 +6104,33 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_b32 v2, v0 offset:65532
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT:    v_min_f32_e32 v2, v5, v2
-; GFX10-NEXT:    v_min_f32_e32 v4, v6, v4
-; GFX10-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v2
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX10-NEXT:    v_mov_b32_e32 v4, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX10-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX10-NEXT:    v_min_f32_e32 v5, v5, v1
+; GFX10-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s4
-; GFX10-NEXT:    v_perm_b32 v2, v4, v2, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB25_1
@@ -6140,29 +6144,29 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr,
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ds_read_b32 v2, v0 offset:65532
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX90A-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX90A-NEXT:    v_min_f32_e32 v2, v6, v2
-; GFX90A-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX90A-NEXT:    v_perm_b32 v2, v3, v2, s9
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX90A-NEXT:    v_min_f32_e32 v5, v5, v1
+; GFX90A-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v5, v2, s9
 ; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6179,29 +6183,29 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr,
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    ds_read_b32 v2, v0 offset:65532
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_min_f32_e32 v2, v6, v2
-; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT:    v_perm_b32 v2, v3, v2, s9
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX908-NEXT:    v_min_f32_e32 v5, v5, v1
+; GFX908-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v2, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v5, v2, s9
 ; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6219,30 +6223,30 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr,
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_read_b32 v2, v0 offset:65532
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_min_f32_e32 v2, v6, v2
-; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v1
+; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v2, v5, v2, 16
 ; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6344,37 +6348,37 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    ds_load_b32 v2, v0
+; GFX12-NEXT:    ds_load_b32 v3, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_min_num_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX12-NEXT:    v_min_num_f32_e32 v4, v6, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_min_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-NEXT:    v_min_num_f32_e32 v4, v4, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX12-NEXT:    v_bfe_u32 v6, v4, 16, 1
 ; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
 ; GFX12-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v5, v7, s0
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v4, v3, 0x7060302
+; GFX12-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2
+; GFX12-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -6388,36 +6392,36 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX940-LABEL: local_atomic_fmin_noret_v2bf16:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    ds_read_b32 v2, v0
+; GFX940-NEXT:    ds_read_b32 v3, v0
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX940-NEXT:    v_min_f32_e32 v4, v6, v5
-; GFX940-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX940-NEXT:    v_add3_u32 v7, v7, v4, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX940-NEXT:    v_min_f32_e32 v4, v4, v2
+; GFX940-NEXT:    v_min_f32_e32 v5, v5, v1
+; GFX940-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v4, s4
+; GFX940-NEXT:    v_add3_u32 v8, v8, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v3, v4, v3, s5
-; GFX940-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v4, v5, v4, s5
+; GFX940-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v2, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6427,39 +6431,39 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX11-LABEL: local_atomic_fmin_noret_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    ds_load_b32 v2, v0
+; GFX11-NEXT:    ds_load_b32 v3, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_min_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_min_f32_e32 v4, v6, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_min_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-NEXT:    v_min_f32_e32 v4, v4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v6, v4, 16, 1
 ; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v5, v7, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v4, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2
+; GFX11-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -6472,34 +6476,34 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX10-LABEL: local_atomic_fmin_noret_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    ds_read_b32 v2, v0
+; GFX10-NEXT:    ds_read_b32 v3, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX10-NEXT:    v_min_f32_e32 v4, v6, v5
-; GFX10-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX10-NEXT:    v_min_f32_e32 v4, v4, v2
+; GFX10-NEXT:    v_min_f32_e32 v5, v5, v1
 ; GFX10-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX10-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v5, v7, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v4, v3, 0x7060302
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v4, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
@@ -6510,35 +6514,35 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX90A-LABEL: local_atomic_fmin_noret_v2bf16:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ds_read_b32 v2, v0
+; GFX90A-NEXT:    ds_read_b32 v3, v0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_min_f32_e32 v4, v6, v5
-; GFX90A-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX90A-NEXT:    v_perm_b32 v3, v4, v3, s9
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v2
+; GFX90A-NEXT:    v_min_f32_e32 v5, v5, v1
+; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v4, v5, v4, s9
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6548,35 +6552,35 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX908-LABEL: local_atomic_fmin_noret_v2bf16:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    ds_read_b32 v2, v0
+; GFX908-NEXT:    ds_read_b32 v3, v0
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX908-NEXT:    v_min_f32_e32 v4, v6, v5
-; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT:    v_add3_u32 v7, v7, v4, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v4, v3, s9
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX908-NEXT:    v_min_f32_e32 v4, v4, v2
+; GFX908-NEXT:    v_min_f32_e32 v5, v5, v1
+; GFX908-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX908-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v4, v5, v4, s9
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6587,36 +6591,36 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_read_b32 v2, v0
+; GFX8-NEXT:    ds_read_b32 v3, v0
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX8-NEXT:    v_min_f32_e32 v4, v6, v5
-; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v4
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT:    v_min_f32_e32 v4, v4, v2
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v1
+; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6708,37 +6712,37 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    ds_load_b32 v2, v0 offset:65532
+; GFX12-NEXT:    ds_load_b32 v3, v0 offset:65532
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_min_num_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX12-NEXT:    v_min_num_f32_e32 v4, v6, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_min_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-NEXT:    v_min_num_f32_e32 v4, v4, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX12-NEXT:    v_bfe_u32 v6, v4, 16, 1
 ; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
 ; GFX12-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v5, v7, s0
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v4, v3, 0x7060302
+; GFX12-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
+; GFX12-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -6752,36 +6756,36 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX940-LABEL: local_atomic_fmin_noret_v2bf16__ofset:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    ds_read_b32 v2, v0 offset:65532
+; GFX940-NEXT:    ds_read_b32 v3, v0 offset:65532
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX940-NEXT:    v_min_f32_e32 v4, v6, v5
-; GFX940-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX940-NEXT:    v_add3_u32 v7, v7, v4, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX940-NEXT:    v_min_f32_e32 v4, v4, v2
+; GFX940-NEXT:    v_min_f32_e32 v5, v5, v1
+; GFX940-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v4, s4
+; GFX940-NEXT:    v_add3_u32 v8, v8, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v3, v4, v3, s5
-; GFX940-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v4, v5, v4, s5
+; GFX940-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v2, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6791,39 +6795,39 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX11-LABEL: local_atomic_fmin_noret_v2bf16__ofset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    ds_load_b32 v2, v0 offset:65532
+; GFX11-NEXT:    ds_load_b32 v3, v0 offset:65532
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_min_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_min_f32_e32 v4, v6, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_min_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-NEXT:    v_min_f32_e32 v4, v4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v6, v4, 16, 1
 ; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v5, v7, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v4, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
+; GFX11-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -6836,34 +6840,34 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX10-LABEL: local_atomic_fmin_noret_v2bf16__ofset:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    ds_read_b32 v2, v0 offset:65532
+; GFX10-NEXT:    ds_read_b32 v3, v0 offset:65532
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX10-NEXT:    v_min_f32_e32 v4, v6, v5
-; GFX10-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX10-NEXT:    v_min_f32_e32 v4, v4, v2
+; GFX10-NEXT:    v_min_f32_e32 v5, v5, v1
 ; GFX10-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX10-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v5, v7, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v4, v3, 0x7060302
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v4, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
@@ -6874,35 +6878,35 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX90A-LABEL: local_atomic_fmin_noret_v2bf16__ofset:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ds_read_b32 v2, v0 offset:65532
+; GFX90A-NEXT:    ds_read_b32 v3, v0 offset:65532
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_min_f32_e32 v4, v6, v5
-; GFX90A-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX90A-NEXT:    v_perm_b32 v3, v4, v3, s9
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v2
+; GFX90A-NEXT:    v_min_f32_e32 v5, v5, v1
+; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v4, v5, v4, s9
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6912,35 +6916,35 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX908-LABEL: local_atomic_fmin_noret_v2bf16__ofset:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    ds_read_b32 v2, v0 offset:65532
+; GFX908-NEXT:    ds_read_b32 v3, v0 offset:65532
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX908-NEXT:    v_min_f32_e32 v4, v6, v5
-; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT:    v_add3_u32 v7, v7, v4, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v4, v3, s9
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX908-NEXT:    v_min_f32_e32 v4, v4, v2
+; GFX908-NEXT:    v_min_f32_e32 v5, v5, v1
+; GFX908-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX908-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v4, v5, v4, s9
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6951,36 +6955,36 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_read_b32 v2, v0 offset:65532
+; GFX8-NEXT:    ds_read_b32 v3, v0 offset:65532
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX8-NEXT:    v_min_f32_e32 v4, v6, v5
-; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v4
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT:    v_min_f32_e32 v4, v4, v2
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v1
+; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
index 2433eca80b23c..1b08b64b046b4 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
@@ -6390,36 +6390,37 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ds_load_b32 v2, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1
-; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT:    v_dual_sub_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_sub_f32_e32 v2, v5, v2
-; GFX12-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v7, 0x400000, v2
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-NEXT:    v_mov_b32_e32 v4, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-NEXT:    v_sub_f32_e32 v5, v5, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX12-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v2
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v2, v4, v2, 0x7060302
+; GFX12-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX12-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3
+; GFX12-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v4
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -6436,30 +6437,30 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    ds_read_b32 v2, v0
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v4, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX940-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX940-NEXT:    v_sub_f32_e32 v2, v6, v2
-; GFX940-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX940-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX940-NEXT:    v_sub_f32_e32 v5, v5, v1
+; GFX940-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v2, s4
+; GFX940-NEXT:    v_add3_u32 v8, v8, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v2, v3, v2, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v5, v2, s5
 ; GFX940-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6475,38 +6476,39 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_b32 v2, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT:    v_dual_sub_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_f32_e32 v2, v5, v2
-; GFX11-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v2
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-NEXT:    v_mov_b32_e32 v4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-NEXT:    v_sub_f32_e32 v5, v5, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX11-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v2, v4, v2, 0x7060302
+; GFX11-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX11-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3
+; GFX11-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v4
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -6521,33 +6523,33 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_b32 v2, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT:    v_sub_f32_e32 v2, v5, v2
-; GFX10-NEXT:    v_sub_f32_e32 v4, v6, v4
-; GFX10-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v2
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX10-NEXT:    v_mov_b32_e32 v4, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX10-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX10-NEXT:    v_sub_f32_e32 v5, v5, v1
+; GFX10-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s4
-; GFX10-NEXT:    v_perm_b32 v2, v4, v2, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB24_1
@@ -6561,29 +6563,29 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ds_read_b32 v2, v0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX90A-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX90A-NEXT:    v_sub_f32_e32 v2, v6, v2
-; GFX90A-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX90A-NEXT:    v_perm_b32 v2, v3, v2, s9
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX90A-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX90A-NEXT:    v_sub_f32_e32 v5, v5, v1
+; GFX90A-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v5, v2, s9
 ; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6600,29 +6602,29 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    ds_read_b32 v2, v0
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX908-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_sub_f32_e32 v2, v6, v2
-; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT:    v_perm_b32 v2, v3, v2, s9
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX908-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX908-NEXT:    v_sub_f32_e32 v5, v5, v1
+; GFX908-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v2, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v5, v2, s9
 ; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6640,30 +6642,30 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_read_b32 v2, v0
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_sub_f32_e32 v2, v6, v2
-; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX8-NEXT:    v_sub_f32_e32 v5, v5, v1
+; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v2, v5, v2, 16
 ; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6765,36 +6767,37 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr,
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ds_load_b32 v2, v0 offset:65532
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1
-; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT:    v_dual_sub_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_sub_f32_e32 v2, v5, v2
-; GFX12-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v7, 0x400000, v2
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-NEXT:    v_mov_b32_e32 v4, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-NEXT:    v_sub_f32_e32 v5, v5, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX12-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v2
 ; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v2, v4, v2, 0x7060302
+; GFX12-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX12-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
+; GFX12-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -6811,30 +6814,30 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr,
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    ds_read_b32 v2, v0 offset:65532
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v4, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX940-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX940-NEXT:    v_sub_f32_e32 v2, v6, v2
-; GFX940-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX940-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX940-NEXT:    v_sub_f32_e32 v5, v5, v1
+; GFX940-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v2, s4
+; GFX940-NEXT:    v_add3_u32 v8, v8, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v2, v3, v2, s5
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v5, v2, s5
 ; GFX940-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6850,38 +6853,39 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_b32 v2, v0 offset:65532
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT:    v_dual_sub_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_f32_e32 v2, v5, v2
-; GFX11-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v2
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-NEXT:    v_mov_b32_e32 v4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-NEXT:    v_sub_f32_e32 v5, v5, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX11-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v2, v4, v2, 0x7060302
+; GFX11-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX11-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
+; GFX11-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -6896,33 +6900,33 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_b32 v2, v0 offset:65532
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT:    v_sub_f32_e32 v2, v5, v2
-; GFX10-NEXT:    v_sub_f32_e32 v4, v6, v4
-; GFX10-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v2
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX10-NEXT:    v_mov_b32_e32 v4, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX10-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX10-NEXT:    v_sub_f32_e32 v5, v5, v1
+; GFX10-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s4
-; GFX10-NEXT:    v_perm_b32 v2, v4, v2, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB25_1
@@ -6936,29 +6940,29 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr,
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ds_read_b32 v2, v0 offset:65532
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX90A-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX90A-NEXT:    v_sub_f32_e32 v2, v6, v2
-; GFX90A-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX90A-NEXT:    v_perm_b32 v2, v3, v2, s9
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX90A-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX90A-NEXT:    v_sub_f32_e32 v5, v5, v1
+; GFX90A-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v5, v2, s9
 ; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6975,29 +6979,29 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr,
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    ds_read_b32 v2, v0 offset:65532
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX908-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX908-NEXT:    v_sub_f32_e32 v2, v6, v2
-; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT:    v_perm_b32 v2, v3, v2, s9
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX908-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX908-NEXT:    v_sub_f32_e32 v5, v5, v1
+; GFX908-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v2, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v5, v2, s9
 ; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -7015,30 +7019,30 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr,
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_read_b32 v2, v0 offset:65532
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_sub_f32_e32 v2, v6, v2
-; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX8-NEXT:    v_sub_f32_e32 v5, v5, v1
+; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v2, v5, v2, 16
 ; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
@@ -7140,37 +7144,37 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    ds_load_b32 v2, v0
+; GFX12-NEXT:    ds_load_b32 v3, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_sub_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX12-NEXT:    v_sub_f32_e32 v4, v6, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-NEXT:    v_sub_f32_e32 v4, v4, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX12-NEXT:    v_bfe_u32 v6, v4, 16, 1
 ; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
 ; GFX12-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v5, v7, s0
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v4, v3, 0x7060302
+; GFX12-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2
+; GFX12-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -7184,36 +7188,36 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX940-LABEL: local_atomic_fsub_noret_v2bf16:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    ds_read_b32 v2, v0
+; GFX940-NEXT:    ds_read_b32 v3, v0
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT:    v_sub_f32_e32 v3, v4, v3
-; GFX940-NEXT:    v_sub_f32_e32 v4, v6, v5
-; GFX940-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX940-NEXT:    v_add3_u32 v7, v7, v4, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX940-NEXT:    v_sub_f32_e32 v4, v4, v2
+; GFX940-NEXT:    v_sub_f32_e32 v5, v5, v1
+; GFX940-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v4, s4
+; GFX940-NEXT:    v_add3_u32 v8, v8, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v3, v4, v3, s5
-; GFX940-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v4, v5, v4, s5
+; GFX940-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v2, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7223,39 +7227,39 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX11-LABEL: local_atomic_fsub_noret_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    ds_load_b32 v2, v0
+; GFX11-NEXT:    ds_load_b32 v3, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_sub_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_sub_f32_e32 v4, v6, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-NEXT:    v_sub_f32_e32 v4, v4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v6, v4, 16, 1
 ; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v5, v7, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v4, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2
+; GFX11-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -7268,34 +7272,34 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX10-LABEL: local_atomic_fsub_noret_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    ds_read_b32 v2, v0
+; GFX10-NEXT:    ds_read_b32 v3, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_sub_f32_e32 v3, v4, v3
-; GFX10-NEXT:    v_sub_f32_e32 v4, v6, v5
-; GFX10-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX10-NEXT:    v_sub_f32_e32 v4, v4, v2
+; GFX10-NEXT:    v_sub_f32_e32 v5, v5, v1
 ; GFX10-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX10-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v5, v7, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v4, v3, 0x7060302
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v4, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
@@ -7306,35 +7310,35 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX90A-LABEL: local_atomic_fsub_noret_v2bf16:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ds_read_b32 v2, v0
+; GFX90A-NEXT:    ds_read_b32 v3, v0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_sub_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_sub_f32_e32 v4, v6, v5
-; GFX90A-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX90A-NEXT:    v_perm_b32 v3, v4, v3, s9
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT:    v_sub_f32_e32 v4, v4, v2
+; GFX90A-NEXT:    v_sub_f32_e32 v5, v5, v1
+; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v4, v5, v4, s9
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7344,35 +7348,35 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX908-LABEL: local_atomic_fsub_noret_v2bf16:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    ds_read_b32 v2, v0
+; GFX908-NEXT:    ds_read_b32 v3, v0
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_sub_f32_e32 v3, v4, v3
-; GFX908-NEXT:    v_sub_f32_e32 v4, v6, v5
-; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT:    v_add3_u32 v7, v7, v4, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v4, v3, s9
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX908-NEXT:    v_sub_f32_e32 v4, v4, v2
+; GFX908-NEXT:    v_sub_f32_e32 v5, v5, v1
+; GFX908-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX908-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v4, v5, v4, s9
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7383,36 +7387,36 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_read_b32 v2, v0
+; GFX8-NEXT:    ds_read_b32 v3, v0
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_sub_f32_e32 v3, v4, v3
-; GFX8-NEXT:    v_sub_f32_e32 v4, v6, v5
-; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v4
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT:    v_sub_f32_e32 v4, v4, v2
+; GFX8-NEXT:    v_sub_f32_e32 v5, v5, v1
+; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7504,37 +7508,37 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    ds_load_b32 v2, v0 offset:65532
+; GFX12-NEXT:    ds_load_b32 v3, v0 offset:65532
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_sub_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX12-NEXT:    v_sub_f32_e32 v4, v6, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-NEXT:    v_sub_f32_e32 v4, v4, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX12-NEXT:    v_bfe_u32 v6, v4, 16, 1
 ; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
 ; GFX12-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v5, v7, s0
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v3, v4, v3, 0x7060302
+; GFX12-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
+; GFX12-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -7548,36 +7552,36 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX940-LABEL: local_atomic_fsub_noret_v2bf16__ofset:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    ds_read_b32 v2, v0 offset:65532
+; GFX940-NEXT:    ds_read_b32 v3, v0 offset:65532
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
 ; GFX940-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT:    v_sub_f32_e32 v3, v4, v3
-; GFX940-NEXT:    v_sub_f32_e32 v4, v6, v5
-; GFX940-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX940-NEXT:    v_add3_u32 v7, v7, v4, s4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX940-NEXT:    v_sub_f32_e32 v4, v4, v2
+; GFX940-NEXT:    v_sub_f32_e32 v5, v5, v1
+; GFX940-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v4, s4
+; GFX940-NEXT:    v_add3_u32 v8, v8, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[0:1]
-; GFX940-NEXT:    v_perm_b32 v3, v4, v3, s5
-; GFX940-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v4, v5, v4, s5
+; GFX940-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v2, v3
+; GFX940-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7587,39 +7591,39 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX11-LABEL: local_atomic_fsub_noret_v2bf16__ofset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    ds_load_b32 v2, v0 offset:65532
+; GFX11-NEXT:    ds_load_b32 v3, v0 offset:65532
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_sub_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_sub_f32_e32 v4, v6, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-NEXT:    v_sub_f32_e32 v4, v4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v6, v4, 16, 1
 ; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v5, v7, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v4, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
+; GFX11-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -7632,34 +7636,34 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX10-LABEL: local_atomic_fsub_noret_v2bf16__ofset:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    ds_read_b32 v2, v0 offset:65532
+; GFX10-NEXT:    ds_read_b32 v3, v0 offset:65532
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_sub_f32_e32 v3, v4, v3
-; GFX10-NEXT:    v_sub_f32_e32 v4, v6, v5
-; GFX10-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX10-NEXT:    v_sub_f32_e32 v4, v4, v2
+; GFX10-NEXT:    v_sub_f32_e32 v5, v5, v1
 ; GFX10-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX10-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v5, v7, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v4, v3, 0x7060302
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v4, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
@@ -7670,35 +7674,35 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX90A-LABEL: local_atomic_fsub_noret_v2bf16__ofset:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ds_read_b32 v2, v0 offset:65532
+; GFX90A-NEXT:    ds_read_b32 v3, v0 offset:65532
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT:    v_sub_f32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_sub_f32_e32 v4, v6, v5
-; GFX90A-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v4, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX90A-NEXT:    v_perm_b32 v3, v4, v3, s9
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT:    v_sub_f32_e32 v4, v4, v2
+; GFX90A-NEXT:    v_sub_f32_e32 v5, v5, v1
+; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v4, v5, v4, s9
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7708,35 +7712,35 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX908-LABEL: local_atomic_fsub_noret_v2bf16__ofset:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    ds_read_b32 v2, v0 offset:65532
+; GFX908-NEXT:    ds_read_b32 v3, v0 offset:65532
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT:    v_sub_f32_e32 v3, v4, v3
-; GFX908-NEXT:    v_sub_f32_e32 v4, v6, v5
-; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT:    v_add3_u32 v7, v7, v4, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX908-NEXT:    v_perm_b32 v3, v4, v3, s9
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX908-NEXT:    v_sub_f32_e32 v4, v4, v2
+; GFX908-NEXT:    v_sub_f32_e32 v5, v5, v1
+; GFX908-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX908-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v4, v5, v4, s9
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7747,36 +7751,36 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_read_b32 v2, v0 offset:65532
+; GFX8-NEXT:    ds_read_b32 v3, v0 offset:65532
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_sub_f32_e32 v3, v4, v3
-; GFX8-NEXT:    v_sub_f32_e32 v4, v6, v5
-; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v4
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT:    v_sub_f32_e32 v4, v4, v2
+; GFX8-NEXT:    v_sub_f32_e32 v5, v5, v1
+; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
index be8a6295c8a71..8157b1a7f7c80 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
@@ -161,13 +161,13 @@ define void @issue63986_reduced_expanded(i64 %idxprom) {
 ; CHECK-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
 ; CHECK-NEXT:    s_cbranch_execnz .LBB1_8
 ; CHECK-NEXT:  .LBB1_5: ; %loop-memcpy-residual.preheader
-; CHECK-NEXT:    s_mov_b64 s[6:7], 0
-; CHECK-NEXT:  .LBB1_6: ; %loop-memcpy-residual
-; CHECK-NEXT:    s_add_u32 s6, s6, 1
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    s_mov_b64 s[6:7], 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s5
-; CHECK-NEXT:    s_addc_u32 s7, s7, 0
-; CHECK-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
+; CHECK-NEXT:  .LBB1_6: ; %loop-memcpy-residual
+; CHECK-NEXT:    s_add_u32 s4, s6, 1
+; CHECK-NEXT:    s_addc_u32 s5, s7, 0
+; CHECK-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
 ; CHECK-NEXT:    s_mov_b64 s[6:7], 1
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB1_6
 ; CHECK-NEXT:  ; %bb.7: ; %Flow
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
index 86a6ad46e8768..a9b8663a48dea 100644
--- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
@@ -5,21 +5,23 @@
 define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
 ; GFX942-LABEL: matmul_kernel:
 ; GFX942:       ; %bb.0: ; %entry
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX942-NEXT:    v_mov_b32_e32 v1, 0
-; GFX942-NEXT:    s_mov_b32 s0, 0
+; GFX942-NEXT:    s_mov_b32 s2, 0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a0, v1
-; GFX942-NEXT:    s_mov_b32 s1, 0
+; GFX942-NEXT:    s_mov_b32 s3, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX942-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
 ; GFX942-NEXT:    s_branch .LBB0_2
 ; GFX942-NEXT:  .LBB0_1: ; %bb2
 ; GFX942-NEXT:    ; in Loop: Header=BB0_2 Depth=1
-; GFX942-NEXT:    s_or_b32 s4, s1, 1
-; GFX942-NEXT:    s_ashr_i32 s5, s1, 31
-; GFX942-NEXT:    s_mov_b32 s1, s0
-; GFX942-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-NEXT:    s_or_b32 s4, s3, 1
+; GFX942-NEXT:    s_ashr_i32 s5, s3, 31
+; GFX942-NEXT:    s_mov_b32 s3, s2
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
 ; GFX942-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v1
@@ -27,54 +29,56 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
 ; GFX942-NEXT:    v_accvgpr_write_b32 a1, v1
 ; GFX942-NEXT:    v_accvgpr_write_b32 a2, v2
 ; GFX942-NEXT:    v_accvgpr_write_b32 a3, v3
-; GFX942-NEXT:    s_and_b32 s1, s5, s4
+; GFX942-NEXT:    s_and_b32 s3, s5, s4
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mfma_f32_16x16x16_f16 a[0:3], v[4:5], v[4:5], a[0:3]
 ; GFX942-NEXT:    s_cbranch_execz .LBB0_4
 ; GFX942-NEXT:  .LBB0_2: ; %bb
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GFX942-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_vccz .LBB0_1
 ; GFX942-NEXT:  ; %bb.3:
-; GFX942-NEXT:    ; implicit-def: $sgpr1
+; GFX942-NEXT:    ; implicit-def: $sgpr3
 ; GFX942-NEXT:  .LBB0_4: ; %common.ret
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX908-LABEL: matmul_kernel:
 ; GFX908:       ; %bb.0: ; %entry
-; GFX908-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX908-NEXT:    v_mov_b32_e32 v1, 0
-; GFX908-NEXT:    s_mov_b32 s0, 0
-; GFX908-NEXT:    s_mov_b32 s1, 0
+; GFX908-NEXT:    s_mov_b32 s2, 0
+; GFX908-NEXT:    s_mov_b32 s3, 0
 ; GFX908-NEXT:    v_accvgpr_write_b32 a0, v1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX908-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX908-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX908-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX908-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
 ; GFX908-NEXT:    s_branch .LBB0_2
 ; GFX908-NEXT:  .LBB0_1: ; %bb2
 ; GFX908-NEXT:    ; in Loop: Header=BB0_2 Depth=1
-; GFX908-NEXT:    s_or_b32 s4, s1, 1
-; GFX908-NEXT:    s_ashr_i32 s5, s1, 31
-; GFX908-NEXT:    s_mov_b32 s1, s0
+; GFX908-NEXT:    s_or_b32 s4, s3, 1
+; GFX908-NEXT:    s_ashr_i32 s5, s3, 31
+; GFX908-NEXT:    s_mov_b32 s3, s2
 ; GFX908-NEXT:    s_nop 3
 ; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
-; GFX908-NEXT:    v_mov_b32_e32 v5, s1
-; GFX908-NEXT:    v_mov_b32_e32 v4, s0
+; GFX908-NEXT:    v_mov_b32_e32 v5, s3
+; GFX908-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX908-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX908-NEXT:    v_accvgpr_write_b32 a0, v0
 ; GFX908-NEXT:    v_accvgpr_write_b32 a1, v1
 ; GFX908-NEXT:    v_accvgpr_write_b32 a2, v2
 ; GFX908-NEXT:    v_accvgpr_write_b32 a3, v3
-; GFX908-NEXT:    s_and_b32 s1, s5, s4
+; GFX908-NEXT:    s_and_b32 s3, s5, s4
 ; GFX908-NEXT:    v_mfma_f32_16x16x16f16 a[0:3], v[4:5], v[4:5], a[0:3]
 ; GFX908-NEXT:    s_cbranch_execz .LBB0_4
 ; GFX908-NEXT:  .LBB0_2: ; %bb
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GFX908-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB0_1
 ; GFX908-NEXT:  ; %bb.3:
-; GFX908-NEXT:    ; implicit-def: $sgpr1
+; GFX908-NEXT:    ; implicit-def: $sgpr3
 ; GFX908-NEXT:  .LBB0_4: ; %common.ret
 ; GFX908-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
index c80254210109a..3e45a2d0df43d 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
+++ b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
@@ -23,8 +23,10 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) {
 ; GCN-NEXT:    ; Child Loop BB0_4 Depth 2
 ; GCN-NEXT:    buffer_load_dword v1, off, s[8:11], 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v1
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v1
 ; GCN-NEXT:    s_mov_b32 s12, s6
 ; GCN-NEXT:    s_branch .LBB0_4
 ; GCN-NEXT:  .LBB0_3: ; %Flow1
@@ -34,7 +36,7 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) {
 ; GCN-NEXT:  .LBB0_4: ; %bb2
 ; GCN-NEXT:    ; Parent Loop BB0_2 Depth=1
 ; GCN-NEXT:    ; => This Inner Loop Header: Depth=2
-; GCN-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; GCN-NEXT:    s_lshl_b32 s12, s12, 5
 ; GCN-NEXT:    s_cbranch_vccz .LBB0_6
 ; GCN-NEXT:  ; %bb.5: ; in Loop: Header=BB0_4 Depth=2
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index cb9f6c1f38eea..96dd6276f7e38 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -1744,8 +1744,8 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[4:5], 0, -1, vcc
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-IR-NEXT:    s_movk_i32 s12, 0x7fff
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-IR-NEXT:  .LBB13_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index c1cc61bbacd0f..23364e860d154 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -1865,8 +1865,8 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_addc_u32_e64 v7, s[4:5], 0, -1, vcc
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:    s_movk_i32 s12, 0x7fff
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:  .LBB13_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index 15dc545175055..12eec4fa3bd59 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -32,51 +32,71 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-LABEL: kernel:
 ; GLOBALNESS1:       ; %bb.0: ; %bb
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GLOBALNESS1-NEXT:    s_load_dwordx4 s[64:67], s[8:9], 0x0
+; GLOBALNESS1-NEXT:    s_load_dwordx4 s[76:79], s[8:9], 0x0
 ; GLOBALNESS1-NEXT:    s_load_dword s6, s[8:9], 0x14
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v41, v0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v42, 0
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
 ; GLOBALNESS1-NEXT:    global_store_dword v[0:1], v42, off
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT:    global_load_dword v2, v42, s[64:65]
-; GLOBALNESS1-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GLOBALNESS1-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
-; GLOBALNESS1-NEXT:    s_add_u32 s0, s0, s17
-; GLOBALNESS1-NEXT:    s_addc_u32 s1, s1, 0
+; GLOBALNESS1-NEXT:    global_load_dword v2, v42, s[76:77]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[40:41], s[4:5]
 ; GLOBALNESS1-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x18
 ; GLOBALNESS1-NEXT:    s_load_dword s7, s[8:9], 0x20
-; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s66, 0
-; GLOBALNESS1-NEXT:    s_cselect_b64 s[68:69], -1, 0
+; GLOBALNESS1-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GLOBALNESS1-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GLOBALNESS1-NEXT:    s_add_u32 s0, s0, s17
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0
-; GLOBALNESS1-NEXT:    s_xor_b64 s[70:71], s[68:69], -1
+; GLOBALNESS1-NEXT:    s_addc_u32 s1, s1, 0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0x40994400
-; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s6, 0
+; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s78, 0
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e64 s[42:43], s[4:5], v[0:1]
-; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e64 s[64:65], s[4:5], 0
+; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1]
+; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
+; GLOBALNESS1-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s6, 0
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[42:43], 1, v0
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS1-NEXT:    s_xor_b64 s[72:73], s[4:5], -1
+; GLOBALNESS1-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s7, 0
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[48:49], 1, v0
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS1-NEXT:    s_mov_b32 s60, s16
+; GLOBALNESS1-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[50:51], 1, v0
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[52:53], 1, v0
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[44:45], 1, v1
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[46:47], 1, v3
+; GLOBALNESS1-NEXT:    s_mov_b32 s70, s16
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[38:39], s[8:9]
-; GLOBALNESS1-NEXT:    s_mov_b32 s61, s15
-; GLOBALNESS1-NEXT:    s_mov_b32 s62, s14
+; GLOBALNESS1-NEXT:    s_mov_b32 s71, s15
+; GLOBALNESS1-NEXT:    s_mov_b32 s72, s14
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[34:35], s[10:11]
-; GLOBALNESS1-NEXT:    s_xor_b64 s[74:75], s[4:5], -1
 ; GLOBALNESS1-NEXT:    s_mov_b32 s32, 0
 ; GLOBALNESS1-NEXT:    ; implicit-def: $vgpr44_vgpr45
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0)
-; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e64 s[44:45], 0, v2
-; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e64 s[46:47], 1, v2
-; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e64 s[48:49], 1, v2
-; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e64 s[50:51], 0, v2
+; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v2
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[54:55], 1, v0
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[56:57], 1, v1
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[58:59], 1, v3
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[60:61], 1, v2
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_4
 ; GLOBALNESS1-NEXT:  .LBB1_1: ; %bb70.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[50:51]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[60:61]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_29
 ; GLOBALNESS1-NEXT:  .LBB1_2: ; %Flow15
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
@@ -105,23 +125,23 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s62
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s60
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s72
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s71
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s70
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[68:69]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[46:47]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS1-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_9
 ; GLOBALNESS1-NEXT:  ; %bb.5: ; %NodeBlock
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_cmp_lt_i32 s67, 1
+; GLOBALNESS1-NEXT:    s_cmp_lt_i32 s79, 1
 ; GLOBALNESS1-NEXT:    s_cbranch_scc1 .LBB1_7
 ; GLOBALNESS1-NEXT:  ; %bb.6: ; %LeafBlock12
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s67, 1
+; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s79, 1
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], -1
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_8
@@ -131,7 +151,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GLOBALNESS1-NEXT:  .LBB1_8: ; %LeafBlock
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s67, 0
+; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s79, 0
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], 0
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GLOBALNESS1-NEXT:  .LBB1_9: ; %Flow25
@@ -143,15 +163,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
 ; GLOBALNESS1-NEXT:    flat_load_dword v0, v[2:3]
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e64 s[52:53], 0, v0
+; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e64 s[62:63], 0, v0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
-; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[76:77], s[52:53]
+; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[74:75], s[62:63]
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_26
 ; GLOBALNESS1-NEXT:  ; %bb.11: ; %bb33.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
-; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[44:45]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[54:55]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_13
 ; GLOBALNESS1-NEXT:  ; %bb.12: ; %bb39.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
@@ -163,70 +183,72 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v46
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e32 v2, 0, v40, vcc
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0)
-; GLOBALNESS1-NEXT:    v_cmp_nlt_f64_e64 s[54:55], 0, v[0:1]
-; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e64 s[56:57], 0, v2
+; GLOBALNESS1-NEXT:    v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e64 s[64:65], 0, v2
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[66:67], 1, v0
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_16
 ; GLOBALNESS1-NEXT:  .LBB1_14: ; %Flow16
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GLOBALNESS1-NEXT:  .LBB1_15: ; %bb63.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[74:75]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[52:53]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_25
 ; GLOBALNESS1-NEXT:  .LBB1_16: ; %bb44.i
 ; GLOBALNESS1-NEXT:    ; Parent Loop BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    ; => This Inner Loop Header: Depth=2
-; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[70:71]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[48:49]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS1-NEXT:  ; %bb.17: ; %bb46.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[72:73]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[50:51]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS1-NEXT:  ; %bb.18: ; %bb50.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[42:43]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[42:43]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_21
 ; GLOBALNESS1-NEXT:  ; %bb.19: ; %bb3.i.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[64:65]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[44:45]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_21
 ; GLOBALNESS1-NEXT:  ; %bb.20: ; %bb6.i.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[54:55]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[66:67]
 ; GLOBALNESS1-NEXT:  .LBB1_21: ; %spam.exit.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[46:47]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[56:57]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS1-NEXT:  ; %bb.22: ; %bb55.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_add_u32 s58, s38, 40
-; GLOBALNESS1-NEXT:    s_addc_u32 s59, s39, 0
+; GLOBALNESS1-NEXT:    s_add_u32 s68, s38, 40
+; GLOBALNESS1-NEXT:    s_addc_u32 s69, s39, 0
 ; GLOBALNESS1-NEXT:    s_getpc_b64 s[4:5]
 ; GLOBALNESS1-NEXT:    s_add_u32 s4, s4, wobble@gotpcrel32@lo+4
 ; GLOBALNESS1-NEXT:    s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12
-; GLOBALNESS1-NEXT:    s_load_dwordx2 s[78:79], s[4:5], 0x0
+; GLOBALNESS1-NEXT:    s_load_dwordx2 s[76:77], s[4:5], 0x0
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
-; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], s[58:59]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], s[68:69]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s62
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s60
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s72
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s71
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s70
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[78:79]
+; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[76:77]
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[46:47], 0, 0
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
-; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], s[58:59]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], s[68:69]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s62
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s60
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s72
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s71
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s70
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    global_store_dwordx2 v[46:47], v[44:45], off
-; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[78:79]
-; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[56:57]
+; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[76:77]
+; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[64:65]
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_14
 ; GLOBALNESS1-NEXT:  ; %bb.23: ; %bb62.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
@@ -242,12 +264,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
 ; GLOBALNESS1-NEXT:  .LBB1_26: ; %Flow24
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[76:77]
-; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[52:53]
+; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[74:75]
+; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[62:63]
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_2
 ; GLOBALNESS1-NEXT:  ; %bb.27: ; %bb67.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[48:49]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[58:59]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_1
 ; GLOBALNESS1-NEXT:  ; %bb.28: ; %bb69.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
@@ -271,9 +293,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s62
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s60
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s72
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s71
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s70
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    s_getpc_b64 s[16:17]
 ; GLOBALNESS1-NEXT:    s_add_u32 s16, s16, widget@rel32@lo+4
@@ -289,9 +311,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s62
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s60
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s72
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s71
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s70
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    s_getpc_b64 s[16:17]
 ; GLOBALNESS1-NEXT:    s_add_u32 s16, s16, widget@rel32@lo+4
@@ -302,51 +324,71 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-LABEL: kernel:
 ; GLOBALNESS0:       ; %bb.0: ; %bb
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GLOBALNESS0-NEXT:    s_load_dwordx4 s[64:67], s[8:9], 0x0
+; GLOBALNESS0-NEXT:    s_load_dwordx4 s[72:75], s[8:9], 0x0
 ; GLOBALNESS0-NEXT:    s_load_dword s6, s[8:9], 0x14
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v41, v0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v42, 0
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
 ; GLOBALNESS0-NEXT:    global_store_dword v[0:1], v42, off
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS0-NEXT:    global_load_dword v2, v42, s[64:65]
-; GLOBALNESS0-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GLOBALNESS0-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
-; GLOBALNESS0-NEXT:    s_add_u32 s0, s0, s17
-; GLOBALNESS0-NEXT:    s_addc_u32 s1, s1, 0
+; GLOBALNESS0-NEXT:    global_load_dword v2, v42, s[72:73]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[40:41], s[4:5]
 ; GLOBALNESS0-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x18
 ; GLOBALNESS0-NEXT:    s_load_dword s7, s[8:9], 0x20
-; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s66, 0
-; GLOBALNESS0-NEXT:    s_cselect_b64 s[68:69], -1, 0
+; GLOBALNESS0-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GLOBALNESS0-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GLOBALNESS0-NEXT:    s_add_u32 s0, s0, s17
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0
-; GLOBALNESS0-NEXT:    s_xor_b64 s[70:71], s[68:69], -1
+; GLOBALNESS0-NEXT:    s_addc_u32 s1, s1, 0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0x40994400
-; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s6, 0
+; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s74, 0
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e64 s[42:43], s[4:5], v[0:1]
-; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e64 s[64:65], s[4:5], 0
+; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1]
+; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
+; GLOBALNESS0-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s6, 0
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[42:43], 1, v0
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS0-NEXT:    s_xor_b64 s[72:73], s[4:5], -1
+; GLOBALNESS0-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s7, 0
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[48:49], 1, v0
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS0-NEXT:    s_mov_b32 s58, s16
+; GLOBALNESS0-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[50:51], 1, v0
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[52:53], 1, v0
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[44:45], 1, v1
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[46:47], 1, v3
+; GLOBALNESS0-NEXT:    s_mov_b32 s68, s16
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[38:39], s[8:9]
-; GLOBALNESS0-NEXT:    s_mov_b32 s59, s15
-; GLOBALNESS0-NEXT:    s_mov_b32 s60, s14
+; GLOBALNESS0-NEXT:    s_mov_b32 s69, s15
+; GLOBALNESS0-NEXT:    s_mov_b32 s70, s14
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[34:35], s[10:11]
-; GLOBALNESS0-NEXT:    s_xor_b64 s[74:75], s[4:5], -1
 ; GLOBALNESS0-NEXT:    s_mov_b32 s32, 0
 ; GLOBALNESS0-NEXT:    ; implicit-def: $vgpr44_vgpr45
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0)
-; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[44:45], 0, v2
-; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[46:47], 1, v2
-; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e64 s[48:49], 1, v2
-; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e64 s[50:51], 0, v2
+; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v2
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[54:55], 1, v0
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[56:57], 1, v1
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[58:59], 1, v3
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[60:61], 1, v2
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_4
 ; GLOBALNESS0-NEXT:  .LBB1_1: ; %bb70.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[50:51]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[60:61]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_29
 ; GLOBALNESS0-NEXT:  .LBB1_2: ; %Flow15
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
@@ -375,23 +417,23 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s60
-; GLOBALNESS0-NEXT:    s_mov_b32 s13, s59
-; GLOBALNESS0-NEXT:    s_mov_b32 s14, s58
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s70
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s69
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s68
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[68:69]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[46:47]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_9
 ; GLOBALNESS0-NEXT:  ; %bb.5: ; %NodeBlock
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_cmp_lt_i32 s67, 1
+; GLOBALNESS0-NEXT:    s_cmp_lt_i32 s75, 1
 ; GLOBALNESS0-NEXT:    s_cbranch_scc1 .LBB1_7
 ; GLOBALNESS0-NEXT:  ; %bb.6: ; %LeafBlock12
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s67, 1
+; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s75, 1
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], -1
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_8
@@ -401,7 +443,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GLOBALNESS0-NEXT:  .LBB1_8: ; %LeafBlock
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s67, 0
+; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s75, 0
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], 0
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GLOBALNESS0-NEXT:  .LBB1_9: ; %Flow25
@@ -413,15 +455,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
 ; GLOBALNESS0-NEXT:    flat_load_dword v0, v[2:3]
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[52:53], 0, v0
+; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[62:63], 0, v0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
-; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[76:77], s[52:53]
+; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[76:77], s[62:63]
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_26
 ; GLOBALNESS0-NEXT:  ; %bb.11: ; %bb33.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
-; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[44:45]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[54:55]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_13
 ; GLOBALNESS0-NEXT:  ; %bb.12: ; %bb39.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
@@ -433,70 +475,72 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v46
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e32 v2, 0, v40, vcc
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0)
-; GLOBALNESS0-NEXT:    v_cmp_nlt_f64_e64 s[54:55], 0, v[0:1]
-; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e64 s[56:57], 0, v2
+; GLOBALNESS0-NEXT:    v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e64 s[64:65], 0, v2
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[66:67], 1, v0
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_16
 ; GLOBALNESS0-NEXT:  .LBB1_14: ; %Flow16
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GLOBALNESS0-NEXT:  .LBB1_15: ; %bb63.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[74:75]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[52:53]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_25
 ; GLOBALNESS0-NEXT:  .LBB1_16: ; %bb44.i
 ; GLOBALNESS0-NEXT:    ; Parent Loop BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    ; => This Inner Loop Header: Depth=2
-; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[70:71]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[48:49]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS0-NEXT:  ; %bb.17: ; %bb46.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[72:73]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[50:51]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS0-NEXT:  ; %bb.18: ; %bb50.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[42:43]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[42:43]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_21
 ; GLOBALNESS0-NEXT:  ; %bb.19: ; %bb3.i.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[64:65]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[44:45]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_21
 ; GLOBALNESS0-NEXT:  ; %bb.20: ; %bb6.i.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[54:55]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[66:67]
 ; GLOBALNESS0-NEXT:  .LBB1_21: ; %spam.exit.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[46:47]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[56:57]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS0-NEXT:  ; %bb.22: ; %bb55.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_add_u32 s62, s38, 40
-; GLOBALNESS0-NEXT:    s_addc_u32 s63, s39, 0
+; GLOBALNESS0-NEXT:    s_add_u32 s72, s38, 40
+; GLOBALNESS0-NEXT:    s_addc_u32 s73, s39, 0
 ; GLOBALNESS0-NEXT:    s_getpc_b64 s[4:5]
 ; GLOBALNESS0-NEXT:    s_add_u32 s4, s4, wobble@gotpcrel32@lo+4
 ; GLOBALNESS0-NEXT:    s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12
 ; GLOBALNESS0-NEXT:    s_load_dwordx2 s[78:79], s[4:5], 0x0
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[62:63]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[72:73]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s60
-; GLOBALNESS0-NEXT:    s_mov_b32 s13, s59
-; GLOBALNESS0-NEXT:    s_mov_b32 s14, s58
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s70
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s69
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s68
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[78:79]
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[46:47], 0, 0
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[62:63]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[72:73]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s60
-; GLOBALNESS0-NEXT:    s_mov_b32 s13, s59
-; GLOBALNESS0-NEXT:    s_mov_b32 s14, s58
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s70
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s69
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s68
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS0-NEXT:    global_store_dwordx2 v[46:47], v[44:45], off
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[78:79]
-; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[56:57]
+; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[64:65]
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_14
 ; GLOBALNESS0-NEXT:  ; %bb.23: ; %bb62.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
@@ -513,11 +557,11 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:  .LBB1_26: ; %Flow24
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[76:77]
-; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[52:53]
+; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[62:63]
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_2
 ; GLOBALNESS0-NEXT:  ; %bb.27: ; %bb67.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[48:49]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[58:59]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_1
 ; GLOBALNESS0-NEXT:  ; %bb.28: ; %bb69.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
@@ -541,9 +585,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s60
-; GLOBALNESS0-NEXT:    s_mov_b32 s13, s59
-; GLOBALNESS0-NEXT:    s_mov_b32 s14, s58
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s70
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s69
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s68
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS0-NEXT:    s_getpc_b64 s[16:17]
 ; GLOBALNESS0-NEXT:    s_add_u32 s16, s16, widget@rel32@lo+4
@@ -559,9 +603,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s60
-; GLOBALNESS0-NEXT:    s_mov_b32 s13, s59
-; GLOBALNESS0-NEXT:    s_mov_b32 s14, s58
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s70
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s69
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s68
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS0-NEXT:    s_getpc_b64 s[16:17]
 ; GLOBALNESS0-NEXT:    s_add_u32 s16, s16, widget@rel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index 089f3d255e87a..db7d816386a70 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -1187,8 +1187,8 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_addc_u32_e64 v1, s[4:5], 0, -1, vcc
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:    s_movk_i32 s12, 0x7fff
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:  .LBB10_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
diff --git a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
index 945b87b6e1628..0acee5bd5ac19 100644
--- a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
+++ b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
@@ -52,10 +52,13 @@ define amdgpu_ps float @valley_partially_undef_copy() #0 {
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], 0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v1
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; CHECK-NEXT:    s_waitcnt expcnt(1)
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v1
 ; CHECK-NEXT:  .LBB1_1: ; %bb9
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; CHECK-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB1_1
 ; CHECK-NEXT:  ; %bb.2: ; %bb11
 ; CHECK-NEXT:    s_mov_b32 s3, 0xf000
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index bc7b375cd404d..a794d139063d5 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -1294,8 +1294,8 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_addc_u32_e64 v7, s[4:5], 0, -1, vcc
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:    s_movk_i32 s12, 0x7fff
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:  .LBB9_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
index 41d324dd7abfc..0211c5111c31d 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
@@ -6,15 +6,15 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(ptr %arg) #0 {
 ; GCN-LABEL: vgpr_descriptor_waterfall_loop_idom_update:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_add_co_u32 v6, vcc_lo, v0, 8
+; GCN-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
 ; GCN-NEXT:  .LBB0_1: ; %bb0
 ; GCN-NEXT:    ; =>This Loop Header: Depth=1
 ; GCN-NEXT:    ; Child Loop BB0_2 Depth 2
-; GCN-NEXT:    v_add_co_u32 v6, vcc_lo, v0, 8
-; GCN-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
-; GCN-NEXT:    s_mov_b32 s5, exec_lo
 ; GCN-NEXT:    s_clause 0x1
 ; GCN-NEXT:    flat_load_dwordx2 v[4:5], v[6:7]
 ; GCN-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GCN-NEXT:    s_mov_b32 s5, exec_lo
 ; GCN-NEXT:  .LBB0_2: ; Parent Loop BB0_1 Depth=1
 ; GCN-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/LoongArch/jr-without-ra.ll b/llvm/test/CodeGen/LoongArch/jr-without-ra.ll
index 2bd89dacb2b37..d1c4459aaa6ee 100644
--- a/llvm/test/CodeGen/LoongArch/jr-without-ra.ll
+++ b/llvm/test/CodeGen/LoongArch/jr-without-ra.ll
@@ -20,101 +20,101 @@ define void @jr_without_ra(ptr %rtwdev, ptr %chan, ptr %h2c, i8 %.pre, i1 %cmp.i
 ; CHECK-NEXT:    st.d $s6, $sp, 24 # 8-byte Folded Spill
 ; CHECK-NEXT:    st.d $s7, $sp, 16 # 8-byte Folded Spill
 ; CHECK-NEXT:    st.d $s8, $sp, 8 # 8-byte Folded Spill
-; CHECK-NEXT:    move $s6, $zero
-; CHECK-NEXT:    move $s1, $zero
+; CHECK-NEXT:    move $s7, $zero
+; CHECK-NEXT:    move $s0, $zero
 ; CHECK-NEXT:    ld.d $t0, $sp, 184
-; CHECK-NEXT:    ld.d $t1, $sp, 176
-; CHECK-NEXT:    ld.d $s2, $sp, 168
-; CHECK-NEXT:    ld.d $t2, $sp, 160
-; CHECK-NEXT:    ld.d $t3, $sp, 152
-; CHECK-NEXT:    ld.d $t4, $sp, 144
-; CHECK-NEXT:    ld.d $t5, $sp, 136
-; CHECK-NEXT:    ld.d $t6, $sp, 128
-; CHECK-NEXT:    ld.d $t7, $sp, 120
-; CHECK-NEXT:    ld.d $t8, $sp, 112
-; CHECK-NEXT:    ld.d $fp, $sp, 104
-; CHECK-NEXT:    ld.d $s0, $sp, 96
+; CHECK-NEXT:    ld.d $s2, $sp, 176
+; CHECK-NEXT:    ld.d $s1, $sp, 168
+; CHECK-NEXT:    ld.d $t1, $sp, 160
+; CHECK-NEXT:    ld.d $t2, $sp, 152
+; CHECK-NEXT:    ld.d $t3, $sp, 144
+; CHECK-NEXT:    ld.d $t4, $sp, 136
+; CHECK-NEXT:    ld.d $t5, $sp, 128
+; CHECK-NEXT:    ld.d $t6, $sp, 120
+; CHECK-NEXT:    ld.d $t7, $sp, 112
+; CHECK-NEXT:    ld.d $t8, $sp, 104
+; CHECK-NEXT:    ld.d $fp, $sp, 96
 ; CHECK-NEXT:    andi $a4, $a4, 1
-; CHECK-NEXT:    alsl.d $a6, $a6, $s2, 4
-; CHECK-NEXT:    pcalau12i $s2, %pc_hi20(.LJTI0_0)
-; CHECK-NEXT:    addi.d $s2, $s2, %pc_lo12(.LJTI0_0)
+; CHECK-NEXT:    alsl.d $a6, $a6, $s1, 4
+; CHECK-NEXT:    pcalau12i $s1, %pc_hi20(.LJTI0_0)
+; CHECK-NEXT:    addi.d $s1, $s1, %pc_lo12(.LJTI0_0)
+; CHECK-NEXT:    slli.d $s3, $s2, 2
+; CHECK-NEXT:    alsl.d $s2, $s2, $s3, 1
+; CHECK-NEXT:    add.d $s2, $t5, $s2
+; CHECK-NEXT:    addi.w $s4, $zero, -41
 ; CHECK-NEXT:    ori $s3, $zero, 1
-; CHECK-NEXT:    ori $s4, $zero, 50
-; CHECK-NEXT:    ori $s5, $zero, 3
-; CHECK-NEXT:    lu32i.d $s5, 262144
+; CHECK-NEXT:    slli.d $s4, $s4, 3
+; CHECK-NEXT:    ori $s6, $zero, 3
+; CHECK-NEXT:    lu32i.d $s6, 262144
 ; CHECK-NEXT:    b .LBB0_4
 ; CHECK-NEXT:    .p2align 4, , 16
 ; CHECK-NEXT:  .LBB0_1: # %sw.bb27.i.i
 ; CHECK-NEXT:    # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT:    ori $s7, $zero, 1
+; CHECK-NEXT:    ori $s8, $zero, 1
 ; CHECK-NEXT:  .LBB0_2: # %if.else.i106
 ; CHECK-NEXT:    # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT:    alsl.d $s8, $s1, $s1, 3
-; CHECK-NEXT:    alsl.d $s1, $s8, $s1, 1
-; CHECK-NEXT:    add.d $s1, $t0, $s1
-; CHECK-NEXT:    ldx.bu $s7, $s1, $s7
+; CHECK-NEXT:    alsl.d $s5, $s0, $s0, 3
+; CHECK-NEXT:    alsl.d $s0, $s5, $s0, 1
+; CHECK-NEXT:    add.d $s0, $t0, $s0
+; CHECK-NEXT:    ldx.bu $s8, $s0, $s8
 ; CHECK-NEXT:  .LBB0_3: # %phy_tssi_get_ofdm_de.exit
 ; CHECK-NEXT:    # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT:    st.b $zero, $t6, 0
-; CHECK-NEXT:    st.b $s6, $t4, 0
-; CHECK-NEXT:    st.b $zero, $fp, 0
-; CHECK-NEXT:    st.b $zero, $t2, 0
+; CHECK-NEXT:    st.b $zero, $t5, 0
+; CHECK-NEXT:    st.b $s7, $t3, 0
+; CHECK-NEXT:    st.b $zero, $t8, 0
+; CHECK-NEXT:    st.b $zero, $t1, 0
 ; CHECK-NEXT:    st.b $zero, $a1, 0
-; CHECK-NEXT:    st.b $zero, $t3, 0
-; CHECK-NEXT:    st.b $s7, $a5, 0
-; CHECK-NEXT:    ori $s1, $zero, 1
-; CHECK-NEXT:    move $s6, $a3
+; CHECK-NEXT:    st.b $zero, $t2, 0
+; CHECK-NEXT:    st.b $s8, $a5, 0
+; CHECK-NEXT:    ori $s0, $zero, 1
+; CHECK-NEXT:    move $s7, $a3
 ; CHECK-NEXT:  .LBB0_4: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    beqz $a4, .LBB0_9
 ; CHECK-NEXT:  # %bb.5: # %calc_6g.i
 ; CHECK-NEXT:    # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT:    move $s6, $zero
+; CHECK-NEXT:    move $s7, $zero
 ; CHECK-NEXT:    bnez $zero, .LBB0_8
 ; CHECK-NEXT:  # %bb.6: # %calc_6g.i
 ; CHECK-NEXT:    # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT:    slli.d $s7, $zero, 3
-; CHECK-NEXT:    ldx.d $s7, $s7, $s2
-; CHECK-NEXT:    jr $s7
+; CHECK-NEXT:    slli.d $s8, $zero, 3
+; CHECK-NEXT:    ldx.d $s8, $s8, $s1
+; CHECK-NEXT:    jr $s8
 ; CHECK-NEXT:  .LBB0_7: # %sw.bb12.i.i
 ; CHECK-NEXT:    # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT:    ori $s6, $zero, 1
+; CHECK-NEXT:    ori $s7, $zero, 1
 ; CHECK-NEXT:  .LBB0_8: # %if.else58.i
 ; CHECK-NEXT:    # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT:    ldx.bu $s6, $a6, $s6
+; CHECK-NEXT:    ldx.bu $s7, $a6, $s7
 ; CHECK-NEXT:    b .LBB0_11
 ; CHECK-NEXT:    .p2align 4, , 16
 ; CHECK-NEXT:  .LBB0_9: # %if.end.i
 ; CHECK-NEXT:    # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT:    andi $s6, $s6, 255
-; CHECK-NEXT:    bltu $s4, $s6, .LBB0_15
+; CHECK-NEXT:    andi $s7, $s7, 255
+; CHECK-NEXT:    ori $s5, $zero, 50
+; CHECK-NEXT:    bltu $s5, $s7, .LBB0_15
 ; CHECK-NEXT:  # %bb.10: # %if.end.i
 ; CHECK-NEXT:    # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT:    sll.d $s6, $s3, $s6
-; CHECK-NEXT:    and $s7, $s6, $s5
-; CHECK-NEXT:    move $s6, $s0
-; CHECK-NEXT:    beqz $s7, .LBB0_15
+; CHECK-NEXT:    sll.d $s7, $s3, $s7
+; CHECK-NEXT:    and $s8, $s7, $s6
+; CHECK-NEXT:    move $s7, $fp
+; CHECK-NEXT:    beqz $s8, .LBB0_15
 ; CHECK-NEXT:  .LBB0_11: # %phy_tssi_get_ofdm_trim_de.exit
 ; CHECK-NEXT:    # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT:    move $s7, $zero
-; CHECK-NEXT:    st.b $zero, $t8, 0
-; CHECK-NEXT:    slli.d $s8, $t1, 2
-; CHECK-NEXT:    alsl.d $s8, $t1, $s8, 1
-; CHECK-NEXT:    add.d $s8, $t6, $s8
-; CHECK-NEXT:    ldx.b $s8, $s8, $t5
+; CHECK-NEXT:    move $s8, $zero
+; CHECK-NEXT:    st.b $zero, $t7, 0
+; CHECK-NEXT:    ldx.b $ra, $s2, $t4
 ; CHECK-NEXT:    st.b $zero, $a2, 0
 ; CHECK-NEXT:    st.b $zero, $a7, 0
-; CHECK-NEXT:    st.b $zero, $t7, 0
-; CHECK-NEXT:    st.b $s8, $a0, 0
+; CHECK-NEXT:    st.b $zero, $t6, 0
+; CHECK-NEXT:    st.b $ra, $a0, 0
 ; CHECK-NEXT:    bnez $s3, .LBB0_13
 ; CHECK-NEXT:  # %bb.12: # %phy_tssi_get_ofdm_trim_de.exit
 ; CHECK-NEXT:    # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT:    addi.w $s8, $zero, -41
-; CHECK-NEXT:    slli.d $s8, $s8, 3
 ; CHECK-NEXT:    pcalau12i $ra, %pc_hi20(.LJTI0_1)
 ; CHECK-NEXT:    addi.d $ra, $ra, %pc_lo12(.LJTI0_1)
-; CHECK-NEXT:    ldx.d $s8, $s8, $ra
-; CHECK-NEXT:    jr $s8
+; CHECK-NEXT:    ldx.d $s5, $s4, $ra
+; CHECK-NEXT:    jr $s5
 ; CHECK-NEXT:  .LBB0_13: # %phy_tssi_get_ofdm_trim_de.exit
 ; CHECK-NEXT:    # in Loop: Header=BB0_4 Depth=1
 ; CHECK-NEXT:    bnez $s3, .LBB0_1
diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
index ec2448cb3965f..c35f05be304cc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
@@ -489,9 +489,8 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV64-NEXT:    j .LBB0_11
 ; RV64-NEXT:  .LBB0_8: # %vector.ph
 ; RV64-NEXT:    # in Loop: Header=BB0_6 Depth=1
-; RV64-NEXT:    slli t6, t0, 1
-; RV64-NEXT:    slli s0, t0, 28
-; RV64-NEXT:    sub t6, s0, t6
+; RV64-NEXT:    slli t6, t0, 28
+; RV64-NEXT:    sub t6, t6, t1
 ; RV64-NEXT:    and t6, t6, a6
 ; RV64-NEXT:    csrwi vxrm, 0
 ; RV64-NEXT:    mv s0, a2
diff --git a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
index 6d082802f9cd7..d076cb00ad7e0 100644
--- a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
@@ -353,8 +353,8 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    .pad #4
 ; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    .pad #16
 ; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    mov lr, r0
@@ -364,48 +364,50 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
 ; CHECK-NEXT:  @ %bb.1: @ %for.cond2.preheader.lr.ph
 ; CHECK-NEXT:    movs r0, #1
 ; CHECK-NEXT:    cmp r2, #1
-; CHECK-NEXT:    csel r3, r2, r0, lt
+; CHECK-NEXT:    csel r7, r2, r0, lt
 ; CHECK-NEXT:    mov r12, r1
-; CHECK-NEXT:    mov r1, r3
-; CHECK-NEXT:    cmp r3, #3
+; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    cmp r7, #3
 ; CHECK-NEXT:    it ls
 ; CHECK-NEXT:    movls r1, #3
 ; CHECK-NEXT:    mov r4, r2
-; CHECK-NEXT:    subs r1, r1, r3
+; CHECK-NEXT:    subs r1, r1, r7
 ; CHECK-NEXT:    movw r2, #43691
 ; CHECK-NEXT:    adds r1, #2
 ; CHECK-NEXT:    movt r2, #43690
-; CHECK-NEXT:    ldr r6, [sp, #112]
-; CHECK-NEXT:    movw r9, :lower16:c
+; CHECK-NEXT:    ldr r6, [sp, #128]
+; CHECK-NEXT:    movw r8, :lower16:c
 ; CHECK-NEXT:    umull r1, r2, r1, r2
-; CHECK-NEXT:    adr.w r8, .LCPI1_1
+; CHECK-NEXT:    movt r8, :upper16:c
 ; CHECK-NEXT:    movs r1, #4
+; CHECK-NEXT:    @ implicit-def: $r10
 ; CHECK-NEXT:    @ implicit-def: $r5
 ; CHECK-NEXT:    @ implicit-def: $r11
-; CHECK-NEXT:    @ implicit-def: $r7
-; CHECK-NEXT:    movt r9, :upper16:c
-; CHECK-NEXT:    mov.w r10, #12
-; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    mov.w r9, #12
+; CHECK-NEXT:    str r4, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    add.w r1, r1, r2, lsr #1
 ; CHECK-NEXT:    add.w r0, r0, r2, lsr #1
-; CHECK-NEXT:    bic r2, r1, #3
+; CHECK-NEXT:    bic r3, r1, #3
 ; CHECK-NEXT:    adr r1, .LCPI1_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vdup.32 q5, r0
+; CHECK-NEXT:    adr r1, .LCPI1_1
+; CHECK-NEXT:    vldrw.u32 q5, [r1]
 ; CHECK-NEXT:    vdup.32 q6, r0
-; CHECK-NEXT:    strd r2, r4, [sp, #4] @ 8-byte Folded Spill
-; CHECK-NEXT:    vadd.i32 q4, q0, r3
+; CHECK-NEXT:    vadd.i32 q4, q0, r7
+; CHECK-NEXT:    vdup.32 q7, r0
+; CHECK-NEXT:    strd r3, r7, [sp, #4] @ 8-byte Folded Spill
 ; CHECK-NEXT:    b .LBB1_6
 ; CHECK-NEXT:  .LBB1_2: @ %for.body6.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT:    mov r0, r7
-; CHECK-NEXT:    cmn.w r7, #4
+; CHECK-NEXT:    mov r0, r11
+; CHECK-NEXT:    cmn.w r11, #4
 ; CHECK-NEXT:    it le
 ; CHECK-NEXT:    mvnle r0, #3
 ; CHECK-NEXT:    movw r2, #18725
 ; CHECK-NEXT:    adds r0, #6
 ; CHECK-NEXT:    movt r2, #9362
-; CHECK-NEXT:    subs r1, r0, r7
+; CHECK-NEXT:    sub.w r1, r0, r11
+; CHECK-NEXT:    mov r10, r3
 ; CHECK-NEXT:    umull r2, r3, r1, r2
 ; CHECK-NEXT:    subs r2, r1, r3
 ; CHECK-NEXT:    add.w r2, r3, r2, lsr #1
@@ -413,18 +415,19 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
 ; CHECK-NEXT:    lsls r3, r3, #3
 ; CHECK-NEXT:    sub.w r2, r3, r2, lsr #2
 ; CHECK-NEXT:    subs r1, r2, r1
+; CHECK-NEXT:    mov r3, r10
 ; CHECK-NEXT:    add r0, r1
 ; CHECK-NEXT:  .LBB1_3: @ %for.cond.cleanup5.loopexit134.split.loop.exit139
 ; CHECK-NEXT:    @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT:    adds r7, r0, #7
+; CHECK-NEXT:    add.w r11, r0, #7
 ; CHECK-NEXT:  .LBB1_4: @ %for.cond.cleanup5
 ; CHECK-NEXT:    @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT:    movs r5, #0
+; CHECK-NEXT:    mov.w r10, #0
 ; CHECK-NEXT:  .LBB1_5: @ %for.cond.cleanup5
 ; CHECK-NEXT:    @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT:    add.w r11, r11, #2
-; CHECK-NEXT:    subs.w r1, r11, lr
-; CHECK-NEXT:    asr.w r0, r11, #31
+; CHECK-NEXT:    adds r5, #2
+; CHECK-NEXT:    subs.w r1, r5, lr
+; CHECK-NEXT:    asr.w r0, r5, #31
 ; CHECK-NEXT:    sbcs.w r0, r0, r12
 ; CHECK-NEXT:    bge.w .LBB1_28
 ; CHECK-NEXT:  .LBB1_6: @ %for.cond2.preheader
@@ -433,35 +436,36 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
 ; CHECK-NEXT:    @ Child Loop BB1_10 Depth 2
 ; CHECK-NEXT:    @ Child Loop BB1_12 Depth 3
 ; CHECK-NEXT:    @ Child Loop BB1_14 Depth 3
-; CHECK-NEXT:    cmp r7, #2
+; CHECK-NEXT:    cmp.w r11, #2
 ; CHECK-NEXT:    bgt .LBB1_5
 ; CHECK-NEXT:  @ %bb.7: @ %for.body6.lr.ph
 ; CHECK-NEXT:    @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    cmp r0, #5
+; CHECK-NEXT:    cmp r7, #5
 ; CHECK-NEXT:    bhi .LBB1_17
 ; CHECK-NEXT:  @ %bb.8: @ %for.body6.us.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT:    ldrd r2, r3, [sp, #104]
+; CHECK-NEXT:    ldrd r2, r3, [sp, #120]
 ; CHECK-NEXT:    movs r0, #32
 ; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    mov r6, r12
-; CHECK-NEXT:    mov r4, lr
+; CHECK-NEXT:    mov r4, r6
+; CHECK-NEXT:    mov r7, r12
+; CHECK-NEXT:    mov r6, lr
 ; CHECK-NEXT:    bl __aeabi_ldivmod
-; CHECK-NEXT:    mov lr, r4
-; CHECK-NEXT:    mov r12, r6
-; CHECK-NEXT:    ldr r4, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    vdup.32 q0, r2
-; CHECK-NEXT:    ldr r6, [sp, #112]
-; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    mov lr, r6
+; CHECK-NEXT:    mov r6, r4
+; CHECK-NEXT:    mov r12, r7
 ; CHECK-NEXT:    ldr r3, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r4, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    vdup.32 q0, r2
+; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    mov r0, r11
 ; CHECK-NEXT:    b .LBB1_10
 ; CHECK-NEXT:  .LBB1_9: @ %for.cond.cleanup17.us
 ; CHECK-NEXT:    @ in Loop: Header=BB1_10 Depth=2
-; CHECK-NEXT:    adds r7, r0, #7
+; CHECK-NEXT:    add.w r11, r0, #7
 ; CHECK-NEXT:    cmn.w r0, #4
-; CHECK-NEXT:    mov.w r5, #0
-; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    mov.w r10, #0
+; CHECK-NEXT:    mov r0, r11
 ; CHECK-NEXT:    bge .LBB1_5
 ; CHECK-NEXT:  .LBB1_10: @ %for.body6.us
 ; CHECK-NEXT:    @ Parent Loop BB1_6 Depth=1
@@ -484,14 +488,13 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
 ; CHECK-NEXT:    @ Parent Loop BB1_6 Depth=1
 ; CHECK-NEXT:    @ Parent Loop BB1_10 Depth=2
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=3
-; CHECK-NEXT:    vldrw.u32 q2, [r8]
-; CHECK-NEXT:    subs r2, #4
-; CHECK-NEXT:    vqadd.u32 q2, q2, r1
-; CHECK-NEXT:    add.w r1, r1, #4
-; CHECK-NEXT:    vcmp.u32 hi, q6, q2
+; CHECK-NEXT:    vqadd.u32 q2, q5, r1
+; CHECK-NEXT:    adds r1, #4
+; CHECK-NEXT:    vcmp.u32 hi, q7, q2
 ; CHECK-NEXT:    vshl.i32 q2, q1, #2
-; CHECK-NEXT:    vadd.i32 q2, q2, r9
-; CHECK-NEXT:    vadd.i32 q1, q1, r10
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    vadd.i32 q2, q2, r8
+; CHECK-NEXT:    vadd.i32 q1, q1, r9
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vstrwt.32 q0, [q2]
 ; CHECK-NEXT:    bne .LBB1_12
@@ -504,14 +507,13 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
 ; CHECK-NEXT:    @ Parent Loop BB1_6 Depth=1
 ; CHECK-NEXT:    @ Parent Loop BB1_10 Depth=2
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=3
-; CHECK-NEXT:    vldrw.u32 q2, [r8]
-; CHECK-NEXT:    subs r2, #4
-; CHECK-NEXT:    vqadd.u32 q2, q2, r1
-; CHECK-NEXT:    add.w r1, r1, #4
-; CHECK-NEXT:    vcmp.u32 hi, q5, q2
+; CHECK-NEXT:    vqadd.u32 q2, q5, r1
+; CHECK-NEXT:    adds r1, #4
+; CHECK-NEXT:    vcmp.u32 hi, q6, q2
 ; CHECK-NEXT:    vshl.i32 q2, q1, #2
-; CHECK-NEXT:    vadd.i32 q2, q2, r9
-; CHECK-NEXT:    vadd.i32 q1, q1, r10
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    vadd.i32 q2, q2, r8
+; CHECK-NEXT:    vadd.i32 q1, q1, r9
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vstrwt.32 q0, [q2]
 ; CHECK-NEXT:    bne .LBB1_14
@@ -521,7 +523,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
 ; CHECK-NEXT:    beq .LBB1_9
 ; CHECK-NEXT:  @ %bb.16: @ %for.cond9.for.cond15.preheader_crit_edge.us
 ; CHECK-NEXT:    @ in Loop: Header=BB1_10 Depth=2
-; CHECK-NEXT:    eor r1, r5, #1
+; CHECK-NEXT:    eor r1, r10, #1
 ; CHECK-NEXT:    lsls r1, r1, #31
 ; CHECK-NEXT:    bne .LBB1_9
 ; CHECK-NEXT:    b .LBB1_26
@@ -530,11 +532,11 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
 ; CHECK-NEXT:    cmp r6, #0
 ; CHECK-NEXT:    beq.w .LBB1_2
 ; CHECK-NEXT:  @ %bb.18: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    mov r0, r11
 ; CHECK-NEXT:  .LBB1_19: @ %for.body6.us60
 ; CHECK-NEXT:    @ Parent Loop BB1_6 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    lsls r1, r5, #31
+; CHECK-NEXT:    lsls.w r1, r10, #31
 ; CHECK-NEXT:    bne .LBB1_27
 ; CHECK-NEXT:  @ %bb.20: @ %for.cond.cleanup17.us63
 ; CHECK-NEXT:    @ in Loop: Header=BB1_19 Depth=2
@@ -550,19 +552,19 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
 ; CHECK-NEXT:    bgt .LBB1_25
 ; CHECK-NEXT:  @ %bb.23: @ %for.cond.cleanup17.us63.3
 ; CHECK-NEXT:    @ in Loop: Header=BB1_19 Depth=2
-; CHECK-NEXT:    add.w r7, r0, #28
+; CHECK-NEXT:    add.w r11, r0, #28
 ; CHECK-NEXT:    cmn.w r0, #25
-; CHECK-NEXT:    mov.w r5, #0
-; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    mov.w r10, #0
+; CHECK-NEXT:    mov r0, r11
 ; CHECK-NEXT:    blt .LBB1_19
 ; CHECK-NEXT:    b .LBB1_5
 ; CHECK-NEXT:  .LBB1_24: @ %for.cond.cleanup5.loopexit134.split.loop.exit137
 ; CHECK-NEXT:    @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT:    add.w r7, r0, #14
+; CHECK-NEXT:    add.w r11, r0, #14
 ; CHECK-NEXT:    b .LBB1_4
 ; CHECK-NEXT:  .LBB1_25: @ %for.cond.cleanup5.loopexit134.split.loop.exit135
 ; CHECK-NEXT:    @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT:    add.w r7, r0, #21
+; CHECK-NEXT:    add.w r11, r0, #21
 ; CHECK-NEXT:    b .LBB1_4
 ; CHECK-NEXT:  .LBB1_26: @ %for.inc19.us
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
@@ -572,7 +574,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
 ; CHECK-NEXT:    b .LBB1_27
 ; CHECK-NEXT:  .LBB1_28: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #16
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:    .p2align 4
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
index 83ea44704e63f..e63c62574dafb 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
@@ -594,71 +594,71 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(ptr noalias nocapture reado
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT:    .pad #32
-; CHECK-NEXT:    sub sp, #32
+; CHECK-NEXT:    .pad #28
+; CHECK-NEXT:    sub sp, #28
 ; CHECK-NEXT:    cmp r2, #1
-; CHECK-NEXT:    strd r1, r2, [sp, #8] @ 8-byte Folded Spill
+; CHECK-NEXT:    strd r1, r2, [sp, #4] @ 8-byte Folded Spill
 ; CHECK-NEXT:    blt .LBB13_5
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph.preheader
-; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    mov r9, r0
+; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    movs r6, #1
-; CHECK-NEXT:    mov.w r10, #8
+; CHECK-NEXT:    add r2, sp, #12
+; CHECK-NEXT:    mov.w r9, #8
 ; CHECK-NEXT:    bic r1, r1, #7
-; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    sub.w r7, r1, #8
-; CHECK-NEXT:    add.w r0, r6, r7, lsr #3
-; CHECK-NEXT:    str r0, [sp] @ 4-byte Spill
-; CHECK-NEXT:    add r0, sp, #16
+; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
+; CHECK-NEXT:    sub.w r3, r1, #8
+; CHECK-NEXT:    add.w r8, r6, r3, lsr #3
+; CHECK-NEXT:    adr r3, .LCPI13_0
+; CHECK-NEXT:    vldrw.u32 q0, [r3]
 ; CHECK-NEXT:  .LBB13_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB13_3 Depth 2
-; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
-; CHECK-NEXT:    dls lr, r1
-; CHECK-NEXT:    adr r1, .LCPI13_0
-; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    dls lr, r8
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:  .LBB13_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB13_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vstrw.32 q0, [r0]
-; CHECK-NEXT:    vadd.i16 q0, q0, r10
-; CHECK-NEXT:    vldrh.s32 q1, [r0, #8]
-; CHECK-NEXT:    vshl.i32 q1, q1, #1
-; CHECK-NEXT:    vadd.i32 q1, q1, r9
-; CHECK-NEXT:    vmov r3, r6, d3
-; CHECK-NEXT:    vmov r5, r4, d2
-; CHECK-NEXT:    vldrh.s32 q1, [r0]
-; CHECK-NEXT:    vshl.i32 q1, q1, #1
-; CHECK-NEXT:    vadd.i32 q1, q1, r9
-; CHECK-NEXT:    vmov r12, r11, d3
-; CHECK-NEXT:    ldrh.w r8, [r6]
-; CHECK-NEXT:    vmov r2, r6, d2
+; CHECK-NEXT:    vstrw.32 q1, [r2]
+; CHECK-NEXT:    mov r12, r2
+; CHECK-NEXT:    vldrh.s32 q2, [r2, #8]
+; CHECK-NEXT:    vadd.i16 q1, q1, r9
+; CHECK-NEXT:    vshl.i32 q2, q2, #1
+; CHECK-NEXT:    vadd.i32 q2, q2, r0
+; CHECK-NEXT:    vmov r7, r5, d5
+; CHECK-NEXT:    vmov r3, r4, d4
+; CHECK-NEXT:    vldrh.s32 q2, [r2]
+; CHECK-NEXT:    vshl.i32 q2, q2, #1
+; CHECK-NEXT:    vadd.i32 q2, q2, r0
+; CHECK-NEXT:    vmov r1, r10, d5
+; CHECK-NEXT:    ldrh r7, [r7]
 ; CHECK-NEXT:    ldrh r4, [r4]
+; CHECK-NEXT:    ldrh r5, [r5]
+; CHECK-NEXT:    ldrh.w r2, [r10]
+; CHECK-NEXT:    ldrh.w r10, [r3]
+; CHECK-NEXT:    vmov r3, r11, d4
+; CHECK-NEXT:    ldrh r1, [r1]
 ; CHECK-NEXT:    ldrh r3, [r3]
-; CHECK-NEXT:    ldrh.w r1, [r11]
-; CHECK-NEXT:    ldrh.w r11, [r5]
-; CHECK-NEXT:    ldrh.w r5, [r12]
-; CHECK-NEXT:    ldrh r2, [r2]
-; CHECK-NEXT:    ldrh r6, [r6]
-; CHECK-NEXT:    vmov.16 q1[0], r2
-; CHECK-NEXT:    vmov.16 q1[1], r6
-; CHECK-NEXT:    vmov.16 q1[2], r5
-; CHECK-NEXT:    vmov.16 q1[3], r1
-; CHECK-NEXT:    vmov.16 q1[4], r11
-; CHECK-NEXT:    vmov.16 q1[5], r4
-; CHECK-NEXT:    vmov.16 q1[6], r3
-; CHECK-NEXT:    vmov.16 q1[7], r8
-; CHECK-NEXT:    vstrb.8 q1, [r7], #16
+; CHECK-NEXT:    ldrh.w r11, [r11]
+; CHECK-NEXT:    vmov.16 q2[0], r3
+; CHECK-NEXT:    vmov.16 q2[1], r11
+; CHECK-NEXT:    vmov.16 q2[2], r1
+; CHECK-NEXT:    vmov.16 q2[3], r2
+; CHECK-NEXT:    mov r2, r12
+; CHECK-NEXT:    vmov.16 q2[4], r10
+; CHECK-NEXT:    vmov.16 q2[5], r4
+; CHECK-NEXT:    vmov.16 q2[6], r7
+; CHECK-NEXT:    vmov.16 q2[7], r5
+; CHECK-NEXT:    vstrb.8 q2, [r6], #16
 ; CHECK-NEXT:    le lr, .LBB13_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB13_2 Depth=1
-; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    ldr r2, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    cmp r2, r1
+; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldr r3, [sp] @ 4-byte Reload
+; CHECK-NEXT:    cmp r3, r1
 ; CHECK-NEXT:    bne .LBB13_2
 ; CHECK-NEXT:  .LBB13_5: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #32
+; CHECK-NEXT:    add sp, #28
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.6:
@@ -711,144 +711,145 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #88
-; CHECK-NEXT:    sub sp, #88
+; CHECK-NEXT:    .pad #136
+; CHECK-NEXT:    sub sp, #136
 ; CHECK-NEXT:    cmp r2, #1
-; CHECK-NEXT:    strd r1, r2, [sp, #8] @ 8-byte Folded Spill
+; CHECK-NEXT:    strd r1, r2, [sp, #64] @ 8-byte Folded Spill
 ; CHECK-NEXT:    blt.w .LBB14_5
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph.preheader
-; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    add r4, sp, #72
-; CHECK-NEXT:    add r7, sp, #40
-; CHECK-NEXT:    add r5, sp, #56
+; CHECK-NEXT:    ldr r1, [sp, #68] @ 4-byte Reload
+; CHECK-NEXT:    adr r3, .LCPI14_2
+; CHECK-NEXT:    vldrw.u32 q0, [r3]
+; CHECK-NEXT:    movs r2, #1
 ; CHECK-NEXT:    bic r1, r1, #7
 ; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    subs r1, #8
-; CHECK-NEXT:    vmov.i16 q6, #0x18
-; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    vstrw.32 q6, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    add.w r1, r3, r1, lsr #3
-; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
+; CHECK-NEXT:    vstrw.32 q0, [sp, #40] @ 16-byte Spill
+; CHECK-NEXT:    vmov.i16 q2, #0x18
+; CHECK-NEXT:    add.w r1, r2, r1, lsr #3
+; CHECK-NEXT:    str r1, [sp, #60] @ 4-byte Spill
+; CHECK-NEXT:    adr r1, .LCPI14_0
+; CHECK-NEXT:    adr r2, .LCPI14_1
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vstrw.32 q2, [sp, #72] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q0, [sp, #24] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    add r2, sp, #120
+; CHECK-NEXT:    vstrw.32 q0, [sp, #8] @ 16-byte Spill
 ; CHECK-NEXT:  .LBB14_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB14_3 Depth 2
-; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #60] @ 4-byte Reload
+; CHECK-NEXT:    add.w r10, sp, #104
 ; CHECK-NEXT:    dls lr, r1
-; CHECK-NEXT:    adr r1, .LCPI14_2
-; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    adr r1, .LCPI14_0
-; CHECK-NEXT:    vldrw.u32 q2, [r1]
-; CHECK-NEXT:    adr r1, .LCPI14_1
-; CHECK-NEXT:    ldr.w r12, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    vldrw.u32 q3, [r1]
+; CHECK-NEXT:    ldr r7, [sp, #64] @ 4-byte Reload
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #24] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q5, [sp, #40] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #8] @ 16-byte Reload
 ; CHECK-NEXT:  .LBB14_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB14_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vstrw.32 q1, [r4]
-; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    vldrh.s32 q0, [r4, #8]
-; CHECK-NEXT:    mov r11, r4
-; CHECK-NEXT:    mov r5, r7
+; CHECK-NEXT:    vstrw.32 q5, [r2]
+; CHECK-NEXT:    mov r8, r2
+; CHECK-NEXT:    vldrh.s32 q0, [r2, #8]
 ; CHECK-NEXT:    vshl.i32 q0, q0, #1
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vmov r2, r3, d0
-; CHECK-NEXT:    vmov r6, r10, d1
-; CHECK-NEXT:    vldrh.s32 q0, [r4]
+; CHECK-NEXT:    vmov r1, r3, d0
+; CHECK-NEXT:    vmov r4, r5, d1
+; CHECK-NEXT:    vldrh.s32 q0, [r2]
 ; CHECK-NEXT:    vshl.i32 q0, q0, #1
-; CHECK-NEXT:    vadd.i32 q6, q0, r0
-; CHECK-NEXT:    vmov r7, r4, d12
-; CHECK-NEXT:    ldrh.w r9, [r2]
-; CHECK-NEXT:    ldrh.w r2, [r10]
-; CHECK-NEXT:    str r2, [sp, #36] @ 4-byte Spill
-; CHECK-NEXT:    ldrh.w r8, [r3]
-; CHECK-NEXT:    ldrh r3, [r6]
-; CHECK-NEXT:    ldrh r2, [r7]
-; CHECK-NEXT:    mov r7, r5
-; CHECK-NEXT:    ldrh r4, [r4]
-; CHECK-NEXT:    vstrw.32 q3, [r7]
-; CHECK-NEXT:    vldrh.s32 q0, [r7]
-; CHECK-NEXT:    vmov.16 q4[0], r2
-; CHECK-NEXT:    vmov.16 q4[1], r4
-; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    vadd.i32 q2, q0, r0
+; CHECK-NEXT:    vmov r6, r2, d4
+; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    ldrh.w r12, [r4]
+; CHECK-NEXT:    add r4, sp, #88
+; CHECK-NEXT:    ldrh.w r11, [r5]
+; CHECK-NEXT:    ldrh r3, [r3]
+; CHECK-NEXT:    ldrh r5, [r6]
+; CHECK-NEXT:    ldrh r2, [r2]
+; CHECK-NEXT:    vstrw.32 q6, [r4]
+; CHECK-NEXT:    vldrh.s32 q0, [r4]
+; CHECK-NEXT:    vmov.16 q7[0], r5
+; CHECK-NEXT:    vmov.16 q7[1], r2
 ; CHECK-NEXT:    vshl.i32 q0, q0, #1
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vmov r4, r6, d0
-; CHECK-NEXT:    vmov r1, r2, d1
-; CHECK-NEXT:    vldrh.s32 q0, [r7, #8]
+; CHECK-NEXT:    vmov r6, r9, d0
+; CHECK-NEXT:    vmov r2, r5, d1
+; CHECK-NEXT:    vldrh.s32 q0, [r4, #8]
 ; CHECK-NEXT:    vshl.i32 q0, q0, #1
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    ldrh r4, [r4]
-; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    vmov.16 q5[0], r4
-; CHECK-NEXT:    ldrh r4, [r6]
+; CHECK-NEXT:    ldrh r6, [r6]
 ; CHECK-NEXT:    ldrh r2, [r2]
-; CHECK-NEXT:    vmov.16 q5[1], r4
-; CHECK-NEXT:    vmov.16 q5[2], r1
-; CHECK-NEXT:    vmov r1, r4, d0
-; CHECK-NEXT:    vmov.16 q5[3], r2
-; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    ldrh r4, [r4]
-; CHECK-NEXT:    vmov.16 q5[4], r1
-; CHECK-NEXT:    vmov r1, r2, d1
-; CHECK-NEXT:    vmov.16 q5[5], r4
-; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    vmov.16 q1[0], r6
+; CHECK-NEXT:    ldrh.w r6, [r9]
+; CHECK-NEXT:    ldrh r5, [r5]
+; CHECK-NEXT:    vmov.16 q1[1], r6
+; CHECK-NEXT:    vmov.16 q1[2], r2
+; CHECK-NEXT:    vmov r2, r6, d0
+; CHECK-NEXT:    vmov.16 q1[3], r5
 ; CHECK-NEXT:    ldrh r2, [r2]
-; CHECK-NEXT:    vstrw.32 q2, [r5]
-; CHECK-NEXT:    vldrh.s32 q0, [r5]
-; CHECK-NEXT:    vmov.16 q5[6], r1
-; CHECK-NEXT:    vmov.16 q5[7], r2
+; CHECK-NEXT:    ldrh r6, [r6]
+; CHECK-NEXT:    vmov.16 q1[4], r2
+; CHECK-NEXT:    vmov r2, r5, d1
+; CHECK-NEXT:    vmov.16 q1[5], r6
+; CHECK-NEXT:    mov r6, r10
+; CHECK-NEXT:    ldrh r2, [r2]
+; CHECK-NEXT:    ldrh r5, [r5]
+; CHECK-NEXT:    vstrw.32 q4, [r10]
+; CHECK-NEXT:    vldrh.s32 q0, [r6]
+; CHECK-NEXT:    vmov.16 q1[6], r2
+; CHECK-NEXT:    vmov.16 q1[7], r5
 ; CHECK-NEXT:    vshl.i32 q0, q0, #1
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vmov r1, r2, d0
-; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    vmov r2, r5, d0
 ; CHECK-NEXT:    ldrh r2, [r2]
-; CHECK-NEXT:    vmov.16 q7[0], r1
-; CHECK-NEXT:    vmov.16 q7[1], r2
-; CHECK-NEXT:    vmov r1, r2, d13
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i16 q3, q3, q6
-; CHECK-NEXT:    vadd.i16 q1, q1, q6
-; CHECK-NEXT:    vadd.i16 q2, q2, q6
-; CHECK-NEXT:    ldrh.w r10, [r2]
+; CHECK-NEXT:    ldrh r5, [r5]
+; CHECK-NEXT:    vmov.16 q3[0], r2
+; CHECK-NEXT:    vmov.16 q3[1], r5
+; CHECK-NEXT:    vmov r2, r5, d5
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #72] @ 16-byte Reload
+; CHECK-NEXT:    vadd.i16 q6, q6, q2
+; CHECK-NEXT:    vadd.i16 q5, q5, q2
+; CHECK-NEXT:    vadd.i16 q4, q4, q2
+; CHECK-NEXT:    ldrh.w r9, [r2]
 ; CHECK-NEXT:    vmov r2, r4, d1
-; CHECK-NEXT:    vldrh.s32 q0, [r5, #8]
-; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    vldrh.s32 q0, [r6, #8]
+; CHECK-NEXT:    ldrh r5, [r5]
+; CHECK-NEXT:    vmov.16 q7[2], r9
 ; CHECK-NEXT:    vshl.i32 q0, q0, #1
-; CHECK-NEXT:    vmov.16 q4[2], r1
+; CHECK-NEXT:    vmov.16 q7[3], r5
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vmov.16 q4[3], r10
-; CHECK-NEXT:    vmov.16 q4[4], r9
-; CHECK-NEXT:    ldr r1, [sp, #36] @ 4-byte Reload
-; CHECK-NEXT:    vmov.16 q4[5], r8
-; CHECK-NEXT:    vmov.16 q4[6], r3
-; CHECK-NEXT:    vmov.16 q4[7], r1
+; CHECK-NEXT:    vmov.16 q7[4], r1
+; CHECK-NEXT:    vmov.16 q7[5], r3
+; CHECK-NEXT:    vmov.16 q7[6], r12
+; CHECK-NEXT:    vmov.16 q7[7], r11
 ; CHECK-NEXT:    ldrh r2, [r2]
 ; CHECK-NEXT:    ldrh r4, [r4]
-; CHECK-NEXT:    vmov.16 q7[2], r2
-; CHECK-NEXT:    vmov.16 q7[3], r4
+; CHECK-NEXT:    vmov.16 q3[2], r2
+; CHECK-NEXT:    vmov.16 q3[3], r4
 ; CHECK-NEXT:    vmov r2, r4, d0
 ; CHECK-NEXT:    ldrh r2, [r2]
 ; CHECK-NEXT:    ldrh r4, [r4]
-; CHECK-NEXT:    vmov.16 q7[4], r2
-; CHECK-NEXT:    vmov.16 q7[5], r4
+; CHECK-NEXT:    vmov.16 q3[4], r2
+; CHECK-NEXT:    vmov.16 q3[5], r4
 ; CHECK-NEXT:    vmov r2, r4, d1
 ; CHECK-NEXT:    ldrh r2, [r2]
 ; CHECK-NEXT:    ldrh r4, [r4]
-; CHECK-NEXT:    vmov.16 q7[6], r2
-; CHECK-NEXT:    vmov.16 q7[7], r4
-; CHECK-NEXT:    mov r4, r11
-; CHECK-NEXT:    vadd.i16 q0, q7, q5
-; CHECK-NEXT:    vadd.i16 q0, q0, q4
-; CHECK-NEXT:    vstrb.8 q0, [r12], #16
+; CHECK-NEXT:    vmov.16 q3[6], r2
+; CHECK-NEXT:    mov r2, r8
+; CHECK-NEXT:    vmov.16 q3[7], r4
+; CHECK-NEXT:    vadd.i16 q0, q3, q1
+; CHECK-NEXT:    vadd.i16 q0, q0, q7
+; CHECK-NEXT:    vstrb.8 q0, [r7], #16
 ; CHECK-NEXT:    le lr, .LBB14_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB14_2 Depth=1
 ; CHECK-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    ldr r2, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    cmp r1, r2
+; CHECK-NEXT:    ldr r3, [sp, #68] @ 4-byte Reload
+; CHECK-NEXT:    cmp r1, r3
 ; CHECK-NEXT:    bne.w .LBB14_2
 ; CHECK-NEXT:  .LBB14_5: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #88
+; CHECK-NEXT:    add sp, #136
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -924,246 +925,260 @@ for.cond.cleanup:                                 ; preds = %for.body, %middle.b
 define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n) {
 ; CHECK-LABEL: gather_inc_v16i8_complex:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    cmp r2, #1
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    bxlt lr
-; CHECK-NEXT:  .LBB15_1: @ %vector.ph.preheader
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    .pad #4
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #160
-; CHECK-NEXT:    sub sp, #160
-; CHECK-NEXT:    bic lr, r2, #7
-; CHECK-NEXT:    mov r12, r1
-; CHECK-NEXT:    vmov.i32 q0, #0x30
-; CHECK-NEXT:  .LBB15_2: @ %vector.ph
-; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB15_3 Depth 2
+; CHECK-NEXT:    .pad #312
+; CHECK-NEXT:    sub sp, #312
+; CHECK-NEXT:    cmp r2, #1
+; CHECK-NEXT:    str r1, [sp, #116] @ 4-byte Spill
+; CHECK-NEXT:    blt.w .LBB15_5
+; CHECK-NEXT:  @ %bb.1: @ %vector.ph.preheader
 ; CHECK-NEXT:    adr r1, .LCPI15_0
-; CHECK-NEXT:    mov r8, r12
-; CHECK-NEXT:    vldrw.u32 q2, [r1]
+; CHECK-NEXT:    adr r6, .LCPI15_8
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    adr r1, .LCPI15_1
-; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    adr r1, .LCPI15_8
-; CHECK-NEXT:    vldrw.u32 q4, [r1]
-; CHECK-NEXT:    adr r1, .LCPI15_7
-; CHECK-NEXT:    vldrw.u32 q5, [r1]
-; CHECK-NEXT:    adr r1, .LCPI15_9
-; CHECK-NEXT:    vstrw.32 q1, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q1, [r1]
+; CHECK-NEXT:    adr r7, .LCPI15_7
+; CHECK-NEXT:    adr r3, .LCPI15_6
+; CHECK-NEXT:    vstrw.32 q0, [sp, #96] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    adr r1, .LCPI15_5
-; CHECK-NEXT:    mov r9, lr
-; CHECK-NEXT:    vstrw.32 q1, [sp, #144] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    adr r1, .LCPI15_6
-; CHECK-NEXT:    vstrw.32 q1, [sp, #128] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q1, [r1]
+; CHECK-NEXT:    bic r10, r2, #7
+; CHECK-NEXT:    vstrw.32 q0, [sp, #80] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q0, [r6]
+; CHECK-NEXT:    adr r6, .LCPI15_9
+; CHECK-NEXT:    vmov.i32 q2, #0x30
+; CHECK-NEXT:    vstrw.32 q0, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q0, [r7]
+; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q0, [r6]
+; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q0, [r3]
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:  .LBB15_2: @ %vector.ph
+; CHECK-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB15_3 Depth 2
 ; CHECK-NEXT:    adr r1, .LCPI15_3
-; CHECK-NEXT:    vldrw.u32 q6, [r1]
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    adr r1, .LCPI15_4
-; CHECK-NEXT:    vldrw.u32 q7, [r1]
+; CHECK-NEXT:    vldrw.u32 q5, [r1]
 ; CHECK-NEXT:    adr r1, .LCPI15_2
-; CHECK-NEXT:    vstrw.32 q7, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q7, [r1]
+; CHECK-NEXT:    vldrw.u32 q3, [r1]
 ; CHECK-NEXT:    adr r1, .LCPI15_10
-; CHECK-NEXT:    vstrw.32 q7, [sp, #112] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q7, [r1]
-; CHECK-NEXT:    adr r1, .LCPI15_11
+; CHECK-NEXT:    vstrw.32 q6, [sp, #280] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q3, [sp, #296] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q3, [r1]
-; CHECK-NEXT:    vstrw.32 q7, [sp, #96] @ 16-byte Spill
+; CHECK-NEXT:    adr r1, .LCPI15_11
+; CHECK-NEXT:    ldr.w r8, [sp, #116] @ 4-byte Reload
+; CHECK-NEXT:    vstrw.32 q3, [sp, #248] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #80] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q6, [sp, #264] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q3, [sp, #216] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q7, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q3, [sp, #200] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    mov r11, r10
+; CHECK-NEXT:    vstrw.32 q6, [sp, #232] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q3, [sp, #184] @ 16-byte Spill
 ; CHECK-NEXT:  .LBB15_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB15_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vadd.i32 q7, q6, r0
-; CHECK-NEXT:    vstrw.32 q6, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vadd.i32 q6, q3, r0
-; CHECK-NEXT:    vstrw.32 q3, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vadd.i32 q3, q1, r0
-; CHECK-NEXT:    vmov r10, r1, d15
-; CHECK-NEXT:    vmov r7, r11, d6
-; CHECK-NEXT:    vstrw.32 q2, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT:    vmov r5, r3, d13
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #112] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vadd.i32 q2, q2, r0
-; CHECK-NEXT:    vmov q0, q1
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #96] @ 16-byte Reload
-; CHECK-NEXT:    subs.w r9, r9, #16
+; CHECK-NEXT:    vadd.i32 q4, q1, r0
+; CHECK-NEXT:    vstrw.32 q7, [sp, #136] @ 16-byte Spill
+; CHECK-NEXT:    vmov r1, lr, d8
+; CHECK-NEXT:    vadd.i32 q7, q7, r0
+; CHECK-NEXT:    vmov r5, r4, d15
+; CHECK-NEXT:    vadd.i32 q6, q0, r0
+; CHECK-NEXT:    vmov r6, r7, d13
+; CHECK-NEXT:    vstrw.32 q1, [sp, #152] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #296] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [sp, #168] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #248] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #216] @ 16-byte Reload
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    ldrb r6, [r1]
-; CHECK-NEXT:    ldrb r1, [r7]
-; CHECK-NEXT:    vmov r7, r4, d12
+; CHECK-NEXT:    vstrw.32 q5, [sp, #120] @ 16-byte Spill
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    subs.w r11, r11, #16
+; CHECK-NEXT:    ldrb.w r9, [r1]
+; CHECK-NEXT:    vmov r1, r3, d14
 ; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    ldrb r3, [r3]
 ; CHECK-NEXT:    ldrb r7, [r7]
-; CHECK-NEXT:    ldrb r4, [r4]
-; CHECK-NEXT:    vmov.8 q6[0], r7
-; CHECK-NEXT:    vmov.8 q6[1], r4
-; CHECK-NEXT:    vmov.8 q6[2], r5
-; CHECK-NEXT:    vmov r4, r5, d14
-; CHECK-NEXT:    vmov.8 q6[3], r3
-; CHECK-NEXT:    ldrb r4, [r4]
-; CHECK-NEXT:    ldrb r7, [r5]
-; CHECK-NEXT:    vmov.8 q7[0], r4
-; CHECK-NEXT:    ldrb.w r5, [r10]
-; CHECK-NEXT:    vmov.8 q7[1], r7
-; CHECK-NEXT:    ldrb.w r7, [r11]
+; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    vmov.8 q7[0], r1
+; CHECK-NEXT:    ldrb r1, [r3]
+; CHECK-NEXT:    vmov.8 q7[1], r1
+; CHECK-NEXT:    vmov r1, r3, d12
 ; CHECK-NEXT:    vmov.8 q7[2], r5
-; CHECK-NEXT:    vmov r5, r10, d5
+; CHECK-NEXT:    ldrb r5, [r6]
+; CHECK-NEXT:    ldrb r6, [r4]
 ; CHECK-NEXT:    vmov.8 q7[3], r6
-; CHECK-NEXT:    vmov r3, r4, d4
-; CHECK-NEXT:    vmov.8 q7[4], r1
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vmov.8 q7[5], r7
-; CHECK-NEXT:    ldrb r6, [r5]
-; CHECK-NEXT:    vmov r1, r5, d7
-; CHECK-NEXT:    ldrb r4, [r4]
-; CHECK-NEXT:    vldrw.u32 q3, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q3, q3, q0
 ; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    vmov.8 q6[0], r1
+; CHECK-NEXT:    vmov r6, r1, d2
+; CHECK-NEXT:    vmov.8 q6[1], r3
+; CHECK-NEXT:    vmov.8 q6[2], r5
+; CHECK-NEXT:    vmov.8 q6[3], r7
+; CHECK-NEXT:    ldrb.w r7, [lr]
+; CHECK-NEXT:    vmov.8 q6[4], r9
+; CHECK-NEXT:    vmov.8 q6[5], r7
+; CHECK-NEXT:    ldrb r4, [r1]
+; CHECK-NEXT:    vmov r1, r5, d3
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #232] @ 16-byte Reload
+; CHECK-NEXT:    ldrb.w r12, [r1]
+; CHECK-NEXT:    vmov r1, r3, d9
 ; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    vmov.8 q7[6], r1
-; CHECK-NEXT:    vmov r1, r7, d2
-; CHECK-NEXT:    vmov.8 q7[7], r5
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #184] @ 16-byte Reload
+; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    vmov.8 q6[6], r1
+; CHECK-NEXT:    vmov r1, r7, d0
+; CHECK-NEXT:    vmov.8 q6[7], r3
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    ldrb r7, [r7]
-; CHECK-NEXT:    vmov.8 q6[4], r1
-; CHECK-NEXT:    vmov r1, r5, d3
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #144] @ 16-byte Reload
-; CHECK-NEXT:    vmov.8 q6[5], r7
-; CHECK-NEXT:    vadd.i32 q1, q1, r0
+; CHECK-NEXT:    vmov.8 q7[4], r1
+; CHECK-NEXT:    vmov r1, r3, d1
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #264] @ 16-byte Reload
+; CHECK-NEXT:    vmov.8 q7[5], r7
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
 ; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    vmov.8 q6[6], r1
-; CHECK-NEXT:    ldrb r1, [r3]
-; CHECK-NEXT:    vmov.8 q6[7], r5
-; CHECK-NEXT:    vmov r3, r7, d2
-; CHECK-NEXT:    vmov.8 q6[8], r1
-; CHECK-NEXT:    vmov r1, r11, d3
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #128] @ 16-byte Reload
-; CHECK-NEXT:    vmov.8 q6[9], r4
-; CHECK-NEXT:    vmov.8 q6[10], r6
-; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vmov r5, r6, d2
-; CHECK-NEXT:    ldrb r4, [r7]
-; CHECK-NEXT:    ldrb.w r7, [r10]
 ; CHECK-NEXT:    ldrb r3, [r3]
-; CHECK-NEXT:    vmov.8 q6[11], r7
-; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    vmov.8 q7[6], r1
+; CHECK-NEXT:    ldrb r1, [r6]
+; CHECK-NEXT:    vmov r7, r6, d0
+; CHECK-NEXT:    vmov.8 q7[7], r3
+; CHECK-NEXT:    vmov r3, lr, d1
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #280] @ 16-byte Reload
+; CHECK-NEXT:    vmov.8 q7[8], r1
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vmov.8 q7[9], r4
+; CHECK-NEXT:    vmov r4, r1, d0
+; CHECK-NEXT:    vmov.8 q7[10], r12
+; CHECK-NEXT:    vmov.8 q7[11], r5
+; CHECK-NEXT:    ldrb r7, [r7]
 ; CHECK-NEXT:    ldrb r6, [r6]
-; CHECK-NEXT:    vmov.8 q7[8], r5
-; CHECK-NEXT:    vmov r5, r7, d3
-; CHECK-NEXT:    vmov.8 q7[9], r6
-; CHECK-NEXT:    vadd.i32 q1, q2, r0
-; CHECK-NEXT:    vadd.i32 q2, q2, q0
-; CHECK-NEXT:    vstrw.32 q2, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q2, q2, q0
+; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    ldrb r4, [r4]
+; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    vmov.8 q6[8], r4
+; CHECK-NEXT:    vmov r5, r4, d1
+; CHECK-NEXT:    vmov.8 q6[9], r1
+; CHECK-NEXT:    vadd.i32 q0, q5, r0
+; CHECK-NEXT:    vldrw.u32 q5, [sp, #200] @ 16-byte Reload
 ; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    ldrb r7, [r7]
-; CHECK-NEXT:    vmov.8 q7[10], r5
-; CHECK-NEXT:    vmov.8 q7[11], r7
-; CHECK-NEXT:    vmov.8 q7[12], r3
-; CHECK-NEXT:    vmov.8 q7[13], r4
-; CHECK-NEXT:    vmov.8 q7[14], r1
-; CHECK-NEXT:    vmov r1, r3, d2
+; CHECK-NEXT:    ldrb r4, [r4]
+; CHECK-NEXT:    vmov.8 q6[10], r5
+; CHECK-NEXT:    vmov.8 q6[11], r4
+; CHECK-NEXT:    vmov.8 q6[12], r7
+; CHECK-NEXT:    vmov.8 q6[13], r6
+; CHECK-NEXT:    vmov.8 q6[14], r3
+; CHECK-NEXT:    vmov r1, r3, d0
 ; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov.8 q6[12], r1
+; CHECK-NEXT:    vmov.8 q7[12], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
-; CHECK-NEXT:    vmov.8 q6[13], r1
-; CHECK-NEXT:    vmov r1, r3, d3
-; CHECK-NEXT:    vadd.i32 q1, q5, r0
-; CHECK-NEXT:    vadd.i32 q5, q5, q0
+; CHECK-NEXT:    vmov.8 q7[13], r1
+; CHECK-NEXT:    vmov r1, r3, d1
+; CHECK-NEXT:    vadd.i32 q0, q1, r0
+; CHECK-NEXT:    vadd.i32 q1, q1, q2
+; CHECK-NEXT:    vstrw.32 q1, [sp, #232] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #248] @ 16-byte Reload
+; CHECK-NEXT:    vadd.i32 q1, q1, q2
+; CHECK-NEXT:    vstrw.32 q1, [sp, #248] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #152] @ 16-byte Reload
+; CHECK-NEXT:    vadd.i32 q1, q1, q2
 ; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov.8 q6[14], r1
+; CHECK-NEXT:    vmov.8 q7[14], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
-; CHECK-NEXT:    vmov.8 q6[15], r1
-; CHECK-NEXT:    ldrb.w r1, [r11]
 ; CHECK-NEXT:    vmov.8 q7[15], r1
-; CHECK-NEXT:    vmov r1, r3, d2
-; CHECK-NEXT:    vadd.i8 q6, q7, q6
+; CHECK-NEXT:    ldrb.w r1, [lr]
+; CHECK-NEXT:    vmov.8 q6[15], r1
+; CHECK-NEXT:    vmov r1, r3, d0
+; CHECK-NEXT:    vadd.i8 q6, q6, q7
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    ldrb r3, [r3]
 ; CHECK-NEXT:    vmov.8 q7[0], r1
 ; CHECK-NEXT:    vmov.8 q7[1], r3
-; CHECK-NEXT:    vmov r1, r3, d3
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q1, q1, r0
+; CHECK-NEXT:    vmov r1, r3, d1
+; CHECK-NEXT:    vadd.i32 q0, q3, r0
+; CHECK-NEXT:    vadd.i32 q3, q3, q2
+; CHECK-NEXT:    vstrw.32 q3, [sp, #216] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #296] @ 16-byte Reload
+; CHECK-NEXT:    vadd.i32 q3, q3, q2
+; CHECK-NEXT:    vstrw.32 q3, [sp, #296] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #280] @ 16-byte Reload
+; CHECK-NEXT:    vadd.i32 q3, q3, q2
+; CHECK-NEXT:    vstrw.32 q3, [sp, #280] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #264] @ 16-byte Reload
+; CHECK-NEXT:    vadd.i32 q3, q3, q2
+; CHECK-NEXT:    vstrw.32 q3, [sp, #264] @ 16-byte Spill
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[2], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[3], r1
-; CHECK-NEXT:    vmov r1, r3, d2
+; CHECK-NEXT:    vmov r1, r3, d0
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[4], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[5], r1
-; CHECK-NEXT:    vmov r1, r3, d3
-; CHECK-NEXT:    vadd.i32 q1, q4, r0
-; CHECK-NEXT:    vadd.i32 q4, q4, q0
+; CHECK-NEXT:    vmov r1, r3, d1
+; CHECK-NEXT:    vadd.i32 q0, q5, r0
+; CHECK-NEXT:    vadd.i32 q5, q5, q2
+; CHECK-NEXT:    vstrw.32 q5, [sp, #200] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q5, [sp, #120] @ 16-byte Reload
+; CHECK-NEXT:    vadd.i32 q5, q5, q2
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[6], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[7], r1
-; CHECK-NEXT:    vmov r1, r3, d2
+; CHECK-NEXT:    vmov r1, r3, d0
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[8], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[9], r1
-; CHECK-NEXT:    vmov r1, r3, d3
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q1, q1, r0
+; CHECK-NEXT:    vmov r1, r3, d1
+; CHECK-NEXT:    vadd.i32 q0, q4, r0
+; CHECK-NEXT:    vadd.i32 q4, q4, q2
+; CHECK-NEXT:    vstrw.32 q4, [sp, #184] @ 16-byte Spill
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[10], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[11], r1
-; CHECK-NEXT:    vmov r1, r3, d2
+; CHECK-NEXT:    vmov r1, r3, d0
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[12], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[13], r1
-; CHECK-NEXT:    vmov r1, r3, d3
+; CHECK-NEXT:    vmov r1, r3, d1
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[14], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[15], r1
-; CHECK-NEXT:    vadd.i8 q1, q6, q7
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #96] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vstrb.8 q1, [r8], #16
-; CHECK-NEXT:    vadd.i32 q7, q7, q0
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q7, [sp, #96] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q6, q6, q0
-; CHECK-NEXT:    vadd.i32 q1, q1, q0
-; CHECK-NEXT:    vadd.i32 q7, q7, q0
-; CHECK-NEXT:    vstrw.32 q7, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #112] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q7, q7, q0
-; CHECK-NEXT:    vstrw.32 q7, [sp, #112] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #128] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q7, q7, q0
-; CHECK-NEXT:    vstrw.32 q7, [sp, #128] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #144] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q7, q7, q0
-; CHECK-NEXT:    vstrw.32 q7, [sp, #144] @ 16-byte Spill
+; CHECK-NEXT:    vadd.i8 q0, q6, q7
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #136] @ 16-byte Reload
+; CHECK-NEXT:    vstrb.8 q0, [r8], #16
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #168] @ 16-byte Reload
+; CHECK-NEXT:    vadd.i32 q7, q7, q2
+; CHECK-NEXT:    vadd.i32 q0, q0, q2
 ; CHECK-NEXT:    bne.w .LBB15_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB15_2 Depth=1
-; CHECK-NEXT:    cmp lr, r2
+; CHECK-NEXT:    cmp r10, r2
 ; CHECK-NEXT:    bne.w .LBB15_2
-; CHECK-NEXT:  @ %bb.5:
-; CHECK-NEXT:    add sp, #160
+; CHECK-NEXT:  .LBB15_5: @ %for.cond.cleanup
+; CHECK-NEXT:    add sp, #312
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.6:
 ; CHECK-NEXT:  .LCPI15_0:
@@ -1272,95 +1287,102 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    .pad #4
 ; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #64
+; CHECK-NEXT:    sub sp, #64
 ; CHECK-NEXT:    cmp r2, #1
-; CHECK-NEXT:    strd r1, r2, [sp, #8] @ 8-byte Folded Spill
-; CHECK-NEXT:    blt .LBB16_5
+; CHECK-NEXT:    strd r1, r2, [sp, #56] @ 8-byte Folded Spill
+; CHECK-NEXT:    blt.w .LBB16_5
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph.preheader
-; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    mov.w r11, #16
-; CHECK-NEXT:    bic r3, r1, #7
-; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    adr r5, .LCPI16_3
+; CHECK-NEXT:    adr r7, .LCPI16_1
+; CHECK-NEXT:    vldrw.u32 q0, [r5]
+; CHECK-NEXT:    ldr r1, [sp, #60] @ 4-byte Reload
+; CHECK-NEXT:    adr r3, .LCPI16_0
+; CHECK-NEXT:    adr r6, .LCPI16_2
+; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q0, [r7]
+; CHECK-NEXT:    bic r9, r1, #7
+; CHECK-NEXT:    vldrw.u32 q3, [r3]
+; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q0, [r6]
+; CHECK-NEXT:    mov.w lr, #16
+; CHECK-NEXT:    str.w r9, [sp, #52] @ 4-byte Spill
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
 ; CHECK-NEXT:  .LBB16_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB16_3 Depth 2
-; CHECK-NEXT:    adr r1, .LCPI16_3
-; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    vldrw.u32 q5, [r1]
-; CHECK-NEXT:    adr r1, .LCPI16_1
-; CHECK-NEXT:    vldrw.u32 q4, [r1]
-; CHECK-NEXT:    adr r1, .LCPI16_2
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    adr r1, .LCPI16_0
-; CHECK-NEXT:    vldrw.u32 q1, [r1]
+; CHECK-NEXT:    ldr.w r8, [sp, #56] @ 4-byte Reload
+; CHECK-NEXT:    vldrw.u32 q5, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vmov q4, q3
 ; CHECK-NEXT:  .LBB16_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB16_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vadd.i32 q6, q1, r0
-; CHECK-NEXT:    vadd.i32 q2, q0, r0
-; CHECK-NEXT:    vmov r4, r5, d13
-; CHECK-NEXT:    vadd.i32 q3, q5, r11
-; CHECK-NEXT:    vmov lr, r8, d4
-; CHECK-NEXT:    subs r3, #16
-; CHECK-NEXT:    vmov r6, r12, d5
-; CHECK-NEXT:    vadd.i32 q2, q4, r11
-; CHECK-NEXT:    vadd.i32 q1, q1, r11
-; CHECK-NEXT:    vadd.i32 q0, q0, r11
-; CHECK-NEXT:    ldrb.w r10, [r5]
-; CHECK-NEXT:    vmov r2, r5, d12
-; CHECK-NEXT:    vadd.i32 q6, q5, r0
-; CHECK-NEXT:    vadd.i32 q5, q4, r0
-; CHECK-NEXT:    ldrb.w r1, [r8]
-; CHECK-NEXT:    ldrb.w r9, [r4]
-; CHECK-NEXT:    ldrb r4, [r6]
-; CHECK-NEXT:    ldrb.w r6, [lr]
-; CHECK-NEXT:    ldrb.w r12, [r12]
-; CHECK-NEXT:    ldrb r2, [r2]
+; CHECK-NEXT:    vadd.i32 q1, q5, r0
+; CHECK-NEXT:    vadd.i32 q2, q4, r0
+; CHECK-NEXT:    vmov r7, r3, d3
+; CHECK-NEXT:    vadd.i32 q6, q0, lr
+; CHECK-NEXT:    vmov r5, r6, d5
+; CHECK-NEXT:    subs.w r9, r9, #16
+; CHECK-NEXT:    vmov r4, r10, d2
+; CHECK-NEXT:    vadd.i32 q1, q7, lr
+; CHECK-NEXT:    vadd.i32 q4, q4, lr
+; CHECK-NEXT:    vadd.i32 q5, q5, lr
+; CHECK-NEXT:    ldrb.w r11, [r3]
+; CHECK-NEXT:    ldrb r3, [r7]
+; CHECK-NEXT:    vmov r7, r12, d4
+; CHECK-NEXT:    vadd.i32 q2, q7, r0
+; CHECK-NEXT:    vadd.i32 q7, q0, r0
 ; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    vmov.8 q4[0], r2
-; CHECK-NEXT:    vmov.8 q4[1], r5
-; CHECK-NEXT:    vmov r8, r5, d11
-; CHECK-NEXT:    vmov.8 q4[2], r9
-; CHECK-NEXT:    vmov.8 q4[3], r10
-; CHECK-NEXT:    vmov.8 q4[4], r6
-; CHECK-NEXT:    vmov.8 q4[5], r1
-; CHECK-NEXT:    vmov.8 q4[6], r4
-; CHECK-NEXT:    vmov r4, r6, d10
-; CHECK-NEXT:    vmov.8 q4[7], r12
-; CHECK-NEXT:    vmov q5, q3
-; CHECK-NEXT:    ldrb.w lr, [r5]
-; CHECK-NEXT:    vmov r5, r2, d13
-; CHECK-NEXT:    ldrb r4, [r4]
 ; CHECK-NEXT:    ldrb r6, [r6]
-; CHECK-NEXT:    vmov.8 q4[8], r4
-; CHECK-NEXT:    vmov.8 q4[9], r6
-; CHECK-NEXT:    ldrb.w r9, [r2]
-; CHECK-NEXT:    vmov r1, r2, d12
-; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    ldrb.w r10, [r2]
-; CHECK-NEXT:    ldrb.w r2, [r8]
+; CHECK-NEXT:    ldrb r4, [r4]
+; CHECK-NEXT:    ldrb.w r10, [r10]
+; CHECK-NEXT:    ldrb r7, [r7]
+; CHECK-NEXT:    ldrb.w r1, [r12]
+; CHECK-NEXT:    vmov.8 q0[0], r7
+; CHECK-NEXT:    vmov.8 q0[1], r1
+; CHECK-NEXT:    vmov r1, r7, d15
+; CHECK-NEXT:    vmov.8 q0[2], r5
+; CHECK-NEXT:    vmov.8 q0[3], r6
+; CHECK-NEXT:    vmov.8 q0[4], r4
+; CHECK-NEXT:    vmov r4, r2, d4
+; CHECK-NEXT:    vmov.8 q0[5], r10
+; CHECK-NEXT:    vmov.8 q0[6], r3
+; CHECK-NEXT:    vmov.8 q0[7], r11
+; CHECK-NEXT:    ldrb r6, [r7]
+; CHECK-NEXT:    vmov r5, r7, d5
 ; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov.8 q4[10], r2
-; CHECK-NEXT:    vmov.8 q4[11], lr
-; CHECK-NEXT:    vmov.8 q4[12], r1
-; CHECK-NEXT:    vmov.8 q4[13], r10
-; CHECK-NEXT:    vmov.8 q4[14], r5
-; CHECK-NEXT:    vmov.8 q4[15], r9
-; CHECK-NEXT:    vstrb.8 q4, [r7], #16
-; CHECK-NEXT:    vmov q4, q2
+; CHECK-NEXT:    ldrb r2, [r2]
+; CHECK-NEXT:    ldrb r3, [r5]
+; CHECK-NEXT:    ldrb.w r12, [r7]
+; CHECK-NEXT:    ldrb r5, [r4]
+; CHECK-NEXT:    vmov r4, r7, d14
+; CHECK-NEXT:    vmov q7, q1
+; CHECK-NEXT:    ldrb r4, [r4]
+; CHECK-NEXT:    ldrb r7, [r7]
+; CHECK-NEXT:    vmov.8 q0[8], r4
+; CHECK-NEXT:    vmov.8 q0[9], r7
+; CHECK-NEXT:    vmov.8 q0[10], r1
+; CHECK-NEXT:    vmov.8 q0[11], r6
+; CHECK-NEXT:    vmov.8 q0[12], r5
+; CHECK-NEXT:    vmov.8 q0[13], r2
+; CHECK-NEXT:    vmov.8 q0[14], r3
+; CHECK-NEXT:    vmov.8 q0[15], r12
+; CHECK-NEXT:    vstrb.8 q0, [r8], #16
+; CHECK-NEXT:    vmov q0, q6
 ; CHECK-NEXT:    bne .LBB16_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB16_2 Depth=1
-; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    ldr r3, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    cmp r3, r1
+; CHECK-NEXT:    ldr r1, [sp, #60] @ 4-byte Reload
+; CHECK-NEXT:    ldr.w r9, [sp, #52] @ 4-byte Reload
+; CHECK-NEXT:    cmp r9, r1
 ; CHECK-NEXT:    bne .LBB16_2
 ; CHECK-NEXT:  .LBB16_5: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #16
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    add sp, #64
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:    .p2align 4
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
index 82ec62ec9f7a1..7b8b884576d13 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
@@ -602,57 +602,60 @@ define dso_local void @arm_mat_mult_q15(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    .pad #4
 ; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    .pad #24
-; CHECK-NEXT:    sub sp, #24
-; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #32
+; CHECK-NEXT:    sub sp, #32
+; CHECK-NEXT:    strd r0, r2, [sp, #24] @ 8-byte Folded Spill
 ; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    str r3, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    ldrne r0, [sp, #112]
+; CHECK-NEXT:    ldrne r0, [sp, #136]
 ; CHECK-NEXT:    cmpne r0, #0
 ; CHECK-NEXT:    bne .LBB10_2
 ; CHECK-NEXT:  .LBB10_1: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #24
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    add sp, #32
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:  .LBB10_2: @ %for.cond1.preheader.us.preheader
-; CHECK-NEXT:    ldr.w r12, [sp, #116]
+; CHECK-NEXT:    ldr.w r12, [sp, #140]
 ; CHECK-NEXT:    movs r7, #1
+; CHECK-NEXT:    mov.w r11, #0
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    movs r5, #0
-; CHECK-NEXT:    bic r0, r12, #3
-; CHECK-NEXT:    subs r3, r0, #4
-; CHECK-NEXT:    add.w r3, r7, r3, lsr #2
-; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT:    ldr r3, [sp, #112]
-; CHECK-NEXT:    lsl.w r7, r12, #1
-; CHECK-NEXT:    str r7, [sp] @ 4-byte Spill
-; CHECK-NEXT:    movs r7, #0
-; CHECK-NEXT:    vdup.32 q1, r3
-; CHECK-NEXT:    lsls r6, r3, #1
-; CHECK-NEXT:    vshl.i32 q2, q1, #2
-; CHECK-NEXT:    ldr r3, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    str r3, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    bic r2, r12, #3
+; CHECK-NEXT:    subs r3, r2, #4
+; CHECK-NEXT:    add.w r0, r7, r3, lsr #2
+; CHECK-NEXT:    ldr r7, [sp, #136]
+; CHECK-NEXT:    adr r3, .LCPI10_0
+; CHECK-NEXT:    str r0, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    lsl.w r0, r12, #1
+; CHECK-NEXT:    vdup.32 q1, r7
+; CHECK-NEXT:    vldrw.u32 q2, [r3]
+; CHECK-NEXT:    str r0, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    ldr r0, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT:    lsls r6, r7, #1
+; CHECK-NEXT:    vshl.i32 q3, q1, #2
+; CHECK-NEXT:    movs r3, #0
+; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
 ; CHECK-NEXT:    b .LBB10_5
 ; CHECK-NEXT:  .LBB10_3: @ %for.cond5.preheader.us73.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB10_5 Depth=1
-; CHECK-NEXT:    add.w r3, r2, r8, lsl #1
+; CHECK-NEXT:    ldr r0, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    add.w r3, r0, r5, lsl #1
 ; CHECK-NEXT:    wlstp.8 lr, r6, .LBB10_4
 ; CHECK-NEXT:    b .LBB10_15
 ; CHECK-NEXT:  .LBB10_4: @ %for.cond1.for.cond.cleanup3_crit_edge.us
 ; CHECK-NEXT:    @ in Loop: Header=BB10_5 Depth=1
-; CHECK-NEXT:    ldr r3, [sp] @ 4-byte Reload
-; CHECK-NEXT:    add r7, r12
-; CHECK-NEXT:    ldr r5, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    add r5, r3
-; CHECK-NEXT:    str r5, [sp, #16] @ 4-byte Spill
-; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    ldr r3, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    adds r5, #1
-; CHECK-NEXT:    cmp r5, r3
+; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    add r11, r12
+; CHECK-NEXT:    ldr r3, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    add r3, r0
+; CHECK-NEXT:    str r3, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    ldr r3, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    adds r3, #1
+; CHECK-NEXT:    cmp r3, r0
 ; CHECK-NEXT:    beq .LBB10_1
 ; CHECK-NEXT:  .LBB10_5: @ %for.cond1.preheader.us
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
@@ -660,22 +663,21 @@ define dso_local void @arm_mat_mult_q15(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    @ Child Loop BB10_11 Depth 3
 ; CHECK-NEXT:    @ Child Loop BB10_14 Depth 3
 ; CHECK-NEXT:    @ Child Loop BB10_15 Depth 2
-; CHECK-NEXT:    ldr r3, [sp, #112]
+; CHECK-NEXT:    mul r5, r3, r7
 ; CHECK-NEXT:    cmp.w r12, #0
-; CHECK-NEXT:    str r5, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    mul r8, r5, r3
+; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    beq .LBB10_3
 ; CHECK-NEXT:  @ %bb.6: @ %for.cond5.preheader.us.us.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB10_5 Depth=1
-; CHECK-NEXT:    mov.w r9, #0
+; CHECK-NEXT:    mov.w r8, #0
 ; CHECK-NEXT:    b .LBB10_8
 ; CHECK-NEXT:  .LBB10_7: @ %for.cond5.for.cond.cleanup7_crit_edge.us.us
 ; CHECK-NEXT:    @ in Loop: Header=BB10_8 Depth=2
-; CHECK-NEXT:    add.w r3, r9, r8
-; CHECK-NEXT:    add.w r9, r9, #1
-; CHECK-NEXT:    strh.w r10, [r2, r3, lsl #1]
-; CHECK-NEXT:    ldr r3, [sp, #112]
-; CHECK-NEXT:    cmp r9, r3
+; CHECK-NEXT:    ldr r3, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    add.w r0, r8, r5
+; CHECK-NEXT:    add.w r8, r8, #1
+; CHECK-NEXT:    cmp r8, r7
+; CHECK-NEXT:    strh.w r10, [r3, r0, lsl #1]
 ; CHECK-NEXT:    beq .LBB10_4
 ; CHECK-NEXT:  .LBB10_8: @ %for.cond5.preheader.us.us
 ; CHECK-NEXT:    @ Parent Loop BB10_5 Depth=1
@@ -690,48 +692,46 @@ define dso_local void @arm_mat_mult_q15(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    b .LBB10_13
 ; CHECK-NEXT:  .LBB10_10: @ %vector.ph
 ; CHECK-NEXT:    @ in Loop: Header=BB10_8 Depth=2
-; CHECK-NEXT:    adr r3, .LCPI10_0
-; CHECK-NEXT:    vmov q4, q1
-; CHECK-NEXT:    vldrw.u32 q5, [r3]
-; CHECK-NEXT:    ldr r3, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    vmov.i32 q3, #0x0
-; CHECK-NEXT:    dls lr, r3
-; CHECK-NEXT:    vmlas.i32 q4, q5, r9
-; CHECK-NEXT:    ldr r3, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    vmov q5, q1
+; CHECK-NEXT:    vmov.i32 q4, #0x0
+; CHECK-NEXT:    vmlas.i32 q5, q2, r8
+; CHECK-NEXT:    dls lr, r0
+; CHECK-NEXT:    ldr r3, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:  .LBB10_11: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB10_5 Depth=1
 ; CHECK-NEXT:    @ Parent Loop BB10_8 Depth=2
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=3
-; CHECK-NEXT:    vadd.i32 q5, q4, q2
-; CHECK-NEXT:    vldrh.s32 q6, [r1, q4, uxtw #1]
-; CHECK-NEXT:    vldrh.s32 q4, [r3], #8
-; CHECK-NEXT:    vmul.i32 q4, q6, q4
-; CHECK-NEXT:    vadd.i32 q3, q4, q3
-; CHECK-NEXT:    vmov q4, q5
+; CHECK-NEXT:    vadd.i32 q6, q5, q3
+; CHECK-NEXT:    vldrh.s32 q7, [r1, q5, uxtw #1]
+; CHECK-NEXT:    vldrh.s32 q5, [r3], #8
+; CHECK-NEXT:    vmul.i32 q5, q7, q5
+; CHECK-NEXT:    vadd.i32 q4, q5, q4
+; CHECK-NEXT:    vmov q5, q6
 ; CHECK-NEXT:    le lr, .LBB10_11
 ; CHECK-NEXT:  @ %bb.12: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB10_8 Depth=2
-; CHECK-NEXT:    vaddv.u32 r10, q3
-; CHECK-NEXT:    cmp r0, r12
-; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    vaddv.u32 r10, q4
+; CHECK-NEXT:    cmp r2, r12
+; CHECK-NEXT:    mov r4, r2
 ; CHECK-NEXT:    beq .LBB10_7
 ; CHECK-NEXT:  .LBB10_13: @ %for.body8.us.us.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB10_8 Depth=2
-; CHECK-NEXT:    ldr r3, [sp, #112]
+; CHECK-NEXT:    mla r3, r7, r4, r8
+; CHECK-NEXT:    add.w r0, r11, r4
+; CHECK-NEXT:    ldr r7, [sp, #24] @ 4-byte Reload
 ; CHECK-NEXT:    sub.w lr, r12, r4
-; CHECK-NEXT:    ldr r5, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    mla r3, r3, r4, r9
-; CHECK-NEXT:    add.w r11, r1, r3, lsl #1
-; CHECK-NEXT:    adds r3, r7, r4
-; CHECK-NEXT:    add.w r3, r5, r3, lsl #1
+; CHECK-NEXT:    add.w r9, r7, r0, lsl #1
+; CHECK-NEXT:    ldr r7, [sp, #136]
+; CHECK-NEXT:    add.w r3, r1, r3, lsl #1
 ; CHECK-NEXT:  .LBB10_14: @ %for.body8.us.us
 ; CHECK-NEXT:    @ Parent Loop BB10_5 Depth=1
 ; CHECK-NEXT:    @ Parent Loop BB10_8 Depth=2
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=3
-; CHECK-NEXT:    ldrsh.w r5, [r11]
-; CHECK-NEXT:    add r11, r6
-; CHECK-NEXT:    ldrsh r4, [r3], #2
-; CHECK-NEXT:    smlabb r10, r5, r4, r10
+; CHECK-NEXT:    ldrsh.w r4, [r3]
+; CHECK-NEXT:    add r3, r6
+; CHECK-NEXT:    ldrsh r0, [r9], #2
+; CHECK-NEXT:    smlabb r10, r4, r0, r10
 ; CHECK-NEXT:    le lr, .LBB10_14
 ; CHECK-NEXT:    b .LBB10_7
 ; CHECK-NEXT:  .LBB10_15: @ Parent Loop BB10_5 Depth=1
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll
index 9dc2d62f21d95..6633cec659d8e 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll
@@ -121,29 +121,31 @@ define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, doubl
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_ashr_i32 s7, s6, 31
 ; CHECK-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; CHECK-NEXT:    s_add_u32 s4, s0, s4
-; CHECK-NEXT:    s_addc_u32 s5, s1, s5
-; CHECK-NEXT:    s_add_u32 s0, s4, -8
-; CHECK-NEXT:    s_addc_u32 s1, s5, -1
-; CHECK-NEXT:    s_cmp_eq_u64 s[4:5], 9
-; CHECK-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; CHECK-NEXT:    s_add_u32 s0, s0, s4
+; CHECK-NEXT:    s_addc_u32 s1, s1, s5
+; CHECK-NEXT:    s_add_u32 s4, s0, -8
+; CHECK-NEXT:    s_addc_u32 s5, s1, -1
+; CHECK-NEXT:    s_cmp_eq_u64 s[0:1], 9
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
 ; CHECK-NEXT:  .LBB3_1: ; %bb0
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; CHECK-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB3_1
 ; CHECK-NEXT:  ; %bb.2: ; %bb1
-; CHECK-NEXT:    s_mov_b64 s[4:5], exec
-; CHECK-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
-; CHECK-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; CHECK-NEXT:    s_mov_b64 s[0:1], exec
+; CHECK-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
+; CHECK-NEXT:    v_mbcnt_hi_u32_b32 v0, s1, v0
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; CHECK-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; CHECK-NEXT:    s_cbranch_execz .LBB3_4
 ; CHECK-NEXT:  ; %bb.3:
-; CHECK-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; CHECK-NEXT:    v_cvt_f64_u32_e32 v[0:1], s4
+; CHECK-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; CHECK-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; CHECK-NEXT:    v_mul_f64 v[0:1], s[2:3], v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
-; CHECK-NEXT:    global_atomic_add_f64 v2, v[0:1], s[0:1]
+; CHECK-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_wbinvl1_vol
 ; CHECK-NEXT:  .LBB3_4:

From 75a4563fc164e268d2a7af5735d5e84ceee865e7 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 10 Jan 2025 09:12:44 +0100
Subject: [PATCH 030/408] [LLVM] Update windows codegen maintainer (#119576)

I think that nowadays the go-to contact for Windows codegen is rnk.
---
 llvm/Maintainers.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md
index 2ccf30b8139aa..bad029ca6fb3c 100644
--- a/llvm/Maintainers.md
+++ b/llvm/Maintainers.md
@@ -164,10 +164,10 @@ quentin.colombet@gmail.com (email), [qcolombet](https://github.com/qcolombet) (G
 Fangrui Song \
 i@maskray.me (email), [MaskRay](https://github.com/MaskRay) (GitHub)
 
-#### Windows codegen
+#### Windows ABI and codegen
 
-Anton Korobeynikov \
-anton@korobeynikov.info (email), [asl](https://github.com/asl) (GitHub)
+Reid Kleckner \
+rnk@google.com (email), [rnk](https://github.com/rnk) (GitHub)
 
 ### Backends / Targets
 
@@ -479,7 +479,7 @@ Hal Finkel (hfinkel@anl.gov, [hfinkel](https://github.com/hfinkel) -- AliasAnaly
 Renato Golin (rengolin@systemcall.eu, [rengolin](https://github.com/rengolin)) -- ARM backend \
 Venkatraman Govindaraju (venkatra@cs.wisc.edu, [vegovin](https://github.com/vegovin) -- Sparc backend \
 James Grosbach (grosbach@apple.com) -- MC layer \
-Anton Korobeynikov (anton@korobeynikov.info, [asl](https://github.com/asl)) -- ARM EABI \
+Anton Korobeynikov (anton@korobeynikov.info, [asl](https://github.com/asl)) -- ARM EABI, Windows codegen \
 Benjamin Kramer (benny.kra@gmail.com, [d0k](https://github.com/d0k)) -- DWARF Parser \
 David Majnemer (david.majnemer@gmail.com, [majnemer](https://github.com/majnemer)) -- InstCombine, ConstantFold \
 Chad Rosier (mcrosier@codeaurora.org) -- FastISel \

From 4adeb6cf556df10da668916b22eb39d3f1313e8a Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Fri, 10 Jan 2025 09:15:18 +0100
Subject: [PATCH 031/408] [mlir][spirv] Add convergent attribute to builtin
 (#122131)

Add the `convergent` attribute to builtin functions and builtin function
calls when lowering SPIR-V non-uniform group functions to LLVM dialect.

---------

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 .../Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp    |  4 +-
 .../SPIRVToLLVM/non-uniform-ops-to-llvm.mlir  | 72 +++++++++----------
 2 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp b/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp
index b11511f21d03d..dbaac030aa0a2 100644
--- a/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp
+++ b/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp
@@ -1253,8 +1253,8 @@ class GroupReducePattern : public SPIRVToLLVMConversion<ReduceOp> {
     Operation *symbolTable =
         op->template getParentWithTrait<OpTrait::SymbolTable>();
 
-    LLVM::LLVMFuncOp func = lookupOrCreateSPIRVFn(
-        symbolTable, funcName, paramTypes, retTy, !NonUniform);
+    LLVM::LLVMFuncOp func =
+        lookupOrCreateSPIRVFn(symbolTable, funcName, paramTypes, retTy);
 
     Location loc = op.getLoc();
     Value scope = rewriter.create<LLVM::ConstantOp>(
diff --git a/mlir/test/Conversion/SPIRVToLLVM/non-uniform-ops-to-llvm.mlir b/mlir/test/Conversion/SPIRVToLLVM/non-uniform-ops-to-llvm.mlir
index e81048792c45d..ab174ba2b41e4 100644
--- a/mlir/test/Conversion/SPIRVToLLVM/non-uniform-ops-to-llvm.mlir
+++ b/mlir/test/Conversion/SPIRVToLLVM/non-uniform-ops-to-llvm.mlir
@@ -2,30 +2,30 @@
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 
-// CHECK-LABEL:   llvm.func spir_funccc @_Z33__spirv_GroupNonUniformLogicalXoriib(i32, i32, i1) -> i1 attributes {no_unwind, will_return}
-// CHECK:         llvm.func spir_funccc @_Z32__spirv_GroupNonUniformLogicalOriib(i32, i32, i1) -> i1 attributes {no_unwind, will_return}
-// CHECK:         llvm.func spir_funccc @_Z33__spirv_GroupNonUniformLogicalAndiib(i32, i32, i1) -> i1 attributes {no_unwind, will_return}
-// CHECK:         llvm.func spir_funccc @_Z33__spirv_GroupNonUniformBitwiseXoriij(i32, i32, i32) -> i32 attributes {no_unwind, will_return}
-// CHECK:         llvm.func spir_funccc @_Z32__spirv_GroupNonUniformBitwiseOriij(i32, i32, i32) -> i32 attributes {no_unwind, will_return}
-// CHECK:         llvm.func spir_funccc @_Z33__spirv_GroupNonUniformBitwiseAndiij(i32, i32, i32) -> i32 attributes {no_unwind, will_return}
-// CHECK:         llvm.func spir_funccc @_Z27__spirv_GroupNonUniformSMaxiijj(i32, i32, i32, i32) -> i32 attributes {no_unwind, will_return}
-// CHECK:         llvm.func spir_funccc @_Z27__spirv_GroupNonUniformFMaxiif(i32, i32, f32) -> f32 attributes {no_unwind, will_return}
-// CHECK:         llvm.func spir_funccc @_Z27__spirv_GroupNonUniformUMaxiij(i32, i32, i32) -> i32 attributes {no_unwind, will_return}
-// CHECK:         llvm.func spir_funccc @_Z27__spirv_GroupNonUniformSMaxiij(i32, i32, i32) -> i32 attributes {no_unwind, will_return}
-// CHECK:         llvm.func spir_funccc @_Z27__spirv_GroupNonUniformFMiniifj(i32, i32, f32, i32) -> f32 attributes {no_unwind, will_return}
-// CHECK:         llvm.func spir_funccc @_Z27__spirv_GroupNonUniformFMiniif(i32, i32, f32) -> f32 attributes {no_unwind, will_return}
-// CHECK:         llvm.func spir_funccc @_Z27__spirv_GroupNonUniformUMiniij(i32, i32, i32) -> i32 attributes {no_unwind, will_return}
-// CHECK:         llvm.func spir_funccc @_Z27__spirv_GroupNonUniformSMiniij(i32, i32, i32) -> i32 attributes {no_unwind, will_return}
-// CHECK:         llvm.func spir_funccc @_Z27__spirv_GroupNonUniformFMuliif(i32, i32, f32) -> f32 attributes {no_unwind, will_return}
-// CHECK:         llvm.func spir_funccc @_Z27__spirv_GroupNonUniformIMuliijj(i32, i32, i32, i32) -> i32 attributes {no_unwind, will_return}
-// CHECK:         llvm.func spir_funccc @_Z27__spirv_GroupNonUniformFAddiifj(i32, i32, f32, i32) -> f32 attributes {no_unwind, will_return}
-// CHECK:         llvm.func spir_funccc @_Z27__spirv_GroupNonUniformIAddiij(i32, i32, i32) -> i32 attributes {no_unwind, will_return}
+// CHECK-LABEL:   llvm.func spir_funccc @_Z33__spirv_GroupNonUniformLogicalXoriib(i32, i32, i1) -> i1 attributes {convergent, no_unwind, will_return}
+// CHECK:         llvm.func spir_funccc @_Z32__spirv_GroupNonUniformLogicalOriib(i32, i32, i1) -> i1 attributes {convergent, no_unwind, will_return}
+// CHECK:         llvm.func spir_funccc @_Z33__spirv_GroupNonUniformLogicalAndiib(i32, i32, i1) -> i1 attributes {convergent, no_unwind, will_return}
+// CHECK:         llvm.func spir_funccc @_Z33__spirv_GroupNonUniformBitwiseXoriij(i32, i32, i32) -> i32 attributes {convergent, no_unwind, will_return}
+// CHECK:         llvm.func spir_funccc @_Z32__spirv_GroupNonUniformBitwiseOriij(i32, i32, i32) -> i32 attributes {convergent, no_unwind, will_return}
+// CHECK:         llvm.func spir_funccc @_Z33__spirv_GroupNonUniformBitwiseAndiij(i32, i32, i32) -> i32 attributes {convergent, no_unwind, will_return}
+// CHECK:         llvm.func spir_funccc @_Z27__spirv_GroupNonUniformSMaxiijj(i32, i32, i32, i32) -> i32 attributes {convergent, no_unwind, will_return}
+// CHECK:         llvm.func spir_funccc @_Z27__spirv_GroupNonUniformFMaxiif(i32, i32, f32) -> f32 attributes {convergent, no_unwind, will_return}
+// CHECK:         llvm.func spir_funccc @_Z27__spirv_GroupNonUniformUMaxiij(i32, i32, i32) -> i32 attributes {convergent, no_unwind, will_return}
+// CHECK:         llvm.func spir_funccc @_Z27__spirv_GroupNonUniformSMaxiij(i32, i32, i32) -> i32 attributes {convergent, no_unwind, will_return}
+// CHECK:         llvm.func spir_funccc @_Z27__spirv_GroupNonUniformFMiniifj(i32, i32, f32, i32) -> f32 attributes {convergent, no_unwind, will_return}
+// CHECK:         llvm.func spir_funccc @_Z27__spirv_GroupNonUniformFMiniif(i32, i32, f32) -> f32 attributes {convergent, no_unwind, will_return}
+// CHECK:         llvm.func spir_funccc @_Z27__spirv_GroupNonUniformUMiniij(i32, i32, i32) -> i32 attributes {convergent, no_unwind, will_return}
+// CHECK:         llvm.func spir_funccc @_Z27__spirv_GroupNonUniformSMiniij(i32, i32, i32) -> i32 attributes {convergent, no_unwind, will_return}
+// CHECK:         llvm.func spir_funccc @_Z27__spirv_GroupNonUniformFMuliif(i32, i32, f32) -> f32 attributes {convergent, no_unwind, will_return}
+// CHECK:         llvm.func spir_funccc @_Z27__spirv_GroupNonUniformIMuliijj(i32, i32, i32, i32) -> i32 attributes {convergent, no_unwind, will_return}
+// CHECK:         llvm.func spir_funccc @_Z27__spirv_GroupNonUniformFAddiifj(i32, i32, f32, i32) -> f32 attributes {convergent, no_unwind, will_return}
+// CHECK:         llvm.func spir_funccc @_Z27__spirv_GroupNonUniformIAddiij(i32, i32, i32) -> i32 attributes {convergent, no_unwind, will_return}
 
 // CHECK-LABEL:   llvm.func @non_uniform_iadd(
 // CHECK-SAME:                                %[[VAL_0:.*]]: i32) -> i32 {
 // CHECK:           %[[VAL_1:.*]] = llvm.mlir.constant(3 : i32) : i32
 // CHECK:           %[[VAL_2:.*]] = llvm.mlir.constant(0 : i32) : i32
-// CHECK:           %[[VAL_3:.*]] = llvm.call spir_funccc @_Z27__spirv_GroupNonUniformIAddiij(%[[VAL_1]], %[[VAL_2]], %[[VAL_0]]) {no_unwind, will_return} : (i32, i32, i32) -> i32
+// CHECK:           %[[VAL_3:.*]] = llvm.call spir_funccc @_Z27__spirv_GroupNonUniformIAddiij(%[[VAL_1]], %[[VAL_2]], %[[VAL_0]]) {convergent, no_unwind, will_return} : (i32, i32, i32) -> i32
 // CHECK:           llvm.return %[[VAL_3]] : i32
 // CHECK:         }
 spirv.func @non_uniform_iadd(%arg0: i32) -> i32 "None" {
@@ -38,7 +38,7 @@ spirv.func @non_uniform_iadd(%arg0: i32) -> i32 "None" {
 // CHECK:           %[[VAL_1:.*]] = llvm.mlir.constant(16 : i32) : i32
 // CHECK:           %[[VAL_2:.*]] = llvm.mlir.constant(3 : i32) : i32
 // CHECK:           %[[VAL_3:.*]] = llvm.mlir.constant(3 : i32) : i32
-// CHECK:           %[[VAL_4:.*]] = llvm.call spir_funccc @_Z27__spirv_GroupNonUniformFAddiifj(%[[VAL_2]], %[[VAL_3]], %[[VAL_0]], %[[VAL_1]]) {no_unwind, will_return} : (i32, i32, f32, i32) -> f32
+// CHECK:           %[[VAL_4:.*]] = llvm.call spir_funccc @_Z27__spirv_GroupNonUniformFAddiifj(%[[VAL_2]], %[[VAL_3]], %[[VAL_0]], %[[VAL_1]]) {convergent, no_unwind, will_return} : (i32, i32, f32, i32) -> f32
 // CHECK:           llvm.return %[[VAL_4]] : f32
 // CHECK:         }
 spirv.func @non_uniform_fadd(%arg0: f32) -> f32 "None" {
@@ -52,7 +52,7 @@ spirv.func @non_uniform_fadd(%arg0: f32) -> f32 "None" {
 // CHECK:           %[[VAL_1:.*]] = llvm.mlir.constant(16 : i32) : i32
 // CHECK:           %[[VAL_2:.*]] = llvm.mlir.constant(3 : i32) : i32
 // CHECK:           %[[VAL_3:.*]] = llvm.mlir.constant(3 : i32) : i32
-// CHECK:           %[[VAL_4:.*]] = llvm.call spir_funccc @_Z27__spirv_GroupNonUniformIMuliijj(%[[VAL_2]], %[[VAL_3]], %[[VAL_0]], %[[VAL_1]]) {no_unwind, will_return} : (i32, i32, i32, i32) -> i32
+// CHECK:           %[[VAL_4:.*]] = llvm.call spir_funccc @_Z27__spirv_GroupNonUniformIMuliijj(%[[VAL_2]], %[[VAL_3]], %[[VAL_0]], %[[VAL_1]]) {convergent, no_unwind, will_return} : (i32, i32, i32, i32) -> i32
 // CHECK:           llvm.return %[[VAL_4]] : i32
 // CHECK:         }
 spirv.func @non_uniform_imul(%arg0: i32) -> i32 "None" {
@@ -65,7 +65,7 @@ spirv.func @non_uniform_imul(%arg0: i32) -> i32 "None" {
 // CHECK-SAME:                                %[[VAL_0:.*]]: f32) -> f32 {
 // CHECK:           %[[VAL_1:.*]] = llvm.mlir.constant(3 : i32) : i32
 // CHECK:           %[[VAL_2:.*]] = llvm.mlir.constant(0 : i32) : i32
-// CHECK:           %[[VAL_3:.*]] = llvm.call spir_funccc @_Z27__spirv_GroupNonUniformFMuliif(%[[VAL_1]], %[[VAL_2]], %[[VAL_0]]) {no_unwind, will_return} : (i32, i32, f32) -> f32
+// CHECK:           %[[VAL_3:.*]] = llvm.call spir_funccc @_Z27__spirv_GroupNonUniformFMuliif(%[[VAL_1]], %[[VAL_2]], %[[VAL_0]]) {convergent, no_unwind, will_return} : (i32, i32, f32) -> f32
 // CHECK:           llvm.return %[[VAL_3]] : f32
 // CHECK:         }
 spirv.func @non_uniform_fmul(%arg0: f32) -> f32 "None" {
@@ -77,7 +77,7 @@ spirv.func @non_uniform_fmul(%arg0: f32) -> f32 "None" {
 // CHECK-SAME:                                %[[VAL_0:.*]]: i32) -> i32 {
 // CHECK:           %[[VAL_1:.*]] = llvm.mlir.constant(3 : i32) : i32
 // CHECK:           %[[VAL_2:.*]] = llvm.mlir.constant(0 : i32) : i32
-// CHECK:           %[[VAL_3:.*]] = llvm.call spir_funccc @_Z27__spirv_GroupNonUniformSMiniij(%[[VAL_1]], %[[VAL_2]], %[[VAL_0]]) {no_unwind, will_return} : (i32, i32, i32) -> i32
+// CHECK:           %[[VAL_3:.*]] = llvm.call spir_funccc @_Z27__spirv_GroupNonUniformSMiniij(%[[VAL_1]], %[[VAL_2]], %[[VAL_0]]) {convergent, no_unwind, will_return} : (i32, i32, i32) -> i32
 // CHECK:           llvm.return %[[VAL_3]] : i32
 // CHECK:         }
 spirv.func @non_uniform_smin(%arg0: i32) -> i32 "None" {
@@ -89,7 +89,7 @@ spirv.func @non_uniform_smin(%arg0: i32) -> i32 "None" {
 // CHECK-SAME:                                %[[VAL_0:.*]]: i32) -> i32 {
 // CHECK:           %[[VAL_1:.*]] = llvm.mlir.constant(3 : i32) : i32
 // CHECK:           %[[VAL_2:.*]] = llvm.mlir.constant(0 : i32) : i32
-// CHECK:           %[[VAL_3:.*]] = llvm.call spir_funccc @_Z27__spirv_GroupNonUniformUMiniij(%[[VAL_1]], %[[VAL_2]], %[[VAL_0]]) {no_unwind, will_return} : (i32, i32, i32) -> i32
+// CHECK:           %[[VAL_3:.*]] = llvm.call spir_funccc @_Z27__spirv_GroupNonUniformUMiniij(%[[VAL_1]], %[[VAL_2]], %[[VAL_0]]) {convergent, no_unwind, will_return} : (i32, i32, i32) -> i32
 // CHECK:           llvm.return %[[VAL_3]] : i32
 // CHECK:         }
 spirv.func @non_uniform_umin(%arg0: i32) -> i32 "None" {
@@ -101,7 +101,7 @@ spirv.func @non_uniform_umin(%arg0: i32) -> i32 "None" {
 // CHECK-SAME:                                %[[VAL_0:.*]]: f32) -> f32 {
 // CHECK:           %[[VAL_1:.*]] = llvm.mlir.constant(3 : i32) : i32
 // CHECK:           %[[VAL_2:.*]] = llvm.mlir.constant(0 : i32) : i32
-// CHECK:           %[[VAL_3:.*]] = llvm.call spir_funccc @_Z27__spirv_GroupNonUniformFMiniif(%[[VAL_1]], %[[VAL_2]], %[[VAL_0]]) {no_unwind, will_return} : (i32, i32, f32) -> f32
+// CHECK:           %[[VAL_3:.*]] = llvm.call spir_funccc @_Z27__spirv_GroupNonUniformFMiniif(%[[VAL_1]], %[[VAL_2]], %[[VAL_0]]) {convergent, no_unwind, will_return} : (i32, i32, f32) -> f32
 // CHECK:           llvm.return %[[VAL_3]] : f32
 // CHECK:         }
 spirv.func @non_uniform_fmin(%arg0: f32) -> f32 "None" {
@@ -114,7 +114,7 @@ spirv.func @non_uniform_fmin(%arg0: f32) -> f32 "None" {
 // CHECK:           %[[VAL_1:.*]] = llvm.mlir.constant(16 : i32) : i32
 // CHECK:           %[[VAL_2:.*]] = llvm.mlir.constant(3 : i32) : i32
 // CHECK:           %[[VAL_3:.*]] = llvm.mlir.constant(3 : i32) : i32
-// CHECK:           %[[VAL_4:.*]] = llvm.call spir_funccc @_Z27__spirv_GroupNonUniformFMiniifj(%[[VAL_2]], %[[VAL_3]], %[[VAL_0]], %[[VAL_1]]) {no_unwind, will_return} : (i32, i32, f32, i32) -> f32
+// CHECK:           %[[VAL_4:.*]] = llvm.call spir_funccc @_Z27__spirv_GroupNonUniformFMiniifj(%[[VAL_2]], %[[VAL_3]], %[[VAL_0]], %[[VAL_1]]) {convergent, no_unwind, will_return} : (i32, i32, f32, i32) -> f32
 // CHECK:           llvm.return %[[VAL_4]] : f32
 // CHECK:         }
 spirv.func @non_uniform_fmin_cluster(%arg0: f32) -> f32 "None" {
@@ -127,7 +127,7 @@ spirv.func @non_uniform_fmin_cluster(%arg0: f32) -> f32 "None" {
 // CHECK-SAME:                                %[[VAL_0:.*]]: i32) -> i32 {
 // CHECK:           %[[VAL_1:.*]] = llvm.mlir.constant(3 : i32) : i32
 // CHECK:           %[[VAL_2:.*]] = llvm.mlir.constant(0 : i32) : i32
-// CHECK:           %[[VAL_3:.*]] = llvm.call spir_funccc @_Z27__spirv_GroupNonUniformSMaxiij(%[[VAL_1]], %[[VAL_2]], %[[VAL_0]]) {no_unwind, will_return} : (i32, i32, i32) -> i32
+// CHECK:           %[[VAL_3:.*]] = llvm.call spir_funccc @_Z27__spirv_GroupNonUniformSMaxiij(%[[VAL_1]], %[[VAL_2]], %[[VAL_0]]) {convergent, no_unwind, will_return} : (i32, i32, i32) -> i32
 // CHECK:           llvm.return %[[VAL_3]] : i32
 // CHECK:         }
 spirv.func @non_uniform_smax(%arg0: i32) -> i32 "None" {
@@ -139,7 +139,7 @@ spirv.func @non_uniform_smax(%arg0: i32) -> i32 "None" {
 // CHECK-SAME:                                %[[VAL_0:.*]]: i32) -> i32 {
 // CHECK:           %[[VAL_1:.*]] = llvm.mlir.constant(3 : i32) : i32
 // CHECK:           %[[VAL_2:.*]] = llvm.mlir.constant(0 : i32) : i32
-// CHECK:           %[[VAL_3:.*]] = llvm.call spir_funccc @_Z27__spirv_GroupNonUniformUMaxiij(%[[VAL_1]], %[[VAL_2]], %[[VAL_0]]) {no_unwind, will_return} : (i32, i32, i32) -> i32
+// CHECK:           %[[VAL_3:.*]] = llvm.call spir_funccc @_Z27__spirv_GroupNonUniformUMaxiij(%[[VAL_1]], %[[VAL_2]], %[[VAL_0]]) {convergent, no_unwind, will_return} : (i32, i32, i32) -> i32
 // CHECK:           llvm.return %[[VAL_3]] : i32
 // CHECK:         }
 spirv.func @non_uniform_umax(%arg0: i32) -> i32 "None" {
@@ -151,7 +151,7 @@ spirv.func @non_uniform_umax(%arg0: i32) -> i32 "None" {
 // CHECK-SAME:                                %[[VAL_0:.*]]: f32) -> f32 {
 // CHECK:           %[[VAL_1:.*]] = llvm.mlir.constant(3 : i32) : i32
 // CHECK:           %[[VAL_2:.*]] = llvm.mlir.constant(0 : i32) : i32
-// CHECK:           %[[VAL_3:.*]] = llvm.call spir_funccc @_Z27__spirv_GroupNonUniformFMaxiif(%[[VAL_1]], %[[VAL_2]], %[[VAL_0]]) {no_unwind, will_return} : (i32, i32, f32) -> f32
+// CHECK:           %[[VAL_3:.*]] = llvm.call spir_funccc @_Z27__spirv_GroupNonUniformFMaxiif(%[[VAL_1]], %[[VAL_2]], %[[VAL_0]]) {convergent, no_unwind, will_return} : (i32, i32, f32) -> f32
 // CHECK:           llvm.return %[[VAL_3]] : f32
 // CHECK:         }
 spirv.func @non_uniform_fmax(%arg0: f32) -> f32 "None" {
@@ -164,7 +164,7 @@ spirv.func @non_uniform_fmax(%arg0: f32) -> f32 "None" {
 // CHECK:           %[[VAL_1:.*]] = llvm.mlir.constant(16 : i32) : i32
 // CHECK:           %[[VAL_2:.*]] = llvm.mlir.constant(3 : i32) : i32
 // CHECK:           %[[VAL_3:.*]] = llvm.mlir.constant(3 : i32) : i32
-// CHECK:           %[[VAL_4:.*]] = llvm.call spir_funccc @_Z27__spirv_GroupNonUniformSMaxiijj(%[[VAL_2]], %[[VAL_3]], %[[VAL_0]], %[[VAL_1]]) {no_unwind, will_return} : (i32, i32, i32, i32) -> i32
+// CHECK:           %[[VAL_4:.*]] = llvm.call spir_funccc @_Z27__spirv_GroupNonUniformSMaxiijj(%[[VAL_2]], %[[VAL_3]], %[[VAL_0]], %[[VAL_1]]) {convergent, no_unwind, will_return} : (i32, i32, i32, i32) -> i32
 // CHECK:           llvm.return %[[VAL_4]] : i32
 // CHECK:         }
 spirv.func @non_uniform_smax_cluster(%arg0: i32) -> i32 "None" {
@@ -177,7 +177,7 @@ spirv.func @non_uniform_smax_cluster(%arg0: i32) -> i32 "None" {
 // CHECK-SAME:                                       %[[VAL_0:.*]]: i32) -> i32 {
 // CHECK:           %[[VAL_1:.*]] = llvm.mlir.constant(3 : i32) : i32
 // CHECK:           %[[VAL_2:.*]] = llvm.mlir.constant(0 : i32) : i32
-// CHECK:           %[[VAL_3:.*]] = llvm.call spir_funccc @_Z33__spirv_GroupNonUniformBitwiseAndiij(%[[VAL_1]], %[[VAL_2]], %[[VAL_0]]) {no_unwind, will_return} : (i32, i32, i32) -> i32
+// CHECK:           %[[VAL_3:.*]] = llvm.call spir_funccc @_Z33__spirv_GroupNonUniformBitwiseAndiij(%[[VAL_1]], %[[VAL_2]], %[[VAL_0]]) {convergent, no_unwind, will_return} : (i32, i32, i32) -> i32
 // CHECK:           llvm.return %[[VAL_3]] : i32
 // CHECK:         }
 spirv.func @non_uniform_bitwise_and(%arg0: i32) -> i32 "None" {
@@ -189,7 +189,7 @@ spirv.func @non_uniform_bitwise_and(%arg0: i32) -> i32 "None" {
 // CHECK-SAME:                                      %[[VAL_0:.*]]: i32) -> i32 {
 // CHECK:           %[[VAL_1:.*]] = llvm.mlir.constant(3 : i32) : i32
 // CHECK:           %[[VAL_2:.*]] = llvm.mlir.constant(0 : i32) : i32
-// CHECK:           %[[VAL_3:.*]] = llvm.call spir_funccc @_Z32__spirv_GroupNonUniformBitwiseOriij(%[[VAL_1]], %[[VAL_2]], %[[VAL_0]]) {no_unwind, will_return} : (i32, i32, i32) -> i32
+// CHECK:           %[[VAL_3:.*]] = llvm.call spir_funccc @_Z32__spirv_GroupNonUniformBitwiseOriij(%[[VAL_1]], %[[VAL_2]], %[[VAL_0]]) {convergent, no_unwind, will_return} : (i32, i32, i32) -> i32
 // CHECK:           llvm.return %[[VAL_3]] : i32
 // CHECK:         }
 spirv.func @non_uniform_bitwise_or(%arg0: i32) -> i32 "None" {
@@ -201,7 +201,7 @@ spirv.func @non_uniform_bitwise_or(%arg0: i32) -> i32 "None" {
 // CHECK-SAME:                                       %[[VAL_0:.*]]: i32) -> i32 {
 // CHECK:           %[[VAL_1:.*]] = llvm.mlir.constant(3 : i32) : i32
 // CHECK:           %[[VAL_2:.*]] = llvm.mlir.constant(0 : i32) : i32
-// CHECK:           %[[VAL_3:.*]] = llvm.call spir_funccc @_Z33__spirv_GroupNonUniformBitwiseXoriij(%[[VAL_1]], %[[VAL_2]], %[[VAL_0]]) {no_unwind, will_return} : (i32, i32, i32) -> i32
+// CHECK:           %[[VAL_3:.*]] = llvm.call spir_funccc @_Z33__spirv_GroupNonUniformBitwiseXoriij(%[[VAL_1]], %[[VAL_2]], %[[VAL_0]]) {convergent, no_unwind, will_return} : (i32, i32, i32) -> i32
 // CHECK:           llvm.return %[[VAL_3]] : i32
 // CHECK:         }
 spirv.func @non_uniform_bitwise_xor(%arg0: i32) -> i32 "None" {
@@ -213,7 +213,7 @@ spirv.func @non_uniform_bitwise_xor(%arg0: i32) -> i32 "None" {
 // CHECK-SAME:                                       %[[VAL_0:.*]]: i1) -> i1 {
 // CHECK:           %[[VAL_1:.*]] = llvm.mlir.constant(3 : i32) : i32
 // CHECK:           %[[VAL_2:.*]] = llvm.mlir.constant(0 : i32) : i32
-// CHECK:           %[[VAL_3:.*]] = llvm.call spir_funccc @_Z33__spirv_GroupNonUniformLogicalAndiib(%[[VAL_1]], %[[VAL_2]], %[[VAL_0]]) {no_unwind, will_return} : (i32, i32, i1) -> i1
+// CHECK:           %[[VAL_3:.*]] = llvm.call spir_funccc @_Z33__spirv_GroupNonUniformLogicalAndiib(%[[VAL_1]], %[[VAL_2]], %[[VAL_0]]) {convergent, no_unwind, will_return} : (i32, i32, i1) -> i1
 // CHECK:           llvm.return %[[VAL_3]] : i1
 // CHECK:         }
 spirv.func @non_uniform_logical_and(%arg0: i1) -> i1 "None" {
@@ -225,7 +225,7 @@ spirv.func @non_uniform_logical_and(%arg0: i1) -> i1 "None" {
 // CHECK-SAME:                                      %[[VAL_0:.*]]: i1) -> i1 {
 // CHECK:           %[[VAL_1:.*]] = llvm.mlir.constant(3 : i32) : i32
 // CHECK:           %[[VAL_2:.*]] = llvm.mlir.constant(0 : i32) : i32
-// CHECK:           %[[VAL_3:.*]] = llvm.call spir_funccc @_Z32__spirv_GroupNonUniformLogicalOriib(%[[VAL_1]], %[[VAL_2]], %[[VAL_0]]) {no_unwind, will_return} : (i32, i32, i1) -> i1
+// CHECK:           %[[VAL_3:.*]] = llvm.call spir_funccc @_Z32__spirv_GroupNonUniformLogicalOriib(%[[VAL_1]], %[[VAL_2]], %[[VAL_0]]) {convergent, no_unwind, will_return} : (i32, i32, i1) -> i1
 // CHECK:           llvm.return %[[VAL_3]] : i1
 // CHECK:         }
 spirv.func @non_uniform_logical_or(%arg0: i1) -> i1 "None" {
@@ -237,7 +237,7 @@ spirv.func @non_uniform_logical_or(%arg0: i1) -> i1 "None" {
 // CHECK-SAME:                                       %[[VAL_0:.*]]: i1) -> i1 {
 // CHECK:           %[[VAL_1:.*]] = llvm.mlir.constant(3 : i32) : i32
 // CHECK:           %[[VAL_2:.*]] = llvm.mlir.constant(0 : i32) : i32
-// CHECK:           %[[VAL_3:.*]] = llvm.call spir_funccc @_Z33__spirv_GroupNonUniformLogicalXoriib(%[[VAL_1]], %[[VAL_2]], %[[VAL_0]]) {no_unwind, will_return} : (i32, i32, i1) -> i1
+// CHECK:           %[[VAL_3:.*]] = llvm.call spir_funccc @_Z33__spirv_GroupNonUniformLogicalXoriib(%[[VAL_1]], %[[VAL_2]], %[[VAL_0]]) {convergent, no_unwind, will_return} : (i32, i32, i1) -> i1
 // CHECK:           llvm.return %[[VAL_3]] : i1
 // CHECK:         }
 spirv.func @non_uniform_logical_xor(%arg0: i1) -> i1 "None" {

From 05dfbc146d87866f0ef22dc88f729b5b9fdfe1a0 Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Fri, 10 Jan 2025 09:23:52 +0100
Subject: [PATCH 032/408] [lldb] Regularize
 DWARFDIE::Get{TypeLookup,Decl}Context names (#122273)

The functions call GetName for everything except variables, where they
call GetPubname instead. The difference is that the latter prefers to
return the linkage name, if it is available.

This doesn't seem particularly useful given that the linkage name
already kind of contains the context of the variable, and I doubt that
anything depends on it as these functions are currently called on type
and subprogram DIEs -- not variables.

This makes it easier to simplify/deduplicate these functions later.
---
 lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp
index 96b13efe58351..4b864b549f8ce 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp
@@ -412,7 +412,7 @@ static void GetDeclContextImpl(DWARFDIE die,
       push_ctx(CompilerContextKind::Function, die.GetName());
       break;
     case DW_TAG_variable:
-      push_ctx(CompilerContextKind::Variable, die.GetPubname());
+      push_ctx(CompilerContextKind::Variable, die.GetName());
       break;
     case DW_TAG_typedef:
       push_ctx(CompilerContextKind::Typedef, die.GetName());
@@ -457,7 +457,7 @@ static void GetTypeLookupContextImpl(DWARFDIE die,
       push_ctx(CompilerContextKind::Enum, die.GetName());
       break;
     case DW_TAG_variable:
-      push_ctx(CompilerContextKind::Variable, die.GetPubname());
+      push_ctx(CompilerContextKind::Variable, die.GetName());
       break;
     case DW_TAG_typedef:
       push_ctx(CompilerContextKind::Typedef, die.GetName());

From 6504546abcd38159256c3030286b1c02b401c4f8 Mon Sep 17 00:00:00 2001
From: maflcko <6399679+maflcko@users.noreply.github.com>
Date: Fri, 10 Jan 2025 09:24:24 +0100
Subject: [PATCH 033/408] [clang-tidy][use-internal-linkage] fix false positive
 for consteval function (#122141)

Fixes https://github.com/llvm/llvm-project/issues/122096

---------

Co-authored-by: MarcoFalke <*~=`'#}+{/-|&$^_@721217.xyz>
---
 .../clang-tidy/misc/UseInternalLinkageCheck.cpp            | 2 +-
 clang-tools-extra/docs/ReleaseNotes.rst                    | 6 +++---
 .../checkers/misc/use-internal-linkage-consteval.cpp       | 7 +++++++
 3 files changed, 11 insertions(+), 4 deletions(-)
 create mode 100644 clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-consteval.cpp

diff --git a/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp b/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp
index 1e0f398a4a560..4778182944abd 100644
--- a/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp
@@ -125,7 +125,7 @@ void UseInternalLinkageCheck::registerMatchers(MatchFinder *Finder) {
                     exportDecl()))))));
   Finder->addMatcher(
       functionDecl(Common, hasBody(),
-                   unless(anyOf(cxxMethodDecl(),
+                   unless(anyOf(cxxMethodDecl(), isConsteval(),
                                 isAllocationOrDeallocationOverloadedFunction(),
                                 isMain())))
           .bind("fn"),
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 19c59f5b32eed..1a7d48a0b4dc7 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -286,9 +286,9 @@ Changes in existing checks
 
 - Improved :doc:`misc-use-internal-linkage
   <clang-tidy/checks/misc/use-internal-linkage>` check to insert ``static``
-  keyword before type qualifiers such as ``const`` and ``volatile`` and fix
-  false positives for function declaration without body and fix false positives
-  for C++20 export declarations and fix false positives for global scoped
+  keyword before type qualifiers such as ``const`` and ``volatile``. Also, fix
+  false positives for function declaration without body, C++20 consteval
+  functions, C++20 export declarations, and global scoped
   overloaded ``operator new`` and ``operator delete``.
 
 - Improved :doc:`modernize-avoid-c-arrays
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-consteval.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-consteval.cpp
new file mode 100644
index 0000000000000..62c9818e07c4f
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-consteval.cpp
@@ -0,0 +1,7 @@
+// RUN: %check_clang_tidy -std=c++20 %s misc-use-internal-linkage %t -- -- -I%S/Inputs/use-internal-linkage
+
+consteval void gh122096() {}
+
+constexpr void cxf() {}
+// CHECK-MESSAGES: :[[@LINE-1]]:16: warning: function 'cxf'
+// CHECK-FIXES: static constexpr void cxf() {}

From 86b1b0671cafd462c0aa681e2d320ce597300f69 Mon Sep 17 00:00:00 2001
From: Guy David <49722543+guy-david@users.noreply.github.com>
Date: Fri, 10 Jan 2025 10:33:02 +0200
Subject: [PATCH 034/408] MachineVerifier: Check stack protector is top-most in
 frame (#121481)

Somewhat paranoid, but mitigates potential bugs in the future that might
place it elsewhere and render the mechanism useless.
---
 llvm/lib/CodeGen/MachineVerifier.cpp          | 52 ++++++++++++++-
 .../stack-protector-offset.mir                | 63 +++++++++++++++++++
 2 files changed, 114 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/MachineVerifier/stack-protector-offset.mir

diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index bec36b728ae32..2558799c19f4d 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -353,6 +353,8 @@ struct MachineVerifier {
                        LaneBitmask LaneMask = LaneBitmask::getNone());
 
   void verifyStackFrame();
+  // Check that the stack protector is the top-most object in the stack.
+  void verifyStackProtector();
 
   void verifySlotIndexes() const;
   void verifyProperties(const MachineFunction &MF);
@@ -709,8 +711,10 @@ void MachineVerifier::visitMachineFunctionBefore() {
   // Check that the register use lists are sane.
   MRI->verifyUseLists();
 
-  if (!MF->empty())
+  if (!MF->empty()) {
     verifyStackFrame();
+    verifyStackProtector();
+  }
 }
 
 void
@@ -4038,3 +4042,49 @@ void MachineVerifier::verifyStackFrame() {
     }
   }
 }
+
+void MachineVerifier::verifyStackProtector() {
+  const MachineFrameInfo &MFI = MF->getFrameInfo();
+  if (!MFI.hasStackProtectorIndex())
+    return;
+  // Only applicable when the offsets of frame objects have been determined,
+  // which is indicated by a non-zero stack size.
+  if (!MFI.getStackSize())
+    return;
+  const TargetFrameLowering &TFI = *MF->getSubtarget().getFrameLowering();
+  bool StackGrowsDown =
+      TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
+  // Collect the frame indices of the callee-saved registers which are spilled
+  // to the stack. These are the registers that are stored above the stack
+  // protector.
+  SmallSet<unsigned, 4> CalleeSavedFrameIndices;
+  if (MFI.isCalleeSavedInfoValid()) {
+    for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo()) {
+      if (!Info.isSpilledToReg())
+        CalleeSavedFrameIndices.insert(Info.getFrameIdx());
+    }
+  }
+  unsigned FI = MFI.getStackProtectorIndex();
+  int64_t SPStart = MFI.getObjectOffset(FI);
+  int64_t SPEnd = SPStart + MFI.getObjectSize(FI);
+  for (unsigned I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
+    if (I == FI)
+      continue;
+    // Variable-sized objects do not have a fixed offset.
+    if (MFI.isVariableSizedObjectIndex(I))
+      continue;
+    if (CalleeSavedFrameIndices.contains(I))
+      continue;
+    int64_t ObjStart = MFI.getObjectOffset(I);
+    int64_t ObjEnd = ObjStart + MFI.getObjectSize(I);
+    if (SPStart < ObjEnd && ObjStart < SPEnd) {
+      report("Stack protector overlaps with another stack object", MF);
+      break;
+    }
+    if ((StackGrowsDown && SPStart <= ObjStart) ||
+        (!StackGrowsDown && SPStart >= ObjStart)) {
+      report("Stack protector is not the top-most object on the stack", MF);
+      break;
+    }
+  }
+}
diff --git a/llvm/test/MachineVerifier/stack-protector-offset.mir b/llvm/test/MachineVerifier/stack-protector-offset.mir
new file mode 100644
index 0000000000000..47008e1b12354
--- /dev/null
+++ b/llvm/test/MachineVerifier/stack-protector-offset.mir
@@ -0,0 +1,63 @@
+# REQUIRES: aarch64-registered-target, amdgpu-registered-target
+
+# RUN: split-file %s %t
+
+# RUN: llc -mtriple=aarch64 -run-pass=none -o - %t/valid.mir
+# RUN: not --crash llc -mtriple=aarch64 -run-pass=none -o - %t/lower.mir 2>&1 | FileCheck %t/lower.mir
+# RUN: not --crash llc -mtriple=aarch64 -run-pass=none -o - %t/overlap.mir 2>&1 | FileCheck %t/overlap.mir
+# RUN: not --crash llc -mtriple=amdgcn -run-pass=none -o - %t/higher.mir 2>&1 | FileCheck %t/higher.mir
+
+;--- valid.mir
+---
+name:            valid
+frameInfo:
+  stackSize:       16
+  stackProtector:  '%stack.1'
+stack:
+  - { id: 0, offset: -24, size: 8, alignment: 8, stack-id: default }
+  - { id: 1, offset: -16, size: 8, alignment: 8, stack-id: default }
+body:             |
+  bb.0:
+...
+
+;--- lower.mir
+# CHECK: *** Bad machine code: Stack protector is not the top-most object on the stack ***
+---
+name:            lower
+frameInfo:
+  stackSize:       16
+  stackProtector:  '%stack.1'
+stack:
+  - { id: 0, offset: -16, size: 8, alignment: 8, stack-id: default }
+  - { id: 1, offset: -24, size: 8, alignment: 8, stack-id: default }
+body:             |
+  bb.0:
+...
+
+;--- overlap.mir
+# CHECK: *** Bad machine code: Stack protector overlaps with another stack object ***
+---
+name:            overlap
+frameInfo:
+  stackSize:       16
+  stackProtector:  '%stack.1'
+stack:
+  - { id: 0, offset: -20, size: 8, alignment: 4, stack-id: default }
+  - { id: 1, offset: -16, size: 8, alignment: 8, stack-id: default }
+body:             |
+  bb.0:
+...
+
+;--- higher.mir
+# CHECK: *** Bad machine code: Stack protector is not the top-most object on the stack ***
+---
+name:            higher
+frameInfo:
+  stackSize:       16
+  stackProtector:  '%stack.1'
+stack:
+  - { id: 0, offset: 16, size: 8, alignment: 8, stack-id: default }
+  - { id: 1, offset: 24, size: 8, alignment: 8, stack-id: default }
+body:             |
+  bb.0:
+...

From 66a88f62cd56e55b5fa0ddb1bdffa549f7565f8f Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Fri, 10 Jan 2025 09:56:55 +0100
Subject: [PATCH 035/408] [lldb] Add Function::GetAddress and redirect some
 uses (#115836)

Many calls to Function::GetAddressRange() were not interested in the
range itself. Instead they wanted to find the address of the function
(its entry point) or the base address for relocation of function-scoped
entities (technically, the two don't need to be the same, but there's
isn't good reason for them not to be). This PR creates a separate
function for retrieving this, and changes the existing
(non-controversial) uses to call that instead.
---
 lldb/include/lldb/Symbol/Function.h               |  8 ++++++++
 lldb/source/API/SBBlock.cpp                       |  3 +--
 lldb/source/API/SBFunction.cpp                    |  3 +--
 lldb/source/Breakpoint/BreakpointResolver.cpp     |  2 +-
 lldb/source/Breakpoint/BreakpointResolverName.cpp |  2 +-
 lldb/source/Core/SearchFilter.cpp                 |  2 +-
 lldb/source/Expression/DWARFExpressionList.cpp    |  3 +--
 lldb/source/Expression/IRExecutionUnit.cpp        |  3 +--
 .../Architecture/Mips/ArchitectureMips.cpp        |  2 +-
 .../MacOSX-DYLD/DynamicLoaderDarwin.cpp           |  6 ++----
 .../POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp         |  3 +--
 .../Clang/ClangExpressionDeclMap.cpp              |  2 +-
 .../SymbolFile/Breakpad/SymbolFileBreakpad.cpp    |  4 ++--
 .../SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp  |  3 +--
 .../SymbolFile/NativePDB/SymbolFileNativePDB.cpp  | 12 ++++--------
 lldb/source/Symbol/Block.cpp                      | 14 ++++++--------
 lldb/source/Symbol/Function.cpp                   | 15 +++++++--------
 lldb/source/Symbol/SymbolContext.cpp              | 10 +++-------
 lldb/source/Symbol/Variable.cpp                   |  8 +++-----
 lldb/source/Target/StackFrame.cpp                 |  3 +--
 lldb/source/Target/ThreadPlanStepInRange.cpp      |  2 +-
 lldb/source/ValueObject/ValueObjectVariable.cpp   |  3 +--
 22 files changed, 49 insertions(+), 64 deletions(-)

diff --git a/lldb/include/lldb/Symbol/Function.h b/lldb/include/lldb/Symbol/Function.h
index e4118c1f9be86..157c007bdf0e8 100644
--- a/lldb/include/lldb/Symbol/Function.h
+++ b/lldb/include/lldb/Symbol/Function.h
@@ -449,6 +449,11 @@ class Function : public UserID, public SymbolContextScope {
 
   AddressRanges GetAddressRanges() { return m_block.GetRanges(); }
 
+  /// Return the address of the function (its entry point). This address is also
+  /// used as a base address for relocation of function-scope entities (blocks
+  /// and variables).
+  const Address &GetAddress() const { return m_address; }
+
   lldb::LanguageType GetLanguage() const;
   /// Find the file and line number of the source location of the start of the
   /// function.  This will use the declaration if present and fall back on the
@@ -658,6 +663,9 @@ class Function : public UserID, public SymbolContextScope {
   /// include addresses belonging to other functions.
   AddressRange m_range;
 
+  /// The address (entry point) of the function.
+  Address m_address;
+
   /// The frame base expression for variables that are relative to the frame
   /// pointer.
   DWARFExpressionList m_frame_base;
diff --git a/lldb/source/API/SBBlock.cpp b/lldb/source/API/SBBlock.cpp
index b921ccd980245..2ef4cc7227cf9 100644
--- a/lldb/source/API/SBBlock.cpp
+++ b/lldb/source/API/SBBlock.cpp
@@ -176,8 +176,7 @@ bool SBBlock::GetDescription(SBStream &description) {
     m_opaque_ptr->CalculateSymbolContext(&sc);
     if (sc.function) {
       m_opaque_ptr->DumpAddressRanges(
-          &strm,
-          sc.function->GetAddressRange().GetBaseAddress().GetFileAddress());
+          &strm, sc.function->GetAddress().GetFileAddress());
     }
   } else
     strm.PutCString("No value");
diff --git a/lldb/source/API/SBFunction.cpp b/lldb/source/API/SBFunction.cpp
index 3f6b4eea98318..414eccc357c0e 100644
--- a/lldb/source/API/SBFunction.cpp
+++ b/lldb/source/API/SBFunction.cpp
@@ -120,8 +120,7 @@ SBInstructionList SBFunction::GetInstructions(SBTarget target,
   if (m_opaque_ptr) {
     TargetSP target_sp(target.GetSP());
     std::unique_lock<std::recursive_mutex> lock;
-    ModuleSP module_sp(
-        m_opaque_ptr->GetAddressRange().GetBaseAddress().GetModule());
+    ModuleSP module_sp(m_opaque_ptr->GetAddress().GetModule());
     if (target_sp && module_sp) {
       lock = std::unique_lock<std::recursive_mutex>(target_sp->GetAPIMutex());
       const bool force_live_memory = true;
diff --git a/lldb/source/Breakpoint/BreakpointResolver.cpp b/lldb/source/Breakpoint/BreakpointResolver.cpp
index 9643602d78c75..5fe544908c39e 100644
--- a/lldb/source/Breakpoint/BreakpointResolver.cpp
+++ b/lldb/source/Breakpoint/BreakpointResolver.cpp
@@ -325,7 +325,7 @@ void BreakpointResolver::AddLocation(SearchFilter &filter,
   // If the line number is before the prologue end, move it there...
   bool skipped_prologue = false;
   if (skip_prologue && sc.function) {
-    Address prologue_addr(sc.function->GetAddressRange().GetBaseAddress());
+    Address prologue_addr = sc.function->GetAddress();
     if (prologue_addr.IsValid() && (line_start == prologue_addr)) {
       const uint32_t prologue_byte_size = sc.function->GetPrologueByteSize();
       if (prologue_byte_size) {
diff --git a/lldb/source/Breakpoint/BreakpointResolverName.cpp b/lldb/source/Breakpoint/BreakpointResolverName.cpp
index 264638eb836dc..b9c8a1635d844 100644
--- a/lldb/source/Breakpoint/BreakpointResolverName.cpp
+++ b/lldb/source/Breakpoint/BreakpointResolverName.cpp
@@ -339,7 +339,7 @@ BreakpointResolverName::SearchCallback(SearchFilter &filter,
       if (!sc.block->GetStartAddress(break_addr))
         break_addr.Clear();
     } else if (sc.function) {
-      break_addr = sc.function->GetAddressRange().GetBaseAddress();
+      break_addr = sc.function->GetAddress();
       if (m_skip_prologue && break_addr.IsValid()) {
         const uint32_t prologue_byte_size = sc.function->GetPrologueByteSize();
         if (prologue_byte_size)
diff --git a/lldb/source/Core/SearchFilter.cpp b/lldb/source/Core/SearchFilter.cpp
index 8027a62414bac..6cd00af64870a 100644
--- a/lldb/source/Core/SearchFilter.cpp
+++ b/lldb/source/Core/SearchFilter.cpp
@@ -152,7 +152,7 @@ bool SearchFilter::FunctionPasses(Function &function) {
   // This is a slightly cheesy job, but since we don't have finer grained 
   // filters yet, just checking that the start address passes is probably
   // good enough for the base class behavior.
-  Address addr = function.GetAddressRange().GetBaseAddress();
+  Address addr = function.GetAddress();
   return AddressPasses(addr);
 }
 
diff --git a/lldb/source/Expression/DWARFExpressionList.cpp b/lldb/source/Expression/DWARFExpressionList.cpp
index 7a5cf9f9a0be4..be6c0a151159d 100644
--- a/lldb/source/Expression/DWARFExpressionList.cpp
+++ b/lldb/source/Expression/DWARFExpressionList.cpp
@@ -126,8 +126,7 @@ bool DWARFExpressionList::MatchesOperand(
     if (!sc.function)
       return false;
 
-    addr_t load_function_start =
-        sc.function->GetAddressRange().GetBaseAddress().GetFileAddress();
+    addr_t load_function_start = sc.function->GetAddress().GetFileAddress();
     if (load_function_start == LLDB_INVALID_ADDRESS)
       return false;
 
diff --git a/lldb/source/Expression/IRExecutionUnit.cpp b/lldb/source/Expression/IRExecutionUnit.cpp
index 9158b43b1c5c0..c8b4ddf705ec4 100644
--- a/lldb/source/Expression/IRExecutionUnit.cpp
+++ b/lldb/source/Expression/IRExecutionUnit.cpp
@@ -731,8 +731,7 @@ class LoadAddressResolver {
 
       // If that didn't work, try the function.
       if (load_address == LLDB_INVALID_ADDRESS && candidate_sc.function) {
-        Address addr =
-            candidate_sc.function->GetAddressRange().GetBaseAddress();
+        Address addr = candidate_sc.function->GetAddress();
         load_address = m_target->GetProcessSP() ? addr.GetLoadAddress(m_target)
                                                 : addr.GetFileAddress();
       }
diff --git a/lldb/source/Plugins/Architecture/Mips/ArchitectureMips.cpp b/lldb/source/Plugins/Architecture/Mips/ArchitectureMips.cpp
index 27ba10524141e..5aa903443c760 100644
--- a/lldb/source/Plugins/Architecture/Mips/ArchitectureMips.cpp
+++ b/lldb/source/Plugins/Architecture/Mips/ArchitectureMips.cpp
@@ -97,7 +97,7 @@ lldb::addr_t ArchitectureMips::GetBreakableLoadAddress(lldb::addr_t addr,
       resolve_scope, sc);
     Address sym_addr;
     if (sc.function)
-      sym_addr = sc.function->GetAddressRange().GetBaseAddress();
+      sym_addr = sc.function->GetAddress();
     else if (sc.symbol)
       sym_addr = sc.symbol->GetAddress();
 
diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp
index 3659dfcd3c4ca..8949b14c57189 100644
--- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp
+++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp
@@ -796,10 +796,8 @@ bool DynamicLoaderDarwin::AlwaysRelyOnEHUnwindInfo(SymbolContext &sym_ctx) {
   if (sym_ctx.symbol) {
     module_sp = sym_ctx.symbol->GetAddressRef().GetModule();
   }
-  if (module_sp.get() == nullptr && sym_ctx.function) {
-    module_sp =
-        sym_ctx.function->GetAddressRange().GetBaseAddress().GetModule();
-  }
+  if (module_sp.get() == nullptr && sym_ctx.function)
+    module_sp = sym_ctx.function->GetAddress().GetModule();
   if (module_sp.get() == nullptr)
     return false;
 
diff --git a/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp b/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp
index 34aca50df0ac4..5614bc32468d5 100644
--- a/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp
+++ b/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp
@@ -856,8 +856,7 @@ bool DynamicLoaderPOSIXDYLD::AlwaysRelyOnEHUnwindInfo(
   if (sym_ctx.symbol)
     module_sp = sym_ctx.symbol->GetAddressRef().GetModule();
   if (!module_sp && sym_ctx.function)
-    module_sp =
-        sym_ctx.function->GetAddressRange().GetBaseAddress().GetModule();
+    module_sp = sym_ctx.function->GetAddress().GetModule();
   if (!module_sp)
     return false;
 
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp
index bbe401b827f17..9e96f6557c7ba 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp
@@ -1883,7 +1883,7 @@ void ClangExpressionDeclMap::AddOneFunction(NameSearchContext &context,
       return;
     }
 
-    fun_address = function->GetAddressRange().GetBaseAddress();
+    fun_address = function->GetAddress();
 
     CompilerType copied_function_type = GuardedCopyType(function_clang_type);
     if (copied_function_type) {
diff --git a/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.cpp b/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.cpp
index bc886259d6fa5..45c8f121db2bc 100644
--- a/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.cpp
+++ b/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.cpp
@@ -302,7 +302,7 @@ size_t SymbolFileBreakpad::ParseBlocksRecursive(Function &func) {
   blocks.push_back(&func.GetBlock(false));
 
   size_t blocks_added = 0;
-  addr_t func_base = func.GetAddressRange().GetBaseAddress().GetOffset();
+  addr_t func_base = func.GetAddress().GetOffset();
   CompUnitData &data = m_cu_data->GetEntryRef(comp_unit->GetID()).data;
   LineIterator It(*m_objfile_sp, Record::Func, data.bookmark),
       End(*m_objfile_sp);
@@ -396,7 +396,7 @@ SymbolFileBreakpad::ResolveSymbolContext(const Address &so_addr,
         Block &block = func_sp->GetBlock(true);
         sc.block = block.FindInnermostBlockByOffset(
             so_addr.GetFileAddress() -
-            sc.function->GetAddressRange().GetBaseAddress().GetFileAddress());
+            sc.function->GetAddress().GetFileAddress());
         if (sc.block)
           result |= eSymbolContextBlock;
       }
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp
index cbe077bfbaef5..0ecf47a3c7869 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp
@@ -1061,8 +1061,7 @@ static void RemoveFunctionsWithModuleNotEqualTo(const ModuleSP &module_sp,
     SymbolContext sc;
     sc_list.GetContextAtIndex(i, sc);
     if (sc.function) {
-      const SectionSP section_sp(
-          sc.function->GetAddressRange().GetBaseAddress().GetSection());
+      const SectionSP section_sp = sc.function->GetAddress().GetSection();
       if (section_sp->GetModule() != module_sp) {
         sc_list.RemoveContextAtIndex(i);
         continue;
diff --git a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
index 27d51bbd1cb56..53c4c126658ba 100644
--- a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
+++ b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
@@ -416,8 +416,7 @@ Block *SymbolFileNativePDB::CreateBlock(PdbCompilandSymId block_id) {
     lldbassert(func);
     lldb::addr_t block_base =
         m_index->MakeVirtualAddress(block.Segment, block.CodeOffset);
-    lldb::addr_t func_base =
-        func->GetAddressRange().GetBaseAddress().GetFileAddress();
+    lldb::addr_t func_base = func->GetAddress().GetFileAddress();
     BlockSP child_block = parent_block->CreateChild(opaque_block_uid);
     if (block_base >= func_base)
       child_block->AddRange(Block::Range(block_base - func_base, block.CodeSize));
@@ -1113,8 +1112,7 @@ uint32_t SymbolFileNativePDB::ResolveSymbolContext(
         sc.function = GetOrCreateFunction(csid, *sc.comp_unit).get();
         if (sc.function) {
           Block &block = sc.function->GetBlock(true);
-          addr_t func_base =
-              sc.function->GetAddressRange().GetBaseAddress().GetFileAddress();
+          addr_t func_base = sc.function->GetAddress().GetFileAddress();
           addr_t offset = file_addr - func_base;
           sc.block = block.FindInnermostBlockByOffset(offset);
         }
@@ -1127,8 +1125,7 @@ uint32_t SymbolFileNativePDB::ResolveSymbolContext(
         sc.function = block->CalculateSymbolContextFunction();
         if (sc.function) {
           sc.function->GetBlock(true);
-          addr_t func_base =
-              sc.function->GetAddressRange().GetBaseAddress().GetFileAddress();
+          addr_t func_base = sc.function->GetAddress().GetFileAddress();
           addr_t offset = file_addr - func_base;
           sc.block = block->FindInnermostBlockByOffset(offset);
         }
@@ -1857,8 +1854,7 @@ VariableSP SymbolFileNativePDB::CreateLocalVariable(PdbCompilandSymId scope_id,
   // when lookuping local variables in this scope.
   if (!var_info.location.IsValid())
     var_info.location = DWARFExpressionList(module, DWARFExpression(), nullptr);
-  var_info.location.SetFuncFileAddress(
-      func->GetAddressRange().GetBaseAddress().GetFileAddress());
+  var_info.location.SetFuncFileAddress(func->GetAddress().GetFileAddress());
   CompilandIndexItem *cii = m_index->compilands().GetCompiland(var_id.modi);
   CompUnitSP comp_unit_sp = GetOrCreateCompileUnit(*cii);
   TypeSP type_sp = GetOrCreateType(var_info.type);
diff --git a/lldb/source/Symbol/Block.cpp b/lldb/source/Symbol/Block.cpp
index 5cc87240f552a..139fa06d08fca 100644
--- a/lldb/source/Symbol/Block.cpp
+++ b/lldb/source/Symbol/Block.cpp
@@ -39,10 +39,9 @@ void Block::GetDescription(Stream *s, Function *function,
 
     addr_t base_addr = LLDB_INVALID_ADDRESS;
     if (target)
-      base_addr =
-          function->GetAddressRange().GetBaseAddress().GetLoadAddress(target);
+      base_addr = function->GetAddress().GetLoadAddress(target);
     if (base_addr == LLDB_INVALID_ADDRESS)
-      base_addr = function->GetAddressRange().GetBaseAddress().GetFileAddress();
+      base_addr = function->GetAddress().GetFileAddress();
 
     s->Printf(", range%s = ", num_ranges > 1 ? "s" : "");
     for (size_t i = 0; i < num_ranges; ++i) {
@@ -299,7 +298,7 @@ bool Block::GetRangeAtIndex(uint32_t range_idx, AddressRange &range) {
     Function *function = CalculateSymbolContextFunction();
     if (function) {
       const Range &vm_range = m_ranges.GetEntryRef(range_idx);
-      range.GetBaseAddress() = function->GetAddressRange().GetBaseAddress();
+      range.GetBaseAddress() = function->GetAddress();
       range.GetBaseAddress().Slide(vm_range.GetRangeBase());
       range.SetByteSize(vm_range.GetByteSize());
       return true;
@@ -317,7 +316,7 @@ AddressRanges Block::GetRanges() {
     ranges.emplace_back();
     auto &range = ranges.back();
     const Range &vm_range = m_ranges.GetEntryRef(i);
-    range.GetBaseAddress() = function->GetAddressRange().GetBaseAddress();
+    range.GetBaseAddress() = function->GetAddress();
     range.GetBaseAddress().Slide(vm_range.GetRangeBase());
     range.SetByteSize(vm_range.GetByteSize());
   }
@@ -330,7 +329,7 @@ bool Block::GetStartAddress(Address &addr) {
 
   Function *function = CalculateSymbolContextFunction();
   if (function) {
-    addr = function->GetAddressRange().GetBaseAddress();
+    addr = function->GetAddress();
     addr.Slide(m_ranges.GetEntryRef(0).GetRangeBase());
     return true;
   }
@@ -349,8 +348,7 @@ void Block::AddRange(const Range &range) {
     if (log) {
       ModuleSP module_sp(m_parent_scope.CalculateSymbolContextModule());
       Function *function = m_parent_scope.CalculateSymbolContextFunction();
-      const addr_t function_file_addr =
-          function->GetAddressRange().GetBaseAddress().GetFileAddress();
+      const addr_t function_file_addr = function->GetAddress().GetFileAddress();
       const addr_t block_start_addr = function_file_addr + range.GetRangeBase();
       const addr_t block_end_addr = function_file_addr + range.GetRangeEnd();
       Type *func_type = function->GetType();
diff --git a/lldb/source/Symbol/Function.cpp b/lldb/source/Symbol/Function.cpp
index 4f07b946353a4..c9523281dc565 100644
--- a/lldb/source/Symbol/Function.cpp
+++ b/lldb/source/Symbol/Function.cpp
@@ -133,7 +133,7 @@ lldb::addr_t CallEdge::GetLoadAddress(lldb::addr_t unresolved_pc,
                                       Function &caller, Target &target) {
   Log *log = GetLog(LLDBLog::Step);
 
-  const Address &caller_start_addr = caller.GetAddressRange().GetBaseAddress();
+  const Address &caller_start_addr = caller.GetAddress();
 
   ModuleSP caller_module_sp = caller_start_addr.GetModule();
   if (!caller_module_sp) {
@@ -279,9 +279,10 @@ Function::Function(CompileUnit *comp_unit, lldb::user_id_t func_uid,
                    AddressRanges ranges)
     : UserID(func_uid), m_comp_unit(comp_unit), m_type_uid(type_uid),
       m_type(type), m_mangled(mangled), m_block(*this, func_uid),
-      m_range(CollapseRanges(ranges)), m_prologue_byte_size(0) {
+      m_range(CollapseRanges(ranges)), m_address(m_range.GetBaseAddress()),
+      m_prologue_byte_size(0) {
   assert(comp_unit != nullptr);
-  lldb::addr_t base_file_addr = m_range.GetBaseAddress().GetFileAddress();
+  lldb::addr_t base_file_addr = m_address.GetFileAddress();
   for (const AddressRange &range : ranges)
     m_block.AddRange(
         Block::Range(range.GetBaseAddress().GetFileAddress() - base_file_addr,
@@ -312,8 +313,7 @@ void Function::GetStartLineSourceInfo(SupportFileSP &source_file_sp,
       return;
 
     LineEntry line_entry;
-    if (line_table->FindLineEntryByAddress(GetAddressRange().GetBaseAddress(),
-                                           line_entry, nullptr)) {
+    if (line_table->FindLineEntryByAddress(GetAddress(), line_entry, nullptr)) {
       line_no = line_entry.line;
       source_file_sp = line_entry.file_sp;
     }
@@ -484,7 +484,7 @@ Function *Function::CalculateSymbolContextFunction() { return this; }
 lldb::DisassemblerSP Function::GetInstructions(const ExecutionContext &exe_ctx,
                                                const char *flavor,
                                                bool prefer_file_cache) {
-  ModuleSP module_sp(GetAddressRange().GetBaseAddress().GetModule());
+  ModuleSP module_sp = GetAddress().GetModule();
   if (module_sp && exe_ctx.HasTargetScope()) {
     return Disassembler::DisassembleRange(
         module_sp->GetArchitecture(), nullptr, nullptr, nullptr, flavor,
@@ -601,8 +601,7 @@ uint32_t Function::GetPrologueByteSize() {
     if (line_table) {
       LineEntry first_line_entry;
       uint32_t first_line_entry_idx = UINT32_MAX;
-      if (line_table->FindLineEntryByAddress(GetAddressRange().GetBaseAddress(),
-                                             first_line_entry,
+      if (line_table->FindLineEntryByAddress(GetAddress(), first_line_entry,
                                              &first_line_entry_idx)) {
         // Make sure the first line entry isn't already the end of the prologue
         addr_t prologue_end_file_addr = LLDB_INVALID_ADDRESS;
diff --git a/lldb/source/Symbol/SymbolContext.cpp b/lldb/source/Symbol/SymbolContext.cpp
index 1a291ca3c0ea7..19b6ff6a5302b 100644
--- a/lldb/source/Symbol/SymbolContext.cpp
+++ b/lldb/source/Symbol/SymbolContext.cpp
@@ -105,8 +105,7 @@ bool SymbolContext::DumpStopContext(
     if (addr_t file_addr = addr.GetFileAddress();
         file_addr != LLDB_INVALID_ADDRESS) {
       const addr_t function_offset =
-          file_addr -
-          function->GetAddressRange().GetBaseAddress().GetFileAddress();
+          file_addr - function->GetAddress().GetFileAddress();
       if (!show_function_name) {
         // Print +offset even if offset is 0
         dumped_something = true;
@@ -700,9 +699,7 @@ LineEntry SymbolContext::GetFunctionStartLineEntry() const {
   }
 
   if (function) {
-    if (function->GetAddressRange()
-            .GetBaseAddress()
-            .CalculateSymbolContextLineEntry(line_entry))
+    if (function->GetAddress().CalculateSymbolContextLineEntry(line_entry))
       return line_entry;
   }
   return LineEntry();
@@ -1228,8 +1225,7 @@ bool SymbolContextList::AppendIfUnique(const SymbolContext &sc,
           continue;
 
         if (pos->function) {
-          if (pos->function->GetAddressRange().GetBaseAddress() ==
-              sc.symbol->GetAddressRef()) {
+          if (pos->function->GetAddress() == sc.symbol->GetAddressRef()) {
             // Do we already have a function with this symbol?
             if (pos->symbol == sc.symbol)
               return false;
diff --git a/lldb/source/Symbol/Variable.cpp b/lldb/source/Symbol/Variable.cpp
index a63e4f973537f..8244725aba545 100644
--- a/lldb/source/Symbol/Variable.cpp
+++ b/lldb/source/Symbol/Variable.cpp
@@ -221,8 +221,7 @@ bool Variable::LocationIsValidForFrame(StackFrame *frame) {
       TargetSP target_sp(frame->CalculateTarget());
 
       addr_t loclist_base_load_addr =
-          function->GetAddressRange().GetBaseAddress().GetLoadAddress(
-              target_sp.get());
+          function->GetAddress().GetLoadAddress(target_sp.get());
       if (loclist_base_load_addr == LLDB_INVALID_ADDRESS)
         return false;
       // It is a location list. We just need to tell if the location list
@@ -259,7 +258,7 @@ bool Variable::LocationIsValidForAddress(const Address &address) {
 
       if (sc.function) {
         addr_t loclist_base_file_addr =
-            sc.function->GetAddressRange().GetBaseAddress().GetFileAddress();
+            sc.function->GetAddress().GetFileAddress();
         if (loclist_base_file_addr == LLDB_INVALID_ADDRESS)
           return false;
         // It is a location list. We just need to tell if the location list
@@ -450,8 +449,7 @@ bool Variable::DumpLocations(Stream *s, const Address &address) {
 
   const addr_t file_addr = address.GetFileAddress();
   if (sc.function) {
-    addr_t loclist_base_file_addr =
-        sc.function->GetAddressRange().GetBaseAddress().GetFileAddress();
+    addr_t loclist_base_file_addr = sc.function->GetAddress().GetFileAddress();
     if (loclist_base_file_addr == LLDB_INVALID_ADDRESS)
       return false;
     return m_location_list.DumpLocations(s, eDescriptionLevelBrief,
diff --git a/lldb/source/Target/StackFrame.cpp b/lldb/source/Target/StackFrame.cpp
index 2633c976c13bf..4d068638f42b6 100644
--- a/lldb/source/Target/StackFrame.cpp
+++ b/lldb/source/Target/StackFrame.cpp
@@ -1121,8 +1121,7 @@ llvm::Error StackFrame::GetFrameBaseValue(Scalar &frame_base) {
       addr_t loclist_base_addr = LLDB_INVALID_ADDRESS;
       if (!m_sc.function->GetFrameBaseExpression().IsAlwaysValidSingleExpr())
         loclist_base_addr =
-            m_sc.function->GetAddressRange().GetBaseAddress().GetLoadAddress(
-                exe_ctx.GetTargetPtr());
+            m_sc.function->GetAddress().GetLoadAddress(exe_ctx.GetTargetPtr());
 
       llvm::Expected<Value> expr_value =
           m_sc.function->GetFrameBaseExpression().Evaluate(
diff --git a/lldb/source/Target/ThreadPlanStepInRange.cpp b/lldb/source/Target/ThreadPlanStepInRange.cpp
index 224a17d896ccf..4a2ede8b39728 100644
--- a/lldb/source/Target/ThreadPlanStepInRange.cpp
+++ b/lldb/source/Target/ThreadPlanStepInRange.cpp
@@ -250,7 +250,7 @@ bool ThreadPlanStepInRange::ShouldStop(Event *event_ptr) {
                                                         eSymbolContextSymbol);
 
         if (sc.function) {
-          func_start_address = sc.function->GetAddressRange().GetBaseAddress();
+          func_start_address = sc.function->GetAddress();
           if (curr_addr == func_start_address.GetLoadAddress(&GetTarget()))
             bytes_to_skip = sc.function->GetPrologueByteSize();
         } else if (sc.symbol) {
diff --git a/lldb/source/ValueObject/ValueObjectVariable.cpp b/lldb/source/ValueObject/ValueObjectVariable.cpp
index db664ce9a7a20..6a482b91ad4be 100644
--- a/lldb/source/ValueObject/ValueObjectVariable.cpp
+++ b/lldb/source/ValueObject/ValueObjectVariable.cpp
@@ -160,8 +160,7 @@ bool ValueObjectVariable::UpdateValue() {
       variable->CalculateSymbolContext(&sc);
       if (sc.function)
         loclist_base_load_addr =
-            sc.function->GetAddressRange().GetBaseAddress().GetLoadAddress(
-                target);
+            sc.function->GetAddress().GetLoadAddress(target);
     }
     Value old_value(m_value);
     llvm::Expected<Value> maybe_value = expr_list.Evaluate(

From fd922c4b4f6bcb7043228b003ddf956131c6b4ea Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Fri, 10 Jan 2025 09:19:25 +0000
Subject: [PATCH 036/408] [CodeGen] Add const to getAddrModeArguments argument.
 NFC. (#122335)

---
 llvm/include/llvm/CodeGen/TargetLowering.h | 6 +++---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  | 2 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.h    | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 3751aac4df8ea..ce58777655e06 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2795,9 +2795,9 @@ class TargetLoweringBase {
   /// possible to be done in the address mode for that operand. This hook lets
   /// targets also pass back when this should be done on intrinsics which
   /// load/store.
-  virtual bool getAddrModeArguments(IntrinsicInst * /*I*/,
-                                    SmallVectorImpl<Value*> &/*Ops*/,
-                                    Type *&/*AccessTy*/) const {
+  virtual bool getAddrModeArguments(const IntrinsicInst * /*I*/,
+                                    SmallVectorImpl<Value *> & /*Ops*/,
+                                    Type *& /*AccessTy*/) const {
     return false;
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e057c665e39da..4a39443582444 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1486,7 +1486,7 @@ void SITargetLowering::CollectTargetIntrinsicOperands(
   }
 }
 
-bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
+bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
                                             SmallVectorImpl<Value *> &Ops,
                                             Type *&AccessTy) const {
   Value *Ptr = nullptr;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 27960a0940923..5c215f76552d9 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -315,9 +315,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {
                                       SmallVectorImpl<SDValue> &Ops,
                                       SelectionDAG &DAG) const override;
 
-  bool getAddrModeArguments(IntrinsicInst * /*I*/,
-                            SmallVectorImpl<Value*> &/*Ops*/,
-                            Type *&/*AccessTy*/) const override;
+  bool getAddrModeArguments(const IntrinsicInst *I,
+                            SmallVectorImpl<Value *> &Ops,
+                            Type *&AccessTy) const override;
 
   bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const;
   bool isLegalGlobalAddressingMode(const AddrMode &AM) const;

From 46ca6dfb5f0783d68cd738501a26a1a9455ff74e Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 10 Jan 2025 16:21:53 +0700
Subject: [PATCH 037/408] AMDGPU: Add disjoint to or produced from lowering
 vector ops (#122424)

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4a39443582444..529d9ba17d4f6 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7458,7 +7458,8 @@ SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
       DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
 
   // 4. Get (2) and (3) ORed into the target vector.
-  SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
+  SDValue BFI =
+      DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
 
   return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
 }
@@ -7666,7 +7667,8 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
     Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
     Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
 
-    SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
+    SDValue Or =
+        DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
     return DAG.getNode(ISD::BITCAST, SL, VT, Or);
   }
 

From 98e5962b7c9fee60b81164025dc17ab31f49f5b7 Mon Sep 17 00:00:00 2001
From: LiqinWeng <liqin.weng@spacemit.com>
Date: Fri, 10 Jan 2025 17:22:51 +0800
Subject: [PATCH 038/408] [RISCV][CostModel] Add cost for fabs/fsqrt of type
 bf16/f16 (#118608)

---
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |  68 +++++--
 .../CostModel/RISCV/fp-min-max-abs.ll         | 164 +++++++++--------
 .../Analysis/CostModel/RISCV/fp-sqrt-pow.ll   | 174 ++++++++++--------
 3 files changed, 243 insertions(+), 163 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index a8f04f038f810..add82dc80c429 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -13,6 +13,7 @@
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/CodeGen/CostTable.h"
 #include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/PatternMatch.h"
 #include <cmath>
@@ -1035,21 +1036,66 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     }
     break;
   }
-  case Intrinsic::fabs:
+  case Intrinsic::fabs: {
+    auto LT = getTypeLegalizationCost(RetTy);
+    if (ST->hasVInstructions() && LT.second.isVector()) {
+      // lui a0, 8
+      // addi a0, a0, -1
+      // vsetvli a1, zero, e16, m1, ta, ma
+      // vand.vx v8, v8, a0
+      // f16 with zvfhmin and bf16 with zvfhbmin
+      if (LT.second.getVectorElementType() == MVT::bf16 ||
+          (LT.second.getVectorElementType() == MVT::f16 &&
+           !ST->hasVInstructionsF16()))
+        return LT.first * getRISCVInstructionCost(RISCV::VAND_VX, LT.second,
+                                                  CostKind) +
+               2;
+      else
+        return LT.first *
+               getRISCVInstructionCost(RISCV::VFSGNJX_VV, LT.second, CostKind);
+    }
+    break;
+  }
   case Intrinsic::sqrt: {
     auto LT = getTypeLegalizationCost(RetTy);
-    // TODO: add f16/bf16, bf16 with zvfbfmin && f16 with zvfhmin
     if (ST->hasVInstructions() && LT.second.isVector()) {
-      unsigned Op;
-      switch (ICA.getID()) {
-      case Intrinsic::fabs:
-        Op = RISCV::VFSGNJX_VV;
-        break;
-      case Intrinsic::sqrt:
-        Op = RISCV::VFSQRT_V;
-        break;
+      SmallVector<unsigned, 4> ConvOp;
+      SmallVector<unsigned, 2> FsqrtOp;
+      MVT ConvType = LT.second;
+      MVT FsqrtType = LT.second;
+      // f16 with zvfhmin and bf16 with zvfbfmin and the type of nxv32[b]f16
+      // will be spilt.
+      if (LT.second.getVectorElementType() == MVT::bf16) {
+        if (LT.second == MVT::nxv32bf16) {
+          ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVTBF16_F_F_V,
+                    RISCV::VFNCVTBF16_F_F_W, RISCV::VFNCVTBF16_F_F_W};
+          FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
+          ConvType = MVT::nxv16f16;
+          FsqrtType = MVT::nxv16f32;
+        } else {
+          ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFNCVTBF16_F_F_W};
+          FsqrtOp = {RISCV::VFSQRT_V};
+          FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
+        }
+      } else if (LT.second.getVectorElementType() == MVT::f16 &&
+                 !ST->hasVInstructionsF16()) {
+        if (LT.second == MVT::nxv32f16) {
+          ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_F_F_V,
+                    RISCV::VFNCVT_F_F_W, RISCV::VFNCVT_F_F_W};
+          FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
+          ConvType = MVT::nxv16f16;
+          FsqrtType = MVT::nxv16f32;
+        } else {
+          ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFNCVT_F_F_W};
+          FsqrtOp = {RISCV::VFSQRT_V};
+          FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
+        }
+      } else {
+        FsqrtOp = {RISCV::VFSQRT_V};
       }
-      return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
+
+      return LT.first * (getRISCVInstructionCost(FsqrtOp, FsqrtType, CostKind) +
+                         getRISCVInstructionCost(ConvOp, ConvType, CostKind));
     }
     break;
   }
diff --git a/llvm/test/Analysis/CostModel/RISCV/fp-min-max-abs.ll b/llvm/test/Analysis/CostModel/RISCV/fp-min-max-abs.ll
index 9eb06a07f135f..bdaf423c53bc4 100644
--- a/llvm/test/Analysis/CostModel/RISCV/fp-min-max-abs.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/fp-min-max-abs.ll
@@ -4,89 +4,101 @@
 
 define void @fabs() {
 ; CHECK-LABEL: 'fabs'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call bfloat @llvm.fabs.bf16(bfloat undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x bfloat> @llvm.fabs.v8bf16(<8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <16 x bfloat> @llvm.fabs.v16bf16(<16 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x bfloat> @llvm.fabs.nxv2bf16(<vscale x 2 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x bfloat> @llvm.fabs.nxv4bf16(<vscale x 4 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = call <vscale x 8 x bfloat> @llvm.fabs.nxv8bf16(<vscale x 8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %9 = call <vscale x 16 x bfloat> @llvm.fabs.nxv16bf16(<vscale x 16 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call float @llvm.fabs.f32(float undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = call <2 x float> @llvm.fabs.v2f32(<2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %13 = call <8 x float> @llvm.fabs.v8f32(<8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %14 = call <16 x float> @llvm.fabs.v16f32(<16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = call <vscale x 1 x float> @llvm.fabs.nxv1f32(<vscale x 1 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <vscale x 2 x float> @llvm.fabs.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %17 = call <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %18 = call <vscale x 8 x float> @llvm.fabs.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %19 = call <vscale x 16 x float> @llvm.fabs.nxv16f32(<vscale x 16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = call double @llvm.fabs.f64(double undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = call <2 x double> @llvm.fabs.v2f64(<2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %22 = call <4 x double> @llvm.fabs.v4f64(<4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %23 = call <8 x double> @llvm.fabs.v8f64(<8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %24 = call <16 x double> @llvm.fabs.v16f64(<16 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %25 = call <vscale x 1 x double> @llvm.fabs.nxv1f64(<vscale x 1 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %26 = call <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %27 = call <vscale x 4 x double> @llvm.fabs.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %28 = call <vscale x 8 x double> @llvm.fabs.nxv8f64(<vscale x 8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call bfloat @llvm.fabs.bf16(bfloat poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %2 = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %3 = call <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %4 = call <8 x bfloat> @llvm.fabs.v8bf16(<8 x bfloat> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %5 = call <16 x bfloat> @llvm.fabs.v16bf16(<16 x bfloat> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %6 = call <vscale x 2 x bfloat> @llvm.fabs.nxv2bf16(<vscale x 2 x bfloat> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %7 = call <vscale x 4 x bfloat> @llvm.fabs.nxv4bf16(<vscale x 4 x bfloat> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %8 = call <vscale x 8 x bfloat> @llvm.fabs.nxv8bf16(<vscale x 8 x bfloat> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %9 = call <vscale x 16 x bfloat> @llvm.fabs.nxv16bf16(<vscale x 16 x bfloat> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call float @llvm.fabs.f32(float poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = call <2 x float> @llvm.fabs.v2f32(<2 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = call <4 x float> @llvm.fabs.v4f32(<4 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %13 = call <8 x float> @llvm.fabs.v8f32(<8 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %14 = call <16 x float> @llvm.fabs.v16f32(<16 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = call <vscale x 1 x float> @llvm.fabs.nxv1f32(<vscale x 1 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <vscale x 2 x float> @llvm.fabs.nxv2f32(<vscale x 2 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %17 = call <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %18 = call <vscale x 8 x float> @llvm.fabs.nxv8f32(<vscale x 8 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %19 = call <vscale x 16 x float> @llvm.fabs.nxv16f32(<vscale x 16 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = call double @llvm.fabs.f64(double poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = call <2 x double> @llvm.fabs.v2f64(<2 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %22 = call <4 x double> @llvm.fabs.v4f64(<4 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %23 = call <8 x double> @llvm.fabs.v8f64(<8 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %24 = call <16 x double> @llvm.fabs.v16f64(<16 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %25 = call <vscale x 1 x double> @llvm.fabs.nxv1f64(<vscale x 1 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %26 = call <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %27 = call <vscale x 4 x double> @llvm.fabs.nxv4f64(<vscale x 4 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %28 = call <vscale x 8 x double> @llvm.fabs.nxv8f64(<vscale x 8 x double> poison)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  call bfloat @llvm.fabs.bf16(bfloat undef)
-  call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> undef)
-  call <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat> undef)
-  call <8 x bfloat> @llvm.fabs.v8bf16(<8 x bfloat> undef)
-  call <16 x bfloat> @llvm.fabs.v16f16(<16 x bfloat> undef)
-  call <vscale x 2 x bfloat> @llvm.fabs.nxv2bf16(<vscale x 2 x bfloat> undef)
-  call <vscale x 4 x bfloat> @llvm.fabs.nxv4bf16(<vscale x 4 x bfloat> undef)
-  call <vscale x 8 x bfloat> @llvm.fabs.nxv8bf16(<vscale x 8 x bfloat> undef)
-  call <vscale x 16 x bfloat> @llvm.fabs.nxv16f16(<vscale x 16 x bfloat> undef)
-  call float @llvm.fabs.f32(float undef)
-  call <2 x float> @llvm.fabs.v2f32(<2 x float> undef)
-  call <4 x float> @llvm.fabs.v4f32(<4 x float> undef)
-  call <8 x float> @llvm.fabs.v8f32(<8 x float> undef)
-  call <16 x float> @llvm.fabs.v16f32(<16 x float> undef)
-  call <vscale x 1 x float> @llvm.fabs.nxv1f32(<vscale x 1 x float> undef)
-  call <vscale x 2 x float> @llvm.fabs.nxv2f32(<vscale x 2 x float> undef)
-  call <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> undef)
-  call <vscale x 8 x float> @llvm.fabs.nxv8f32(<vscale x 8 x float> undef)
-  call <vscale x 16 x float> @llvm.fabs.nxv16f32(<vscale x 16 x float> undef)
-  call double @llvm.fabs.f64(double undef)
-  call <2 x double> @llvm.fabs.v2f64(<2 x double> undef)
-  call <4 x double> @llvm.fabs.v4f64(<4 x double> undef)
-  call <8 x double> @llvm.fabs.v8f64(<8 x double> undef)
-  call <16 x double> @llvm.fabs.v16f64(<16 x double> undef)
-  call <vscale x 1 x double> @llvm.fabs.nxv1f64(<vscale x 1 x double> undef)
-  call <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> undef)
-  call <vscale x 4 x double> @llvm.fabs.nxv4f64(<vscale x 4 x double> undef)
-  call <vscale x 8 x double> @llvm.fabs.nxv8f64(<vscale x 8 x double> undef)
+  call bfloat @llvm.fabs.bf16(bfloat poison)
+  call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> poison)
+  call <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat> poison)
+  call <8 x bfloat> @llvm.fabs.v8bf16(<8 x bfloat> poison)
+  call <16 x bfloat> @llvm.fabs.v16f16(<16 x bfloat> poison)
+  call <vscale x 2 x bfloat> @llvm.fabs.nxv2bf16(<vscale x 2 x bfloat> poison)
+  call <vscale x 4 x bfloat> @llvm.fabs.nxv4bf16(<vscale x 4 x bfloat> poison)
+  call <vscale x 8 x bfloat> @llvm.fabs.nxv8bf16(<vscale x 8 x bfloat> poison)
+  call <vscale x 16 x bfloat> @llvm.fabs.nxv16f16(<vscale x 16 x bfloat> poison)
+  call float @llvm.fabs.f32(float poison)
+  call <2 x float> @llvm.fabs.v2f32(<2 x float> poison)
+  call <4 x float> @llvm.fabs.v4f32(<4 x float> poison)
+  call <8 x float> @llvm.fabs.v8f32(<8 x float> poison)
+  call <16 x float> @llvm.fabs.v16f32(<16 x float> poison)
+  call <vscale x 1 x float> @llvm.fabs.nxv1f32(<vscale x 1 x float> poison)
+  call <vscale x 2 x float> @llvm.fabs.nxv2f32(<vscale x 2 x float> poison)
+  call <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> poison)
+  call <vscale x 8 x float> @llvm.fabs.nxv8f32(<vscale x 8 x float> poison)
+  call <vscale x 16 x float> @llvm.fabs.nxv16f32(<vscale x 16 x float> poison)
+  call double @llvm.fabs.f64(double poison)
+  call <2 x double> @llvm.fabs.v2f64(<2 x double> poison)
+  call <4 x double> @llvm.fabs.v4f64(<4 x double> poison)
+  call <8 x double> @llvm.fabs.v8f64(<8 x double> poison)
+  call <16 x double> @llvm.fabs.v16f64(<16 x double> poison)
+  call <vscale x 1 x double> @llvm.fabs.nxv1f64(<vscale x 1 x double> poison)
+  call <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> poison)
+  call <vscale x 4 x double> @llvm.fabs.nxv4f64(<vscale x 4 x double> poison)
+  call <vscale x 8 x double> @llvm.fabs.nxv8f64(<vscale x 8 x double> poison)
   ret void
 }
 
 define void @fabs_f16() {
-; CHECK-LABEL: 'fabs_f16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.fabs.f16(half undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x half> @llvm.fabs.v2f16(<2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x half> @llvm.fabs.v4f16(<4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x half> @llvm.fabs.v8f16(<8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <16 x half> @llvm.fabs.v16f16(<16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x half> @llvm.fabs.nxv2f16(<vscale x 2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x half> @llvm.fabs.nxv4f16(<vscale x 4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = call <vscale x 8 x half> @llvm.fabs.nxv8f16(<vscale x 8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %9 = call <vscale x 16 x half> @llvm.fabs.nxv16f16(<vscale x 16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; ZVFH-LABEL: 'fabs_f16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.fabs.f16(half poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x half> @llvm.fabs.v2f16(<2 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x half> @llvm.fabs.v4f16(<4 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x half> @llvm.fabs.v8f16(<8 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <16 x half> @llvm.fabs.v16f16(<16 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x half> @llvm.fabs.nxv2f16(<vscale x 2 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x half> @llvm.fabs.nxv4f16(<vscale x 4 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = call <vscale x 8 x half> @llvm.fabs.nxv8f16(<vscale x 8 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %9 = call <vscale x 16 x half> @llvm.fabs.nxv16f16(<vscale x 16 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'fabs_f16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.fabs.f16(half poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %2 = call <2 x half> @llvm.fabs.v2f16(<2 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %3 = call <4 x half> @llvm.fabs.v4f16(<4 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %4 = call <8 x half> @llvm.fabs.v8f16(<8 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %5 = call <16 x half> @llvm.fabs.v16f16(<16 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %6 = call <vscale x 2 x half> @llvm.fabs.nxv2f16(<vscale x 2 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %7 = call <vscale x 4 x half> @llvm.fabs.nxv4f16(<vscale x 4 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %8 = call <vscale x 8 x half> @llvm.fabs.nxv8f16(<vscale x 8 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %9 = call <vscale x 16 x half> @llvm.fabs.nxv16f16(<vscale x 16 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  call half @llvm.fabs.f16(half undef)
-  call <2 x half> @llvm.fabs.v2f16(<2 x half> undef)
-  call <4 x half> @llvm.fabs.v4f16(<4 x half> undef)
-  call <8 x half> @llvm.fabs.v8f16(<8 x half> undef)
-  call <16 x half> @llvm.fabs.v16f16(<16 x half> undef)
-  call <vscale x 2 x half> @llvm.fabs.nxv2f16(<vscale x 2 x half> undef)
-  call <vscale x 4 x half> @llvm.fabs.nxv4f16(<vscale x 4 x half> undef)
-  call <vscale x 8 x half> @llvm.fabs.nxv8f16(<vscale x 8 x half> undef)
-  call <vscale x 16 x half> @llvm.fabs.nxv16f16(<vscale x 16 x half> undef)
+  call half @llvm.fabs.f16(half poison)
+  call <2 x half> @llvm.fabs.v2f16(<2 x half> poison)
+  call <4 x half> @llvm.fabs.v4f16(<4 x half> poison)
+  call <8 x half> @llvm.fabs.v8f16(<8 x half> poison)
+  call <16 x half> @llvm.fabs.v16f16(<16 x half> poison)
+  call <vscale x 2 x half> @llvm.fabs.nxv2f16(<vscale x 2 x half> poison)
+  call <vscale x 4 x half> @llvm.fabs.nxv4f16(<vscale x 4 x half> poison)
+  call <vscale x 8 x half> @llvm.fabs.nxv8f16(<vscale x 8 x half> poison)
+  call <vscale x 16 x half> @llvm.fabs.nxv16f16(<vscale x 16 x half> poison)
   ret void
 }
 
diff --git a/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll b/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll
index 446627f6bf3c0..32ad44f7dda7b 100644
--- a/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll
@@ -4,89 +4,111 @@
 
 define void @sqrt() {
 ; CHECK-LABEL: 'sqrt'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call bfloat @llvm.sqrt.bf16(bfloat undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x bfloat> @llvm.sqrt.nxv2bf16(<vscale x 2 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x bfloat> @llvm.sqrt.nxv4bf16(<vscale x 4 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = call <vscale x 8 x bfloat> @llvm.sqrt.nxv8bf16(<vscale x 8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %9 = call <vscale x 16 x bfloat> @llvm.sqrt.nxv16bf16(<vscale x 16 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call float @llvm.sqrt.f32(float undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %13 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %14 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = call <vscale x 1 x float> @llvm.sqrt.nxv1f32(<vscale x 1 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <vscale x 2 x float> @llvm.sqrt.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %17 = call <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %18 = call <vscale x 8 x float> @llvm.sqrt.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %19 = call <vscale x 16 x float> @llvm.sqrt.nxv16f32(<vscale x 16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = call double @llvm.sqrt.f64(double undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %22 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %23 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %24 = call <16 x double> @llvm.sqrt.v16f64(<16 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %25 = call <vscale x 1 x double> @llvm.sqrt.nxv1f64(<vscale x 1 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %26 = call <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %27 = call <vscale x 4 x double> @llvm.sqrt.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %28 = call <vscale x 8 x double> @llvm.sqrt.nxv8f64(<vscale x 8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call bfloat @llvm.sqrt.bf16(bfloat poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %2 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %3 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %4 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %5 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %6 = call <32 x bfloat> @llvm.sqrt.v32bf16(<32 x bfloat> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %7 = call <vscale x 2 x bfloat> @llvm.sqrt.nxv2bf16(<vscale x 2 x bfloat> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %8 = call <vscale x 4 x bfloat> @llvm.sqrt.nxv4bf16(<vscale x 4 x bfloat> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %9 = call <vscale x 8 x bfloat> @llvm.sqrt.nxv8bf16(<vscale x 8 x bfloat> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %10 = call <vscale x 16 x bfloat> @llvm.sqrt.nxv16bf16(<vscale x 16 x bfloat> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %11 = call <vscale x 32 x bfloat> @llvm.sqrt.nxv32bf16(<vscale x 32 x bfloat> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = call float @llvm.sqrt.f32(float poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %15 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %16 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = call <vscale x 1 x float> @llvm.sqrt.nxv1f32(<vscale x 1 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = call <vscale x 2 x float> @llvm.sqrt.nxv2f32(<vscale x 2 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %19 = call <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %20 = call <vscale x 8 x float> @llvm.sqrt.nxv8f32(<vscale x 8 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %21 = call <vscale x 16 x float> @llvm.sqrt.nxv16f32(<vscale x 16 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = call double @llvm.sqrt.f64(double poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %24 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %25 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %26 = call <16 x double> @llvm.sqrt.v16f64(<16 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = call <vscale x 1 x double> @llvm.sqrt.nxv1f64(<vscale x 1 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %28 = call <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %29 = call <vscale x 4 x double> @llvm.sqrt.nxv4f64(<vscale x 4 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %30 = call <vscale x 8 x double> @llvm.sqrt.nxv8f64(<vscale x 8 x double> poison)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  call bfloat @llvm.sqrt.bf16(bfloat undef)
-  call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
-  call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
-  call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
-  call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
-  call <vscale x 2 x bfloat> @llvm.sqrt.nxv2bf16(<vscale x 2 x bfloat> undef)
-  call <vscale x 4 x bfloat> @llvm.sqrt.nxv4bf16(<vscale x 4 x bfloat> undef)
-  call <vscale x 8 x bfloat> @llvm.sqrt.nxv8bf16(<vscale x 8 x bfloat> undef)
-  call <vscale x 16 x bfloat> @llvm.sqrt.nxv16bf16(<vscale x 16 x bfloat> undef)
-  call float @llvm.sqrt.f32(float undef)
-  call <2 x float> @llvm.sqrt.v2f32(<2 x float> undef)
-  call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
-  call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
-  call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
-  call <vscale x 1 x float> @llvm.sqrt.nxv1f32(<vscale x 1 x float> undef)
-  call <vscale x 2 x float> @llvm.sqrt.nxv2f32(<vscale x 2 x float> undef)
-  call <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float> undef)
-  call <vscale x 8 x float> @llvm.sqrt.nxv8f32(<vscale x 8 x float> undef)
-  call <vscale x 16 x float> @llvm.sqrt.nxv16f32(<vscale x 16 x float> undef)
-  call double @llvm.sqrt.f64(double undef)
-  call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
-  call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
-  call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
-  call <16 x double> @llvm.sqrt.v16f64(<16 x double> undef)
-  call <vscale x 1 x double> @llvm.sqrt.nxv1f64(<vscale x 1 x double> undef)
-  call <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> undef)
-  call <vscale x 4 x double> @llvm.sqrt.nxv4f64(<vscale x 4 x double> undef)
-  call <vscale x 8 x double> @llvm.sqrt.nxv8f64(<vscale x 8 x double> undef)
+  call bfloat @llvm.sqrt.bf16(bfloat poison)
+  call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> poison)
+  call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> poison)
+  call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> poison)
+  call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> poison)
+  call <32 x bfloat> @llvm.sqrt.v32bf16(<32 x bfloat> poison)
+  call <vscale x 2 x bfloat> @llvm.sqrt.nxv2bf16(<vscale x 2 x bfloat> poison)
+  call <vscale x 4 x bfloat> @llvm.sqrt.nxv4bf16(<vscale x 4 x bfloat> poison)
+  call <vscale x 8 x bfloat> @llvm.sqrt.nxv8bf16(<vscale x 8 x bfloat> poison)
+  call <vscale x 16 x bfloat> @llvm.sqrt.nxv16bf16(<vscale x 16 x bfloat> poison)
+  call <vscale x 32 x bfloat> @llvm.sqrt.nxv32bf16(<vscale x 32 x bfloat> poison)
+  call float @llvm.sqrt.f32(float poison)
+  call <2 x float> @llvm.sqrt.v2f32(<2 x float> poison)
+  call <4 x float> @llvm.sqrt.v4f32(<4 x float> poison)
+  call <8 x float> @llvm.sqrt.v8f32(<8 x float> poison)
+  call <16 x float> @llvm.sqrt.v16f32(<16 x float> poison)
+  call <vscale x 1 x float> @llvm.sqrt.nxv1f32(<vscale x 1 x float> poison)
+  call <vscale x 2 x float> @llvm.sqrt.nxv2f32(<vscale x 2 x float> poison)
+  call <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float> poison)
+  call <vscale x 8 x float> @llvm.sqrt.nxv8f32(<vscale x 8 x float> poison)
+  call <vscale x 16 x float> @llvm.sqrt.nxv16f32(<vscale x 16 x float> poison)
+  call double @llvm.sqrt.f64(double poison)
+  call <2 x double> @llvm.sqrt.v2f64(<2 x double> poison)
+  call <4 x double> @llvm.sqrt.v4f64(<4 x double> poison)
+  call <8 x double> @llvm.sqrt.v8f64(<8 x double> poison)
+  call <16 x double> @llvm.sqrt.v16f64(<16 x double> poison)
+  call <vscale x 1 x double> @llvm.sqrt.nxv1f64(<vscale x 1 x double> poison)
+  call <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> poison)
+  call <vscale x 4 x double> @llvm.sqrt.nxv4f64(<vscale x 4 x double> poison)
+  call <vscale x 8 x double> @llvm.sqrt.nxv8f64(<vscale x 8 x double> poison)
   ret void
 }
 
 define void @sqrt_f16() {
-; CHECK-LABEL: 'sqrt_f16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.sqrt.f16(half undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x half> @llvm.sqrt.nxv2f16(<vscale x 2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x half> @llvm.sqrt.nxv4f16(<vscale x 4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = call <vscale x 8 x half> @llvm.sqrt.nxv8f16(<vscale x 8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %9 = call <vscale x 16 x half> @llvm.sqrt.nxv16f16(<vscale x 16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; ZVFH-LABEL: 'sqrt_f16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.sqrt.f16(half poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %6 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x half> @llvm.sqrt.nxv2f16(<vscale x 2 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x half> @llvm.sqrt.nxv4f16(<vscale x 4 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = call <vscale x 8 x half> @llvm.sqrt.nxv8f16(<vscale x 8 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %10 = call <vscale x 16 x half> @llvm.sqrt.nxv16f16(<vscale x 16 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %11 = call <vscale x 32 x half> @llvm.sqrt.nxv32f16(<vscale x 32 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'sqrt_f16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.sqrt.f16(half poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %2 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %3 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %4 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %5 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %6 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %7 = call <vscale x 2 x half> @llvm.sqrt.nxv2f16(<vscale x 2 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %8 = call <vscale x 4 x half> @llvm.sqrt.nxv4f16(<vscale x 4 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %9 = call <vscale x 8 x half> @llvm.sqrt.nxv8f16(<vscale x 8 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %10 = call <vscale x 16 x half> @llvm.sqrt.nxv16f16(<vscale x 16 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %11 = call <vscale x 32 x half> @llvm.sqrt.nxv32f16(<vscale x 32 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  call half @llvm.sqrt.f16(half undef)
-  call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
-  call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
-  call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
-  call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
-  call <vscale x 2 x half> @llvm.sqrt.nxv2f16(<vscale x 2 x half> undef)
-  call <vscale x 4 x half> @llvm.sqrt.nxv4f16(<vscale x 4 x half> undef)
-  call <vscale x 8 x half> @llvm.sqrt.nxv8f16(<vscale x 8 x half> undef)
-  call <vscale x 16 x half> @llvm.sqrt.nxv16f16(<vscale x 16 x half> undef)
+  call half @llvm.sqrt.f16(half poison)
+  call <2 x half> @llvm.sqrt.v2f16(<2 x half> poison)
+  call <4 x half> @llvm.sqrt.v4f16(<4 x half> poison)
+  call <8 x half> @llvm.sqrt.v8f16(<8 x half> poison)
+  call <16 x half> @llvm.sqrt.v16f16(<16 x half> poison)
+  call <32 x half> @llvm.sqrt.v32f16(<32 x half> poison)
+  call <vscale x 2 x half> @llvm.sqrt.nxv2f16(<vscale x 2 x half> poison)
+  call <vscale x 4 x half> @llvm.sqrt.nxv4f16(<vscale x 4 x half> poison)
+  call <vscale x 8 x half> @llvm.sqrt.nxv8f16(<vscale x 8 x half> poison)
+  call <vscale x 16 x half> @llvm.sqrt.nxv16f16(<vscale x 16 x half> poison)
+  call <vscale x 32 x half> @llvm.sqrt.nxv32f16(<vscale x 32 x half> poison)
   ret void
 }
 

From 66e41a1a20f2190a800669028a0e80bd86e735ce Mon Sep 17 00:00:00 2001
From: Guray Ozen <guray.ozen@gmail.com>
Date: Fri, 10 Jan 2025 10:32:25 +0100
Subject: [PATCH 039/408] [MLIR][NVVM] Declare InferIntRangeInterface for
 RangeableRegisterOp (#122263)

---
 .../include/mlir/Dialect/LLVMIR/NVVMDialect.h |  1 +
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td   | 16 +++++++--
 mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp    | 11 ++++++
 mlir/test/Dialect/LLVMIR/nvvm-test-range.mlir | 35 +++++++++++++++++++
 4 files changed, 61 insertions(+), 2 deletions(-)
 create mode 100644 mlir/test/Dialect/LLVMIR/nvvm-test-range.mlir

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
index 4fd00ff929bd7..50d1a39126ea3 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
@@ -19,6 +19,7 @@
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/OpDefinition.h"
+#include "mlir/Interfaces/InferIntRangeInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index a2d2102b59dec..0b9097e9bbca2 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -18,6 +18,7 @@ include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td"
 include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Dialect/LLVMIR/BasicPtxBuilderInterface.td"
+include "mlir/Interfaces/InferIntRangeInterface.td"
 
 def LLVM_PointerGeneric : LLVM_PointerInAddressSpace<0>;
 def LLVM_PointerGlobal : LLVM_PointerInAddressSpace<1>;
@@ -134,8 +135,8 @@ class NVVM_SpecialRegisterOp<string mnemonic, list<Trait> traits = []> :
   let assemblyFormat = "attr-dict `:` type($res)";
 }
 
-class NVVM_SpecialRangeableRegisterOp<string mnemonic, list<Trait> traits = []> :
-  NVVM_SpecialRegisterOp<mnemonic, traits> {
+class NVVM_SpecialRangeableRegisterOp<string mnemonic> :
+  NVVM_SpecialRegisterOp<mnemonic, [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]> {
   let arguments = (ins OptionalAttr<LLVM_ConstantRangeAttr>:$range);
   let assemblyFormat = "(`range` $range^)? attr-dict `:` type($res)";
   let llvmBuilder = baseLlvmBuilder # setRangeRetAttrCode # baseLlvmBuilderCoda;
@@ -147,6 +148,17 @@ class NVVM_SpecialRangeableRegisterOp<string mnemonic, list<Trait> traits = []>
       build($_builder, $_state, resultType, ::mlir::LLVM::ConstantRangeAttr{});
     }]>
   ];
+
+  // Define this method for the InferIntRangeInterface.
+  let extraClassDefinition = [{
+    // Infer the result ranges based on the range attribute.
+    void $cppClass::inferResultRanges(
+        ArrayRef<::mlir::ConstantIntRanges> argRanges,
+        SetIntRangeFn setResultRanges) {
+        nvvmInferResultRanges(getOperation(), getResult(), argRanges, setResultRanges);
+    }
+  }];
+
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 8b09c0f386d6b..838159d676545 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -1158,6 +1158,17 @@ llvm::Intrinsic::ID CpAsyncBulkTensorReduceOp::getIntrinsicID(
   llvm_unreachable("Invalid Reduction Op for CpAsyncBulkTensorReduceOp");
 }
 
+/// Infer the result ranges for the NVVM SpecialRangeableRegisterOp that might
+/// have ConstantRangeAttr.
+static void nvvmInferResultRanges(Operation *op, Value result,
+                                  ArrayRef<::mlir::ConstantIntRanges> argRanges,
+                                  SetIntRangeFn setResultRanges) {
+  if (auto rangeAttr = op->getAttrOfType<LLVM::ConstantRangeAttr>("range")) {
+    setResultRanges(result, {rangeAttr.getLower(), rangeAttr.getUpper(),
+                             rangeAttr.getLower(), rangeAttr.getUpper()});
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // NVVMDialect initialization, type parsing, and registration.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/LLVMIR/nvvm-test-range.mlir b/mlir/test/Dialect/LLVMIR/nvvm-test-range.mlir
new file mode 100644
index 0000000000000..fae40dc7806ba
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/nvvm-test-range.mlir
@@ -0,0 +1,35 @@
+// RUN: mlir-opt -int-range-optimizations %s | FileCheck %s
+gpu.module @module{
+    gpu.func @kernel_1() kernel {
+        %tidx = nvvm.read.ptx.sreg.tid.x range <i32, 0, 32> : i32
+        %tidy = nvvm.read.ptx.sreg.tid.y range <i32, 0, 128> : i32
+        %tidz = nvvm.read.ptx.sreg.tid.z range <i32, 0, 4> : i32
+        %c64 = arith.constant 64 : i32
+        
+        %1 = arith.cmpi sgt, %tidx, %c64 : i32
+        scf.if %1 {
+            gpu.printf "threadidx"
+        }
+        %2 = arith.cmpi sgt, %tidy, %c64 : i32
+        scf.if %2 {
+            gpu.printf "threadidy"
+        }
+        %3 = arith.cmpi sgt, %tidz, %c64 : i32
+        scf.if %3 {
+            gpu.printf "threadidz"
+        }
+        gpu.return
+    }
+}
+
+// CHECK-LABEL: gpu.func @kernel_1
+// CHECK: %[[false:.+]] = arith.constant false
+// CHECK: %[[c64_i32:.+]] = arith.constant 64 : i32
+// CHECK: %[[S0:.+]] = nvvm.read.ptx.sreg.tid.y range <i32, 0, 128> : i32
+// CHECK: scf.if %[[false]] {
+// CHECK: gpu.printf "threadidx"
+// CHECK: %[[S1:.+]] = arith.cmpi sgt, %[[S0]], %[[c64_i32]] : i32
+// CHECK: scf.if %[[S1]] {
+// CHECK: gpu.printf "threadidy"
+// CHECK: scf.if %[[false]] {
+// CHECK: gpu.printf "threadidz"

From eb63cd62a4a1907dbd58f12660efd8244e7d81e9 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Fri, 10 Jan 2025 10:43:12 +0100
Subject: [PATCH 040/408] [GVN] MemorySSA for GVN: add optional
 `AllowMemorySSA`

Preparatory work to migrate from MemoryDependenceAnalysis
towards MemorySSA in GVN.

Co-authored-by: Antonio Frighetto <me@antoniofrighetto.com>
---
 llvm/include/llvm/Transforms/Scalar/GVN.h | 13 +++++++--
 llvm/lib/Passes/PassBuilder.cpp           |  2 ++
 llvm/lib/Passes/PassRegistry.def          |  2 +-
 llvm/lib/Transforms/Scalar/GVN.cpp        | 35 ++++++++++++++++-------
 llvm/test/Other/new-pm-print-pipeline.ll  |  4 +--
 5 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Scalar/GVN.h b/llvm/include/llvm/Transforms/Scalar/GVN.h
index be6c0ec5edab0..c8be390799836 100644
--- a/llvm/include/llvm/Transforms/Scalar/GVN.h
+++ b/llvm/include/llvm/Transforms/Scalar/GVN.h
@@ -77,6 +77,7 @@ struct GVNOptions {
   std::optional<bool> AllowLoadInLoopPRE;
   std::optional<bool> AllowLoadPRESplitBackedge;
   std::optional<bool> AllowMemDep;
+  std::optional<bool> AllowMemorySSA;
 
   GVNOptions() = default;
 
@@ -108,6 +109,12 @@ struct GVNOptions {
     AllowMemDep = MemDep;
     return *this;
   }
+
+  /// Enables or disables use of MemorySSA.
+  GVNOptions &setMemorySSA(bool MemSSA) {
+    AllowMemorySSA = MemSSA;
+    return *this;
+  }
 };
 
 /// The core GVN pass object.
@@ -144,6 +151,7 @@ class GVNPass : public PassInfoMixin<GVNPass> {
   bool isLoadInLoopPREEnabled() const;
   bool isLoadPRESplitBackedgeEnabled() const;
   bool isMemDepEnabled() const;
+  bool isMemorySSAEnabled() const;
 
   /// This class holds the mapping between values and value numbers.  It is used
   /// as an efficient mechanism to determine the expression-wise equivalence of
@@ -383,9 +391,8 @@ class GVNPass : public PassInfoMixin<GVNPass> {
   void assignBlockRPONumber(Function &F);
 };
 
-/// Create a legacy GVN pass. This also allows parameterizing whether or not
-/// MemDep is enabled.
-FunctionPass *createGVNPass(bool NoMemDepAnalysis = false);
+/// Create a legacy GVN pass.
+FunctionPass *createGVNPass();
 
 /// A simple and fast domtree-based GVN pass to hoist common expressions
 /// from sibling branches.
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index aac4407740055..90d11956d62a7 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -1042,6 +1042,8 @@ Expected<GVNOptions> parseGVNOptions(StringRef Params) {
       Result.setLoadPRESplitBackedge(Enable);
     } else if (ParamName == "memdep") {
       Result.setMemDep(Enable);
+    } else if (ParamName == "memoryssa") {
+      Result.setMemorySSA(Enable);
     } else {
       return make_error<StringError>(
           formatv("invalid GVN pass parameter '{0}' ", ParamName).str(),
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 1021d7fcd9247..a93a995655a14 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -526,7 +526,7 @@ FUNCTION_PASS_WITH_PARAMS(
     "gvn", "GVNPass", [](GVNOptions Opts) { return GVNPass(Opts); },
     parseGVNOptions,
     "no-pre;pre;no-load-pre;load-pre;no-split-backedge-load-pre;"
-    "split-backedge-load-pre;no-memdep;memdep")
+    "split-backedge-load-pre;no-memdep;memdep;no-memoryssa;memoryssa")
 FUNCTION_PASS_WITH_PARAMS(
     "hardware-loops", "HardwareLoopsPass",
     [](HardwareLoopOptions Opts) { return HardwareLoopsPass(Opts); },
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 229fffe92b99c..8d27a22570e9c 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -113,6 +113,8 @@ static cl::opt<bool>
 GVNEnableSplitBackedgeInLoadPRE("enable-split-backedge-in-load-pre",
                                 cl::init(false));
 static cl::opt<bool> GVNEnableMemDep("enable-gvn-memdep", cl::init(true));
+static cl::opt<bool> GVNEnableMemorySSA("enable-gvn-memoryssa",
+                                        cl::init(false));
 
 static cl::opt<uint32_t> MaxNumDeps(
     "gvn-max-num-deps", cl::Hidden, cl::init(100),
@@ -820,6 +822,10 @@ bool GVNPass::isMemDepEnabled() const {
   return Options.AllowMemDep.value_or(GVNEnableMemDep);
 }
 
+bool GVNPass::isMemorySSAEnabled() const {
+  return Options.AllowMemorySSA.value_or(GVNEnableMemorySSA);
+}
+
 PreservedAnalyses GVNPass::run(Function &F, FunctionAnalysisManager &AM) {
   // FIXME: The order of evaluation of these 'getResult' calls is very
   // significant! Re-ordering these variables will cause GVN when run alone to
@@ -832,7 +838,10 @@ PreservedAnalyses GVNPass::run(Function &F, FunctionAnalysisManager &AM) {
   auto *MemDep =
       isMemDepEnabled() ? &AM.getResult<MemoryDependenceAnalysis>(F) : nullptr;
   auto &LI = AM.getResult<LoopAnalysis>(F);
-  auto *MSSA = AM.getCachedResult<MemorySSAAnalysis>(F);
+  auto *MSSA =
+      isMemorySSAEnabled() ? &AM.getResult<MemorySSAAnalysis>(F) : nullptr;
+  assert(!(MemDep && MSSA) &&
+         "Should not use both MemDep and MemorySSA simultaneously!");
   auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
   bool Changed = runImpl(F, AC, DT, TLI, AA, MemDep, LI, &ORE,
                          MSSA ? &MSSA->getMSSA() : nullptr);
@@ -861,7 +870,9 @@ void GVNPass::printPipeline(
     OS << (*Options.AllowLoadPRESplitBackedge ? "" : "no-")
        << "split-backedge-load-pre;";
   if (Options.AllowMemDep != std::nullopt)
-    OS << (*Options.AllowMemDep ? "" : "no-") << "memdep";
+    OS << (*Options.AllowMemDep ? "" : "no-") << "memdep;";
+  if (Options.AllowMemorySSA != std::nullopt)
+    OS << (*Options.AllowMemorySSA ? "" : "no-") << "memoryssa";
   OS << '>';
 }
 
@@ -3293,8 +3304,11 @@ class llvm::gvn::GVNLegacyPass : public FunctionPass {
 public:
   static char ID; // Pass identification, replacement for typeid
 
-  explicit GVNLegacyPass(bool NoMemDepAnalysis = !GVNEnableMemDep)
-      : FunctionPass(ID), Impl(GVNOptions().setMemDep(!NoMemDepAnalysis)) {
+  explicit GVNLegacyPass(bool MemDepAnalysis = GVNEnableMemDep,
+                         bool MemSSAAnalysis = GVNEnableMemorySSA)
+      : FunctionPass(ID), Impl(GVNOptions()
+                                   .setMemDep(MemDepAnalysis)
+                                   .setMemorySSA(MemSSAAnalysis)) {
     initializeGVNLegacyPassPass(*PassRegistry::getPassRegistry());
   }
 
@@ -3302,7 +3316,6 @@ class llvm::gvn::GVNLegacyPass : public FunctionPass {
     if (skipFunction(F))
       return false;
 
-    auto *MSSAWP = getAnalysisIfAvailable<MemorySSAWrapperPass>();
     return Impl.runImpl(
         F, getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
         getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
@@ -3313,7 +3326,9 @@ class llvm::gvn::GVNLegacyPass : public FunctionPass {
             : nullptr,
         getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
         &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(),
-        MSSAWP ? &MSSAWP->getMSSA() : nullptr);
+        Impl.isMemorySSAEnabled()
+            ? &getAnalysis<MemorySSAWrapperPass>().getMSSA()
+            : nullptr);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -3329,7 +3344,8 @@ class llvm::gvn::GVNLegacyPass : public FunctionPass {
     AU.addPreserved<TargetLibraryInfoWrapperPass>();
     AU.addPreserved<LoopInfoWrapperPass>();
     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
-    AU.addPreserved<MemorySSAWrapperPass>();
+    if (Impl.isMemorySSAEnabled())
+      AU.addRequired<MemorySSAWrapperPass>();
   }
 
 private:
@@ -3341,6 +3357,7 @@ char GVNLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(GVNLegacyPass, "gvn", "Global Value Numbering", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
@@ -3349,6 +3366,4 @@ INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
 INITIALIZE_PASS_END(GVNLegacyPass, "gvn", "Global Value Numbering", false, false)
 
 // The public interface to this file...
-FunctionPass *llvm::createGVNPass(bool NoMemDepAnalysis) {
-  return new GVNLegacyPass(NoMemDepAnalysis);
-}
+FunctionPass *llvm::createGVNPass() { return new GVNLegacyPass(); }
diff --git a/llvm/test/Other/new-pm-print-pipeline.ll b/llvm/test/Other/new-pm-print-pipeline.ll
index 9016473b36ba4..eb3ffe3a098dd 100644
--- a/llvm/test/Other/new-pm-print-pipeline.ll
+++ b/llvm/test/Other/new-pm-print-pipeline.ll
@@ -31,8 +31,8 @@
 ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(loop-unroll<>,loop-unroll<partial;peeling;runtime;upperbound;profile-peeling;full-unroll-max=5;O1>,loop-unroll<no-partial;no-peeling;no-runtime;no-upperbound;no-profile-peeling;full-unroll-max=7;O1>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-10
 ; CHECK-10: function(loop-unroll<O2>,loop-unroll<partial;peeling;runtime;upperbound;profile-peeling;full-unroll-max=5;O1>,loop-unroll<no-partial;no-peeling;no-runtime;no-upperbound;no-profile-peeling;full-unroll-max=7;O1>)
 
-; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(gvn<>,gvn<pre;load-pre;split-backedge-load-pre;memdep>,gvn<no-pre;no-load-pre;no-split-backedge-load-pre;no-memdep>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-11
-; CHECK-11: function(gvn<>,gvn<pre;load-pre;split-backedge-load-pre;memdep>,gvn<no-pre;no-load-pre;no-split-backedge-load-pre;no-memdep>)
+; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(gvn<>,gvn<pre;load-pre;split-backedge-load-pre;memdep;memoryssa>,gvn<no-pre;no-load-pre;no-split-backedge-load-pre;no-memdep;no-memoryssa>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-11
+; CHECK-11: function(gvn<>,gvn<pre;load-pre;split-backedge-load-pre;memdep;memoryssa>,gvn<no-pre;no-load-pre;no-split-backedge-load-pre;no-memdep;no-memoryssa>)
 
 ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(early-cse<>,early-cse<memssa>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-12
 ; CHECK-12: function(early-cse<>,early-cse<memssa>)

From 4c853be6673fd95b4b900a6c0e1804bf33a0629c Mon Sep 17 00:00:00 2001
From: Usha Gupta <usha.gupta@arm.com>
Date: Fri, 10 Jan 2025 09:47:50 +0000
Subject: [PATCH 041/408] [AArch64] Replace uaddlv with addv for popcount
 operation (#121934)

Replace `uaddlv` with `addv` for popcount operation as it is simpler
operation.

On certain platforms like Cortex-A510, `addv` has a latency of 3 cycles
whereas `uaddlv` has a latency of 4 cycles

GCC generates `addv` as well:
https://godbolt.org/z/MnYG9jcEo
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 29 ++++----
 llvm/test/CodeGen/AArch64/arm64-popcnt.ll     |  8 +--
 llvm/test/CodeGen/AArch64/dp1.ll              | 66 ++++++++++---------
 llvm/test/CodeGen/AArch64/parity.ll           | 16 ++---
 llvm/test/CodeGen/AArch64/popcount.ll         | 16 ++---
 5 files changed, 66 insertions(+), 69 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 443f5f71b1084..5686ef5c25154 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -10764,37 +10764,30 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
   //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd
   //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
   //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
-  //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
+  //  FMOV    X0, D0        // copy result back to integer reg
   if (VT == MVT::i32 || VT == MVT::i64) {
     if (VT == MVT::i32)
       Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
     Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
 
     SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
-    SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
-    UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
+    SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v8i8, CtPop);
+    if (VT == MVT::i32)
+      AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, AddV,
                          DAG.getConstant(0, DL, MVT::i64));
-
+    AddV = DAG.getNode(ISD::BITCAST, DL, VT, AddV);
     if (IsParity)
-      UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
-                           DAG.getConstant(1, DL, MVT::i32));
-
-    if (VT == MVT::i64)
-      UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
-    return UaddLV;
+      AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
+    return AddV;
   } else if (VT == MVT::i128) {
     Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
 
     SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
-    SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
-    UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
-                         DAG.getConstant(0, DL, MVT::i64));
-
+    SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v16i8, CtPop);
+    AddV = DAG.getNode(ISD::BITCAST, DL, VT, AddV);
     if (IsParity)
-      UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
-                           DAG.getConstant(1, DL, MVT::i32));
-
-    return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
+      AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
+    return AddV;
   }
 
   assert(!IsParity && "ISD::PARITY of vector types not supported");
diff --git a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
index 0030e9ce80abb..ad0904ff98080 100644
--- a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
@@ -8,7 +8,7 @@ define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov s0, w0
 ; CHECK-NEXT:    cnt.8b v0, v0
-; CHECK-NEXT:    uaddlv.8b h0, v0
+; CHECK-NEXT:    addv.8b b0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 ;
@@ -43,7 +43,7 @@ define i32 @cnt32_advsimd_2(<2 x i32> %x) {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    fmov s0, w8
 ; CHECK-NEXT:    cnt.8b v0, v0
-; CHECK-NEXT:    uaddlv.8b h0, v0
+; CHECK-NEXT:    addv.8b b0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 ;
@@ -79,8 +79,8 @@ define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov d0, x0
 ; CHECK-NEXT:    cnt.8b v0, v0
-; CHECK-NEXT:    uaddlv.8b h0, v0
-; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    addv.8b b0, v0
+; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-NONEON-LABEL: cnt64_advsimd:
diff --git a/llvm/test/CodeGen/AArch64/dp1.ll b/llvm/test/CodeGen/AArch64/dp1.ll
index 949dad7798a6c..4f48aac72ebc3 100644
--- a/llvm/test/CodeGen/AArch64/dp1.ll
+++ b/llvm/test/CodeGen/AArch64/dp1.ll
@@ -197,52 +197,58 @@ define void @cttz_zeroundef_i64() {
 }
 
 define void @ctpop_i32() {
-; CHECK-LABEL: ctpop_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, :got:var32
-; CHECK-NEXT:    ldr x8, [x8, :got_lo12:var32]
-; CHECK-NEXT:    ldr w9, [x8]
-; CHECK-NEXT:    fmov d0, x9
-; CHECK-NEXT:    cnt v0.8b, v0.8b
-; CHECK-NEXT:    uaddlv h0, v0.8b
-; CHECK-NEXT:    str s0, [x8]
-; CHECK-NEXT:    ret
+; CHECK-SDAG-LABEL: ctpop_i32:
+; CHECK-SDAG:       // %bb.0:
+; CHECK-SDAG-NEXT:    adrp x8, :got:var32
+; CHECK-SDAG-NEXT:    ldr x8, [x8, :got_lo12:var32]
+; CHECK-SDAG-NEXT:    ldr w9, [x8]
+; CHECK-SDAG-NEXT:    fmov d0, x9
+; CHECK-SDAG-NEXT:    cnt v0.8b, v0.8b
+; CHECK-SDAG-NEXT:    addv b0, v0.8b
+; CHECK-SDAG-NEXT:    fmov w9, s0
+; CHECK-SDAG-NEXT:    str w9, [x8]
+; CHECK-SDAG-NEXT:    ret
+;
+; CHECK-GISEL-LABEL: ctpop_i32:
+; CHECK-GISEL:       // %bb.0:
+; CHECK-GISEL-NEXT:    adrp x8, :got:var32
+; CHECK-GISEL-NEXT:    ldr x8, [x8, :got_lo12:var32]
+; CHECK-GISEL-NEXT:    ldr w9, [x8]
+; CHECK-GISEL-NEXT:    fmov d0, x9
+; CHECK-GISEL-NEXT:    cnt v0.8b, v0.8b
+; CHECK-GISEL-NEXT:    uaddlv h0, v0.8b
+; CHECK-GISEL-NEXT:    str s0, [x8]
+; CHECK-GISEL-NEXT:    ret
   %val0_tmp = load i32, ptr @var32
   %val4_tmp = call i32 @llvm.ctpop.i32(i32 %val0_tmp)
   store volatile i32 %val4_tmp, ptr @var32
   ret void
 }
 
-define void @ctpop_i64() {
-; CHECK-SDAG-LABEL: ctpop_i64:
+define i64 @popcnt(i64 %a, ptr %p) {
+; CHECK-SDAG-LABEL: popcnt:
 ; CHECK-SDAG:       // %bb.0:
-; CHECK-SDAG-NEXT:    adrp x8, :got:var64
-; CHECK-SDAG-NEXT:    ldr x8, [x8, :got_lo12:var64]
-; CHECK-SDAG-NEXT:    ldr d0, [x8]
+; CHECK-SDAG-NEXT:    fmov d0, x0
+; CHECK-SDAG-NEXT:    mov x0, xzr
 ; CHECK-SDAG-NEXT:    cnt v0.8b, v0.8b
-; CHECK-SDAG-NEXT:    uaddlv h0, v0.8b
-; CHECK-SDAG-NEXT:    fmov w9, s0
-; CHECK-SDAG-NEXT:    str x9, [x8]
+; CHECK-SDAG-NEXT:    addv b0, v0.8b
+; CHECK-SDAG-NEXT:    str d0, [x1]
 ; CHECK-SDAG-NEXT:    ret
 ;
-; CHECK-GISEL-LABEL: ctpop_i64:
+; CHECK-GISEL-LABEL: popcnt:
 ; CHECK-GISEL:       // %bb.0:
-; CHECK-GISEL-NEXT:    adrp x8, :got:var64
-; CHECK-GISEL-NEXT:    ldr x8, [x8, :got_lo12:var64]
-; CHECK-GISEL-NEXT:    ldr x9, [x8]
-; CHECK-GISEL-NEXT:    fmov d0, x9
+; CHECK-GISEL-NEXT:    fmov d0, x0
+; CHECK-GISEL-NEXT:    mov x0, xzr
 ; CHECK-GISEL-NEXT:    cnt v0.8b, v0.8b
 ; CHECK-GISEL-NEXT:    uaddlv h0, v0.8b
-; CHECK-GISEL-NEXT:    mov w9, v0.s[0]
-; CHECK-GISEL-NEXT:    str x9, [x8]
+; CHECK-GISEL-NEXT:    mov w8, v0.s[0]
+; CHECK-GISEL-NEXT:    str x8, [x1]
 ; CHECK-GISEL-NEXT:    ret
-  %val0_tmp = load i64, ptr @var64
-  %val4_tmp = call i64 @llvm.ctpop.i64(i64 %val0_tmp)
-  store volatile i64 %val4_tmp, ptr @var64
-  ret void
+  %2 = call i64 @llvm.ctpop(i64 %a)
+  store i64 %2, ptr %p
+  ret i64 0
 }
 
-
 declare i32 @llvm.bswap.i32(i32)
 declare i64 @llvm.bswap.i64(i64)
 declare i32  @llvm.ctlz.i32 (i32, i1)
diff --git a/llvm/test/CodeGen/AArch64/parity.ll b/llvm/test/CodeGen/AArch64/parity.ll
index 19dd185a6cb78..1e51793fb5f91 100644
--- a/llvm/test/CodeGen/AArch64/parity.ll
+++ b/llvm/test/CodeGen/AArch64/parity.ll
@@ -114,9 +114,9 @@ define i64 @parity_64(i64 %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov d0, x0
 ; CHECK-NEXT:    cnt v0.8b, v0.8b
-; CHECK-NEXT:    uaddlv h0, v0.8b
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    addv b0, v0.8b
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    and x0, x8, #0x1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-CSSC-LABEL: parity_64:
@@ -136,9 +136,9 @@ define i128 @parity_128(i128 %x) {
 ; CHECK-NEXT:    mov v0.d[1], x1
 ; CHECK-NEXT:    mov x1, xzr
 ; CHECK-NEXT:    cnt v0.16b, v0.16b
-; CHECK-NEXT:    uaddlv h0, v0.16b
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    addv b0, v0.16b
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    and x0, x8, #0x1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-CSSC-LABEL: parity_128:
@@ -158,8 +158,8 @@ define i32 @parity_64_trunc(i64 %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov d0, x0
 ; CHECK-NEXT:    cnt v0.8b, v0.8b
-; CHECK-NEXT:    uaddlv h0, v0.8b
-; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    addv b0, v0.8b
+; CHECK-NEXT:    fmov x8, d0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll
index f9f1cd4b1fcf7..89b1ac0a0edf1 100644
--- a/llvm/test/CodeGen/AArch64/popcount.ll
+++ b/llvm/test/CodeGen/AArch64/popcount.ll
@@ -28,7 +28,7 @@ define i8 @popcount128(ptr nocapture nonnull readonly %0) {
 ; CHECK-NEXT:    add x8, x0, #8
 ; CHECK-NEXT:    ld1 { v0.d }[1], [x8]
 ; CHECK-NEXT:    cnt v0.16b, v0.16b
-; CHECK-NEXT:    uaddlv h0, v0.16b
+; CHECK-NEXT:    addv b0, v0.16b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 ;
@@ -104,8 +104,8 @@ define i16 @popcount256(ptr nocapture nonnull readonly %0) {
 ; CHECK-NEXT:    ld1 { v1.d }[1], [x8]
 ; CHECK-NEXT:    cnt v0.16b, v0.16b
 ; CHECK-NEXT:    cnt v1.16b, v1.16b
-; CHECK-NEXT:    uaddlv h0, v0.16b
-; CHECK-NEXT:    uaddlv h1, v1.16b
+; CHECK-NEXT:    addv b0, v0.16b
+; CHECK-NEXT:    addv b1, v1.16b
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    fmov w9, s1
 ; CHECK-NEXT:    add w0, w9, w8
@@ -191,12 +191,10 @@ define <1 x i128> @popcount1x128(<1 x i128> %0) {
 ;
 ; CHECK-LABEL: popcount1x128:
 ; CHECK:       // %bb.0: // %Entry
-; CHECK-NEXT:    fmov d1, x0
-; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    mov v1.d[1], x1
-; CHECK-NEXT:    cnt v1.16b, v1.16b
-; CHECK-NEXT:    uaddlv h1, v1.16b
-; CHECK-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    mov v0.d[1], x1
+; CHECK-NEXT:    cnt v0.16b, v0.16b
+; CHECK-NEXT:    addv b0, v0.16b
 ; CHECK-NEXT:    mov x1, v0.d[1]
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret

From 4e32271e8b304eb018c69f74c16edd1668fcdaf3 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin@arm.com>
Date: Fri, 10 Jan 2025 09:54:48 +0000
Subject: [PATCH 042/408] [AArch64][SME] Add diagnostics for SME attributes on
 lambda functions (#121777)

CheckFunctionDeclaration emits diagnostics if any SME attributes are used
by a function definition without the required +sme or +sme2 target features.
This patch moves these diagnostics to a new function in SemaARM and
also adds a call to this from ActOnStartOfLambdaDefinition.
---
 clang/include/clang/Sema/SemaARM.h            |  2 +
 clang/lib/Sema/SemaARM.cpp                    | 53 ++++++++++++++++++
 clang/lib/Sema/SemaDecl.cpp                   | 55 +------------------
 clang/lib/Sema/SemaLambda.cpp                 |  4 ++
 ...-sme-func-attrs-without-target-feature.cpp |  8 ++-
 5 files changed, 69 insertions(+), 53 deletions(-)

diff --git a/clang/include/clang/Sema/SemaARM.h b/clang/include/clang/Sema/SemaARM.h
index 8c4c56e222130..7beb1906a122f 100644
--- a/clang/include/clang/Sema/SemaARM.h
+++ b/clang/include/clang/Sema/SemaARM.h
@@ -79,6 +79,8 @@ class SemaARM : public SemaBase {
   void handleNewAttr(Decl *D, const ParsedAttr &AL);
   void handleCmseNSEntryAttr(Decl *D, const ParsedAttr &AL);
   void handleInterruptAttr(Decl *D, const ParsedAttr &AL);
+
+  void CheckSMEFunctionDefAttributes(const FunctionDecl *FD);
 };
 
 SemaARM::ArmStreamingType getArmStreamingFnType(const FunctionDecl *FD);
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index 411baa066f709..eafd43eb979ba 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -1328,4 +1328,57 @@ void SemaARM::handleInterruptAttr(Decl *D, const ParsedAttr &AL) {
                  ARMInterruptAttr(getASTContext(), AL, Kind));
 }
 
+// Check if the function definition uses any AArch64 SME features without
+// having the '+sme' feature enabled and warn user if sme locally streaming
+// function returns or uses arguments with VL-based types.
+void SemaARM::CheckSMEFunctionDefAttributes(const FunctionDecl *FD) {
+  const auto *Attr = FD->getAttr<ArmNewAttr>();
+  bool UsesSM = FD->hasAttr<ArmLocallyStreamingAttr>();
+  bool UsesZA = Attr && Attr->isNewZA();
+  bool UsesZT0 = Attr && Attr->isNewZT0();
+
+  if (FD->hasAttr<ArmLocallyStreamingAttr>()) {
+    if (FD->getReturnType()->isSizelessVectorType())
+      Diag(FD->getLocation(),
+           diag::warn_sme_locally_streaming_has_vl_args_returns)
+          << /*IsArg=*/false;
+    if (llvm::any_of(FD->parameters(), [](ParmVarDecl *P) {
+          return P->getOriginalType()->isSizelessVectorType();
+        }))
+      Diag(FD->getLocation(),
+           diag::warn_sme_locally_streaming_has_vl_args_returns)
+          << /*IsArg=*/true;
+  }
+  if (const auto *FPT = FD->getType()->getAs<FunctionProtoType>()) {
+    FunctionProtoType::ExtProtoInfo EPI = FPT->getExtProtoInfo();
+    UsesSM |= EPI.AArch64SMEAttributes & FunctionType::SME_PStateSMEnabledMask;
+    UsesZA |= FunctionType::getArmZAState(EPI.AArch64SMEAttributes) !=
+              FunctionType::ARM_None;
+    UsesZT0 |= FunctionType::getArmZT0State(EPI.AArch64SMEAttributes) !=
+               FunctionType::ARM_None;
+  }
+
+  ASTContext &Context = getASTContext();
+  if (UsesSM || UsesZA) {
+    llvm::StringMap<bool> FeatureMap;
+    Context.getFunctionFeatureMap(FeatureMap, FD);
+    if (!FeatureMap.contains("sme")) {
+      if (UsesSM)
+        Diag(FD->getLocation(),
+             diag::err_sme_definition_using_sm_in_non_sme_target);
+      else
+        Diag(FD->getLocation(),
+             diag::err_sme_definition_using_za_in_non_sme_target);
+    }
+  }
+  if (UsesZT0) {
+    llvm::StringMap<bool> FeatureMap;
+    Context.getFunctionFeatureMap(FeatureMap, FD);
+    if (!FeatureMap.contains("sme2")) {
+      Diag(FD->getLocation(),
+           diag::err_sme_definition_using_zt0_in_non_sme2_target);
+    }
+  }
+}
+
 } // namespace clang
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 75920052c4f0c..0d476fea880b9 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -45,6 +45,7 @@
 #include "clang/Sema/ParsedTemplate.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/ScopeInfo.h"
+#include "clang/Sema/SemaARM.h"
 #include "clang/Sema/SemaCUDA.h"
 #include "clang/Sema/SemaHLSL.h"
 #include "clang/Sema/SemaInternal.h"
@@ -12297,58 +12298,8 @@ bool Sema::CheckFunctionDeclaration(Scope *S, FunctionDecl *NewFD,
     }
   }
 
-  // Check if the function definition uses any AArch64 SME features without
-  // having the '+sme' feature enabled and warn user if sme locally streaming
-  // function returns or uses arguments with VL-based types.
-  if (DeclIsDefn) {
-    const auto *Attr = NewFD->getAttr<ArmNewAttr>();
-    bool UsesSM = NewFD->hasAttr<ArmLocallyStreamingAttr>();
-    bool UsesZA = Attr && Attr->isNewZA();
-    bool UsesZT0 = Attr && Attr->isNewZT0();
-
-    if (NewFD->hasAttr<ArmLocallyStreamingAttr>()) {
-      if (NewFD->getReturnType()->isSizelessVectorType())
-        Diag(NewFD->getLocation(),
-             diag::warn_sme_locally_streaming_has_vl_args_returns)
-            << /*IsArg=*/false;
-      if (llvm::any_of(NewFD->parameters(), [](ParmVarDecl *P) {
-            return P->getOriginalType()->isSizelessVectorType();
-          }))
-        Diag(NewFD->getLocation(),
-             diag::warn_sme_locally_streaming_has_vl_args_returns)
-            << /*IsArg=*/true;
-    }
-    if (const auto *FPT = NewFD->getType()->getAs<FunctionProtoType>()) {
-      FunctionProtoType::ExtProtoInfo EPI = FPT->getExtProtoInfo();
-      UsesSM |=
-          EPI.AArch64SMEAttributes & FunctionType::SME_PStateSMEnabledMask;
-      UsesZA |= FunctionType::getArmZAState(EPI.AArch64SMEAttributes) !=
-                FunctionType::ARM_None;
-      UsesZT0 |= FunctionType::getArmZT0State(EPI.AArch64SMEAttributes) !=
-                 FunctionType::ARM_None;
-    }
-
-    if (UsesSM || UsesZA) {
-      llvm::StringMap<bool> FeatureMap;
-      Context.getFunctionFeatureMap(FeatureMap, NewFD);
-      if (!FeatureMap.contains("sme")) {
-        if (UsesSM)
-          Diag(NewFD->getLocation(),
-               diag::err_sme_definition_using_sm_in_non_sme_target);
-        else
-          Diag(NewFD->getLocation(),
-               diag::err_sme_definition_using_za_in_non_sme_target);
-      }
-    }
-    if (UsesZT0) {
-      llvm::StringMap<bool> FeatureMap;
-      Context.getFunctionFeatureMap(FeatureMap, NewFD);
-      if (!FeatureMap.contains("sme2")) {
-        Diag(NewFD->getLocation(),
-             diag::err_sme_definition_using_zt0_in_non_sme2_target);
-      }
-    }
-  }
+  if (DeclIsDefn && Context.getTargetInfo().getTriple().isAArch64())
+    ARM().CheckSMEFunctionDefAttributes(NewFD);
 
   return Redeclaration;
 }
diff --git a/clang/lib/Sema/SemaLambda.cpp b/clang/lib/Sema/SemaLambda.cpp
index f2c3a816b3b5d..0c5467cfd54af 100644
--- a/clang/lib/Sema/SemaLambda.cpp
+++ b/clang/lib/Sema/SemaLambda.cpp
@@ -21,6 +21,7 @@
 #include "clang/Sema/Lookup.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/ScopeInfo.h"
+#include "clang/Sema/SemaARM.h"
 #include "clang/Sema/SemaCUDA.h"
 #include "clang/Sema/SemaInternal.h"
 #include "clang/Sema/SemaOpenMP.h"
@@ -1455,6 +1456,9 @@ void Sema::ActOnStartOfLambdaDefinition(LambdaIntroducer &Intro,
   // Attributes on the lambda apply to the method.
   ProcessDeclAttributes(CurScope, Method, ParamInfo);
 
+  if (Context.getTargetInfo().getTriple().isAArch64())
+    ARM().CheckSMEFunctionDefAttributes(Method);
+
   // CUDA lambdas get implicit host and device attributes.
   if (getLangOpts().CUDA)
     CUDA().SetLambdaAttrs(Method);
diff --git a/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp b/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp
index ec6bb6f503578..2ba266a93f94f 100644
--- a/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp
+++ b/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -std=c++23 -fsyntax-only -verify %s
 
 // This test is testing the diagnostics that Clang emits when compiling without '+sme'.
 
@@ -48,3 +48,9 @@ void streaming_compatible_def2(void (*streaming_fn_ptr)(void) __arm_streaming,
 // Also test when call-site is not a function.
 int streaming_decl_ret_int() __arm_streaming;
 int x = streaming_decl_ret_int(); // expected-error {{call to a streaming function requires 'sme'}}
+
+void sme_attrs_lambdas() {
+  [] __arm_locally_streaming () { return; }();  // expected-error {{function executed in streaming-SVE mode requires 'sme'}}
+  [] __arm_new("za") () { return; }();  // expected-error {{function using ZA state requires 'sme'}}
+  [] __arm_new("zt0") () { return; }();  // expected-error {{function using ZT0 state requires 'sme2'}}
+}

From 854cbbf4a8e7e98b7461eae2c2a37cfa767f791c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bal=C3=A1zs=20K=C3=A9ri?= <balazs.keri@ericsson.com>
Date: Fri, 10 Jan 2025 11:17:06 +0100
Subject: [PATCH 043/408] [clang][analyzer] Split NullDereferenceChecker into
 modeling and reporting (#122139)

The checker currently reports beneath the null dereference dereferences
of undefined value and of label addresses. If we want to add more kinds
of invalid dereferences (or split the existing functionality) it is more
useful to make it separate checkers.
To make this possible the existing checker is split into a
DereferenceModeling part and a NullDereference checker that actually
only switches on the check of null dereference. This is similar
architecture as in MallocChecker and CStringChecker.

The change is almost NFC but a new (modeling) checker is added. If the
NullDereference checker is turned off the found invalid dereferences
will still stop the analysis without emitted warning (this is different
compared to the old behavior).
---
 .../clang/StaticAnalyzer/Checkers/Checkers.td | 10 +++-
 .../Checkers/DereferenceChecker.cpp           | 51 ++++++++++++++-----
 .../test/Analysis/analyzer-enabled-checkers.c |  1 +
 ...c-library-functions-arg-enabled-checkers.c |  1 +
 4 files changed, 47 insertions(+), 16 deletions(-)

diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
index b34e940682fc5..1361da46c3c81 100644
--- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -206,7 +206,12 @@ def CallAndMessageChecker : Checker<"CallAndMessage">,
   Documentation<HasDocumentation>,
   Dependencies<[CallAndMessageModeling]>;
 
-def DereferenceChecker : Checker<"NullDereference">,
+def DereferenceModeling : Checker<"DereferenceModeling">,
+  HelpText<"General support for dereference related checkers">,
+  Documentation<NotDocumented>,
+  Hidden;
+
+def NullDereferenceChecker : Checker<"NullDereference">,
   HelpText<"Check for dereferences of null pointers">,
   CheckerOptions<[
     CmdLineOption<Boolean,
@@ -215,7 +220,8 @@ def DereferenceChecker : Checker<"NullDereference">,
                   "true",
                   Released>
   ]>,
-  Documentation<HasDocumentation>;
+  Documentation<HasDocumentation>,
+  Dependencies<[DereferenceModeling]>;
 
 def NonNullParamChecker : Checker<"NonNullParamChecker">,
   HelpText<"Check for null pointers passed as arguments to a function whose "
diff --git a/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp
index 0355eede75eae..e9e2771c739b6 100644
--- a/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp
@@ -33,12 +33,6 @@ class DereferenceChecker
                       EventDispatcher<ImplicitNullDerefEvent> > {
   enum DerefKind { NullPointer, UndefinedPointerValue, AddressOfLabel };
 
-  BugType BT_Null{this, "Dereference of null pointer", categories::LogicError};
-  BugType BT_Undef{this, "Dereference of undefined pointer value",
-                   categories::LogicError};
-  BugType BT_Label{this, "Dereference of the address of a label",
-                   categories::LogicError};
-
   void reportBug(DerefKind K, ProgramStateRef State, const Stmt *S,
                  CheckerContext &C) const;
 
@@ -56,6 +50,12 @@ class DereferenceChecker
                              bool loadedFrom = false);
 
   bool SuppressAddressSpaces = false;
+
+  bool CheckNullDereference = false;
+
+  std::unique_ptr<BugType> BT_Null;
+  std::unique_ptr<BugType> BT_Undef;
+  std::unique_ptr<BugType> BT_Label;
 };
 } // end anonymous namespace
 
@@ -155,22 +155,27 @@ static bool isDeclRefExprToReference(const Expr *E) {
 
 void DereferenceChecker::reportBug(DerefKind K, ProgramStateRef State,
                                    const Stmt *S, CheckerContext &C) const {
+  if (!CheckNullDereference) {
+    C.addSink();
+    return;
+  }
+
   const BugType *BT = nullptr;
   llvm::StringRef DerefStr1;
   llvm::StringRef DerefStr2;
   switch (K) {
   case DerefKind::NullPointer:
-    BT = &BT_Null;
+    BT = BT_Null.get();
     DerefStr1 = " results in a null pointer dereference";
     DerefStr2 = " results in a dereference of a null pointer";
     break;
   case DerefKind::UndefinedPointerValue:
-    BT = &BT_Undef;
+    BT = BT_Undef.get();
     DerefStr1 = " results in an undefined pointer dereference";
     DerefStr2 = " results in a dereference of an undefined pointer value";
     break;
   case DerefKind::AddressOfLabel:
-    BT = &BT_Label;
+    BT = BT_Label.get();
     DerefStr1 = " results in an undefined pointer dereference";
     DerefStr2 = " results in a dereference of an address of a label";
     break;
@@ -351,12 +356,30 @@ void DereferenceChecker::checkBind(SVal L, SVal V, const Stmt *S,
   C.addTransition(State, this);
 }
 
-void ento::registerDereferenceChecker(CheckerManager &mgr) {
-  auto *Chk = mgr.registerChecker<DereferenceChecker>();
-  Chk->SuppressAddressSpaces = mgr.getAnalyzerOptions().getCheckerBooleanOption(
-      mgr.getCurrentCheckerName(), "SuppressAddressSpaces");
+void ento::registerDereferenceModeling(CheckerManager &Mgr) {
+  Mgr.registerChecker<DereferenceChecker>();
+}
+
+bool ento::shouldRegisterDereferenceModeling(const CheckerManager &) {
+  return true;
+}
+
+void ento::registerNullDereferenceChecker(CheckerManager &Mgr) {
+  auto *Chk = Mgr.getChecker<DereferenceChecker>();
+  Chk->CheckNullDereference = true;
+  Chk->SuppressAddressSpaces = Mgr.getAnalyzerOptions().getCheckerBooleanOption(
+      Mgr.getCurrentCheckerName(), "SuppressAddressSpaces");
+  Chk->BT_Null.reset(new BugType(Mgr.getCurrentCheckerName(),
+                                 "Dereference of null pointer",
+                                 categories::LogicError));
+  Chk->BT_Undef.reset(new BugType(Mgr.getCurrentCheckerName(),
+                                  "Dereference of undefined pointer value",
+                                  categories::LogicError));
+  Chk->BT_Label.reset(new BugType(Mgr.getCurrentCheckerName(),
+                                  "Dereference of the address of a label",
+                                  categories::LogicError));
 }
 
-bool ento::shouldRegisterDereferenceChecker(const CheckerManager &mgr) {
+bool ento::shouldRegisterNullDereferenceChecker(const CheckerManager &) {
   return true;
 }
diff --git a/clang/test/Analysis/analyzer-enabled-checkers.c b/clang/test/Analysis/analyzer-enabled-checkers.c
index 160e35c77462d..c70aeb21ab045 100644
--- a/clang/test/Analysis/analyzer-enabled-checkers.c
+++ b/clang/test/Analysis/analyzer-enabled-checkers.c
@@ -14,6 +14,7 @@
 // CHECK-NEXT: core.BitwiseShift
 // CHECK-NEXT: core.CallAndMessageModeling
 // CHECK-NEXT: core.CallAndMessage
+// CHECK-NEXT: core.DereferenceModeling
 // CHECK-NEXT: core.DivideZero
 // CHECK-NEXT: core.DynamicTypePropagation
 // CHECK-NEXT: core.NonNullParamChecker
diff --git a/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c b/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c
index 0910f030b0f07..faf0a8f19d919 100644
--- a/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c
+++ b/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c
@@ -22,6 +22,7 @@
 // CHECK-NEXT: core.BitwiseShift
 // CHECK-NEXT: core.CallAndMessageModeling
 // CHECK-NEXT: core.CallAndMessage
+// CHECK-NEXT: core.DereferenceModeling
 // CHECK-NEXT: core.DivideZero
 // CHECK-NEXT: core.DynamicTypePropagation
 // CHECK-NEXT: core.NonNullParamChecker

From 1ef258097293fb008bdf3a8955feae0f08fdd9ae Mon Sep 17 00:00:00 2001
From: Guray Ozen <gozen@nvidia.com>
Date: Fri, 10 Jan 2025 11:26:59 +0100
Subject: [PATCH 044/408] [MLIR][NVVM] Add missing cmake dependency

NVVMdialect uses InferIntRangeInterface, but its dependence was missing in cmake. This PR adds that.
---
 mlir/lib/Dialect/LLVMIR/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mlir/lib/Dialect/LLVMIR/CMakeLists.txt b/mlir/lib/Dialect/LLVMIR/CMakeLists.txt
index fce24b556036f..73da0598aaeb4 100644
--- a/mlir/lib/Dialect/LLVMIR/CMakeLists.txt
+++ b/mlir/lib/Dialect/LLVMIR/CMakeLists.txt
@@ -51,6 +51,7 @@ add_mlir_dialect_library(MLIRNVVMDialect
   MLIRNVVMOpsIncGen
   MLIRNVVMConversionsIncGen
   MLIRBasicPtxBuilderInterfaceIncGen
+  MLIRInferIntRangeInterface
   intrinsics_gen
 
   LINK_COMPONENTS

From 3def49cb64ec1298290724081bd37dbdeb2ea5f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mirko=20Brku=C5=A1anin?= <Mirko.Brkusanin@amd.com>
Date: Fri, 10 Jan 2025 11:30:22 +0100
Subject: [PATCH 045/408] [AMDGPU] Remove s_wakeup_barrier instruction
 (#122277)

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |  1 -
 .../CodeGenOpenCL/builtins-amdgcn-gfx12.cl    | 15 -----------
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |  6 -----
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |  5 ----
 llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp  |  1 -
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |  2 --
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 27 ++++---------------
 llvm/lib/Target/AMDGPU/SOPInstructions.td     | 12 ---------
 llvm/test/CodeGen/AMDGPU/s-barrier.ll         |  9 -------
 llvm/test/MC/AMDGPU/gfx12_asm_sop1.s          |  9 -------
 .../Disassembler/AMDGPU/gfx12_dasm_sop1.txt   |  9 -------
 11 files changed, 5 insertions(+), 91 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 14c1746716cdd..1b29a8e359c20 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -489,7 +489,6 @@ TARGET_BUILTIN(__builtin_amdgcn_s_barrier_wait, "vIs", "n", "gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_isfirst, "bIi", "n", "gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_barrier_init, "vv*i", "n", "gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_barrier_join, "vv*", "n", "gfx12-insts")
-TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vv*", "n", "gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "vIs", "n", "gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_get_named_barrier_state, "Uiv*", "n", "gfx12-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
index b1866a8e492c8..5b5ae419f0a4a 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
@@ -173,21 +173,6 @@ void test_s_barrier_join(void *bar)
   __builtin_amdgcn_s_barrier_join(bar);
 }
 
-// CHECK-LABEL: @test_s_wakeup_barrier(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK-NEXT:    [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr
-// CHECK-NEXT:    store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
-// CHECK-NEXT:    call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) [[TMP1]])
-// CHECK-NEXT:    ret void
-//
-void test_s_wakeup_barrier(void *bar)
-{
-  __builtin_amdgcn_s_wakeup_barrier(bar);
-}
-
 // CHECK-LABEL: @test_s_barrier_leave(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.leave(i16 1)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 92418b9104ad1..b930d6983e225 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -284,12 +284,6 @@ def int_amdgcn_s_barrier_join : ClangBuiltin<"__builtin_amdgcn_s_barrier_join">,
   Intrinsic<[], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
                                 IntrNoCallback, IntrNoFree]>;
 
-// void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) %barrier)
-// The %barrier argument must be uniform, otherwise behavior is undefined.
-def int_amdgcn_s_wakeup_barrier : ClangBuiltin<"__builtin_amdgcn_s_wakeup_barrier">,
-  Intrinsic<[], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
-                                IntrNoCallback, IntrNoFree]>;
-
 // void @llvm.amdgcn.s.barrier.wait(i16 %barrierType)
 def int_amdgcn_s_barrier_wait : ClangBuiltin<"__builtin_amdgcn_s_barrier_wait">,
   Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 1e654b260cbfa..926c1e4b23b4a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2239,7 +2239,6 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
   case Intrinsic::amdgcn_s_barrier_signal_var:
     return selectNamedBarrierInit(I, IntrinsicID);
   case Intrinsic::amdgcn_s_barrier_join:
-  case Intrinsic::amdgcn_s_wakeup_barrier:
   case Intrinsic::amdgcn_s_get_named_barrier_state:
     return selectNamedBarrierInst(I, IntrinsicID);
   case Intrinsic::amdgcn_s_get_barrier_state:
@@ -5838,8 +5837,6 @@ unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
       llvm_unreachable("not a named barrier op");
     case Intrinsic::amdgcn_s_barrier_join:
       return AMDGPU::S_BARRIER_JOIN_IMM;
-    case Intrinsic::amdgcn_s_wakeup_barrier:
-      return AMDGPU::S_WAKEUP_BARRIER_IMM;
     case Intrinsic::amdgcn_s_get_named_barrier_state:
       return AMDGPU::S_GET_BARRIER_STATE_IMM;
     };
@@ -5849,8 +5846,6 @@ unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
       llvm_unreachable("not a named barrier op");
     case Intrinsic::amdgcn_s_barrier_join:
       return AMDGPU::S_BARRIER_JOIN_M0;
-    case Intrinsic::amdgcn_s_wakeup_barrier:
-      return AMDGPU::S_WAKEUP_BARRIER_M0;
     case Intrinsic::amdgcn_s_get_named_barrier_state:
       return AMDGPU::S_GET_BARRIER_STATE_M0;
     };
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
index 2df068d8fb007..0406ba9c68ccd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
@@ -326,7 +326,6 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
     case Intrinsic::amdgcn_s_barrier_wait:
     case Intrinsic::amdgcn_s_barrier_leave:
     case Intrinsic::amdgcn_s_get_barrier_state:
-    case Intrinsic::amdgcn_s_wakeup_barrier:
     case Intrinsic::amdgcn_wave_barrier:
     case Intrinsic::amdgcn_sched_barrier:
     case Intrinsic::amdgcn_sched_group_barrier:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 08e23cbf34e42..224c368cff4a1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3304,7 +3304,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       constrainOpWithReadfirstlane(B, MI, 1);
       return;
     case Intrinsic::amdgcn_s_barrier_join:
-    case Intrinsic::amdgcn_s_wakeup_barrier:
       constrainOpWithReadfirstlane(B, MI, 1);
       return;
     case Intrinsic::amdgcn_s_barrier_init:
@@ -5272,7 +5271,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
       break;
     case Intrinsic::amdgcn_s_barrier_join:
-    case Intrinsic::amdgcn_s_wakeup_barrier:
       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
       break;
     case Intrinsic::amdgcn_s_barrier_init:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 529d9ba17d4f6..69dca988b2cad 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10107,8 +10107,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
     return SDValue(NewMI, 0);
   }
-  case Intrinsic::amdgcn_s_barrier_join:
-  case Intrinsic::amdgcn_s_wakeup_barrier: {
+  case Intrinsic::amdgcn_s_barrier_join: {
     // these three intrinsics have one operand: barrier pointer
     SDValue Chain = Op->getOperand(0);
     SmallVector<SDValue, 2> Ops;
@@ -10117,32 +10116,16 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 
     if (isa<ConstantSDNode>(BarOp)) {
       uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
-      switch (IntrinsicID) {
-      default:
-        return SDValue();
-      case Intrinsic::amdgcn_s_barrier_join:
-        Opc = AMDGPU::S_BARRIER_JOIN_IMM;
-        break;
-      case Intrinsic::amdgcn_s_wakeup_barrier:
-        Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
-        break;
-      }
+      Opc = AMDGPU::S_BARRIER_JOIN_IMM;
+
       // extract the BarrierID from bits 4-9 of the immediate
       unsigned BarID = (BarVal >> 4) & 0x3F;
       SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
       Ops.push_back(K);
       Ops.push_back(Chain);
     } else {
-      switch (IntrinsicID) {
-      default:
-        return SDValue();
-      case Intrinsic::amdgcn_s_barrier_join:
-        Opc = AMDGPU::S_BARRIER_JOIN_M0;
-        break;
-      case Intrinsic::amdgcn_s_wakeup_barrier:
-        Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
-        break;
-      }
+      Opc = AMDGPU::S_BARRIER_JOIN_M0;
+
       // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
       SDValue M0Val;
       M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 46ac2a4992c45..da186f7058d18 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -488,11 +488,6 @@ def S_BARRIER_JOIN_M0 : SOP1_Pseudo <"s_barrier_join m0", (outs), (ins),
   let isConvergent = 1;
 }
 
-def S_WAKEUP_BARRIER_M0 : SOP1_Pseudo <"s_wakeup_barrier m0", (outs), (ins),
-  "", []>{
-  let SchedRW = [WriteBarrier];
-  let isConvergent = 1;
-}
 } // End Uses = [M0]
 
 def S_BARRIER_SIGNAL_IMM : SOP1_Pseudo <"s_barrier_signal", (outs),
@@ -514,11 +509,6 @@ def S_BARRIER_JOIN_IMM : SOP1_Pseudo <"s_barrier_join", (outs),
   let isConvergent = 1;
 }
 
-def S_WAKEUP_BARRIER_IMM : SOP1_Pseudo <"s_wakeup_barrier", (outs),
-  (ins SplitBarrier:$src0), "$src0", []>{
-  let SchedRW = [WriteBarrier];
-  let isConvergent = 1;
-}
 } // End has_sdst = 0
 
 def S_GET_BARRIER_STATE_IMM : SOP1_Pseudo <"s_get_barrier_state", (outs SSrc_b32:$sdst),
@@ -2092,13 +2082,11 @@ defm S_BARRIER_SIGNAL_ISFIRST_M0  : SOP1_M0_Real_gfx12<0x04f>;
 defm S_GET_BARRIER_STATE_M0       : SOP1_M0_Real_gfx12<0x050>;
 defm S_BARRIER_INIT_M0            : SOP1_M0_Real_gfx12<0x051>;
 defm S_BARRIER_JOIN_M0            : SOP1_M0_Real_gfx12<0x052>;
-defm S_WAKEUP_BARRIER_M0          : SOP1_M0_Real_gfx12<0x057>;
 defm S_BARRIER_SIGNAL_IMM         : SOP1_IMM_Real_gfx12<0x04e>;
 defm S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_IMM_Real_gfx12<0x04f>;
 defm S_GET_BARRIER_STATE_IMM      : SOP1_IMM_Real_gfx12<0x050>;
 defm S_BARRIER_INIT_IMM           : SOP1_IMM_Real_gfx12<0x051>;
 defm S_BARRIER_JOIN_IMM           : SOP1_IMM_Real_gfx12<0x052>;
-defm S_WAKEUP_BARRIER_IMM         : SOP1_IMM_Real_gfx12<0x057>;
 defm S_SLEEP_VAR                  : SOP1_IMM_Real_gfx12<0x058>;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier.ll b/llvm/test/CodeGen/AMDGPU/s-barrier.ll
index a2624e5f61307..83a077f7f74db 100644
--- a/llvm/test/CodeGen/AMDGPU/s-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier.ll
@@ -112,10 +112,6 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
 ; GFX12-SDAG-NEXT:    s_mov_b32 m0, 2
 ; GFX12-SDAG-NEXT:    s_barrier_wait 1
 ; GFX12-SDAG-NEXT:    s_barrier_leave
-; GFX12-SDAG-NEXT:    s_wakeup_barrier m0
-; GFX12-SDAG-NEXT:    s_mov_b32 m0, s2
-; GFX12-SDAG-NEXT:    s_wakeup_barrier m0
-; GFX12-SDAG-NEXT:    s_mov_b32 m0, 2
 ; GFX12-SDAG-NEXT:    s_get_barrier_state s3, m0
 ; GFX12-SDAG-NEXT:    s_mov_b32 m0, s2
 ; GFX12-SDAG-NEXT:    s_get_barrier_state s2, m0
@@ -176,8 +172,6 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
 ; GFX12-GISEL-NEXT:    s_barrier_join m0
 ; GFX12-GISEL-NEXT:    s_barrier_wait 1
 ; GFX12-GISEL-NEXT:    s_barrier_leave
-; GFX12-GISEL-NEXT:    s_wakeup_barrier 2
-; GFX12-GISEL-NEXT:    s_wakeup_barrier m0
 ; GFX12-GISEL-NEXT:    s_get_barrier_state s0, 2
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_get_barrier_state s0, m0
@@ -218,8 +212,6 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
     call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %in)
     call void @llvm.amdgcn.s.barrier.wait(i16 1)
     call void @llvm.amdgcn.s.barrier.leave(i16 1)
-    call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) @bar)
-    call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) %in)
     %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar)
     %state2 = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) %in)
     call void @llvm.amdgcn.s.barrier()
@@ -295,7 +287,6 @@ declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
 declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
 declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
 declare void @llvm.amdgcn.s.barrier.leave(i16) #1
-declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
 declare i32 @llvm.amdgcn.s.get.barrier.state(i32) #1
 declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s
index 939320e9ef2dc..d93ea2e82c1d2 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s
@@ -726,15 +726,6 @@ s_barrier_join -2
 s_barrier_join m0
 // GFX12: encoding: [0x7d,0x52,0x80,0xbe]
 
-s_wakeup_barrier 1
-// GFX12: encoding: [0x81,0x57,0x80,0xbe]
-
-s_wakeup_barrier -1
-// GFX12: encoding: [0xc1,0x57,0x80,0xbe]
-
-s_wakeup_barrier m0
-// GFX12: encoding: [0x7d,0x57,0x80,0xbe]
-
 s_get_barrier_state s3, -1
 // GFX12: encoding: [0xc1,0x50,0x83,0xbe]
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt
index f8c235f77b5f5..2cb6da42213e3 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt
@@ -726,15 +726,6 @@
 # GFX12: s_barrier_join m0                        ; encoding: [0x7d,0x52,0x80,0xbe]
 0x7d,0x52,0x80,0xbe
 
-# GFX12: s_wakeup_barrier 1                       ; encoding: [0x81,0x57,0x80,0xbe]
-0x81,0x57,0x80,0xbe
-
-# GFX12: s_wakeup_barrier -1                      ; encoding: [0xc1,0x57,0x80,0xbe]
-0xc1,0x57,0x80,0xbe
-
-# GFX12: s_wakeup_barrier m0                      ; encoding: [0x7d,0x57,0x80,0xbe]
-0x7d,0x57,0x80,0xbe
-
 # GFX12: s_get_barrier_state s3, -1               ; encoding: [0xc1,0x50,0x83,0xbe]
 0xc1,0x50,0x83,0xbe
 

From 2e6030ef6a1792bea40aa6b0421f9a5fc9243214 Mon Sep 17 00:00:00 2001
From: Guray Ozen <gozen@nvidia.com>
Date: Fri, 10 Jan 2025 12:22:20 +0100
Subject: [PATCH 046/408] [MLIR][NVVM] Add missing cmake dependency

Another fix
---
 mlir/lib/Dialect/LLVMIR/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/LLVMIR/CMakeLists.txt b/mlir/lib/Dialect/LLVMIR/CMakeLists.txt
index 73da0598aaeb4..c9a3b97294562 100644
--- a/mlir/lib/Dialect/LLVMIR/CMakeLists.txt
+++ b/mlir/lib/Dialect/LLVMIR/CMakeLists.txt
@@ -51,7 +51,6 @@ add_mlir_dialect_library(MLIRNVVMDialect
   MLIRNVVMOpsIncGen
   MLIRNVVMConversionsIncGen
   MLIRBasicPtxBuilderInterfaceIncGen
-  MLIRInferIntRangeInterface
   intrinsics_gen
 
   LINK_COMPONENTS
@@ -62,6 +61,7 @@ add_mlir_dialect_library(MLIRNVVMDialect
   MLIRIR
   MLIRLLVMDialect
   MLIRSideEffectInterfaces
+  MLIRInferIntRangeInterface
   )
 
 add_mlir_dialect_library(MLIRROCDLDialect

From e9e7b2adcf28c702f4ad37bad34ac437ee680799 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 10 Jan 2025 12:25:36 +0100
Subject: [PATCH 047/408] [SDAG] Set IsPostTypeLegalization flag in LegalizeDAG
 (#122278)

This runs after type legalization and as such should set
IsPostTypeLegalization when creating libcalls. I don't think this makes
any observable difference right now, but I ran into this issue in an
upcoming patch.
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 595a410101eca..c6475f0219903 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2153,6 +2153,7 @@ void SelectionDAGLegalize::ExpandFPLibCall(SDNode* Node,
     EVT RetVT = Node->getValueType(0);
     SmallVector<SDValue, 4> Ops(drop_begin(Node->ops()));
     TargetLowering::MakeLibCallOptions CallOptions;
+    CallOptions.IsPostTypeLegalization = true;
     // FIXME: This doesn't support tail calls.
     std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, RetVT,
                                                       Ops, CallOptions,
@@ -4342,6 +4343,8 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
   LLVM_DEBUG(dbgs() << "Trying to convert node to libcall\n");
   SmallVector<SDValue, 8> Results;
   SDLoc dl(Node);
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.IsPostTypeLegalization = true;
   // FIXME: Check flags on the node to see if we can use a finite call.
   unsigned Opc = Node->getOpcode();
   switch (Opc) {
@@ -4384,7 +4387,6 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     AtomicOrdering Order = cast<AtomicSDNode>(Node)->getMergedOrdering();
     RTLIB::Libcall LC = RTLIB::getOUTLINE_ATOMIC(Opc, Order, VT);
     EVT RetVT = Node->getValueType(0);
-    TargetLowering::MakeLibCallOptions CallOptions;
     SmallVector<SDValue, 4> Ops;
     if (TLI.getLibcallName(LC)) {
       // If outline atomic available, prepare its arguments and expand.
@@ -4422,7 +4424,6 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     break;
   }
   case ISD::CLEAR_CACHE: {
-    TargetLowering::MakeLibCallOptions CallOptions;
     SDValue InputChain = Node->getOperand(0);
     SDValue StartVal = Node->getOperand(1);
     SDValue EndVal = Node->getOperand(2);
@@ -4728,7 +4729,6 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     break;
   case ISD::STRICT_BF16_TO_FP:
     if (Node->getValueType(0) == MVT::f32) {
-      TargetLowering::MakeLibCallOptions CallOptions;
       std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(
           DAG, RTLIB::FPEXT_BF16_F32, MVT::f32, Node->getOperand(1),
           CallOptions, SDLoc(Node), Node->getOperand(0));
@@ -4738,7 +4738,6 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     break;
   case ISD::STRICT_FP16_TO_FP: {
     if (Node->getValueType(0) == MVT::f32) {
-      TargetLowering::MakeLibCallOptions CallOptions;
       std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(
           DAG, RTLIB::FPEXT_F16_F32, MVT::f32, Node->getOperand(1), CallOptions,
           SDLoc(Node), Node->getOperand(0));
@@ -4793,7 +4792,6 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     // Sign/zero extend the argument if the libcall takes a larger type.
     SDValue Op = DAG.getNode(Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl,
                              NVT, Node->getOperand(IsStrict ? 1 : 0));
-    TargetLowering::MakeLibCallOptions CallOptions;
     CallOptions.setIsSigned(Signed);
     std::pair<SDValue, SDValue> Tmp =
         TLI.makeLibCall(DAG, LC, RVT, Op, CallOptions, dl, Chain);
@@ -4833,7 +4831,6 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to legalize as libcall");
 
     SDValue Chain = IsStrict ? Node->getOperand(0) : SDValue();
-    TargetLowering::MakeLibCallOptions CallOptions;
     std::pair<SDValue, SDValue> Tmp =
         TLI.makeLibCall(DAG, LC, NVT, Op, CallOptions, dl, Chain);
 
@@ -4861,7 +4858,6 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     RTLIB::Libcall LC = RTLIB::getFPROUND(Op.getValueType(), VT);
     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to legalize as libcall");
 
-    TargetLowering::MakeLibCallOptions CallOptions;
     std::pair<SDValue, SDValue> Tmp =
         TLI.makeLibCall(DAG, LC, VT, Op, CallOptions, SDLoc(Node), Chain);
     Results.push_back(Tmp.first);
@@ -4890,7 +4886,6 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
 
     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to legalize as libcall");
 
-    TargetLowering::MakeLibCallOptions CallOptions;
     std::pair<SDValue, SDValue> Tmp =
         TLI.makeLibCall(DAG, LC, Node->getValueType(0), Node->getOperand(1),
                         CallOptions, SDLoc(Node), Node->getOperand(0));

From 4f69f4579132900949a7886fe3ba92d693430da0 Mon Sep 17 00:00:00 2001
From: Maksim Ivanov <emaxx@google.com>
Date: Fri, 10 Jan 2025 12:32:19 +0100
Subject: [PATCH 048/408] (reland) [clang] Warn [[clang::lifetimebound]]
 misusages on types (#118501)

This relands #118281 as-is, after it got reverted in commit
356df2dd72e8299b5de58e9390283110c19f7c76. The reland can go in after we
fixed some downstream codebases that had incorrectly placed attributes.

Original commit description:

> Emit the "cannot be applied to types" warning instead of silently
ignoring the attribute when it's attempted to be used on a type (instead
of a function argument or the function definition).
>
> Before this commit, the warning has been printed when the attribute
was (mis)used on a decl-specifier, but not in other places in a
declarator.
>
> Examples where the warning starts being emitted with this commit:
>
> ```
>   int * [[clang::lifetimebound]] x;
>
>   void f(int * [[clang::lifetimebound]] x);
>
>   void g(int * [[clang::lifetimebound]]);
> ```
>
> Note that the last example is the case of an unnamed function
parameter. While in theory Clang could've supported the
`[[clang::lifetimebound]]` analysis for unnamed parameters, it doesn't
currently, so the commit at least makes the situation better by
highlighting this as a warning instead of a silent ignore - which was
reported at #96034.
---
 clang/docs/ReleaseNotes.rst               | 17 ++++++++++++++++-
 clang/lib/Sema/SemaType.cpp               |  4 ++++
 clang/test/SemaCXX/attr-lifetimebound.cpp | 18 ++++++++++++++++--
 3 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 07a1a4195427d..511a28c5554bb 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -147,7 +147,7 @@ C++ Specific Potentially Breaking Changes
     // Fixed version:
     unsigned operator""_udl_name(unsigned long long);
 
-- Clang will now produce an error diagnostic when [[clang::lifetimebound]] is
+- Clang will now produce an error diagnostic when ``[[clang::lifetimebound]]`` is
   applied on a parameter or an implicit object parameter of a function that
   returns void. This was previously ignored and had no effect. (#GH107556)
 
@@ -156,6 +156,21 @@ C++ Specific Potentially Breaking Changes
     // Now diagnoses with an error.
     void f(int& i [[clang::lifetimebound]]);
 
+- Clang will now produce an error diagnostic when ``[[clang::lifetimebound]]``
+  is applied on a type (instead of a function parameter or an implicit object
+  parameter); this includes the case when the attribute is specified for an
+  unnamed function parameter. These were previously ignored and had no effect.
+  (#GH118281)
+
+  .. code-block:: c++
+
+    // Now diagnoses with an error.
+    int* [[clang::lifetimebound]] x;
+    // Now diagnoses with an error.
+    void f(int* [[clang::lifetimebound]] i);
+    // Now diagnoses with an error.
+    void g(int* [[clang::lifetimebound]]);
+
 - Clang now rejects all field accesses on null pointers in constant expressions. The following code
   used to work but will now be rejected:
 
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 83464c50b4b23..d46fbca9ee976 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -8628,7 +8628,11 @@ static void HandleLifetimeBoundAttr(TypeProcessingState &State,
     CurType = State.getAttributedType(
         createSimpleAttr<LifetimeBoundAttr>(State.getSema().Context, Attr),
         CurType, CurType);
+    return;
   }
+  State.getSema().Diag(Attr.getLoc(), diag::err_attribute_wrong_decl_type_str)
+      << Attr << Attr.isRegularKeywordAttribute()
+      << "parameters and implicit object parameters";
 }
 
 static void HandleLifetimeCaptureByAttr(TypeProcessingState &State,
diff --git a/clang/test/SemaCXX/attr-lifetimebound.cpp b/clang/test/SemaCXX/attr-lifetimebound.cpp
index f89b556f5bba0..c7abec61873ef 100644
--- a/clang/test/SemaCXX/attr-lifetimebound.cpp
+++ b/clang/test/SemaCXX/attr-lifetimebound.cpp
@@ -9,11 +9,25 @@ namespace usage_invalid {
     ~A() [[clang::lifetimebound]]; // expected-error {{cannot be applied to a destructor}}
     static int *static_class_member() [[clang::lifetimebound]]; // expected-error {{static member function has no implicit object parameter}}
     int *explicit_object(this A&) [[clang::lifetimebound]]; // expected-error {{explicit object member function has no implicit object parameter}}
-    int not_function [[clang::lifetimebound]]; // expected-error {{only applies to parameters and implicit object parameters}}
-    int [[clang::lifetimebound]] also_not_function; // expected-error {{cannot be applied to types}}
+    int attr_on_var [[clang::lifetimebound]]; // expected-error {{only applies to parameters and implicit object parameters}}
+    int [[clang::lifetimebound]] attr_on_int; // expected-error {{cannot be applied to types}}
+    int * [[clang::lifetimebound]] attr_on_int_ptr; // expected-error {{'lifetimebound' attribute only applies to parameters and implicit object parameters}}
+    int * [[clang::lifetimebound]] * attr_on_int_ptr_ptr; // expected-error {{'lifetimebound' attribute only applies to parameters and implicit object parameters}}
+    int (* [[clang::lifetimebound]] attr_on_func_ptr)(); // expected-error {{'lifetimebound' attribute only applies to parameters and implicit object parameters}}
     void void_return_member() [[clang::lifetimebound]]; // expected-error {{'lifetimebound' attribute cannot be applied to an implicit object parameter of a function that returns void; did you mean 'lifetime_capture_by(X)'}}
   };
   int *attr_with_param(int &param [[clang::lifetimebound(42)]]); // expected-error {{takes no arguments}}
+
+  void attr_on_ptr_arg(int * [[clang::lifetimebound]] ptr); // expected-error {{'lifetimebound' attribute only applies to parameters and implicit object parameters}}
+  static_assert((int [[clang::lifetimebound]]) 12); // expected-error {{cannot be applied to types}}
+  int* attr_on_unnamed_arg(const int& [[clang::lifetimebound]]); // expected-error {{'lifetimebound' attribute only applies to parameters and implicit object parameters}}
+  template <typename T>
+  int* attr_on_template_ptr_arg(T * [[clang::lifetimebound]] ptr); // expected-error {{'lifetimebound' attribute only applies to parameters and implicit object parameters}}
+
+  int (*func_ptr)(int) [[clang::lifetimebound]]; // expected-error {{'lifetimebound' attribute only applies to parameters and implicit object parameters}}
+  int (*(*func_ptr_ptr)(int) [[clang::lifetimebound]])(int); // expected-error {{'lifetimebound' attribute only applies to parameters and implicit object parameters}}
+  struct X {};
+  int (X::*member_func_ptr)(int) [[clang::lifetimebound]]; // expected-error {{'lifetimebound' attribute only applies to parameters and implicit object parameters}}
 }
 
 namespace usage_ok {

From eca8ec0c95355992e24f0dfcdec88c8bfc3d014a Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 10 Jan 2025 12:35:37 +0100
Subject: [PATCH 049/408] [LLVM][Maintainers] Remove disclaimer

This list is mostly up to date now, so remove the disclaimer.
---
 llvm/Maintainers.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md
index bad029ca6fb3c..e2af991ed37b1 100644
--- a/llvm/Maintainers.md
+++ b/llvm/Maintainers.md
@@ -10,8 +10,6 @@ The following people are the active maintainers for the project. Please reach
 out to them for code reviews, questions about their area of expertise, or other
 assistance.
 
-**Warning: The maintainer list for LLVM is currently not up to date.**
-
 ### Lead maintainer
 
 The lead maintainer is responsible for all parts of LLVM not covered by somebody else.

From 5a069eac5fbb7752e7602b783ee0102e8269c47a Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Fri, 10 Jan 2025 11:54:46 +0000
Subject: [PATCH 050/408] [AArch64] Don't try to sink and(load) (#122274)

If we sink the and in and(load), CGP can hoist is back again to the
load, getting into an infinite loop. This prevents sinking the and in
this case.

Fixes #122074
---
 .../AArch64/AArch64TargetTransformInfo.cpp    |  5 +-
 .../CodeGen/AArch64/aarch64-dup-ext-crash.ll  | 50 ++++++++++++++++++-
 2 files changed, 52 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 29ea098386cec..932a6f9ce23fd 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5514,7 +5514,10 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
         NumZExts++;
       }
 
-      Ops.push_back(&Insert->getOperandUse(1));
+      // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
+      // the And, just to hoist it again back to the load.
+      if (!match(OperandInstr, m_And(m_Load(m_Value()), m_Value())))
+        Ops.push_back(&Insert->getOperandUse(1));
       Ops.push_back(&Shuffle->getOperandUse(0));
       Ops.push_back(&Op);
     }
diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext-crash.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext-crash.ll
index 95c54cd8b0151..478c1be8821f6 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext-crash.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext-crash.ll
@@ -9,7 +9,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ; here, only that this case no longer causes said crash.
 define dso_local i32 @dupext_crashtest(i32 %e) local_unnamed_addr {
 ; CHECK-LABEL: dupext_crashtest:
-; CHECK:       // %bb.0: // %for.body.lr.ph
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    dup v0.2s, w0
 ; CHECK-NEXT:  .LBB0_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -18,7 +18,7 @@ define dso_local i32 @dupext_crashtest(i32 %e) local_unnamed_addr {
 ; CHECK-NEXT:    xtn v1.2s, v1.2d
 ; CHECK-NEXT:    str d1, [x8]
 ; CHECK-NEXT:    b .LBB0_1
-for.body.lr.ph:
+entry:
   %conv314 = zext i32 %e to i64
   br label %vector.memcheck
 
@@ -40,3 +40,49 @@ vector.body:                                      ; preds = %vector.body, %vecto
   store <2 x i32> %3, ptr %4, align 4
   br label %vector.body
 }
+
+; This test got stuck in a loop hoisting the and to the load, and sinking it back to the mull
+define i32 @dup_and_load(ptr %p, i1 %c) {
+; CHECK-LABEL: dup_and_load:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:    ldrb w0, [x0]
+; CHECK-NEXT:    tbz w1, #0, .LBB1_3
+; CHECK-NEXT:  // %bb.1: // %ph
+; CHECK-NEXT:    dup v0.8h, w0
+; CHECK-NEXT:    mov w9, wzr
+; CHECK-NEXT:  .LBB1_2: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr d1, [x8]
+; CHECK-NEXT:    add w9, w9, #1
+; CHECK-NEXT:    cmp w9, #100
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    umull2 v2.4s, v0.8h, v1.8h
+; CHECK-NEXT:    umull v1.4s, v0.4h, v1.4h
+; CHECK-NEXT:    stp q1, q2, [x8]
+; CHECK-NEXT:    b.lt .LBB1_2
+; CHECK-NEXT:  .LBB1_3: // %end
+; CHECK-NEXT:    ret
+entry:
+  %l = load i32, ptr %p
+  %and255 = and i32 %l, 255
+  br i1 %c, label %ph, label %end
+
+ph:
+  %broadcast.splatinsert = insertelement <8 x i32> poison, i32 %and255, i32 0
+  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %iv = phi i32 [ 0, %ph ], [ %iv.next, %vector.body ]
+  %wide.load = load <8 x i8>, ptr %p, align 4
+  %0 = zext <8 x i8> %wide.load to <8 x i32>
+  %1 = mul <8 x i32> %broadcast.splat, %0
+  store <8 x i32> %1, ptr %p, align 4
+  %iv.next = add i32 %iv, 1
+  %e = icmp slt i32 %iv.next, 100
+  br i1 %e, label %vector.body, label %end
+
+end:
+  ret i32 %and255
+}

From c39500f88c93f668c68bdafe56bd8d16e8abbec1 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 10 Jan 2025 12:57:00 +0100
Subject: [PATCH 051/408] Revert "[GVN] MemorySSA for GVN: add optional
 `AllowMemorySSA`"

This reverts commit eb63cd62a4a1907dbd58f12660efd8244e7d81e9.

This changes the preservation behavior for MSSA when the new flag
is not enabled.
---
 llvm/include/llvm/Transforms/Scalar/GVN.h | 13 ++-------
 llvm/lib/Passes/PassBuilder.cpp           |  2 --
 llvm/lib/Passes/PassRegistry.def          |  2 +-
 llvm/lib/Transforms/Scalar/GVN.cpp        | 35 +++++++----------------
 llvm/test/Other/new-pm-print-pipeline.ll  |  4 +--
 5 files changed, 16 insertions(+), 40 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Scalar/GVN.h b/llvm/include/llvm/Transforms/Scalar/GVN.h
index c8be390799836..be6c0ec5edab0 100644
--- a/llvm/include/llvm/Transforms/Scalar/GVN.h
+++ b/llvm/include/llvm/Transforms/Scalar/GVN.h
@@ -77,7 +77,6 @@ struct GVNOptions {
   std::optional<bool> AllowLoadInLoopPRE;
   std::optional<bool> AllowLoadPRESplitBackedge;
   std::optional<bool> AllowMemDep;
-  std::optional<bool> AllowMemorySSA;
 
   GVNOptions() = default;
 
@@ -109,12 +108,6 @@ struct GVNOptions {
     AllowMemDep = MemDep;
     return *this;
   }
-
-  /// Enables or disables use of MemorySSA.
-  GVNOptions &setMemorySSA(bool MemSSA) {
-    AllowMemorySSA = MemSSA;
-    return *this;
-  }
 };
 
 /// The core GVN pass object.
@@ -151,7 +144,6 @@ class GVNPass : public PassInfoMixin<GVNPass> {
   bool isLoadInLoopPREEnabled() const;
   bool isLoadPRESplitBackedgeEnabled() const;
   bool isMemDepEnabled() const;
-  bool isMemorySSAEnabled() const;
 
   /// This class holds the mapping between values and value numbers.  It is used
   /// as an efficient mechanism to determine the expression-wise equivalence of
@@ -391,8 +383,9 @@ class GVNPass : public PassInfoMixin<GVNPass> {
   void assignBlockRPONumber(Function &F);
 };
 
-/// Create a legacy GVN pass.
-FunctionPass *createGVNPass();
+/// Create a legacy GVN pass. This also allows parameterizing whether or not
+/// MemDep is enabled.
+FunctionPass *createGVNPass(bool NoMemDepAnalysis = false);
 
 /// A simple and fast domtree-based GVN pass to hoist common expressions
 /// from sibling branches.
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 90d11956d62a7..aac4407740055 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -1042,8 +1042,6 @@ Expected<GVNOptions> parseGVNOptions(StringRef Params) {
       Result.setLoadPRESplitBackedge(Enable);
     } else if (ParamName == "memdep") {
       Result.setMemDep(Enable);
-    } else if (ParamName == "memoryssa") {
-      Result.setMemorySSA(Enable);
     } else {
       return make_error<StringError>(
           formatv("invalid GVN pass parameter '{0}' ", ParamName).str(),
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index a93a995655a14..1021d7fcd9247 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -526,7 +526,7 @@ FUNCTION_PASS_WITH_PARAMS(
     "gvn", "GVNPass", [](GVNOptions Opts) { return GVNPass(Opts); },
     parseGVNOptions,
     "no-pre;pre;no-load-pre;load-pre;no-split-backedge-load-pre;"
-    "split-backedge-load-pre;no-memdep;memdep;no-memoryssa;memoryssa")
+    "split-backedge-load-pre;no-memdep;memdep")
 FUNCTION_PASS_WITH_PARAMS(
     "hardware-loops", "HardwareLoopsPass",
     [](HardwareLoopOptions Opts) { return HardwareLoopsPass(Opts); },
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 8d27a22570e9c..229fffe92b99c 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -113,8 +113,6 @@ static cl::opt<bool>
 GVNEnableSplitBackedgeInLoadPRE("enable-split-backedge-in-load-pre",
                                 cl::init(false));
 static cl::opt<bool> GVNEnableMemDep("enable-gvn-memdep", cl::init(true));
-static cl::opt<bool> GVNEnableMemorySSA("enable-gvn-memoryssa",
-                                        cl::init(false));
 
 static cl::opt<uint32_t> MaxNumDeps(
     "gvn-max-num-deps", cl::Hidden, cl::init(100),
@@ -822,10 +820,6 @@ bool GVNPass::isMemDepEnabled() const {
   return Options.AllowMemDep.value_or(GVNEnableMemDep);
 }
 
-bool GVNPass::isMemorySSAEnabled() const {
-  return Options.AllowMemorySSA.value_or(GVNEnableMemorySSA);
-}
-
 PreservedAnalyses GVNPass::run(Function &F, FunctionAnalysisManager &AM) {
   // FIXME: The order of evaluation of these 'getResult' calls is very
   // significant! Re-ordering these variables will cause GVN when run alone to
@@ -838,10 +832,7 @@ PreservedAnalyses GVNPass::run(Function &F, FunctionAnalysisManager &AM) {
   auto *MemDep =
       isMemDepEnabled() ? &AM.getResult<MemoryDependenceAnalysis>(F) : nullptr;
   auto &LI = AM.getResult<LoopAnalysis>(F);
-  auto *MSSA =
-      isMemorySSAEnabled() ? &AM.getResult<MemorySSAAnalysis>(F) : nullptr;
-  assert(!(MemDep && MSSA) &&
-         "Should not use both MemDep and MemorySSA simultaneously!");
+  auto *MSSA = AM.getCachedResult<MemorySSAAnalysis>(F);
   auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
   bool Changed = runImpl(F, AC, DT, TLI, AA, MemDep, LI, &ORE,
                          MSSA ? &MSSA->getMSSA() : nullptr);
@@ -870,9 +861,7 @@ void GVNPass::printPipeline(
     OS << (*Options.AllowLoadPRESplitBackedge ? "" : "no-")
        << "split-backedge-load-pre;";
   if (Options.AllowMemDep != std::nullopt)
-    OS << (*Options.AllowMemDep ? "" : "no-") << "memdep;";
-  if (Options.AllowMemorySSA != std::nullopt)
-    OS << (*Options.AllowMemorySSA ? "" : "no-") << "memoryssa";
+    OS << (*Options.AllowMemDep ? "" : "no-") << "memdep";
   OS << '>';
 }
 
@@ -3304,11 +3293,8 @@ class llvm::gvn::GVNLegacyPass : public FunctionPass {
 public:
   static char ID; // Pass identification, replacement for typeid
 
-  explicit GVNLegacyPass(bool MemDepAnalysis = GVNEnableMemDep,
-                         bool MemSSAAnalysis = GVNEnableMemorySSA)
-      : FunctionPass(ID), Impl(GVNOptions()
-                                   .setMemDep(MemDepAnalysis)
-                                   .setMemorySSA(MemSSAAnalysis)) {
+  explicit GVNLegacyPass(bool NoMemDepAnalysis = !GVNEnableMemDep)
+      : FunctionPass(ID), Impl(GVNOptions().setMemDep(!NoMemDepAnalysis)) {
     initializeGVNLegacyPassPass(*PassRegistry::getPassRegistry());
   }
 
@@ -3316,6 +3302,7 @@ class llvm::gvn::GVNLegacyPass : public FunctionPass {
     if (skipFunction(F))
       return false;
 
+    auto *MSSAWP = getAnalysisIfAvailable<MemorySSAWrapperPass>();
     return Impl.runImpl(
         F, getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
         getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
@@ -3326,9 +3313,7 @@ class llvm::gvn::GVNLegacyPass : public FunctionPass {
             : nullptr,
         getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
         &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(),
-        Impl.isMemorySSAEnabled()
-            ? &getAnalysis<MemorySSAWrapperPass>().getMSSA()
-            : nullptr);
+        MSSAWP ? &MSSAWP->getMSSA() : nullptr);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -3344,8 +3329,7 @@ class llvm::gvn::GVNLegacyPass : public FunctionPass {
     AU.addPreserved<TargetLibraryInfoWrapperPass>();
     AU.addPreserved<LoopInfoWrapperPass>();
     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
-    if (Impl.isMemorySSAEnabled())
-      AU.addRequired<MemorySSAWrapperPass>();
+    AU.addPreserved<MemorySSAWrapperPass>();
   }
 
 private:
@@ -3357,7 +3341,6 @@ char GVNLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(GVNLegacyPass, "gvn", "Global Value Numbering", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
@@ -3366,4 +3349,6 @@ INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
 INITIALIZE_PASS_END(GVNLegacyPass, "gvn", "Global Value Numbering", false, false)
 
 // The public interface to this file...
-FunctionPass *llvm::createGVNPass() { return new GVNLegacyPass(); }
+FunctionPass *llvm::createGVNPass(bool NoMemDepAnalysis) {
+  return new GVNLegacyPass(NoMemDepAnalysis);
+}
diff --git a/llvm/test/Other/new-pm-print-pipeline.ll b/llvm/test/Other/new-pm-print-pipeline.ll
index eb3ffe3a098dd..9016473b36ba4 100644
--- a/llvm/test/Other/new-pm-print-pipeline.ll
+++ b/llvm/test/Other/new-pm-print-pipeline.ll
@@ -31,8 +31,8 @@
 ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(loop-unroll<>,loop-unroll<partial;peeling;runtime;upperbound;profile-peeling;full-unroll-max=5;O1>,loop-unroll<no-partial;no-peeling;no-runtime;no-upperbound;no-profile-peeling;full-unroll-max=7;O1>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-10
 ; CHECK-10: function(loop-unroll<O2>,loop-unroll<partial;peeling;runtime;upperbound;profile-peeling;full-unroll-max=5;O1>,loop-unroll<no-partial;no-peeling;no-runtime;no-upperbound;no-profile-peeling;full-unroll-max=7;O1>)
 
-; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(gvn<>,gvn<pre;load-pre;split-backedge-load-pre;memdep;memoryssa>,gvn<no-pre;no-load-pre;no-split-backedge-load-pre;no-memdep;no-memoryssa>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-11
-; CHECK-11: function(gvn<>,gvn<pre;load-pre;split-backedge-load-pre;memdep;memoryssa>,gvn<no-pre;no-load-pre;no-split-backedge-load-pre;no-memdep;no-memoryssa>)
+; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(gvn<>,gvn<pre;load-pre;split-backedge-load-pre;memdep>,gvn<no-pre;no-load-pre;no-split-backedge-load-pre;no-memdep>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-11
+; CHECK-11: function(gvn<>,gvn<pre;load-pre;split-backedge-load-pre;memdep>,gvn<no-pre;no-load-pre;no-split-backedge-load-pre;no-memdep>)
 
 ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(early-cse<>,early-cse<memssa>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-12
 ; CHECK-12: function(early-cse<>,early-cse<memssa>)

From b53e79422adb83870f44c55d977989da3e5c8c69 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Fri, 10 Jan 2025 12:07:56 +0000
Subject: [PATCH 052/408] VT: teach isImpliedCondOperands about samesign
 (#120263)

isImpliedCondICmps() and its callers in ValueTracking can greatly
benefit from being taught about samesign. As a first step, teach one
caller, namely isImpliedCondOperands(). Very minimal changes are
required for this, as CmpPredicate::getMatching() does most of the work.
---
 llvm/lib/Analysis/ValueTracking.cpp           | 24 +++++++++----------
 .../implied-condition-samesign.ll             | 24 ++++---------------
 2 files changed, 15 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 0eb43dd581acc..db244148a3b1e 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -9405,36 +9405,34 @@ static std::optional<bool> isImpliedCondCommonOperandWithCR(
 /// Return true if LHS implies RHS (expanded to its components as "R0 RPred R1")
 /// is true.  Return false if LHS implies RHS is false. Otherwise, return
 /// std::nullopt if we can't infer anything.
-static std::optional<bool> isImpliedCondICmps(const ICmpInst *LHS,
-                                              CmpInst::Predicate RPred,
-                                              const Value *R0, const Value *R1,
-                                              const DataLayout &DL,
-                                              bool LHSIsTrue) {
+static std::optional<bool>
+isImpliedCondICmps(const ICmpInst *LHS, CmpPredicate RPred, const Value *R0,
+                   const Value *R1, const DataLayout &DL, bool LHSIsTrue) {
   Value *L0 = LHS->getOperand(0);
   Value *L1 = LHS->getOperand(1);
 
   // The rest of the logic assumes the LHS condition is true.  If that's not the
   // case, invert the predicate to make it so.
-  CmpInst::Predicate LPred =
-      LHSIsTrue ? LHS->getPredicate() : LHS->getInversePredicate();
+  CmpPredicate LPred =
+      LHSIsTrue ? LHS->getCmpPredicate() : LHS->getInverseCmpPredicate();
 
   // We can have non-canonical operands, so try to normalize any common operand
   // to L0/R0.
   if (L0 == R1) {
     std::swap(R0, R1);
-    RPred = ICmpInst::getSwappedPredicate(RPred);
+    RPred = ICmpInst::getSwappedCmpPredicate(RPred);
   }
   if (R0 == L1) {
     std::swap(L0, L1);
-    LPred = ICmpInst::getSwappedPredicate(LPred);
+    LPred = ICmpInst::getSwappedCmpPredicate(LPred);
   }
   if (L1 == R1) {
     // If we have L0 == R0 and L1 == R1, then make L1/R1 the constants.
     if (L0 != R0 || match(L0, m_ImmConstant())) {
       std::swap(L0, L1);
-      LPred = ICmpInst::getSwappedPredicate(LPred);
+      LPred = ICmpInst::getSwappedCmpPredicate(LPred);
       std::swap(R0, R1);
-      RPred = ICmpInst::getSwappedPredicate(RPred);
+      RPred = ICmpInst::getSwappedCmpPredicate(RPred);
     }
   }
 
@@ -9493,8 +9491,8 @@ static std::optional<bool> isImpliedCondICmps(const ICmpInst *LHS,
       match(L0, m_c_Add(m_Specific(L1), m_Specific(R1))))
     return CmpPredicate::getMatching(LPred, RPred).has_value();
 
-  if (LPred == RPred)
-    return isImpliedCondOperands(LPred, L0, L1, R0, R1);
+  if (auto P = CmpPredicate::getMatching(LPred, RPred))
+    return isImpliedCondOperands(*P, L0, L1, R0, R1);
 
   return std::nullopt;
 }
diff --git a/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll b/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll
index 0ea69c2cd4d47..3abe4c76aaab6 100644
--- a/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll
+++ b/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll
@@ -4,11 +4,7 @@
 define i1 @incr_sle(i32 %i, i32 %len) {
 ; CHECK-LABEL: define i1 @incr_sle(
 ; CHECK-SAME: i32 [[I:%.*]], i32 [[LEN:%.*]]) {
-; CHECK-NEXT:    [[I_INCR:%.*]] = add nuw nsw i32 [[I]], 1
-; CHECK-NEXT:    [[I_GT_LEN:%.*]] = icmp samesign ugt i32 [[I]], [[LEN]]
-; CHECK-NEXT:    [[I_INCR_SGT_LEN:%.*]] = icmp sgt i32 [[I_INCR]], [[LEN]]
-; CHECK-NEXT:    [[RES:%.*]] = icmp sle i1 [[I_INCR_SGT_LEN]], [[I_GT_LEN]]
-; CHECK-NEXT:    ret i1 [[RES]]
+; CHECK-NEXT:    ret i1 true
 ;
   %i.incr = add nsw nuw i32 %i, 1
   %i.gt.len = icmp samesign ugt i32 %i, %len
@@ -36,11 +32,7 @@ define i1 @incr_sle_no_nsw_nuw(i32 %i, i32 %len) {
 define i1 @incr_sge(i32 %i, i32 %len) {
 ; CHECK-LABEL: define i1 @incr_sge(
 ; CHECK-SAME: i32 [[I:%.*]], i32 [[LEN:%.*]]) {
-; CHECK-NEXT:    [[I_INCR:%.*]] = add nuw nsw i32 [[I]], 1
-; CHECK-NEXT:    [[I_LT_LEN:%.*]] = icmp samesign ult i32 [[I]], [[LEN]]
-; CHECK-NEXT:    [[I_INCR_SLT_LEN:%.*]] = icmp slt i32 [[I_INCR]], [[LEN]]
-; CHECK-NEXT:    [[RES:%.*]] = icmp sge i1 [[I_INCR_SLT_LEN]], [[I_LT_LEN]]
-; CHECK-NEXT:    ret i1 [[RES]]
+; CHECK-NEXT:    ret i1 true
 ;
   %i.incr = add nsw nuw i32 %i, 1
   %i.lt.len = icmp samesign ult i32 %i, %len
@@ -68,11 +60,7 @@ define i1 @incr_sge_no_nsw_nuw(i32 %i, i32 %len) {
 define i1 @incr_ule(i32 %i, i32 %len) {
 ; CHECK-LABEL: define i1 @incr_ule(
 ; CHECK-SAME: i32 [[I:%.*]], i32 [[LEN:%.*]]) {
-; CHECK-NEXT:    [[I_INCR:%.*]] = add nuw nsw i32 [[I]], 1
-; CHECK-NEXT:    [[I_GT_LEN:%.*]] = icmp samesign ugt i32 [[I]], [[LEN]]
-; CHECK-NEXT:    [[I_INCR_SGT_LEN:%.*]] = icmp sgt i32 [[I_INCR]], [[LEN]]
-; CHECK-NEXT:    [[RES:%.*]] = icmp ule i1 [[I_GT_LEN]], [[I_INCR_SGT_LEN]]
-; CHECK-NEXT:    ret i1 [[RES]]
+; CHECK-NEXT:    ret i1 true
 ;
   %i.incr = add nsw nuw i32 %i, 1
   %i.gt.len = icmp samesign ugt i32 %i, %len
@@ -100,11 +88,7 @@ define i1 @incr_ule_no_nsw_nuw(i32 %i, i32 %len) {
 define i1 @incr_uge(i32 %i, i32 %len) {
 ; CHECK-LABEL: define i1 @incr_uge(
 ; CHECK-SAME: i32 [[I:%.*]], i32 [[LEN:%.*]]) {
-; CHECK-NEXT:    [[I_INCR:%.*]] = add nuw nsw i32 [[I]], 1
-; CHECK-NEXT:    [[I_LT_LEN:%.*]] = icmp samesign ult i32 [[I]], [[LEN]]
-; CHECK-NEXT:    [[I_INCR_SLT_LEN:%.*]] = icmp slt i32 [[I_INCR]], [[LEN]]
-; CHECK-NEXT:    [[RES:%.*]] = icmp uge i1 [[I_LT_LEN]], [[I_INCR_SLT_LEN]]
-; CHECK-NEXT:    ret i1 [[RES]]
+; CHECK-NEXT:    ret i1 true
 ;
   %i.incr = add nsw nuw i32 %i, 1
   %i.lt.len = icmp samesign ult i32 %i, %len

From 9b49da2b3169544355192dfd8d6909213169d0c1 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 10 Jan 2025 12:10:45 +0000
Subject: [PATCH 053/408] Revert 86b1b0671cafd "MachineVerifier: Check stack
 protector is top-most in frame" (#122444)

Reverts llvm/llvm-project#121481

This is causing build failures on EXPENSIVE_CHECKS builds:
https://lab.llvm.org/buildbot/#/builders/187/builds/3653
https://lab.llvm.org/buildbot/#/builders/16/builds/11758
---
 llvm/lib/CodeGen/MachineVerifier.cpp          | 52 +--------------
 .../stack-protector-offset.mir                | 63 -------------------
 2 files changed, 1 insertion(+), 114 deletions(-)
 delete mode 100644 llvm/test/MachineVerifier/stack-protector-offset.mir

diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index 2558799c19f4d..bec36b728ae32 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -353,8 +353,6 @@ struct MachineVerifier {
                        LaneBitmask LaneMask = LaneBitmask::getNone());
 
   void verifyStackFrame();
-  // Check that the stack protector is the top-most object in the stack.
-  void verifyStackProtector();
 
   void verifySlotIndexes() const;
   void verifyProperties(const MachineFunction &MF);
@@ -711,10 +709,8 @@ void MachineVerifier::visitMachineFunctionBefore() {
   // Check that the register use lists are sane.
   MRI->verifyUseLists();
 
-  if (!MF->empty()) {
+  if (!MF->empty())
     verifyStackFrame();
-    verifyStackProtector();
-  }
 }
 
 void
@@ -4042,49 +4038,3 @@ void MachineVerifier::verifyStackFrame() {
     }
   }
 }
-
-void MachineVerifier::verifyStackProtector() {
-  const MachineFrameInfo &MFI = MF->getFrameInfo();
-  if (!MFI.hasStackProtectorIndex())
-    return;
-  // Only applicable when the offsets of frame objects have been determined,
-  // which is indicated by a non-zero stack size.
-  if (!MFI.getStackSize())
-    return;
-  const TargetFrameLowering &TFI = *MF->getSubtarget().getFrameLowering();
-  bool StackGrowsDown =
-      TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
-  // Collect the frame indices of the callee-saved registers which are spilled
-  // to the stack. These are the registers that are stored above the stack
-  // protector.
-  SmallSet<unsigned, 4> CalleeSavedFrameIndices;
-  if (MFI.isCalleeSavedInfoValid()) {
-    for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo()) {
-      if (!Info.isSpilledToReg())
-        CalleeSavedFrameIndices.insert(Info.getFrameIdx());
-    }
-  }
-  unsigned FI = MFI.getStackProtectorIndex();
-  int64_t SPStart = MFI.getObjectOffset(FI);
-  int64_t SPEnd = SPStart + MFI.getObjectSize(FI);
-  for (unsigned I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
-    if (I == FI)
-      continue;
-    // Variable-sized objects do not have a fixed offset.
-    if (MFI.isVariableSizedObjectIndex(I))
-      continue;
-    if (CalleeSavedFrameIndices.contains(I))
-      continue;
-    int64_t ObjStart = MFI.getObjectOffset(I);
-    int64_t ObjEnd = ObjStart + MFI.getObjectSize(I);
-    if (SPStart < ObjEnd && ObjStart < SPEnd) {
-      report("Stack protector overlaps with another stack object", MF);
-      break;
-    }
-    if ((StackGrowsDown && SPStart <= ObjStart) ||
-        (!StackGrowsDown && SPStart >= ObjStart)) {
-      report("Stack protector is not the top-most object on the stack", MF);
-      break;
-    }
-  }
-}
diff --git a/llvm/test/MachineVerifier/stack-protector-offset.mir b/llvm/test/MachineVerifier/stack-protector-offset.mir
deleted file mode 100644
index 47008e1b12354..0000000000000
--- a/llvm/test/MachineVerifier/stack-protector-offset.mir
+++ /dev/null
@@ -1,63 +0,0 @@
-# REQUIRES: aarch64-registered-target, amdgpu-registered-target
-
-# RUN: split-file %s %t
-
-# RUN: llc -mtriple=aarch64 -run-pass=none -o - %t/valid.mir
-# RUN: not --crash llc -mtriple=aarch64 -run-pass=none -o - %t/lower.mir 2>&1 | FileCheck %t/lower.mir
-# RUN: not --crash llc -mtriple=aarch64 -run-pass=none -o - %t/overlap.mir 2>&1 | FileCheck %t/overlap.mir
-# RUN: not --crash llc -mtriple=amdgcn -run-pass=none -o - %t/higher.mir 2>&1 | FileCheck %t/higher.mir
-
-;--- valid.mir
----
-name:            valid
-frameInfo:
-  stackSize:       16
-  stackProtector:  '%stack.1'
-stack:
-  - { id: 0, offset: -24, size: 8, alignment: 8, stack-id: default }
-  - { id: 1, offset: -16, size: 8, alignment: 8, stack-id: default }
-body:             |
-  bb.0:
-...
-
-;--- lower.mir
-# CHECK: *** Bad machine code: Stack protector is not the top-most object on the stack ***
----
-name:            lower
-frameInfo:
-  stackSize:       16
-  stackProtector:  '%stack.1'
-stack:
-  - { id: 0, offset: -16, size: 8, alignment: 8, stack-id: default }
-  - { id: 1, offset: -24, size: 8, alignment: 8, stack-id: default }
-body:             |
-  bb.0:
-...
-
-;--- overlap.mir
-# CHECK: *** Bad machine code: Stack protector overlaps with another stack object ***
----
-name:            overlap
-frameInfo:
-  stackSize:       16
-  stackProtector:  '%stack.1'
-stack:
-  - { id: 0, offset: -20, size: 8, alignment: 4, stack-id: default }
-  - { id: 1, offset: -16, size: 8, alignment: 8, stack-id: default }
-body:             |
-  bb.0:
-...
-
-;--- higher.mir
-# CHECK: *** Bad machine code: Stack protector is not the top-most object on the stack ***
----
-name:            higher
-frameInfo:
-  stackSize:       16
-  stackProtector:  '%stack.1'
-stack:
-  - { id: 0, offset: 16, size: 8, alignment: 8, stack-id: default }
-  - { id: 1, offset: 24, size: 8, alignment: 8, stack-id: default }
-body:             |
-  bb.0:
-...

From f44ed64864642b008f0c757a5ff37c150ce47d48 Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Fri, 10 Jan 2025 13:21:58 +0100
Subject: [PATCH 054/408] [lldb] Fix some log messages in NativeProcessLinux

---
 lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp b/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp
index 38b7092682873..7f2aba0e4eb2c 100644
--- a/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp
+++ b/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp
@@ -353,7 +353,7 @@ static std::optional<std::pair<lldb::pid_t, WaitStatus>> WaitPid() {
 
   if (wait_pid == -1) {
     Status error(errno, eErrorTypePOSIX);
-    LLDB_LOG(log, "waitpid(-1, &status, _) failed: {1}", error);
+    LLDB_LOG(log, "waitpid(-1, &status, _) failed: {0}", error);
     return std::nullopt;
   }
 
@@ -874,7 +874,7 @@ void NativeProcessLinux::MonitorSignal(const siginfo_t &info,
   LLDB_LOG(log,
            "received signal {0} ({1}) with code {2}, (siginfo pid = {3}, "
            "waitpid pid = {4})",
-           Host::GetSignalAsCString(signo), signo, info.si_code,
+           Host::GetSignalAsCString(signo), signo, info.si_code, info.si_pid,
            thread.GetID());
 
   // Check for thread stop notification.

From 85ca5517633e06d7cf58688c9b246bf14f61e5bd Mon Sep 17 00:00:00 2001
From: "A. Jiang" <de34@live.cn>
Date: Fri, 10 Jan 2025 20:49:52 +0800
Subject: [PATCH 055/408] [libc++][chrono] Entirely remove relational operators
 for `std::chrono::weekday` (#122428)

Follows-up #98730.
---
 libcxx/docs/ReleaseNotes/20.rst           |  5 ++---
 libcxx/include/__chrono/weekday.h         | 19 -------------------
 libcxx/include/__cxx03/__chrono/weekday.h | 19 -------------------
 3 files changed, 2 insertions(+), 41 deletions(-)

diff --git a/libcxx/docs/ReleaseNotes/20.rst b/libcxx/docs/ReleaseNotes/20.rst
index 9a520d8452b06..228c3f3432c29 100644
--- a/libcxx/docs/ReleaseNotes/20.rst
+++ b/libcxx/docs/ReleaseNotes/20.rst
@@ -117,9 +117,8 @@ Deprecations and Removals
   removed in language modes prior to C++20. If you are using these features prior to C++20, you will need to
   update to ``-std=c++20``.
 
-- TODO: The relational operators for ``std::chrono::weekday`` will be removed entirely, and the
-  ``_LIBCPP_ENABLE_REMOVED_WEEKDAY_RELATIONAL_OPERATORS`` macro that was used to re-enable this extension will be
-  ignored in LLVM 20.
+- The relational operators for ``std::chrono::weekday`` has been removed entirely, and the
+  ``_LIBCPP_ENABLE_REMOVED_WEEKDAY_RELATIONAL_OPERATORS`` macro is now ignored.
 
 - The ``_LIBCPP_ENABLE_REMOVED_ALLOCATOR_CONST`` macro no longer has any effect. ``std::allocator<const T>`` is not
   supported as an extension anymore, please migrate any code that uses e.g. ``std::vector<const T>`` to be
diff --git a/libcxx/include/__chrono/weekday.h b/libcxx/include/__chrono/weekday.h
index 86c780cc71825..728cbb844633f 100644
--- a/libcxx/include/__chrono/weekday.h
+++ b/libcxx/include/__chrono/weekday.h
@@ -79,25 +79,6 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr bool operator==(const weekday& __lhs, con
   return __lhs.c_encoding() == __rhs.c_encoding();
 }
 
-// TODO(LLVM 20): Remove the escape hatch
-#  ifdef _LIBCPP_ENABLE_REMOVED_WEEKDAY_RELATIONAL_OPERATORS
-_LIBCPP_HIDE_FROM_ABI inline constexpr bool operator<(const weekday& __lhs, const weekday& __rhs) noexcept {
-  return __lhs.c_encoding() < __rhs.c_encoding();
-}
-
-_LIBCPP_HIDE_FROM_ABI inline constexpr bool operator>(const weekday& __lhs, const weekday& __rhs) noexcept {
-  return __rhs < __lhs;
-}
-
-_LIBCPP_HIDE_FROM_ABI inline constexpr bool operator<=(const weekday& __lhs, const weekday& __rhs) noexcept {
-  return !(__rhs < __lhs);
-}
-
-_LIBCPP_HIDE_FROM_ABI inline constexpr bool operator>=(const weekday& __lhs, const weekday& __rhs) noexcept {
-  return !(__lhs < __rhs);
-}
-#  endif // _LIBCPP_ENABLE_REMOVED_WEEKDAY_RELATIONAL_OPERATORS
-
 _LIBCPP_HIDE_FROM_ABI inline constexpr weekday operator+(const weekday& __lhs, const days& __rhs) noexcept {
   auto const __mu = static_cast<long long>(__lhs.c_encoding()) + __rhs.count();
   auto const __yr = (__mu >= 0 ? __mu : __mu - 6) / 7;
diff --git a/libcxx/include/__cxx03/__chrono/weekday.h b/libcxx/include/__cxx03/__chrono/weekday.h
index 3c152653118d4..fd8081fecc5b9 100644
--- a/libcxx/include/__cxx03/__chrono/weekday.h
+++ b/libcxx/include/__cxx03/__chrono/weekday.h
@@ -79,25 +79,6 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr bool operator==(const weekday& __lhs, con
   return __lhs.c_encoding() == __rhs.c_encoding();
 }
 
-// TODO(LLVM 20): Remove the escape hatch
-#  ifdef _LIBCPP_ENABLE_REMOVED_WEEKDAY_RELATIONAL_OPERATORS
-_LIBCPP_HIDE_FROM_ABI inline constexpr bool operator<(const weekday& __lhs, const weekday& __rhs) noexcept {
-  return __lhs.c_encoding() < __rhs.c_encoding();
-}
-
-_LIBCPP_HIDE_FROM_ABI inline constexpr bool operator>(const weekday& __lhs, const weekday& __rhs) noexcept {
-  return __rhs < __lhs;
-}
-
-_LIBCPP_HIDE_FROM_ABI inline constexpr bool operator<=(const weekday& __lhs, const weekday& __rhs) noexcept {
-  return !(__rhs < __lhs);
-}
-
-_LIBCPP_HIDE_FROM_ABI inline constexpr bool operator>=(const weekday& __lhs, const weekday& __rhs) noexcept {
-  return !(__lhs < __rhs);
-}
-#  endif // _LIBCPP_ENABLE_REMOVED_WEEKDAY_RELATIONAL_OPERATORS
-
 _LIBCPP_HIDE_FROM_ABI inline constexpr weekday operator+(const weekday& __lhs, const days& __rhs) noexcept {
   auto const __mu = static_cast<long long>(__lhs.c_encoding()) + __rhs.count();
   auto const __yr = (__mu >= 0 ? __mu : __mu - 6) / 7;

From 6b12272353b45def33bf5814cdf9e8587f32d40e Mon Sep 17 00:00:00 2001
From: Maksim Ivanov <emaxx@google.com>
Date: Fri, 10 Jan 2025 14:16:59 +0100
Subject: [PATCH 056/408] [clang] Informative error for lifetimebound in
 decl-spec (#118567)

Emit a bit more informative error when the `[[clang::lifetimebound]]`
attribute is wrongly appearing on a decl-spec:

```
'lifetimebound' attribute only applies to parameters and implicit
object parameters
```

instead of:

```
'lifetimebound' attribute cannot be applied to types
```

The new error is also consistent with the diagnostic emitted when the
attribute is misplaced in other parts of a declarator.
---
 clang/lib/Parse/ParseDecl.cpp             | 10 ++++++++--
 clang/test/SemaCXX/attr-lifetimebound.cpp |  4 ++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index 937a94b02458c..7f3f6d568e28c 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -3706,8 +3706,14 @@ void Parser::ParseDeclarationSpecifiers(
           if (PA.isTypeAttr() && PA.getKind() != ParsedAttr::AT_LifetimeBound &&
               PA.getKind() != ParsedAttr::AT_AnyX86NoCfCheck)
             continue;
-          Diag(PA.getLoc(), diag::err_attribute_not_type_attr)
-              << PA << PA.isRegularKeywordAttribute();
+
+          if (PA.getKind() == ParsedAttr::AT_LifetimeBound)
+            Diag(PA.getLoc(), diag::err_attribute_wrong_decl_type_str)
+                << PA << PA.isRegularKeywordAttribute()
+                << "parameters and implicit object parameters";
+          else
+            Diag(PA.getLoc(), diag::err_attribute_not_type_attr)
+                << PA << PA.isRegularKeywordAttribute();
           PA.setInvalid();
         }
 
diff --git a/clang/test/SemaCXX/attr-lifetimebound.cpp b/clang/test/SemaCXX/attr-lifetimebound.cpp
index c7abec61873ef..896793f996666 100644
--- a/clang/test/SemaCXX/attr-lifetimebound.cpp
+++ b/clang/test/SemaCXX/attr-lifetimebound.cpp
@@ -10,7 +10,7 @@ namespace usage_invalid {
     static int *static_class_member() [[clang::lifetimebound]]; // expected-error {{static member function has no implicit object parameter}}
     int *explicit_object(this A&) [[clang::lifetimebound]]; // expected-error {{explicit object member function has no implicit object parameter}}
     int attr_on_var [[clang::lifetimebound]]; // expected-error {{only applies to parameters and implicit object parameters}}
-    int [[clang::lifetimebound]] attr_on_int; // expected-error {{cannot be applied to types}}
+    int [[clang::lifetimebound]] attr_on_int; // expected-error {{'lifetimebound' attribute only applies to parameters and implicit object parameters}}
     int * [[clang::lifetimebound]] attr_on_int_ptr; // expected-error {{'lifetimebound' attribute only applies to parameters and implicit object parameters}}
     int * [[clang::lifetimebound]] * attr_on_int_ptr_ptr; // expected-error {{'lifetimebound' attribute only applies to parameters and implicit object parameters}}
     int (* [[clang::lifetimebound]] attr_on_func_ptr)(); // expected-error {{'lifetimebound' attribute only applies to parameters and implicit object parameters}}
@@ -19,7 +19,7 @@ namespace usage_invalid {
   int *attr_with_param(int &param [[clang::lifetimebound(42)]]); // expected-error {{takes no arguments}}
 
   void attr_on_ptr_arg(int * [[clang::lifetimebound]] ptr); // expected-error {{'lifetimebound' attribute only applies to parameters and implicit object parameters}}
-  static_assert((int [[clang::lifetimebound]]) 12); // expected-error {{cannot be applied to types}}
+  static_assert((int [[clang::lifetimebound]]) 12); // expected-error {{'lifetimebound' attribute only applies to parameters and implicit object parameters}}
   int* attr_on_unnamed_arg(const int& [[clang::lifetimebound]]); // expected-error {{'lifetimebound' attribute only applies to parameters and implicit object parameters}}
   template <typename T>
   int* attr_on_template_ptr_arg(T * [[clang::lifetimebound]] ptr); // expected-error {{'lifetimebound' attribute only applies to parameters and implicit object parameters}}

From 799e9883eaf7c7bdebfb8ddb3366d9137527b29d Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Fri, 10 Jan 2025 14:38:22 +0100
Subject: [PATCH 057/408] [LLD][COFF] Silence GCC warning in
 Arm64XDynamicRelocEntry::getSize (NFC) (#122382)

Fixes 71bbafba31699bdabe289654d157ae961432e52a.
---
 lld/COFF/Chunks.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lld/COFF/Chunks.cpp b/lld/COFF/Chunks.cpp
index d87a4d8d1ae79..f2d0111a19558 100644
--- a/lld/COFF/Chunks.cpp
+++ b/lld/COFF/Chunks.cpp
@@ -1174,6 +1174,7 @@ size_t Arm64XDynamicRelocEntry::getSize() const {
   case IMAGE_DVRT_ARM64X_FIXUP_TYPE_ZEROFILL:
     llvm_unreachable("unsupported type");
   }
+  llvm_unreachable("invalid type");
 }
 
 void Arm64XDynamicRelocEntry::writeTo(uint8_t *buf) const {

From 7b0536794349734c8862fc140808e4e5a2ab8f8d Mon Sep 17 00:00:00 2001
From: Sergei Barannikov <barannikov88@gmail.com>
Date: Fri, 10 Jan 2025 16:43:19 +0300
Subject: [PATCH 058/408] [ADT] Fix specialization of ValueIsPresent for
 PointerUnion (#121847)

Two instances of `PointerUnion` with different active members and null
value compare unequal. Currently, this results in counterintuitive
behavior when using functions from `Casting.h`, e.g.:

```C++
  PointerUnion<int *, float *> U;
  // U = (int *)nullptr;
  dyn_cast<int *>(U); // Aborts
  dyn_cast<float *>(U); // Aborts
  U = (float *)nullptr;
  dyn_cast<int *>(U); // OK
  dyn_cast<float *>(U); // OK
```

`dyn_cast` should abort in all cases because the argument is null.
Currently, it aborts only if the first member is active. This happens
because the partial template specialization of `ValueIsPresent` for
nullable types compares the union with a union constructed from nullptr,
and the two unions compare equal only if their active members are the
same.

This patch changed the specialization of `ValueIsPresent` for nullable
types to make `isPresent()` return false for all possible null values of
a PointerUnion, and fixes two places where the old behavior was
exploited.

Pull Request: https://github.com/llvm/llvm-project/pull/121847
---
 llvm/include/llvm/Support/Casting.h       | 8 ++++----
 llvm/lib/CodeGen/RegisterBankInfo.cpp     | 4 ++--
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 4 ++--
 llvm/unittests/ADT/PointerUnionTest.cpp   | 5 +++++
 4 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/llvm/include/llvm/Support/Casting.h b/llvm/include/llvm/Support/Casting.h
index 66fdcb44ea2c0..2ce70e732e2ec 100644
--- a/llvm/include/llvm/Support/Casting.h
+++ b/llvm/include/llvm/Support/Casting.h
@@ -614,12 +614,12 @@ template <typename T> struct ValueIsPresent<std::optional<T>> {
   static inline decltype(auto) unwrapValue(std::optional<T> &t) { return *t; }
 };
 
-// If something is "nullable" then we just compare it to nullptr to see if it
-// exists.
+// If something is "nullable" then we just cast it to bool to see if it exists.
 template <typename T>
-struct ValueIsPresent<T, std::enable_if_t<IsNullable<T>>> {
+struct ValueIsPresent<
+    T, std::enable_if_t<IsNullable<T> && std::is_constructible_v<bool, T>>> {
   using UnwrappedType = T;
-  static inline bool isPresent(const T &t) { return t != T(nullptr); }
+  static inline bool isPresent(const T &t) { return static_cast<bool>(t); }
   static inline decltype(auto) unwrapValue(T &t) { return t; }
 };
 
diff --git a/llvm/lib/CodeGen/RegisterBankInfo.cpp b/llvm/lib/CodeGen/RegisterBankInfo.cpp
index e1720b038e236..5a8cf13ad11fd 100644
--- a/llvm/lib/CodeGen/RegisterBankInfo.cpp
+++ b/llvm/lib/CodeGen/RegisterBankInfo.cpp
@@ -134,10 +134,10 @@ const TargetRegisterClass *RegisterBankInfo::constrainGenericRegister(
 
   // If the register already has a class, fallback to MRI::constrainRegClass.
   auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
-  if (isa<const TargetRegisterClass *>(RegClassOrBank))
+  if (isa_and_present<const TargetRegisterClass *>(RegClassOrBank))
     return MRI.constrainRegClass(Reg, &RC);
 
-  const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
+  const auto *RB = dyn_cast_if_present<const RegisterBank *>(RegClassOrBank);
   // Otherwise, all we can do is ensure the bank covers the class, and set it.
   if (RB && !RB->covers(RC))
     return nullptr;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 704435dad65d7..8fa656c77e90e 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3708,10 +3708,10 @@ const TargetRegisterClass *
 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
                                          const MachineRegisterInfo &MRI) const {
   const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
-  if (const RegisterBank *RB = dyn_cast<const RegisterBank *>(RCOrRB))
+  if (const auto *RB = dyn_cast_if_present<const RegisterBank *>(RCOrRB))
     return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB);
 
-  if (const auto *RC = dyn_cast<const TargetRegisterClass *>(RCOrRB))
+  if (const auto *RC = dyn_cast_if_present<const TargetRegisterClass *>(RCOrRB))
     return getAllocatableClass(RC);
 
   return nullptr;
diff --git a/llvm/unittests/ADT/PointerUnionTest.cpp b/llvm/unittests/ADT/PointerUnionTest.cpp
index acddb78960149..a28d532865cbc 100644
--- a/llvm/unittests/ADT/PointerUnionTest.cpp
+++ b/llvm/unittests/ADT/PointerUnionTest.cpp
@@ -208,6 +208,11 @@ TEST_F(PointerUnionTest, NewCastInfra) {
   EXPECT_FALSE(isa<float *>(d4null));
   EXPECT_FALSE(isa<long long *>(d4null));
 
+  EXPECT_FALSE(isa_and_present<int *>(i4null));
+  EXPECT_FALSE(isa_and_present<float *>(f4null));
+  EXPECT_FALSE(isa_and_present<long long *>(l4null));
+  EXPECT_FALSE(isa_and_present<double *>(d4null));
+
   // test cast<>
   EXPECT_EQ(cast<float *>(a), &f);
   EXPECT_EQ(cast<int *>(b), &i);

From cfee344dda7394631f2177a15e56cfeee1d61fc4 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Fri, 10 Jan 2025 14:26:49 +0000
Subject: [PATCH 059/408] VT: teach implied-cond-cr about samesign (#122447)

Teach isImpliedCondCommonOperandWithCR about samesign, noting that the
only case we need to handle is when exactly one of the icmps have
samesign.
---
 llvm/lib/Analysis/ValueTracking.cpp           | 35 +++++++++++++------
 .../implied-condition-samesign.ll             |  8 ++---
 2 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index db244148a3b1e..9a61b36efa51d 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -9388,17 +9388,32 @@ isImpliedCondMatchingOperands(CmpInst::Predicate LPred,
 /// Return true if "icmp LPred X, LCR" implies "icmp RPred X, RCR" is true.
 /// Return false if "icmp LPred X, LCR" implies "icmp RPred X, RCR" is false.
 /// Otherwise, return std::nullopt if we can't infer anything.
-static std::optional<bool> isImpliedCondCommonOperandWithCR(
-    CmpInst::Predicate LPred, const ConstantRange &LCR,
-    CmpInst::Predicate RPred, const ConstantRange &RCR) {
-  ConstantRange DomCR = ConstantRange::makeAllowedICmpRegion(LPred, LCR);
-  // If all true values for lhs and true for rhs, lhs implies rhs
-  if (DomCR.icmp(RPred, RCR))
-    return true;
+static std::optional<bool>
+isImpliedCondCommonOperandWithCR(CmpPredicate LPred, const ConstantRange &LCR,
+                                 CmpPredicate RPred, const ConstantRange &RCR) {
+  auto CRImpliesPred = [&](ConstantRange CR,
+                           CmpInst::Predicate Pred) -> std::optional<bool> {
+    // If all true values for lhs and true for rhs, lhs implies rhs
+    if (CR.icmp(Pred, RCR))
+      return true;
 
-  // If there is no overlap, lhs implies not rhs
-  if (DomCR.icmp(CmpInst::getInversePredicate(RPred), RCR))
-    return false;
+    // If there is no overlap, lhs implies not rhs
+    if (CR.icmp(CmpInst::getInversePredicate(Pred), RCR))
+      return false;
+
+    return std::nullopt;
+  };
+  if (auto Res = CRImpliesPred(ConstantRange::makeAllowedICmpRegion(LPred, LCR),
+                               RPred))
+    return Res;
+  if (LPred.hasSameSign() ^ RPred.hasSameSign()) {
+    LPred = LPred.hasSameSign() ? ICmpInst::getFlippedSignednessPredicate(LPred)
+                                : static_cast<CmpInst::Predicate>(LPred);
+    RPred = RPred.hasSameSign() ? ICmpInst::getFlippedSignednessPredicate(RPred)
+                                : static_cast<CmpInst::Predicate>(RPred);
+    return CRImpliesPred(ConstantRange::makeAllowedICmpRegion(LPred, LCR),
+                         RPred);
+  }
   return std::nullopt;
 }
 
diff --git a/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll b/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll
index 3abe4c76aaab6..042155ae2bb79 100644
--- a/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll
+++ b/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll
@@ -160,9 +160,7 @@ define i32 @gt_implies_sge_dominating_cr(i32 %a, i32 %len) {
 ; CHECK-NEXT:    [[A_GT_20:%.*]] = icmp samesign ugt i32 [[A]], 20
 ; CHECK-NEXT:    br i1 [[A_GT_20]], label %[[TAKEN:.*]], label %[[END:.*]]
 ; CHECK:       [[TAKEN]]:
-; CHECK-NEXT:    [[A_SGE_10:%.*]] = icmp sge i32 [[A]], 10
-; CHECK-NEXT:    [[RES:%.*]] = select i1 [[A_SGE_10]], i32 30, i32 0
-; CHECK-NEXT:    ret i32 [[RES]]
+; CHECK-NEXT:    ret i32 30
 ; CHECK:       [[END]]:
 ; CHECK-NEXT:    ret i32 -1
 ;
@@ -186,9 +184,7 @@ define i32 @sgt_implies_ge_dominating_cr(i32 %a, i32 %len) {
 ; CHECK-NEXT:    [[A_SGT_MINUS_10:%.*]] = icmp sgt i32 [[A]], -10
 ; CHECK-NEXT:    br i1 [[A_SGT_MINUS_10]], label %[[TAKEN:.*]], label %[[END:.*]]
 ; CHECK:       [[TAKEN]]:
-; CHECK-NEXT:    [[A_GE_MINUS_20:%.*]] = icmp samesign uge i32 [[A]], -20
-; CHECK-NEXT:    [[RES:%.*]] = select i1 [[A_GE_MINUS_20]], i32 30, i32 0
-; CHECK-NEXT:    ret i32 [[RES]]
+; CHECK-NEXT:    ret i32 30
 ; CHECK:       [[END]]:
 ; CHECK-NEXT:    ret i32 -1
 ;

From c575a7d1e9b732432bf95c7905067b779f43d1a4 Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <jakub@codeplay.com>
Date: Fri, 10 Jan 2025 15:29:42 +0100
Subject: [PATCH 060/408] [AMDGPU] Provide default value in get intrinsic param
 type (#122448)

Make sure that a default value (nullptr) is returned from
`getIntrinsicParamType`, also validate uses of this helper function.
---
 llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp | 50 ++++++++++++++++--------
 1 file changed, 34 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
index 264b4d43248c6..82233c0c891ad 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
@@ -929,23 +929,38 @@ AMDGPULibFuncBase::Param AMDGPULibFuncBase::Param::getFromTy(Type *Ty,
   return P;
 }
 
-static Type* getIntrinsicParamType(
-  LLVMContext& C,
-  const AMDGPULibFunc::Param& P,
-  bool useAddrSpace) {
-  Type* T = nullptr;
+static Type *getIntrinsicParamType(LLVMContext &C,
+                                   const AMDGPULibFunc::Param &P,
+                                   bool UseAddrSpace) {
+  Type *T = nullptr;
   switch (P.ArgType) {
+  default:
+    return nullptr;
   case AMDGPULibFunc::U8:
-  case AMDGPULibFunc::I8:   T = Type::getInt8Ty(C);   break;
+  case AMDGPULibFunc::I8:
+    T = Type::getInt8Ty(C);
+    break;
   case AMDGPULibFunc::U16:
-  case AMDGPULibFunc::I16:  T = Type::getInt16Ty(C);  break;
+  case AMDGPULibFunc::I16:
+    T = Type::getInt16Ty(C);
+    break;
   case AMDGPULibFunc::U32:
-  case AMDGPULibFunc::I32:  T = Type::getInt32Ty(C);  break;
+  case AMDGPULibFunc::I32:
+    T = Type::getInt32Ty(C);
+    break;
   case AMDGPULibFunc::U64:
-  case AMDGPULibFunc::I64:  T = Type::getInt64Ty(C);  break;
-  case AMDGPULibFunc::F16:  T = Type::getHalfTy(C);   break;
-  case AMDGPULibFunc::F32:  T = Type::getFloatTy(C);  break;
-  case AMDGPULibFunc::F64:  T = Type::getDoubleTy(C); break;
+  case AMDGPULibFunc::I64:
+    T = Type::getInt64Ty(C);
+    break;
+  case AMDGPULibFunc::F16:
+    T = Type::getHalfTy(C);
+    break;
+  case AMDGPULibFunc::F32:
+    T = Type::getFloatTy(C);
+    break;
+  case AMDGPULibFunc::F64:
+    T = Type::getDoubleTy(C);
+    break;
 
   case AMDGPULibFunc::IMG1DA:
   case AMDGPULibFunc::IMG1DB:
@@ -972,7 +987,7 @@ static Type* getIntrinsicParamType(
     T = FixedVectorType::get(T, P.VectorSize);
   if (P.PtrKind != AMDGPULibFunc::BYVALUE)
     T = PointerType::get(
-        C, useAddrSpace ? ((P.PtrKind & AMDGPULibFunc::ADDR_SPACE) - 1) : 0);
+        C, UseAddrSpace ? ((P.PtrKind & AMDGPULibFunc::ADDR_SPACE) - 1) : 0);
   return T;
 }
 
@@ -989,9 +1004,11 @@ FunctionType *AMDGPUMangledLibFunc::getFunctionType(const Module &M) const {
     Args.push_back(ParamTy);
   }
 
-  return FunctionType::get(
-    getIntrinsicParamType(C, getRetType(FuncId, Leads), true),
-    Args, false);
+  Type *RetTy = getIntrinsicParamType(C, getRetType(FuncId, Leads), true);
+  if (!RetTy)
+    return nullptr;
+
+  return FunctionType::get(RetTy, Args, false);
 }
 
 unsigned AMDGPUMangledLibFunc::getNumArgs() const {
@@ -1080,6 +1097,7 @@ FunctionCallee AMDGPULibFunc::getOrInsertFunction(Module *M,
   }
 
   FunctionType *FuncTy = fInfo.getFunctionType(*M);
+  assert(FuncTy);
 
   bool hasPtr = false;
   for (FunctionType::param_iterator

From a2995cb4bb21ba2fe6277bbcd24b8ab1b357e12d Mon Sep 17 00:00:00 2001
From: Ayokunle Amodu <ayokunle321@gmail.com>
Date: Fri, 10 Jan 2025 07:30:51 -0700
Subject: [PATCH 061/408] [LangRef] Fix code segment and numbering issue in the
 'call' instruction section (#122294)

Fixes issue #122084.

Under "Arguments" in the 'call' instruction section, there was some text
included in the code segment so I edited it out. Also fixed the
numbering issue in that section.
---
 llvm/docs/LangRef.rst | 54 +++++++++++++++++++++----------------------
 1 file changed, 26 insertions(+), 28 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 4ee340c9a0315..33acb5e73d5ff 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -12780,11 +12780,11 @@ This instruction requires several arguments:
    attributes like "disable-tail-calls". The ``musttail`` marker provides these
    guarantees:
 
-   #. The call will not cause unbounded stack growth if it is part of a
+   -  The call will not cause unbounded stack growth if it is part of a
       recursive cycle in the call graph.
-   #. Arguments with the :ref:`inalloca <attr_inalloca>` or
+   -  Arguments with the :ref:`inalloca <attr_inalloca>` or
       :ref:`preallocated <attr_preallocated>` attribute are forwarded in place.
-   #. If the musttail call appears in a function with the ``"thunk"`` attribute
+   -  If the musttail call appears in a function with the ``"thunk"`` attribute
       and the caller and callee both have varargs, then any unprototyped
       arguments in register or memory are forwarded to the callee. Similarly,
       the return value of the callee is returned to the caller's caller, even
@@ -12795,7 +12795,7 @@ This instruction requires several arguments:
    argument may be passed to the callee as a byval argument, which can be
    dereferenced inside the callee. For example:
 
-.. code-block:: llvm
+   .. code-block:: llvm
 
       declare void @take_byval(ptr byval(i64))
       declare void @take_ptr(ptr)
@@ -12849,31 +12849,30 @@ This instruction requires several arguments:
         ret void
       }
 
-
    Calls marked ``musttail`` must obey the following additional rules:
 
-   - The call must immediately precede a :ref:`ret <i_ret>` instruction,
-     or a pointer bitcast followed by a ret instruction.
-   - The ret instruction must return the (possibly bitcasted) value
-     produced by the call, undef, or void.
-   - The calling conventions of the caller and callee must match.
-   - The callee must be varargs iff the caller is varargs. Bitcasting a
-     non-varargs function to the appropriate varargs type is legal so
-     long as the non-varargs prefixes obey the other rules.
-   - The return type must not undergo automatic conversion to an `sret` pointer.
+   -  The call must immediately precede a :ref:`ret <i_ret>` instruction,
+      or a pointer bitcast followed by a ret instruction.
+   -  The ret instruction must return the (possibly bitcasted) value
+      produced by the call, undef, or void.
+   -  The calling conventions of the caller and callee must match.
+   -  The callee must be varargs iff the caller is varargs. Bitcasting a
+      non-varargs function to the appropriate varargs type is legal so
+      long as the non-varargs prefixes obey the other rules.
+   -  The return type must not undergo automatic conversion to an `sret` pointer.
 
-  In addition, if the calling convention is not `swifttailcc` or `tailcc`:
+   In addition, if the calling convention is not `swifttailcc` or `tailcc`:
 
-   - All ABI-impacting function attributes, such as sret, byval, inreg,
-     returned, and inalloca, must match.
-   - The caller and callee prototypes must match. Pointer types of parameters
-     or return types may differ in pointee type, but not in address space.
+   -  All ABI-impacting function attributes, such as sret, byval, inreg,
+      returned, and inalloca, must match.
+   -  The caller and callee prototypes must match. Pointer types of parameters
+      or return types may differ in pointee type, but not in address space.
 
-  On the other hand, if the calling convention is `swifttailcc` or `tailcc`:
+   On the other hand, if the calling convention is `swifttailcc` or `tailcc`:
 
-   - Only these ABI-impacting attributes attributes are allowed: sret, byval,
-     swiftself, and swiftasync.
-   - Prototypes are not required to match.
+   -  Only these ABI-impacting attributes attributes are allowed: sret, byval,
+      swiftself, and swiftasync.
+   -  Prototypes are not required to match.
 
    Tail call optimization for calls marked ``tail`` is guaranteed to occur if
    the following conditions are met:
@@ -12881,11 +12880,10 @@ This instruction requires several arguments:
    -  Caller and callee both have the calling convention ``fastcc`` or ``tailcc``.
    -  The call is in tail position (ret immediately follows call and ret
       uses value of call or is void).
-   -  Option ``-tailcallopt`` is enabled,
-      ``llvm::GuaranteedTailCallOpt`` is ``true``, or the calling convention
-      is ``tailcc``
-   -  `Platform-specific constraints are
-      met. <CodeGenerator.html#tail-call-optimization>`_
+   -  Option ``-tailcallopt`` is enabled, ``llvm::GuaranteedTailCallOpt`` is 
+      ``true``, or the calling convention is ``tailcc``.
+   -  `Platform-specific constraints are met. 
+      <CodeGenerator.html#tail-call-optimization>`_
 
 #. The optional ``notail`` marker indicates that the optimizers should not add
    ``tail`` or ``musttail`` markers to the call. It is used to prevent tail

From 9c85cdec4ad29389c27cc2372d45f73d1ca8053a Mon Sep 17 00:00:00 2001
From: Brox Chen <guochen2@amd.com>
Date: Fri, 10 Jan 2025 09:43:20 -0500
Subject: [PATCH 062/408] [AMDGPU][True16][MC][NFC]update vopc dasm test with
 latest update script (#122360)

This is a NFC.

Update VOPC dasm test with +real-true16 and run latest update script.
---
 .../gfx11_dasm_vop3_dpp16_from_vopc.txt       | 1743 +++---
 .../gfx11_dasm_vop3_dpp16_from_vopcx.txt      | 1747 +++---
 .../AMDGPU/gfx11_dasm_vop3_dpp8_from_vopc.txt |  627 +-
 .../gfx11_dasm_vop3_dpp8_from_vopcx.txt       |  387 +-
 .../AMDGPU/gfx11_dasm_vop3_from_vopc.txt      | 4823 +++++++--------
 .../AMDGPU/gfx11_dasm_vop3_from_vopcx.txt     | 2667 ++++-----
 .../Disassembler/AMDGPU/gfx11_dasm_vopc.txt   | 5165 +++++++++--------
 .../AMDGPU/gfx11_dasm_vopc_dpp16.txt          | 1737 +++---
 .../AMDGPU/gfx11_dasm_vopc_dpp8.txt           |  249 +-
 .../Disassembler/AMDGPU/gfx11_dasm_vopcx.txt  | 2653 ++++-----
 .../AMDGPU/gfx11_dasm_vopcx_dpp16.txt         | 1737 +++---
 .../AMDGPU/gfx11_dasm_vopcx_dpp8.txt          |  249 +-
 .../Disassembler/AMDGPU/gfx12_dasm_vop3c.txt  | 4127 ++++++-------
 .../AMDGPU/gfx12_dasm_vop3c_dpp16.txt         | 1631 +++---
 .../AMDGPU/gfx12_dasm_vop3c_dpp8.txt          |  663 +--
 .../Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt | 2281 ++++----
 .../AMDGPU/gfx12_dasm_vop3cx_dpp16.txt        | 1635 +++---
 .../AMDGPU/gfx12_dasm_vop3cx_dpp8.txt         |  455 +-
 .../Disassembler/AMDGPU/gfx12_dasm_vopc.txt   | 4409 +++++++-------
 .../AMDGPU/gfx12_dasm_vopc_dpp16.txt          | 1513 ++---
 .../AMDGPU/gfx12_dasm_vopc_dpp8.txt           |  217 +-
 .../Disassembler/AMDGPU/gfx12_dasm_vopcx.txt  | 2269 ++++----
 .../AMDGPU/gfx12_dasm_vopcx_dpp16.txt         | 1513 ++---
 .../AMDGPU/gfx12_dasm_vopcx_dpp8.txt          |  217 +-
 24 files changed, 22381 insertions(+), 22333 deletions(-)

diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopc.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopc.txt
index 4dae6c2cf076c..a8974243b755e 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopc.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopc.txt
@@ -1,3412 +1,3415 @@
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W32 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64 %s
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W32 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W32 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64 %s
 
+0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_class_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_class_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_class_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_class_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_class_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_class_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_class_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_class_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_class_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_class_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_class_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_class_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_class_f16_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_class_f16_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_class_f16_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_class_f16_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_class_f16_e64_dpp null, -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x01,0x7d,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30]
 0x7c,0x01,0x7d,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_class_f16_e64_dpp null, -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x01,0x7d,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_class_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_class_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_class_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_class_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_class_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_class_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_class_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_class_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_class_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_class_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_class_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_class_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_class_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_class_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_class_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_class_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_class_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_class_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_class_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_class_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_class_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_class_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_class_f32_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_class_f32_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_class_f32_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_class_f32_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_class_f32_e64_dpp null, -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x01,0x7e,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30]
 0x7c,0x01,0x7e,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_class_f32_e64_dpp null, -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x01,0x7e,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_eq_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_eq_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_eq_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_eq_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_eq_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_eq_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_eq_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_eq_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_eq_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_eq_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_eq_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_eq_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x02,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_eq_f16_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x02,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_eq_f16_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x02,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x02,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x02,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_eq_f16_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x02,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_eq_f16_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x02,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x02,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_eq_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x02,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x02,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_eq_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x02,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_eq_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_eq_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_eq_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_eq_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_eq_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_eq_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_eq_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_eq_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_eq_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_eq_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_eq_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_eq_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x12,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_eq_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x12,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_eq_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x12,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x12,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x12,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_eq_f32_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x12,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_eq_f32_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x12,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x12,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_eq_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x12,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x12,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_eq_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x12,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_eq_i16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_eq_i16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_eq_i16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_eq_i16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_eq_i16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_eq_i16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_eq_i16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_eq_i16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_eq_i16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_eq_i16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_eq_i16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_eq_i16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_eq_i16_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_eq_i16_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_eq_i16_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_eq_i16_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_eq_i16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x32,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x32,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_eq_i16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x32,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_eq_i32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_eq_i32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_eq_i32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_eq_i32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_eq_i32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_eq_i32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_eq_i32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_eq_i32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_eq_i32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_eq_i32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_eq_i32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_eq_i32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_eq_i32_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_eq_i32_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_eq_i32_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_eq_i32_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_eq_i32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x42,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x42,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_eq_i32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x42,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_eq_u16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_eq_u16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_eq_u16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_eq_u16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_eq_u16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_eq_u16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_eq_u16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_eq_u16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_eq_u16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_eq_u16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_eq_u16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_eq_u16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_eq_u16_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_eq_u16_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_eq_u16_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_eq_u16_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_eq_u16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x3a,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x3a,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_eq_u16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x3a,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_eq_u32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_eq_u32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_eq_u32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_eq_u32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_eq_u32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_eq_u32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_eq_u32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_eq_u32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_eq_u32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_eq_u32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_eq_u32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_eq_u32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_eq_u32_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_eq_u32_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_eq_u32_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_eq_u32_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_eq_u32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x4a,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x4a,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_eq_u32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x4a,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_f_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_f_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_f_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_f_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_f_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_f_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_f_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_f_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_f_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_f_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_f_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_f_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_f_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_f_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_f_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_f_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_f_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_f_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_f_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_f_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_f_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_f_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x00,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x00,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_f_f16_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x00,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_f_f16_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x00,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x00,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x00,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_f_f16_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x00,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_f_f16_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x00,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x00,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_f_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x00,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x00,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_f_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x00,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_f_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_f_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_f_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_f_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_f_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_f_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_f_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_f_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_f_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_f_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_f_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_f_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_f_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_f_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_f_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_f_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_f_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_f_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_f_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_f_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_f_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_f_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x10,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x10,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_f_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x10,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_f_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x10,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x10,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x10,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_f_f32_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x10,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_f_f32_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x10,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x10,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_f_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x10,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x10,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_f_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x10,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_f_i32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_f_i32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_f_i32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_f_i32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_f_i32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_f_i32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_f_i32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_f_i32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_f_i32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_f_i32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_f_i32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_f_i32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_f_i32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_f_i32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_f_i32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_f_i32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_f_i32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_f_i32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_f_i32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_f_i32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_f_i32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_f_i32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_f_i32_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_f_i32_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_f_i32_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_f_i32_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x40,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_f_i32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x40,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x40,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_f_i32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x40,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_f_u32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_f_u32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_f_u32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_f_u32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_f_u32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_f_u32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_f_u32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_f_u32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_f_u32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_f_u32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_f_u32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_f_u32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_f_u32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_f_u32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_f_u32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_f_u32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_f_u32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_f_u32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_f_u32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_f_u32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_f_u32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_f_u32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_f_u32_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_f_u32_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_f_u32_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_f_u32_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x48,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_f_u32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x48,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x48,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_f_u32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x48,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ge_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ge_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_ge_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_ge_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ge_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ge_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_ge_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ge_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_ge_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ge_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_ge_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ge_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x06,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ge_f16_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x06,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ge_f16_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x06,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x06,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x06,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_ge_f16_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x06,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ge_f16_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x06,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x06,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_ge_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x06,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x06,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_ge_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x06,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ge_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ge_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_ge_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_ge_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ge_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ge_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_ge_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ge_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_ge_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ge_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_ge_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ge_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x16,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ge_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x16,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ge_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x16,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x16,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x16,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_ge_f32_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x16,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ge_f32_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x16,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x16,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_ge_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x16,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x16,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_ge_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x16,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ge_i16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ge_i16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_ge_i16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_ge_i16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ge_i16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ge_i16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_ge_i16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ge_i16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_ge_i16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ge_i16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_ge_i16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ge_i16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ge_i16_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ge_i16_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_ge_i16_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ge_i16_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_ge_i16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x36,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x36,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_ge_i16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x36,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ge_i32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ge_i32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_ge_i32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_ge_i32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ge_i32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ge_i32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_ge_i32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ge_i32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_ge_i32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ge_i32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_ge_i32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ge_i32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ge_i32_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ge_i32_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_ge_i32_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ge_i32_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_ge_i32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x46,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x46,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_ge_i32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x46,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ge_u16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ge_u16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_ge_u16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_ge_u16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ge_u16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ge_u16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_ge_u16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ge_u16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_ge_u16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ge_u16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_ge_u16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ge_u16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ge_u16_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ge_u16_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_ge_u16_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ge_u16_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_ge_u16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x3e,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x3e,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_ge_u16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x3e,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ge_u32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ge_u32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_ge_u32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_ge_u32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ge_u32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ge_u32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_ge_u32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ge_u32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_ge_u32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ge_u32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_ge_u32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ge_u32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ge_u32_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ge_u32_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_ge_u32_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ge_u32_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_ge_u32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x4e,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x4e,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_ge_u32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x4e,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_gt_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_gt_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_gt_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_gt_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_gt_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_gt_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_gt_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_gt_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_gt_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_gt_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_gt_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_gt_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x04,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_gt_f16_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x04,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_gt_f16_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x04,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x04,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x04,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_gt_f16_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x04,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_gt_f16_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x04,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x04,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_gt_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x04,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x04,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_gt_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x04,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_gt_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_gt_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_gt_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_gt_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_gt_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_gt_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_gt_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_gt_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_gt_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_gt_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_gt_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_gt_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x14,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_gt_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x14,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_gt_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x14,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x14,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x14,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_gt_f32_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x14,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_gt_f32_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x14,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x14,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_gt_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x14,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x14,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_gt_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x14,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_gt_i16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_gt_i16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_gt_i16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_gt_i16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_gt_i16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_gt_i16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_gt_i16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_gt_i16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_gt_i16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_gt_i16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_gt_i16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_gt_i16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_gt_i16_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_gt_i16_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_gt_i16_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_gt_i16_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_gt_i16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x34,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x34,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_gt_i16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x34,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_gt_i32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_gt_i32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_gt_i32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_gt_i32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_gt_i32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_gt_i32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_gt_i32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_gt_i32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_gt_i32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_gt_i32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_gt_i32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_gt_i32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_gt_i32_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_gt_i32_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_gt_i32_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_gt_i32_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_gt_i32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x44,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x44,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_gt_i32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x44,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_gt_u16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_gt_u16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_gt_u16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_gt_u16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_gt_u16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_gt_u16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_gt_u16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_gt_u16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_gt_u16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_gt_u16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_gt_u16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_gt_u16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_gt_u16_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_gt_u16_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_gt_u16_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_gt_u16_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_gt_u16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x3c,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x3c,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_gt_u16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x3c,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_gt_u32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_gt_u32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_gt_u32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_gt_u32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_gt_u32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_gt_u32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_gt_u32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_gt_u32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_gt_u32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_gt_u32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_gt_u32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_gt_u32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_gt_u32_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_gt_u32_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_gt_u32_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_gt_u32_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_gt_u32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x4c,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x4c,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_gt_u32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x4c,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_le_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_le_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_le_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_le_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_le_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_le_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_le_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_le_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_le_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_le_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_le_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_le_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_le_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_le_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_le_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_le_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_le_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_le_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_le_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_le_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_le_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x03,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_le_f16_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x03,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_le_f16_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x03,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x03,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x03,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_le_f16_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x03,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_le_f16_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x03,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x03,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_le_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x03,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x03,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_le_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x03,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_le_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_le_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_le_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_le_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_le_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_le_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_le_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_le_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_le_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_le_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_le_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_le_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_le_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_le_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_le_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_le_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_le_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_le_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_le_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_le_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_le_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x13,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_le_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x13,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_le_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x13,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x13,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x13,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_le_f32_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x13,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_le_f32_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x13,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x13,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_le_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x13,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x13,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_le_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x13,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_le_i16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_le_i16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_le_i16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_le_i16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_le_i16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_le_i16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_le_i16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_le_i16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_le_i16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_le_i16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_le_i16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_i16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_le_i16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_le_i16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_le_i16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_le_i16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_le_i16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_le_i16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_le_i16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_le_i16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_le_i16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_le_i16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_le_i16_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_le_i16_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_le_i16_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_le_i16_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_le_i16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x33,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x33,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_le_i16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x33,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_le_i32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_le_i32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_le_i32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_le_i32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_le_i32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_le_i32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_le_i32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_le_i32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_le_i32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_le_i32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_le_i32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_i32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_le_i32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_le_i32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_le_i32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_le_i32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_le_i32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_le_i32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_le_i32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_le_i32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_le_i32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_le_i32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_le_i32_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_le_i32_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_le_i32_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_le_i32_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_le_i32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x43,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x43,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_le_i32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x43,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_le_u16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_le_u16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_le_u16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_le_u16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_le_u16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_le_u16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_le_u16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_le_u16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_le_u16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_le_u16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_le_u16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_u16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_le_u16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_le_u16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_le_u16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_le_u16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_le_u16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_le_u16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_le_u16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_le_u16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_le_u16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_le_u16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_le_u16_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_le_u16_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_le_u16_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_le_u16_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_le_u16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x3b,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x3b,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_le_u16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x3b,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_le_u32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_le_u32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_le_u32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_le_u32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_le_u32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_le_u32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_le_u32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_le_u32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_le_u32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_le_u32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_le_u32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_u32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_le_u32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_le_u32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_le_u32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_le_u32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_le_u32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_le_u32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_le_u32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_le_u32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_le_u32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_le_u32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_le_u32_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_le_u32_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_le_u32_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_le_u32_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_le_u32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x4b,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x4b,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_le_u32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x4b,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lg_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lg_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_lg_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_lg_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_lg_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lg_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_lg_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lg_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_lg_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lg_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_lg_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lg_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x05,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lg_f16_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x05,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lg_f16_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x05,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x05,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x05,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_lg_f16_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x05,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lg_f16_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x05,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x05,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_lg_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x05,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x05,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_lg_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x05,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lg_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lg_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_lg_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_lg_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_lg_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lg_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_lg_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lg_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_lg_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lg_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_lg_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lg_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x15,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lg_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x15,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lg_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x15,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x15,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x15,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_lg_f32_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x15,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lg_f32_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x15,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x15,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_lg_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x15,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x15,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_lg_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x15,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lt_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lt_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_lt_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_lt_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_lt_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lt_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_lt_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lt_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_lt_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lt_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_lt_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lt_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x01,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lt_f16_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x01,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lt_f16_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x01,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x01,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x01,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_lt_f16_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x01,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lt_f16_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x01,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x01,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_lt_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x01,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x01,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_lt_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x01,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lt_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lt_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_lt_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_lt_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_lt_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lt_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_lt_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lt_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_lt_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lt_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_lt_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lt_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x11,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lt_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x11,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lt_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x11,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x11,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x11,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_lt_f32_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x11,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lt_f32_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x11,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x11,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_lt_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x11,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x11,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_lt_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x11,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lt_i16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lt_i16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_lt_i16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_lt_i16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_lt_i16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lt_i16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_lt_i16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lt_i16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_lt_i16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lt_i16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_lt_i16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lt_i16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lt_i16_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lt_i16_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_lt_i16_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lt_i16_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_lt_i16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x31,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x31,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_lt_i16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x31,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lt_i32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lt_i32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_lt_i32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_lt_i32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_lt_i32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lt_i32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_lt_i32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lt_i32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_lt_i32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lt_i32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_lt_i32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lt_i32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lt_i32_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lt_i32_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_lt_i32_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lt_i32_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_lt_i32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x41,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x41,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_lt_i32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x41,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lt_u16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lt_u16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_lt_u16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_lt_u16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_lt_u16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lt_u16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_lt_u16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lt_u16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_lt_u16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lt_u16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_lt_u16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lt_u16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lt_u16_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lt_u16_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_lt_u16_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lt_u16_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_lt_u16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x39,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x39,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_lt_u16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x39,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lt_u32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lt_u32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_lt_u32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_lt_u32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_lt_u32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lt_u32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_lt_u32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lt_u32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_lt_u32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lt_u32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_lt_u32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lt_u32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lt_u32_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lt_u32_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_lt_u32_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lt_u32_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_lt_u32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x49,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x49,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_lt_u32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x49,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ne_i16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ne_i16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_ne_i16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_ne_i16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ne_i16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ne_i16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_ne_i16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ne_i16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_ne_i16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ne_i16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_ne_i16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ne_i16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ne_i16_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ne_i16_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_ne_i16_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ne_i16_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_ne_i16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x35,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x35,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_ne_i16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x35,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ne_i32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ne_i32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_ne_i32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_ne_i32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ne_i32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ne_i32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_ne_i32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ne_i32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_ne_i32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ne_i32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_ne_i32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ne_i32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ne_i32_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ne_i32_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_ne_i32_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ne_i32_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_ne_i32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x45,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x45,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_ne_i32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x45,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ne_u16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ne_u16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_ne_u16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_ne_u16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ne_u16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ne_u16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_ne_u16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ne_u16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_ne_u16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ne_u16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_ne_u16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ne_u16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ne_u16_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ne_u16_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_ne_u16_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ne_u16_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_ne_u16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x3d,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x3d,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_ne_u16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x3d,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ne_u32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ne_u32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_ne_u32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_ne_u32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ne_u32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ne_u32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_ne_u32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ne_u32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_ne_u32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ne_u32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_ne_u32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ne_u32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ne_u32_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ne_u32_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_ne_u32_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ne_u32_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_ne_u32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x4d,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x4d,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_ne_u32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x4d,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_neq_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_neq_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_neq_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_neq_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_neq_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_neq_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_neq_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_neq_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_neq_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_neq_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_neq_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_neq_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x0d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_neq_f16_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x0d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_neq_f16_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x0d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x0d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x0d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_neq_f16_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x0d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_neq_f16_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x0d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x0d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_neq_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x0d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x0d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_neq_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x0d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_neq_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_neq_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_neq_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_neq_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_neq_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_neq_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_neq_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_neq_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_neq_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_neq_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_neq_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_neq_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x1d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_neq_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x1d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_neq_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x1d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x1d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x1d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_neq_f32_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x1d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_neq_f32_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x1d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x1d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_neq_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x1d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x1d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_neq_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x1d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nge_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nge_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_nge_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_nge_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_nge_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nge_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_nge_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nge_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_nge_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nge_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_nge_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nge_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x09,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nge_f16_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x09,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nge_f16_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x09,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x09,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x09,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_nge_f16_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x09,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nge_f16_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x09,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x09,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_nge_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x09,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x09,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_nge_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x09,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nge_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nge_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_nge_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_nge_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_nge_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nge_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_nge_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nge_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_nge_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nge_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_nge_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nge_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x19,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nge_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x19,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nge_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x19,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x19,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x19,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_nge_f32_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x19,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nge_f32_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x19,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x19,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_nge_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x19,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x19,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_nge_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x19,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ngt_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ngt_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_ngt_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_ngt_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ngt_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ngt_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_ngt_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ngt_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_ngt_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ngt_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_ngt_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ngt_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x0b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ngt_f16_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x0b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ngt_f16_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x0b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x0b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x0b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_ngt_f16_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x0b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ngt_f16_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x0b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x0b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_ngt_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x0b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x0b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_ngt_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x0b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ngt_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ngt_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_ngt_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_ngt_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ngt_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ngt_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_ngt_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ngt_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_ngt_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ngt_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_ngt_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ngt_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x1b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ngt_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x1b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ngt_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x1b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x1b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x1b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_ngt_f32_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x1b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ngt_f32_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x1b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x1b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_ngt_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x1b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x1b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_ngt_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x1b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nle_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nle_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_nle_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_nle_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_nle_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nle_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_nle_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nle_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_nle_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nle_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_nle_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nle_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x0c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nle_f16_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x0c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nle_f16_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x0c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x0c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x0c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_nle_f16_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x0c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nle_f16_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x0c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x0c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_nle_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x0c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x0c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_nle_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x0c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nle_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nle_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_nle_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_nle_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_nle_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nle_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_nle_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nle_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_nle_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nle_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_nle_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nle_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x1c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nle_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x1c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nle_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x1c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x1c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x1c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_nle_f32_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x1c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nle_f32_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x1c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x1c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_nle_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x1c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x1c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_nle_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x1c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nlg_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nlg_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_nlg_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_nlg_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_nlg_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nlg_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_nlg_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nlg_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_nlg_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nlg_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_nlg_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nlg_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x0a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nlg_f16_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x0a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nlg_f16_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x0a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x0a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x0a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_nlg_f16_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x0a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nlg_f16_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x0a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x0a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_nlg_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x0a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x0a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_nlg_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x0a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nlg_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nlg_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_nlg_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_nlg_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_nlg_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nlg_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_nlg_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nlg_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_nlg_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nlg_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_nlg_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nlg_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x1a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nlg_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x1a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nlg_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x1a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x1a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x1a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_nlg_f32_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x1a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nlg_f32_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x1a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x1a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_nlg_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x1a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x1a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_nlg_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x1a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nlt_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nlt_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_nlt_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_nlt_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_nlt_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nlt_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_nlt_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nlt_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_nlt_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nlt_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_nlt_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nlt_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x0e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nlt_f16_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x0e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nlt_f16_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x0e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x0e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x0e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_nlt_f16_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x0e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nlt_f16_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x0e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x0e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_nlt_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x0e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x0e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_nlt_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x0e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nlt_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nlt_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_nlt_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_nlt_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_nlt_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nlt_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_nlt_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nlt_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_nlt_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nlt_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_nlt_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nlt_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x1e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nlt_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x1e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nlt_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x1e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x1e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x1e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_nlt_f32_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x1e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nlt_f32_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x1e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x1e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_nlt_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x1e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x1e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_nlt_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x1e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_o_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_o_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_o_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_o_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_o_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_o_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_o_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_o_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_o_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_o_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_o_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_o_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_o_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_o_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_o_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_o_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_o_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_o_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_o_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_o_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_o_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_o_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x07,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_o_f16_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x07,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_o_f16_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x07,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x07,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x07,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_o_f16_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x07,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_o_f16_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x07,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x07,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_o_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x07,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x07,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_o_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x07,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_o_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_o_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_o_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_o_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_o_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_o_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_o_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_o_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_o_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_o_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_o_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_o_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_o_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_o_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_o_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_o_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_o_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_o_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_o_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_o_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_o_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_o_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x17,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_o_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x17,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_o_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x17,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x17,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x17,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_o_f32_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x17,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_o_f32_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x17,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x17,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_o_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x17,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x17,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_o_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x17,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_t_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_t_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_t_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_t_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_t_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_t_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_t_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_t_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_t_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_t_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_t_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_t_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_t_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_t_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_t_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_t_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_t_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_t_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_t_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_t_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_t_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_t_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x0f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x0f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_t_f16_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x0f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_t_f16_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x0f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x0f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x0f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_t_f16_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x0f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_t_f16_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x0f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x0f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_t_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x0f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x0f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_t_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x0f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_t_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_t_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_t_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_t_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_t_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_t_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_t_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_t_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_t_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_t_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_t_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_t_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_t_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_t_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_t_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_t_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_t_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_t_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_t_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_t_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_t_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_t_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x1f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x1f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_t_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x1f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_t_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x1f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x1f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x1f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_t_f32_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x1f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_t_f32_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x1f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x1f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_t_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x1f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x1f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_t_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x1f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_t_i32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_t_i32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_t_i32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_t_i32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_t_i32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_t_i32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_t_i32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_t_i32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_t_i32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_t_i32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_t_i32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_t_i32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_t_i32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_t_i32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_t_i32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_t_i32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_t_i32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_t_i32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_t_i32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_t_i32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_t_i32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_t_i32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_t_i32_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_t_i32_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_t_i32_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_t_i32_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x47,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_t_i32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x47,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x47,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_t_i32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x47,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_t_u32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_t_u32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_t_u32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_t_u32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_t_u32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_t_u32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_t_u32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_t_u32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_t_u32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_t_u32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_t_u32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_t_u32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_t_u32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_t_u32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_t_u32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_t_u32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_t_u32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_t_u32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_t_u32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_t_u32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_t_u32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_t_u32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_t_u32_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_t_u32_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_t_u32_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_t_u32_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x4f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_t_u32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x4f,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x4f,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_t_u32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x4f,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_u_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_u_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_u_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_u_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_u_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_u_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_u_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_u_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_u_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_u_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_u_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_u_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_u_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_u_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_u_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_u_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_u_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_u_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_u_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_u_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_u_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_u_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x08,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_u_f16_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x08,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_u_f16_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x08,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x08,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x08,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_u_f16_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x08,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_u_f16_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x08,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x08,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_u_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x08,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x08,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_u_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x08,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_u_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_u_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_u_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_u_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_u_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_u_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_u_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_u_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_u_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_u_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_u_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_u_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_u_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_u_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_u_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_u_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_u_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_u_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_u_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_u_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_u_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_u_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x18,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_u_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x18,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_u_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x18,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x18,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x18,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_u_f32_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x18,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_u_f32_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x18,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x18,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX11: v_cmp_u_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x18,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x18,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmp_u_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x18,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt
index cd97006838d3e..4ce26199bcc08 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt
@@ -1,2614 +1,2617 @@
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
 
-# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_class_f16_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x01,0xfd,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30]
 0x7e,0x01,0xfd,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_class_f16_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x01,0xfd,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_class_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_class_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_class_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_class_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_class_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_class_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_class_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_class_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_class_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_class_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_class_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_class_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_class_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_class_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_class_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_class_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_class_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_class_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_class_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_class_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_class_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_class_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_class_f32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_class_f32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_class_f32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_class_f32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_class_f32_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x01,0xfe,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30]
 0x7e,0x01,0xfe,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_class_f32_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x01,0xfe,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_eq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_eq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_eq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x92,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x92,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_eq_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x92,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_eq_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x92,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x92,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_eq_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x92,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_eq_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x92,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x92,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_eq_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x92,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_eq_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_eq_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_eq_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xc2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_eq_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_eq_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_eq_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_eq_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xca,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_eq_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_f_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x80,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x80,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_f_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x80,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_f_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x80,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x80,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_f_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x80,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_f_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x80,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x80,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_f_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x80,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_f_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_f_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_f_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_f_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_f_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_f_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_f_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_f_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_f_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_f_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_f_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_f_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_f_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_f_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_f_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_f_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_f_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_f_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_f_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_f_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_f_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_f_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_f_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x90,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x90,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_f_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x90,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_f_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x90,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x90,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_f_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x90,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_f_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x90,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x90,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_f_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x90,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_f_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_f_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_f_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_f_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_f_i32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_f_i32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_f_i32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_f_i32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_f_i32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_f_i32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_f_i32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_f_i32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_f_i32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_f_i32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_f_i32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_f_i32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_f_i32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_f_i32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_f_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_f_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_f_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_f_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_f_i32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_f_i32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_f_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_f_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc0,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_f_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc0,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xc0,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_f_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc0,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_f_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_f_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_f_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_f_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_f_u32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_f_u32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_f_u32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_f_u32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_f_u32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_f_u32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_f_u32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_f_u32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_f_u32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_f_u32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_f_u32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_f_u32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_f_u32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_f_u32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_f_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_f_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_f_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_f_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_f_u32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_f_u32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_f_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_f_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_f_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xc8,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_f_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_ge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_ge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_ge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x96,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x96,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_ge_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x96,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_ge_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x96,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x96,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_ge_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x96,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_ge_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x96,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x96,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_ge_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x96,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_ge_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_ge_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_ge_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xc6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_ge_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_ge_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_ge_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_ge_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xce,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_ge_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_gt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_gt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_gt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x94,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x94,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_gt_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x94,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_gt_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x94,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x94,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_gt_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x94,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_gt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x94,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x94,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_gt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x94,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_gt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_gt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_gt_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xc4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_gt_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_gt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_gt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_gt_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xcc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_gt_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_le_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_le_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_le_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_le_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_le_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_le_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_le_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_le_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_le_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_le_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_le_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_le_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_le_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_le_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_le_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_le_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_le_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_le_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_le_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_le_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_le_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_le_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_le_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_le_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_le_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_le_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_le_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x93,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x93,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_le_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x93,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_le_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x93,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x93,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_le_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x93,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_le_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x93,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x93,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_le_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x93,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_le_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_le_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_le_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_le_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_le_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_le_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_le_i32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_le_i32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_le_i32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_le_i32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_le_i32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_le_i32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_le_i32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_le_i32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_le_i32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_le_i32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_le_i32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_le_i32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_le_i32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_le_i32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_le_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_le_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_le_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_le_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_le_i32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_le_i32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_le_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_le_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_le_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xc3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_le_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_le_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_le_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_le_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_le_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_le_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_le_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_le_u32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_le_u32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_le_u32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_le_u32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_le_u32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_le_u32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_le_u32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_le_u32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_le_u32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_le_u32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_le_u32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_le_u32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_le_u32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_le_u32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_le_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_le_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_le_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_le_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_le_u32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_le_u32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_le_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_le_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_le_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xcb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_le_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_lg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_lg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_lg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x95,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x95,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_lg_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x95,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_lg_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x95,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x95,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_lg_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x95,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_lg_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x95,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x95,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_lg_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x95,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x81,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x81,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_lt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x81,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_lt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x81,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x81,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_lt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x81,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x81,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x81,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x81,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_lt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_lt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_lt_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_lt_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_lt_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_lt_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_lt_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_lt_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_lt_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_lt_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_lt_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_lt_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_lt_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_lt_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x91,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x91,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_lt_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x91,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_lt_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x91,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x91,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_lt_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x91,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_lt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x91,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x91,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_lt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x91,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_lt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_lt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_lt_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xc1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_lt_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_lt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_lt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_lt_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xc9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_lt_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_ne_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_ne_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_ne_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xc5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_ne_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_ne_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_ne_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_ne_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xcd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_ne_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_neq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_neq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_neq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x9d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x9d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_neq_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x9d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_neq_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x9d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x9d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_neq_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x9d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x9d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_nge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_nge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_nge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x99,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x99,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_nge_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x99,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_nge_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x99,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x99,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_nge_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x99,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x99,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x99,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x99,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_ngt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_ngt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x9b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x9b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_ngt_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x9b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_ngt_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x9b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x9b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_ngt_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x9b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x9b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_nle_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_nle_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_nle_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x9c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x9c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_nle_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x9c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_nle_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x9c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x9c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_nle_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x9c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x9c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_nlg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_nlg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x9a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x9a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_nlg_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x9a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_nlg_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x9a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x9a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_nlg_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x9a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x9a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_nlt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_nlt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x9e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x9e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_nlt_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x9e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_nlt_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x9e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x9e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_nlt_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x9e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x9e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_o_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_o_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_o_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_o_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_o_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_o_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_o_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_o_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_o_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_o_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_o_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_o_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_o_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_o_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_o_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_o_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_o_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_o_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_o_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_o_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_o_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_o_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_o_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_o_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_o_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_o_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_o_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x97,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x97,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_o_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x97,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_o_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x97,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x97,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_o_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x97,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x97,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x97,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x97,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_t_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x8f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_t_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_t_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x8f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_t_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_t_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x8f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_t_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_t_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_t_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_t_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_t_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_t_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_t_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_t_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_t_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_t_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_t_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_t_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_t_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_t_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_t_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_t_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_t_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_t_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_t_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_t_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_t_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_t_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_t_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_t_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x9f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x9f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_t_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x9f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_t_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x9f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x9f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_t_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x9f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_t_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x9f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_t_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_t_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_t_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_t_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_t_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_t_i32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_t_i32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_t_i32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_t_i32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_t_i32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_t_i32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_t_i32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_t_i32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_t_i32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_t_i32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_t_i32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_t_i32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_t_i32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_t_i32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_t_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_t_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_t_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_t_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_t_i32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_t_i32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_t_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_t_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc7,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_t_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc7,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xc7,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_t_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc7,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_t_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_t_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_t_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_t_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_t_u32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_t_u32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_t_u32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_t_u32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_t_u32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_t_u32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_t_u32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_t_u32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_t_u32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_t_u32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_t_u32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_t_u32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_t_u32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_t_u32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_t_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_t_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_t_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_t_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_t_u32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_t_u32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_t_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_t_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_t_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xcf,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_t_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_u_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_u_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_u_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_u_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_u_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x98,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x98,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_u_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x98,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_u_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x98,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x98,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_u_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x98,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_u_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x98,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x98,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_u_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x98,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 # Check that dst value does not affect disassembly
-# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x00,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 # Check that dst value does not affect disassembly
-# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0xff,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopc.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopc.txt
index 3587f13b6684f..902cbc2d6f20f 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopc.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopc.txt
@@ -1,1180 +1,1183 @@
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W32 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64 %s
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W32 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W32 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64 %s
 
+0x0a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_class_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_class_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_class_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_class_f16_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_class_f16_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_class_f16_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_class_f16_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_class_f16_e64_dpp null, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x01,0x7d,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 0x7c,0x01,0x7d,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_class_f16_e64_dpp null, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x01,0x7d,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_class_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_class_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_class_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_class_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_class_f32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_class_f32_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_class_f32_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_class_f32_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_class_f32_e64_dpp null, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x01,0x7e,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 0x7c,0x01,0x7e,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_class_f32_e64_dpp null, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x01,0x7e,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x02,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_f16_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x02,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_f16_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x02,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x02,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x02,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_f16_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x02,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_f16_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x02,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x02,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_eq_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x02,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x02,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_eq_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x02,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x12,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x12,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x12,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x12,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x12,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x12,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x12,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x12,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_eq_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x12,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x12,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_eq_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x12,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_i16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_i16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_i16_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_i16_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_i16_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_i16_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_eq_i16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x32,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x32,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_eq_i16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x32,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_i32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_i32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_i32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_i32_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_i32_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_i32_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_eq_i32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x42,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x42,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_eq_i32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x42,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_u16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_u16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_u16_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_u16_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_u16_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_u16_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_eq_u16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x3a,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x3a,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_eq_u16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x3a,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_u32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_u32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_u32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_u32_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_u32_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_u32_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_eq_u32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x4a,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x4a,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_eq_u32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x4a,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x00,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_f_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x00,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_f_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x00,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x00,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x00,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_f_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x00,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_f_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x00,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x00,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x00,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_f_f16_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x00,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_f_f16_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x00,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x00,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x00,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_f_f16_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x00,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_f_f16_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x00,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x00,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_f_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x00,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x00,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_f_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x00,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x10,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_f_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x10,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_f_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x10,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x10,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x10,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_f_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x10,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_f_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x10,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x10,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x10,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_f_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x10,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_f_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x10,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x10,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x10,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_f_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x10,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_f_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x10,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x10,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_f_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x10,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x10,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_f_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x10,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x40,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_f_i32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x40,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_f_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x40,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x40,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x40,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_f_i32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x40,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_f_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x40,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x40,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x40,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_f_i32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x40,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_f_i32_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x40,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x40,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x40,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_f_i32_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x40,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_f_i32_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x40,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x40,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_f_i32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x40,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x40,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_f_i32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x40,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x48,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_f_u32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x48,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_f_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x48,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x48,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x48,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_f_u32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x48,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_f_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x48,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x48,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x48,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_f_u32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x48,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_f_u32_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x48,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x48,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x48,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_f_u32_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x48,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_f_u32_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x48,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x48,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_f_u32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x48,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x48,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_f_u32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x48,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x06,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_f16_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x06,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_f16_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x06,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x06,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x06,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_f16_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x06,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_f16_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x06,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x06,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_ge_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x06,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x06,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_ge_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x06,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x16,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x16,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x16,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x16,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x16,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x16,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x16,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x16,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_ge_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x16,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x16,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_ge_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x16,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_i16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_i16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_i16_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_i16_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_i16_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_i16_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_ge_i16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x36,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x36,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_ge_i16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x36,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_i32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_i32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_i32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_i32_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_i32_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_i32_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_ge_i32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x46,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x46,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_ge_i32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x46,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_u16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_u16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_u16_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_u16_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_u16_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_u16_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_ge_u16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x3e,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x3e,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_ge_u16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x3e,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_u32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_u32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_u32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_u32_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_u32_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_u32_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_ge_u32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x4e,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x4e,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_ge_u32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x4e,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x04,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_f16_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x04,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_f16_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x04,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x04,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x04,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_f16_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x04,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_f16_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x04,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x04,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_gt_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x04,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x04,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_gt_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x04,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x14,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x14,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x14,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x14,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x14,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x14,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x14,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x14,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_gt_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x14,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x14,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_gt_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x14,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_i16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_i16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_i16_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_i16_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_i16_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_i16_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_gt_i16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x34,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x34,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_gt_i16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x34,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_i32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_i32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_i32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_i32_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_i32_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_i32_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_gt_i32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x44,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x44,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_gt_i32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x44,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_u16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_u16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_u16_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_u16_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_u16_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_u16_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_gt_u16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x3c,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x3c,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_gt_u16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x3c,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_u32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_u32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_u32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_u32_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_u32_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_u32_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_gt_u32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x4c,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x4c,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_gt_u32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x4c,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x03,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_f16_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x03,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_f16_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x03,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x03,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x03,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_f16_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x03,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_f16_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x03,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x03,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_le_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x03,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x03,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_le_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x03,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x13,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x13,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x13,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x13,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x13,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x13,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x13,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x13,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_le_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x13,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x13,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_le_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x13,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_i16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_i16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_i16_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_i16_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_i16_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_i16_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_le_i16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x33,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x33,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_le_i16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x33,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_i32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_i32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_i32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_i32_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_i32_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_i32_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_le_i32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x43,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x43,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_le_i32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x43,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_u16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_u16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_u16_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_u16_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_u16_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_u16_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_le_u16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x3b,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x3b,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_le_u16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x3b,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_u32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_u32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_u32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_u32_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_u32_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_u32_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_le_u32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x4b,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x4b,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_le_u32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x4b,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lg_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lg_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lg_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x05,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_lg_f16_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x05,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lg_f16_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x05,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x05,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x05,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_lg_f16_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x05,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lg_f16_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x05,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x05,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_lg_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x05,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x05,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_lg_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x05,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lg_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lg_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lg_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x15,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_lg_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x15,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lg_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x15,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x15,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x15,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_lg_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x15,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lg_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x15,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x15,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_lg_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x15,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x15,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_lg_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x15,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x01,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_f16_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x01,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_f16_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x01,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x01,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x01,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_f16_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x01,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_f16_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x01,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x01,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_lt_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x01,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x01,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_lt_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x01,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x11,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x11,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x11,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x11,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x11,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x11,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x11,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x11,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_lt_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x11,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x11,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_lt_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x11,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_i16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_i16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_i16_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_i16_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_i16_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_i16_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_lt_i16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x31,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x31,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_lt_i16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x31,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_i32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_i32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_i32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_i32_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_i32_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_i32_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_lt_i32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x41,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x41,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_lt_i32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x41,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_u16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_u16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_u16_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_u16_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_u16_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_u16_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_lt_u16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x39,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x39,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_lt_u16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x39,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_u32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_u32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_u32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_u32_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_u32_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_u32_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_lt_u32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x49,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x49,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_lt_u32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x49,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_i16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_i16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_i16_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_i16_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_i16_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_i16_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_ne_i16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x35,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x35,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_ne_i16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x35,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_i32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_i32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_i32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_i32_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_i32_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_i32_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_ne_i32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x45,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x45,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_ne_i32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x45,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_u16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_u16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_u16_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_u16_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_u16_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_u16_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_ne_u16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x3d,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x3d,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_ne_u16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x3d,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_u32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_u32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_u32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_u32_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_u32_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_u32_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_ne_u32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x4d,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x4d,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_ne_u32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x4d,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_neq_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_neq_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_neq_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x0d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_neq_f16_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x0d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_neq_f16_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x0d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x0d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x0d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_neq_f16_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x0d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_neq_f16_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x0d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x0d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_neq_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x0d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x0d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_neq_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x0d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_neq_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_neq_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_neq_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x1d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_neq_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x1d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_neq_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x1d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x1d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x1d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_neq_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x1d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_neq_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x1d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x1d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_neq_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x1d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x1d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_neq_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x1d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nge_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nge_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nge_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x09,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_nge_f16_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x09,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nge_f16_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x09,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x09,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x09,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_nge_f16_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x09,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nge_f16_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x09,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x09,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_nge_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x09,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x09,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_nge_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x09,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nge_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nge_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nge_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x19,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_nge_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x19,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nge_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x19,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x19,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x19,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_nge_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x19,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nge_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x19,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x19,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_nge_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x19,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x19,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_nge_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x19,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ngt_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ngt_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ngt_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x0b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_ngt_f16_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x0b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ngt_f16_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x0b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x0b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x0b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_ngt_f16_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x0b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ngt_f16_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x0b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x0b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_ngt_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x0b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x0b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_ngt_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x0b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ngt_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ngt_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ngt_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x1b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_ngt_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x1b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ngt_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x1b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x1b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x1b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_ngt_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x1b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ngt_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x1b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x1b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_ngt_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x1b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x1b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_ngt_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x1b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nle_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nle_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nle_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x0c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_nle_f16_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x0c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nle_f16_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x0c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x0c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x0c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_nle_f16_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x0c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nle_f16_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x0c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x0c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_nle_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x0c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x0c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_nle_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x0c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nle_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nle_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nle_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x1c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_nle_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x1c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nle_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x1c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x1c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x1c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_nle_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x1c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nle_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x1c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x1c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_nle_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x1c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x1c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_nle_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x1c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlg_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlg_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlg_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x0a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlg_f16_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x0a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlg_f16_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x0a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x0a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x0a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlg_f16_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x0a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlg_f16_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x0a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x0a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_nlg_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x0a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x0a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_nlg_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x0a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlg_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlg_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlg_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x1a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlg_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x1a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlg_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x1a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x1a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x1a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlg_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x1a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlg_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x1a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x1a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_nlg_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x1a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x1a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_nlg_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x1a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlt_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlt_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlt_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x0e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlt_f16_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x0e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlt_f16_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x0e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x0e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x0e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlt_f16_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x0e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlt_f16_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x0e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x0e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_nlt_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x0e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x0e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_nlt_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x0e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlt_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlt_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlt_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x1e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlt_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x1e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlt_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x1e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x1e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x1e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlt_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x1e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlt_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x1e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x1e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_nlt_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x1e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x1e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_nlt_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x1e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_o_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_o_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_o_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_o_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x07,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_o_f16_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x07,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_o_f16_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x07,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x07,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x07,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_o_f16_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x07,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_o_f16_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x07,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x07,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_o_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x07,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x07,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_o_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x07,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_o_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_o_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_o_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_o_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x17,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_o_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x17,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_o_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x17,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x17,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x17,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_o_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x17,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_o_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x17,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x17,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_o_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x17,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x17,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_o_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x17,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x0f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_t_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_t_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x0f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x0f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_t_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_t_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x0f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x0f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_t_f16_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x0f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_t_f16_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x0f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x0f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x0f,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_t_f16_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x0f,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_t_f16_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x0f,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x0f,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_t_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x0f,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x0f,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_t_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x0f,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x1f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_t_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_t_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x1f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x1f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_t_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_t_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x1f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x1f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_t_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x1f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_t_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x1f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x1f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x1f,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_t_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x1f,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_t_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x1f,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x1f,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_t_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x1f,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x1f,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_t_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x1f,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x47,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_t_i32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x47,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_t_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x47,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x47,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x47,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_t_i32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x47,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_t_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x47,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x47,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x47,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_t_i32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x47,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_t_i32_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x47,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x47,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x47,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_t_i32_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x47,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_t_i32_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x47,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x47,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_t_i32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x47,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x47,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_t_i32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x47,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x4f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_t_u32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_t_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x4f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x4f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_t_u32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_t_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x4f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x4f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_t_u32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x4f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_t_u32_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x4f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x4f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x4f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_t_u32_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x4f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_t_u32_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x4f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x4f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_t_u32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x4f,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x4f,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_t_u32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x4f,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_u_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_u_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_u_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_u_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x08,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_u_f16_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x08,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_u_f16_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x08,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x08,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x08,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_u_f16_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x08,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_u_f16_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x08,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x08,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_u_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x08,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x08,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_u_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x08,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_u_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_u_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_u_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_u_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x18,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_u_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x18,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_u_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x18,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x18,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x18,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_u_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x18,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_u_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x18,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x18,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX11: v_cmp_u_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x18,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x18,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmp_u_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x18,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt
index 14a4c1b295210..a703568f5c6f2 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt
@@ -1,574 +1,577 @@
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
 
-# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_class_f16_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x01,0xfd,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 0x7e,0x01,0xfd,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_class_f16_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x01,0xfd,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_class_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xfe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_class_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_class_f32_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x01,0xfe,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 0x7e,0x01,0xfe,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_class_f32_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x01,0xfe,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_eq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_eq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_eq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_eq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x92,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x92,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x92,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_eq_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x92,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x92,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_eq_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x92,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_eq_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x92,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x92,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_eq_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x92,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_eq_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x92,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x92,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_eq_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x92,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_eq_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_eq_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_eq_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xc2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_eq_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_eq_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_eq_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xca,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xca,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xca,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_eq_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xca,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xca,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_eq_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xca,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x80,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x80,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x80,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_f_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x80,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x80,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_f_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x80,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_f_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x80,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x80,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_f_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x80,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_f_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x80,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x80,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_f_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x80,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_f_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x90,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x90,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_f_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x90,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_f_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x90,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x90,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_f_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x90,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_f_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x90,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x90,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_f_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x90,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_f_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x90,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x90,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_f_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x90,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_f_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc0,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc0,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_f_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc0,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_f_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc0,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xc0,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_f_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc0,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_f_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc8,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc8,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_f_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc8,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_f_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc8,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xc8,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_f_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc8,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x96,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x96,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x96,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ge_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x96,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x96,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ge_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x96,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ge_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x96,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x96,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ge_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x96,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ge_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x96,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x96,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_ge_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x96,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ge_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_ge_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ge_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xc6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_ge_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ge_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_ge_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xce,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xce,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xce,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ge_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xce,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xce,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_ge_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xce,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_gt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_gt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_gt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_gt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x94,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x94,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x94,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_gt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x94,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x94,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_gt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x94,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_gt_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x94,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x94,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_gt_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x94,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_gt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x94,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x94,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_gt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x94,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_gt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_gt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_gt_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xc4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_gt_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_gt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_gt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xcc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_gt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xcc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_gt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_le_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_le_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_le_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_le_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_le_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x93,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x93,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_le_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x93,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_le_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x93,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x93,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_le_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x93,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_le_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x93,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x93,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_le_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x93,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_le_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x93,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x93,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_le_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x93,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_le_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_le_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_le_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_le_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_le_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xc3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_le_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_le_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_le_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_le_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xcb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_le_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_le_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xcb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_le_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_lg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_lg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_lg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_lg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x95,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x95,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x95,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_lg_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x95,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x95,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_lg_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x95,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_lg_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x95,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x95,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_lg_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x95,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_lg_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x95,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x95,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_lg_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x95,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x81,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x81,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x81,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_lt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x81,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x81,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_lt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x81,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_lt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x81,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x81,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_lt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x81,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x81,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x81,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x81,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_lt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x91,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x91,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_lt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x91,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_lt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x91,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x91,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_lt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x91,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_lt_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x91,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x91,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_lt_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x91,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_lt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x91,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x91,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_lt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x91,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_lt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_lt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_lt_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xc1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_lt_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_lt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_lt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_lt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xc9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_lt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ne_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_ne_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ne_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xc5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_ne_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ne_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_ne_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xcd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ne_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xcd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_ne_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_neq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_neq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_neq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_neq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x9d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_neq_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x9d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_neq_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_neq_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x9d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x9d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_neq_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x9d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x9d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x99,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x99,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x99,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nge_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x99,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x99,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nge_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x99,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nge_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x99,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x99,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nge_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x99,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x99,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x99,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x99,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ngt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ngt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x9b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ngt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x9b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ngt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ngt_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x9b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x9b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ngt_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x9b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x9b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nle_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nle_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nle_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nle_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x9c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nle_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x9c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nle_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nle_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x9c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x9c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nle_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x9c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x9c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nlg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nlg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x9a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nlg_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x9a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nlg_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nlg_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x9a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x9a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nlg_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x9a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x9a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nlt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nlt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x9e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nlt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x9e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nlt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nlt_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x9e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x9e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nlt_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x9e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x9e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_o_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_o_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_o_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_o_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_o_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x97,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x97,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_o_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x97,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_o_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x97,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x97,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_o_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x97,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_o_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x97,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x97,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_o_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x97,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x97,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x97,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x97,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x8f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_t_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x8f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_t_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_t_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8f,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x8f,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_t_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8f,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_t_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8f,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x8f,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_t_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8f,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_t_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x9f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_t_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_t_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x9f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_t_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_t_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x9f,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x9f,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_t_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x9f,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_t_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9f,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x9f,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_t_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9f,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_t_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc7,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc7,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_t_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc7,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_t_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc7,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xc7,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_t_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc7,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_t_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcf,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xcf,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_t_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcf,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_t_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcf,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xcf,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_t_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcf,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_u_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_u_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_u_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_u_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_u_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x98,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x98,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_u_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x98,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_u_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x98,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x98,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_u_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x98,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_u_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x98,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x98,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_u_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x98,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 # Check that dst value does not affect disassembly
-# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x00,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 # Check that dst value does not affect disassembly
-# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0xff,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_u_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopc.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopc.txt
index a5978690d4ef1..ac1263916df85 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopc.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopc.txt
@@ -1,5227 +1,5230 @@
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W32 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64 %s
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W32 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W32 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64 %s
 
-# W32: v_cmp_class_f16_e64 s10, v1, v2           ; encoding: [0x0a,0x00,0x7d,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_class_f16_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0x7d,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x7d,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_class_f16_e64 s10, v1, v2         ; encoding: [0x0a,0x00,0x7d,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_class_f16_e64 s[10:11], v1, v2    ; encoding: [0x0a,0x00,0x7d,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_class_f16_e64 s10, v1, 0.5           ; encoding: [0x0a,0x00,0x7d,0xd4,0x01,0xe1,0x01,0x00]
-# W64: v_cmp_class_f16_e64 s[10:11], v1, 0.5     ; encoding: [0x0a,0x00,0x7d,0xd4,0x01,0xe1,0x01,0x00]
 0x0a,0x00,0x7d,0xd4,0x01,0xe1,0x01,0x00
+# W32: v_cmp_class_f16_e64 s10, v1, 0.5        ; encoding: [0x0a,0x00,0x7d,0xd4,0x01,0xe1,0x01,0x00]
+# W64: v_cmp_class_f16_e64 s[10:11], v1, 0.5   ; encoding: [0x0a,0x00,0x7d,0xd4,0x01,0xe1,0x01,0x00]
 
-# W32: v_cmp_class_f16_e64 s10, v255, v2         ; encoding: [0x0a,0x00,0x7d,0xd4,0xff,0x05,0x02,0x00]
-# W64: v_cmp_class_f16_e64 s[10:11], v255, v2    ; encoding: [0x0a,0x00,0x7d,0xd4,0xff,0x05,0x02,0x00]
 0x0a,0x00,0x7d,0xd4,0xff,0x05,0x02,0x00
+# W32: v_cmp_class_f16_e64 s10, v255, v2       ; encoding: [0x0a,0x00,0x7d,0xd4,0xff,0x05,0x02,0x00]
+# W64: v_cmp_class_f16_e64 s[10:11], v255, v2  ; encoding: [0x0a,0x00,0x7d,0xd4,0xff,0x05,0x02,0x00]
 
-# W32: v_cmp_class_f16_e64 s10, s1, v2           ; encoding: [0x0a,0x00,0x7d,0xd4,0x01,0x04,0x02,0x00]
-# W64: v_cmp_class_f16_e64 s[10:11], s1, v2      ; encoding: [0x0a,0x00,0x7d,0xd4,0x01,0x04,0x02,0x00]
 0x0a,0x00,0x7d,0xd4,0x01,0x04,0x02,0x00
+# W32: v_cmp_class_f16_e64 s10, s1, v2         ; encoding: [0x0a,0x00,0x7d,0xd4,0x01,0x04,0x02,0x00]
+# W64: v_cmp_class_f16_e64 s[10:11], s1, v2    ; encoding: [0x0a,0x00,0x7d,0xd4,0x01,0x04,0x02,0x00]
 
-# W32: v_cmp_class_f16_e64 s10, s105, v255       ; encoding: [0x0a,0x00,0x7d,0xd4,0x69,0xfe,0x03,0x00]
-# W64: v_cmp_class_f16_e64 s[10:11], s105, v255  ; encoding: [0x0a,0x00,0x7d,0xd4,0x69,0xfe,0x03,0x00]
 0x0a,0x00,0x7d,0xd4,0x69,0xfe,0x03,0x00
+# W32: v_cmp_class_f16_e64 s10, s105, v255     ; encoding: [0x0a,0x00,0x7d,0xd4,0x69,0xfe,0x03,0x00]
+# W64: v_cmp_class_f16_e64 s[10:11], s105, v255 ; encoding: [0x0a,0x00,0x7d,0xd4,0x69,0xfe,0x03,0x00]
 
-# W32: v_cmp_class_f16_e64 s10, vcc_lo, s2       ; encoding: [0x0a,0x00,0x7d,0xd4,0x6a,0x04,0x00,0x00]
-# W64: v_cmp_class_f16_e64 s[10:11], vcc_lo, s2  ; encoding: [0x0a,0x00,0x7d,0xd4,0x6a,0x04,0x00,0x00]
 0x0a,0x00,0x7d,0xd4,0x6a,0x04,0x00,0x00
+# W32: v_cmp_class_f16_e64 s10, vcc_lo, s2     ; encoding: [0x0a,0x00,0x7d,0xd4,0x6a,0x04,0x00,0x00]
+# W64: v_cmp_class_f16_e64 s[10:11], vcc_lo, s2 ; encoding: [0x0a,0x00,0x7d,0xd4,0x6a,0x04,0x00,0x00]
 
-# W32: v_cmp_class_f16_e64 s10, vcc_hi, s105     ; encoding: [0x0a,0x00,0x7d,0xd4,0x6b,0xd2,0x00,0x00]
-# W64: v_cmp_class_f16_e64 s[10:11], vcc_hi, s105 ; encoding: [0x0a,0x00,0x7d,0xd4,0x6b,0xd2,0x00,0x00]
 0x0a,0x00,0x7d,0xd4,0x6b,0xd2,0x00,0x00
+# W32: v_cmp_class_f16_e64 s10, vcc_hi, s105   ; encoding: [0x0a,0x00,0x7d,0xd4,0x6b,0xd2,0x00,0x00]
+# W64: v_cmp_class_f16_e64 s[10:11], vcc_hi, s105 ; encoding: [0x0a,0x00,0x7d,0xd4,0x6b,0xd2,0x00,0x00]
 
-# W32: v_cmp_class_f16_e64 s10, ttmp15, ttmp15   ; encoding: [0x0a,0x00,0x7d,0xd4,0x7b,0xf6,0x00,0x00]
-# W64: v_cmp_class_f16_e64 s[10:11], ttmp15, ttmp15 ; encoding: [0x0a,0x00,0x7d,0xd4,0x7b,0xf6,0x00,0x00]
 0x0a,0x00,0x7d,0xd4,0x7b,0xf6,0x00,0x00
+# W32: v_cmp_class_f16_e64 s10, ttmp15, ttmp15 ; encoding: [0x0a,0x00,0x7d,0xd4,0x7b,0xf6,0x00,0x00]
+# W64: v_cmp_class_f16_e64 s[10:11], ttmp15, ttmp15 ; encoding: [0x0a,0x00,0x7d,0xd4,0x7b,0xf6,0x00,0x00]
 
-# W32: v_cmp_class_f16_e64 s10, m0, src_scc      ; encoding: [0x0a,0x00,0x7d,0xd4,0x7d,0xfa,0x01,0x00]
-# W64: v_cmp_class_f16_e64 s[10:11], m0, src_scc ; encoding: [0x0a,0x00,0x7d,0xd4,0x7d,0xfa,0x01,0x00]
 0x0a,0x00,0x7d,0xd4,0x7d,0xfa,0x01,0x00
+# W32: v_cmp_class_f16_e64 s10, m0, src_scc    ; encoding: [0x0a,0x00,0x7d,0xd4,0x7d,0xfa,0x01,0x00]
+# W64: v_cmp_class_f16_e64 s[10:11], m0, src_scc ; encoding: [0x0a,0x00,0x7d,0xd4,0x7d,0xfa,0x01,0x00]
 
-# W32: v_cmp_class_f16_e64 s10, exec_lo, -1      ; encoding: [0x0a,0x00,0x7d,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_class_f16_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x7d,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x7d,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_class_f16_e64 s10, exec_lo, -1    ; encoding: [0x0a,0x00,0x7d,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_class_f16_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x7d,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_class_f16_e64 s10, exec_hi, null    ; encoding: [0x0a,0x00,0x7d,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_class_f16_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x7d,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x7d,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_class_f16_e64 s10, exec_hi, null  ; encoding: [0x0a,0x00,0x7d,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_class_f16_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x7d,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_class_f16_e64 s10, null, exec_lo    ; encoding: [0x0a,0x00,0x7d,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_class_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x7d,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x7d,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_class_f16_e64 s10, null, exec_lo  ; encoding: [0x0a,0x00,0x7d,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_class_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x7d,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_class_f16_e64 s104, -1, exec_hi     ; encoding: [0x68,0x00,0x7d,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_class_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x7d,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x7d,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_class_f16_e64 s104, -1, exec_hi   ; encoding: [0x68,0x00,0x7d,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_class_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x7d,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_class_f16_e64 vcc_lo, 0.5, m0       ; encoding: [0x6a,0x00,0x7d,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_class_f16_e64 vcc, 0.5, m0          ; encoding: [0x6a,0x00,0x7d,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x7d,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_class_f16_e64 vcc_lo, 0.5, m0     ; encoding: [0x6a,0x00,0x7d,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_class_f16_e64 vcc, 0.5, m0        ; encoding: [0x6a,0x00,0x7d,0xd4,0xf0,0xfa,0x00,0x00]
 
+0x7a,0x00,0x7d,0xd4,0xfd,0xd4,0x00,0x00
 # W32: v_cmp_class_f16_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x7d,0xd4,0xfd,0xd4,0x00,0x00]
 # W64: v_cmp_class_f16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x7d,0xd4,0xfd,0xd4,0x00,0x00]
-0x7a,0x00,0x7d,0xd4,0xfd,0xd4,0x00,0x00
 
-# GFX11: v_cmp_class_f16_e64 null, -|0xfe0b|, vcc_hi ; encoding: [0x7c,0x01,0x7d,0xd4,0xff,0xd6,0x00,0x20,0x0b,0xfe,0x00,0x00]
 0x7c,0x01,0x7d,0xd4,0xff,0xd6,0x00,0x20,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmp_class_f16_e64 null, -|0xfe0b|, vcc_hi ; encoding: [0x7c,0x01,0x7d,0xd4,0xff,0xd6,0x00,0x20,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_class_f32_e64 s10, v1, v2           ; encoding: [0x0a,0x00,0x7e,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_class_f32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0x7e,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x7e,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_class_f32_e64 s10, v1, v2         ; encoding: [0x0a,0x00,0x7e,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_class_f32_e64 s[10:11], v1, v2    ; encoding: [0x0a,0x00,0x7e,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_class_f32_e64 s10, v255, v255       ; encoding: [0x0a,0x00,0x7e,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_class_f32_e64 s[10:11], v255, v255  ; encoding: [0x0a,0x00,0x7e,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x7e,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_class_f32_e64 s10, v255, v255     ; encoding: [0x0a,0x00,0x7e,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_class_f32_e64 s[10:11], v255, v255 ; encoding: [0x0a,0x00,0x7e,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_class_f32_e64 s10, s1, s2           ; encoding: [0x0a,0x00,0x7e,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_class_f32_e64 s[10:11], s1, s2      ; encoding: [0x0a,0x00,0x7e,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x7e,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_class_f32_e64 s10, s1, s2         ; encoding: [0x0a,0x00,0x7e,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_class_f32_e64 s[10:11], s1, s2    ; encoding: [0x0a,0x00,0x7e,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_class_f32_e64 s10, s105, s105       ; encoding: [0x0a,0x00,0x7e,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_class_f32_e64 s[10:11], s105, s105  ; encoding: [0x0a,0x00,0x7e,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x7e,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_class_f32_e64 s10, s105, s105     ; encoding: [0x0a,0x00,0x7e,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_class_f32_e64 s[10:11], s105, s105 ; encoding: [0x0a,0x00,0x7e,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_class_f32_e64 s10, vcc_lo, ttmp15   ; encoding: [0x0a,0x00,0x7e,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_class_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x7e,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x7e,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_class_f32_e64 s10, vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x7e,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_class_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x7e,0xd4,0x6a,0xf6,0x00,0x00]
 
+0x0a,0x00,0x7e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_class_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x7e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_class_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x7e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x7e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_class_f32_e64 s10, ttmp15, src_scc  ; encoding: [0x0a,0x00,0x7e,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_class_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x7e,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x7e,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_class_f32_e64 s10, ttmp15, src_scc ; encoding: [0x0a,0x00,0x7e,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_class_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x7e,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_class_f32_e64 s10, m0, 0.5          ; encoding: [0x0a,0x00,0x7e,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_class_f32_e64 s[10:11], m0, 0.5     ; encoding: [0x0a,0x00,0x7e,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x7e,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_class_f32_e64 s10, m0, 0.5        ; encoding: [0x0a,0x00,0x7e,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_class_f32_e64 s[10:11], m0, 0.5   ; encoding: [0x0a,0x00,0x7e,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_class_f32_e64 s10, exec_lo, -1      ; encoding: [0x0a,0x00,0x7e,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_class_f32_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x7e,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x7e,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_class_f32_e64 s10, exec_lo, -1    ; encoding: [0x0a,0x00,0x7e,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_class_f32_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x7e,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_class_f32_e64 s10, exec_hi, null    ; encoding: [0x0a,0x00,0x7e,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_class_f32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x7e,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x7e,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_class_f32_e64 s10, exec_hi, null  ; encoding: [0x0a,0x00,0x7e,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_class_f32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x7e,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_class_f32_e64 s10, null, exec_lo    ; encoding: [0x0a,0x00,0x7e,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_class_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x7e,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x7e,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_class_f32_e64 s10, null, exec_lo  ; encoding: [0x0a,0x00,0x7e,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_class_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x7e,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_class_f32_e64 s104, -1, exec_hi     ; encoding: [0x68,0x00,0x7e,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_class_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x7e,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x7e,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_class_f32_e64 s104, -1, exec_hi   ; encoding: [0x68,0x00,0x7e,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_class_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x7e,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_class_f32_e64 vcc_lo, 0.5, m0       ; encoding: [0x6a,0x00,0x7e,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_class_f32_e64 vcc, 0.5, m0          ; encoding: [0x6a,0x00,0x7e,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x7e,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_class_f32_e64 vcc_lo, 0.5, m0     ; encoding: [0x6a,0x00,0x7e,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_class_f32_e64 vcc, 0.5, m0        ; encoding: [0x6a,0x00,0x7e,0xd4,0xf0,0xfa,0x00,0x00]
 
+0x7a,0x00,0x7e,0xd4,0xfd,0xd4,0x00,0x00
 # W32: v_cmp_class_f32_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x7e,0xd4,0xfd,0xd4,0x00,0x00]
 # W64: v_cmp_class_f32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x7e,0xd4,0xfd,0xd4,0x00,0x00]
-0x7a,0x00,0x7e,0xd4,0xfd,0xd4,0x00,0x00
 
-# GFX11: v_cmp_class_f32_e64 null, -|0xaf123456|, vcc_hi ; encoding: [0x7c,0x01,0x7e,0xd4,0xff,0xd6,0x00,0x20,0x56,0x34,0x12,0xaf]
 0x7c,0x01,0x7e,0xd4,0xff,0xd6,0x00,0x20,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_class_f32_e64 null, -|0xaf123456|, vcc_hi ; encoding: [0x7c,0x01,0x7e,0xd4,0xff,0xd6,0x00,0x20,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_class_f64_e64 s10, v[1:2], v2       ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_class_f64_e64 s[10:11], v[1:2], v2  ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x7f,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_class_f64_e64 s10, v[1:2], v2     ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_class_f64_e64 s[10:11], v[1:2], v2 ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_class_f64_e64 s10, v[1:2], v255     ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0xff,0x03,0x00]
-# W64: v_cmp_class_f64_e64 s[10:11], v[1:2], v255 ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0xff,0x03,0x00]
 0x0a,0x00,0x7f,0xd4,0x01,0xff,0x03,0x00
+# W32: v_cmp_class_f64_e64 s10, v[1:2], v255   ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0xff,0x03,0x00]
+# W64: v_cmp_class_f64_e64 s[10:11], v[1:2], v255 ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0xff,0x03,0x00]
 
-# W32: v_cmp_class_f64_e64 s10, v[1:2], s2       ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0x05,0x00,0x00]
-# W64: v_cmp_class_f64_e64 s[10:11], v[1:2], s2  ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0x05,0x00,0x00]
 0x0a,0x00,0x7f,0xd4,0x01,0x05,0x00,0x00
+# W32: v_cmp_class_f64_e64 s10, v[1:2], s2     ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0x05,0x00,0x00]
+# W64: v_cmp_class_f64_e64 s[10:11], v[1:2], s2 ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0x05,0x00,0x00]
 
-# W32: v_cmp_class_f64_e64 s10, v[1:2], s105     ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0xd3,0x00,0x00]
-# W64: v_cmp_class_f64_e64 s[10:11], v[1:2], s105 ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0xd3,0x00,0x00]
 0x0a,0x00,0x7f,0xd4,0x01,0xd3,0x00,0x00
+# W32: v_cmp_class_f64_e64 s10, v[1:2], s105   ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0xd3,0x00,0x00]
+# W64: v_cmp_class_f64_e64 s[10:11], v[1:2], s105 ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0xd3,0x00,0x00]
 
+0x0a,0x00,0x7f,0xd4,0xfe,0xf7,0x00,0x00
 # W32: v_cmp_class_f64_e64 s10, v[254:255], ttmp15 ; encoding: [0x0a,0x00,0x7f,0xd4,0xfe,0xf7,0x00,0x00]
 # W64: v_cmp_class_f64_e64 s[10:11], v[254:255], ttmp15 ; encoding: [0x0a,0x00,0x7f,0xd4,0xfe,0xf7,0x00,0x00]
-0x0a,0x00,0x7f,0xd4,0xfe,0xf7,0x00,0x00
 
-# W32: v_cmp_class_f64_e64 s10, s[2:3], vcc_hi   ; encoding: [0x0a,0x00,0x7f,0xd4,0x02,0xd6,0x00,0x00]
-# W64: v_cmp_class_f64_e64 s[10:11], s[2:3], vcc_hi ; encoding: [0x0a,0x00,0x7f,0xd4,0x02,0xd6,0x00,0x00]
 0x0a,0x00,0x7f,0xd4,0x02,0xd6,0x00,0x00
+# W32: v_cmp_class_f64_e64 s10, s[2:3], vcc_hi ; encoding: [0x0a,0x00,0x7f,0xd4,0x02,0xd6,0x00,0x00]
+# W64: v_cmp_class_f64_e64 s[10:11], s[2:3], vcc_hi ; encoding: [0x0a,0x00,0x7f,0xd4,0x02,0xd6,0x00,0x00]
 
+0x0a,0x00,0x7f,0xd4,0x68,0xd4,0x00,0x00
 # W32: v_cmp_class_f64_e64 s10, s[104:105], vcc_lo ; encoding: [0x0a,0x00,0x7f,0xd4,0x68,0xd4,0x00,0x00]
 # W64: v_cmp_class_f64_e64 s[10:11], s[104:105], vcc_lo ; encoding: [0x0a,0x00,0x7f,0xd4,0x68,0xd4,0x00,0x00]
-0x0a,0x00,0x7f,0xd4,0x68,0xd4,0x00,0x00
 
-# W32: v_cmp_class_f64_e64 s10, vcc, m0          ; encoding: [0x0a,0x00,0x7f,0xd4,0x6a,0xfa,0x00,0x00]
-# W64: v_cmp_class_f64_e64 s[10:11], vcc, m0     ; encoding: [0x0a,0x00,0x7f,0xd4,0x6a,0xfa,0x00,0x00]
 0x0a,0x00,0x7f,0xd4,0x6a,0xfa,0x00,0x00
+# W32: v_cmp_class_f64_e64 s10, vcc, m0        ; encoding: [0x0a,0x00,0x7f,0xd4,0x6a,0xfa,0x00,0x00]
+# W64: v_cmp_class_f64_e64 s[10:11], vcc, m0   ; encoding: [0x0a,0x00,0x7f,0xd4,0x6a,0xfa,0x00,0x00]
 
+0x0a,0x00,0x7f,0xd4,0x7a,0xfe,0x00,0x00
 # W32: v_cmp_class_f64_e64 s10, ttmp[14:15], exec_hi ; encoding: [0x0a,0x00,0x7f,0xd4,0x7a,0xfe,0x00,0x00]
 # W64: v_cmp_class_f64_e64 s[10:11], ttmp[14:15], exec_hi ; encoding: [0x0a,0x00,0x7f,0xd4,0x7a,0xfe,0x00,0x00]
-0x0a,0x00,0x7f,0xd4,0x7a,0xfe,0x00,0x00
 
-# W32: v_cmp_class_f64_e64 s10, exec, exec_lo    ; encoding: [0x0a,0x00,0x7f,0xd4,0x7e,0xfc,0x00,0x00]
-# W64: v_cmp_class_f64_e64 s[10:11], exec, exec_lo ; encoding: [0x0a,0x00,0x7f,0xd4,0x7e,0xfc,0x00,0x00]
 0x0a,0x00,0x7f,0xd4,0x7e,0xfc,0x00,0x00
+# W32: v_cmp_class_f64_e64 s10, exec, exec_lo  ; encoding: [0x0a,0x00,0x7f,0xd4,0x7e,0xfc,0x00,0x00]
+# W64: v_cmp_class_f64_e64 s[10:11], exec, exec_lo ; encoding: [0x0a,0x00,0x7f,0xd4,0x7e,0xfc,0x00,0x00]
 
-# W32: v_cmp_class_f64_e64 s10, null, null       ; encoding: [0x0a,0x00,0x7f,0xd4,0x7c,0xf8,0x00,0x00]
-# W64: v_cmp_class_f64_e64 s[10:11], null, null  ; encoding: [0x0a,0x00,0x7f,0xd4,0x7c,0xf8,0x00,0x00]
 0x0a,0x00,0x7f,0xd4,0x7c,0xf8,0x00,0x00
+# W32: v_cmp_class_f64_e64 s10, null, null     ; encoding: [0x0a,0x00,0x7f,0xd4,0x7c,0xf8,0x00,0x00]
+# W64: v_cmp_class_f64_e64 s[10:11], null, null ; encoding: [0x0a,0x00,0x7f,0xd4,0x7c,0xf8,0x00,0x00]
 
-# W32: v_cmp_class_f64_e64 s104, -1, -1          ; encoding: [0x68,0x00,0x7f,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_class_f64_e64 s[104:105], -1, -1    ; encoding: [0x68,0x00,0x7f,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x7f,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_class_f64_e64 s104, -1, -1        ; encoding: [0x68,0x00,0x7f,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_class_f64_e64 s[104:105], -1, -1  ; encoding: [0x68,0x00,0x7f,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_class_f64_e64 vcc_lo, 0.5, 0.5      ; encoding: [0x6a,0x00,0x7f,0xd4,0xf0,0xe0,0x01,0x00]
-# W64: v_cmp_class_f64_e64 vcc, 0.5, 0.5         ; encoding: [0x6a,0x00,0x7f,0xd4,0xf0,0xe0,0x01,0x00]
 0x6a,0x00,0x7f,0xd4,0xf0,0xe0,0x01,0x00
+# W32: v_cmp_class_f64_e64 vcc_lo, 0.5, 0.5    ; encoding: [0x6a,0x00,0x7f,0xd4,0xf0,0xe0,0x01,0x00]
+# W64: v_cmp_class_f64_e64 vcc, 0.5, 0.5       ; encoding: [0x6a,0x00,0x7f,0xd4,0xf0,0xe0,0x01,0x00]
 
+0x7a,0x01,0x7f,0xd4,0xfd,0xfa,0x01,0x20
 # W32: v_cmp_class_f64_e64 ttmp14, -|src_scc|, src_scc ; encoding: [0x7a,0x01,0x7f,0xd4,0xfd,0xfa,0x01,0x20]
 # W64: v_cmp_class_f64_e64 ttmp[14:15], -|src_scc|, src_scc ; encoding: [0x7a,0x01,0x7f,0xd4,0xfd,0xfa,0x01,0x20]
-0x7a,0x01,0x7f,0xd4,0xfd,0xfa,0x01,0x20
 
-# GFX11: v_cmp_class_f64_e64 null, 0xaf123456, 0xaf123456 ; encoding: [0x7c,0x00,0x7f,0xd4,0xff,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x7f,0xd4,0xff,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_class_f64_e64 null, 0xaf123456, 0xaf123456 ; encoding: [0x7c,0x00,0x7f,0xd4,0xff,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_eq_f16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x02,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_eq_f16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x02,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x02,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_eq_f16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x02,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_eq_f16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x02,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_eq_f16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x02,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_eq_f16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x02,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x02,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_eq_f16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x02,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_eq_f16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x02,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_eq_f16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x02,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_eq_f16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x02,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x02,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_eq_f16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x02,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_eq_f16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x02,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_eq_f16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x02,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_eq_f16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x02,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x02,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_eq_f16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x02,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_eq_f16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x02,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_eq_f16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x02,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_eq_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x02,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x02,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_eq_f16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x02,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_eq_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x02,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_eq_f16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x02,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_eq_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x02,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x02,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_eq_f16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x02,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_eq_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x02,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_eq_f16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x02,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_eq_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x02,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x02,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_eq_f16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x02,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_eq_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x02,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_eq_f16_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x02,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_eq_f16_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x02,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x02,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_eq_f16_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x02,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_eq_f16_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x02,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_eq_f16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x02,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_eq_f16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x02,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x02,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_eq_f16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x02,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_eq_f16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x02,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_eq_f16_e64 s10, |exec_hi|, null     ; encoding: [0x0a,0x01,0x02,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_eq_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x02,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x02,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_eq_f16_e64 s10, |exec_hi|, null   ; encoding: [0x0a,0x01,0x02,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_eq_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x02,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_eq_f16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x02,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_eq_f16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x02,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x02,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_eq_f16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x02,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_eq_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x02,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_eq_f16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x02,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_eq_f16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x02,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x02,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_eq_f16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x02,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_eq_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x02,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_eq_f16_e64 vcc_lo, 0.5, -m0         ; encoding: [0x6a,0x00,0x02,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_eq_f16_e64 vcc, 0.5, -m0            ; encoding: [0x6a,0x00,0x02,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x02,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_eq_f16_e64 vcc_lo, 0.5, -m0       ; encoding: [0x6a,0x00,0x02,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_eq_f16_e64 vcc, 0.5, -m0          ; encoding: [0x6a,0x00,0x02,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x02,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_eq_f16_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x02,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_eq_f16_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x02,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x02,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_eq_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x02,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7c,0x83,0x02,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmp_eq_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x02,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_eq_f32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x12,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_eq_f32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x12,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x12,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_eq_f32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x12,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_eq_f32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x12,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_eq_f32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x12,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_eq_f32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x12,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x12,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_eq_f32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x12,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_eq_f32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x12,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_eq_f32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x12,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_eq_f32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x12,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x12,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_eq_f32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x12,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_eq_f32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x12,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_eq_f32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x12,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_eq_f32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x12,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x12,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_eq_f32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x12,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_eq_f32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x12,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_eq_f32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x12,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_eq_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x12,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x12,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_eq_f32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x12,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_eq_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x12,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_eq_f32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x12,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_eq_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x12,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x12,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_eq_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x12,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_eq_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x12,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_eq_f32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x12,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_eq_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x12,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x12,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_eq_f32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x12,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_eq_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x12,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_eq_f32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x12,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_eq_f32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x12,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x12,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_eq_f32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x12,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_eq_f32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x12,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_eq_f32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x12,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_eq_f32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x12,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x12,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_eq_f32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x12,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_eq_f32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x12,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_eq_f32_e64 s10, |exec_hi|, null     ; encoding: [0x0a,0x01,0x12,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_eq_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x12,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x12,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_eq_f32_e64 s10, |exec_hi|, null   ; encoding: [0x0a,0x01,0x12,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_eq_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x12,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_eq_f32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x12,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_eq_f32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x12,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x12,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_eq_f32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x12,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_eq_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x12,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_eq_f32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x12,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_eq_f32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x12,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x12,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_eq_f32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x12,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_eq_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x12,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_eq_f32_e64 vcc_lo, 0.5, -m0         ; encoding: [0x6a,0x00,0x12,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_eq_f32_e64 vcc, 0.5, -m0            ; encoding: [0x6a,0x00,0x12,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x12,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_eq_f32_e64 vcc_lo, 0.5, -m0       ; encoding: [0x6a,0x00,0x12,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_eq_f32_e64 vcc, 0.5, -m0          ; encoding: [0x6a,0x00,0x12,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x12,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_eq_f32_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x12,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_eq_f32_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x12,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x12,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_eq_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x12,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7c,0x83,0x12,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_eq_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x12,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_eq_f64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x22,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_eq_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x22,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x22,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_eq_f64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x22,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_eq_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x22,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x22,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_eq_f64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x22,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_eq_f64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x22,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x22,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_eq_f64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x22,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_eq_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x22,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x22,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_eq_f64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x22,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_eq_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x22,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x22,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_eq_f64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x22,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_eq_f64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x22,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x22,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_eq_f64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x22,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_eq_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x22,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x22,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_eq_f64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x22,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_eq_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x22,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x22,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_eq_f64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x22,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_eq_f64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x22,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x22,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_eq_f64_e64 s10, -|exec|, src_scc    ; encoding: [0x0a,0x01,0x22,0xd4,0x7e,0xfa,0x01,0x20]
-# W64: v_cmp_eq_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x22,0xd4,0x7e,0xfa,0x01,0x20]
 0x0a,0x01,0x22,0xd4,0x7e,0xfa,0x01,0x20
+# W32: v_cmp_eq_f64_e64 s10, -|exec|, src_scc  ; encoding: [0x0a,0x01,0x22,0xd4,0x7e,0xfa,0x01,0x20]
+# W64: v_cmp_eq_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x22,0xd4,0x7e,0xfa,0x01,0x20]
 
-# W32: v_cmp_eq_f64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x22,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_eq_f64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x22,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x22,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_eq_f64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x22,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_eq_f64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x22,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_eq_f64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x22,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_eq_f64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x22,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x22,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_eq_f64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x22,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_eq_f64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x22,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_eq_f64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x22,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_eq_f64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x22,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x22,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_eq_f64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x22,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_eq_f64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x22,0xd4,0xf0,0xf8,0x00,0x00]
 
+0x7a,0x03,0x22,0xd4,0xfd,0xfc,0x00,0x60
 # W32: v_cmp_eq_f64_e64 ttmp14, -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x22,0xd4,0xfd,0xfc,0x00,0x60]
 # W64: v_cmp_eq_f64_e64 ttmp[14:15], -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x22,0xd4,0xfd,0xfc,0x00,0x60]
-0x7a,0x03,0x22,0xd4,0xfd,0xfc,0x00,0x60
 
-# GFX11: v_cmp_eq_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x22,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7c,0x82,0x22,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_eq_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x22,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_eq_i16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x32,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_eq_i16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x32,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x32,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_eq_i16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x32,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_eq_i16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x32,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_eq_i16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x32,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_eq_i16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x32,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x32,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_eq_i16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x32,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_eq_i16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x32,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_eq_i16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x32,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_eq_i16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x32,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x32,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_eq_i16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x32,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_eq_i16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x32,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_eq_i16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x32,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_eq_i16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x32,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x32,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_eq_i16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x32,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_eq_i16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x32,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_eq_i16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x32,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_eq_i16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x32,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x32,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_eq_i16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x32,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_eq_i16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x32,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_eq_i16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x32,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_eq_i16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x32,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x32,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_eq_i16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x32,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_eq_i16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x32,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_eq_i16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x32,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_eq_i16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x32,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x32,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_eq_i16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x32,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_eq_i16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x32,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_eq_i16_e64 s10, m0, 0x3800          ; encoding: [0x0a,0x00,0x32,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_eq_i16_e64 s[10:11], m0, 0x3800     ; encoding: [0x0a,0x00,0x32,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x0a,0x00,0x32,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_eq_i16_e64 s10, m0, 0x3800        ; encoding: [0x0a,0x00,0x32,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_eq_i16_e64 s[10:11], m0, 0x3800   ; encoding: [0x0a,0x00,0x32,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_eq_i16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x32,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_eq_i16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x32,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x32,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_eq_i16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x32,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_eq_i16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x32,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_eq_i16_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x32,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_eq_i16_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x32,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x32,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_eq_i16_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x32,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_eq_i16_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x32,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_eq_i16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x32,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_eq_i16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x32,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x32,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_eq_i16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x32,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_eq_i16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x32,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_eq_i16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x32,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_eq_i16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x32,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x32,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_eq_i16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x32,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_eq_i16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x32,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_eq_i16_e64 vcc_lo, 0x3800, m0       ; encoding: [0x6a,0x00,0x32,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_eq_i16_e64 vcc, 0x3800, m0          ; encoding: [0x6a,0x00,0x32,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x6a,0x00,0x32,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_eq_i16_e64 vcc_lo, 0x3800, m0     ; encoding: [0x6a,0x00,0x32,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_eq_i16_e64 vcc, 0x3800, m0        ; encoding: [0x6a,0x00,0x32,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_eq_i16_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x32,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_eq_i16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x32,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x32,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_eq_i16_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x32,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_eq_i16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x32,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmp_eq_i16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x32,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7c,0x00,0x32,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmp_eq_i16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x32,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_eq_i32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x42,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_eq_i32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x42,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x42,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_eq_i32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x42,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_eq_i32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x42,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_eq_i32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x42,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_eq_i32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x42,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x42,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_eq_i32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x42,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_eq_i32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x42,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_eq_i32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x42,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_eq_i32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x42,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x42,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_eq_i32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x42,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_eq_i32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x42,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_eq_i32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x42,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_eq_i32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x42,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x42,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_eq_i32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x42,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_eq_i32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x42,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_eq_i32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x42,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_eq_i32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x42,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x42,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_eq_i32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x42,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_eq_i32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x42,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_eq_i32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x42,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_eq_i32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x42,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x42,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_eq_i32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x42,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_eq_i32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x42,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_eq_i32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x42,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_eq_i32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x42,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x42,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_eq_i32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x42,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_eq_i32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x42,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_eq_i32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x42,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_eq_i32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x42,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x42,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_eq_i32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x42,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_eq_i32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x42,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_eq_i32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x42,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_eq_i32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x42,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x42,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_eq_i32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x42,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_eq_i32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x42,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_eq_i32_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x42,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_eq_i32_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x42,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x42,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_eq_i32_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x42,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_eq_i32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x42,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_eq_i32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x42,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_eq_i32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x42,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x42,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_eq_i32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x42,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_eq_i32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x42,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_eq_i32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x42,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_eq_i32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x42,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x42,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_eq_i32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x42,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_eq_i32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x42,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_eq_i32_e64 vcc_lo, 0.5, m0          ; encoding: [0x6a,0x00,0x42,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_eq_i32_e64 vcc, 0.5, m0             ; encoding: [0x6a,0x00,0x42,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x42,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_eq_i32_e64 vcc_lo, 0.5, m0        ; encoding: [0x6a,0x00,0x42,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_eq_i32_e64 vcc, 0.5, m0           ; encoding: [0x6a,0x00,0x42,0xd4,0xf0,0xfa,0x00,0x00]
 
-# W32: v_cmp_eq_i32_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x42,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_eq_i32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x42,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x42,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_eq_i32_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x42,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_eq_i32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x42,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmp_eq_i32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x42,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x42,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_eq_i32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x42,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_eq_i64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x52,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_eq_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x52,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x52,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_eq_i64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x52,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_eq_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x52,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x52,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_eq_i64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x52,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_eq_i64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x52,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x52,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_eq_i64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x52,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_eq_i64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x52,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x52,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_eq_i64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x52,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_eq_i64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x52,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x52,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_eq_i64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x52,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_eq_i64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x52,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x52,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_eq_i64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x52,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_eq_i64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x52,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x52,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_eq_i64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x52,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_eq_i64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x52,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x52,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_eq_i64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x52,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_eq_i64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x52,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x52,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_eq_i64_e64 s10, exec, src_scc       ; encoding: [0x0a,0x00,0x52,0xd4,0x7e,0xfa,0x01,0x00]
-# W64: v_cmp_eq_i64_e64 s[10:11], exec, src_scc  ; encoding: [0x0a,0x00,0x52,0xd4,0x7e,0xfa,0x01,0x00]
 0x0a,0x00,0x52,0xd4,0x7e,0xfa,0x01,0x00
+# W32: v_cmp_eq_i64_e64 s10, exec, src_scc     ; encoding: [0x0a,0x00,0x52,0xd4,0x7e,0xfa,0x01,0x00]
+# W64: v_cmp_eq_i64_e64 s[10:11], exec, src_scc ; encoding: [0x0a,0x00,0x52,0xd4,0x7e,0xfa,0x01,0x00]
 
-# W32: v_cmp_eq_i64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x52,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_eq_i64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x52,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x52,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_eq_i64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x52,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_eq_i64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x52,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_eq_i64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x52,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_eq_i64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x52,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x52,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_eq_i64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x52,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_eq_i64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x52,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_eq_i64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x52,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_eq_i64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x52,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x52,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_eq_i64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x52,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_eq_i64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x52,0xd4,0xf0,0xf8,0x00,0x00]
 
-# W32: v_cmp_eq_i64_e64 ttmp14, src_scc, exec    ; encoding: [0x7a,0x00,0x52,0xd4,0xfd,0xfc,0x00,0x00]
-# W64: v_cmp_eq_i64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x52,0xd4,0xfd,0xfc,0x00,0x00]
 0x7a,0x00,0x52,0xd4,0xfd,0xfc,0x00,0x00
+# W32: v_cmp_eq_i64_e64 ttmp14, src_scc, exec  ; encoding: [0x7a,0x00,0x52,0xd4,0xfd,0xfc,0x00,0x00]
+# W64: v_cmp_eq_i64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x52,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmp_eq_i64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x52,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x52,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_eq_i64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x52,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_eq_u16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x3a,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_eq_u16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x3a,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x3a,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_eq_u16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x3a,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_eq_u16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x3a,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_eq_u16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x3a,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_eq_u16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x3a,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x3a,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_eq_u16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x3a,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_eq_u16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x3a,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_eq_u16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x3a,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_eq_u16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x3a,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x3a,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_eq_u16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x3a,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_eq_u16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x3a,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_eq_u16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x3a,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_eq_u16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x3a,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x3a,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_eq_u16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x3a,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_eq_u16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x3a,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_eq_u16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x3a,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_eq_u16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x3a,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x3a,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_eq_u16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x3a,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_eq_u16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x3a,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_eq_u16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x3a,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_eq_u16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x3a,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x3a,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_eq_u16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x3a,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_eq_u16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x3a,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_eq_u16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x3a,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_eq_u16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x3a,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x3a,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_eq_u16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x3a,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_eq_u16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x3a,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_eq_u16_e64 s10, m0, 0x3800          ; encoding: [0x0a,0x00,0x3a,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_eq_u16_e64 s[10:11], m0, 0x3800     ; encoding: [0x0a,0x00,0x3a,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x0a,0x00,0x3a,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_eq_u16_e64 s10, m0, 0x3800        ; encoding: [0x0a,0x00,0x3a,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_eq_u16_e64 s[10:11], m0, 0x3800   ; encoding: [0x0a,0x00,0x3a,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_eq_u16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x3a,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_eq_u16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x3a,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x3a,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_eq_u16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x3a,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_eq_u16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x3a,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_eq_u16_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x3a,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_eq_u16_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x3a,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x3a,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_eq_u16_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x3a,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_eq_u16_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x3a,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_eq_u16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x3a,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_eq_u16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x3a,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x3a,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_eq_u16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x3a,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_eq_u16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x3a,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_eq_u16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x3a,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_eq_u16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x3a,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x3a,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_eq_u16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x3a,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_eq_u16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x3a,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_eq_u16_e64 vcc_lo, 0x3800, m0       ; encoding: [0x6a,0x00,0x3a,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_eq_u16_e64 vcc, 0x3800, m0          ; encoding: [0x6a,0x00,0x3a,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x6a,0x00,0x3a,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_eq_u16_e64 vcc_lo, 0x3800, m0     ; encoding: [0x6a,0x00,0x3a,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_eq_u16_e64 vcc, 0x3800, m0        ; encoding: [0x6a,0x00,0x3a,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_eq_u16_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x3a,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_eq_u16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x3a,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x3a,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_eq_u16_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x3a,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_eq_u16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x3a,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmp_eq_u16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x3a,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7c,0x00,0x3a,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmp_eq_u16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x3a,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_eq_u32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x4a,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_eq_u32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x4a,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x4a,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_eq_u32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x4a,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_eq_u32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x4a,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_eq_u32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x4a,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_eq_u32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x4a,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x4a,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_eq_u32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x4a,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_eq_u32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x4a,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_eq_u32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x4a,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_eq_u32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x4a,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x4a,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_eq_u32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x4a,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_eq_u32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x4a,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_eq_u32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x4a,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_eq_u32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x4a,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x4a,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_eq_u32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x4a,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_eq_u32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x4a,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_eq_u32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x4a,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_eq_u32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x4a,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x4a,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_eq_u32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x4a,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_eq_u32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x4a,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_eq_u32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x4a,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_eq_u32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4a,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x4a,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_eq_u32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4a,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_eq_u32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4a,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_eq_u32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x4a,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_eq_u32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x4a,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x4a,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_eq_u32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x4a,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_eq_u32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x4a,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_eq_u32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x4a,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_eq_u32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x4a,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x4a,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_eq_u32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x4a,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_eq_u32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x4a,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_eq_u32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x4a,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_eq_u32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x4a,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x4a,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_eq_u32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x4a,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_eq_u32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x4a,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_eq_u32_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x4a,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_eq_u32_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x4a,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x4a,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_eq_u32_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x4a,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_eq_u32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x4a,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_eq_u32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x4a,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_eq_u32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x4a,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x4a,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_eq_u32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x4a,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_eq_u32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x4a,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_eq_u32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x4a,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_eq_u32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x4a,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x4a,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_eq_u32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x4a,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_eq_u32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x4a,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_eq_u32_e64 vcc_lo, 0.5, m0          ; encoding: [0x6a,0x00,0x4a,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_eq_u32_e64 vcc, 0.5, m0             ; encoding: [0x6a,0x00,0x4a,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x4a,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_eq_u32_e64 vcc_lo, 0.5, m0        ; encoding: [0x6a,0x00,0x4a,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_eq_u32_e64 vcc, 0.5, m0           ; encoding: [0x6a,0x00,0x4a,0xd4,0xf0,0xfa,0x00,0x00]
 
-# W32: v_cmp_eq_u32_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x4a,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_eq_u32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4a,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x4a,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_eq_u32_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4a,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_eq_u32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4a,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmp_eq_u32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x4a,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x4a,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_eq_u32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x4a,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_eq_u64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x5a,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_eq_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x5a,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x5a,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_eq_u64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x5a,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_eq_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x5a,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x5a,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_eq_u64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x5a,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_eq_u64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x5a,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x5a,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_eq_u64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x5a,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_eq_u64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x5a,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x5a,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_eq_u64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x5a,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_eq_u64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x5a,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x5a,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_eq_u64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x5a,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_eq_u64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x5a,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x5a,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_eq_u64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x5a,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_eq_u64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x5a,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x5a,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_eq_u64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x5a,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_eq_u64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x5a,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x5a,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_eq_u64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x5a,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_eq_u64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x5a,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x5a,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_eq_u64_e64 s10, exec, src_scc       ; encoding: [0x0a,0x00,0x5a,0xd4,0x7e,0xfa,0x01,0x00]
-# W64: v_cmp_eq_u64_e64 s[10:11], exec, src_scc  ; encoding: [0x0a,0x00,0x5a,0xd4,0x7e,0xfa,0x01,0x00]
 0x0a,0x00,0x5a,0xd4,0x7e,0xfa,0x01,0x00
+# W32: v_cmp_eq_u64_e64 s10, exec, src_scc     ; encoding: [0x0a,0x00,0x5a,0xd4,0x7e,0xfa,0x01,0x00]
+# W64: v_cmp_eq_u64_e64 s[10:11], exec, src_scc ; encoding: [0x0a,0x00,0x5a,0xd4,0x7e,0xfa,0x01,0x00]
 
-# W32: v_cmp_eq_u64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x5a,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_eq_u64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x5a,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x5a,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_eq_u64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x5a,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_eq_u64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x5a,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_eq_u64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x5a,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_eq_u64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x5a,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x5a,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_eq_u64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x5a,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_eq_u64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x5a,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_eq_u64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x5a,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_eq_u64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x5a,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x5a,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_eq_u64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x5a,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_eq_u64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x5a,0xd4,0xf0,0xf8,0x00,0x00]
 
-# W32: v_cmp_eq_u64_e64 ttmp14, src_scc, exec    ; encoding: [0x7a,0x00,0x5a,0xd4,0xfd,0xfc,0x00,0x00]
-# W64: v_cmp_eq_u64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x5a,0xd4,0xfd,0xfc,0x00,0x00]
 0x7a,0x00,0x5a,0xd4,0xfd,0xfc,0x00,0x00
+# W32: v_cmp_eq_u64_e64 ttmp14, src_scc, exec  ; encoding: [0x7a,0x00,0x5a,0xd4,0xfd,0xfc,0x00,0x00]
+# W64: v_cmp_eq_u64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x5a,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmp_eq_u64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x5a,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x5a,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_eq_u64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x5a,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_f_f16_e64 s10, v1, v2               ; encoding: [0x0a,0x00,0x00,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_f_f16_e64 s[10:11], v1, v2          ; encoding: [0x0a,0x00,0x00,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x00,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_f_f16_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x00,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_f_f16_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x00,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_f_f16_e64 s10, v255, v255           ; encoding: [0x0a,0x00,0x00,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_f_f16_e64 s[10:11], v255, v255      ; encoding: [0x0a,0x00,0x00,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x00,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_f_f16_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x00,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_f_f16_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x00,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_f_f16_e64 s10, s1, s2               ; encoding: [0x0a,0x00,0x00,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_f_f16_e64 s[10:11], s1, s2          ; encoding: [0x0a,0x00,0x00,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x00,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_f_f16_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x00,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_f_f16_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x00,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_f_f16_e64 s10, s105, s105           ; encoding: [0x0a,0x00,0x00,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_f_f16_e64 s[10:11], s105, s105      ; encoding: [0x0a,0x00,0x00,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x00,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_f_f16_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x00,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_f_f16_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x00,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_f_f16_e64 s10, vcc_lo, ttmp15       ; encoding: [0x0a,0x00,0x00,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_f_f16_e64 s[10:11], vcc_lo, ttmp15  ; encoding: [0x0a,0x00,0x00,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x00,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_f_f16_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x00,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_f_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x00,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_f_f16_e64 s10, vcc_hi, 0xfe0b       ; encoding: [0x0a,0x00,0x00,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_f_f16_e64 s[10:11], vcc_hi, 0xfe0b  ; encoding: [0x0a,0x00,0x00,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x00,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_f_f16_e64 s10, vcc_hi, 0xfe0b     ; encoding: [0x0a,0x00,0x00,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_f_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x00,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_f_f16_e64 s10, ttmp15, src_scc      ; encoding: [0x0a,0x00,0x00,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_f_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x00,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x00,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_f_f16_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x00,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_f_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x00,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_f_f16_e64 s10, m0, 0.5              ; encoding: [0x0a,0x00,0x00,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_f_f16_e64 s[10:11], m0, 0.5         ; encoding: [0x0a,0x00,0x00,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x00,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_f_f16_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x00,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_f_f16_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x00,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_f_f16_e64 s10, exec_lo, -1          ; encoding: [0x0a,0x00,0x00,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_f_f16_e64 s[10:11], exec_lo, -1     ; encoding: [0x0a,0x00,0x00,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x00,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_f_f16_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x00,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_f_f16_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x00,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_f_f16_e64 s10, |exec_hi|, null      ; encoding: [0x0a,0x01,0x00,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_f_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x00,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x00,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_f_f16_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x00,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_f_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x00,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_f_f16_e64 s10, null, exec_lo        ; encoding: [0x0a,0x00,0x00,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_f_f16_e64 s[10:11], null, exec_lo   ; encoding: [0x0a,0x00,0x00,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x00,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_f_f16_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x00,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_f_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x00,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_f_f16_e64 s104, -1, exec_hi         ; encoding: [0x68,0x00,0x00,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_f_f16_e64 s[104:105], -1, exec_hi   ; encoding: [0x68,0x00,0x00,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x00,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_f_f16_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x00,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_f_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x00,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_f_f16_e64 vcc_lo, 0.5, -m0          ; encoding: [0x6a,0x00,0x00,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_f_f16_e64 vcc, 0.5, -m0             ; encoding: [0x6a,0x00,0x00,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x00,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_f_f16_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x00,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_f_f16_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x00,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x00,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_f_f16_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x00,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_f_f16_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x00,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x00,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_f_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x00,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7c,0x83,0x00,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmp_f_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x00,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_f_f32_e64 s10, v1, v2               ; encoding: [0x0a,0x00,0x10,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_f_f32_e64 s[10:11], v1, v2          ; encoding: [0x0a,0x00,0x10,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x10,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_f_f32_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x10,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_f_f32_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x10,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_f_f32_e64 s10, v255, v255           ; encoding: [0x0a,0x00,0x10,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_f_f32_e64 s[10:11], v255, v255      ; encoding: [0x0a,0x00,0x10,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x10,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_f_f32_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x10,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_f_f32_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x10,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_f_f32_e64 s10, s1, s2               ; encoding: [0x0a,0x00,0x10,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_f_f32_e64 s[10:11], s1, s2          ; encoding: [0x0a,0x00,0x10,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x10,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_f_f32_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x10,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_f_f32_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x10,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_f_f32_e64 s10, s105, s105           ; encoding: [0x0a,0x00,0x10,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_f_f32_e64 s[10:11], s105, s105      ; encoding: [0x0a,0x00,0x10,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x10,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_f_f32_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x10,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_f_f32_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x10,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_f_f32_e64 s10, vcc_lo, ttmp15       ; encoding: [0x0a,0x00,0x10,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_f_f32_e64 s[10:11], vcc_lo, ttmp15  ; encoding: [0x0a,0x00,0x10,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x10,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_f_f32_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x10,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_f_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x10,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_f_f32_e64 s10, vcc_hi, 0xaf123456   ; encoding: [0x0a,0x00,0x10,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_f_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x10,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x10,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_f_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x10,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_f_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x10,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_f_f32_e64 s10, ttmp15, src_scc      ; encoding: [0x0a,0x00,0x10,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_f_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x10,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x10,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_f_f32_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x10,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_f_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x10,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_f_f32_e64 s10, m0, 0.5              ; encoding: [0x0a,0x00,0x10,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_f_f32_e64 s[10:11], m0, 0.5         ; encoding: [0x0a,0x00,0x10,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x10,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_f_f32_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x10,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_f_f32_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x10,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_f_f32_e64 s10, exec_lo, -1          ; encoding: [0x0a,0x00,0x10,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_f_f32_e64 s[10:11], exec_lo, -1     ; encoding: [0x0a,0x00,0x10,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x10,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_f_f32_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x10,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_f_f32_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x10,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_f_f32_e64 s10, |exec_hi|, null      ; encoding: [0x0a,0x01,0x10,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_f_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x10,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x10,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_f_f32_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x10,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_f_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x10,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_f_f32_e64 s10, null, exec_lo        ; encoding: [0x0a,0x00,0x10,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_f_f32_e64 s[10:11], null, exec_lo   ; encoding: [0x0a,0x00,0x10,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x10,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_f_f32_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x10,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_f_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x10,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_f_f32_e64 s104, -1, exec_hi         ; encoding: [0x68,0x00,0x10,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_f_f32_e64 s[104:105], -1, exec_hi   ; encoding: [0x68,0x00,0x10,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x10,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_f_f32_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x10,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_f_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x10,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_f_f32_e64 vcc_lo, 0.5, -m0          ; encoding: [0x6a,0x00,0x10,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_f_f32_e64 vcc, 0.5, -m0             ; encoding: [0x6a,0x00,0x10,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x10,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_f_f32_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x10,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_f_f32_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x10,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x10,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_f_f32_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x10,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_f_f32_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x10,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x10,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_f_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x10,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7c,0x83,0x10,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_f_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x10,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_f_f64_e64 s10, v[1:2], v[2:3]       ; encoding: [0x0a,0x00,0x20,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_f_f64_e64 s[10:11], v[1:2], v[2:3]  ; encoding: [0x0a,0x00,0x20,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x20,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_f_f64_e64 s10, v[1:2], v[2:3]     ; encoding: [0x0a,0x00,0x20,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_f_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x20,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x20,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_f_f64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x20,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_f_f64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x20,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x20,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_f_f64_e64 s10, s[2:3], s[4:5]       ; encoding: [0x0a,0x00,0x20,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_f_f64_e64 s[10:11], s[2:3], s[4:5]  ; encoding: [0x0a,0x00,0x20,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x20,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_f_f64_e64 s10, s[2:3], s[4:5]     ; encoding: [0x0a,0x00,0x20,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_f_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x20,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x20,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_f_f64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x20,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_f_f64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x20,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x20,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_f_f64_e64 s10, vcc, ttmp[14:15]     ; encoding: [0x0a,0x00,0x20,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_f_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x20,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x20,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_f_f64_e64 s10, vcc, ttmp[14:15]   ; encoding: [0x0a,0x00,0x20,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_f_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x20,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x20,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_f_f64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x20,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_f_f64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x20,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x20,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_f_f64_e64 s10, -|exec|, src_scc     ; encoding: [0x0a,0x01,0x20,0xd4,0x7e,0xfa,0x01,0x20]
-# W64: v_cmp_f_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x20,0xd4,0x7e,0xfa,0x01,0x20]
 0x0a,0x01,0x20,0xd4,0x7e,0xfa,0x01,0x20
+# W32: v_cmp_f_f64_e64 s10, -|exec|, src_scc   ; encoding: [0x0a,0x01,0x20,0xd4,0x7e,0xfa,0x01,0x20]
+# W64: v_cmp_f_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x20,0xd4,0x7e,0xfa,0x01,0x20]
 
-# W32: v_cmp_f_f64_e64 s10, null, 0.5            ; encoding: [0x0a,0x00,0x20,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_f_f64_e64 s[10:11], null, 0.5       ; encoding: [0x0a,0x00,0x20,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x20,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_f_f64_e64 s10, null, 0.5          ; encoding: [0x0a,0x00,0x20,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_f_f64_e64 s[10:11], null, 0.5     ; encoding: [0x0a,0x00,0x20,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_f_f64_e64 s104, -1, -1              ; encoding: [0x68,0x00,0x20,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_f_f64_e64 s[104:105], -1, -1        ; encoding: [0x68,0x00,0x20,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x20,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_f_f64_e64 s104, -1, -1            ; encoding: [0x68,0x00,0x20,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_f_f64_e64 s[104:105], -1, -1      ; encoding: [0x68,0x00,0x20,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_f_f64_e64 vcc_lo, 0.5, null         ; encoding: [0x6a,0x00,0x20,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_f_f64_e64 vcc, 0.5, null            ; encoding: [0x6a,0x00,0x20,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x20,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_f_f64_e64 vcc_lo, 0.5, null       ; encoding: [0x6a,0x00,0x20,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_f_f64_e64 vcc, 0.5, null          ; encoding: [0x6a,0x00,0x20,0xd4,0xf0,0xf8,0x00,0x00]
 
+0x7a,0x03,0x20,0xd4,0xfd,0xfc,0x00,0x60
 # W32: v_cmp_f_f64_e64 ttmp14, -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x20,0xd4,0xfd,0xfc,0x00,0x60]
 # W64: v_cmp_f_f64_e64 ttmp[14:15], -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x20,0xd4,0xfd,0xfc,0x00,0x60]
-0x7a,0x03,0x20,0xd4,0xfd,0xfc,0x00,0x60
 
-# GFX11: v_cmp_f_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x20,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7c,0x82,0x20,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_f_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x20,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_f_i32_e64 s10, v1, v2               ; encoding: [0x0a,0x00,0x40,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_f_i32_e64 s[10:11], v1, v2          ; encoding: [0x0a,0x00,0x40,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x40,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_f_i32_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x40,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_f_i32_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x40,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_f_i32_e64 s10, v255, v255           ; encoding: [0x0a,0x00,0x40,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_f_i32_e64 s[10:11], v255, v255      ; encoding: [0x0a,0x00,0x40,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x40,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_f_i32_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x40,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_f_i32_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x40,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_f_i32_e64 s10, s1, s2               ; encoding: [0x0a,0x00,0x40,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_f_i32_e64 s[10:11], s1, s2          ; encoding: [0x0a,0x00,0x40,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x40,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_f_i32_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x40,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_f_i32_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x40,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_f_i32_e64 s10, s105, s105           ; encoding: [0x0a,0x00,0x40,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_f_i32_e64 s[10:11], s105, s105      ; encoding: [0x0a,0x00,0x40,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x40,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_f_i32_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x40,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_f_i32_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x40,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_f_i32_e64 s10, vcc_lo, ttmp15       ; encoding: [0x0a,0x00,0x40,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_f_i32_e64 s[10:11], vcc_lo, ttmp15  ; encoding: [0x0a,0x00,0x40,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x40,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_f_i32_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x40,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_f_i32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x40,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_f_i32_e64 s10, vcc_hi, 0xaf123456   ; encoding: [0x0a,0x00,0x40,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_f_i32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x40,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x40,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_f_i32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x40,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_f_i32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x40,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_f_i32_e64 s10, ttmp15, src_scc      ; encoding: [0x0a,0x00,0x40,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_f_i32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x40,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x40,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_f_i32_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x40,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_f_i32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x40,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_f_i32_e64 s10, m0, 0.5              ; encoding: [0x0a,0x00,0x40,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_f_i32_e64 s[10:11], m0, 0.5         ; encoding: [0x0a,0x00,0x40,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x40,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_f_i32_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x40,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_f_i32_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x40,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_f_i32_e64 s10, exec_lo, -1          ; encoding: [0x0a,0x00,0x40,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_f_i32_e64 s[10:11], exec_lo, -1     ; encoding: [0x0a,0x00,0x40,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x40,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_f_i32_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x40,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_f_i32_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x40,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_f_i32_e64 s10, exec_hi, null        ; encoding: [0x0a,0x00,0x40,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_f_i32_e64 s[10:11], exec_hi, null   ; encoding: [0x0a,0x00,0x40,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x40,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_f_i32_e64 s10, exec_hi, null      ; encoding: [0x0a,0x00,0x40,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_f_i32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x40,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_f_i32_e64 s10, null, exec_lo        ; encoding: [0x0a,0x00,0x40,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_f_i32_e64 s[10:11], null, exec_lo   ; encoding: [0x0a,0x00,0x40,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x40,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_f_i32_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x40,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_f_i32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x40,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_f_i32_e64 s104, -1, exec_hi         ; encoding: [0x68,0x00,0x40,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_f_i32_e64 s[104:105], -1, exec_hi   ; encoding: [0x68,0x00,0x40,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x40,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_f_i32_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x40,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_f_i32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x40,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_f_i32_e64 vcc_lo, 0.5, m0           ; encoding: [0x6a,0x00,0x40,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_f_i32_e64 vcc, 0.5, m0              ; encoding: [0x6a,0x00,0x40,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x40,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_f_i32_e64 vcc_lo, 0.5, m0         ; encoding: [0x6a,0x00,0x40,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_f_i32_e64 vcc, 0.5, m0            ; encoding: [0x6a,0x00,0x40,0xd4,0xf0,0xfa,0x00,0x00]
 
-# W32: v_cmp_f_i32_e64 ttmp14, src_scc, vcc_lo   ; encoding: [0x7a,0x00,0x40,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_f_i32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x40,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x40,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_f_i32_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x40,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_f_i32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x40,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmp_f_i32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x40,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x40,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_f_i32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x40,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_f_i64_e64 s10, v[1:2], v[2:3]       ; encoding: [0x0a,0x00,0x50,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_f_i64_e64 s[10:11], v[1:2], v[2:3]  ; encoding: [0x0a,0x00,0x50,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x50,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_f_i64_e64 s10, v[1:2], v[2:3]     ; encoding: [0x0a,0x00,0x50,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_f_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x50,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x50,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_f_i64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x50,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_f_i64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x50,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x50,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_f_i64_e64 s10, s[2:3], s[4:5]       ; encoding: [0x0a,0x00,0x50,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_f_i64_e64 s[10:11], s[2:3], s[4:5]  ; encoding: [0x0a,0x00,0x50,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x50,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_f_i64_e64 s10, s[2:3], s[4:5]     ; encoding: [0x0a,0x00,0x50,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_f_i64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x50,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x50,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_f_i64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x50,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_f_i64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x50,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x50,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_f_i64_e64 s10, vcc, ttmp[14:15]     ; encoding: [0x0a,0x00,0x50,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_f_i64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x50,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x50,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_f_i64_e64 s10, vcc, ttmp[14:15]   ; encoding: [0x0a,0x00,0x50,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_f_i64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x50,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x50,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_f_i64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x50,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_f_i64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x50,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x50,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_f_i64_e64 s10, exec, src_scc        ; encoding: [0x0a,0x00,0x50,0xd4,0x7e,0xfa,0x01,0x00]
-# W64: v_cmp_f_i64_e64 s[10:11], exec, src_scc   ; encoding: [0x0a,0x00,0x50,0xd4,0x7e,0xfa,0x01,0x00]
 0x0a,0x00,0x50,0xd4,0x7e,0xfa,0x01,0x00
+# W32: v_cmp_f_i64_e64 s10, exec, src_scc      ; encoding: [0x0a,0x00,0x50,0xd4,0x7e,0xfa,0x01,0x00]
+# W64: v_cmp_f_i64_e64 s[10:11], exec, src_scc ; encoding: [0x0a,0x00,0x50,0xd4,0x7e,0xfa,0x01,0x00]
 
-# W32: v_cmp_f_i64_e64 s10, null, 0.5            ; encoding: [0x0a,0x00,0x50,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_f_i64_e64 s[10:11], null, 0.5       ; encoding: [0x0a,0x00,0x50,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x50,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_f_i64_e64 s10, null, 0.5          ; encoding: [0x0a,0x00,0x50,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_f_i64_e64 s[10:11], null, 0.5     ; encoding: [0x0a,0x00,0x50,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_f_i64_e64 s104, -1, -1              ; encoding: [0x68,0x00,0x50,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_f_i64_e64 s[104:105], -1, -1        ; encoding: [0x68,0x00,0x50,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x50,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_f_i64_e64 s104, -1, -1            ; encoding: [0x68,0x00,0x50,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_f_i64_e64 s[104:105], -1, -1      ; encoding: [0x68,0x00,0x50,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_f_i64_e64 vcc_lo, 0.5, null         ; encoding: [0x6a,0x00,0x50,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_f_i64_e64 vcc, 0.5, null            ; encoding: [0x6a,0x00,0x50,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x50,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_f_i64_e64 vcc_lo, 0.5, null       ; encoding: [0x6a,0x00,0x50,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_f_i64_e64 vcc, 0.5, null          ; encoding: [0x6a,0x00,0x50,0xd4,0xf0,0xf8,0x00,0x00]
 
-# W32: v_cmp_f_i64_e64 ttmp14, src_scc, exec     ; encoding: [0x7a,0x00,0x50,0xd4,0xfd,0xfc,0x00,0x00]
-# W64: v_cmp_f_i64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x50,0xd4,0xfd,0xfc,0x00,0x00]
 0x7a,0x00,0x50,0xd4,0xfd,0xfc,0x00,0x00
+# W32: v_cmp_f_i64_e64 ttmp14, src_scc, exec   ; encoding: [0x7a,0x00,0x50,0xd4,0xfd,0xfc,0x00,0x00]
+# W64: v_cmp_f_i64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x50,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmp_f_i64_e64 null, 0xaf123456, vcc   ; encoding: [0x7c,0x00,0x50,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x50,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_f_i64_e64 null, 0xaf123456, vcc   ; encoding: [0x7c,0x00,0x50,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_f_u32_e64 s10, v1, v2               ; encoding: [0x0a,0x00,0x48,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_f_u32_e64 s[10:11], v1, v2          ; encoding: [0x0a,0x00,0x48,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x48,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_f_u32_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x48,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_f_u32_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x48,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_f_u32_e64 s10, v255, v255           ; encoding: [0x0a,0x00,0x48,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_f_u32_e64 s[10:11], v255, v255      ; encoding: [0x0a,0x00,0x48,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x48,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_f_u32_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x48,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_f_u32_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x48,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_f_u32_e64 s10, s1, s2               ; encoding: [0x0a,0x00,0x48,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_f_u32_e64 s[10:11], s1, s2          ; encoding: [0x0a,0x00,0x48,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x48,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_f_u32_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x48,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_f_u32_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x48,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_f_u32_e64 s10, s105, s105           ; encoding: [0x0a,0x00,0x48,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_f_u32_e64 s[10:11], s105, s105      ; encoding: [0x0a,0x00,0x48,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x48,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_f_u32_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x48,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_f_u32_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x48,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_f_u32_e64 s10, vcc_lo, ttmp15       ; encoding: [0x0a,0x00,0x48,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_f_u32_e64 s[10:11], vcc_lo, ttmp15  ; encoding: [0x0a,0x00,0x48,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x48,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_f_u32_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x48,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_f_u32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x48,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_f_u32_e64 s10, vcc_hi, 0xaf123456   ; encoding: [0x0a,0x00,0x48,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_f_u32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x48,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x48,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_f_u32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x48,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_f_u32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x48,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_f_u32_e64 s10, ttmp15, src_scc      ; encoding: [0x0a,0x00,0x48,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_f_u32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x48,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x48,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_f_u32_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x48,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_f_u32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x48,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_f_u32_e64 s10, m0, 0.5              ; encoding: [0x0a,0x00,0x48,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_f_u32_e64 s[10:11], m0, 0.5         ; encoding: [0x0a,0x00,0x48,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x48,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_f_u32_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x48,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_f_u32_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x48,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_f_u32_e64 s10, exec_lo, -1          ; encoding: [0x0a,0x00,0x48,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_f_u32_e64 s[10:11], exec_lo, -1     ; encoding: [0x0a,0x00,0x48,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x48,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_f_u32_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x48,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_f_u32_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x48,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_f_u32_e64 s10, exec_hi, null        ; encoding: [0x0a,0x00,0x48,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_f_u32_e64 s[10:11], exec_hi, null   ; encoding: [0x0a,0x00,0x48,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x48,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_f_u32_e64 s10, exec_hi, null      ; encoding: [0x0a,0x00,0x48,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_f_u32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x48,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_f_u32_e64 s10, null, exec_lo        ; encoding: [0x0a,0x00,0x48,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_f_u32_e64 s[10:11], null, exec_lo   ; encoding: [0x0a,0x00,0x48,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x48,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_f_u32_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x48,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_f_u32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x48,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_f_u32_e64 s104, -1, exec_hi         ; encoding: [0x68,0x00,0x48,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_f_u32_e64 s[104:105], -1, exec_hi   ; encoding: [0x68,0x00,0x48,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x48,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_f_u32_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x48,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_f_u32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x48,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_f_u32_e64 vcc_lo, 0.5, m0           ; encoding: [0x6a,0x00,0x48,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_f_u32_e64 vcc, 0.5, m0              ; encoding: [0x6a,0x00,0x48,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x48,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_f_u32_e64 vcc_lo, 0.5, m0         ; encoding: [0x6a,0x00,0x48,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_f_u32_e64 vcc, 0.5, m0            ; encoding: [0x6a,0x00,0x48,0xd4,0xf0,0xfa,0x00,0x00]
 
-# W32: v_cmp_f_u32_e64 ttmp14, src_scc, vcc_lo   ; encoding: [0x7a,0x00,0x48,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_f_u32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x48,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x48,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_f_u32_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x48,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_f_u32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x48,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmp_f_u32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x48,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x48,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_f_u32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x48,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_f_u64_e64 s10, v[1:2], v[2:3]       ; encoding: [0x0a,0x00,0x58,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_f_u64_e64 s[10:11], v[1:2], v[2:3]  ; encoding: [0x0a,0x00,0x58,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x58,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_f_u64_e64 s10, v[1:2], v[2:3]     ; encoding: [0x0a,0x00,0x58,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_f_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x58,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x58,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_f_u64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x58,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_f_u64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x58,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x58,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_f_u64_e64 s10, s[2:3], s[4:5]       ; encoding: [0x0a,0x00,0x58,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_f_u64_e64 s[10:11], s[2:3], s[4:5]  ; encoding: [0x0a,0x00,0x58,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x58,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_f_u64_e64 s10, s[2:3], s[4:5]     ; encoding: [0x0a,0x00,0x58,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_f_u64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x58,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x58,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_f_u64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x58,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_f_u64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x58,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x58,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_f_u64_e64 s10, vcc, ttmp[14:15]     ; encoding: [0x0a,0x00,0x58,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_f_u64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x58,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x58,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_f_u64_e64 s10, vcc, ttmp[14:15]   ; encoding: [0x0a,0x00,0x58,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_f_u64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x58,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x58,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_f_u64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x58,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_f_u64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x58,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x58,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_f_u64_e64 s10, exec, src_scc        ; encoding: [0x0a,0x00,0x58,0xd4,0x7e,0xfa,0x01,0x00]
-# W64: v_cmp_f_u64_e64 s[10:11], exec, src_scc   ; encoding: [0x0a,0x00,0x58,0xd4,0x7e,0xfa,0x01,0x00]
 0x0a,0x00,0x58,0xd4,0x7e,0xfa,0x01,0x00
+# W32: v_cmp_f_u64_e64 s10, exec, src_scc      ; encoding: [0x0a,0x00,0x58,0xd4,0x7e,0xfa,0x01,0x00]
+# W64: v_cmp_f_u64_e64 s[10:11], exec, src_scc ; encoding: [0x0a,0x00,0x58,0xd4,0x7e,0xfa,0x01,0x00]
 
-# W32: v_cmp_f_u64_e64 s10, null, 0.5            ; encoding: [0x0a,0x00,0x58,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_f_u64_e64 s[10:11], null, 0.5       ; encoding: [0x0a,0x00,0x58,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x58,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_f_u64_e64 s10, null, 0.5          ; encoding: [0x0a,0x00,0x58,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_f_u64_e64 s[10:11], null, 0.5     ; encoding: [0x0a,0x00,0x58,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_f_u64_e64 s104, -1, -1              ; encoding: [0x68,0x00,0x58,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_f_u64_e64 s[104:105], -1, -1        ; encoding: [0x68,0x00,0x58,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x58,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_f_u64_e64 s104, -1, -1            ; encoding: [0x68,0x00,0x58,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_f_u64_e64 s[104:105], -1, -1      ; encoding: [0x68,0x00,0x58,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_f_u64_e64 vcc_lo, 0.5, null         ; encoding: [0x6a,0x00,0x58,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_f_u64_e64 vcc, 0.5, null            ; encoding: [0x6a,0x00,0x58,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x58,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_f_u64_e64 vcc_lo, 0.5, null       ; encoding: [0x6a,0x00,0x58,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_f_u64_e64 vcc, 0.5, null          ; encoding: [0x6a,0x00,0x58,0xd4,0xf0,0xf8,0x00,0x00]
 
-# W32: v_cmp_f_u64_e64 ttmp14, src_scc, exec     ; encoding: [0x7a,0x00,0x58,0xd4,0xfd,0xfc,0x00,0x00]
-# W64: v_cmp_f_u64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x58,0xd4,0xfd,0xfc,0x00,0x00]
 0x7a,0x00,0x58,0xd4,0xfd,0xfc,0x00,0x00
+# W32: v_cmp_f_u64_e64 ttmp14, src_scc, exec   ; encoding: [0x7a,0x00,0x58,0xd4,0xfd,0xfc,0x00,0x00]
+# W64: v_cmp_f_u64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x58,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmp_f_u64_e64 null, 0xaf123456, vcc   ; encoding: [0x7c,0x00,0x58,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x58,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_f_u64_e64 null, 0xaf123456, vcc   ; encoding: [0x7c,0x00,0x58,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ge_f16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x06,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ge_f16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x06,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x06,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ge_f16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x06,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ge_f16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x06,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_ge_f16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x06,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_ge_f16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x06,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x06,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_ge_f16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x06,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_ge_f16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x06,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_ge_f16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x06,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_ge_f16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x06,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x06,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_ge_f16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x06,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_ge_f16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x06,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_ge_f16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x06,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_ge_f16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x06,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x06,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_ge_f16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x06,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_ge_f16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x06,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_ge_f16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x06,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_ge_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x06,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x06,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_ge_f16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x06,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_ge_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x06,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_ge_f16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x06,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_ge_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x06,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x06,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_ge_f16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x06,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_ge_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x06,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ge_f16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x06,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_ge_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x06,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x06,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_ge_f16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x06,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_ge_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x06,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_ge_f16_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x06,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_ge_f16_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x06,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x06,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_ge_f16_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x06,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_ge_f16_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x06,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_ge_f16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x06,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_ge_f16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x06,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x06,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_ge_f16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x06,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_ge_f16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x06,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_ge_f16_e64 s10, |exec_hi|, null     ; encoding: [0x0a,0x01,0x06,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_ge_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x06,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x06,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_ge_f16_e64 s10, |exec_hi|, null   ; encoding: [0x0a,0x01,0x06,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_ge_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x06,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_ge_f16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x06,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_ge_f16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x06,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x06,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_ge_f16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x06,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_ge_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x06,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_ge_f16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x06,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_ge_f16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x06,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x06,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_ge_f16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x06,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_ge_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x06,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_ge_f16_e64 vcc_lo, 0.5, -m0         ; encoding: [0x6a,0x00,0x06,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_ge_f16_e64 vcc, 0.5, -m0            ; encoding: [0x6a,0x00,0x06,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x06,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_ge_f16_e64 vcc_lo, 0.5, -m0       ; encoding: [0x6a,0x00,0x06,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_ge_f16_e64 vcc, 0.5, -m0          ; encoding: [0x6a,0x00,0x06,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x06,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_ge_f16_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x06,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_ge_f16_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x06,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x06,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_ge_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x06,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7c,0x83,0x06,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmp_ge_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x06,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ge_f32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x16,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ge_f32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x16,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x16,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ge_f32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x16,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ge_f32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x16,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_ge_f32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x16,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_ge_f32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x16,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x16,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_ge_f32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x16,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_ge_f32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x16,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_ge_f32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x16,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_ge_f32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x16,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x16,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_ge_f32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x16,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_ge_f32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x16,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_ge_f32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x16,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_ge_f32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x16,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x16,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_ge_f32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x16,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_ge_f32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x16,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_ge_f32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x16,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_ge_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x16,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x16,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_ge_f32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x16,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_ge_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x16,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_ge_f32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x16,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_ge_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x16,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x16,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_ge_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x16,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_ge_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x16,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ge_f32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x16,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_ge_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x16,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x16,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_ge_f32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x16,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_ge_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x16,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_ge_f32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x16,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_ge_f32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x16,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x16,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_ge_f32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x16,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_ge_f32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x16,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_ge_f32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x16,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_ge_f32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x16,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x16,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_ge_f32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x16,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_ge_f32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x16,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_ge_f32_e64 s10, |exec_hi|, null     ; encoding: [0x0a,0x01,0x16,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_ge_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x16,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x16,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_ge_f32_e64 s10, |exec_hi|, null   ; encoding: [0x0a,0x01,0x16,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_ge_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x16,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_ge_f32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x16,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_ge_f32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x16,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x16,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_ge_f32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x16,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_ge_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x16,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_ge_f32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x16,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_ge_f32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x16,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x16,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_ge_f32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x16,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_ge_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x16,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_ge_f32_e64 vcc_lo, 0.5, -m0         ; encoding: [0x6a,0x00,0x16,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_ge_f32_e64 vcc, 0.5, -m0            ; encoding: [0x6a,0x00,0x16,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x16,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_ge_f32_e64 vcc_lo, 0.5, -m0       ; encoding: [0x6a,0x00,0x16,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_ge_f32_e64 vcc, 0.5, -m0          ; encoding: [0x6a,0x00,0x16,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x16,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_ge_f32_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x16,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_ge_f32_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x16,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x16,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_ge_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x16,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7c,0x83,0x16,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_ge_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x16,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ge_f64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x26,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ge_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x26,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x26,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ge_f64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x26,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ge_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x26,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x26,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_ge_f64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x26,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_ge_f64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x26,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x26,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_ge_f64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x26,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_ge_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x26,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x26,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_ge_f64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x26,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_ge_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x26,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x26,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_ge_f64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x26,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_ge_f64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x26,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x26,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_ge_f64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x26,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_ge_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x26,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x26,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_ge_f64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x26,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_ge_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x26,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x26,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_ge_f64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x26,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_ge_f64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x26,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x26,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_ge_f64_e64 s10, -|exec|, src_scc    ; encoding: [0x0a,0x01,0x26,0xd4,0x7e,0xfa,0x01,0x20]
-# W64: v_cmp_ge_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x26,0xd4,0x7e,0xfa,0x01,0x20]
 0x0a,0x01,0x26,0xd4,0x7e,0xfa,0x01,0x20
+# W32: v_cmp_ge_f64_e64 s10, -|exec|, src_scc  ; encoding: [0x0a,0x01,0x26,0xd4,0x7e,0xfa,0x01,0x20]
+# W64: v_cmp_ge_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x26,0xd4,0x7e,0xfa,0x01,0x20]
 
-# W32: v_cmp_ge_f64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x26,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_ge_f64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x26,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x26,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_ge_f64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x26,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_ge_f64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x26,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_ge_f64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x26,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_ge_f64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x26,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x26,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_ge_f64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x26,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_ge_f64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x26,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_ge_f64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x26,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_ge_f64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x26,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x26,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_ge_f64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x26,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_ge_f64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x26,0xd4,0xf0,0xf8,0x00,0x00]
 
+0x7a,0x03,0x26,0xd4,0xfd,0xfc,0x00,0x60
 # W32: v_cmp_ge_f64_e64 ttmp14, -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x26,0xd4,0xfd,0xfc,0x00,0x60]
 # W64: v_cmp_ge_f64_e64 ttmp[14:15], -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x26,0xd4,0xfd,0xfc,0x00,0x60]
-0x7a,0x03,0x26,0xd4,0xfd,0xfc,0x00,0x60
 
-# GFX11: v_cmp_ge_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x26,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7c,0x82,0x26,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_ge_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x26,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ge_i16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x36,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ge_i16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x36,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x36,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ge_i16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x36,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ge_i16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x36,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_ge_i16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x36,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_ge_i16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x36,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x36,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_ge_i16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x36,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_ge_i16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x36,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_ge_i16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x36,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_ge_i16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x36,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x36,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_ge_i16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x36,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_ge_i16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x36,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_ge_i16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x36,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_ge_i16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x36,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x36,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_ge_i16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x36,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_ge_i16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x36,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_ge_i16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x36,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_ge_i16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x36,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x36,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_ge_i16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x36,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_ge_i16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x36,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_ge_i16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x36,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_ge_i16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x36,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x36,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_ge_i16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x36,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_ge_i16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x36,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ge_i16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x36,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_ge_i16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x36,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x36,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_ge_i16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x36,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_ge_i16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x36,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_ge_i16_e64 s10, m0, 0x3800          ; encoding: [0x0a,0x00,0x36,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_ge_i16_e64 s[10:11], m0, 0x3800     ; encoding: [0x0a,0x00,0x36,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x0a,0x00,0x36,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_ge_i16_e64 s10, m0, 0x3800        ; encoding: [0x0a,0x00,0x36,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_ge_i16_e64 s[10:11], m0, 0x3800   ; encoding: [0x0a,0x00,0x36,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_ge_i16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x36,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_ge_i16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x36,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x36,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_ge_i16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x36,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_ge_i16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x36,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_ge_i16_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x36,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_ge_i16_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x36,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x36,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_ge_i16_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x36,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_ge_i16_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x36,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_ge_i16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x36,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_ge_i16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x36,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x36,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_ge_i16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x36,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_ge_i16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x36,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_ge_i16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x36,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_ge_i16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x36,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x36,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_ge_i16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x36,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_ge_i16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x36,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_ge_i16_e64 vcc_lo, 0x3800, m0       ; encoding: [0x6a,0x00,0x36,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_ge_i16_e64 vcc, 0x3800, m0          ; encoding: [0x6a,0x00,0x36,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x6a,0x00,0x36,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_ge_i16_e64 vcc_lo, 0x3800, m0     ; encoding: [0x6a,0x00,0x36,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_ge_i16_e64 vcc, 0x3800, m0        ; encoding: [0x6a,0x00,0x36,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_ge_i16_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x36,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_ge_i16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x36,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x36,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_ge_i16_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x36,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_ge_i16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x36,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmp_ge_i16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x36,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7c,0x00,0x36,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmp_ge_i16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x36,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ge_i32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x46,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ge_i32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x46,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x46,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ge_i32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x46,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ge_i32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x46,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_ge_i32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x46,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_ge_i32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x46,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x46,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_ge_i32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x46,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_ge_i32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x46,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_ge_i32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x46,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_ge_i32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x46,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x46,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_ge_i32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x46,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_ge_i32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x46,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_ge_i32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x46,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_ge_i32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x46,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x46,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_ge_i32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x46,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_ge_i32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x46,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_ge_i32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x46,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_ge_i32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x46,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x46,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_ge_i32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x46,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_ge_i32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x46,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_ge_i32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x46,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_ge_i32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x46,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x46,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_ge_i32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x46,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_ge_i32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x46,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ge_i32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x46,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_ge_i32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x46,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x46,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_ge_i32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x46,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_ge_i32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x46,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_ge_i32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x46,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_ge_i32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x46,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x46,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_ge_i32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x46,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_ge_i32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x46,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_ge_i32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x46,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_ge_i32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x46,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x46,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_ge_i32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x46,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_ge_i32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x46,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_ge_i32_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x46,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_ge_i32_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x46,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x46,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_ge_i32_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x46,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_ge_i32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x46,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_ge_i32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x46,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_ge_i32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x46,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x46,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_ge_i32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x46,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_ge_i32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x46,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_ge_i32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x46,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_ge_i32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x46,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x46,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_ge_i32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x46,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_ge_i32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x46,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_ge_i32_e64 vcc_lo, 0.5, m0          ; encoding: [0x6a,0x00,0x46,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_ge_i32_e64 vcc, 0.5, m0             ; encoding: [0x6a,0x00,0x46,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x46,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_ge_i32_e64 vcc_lo, 0.5, m0        ; encoding: [0x6a,0x00,0x46,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_ge_i32_e64 vcc, 0.5, m0           ; encoding: [0x6a,0x00,0x46,0xd4,0xf0,0xfa,0x00,0x00]
 
-# W32: v_cmp_ge_i32_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x46,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_ge_i32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x46,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x46,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_ge_i32_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x46,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_ge_i32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x46,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmp_ge_i32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x46,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x46,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_ge_i32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x46,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ge_i64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x56,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ge_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x56,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x56,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ge_i64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x56,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ge_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x56,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x56,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_ge_i64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x56,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_ge_i64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x56,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x56,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_ge_i64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x56,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_ge_i64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x56,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x56,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_ge_i64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x56,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_ge_i64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x56,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x56,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_ge_i64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x56,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_ge_i64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x56,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x56,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_ge_i64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x56,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_ge_i64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x56,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x56,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_ge_i64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x56,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_ge_i64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x56,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x56,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_ge_i64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x56,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_ge_i64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x56,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x56,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_ge_i64_e64 s10, exec, src_scc       ; encoding: [0x0a,0x00,0x56,0xd4,0x7e,0xfa,0x01,0x00]
-# W64: v_cmp_ge_i64_e64 s[10:11], exec, src_scc  ; encoding: [0x0a,0x00,0x56,0xd4,0x7e,0xfa,0x01,0x00]
 0x0a,0x00,0x56,0xd4,0x7e,0xfa,0x01,0x00
+# W32: v_cmp_ge_i64_e64 s10, exec, src_scc     ; encoding: [0x0a,0x00,0x56,0xd4,0x7e,0xfa,0x01,0x00]
+# W64: v_cmp_ge_i64_e64 s[10:11], exec, src_scc ; encoding: [0x0a,0x00,0x56,0xd4,0x7e,0xfa,0x01,0x00]
 
-# W32: v_cmp_ge_i64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x56,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_ge_i64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x56,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x56,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_ge_i64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x56,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_ge_i64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x56,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_ge_i64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x56,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_ge_i64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x56,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x56,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_ge_i64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x56,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_ge_i64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x56,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_ge_i64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x56,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_ge_i64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x56,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x56,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_ge_i64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x56,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_ge_i64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x56,0xd4,0xf0,0xf8,0x00,0x00]
 
-# W32: v_cmp_ge_i64_e64 ttmp14, src_scc, exec    ; encoding: [0x7a,0x00,0x56,0xd4,0xfd,0xfc,0x00,0x00]
-# W64: v_cmp_ge_i64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x56,0xd4,0xfd,0xfc,0x00,0x00]
 0x7a,0x00,0x56,0xd4,0xfd,0xfc,0x00,0x00
+# W32: v_cmp_ge_i64_e64 ttmp14, src_scc, exec  ; encoding: [0x7a,0x00,0x56,0xd4,0xfd,0xfc,0x00,0x00]
+# W64: v_cmp_ge_i64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x56,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmp_ge_i64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x56,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x56,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_ge_i64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x56,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ge_u16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x3e,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ge_u16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x3e,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x3e,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ge_u16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x3e,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ge_u16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x3e,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_ge_u16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x3e,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_ge_u16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x3e,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x3e,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_ge_u16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x3e,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_ge_u16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x3e,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_ge_u16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x3e,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_ge_u16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x3e,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x3e,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_ge_u16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x3e,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_ge_u16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x3e,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_ge_u16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x3e,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_ge_u16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x3e,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x3e,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_ge_u16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x3e,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_ge_u16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x3e,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_ge_u16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x3e,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_ge_u16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x3e,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x3e,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_ge_u16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x3e,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_ge_u16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x3e,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_ge_u16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x3e,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_ge_u16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x3e,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x3e,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_ge_u16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x3e,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_ge_u16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x3e,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ge_u16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x3e,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_ge_u16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x3e,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x3e,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_ge_u16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x3e,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_ge_u16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x3e,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_ge_u16_e64 s10, m0, 0x3800          ; encoding: [0x0a,0x00,0x3e,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_ge_u16_e64 s[10:11], m0, 0x3800     ; encoding: [0x0a,0x00,0x3e,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x0a,0x00,0x3e,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_ge_u16_e64 s10, m0, 0x3800        ; encoding: [0x0a,0x00,0x3e,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_ge_u16_e64 s[10:11], m0, 0x3800   ; encoding: [0x0a,0x00,0x3e,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_ge_u16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x3e,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_ge_u16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x3e,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x3e,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_ge_u16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x3e,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_ge_u16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x3e,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_ge_u16_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x3e,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_ge_u16_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x3e,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x3e,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_ge_u16_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x3e,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_ge_u16_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x3e,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_ge_u16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x3e,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_ge_u16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x3e,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x3e,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_ge_u16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x3e,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_ge_u16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x3e,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_ge_u16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x3e,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_ge_u16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x3e,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x3e,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_ge_u16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x3e,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_ge_u16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x3e,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_ge_u16_e64 vcc_lo, 0x3800, m0       ; encoding: [0x6a,0x00,0x3e,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_ge_u16_e64 vcc, 0x3800, m0          ; encoding: [0x6a,0x00,0x3e,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x6a,0x00,0x3e,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_ge_u16_e64 vcc_lo, 0x3800, m0     ; encoding: [0x6a,0x00,0x3e,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_ge_u16_e64 vcc, 0x3800, m0        ; encoding: [0x6a,0x00,0x3e,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_ge_u16_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x3e,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_ge_u16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x3e,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x3e,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_ge_u16_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x3e,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_ge_u16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x3e,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmp_ge_u16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x3e,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7c,0x00,0x3e,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmp_ge_u16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x3e,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ge_u32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x4e,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ge_u32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x4e,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x4e,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ge_u32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x4e,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ge_u32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x4e,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_ge_u32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x4e,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_ge_u32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x4e,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x4e,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_ge_u32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x4e,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_ge_u32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x4e,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_ge_u32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x4e,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_ge_u32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x4e,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x4e,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_ge_u32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x4e,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_ge_u32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x4e,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_ge_u32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x4e,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_ge_u32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x4e,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x4e,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_ge_u32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x4e,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_ge_u32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x4e,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_ge_u32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x4e,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_ge_u32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x4e,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x4e,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_ge_u32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x4e,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_ge_u32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x4e,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_ge_u32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x4e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_ge_u32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x4e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_ge_u32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_ge_u32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ge_u32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x4e,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_ge_u32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x4e,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x4e,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_ge_u32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x4e,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_ge_u32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x4e,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_ge_u32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x4e,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_ge_u32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x4e,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x4e,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_ge_u32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x4e,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_ge_u32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x4e,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_ge_u32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x4e,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_ge_u32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x4e,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x4e,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_ge_u32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x4e,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_ge_u32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x4e,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_ge_u32_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x4e,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_ge_u32_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x4e,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x4e,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_ge_u32_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x4e,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_ge_u32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x4e,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_ge_u32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x4e,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_ge_u32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x4e,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x4e,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_ge_u32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x4e,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_ge_u32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x4e,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_ge_u32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x4e,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_ge_u32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x4e,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x4e,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_ge_u32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x4e,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_ge_u32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x4e,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_ge_u32_e64 vcc_lo, 0.5, m0          ; encoding: [0x6a,0x00,0x4e,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_ge_u32_e64 vcc, 0.5, m0             ; encoding: [0x6a,0x00,0x4e,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x4e,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_ge_u32_e64 vcc_lo, 0.5, m0        ; encoding: [0x6a,0x00,0x4e,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_ge_u32_e64 vcc, 0.5, m0           ; encoding: [0x6a,0x00,0x4e,0xd4,0xf0,0xfa,0x00,0x00]
 
-# W32: v_cmp_ge_u32_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x4e,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_ge_u32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4e,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x4e,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_ge_u32_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4e,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_ge_u32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4e,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmp_ge_u32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x4e,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x4e,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_ge_u32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x4e,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ge_u64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x5e,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ge_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x5e,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x5e,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ge_u64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x5e,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ge_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x5e,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x5e,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_ge_u64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x5e,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_ge_u64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x5e,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x5e,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_ge_u64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x5e,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_ge_u64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x5e,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x5e,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_ge_u64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x5e,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_ge_u64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x5e,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x5e,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_ge_u64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x5e,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_ge_u64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x5e,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x5e,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_ge_u64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x5e,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_ge_u64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x5e,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x5e,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_ge_u64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x5e,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_ge_u64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x5e,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x5e,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_ge_u64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x5e,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_ge_u64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x5e,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x5e,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_ge_u64_e64 s10, exec, src_scc       ; encoding: [0x0a,0x00,0x5e,0xd4,0x7e,0xfa,0x01,0x00]
-# W64: v_cmp_ge_u64_e64 s[10:11], exec, src_scc  ; encoding: [0x0a,0x00,0x5e,0xd4,0x7e,0xfa,0x01,0x00]
 0x0a,0x00,0x5e,0xd4,0x7e,0xfa,0x01,0x00
+# W32: v_cmp_ge_u64_e64 s10, exec, src_scc     ; encoding: [0x0a,0x00,0x5e,0xd4,0x7e,0xfa,0x01,0x00]
+# W64: v_cmp_ge_u64_e64 s[10:11], exec, src_scc ; encoding: [0x0a,0x00,0x5e,0xd4,0x7e,0xfa,0x01,0x00]
 
-# W32: v_cmp_ge_u64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x5e,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_ge_u64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x5e,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x5e,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_ge_u64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x5e,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_ge_u64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x5e,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_ge_u64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x5e,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_ge_u64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x5e,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x5e,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_ge_u64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x5e,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_ge_u64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x5e,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_ge_u64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x5e,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_ge_u64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x5e,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x5e,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_ge_u64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x5e,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_ge_u64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x5e,0xd4,0xf0,0xf8,0x00,0x00]
 
-# W32: v_cmp_ge_u64_e64 ttmp14, src_scc, exec    ; encoding: [0x7a,0x00,0x5e,0xd4,0xfd,0xfc,0x00,0x00]
-# W64: v_cmp_ge_u64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x5e,0xd4,0xfd,0xfc,0x00,0x00]
 0x7a,0x00,0x5e,0xd4,0xfd,0xfc,0x00,0x00
+# W32: v_cmp_ge_u64_e64 ttmp14, src_scc, exec  ; encoding: [0x7a,0x00,0x5e,0xd4,0xfd,0xfc,0x00,0x00]
+# W64: v_cmp_ge_u64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x5e,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmp_ge_u64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x5e,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x5e,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_ge_u64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x5e,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_gt_f16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x04,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_gt_f16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x04,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x04,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_gt_f16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x04,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_gt_f16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x04,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_gt_f16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x04,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_gt_f16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x04,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x04,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_gt_f16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x04,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_gt_f16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x04,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_gt_f16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x04,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_gt_f16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x04,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x04,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_gt_f16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x04,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_gt_f16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x04,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_gt_f16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x04,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_gt_f16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x04,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x04,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_gt_f16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x04,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_gt_f16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x04,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_gt_f16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x04,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_gt_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x04,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x04,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_gt_f16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x04,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_gt_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x04,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_gt_f16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x04,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_gt_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x04,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x04,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_gt_f16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x04,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_gt_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x04,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_gt_f16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x04,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_gt_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x04,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x04,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_gt_f16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x04,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_gt_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x04,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_gt_f16_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x04,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_gt_f16_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x04,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x04,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_gt_f16_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x04,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_gt_f16_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x04,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_gt_f16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x04,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_gt_f16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x04,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x04,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_gt_f16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x04,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_gt_f16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x04,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_gt_f16_e64 s10, |exec_hi|, null     ; encoding: [0x0a,0x01,0x04,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_gt_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x04,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x04,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_gt_f16_e64 s10, |exec_hi|, null   ; encoding: [0x0a,0x01,0x04,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_gt_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x04,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_gt_f16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x04,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_gt_f16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x04,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x04,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_gt_f16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x04,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_gt_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x04,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_gt_f16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x04,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_gt_f16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x04,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x04,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_gt_f16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x04,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_gt_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x04,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_gt_f16_e64 vcc_lo, 0.5, -m0         ; encoding: [0x6a,0x00,0x04,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_gt_f16_e64 vcc, 0.5, -m0            ; encoding: [0x6a,0x00,0x04,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x04,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_gt_f16_e64 vcc_lo, 0.5, -m0       ; encoding: [0x6a,0x00,0x04,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_gt_f16_e64 vcc, 0.5, -m0          ; encoding: [0x6a,0x00,0x04,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x04,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_gt_f16_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x04,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_gt_f16_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x04,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x04,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_gt_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x04,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7c,0x83,0x04,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmp_gt_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x04,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_gt_f32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x14,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_gt_f32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x14,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x14,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_gt_f32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x14,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_gt_f32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x14,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_gt_f32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x14,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_gt_f32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x14,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x14,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_gt_f32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x14,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_gt_f32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x14,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_gt_f32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x14,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_gt_f32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x14,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x14,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_gt_f32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x14,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_gt_f32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x14,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_gt_f32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x14,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_gt_f32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x14,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x14,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_gt_f32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x14,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_gt_f32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x14,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_gt_f32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x14,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_gt_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x14,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x14,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_gt_f32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x14,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_gt_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x14,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_gt_f32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x14,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_gt_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x14,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x14,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_gt_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x14,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_gt_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x14,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_gt_f32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x14,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_gt_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x14,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x14,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_gt_f32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x14,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_gt_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x14,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_gt_f32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x14,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_gt_f32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x14,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x14,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_gt_f32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x14,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_gt_f32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x14,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_gt_f32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x14,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_gt_f32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x14,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x14,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_gt_f32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x14,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_gt_f32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x14,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_gt_f32_e64 s10, |exec_hi|, null     ; encoding: [0x0a,0x01,0x14,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_gt_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x14,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x14,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_gt_f32_e64 s10, |exec_hi|, null   ; encoding: [0x0a,0x01,0x14,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_gt_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x14,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_gt_f32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x14,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_gt_f32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x14,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x14,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_gt_f32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x14,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_gt_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x14,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_gt_f32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x14,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_gt_f32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x14,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x14,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_gt_f32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x14,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_gt_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x14,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_gt_f32_e64 vcc_lo, 0.5, -m0         ; encoding: [0x6a,0x00,0x14,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_gt_f32_e64 vcc, 0.5, -m0            ; encoding: [0x6a,0x00,0x14,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x14,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_gt_f32_e64 vcc_lo, 0.5, -m0       ; encoding: [0x6a,0x00,0x14,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_gt_f32_e64 vcc, 0.5, -m0          ; encoding: [0x6a,0x00,0x14,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x14,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_gt_f32_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x14,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_gt_f32_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x14,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x14,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_gt_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x14,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7c,0x83,0x14,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_gt_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x14,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_gt_f64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x24,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_gt_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x24,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x24,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_gt_f64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x24,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_gt_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x24,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x24,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_gt_f64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x24,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_gt_f64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x24,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x24,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_gt_f64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x24,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_gt_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x24,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x24,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_gt_f64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x24,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_gt_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x24,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x24,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_gt_f64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x24,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_gt_f64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x24,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x24,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_gt_f64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x24,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_gt_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x24,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x24,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_gt_f64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x24,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_gt_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x24,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x24,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_gt_f64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x24,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_gt_f64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x24,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x24,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_gt_f64_e64 s10, -|exec|, src_scc    ; encoding: [0x0a,0x01,0x24,0xd4,0x7e,0xfa,0x01,0x20]
-# W64: v_cmp_gt_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x24,0xd4,0x7e,0xfa,0x01,0x20]
 0x0a,0x01,0x24,0xd4,0x7e,0xfa,0x01,0x20
+# W32: v_cmp_gt_f64_e64 s10, -|exec|, src_scc  ; encoding: [0x0a,0x01,0x24,0xd4,0x7e,0xfa,0x01,0x20]
+# W64: v_cmp_gt_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x24,0xd4,0x7e,0xfa,0x01,0x20]
 
-# W32: v_cmp_gt_f64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x24,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_gt_f64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x24,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x24,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_gt_f64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x24,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_gt_f64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x24,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_gt_f64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x24,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_gt_f64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x24,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x24,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_gt_f64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x24,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_gt_f64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x24,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_gt_f64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x24,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_gt_f64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x24,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x24,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_gt_f64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x24,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_gt_f64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x24,0xd4,0xf0,0xf8,0x00,0x00]
 
+0x7a,0x03,0x24,0xd4,0xfd,0xfc,0x00,0x60
 # W32: v_cmp_gt_f64_e64 ttmp14, -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x24,0xd4,0xfd,0xfc,0x00,0x60]
 # W64: v_cmp_gt_f64_e64 ttmp[14:15], -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x24,0xd4,0xfd,0xfc,0x00,0x60]
-0x7a,0x03,0x24,0xd4,0xfd,0xfc,0x00,0x60
 
-# GFX11: v_cmp_gt_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x24,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7c,0x82,0x24,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_gt_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x24,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_gt_i16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x34,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_gt_i16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x34,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x34,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_gt_i16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x34,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_gt_i16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x34,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_gt_i16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x34,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_gt_i16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x34,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x34,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_gt_i16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x34,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_gt_i16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x34,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_gt_i16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x34,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_gt_i16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x34,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x34,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_gt_i16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x34,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_gt_i16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x34,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_gt_i16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x34,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_gt_i16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x34,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x34,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_gt_i16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x34,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_gt_i16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x34,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_gt_i16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x34,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_gt_i16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x34,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x34,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_gt_i16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x34,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_gt_i16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x34,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_gt_i16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x34,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_gt_i16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x34,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x34,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_gt_i16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x34,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_gt_i16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x34,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_gt_i16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x34,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_gt_i16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x34,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x34,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_gt_i16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x34,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_gt_i16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x34,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_gt_i16_e64 s10, m0, 0x3800          ; encoding: [0x0a,0x00,0x34,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_gt_i16_e64 s[10:11], m0, 0x3800     ; encoding: [0x0a,0x00,0x34,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x0a,0x00,0x34,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_gt_i16_e64 s10, m0, 0x3800        ; encoding: [0x0a,0x00,0x34,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_gt_i16_e64 s[10:11], m0, 0x3800   ; encoding: [0x0a,0x00,0x34,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_gt_i16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x34,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_gt_i16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x34,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x34,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_gt_i16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x34,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_gt_i16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x34,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_gt_i16_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x34,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_gt_i16_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x34,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x34,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_gt_i16_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x34,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_gt_i16_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x34,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_gt_i16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x34,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_gt_i16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x34,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x34,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_gt_i16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x34,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_gt_i16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x34,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_gt_i16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x34,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_gt_i16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x34,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x34,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_gt_i16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x34,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_gt_i16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x34,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_gt_i16_e64 vcc_lo, 0x3800, m0       ; encoding: [0x6a,0x00,0x34,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_gt_i16_e64 vcc, 0x3800, m0          ; encoding: [0x6a,0x00,0x34,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x6a,0x00,0x34,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_gt_i16_e64 vcc_lo, 0x3800, m0     ; encoding: [0x6a,0x00,0x34,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_gt_i16_e64 vcc, 0x3800, m0        ; encoding: [0x6a,0x00,0x34,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_gt_i16_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x34,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_gt_i16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x34,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x34,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_gt_i16_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x34,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_gt_i16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x34,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmp_gt_i16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x34,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7c,0x00,0x34,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmp_gt_i16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x34,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_gt_i32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x44,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_gt_i32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x44,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x44,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_gt_i32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x44,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_gt_i32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x44,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_gt_i32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x44,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_gt_i32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x44,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x44,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_gt_i32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x44,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_gt_i32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x44,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_gt_i32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x44,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_gt_i32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x44,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x44,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_gt_i32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x44,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_gt_i32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x44,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_gt_i32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x44,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_gt_i32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x44,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x44,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_gt_i32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x44,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_gt_i32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x44,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_gt_i32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x44,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_gt_i32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x44,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x44,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_gt_i32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x44,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_gt_i32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x44,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_gt_i32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x44,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_gt_i32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x44,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x44,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_gt_i32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x44,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_gt_i32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x44,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_gt_i32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x44,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_gt_i32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x44,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x44,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_gt_i32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x44,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_gt_i32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x44,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_gt_i32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x44,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_gt_i32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x44,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x44,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_gt_i32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x44,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_gt_i32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x44,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_gt_i32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x44,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_gt_i32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x44,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x44,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_gt_i32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x44,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_gt_i32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x44,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_gt_i32_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x44,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_gt_i32_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x44,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x44,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_gt_i32_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x44,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_gt_i32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x44,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_gt_i32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x44,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_gt_i32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x44,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x44,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_gt_i32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x44,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_gt_i32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x44,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_gt_i32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x44,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_gt_i32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x44,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x44,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_gt_i32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x44,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_gt_i32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x44,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_gt_i32_e64 vcc_lo, 0.5, m0          ; encoding: [0x6a,0x00,0x44,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_gt_i32_e64 vcc, 0.5, m0             ; encoding: [0x6a,0x00,0x44,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x44,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_gt_i32_e64 vcc_lo, 0.5, m0        ; encoding: [0x6a,0x00,0x44,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_gt_i32_e64 vcc, 0.5, m0           ; encoding: [0x6a,0x00,0x44,0xd4,0xf0,0xfa,0x00,0x00]
 
-# W32: v_cmp_gt_i32_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x44,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_gt_i32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x44,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x44,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_gt_i32_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x44,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_gt_i32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x44,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmp_gt_i32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x44,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x44,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_gt_i32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x44,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_gt_i64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x54,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_gt_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x54,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x54,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_gt_i64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x54,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_gt_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x54,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x54,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_gt_i64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x54,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_gt_i64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x54,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x54,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_gt_i64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x54,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_gt_i64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x54,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x54,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_gt_i64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x54,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_gt_i64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x54,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x54,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_gt_i64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x54,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_gt_i64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x54,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x54,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_gt_i64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x54,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_gt_i64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x54,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x54,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_gt_i64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x54,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_gt_i64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x54,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x54,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_gt_i64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x54,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_gt_i64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x54,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x54,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_gt_i64_e64 s10, exec, src_scc       ; encoding: [0x0a,0x00,0x54,0xd4,0x7e,0xfa,0x01,0x00]
-# W64: v_cmp_gt_i64_e64 s[10:11], exec, src_scc  ; encoding: [0x0a,0x00,0x54,0xd4,0x7e,0xfa,0x01,0x00]
 0x0a,0x00,0x54,0xd4,0x7e,0xfa,0x01,0x00
+# W32: v_cmp_gt_i64_e64 s10, exec, src_scc     ; encoding: [0x0a,0x00,0x54,0xd4,0x7e,0xfa,0x01,0x00]
+# W64: v_cmp_gt_i64_e64 s[10:11], exec, src_scc ; encoding: [0x0a,0x00,0x54,0xd4,0x7e,0xfa,0x01,0x00]
 
-# W32: v_cmp_gt_i64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x54,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_gt_i64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x54,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x54,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_gt_i64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x54,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_gt_i64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x54,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_gt_i64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x54,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_gt_i64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x54,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x54,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_gt_i64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x54,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_gt_i64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x54,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_gt_i64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x54,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_gt_i64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x54,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x54,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_gt_i64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x54,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_gt_i64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x54,0xd4,0xf0,0xf8,0x00,0x00]
 
-# W32: v_cmp_gt_i64_e64 ttmp14, src_scc, exec    ; encoding: [0x7a,0x00,0x54,0xd4,0xfd,0xfc,0x00,0x00]
-# W64: v_cmp_gt_i64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x54,0xd4,0xfd,0xfc,0x00,0x00]
 0x7a,0x00,0x54,0xd4,0xfd,0xfc,0x00,0x00
+# W32: v_cmp_gt_i64_e64 ttmp14, src_scc, exec  ; encoding: [0x7a,0x00,0x54,0xd4,0xfd,0xfc,0x00,0x00]
+# W64: v_cmp_gt_i64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x54,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmp_gt_i64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x54,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x54,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_gt_i64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x54,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_gt_u16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x3c,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_gt_u16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x3c,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x3c,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_gt_u16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x3c,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_gt_u16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x3c,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_gt_u16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x3c,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_gt_u16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x3c,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x3c,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_gt_u16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x3c,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_gt_u16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x3c,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_gt_u16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x3c,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_gt_u16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x3c,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x3c,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_gt_u16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x3c,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_gt_u16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x3c,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_gt_u16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x3c,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_gt_u16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x3c,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x3c,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_gt_u16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x3c,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_gt_u16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x3c,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_gt_u16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x3c,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_gt_u16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x3c,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x3c,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_gt_u16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x3c,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_gt_u16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x3c,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_gt_u16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x3c,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_gt_u16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x3c,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x3c,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_gt_u16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x3c,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_gt_u16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x3c,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_gt_u16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x3c,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_gt_u16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x3c,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x3c,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_gt_u16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x3c,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_gt_u16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x3c,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_gt_u16_e64 s10, m0, 0x3800          ; encoding: [0x0a,0x00,0x3c,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_gt_u16_e64 s[10:11], m0, 0x3800     ; encoding: [0x0a,0x00,0x3c,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x0a,0x00,0x3c,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_gt_u16_e64 s10, m0, 0x3800        ; encoding: [0x0a,0x00,0x3c,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_gt_u16_e64 s[10:11], m0, 0x3800   ; encoding: [0x0a,0x00,0x3c,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_gt_u16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x3c,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_gt_u16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x3c,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x3c,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_gt_u16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x3c,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_gt_u16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x3c,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_gt_u16_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x3c,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_gt_u16_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x3c,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x3c,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_gt_u16_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x3c,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_gt_u16_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x3c,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_gt_u16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x3c,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_gt_u16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x3c,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x3c,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_gt_u16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x3c,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_gt_u16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x3c,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_gt_u16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x3c,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_gt_u16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x3c,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x3c,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_gt_u16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x3c,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_gt_u16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x3c,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_gt_u16_e64 vcc_lo, 0x3800, m0       ; encoding: [0x6a,0x00,0x3c,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_gt_u16_e64 vcc, 0x3800, m0          ; encoding: [0x6a,0x00,0x3c,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x6a,0x00,0x3c,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_gt_u16_e64 vcc_lo, 0x3800, m0     ; encoding: [0x6a,0x00,0x3c,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_gt_u16_e64 vcc, 0x3800, m0        ; encoding: [0x6a,0x00,0x3c,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_gt_u16_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x3c,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_gt_u16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x3c,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x3c,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_gt_u16_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x3c,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_gt_u16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x3c,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmp_gt_u16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x3c,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7c,0x00,0x3c,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmp_gt_u16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x3c,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_gt_u32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x4c,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_gt_u32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x4c,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x4c,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_gt_u32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x4c,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_gt_u32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x4c,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_gt_u32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x4c,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_gt_u32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x4c,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x4c,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_gt_u32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x4c,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_gt_u32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x4c,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_gt_u32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x4c,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_gt_u32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x4c,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x4c,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_gt_u32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x4c,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_gt_u32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x4c,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_gt_u32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x4c,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_gt_u32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x4c,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x4c,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_gt_u32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x4c,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_gt_u32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x4c,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_gt_u32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x4c,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_gt_u32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x4c,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x4c,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_gt_u32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x4c,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_gt_u32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x4c,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_gt_u32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x4c,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_gt_u32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4c,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x4c,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_gt_u32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4c,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_gt_u32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4c,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_gt_u32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x4c,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_gt_u32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x4c,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x4c,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_gt_u32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x4c,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_gt_u32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x4c,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_gt_u32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x4c,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_gt_u32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x4c,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x4c,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_gt_u32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x4c,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_gt_u32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x4c,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_gt_u32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x4c,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_gt_u32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x4c,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x4c,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_gt_u32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x4c,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_gt_u32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x4c,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_gt_u32_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x4c,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_gt_u32_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x4c,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x4c,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_gt_u32_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x4c,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_gt_u32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x4c,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_gt_u32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x4c,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_gt_u32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x4c,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x4c,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_gt_u32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x4c,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_gt_u32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x4c,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_gt_u32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x4c,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_gt_u32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x4c,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x4c,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_gt_u32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x4c,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_gt_u32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x4c,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_gt_u32_e64 vcc_lo, 0.5, m0          ; encoding: [0x6a,0x00,0x4c,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_gt_u32_e64 vcc, 0.5, m0             ; encoding: [0x6a,0x00,0x4c,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x4c,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_gt_u32_e64 vcc_lo, 0.5, m0        ; encoding: [0x6a,0x00,0x4c,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_gt_u32_e64 vcc, 0.5, m0           ; encoding: [0x6a,0x00,0x4c,0xd4,0xf0,0xfa,0x00,0x00]
 
-# W32: v_cmp_gt_u32_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x4c,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_gt_u32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4c,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x4c,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_gt_u32_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4c,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_gt_u32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4c,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmp_gt_u32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x4c,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x4c,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_gt_u32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x4c,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_gt_u64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x5c,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_gt_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x5c,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x5c,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_gt_u64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x5c,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_gt_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x5c,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x5c,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_gt_u64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x5c,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_gt_u64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x5c,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x5c,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_gt_u64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x5c,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_gt_u64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x5c,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x5c,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_gt_u64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x5c,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_gt_u64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x5c,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x5c,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_gt_u64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x5c,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_gt_u64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x5c,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x5c,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_gt_u64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x5c,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_gt_u64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x5c,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x5c,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_gt_u64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x5c,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_gt_u64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x5c,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x5c,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_gt_u64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x5c,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_gt_u64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x5c,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x5c,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_gt_u64_e64 s10, exec, src_scc       ; encoding: [0x0a,0x00,0x5c,0xd4,0x7e,0xfa,0x01,0x00]
-# W64: v_cmp_gt_u64_e64 s[10:11], exec, src_scc  ; encoding: [0x0a,0x00,0x5c,0xd4,0x7e,0xfa,0x01,0x00]
 0x0a,0x00,0x5c,0xd4,0x7e,0xfa,0x01,0x00
+# W32: v_cmp_gt_u64_e64 s10, exec, src_scc     ; encoding: [0x0a,0x00,0x5c,0xd4,0x7e,0xfa,0x01,0x00]
+# W64: v_cmp_gt_u64_e64 s[10:11], exec, src_scc ; encoding: [0x0a,0x00,0x5c,0xd4,0x7e,0xfa,0x01,0x00]
 
-# W32: v_cmp_gt_u64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x5c,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_gt_u64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x5c,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x5c,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_gt_u64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x5c,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_gt_u64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x5c,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_gt_u64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x5c,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_gt_u64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x5c,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x5c,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_gt_u64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x5c,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_gt_u64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x5c,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_gt_u64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x5c,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_gt_u64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x5c,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x5c,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_gt_u64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x5c,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_gt_u64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x5c,0xd4,0xf0,0xf8,0x00,0x00]
 
-# W32: v_cmp_gt_u64_e64 ttmp14, src_scc, exec    ; encoding: [0x7a,0x00,0x5c,0xd4,0xfd,0xfc,0x00,0x00]
-# W64: v_cmp_gt_u64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x5c,0xd4,0xfd,0xfc,0x00,0x00]
 0x7a,0x00,0x5c,0xd4,0xfd,0xfc,0x00,0x00
+# W32: v_cmp_gt_u64_e64 ttmp14, src_scc, exec  ; encoding: [0x7a,0x00,0x5c,0xd4,0xfd,0xfc,0x00,0x00]
+# W64: v_cmp_gt_u64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x5c,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmp_gt_u64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x5c,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x5c,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_gt_u64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x5c,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_le_f16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x03,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_le_f16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x03,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x03,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_le_f16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x03,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_le_f16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x03,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_le_f16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x03,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_le_f16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x03,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x03,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_le_f16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x03,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_le_f16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x03,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_le_f16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x03,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_le_f16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x03,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x03,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_le_f16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x03,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_le_f16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x03,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_le_f16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x03,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_le_f16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x03,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x03,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_le_f16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x03,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_le_f16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x03,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_le_f16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x03,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_le_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x03,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x03,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_le_f16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x03,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_le_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x03,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_le_f16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x03,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_le_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x03,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x03,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_le_f16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x03,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_le_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x03,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_le_f16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x03,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_le_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x03,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x03,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_le_f16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x03,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_le_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x03,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_le_f16_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x03,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_le_f16_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x03,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x03,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_le_f16_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x03,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_le_f16_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x03,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_le_f16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x03,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_le_f16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x03,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x03,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_le_f16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x03,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_le_f16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x03,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_le_f16_e64 s10, |exec_hi|, null     ; encoding: [0x0a,0x01,0x03,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_le_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x03,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x03,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_le_f16_e64 s10, |exec_hi|, null   ; encoding: [0x0a,0x01,0x03,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_le_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x03,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_le_f16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x03,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_le_f16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x03,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x03,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_le_f16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x03,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_le_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x03,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_le_f16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x03,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_le_f16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x03,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x03,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_le_f16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x03,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_le_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x03,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_le_f16_e64 vcc_lo, 0.5, -m0         ; encoding: [0x6a,0x00,0x03,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_le_f16_e64 vcc, 0.5, -m0            ; encoding: [0x6a,0x00,0x03,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x03,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_le_f16_e64 vcc_lo, 0.5, -m0       ; encoding: [0x6a,0x00,0x03,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_le_f16_e64 vcc, 0.5, -m0          ; encoding: [0x6a,0x00,0x03,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x03,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_le_f16_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x03,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_le_f16_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x03,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x03,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_le_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x03,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7c,0x83,0x03,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmp_le_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x03,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_le_f32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x13,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_le_f32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x13,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x13,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_le_f32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x13,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_le_f32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x13,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_le_f32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x13,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_le_f32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x13,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x13,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_le_f32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x13,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_le_f32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x13,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_le_f32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x13,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_le_f32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x13,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x13,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_le_f32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x13,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_le_f32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x13,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_le_f32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x13,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_le_f32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x13,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x13,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_le_f32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x13,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_le_f32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x13,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_le_f32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x13,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_le_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x13,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x13,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_le_f32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x13,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_le_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x13,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_le_f32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x13,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_le_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x13,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x13,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_le_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x13,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_le_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x13,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_le_f32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x13,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_le_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x13,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x13,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_le_f32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x13,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_le_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x13,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_le_f32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x13,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_le_f32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x13,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x13,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_le_f32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x13,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_le_f32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x13,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_le_f32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x13,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_le_f32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x13,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x13,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_le_f32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x13,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_le_f32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x13,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_le_f32_e64 s10, |exec_hi|, null     ; encoding: [0x0a,0x01,0x13,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_le_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x13,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x13,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_le_f32_e64 s10, |exec_hi|, null   ; encoding: [0x0a,0x01,0x13,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_le_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x13,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_le_f32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x13,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_le_f32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x13,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x13,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_le_f32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x13,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_le_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x13,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_le_f32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x13,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_le_f32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x13,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x13,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_le_f32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x13,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_le_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x13,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_le_f32_e64 vcc_lo, 0.5, -m0         ; encoding: [0x6a,0x00,0x13,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_le_f32_e64 vcc, 0.5, -m0            ; encoding: [0x6a,0x00,0x13,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x13,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_le_f32_e64 vcc_lo, 0.5, -m0       ; encoding: [0x6a,0x00,0x13,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_le_f32_e64 vcc, 0.5, -m0          ; encoding: [0x6a,0x00,0x13,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x13,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_le_f32_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x13,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_le_f32_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x13,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x13,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_le_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x13,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7c,0x83,0x13,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_le_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x13,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_le_f64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x23,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_le_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x23,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x23,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_le_f64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x23,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_le_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x23,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x23,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_le_f64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x23,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_le_f64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x23,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x23,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_le_f64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x23,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_le_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x23,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x23,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_le_f64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x23,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_le_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x23,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x23,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_le_f64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x23,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_le_f64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x23,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x23,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_le_f64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x23,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_le_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x23,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x23,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_le_f64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x23,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_le_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x23,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x23,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_le_f64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x23,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_le_f64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x23,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x23,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_le_f64_e64 s10, -|exec|, src_scc    ; encoding: [0x0a,0x01,0x23,0xd4,0x7e,0xfa,0x01,0x20]
-# W64: v_cmp_le_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x23,0xd4,0x7e,0xfa,0x01,0x20]
 0x0a,0x01,0x23,0xd4,0x7e,0xfa,0x01,0x20
+# W32: v_cmp_le_f64_e64 s10, -|exec|, src_scc  ; encoding: [0x0a,0x01,0x23,0xd4,0x7e,0xfa,0x01,0x20]
+# W64: v_cmp_le_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x23,0xd4,0x7e,0xfa,0x01,0x20]
 
-# W32: v_cmp_le_f64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x23,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_le_f64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x23,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x23,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_le_f64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x23,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_le_f64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x23,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_le_f64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x23,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_le_f64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x23,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x23,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_le_f64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x23,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_le_f64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x23,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_le_f64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x23,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_le_f64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x23,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x23,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_le_f64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x23,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_le_f64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x23,0xd4,0xf0,0xf8,0x00,0x00]
 
+0x7a,0x03,0x23,0xd4,0xfd,0xfc,0x00,0x60
 # W32: v_cmp_le_f64_e64 ttmp14, -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x23,0xd4,0xfd,0xfc,0x00,0x60]
 # W64: v_cmp_le_f64_e64 ttmp[14:15], -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x23,0xd4,0xfd,0xfc,0x00,0x60]
-0x7a,0x03,0x23,0xd4,0xfd,0xfc,0x00,0x60
 
-# GFX11: v_cmp_le_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x23,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7c,0x82,0x23,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_le_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x23,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_le_i16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x33,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_le_i16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x33,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x33,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_le_i16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x33,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_le_i16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x33,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_le_i16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x33,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_le_i16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x33,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x33,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_le_i16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x33,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_le_i16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x33,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_le_i16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x33,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_le_i16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x33,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x33,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_le_i16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x33,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_le_i16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x33,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_le_i16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x33,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_le_i16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x33,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x33,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_le_i16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x33,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_le_i16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x33,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_le_i16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x33,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_le_i16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x33,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x33,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_le_i16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x33,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_le_i16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x33,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_le_i16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x33,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_le_i16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x33,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x33,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_le_i16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x33,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_le_i16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x33,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_le_i16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x33,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_le_i16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x33,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x33,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_le_i16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x33,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_le_i16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x33,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_le_i16_e64 s10, m0, 0x3800          ; encoding: [0x0a,0x00,0x33,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_le_i16_e64 s[10:11], m0, 0x3800     ; encoding: [0x0a,0x00,0x33,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x0a,0x00,0x33,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_le_i16_e64 s10, m0, 0x3800        ; encoding: [0x0a,0x00,0x33,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_le_i16_e64 s[10:11], m0, 0x3800   ; encoding: [0x0a,0x00,0x33,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_le_i16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x33,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_le_i16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x33,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x33,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_le_i16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x33,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_le_i16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x33,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_le_i16_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x33,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_le_i16_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x33,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x33,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_le_i16_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x33,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_le_i16_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x33,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_le_i16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x33,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_le_i16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x33,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x33,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_le_i16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x33,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_le_i16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x33,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_le_i16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x33,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_le_i16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x33,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x33,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_le_i16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x33,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_le_i16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x33,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_le_i16_e64 vcc_lo, 0x3800, m0       ; encoding: [0x6a,0x00,0x33,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_le_i16_e64 vcc, 0x3800, m0          ; encoding: [0x6a,0x00,0x33,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x6a,0x00,0x33,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_le_i16_e64 vcc_lo, 0x3800, m0     ; encoding: [0x6a,0x00,0x33,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_le_i16_e64 vcc, 0x3800, m0        ; encoding: [0x6a,0x00,0x33,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_le_i16_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x33,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_le_i16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x33,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x33,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_le_i16_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x33,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_le_i16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x33,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmp_le_i16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x33,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7c,0x00,0x33,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmp_le_i16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x33,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_le_i32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x43,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_le_i32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x43,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x43,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_le_i32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x43,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_le_i32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x43,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_le_i32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x43,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_le_i32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x43,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x43,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_le_i32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x43,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_le_i32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x43,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_le_i32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x43,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_le_i32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x43,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x43,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_le_i32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x43,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_le_i32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x43,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_le_i32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x43,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_le_i32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x43,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x43,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_le_i32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x43,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_le_i32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x43,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_le_i32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x43,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_le_i32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x43,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x43,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_le_i32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x43,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_le_i32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x43,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_le_i32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x43,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_le_i32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x43,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x43,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_le_i32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x43,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_le_i32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x43,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_le_i32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x43,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_le_i32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x43,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x43,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_le_i32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x43,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_le_i32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x43,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_le_i32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x43,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_le_i32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x43,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x43,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_le_i32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x43,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_le_i32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x43,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_le_i32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x43,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_le_i32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x43,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x43,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_le_i32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x43,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_le_i32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x43,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_le_i32_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x43,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_le_i32_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x43,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x43,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_le_i32_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x43,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_le_i32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x43,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_le_i32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x43,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_le_i32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x43,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x43,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_le_i32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x43,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_le_i32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x43,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_le_i32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x43,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_le_i32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x43,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x43,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_le_i32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x43,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_le_i32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x43,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_le_i32_e64 vcc_lo, 0.5, m0          ; encoding: [0x6a,0x00,0x43,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_le_i32_e64 vcc, 0.5, m0             ; encoding: [0x6a,0x00,0x43,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x43,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_le_i32_e64 vcc_lo, 0.5, m0        ; encoding: [0x6a,0x00,0x43,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_le_i32_e64 vcc, 0.5, m0           ; encoding: [0x6a,0x00,0x43,0xd4,0xf0,0xfa,0x00,0x00]
 
-# W32: v_cmp_le_i32_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x43,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_le_i32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x43,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x43,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_le_i32_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x43,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_le_i32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x43,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmp_le_i32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x43,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x43,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_le_i32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x43,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_le_i64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x53,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_le_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x53,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x53,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_le_i64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x53,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_le_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x53,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x53,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_le_i64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x53,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_le_i64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x53,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x53,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_le_i64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x53,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_le_i64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x53,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x53,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_le_i64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x53,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_le_i64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x53,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x53,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_le_i64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x53,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_le_i64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x53,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x53,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_le_i64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x53,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_le_i64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x53,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x53,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_le_i64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x53,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_le_i64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x53,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x53,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_le_i64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x53,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_le_i64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x53,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x53,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_le_i64_e64 s10, exec, src_scc       ; encoding: [0x0a,0x00,0x53,0xd4,0x7e,0xfa,0x01,0x00]
-# W64: v_cmp_le_i64_e64 s[10:11], exec, src_scc  ; encoding: [0x0a,0x00,0x53,0xd4,0x7e,0xfa,0x01,0x00]
 0x0a,0x00,0x53,0xd4,0x7e,0xfa,0x01,0x00
+# W32: v_cmp_le_i64_e64 s10, exec, src_scc     ; encoding: [0x0a,0x00,0x53,0xd4,0x7e,0xfa,0x01,0x00]
+# W64: v_cmp_le_i64_e64 s[10:11], exec, src_scc ; encoding: [0x0a,0x00,0x53,0xd4,0x7e,0xfa,0x01,0x00]
 
-# W32: v_cmp_le_i64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x53,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_le_i64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x53,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x53,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_le_i64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x53,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_le_i64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x53,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_le_i64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x53,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_le_i64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x53,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x53,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_le_i64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x53,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_le_i64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x53,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_le_i64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x53,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_le_i64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x53,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x53,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_le_i64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x53,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_le_i64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x53,0xd4,0xf0,0xf8,0x00,0x00]
 
-# W32: v_cmp_le_i64_e64 ttmp14, src_scc, exec    ; encoding: [0x7a,0x00,0x53,0xd4,0xfd,0xfc,0x00,0x00]
-# W64: v_cmp_le_i64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x53,0xd4,0xfd,0xfc,0x00,0x00]
 0x7a,0x00,0x53,0xd4,0xfd,0xfc,0x00,0x00
+# W32: v_cmp_le_i64_e64 ttmp14, src_scc, exec  ; encoding: [0x7a,0x00,0x53,0xd4,0xfd,0xfc,0x00,0x00]
+# W64: v_cmp_le_i64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x53,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmp_le_i64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x53,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x53,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_le_i64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x53,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_le_u16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x3b,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_le_u16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x3b,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x3b,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_le_u16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x3b,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_le_u16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x3b,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_le_u16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x3b,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_le_u16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x3b,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x3b,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_le_u16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x3b,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_le_u16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x3b,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_le_u16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x3b,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_le_u16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x3b,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x3b,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_le_u16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x3b,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_le_u16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x3b,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_le_u16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x3b,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_le_u16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x3b,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x3b,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_le_u16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x3b,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_le_u16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x3b,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_le_u16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x3b,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_le_u16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x3b,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x3b,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_le_u16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x3b,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_le_u16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x3b,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_le_u16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x3b,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_le_u16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x3b,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x3b,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_le_u16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x3b,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_le_u16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x3b,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_le_u16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x3b,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_le_u16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x3b,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x3b,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_le_u16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x3b,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_le_u16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x3b,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_le_u16_e64 s10, m0, 0x3800          ; encoding: [0x0a,0x00,0x3b,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_le_u16_e64 s[10:11], m0, 0x3800     ; encoding: [0x0a,0x00,0x3b,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x0a,0x00,0x3b,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_le_u16_e64 s10, m0, 0x3800        ; encoding: [0x0a,0x00,0x3b,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_le_u16_e64 s[10:11], m0, 0x3800   ; encoding: [0x0a,0x00,0x3b,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_le_u16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x3b,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_le_u16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x3b,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x3b,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_le_u16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x3b,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_le_u16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x3b,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_le_u16_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x3b,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_le_u16_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x3b,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x3b,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_le_u16_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x3b,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_le_u16_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x3b,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_le_u16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x3b,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_le_u16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x3b,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x3b,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_le_u16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x3b,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_le_u16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x3b,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_le_u16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x3b,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_le_u16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x3b,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x3b,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_le_u16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x3b,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_le_u16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x3b,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_le_u16_e64 vcc_lo, 0x3800, m0       ; encoding: [0x6a,0x00,0x3b,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_le_u16_e64 vcc, 0x3800, m0          ; encoding: [0x6a,0x00,0x3b,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x6a,0x00,0x3b,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_le_u16_e64 vcc_lo, 0x3800, m0     ; encoding: [0x6a,0x00,0x3b,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_le_u16_e64 vcc, 0x3800, m0        ; encoding: [0x6a,0x00,0x3b,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_le_u16_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x3b,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_le_u16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x3b,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x3b,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_le_u16_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x3b,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_le_u16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x3b,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmp_le_u16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x3b,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7c,0x00,0x3b,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmp_le_u16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x3b,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_le_u32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x4b,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_le_u32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x4b,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x4b,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_le_u32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x4b,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_le_u32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x4b,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_le_u32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x4b,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_le_u32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x4b,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x4b,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_le_u32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x4b,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_le_u32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x4b,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_le_u32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x4b,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_le_u32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x4b,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x4b,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_le_u32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x4b,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_le_u32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x4b,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_le_u32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x4b,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_le_u32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x4b,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x4b,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_le_u32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x4b,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_le_u32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x4b,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_le_u32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x4b,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_le_u32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x4b,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x4b,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_le_u32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x4b,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_le_u32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x4b,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_le_u32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x4b,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_le_u32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4b,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x4b,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_le_u32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4b,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_le_u32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4b,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_le_u32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x4b,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_le_u32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x4b,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x4b,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_le_u32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x4b,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_le_u32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x4b,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_le_u32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x4b,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_le_u32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x4b,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x4b,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_le_u32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x4b,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_le_u32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x4b,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_le_u32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x4b,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_le_u32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x4b,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x4b,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_le_u32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x4b,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_le_u32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x4b,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_le_u32_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x4b,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_le_u32_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x4b,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x4b,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_le_u32_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x4b,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_le_u32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x4b,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_le_u32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x4b,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_le_u32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x4b,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x4b,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_le_u32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x4b,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_le_u32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x4b,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_le_u32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x4b,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_le_u32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x4b,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x4b,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_le_u32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x4b,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_le_u32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x4b,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_le_u32_e64 vcc_lo, 0.5, m0          ; encoding: [0x6a,0x00,0x4b,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_le_u32_e64 vcc, 0.5, m0             ; encoding: [0x6a,0x00,0x4b,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x4b,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_le_u32_e64 vcc_lo, 0.5, m0        ; encoding: [0x6a,0x00,0x4b,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_le_u32_e64 vcc, 0.5, m0           ; encoding: [0x6a,0x00,0x4b,0xd4,0xf0,0xfa,0x00,0x00]
 
-# W32: v_cmp_le_u32_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x4b,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_le_u32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4b,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x4b,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_le_u32_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4b,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_le_u32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4b,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmp_le_u32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x4b,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x4b,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_le_u32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x4b,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_le_u64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x5b,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_le_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x5b,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x5b,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_le_u64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x5b,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_le_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x5b,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x5b,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_le_u64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x5b,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_le_u64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x5b,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x5b,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_le_u64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x5b,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_le_u64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x5b,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x5b,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_le_u64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x5b,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_le_u64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x5b,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x5b,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_le_u64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x5b,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_le_u64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x5b,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x5b,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_le_u64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x5b,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_le_u64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x5b,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x5b,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_le_u64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x5b,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_le_u64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x5b,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x5b,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_le_u64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x5b,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_le_u64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x5b,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x5b,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_le_u64_e64 s10, exec, src_scc       ; encoding: [0x0a,0x00,0x5b,0xd4,0x7e,0xfa,0x01,0x00]
-# W64: v_cmp_le_u64_e64 s[10:11], exec, src_scc  ; encoding: [0x0a,0x00,0x5b,0xd4,0x7e,0xfa,0x01,0x00]
 0x0a,0x00,0x5b,0xd4,0x7e,0xfa,0x01,0x00
+# W32: v_cmp_le_u64_e64 s10, exec, src_scc     ; encoding: [0x0a,0x00,0x5b,0xd4,0x7e,0xfa,0x01,0x00]
+# W64: v_cmp_le_u64_e64 s[10:11], exec, src_scc ; encoding: [0x0a,0x00,0x5b,0xd4,0x7e,0xfa,0x01,0x00]
 
-# W32: v_cmp_le_u64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x5b,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_le_u64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x5b,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x5b,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_le_u64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x5b,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_le_u64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x5b,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_le_u64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x5b,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_le_u64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x5b,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x5b,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_le_u64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x5b,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_le_u64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x5b,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_le_u64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x5b,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_le_u64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x5b,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x5b,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_le_u64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x5b,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_le_u64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x5b,0xd4,0xf0,0xf8,0x00,0x00]
 
-# W32: v_cmp_le_u64_e64 ttmp14, src_scc, exec    ; encoding: [0x7a,0x00,0x5b,0xd4,0xfd,0xfc,0x00,0x00]
-# W64: v_cmp_le_u64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x5b,0xd4,0xfd,0xfc,0x00,0x00]
 0x7a,0x00,0x5b,0xd4,0xfd,0xfc,0x00,0x00
+# W32: v_cmp_le_u64_e64 ttmp14, src_scc, exec  ; encoding: [0x7a,0x00,0x5b,0xd4,0xfd,0xfc,0x00,0x00]
+# W64: v_cmp_le_u64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x5b,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmp_le_u64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x5b,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x5b,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_le_u64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x5b,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lg_f16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x05,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_lg_f16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x05,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x05,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_lg_f16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x05,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_lg_f16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x05,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_lg_f16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x05,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_lg_f16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x05,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x05,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_lg_f16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x05,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_lg_f16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x05,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_lg_f16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x05,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_lg_f16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x05,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x05,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_lg_f16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x05,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_lg_f16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x05,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_lg_f16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x05,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_lg_f16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x05,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x05,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_lg_f16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x05,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_lg_f16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x05,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_lg_f16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x05,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_lg_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x05,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x05,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_lg_f16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x05,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_lg_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x05,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_lg_f16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x05,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_lg_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x05,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x05,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_lg_f16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x05,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_lg_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x05,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_lg_f16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x05,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_lg_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x05,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x05,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_lg_f16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x05,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_lg_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x05,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_lg_f16_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x05,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_lg_f16_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x05,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x05,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_lg_f16_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x05,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_lg_f16_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x05,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_lg_f16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x05,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_lg_f16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x05,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x05,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_lg_f16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x05,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_lg_f16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x05,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_lg_f16_e64 s10, |exec_hi|, null     ; encoding: [0x0a,0x01,0x05,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_lg_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x05,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x05,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_lg_f16_e64 s10, |exec_hi|, null   ; encoding: [0x0a,0x01,0x05,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_lg_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x05,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_lg_f16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x05,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_lg_f16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x05,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x05,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_lg_f16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x05,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_lg_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x05,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_lg_f16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x05,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_lg_f16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x05,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x05,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_lg_f16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x05,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_lg_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x05,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_lg_f16_e64 vcc_lo, 0.5, -m0         ; encoding: [0x6a,0x00,0x05,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_lg_f16_e64 vcc, 0.5, -m0            ; encoding: [0x6a,0x00,0x05,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x05,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_lg_f16_e64 vcc_lo, 0.5, -m0       ; encoding: [0x6a,0x00,0x05,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_lg_f16_e64 vcc, 0.5, -m0          ; encoding: [0x6a,0x00,0x05,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x05,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_lg_f16_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x05,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_lg_f16_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x05,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x05,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_lg_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x05,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7c,0x83,0x05,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmp_lg_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x05,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_lg_f32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x15,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_lg_f32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x15,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x15,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_lg_f32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x15,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_lg_f32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x15,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_lg_f32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x15,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_lg_f32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x15,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x15,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_lg_f32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x15,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_lg_f32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x15,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_lg_f32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x15,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_lg_f32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x15,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x15,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_lg_f32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x15,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_lg_f32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x15,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_lg_f32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x15,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_lg_f32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x15,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x15,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_lg_f32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x15,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_lg_f32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x15,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_lg_f32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x15,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_lg_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x15,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x15,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_lg_f32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x15,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_lg_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x15,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_lg_f32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x15,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_lg_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x15,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x15,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_lg_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x15,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_lg_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x15,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lg_f32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x15,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_lg_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x15,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x15,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_lg_f32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x15,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_lg_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x15,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_lg_f32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x15,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_lg_f32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x15,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x15,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_lg_f32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x15,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_lg_f32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x15,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_lg_f32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x15,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_lg_f32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x15,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x15,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_lg_f32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x15,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_lg_f32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x15,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_lg_f32_e64 s10, |exec_hi|, null     ; encoding: [0x0a,0x01,0x15,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_lg_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x15,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x15,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_lg_f32_e64 s10, |exec_hi|, null   ; encoding: [0x0a,0x01,0x15,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_lg_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x15,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_lg_f32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x15,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_lg_f32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x15,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x15,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_lg_f32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x15,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_lg_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x15,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_lg_f32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x15,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_lg_f32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x15,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x15,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_lg_f32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x15,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_lg_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x15,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_lg_f32_e64 vcc_lo, 0.5, -m0         ; encoding: [0x6a,0x00,0x15,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_lg_f32_e64 vcc, 0.5, -m0            ; encoding: [0x6a,0x00,0x15,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x15,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_lg_f32_e64 vcc_lo, 0.5, -m0       ; encoding: [0x6a,0x00,0x15,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_lg_f32_e64 vcc, 0.5, -m0          ; encoding: [0x6a,0x00,0x15,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x15,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_lg_f32_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x15,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_lg_f32_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x15,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x15,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_lg_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x15,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7c,0x83,0x15,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_lg_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x15,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lg_f64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x25,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_lg_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x25,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x25,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_lg_f64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x25,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_lg_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x25,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x25,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_lg_f64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x25,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_lg_f64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x25,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x25,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_lg_f64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x25,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_lg_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x25,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x25,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_lg_f64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x25,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_lg_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x25,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x25,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_lg_f64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x25,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_lg_f64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x25,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x25,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_lg_f64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x25,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_lg_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x25,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x25,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_lg_f64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x25,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_lg_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x25,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x25,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_lg_f64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x25,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_lg_f64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x25,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x25,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_lg_f64_e64 s10, -|exec|, src_scc    ; encoding: [0x0a,0x01,0x25,0xd4,0x7e,0xfa,0x01,0x20]
-# W64: v_cmp_lg_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x25,0xd4,0x7e,0xfa,0x01,0x20]
 0x0a,0x01,0x25,0xd4,0x7e,0xfa,0x01,0x20
+# W32: v_cmp_lg_f64_e64 s10, -|exec|, src_scc  ; encoding: [0x0a,0x01,0x25,0xd4,0x7e,0xfa,0x01,0x20]
+# W64: v_cmp_lg_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x25,0xd4,0x7e,0xfa,0x01,0x20]
 
-# W32: v_cmp_lg_f64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x25,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_lg_f64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x25,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x25,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_lg_f64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x25,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_lg_f64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x25,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_lg_f64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x25,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_lg_f64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x25,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x25,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_lg_f64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x25,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_lg_f64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x25,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_lg_f64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x25,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_lg_f64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x25,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x25,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_lg_f64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x25,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_lg_f64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x25,0xd4,0xf0,0xf8,0x00,0x00]
 
+0x7a,0x03,0x25,0xd4,0xfd,0xfc,0x00,0x60
 # W32: v_cmp_lg_f64_e64 ttmp14, -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x25,0xd4,0xfd,0xfc,0x00,0x60]
 # W64: v_cmp_lg_f64_e64 ttmp[14:15], -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x25,0xd4,0xfd,0xfc,0x00,0x60]
-0x7a,0x03,0x25,0xd4,0xfd,0xfc,0x00,0x60
 
-# GFX11: v_cmp_lg_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x25,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7c,0x82,0x25,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_lg_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x25,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lt_f16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x01,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_lt_f16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x01,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x01,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_lt_f16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x01,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_lt_f16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x01,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_lt_f16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x01,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_lt_f16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x01,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x01,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_lt_f16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x01,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_lt_f16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x01,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_lt_f16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x01,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_lt_f16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x01,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x01,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_lt_f16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x01,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_lt_f16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x01,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_lt_f16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x01,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_lt_f16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x01,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x01,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_lt_f16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x01,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_lt_f16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x01,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_lt_f16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x01,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_lt_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x01,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x01,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_lt_f16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x01,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_lt_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x01,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_lt_f16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x01,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_lt_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x01,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x01,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_lt_f16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x01,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_lt_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x01,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_lt_f16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x01,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_lt_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x01,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x01,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_lt_f16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x01,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_lt_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x01,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_lt_f16_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x01,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_lt_f16_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x01,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x01,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_lt_f16_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x01,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_lt_f16_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x01,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_lt_f16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x01,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_lt_f16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x01,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x01,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_lt_f16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x01,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_lt_f16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x01,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_lt_f16_e64 s10, |exec_hi|, null     ; encoding: [0x0a,0x01,0x01,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_lt_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x01,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x01,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_lt_f16_e64 s10, |exec_hi|, null   ; encoding: [0x0a,0x01,0x01,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_lt_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x01,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_lt_f16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x01,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_lt_f16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x01,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x01,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_lt_f16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x01,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_lt_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x01,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_lt_f16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x01,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_lt_f16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x01,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x01,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_lt_f16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x01,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_lt_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x01,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_lt_f16_e64 vcc_lo, 0.5, -m0         ; encoding: [0x6a,0x00,0x01,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_lt_f16_e64 vcc, 0.5, -m0            ; encoding: [0x6a,0x00,0x01,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x01,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_lt_f16_e64 vcc_lo, 0.5, -m0       ; encoding: [0x6a,0x00,0x01,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_lt_f16_e64 vcc, 0.5, -m0          ; encoding: [0x6a,0x00,0x01,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x01,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_lt_f16_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x01,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_lt_f16_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x01,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x01,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_lt_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x01,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7c,0x83,0x01,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmp_lt_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x01,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_lt_f32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x11,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_lt_f32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x11,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x11,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_lt_f32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x11,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_lt_f32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x11,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_lt_f32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x11,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_lt_f32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x11,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x11,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_lt_f32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x11,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_lt_f32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x11,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_lt_f32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x11,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_lt_f32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x11,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x11,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_lt_f32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x11,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_lt_f32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x11,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_lt_f32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x11,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_lt_f32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x11,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x11,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_lt_f32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x11,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_lt_f32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x11,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_lt_f32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x11,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_lt_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x11,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x11,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_lt_f32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x11,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_lt_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x11,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_lt_f32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x11,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_lt_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x11,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x11,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_lt_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x11,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_lt_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x11,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lt_f32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x11,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_lt_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x11,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x11,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_lt_f32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x11,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_lt_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x11,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_lt_f32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x11,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_lt_f32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x11,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x11,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_lt_f32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x11,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_lt_f32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x11,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_lt_f32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x11,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_lt_f32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x11,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x11,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_lt_f32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x11,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_lt_f32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x11,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_lt_f32_e64 s10, |exec_hi|, null     ; encoding: [0x0a,0x01,0x11,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_lt_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x11,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x11,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_lt_f32_e64 s10, |exec_hi|, null   ; encoding: [0x0a,0x01,0x11,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_lt_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x11,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_lt_f32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x11,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_lt_f32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x11,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x11,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_lt_f32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x11,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_lt_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x11,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_lt_f32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x11,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_lt_f32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x11,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x11,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_lt_f32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x11,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_lt_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x11,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_lt_f32_e64 vcc_lo, 0.5, -m0         ; encoding: [0x6a,0x00,0x11,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_lt_f32_e64 vcc, 0.5, -m0            ; encoding: [0x6a,0x00,0x11,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x11,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_lt_f32_e64 vcc_lo, 0.5, -m0       ; encoding: [0x6a,0x00,0x11,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_lt_f32_e64 vcc, 0.5, -m0          ; encoding: [0x6a,0x00,0x11,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x11,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_lt_f32_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x11,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_lt_f32_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x11,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x11,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_lt_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x11,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7c,0x83,0x11,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_lt_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x11,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lt_f64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x21,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_lt_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x21,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x21,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_lt_f64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x21,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_lt_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x21,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x21,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_lt_f64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x21,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_lt_f64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x21,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x21,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_lt_f64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x21,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_lt_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x21,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x21,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_lt_f64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x21,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_lt_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x21,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x21,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_lt_f64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x21,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_lt_f64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x21,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x21,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_lt_f64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x21,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_lt_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x21,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x21,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_lt_f64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x21,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_lt_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x21,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x21,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_lt_f64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x21,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_lt_f64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x21,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x21,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_lt_f64_e64 s10, -|exec|, src_scc    ; encoding: [0x0a,0x01,0x21,0xd4,0x7e,0xfa,0x01,0x20]
-# W64: v_cmp_lt_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x21,0xd4,0x7e,0xfa,0x01,0x20]
 0x0a,0x01,0x21,0xd4,0x7e,0xfa,0x01,0x20
+# W32: v_cmp_lt_f64_e64 s10, -|exec|, src_scc  ; encoding: [0x0a,0x01,0x21,0xd4,0x7e,0xfa,0x01,0x20]
+# W64: v_cmp_lt_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x21,0xd4,0x7e,0xfa,0x01,0x20]
 
-# W32: v_cmp_lt_f64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x21,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_lt_f64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x21,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x21,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_lt_f64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x21,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_lt_f64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x21,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_lt_f64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x21,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_lt_f64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x21,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x21,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_lt_f64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x21,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_lt_f64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x21,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_lt_f64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x21,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_lt_f64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x21,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x21,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_lt_f64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x21,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_lt_f64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x21,0xd4,0xf0,0xf8,0x00,0x00]
 
+0x7a,0x03,0x21,0xd4,0xfd,0xfc,0x00,0x60
 # W32: v_cmp_lt_f64_e64 ttmp14, -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x21,0xd4,0xfd,0xfc,0x00,0x60]
 # W64: v_cmp_lt_f64_e64 ttmp[14:15], -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x21,0xd4,0xfd,0xfc,0x00,0x60]
-0x7a,0x03,0x21,0xd4,0xfd,0xfc,0x00,0x60
 
-# GFX11: v_cmp_lt_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x21,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7c,0x82,0x21,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_lt_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x21,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lt_i16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x31,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_lt_i16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x31,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x31,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_lt_i16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x31,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_lt_i16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x31,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_lt_i16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x31,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_lt_i16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x31,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x31,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_lt_i16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x31,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_lt_i16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x31,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_lt_i16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x31,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_lt_i16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x31,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x31,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_lt_i16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x31,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_lt_i16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x31,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_lt_i16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x31,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_lt_i16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x31,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x31,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_lt_i16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x31,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_lt_i16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x31,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_lt_i16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x31,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_lt_i16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x31,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x31,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_lt_i16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x31,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_lt_i16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x31,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_lt_i16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x31,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_lt_i16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x31,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x31,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_lt_i16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x31,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_lt_i16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x31,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_lt_i16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x31,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_lt_i16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x31,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x31,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_lt_i16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x31,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_lt_i16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x31,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_lt_i16_e64 s10, m0, 0x3800          ; encoding: [0x0a,0x00,0x31,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_lt_i16_e64 s[10:11], m0, 0x3800     ; encoding: [0x0a,0x00,0x31,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x0a,0x00,0x31,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_lt_i16_e64 s10, m0, 0x3800        ; encoding: [0x0a,0x00,0x31,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_lt_i16_e64 s[10:11], m0, 0x3800   ; encoding: [0x0a,0x00,0x31,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_lt_i16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x31,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_lt_i16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x31,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x31,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_lt_i16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x31,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_lt_i16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x31,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_lt_i16_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x31,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_lt_i16_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x31,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x31,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_lt_i16_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x31,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_lt_i16_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x31,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_lt_i16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x31,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_lt_i16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x31,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x31,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_lt_i16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x31,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_lt_i16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x31,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_lt_i16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x31,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_lt_i16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x31,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x31,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_lt_i16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x31,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_lt_i16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x31,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_lt_i16_e64 vcc_lo, 0x3800, m0       ; encoding: [0x6a,0x00,0x31,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_lt_i16_e64 vcc, 0x3800, m0          ; encoding: [0x6a,0x00,0x31,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x6a,0x00,0x31,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_lt_i16_e64 vcc_lo, 0x3800, m0     ; encoding: [0x6a,0x00,0x31,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_lt_i16_e64 vcc, 0x3800, m0        ; encoding: [0x6a,0x00,0x31,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_lt_i16_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x31,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_lt_i16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x31,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x31,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_lt_i16_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x31,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_lt_i16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x31,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmp_lt_i16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x31,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7c,0x00,0x31,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmp_lt_i16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x31,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_lt_i32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x41,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_lt_i32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x41,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x41,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_lt_i32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x41,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_lt_i32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x41,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_lt_i32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x41,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_lt_i32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x41,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x41,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_lt_i32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x41,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_lt_i32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x41,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_lt_i32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x41,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_lt_i32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x41,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x41,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_lt_i32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x41,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_lt_i32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x41,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_lt_i32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x41,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_lt_i32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x41,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x41,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_lt_i32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x41,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_lt_i32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x41,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_lt_i32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x41,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_lt_i32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x41,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x41,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_lt_i32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x41,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_lt_i32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x41,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_lt_i32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x41,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_lt_i32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x41,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x41,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_lt_i32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x41,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_lt_i32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x41,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lt_i32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x41,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_lt_i32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x41,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x41,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_lt_i32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x41,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_lt_i32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x41,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_lt_i32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x41,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_lt_i32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x41,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x41,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_lt_i32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x41,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_lt_i32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x41,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_lt_i32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x41,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_lt_i32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x41,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x41,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_lt_i32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x41,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_lt_i32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x41,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_lt_i32_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x41,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_lt_i32_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x41,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x41,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_lt_i32_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x41,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_lt_i32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x41,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_lt_i32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x41,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_lt_i32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x41,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x41,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_lt_i32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x41,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_lt_i32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x41,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_lt_i32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x41,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_lt_i32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x41,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x41,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_lt_i32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x41,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_lt_i32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x41,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_lt_i32_e64 vcc_lo, 0.5, m0          ; encoding: [0x6a,0x00,0x41,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_lt_i32_e64 vcc, 0.5, m0             ; encoding: [0x6a,0x00,0x41,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x41,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_lt_i32_e64 vcc_lo, 0.5, m0        ; encoding: [0x6a,0x00,0x41,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_lt_i32_e64 vcc, 0.5, m0           ; encoding: [0x6a,0x00,0x41,0xd4,0xf0,0xfa,0x00,0x00]
 
-# W32: v_cmp_lt_i32_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x41,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_lt_i32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x41,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x41,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_lt_i32_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x41,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_lt_i32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x41,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmp_lt_i32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x41,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x41,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_lt_i32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x41,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lt_i64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x51,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_lt_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x51,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x51,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_lt_i64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x51,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_lt_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x51,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x51,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_lt_i64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x51,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_lt_i64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x51,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x51,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_lt_i64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x51,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_lt_i64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x51,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x51,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_lt_i64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x51,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_lt_i64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x51,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x51,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_lt_i64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x51,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_lt_i64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x51,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x51,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_lt_i64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x51,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_lt_i64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x51,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x51,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_lt_i64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x51,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_lt_i64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x51,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x51,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_lt_i64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x51,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_lt_i64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x51,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x51,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_lt_i64_e64 s10, exec, src_scc       ; encoding: [0x0a,0x00,0x51,0xd4,0x7e,0xfa,0x01,0x00]
-# W64: v_cmp_lt_i64_e64 s[10:11], exec, src_scc  ; encoding: [0x0a,0x00,0x51,0xd4,0x7e,0xfa,0x01,0x00]
 0x0a,0x00,0x51,0xd4,0x7e,0xfa,0x01,0x00
+# W32: v_cmp_lt_i64_e64 s10, exec, src_scc     ; encoding: [0x0a,0x00,0x51,0xd4,0x7e,0xfa,0x01,0x00]
+# W64: v_cmp_lt_i64_e64 s[10:11], exec, src_scc ; encoding: [0x0a,0x00,0x51,0xd4,0x7e,0xfa,0x01,0x00]
 
-# W32: v_cmp_lt_i64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x51,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_lt_i64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x51,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x51,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_lt_i64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x51,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_lt_i64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x51,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_lt_i64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x51,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_lt_i64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x51,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x51,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_lt_i64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x51,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_lt_i64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x51,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_lt_i64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x51,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_lt_i64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x51,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x51,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_lt_i64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x51,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_lt_i64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x51,0xd4,0xf0,0xf8,0x00,0x00]
 
-# W32: v_cmp_lt_i64_e64 ttmp14, src_scc, exec    ; encoding: [0x7a,0x00,0x51,0xd4,0xfd,0xfc,0x00,0x00]
-# W64: v_cmp_lt_i64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x51,0xd4,0xfd,0xfc,0x00,0x00]
 0x7a,0x00,0x51,0xd4,0xfd,0xfc,0x00,0x00
+# W32: v_cmp_lt_i64_e64 ttmp14, src_scc, exec  ; encoding: [0x7a,0x00,0x51,0xd4,0xfd,0xfc,0x00,0x00]
+# W64: v_cmp_lt_i64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x51,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmp_lt_i64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x51,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x51,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_lt_i64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x51,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lt_u16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x39,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_lt_u16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x39,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x39,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_lt_u16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x39,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_lt_u16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x39,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_lt_u16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x39,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_lt_u16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x39,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x39,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_lt_u16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x39,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_lt_u16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x39,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_lt_u16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x39,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_lt_u16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x39,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x39,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_lt_u16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x39,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_lt_u16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x39,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_lt_u16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x39,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_lt_u16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x39,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x39,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_lt_u16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x39,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_lt_u16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x39,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_lt_u16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x39,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_lt_u16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x39,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x39,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_lt_u16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x39,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_lt_u16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x39,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_lt_u16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x39,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_lt_u16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x39,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x39,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_lt_u16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x39,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_lt_u16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x39,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_lt_u16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x39,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_lt_u16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x39,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x39,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_lt_u16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x39,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_lt_u16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x39,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_lt_u16_e64 s10, m0, 0x3800          ; encoding: [0x0a,0x00,0x39,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_lt_u16_e64 s[10:11], m0, 0x3800     ; encoding: [0x0a,0x00,0x39,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x0a,0x00,0x39,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_lt_u16_e64 s10, m0, 0x3800        ; encoding: [0x0a,0x00,0x39,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_lt_u16_e64 s[10:11], m0, 0x3800   ; encoding: [0x0a,0x00,0x39,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_lt_u16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x39,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_lt_u16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x39,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x39,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_lt_u16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x39,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_lt_u16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x39,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_lt_u16_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x39,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_lt_u16_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x39,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x39,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_lt_u16_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x39,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_lt_u16_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x39,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_lt_u16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x39,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_lt_u16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x39,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x39,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_lt_u16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x39,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_lt_u16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x39,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_lt_u16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x39,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_lt_u16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x39,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x39,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_lt_u16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x39,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_lt_u16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x39,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_lt_u16_e64 vcc_lo, 0x3800, m0       ; encoding: [0x6a,0x00,0x39,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_lt_u16_e64 vcc, 0x3800, m0          ; encoding: [0x6a,0x00,0x39,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x6a,0x00,0x39,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_lt_u16_e64 vcc_lo, 0x3800, m0     ; encoding: [0x6a,0x00,0x39,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_lt_u16_e64 vcc, 0x3800, m0        ; encoding: [0x6a,0x00,0x39,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_lt_u16_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x39,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_lt_u16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x39,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x39,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_lt_u16_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x39,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_lt_u16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x39,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmp_lt_u16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x39,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7c,0x00,0x39,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmp_lt_u16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x39,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_lt_u32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x49,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_lt_u32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x49,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x49,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_lt_u32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x49,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_lt_u32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x49,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_lt_u32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x49,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_lt_u32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x49,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x49,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_lt_u32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x49,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_lt_u32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x49,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_lt_u32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x49,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_lt_u32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x49,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x49,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_lt_u32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x49,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_lt_u32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x49,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_lt_u32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x49,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_lt_u32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x49,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x49,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_lt_u32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x49,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_lt_u32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x49,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_lt_u32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x49,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_lt_u32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x49,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x49,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_lt_u32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x49,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_lt_u32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x49,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_lt_u32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x49,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_lt_u32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x49,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x49,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_lt_u32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x49,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_lt_u32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x49,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lt_u32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x49,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_lt_u32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x49,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x49,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_lt_u32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x49,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_lt_u32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x49,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_lt_u32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x49,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_lt_u32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x49,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x49,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_lt_u32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x49,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_lt_u32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x49,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_lt_u32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x49,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_lt_u32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x49,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x49,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_lt_u32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x49,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_lt_u32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x49,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_lt_u32_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x49,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_lt_u32_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x49,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x49,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_lt_u32_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x49,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_lt_u32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x49,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_lt_u32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x49,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_lt_u32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x49,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x49,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_lt_u32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x49,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_lt_u32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x49,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_lt_u32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x49,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_lt_u32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x49,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x49,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_lt_u32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x49,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_lt_u32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x49,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_lt_u32_e64 vcc_lo, 0.5, m0          ; encoding: [0x6a,0x00,0x49,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_lt_u32_e64 vcc, 0.5, m0             ; encoding: [0x6a,0x00,0x49,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x49,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_lt_u32_e64 vcc_lo, 0.5, m0        ; encoding: [0x6a,0x00,0x49,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_lt_u32_e64 vcc, 0.5, m0           ; encoding: [0x6a,0x00,0x49,0xd4,0xf0,0xfa,0x00,0x00]
 
-# W32: v_cmp_lt_u32_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x49,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_lt_u32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x49,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x49,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_lt_u32_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x49,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_lt_u32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x49,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmp_lt_u32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x49,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x49,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_lt_u32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x49,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lt_u64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x59,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_lt_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x59,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x59,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_lt_u64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x59,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_lt_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x59,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x59,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_lt_u64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x59,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_lt_u64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x59,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x59,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_lt_u64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x59,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_lt_u64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x59,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x59,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_lt_u64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x59,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_lt_u64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x59,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x59,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_lt_u64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x59,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_lt_u64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x59,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x59,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_lt_u64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x59,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_lt_u64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x59,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x59,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_lt_u64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x59,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_lt_u64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x59,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x59,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_lt_u64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x59,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_lt_u64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x59,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x59,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_lt_u64_e64 s10, exec, src_scc       ; encoding: [0x0a,0x00,0x59,0xd4,0x7e,0xfa,0x01,0x00]
-# W64: v_cmp_lt_u64_e64 s[10:11], exec, src_scc  ; encoding: [0x0a,0x00,0x59,0xd4,0x7e,0xfa,0x01,0x00]
 0x0a,0x00,0x59,0xd4,0x7e,0xfa,0x01,0x00
+# W32: v_cmp_lt_u64_e64 s10, exec, src_scc     ; encoding: [0x0a,0x00,0x59,0xd4,0x7e,0xfa,0x01,0x00]
+# W64: v_cmp_lt_u64_e64 s[10:11], exec, src_scc ; encoding: [0x0a,0x00,0x59,0xd4,0x7e,0xfa,0x01,0x00]
 
-# W32: v_cmp_lt_u64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x59,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_lt_u64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x59,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x59,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_lt_u64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x59,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_lt_u64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x59,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_lt_u64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x59,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_lt_u64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x59,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x59,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_lt_u64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x59,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_lt_u64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x59,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_lt_u64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x59,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_lt_u64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x59,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x59,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_lt_u64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x59,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_lt_u64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x59,0xd4,0xf0,0xf8,0x00,0x00]
 
-# W32: v_cmp_lt_u64_e64 ttmp14, src_scc, exec    ; encoding: [0x7a,0x00,0x59,0xd4,0xfd,0xfc,0x00,0x00]
-# W64: v_cmp_lt_u64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x59,0xd4,0xfd,0xfc,0x00,0x00]
 0x7a,0x00,0x59,0xd4,0xfd,0xfc,0x00,0x00
+# W32: v_cmp_lt_u64_e64 ttmp14, src_scc, exec  ; encoding: [0x7a,0x00,0x59,0xd4,0xfd,0xfc,0x00,0x00]
+# W64: v_cmp_lt_u64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x59,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmp_lt_u64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x59,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x59,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_lt_u64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x59,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ne_i16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x35,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ne_i16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x35,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x35,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ne_i16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x35,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ne_i16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x35,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_ne_i16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x35,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_ne_i16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x35,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x35,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_ne_i16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x35,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_ne_i16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x35,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_ne_i16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x35,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_ne_i16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x35,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x35,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_ne_i16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x35,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_ne_i16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x35,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_ne_i16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x35,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_ne_i16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x35,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x35,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_ne_i16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x35,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_ne_i16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x35,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_ne_i16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x35,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_ne_i16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x35,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x35,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_ne_i16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x35,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_ne_i16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x35,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_ne_i16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x35,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_ne_i16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x35,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x35,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_ne_i16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x35,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_ne_i16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x35,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ne_i16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x35,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_ne_i16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x35,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x35,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_ne_i16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x35,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_ne_i16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x35,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_ne_i16_e64 s10, m0, 0x3800          ; encoding: [0x0a,0x00,0x35,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_ne_i16_e64 s[10:11], m0, 0x3800     ; encoding: [0x0a,0x00,0x35,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x0a,0x00,0x35,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_ne_i16_e64 s10, m0, 0x3800        ; encoding: [0x0a,0x00,0x35,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_ne_i16_e64 s[10:11], m0, 0x3800   ; encoding: [0x0a,0x00,0x35,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_ne_i16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x35,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_ne_i16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x35,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x35,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_ne_i16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x35,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_ne_i16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x35,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_ne_i16_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x35,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_ne_i16_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x35,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x35,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_ne_i16_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x35,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_ne_i16_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x35,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_ne_i16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x35,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_ne_i16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x35,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x35,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_ne_i16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x35,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_ne_i16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x35,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_ne_i16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x35,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_ne_i16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x35,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x35,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_ne_i16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x35,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_ne_i16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x35,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_ne_i16_e64 vcc_lo, 0x3800, m0       ; encoding: [0x6a,0x00,0x35,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_ne_i16_e64 vcc, 0x3800, m0          ; encoding: [0x6a,0x00,0x35,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x6a,0x00,0x35,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_ne_i16_e64 vcc_lo, 0x3800, m0     ; encoding: [0x6a,0x00,0x35,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_ne_i16_e64 vcc, 0x3800, m0        ; encoding: [0x6a,0x00,0x35,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_ne_i16_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x35,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_ne_i16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x35,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x35,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_ne_i16_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x35,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_ne_i16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x35,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmp_ne_i16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x35,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7c,0x00,0x35,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmp_ne_i16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x35,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ne_i32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x45,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ne_i32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x45,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x45,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ne_i32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x45,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ne_i32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x45,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_ne_i32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x45,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_ne_i32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x45,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x45,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_ne_i32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x45,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_ne_i32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x45,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_ne_i32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x45,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_ne_i32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x45,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x45,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_ne_i32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x45,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_ne_i32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x45,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_ne_i32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x45,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_ne_i32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x45,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x45,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_ne_i32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x45,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_ne_i32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x45,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_ne_i32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x45,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_ne_i32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x45,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x45,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_ne_i32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x45,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_ne_i32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x45,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_ne_i32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x45,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_ne_i32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x45,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x45,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_ne_i32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x45,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_ne_i32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x45,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ne_i32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x45,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_ne_i32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x45,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x45,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_ne_i32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x45,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_ne_i32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x45,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_ne_i32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x45,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_ne_i32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x45,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x45,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_ne_i32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x45,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_ne_i32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x45,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_ne_i32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x45,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_ne_i32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x45,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x45,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_ne_i32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x45,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_ne_i32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x45,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_ne_i32_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x45,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_ne_i32_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x45,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x45,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_ne_i32_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x45,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_ne_i32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x45,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_ne_i32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x45,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_ne_i32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x45,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x45,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_ne_i32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x45,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_ne_i32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x45,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_ne_i32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x45,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_ne_i32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x45,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x45,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_ne_i32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x45,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_ne_i32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x45,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_ne_i32_e64 vcc_lo, 0.5, m0          ; encoding: [0x6a,0x00,0x45,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_ne_i32_e64 vcc, 0.5, m0             ; encoding: [0x6a,0x00,0x45,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x45,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_ne_i32_e64 vcc_lo, 0.5, m0        ; encoding: [0x6a,0x00,0x45,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_ne_i32_e64 vcc, 0.5, m0           ; encoding: [0x6a,0x00,0x45,0xd4,0xf0,0xfa,0x00,0x00]
 
-# W32: v_cmp_ne_i32_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x45,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_ne_i32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x45,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x45,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_ne_i32_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x45,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_ne_i32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x45,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmp_ne_i32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x45,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x45,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_ne_i32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x45,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ne_i64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x55,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ne_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x55,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x55,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ne_i64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x55,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ne_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x55,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x55,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_ne_i64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x55,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_ne_i64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x55,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x55,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_ne_i64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x55,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_ne_i64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x55,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x55,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_ne_i64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x55,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_ne_i64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x55,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x55,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_ne_i64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x55,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_ne_i64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x55,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x55,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_ne_i64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x55,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_ne_i64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x55,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x55,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_ne_i64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x55,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_ne_i64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x55,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x55,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_ne_i64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x55,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_ne_i64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x55,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x55,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_ne_i64_e64 s10, exec, src_scc       ; encoding: [0x0a,0x00,0x55,0xd4,0x7e,0xfa,0x01,0x00]
-# W64: v_cmp_ne_i64_e64 s[10:11], exec, src_scc  ; encoding: [0x0a,0x00,0x55,0xd4,0x7e,0xfa,0x01,0x00]
 0x0a,0x00,0x55,0xd4,0x7e,0xfa,0x01,0x00
+# W32: v_cmp_ne_i64_e64 s10, exec, src_scc     ; encoding: [0x0a,0x00,0x55,0xd4,0x7e,0xfa,0x01,0x00]
+# W64: v_cmp_ne_i64_e64 s[10:11], exec, src_scc ; encoding: [0x0a,0x00,0x55,0xd4,0x7e,0xfa,0x01,0x00]
 
-# W32: v_cmp_ne_i64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x55,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_ne_i64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x55,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x55,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_ne_i64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x55,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_ne_i64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x55,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_ne_i64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x55,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_ne_i64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x55,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x55,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_ne_i64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x55,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_ne_i64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x55,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_ne_i64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x55,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_ne_i64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x55,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x55,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_ne_i64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x55,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_ne_i64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x55,0xd4,0xf0,0xf8,0x00,0x00]
 
-# W32: v_cmp_ne_i64_e64 ttmp14, src_scc, exec    ; encoding: [0x7a,0x00,0x55,0xd4,0xfd,0xfc,0x00,0x00]
-# W64: v_cmp_ne_i64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x55,0xd4,0xfd,0xfc,0x00,0x00]
 0x7a,0x00,0x55,0xd4,0xfd,0xfc,0x00,0x00
+# W32: v_cmp_ne_i64_e64 ttmp14, src_scc, exec  ; encoding: [0x7a,0x00,0x55,0xd4,0xfd,0xfc,0x00,0x00]
+# W64: v_cmp_ne_i64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x55,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmp_ne_i64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x55,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x55,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_ne_i64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x55,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ne_u16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x3d,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ne_u16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x3d,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x3d,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ne_u16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x3d,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ne_u16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x3d,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_ne_u16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x3d,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_ne_u16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x3d,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x3d,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_ne_u16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x3d,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_ne_u16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x3d,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_ne_u16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x3d,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_ne_u16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x3d,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x3d,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_ne_u16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x3d,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_ne_u16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x3d,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_ne_u16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x3d,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_ne_u16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x3d,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x3d,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_ne_u16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x3d,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_ne_u16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x3d,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_ne_u16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x3d,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_ne_u16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x3d,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x3d,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_ne_u16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x3d,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_ne_u16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x3d,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_ne_u16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x3d,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_ne_u16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x3d,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x3d,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_ne_u16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x3d,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_ne_u16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x3d,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ne_u16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x3d,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_ne_u16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x3d,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x3d,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_ne_u16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x3d,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_ne_u16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x3d,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_ne_u16_e64 s10, m0, 0x3800          ; encoding: [0x0a,0x00,0x3d,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_ne_u16_e64 s[10:11], m0, 0x3800     ; encoding: [0x0a,0x00,0x3d,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x0a,0x00,0x3d,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_ne_u16_e64 s10, m0, 0x3800        ; encoding: [0x0a,0x00,0x3d,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_ne_u16_e64 s[10:11], m0, 0x3800   ; encoding: [0x0a,0x00,0x3d,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_ne_u16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x3d,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_ne_u16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x3d,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x3d,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_ne_u16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x3d,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_ne_u16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x3d,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_ne_u16_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x3d,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_ne_u16_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x3d,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x3d,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_ne_u16_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x3d,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_ne_u16_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x3d,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_ne_u16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x3d,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_ne_u16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x3d,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x3d,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_ne_u16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x3d,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_ne_u16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x3d,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_ne_u16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x3d,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_ne_u16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x3d,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x3d,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_ne_u16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x3d,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_ne_u16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x3d,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_ne_u16_e64 vcc_lo, 0x3800, m0       ; encoding: [0x6a,0x00,0x3d,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_ne_u16_e64 vcc, 0x3800, m0          ; encoding: [0x6a,0x00,0x3d,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x6a,0x00,0x3d,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_ne_u16_e64 vcc_lo, 0x3800, m0     ; encoding: [0x6a,0x00,0x3d,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_ne_u16_e64 vcc, 0x3800, m0        ; encoding: [0x6a,0x00,0x3d,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_ne_u16_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x3d,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_ne_u16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x3d,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x3d,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_ne_u16_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x3d,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_ne_u16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x3d,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmp_ne_u16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x3d,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7c,0x00,0x3d,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmp_ne_u16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x3d,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ne_u32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x4d,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ne_u32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x4d,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x4d,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ne_u32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x4d,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ne_u32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x4d,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_ne_u32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x4d,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_ne_u32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x4d,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x4d,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_ne_u32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x4d,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_ne_u32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x4d,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_ne_u32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x4d,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_ne_u32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x4d,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x4d,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_ne_u32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x4d,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_ne_u32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x4d,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_ne_u32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x4d,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_ne_u32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x4d,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x4d,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_ne_u32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x4d,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_ne_u32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x4d,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_ne_u32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x4d,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_ne_u32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x4d,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x4d,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_ne_u32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x4d,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_ne_u32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x4d,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_ne_u32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x4d,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_ne_u32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4d,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x4d,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_ne_u32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4d,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_ne_u32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4d,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ne_u32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x4d,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_ne_u32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x4d,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x4d,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_ne_u32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x4d,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_ne_u32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x4d,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_ne_u32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x4d,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_ne_u32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x4d,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x4d,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_ne_u32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x4d,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_ne_u32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x4d,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_ne_u32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x4d,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_ne_u32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x4d,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x4d,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_ne_u32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x4d,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_ne_u32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x4d,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_ne_u32_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x4d,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_ne_u32_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x4d,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x4d,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_ne_u32_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x4d,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_ne_u32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x4d,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_ne_u32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x4d,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_ne_u32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x4d,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x4d,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_ne_u32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x4d,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_ne_u32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x4d,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_ne_u32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x4d,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_ne_u32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x4d,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x4d,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_ne_u32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x4d,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_ne_u32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x4d,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_ne_u32_e64 vcc_lo, 0.5, m0          ; encoding: [0x6a,0x00,0x4d,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_ne_u32_e64 vcc, 0.5, m0             ; encoding: [0x6a,0x00,0x4d,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x4d,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_ne_u32_e64 vcc_lo, 0.5, m0        ; encoding: [0x6a,0x00,0x4d,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_ne_u32_e64 vcc, 0.5, m0           ; encoding: [0x6a,0x00,0x4d,0xd4,0xf0,0xfa,0x00,0x00]
 
-# W32: v_cmp_ne_u32_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x4d,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_ne_u32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4d,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x4d,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_ne_u32_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4d,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_ne_u32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4d,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmp_ne_u32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x4d,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x4d,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_ne_u32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x4d,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ne_u64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x5d,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ne_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x5d,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x5d,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ne_u64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x5d,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ne_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x5d,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x5d,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_ne_u64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x5d,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_ne_u64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x5d,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x5d,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_ne_u64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x5d,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_ne_u64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x5d,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x5d,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_ne_u64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x5d,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_ne_u64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x5d,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x5d,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_ne_u64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x5d,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_ne_u64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x5d,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x5d,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_ne_u64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x5d,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_ne_u64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x5d,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x5d,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_ne_u64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x5d,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_ne_u64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x5d,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x5d,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_ne_u64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x5d,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_ne_u64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x5d,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x5d,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_ne_u64_e64 s10, exec, src_scc       ; encoding: [0x0a,0x00,0x5d,0xd4,0x7e,0xfa,0x01,0x00]
-# W64: v_cmp_ne_u64_e64 s[10:11], exec, src_scc  ; encoding: [0x0a,0x00,0x5d,0xd4,0x7e,0xfa,0x01,0x00]
 0x0a,0x00,0x5d,0xd4,0x7e,0xfa,0x01,0x00
+# W32: v_cmp_ne_u64_e64 s10, exec, src_scc     ; encoding: [0x0a,0x00,0x5d,0xd4,0x7e,0xfa,0x01,0x00]
+# W64: v_cmp_ne_u64_e64 s[10:11], exec, src_scc ; encoding: [0x0a,0x00,0x5d,0xd4,0x7e,0xfa,0x01,0x00]
 
-# W32: v_cmp_ne_u64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x5d,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_ne_u64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x5d,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x5d,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_ne_u64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x5d,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_ne_u64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x5d,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_ne_u64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x5d,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_ne_u64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x5d,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x5d,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_ne_u64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x5d,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_ne_u64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x5d,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_ne_u64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x5d,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_ne_u64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x5d,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x5d,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_ne_u64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x5d,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_ne_u64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x5d,0xd4,0xf0,0xf8,0x00,0x00]
 
-# W32: v_cmp_ne_u64_e64 ttmp14, src_scc, exec    ; encoding: [0x7a,0x00,0x5d,0xd4,0xfd,0xfc,0x00,0x00]
-# W64: v_cmp_ne_u64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x5d,0xd4,0xfd,0xfc,0x00,0x00]
 0x7a,0x00,0x5d,0xd4,0xfd,0xfc,0x00,0x00
+# W32: v_cmp_ne_u64_e64 ttmp14, src_scc, exec  ; encoding: [0x7a,0x00,0x5d,0xd4,0xfd,0xfc,0x00,0x00]
+# W64: v_cmp_ne_u64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x5d,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmp_ne_u64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x5d,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x5d,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_ne_u64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x5d,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_neq_f16_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x0d,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_neq_f16_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x0d,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x0d,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_neq_f16_e64 s10, v1, v2           ; encoding: [0x0a,0x00,0x0d,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_neq_f16_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0x0d,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_neq_f16_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x0d,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_neq_f16_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x0d,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x0d,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_neq_f16_e64 s10, v255, v255       ; encoding: [0x0a,0x00,0x0d,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_neq_f16_e64 s[10:11], v255, v255  ; encoding: [0x0a,0x00,0x0d,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_neq_f16_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x0d,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_neq_f16_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x0d,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x0d,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_neq_f16_e64 s10, s1, s2           ; encoding: [0x0a,0x00,0x0d,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_neq_f16_e64 s[10:11], s1, s2      ; encoding: [0x0a,0x00,0x0d,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_neq_f16_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x0d,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_neq_f16_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x0d,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x0d,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_neq_f16_e64 s10, s105, s105       ; encoding: [0x0a,0x00,0x0d,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_neq_f16_e64 s[10:11], s105, s105  ; encoding: [0x0a,0x00,0x0d,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_neq_f16_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x0d,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_neq_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x0d,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x0d,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_neq_f16_e64 s10, vcc_lo, ttmp15   ; encoding: [0x0a,0x00,0x0d,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_neq_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x0d,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_neq_f16_e64 s10, vcc_hi, 0xfe0b     ; encoding: [0x0a,0x00,0x0d,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_neq_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x0d,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x0d,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_neq_f16_e64 s10, vcc_hi, 0xfe0b   ; encoding: [0x0a,0x00,0x0d,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_neq_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x0d,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_neq_f16_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x0d,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_neq_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x0d,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x0d,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_neq_f16_e64 s10, ttmp15, src_scc  ; encoding: [0x0a,0x00,0x0d,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_neq_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x0d,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_neq_f16_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x0d,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_neq_f16_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x0d,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x0d,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_neq_f16_e64 s10, m0, 0.5          ; encoding: [0x0a,0x00,0x0d,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_neq_f16_e64 s[10:11], m0, 0.5     ; encoding: [0x0a,0x00,0x0d,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_neq_f16_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x0d,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_neq_f16_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x0d,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x0d,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_neq_f16_e64 s10, exec_lo, -1      ; encoding: [0x0a,0x00,0x0d,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_neq_f16_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x0d,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_neq_f16_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x0d,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_neq_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x0d,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x0d,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_neq_f16_e64 s10, |exec_hi|, null  ; encoding: [0x0a,0x01,0x0d,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_neq_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x0d,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_neq_f16_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x0d,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_neq_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x0d,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x0d,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_neq_f16_e64 s10, null, exec_lo    ; encoding: [0x0a,0x00,0x0d,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_neq_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x0d,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_neq_f16_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x0d,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_neq_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x0d,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x0d,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_neq_f16_e64 s104, -1, exec_hi     ; encoding: [0x68,0x00,0x0d,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_neq_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x0d,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_neq_f16_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x0d,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_neq_f16_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x0d,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x0d,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_neq_f16_e64 vcc_lo, 0.5, -m0      ; encoding: [0x6a,0x00,0x0d,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_neq_f16_e64 vcc, 0.5, -m0         ; encoding: [0x6a,0x00,0x0d,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x0d,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_neq_f16_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x0d,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_neq_f16_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x0d,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x0d,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_neq_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x0d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7c,0x83,0x0d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmp_neq_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x0d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_neq_f32_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x1d,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_neq_f32_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x1d,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x1d,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_neq_f32_e64 s10, v1, v2           ; encoding: [0x0a,0x00,0x1d,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_neq_f32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0x1d,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_neq_f32_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x1d,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_neq_f32_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x1d,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x1d,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_neq_f32_e64 s10, v255, v255       ; encoding: [0x0a,0x00,0x1d,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_neq_f32_e64 s[10:11], v255, v255  ; encoding: [0x0a,0x00,0x1d,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_neq_f32_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x1d,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_neq_f32_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x1d,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x1d,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_neq_f32_e64 s10, s1, s2           ; encoding: [0x0a,0x00,0x1d,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_neq_f32_e64 s[10:11], s1, s2      ; encoding: [0x0a,0x00,0x1d,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_neq_f32_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x1d,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_neq_f32_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x1d,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x1d,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_neq_f32_e64 s10, s105, s105       ; encoding: [0x0a,0x00,0x1d,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_neq_f32_e64 s[10:11], s105, s105  ; encoding: [0x0a,0x00,0x1d,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_neq_f32_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x1d,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_neq_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x1d,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x1d,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_neq_f32_e64 s10, vcc_lo, ttmp15   ; encoding: [0x0a,0x00,0x1d,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_neq_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x1d,0xd4,0x6a,0xf6,0x00,0x00]
 
+0x0a,0x00,0x1d,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_neq_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x1d,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_neq_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x1d,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x1d,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_neq_f32_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x1d,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_neq_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x1d,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x1d,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_neq_f32_e64 s10, ttmp15, src_scc  ; encoding: [0x0a,0x00,0x1d,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_neq_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x1d,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_neq_f32_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x1d,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_neq_f32_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x1d,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x1d,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_neq_f32_e64 s10, m0, 0.5          ; encoding: [0x0a,0x00,0x1d,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_neq_f32_e64 s[10:11], m0, 0.5     ; encoding: [0x0a,0x00,0x1d,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_neq_f32_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x1d,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_neq_f32_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x1d,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x1d,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_neq_f32_e64 s10, exec_lo, -1      ; encoding: [0x0a,0x00,0x1d,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_neq_f32_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x1d,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_neq_f32_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x1d,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_neq_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x1d,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x1d,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_neq_f32_e64 s10, |exec_hi|, null  ; encoding: [0x0a,0x01,0x1d,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_neq_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x1d,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_neq_f32_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x1d,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_neq_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x1d,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x1d,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_neq_f32_e64 s10, null, exec_lo    ; encoding: [0x0a,0x00,0x1d,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_neq_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x1d,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_neq_f32_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x1d,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_neq_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x1d,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x1d,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_neq_f32_e64 s104, -1, exec_hi     ; encoding: [0x68,0x00,0x1d,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_neq_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x1d,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_neq_f32_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x1d,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_neq_f32_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x1d,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x1d,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_neq_f32_e64 vcc_lo, 0.5, -m0      ; encoding: [0x6a,0x00,0x1d,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_neq_f32_e64 vcc, 0.5, -m0         ; encoding: [0x6a,0x00,0x1d,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x1d,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_neq_f32_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x1d,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_neq_f32_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x1d,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x1d,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_neq_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x1d,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7c,0x83,0x1d,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_neq_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x1d,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_neq_f64_e64 s10, v[1:2], v[2:3]     ; encoding: [0x0a,0x00,0x2d,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_neq_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x2d,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x2d,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_neq_f64_e64 s10, v[1:2], v[2:3]   ; encoding: [0x0a,0x00,0x2d,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_neq_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x2d,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x2d,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_neq_f64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x2d,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_neq_f64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x2d,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x2d,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_neq_f64_e64 s10, s[2:3], s[4:5]     ; encoding: [0x0a,0x00,0x2d,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_neq_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x2d,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x2d,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_neq_f64_e64 s10, s[2:3], s[4:5]   ; encoding: [0x0a,0x00,0x2d,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_neq_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x2d,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x2d,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_neq_f64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x2d,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_neq_f64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x2d,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x2d,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_neq_f64_e64 s10, vcc, ttmp[14:15]   ; encoding: [0x0a,0x00,0x2d,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_neq_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2d,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x2d,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_neq_f64_e64 s10, vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2d,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_neq_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2d,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x2d,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_neq_f64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x2d,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_neq_f64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x2d,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x2d,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_neq_f64_e64 s10, -|exec|, src_scc   ; encoding: [0x0a,0x01,0x2d,0xd4,0x7e,0xfa,0x01,0x20]
-# W64: v_cmp_neq_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x2d,0xd4,0x7e,0xfa,0x01,0x20]
 0x0a,0x01,0x2d,0xd4,0x7e,0xfa,0x01,0x20
+# W32: v_cmp_neq_f64_e64 s10, -|exec|, src_scc ; encoding: [0x0a,0x01,0x2d,0xd4,0x7e,0xfa,0x01,0x20]
+# W64: v_cmp_neq_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x2d,0xd4,0x7e,0xfa,0x01,0x20]
 
-# W32: v_cmp_neq_f64_e64 s10, null, 0.5          ; encoding: [0x0a,0x00,0x2d,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_neq_f64_e64 s[10:11], null, 0.5     ; encoding: [0x0a,0x00,0x2d,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x2d,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_neq_f64_e64 s10, null, 0.5        ; encoding: [0x0a,0x00,0x2d,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_neq_f64_e64 s[10:11], null, 0.5   ; encoding: [0x0a,0x00,0x2d,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_neq_f64_e64 s104, -1, -1            ; encoding: [0x68,0x00,0x2d,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_neq_f64_e64 s[104:105], -1, -1      ; encoding: [0x68,0x00,0x2d,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x2d,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_neq_f64_e64 s104, -1, -1          ; encoding: [0x68,0x00,0x2d,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_neq_f64_e64 s[104:105], -1, -1    ; encoding: [0x68,0x00,0x2d,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_neq_f64_e64 vcc_lo, 0.5, null       ; encoding: [0x6a,0x00,0x2d,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_neq_f64_e64 vcc, 0.5, null          ; encoding: [0x6a,0x00,0x2d,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x2d,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_neq_f64_e64 vcc_lo, 0.5, null     ; encoding: [0x6a,0x00,0x2d,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_neq_f64_e64 vcc, 0.5, null        ; encoding: [0x6a,0x00,0x2d,0xd4,0xf0,0xf8,0x00,0x00]
 
+0x7a,0x03,0x2d,0xd4,0xfd,0xfc,0x00,0x60
 # W32: v_cmp_neq_f64_e64 ttmp14, -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x2d,0xd4,0xfd,0xfc,0x00,0x60]
 # W64: v_cmp_neq_f64_e64 ttmp[14:15], -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x2d,0xd4,0xfd,0xfc,0x00,0x60]
-0x7a,0x03,0x2d,0xd4,0xfd,0xfc,0x00,0x60
 
-# GFX11: v_cmp_neq_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x2d,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7c,0x82,0x2d,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_neq_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x2d,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_nge_f16_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x09,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_nge_f16_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x09,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x09,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_nge_f16_e64 s10, v1, v2           ; encoding: [0x0a,0x00,0x09,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_nge_f16_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0x09,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_nge_f16_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x09,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_nge_f16_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x09,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x09,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_nge_f16_e64 s10, v255, v255       ; encoding: [0x0a,0x00,0x09,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_nge_f16_e64 s[10:11], v255, v255  ; encoding: [0x0a,0x00,0x09,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_nge_f16_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x09,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_nge_f16_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x09,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x09,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_nge_f16_e64 s10, s1, s2           ; encoding: [0x0a,0x00,0x09,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_nge_f16_e64 s[10:11], s1, s2      ; encoding: [0x0a,0x00,0x09,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_nge_f16_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x09,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_nge_f16_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x09,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x09,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_nge_f16_e64 s10, s105, s105       ; encoding: [0x0a,0x00,0x09,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_nge_f16_e64 s[10:11], s105, s105  ; encoding: [0x0a,0x00,0x09,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_nge_f16_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x09,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_nge_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x09,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x09,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_nge_f16_e64 s10, vcc_lo, ttmp15   ; encoding: [0x0a,0x00,0x09,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_nge_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x09,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_nge_f16_e64 s10, vcc_hi, 0xfe0b     ; encoding: [0x0a,0x00,0x09,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_nge_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x09,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x09,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_nge_f16_e64 s10, vcc_hi, 0xfe0b   ; encoding: [0x0a,0x00,0x09,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_nge_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x09,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_nge_f16_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x09,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_nge_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x09,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x09,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_nge_f16_e64 s10, ttmp15, src_scc  ; encoding: [0x0a,0x00,0x09,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_nge_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x09,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_nge_f16_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x09,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_nge_f16_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x09,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x09,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_nge_f16_e64 s10, m0, 0.5          ; encoding: [0x0a,0x00,0x09,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_nge_f16_e64 s[10:11], m0, 0.5     ; encoding: [0x0a,0x00,0x09,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_nge_f16_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x09,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_nge_f16_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x09,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x09,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_nge_f16_e64 s10, exec_lo, -1      ; encoding: [0x0a,0x00,0x09,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_nge_f16_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x09,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_nge_f16_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x09,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_nge_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x09,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x09,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_nge_f16_e64 s10, |exec_hi|, null  ; encoding: [0x0a,0x01,0x09,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_nge_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x09,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_nge_f16_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x09,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_nge_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x09,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x09,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_nge_f16_e64 s10, null, exec_lo    ; encoding: [0x0a,0x00,0x09,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_nge_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x09,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_nge_f16_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x09,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_nge_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x09,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x09,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_nge_f16_e64 s104, -1, exec_hi     ; encoding: [0x68,0x00,0x09,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_nge_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x09,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_nge_f16_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x09,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_nge_f16_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x09,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x09,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_nge_f16_e64 vcc_lo, 0.5, -m0      ; encoding: [0x6a,0x00,0x09,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_nge_f16_e64 vcc, 0.5, -m0         ; encoding: [0x6a,0x00,0x09,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x09,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_nge_f16_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x09,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_nge_f16_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x09,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x09,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_nge_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x09,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7c,0x83,0x09,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmp_nge_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x09,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_nge_f32_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x19,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_nge_f32_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x19,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x19,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_nge_f32_e64 s10, v1, v2           ; encoding: [0x0a,0x00,0x19,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_nge_f32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0x19,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_nge_f32_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x19,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_nge_f32_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x19,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x19,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_nge_f32_e64 s10, v255, v255       ; encoding: [0x0a,0x00,0x19,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_nge_f32_e64 s[10:11], v255, v255  ; encoding: [0x0a,0x00,0x19,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_nge_f32_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x19,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_nge_f32_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x19,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x19,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_nge_f32_e64 s10, s1, s2           ; encoding: [0x0a,0x00,0x19,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_nge_f32_e64 s[10:11], s1, s2      ; encoding: [0x0a,0x00,0x19,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_nge_f32_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x19,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_nge_f32_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x19,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x19,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_nge_f32_e64 s10, s105, s105       ; encoding: [0x0a,0x00,0x19,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_nge_f32_e64 s[10:11], s105, s105  ; encoding: [0x0a,0x00,0x19,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_nge_f32_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x19,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_nge_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x19,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x19,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_nge_f32_e64 s10, vcc_lo, ttmp15   ; encoding: [0x0a,0x00,0x19,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_nge_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x19,0xd4,0x6a,0xf6,0x00,0x00]
 
+0x0a,0x00,0x19,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_nge_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x19,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_nge_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x19,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x19,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_nge_f32_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x19,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_nge_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x19,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x19,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_nge_f32_e64 s10, ttmp15, src_scc  ; encoding: [0x0a,0x00,0x19,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_nge_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x19,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_nge_f32_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x19,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_nge_f32_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x19,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x19,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_nge_f32_e64 s10, m0, 0.5          ; encoding: [0x0a,0x00,0x19,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_nge_f32_e64 s[10:11], m0, 0.5     ; encoding: [0x0a,0x00,0x19,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_nge_f32_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x19,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_nge_f32_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x19,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x19,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_nge_f32_e64 s10, exec_lo, -1      ; encoding: [0x0a,0x00,0x19,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_nge_f32_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x19,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_nge_f32_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x19,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_nge_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x19,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x19,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_nge_f32_e64 s10, |exec_hi|, null  ; encoding: [0x0a,0x01,0x19,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_nge_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x19,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_nge_f32_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x19,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_nge_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x19,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x19,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_nge_f32_e64 s10, null, exec_lo    ; encoding: [0x0a,0x00,0x19,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_nge_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x19,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_nge_f32_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x19,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_nge_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x19,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x19,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_nge_f32_e64 s104, -1, exec_hi     ; encoding: [0x68,0x00,0x19,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_nge_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x19,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_nge_f32_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x19,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_nge_f32_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x19,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x19,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_nge_f32_e64 vcc_lo, 0.5, -m0      ; encoding: [0x6a,0x00,0x19,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_nge_f32_e64 vcc, 0.5, -m0         ; encoding: [0x6a,0x00,0x19,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x19,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_nge_f32_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x19,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_nge_f32_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x19,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x19,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_nge_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x19,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7c,0x83,0x19,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_nge_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x19,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_nge_f64_e64 s10, v[1:2], v[2:3]     ; encoding: [0x0a,0x00,0x29,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_nge_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x29,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x29,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_nge_f64_e64 s10, v[1:2], v[2:3]   ; encoding: [0x0a,0x00,0x29,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_nge_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x29,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x29,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_nge_f64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x29,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_nge_f64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x29,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x29,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_nge_f64_e64 s10, s[2:3], s[4:5]     ; encoding: [0x0a,0x00,0x29,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_nge_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x29,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x29,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_nge_f64_e64 s10, s[2:3], s[4:5]   ; encoding: [0x0a,0x00,0x29,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_nge_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x29,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x29,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_nge_f64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x29,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_nge_f64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x29,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x29,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_nge_f64_e64 s10, vcc, ttmp[14:15]   ; encoding: [0x0a,0x00,0x29,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_nge_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x29,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x29,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_nge_f64_e64 s10, vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x29,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_nge_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x29,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x29,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_nge_f64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x29,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_nge_f64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x29,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x29,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_nge_f64_e64 s10, -|exec|, src_scc   ; encoding: [0x0a,0x01,0x29,0xd4,0x7e,0xfa,0x01,0x20]
-# W64: v_cmp_nge_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x29,0xd4,0x7e,0xfa,0x01,0x20]
 0x0a,0x01,0x29,0xd4,0x7e,0xfa,0x01,0x20
+# W32: v_cmp_nge_f64_e64 s10, -|exec|, src_scc ; encoding: [0x0a,0x01,0x29,0xd4,0x7e,0xfa,0x01,0x20]
+# W64: v_cmp_nge_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x29,0xd4,0x7e,0xfa,0x01,0x20]
 
-# W32: v_cmp_nge_f64_e64 s10, null, 0.5          ; encoding: [0x0a,0x00,0x29,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_nge_f64_e64 s[10:11], null, 0.5     ; encoding: [0x0a,0x00,0x29,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x29,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_nge_f64_e64 s10, null, 0.5        ; encoding: [0x0a,0x00,0x29,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_nge_f64_e64 s[10:11], null, 0.5   ; encoding: [0x0a,0x00,0x29,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_nge_f64_e64 s104, -1, -1            ; encoding: [0x68,0x00,0x29,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_nge_f64_e64 s[104:105], -1, -1      ; encoding: [0x68,0x00,0x29,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x29,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_nge_f64_e64 s104, -1, -1          ; encoding: [0x68,0x00,0x29,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_nge_f64_e64 s[104:105], -1, -1    ; encoding: [0x68,0x00,0x29,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_nge_f64_e64 vcc_lo, 0.5, null       ; encoding: [0x6a,0x00,0x29,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_nge_f64_e64 vcc, 0.5, null          ; encoding: [0x6a,0x00,0x29,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x29,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_nge_f64_e64 vcc_lo, 0.5, null     ; encoding: [0x6a,0x00,0x29,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_nge_f64_e64 vcc, 0.5, null        ; encoding: [0x6a,0x00,0x29,0xd4,0xf0,0xf8,0x00,0x00]
 
+0x7a,0x03,0x29,0xd4,0xfd,0xfc,0x00,0x60
 # W32: v_cmp_nge_f64_e64 ttmp14, -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x29,0xd4,0xfd,0xfc,0x00,0x60]
 # W64: v_cmp_nge_f64_e64 ttmp[14:15], -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x29,0xd4,0xfd,0xfc,0x00,0x60]
-0x7a,0x03,0x29,0xd4,0xfd,0xfc,0x00,0x60
 
-# GFX11: v_cmp_nge_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x29,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7c,0x82,0x29,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_nge_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x29,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ngt_f16_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x0b,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ngt_f16_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x0b,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x0b,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ngt_f16_e64 s10, v1, v2           ; encoding: [0x0a,0x00,0x0b,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ngt_f16_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0x0b,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_ngt_f16_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x0b,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_ngt_f16_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x0b,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x0b,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_ngt_f16_e64 s10, v255, v255       ; encoding: [0x0a,0x00,0x0b,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_ngt_f16_e64 s[10:11], v255, v255  ; encoding: [0x0a,0x00,0x0b,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_ngt_f16_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x0b,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_ngt_f16_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x0b,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x0b,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_ngt_f16_e64 s10, s1, s2           ; encoding: [0x0a,0x00,0x0b,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_ngt_f16_e64 s[10:11], s1, s2      ; encoding: [0x0a,0x00,0x0b,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_ngt_f16_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x0b,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_ngt_f16_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x0b,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x0b,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_ngt_f16_e64 s10, s105, s105       ; encoding: [0x0a,0x00,0x0b,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_ngt_f16_e64 s[10:11], s105, s105  ; encoding: [0x0a,0x00,0x0b,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_ngt_f16_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x0b,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_ngt_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x0b,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x0b,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_ngt_f16_e64 s10, vcc_lo, ttmp15   ; encoding: [0x0a,0x00,0x0b,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_ngt_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x0b,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_ngt_f16_e64 s10, vcc_hi, 0xfe0b     ; encoding: [0x0a,0x00,0x0b,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_ngt_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x0b,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x0b,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_ngt_f16_e64 s10, vcc_hi, 0xfe0b   ; encoding: [0x0a,0x00,0x0b,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_ngt_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x0b,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ngt_f16_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x0b,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_ngt_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x0b,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x0b,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_ngt_f16_e64 s10, ttmp15, src_scc  ; encoding: [0x0a,0x00,0x0b,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_ngt_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x0b,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_ngt_f16_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x0b,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_ngt_f16_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x0b,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x0b,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_ngt_f16_e64 s10, m0, 0.5          ; encoding: [0x0a,0x00,0x0b,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_ngt_f16_e64 s[10:11], m0, 0.5     ; encoding: [0x0a,0x00,0x0b,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_ngt_f16_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x0b,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_ngt_f16_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x0b,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x0b,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_ngt_f16_e64 s10, exec_lo, -1      ; encoding: [0x0a,0x00,0x0b,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_ngt_f16_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x0b,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_ngt_f16_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x0b,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_ngt_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x0b,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x0b,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_ngt_f16_e64 s10, |exec_hi|, null  ; encoding: [0x0a,0x01,0x0b,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_ngt_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x0b,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_ngt_f16_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x0b,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_ngt_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x0b,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x0b,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_ngt_f16_e64 s10, null, exec_lo    ; encoding: [0x0a,0x00,0x0b,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_ngt_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x0b,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_ngt_f16_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x0b,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_ngt_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x0b,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x0b,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_ngt_f16_e64 s104, -1, exec_hi     ; encoding: [0x68,0x00,0x0b,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_ngt_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x0b,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_ngt_f16_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x0b,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_ngt_f16_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x0b,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x0b,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_ngt_f16_e64 vcc_lo, 0.5, -m0      ; encoding: [0x6a,0x00,0x0b,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_ngt_f16_e64 vcc, 0.5, -m0         ; encoding: [0x6a,0x00,0x0b,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x0b,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_ngt_f16_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x0b,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_ngt_f16_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x0b,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x0b,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_ngt_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x0b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7c,0x83,0x0b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmp_ngt_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x0b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ngt_f32_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x1b,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ngt_f32_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x1b,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x1b,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ngt_f32_e64 s10, v1, v2           ; encoding: [0x0a,0x00,0x1b,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ngt_f32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0x1b,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_ngt_f32_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x1b,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_ngt_f32_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x1b,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x1b,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_ngt_f32_e64 s10, v255, v255       ; encoding: [0x0a,0x00,0x1b,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_ngt_f32_e64 s[10:11], v255, v255  ; encoding: [0x0a,0x00,0x1b,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_ngt_f32_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x1b,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_ngt_f32_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x1b,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x1b,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_ngt_f32_e64 s10, s1, s2           ; encoding: [0x0a,0x00,0x1b,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_ngt_f32_e64 s[10:11], s1, s2      ; encoding: [0x0a,0x00,0x1b,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_ngt_f32_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x1b,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_ngt_f32_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x1b,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x1b,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_ngt_f32_e64 s10, s105, s105       ; encoding: [0x0a,0x00,0x1b,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_ngt_f32_e64 s[10:11], s105, s105  ; encoding: [0x0a,0x00,0x1b,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_ngt_f32_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x1b,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_ngt_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x1b,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x1b,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_ngt_f32_e64 s10, vcc_lo, ttmp15   ; encoding: [0x0a,0x00,0x1b,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_ngt_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x1b,0xd4,0x6a,0xf6,0x00,0x00]
 
+0x0a,0x00,0x1b,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_ngt_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x1b,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_ngt_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x1b,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x1b,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_ngt_f32_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x1b,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_ngt_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x1b,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x1b,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_ngt_f32_e64 s10, ttmp15, src_scc  ; encoding: [0x0a,0x00,0x1b,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_ngt_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x1b,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_ngt_f32_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x1b,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_ngt_f32_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x1b,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x1b,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_ngt_f32_e64 s10, m0, 0.5          ; encoding: [0x0a,0x00,0x1b,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_ngt_f32_e64 s[10:11], m0, 0.5     ; encoding: [0x0a,0x00,0x1b,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_ngt_f32_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x1b,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_ngt_f32_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x1b,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x1b,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_ngt_f32_e64 s10, exec_lo, -1      ; encoding: [0x0a,0x00,0x1b,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_ngt_f32_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x1b,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_ngt_f32_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x1b,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_ngt_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x1b,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x1b,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_ngt_f32_e64 s10, |exec_hi|, null  ; encoding: [0x0a,0x01,0x1b,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_ngt_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x1b,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_ngt_f32_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x1b,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_ngt_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x1b,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x1b,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_ngt_f32_e64 s10, null, exec_lo    ; encoding: [0x0a,0x00,0x1b,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_ngt_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x1b,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_ngt_f32_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x1b,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_ngt_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x1b,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x1b,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_ngt_f32_e64 s104, -1, exec_hi     ; encoding: [0x68,0x00,0x1b,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_ngt_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x1b,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_ngt_f32_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x1b,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_ngt_f32_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x1b,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x1b,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_ngt_f32_e64 vcc_lo, 0.5, -m0      ; encoding: [0x6a,0x00,0x1b,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_ngt_f32_e64 vcc, 0.5, -m0         ; encoding: [0x6a,0x00,0x1b,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x1b,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_ngt_f32_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x1b,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_ngt_f32_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x1b,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x1b,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_ngt_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x1b,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7c,0x83,0x1b,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_ngt_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x1b,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ngt_f64_e64 s10, v[1:2], v[2:3]     ; encoding: [0x0a,0x00,0x2b,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ngt_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x2b,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x2b,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ngt_f64_e64 s10, v[1:2], v[2:3]   ; encoding: [0x0a,0x00,0x2b,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ngt_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x2b,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x2b,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_ngt_f64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x2b,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_ngt_f64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x2b,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x2b,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_ngt_f64_e64 s10, s[2:3], s[4:5]     ; encoding: [0x0a,0x00,0x2b,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_ngt_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x2b,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x2b,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_ngt_f64_e64 s10, s[2:3], s[4:5]   ; encoding: [0x0a,0x00,0x2b,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_ngt_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x2b,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x2b,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_ngt_f64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x2b,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_ngt_f64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x2b,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x2b,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_ngt_f64_e64 s10, vcc, ttmp[14:15]   ; encoding: [0x0a,0x00,0x2b,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_ngt_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2b,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x2b,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_ngt_f64_e64 s10, vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2b,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_ngt_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2b,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x2b,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_ngt_f64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x2b,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_ngt_f64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x2b,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x2b,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_ngt_f64_e64 s10, -|exec|, src_scc   ; encoding: [0x0a,0x01,0x2b,0xd4,0x7e,0xfa,0x01,0x20]
-# W64: v_cmp_ngt_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x2b,0xd4,0x7e,0xfa,0x01,0x20]
 0x0a,0x01,0x2b,0xd4,0x7e,0xfa,0x01,0x20
+# W32: v_cmp_ngt_f64_e64 s10, -|exec|, src_scc ; encoding: [0x0a,0x01,0x2b,0xd4,0x7e,0xfa,0x01,0x20]
+# W64: v_cmp_ngt_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x2b,0xd4,0x7e,0xfa,0x01,0x20]
 
-# W32: v_cmp_ngt_f64_e64 s10, null, 0.5          ; encoding: [0x0a,0x00,0x2b,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_ngt_f64_e64 s[10:11], null, 0.5     ; encoding: [0x0a,0x00,0x2b,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x2b,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_ngt_f64_e64 s10, null, 0.5        ; encoding: [0x0a,0x00,0x2b,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_ngt_f64_e64 s[10:11], null, 0.5   ; encoding: [0x0a,0x00,0x2b,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_ngt_f64_e64 s104, -1, -1            ; encoding: [0x68,0x00,0x2b,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_ngt_f64_e64 s[104:105], -1, -1      ; encoding: [0x68,0x00,0x2b,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x2b,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_ngt_f64_e64 s104, -1, -1          ; encoding: [0x68,0x00,0x2b,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_ngt_f64_e64 s[104:105], -1, -1    ; encoding: [0x68,0x00,0x2b,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_ngt_f64_e64 vcc_lo, 0.5, null       ; encoding: [0x6a,0x00,0x2b,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_ngt_f64_e64 vcc, 0.5, null          ; encoding: [0x6a,0x00,0x2b,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x2b,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_ngt_f64_e64 vcc_lo, 0.5, null     ; encoding: [0x6a,0x00,0x2b,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_ngt_f64_e64 vcc, 0.5, null        ; encoding: [0x6a,0x00,0x2b,0xd4,0xf0,0xf8,0x00,0x00]
 
+0x7a,0x03,0x2b,0xd4,0xfd,0xfc,0x00,0x60
 # W32: v_cmp_ngt_f64_e64 ttmp14, -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x2b,0xd4,0xfd,0xfc,0x00,0x60]
 # W64: v_cmp_ngt_f64_e64 ttmp[14:15], -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x2b,0xd4,0xfd,0xfc,0x00,0x60]
-0x7a,0x03,0x2b,0xd4,0xfd,0xfc,0x00,0x60
 
-# GFX11: v_cmp_ngt_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x2b,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7c,0x82,0x2b,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_ngt_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x2b,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_nle_f16_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x0c,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_nle_f16_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x0c,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x0c,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_nle_f16_e64 s10, v1, v2           ; encoding: [0x0a,0x00,0x0c,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_nle_f16_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0x0c,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_nle_f16_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x0c,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_nle_f16_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x0c,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x0c,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_nle_f16_e64 s10, v255, v255       ; encoding: [0x0a,0x00,0x0c,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_nle_f16_e64 s[10:11], v255, v255  ; encoding: [0x0a,0x00,0x0c,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_nle_f16_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x0c,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_nle_f16_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x0c,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x0c,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_nle_f16_e64 s10, s1, s2           ; encoding: [0x0a,0x00,0x0c,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_nle_f16_e64 s[10:11], s1, s2      ; encoding: [0x0a,0x00,0x0c,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_nle_f16_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x0c,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_nle_f16_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x0c,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x0c,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_nle_f16_e64 s10, s105, s105       ; encoding: [0x0a,0x00,0x0c,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_nle_f16_e64 s[10:11], s105, s105  ; encoding: [0x0a,0x00,0x0c,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_nle_f16_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x0c,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_nle_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x0c,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x0c,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_nle_f16_e64 s10, vcc_lo, ttmp15   ; encoding: [0x0a,0x00,0x0c,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_nle_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x0c,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_nle_f16_e64 s10, vcc_hi, 0xfe0b     ; encoding: [0x0a,0x00,0x0c,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_nle_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x0c,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x0c,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_nle_f16_e64 s10, vcc_hi, 0xfe0b   ; encoding: [0x0a,0x00,0x0c,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_nle_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x0c,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_nle_f16_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x0c,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_nle_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x0c,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x0c,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_nle_f16_e64 s10, ttmp15, src_scc  ; encoding: [0x0a,0x00,0x0c,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_nle_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x0c,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_nle_f16_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x0c,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_nle_f16_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x0c,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x0c,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_nle_f16_e64 s10, m0, 0.5          ; encoding: [0x0a,0x00,0x0c,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_nle_f16_e64 s[10:11], m0, 0.5     ; encoding: [0x0a,0x00,0x0c,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_nle_f16_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x0c,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_nle_f16_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x0c,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x0c,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_nle_f16_e64 s10, exec_lo, -1      ; encoding: [0x0a,0x00,0x0c,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_nle_f16_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x0c,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_nle_f16_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x0c,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_nle_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x0c,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x0c,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_nle_f16_e64 s10, |exec_hi|, null  ; encoding: [0x0a,0x01,0x0c,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_nle_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x0c,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_nle_f16_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x0c,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_nle_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x0c,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x0c,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_nle_f16_e64 s10, null, exec_lo    ; encoding: [0x0a,0x00,0x0c,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_nle_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x0c,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_nle_f16_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x0c,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_nle_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x0c,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x0c,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_nle_f16_e64 s104, -1, exec_hi     ; encoding: [0x68,0x00,0x0c,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_nle_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x0c,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_nle_f16_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x0c,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_nle_f16_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x0c,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x0c,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_nle_f16_e64 vcc_lo, 0.5, -m0      ; encoding: [0x6a,0x00,0x0c,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_nle_f16_e64 vcc, 0.5, -m0         ; encoding: [0x6a,0x00,0x0c,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x0c,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_nle_f16_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x0c,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_nle_f16_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x0c,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x0c,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_nle_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x0c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7c,0x83,0x0c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmp_nle_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x0c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_nle_f32_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x1c,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_nle_f32_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x1c,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x1c,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_nle_f32_e64 s10, v1, v2           ; encoding: [0x0a,0x00,0x1c,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_nle_f32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0x1c,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_nle_f32_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x1c,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_nle_f32_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x1c,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x1c,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_nle_f32_e64 s10, v255, v255       ; encoding: [0x0a,0x00,0x1c,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_nle_f32_e64 s[10:11], v255, v255  ; encoding: [0x0a,0x00,0x1c,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_nle_f32_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x1c,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_nle_f32_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x1c,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x1c,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_nle_f32_e64 s10, s1, s2           ; encoding: [0x0a,0x00,0x1c,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_nle_f32_e64 s[10:11], s1, s2      ; encoding: [0x0a,0x00,0x1c,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_nle_f32_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x1c,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_nle_f32_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x1c,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x1c,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_nle_f32_e64 s10, s105, s105       ; encoding: [0x0a,0x00,0x1c,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_nle_f32_e64 s[10:11], s105, s105  ; encoding: [0x0a,0x00,0x1c,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_nle_f32_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x1c,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_nle_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x1c,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x1c,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_nle_f32_e64 s10, vcc_lo, ttmp15   ; encoding: [0x0a,0x00,0x1c,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_nle_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x1c,0xd4,0x6a,0xf6,0x00,0x00]
 
+0x0a,0x00,0x1c,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_nle_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x1c,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_nle_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x1c,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x1c,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_nle_f32_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x1c,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_nle_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x1c,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x1c,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_nle_f32_e64 s10, ttmp15, src_scc  ; encoding: [0x0a,0x00,0x1c,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_nle_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x1c,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_nle_f32_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x1c,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_nle_f32_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x1c,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x1c,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_nle_f32_e64 s10, m0, 0.5          ; encoding: [0x0a,0x00,0x1c,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_nle_f32_e64 s[10:11], m0, 0.5     ; encoding: [0x0a,0x00,0x1c,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_nle_f32_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x1c,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_nle_f32_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x1c,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x1c,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_nle_f32_e64 s10, exec_lo, -1      ; encoding: [0x0a,0x00,0x1c,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_nle_f32_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x1c,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_nle_f32_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x1c,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_nle_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x1c,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x1c,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_nle_f32_e64 s10, |exec_hi|, null  ; encoding: [0x0a,0x01,0x1c,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_nle_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x1c,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_nle_f32_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x1c,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_nle_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x1c,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x1c,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_nle_f32_e64 s10, null, exec_lo    ; encoding: [0x0a,0x00,0x1c,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_nle_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x1c,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_nle_f32_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x1c,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_nle_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x1c,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x1c,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_nle_f32_e64 s104, -1, exec_hi     ; encoding: [0x68,0x00,0x1c,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_nle_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x1c,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_nle_f32_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x1c,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_nle_f32_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x1c,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x1c,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_nle_f32_e64 vcc_lo, 0.5, -m0      ; encoding: [0x6a,0x00,0x1c,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_nle_f32_e64 vcc, 0.5, -m0         ; encoding: [0x6a,0x00,0x1c,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x1c,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_nle_f32_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x1c,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_nle_f32_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x1c,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x1c,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_nle_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x1c,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7c,0x83,0x1c,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_nle_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x1c,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_nle_f64_e64 s10, v[1:2], v[2:3]     ; encoding: [0x0a,0x00,0x2c,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_nle_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x2c,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x2c,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_nle_f64_e64 s10, v[1:2], v[2:3]   ; encoding: [0x0a,0x00,0x2c,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_nle_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x2c,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x2c,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_nle_f64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x2c,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_nle_f64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x2c,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x2c,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_nle_f64_e64 s10, s[2:3], s[4:5]     ; encoding: [0x0a,0x00,0x2c,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_nle_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x2c,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x2c,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_nle_f64_e64 s10, s[2:3], s[4:5]   ; encoding: [0x0a,0x00,0x2c,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_nle_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x2c,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x2c,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_nle_f64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x2c,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_nle_f64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x2c,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x2c,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_nle_f64_e64 s10, vcc, ttmp[14:15]   ; encoding: [0x0a,0x00,0x2c,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_nle_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2c,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x2c,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_nle_f64_e64 s10, vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2c,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_nle_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2c,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x2c,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_nle_f64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x2c,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_nle_f64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x2c,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x2c,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_nle_f64_e64 s10, -|exec|, src_scc   ; encoding: [0x0a,0x01,0x2c,0xd4,0x7e,0xfa,0x01,0x20]
-# W64: v_cmp_nle_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x2c,0xd4,0x7e,0xfa,0x01,0x20]
 0x0a,0x01,0x2c,0xd4,0x7e,0xfa,0x01,0x20
+# W32: v_cmp_nle_f64_e64 s10, -|exec|, src_scc ; encoding: [0x0a,0x01,0x2c,0xd4,0x7e,0xfa,0x01,0x20]
+# W64: v_cmp_nle_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x2c,0xd4,0x7e,0xfa,0x01,0x20]
 
-# W32: v_cmp_nle_f64_e64 s10, null, 0.5          ; encoding: [0x0a,0x00,0x2c,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_nle_f64_e64 s[10:11], null, 0.5     ; encoding: [0x0a,0x00,0x2c,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x2c,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_nle_f64_e64 s10, null, 0.5        ; encoding: [0x0a,0x00,0x2c,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_nle_f64_e64 s[10:11], null, 0.5   ; encoding: [0x0a,0x00,0x2c,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_nle_f64_e64 s104, -1, -1            ; encoding: [0x68,0x00,0x2c,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_nle_f64_e64 s[104:105], -1, -1      ; encoding: [0x68,0x00,0x2c,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x2c,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_nle_f64_e64 s104, -1, -1          ; encoding: [0x68,0x00,0x2c,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_nle_f64_e64 s[104:105], -1, -1    ; encoding: [0x68,0x00,0x2c,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_nle_f64_e64 vcc_lo, 0.5, null       ; encoding: [0x6a,0x00,0x2c,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_nle_f64_e64 vcc, 0.5, null          ; encoding: [0x6a,0x00,0x2c,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x2c,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_nle_f64_e64 vcc_lo, 0.5, null     ; encoding: [0x6a,0x00,0x2c,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_nle_f64_e64 vcc, 0.5, null        ; encoding: [0x6a,0x00,0x2c,0xd4,0xf0,0xf8,0x00,0x00]
 
+0x7a,0x03,0x2c,0xd4,0xfd,0xfc,0x00,0x60
 # W32: v_cmp_nle_f64_e64 ttmp14, -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x2c,0xd4,0xfd,0xfc,0x00,0x60]
 # W64: v_cmp_nle_f64_e64 ttmp[14:15], -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x2c,0xd4,0xfd,0xfc,0x00,0x60]
-0x7a,0x03,0x2c,0xd4,0xfd,0xfc,0x00,0x60
 
-# GFX11: v_cmp_nle_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x2c,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7c,0x82,0x2c,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_nle_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x2c,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_nlg_f16_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x0a,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_nlg_f16_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x0a,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x0a,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_nlg_f16_e64 s10, v1, v2           ; encoding: [0x0a,0x00,0x0a,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_nlg_f16_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0x0a,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_nlg_f16_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x0a,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_nlg_f16_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x0a,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x0a,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_nlg_f16_e64 s10, v255, v255       ; encoding: [0x0a,0x00,0x0a,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_nlg_f16_e64 s[10:11], v255, v255  ; encoding: [0x0a,0x00,0x0a,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_nlg_f16_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x0a,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_nlg_f16_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x0a,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x0a,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_nlg_f16_e64 s10, s1, s2           ; encoding: [0x0a,0x00,0x0a,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_nlg_f16_e64 s[10:11], s1, s2      ; encoding: [0x0a,0x00,0x0a,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_nlg_f16_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x0a,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_nlg_f16_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x0a,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x0a,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_nlg_f16_e64 s10, s105, s105       ; encoding: [0x0a,0x00,0x0a,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_nlg_f16_e64 s[10:11], s105, s105  ; encoding: [0x0a,0x00,0x0a,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_nlg_f16_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x0a,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_nlg_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x0a,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x0a,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_nlg_f16_e64 s10, vcc_lo, ttmp15   ; encoding: [0x0a,0x00,0x0a,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_nlg_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x0a,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_nlg_f16_e64 s10, vcc_hi, 0xfe0b     ; encoding: [0x0a,0x00,0x0a,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_nlg_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x0a,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x0a,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_nlg_f16_e64 s10, vcc_hi, 0xfe0b   ; encoding: [0x0a,0x00,0x0a,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_nlg_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x0a,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_nlg_f16_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x0a,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_nlg_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x0a,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x0a,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_nlg_f16_e64 s10, ttmp15, src_scc  ; encoding: [0x0a,0x00,0x0a,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_nlg_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x0a,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_nlg_f16_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x0a,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_nlg_f16_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x0a,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x0a,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_nlg_f16_e64 s10, m0, 0.5          ; encoding: [0x0a,0x00,0x0a,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_nlg_f16_e64 s[10:11], m0, 0.5     ; encoding: [0x0a,0x00,0x0a,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_nlg_f16_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x0a,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_nlg_f16_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x0a,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x0a,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_nlg_f16_e64 s10, exec_lo, -1      ; encoding: [0x0a,0x00,0x0a,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_nlg_f16_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x0a,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_nlg_f16_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x0a,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_nlg_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x0a,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x0a,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_nlg_f16_e64 s10, |exec_hi|, null  ; encoding: [0x0a,0x01,0x0a,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_nlg_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x0a,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_nlg_f16_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x0a,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_nlg_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x0a,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x0a,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_nlg_f16_e64 s10, null, exec_lo    ; encoding: [0x0a,0x00,0x0a,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_nlg_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x0a,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_nlg_f16_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x0a,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_nlg_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x0a,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x0a,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_nlg_f16_e64 s104, -1, exec_hi     ; encoding: [0x68,0x00,0x0a,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_nlg_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x0a,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_nlg_f16_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x0a,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_nlg_f16_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x0a,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x0a,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_nlg_f16_e64 vcc_lo, 0.5, -m0      ; encoding: [0x6a,0x00,0x0a,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_nlg_f16_e64 vcc, 0.5, -m0         ; encoding: [0x6a,0x00,0x0a,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x0a,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_nlg_f16_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x0a,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_nlg_f16_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x0a,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x0a,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_nlg_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x0a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7c,0x83,0x0a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmp_nlg_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x0a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_nlg_f32_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x1a,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_nlg_f32_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x1a,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x1a,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_nlg_f32_e64 s10, v1, v2           ; encoding: [0x0a,0x00,0x1a,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_nlg_f32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0x1a,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_nlg_f32_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x1a,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_nlg_f32_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x1a,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x1a,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_nlg_f32_e64 s10, v255, v255       ; encoding: [0x0a,0x00,0x1a,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_nlg_f32_e64 s[10:11], v255, v255  ; encoding: [0x0a,0x00,0x1a,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_nlg_f32_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x1a,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_nlg_f32_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x1a,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x1a,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_nlg_f32_e64 s10, s1, s2           ; encoding: [0x0a,0x00,0x1a,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_nlg_f32_e64 s[10:11], s1, s2      ; encoding: [0x0a,0x00,0x1a,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_nlg_f32_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x1a,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_nlg_f32_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x1a,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x1a,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_nlg_f32_e64 s10, s105, s105       ; encoding: [0x0a,0x00,0x1a,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_nlg_f32_e64 s[10:11], s105, s105  ; encoding: [0x0a,0x00,0x1a,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_nlg_f32_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x1a,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_nlg_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x1a,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x1a,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_nlg_f32_e64 s10, vcc_lo, ttmp15   ; encoding: [0x0a,0x00,0x1a,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_nlg_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x1a,0xd4,0x6a,0xf6,0x00,0x00]
 
+0x0a,0x00,0x1a,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_nlg_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x1a,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_nlg_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x1a,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x1a,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_nlg_f32_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x1a,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_nlg_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x1a,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x1a,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_nlg_f32_e64 s10, ttmp15, src_scc  ; encoding: [0x0a,0x00,0x1a,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_nlg_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x1a,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_nlg_f32_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x1a,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_nlg_f32_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x1a,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x1a,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_nlg_f32_e64 s10, m0, 0.5          ; encoding: [0x0a,0x00,0x1a,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_nlg_f32_e64 s[10:11], m0, 0.5     ; encoding: [0x0a,0x00,0x1a,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_nlg_f32_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x1a,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_nlg_f32_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x1a,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x1a,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_nlg_f32_e64 s10, exec_lo, -1      ; encoding: [0x0a,0x00,0x1a,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_nlg_f32_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x1a,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_nlg_f32_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x1a,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_nlg_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x1a,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x1a,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_nlg_f32_e64 s10, |exec_hi|, null  ; encoding: [0x0a,0x01,0x1a,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_nlg_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x1a,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_nlg_f32_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x1a,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_nlg_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x1a,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x1a,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_nlg_f32_e64 s10, null, exec_lo    ; encoding: [0x0a,0x00,0x1a,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_nlg_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x1a,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_nlg_f32_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x1a,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_nlg_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x1a,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x1a,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_nlg_f32_e64 s104, -1, exec_hi     ; encoding: [0x68,0x00,0x1a,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_nlg_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x1a,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_nlg_f32_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x1a,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_nlg_f32_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x1a,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x1a,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_nlg_f32_e64 vcc_lo, 0.5, -m0      ; encoding: [0x6a,0x00,0x1a,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_nlg_f32_e64 vcc, 0.5, -m0         ; encoding: [0x6a,0x00,0x1a,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x1a,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_nlg_f32_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x1a,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_nlg_f32_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x1a,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x1a,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_nlg_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x1a,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7c,0x83,0x1a,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_nlg_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x1a,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_nlg_f64_e64 s10, v[1:2], v[2:3]     ; encoding: [0x0a,0x00,0x2a,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_nlg_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x2a,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x2a,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_nlg_f64_e64 s10, v[1:2], v[2:3]   ; encoding: [0x0a,0x00,0x2a,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_nlg_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x2a,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x2a,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_nlg_f64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x2a,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_nlg_f64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x2a,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x2a,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_nlg_f64_e64 s10, s[2:3], s[4:5]     ; encoding: [0x0a,0x00,0x2a,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_nlg_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x2a,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x2a,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_nlg_f64_e64 s10, s[2:3], s[4:5]   ; encoding: [0x0a,0x00,0x2a,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_nlg_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x2a,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x2a,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_nlg_f64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x2a,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_nlg_f64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x2a,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x2a,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_nlg_f64_e64 s10, vcc, ttmp[14:15]   ; encoding: [0x0a,0x00,0x2a,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_nlg_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2a,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x2a,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_nlg_f64_e64 s10, vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2a,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_nlg_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2a,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x2a,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_nlg_f64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x2a,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_nlg_f64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x2a,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x2a,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_nlg_f64_e64 s10, -|exec|, src_scc   ; encoding: [0x0a,0x01,0x2a,0xd4,0x7e,0xfa,0x01,0x20]
-# W64: v_cmp_nlg_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x2a,0xd4,0x7e,0xfa,0x01,0x20]
 0x0a,0x01,0x2a,0xd4,0x7e,0xfa,0x01,0x20
+# W32: v_cmp_nlg_f64_e64 s10, -|exec|, src_scc ; encoding: [0x0a,0x01,0x2a,0xd4,0x7e,0xfa,0x01,0x20]
+# W64: v_cmp_nlg_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x2a,0xd4,0x7e,0xfa,0x01,0x20]
 
-# W32: v_cmp_nlg_f64_e64 s10, null, 0.5          ; encoding: [0x0a,0x00,0x2a,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_nlg_f64_e64 s[10:11], null, 0.5     ; encoding: [0x0a,0x00,0x2a,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x2a,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_nlg_f64_e64 s10, null, 0.5        ; encoding: [0x0a,0x00,0x2a,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_nlg_f64_e64 s[10:11], null, 0.5   ; encoding: [0x0a,0x00,0x2a,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_nlg_f64_e64 s104, -1, -1            ; encoding: [0x68,0x00,0x2a,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_nlg_f64_e64 s[104:105], -1, -1      ; encoding: [0x68,0x00,0x2a,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x2a,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_nlg_f64_e64 s104, -1, -1          ; encoding: [0x68,0x00,0x2a,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_nlg_f64_e64 s[104:105], -1, -1    ; encoding: [0x68,0x00,0x2a,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_nlg_f64_e64 vcc_lo, 0.5, null       ; encoding: [0x6a,0x00,0x2a,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_nlg_f64_e64 vcc, 0.5, null          ; encoding: [0x6a,0x00,0x2a,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x2a,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_nlg_f64_e64 vcc_lo, 0.5, null     ; encoding: [0x6a,0x00,0x2a,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_nlg_f64_e64 vcc, 0.5, null        ; encoding: [0x6a,0x00,0x2a,0xd4,0xf0,0xf8,0x00,0x00]
 
+0x7a,0x03,0x2a,0xd4,0xfd,0xfc,0x00,0x60
 # W32: v_cmp_nlg_f64_e64 ttmp14, -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x2a,0xd4,0xfd,0xfc,0x00,0x60]
 # W64: v_cmp_nlg_f64_e64 ttmp[14:15], -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x2a,0xd4,0xfd,0xfc,0x00,0x60]
-0x7a,0x03,0x2a,0xd4,0xfd,0xfc,0x00,0x60
 
-# GFX11: v_cmp_nlg_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x2a,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7c,0x82,0x2a,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_nlg_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x2a,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_nlt_f16_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x0e,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_nlt_f16_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x0e,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x0e,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_nlt_f16_e64 s10, v1, v2           ; encoding: [0x0a,0x00,0x0e,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_nlt_f16_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0x0e,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_nlt_f16_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x0e,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_nlt_f16_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x0e,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x0e,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_nlt_f16_e64 s10, v255, v255       ; encoding: [0x0a,0x00,0x0e,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_nlt_f16_e64 s[10:11], v255, v255  ; encoding: [0x0a,0x00,0x0e,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_nlt_f16_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x0e,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_nlt_f16_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x0e,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x0e,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_nlt_f16_e64 s10, s1, s2           ; encoding: [0x0a,0x00,0x0e,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_nlt_f16_e64 s[10:11], s1, s2      ; encoding: [0x0a,0x00,0x0e,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_nlt_f16_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x0e,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_nlt_f16_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x0e,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x0e,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_nlt_f16_e64 s10, s105, s105       ; encoding: [0x0a,0x00,0x0e,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_nlt_f16_e64 s[10:11], s105, s105  ; encoding: [0x0a,0x00,0x0e,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_nlt_f16_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x0e,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_nlt_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x0e,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x0e,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_nlt_f16_e64 s10, vcc_lo, ttmp15   ; encoding: [0x0a,0x00,0x0e,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_nlt_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x0e,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_nlt_f16_e64 s10, vcc_hi, 0xfe0b     ; encoding: [0x0a,0x00,0x0e,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_nlt_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x0e,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x0e,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_nlt_f16_e64 s10, vcc_hi, 0xfe0b   ; encoding: [0x0a,0x00,0x0e,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_nlt_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x0e,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_nlt_f16_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x0e,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_nlt_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x0e,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x0e,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_nlt_f16_e64 s10, ttmp15, src_scc  ; encoding: [0x0a,0x00,0x0e,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_nlt_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x0e,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_nlt_f16_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x0e,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_nlt_f16_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x0e,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x0e,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_nlt_f16_e64 s10, m0, 0.5          ; encoding: [0x0a,0x00,0x0e,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_nlt_f16_e64 s[10:11], m0, 0.5     ; encoding: [0x0a,0x00,0x0e,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_nlt_f16_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x0e,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_nlt_f16_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x0e,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x0e,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_nlt_f16_e64 s10, exec_lo, -1      ; encoding: [0x0a,0x00,0x0e,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_nlt_f16_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x0e,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_nlt_f16_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x0e,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_nlt_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x0e,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x0e,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_nlt_f16_e64 s10, |exec_hi|, null  ; encoding: [0x0a,0x01,0x0e,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_nlt_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x0e,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_nlt_f16_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x0e,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_nlt_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x0e,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x0e,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_nlt_f16_e64 s10, null, exec_lo    ; encoding: [0x0a,0x00,0x0e,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_nlt_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x0e,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_nlt_f16_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x0e,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_nlt_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x0e,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x0e,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_nlt_f16_e64 s104, -1, exec_hi     ; encoding: [0x68,0x00,0x0e,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_nlt_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x0e,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_nlt_f16_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x0e,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_nlt_f16_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x0e,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x0e,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_nlt_f16_e64 vcc_lo, 0.5, -m0      ; encoding: [0x6a,0x00,0x0e,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_nlt_f16_e64 vcc, 0.5, -m0         ; encoding: [0x6a,0x00,0x0e,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x0e,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_nlt_f16_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x0e,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_nlt_f16_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x0e,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x0e,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_nlt_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x0e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7c,0x83,0x0e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmp_nlt_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x0e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_nlt_f32_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x1e,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_nlt_f32_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x1e,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x1e,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_nlt_f32_e64 s10, v1, v2           ; encoding: [0x0a,0x00,0x1e,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_nlt_f32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0x1e,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_nlt_f32_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x1e,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_nlt_f32_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x1e,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x1e,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_nlt_f32_e64 s10, v255, v255       ; encoding: [0x0a,0x00,0x1e,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_nlt_f32_e64 s[10:11], v255, v255  ; encoding: [0x0a,0x00,0x1e,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_nlt_f32_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x1e,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_nlt_f32_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x1e,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x1e,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_nlt_f32_e64 s10, s1, s2           ; encoding: [0x0a,0x00,0x1e,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_nlt_f32_e64 s[10:11], s1, s2      ; encoding: [0x0a,0x00,0x1e,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_nlt_f32_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x1e,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_nlt_f32_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x1e,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x1e,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_nlt_f32_e64 s10, s105, s105       ; encoding: [0x0a,0x00,0x1e,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_nlt_f32_e64 s[10:11], s105, s105  ; encoding: [0x0a,0x00,0x1e,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_nlt_f32_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x1e,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_nlt_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x1e,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x1e,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_nlt_f32_e64 s10, vcc_lo, ttmp15   ; encoding: [0x0a,0x00,0x1e,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_nlt_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x1e,0xd4,0x6a,0xf6,0x00,0x00]
 
+0x0a,0x00,0x1e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_nlt_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x1e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_nlt_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x1e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x1e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_nlt_f32_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x1e,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_nlt_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x1e,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x1e,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_nlt_f32_e64 s10, ttmp15, src_scc  ; encoding: [0x0a,0x00,0x1e,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_nlt_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x1e,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_nlt_f32_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x1e,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_nlt_f32_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x1e,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x1e,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_nlt_f32_e64 s10, m0, 0.5          ; encoding: [0x0a,0x00,0x1e,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_nlt_f32_e64 s[10:11], m0, 0.5     ; encoding: [0x0a,0x00,0x1e,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_nlt_f32_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x1e,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_nlt_f32_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x1e,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x1e,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_nlt_f32_e64 s10, exec_lo, -1      ; encoding: [0x0a,0x00,0x1e,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_nlt_f32_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x1e,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_nlt_f32_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x1e,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_nlt_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x1e,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x1e,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_nlt_f32_e64 s10, |exec_hi|, null  ; encoding: [0x0a,0x01,0x1e,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_nlt_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x1e,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_nlt_f32_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x1e,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_nlt_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x1e,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x1e,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_nlt_f32_e64 s10, null, exec_lo    ; encoding: [0x0a,0x00,0x1e,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_nlt_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x1e,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_nlt_f32_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x1e,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_nlt_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x1e,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x1e,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_nlt_f32_e64 s104, -1, exec_hi     ; encoding: [0x68,0x00,0x1e,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_nlt_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x1e,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_nlt_f32_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x1e,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_nlt_f32_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x1e,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x1e,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_nlt_f32_e64 vcc_lo, 0.5, -m0      ; encoding: [0x6a,0x00,0x1e,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_nlt_f32_e64 vcc, 0.5, -m0         ; encoding: [0x6a,0x00,0x1e,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x1e,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_nlt_f32_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x1e,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_nlt_f32_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x1e,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x1e,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_nlt_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x1e,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7c,0x83,0x1e,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_nlt_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x1e,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_nlt_f64_e64 s10, v[1:2], v[2:3]     ; encoding: [0x0a,0x00,0x2e,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_nlt_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x2e,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x2e,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_nlt_f64_e64 s10, v[1:2], v[2:3]   ; encoding: [0x0a,0x00,0x2e,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_nlt_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x2e,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x2e,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_nlt_f64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x2e,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_nlt_f64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x2e,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x2e,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_nlt_f64_e64 s10, s[2:3], s[4:5]     ; encoding: [0x0a,0x00,0x2e,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_nlt_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x2e,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x2e,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_nlt_f64_e64 s10, s[2:3], s[4:5]   ; encoding: [0x0a,0x00,0x2e,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_nlt_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x2e,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x2e,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_nlt_f64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x2e,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_nlt_f64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x2e,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x2e,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_nlt_f64_e64 s10, vcc, ttmp[14:15]   ; encoding: [0x0a,0x00,0x2e,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_nlt_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2e,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x2e,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_nlt_f64_e64 s10, vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2e,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_nlt_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2e,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x2e,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_nlt_f64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x2e,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_nlt_f64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x2e,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x2e,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_nlt_f64_e64 s10, -|exec|, src_scc   ; encoding: [0x0a,0x01,0x2e,0xd4,0x7e,0xfa,0x01,0x20]
-# W64: v_cmp_nlt_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x2e,0xd4,0x7e,0xfa,0x01,0x20]
 0x0a,0x01,0x2e,0xd4,0x7e,0xfa,0x01,0x20
+# W32: v_cmp_nlt_f64_e64 s10, -|exec|, src_scc ; encoding: [0x0a,0x01,0x2e,0xd4,0x7e,0xfa,0x01,0x20]
+# W64: v_cmp_nlt_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x2e,0xd4,0x7e,0xfa,0x01,0x20]
 
-# W32: v_cmp_nlt_f64_e64 s10, null, 0.5          ; encoding: [0x0a,0x00,0x2e,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_nlt_f64_e64 s[10:11], null, 0.5     ; encoding: [0x0a,0x00,0x2e,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x2e,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_nlt_f64_e64 s10, null, 0.5        ; encoding: [0x0a,0x00,0x2e,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_nlt_f64_e64 s[10:11], null, 0.5   ; encoding: [0x0a,0x00,0x2e,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_nlt_f64_e64 s104, -1, -1            ; encoding: [0x68,0x00,0x2e,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_nlt_f64_e64 s[104:105], -1, -1      ; encoding: [0x68,0x00,0x2e,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x2e,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_nlt_f64_e64 s104, -1, -1          ; encoding: [0x68,0x00,0x2e,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_nlt_f64_e64 s[104:105], -1, -1    ; encoding: [0x68,0x00,0x2e,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_nlt_f64_e64 vcc_lo, 0.5, null       ; encoding: [0x6a,0x00,0x2e,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_nlt_f64_e64 vcc, 0.5, null          ; encoding: [0x6a,0x00,0x2e,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x2e,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_nlt_f64_e64 vcc_lo, 0.5, null     ; encoding: [0x6a,0x00,0x2e,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_nlt_f64_e64 vcc, 0.5, null        ; encoding: [0x6a,0x00,0x2e,0xd4,0xf0,0xf8,0x00,0x00]
 
+0x7a,0x03,0x2e,0xd4,0xfd,0xfc,0x00,0x60
 # W32: v_cmp_nlt_f64_e64 ttmp14, -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x2e,0xd4,0xfd,0xfc,0x00,0x60]
 # W64: v_cmp_nlt_f64_e64 ttmp[14:15], -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x2e,0xd4,0xfd,0xfc,0x00,0x60]
-0x7a,0x03,0x2e,0xd4,0xfd,0xfc,0x00,0x60
 
-# GFX11: v_cmp_nlt_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x2e,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7c,0x82,0x2e,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_nlt_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x2e,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_o_f16_e64 s10, v1, v2               ; encoding: [0x0a,0x00,0x07,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_o_f16_e64 s[10:11], v1, v2          ; encoding: [0x0a,0x00,0x07,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x07,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_o_f16_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x07,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_o_f16_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x07,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_o_f16_e64 s10, v255, v255           ; encoding: [0x0a,0x00,0x07,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_o_f16_e64 s[10:11], v255, v255      ; encoding: [0x0a,0x00,0x07,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x07,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_o_f16_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x07,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_o_f16_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x07,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_o_f16_e64 s10, s1, s2               ; encoding: [0x0a,0x00,0x07,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_o_f16_e64 s[10:11], s1, s2          ; encoding: [0x0a,0x00,0x07,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x07,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_o_f16_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x07,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_o_f16_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x07,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_o_f16_e64 s10, s105, s105           ; encoding: [0x0a,0x00,0x07,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_o_f16_e64 s[10:11], s105, s105      ; encoding: [0x0a,0x00,0x07,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x07,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_o_f16_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x07,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_o_f16_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x07,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_o_f16_e64 s10, vcc_lo, ttmp15       ; encoding: [0x0a,0x00,0x07,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_o_f16_e64 s[10:11], vcc_lo, ttmp15  ; encoding: [0x0a,0x00,0x07,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x07,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_o_f16_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x07,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_o_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x07,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_o_f16_e64 s10, vcc_hi, 0xfe0b       ; encoding: [0x0a,0x00,0x07,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_o_f16_e64 s[10:11], vcc_hi, 0xfe0b  ; encoding: [0x0a,0x00,0x07,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x07,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_o_f16_e64 s10, vcc_hi, 0xfe0b     ; encoding: [0x0a,0x00,0x07,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_o_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x07,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_o_f16_e64 s10, ttmp15, src_scc      ; encoding: [0x0a,0x00,0x07,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_o_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x07,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x07,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_o_f16_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x07,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_o_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x07,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_o_f16_e64 s10, m0, 0.5              ; encoding: [0x0a,0x00,0x07,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_o_f16_e64 s[10:11], m0, 0.5         ; encoding: [0x0a,0x00,0x07,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x07,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_o_f16_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x07,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_o_f16_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x07,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_o_f16_e64 s10, exec_lo, -1          ; encoding: [0x0a,0x00,0x07,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_o_f16_e64 s[10:11], exec_lo, -1     ; encoding: [0x0a,0x00,0x07,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x07,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_o_f16_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x07,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_o_f16_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x07,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_o_f16_e64 s10, |exec_hi|, null      ; encoding: [0x0a,0x01,0x07,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_o_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x07,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x07,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_o_f16_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x07,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_o_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x07,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_o_f16_e64 s10, null, exec_lo        ; encoding: [0x0a,0x00,0x07,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_o_f16_e64 s[10:11], null, exec_lo   ; encoding: [0x0a,0x00,0x07,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x07,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_o_f16_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x07,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_o_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x07,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_o_f16_e64 s104, -1, exec_hi         ; encoding: [0x68,0x00,0x07,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_o_f16_e64 s[104:105], -1, exec_hi   ; encoding: [0x68,0x00,0x07,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x07,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_o_f16_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x07,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_o_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x07,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_o_f16_e64 vcc_lo, 0.5, -m0          ; encoding: [0x6a,0x00,0x07,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_o_f16_e64 vcc, 0.5, -m0             ; encoding: [0x6a,0x00,0x07,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x07,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_o_f16_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x07,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_o_f16_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x07,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x07,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_o_f16_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x07,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_o_f16_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x07,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x07,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_o_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x07,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7c,0x83,0x07,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmp_o_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x07,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_o_f32_e64 s10, v1, v2               ; encoding: [0x0a,0x00,0x17,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_o_f32_e64 s[10:11], v1, v2          ; encoding: [0x0a,0x00,0x17,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x17,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_o_f32_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x17,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_o_f32_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x17,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_o_f32_e64 s10, v255, v255           ; encoding: [0x0a,0x00,0x17,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_o_f32_e64 s[10:11], v255, v255      ; encoding: [0x0a,0x00,0x17,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x17,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_o_f32_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x17,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_o_f32_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x17,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_o_f32_e64 s10, s1, s2               ; encoding: [0x0a,0x00,0x17,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_o_f32_e64 s[10:11], s1, s2          ; encoding: [0x0a,0x00,0x17,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x17,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_o_f32_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x17,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_o_f32_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x17,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_o_f32_e64 s10, s105, s105           ; encoding: [0x0a,0x00,0x17,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_o_f32_e64 s[10:11], s105, s105      ; encoding: [0x0a,0x00,0x17,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x17,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_o_f32_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x17,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_o_f32_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x17,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_o_f32_e64 s10, vcc_lo, ttmp15       ; encoding: [0x0a,0x00,0x17,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_o_f32_e64 s[10:11], vcc_lo, ttmp15  ; encoding: [0x0a,0x00,0x17,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x17,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_o_f32_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x17,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_o_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x17,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_o_f32_e64 s10, vcc_hi, 0xaf123456   ; encoding: [0x0a,0x00,0x17,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_o_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x17,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x17,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_o_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x17,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_o_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x17,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_o_f32_e64 s10, ttmp15, src_scc      ; encoding: [0x0a,0x00,0x17,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_o_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x17,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x17,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_o_f32_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x17,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_o_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x17,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_o_f32_e64 s10, m0, 0.5              ; encoding: [0x0a,0x00,0x17,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_o_f32_e64 s[10:11], m0, 0.5         ; encoding: [0x0a,0x00,0x17,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x17,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_o_f32_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x17,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_o_f32_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x17,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_o_f32_e64 s10, exec_lo, -1          ; encoding: [0x0a,0x00,0x17,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_o_f32_e64 s[10:11], exec_lo, -1     ; encoding: [0x0a,0x00,0x17,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x17,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_o_f32_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x17,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_o_f32_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x17,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_o_f32_e64 s10, |exec_hi|, null      ; encoding: [0x0a,0x01,0x17,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_o_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x17,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x17,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_o_f32_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x17,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_o_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x17,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_o_f32_e64 s10, null, exec_lo        ; encoding: [0x0a,0x00,0x17,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_o_f32_e64 s[10:11], null, exec_lo   ; encoding: [0x0a,0x00,0x17,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x17,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_o_f32_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x17,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_o_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x17,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_o_f32_e64 s104, -1, exec_hi         ; encoding: [0x68,0x00,0x17,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_o_f32_e64 s[104:105], -1, exec_hi   ; encoding: [0x68,0x00,0x17,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x17,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_o_f32_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x17,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_o_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x17,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_o_f32_e64 vcc_lo, 0.5, -m0          ; encoding: [0x6a,0x00,0x17,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_o_f32_e64 vcc, 0.5, -m0             ; encoding: [0x6a,0x00,0x17,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x17,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_o_f32_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x17,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_o_f32_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x17,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x17,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_o_f32_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x17,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_o_f32_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x17,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x17,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_o_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x17,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7c,0x83,0x17,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_o_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x17,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_o_f64_e64 s10, v[1:2], v[2:3]       ; encoding: [0x0a,0x00,0x27,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_o_f64_e64 s[10:11], v[1:2], v[2:3]  ; encoding: [0x0a,0x00,0x27,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x27,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_o_f64_e64 s10, v[1:2], v[2:3]     ; encoding: [0x0a,0x00,0x27,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_o_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x27,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x27,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_o_f64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x27,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_o_f64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x27,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x27,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_o_f64_e64 s10, s[2:3], s[4:5]       ; encoding: [0x0a,0x00,0x27,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_o_f64_e64 s[10:11], s[2:3], s[4:5]  ; encoding: [0x0a,0x00,0x27,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x27,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_o_f64_e64 s10, s[2:3], s[4:5]     ; encoding: [0x0a,0x00,0x27,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_o_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x27,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x27,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_o_f64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x27,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_o_f64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x27,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x27,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_o_f64_e64 s10, vcc, ttmp[14:15]     ; encoding: [0x0a,0x00,0x27,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_o_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x27,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x27,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_o_f64_e64 s10, vcc, ttmp[14:15]   ; encoding: [0x0a,0x00,0x27,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_o_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x27,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x27,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_o_f64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x27,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_o_f64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x27,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x27,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_o_f64_e64 s10, -|exec|, src_scc     ; encoding: [0x0a,0x01,0x27,0xd4,0x7e,0xfa,0x01,0x20]
-# W64: v_cmp_o_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x27,0xd4,0x7e,0xfa,0x01,0x20]
 0x0a,0x01,0x27,0xd4,0x7e,0xfa,0x01,0x20
+# W32: v_cmp_o_f64_e64 s10, -|exec|, src_scc   ; encoding: [0x0a,0x01,0x27,0xd4,0x7e,0xfa,0x01,0x20]
+# W64: v_cmp_o_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x27,0xd4,0x7e,0xfa,0x01,0x20]
 
-# W32: v_cmp_o_f64_e64 s10, null, 0.5            ; encoding: [0x0a,0x00,0x27,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_o_f64_e64 s[10:11], null, 0.5       ; encoding: [0x0a,0x00,0x27,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x27,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_o_f64_e64 s10, null, 0.5          ; encoding: [0x0a,0x00,0x27,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_o_f64_e64 s[10:11], null, 0.5     ; encoding: [0x0a,0x00,0x27,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_o_f64_e64 s104, -1, -1              ; encoding: [0x68,0x00,0x27,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_o_f64_e64 s[104:105], -1, -1        ; encoding: [0x68,0x00,0x27,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x27,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_o_f64_e64 s104, -1, -1            ; encoding: [0x68,0x00,0x27,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_o_f64_e64 s[104:105], -1, -1      ; encoding: [0x68,0x00,0x27,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_o_f64_e64 vcc_lo, 0.5, null         ; encoding: [0x6a,0x00,0x27,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_o_f64_e64 vcc, 0.5, null            ; encoding: [0x6a,0x00,0x27,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x27,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_o_f64_e64 vcc_lo, 0.5, null       ; encoding: [0x6a,0x00,0x27,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_o_f64_e64 vcc, 0.5, null          ; encoding: [0x6a,0x00,0x27,0xd4,0xf0,0xf8,0x00,0x00]
 
+0x7a,0x03,0x27,0xd4,0xfd,0xfc,0x00,0x60
 # W32: v_cmp_o_f64_e64 ttmp14, -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x27,0xd4,0xfd,0xfc,0x00,0x60]
 # W64: v_cmp_o_f64_e64 ttmp[14:15], -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x27,0xd4,0xfd,0xfc,0x00,0x60]
-0x7a,0x03,0x27,0xd4,0xfd,0xfc,0x00,0x60
 
-# GFX11: v_cmp_o_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x27,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7c,0x82,0x27,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_o_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x27,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_t_f16_e64 s10, v1, v2               ; encoding: [0x0a,0x00,0x0f,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_t_f16_e64 s[10:11], v1, v2          ; encoding: [0x0a,0x00,0x0f,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x0f,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_t_f16_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x0f,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_t_f16_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x0f,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_t_f16_e64 s10, v255, v255           ; encoding: [0x0a,0x00,0x0f,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_t_f16_e64 s[10:11], v255, v255      ; encoding: [0x0a,0x00,0x0f,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x0f,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_t_f16_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x0f,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_t_f16_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x0f,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_t_f16_e64 s10, s1, s2               ; encoding: [0x0a,0x00,0x0f,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_t_f16_e64 s[10:11], s1, s2          ; encoding: [0x0a,0x00,0x0f,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x0f,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_t_f16_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x0f,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_t_f16_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x0f,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_t_f16_e64 s10, s105, s105           ; encoding: [0x0a,0x00,0x0f,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_t_f16_e64 s[10:11], s105, s105      ; encoding: [0x0a,0x00,0x0f,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x0f,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_t_f16_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x0f,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_t_f16_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x0f,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_t_f16_e64 s10, vcc_lo, ttmp15       ; encoding: [0x0a,0x00,0x0f,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_t_f16_e64 s[10:11], vcc_lo, ttmp15  ; encoding: [0x0a,0x00,0x0f,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x0f,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_t_f16_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x0f,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_t_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x0f,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_t_f16_e64 s10, vcc_hi, 0xfe0b       ; encoding: [0x0a,0x00,0x0f,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_t_f16_e64 s[10:11], vcc_hi, 0xfe0b  ; encoding: [0x0a,0x00,0x0f,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x0f,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_t_f16_e64 s10, vcc_hi, 0xfe0b     ; encoding: [0x0a,0x00,0x0f,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_t_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x0f,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_t_f16_e64 s10, ttmp15, src_scc      ; encoding: [0x0a,0x00,0x0f,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_t_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x0f,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x0f,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_t_f16_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x0f,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_t_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x0f,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_t_f16_e64 s10, m0, 0.5              ; encoding: [0x0a,0x00,0x0f,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_t_f16_e64 s[10:11], m0, 0.5         ; encoding: [0x0a,0x00,0x0f,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x0f,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_t_f16_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x0f,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_t_f16_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x0f,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_t_f16_e64 s10, exec_lo, -1          ; encoding: [0x0a,0x00,0x0f,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_t_f16_e64 s[10:11], exec_lo, -1     ; encoding: [0x0a,0x00,0x0f,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x0f,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_t_f16_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x0f,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_t_f16_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x0f,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_t_f16_e64 s10, |exec_hi|, null      ; encoding: [0x0a,0x01,0x0f,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_t_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x0f,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x0f,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_t_f16_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x0f,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_t_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x0f,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_t_f16_e64 s10, null, exec_lo        ; encoding: [0x0a,0x00,0x0f,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_t_f16_e64 s[10:11], null, exec_lo   ; encoding: [0x0a,0x00,0x0f,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x0f,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_t_f16_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x0f,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_t_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x0f,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_t_f16_e64 s104, -1, exec_hi         ; encoding: [0x68,0x00,0x0f,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_t_f16_e64 s[104:105], -1, exec_hi   ; encoding: [0x68,0x00,0x0f,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x0f,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_t_f16_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x0f,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_t_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x0f,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_t_f16_e64 vcc_lo, 0.5, -m0          ; encoding: [0x6a,0x00,0x0f,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_t_f16_e64 vcc, 0.5, -m0             ; encoding: [0x6a,0x00,0x0f,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x0f,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_t_f16_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x0f,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_t_f16_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x0f,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x0f,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_t_f16_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x0f,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_t_f16_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x0f,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x0f,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_t_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x0f,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7c,0x83,0x0f,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmp_t_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x0f,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_t_f32_e64 s10, v1, v2               ; encoding: [0x0a,0x00,0x1f,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_t_f32_e64 s[10:11], v1, v2          ; encoding: [0x0a,0x00,0x1f,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x1f,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_t_f32_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x1f,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_t_f32_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x1f,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_t_f32_e64 s10, v255, v255           ; encoding: [0x0a,0x00,0x1f,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_t_f32_e64 s[10:11], v255, v255      ; encoding: [0x0a,0x00,0x1f,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x1f,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_t_f32_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x1f,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_t_f32_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x1f,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_t_f32_e64 s10, s1, s2               ; encoding: [0x0a,0x00,0x1f,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_t_f32_e64 s[10:11], s1, s2          ; encoding: [0x0a,0x00,0x1f,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x1f,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_t_f32_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x1f,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_t_f32_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x1f,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_t_f32_e64 s10, s105, s105           ; encoding: [0x0a,0x00,0x1f,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_t_f32_e64 s[10:11], s105, s105      ; encoding: [0x0a,0x00,0x1f,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x1f,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_t_f32_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x1f,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_t_f32_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x1f,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_t_f32_e64 s10, vcc_lo, ttmp15       ; encoding: [0x0a,0x00,0x1f,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_t_f32_e64 s[10:11], vcc_lo, ttmp15  ; encoding: [0x0a,0x00,0x1f,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x1f,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_t_f32_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x1f,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_t_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x1f,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_t_f32_e64 s10, vcc_hi, 0xaf123456   ; encoding: [0x0a,0x00,0x1f,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_t_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x1f,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x1f,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_t_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x1f,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_t_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x1f,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_t_f32_e64 s10, ttmp15, src_scc      ; encoding: [0x0a,0x00,0x1f,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_t_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x1f,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x1f,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_t_f32_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x1f,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_t_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x1f,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_t_f32_e64 s10, m0, 0.5              ; encoding: [0x0a,0x00,0x1f,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_t_f32_e64 s[10:11], m0, 0.5         ; encoding: [0x0a,0x00,0x1f,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x1f,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_t_f32_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x1f,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_t_f32_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x1f,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_t_f32_e64 s10, exec_lo, -1          ; encoding: [0x0a,0x00,0x1f,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_t_f32_e64 s[10:11], exec_lo, -1     ; encoding: [0x0a,0x00,0x1f,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x1f,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_t_f32_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x1f,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_t_f32_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x1f,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_t_f32_e64 s10, |exec_hi|, null      ; encoding: [0x0a,0x01,0x1f,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_t_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x1f,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x1f,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_t_f32_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x1f,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_t_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x1f,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_t_f32_e64 s10, null, exec_lo        ; encoding: [0x0a,0x00,0x1f,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_t_f32_e64 s[10:11], null, exec_lo   ; encoding: [0x0a,0x00,0x1f,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x1f,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_t_f32_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x1f,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_t_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x1f,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_t_f32_e64 s104, -1, exec_hi         ; encoding: [0x68,0x00,0x1f,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_t_f32_e64 s[104:105], -1, exec_hi   ; encoding: [0x68,0x00,0x1f,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x1f,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_t_f32_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x1f,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_t_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x1f,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_t_f32_e64 vcc_lo, 0.5, -m0          ; encoding: [0x6a,0x00,0x1f,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_t_f32_e64 vcc, 0.5, -m0             ; encoding: [0x6a,0x00,0x1f,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x1f,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_t_f32_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x1f,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_t_f32_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x1f,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x1f,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_t_f32_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x1f,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_t_f32_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x1f,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x1f,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_t_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x1f,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7c,0x83,0x1f,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_t_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x1f,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_t_f64_e64 s10, v[1:2], v[2:3]       ; encoding: [0x0a,0x00,0x2f,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_t_f64_e64 s[10:11], v[1:2], v[2:3]  ; encoding: [0x0a,0x00,0x2f,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x2f,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_t_f64_e64 s10, v[1:2], v[2:3]     ; encoding: [0x0a,0x00,0x2f,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_t_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x2f,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x2f,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_t_f64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x2f,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_t_f64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x2f,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x2f,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_t_f64_e64 s10, s[2:3], s[4:5]       ; encoding: [0x0a,0x00,0x2f,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_t_f64_e64 s[10:11], s[2:3], s[4:5]  ; encoding: [0x0a,0x00,0x2f,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x2f,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_t_f64_e64 s10, s[2:3], s[4:5]     ; encoding: [0x0a,0x00,0x2f,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_t_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x2f,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x2f,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_t_f64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x2f,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_t_f64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x2f,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x2f,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_t_f64_e64 s10, vcc, ttmp[14:15]     ; encoding: [0x0a,0x00,0x2f,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_t_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2f,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x2f,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_t_f64_e64 s10, vcc, ttmp[14:15]   ; encoding: [0x0a,0x00,0x2f,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_t_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2f,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x2f,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_t_f64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x2f,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_t_f64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x2f,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x2f,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_t_f64_e64 s10, -|exec|, src_scc     ; encoding: [0x0a,0x01,0x2f,0xd4,0x7e,0xfa,0x01,0x20]
-# W64: v_cmp_t_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x2f,0xd4,0x7e,0xfa,0x01,0x20]
 0x0a,0x01,0x2f,0xd4,0x7e,0xfa,0x01,0x20
+# W32: v_cmp_t_f64_e64 s10, -|exec|, src_scc   ; encoding: [0x0a,0x01,0x2f,0xd4,0x7e,0xfa,0x01,0x20]
+# W64: v_cmp_t_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x2f,0xd4,0x7e,0xfa,0x01,0x20]
 
-# W32: v_cmp_t_f64_e64 s10, null, 0.5            ; encoding: [0x0a,0x00,0x2f,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_t_f64_e64 s[10:11], null, 0.5       ; encoding: [0x0a,0x00,0x2f,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x2f,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_t_f64_e64 s10, null, 0.5          ; encoding: [0x0a,0x00,0x2f,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_t_f64_e64 s[10:11], null, 0.5     ; encoding: [0x0a,0x00,0x2f,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_t_f64_e64 s104, -1, -1              ; encoding: [0x68,0x00,0x2f,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_t_f64_e64 s[104:105], -1, -1        ; encoding: [0x68,0x00,0x2f,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x2f,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_t_f64_e64 s104, -1, -1            ; encoding: [0x68,0x00,0x2f,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_t_f64_e64 s[104:105], -1, -1      ; encoding: [0x68,0x00,0x2f,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_t_f64_e64 vcc_lo, 0.5, null         ; encoding: [0x6a,0x00,0x2f,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_t_f64_e64 vcc, 0.5, null            ; encoding: [0x6a,0x00,0x2f,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x2f,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_t_f64_e64 vcc_lo, 0.5, null       ; encoding: [0x6a,0x00,0x2f,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_t_f64_e64 vcc, 0.5, null          ; encoding: [0x6a,0x00,0x2f,0xd4,0xf0,0xf8,0x00,0x00]
 
+0x7a,0x03,0x2f,0xd4,0xfd,0xfc,0x00,0x60
 # W32: v_cmp_t_f64_e64 ttmp14, -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x2f,0xd4,0xfd,0xfc,0x00,0x60]
 # W64: v_cmp_t_f64_e64 ttmp[14:15], -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x2f,0xd4,0xfd,0xfc,0x00,0x60]
-0x7a,0x03,0x2f,0xd4,0xfd,0xfc,0x00,0x60
 
-# GFX11: v_cmp_t_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x2f,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7c,0x82,0x2f,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_t_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x2f,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_t_i32_e64 s10, v1, v2               ; encoding: [0x0a,0x00,0x47,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_t_i32_e64 s[10:11], v1, v2          ; encoding: [0x0a,0x00,0x47,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x47,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_t_i32_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x47,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_t_i32_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x47,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_t_i32_e64 s10, v255, v255           ; encoding: [0x0a,0x00,0x47,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_t_i32_e64 s[10:11], v255, v255      ; encoding: [0x0a,0x00,0x47,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x47,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_t_i32_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x47,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_t_i32_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x47,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_t_i32_e64 s10, s1, s2               ; encoding: [0x0a,0x00,0x47,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_t_i32_e64 s[10:11], s1, s2          ; encoding: [0x0a,0x00,0x47,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x47,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_t_i32_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x47,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_t_i32_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x47,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_t_i32_e64 s10, s105, s105           ; encoding: [0x0a,0x00,0x47,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_t_i32_e64 s[10:11], s105, s105      ; encoding: [0x0a,0x00,0x47,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x47,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_t_i32_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x47,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_t_i32_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x47,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_t_i32_e64 s10, vcc_lo, ttmp15       ; encoding: [0x0a,0x00,0x47,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_t_i32_e64 s[10:11], vcc_lo, ttmp15  ; encoding: [0x0a,0x00,0x47,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x47,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_t_i32_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x47,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_t_i32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x47,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_t_i32_e64 s10, vcc_hi, 0xaf123456   ; encoding: [0x0a,0x00,0x47,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_t_i32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x47,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x47,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_t_i32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x47,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_t_i32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x47,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_t_i32_e64 s10, ttmp15, src_scc      ; encoding: [0x0a,0x00,0x47,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_t_i32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x47,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x47,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_t_i32_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x47,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_t_i32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x47,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_t_i32_e64 s10, m0, 0.5              ; encoding: [0x0a,0x00,0x47,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_t_i32_e64 s[10:11], m0, 0.5         ; encoding: [0x0a,0x00,0x47,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x47,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_t_i32_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x47,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_t_i32_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x47,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_t_i32_e64 s10, exec_lo, -1          ; encoding: [0x0a,0x00,0x47,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_t_i32_e64 s[10:11], exec_lo, -1     ; encoding: [0x0a,0x00,0x47,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x47,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_t_i32_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x47,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_t_i32_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x47,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_t_i32_e64 s10, exec_hi, null        ; encoding: [0x0a,0x00,0x47,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_t_i32_e64 s[10:11], exec_hi, null   ; encoding: [0x0a,0x00,0x47,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x47,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_t_i32_e64 s10, exec_hi, null      ; encoding: [0x0a,0x00,0x47,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_t_i32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x47,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_t_i32_e64 s10, null, exec_lo        ; encoding: [0x0a,0x00,0x47,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_t_i32_e64 s[10:11], null, exec_lo   ; encoding: [0x0a,0x00,0x47,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x47,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_t_i32_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x47,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_t_i32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x47,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_t_i32_e64 s104, -1, exec_hi         ; encoding: [0x68,0x00,0x47,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_t_i32_e64 s[104:105], -1, exec_hi   ; encoding: [0x68,0x00,0x47,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x47,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_t_i32_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x47,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_t_i32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x47,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_t_i32_e64 vcc_lo, 0.5, m0           ; encoding: [0x6a,0x00,0x47,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_t_i32_e64 vcc, 0.5, m0              ; encoding: [0x6a,0x00,0x47,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x47,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_t_i32_e64 vcc_lo, 0.5, m0         ; encoding: [0x6a,0x00,0x47,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_t_i32_e64 vcc, 0.5, m0            ; encoding: [0x6a,0x00,0x47,0xd4,0xf0,0xfa,0x00,0x00]
 
-# W32: v_cmp_t_i32_e64 ttmp14, src_scc, vcc_lo   ; encoding: [0x7a,0x00,0x47,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_t_i32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x47,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x47,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_t_i32_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x47,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_t_i32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x47,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmp_t_i32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x47,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x47,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_t_i32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x47,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_t_i64_e64 s10, v[1:2], v[2:3]       ; encoding: [0x0a,0x00,0x57,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_t_i64_e64 s[10:11], v[1:2], v[2:3]  ; encoding: [0x0a,0x00,0x57,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x57,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_t_i64_e64 s10, v[1:2], v[2:3]     ; encoding: [0x0a,0x00,0x57,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_t_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x57,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x57,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_t_i64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x57,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_t_i64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x57,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x57,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_t_i64_e64 s10, s[2:3], s[4:5]       ; encoding: [0x0a,0x00,0x57,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_t_i64_e64 s[10:11], s[2:3], s[4:5]  ; encoding: [0x0a,0x00,0x57,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x57,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_t_i64_e64 s10, s[2:3], s[4:5]     ; encoding: [0x0a,0x00,0x57,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_t_i64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x57,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x57,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_t_i64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x57,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_t_i64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x57,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x57,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_t_i64_e64 s10, vcc, ttmp[14:15]     ; encoding: [0x0a,0x00,0x57,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_t_i64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x57,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x57,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_t_i64_e64 s10, vcc, ttmp[14:15]   ; encoding: [0x0a,0x00,0x57,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_t_i64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x57,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x57,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_t_i64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x57,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_t_i64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x57,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x57,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_t_i64_e64 s10, exec, src_scc        ; encoding: [0x0a,0x00,0x57,0xd4,0x7e,0xfa,0x01,0x00]
-# W64: v_cmp_t_i64_e64 s[10:11], exec, src_scc   ; encoding: [0x0a,0x00,0x57,0xd4,0x7e,0xfa,0x01,0x00]
 0x0a,0x00,0x57,0xd4,0x7e,0xfa,0x01,0x00
+# W32: v_cmp_t_i64_e64 s10, exec, src_scc      ; encoding: [0x0a,0x00,0x57,0xd4,0x7e,0xfa,0x01,0x00]
+# W64: v_cmp_t_i64_e64 s[10:11], exec, src_scc ; encoding: [0x0a,0x00,0x57,0xd4,0x7e,0xfa,0x01,0x00]
 
-# W32: v_cmp_t_i64_e64 s10, null, 0.5            ; encoding: [0x0a,0x00,0x57,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_t_i64_e64 s[10:11], null, 0.5       ; encoding: [0x0a,0x00,0x57,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x57,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_t_i64_e64 s10, null, 0.5          ; encoding: [0x0a,0x00,0x57,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_t_i64_e64 s[10:11], null, 0.5     ; encoding: [0x0a,0x00,0x57,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_t_i64_e64 s104, -1, -1              ; encoding: [0x68,0x00,0x57,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_t_i64_e64 s[104:105], -1, -1        ; encoding: [0x68,0x00,0x57,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x57,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_t_i64_e64 s104, -1, -1            ; encoding: [0x68,0x00,0x57,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_t_i64_e64 s[104:105], -1, -1      ; encoding: [0x68,0x00,0x57,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_t_i64_e64 vcc_lo, 0.5, null         ; encoding: [0x6a,0x00,0x57,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_t_i64_e64 vcc, 0.5, null            ; encoding: [0x6a,0x00,0x57,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x57,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_t_i64_e64 vcc_lo, 0.5, null       ; encoding: [0x6a,0x00,0x57,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_t_i64_e64 vcc, 0.5, null          ; encoding: [0x6a,0x00,0x57,0xd4,0xf0,0xf8,0x00,0x00]
 
-# W32: v_cmp_t_i64_e64 ttmp14, src_scc, exec     ; encoding: [0x7a,0x00,0x57,0xd4,0xfd,0xfc,0x00,0x00]
-# W64: v_cmp_t_i64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x57,0xd4,0xfd,0xfc,0x00,0x00]
 0x7a,0x00,0x57,0xd4,0xfd,0xfc,0x00,0x00
+# W32: v_cmp_t_i64_e64 ttmp14, src_scc, exec   ; encoding: [0x7a,0x00,0x57,0xd4,0xfd,0xfc,0x00,0x00]
+# W64: v_cmp_t_i64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x57,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmp_t_i64_e64 null, 0xaf123456, vcc   ; encoding: [0x7c,0x00,0x57,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x57,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_t_i64_e64 null, 0xaf123456, vcc   ; encoding: [0x7c,0x00,0x57,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_t_u32_e64 s10, v1, v2               ; encoding: [0x0a,0x00,0x4f,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_t_u32_e64 s[10:11], v1, v2          ; encoding: [0x0a,0x00,0x4f,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x4f,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_t_u32_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x4f,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_t_u32_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x4f,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_t_u32_e64 s10, v255, v255           ; encoding: [0x0a,0x00,0x4f,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_t_u32_e64 s[10:11], v255, v255      ; encoding: [0x0a,0x00,0x4f,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x4f,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_t_u32_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x4f,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_t_u32_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x4f,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_t_u32_e64 s10, s1, s2               ; encoding: [0x0a,0x00,0x4f,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_t_u32_e64 s[10:11], s1, s2          ; encoding: [0x0a,0x00,0x4f,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x4f,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_t_u32_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x4f,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_t_u32_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x4f,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_t_u32_e64 s10, s105, s105           ; encoding: [0x0a,0x00,0x4f,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_t_u32_e64 s[10:11], s105, s105      ; encoding: [0x0a,0x00,0x4f,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x4f,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_t_u32_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x4f,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_t_u32_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x4f,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_t_u32_e64 s10, vcc_lo, ttmp15       ; encoding: [0x0a,0x00,0x4f,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_t_u32_e64 s[10:11], vcc_lo, ttmp15  ; encoding: [0x0a,0x00,0x4f,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x4f,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_t_u32_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x4f,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_t_u32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x4f,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_t_u32_e64 s10, vcc_hi, 0xaf123456   ; encoding: [0x0a,0x00,0x4f,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_t_u32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4f,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x4f,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_t_u32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4f,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_t_u32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4f,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_t_u32_e64 s10, ttmp15, src_scc      ; encoding: [0x0a,0x00,0x4f,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_t_u32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x4f,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x4f,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_t_u32_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x4f,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_t_u32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x4f,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_t_u32_e64 s10, m0, 0.5              ; encoding: [0x0a,0x00,0x4f,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_t_u32_e64 s[10:11], m0, 0.5         ; encoding: [0x0a,0x00,0x4f,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x4f,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_t_u32_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x4f,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_t_u32_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x4f,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_t_u32_e64 s10, exec_lo, -1          ; encoding: [0x0a,0x00,0x4f,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_t_u32_e64 s[10:11], exec_lo, -1     ; encoding: [0x0a,0x00,0x4f,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x4f,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_t_u32_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x4f,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_t_u32_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x4f,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_t_u32_e64 s10, exec_hi, null        ; encoding: [0x0a,0x00,0x4f,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_t_u32_e64 s[10:11], exec_hi, null   ; encoding: [0x0a,0x00,0x4f,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x4f,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_t_u32_e64 s10, exec_hi, null      ; encoding: [0x0a,0x00,0x4f,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_t_u32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x4f,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_t_u32_e64 s10, null, exec_lo        ; encoding: [0x0a,0x00,0x4f,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_t_u32_e64 s[10:11], null, exec_lo   ; encoding: [0x0a,0x00,0x4f,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x4f,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_t_u32_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x4f,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_t_u32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x4f,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_t_u32_e64 s104, -1, exec_hi         ; encoding: [0x68,0x00,0x4f,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_t_u32_e64 s[104:105], -1, exec_hi   ; encoding: [0x68,0x00,0x4f,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x4f,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_t_u32_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x4f,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_t_u32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x4f,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_t_u32_e64 vcc_lo, 0.5, m0           ; encoding: [0x6a,0x00,0x4f,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_t_u32_e64 vcc, 0.5, m0              ; encoding: [0x6a,0x00,0x4f,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x4f,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_t_u32_e64 vcc_lo, 0.5, m0         ; encoding: [0x6a,0x00,0x4f,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_t_u32_e64 vcc, 0.5, m0            ; encoding: [0x6a,0x00,0x4f,0xd4,0xf0,0xfa,0x00,0x00]
 
-# W32: v_cmp_t_u32_e64 ttmp14, src_scc, vcc_lo   ; encoding: [0x7a,0x00,0x4f,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_t_u32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4f,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x4f,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_t_u32_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4f,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_t_u32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4f,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmp_t_u32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x4f,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x4f,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_t_u32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x4f,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_t_u64_e64 s10, v[1:2], v[2:3]       ; encoding: [0x0a,0x00,0x5f,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_t_u64_e64 s[10:11], v[1:2], v[2:3]  ; encoding: [0x0a,0x00,0x5f,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x5f,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_t_u64_e64 s10, v[1:2], v[2:3]     ; encoding: [0x0a,0x00,0x5f,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_t_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x5f,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x5f,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_t_u64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x5f,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_t_u64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x5f,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x5f,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_t_u64_e64 s10, s[2:3], s[4:5]       ; encoding: [0x0a,0x00,0x5f,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_t_u64_e64 s[10:11], s[2:3], s[4:5]  ; encoding: [0x0a,0x00,0x5f,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x5f,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_t_u64_e64 s10, s[2:3], s[4:5]     ; encoding: [0x0a,0x00,0x5f,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_t_u64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x5f,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x5f,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_t_u64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x5f,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_t_u64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x5f,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x5f,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_t_u64_e64 s10, vcc, ttmp[14:15]     ; encoding: [0x0a,0x00,0x5f,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_t_u64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x5f,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x5f,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_t_u64_e64 s10, vcc, ttmp[14:15]   ; encoding: [0x0a,0x00,0x5f,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_t_u64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x5f,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x5f,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_t_u64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x5f,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_t_u64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x5f,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x5f,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_t_u64_e64 s10, exec, src_scc        ; encoding: [0x0a,0x00,0x5f,0xd4,0x7e,0xfa,0x01,0x00]
-# W64: v_cmp_t_u64_e64 s[10:11], exec, src_scc   ; encoding: [0x0a,0x00,0x5f,0xd4,0x7e,0xfa,0x01,0x00]
 0x0a,0x00,0x5f,0xd4,0x7e,0xfa,0x01,0x00
+# W32: v_cmp_t_u64_e64 s10, exec, src_scc      ; encoding: [0x0a,0x00,0x5f,0xd4,0x7e,0xfa,0x01,0x00]
+# W64: v_cmp_t_u64_e64 s[10:11], exec, src_scc ; encoding: [0x0a,0x00,0x5f,0xd4,0x7e,0xfa,0x01,0x00]
 
-# W32: v_cmp_t_u64_e64 s10, null, 0.5            ; encoding: [0x0a,0x00,0x5f,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_t_u64_e64 s[10:11], null, 0.5       ; encoding: [0x0a,0x00,0x5f,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x5f,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_t_u64_e64 s10, null, 0.5          ; encoding: [0x0a,0x00,0x5f,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_t_u64_e64 s[10:11], null, 0.5     ; encoding: [0x0a,0x00,0x5f,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_t_u64_e64 s104, -1, -1              ; encoding: [0x68,0x00,0x5f,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_t_u64_e64 s[104:105], -1, -1        ; encoding: [0x68,0x00,0x5f,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x5f,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_t_u64_e64 s104, -1, -1            ; encoding: [0x68,0x00,0x5f,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_t_u64_e64 s[104:105], -1, -1      ; encoding: [0x68,0x00,0x5f,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_t_u64_e64 vcc_lo, 0.5, null         ; encoding: [0x6a,0x00,0x5f,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_t_u64_e64 vcc, 0.5, null            ; encoding: [0x6a,0x00,0x5f,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x5f,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_t_u64_e64 vcc_lo, 0.5, null       ; encoding: [0x6a,0x00,0x5f,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_t_u64_e64 vcc, 0.5, null          ; encoding: [0x6a,0x00,0x5f,0xd4,0xf0,0xf8,0x00,0x00]
 
-# W32: v_cmp_t_u64_e64 ttmp14, src_scc, exec     ; encoding: [0x7a,0x00,0x5f,0xd4,0xfd,0xfc,0x00,0x00]
-# W64: v_cmp_t_u64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x5f,0xd4,0xfd,0xfc,0x00,0x00]
 0x7a,0x00,0x5f,0xd4,0xfd,0xfc,0x00,0x00
+# W32: v_cmp_t_u64_e64 ttmp14, src_scc, exec   ; encoding: [0x7a,0x00,0x5f,0xd4,0xfd,0xfc,0x00,0x00]
+# W64: v_cmp_t_u64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x5f,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmp_t_u64_e64 null, 0xaf123456, vcc   ; encoding: [0x7c,0x00,0x5f,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x5f,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_t_u64_e64 null, 0xaf123456, vcc   ; encoding: [0x7c,0x00,0x5f,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_u_f16_e64 s10, v1, v2               ; encoding: [0x0a,0x00,0x08,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_u_f16_e64 s[10:11], v1, v2          ; encoding: [0x0a,0x00,0x08,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x08,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_u_f16_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x08,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_u_f16_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x08,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_u_f16_e64 s10, v255, v255           ; encoding: [0x0a,0x00,0x08,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_u_f16_e64 s[10:11], v255, v255      ; encoding: [0x0a,0x00,0x08,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x08,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_u_f16_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x08,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_u_f16_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x08,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_u_f16_e64 s10, s1, s2               ; encoding: [0x0a,0x00,0x08,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_u_f16_e64 s[10:11], s1, s2          ; encoding: [0x0a,0x00,0x08,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x08,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_u_f16_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x08,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_u_f16_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x08,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_u_f16_e64 s10, s105, s105           ; encoding: [0x0a,0x00,0x08,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_u_f16_e64 s[10:11], s105, s105      ; encoding: [0x0a,0x00,0x08,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x08,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_u_f16_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x08,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_u_f16_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x08,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_u_f16_e64 s10, vcc_lo, ttmp15       ; encoding: [0x0a,0x00,0x08,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_u_f16_e64 s[10:11], vcc_lo, ttmp15  ; encoding: [0x0a,0x00,0x08,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x08,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_u_f16_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x08,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_u_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x08,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_u_f16_e64 s10, vcc_hi, 0xfe0b       ; encoding: [0x0a,0x00,0x08,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_u_f16_e64 s[10:11], vcc_hi, 0xfe0b  ; encoding: [0x0a,0x00,0x08,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x08,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_u_f16_e64 s10, vcc_hi, 0xfe0b     ; encoding: [0x0a,0x00,0x08,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_u_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x08,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_u_f16_e64 s10, ttmp15, src_scc      ; encoding: [0x0a,0x00,0x08,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_u_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x08,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x08,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_u_f16_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x08,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_u_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x08,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_u_f16_e64 s10, m0, 0.5              ; encoding: [0x0a,0x00,0x08,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_u_f16_e64 s[10:11], m0, 0.5         ; encoding: [0x0a,0x00,0x08,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x08,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_u_f16_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x08,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_u_f16_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x08,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_u_f16_e64 s10, exec_lo, -1          ; encoding: [0x0a,0x00,0x08,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_u_f16_e64 s[10:11], exec_lo, -1     ; encoding: [0x0a,0x00,0x08,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x08,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_u_f16_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x08,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_u_f16_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x08,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_u_f16_e64 s10, |exec_hi|, null      ; encoding: [0x0a,0x01,0x08,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_u_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x08,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x08,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_u_f16_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x08,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_u_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x08,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_u_f16_e64 s10, null, exec_lo        ; encoding: [0x0a,0x00,0x08,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_u_f16_e64 s[10:11], null, exec_lo   ; encoding: [0x0a,0x00,0x08,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x08,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_u_f16_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x08,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_u_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x08,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_u_f16_e64 s104, -1, exec_hi         ; encoding: [0x68,0x00,0x08,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_u_f16_e64 s[104:105], -1, exec_hi   ; encoding: [0x68,0x00,0x08,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x08,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_u_f16_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x08,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_u_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x08,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_u_f16_e64 vcc_lo, 0.5, -m0          ; encoding: [0x6a,0x00,0x08,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_u_f16_e64 vcc, 0.5, -m0             ; encoding: [0x6a,0x00,0x08,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x08,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_u_f16_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x08,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_u_f16_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x08,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x08,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_u_f16_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x08,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_u_f16_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x08,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x08,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_u_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x08,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7c,0x83,0x08,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmp_u_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x08,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_u_f32_e64 s10, v1, v2               ; encoding: [0x0a,0x00,0x18,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_u_f32_e64 s[10:11], v1, v2          ; encoding: [0x0a,0x00,0x18,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x18,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_u_f32_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x18,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_u_f32_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x18,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_u_f32_e64 s10, v255, v255           ; encoding: [0x0a,0x00,0x18,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_u_f32_e64 s[10:11], v255, v255      ; encoding: [0x0a,0x00,0x18,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x18,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_u_f32_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x18,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_u_f32_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x18,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_u_f32_e64 s10, s1, s2               ; encoding: [0x0a,0x00,0x18,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_u_f32_e64 s[10:11], s1, s2          ; encoding: [0x0a,0x00,0x18,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x18,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_u_f32_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x18,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_u_f32_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x18,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_u_f32_e64 s10, s105, s105           ; encoding: [0x0a,0x00,0x18,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_u_f32_e64 s[10:11], s105, s105      ; encoding: [0x0a,0x00,0x18,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x18,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_u_f32_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x18,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_u_f32_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x18,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_u_f32_e64 s10, vcc_lo, ttmp15       ; encoding: [0x0a,0x00,0x18,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_u_f32_e64 s[10:11], vcc_lo, ttmp15  ; encoding: [0x0a,0x00,0x18,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x18,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_u_f32_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x18,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_u_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x18,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_u_f32_e64 s10, vcc_hi, 0xaf123456   ; encoding: [0x0a,0x00,0x18,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_u_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x18,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x18,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_u_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x18,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_u_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x18,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_u_f32_e64 s10, ttmp15, src_scc      ; encoding: [0x0a,0x00,0x18,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_u_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x18,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x18,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_u_f32_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x18,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_u_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x18,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_u_f32_e64 s10, m0, 0.5              ; encoding: [0x0a,0x00,0x18,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_u_f32_e64 s[10:11], m0, 0.5         ; encoding: [0x0a,0x00,0x18,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x18,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_u_f32_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x18,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_u_f32_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x18,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_u_f32_e64 s10, exec_lo, -1          ; encoding: [0x0a,0x00,0x18,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_u_f32_e64 s[10:11], exec_lo, -1     ; encoding: [0x0a,0x00,0x18,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x18,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_u_f32_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x18,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_u_f32_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x18,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_u_f32_e64 s10, |exec_hi|, null      ; encoding: [0x0a,0x01,0x18,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_u_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x18,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x18,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_u_f32_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x18,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_u_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x18,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_u_f32_e64 s10, null, exec_lo        ; encoding: [0x0a,0x00,0x18,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_u_f32_e64 s[10:11], null, exec_lo   ; encoding: [0x0a,0x00,0x18,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x18,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_u_f32_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x18,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_u_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x18,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_u_f32_e64 s104, -1, exec_hi         ; encoding: [0x68,0x00,0x18,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_u_f32_e64 s[104:105], -1, exec_hi   ; encoding: [0x68,0x00,0x18,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x18,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_u_f32_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x18,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_u_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x18,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_u_f32_e64 vcc_lo, 0.5, -m0          ; encoding: [0x6a,0x00,0x18,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_u_f32_e64 vcc, 0.5, -m0             ; encoding: [0x6a,0x00,0x18,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x18,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_u_f32_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x18,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_u_f32_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x18,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x18,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_u_f32_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x18,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_u_f32_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x18,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x18,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX11: v_cmp_u_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x18,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7c,0x83,0x18,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_u_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x18,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_u_f64_e64 s10, v[1:2], v[2:3]       ; encoding: [0x0a,0x00,0x28,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_u_f64_e64 s[10:11], v[1:2], v[2:3]  ; encoding: [0x0a,0x00,0x28,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x28,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_u_f64_e64 s10, v[1:2], v[2:3]     ; encoding: [0x0a,0x00,0x28,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_u_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x28,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x28,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_u_f64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x28,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_u_f64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x28,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x28,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_u_f64_e64 s10, s[2:3], s[4:5]       ; encoding: [0x0a,0x00,0x28,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_u_f64_e64 s[10:11], s[2:3], s[4:5]  ; encoding: [0x0a,0x00,0x28,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x28,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_u_f64_e64 s10, s[2:3], s[4:5]     ; encoding: [0x0a,0x00,0x28,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_u_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x28,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x28,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_u_f64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x28,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_u_f64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x28,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x28,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_u_f64_e64 s10, vcc, ttmp[14:15]     ; encoding: [0x0a,0x00,0x28,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_u_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x28,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x28,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_u_f64_e64 s10, vcc, ttmp[14:15]   ; encoding: [0x0a,0x00,0x28,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_u_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x28,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x28,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_u_f64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x28,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_u_f64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x28,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x28,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_u_f64_e64 s10, -|exec|, src_scc     ; encoding: [0x0a,0x01,0x28,0xd4,0x7e,0xfa,0x01,0x20]
-# W64: v_cmp_u_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x28,0xd4,0x7e,0xfa,0x01,0x20]
 0x0a,0x01,0x28,0xd4,0x7e,0xfa,0x01,0x20
+# W32: v_cmp_u_f64_e64 s10, -|exec|, src_scc   ; encoding: [0x0a,0x01,0x28,0xd4,0x7e,0xfa,0x01,0x20]
+# W64: v_cmp_u_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x28,0xd4,0x7e,0xfa,0x01,0x20]
 
-# W32: v_cmp_u_f64_e64 s10, null, 0.5            ; encoding: [0x0a,0x00,0x28,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_u_f64_e64 s[10:11], null, 0.5       ; encoding: [0x0a,0x00,0x28,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x28,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_u_f64_e64 s10, null, 0.5          ; encoding: [0x0a,0x00,0x28,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_u_f64_e64 s[10:11], null, 0.5     ; encoding: [0x0a,0x00,0x28,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_u_f64_e64 s104, -1, -1              ; encoding: [0x68,0x00,0x28,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_u_f64_e64 s[104:105], -1, -1        ; encoding: [0x68,0x00,0x28,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x28,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_u_f64_e64 s104, -1, -1            ; encoding: [0x68,0x00,0x28,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_u_f64_e64 s[104:105], -1, -1      ; encoding: [0x68,0x00,0x28,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_u_f64_e64 vcc_lo, 0.5, null         ; encoding: [0x6a,0x00,0x28,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_u_f64_e64 vcc, 0.5, null            ; encoding: [0x6a,0x00,0x28,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x28,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_u_f64_e64 vcc_lo, 0.5, null       ; encoding: [0x6a,0x00,0x28,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_u_f64_e64 vcc, 0.5, null          ; encoding: [0x6a,0x00,0x28,0xd4,0xf0,0xf8,0x00,0x00]
 
+0x7a,0x03,0x28,0xd4,0xfd,0xfc,0x00,0x60
 # W32: v_cmp_u_f64_e64 ttmp14, -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x28,0xd4,0xfd,0xfc,0x00,0x60]
 # W64: v_cmp_u_f64_e64 ttmp[14:15], -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x28,0xd4,0xfd,0xfc,0x00,0x60]
-0x7a,0x03,0x28,0xd4,0xfd,0xfc,0x00,0x60
 
-# GFX11: v_cmp_u_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x28,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7c,0x82,0x28,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmp_u_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x28,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt
index 3f2d1164029eb..d519c0ffa66c6 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt
@@ -1,3992 +1,3995 @@
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
 
-# GFX11: v_cmpx_class_f16_e64 v1, v2             ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_class_f16_e64 v1, v2             ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_class_f16_e64 v1, 0.5            ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0xe1,0x01,0x00]
 0x7e,0x00,0xfd,0xd4,0x01,0xe1,0x01,0x00
+# GFX11: v_cmpx_class_f16_e64 v1, 0.5            ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0xe1,0x01,0x00]
 
-# GFX11: v_cmpx_class_f16_e64 v255, v2           ; encoding: [0x7e,0x00,0xfd,0xd4,0xff,0x05,0x02,0x00]
 0x7e,0x00,0xfd,0xd4,0xff,0x05,0x02,0x00
+# GFX11: v_cmpx_class_f16_e64 v255, v2           ; encoding: [0x7e,0x00,0xfd,0xd4,0xff,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_class_f16_e64 s1, v2             ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x04,0x02,0x00]
 0x7e,0x00,0xfd,0xd4,0x01,0x04,0x02,0x00
+# GFX11: v_cmpx_class_f16_e64 s1, v2             ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x04,0x02,0x00]
 
-# GFX11: v_cmpx_class_f16_e64 s105, v255         ; encoding: [0x7e,0x00,0xfd,0xd4,0x69,0xfe,0x03,0x00]
 0x7e,0x00,0xfd,0xd4,0x69,0xfe,0x03,0x00
+# GFX11: v_cmpx_class_f16_e64 s105, v255         ; encoding: [0x7e,0x00,0xfd,0xd4,0x69,0xfe,0x03,0x00]
 
-# GFX11: v_cmpx_class_f16_e64 vcc_lo, s2         ; encoding: [0x7e,0x00,0xfd,0xd4,0x6a,0x04,0x00,0x00]
 0x7e,0x00,0xfd,0xd4,0x6a,0x04,0x00,0x00
+# GFX11: v_cmpx_class_f16_e64 vcc_lo, s2         ; encoding: [0x7e,0x00,0xfd,0xd4,0x6a,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_class_f16_e64 vcc_hi, s105       ; encoding: [0x7e,0x00,0xfd,0xd4,0x6b,0xd2,0x00,0x00]
 0x7e,0x00,0xfd,0xd4,0x6b,0xd2,0x00,0x00
+# GFX11: v_cmpx_class_f16_e64 vcc_hi, s105       ; encoding: [0x7e,0x00,0xfd,0xd4,0x6b,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_class_f16_e64 ttmp15, ttmp15     ; encoding: [0x7e,0x00,0xfd,0xd4,0x7b,0xf6,0x00,0x00]
 0x7e,0x00,0xfd,0xd4,0x7b,0xf6,0x00,0x00
+# GFX11: v_cmpx_class_f16_e64 ttmp15, ttmp15     ; encoding: [0x7e,0x00,0xfd,0xd4,0x7b,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_class_f16_e64 m0, src_scc        ; encoding: [0x7e,0x00,0xfd,0xd4,0x7d,0xfa,0x01,0x00]
 0x7e,0x00,0xfd,0xd4,0x7d,0xfa,0x01,0x00
+# GFX11: v_cmpx_class_f16_e64 m0, src_scc        ; encoding: [0x7e,0x00,0xfd,0xd4,0x7d,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_class_f16_e64 exec_lo, -1        ; encoding: [0x7e,0x00,0xfd,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xfd,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_class_f16_e64 exec_lo, -1        ; encoding: [0x7e,0x00,0xfd,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_class_f16_e64 exec_hi, null      ; encoding: [0x7e,0x00,0xfd,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xfd,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_class_f16_e64 exec_hi, null      ; encoding: [0x7e,0x00,0xfd,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_class_f16_e64 null, exec_lo      ; encoding: [0x7e,0x00,0xfd,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xfd,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_class_f16_e64 null, exec_lo      ; encoding: [0x7e,0x00,0xfd,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_class_f16_e64 -1, exec_hi        ; encoding: [0x7e,0x00,0xfd,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xfd,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_class_f16_e64 -1, exec_hi        ; encoding: [0x7e,0x00,0xfd,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_class_f16_e64 0.5, m0            ; encoding: [0x7e,0x00,0xfd,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xfd,0xd4,0xf0,0xfa,0x00,0x00
+# GFX11: v_cmpx_class_f16_e64 0.5, m0            ; encoding: [0x7e,0x00,0xfd,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX11: v_cmpx_class_f16_e64 src_scc, vcc_lo    ; encoding: [0x7e,0x00,0xfd,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xfd,0xd4,0xfd,0xd4,0x00,0x00
+# GFX11: v_cmpx_class_f16_e64 src_scc, vcc_lo    ; encoding: [0x7e,0x00,0xfd,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmpx_class_f16_e64 -|0xfe0b|, vcc_hi  ; encoding: [0x7e,0x01,0xfd,0xd4,0xff,0xd6,0x00,0x20,0x0b,0xfe,0x00,0x00]
 0x7e,0x01,0xfd,0xd4,0xff,0xd6,0x00,0x20,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_class_f16_e64 -|0xfe0b|, vcc_hi  ; encoding: [0x7e,0x01,0xfd,0xd4,0xff,0xd6,0x00,0x20,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_class_f32_e64 v1, v2             ; encoding: [0x7e,0x00,0xfe,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xfe,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_class_f32_e64 v1, v2             ; encoding: [0x7e,0x00,0xfe,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_class_f32_e64 v255, v255         ; encoding: [0x7e,0x00,0xfe,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xfe,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_class_f32_e64 v255, v255         ; encoding: [0x7e,0x00,0xfe,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_class_f32_e64 s1, s2             ; encoding: [0x7e,0x00,0xfe,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xfe,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_class_f32_e64 s1, s2             ; encoding: [0x7e,0x00,0xfe,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_class_f32_e64 s105, s105         ; encoding: [0x7e,0x00,0xfe,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xfe,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_class_f32_e64 s105, s105         ; encoding: [0x7e,0x00,0xfe,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_class_f32_e64 vcc_lo, ttmp15     ; encoding: [0x7e,0x00,0xfe,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xfe,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_class_f32_e64 vcc_lo, ttmp15     ; encoding: [0x7e,0x00,0xfe,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_class_f32_e64 vcc_hi, 0xaf123456 ; encoding: [0x7e,0x00,0xfe,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xfe,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_class_f32_e64 vcc_hi, 0xaf123456 ; encoding: [0x7e,0x00,0xfe,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_class_f32_e64 ttmp15, src_scc    ; encoding: [0x7e,0x00,0xfe,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xfe,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_class_f32_e64 ttmp15, src_scc    ; encoding: [0x7e,0x00,0xfe,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_class_f32_e64 m0, 0.5            ; encoding: [0x7e,0x00,0xfe,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0xfe,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_class_f32_e64 m0, 0.5            ; encoding: [0x7e,0x00,0xfe,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_class_f32_e64 exec_lo, -1        ; encoding: [0x7e,0x00,0xfe,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xfe,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_class_f32_e64 exec_lo, -1        ; encoding: [0x7e,0x00,0xfe,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_class_f32_e64 exec_hi, null      ; encoding: [0x7e,0x00,0xfe,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xfe,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_class_f32_e64 exec_hi, null      ; encoding: [0x7e,0x00,0xfe,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_class_f32_e64 null, exec_lo      ; encoding: [0x7e,0x00,0xfe,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xfe,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_class_f32_e64 null, exec_lo      ; encoding: [0x7e,0x00,0xfe,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_class_f32_e64 -1, exec_hi        ; encoding: [0x7e,0x00,0xfe,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xfe,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_class_f32_e64 -1, exec_hi        ; encoding: [0x7e,0x00,0xfe,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_class_f32_e64 0.5, m0            ; encoding: [0x7e,0x00,0xfe,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xfe,0xd4,0xf0,0xfa,0x00,0x00
+# GFX11: v_cmpx_class_f32_e64 0.5, m0            ; encoding: [0x7e,0x00,0xfe,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX11: v_cmpx_class_f32_e64 src_scc, vcc_lo    ; encoding: [0x7e,0x00,0xfe,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xfe,0xd4,0xfd,0xd4,0x00,0x00
+# GFX11: v_cmpx_class_f32_e64 src_scc, vcc_lo    ; encoding: [0x7e,0x00,0xfe,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmpx_class_f32_e64 -|0xaf123456|, vcc_hi ; encoding: [0x7e,0x01,0xfe,0xd4,0xff,0xd6,0x00,0x20,0x56,0x34,0x12,0xaf]
 0x7e,0x01,0xfe,0xd4,0xff,0xd6,0x00,0x20,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_class_f32_e64 -|0xaf123456|, vcc_hi ; encoding: [0x7e,0x01,0xfe,0xd4,0xff,0xd6,0x00,0x20,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_class_f64_e64 v[1:2], v2         ; encoding: [0x7e,0x00,0xff,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xff,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_class_f64_e64 v[1:2], v2         ; encoding: [0x7e,0x00,0xff,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_class_f64_e64 v[1:2], v255       ; encoding: [0x7e,0x00,0xff,0xd4,0x01,0xff,0x03,0x00]
 0x7e,0x00,0xff,0xd4,0x01,0xff,0x03,0x00
+# GFX11: v_cmpx_class_f64_e64 v[1:2], v255       ; encoding: [0x7e,0x00,0xff,0xd4,0x01,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_class_f64_e64 v[1:2], s2         ; encoding: [0x7e,0x00,0xff,0xd4,0x01,0x05,0x00,0x00]
 0x7e,0x00,0xff,0xd4,0x01,0x05,0x00,0x00
+# GFX11: v_cmpx_class_f64_e64 v[1:2], s2         ; encoding: [0x7e,0x00,0xff,0xd4,0x01,0x05,0x00,0x00]
 
-# GFX11: v_cmpx_class_f64_e64 v[1:2], s105       ; encoding: [0x7e,0x00,0xff,0xd4,0x01,0xd3,0x00,0x00]
 0x7e,0x00,0xff,0xd4,0x01,0xd3,0x00,0x00
+# GFX11: v_cmpx_class_f64_e64 v[1:2], s105       ; encoding: [0x7e,0x00,0xff,0xd4,0x01,0xd3,0x00,0x00]
 
-# GFX11: v_cmpx_class_f64_e64 v[254:255], ttmp15 ; encoding: [0x7e,0x00,0xff,0xd4,0xfe,0xf7,0x00,0x00]
 0x7e,0x00,0xff,0xd4,0xfe,0xf7,0x00,0x00
+# GFX11: v_cmpx_class_f64_e64 v[254:255], ttmp15 ; encoding: [0x7e,0x00,0xff,0xd4,0xfe,0xf7,0x00,0x00]
 
-# GFX11: v_cmpx_class_f64_e64 s[2:3], vcc_hi     ; encoding: [0x7e,0x00,0xff,0xd4,0x02,0xd6,0x00,0x00]
 0x7e,0x00,0xff,0xd4,0x02,0xd6,0x00,0x00
+# GFX11: v_cmpx_class_f64_e64 s[2:3], vcc_hi     ; encoding: [0x7e,0x00,0xff,0xd4,0x02,0xd6,0x00,0x00]
 
-# GFX11: v_cmpx_class_f64_e64 s[104:105], vcc_lo ; encoding: [0x7e,0x00,0xff,0xd4,0x68,0xd4,0x00,0x00]
 0x7e,0x00,0xff,0xd4,0x68,0xd4,0x00,0x00
+# GFX11: v_cmpx_class_f64_e64 s[104:105], vcc_lo ; encoding: [0x7e,0x00,0xff,0xd4,0x68,0xd4,0x00,0x00]
 
-# GFX11: v_cmpx_class_f64_e64 vcc, m0            ; encoding: [0x7e,0x00,0xff,0xd4,0x6a,0xfa,0x00,0x00]
 0x7e,0x00,0xff,0xd4,0x6a,0xfa,0x00,0x00
+# GFX11: v_cmpx_class_f64_e64 vcc, m0            ; encoding: [0x7e,0x00,0xff,0xd4,0x6a,0xfa,0x00,0x00]
 
-# GFX11: v_cmpx_class_f64_e64 ttmp[14:15], exec_hi ; encoding: [0x7e,0x00,0xff,0xd4,0x7a,0xfe,0x00,0x00]
 0x7e,0x00,0xff,0xd4,0x7a,0xfe,0x00,0x00
+# GFX11: v_cmpx_class_f64_e64 ttmp[14:15], exec_hi ; encoding: [0x7e,0x00,0xff,0xd4,0x7a,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_class_f64_e64 exec, exec_lo      ; encoding: [0x7e,0x00,0xff,0xd4,0x7e,0xfc,0x00,0x00]
 0x7e,0x00,0xff,0xd4,0x7e,0xfc,0x00,0x00
+# GFX11: v_cmpx_class_f64_e64 exec, exec_lo      ; encoding: [0x7e,0x00,0xff,0xd4,0x7e,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_class_f64_e64 null, null         ; encoding: [0x7e,0x00,0xff,0xd4,0x7c,0xf8,0x00,0x00]
 0x7e,0x00,0xff,0xd4,0x7c,0xf8,0x00,0x00
+# GFX11: v_cmpx_class_f64_e64 null, null         ; encoding: [0x7e,0x00,0xff,0xd4,0x7c,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_class_f64_e64 -1, -1             ; encoding: [0x7e,0x00,0xff,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xff,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_class_f64_e64 -1, -1             ; encoding: [0x7e,0x00,0xff,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_class_f64_e64 0.5, 0.5           ; encoding: [0x7e,0x00,0xff,0xd4,0xf0,0xe0,0x01,0x00]
 0x7e,0x00,0xff,0xd4,0xf0,0xe0,0x01,0x00
+# GFX11: v_cmpx_class_f64_e64 0.5, 0.5           ; encoding: [0x7e,0x00,0xff,0xd4,0xf0,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_class_f64_e64 -|src_scc|, src_scc ; encoding: [0x7e,0x01,0xff,0xd4,0xfd,0xfa,0x01,0x20]
 0x7e,0x01,0xff,0xd4,0xfd,0xfa,0x01,0x20
+# GFX11: v_cmpx_class_f64_e64 -|src_scc|, src_scc ; encoding: [0x7e,0x01,0xff,0xd4,0xfd,0xfa,0x01,0x20]
 
-# GFX11: v_cmpx_class_f64_e64 0xaf123456, 0xaf123456 ; encoding: [0x7e,0x00,0xff,0xd4,0xff,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xff,0xd4,0xff,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_class_f64_e64 0xaf123456, 0xaf123456 ; encoding: [0x7e,0x00,0xff,0xd4,0xff,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_eq_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_eq_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_eq_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_eq_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_eq_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x82,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_eq_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_eq_f16_e64 s105, s105            ; encoding: [0x7e,0x00,0x82,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x82,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_eq_f16_e64 s105, s105            ; encoding: [0x7e,0x00,0x82,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_eq_f16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x82,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x82,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_eq_f16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x82,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_eq_f16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0x82,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0x82,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_eq_f16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0x82,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_eq_f16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x82,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x82,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_eq_f16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x82,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_eq_f16_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x82,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x82,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_eq_f16_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x82,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_eq_f16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x82,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x82,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_eq_f16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x82,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_eq_f16_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x82,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x82,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_eq_f16_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x82,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_eq_f16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x82,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x82,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_eq_f16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x82,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_eq_f16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x82,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x82,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_eq_f16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x82,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_eq_f16_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x82,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x82,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_eq_f16_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x82,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_eq_f16_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x82,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x82,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_eq_f16_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x82,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_eq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x82,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7e,0x83,0x82,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_eq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x82,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_eq_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x92,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x92,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_eq_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x92,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_eq_f32_e64 v255, v255            ; encoding: [0x7e,0x00,0x92,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x92,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_eq_f32_e64 v255, v255            ; encoding: [0x7e,0x00,0x92,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_eq_f32_e64 s1, s2                ; encoding: [0x7e,0x00,0x92,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x92,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_eq_f32_e64 s1, s2                ; encoding: [0x7e,0x00,0x92,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_eq_f32_e64 s105, s105            ; encoding: [0x7e,0x00,0x92,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x92,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_eq_f32_e64 s105, s105            ; encoding: [0x7e,0x00,0x92,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_eq_f32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x92,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x92,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_eq_f32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x92,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_eq_f32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0x92,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0x92,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_eq_f32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0x92,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_eq_f32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x92,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x92,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_eq_f32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x92,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_eq_f32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x92,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x92,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_eq_f32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x92,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_eq_f32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x92,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x92,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_eq_f32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x92,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_eq_f32_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x92,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x92,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_eq_f32_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x92,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_eq_f32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x92,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x92,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_eq_f32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x92,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_eq_f32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x92,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x92,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_eq_f32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x92,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_eq_f32_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x92,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x92,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_eq_f32_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x92,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_eq_f32_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x92,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x92,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_eq_f32_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x92,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_eq_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x92,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7e,0x83,0x92,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_eq_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x92,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_eq_f64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xa2,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xa2,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_eq_f64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xa2,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_eq_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa2,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xa2,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_eq_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa2,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_eq_f64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xa2,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xa2,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_eq_f64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xa2,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_eq_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa2,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xa2,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_eq_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa2,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_eq_f64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xa2,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xa2,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_eq_f64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xa2,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_eq_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa2,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xa2,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_eq_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa2,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_eq_f64_e64 -|exec|, src_scc      ; encoding: [0x7e,0x01,0xa2,0xd4,0x7e,0xfa,0x01,0x20]
 0x7e,0x01,0xa2,0xd4,0x7e,0xfa,0x01,0x20
+# GFX11: v_cmpx_eq_f64_e64 -|exec|, src_scc      ; encoding: [0x7e,0x01,0xa2,0xd4,0x7e,0xfa,0x01,0x20]
 
-# GFX11: v_cmpx_eq_f64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xa2,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xa2,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_eq_f64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xa2,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_eq_f64_e64 -1, -1                ; encoding: [0x7e,0x00,0xa2,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xa2,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_eq_f64_e64 -1, -1                ; encoding: [0x7e,0x00,0xa2,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_eq_f64_e64 0.5, null             ; encoding: [0x7e,0x00,0xa2,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xa2,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_eq_f64_e64 0.5, null             ; encoding: [0x7e,0x00,0xa2,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_eq_f64_e64 -|src_scc|, -|exec|   ; encoding: [0x7e,0x03,0xa2,0xd4,0xfd,0xfc,0x00,0x60]
 0x7e,0x03,0xa2,0xd4,0xfd,0xfc,0x00,0x60
+# GFX11: v_cmpx_eq_f64_e64 -|src_scc|, -|exec|   ; encoding: [0x7e,0x03,0xa2,0xd4,0xfd,0xfc,0x00,0x60]
 
-# GFX11: v_cmpx_eq_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa2,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7e,0x82,0xa2,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_eq_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa2,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_eq_i16_e64 v1, v2                ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xb2,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_eq_i16_e64 v1, v2                ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_eq_i16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xb2,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_eq_i16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_eq_i16_e64 s1, s2                ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xb2,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_eq_i16_e64 s1, s2                ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i16_e64 s105, s105            ; encoding: [0x7e,0x00,0xb2,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xb2,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_eq_i16_e64 s105, s105            ; encoding: [0x7e,0x00,0xb2,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xb2,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xb2,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_eq_i16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xb2,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xb2,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xb2,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_eq_i16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xb2,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xb2,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xb2,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_eq_i16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xb2,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_eq_i16_e64 m0, 0x3800
 0x7e,0x00,0xb2,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_eq_i16_e64 m0, 0x3800            ; encoding: [0x7e,0x00,0xb2,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xb2,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xb2,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_eq_i16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xb2,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_eq_i16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xb2,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xb2,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_eq_i16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xb2,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xb2,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xb2,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_eq_i16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xb2,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xb2,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xb2,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_eq_i16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xb2,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i16_e64 0x3800, m0
 0x7e,0x00,0xb2,0xd4,0xf0,0xfa,0x00,0x00
+# GFX11: v_cmpx_eq_i16_e64 0x3800, m0            ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xb2,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xb2,0xd4,0xfd,0xd4,0x00,0x00
+# GFX11: v_cmpx_eq_i16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xb2,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xb2,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_eq_i16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc2,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xc2,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_eq_i32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc2,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_eq_i32_e64 v255, v255            ; encoding: [0x7e,0x00,0xc2,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xc2,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_eq_i32_e64 v255, v255            ; encoding: [0x7e,0x00,0xc2,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_eq_i32_e64 s1, s2                ; encoding: [0x7e,0x00,0xc2,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xc2,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_eq_i32_e64 s1, s2                ; encoding: [0x7e,0x00,0xc2,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i32_e64 s105, s105            ; encoding: [0x7e,0x00,0xc2,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xc2,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_eq_i32_e64 s105, s105            ; encoding: [0x7e,0x00,0xc2,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xc2,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xc2,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_eq_i32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xc2,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xc2,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc2,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_eq_i32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xc2,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_eq_i32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xc2,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xc2,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_eq_i32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xc2,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_eq_i32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xc2,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0xc2,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_eq_i32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xc2,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_eq_i32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xc2,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xc2,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_eq_i32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xc2,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_eq_i32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xc2,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xc2,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_eq_i32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xc2,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xc2,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xc2,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_eq_i32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xc2,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xc2,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xc2,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_eq_i32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xc2,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xc2,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xc2,0xd4,0xf0,0xfa,0x00,0x00
+# GFX11: v_cmpx_eq_i32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xc2,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xc2,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xc2,0xd4,0xfd,0xd4,0x00,0x00
+# GFX11: v_cmpx_eq_i32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xc2,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xc2,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc2,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_eq_i32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xc2,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_eq_i64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xd2,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xd2,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_eq_i64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xd2,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_eq_i64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd2,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xd2,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_eq_i64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd2,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_eq_i64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xd2,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xd2,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_eq_i64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xd2,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd2,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xd2,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_eq_i64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd2,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xd2,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xd2,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_eq_i64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xd2,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd2,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd2,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_eq_i64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd2,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_eq_i64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xd2,0xd4,0x7e,0xfa,0x01,0x00]
 0x7e,0x00,0xd2,0xd4,0x7e,0xfa,0x01,0x00
+# GFX11: v_cmpx_eq_i64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xd2,0xd4,0x7e,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_eq_i64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xd2,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xd2,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_eq_i64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xd2,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_eq_i64_e64 -1, -1                ; encoding: [0x7e,0x00,0xd2,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xd2,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_eq_i64_e64 -1, -1                ; encoding: [0x7e,0x00,0xd2,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_eq_i64_e64 0.5, null             ; encoding: [0x7e,0x00,0xd2,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xd2,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_eq_i64_e64 0.5, null             ; encoding: [0x7e,0x00,0xd2,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xd2,0xd4,0xfd,0xfc,0x00,0x00]
 0x7e,0x00,0xd2,0xd4,0xfd,0xfc,0x00,0x00
+# GFX11: v_cmpx_eq_i64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xd2,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xd2,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd2,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_eq_i64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xd2,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_eq_u16_e64 v1, v2                ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xba,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_eq_u16_e64 v1, v2                ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_eq_u16_e64 v255, v255            ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xba,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_eq_u16_e64 v255, v255            ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_eq_u16_e64 s1, s2                ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xba,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_eq_u16_e64 s1, s2                ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u16_e64 s105, s105            ; encoding: [0x7e,0x00,0xba,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xba,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_eq_u16_e64 s105, s105            ; encoding: [0x7e,0x00,0xba,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xba,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xba,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_eq_u16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xba,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xba,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xba,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_eq_u16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xba,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xba,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xba,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_eq_u16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xba,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_eq_u16_e64 m0, 0x3800
 0x7e,0x00,0xba,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_eq_u16_e64 m0, 0x3800            ; encoding: [0x7e,0x00,0xba,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xba,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xba,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_eq_u16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xba,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_eq_u16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xba,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xba,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_eq_u16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xba,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xba,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xba,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_eq_u16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xba,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xba,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xba,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_eq_u16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xba,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u16_e64 0x3800, m0
 0x7e,0x00,0xba,0xd4,0xf0,0xfa,0x00,0x00
+# GFX11: v_cmpx_eq_u16_e64 0x3800, m0            ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xba,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xba,0xd4,0xfd,0xd4,0x00,0x00
+# GFX11: v_cmpx_eq_u16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xba,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xba,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_eq_u16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u32_e64 v1, v2                ; encoding: [0x7e,0x00,0xca,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xca,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_eq_u32_e64 v1, v2                ; encoding: [0x7e,0x00,0xca,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_eq_u32_e64 v255, v255            ; encoding: [0x7e,0x00,0xca,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xca,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_eq_u32_e64 v255, v255            ; encoding: [0x7e,0x00,0xca,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_eq_u32_e64 s1, s2                ; encoding: [0x7e,0x00,0xca,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xca,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_eq_u32_e64 s1, s2                ; encoding: [0x7e,0x00,0xca,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u32_e64 s105, s105            ; encoding: [0x7e,0x00,0xca,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xca,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_eq_u32_e64 s105, s105            ; encoding: [0x7e,0x00,0xca,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xca,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xca,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_eq_u32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xca,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xca,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xca,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_eq_u32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xca,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_eq_u32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xca,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xca,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_eq_u32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xca,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_eq_u32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xca,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0xca,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_eq_u32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xca,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_eq_u32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xca,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xca,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_eq_u32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xca,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_eq_u32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xca,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xca,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_eq_u32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xca,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xca,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xca,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_eq_u32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xca,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xca,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xca,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_eq_u32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xca,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xca,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xca,0xd4,0xf0,0xfa,0x00,0x00
+# GFX11: v_cmpx_eq_u32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xca,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xca,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xca,0xd4,0xfd,0xd4,0x00,0x00
+# GFX11: v_cmpx_eq_u32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xca,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xca,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xca,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_eq_u32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xca,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_eq_u64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xda,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xda,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_eq_u64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xda,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_eq_u64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xda,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xda,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_eq_u64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xda,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_eq_u64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xda,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xda,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_eq_u64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xda,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xda,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xda,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_eq_u64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xda,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xda,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xda,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_eq_u64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xda,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xda,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xda,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_eq_u64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xda,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_eq_u64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xda,0xd4,0x7e,0xfa,0x01,0x00]
 0x7e,0x00,0xda,0xd4,0x7e,0xfa,0x01,0x00
+# GFX11: v_cmpx_eq_u64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xda,0xd4,0x7e,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_eq_u64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xda,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xda,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_eq_u64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xda,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_eq_u64_e64 -1, -1                ; encoding: [0x7e,0x00,0xda,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xda,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_eq_u64_e64 -1, -1                ; encoding: [0x7e,0x00,0xda,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_eq_u64_e64 0.5, null             ; encoding: [0x7e,0x00,0xda,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xda,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_eq_u64_e64 0.5, null             ; encoding: [0x7e,0x00,0xda,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xda,0xd4,0xfd,0xfc,0x00,0x00]
 0x7e,0x00,0xda,0xd4,0xfd,0xfc,0x00,0x00
+# GFX11: v_cmpx_eq_u64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xda,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xda,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xda,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_eq_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xda,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_f_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x80,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x80,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_f_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x80,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_f_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x80,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x80,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_f_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x80,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_f_f16_e64 s1, s2                 ; encoding: [0x7e,0x00,0x80,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x80,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_f_f16_e64 s1, s2                 ; encoding: [0x7e,0x00,0x80,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_f_f16_e64 s105, s105             ; encoding: [0x7e,0x00,0x80,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x80,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_f_f16_e64 s105, s105             ; encoding: [0x7e,0x00,0x80,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_f_f16_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0x80,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x80,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_f_f16_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0x80,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_f_f16_e64 vcc_hi, 0xfe0b         ; encoding: [0x7e,0x00,0x80,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0x80,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_f_f16_e64 vcc_hi, 0xfe0b         ; encoding: [0x7e,0x00,0x80,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_f_f16_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0x80,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x80,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_f_f16_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0x80,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_f_f16_e64 m0, 0.5                ; encoding: [0x7e,0x00,0x80,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x80,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_f_f16_e64 m0, 0.5                ; encoding: [0x7e,0x00,0x80,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_f_f16_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0x80,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x80,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_f_f16_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0x80,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_f_f16_e64 |exec_hi|, null        ; encoding: [0x7e,0x01,0x80,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x80,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_f_f16_e64 |exec_hi|, null        ; encoding: [0x7e,0x01,0x80,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_f_f16_e64 null, exec_lo          ; encoding: [0x7e,0x00,0x80,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x80,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_f_f16_e64 null, exec_lo          ; encoding: [0x7e,0x00,0x80,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_f_f16_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0x80,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x80,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_f_f16_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0x80,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_f_f16_e64 0.5, -m0               ; encoding: [0x7e,0x00,0x80,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x80,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_f_f16_e64 0.5, -m0               ; encoding: [0x7e,0x00,0x80,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_f_f16_e64 -src_scc, |vcc_lo|     ; encoding: [0x7e,0x02,0x80,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x80,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_f_f16_e64 -src_scc, |vcc_lo|     ; encoding: [0x7e,0x02,0x80,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_f_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x80,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7e,0x83,0x80,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_f_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x80,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_f_f32_e64 v1, v2                 ; encoding: [0x7e,0x00,0x90,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x90,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_f_f32_e64 v1, v2                 ; encoding: [0x7e,0x00,0x90,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_f_f32_e64 v255, v255             ; encoding: [0x7e,0x00,0x90,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x90,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_f_f32_e64 v255, v255             ; encoding: [0x7e,0x00,0x90,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_f_f32_e64 s1, s2                 ; encoding: [0x7e,0x00,0x90,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x90,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_f_f32_e64 s1, s2                 ; encoding: [0x7e,0x00,0x90,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_f_f32_e64 s105, s105             ; encoding: [0x7e,0x00,0x90,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x90,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_f_f32_e64 s105, s105             ; encoding: [0x7e,0x00,0x90,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_f_f32_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0x90,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x90,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_f_f32_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0x90,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_f_f32_e64 vcc_hi, 0xaf123456     ; encoding: [0x7e,0x00,0x90,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0x90,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_f_f32_e64 vcc_hi, 0xaf123456     ; encoding: [0x7e,0x00,0x90,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_f_f32_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0x90,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x90,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_f_f32_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0x90,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_f_f32_e64 m0, 0.5                ; encoding: [0x7e,0x00,0x90,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x90,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_f_f32_e64 m0, 0.5                ; encoding: [0x7e,0x00,0x90,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_f_f32_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0x90,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x90,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_f_f32_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0x90,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_f_f32_e64 |exec_hi|, null        ; encoding: [0x7e,0x01,0x90,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x90,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_f_f32_e64 |exec_hi|, null        ; encoding: [0x7e,0x01,0x90,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_f_f32_e64 null, exec_lo          ; encoding: [0x7e,0x00,0x90,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x90,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_f_f32_e64 null, exec_lo          ; encoding: [0x7e,0x00,0x90,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_f_f32_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0x90,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x90,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_f_f32_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0x90,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_f_f32_e64 0.5, -m0               ; encoding: [0x7e,0x00,0x90,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x90,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_f_f32_e64 0.5, -m0               ; encoding: [0x7e,0x00,0x90,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_f_f32_e64 -src_scc, |vcc_lo|     ; encoding: [0x7e,0x02,0x90,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x90,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_f_f32_e64 -src_scc, |vcc_lo|     ; encoding: [0x7e,0x02,0x90,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_f_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x90,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7e,0x83,0x90,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_f_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x90,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_f_f64_e64 v[1:2], v[2:3]         ; encoding: [0x7e,0x00,0xa0,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xa0,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_f_f64_e64 v[1:2], v[2:3]         ; encoding: [0x7e,0x00,0xa0,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_f_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa0,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xa0,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_f_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa0,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_f_f64_e64 s[2:3], s[4:5]         ; encoding: [0x7e,0x00,0xa0,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xa0,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_f_f64_e64 s[2:3], s[4:5]         ; encoding: [0x7e,0x00,0xa0,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_f_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa0,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xa0,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_f_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa0,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_f_f64_e64 vcc, ttmp[14:15]       ; encoding: [0x7e,0x00,0xa0,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xa0,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_f_f64_e64 vcc, ttmp[14:15]       ; encoding: [0x7e,0x00,0xa0,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_f_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa0,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xa0,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_f_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa0,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_f_f64_e64 -|exec|, src_scc       ; encoding: [0x7e,0x01,0xa0,0xd4,0x7e,0xfa,0x01,0x20]
 0x7e,0x01,0xa0,0xd4,0x7e,0xfa,0x01,0x20
+# GFX11: v_cmpx_f_f64_e64 -|exec|, src_scc       ; encoding: [0x7e,0x01,0xa0,0xd4,0x7e,0xfa,0x01,0x20]
 
-# GFX11: v_cmpx_f_f64_e64 null, 0.5              ; encoding: [0x7e,0x00,0xa0,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xa0,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_f_f64_e64 null, 0.5              ; encoding: [0x7e,0x00,0xa0,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_f_f64_e64 -1, -1                 ; encoding: [0x7e,0x00,0xa0,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xa0,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_f_f64_e64 -1, -1                 ; encoding: [0x7e,0x00,0xa0,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_f_f64_e64 0.5, null              ; encoding: [0x7e,0x00,0xa0,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xa0,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_f_f64_e64 0.5, null              ; encoding: [0x7e,0x00,0xa0,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_f_f64_e64 -|src_scc|, -|exec|    ; encoding: [0x7e,0x03,0xa0,0xd4,0xfd,0xfc,0x00,0x60]
 0x7e,0x03,0xa0,0xd4,0xfd,0xfc,0x00,0x60
+# GFX11: v_cmpx_f_f64_e64 -|src_scc|, -|exec|    ; encoding: [0x7e,0x03,0xa0,0xd4,0xfd,0xfc,0x00,0x60]
 
-# GFX11: v_cmpx_f_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa0,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7e,0x82,0xa0,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_f_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa0,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_f_i32_e64 v1, v2                 ; encoding: [0x7e,0x00,0xc0,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xc0,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_f_i32_e64 v1, v2                 ; encoding: [0x7e,0x00,0xc0,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_f_i32_e64 v255, v255             ; encoding: [0x7e,0x00,0xc0,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xc0,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_f_i32_e64 v255, v255             ; encoding: [0x7e,0x00,0xc0,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_f_i32_e64 s1, s2                 ; encoding: [0x7e,0x00,0xc0,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xc0,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_f_i32_e64 s1, s2                 ; encoding: [0x7e,0x00,0xc0,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_f_i32_e64 s105, s105             ; encoding: [0x7e,0x00,0xc0,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xc0,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_f_i32_e64 s105, s105             ; encoding: [0x7e,0x00,0xc0,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_f_i32_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0xc0,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xc0,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_f_i32_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0xc0,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_f_i32_e64 vcc_hi, 0xaf123456     ; encoding: [0x7e,0x00,0xc0,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc0,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_f_i32_e64 vcc_hi, 0xaf123456     ; encoding: [0x7e,0x00,0xc0,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_f_i32_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0xc0,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xc0,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_f_i32_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0xc0,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_f_i32_e64 m0, 0.5                ; encoding: [0x7e,0x00,0xc0,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0xc0,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_f_i32_e64 m0, 0.5                ; encoding: [0x7e,0x00,0xc0,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_f_i32_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0xc0,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xc0,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_f_i32_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0xc0,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_f_i32_e64 exec_hi, null          ; encoding: [0x7e,0x00,0xc0,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xc0,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_f_i32_e64 exec_hi, null          ; encoding: [0x7e,0x00,0xc0,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_f_i32_e64 null, exec_lo          ; encoding: [0x7e,0x00,0xc0,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xc0,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_f_i32_e64 null, exec_lo          ; encoding: [0x7e,0x00,0xc0,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_f_i32_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0xc0,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xc0,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_f_i32_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0xc0,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_f_i32_e64 0.5, m0                ; encoding: [0x7e,0x00,0xc0,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xc0,0xd4,0xf0,0xfa,0x00,0x00
+# GFX11: v_cmpx_f_i32_e64 0.5, m0                ; encoding: [0x7e,0x00,0xc0,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX11: v_cmpx_f_i32_e64 src_scc, vcc_lo        ; encoding: [0x7e,0x00,0xc0,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xc0,0xd4,0xfd,0xd4,0x00,0x00
+# GFX11: v_cmpx_f_i32_e64 src_scc, vcc_lo        ; encoding: [0x7e,0x00,0xc0,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmpx_f_i32_e64 0xaf123456, vcc_hi     ; encoding: [0x7e,0x00,0xc0,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc0,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_f_i32_e64 0xaf123456, vcc_hi     ; encoding: [0x7e,0x00,0xc0,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_f_i64_e64 v[1:2], v[2:3]         ; encoding: [0x7e,0x00,0xd0,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xd0,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_f_i64_e64 v[1:2], v[2:3]         ; encoding: [0x7e,0x00,0xd0,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_f_i64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd0,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xd0,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_f_i64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd0,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_f_i64_e64 s[2:3], s[4:5]         ; encoding: [0x7e,0x00,0xd0,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xd0,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_f_i64_e64 s[2:3], s[4:5]         ; encoding: [0x7e,0x00,0xd0,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_f_i64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd0,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xd0,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_f_i64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd0,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_f_i64_e64 vcc, ttmp[14:15]       ; encoding: [0x7e,0x00,0xd0,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xd0,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_f_i64_e64 vcc, ttmp[14:15]       ; encoding: [0x7e,0x00,0xd0,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_f_i64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd0,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd0,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_f_i64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd0,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_f_i64_e64 exec, src_scc          ; encoding: [0x7e,0x00,0xd0,0xd4,0x7e,0xfa,0x01,0x00]
 0x7e,0x00,0xd0,0xd4,0x7e,0xfa,0x01,0x00
+# GFX11: v_cmpx_f_i64_e64 exec, src_scc          ; encoding: [0x7e,0x00,0xd0,0xd4,0x7e,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_f_i64_e64 null, 0.5              ; encoding: [0x7e,0x00,0xd0,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xd0,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_f_i64_e64 null, 0.5              ; encoding: [0x7e,0x00,0xd0,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_f_i64_e64 -1, -1                 ; encoding: [0x7e,0x00,0xd0,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xd0,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_f_i64_e64 -1, -1                 ; encoding: [0x7e,0x00,0xd0,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_f_i64_e64 0.5, null              ; encoding: [0x7e,0x00,0xd0,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xd0,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_f_i64_e64 0.5, null              ; encoding: [0x7e,0x00,0xd0,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_f_i64_e64 src_scc, exec          ; encoding: [0x7e,0x00,0xd0,0xd4,0xfd,0xfc,0x00,0x00]
 0x7e,0x00,0xd0,0xd4,0xfd,0xfc,0x00,0x00
+# GFX11: v_cmpx_f_i64_e64 src_scc, exec          ; encoding: [0x7e,0x00,0xd0,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_f_i64_e64 0xaf123456, vcc        ; encoding: [0x7e,0x00,0xd0,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd0,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_f_i64_e64 0xaf123456, vcc        ; encoding: [0x7e,0x00,0xd0,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_f_u32_e64 v1, v2                 ; encoding: [0x7e,0x00,0xc8,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xc8,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_f_u32_e64 v1, v2                 ; encoding: [0x7e,0x00,0xc8,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_f_u32_e64 v255, v255             ; encoding: [0x7e,0x00,0xc8,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xc8,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_f_u32_e64 v255, v255             ; encoding: [0x7e,0x00,0xc8,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_f_u32_e64 s1, s2                 ; encoding: [0x7e,0x00,0xc8,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xc8,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_f_u32_e64 s1, s2                 ; encoding: [0x7e,0x00,0xc8,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_f_u32_e64 s105, s105             ; encoding: [0x7e,0x00,0xc8,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xc8,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_f_u32_e64 s105, s105             ; encoding: [0x7e,0x00,0xc8,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_f_u32_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0xc8,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xc8,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_f_u32_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0xc8,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_f_u32_e64 vcc_hi, 0xaf123456     ; encoding: [0x7e,0x00,0xc8,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc8,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_f_u32_e64 vcc_hi, 0xaf123456     ; encoding: [0x7e,0x00,0xc8,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_f_u32_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0xc8,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xc8,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_f_u32_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0xc8,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_f_u32_e64 m0, 0.5                ; encoding: [0x7e,0x00,0xc8,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0xc8,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_f_u32_e64 m0, 0.5                ; encoding: [0x7e,0x00,0xc8,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_f_u32_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0xc8,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xc8,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_f_u32_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0xc8,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_f_u32_e64 exec_hi, null          ; encoding: [0x7e,0x00,0xc8,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xc8,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_f_u32_e64 exec_hi, null          ; encoding: [0x7e,0x00,0xc8,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_f_u32_e64 null, exec_lo          ; encoding: [0x7e,0x00,0xc8,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xc8,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_f_u32_e64 null, exec_lo          ; encoding: [0x7e,0x00,0xc8,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_f_u32_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0xc8,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xc8,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_f_u32_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0xc8,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_f_u32_e64 0.5, m0                ; encoding: [0x7e,0x00,0xc8,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xc8,0xd4,0xf0,0xfa,0x00,0x00
+# GFX11: v_cmpx_f_u32_e64 0.5, m0                ; encoding: [0x7e,0x00,0xc8,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX11: v_cmpx_f_u32_e64 src_scc, vcc_lo        ; encoding: [0x7e,0x00,0xc8,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xc8,0xd4,0xfd,0xd4,0x00,0x00
+# GFX11: v_cmpx_f_u32_e64 src_scc, vcc_lo        ; encoding: [0x7e,0x00,0xc8,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmpx_f_u32_e64 0xaf123456, vcc_hi     ; encoding: [0x7e,0x00,0xc8,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc8,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_f_u32_e64 0xaf123456, vcc_hi     ; encoding: [0x7e,0x00,0xc8,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_f_u64_e64 v[1:2], v[2:3]         ; encoding: [0x7e,0x00,0xd8,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xd8,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_f_u64_e64 v[1:2], v[2:3]         ; encoding: [0x7e,0x00,0xd8,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_f_u64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd8,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xd8,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_f_u64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd8,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_f_u64_e64 s[2:3], s[4:5]         ; encoding: [0x7e,0x00,0xd8,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xd8,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_f_u64_e64 s[2:3], s[4:5]         ; encoding: [0x7e,0x00,0xd8,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_f_u64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd8,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xd8,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_f_u64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd8,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_f_u64_e64 vcc, ttmp[14:15]       ; encoding: [0x7e,0x00,0xd8,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xd8,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_f_u64_e64 vcc, ttmp[14:15]       ; encoding: [0x7e,0x00,0xd8,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_f_u64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd8,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd8,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_f_u64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd8,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_f_u64_e64 exec, src_scc          ; encoding: [0x7e,0x00,0xd8,0xd4,0x7e,0xfa,0x01,0x00]
 0x7e,0x00,0xd8,0xd4,0x7e,0xfa,0x01,0x00
+# GFX11: v_cmpx_f_u64_e64 exec, src_scc          ; encoding: [0x7e,0x00,0xd8,0xd4,0x7e,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_f_u64_e64 null, 0.5              ; encoding: [0x7e,0x00,0xd8,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xd8,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_f_u64_e64 null, 0.5              ; encoding: [0x7e,0x00,0xd8,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_f_u64_e64 -1, -1                 ; encoding: [0x7e,0x00,0xd8,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xd8,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_f_u64_e64 -1, -1                 ; encoding: [0x7e,0x00,0xd8,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_f_u64_e64 0.5, null              ; encoding: [0x7e,0x00,0xd8,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xd8,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_f_u64_e64 0.5, null              ; encoding: [0x7e,0x00,0xd8,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_f_u64_e64 src_scc, exec          ; encoding: [0x7e,0x00,0xd8,0xd4,0xfd,0xfc,0x00,0x00]
 0x7e,0x00,0xd8,0xd4,0xfd,0xfc,0x00,0x00
+# GFX11: v_cmpx_f_u64_e64 src_scc, exec          ; encoding: [0x7e,0x00,0xd8,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_f_u64_e64 0xaf123456, vcc        ; encoding: [0x7e,0x00,0xd8,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd8,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_f_u64_e64 0xaf123456, vcc        ; encoding: [0x7e,0x00,0xd8,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ge_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_ge_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_ge_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_ge_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_ge_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x86,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_ge_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_ge_f16_e64 s105, s105            ; encoding: [0x7e,0x00,0x86,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x86,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_ge_f16_e64 s105, s105            ; encoding: [0x7e,0x00,0x86,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_ge_f16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x86,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x86,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_ge_f16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x86,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_ge_f16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0x86,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0x86,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_ge_f16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0x86,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ge_f16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x86,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x86,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_ge_f16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x86,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_ge_f16_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x86,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x86,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_ge_f16_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x86,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_ge_f16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x86,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x86,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_ge_f16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x86,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_ge_f16_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x86,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x86,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_ge_f16_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x86,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_ge_f16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x86,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x86,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_ge_f16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x86,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_ge_f16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x86,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x86,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_ge_f16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x86,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ge_f16_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x86,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x86,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_ge_f16_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x86,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_ge_f16_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x86,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x86,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_ge_f16_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x86,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_ge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x86,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7e,0x83,0x86,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_ge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x86,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ge_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x96,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x96,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_ge_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x96,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_ge_f32_e64 v255, v255            ; encoding: [0x7e,0x00,0x96,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x96,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_ge_f32_e64 v255, v255            ; encoding: [0x7e,0x00,0x96,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_ge_f32_e64 s1, s2                ; encoding: [0x7e,0x00,0x96,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x96,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_ge_f32_e64 s1, s2                ; encoding: [0x7e,0x00,0x96,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_ge_f32_e64 s105, s105            ; encoding: [0x7e,0x00,0x96,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x96,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_ge_f32_e64 s105, s105            ; encoding: [0x7e,0x00,0x96,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_ge_f32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x96,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x96,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_ge_f32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x96,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_ge_f32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0x96,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0x96,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ge_f32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0x96,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ge_f32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x96,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x96,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_ge_f32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x96,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_ge_f32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x96,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x96,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_ge_f32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x96,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_ge_f32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x96,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x96,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_ge_f32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x96,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_ge_f32_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x96,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x96,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_ge_f32_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x96,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_ge_f32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x96,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x96,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_ge_f32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x96,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_ge_f32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x96,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x96,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_ge_f32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x96,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ge_f32_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x96,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x96,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_ge_f32_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x96,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_ge_f32_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x96,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x96,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_ge_f32_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x96,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_ge_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x96,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7e,0x83,0x96,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ge_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x96,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ge_f64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xa6,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xa6,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_ge_f64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xa6,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_ge_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa6,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xa6,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_ge_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa6,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_ge_f64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xa6,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xa6,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_ge_f64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xa6,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_ge_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa6,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xa6,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_ge_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa6,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_ge_f64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xa6,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xa6,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_ge_f64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xa6,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_ge_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa6,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xa6,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ge_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa6,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ge_f64_e64 -|exec|, src_scc      ; encoding: [0x7e,0x01,0xa6,0xd4,0x7e,0xfa,0x01,0x20]
 0x7e,0x01,0xa6,0xd4,0x7e,0xfa,0x01,0x20
+# GFX11: v_cmpx_ge_f64_e64 -|exec|, src_scc      ; encoding: [0x7e,0x01,0xa6,0xd4,0x7e,0xfa,0x01,0x20]
 
-# GFX11: v_cmpx_ge_f64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xa6,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xa6,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_ge_f64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xa6,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_ge_f64_e64 -1, -1                ; encoding: [0x7e,0x00,0xa6,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xa6,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_ge_f64_e64 -1, -1                ; encoding: [0x7e,0x00,0xa6,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_ge_f64_e64 0.5, null             ; encoding: [0x7e,0x00,0xa6,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xa6,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_ge_f64_e64 0.5, null             ; encoding: [0x7e,0x00,0xa6,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_ge_f64_e64 -|src_scc|, -|exec|   ; encoding: [0x7e,0x03,0xa6,0xd4,0xfd,0xfc,0x00,0x60]
 0x7e,0x03,0xa6,0xd4,0xfd,0xfc,0x00,0x60
+# GFX11: v_cmpx_ge_f64_e64 -|src_scc|, -|exec|   ; encoding: [0x7e,0x03,0xa6,0xd4,0xfd,0xfc,0x00,0x60]
 
-# GFX11: v_cmpx_ge_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa6,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7e,0x82,0xa6,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ge_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa6,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ge_i16_e64 v1, v2                ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xb6,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_ge_i16_e64 v1, v2                ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_ge_i16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xb6,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_ge_i16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_ge_i16_e64 s1, s2                ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xb6,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_ge_i16_e64 s1, s2                ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i16_e64 s105, s105            ; encoding: [0x7e,0x00,0xb6,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xb6,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_ge_i16_e64 s105, s105            ; encoding: [0x7e,0x00,0xb6,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xb6,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xb6,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_ge_i16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xb6,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xb6,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xb6,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_ge_i16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xb6,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xb6,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xb6,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_ge_i16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xb6,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_ge_i16_e64 m0, 0x3800
 0x7e,0x00,0xb6,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_ge_i16_e64 m0, 0x3800            ; encoding: [0x7e,0x00,0xb6,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xb6,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xb6,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_ge_i16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xb6,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_ge_i16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xb6,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xb6,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_ge_i16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xb6,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xb6,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xb6,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_ge_i16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xb6,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xb6,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xb6,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_ge_i16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xb6,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i16_e64 0x3800, m0
 0x7e,0x00,0xb6,0xd4,0xf0,0xfa,0x00,0x00
+# GFX11: v_cmpx_ge_i16_e64 0x3800, m0            ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xb6,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xb6,0xd4,0xfd,0xd4,0x00,0x00
+# GFX11: v_cmpx_ge_i16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xb6,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xb6,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_ge_i16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc6,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xc6,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_ge_i32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc6,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_ge_i32_e64 v255, v255            ; encoding: [0x7e,0x00,0xc6,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xc6,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_ge_i32_e64 v255, v255            ; encoding: [0x7e,0x00,0xc6,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_ge_i32_e64 s1, s2                ; encoding: [0x7e,0x00,0xc6,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xc6,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_ge_i32_e64 s1, s2                ; encoding: [0x7e,0x00,0xc6,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i32_e64 s105, s105            ; encoding: [0x7e,0x00,0xc6,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xc6,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_ge_i32_e64 s105, s105            ; encoding: [0x7e,0x00,0xc6,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xc6,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xc6,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_ge_i32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xc6,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xc6,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc6,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ge_i32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xc6,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ge_i32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xc6,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xc6,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_ge_i32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xc6,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_ge_i32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xc6,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0xc6,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_ge_i32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xc6,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_ge_i32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xc6,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xc6,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_ge_i32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xc6,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_ge_i32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xc6,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xc6,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_ge_i32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xc6,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xc6,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xc6,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_ge_i32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xc6,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xc6,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xc6,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_ge_i32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xc6,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xc6,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xc6,0xd4,0xf0,0xfa,0x00,0x00
+# GFX11: v_cmpx_ge_i32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xc6,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xc6,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xc6,0xd4,0xfd,0xd4,0x00,0x00
+# GFX11: v_cmpx_ge_i32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xc6,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xc6,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc6,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ge_i32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xc6,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ge_i64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xd6,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xd6,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_ge_i64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xd6,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_ge_i64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd6,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xd6,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_ge_i64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd6,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_ge_i64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xd6,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xd6,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_ge_i64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xd6,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd6,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xd6,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_ge_i64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd6,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xd6,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xd6,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_ge_i64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xd6,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd6,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd6,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ge_i64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd6,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ge_i64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xd6,0xd4,0x7e,0xfa,0x01,0x00]
 0x7e,0x00,0xd6,0xd4,0x7e,0xfa,0x01,0x00
+# GFX11: v_cmpx_ge_i64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xd6,0xd4,0x7e,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_ge_i64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xd6,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xd6,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_ge_i64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xd6,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_ge_i64_e64 -1, -1                ; encoding: [0x7e,0x00,0xd6,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xd6,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_ge_i64_e64 -1, -1                ; encoding: [0x7e,0x00,0xd6,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_ge_i64_e64 0.5, null             ; encoding: [0x7e,0x00,0xd6,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xd6,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_ge_i64_e64 0.5, null             ; encoding: [0x7e,0x00,0xd6,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xd6,0xd4,0xfd,0xfc,0x00,0x00]
 0x7e,0x00,0xd6,0xd4,0xfd,0xfc,0x00,0x00
+# GFX11: v_cmpx_ge_i64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xd6,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xd6,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd6,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ge_i64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xd6,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ge_u16_e64 v1, v2                ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xbe,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_ge_u16_e64 v1, v2                ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_ge_u16_e64 v255, v255            ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xbe,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_ge_u16_e64 v255, v255            ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_ge_u16_e64 s1, s2                ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xbe,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_ge_u16_e64 s1, s2                ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u16_e64 s105, s105            ; encoding: [0x7e,0x00,0xbe,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xbe,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_ge_u16_e64 s105, s105            ; encoding: [0x7e,0x00,0xbe,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xbe,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xbe,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_ge_u16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xbe,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xbe,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xbe,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_ge_u16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xbe,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xbe,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xbe,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_ge_u16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xbe,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_ge_u16_e64 m0, 0x3800
 0x7e,0x00,0xbe,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_ge_u16_e64 m0, 0x3800            ; encoding: [0x7e,0x00,0xbe,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xbe,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xbe,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_ge_u16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xbe,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_ge_u16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xbe,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xbe,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_ge_u16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xbe,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xbe,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xbe,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_ge_u16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xbe,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xbe,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xbe,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_ge_u16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xbe,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u16_e64 0x3800, m0
 0x7e,0x00,0xbe,0xd4,0xf0,0xfa,0x00,0x00
+# GFX11: v_cmpx_ge_u16_e64 0x3800, m0            ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xbe,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xbe,0xd4,0xfd,0xd4,0x00,0x00
+# GFX11: v_cmpx_ge_u16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xbe,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xbe,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_ge_u16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u32_e64 v1, v2                ; encoding: [0x7e,0x00,0xce,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xce,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_ge_u32_e64 v1, v2                ; encoding: [0x7e,0x00,0xce,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_ge_u32_e64 v255, v255            ; encoding: [0x7e,0x00,0xce,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xce,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_ge_u32_e64 v255, v255            ; encoding: [0x7e,0x00,0xce,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_ge_u32_e64 s1, s2                ; encoding: [0x7e,0x00,0xce,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xce,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_ge_u32_e64 s1, s2                ; encoding: [0x7e,0x00,0xce,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u32_e64 s105, s105            ; encoding: [0x7e,0x00,0xce,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xce,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_ge_u32_e64 s105, s105            ; encoding: [0x7e,0x00,0xce,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xce,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xce,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_ge_u32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xce,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xce,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xce,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ge_u32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xce,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ge_u32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xce,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xce,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_ge_u32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xce,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_ge_u32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xce,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0xce,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_ge_u32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xce,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_ge_u32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xce,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xce,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_ge_u32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xce,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_ge_u32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xce,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xce,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_ge_u32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xce,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xce,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xce,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_ge_u32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xce,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xce,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xce,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_ge_u32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xce,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xce,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xce,0xd4,0xf0,0xfa,0x00,0x00
+# GFX11: v_cmpx_ge_u32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xce,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xce,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xce,0xd4,0xfd,0xd4,0x00,0x00
+# GFX11: v_cmpx_ge_u32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xce,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xce,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xce,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ge_u32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xce,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ge_u64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xde,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xde,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_ge_u64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xde,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_ge_u64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xde,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xde,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_ge_u64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xde,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_ge_u64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xde,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xde,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_ge_u64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xde,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xde,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xde,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_ge_u64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xde,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xde,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xde,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_ge_u64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xde,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xde,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xde,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ge_u64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xde,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ge_u64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xde,0xd4,0x7e,0xfa,0x01,0x00]
 0x7e,0x00,0xde,0xd4,0x7e,0xfa,0x01,0x00
+# GFX11: v_cmpx_ge_u64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xde,0xd4,0x7e,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_ge_u64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xde,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xde,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_ge_u64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xde,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_ge_u64_e64 -1, -1                ; encoding: [0x7e,0x00,0xde,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xde,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_ge_u64_e64 -1, -1                ; encoding: [0x7e,0x00,0xde,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_ge_u64_e64 0.5, null             ; encoding: [0x7e,0x00,0xde,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xde,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_ge_u64_e64 0.5, null             ; encoding: [0x7e,0x00,0xde,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xde,0xd4,0xfd,0xfc,0x00,0x00]
 0x7e,0x00,0xde,0xd4,0xfd,0xfc,0x00,0x00
+# GFX11: v_cmpx_ge_u64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xde,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xde,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xde,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ge_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xde,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_gt_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_gt_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_gt_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_gt_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_gt_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x84,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_gt_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_gt_f16_e64 s105, s105            ; encoding: [0x7e,0x00,0x84,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x84,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_gt_f16_e64 s105, s105            ; encoding: [0x7e,0x00,0x84,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_gt_f16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x84,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x84,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_gt_f16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x84,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_gt_f16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0x84,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0x84,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_gt_f16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0x84,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_gt_f16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x84,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x84,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_gt_f16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x84,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_gt_f16_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x84,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x84,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_gt_f16_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x84,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_gt_f16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x84,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x84,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_gt_f16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x84,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_gt_f16_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x84,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x84,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_gt_f16_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x84,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_gt_f16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x84,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x84,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_gt_f16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x84,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_gt_f16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x84,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x84,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_gt_f16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x84,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_gt_f16_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x84,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x84,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_gt_f16_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x84,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_gt_f16_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x84,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x84,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_gt_f16_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x84,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_gt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x84,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7e,0x83,0x84,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_gt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x84,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_gt_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x94,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x94,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_gt_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x94,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_gt_f32_e64 v255, v255            ; encoding: [0x7e,0x00,0x94,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x94,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_gt_f32_e64 v255, v255            ; encoding: [0x7e,0x00,0x94,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_gt_f32_e64 s1, s2                ; encoding: [0x7e,0x00,0x94,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x94,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_gt_f32_e64 s1, s2                ; encoding: [0x7e,0x00,0x94,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_gt_f32_e64 s105, s105            ; encoding: [0x7e,0x00,0x94,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x94,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_gt_f32_e64 s105, s105            ; encoding: [0x7e,0x00,0x94,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_gt_f32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x94,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x94,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_gt_f32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x94,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_gt_f32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0x94,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0x94,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_gt_f32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0x94,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_gt_f32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x94,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x94,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_gt_f32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x94,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_gt_f32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x94,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x94,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_gt_f32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x94,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_gt_f32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x94,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x94,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_gt_f32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x94,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_gt_f32_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x94,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x94,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_gt_f32_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x94,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_gt_f32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x94,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x94,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_gt_f32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x94,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_gt_f32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x94,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x94,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_gt_f32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x94,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_gt_f32_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x94,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x94,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_gt_f32_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x94,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_gt_f32_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x94,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x94,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_gt_f32_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x94,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_gt_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x94,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7e,0x83,0x94,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_gt_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x94,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_gt_f64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xa4,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xa4,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_gt_f64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xa4,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_gt_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa4,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xa4,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_gt_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa4,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_gt_f64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xa4,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xa4,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_gt_f64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xa4,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_gt_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa4,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xa4,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_gt_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa4,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_gt_f64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xa4,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xa4,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_gt_f64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xa4,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_gt_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa4,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xa4,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_gt_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa4,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_gt_f64_e64 -|exec|, src_scc      ; encoding: [0x7e,0x01,0xa4,0xd4,0x7e,0xfa,0x01,0x20]
 0x7e,0x01,0xa4,0xd4,0x7e,0xfa,0x01,0x20
+# GFX11: v_cmpx_gt_f64_e64 -|exec|, src_scc      ; encoding: [0x7e,0x01,0xa4,0xd4,0x7e,0xfa,0x01,0x20]
 
-# GFX11: v_cmpx_gt_f64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xa4,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xa4,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_gt_f64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xa4,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_gt_f64_e64 -1, -1                ; encoding: [0x7e,0x00,0xa4,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xa4,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_gt_f64_e64 -1, -1                ; encoding: [0x7e,0x00,0xa4,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_gt_f64_e64 0.5, null             ; encoding: [0x7e,0x00,0xa4,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xa4,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_gt_f64_e64 0.5, null             ; encoding: [0x7e,0x00,0xa4,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_gt_f64_e64 -|src_scc|, -|exec|   ; encoding: [0x7e,0x03,0xa4,0xd4,0xfd,0xfc,0x00,0x60]
 0x7e,0x03,0xa4,0xd4,0xfd,0xfc,0x00,0x60
+# GFX11: v_cmpx_gt_f64_e64 -|src_scc|, -|exec|   ; encoding: [0x7e,0x03,0xa4,0xd4,0xfd,0xfc,0x00,0x60]
 
-# GFX11: v_cmpx_gt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa4,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7e,0x82,0xa4,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_gt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa4,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_gt_i16_e64 v1, v2                ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xb4,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_gt_i16_e64 v1, v2                ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_gt_i16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xb4,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_gt_i16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_gt_i16_e64 s1, s2                ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xb4,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_gt_i16_e64 s1, s2                ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i16_e64 s105, s105            ; encoding: [0x7e,0x00,0xb4,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xb4,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_gt_i16_e64 s105, s105            ; encoding: [0x7e,0x00,0xb4,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xb4,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xb4,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_gt_i16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xb4,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xb4,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xb4,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_gt_i16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xb4,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xb4,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xb4,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_gt_i16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xb4,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_gt_i16_e64 m0, 0x3800
 0x7e,0x00,0xb4,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_gt_i16_e64 m0, 0x3800            ; encoding: [0x7e,0x00,0xb4,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xb4,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xb4,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_gt_i16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xb4,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_gt_i16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xb4,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xb4,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_gt_i16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xb4,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xb4,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xb4,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_gt_i16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xb4,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xb4,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xb4,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_gt_i16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xb4,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i16_e64 0x3800, m0
 0x7e,0x00,0xb4,0xd4,0xf0,0xfa,0x00,0x00
+# GFX11: v_cmpx_gt_i16_e64 0x3800, m0            ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xb4,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xb4,0xd4,0xfd,0xd4,0x00,0x00
+# GFX11: v_cmpx_gt_i16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xb4,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xb4,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_gt_i16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc4,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xc4,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_gt_i32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc4,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_gt_i32_e64 v255, v255            ; encoding: [0x7e,0x00,0xc4,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xc4,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_gt_i32_e64 v255, v255            ; encoding: [0x7e,0x00,0xc4,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_gt_i32_e64 s1, s2                ; encoding: [0x7e,0x00,0xc4,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xc4,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_gt_i32_e64 s1, s2                ; encoding: [0x7e,0x00,0xc4,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i32_e64 s105, s105            ; encoding: [0x7e,0x00,0xc4,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xc4,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_gt_i32_e64 s105, s105            ; encoding: [0x7e,0x00,0xc4,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xc4,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xc4,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_gt_i32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xc4,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xc4,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc4,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_gt_i32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xc4,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_gt_i32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xc4,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xc4,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_gt_i32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xc4,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_gt_i32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xc4,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0xc4,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_gt_i32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xc4,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_gt_i32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xc4,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xc4,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_gt_i32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xc4,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_gt_i32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xc4,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xc4,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_gt_i32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xc4,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xc4,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xc4,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_gt_i32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xc4,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xc4,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xc4,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_gt_i32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xc4,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xc4,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xc4,0xd4,0xf0,0xfa,0x00,0x00
+# GFX11: v_cmpx_gt_i32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xc4,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xc4,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xc4,0xd4,0xfd,0xd4,0x00,0x00
+# GFX11: v_cmpx_gt_i32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xc4,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xc4,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc4,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_gt_i32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xc4,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_gt_i64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xd4,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xd4,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_gt_i64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xd4,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_gt_i64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd4,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xd4,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_gt_i64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd4,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_gt_i64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xd4,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xd4,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_gt_i64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xd4,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd4,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xd4,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_gt_i64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd4,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xd4,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xd4,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_gt_i64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xd4,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd4,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd4,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_gt_i64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd4,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_gt_i64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xd4,0xd4,0x7e,0xfa,0x01,0x00]
 0x7e,0x00,0xd4,0xd4,0x7e,0xfa,0x01,0x00
+# GFX11: v_cmpx_gt_i64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xd4,0xd4,0x7e,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_gt_i64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xd4,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xd4,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_gt_i64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xd4,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_gt_i64_e64 -1, -1                ; encoding: [0x7e,0x00,0xd4,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xd4,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_gt_i64_e64 -1, -1                ; encoding: [0x7e,0x00,0xd4,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_gt_i64_e64 0.5, null             ; encoding: [0x7e,0x00,0xd4,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xd4,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_gt_i64_e64 0.5, null             ; encoding: [0x7e,0x00,0xd4,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xd4,0xd4,0xfd,0xfc,0x00,0x00]
 0x7e,0x00,0xd4,0xd4,0xfd,0xfc,0x00,0x00
+# GFX11: v_cmpx_gt_i64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xd4,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xd4,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd4,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_gt_i64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xd4,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_gt_u16_e64 v1, v2                ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xbc,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_gt_u16_e64 v1, v2                ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_gt_u16_e64 v255, v255            ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xbc,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_gt_u16_e64 v255, v255            ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_gt_u16_e64 s1, s2                ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xbc,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_gt_u16_e64 s1, s2                ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u16_e64 s105, s105            ; encoding: [0x7e,0x00,0xbc,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xbc,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_gt_u16_e64 s105, s105            ; encoding: [0x7e,0x00,0xbc,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xbc,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xbc,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_gt_u16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xbc,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xbc,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xbc,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_gt_u16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xbc,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xbc,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xbc,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_gt_u16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xbc,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_gt_u16_e64 m0, 0x3800
 0x7e,0x00,0xbc,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_gt_u16_e64 m0, 0x3800            ; encoding: [0x7e,0x00,0xbc,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xbc,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xbc,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_gt_u16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xbc,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_gt_u16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xbc,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xbc,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_gt_u16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xbc,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xbc,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xbc,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_gt_u16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xbc,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xbc,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xbc,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_gt_u16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xbc,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u16_e64 0x3800, m0
 0x7e,0x00,0xbc,0xd4,0xf0,0xfa,0x00,0x00
+# GFX11: v_cmpx_gt_u16_e64 0x3800, m0            ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xbc,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xbc,0xd4,0xfd,0xd4,0x00,0x00
+# GFX11: v_cmpx_gt_u16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xbc,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xbc,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_gt_u16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u32_e64 v1, v2                ; encoding: [0x7e,0x00,0xcc,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xcc,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_gt_u32_e64 v1, v2                ; encoding: [0x7e,0x00,0xcc,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_gt_u32_e64 v255, v255            ; encoding: [0x7e,0x00,0xcc,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xcc,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_gt_u32_e64 v255, v255            ; encoding: [0x7e,0x00,0xcc,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_gt_u32_e64 s1, s2                ; encoding: [0x7e,0x00,0xcc,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xcc,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_gt_u32_e64 s1, s2                ; encoding: [0x7e,0x00,0xcc,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u32_e64 s105, s105            ; encoding: [0x7e,0x00,0xcc,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xcc,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_gt_u32_e64 s105, s105            ; encoding: [0x7e,0x00,0xcc,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xcc,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xcc,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_gt_u32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xcc,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xcc,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xcc,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_gt_u32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xcc,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_gt_u32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xcc,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xcc,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_gt_u32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xcc,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_gt_u32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xcc,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0xcc,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_gt_u32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xcc,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_gt_u32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xcc,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xcc,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_gt_u32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xcc,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_gt_u32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xcc,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xcc,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_gt_u32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xcc,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xcc,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xcc,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_gt_u32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xcc,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xcc,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xcc,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_gt_u32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xcc,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xcc,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xcc,0xd4,0xf0,0xfa,0x00,0x00
+# GFX11: v_cmpx_gt_u32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xcc,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xcc,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xcc,0xd4,0xfd,0xd4,0x00,0x00
+# GFX11: v_cmpx_gt_u32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xcc,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xcc,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xcc,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_gt_u32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xcc,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_gt_u64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xdc,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xdc,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_gt_u64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xdc,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_gt_u64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xdc,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xdc,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_gt_u64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xdc,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_gt_u64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xdc,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xdc,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_gt_u64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xdc,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xdc,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xdc,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_gt_u64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xdc,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xdc,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xdc,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_gt_u64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xdc,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xdc,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xdc,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_gt_u64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xdc,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_gt_u64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xdc,0xd4,0x7e,0xfa,0x01,0x00]
 0x7e,0x00,0xdc,0xd4,0x7e,0xfa,0x01,0x00
+# GFX11: v_cmpx_gt_u64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xdc,0xd4,0x7e,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_gt_u64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xdc,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xdc,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_gt_u64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xdc,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_gt_u64_e64 -1, -1                ; encoding: [0x7e,0x00,0xdc,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xdc,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_gt_u64_e64 -1, -1                ; encoding: [0x7e,0x00,0xdc,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_gt_u64_e64 0.5, null             ; encoding: [0x7e,0x00,0xdc,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xdc,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_gt_u64_e64 0.5, null             ; encoding: [0x7e,0x00,0xdc,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xdc,0xd4,0xfd,0xfc,0x00,0x00]
 0x7e,0x00,0xdc,0xd4,0xfd,0xfc,0x00,0x00
+# GFX11: v_cmpx_gt_u64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xdc,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xdc,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xdc,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_gt_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xdc,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_le_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_le_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_le_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_le_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_le_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x83,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_le_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_le_f16_e64 s105, s105            ; encoding: [0x7e,0x00,0x83,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x83,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_le_f16_e64 s105, s105            ; encoding: [0x7e,0x00,0x83,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_le_f16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x83,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x83,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_le_f16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x83,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_le_f16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0x83,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0x83,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_le_f16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0x83,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_le_f16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x83,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x83,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_le_f16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x83,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_le_f16_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x83,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x83,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_le_f16_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x83,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_le_f16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x83,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x83,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_le_f16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x83,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_le_f16_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x83,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x83,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_le_f16_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x83,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_le_f16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x83,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x83,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_le_f16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x83,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_le_f16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x83,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x83,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_le_f16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x83,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_le_f16_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x83,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x83,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_le_f16_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x83,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_le_f16_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x83,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x83,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_le_f16_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x83,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_le_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x83,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7e,0x83,0x83,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_le_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x83,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_le_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x93,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x93,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_le_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x93,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_le_f32_e64 v255, v255            ; encoding: [0x7e,0x00,0x93,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x93,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_le_f32_e64 v255, v255            ; encoding: [0x7e,0x00,0x93,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_le_f32_e64 s1, s2                ; encoding: [0x7e,0x00,0x93,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x93,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_le_f32_e64 s1, s2                ; encoding: [0x7e,0x00,0x93,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_le_f32_e64 s105, s105            ; encoding: [0x7e,0x00,0x93,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x93,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_le_f32_e64 s105, s105            ; encoding: [0x7e,0x00,0x93,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_le_f32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x93,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x93,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_le_f32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x93,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_le_f32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0x93,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0x93,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_le_f32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0x93,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_le_f32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x93,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x93,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_le_f32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x93,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_le_f32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x93,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x93,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_le_f32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x93,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_le_f32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x93,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x93,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_le_f32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x93,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_le_f32_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x93,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x93,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_le_f32_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x93,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_le_f32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x93,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x93,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_le_f32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x93,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_le_f32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x93,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x93,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_le_f32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x93,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_le_f32_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x93,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x93,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_le_f32_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x93,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_le_f32_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x93,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x93,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_le_f32_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x93,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_le_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x93,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7e,0x83,0x93,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_le_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x93,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_le_f64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xa3,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xa3,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_le_f64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xa3,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_le_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa3,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xa3,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_le_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa3,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_le_f64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xa3,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xa3,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_le_f64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xa3,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_le_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa3,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xa3,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_le_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa3,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_le_f64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xa3,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xa3,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_le_f64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xa3,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_le_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa3,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xa3,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_le_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa3,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_le_f64_e64 -|exec|, src_scc      ; encoding: [0x7e,0x01,0xa3,0xd4,0x7e,0xfa,0x01,0x20]
 0x7e,0x01,0xa3,0xd4,0x7e,0xfa,0x01,0x20
+# GFX11: v_cmpx_le_f64_e64 -|exec|, src_scc      ; encoding: [0x7e,0x01,0xa3,0xd4,0x7e,0xfa,0x01,0x20]
 
-# GFX11: v_cmpx_le_f64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xa3,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xa3,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_le_f64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xa3,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_le_f64_e64 -1, -1                ; encoding: [0x7e,0x00,0xa3,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xa3,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_le_f64_e64 -1, -1                ; encoding: [0x7e,0x00,0xa3,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_le_f64_e64 0.5, null             ; encoding: [0x7e,0x00,0xa3,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xa3,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_le_f64_e64 0.5, null             ; encoding: [0x7e,0x00,0xa3,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_le_f64_e64 -|src_scc|, -|exec|   ; encoding: [0x7e,0x03,0xa3,0xd4,0xfd,0xfc,0x00,0x60]
 0x7e,0x03,0xa3,0xd4,0xfd,0xfc,0x00,0x60
+# GFX11: v_cmpx_le_f64_e64 -|src_scc|, -|exec|   ; encoding: [0x7e,0x03,0xa3,0xd4,0xfd,0xfc,0x00,0x60]
 
-# GFX11: v_cmpx_le_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa3,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7e,0x82,0xa3,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_le_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa3,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_le_i16_e64 v1, v2                ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xb3,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_le_i16_e64 v1, v2                ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_le_i16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xb3,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_le_i16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_le_i16_e64 s1, s2                ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xb3,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_le_i16_e64 s1, s2                ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_le_i16_e64 s105, s105            ; encoding: [0x7e,0x00,0xb3,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xb3,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_le_i16_e64 s105, s105            ; encoding: [0x7e,0x00,0xb3,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_le_i16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xb3,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xb3,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_le_i16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xb3,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_le_i16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xb3,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xb3,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_le_i16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xb3,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_le_i16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xb3,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xb3,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_le_i16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xb3,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_le_i16_e64 m0, 0x3800
 0x7e,0x00,0xb3,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_le_i16_e64 m0, 0x3800            ; encoding: [0x7e,0x00,0xb3,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_le_i16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xb3,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xb3,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_le_i16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xb3,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_le_i16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xb3,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xb3,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_le_i16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xb3,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_le_i16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xb3,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xb3,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_le_i16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xb3,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_le_i16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xb3,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xb3,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_le_i16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xb3,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_le_i16_e64 0x3800, m0
 0x7e,0x00,0xb3,0xd4,0xf0,0xfa,0x00,0x00
+# GFX11: v_cmpx_le_i16_e64 0x3800, m0            ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_le_i16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xb3,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xb3,0xd4,0xfd,0xd4,0x00,0x00
+# GFX11: v_cmpx_le_i16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xb3,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmpx_le_i16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xb3,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_le_i16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_le_i32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc3,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xc3,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_le_i32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc3,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_le_i32_e64 v255, v255            ; encoding: [0x7e,0x00,0xc3,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xc3,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_le_i32_e64 v255, v255            ; encoding: [0x7e,0x00,0xc3,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_le_i32_e64 s1, s2                ; encoding: [0x7e,0x00,0xc3,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xc3,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_le_i32_e64 s1, s2                ; encoding: [0x7e,0x00,0xc3,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_le_i32_e64 s105, s105            ; encoding: [0x7e,0x00,0xc3,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xc3,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_le_i32_e64 s105, s105            ; encoding: [0x7e,0x00,0xc3,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_le_i32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xc3,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xc3,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_le_i32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xc3,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_le_i32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xc3,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc3,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_le_i32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xc3,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_le_i32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xc3,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xc3,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_le_i32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xc3,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_le_i32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xc3,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0xc3,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_le_i32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xc3,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_le_i32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xc3,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xc3,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_le_i32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xc3,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_le_i32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xc3,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xc3,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_le_i32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xc3,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_le_i32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xc3,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xc3,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_le_i32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xc3,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_le_i32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xc3,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xc3,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_le_i32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xc3,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_le_i32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xc3,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xc3,0xd4,0xf0,0xfa,0x00,0x00
+# GFX11: v_cmpx_le_i32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xc3,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX11: v_cmpx_le_i32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xc3,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xc3,0xd4,0xfd,0xd4,0x00,0x00
+# GFX11: v_cmpx_le_i32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xc3,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmpx_le_i32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xc3,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc3,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_le_i32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xc3,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_le_i64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xd3,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xd3,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_le_i64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xd3,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_le_i64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd3,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xd3,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_le_i64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd3,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_le_i64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xd3,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xd3,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_le_i64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xd3,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_le_i64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd3,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xd3,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_le_i64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd3,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_le_i64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xd3,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xd3,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_le_i64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xd3,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_le_i64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd3,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd3,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_le_i64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd3,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_le_i64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xd3,0xd4,0x7e,0xfa,0x01,0x00]
 0x7e,0x00,0xd3,0xd4,0x7e,0xfa,0x01,0x00
+# GFX11: v_cmpx_le_i64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xd3,0xd4,0x7e,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_le_i64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xd3,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xd3,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_le_i64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xd3,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_le_i64_e64 -1, -1                ; encoding: [0x7e,0x00,0xd3,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xd3,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_le_i64_e64 -1, -1                ; encoding: [0x7e,0x00,0xd3,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_le_i64_e64 0.5, null             ; encoding: [0x7e,0x00,0xd3,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xd3,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_le_i64_e64 0.5, null             ; encoding: [0x7e,0x00,0xd3,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_le_i64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xd3,0xd4,0xfd,0xfc,0x00,0x00]
 0x7e,0x00,0xd3,0xd4,0xfd,0xfc,0x00,0x00
+# GFX11: v_cmpx_le_i64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xd3,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_le_i64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xd3,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd3,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_le_i64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xd3,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_le_u16_e64 v1, v2                ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xbb,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_le_u16_e64 v1, v2                ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_le_u16_e64 v255, v255            ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xbb,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_le_u16_e64 v255, v255            ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_le_u16_e64 s1, s2                ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xbb,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_le_u16_e64 s1, s2                ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_le_u16_e64 s105, s105            ; encoding: [0x7e,0x00,0xbb,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xbb,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_le_u16_e64 s105, s105            ; encoding: [0x7e,0x00,0xbb,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_le_u16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xbb,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xbb,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_le_u16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xbb,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_le_u16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xbb,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xbb,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_le_u16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xbb,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_le_u16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xbb,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xbb,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_le_u16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xbb,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_le_u16_e64 m0, 0x3800
 0x7e,0x00,0xbb,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_le_u16_e64 m0, 0x3800            ; encoding: [0x7e,0x00,0xbb,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_le_u16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xbb,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xbb,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_le_u16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xbb,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_le_u16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xbb,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xbb,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_le_u16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xbb,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_le_u16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xbb,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xbb,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_le_u16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xbb,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_le_u16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xbb,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xbb,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_le_u16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xbb,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_le_u16_e64 0x3800, m0
 0x7e,0x00,0xbb,0xd4,0xf0,0xfa,0x00,0x00
+# GFX11: v_cmpx_le_u16_e64 0x3800, m0            ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_le_u16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xbb,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xbb,0xd4,0xfd,0xd4,0x00,0x00
+# GFX11: v_cmpx_le_u16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xbb,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmpx_le_u16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xbb,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_le_u16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_le_u32_e64 v1, v2                ; encoding: [0x7e,0x00,0xcb,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xcb,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_le_u32_e64 v1, v2                ; encoding: [0x7e,0x00,0xcb,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_le_u32_e64 v255, v255            ; encoding: [0x7e,0x00,0xcb,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xcb,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_le_u32_e64 v255, v255            ; encoding: [0x7e,0x00,0xcb,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_le_u32_e64 s1, s2                ; encoding: [0x7e,0x00,0xcb,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xcb,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_le_u32_e64 s1, s2                ; encoding: [0x7e,0x00,0xcb,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_le_u32_e64 s105, s105            ; encoding: [0x7e,0x00,0xcb,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xcb,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_le_u32_e64 s105, s105            ; encoding: [0x7e,0x00,0xcb,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_le_u32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xcb,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xcb,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_le_u32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xcb,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_le_u32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xcb,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xcb,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_le_u32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xcb,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_le_u32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xcb,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xcb,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_le_u32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xcb,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_le_u32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xcb,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0xcb,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_le_u32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xcb,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_le_u32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xcb,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xcb,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_le_u32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xcb,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_le_u32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xcb,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xcb,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_le_u32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xcb,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_le_u32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xcb,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xcb,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_le_u32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xcb,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_le_u32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xcb,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xcb,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_le_u32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xcb,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_le_u32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xcb,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xcb,0xd4,0xf0,0xfa,0x00,0x00
+# GFX11: v_cmpx_le_u32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xcb,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX11: v_cmpx_le_u32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xcb,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xcb,0xd4,0xfd,0xd4,0x00,0x00
+# GFX11: v_cmpx_le_u32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xcb,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmpx_le_u32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xcb,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xcb,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_le_u32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xcb,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_le_u64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xdb,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xdb,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_le_u64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xdb,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_le_u64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xdb,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xdb,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_le_u64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xdb,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_le_u64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xdb,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xdb,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_le_u64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xdb,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_le_u64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xdb,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xdb,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_le_u64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xdb,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_le_u64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xdb,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xdb,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_le_u64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xdb,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_le_u64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xdb,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xdb,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_le_u64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xdb,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_le_u64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xdb,0xd4,0x7e,0xfa,0x01,0x00]
 0x7e,0x00,0xdb,0xd4,0x7e,0xfa,0x01,0x00
+# GFX11: v_cmpx_le_u64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xdb,0xd4,0x7e,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_le_u64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xdb,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xdb,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_le_u64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xdb,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_le_u64_e64 -1, -1                ; encoding: [0x7e,0x00,0xdb,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xdb,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_le_u64_e64 -1, -1                ; encoding: [0x7e,0x00,0xdb,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_le_u64_e64 0.5, null             ; encoding: [0x7e,0x00,0xdb,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xdb,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_le_u64_e64 0.5, null             ; encoding: [0x7e,0x00,0xdb,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_le_u64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xdb,0xd4,0xfd,0xfc,0x00,0x00]
 0x7e,0x00,0xdb,0xd4,0xfd,0xfc,0x00,0x00
+# GFX11: v_cmpx_le_u64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xdb,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_le_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xdb,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xdb,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_le_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xdb,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_lg_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_lg_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_lg_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_lg_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_lg_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x85,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_lg_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_lg_f16_e64 s105, s105            ; encoding: [0x7e,0x00,0x85,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x85,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_lg_f16_e64 s105, s105            ; encoding: [0x7e,0x00,0x85,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_lg_f16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x85,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x85,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_lg_f16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x85,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_lg_f16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0x85,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0x85,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_lg_f16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0x85,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_lg_f16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x85,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x85,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_lg_f16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x85,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_lg_f16_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x85,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x85,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_lg_f16_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x85,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_lg_f16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x85,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x85,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_lg_f16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x85,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_lg_f16_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x85,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x85,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_lg_f16_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x85,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_lg_f16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x85,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x85,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_lg_f16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x85,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_lg_f16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x85,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x85,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_lg_f16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x85,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_lg_f16_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x85,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x85,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_lg_f16_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x85,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_lg_f16_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x85,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x85,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_lg_f16_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x85,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_lg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x85,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7e,0x83,0x85,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_lg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x85,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_lg_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x95,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x95,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_lg_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x95,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_lg_f32_e64 v255, v255            ; encoding: [0x7e,0x00,0x95,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x95,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_lg_f32_e64 v255, v255            ; encoding: [0x7e,0x00,0x95,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_lg_f32_e64 s1, s2                ; encoding: [0x7e,0x00,0x95,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x95,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_lg_f32_e64 s1, s2                ; encoding: [0x7e,0x00,0x95,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_lg_f32_e64 s105, s105            ; encoding: [0x7e,0x00,0x95,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x95,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_lg_f32_e64 s105, s105            ; encoding: [0x7e,0x00,0x95,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_lg_f32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x95,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x95,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_lg_f32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x95,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_lg_f32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0x95,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0x95,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_lg_f32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0x95,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_lg_f32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x95,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x95,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_lg_f32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x95,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_lg_f32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x95,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x95,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_lg_f32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x95,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_lg_f32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x95,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x95,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_lg_f32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x95,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_lg_f32_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x95,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x95,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_lg_f32_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x95,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_lg_f32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x95,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x95,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_lg_f32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x95,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_lg_f32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x95,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x95,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_lg_f32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x95,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_lg_f32_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x95,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x95,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_lg_f32_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x95,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_lg_f32_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x95,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x95,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_lg_f32_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x95,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_lg_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x95,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7e,0x83,0x95,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_lg_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x95,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_lg_f64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xa5,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xa5,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_lg_f64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xa5,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_lg_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa5,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xa5,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_lg_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa5,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_lg_f64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xa5,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xa5,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_lg_f64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xa5,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_lg_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa5,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xa5,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_lg_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa5,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_lg_f64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xa5,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xa5,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_lg_f64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xa5,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_lg_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa5,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xa5,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_lg_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa5,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_lg_f64_e64 -|exec|, src_scc      ; encoding: [0x7e,0x01,0xa5,0xd4,0x7e,0xfa,0x01,0x20]
 0x7e,0x01,0xa5,0xd4,0x7e,0xfa,0x01,0x20
+# GFX11: v_cmpx_lg_f64_e64 -|exec|, src_scc      ; encoding: [0x7e,0x01,0xa5,0xd4,0x7e,0xfa,0x01,0x20]
 
-# GFX11: v_cmpx_lg_f64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xa5,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xa5,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_lg_f64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xa5,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_lg_f64_e64 -1, -1                ; encoding: [0x7e,0x00,0xa5,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xa5,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_lg_f64_e64 -1, -1                ; encoding: [0x7e,0x00,0xa5,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_lg_f64_e64 0.5, null             ; encoding: [0x7e,0x00,0xa5,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xa5,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_lg_f64_e64 0.5, null             ; encoding: [0x7e,0x00,0xa5,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_lg_f64_e64 -|src_scc|, -|exec|   ; encoding: [0x7e,0x03,0xa5,0xd4,0xfd,0xfc,0x00,0x60]
 0x7e,0x03,0xa5,0xd4,0xfd,0xfc,0x00,0x60
+# GFX11: v_cmpx_lg_f64_e64 -|src_scc|, -|exec|   ; encoding: [0x7e,0x03,0xa5,0xd4,0xfd,0xfc,0x00,0x60]
 
-# GFX11: v_cmpx_lg_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa5,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7e,0x82,0xa5,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_lg_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa5,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_lt_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x81,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_lt_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_lt_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x81,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x81,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_lt_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x81,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_lt_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x81,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_lt_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_lt_f16_e64 s105, s105            ; encoding: [0x7e,0x00,0x81,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x81,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_lt_f16_e64 s105, s105            ; encoding: [0x7e,0x00,0x81,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_lt_f16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x81,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x81,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_lt_f16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x81,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_lt_f16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0x81,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0x81,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_lt_f16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0x81,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_lt_f16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x81,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x81,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_lt_f16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x81,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_lt_f16_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x81,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x81,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_lt_f16_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x81,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_lt_f16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x81,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x81,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_lt_f16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x81,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_lt_f16_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x81,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x81,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_lt_f16_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x81,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_lt_f16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x81,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x81,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_lt_f16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x81,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_lt_f16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x81,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x81,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_lt_f16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x81,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_lt_f16_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x81,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x81,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_lt_f16_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x81,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_lt_f16_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x81,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x81,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_lt_f16_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x81,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_lt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x81,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7e,0x83,0x81,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_lt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x81,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_lt_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x91,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x91,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_lt_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x91,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_lt_f32_e64 v255, v255            ; encoding: [0x7e,0x00,0x91,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x91,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_lt_f32_e64 v255, v255            ; encoding: [0x7e,0x00,0x91,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_lt_f32_e64 s1, s2                ; encoding: [0x7e,0x00,0x91,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x91,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_lt_f32_e64 s1, s2                ; encoding: [0x7e,0x00,0x91,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_lt_f32_e64 s105, s105            ; encoding: [0x7e,0x00,0x91,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x91,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_lt_f32_e64 s105, s105            ; encoding: [0x7e,0x00,0x91,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_lt_f32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x91,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x91,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_lt_f32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x91,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_lt_f32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0x91,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0x91,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_lt_f32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0x91,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_lt_f32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x91,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x91,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_lt_f32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x91,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_lt_f32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x91,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x91,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_lt_f32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x91,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_lt_f32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x91,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x91,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_lt_f32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x91,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_lt_f32_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x91,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x91,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_lt_f32_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x91,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_lt_f32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x91,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x91,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_lt_f32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x91,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_lt_f32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x91,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x91,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_lt_f32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x91,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_lt_f32_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x91,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x91,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_lt_f32_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x91,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_lt_f32_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x91,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x91,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_lt_f32_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x91,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_lt_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x91,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7e,0x83,0x91,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_lt_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x91,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_lt_f64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xa1,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xa1,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_lt_f64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xa1,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_lt_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa1,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xa1,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_lt_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa1,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_lt_f64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xa1,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xa1,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_lt_f64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xa1,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_lt_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa1,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xa1,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_lt_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa1,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_lt_f64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xa1,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xa1,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_lt_f64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xa1,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_lt_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa1,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xa1,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_lt_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa1,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_lt_f64_e64 -|exec|, src_scc      ; encoding: [0x7e,0x01,0xa1,0xd4,0x7e,0xfa,0x01,0x20]
 0x7e,0x01,0xa1,0xd4,0x7e,0xfa,0x01,0x20
+# GFX11: v_cmpx_lt_f64_e64 -|exec|, src_scc      ; encoding: [0x7e,0x01,0xa1,0xd4,0x7e,0xfa,0x01,0x20]
 
-# GFX11: v_cmpx_lt_f64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xa1,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xa1,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_lt_f64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xa1,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_lt_f64_e64 -1, -1                ; encoding: [0x7e,0x00,0xa1,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xa1,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_lt_f64_e64 -1, -1                ; encoding: [0x7e,0x00,0xa1,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_lt_f64_e64 0.5, null             ; encoding: [0x7e,0x00,0xa1,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xa1,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_lt_f64_e64 0.5, null             ; encoding: [0x7e,0x00,0xa1,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_lt_f64_e64 -|src_scc|, -|exec|   ; encoding: [0x7e,0x03,0xa1,0xd4,0xfd,0xfc,0x00,0x60]
 0x7e,0x03,0xa1,0xd4,0xfd,0xfc,0x00,0x60
+# GFX11: v_cmpx_lt_f64_e64 -|src_scc|, -|exec|   ; encoding: [0x7e,0x03,0xa1,0xd4,0xfd,0xfc,0x00,0x60]
 
-# GFX11: v_cmpx_lt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa1,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7e,0x82,0xa1,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_lt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa1,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_lt_i16_e64 v1, v2                ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xb1,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_lt_i16_e64 v1, v2                ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_lt_i16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xb1,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_lt_i16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_lt_i16_e64 s1, s2                ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xb1,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_lt_i16_e64 s1, s2                ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i16_e64 s105, s105            ; encoding: [0x7e,0x00,0xb1,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xb1,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_lt_i16_e64 s105, s105            ; encoding: [0x7e,0x00,0xb1,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xb1,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xb1,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_lt_i16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xb1,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xb1,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xb1,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_lt_i16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xb1,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xb1,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xb1,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_lt_i16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xb1,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_lt_i16_e64 m0, 0x3800
 0x7e,0x00,0xb1,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_lt_i16_e64 m0, 0x3800            ; encoding: [0x7e,0x00,0xb1,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xb1,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xb1,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_lt_i16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xb1,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_lt_i16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xb1,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xb1,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_lt_i16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xb1,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xb1,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xb1,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_lt_i16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xb1,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xb1,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xb1,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_lt_i16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xb1,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i16_e64 0x3800, m0
 0x7e,0x00,0xb1,0xd4,0xf0,0xfa,0x00,0x00
+# GFX11: v_cmpx_lt_i16_e64 0x3800, m0            ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xb1,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xb1,0xd4,0xfd,0xd4,0x00,0x00
+# GFX11: v_cmpx_lt_i16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xb1,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xb1,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_lt_i16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc1,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xc1,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_lt_i32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc1,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_lt_i32_e64 v255, v255            ; encoding: [0x7e,0x00,0xc1,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xc1,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_lt_i32_e64 v255, v255            ; encoding: [0x7e,0x00,0xc1,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_lt_i32_e64 s1, s2                ; encoding: [0x7e,0x00,0xc1,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xc1,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_lt_i32_e64 s1, s2                ; encoding: [0x7e,0x00,0xc1,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i32_e64 s105, s105            ; encoding: [0x7e,0x00,0xc1,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xc1,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_lt_i32_e64 s105, s105            ; encoding: [0x7e,0x00,0xc1,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xc1,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xc1,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_lt_i32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xc1,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xc1,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc1,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_lt_i32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xc1,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_lt_i32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xc1,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xc1,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_lt_i32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xc1,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_lt_i32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xc1,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0xc1,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_lt_i32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xc1,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_lt_i32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xc1,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xc1,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_lt_i32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xc1,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_lt_i32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xc1,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xc1,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_lt_i32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xc1,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xc1,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xc1,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_lt_i32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xc1,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xc1,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xc1,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_lt_i32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xc1,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xc1,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xc1,0xd4,0xf0,0xfa,0x00,0x00
+# GFX11: v_cmpx_lt_i32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xc1,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xc1,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xc1,0xd4,0xfd,0xd4,0x00,0x00
+# GFX11: v_cmpx_lt_i32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xc1,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xc1,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc1,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_lt_i32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xc1,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_lt_i64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xd1,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xd1,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_lt_i64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xd1,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_lt_i64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd1,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xd1,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_lt_i64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd1,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_lt_i64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xd1,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xd1,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_lt_i64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xd1,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd1,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xd1,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_lt_i64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd1,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xd1,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xd1,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_lt_i64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xd1,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd1,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd1,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_lt_i64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd1,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_lt_i64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xd1,0xd4,0x7e,0xfa,0x01,0x00]
 0x7e,0x00,0xd1,0xd4,0x7e,0xfa,0x01,0x00
+# GFX11: v_cmpx_lt_i64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xd1,0xd4,0x7e,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_lt_i64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xd1,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xd1,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_lt_i64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xd1,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_lt_i64_e64 -1, -1                ; encoding: [0x7e,0x00,0xd1,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xd1,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_lt_i64_e64 -1, -1                ; encoding: [0x7e,0x00,0xd1,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_lt_i64_e64 0.5, null             ; encoding: [0x7e,0x00,0xd1,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xd1,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_lt_i64_e64 0.5, null             ; encoding: [0x7e,0x00,0xd1,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xd1,0xd4,0xfd,0xfc,0x00,0x00]
 0x7e,0x00,0xd1,0xd4,0xfd,0xfc,0x00,0x00
+# GFX11: v_cmpx_lt_i64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xd1,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xd1,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd1,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_lt_i64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xd1,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_lt_u16_e64 v1, v2                ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xb9,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_lt_u16_e64 v1, v2                ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_lt_u16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xb9,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_lt_u16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_lt_u16_e64 s1, s2                ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xb9,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_lt_u16_e64 s1, s2                ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u16_e64 s105, s105            ; encoding: [0x7e,0x00,0xb9,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xb9,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_lt_u16_e64 s105, s105            ; encoding: [0x7e,0x00,0xb9,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xb9,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xb9,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_lt_u16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xb9,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xb9,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xb9,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_lt_u16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xb9,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xb9,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xb9,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_lt_u16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xb9,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_lt_u16_e64 m0, 0x3800
 0x7e,0x00,0xb9,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_lt_u16_e64 m0, 0x3800            ; encoding: [0x7e,0x00,0xb9,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xb9,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xb9,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_lt_u16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xb9,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_lt_u16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xb9,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xb9,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_lt_u16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xb9,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xb9,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xb9,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_lt_u16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xb9,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xb9,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xb9,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_lt_u16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xb9,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u16_e64 0x3800, m0
 0x7e,0x00,0xb9,0xd4,0xf0,0xfa,0x00,0x00
+# GFX11: v_cmpx_lt_u16_e64 0x3800, m0            ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xb9,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xb9,0xd4,0xfd,0xd4,0x00,0x00
+# GFX11: v_cmpx_lt_u16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xb9,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xb9,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_lt_u16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc9,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xc9,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_lt_u32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc9,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_lt_u32_e64 v255, v255            ; encoding: [0x7e,0x00,0xc9,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xc9,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_lt_u32_e64 v255, v255            ; encoding: [0x7e,0x00,0xc9,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_lt_u32_e64 s1, s2                ; encoding: [0x7e,0x00,0xc9,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xc9,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_lt_u32_e64 s1, s2                ; encoding: [0x7e,0x00,0xc9,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u32_e64 s105, s105            ; encoding: [0x7e,0x00,0xc9,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xc9,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_lt_u32_e64 s105, s105            ; encoding: [0x7e,0x00,0xc9,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xc9,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xc9,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_lt_u32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xc9,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xc9,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc9,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_lt_u32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xc9,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_lt_u32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xc9,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xc9,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_lt_u32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xc9,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_lt_u32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xc9,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0xc9,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_lt_u32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xc9,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_lt_u32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xc9,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xc9,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_lt_u32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xc9,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_lt_u32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xc9,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xc9,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_lt_u32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xc9,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xc9,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xc9,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_lt_u32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xc9,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xc9,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xc9,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_lt_u32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xc9,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xc9,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xc9,0xd4,0xf0,0xfa,0x00,0x00
+# GFX11: v_cmpx_lt_u32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xc9,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xc9,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xc9,0xd4,0xfd,0xd4,0x00,0x00
+# GFX11: v_cmpx_lt_u32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xc9,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xc9,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc9,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_lt_u32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xc9,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_lt_u64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xd9,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xd9,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_lt_u64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xd9,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_lt_u64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd9,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xd9,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_lt_u64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd9,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_lt_u64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xd9,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xd9,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_lt_u64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xd9,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd9,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xd9,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_lt_u64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd9,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xd9,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xd9,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_lt_u64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xd9,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd9,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd9,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_lt_u64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd9,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_lt_u64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xd9,0xd4,0x7e,0xfa,0x01,0x00]
 0x7e,0x00,0xd9,0xd4,0x7e,0xfa,0x01,0x00
+# GFX11: v_cmpx_lt_u64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xd9,0xd4,0x7e,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_lt_u64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xd9,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xd9,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_lt_u64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xd9,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_lt_u64_e64 -1, -1                ; encoding: [0x7e,0x00,0xd9,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xd9,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_lt_u64_e64 -1, -1                ; encoding: [0x7e,0x00,0xd9,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_lt_u64_e64 0.5, null             ; encoding: [0x7e,0x00,0xd9,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xd9,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_lt_u64_e64 0.5, null             ; encoding: [0x7e,0x00,0xd9,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xd9,0xd4,0xfd,0xfc,0x00,0x00]
 0x7e,0x00,0xd9,0xd4,0xfd,0xfc,0x00,0x00
+# GFX11: v_cmpx_lt_u64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xd9,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xd9,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd9,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_lt_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xd9,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ne_i16_e64 v1, v2                ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xb5,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_ne_i16_e64 v1, v2                ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_ne_i16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xb5,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_ne_i16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_ne_i16_e64 s1, s2                ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xb5,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_ne_i16_e64 s1, s2                ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i16_e64 s105, s105            ; encoding: [0x7e,0x00,0xb5,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xb5,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_ne_i16_e64 s105, s105            ; encoding: [0x7e,0x00,0xb5,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xb5,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xb5,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_ne_i16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xb5,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xb5,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xb5,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_ne_i16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xb5,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xb5,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xb5,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_ne_i16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xb5,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_ne_i16_e64 m0, 0x3800
 0x7e,0x00,0xb5,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_ne_i16_e64 m0, 0x3800            ; encoding: [0x7e,0x00,0xb5,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xb5,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xb5,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_ne_i16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xb5,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_ne_i16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xb5,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xb5,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_ne_i16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xb5,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xb5,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xb5,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_ne_i16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xb5,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xb5,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xb5,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_ne_i16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xb5,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i16_e64 0x3800, m0
 0x7e,0x00,0xb5,0xd4,0xf0,0xfa,0x00,0x00
+# GFX11: v_cmpx_ne_i16_e64 0x3800, m0            ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xb5,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xb5,0xd4,0xfd,0xd4,0x00,0x00
+# GFX11: v_cmpx_ne_i16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xb5,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xb5,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_ne_i16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc5,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xc5,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_ne_i32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc5,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_ne_i32_e64 v255, v255            ; encoding: [0x7e,0x00,0xc5,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xc5,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_ne_i32_e64 v255, v255            ; encoding: [0x7e,0x00,0xc5,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_ne_i32_e64 s1, s2                ; encoding: [0x7e,0x00,0xc5,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xc5,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_ne_i32_e64 s1, s2                ; encoding: [0x7e,0x00,0xc5,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i32_e64 s105, s105            ; encoding: [0x7e,0x00,0xc5,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xc5,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_ne_i32_e64 s105, s105            ; encoding: [0x7e,0x00,0xc5,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xc5,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xc5,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_ne_i32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xc5,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xc5,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc5,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ne_i32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xc5,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ne_i32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xc5,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xc5,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_ne_i32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xc5,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_ne_i32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xc5,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0xc5,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_ne_i32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xc5,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_ne_i32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xc5,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xc5,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_ne_i32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xc5,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_ne_i32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xc5,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xc5,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_ne_i32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xc5,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xc5,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xc5,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_ne_i32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xc5,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xc5,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xc5,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_ne_i32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xc5,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xc5,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xc5,0xd4,0xf0,0xfa,0x00,0x00
+# GFX11: v_cmpx_ne_i32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xc5,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xc5,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xc5,0xd4,0xfd,0xd4,0x00,0x00
+# GFX11: v_cmpx_ne_i32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xc5,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xc5,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc5,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ne_i32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xc5,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ne_i64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xd5,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xd5,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_ne_i64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xd5,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_ne_i64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd5,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xd5,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_ne_i64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd5,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_ne_i64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xd5,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xd5,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_ne_i64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xd5,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd5,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xd5,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_ne_i64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd5,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xd5,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xd5,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_ne_i64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xd5,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd5,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd5,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ne_i64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd5,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ne_i64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xd5,0xd4,0x7e,0xfa,0x01,0x00]
 0x7e,0x00,0xd5,0xd4,0x7e,0xfa,0x01,0x00
+# GFX11: v_cmpx_ne_i64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xd5,0xd4,0x7e,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_ne_i64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xd5,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xd5,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_ne_i64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xd5,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_ne_i64_e64 -1, -1                ; encoding: [0x7e,0x00,0xd5,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xd5,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_ne_i64_e64 -1, -1                ; encoding: [0x7e,0x00,0xd5,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_ne_i64_e64 0.5, null             ; encoding: [0x7e,0x00,0xd5,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xd5,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_ne_i64_e64 0.5, null             ; encoding: [0x7e,0x00,0xd5,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xd5,0xd4,0xfd,0xfc,0x00,0x00]
 0x7e,0x00,0xd5,0xd4,0xfd,0xfc,0x00,0x00
+# GFX11: v_cmpx_ne_i64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xd5,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xd5,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd5,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ne_i64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xd5,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ne_u16_e64 v1, v2                ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xbd,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_ne_u16_e64 v1, v2                ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_ne_u16_e64 v255, v255            ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xbd,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_ne_u16_e64 v255, v255            ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_ne_u16_e64 s1, s2                ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xbd,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_ne_u16_e64 s1, s2                ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u16_e64 s105, s105            ; encoding: [0x7e,0x00,0xbd,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xbd,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_ne_u16_e64 s105, s105            ; encoding: [0x7e,0x00,0xbd,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xbd,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xbd,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_ne_u16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xbd,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xbd,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xbd,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_ne_u16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xbd,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xbd,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xbd,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_ne_u16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xbd,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_ne_u16_e64 m0, 0x3800
 0x7e,0x00,0xbd,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_ne_u16_e64 m0, 0x3800            ; encoding: [0x7e,0x00,0xbd,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xbd,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xbd,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_ne_u16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xbd,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_ne_u16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xbd,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xbd,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_ne_u16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xbd,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xbd,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xbd,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_ne_u16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xbd,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xbd,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xbd,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_ne_u16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xbd,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u16_e64 0x3800, m0
 0x7e,0x00,0xbd,0xd4,0xf0,0xfa,0x00,0x00
+# GFX11: v_cmpx_ne_u16_e64 0x3800, m0            ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xbd,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xbd,0xd4,0xfd,0xd4,0x00,0x00
+# GFX11: v_cmpx_ne_u16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xbd,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xbd,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_ne_u16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u32_e64 v1, v2                ; encoding: [0x7e,0x00,0xcd,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xcd,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_ne_u32_e64 v1, v2                ; encoding: [0x7e,0x00,0xcd,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_ne_u32_e64 v255, v255            ; encoding: [0x7e,0x00,0xcd,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xcd,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_ne_u32_e64 v255, v255            ; encoding: [0x7e,0x00,0xcd,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_ne_u32_e64 s1, s2                ; encoding: [0x7e,0x00,0xcd,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xcd,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_ne_u32_e64 s1, s2                ; encoding: [0x7e,0x00,0xcd,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u32_e64 s105, s105            ; encoding: [0x7e,0x00,0xcd,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xcd,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_ne_u32_e64 s105, s105            ; encoding: [0x7e,0x00,0xcd,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xcd,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xcd,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_ne_u32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xcd,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xcd,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xcd,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ne_u32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xcd,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ne_u32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xcd,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xcd,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_ne_u32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xcd,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_ne_u32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xcd,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0xcd,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_ne_u32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xcd,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_ne_u32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xcd,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xcd,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_ne_u32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xcd,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_ne_u32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xcd,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xcd,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_ne_u32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xcd,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xcd,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xcd,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_ne_u32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xcd,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xcd,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xcd,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_ne_u32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xcd,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xcd,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xcd,0xd4,0xf0,0xfa,0x00,0x00
+# GFX11: v_cmpx_ne_u32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xcd,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xcd,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xcd,0xd4,0xfd,0xd4,0x00,0x00
+# GFX11: v_cmpx_ne_u32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xcd,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xcd,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xcd,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ne_u32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xcd,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ne_u64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xdd,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xdd,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_ne_u64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xdd,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_ne_u64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xdd,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xdd,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_ne_u64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xdd,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_ne_u64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xdd,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xdd,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_ne_u64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xdd,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xdd,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xdd,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_ne_u64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xdd,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xdd,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xdd,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_ne_u64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xdd,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xdd,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xdd,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ne_u64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xdd,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ne_u64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xdd,0xd4,0x7e,0xfa,0x01,0x00]
 0x7e,0x00,0xdd,0xd4,0x7e,0xfa,0x01,0x00
+# GFX11: v_cmpx_ne_u64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xdd,0xd4,0x7e,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_ne_u64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xdd,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xdd,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_ne_u64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xdd,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_ne_u64_e64 -1, -1                ; encoding: [0x7e,0x00,0xdd,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xdd,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_ne_u64_e64 -1, -1                ; encoding: [0x7e,0x00,0xdd,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_ne_u64_e64 0.5, null             ; encoding: [0x7e,0x00,0xdd,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xdd,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_ne_u64_e64 0.5, null             ; encoding: [0x7e,0x00,0xdd,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xdd,0xd4,0xfd,0xfc,0x00,0x00]
 0x7e,0x00,0xdd,0xd4,0xfd,0xfc,0x00,0x00
+# GFX11: v_cmpx_ne_u64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xdd,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xdd,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xdd,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ne_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xdd,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_neq_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_neq_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_neq_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_neq_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_neq_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x8d,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_neq_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_neq_f16_e64 s105, s105           ; encoding: [0x7e,0x00,0x8d,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x8d,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_neq_f16_e64 s105, s105           ; encoding: [0x7e,0x00,0x8d,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_neq_f16_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x8d,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x8d,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_neq_f16_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x8d,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_neq_f16_e64 vcc_hi, 0xfe0b       ; encoding: [0x7e,0x00,0x8d,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0x8d,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_neq_f16_e64 vcc_hi, 0xfe0b       ; encoding: [0x7e,0x00,0x8d,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_neq_f16_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x8d,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x8d,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_neq_f16_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x8d,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_neq_f16_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x8d,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x8d,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_neq_f16_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x8d,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_neq_f16_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x8d,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x8d,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_neq_f16_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x8d,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_neq_f16_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x8d,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x8d,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_neq_f16_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x8d,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_neq_f16_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x8d,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x8d,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_neq_f16_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x8d,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_neq_f16_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x8d,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x8d,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_neq_f16_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x8d,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_neq_f16_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x8d,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x8d,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_neq_f16_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x8d,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_neq_f16_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x8d,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x8d,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_neq_f16_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x8d,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_neq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7e,0x83,0x8d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_neq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_neq_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9d,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x9d,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_neq_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9d,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_neq_f32_e64 v255, v255           ; encoding: [0x7e,0x00,0x9d,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x9d,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_neq_f32_e64 v255, v255           ; encoding: [0x7e,0x00,0x9d,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_neq_f32_e64 s1, s2               ; encoding: [0x7e,0x00,0x9d,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x9d,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_neq_f32_e64 s1, s2               ; encoding: [0x7e,0x00,0x9d,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_neq_f32_e64 s105, s105           ; encoding: [0x7e,0x00,0x9d,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x9d,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_neq_f32_e64 s105, s105           ; encoding: [0x7e,0x00,0x9d,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_neq_f32_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x9d,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x9d,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_neq_f32_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x9d,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_neq_f32_e64 vcc_hi, 0xaf123456   ; encoding: [0x7e,0x00,0x9d,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0x9d,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_neq_f32_e64 vcc_hi, 0xaf123456   ; encoding: [0x7e,0x00,0x9d,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_neq_f32_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x9d,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x9d,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_neq_f32_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x9d,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_neq_f32_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x9d,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x9d,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_neq_f32_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x9d,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_neq_f32_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x9d,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x9d,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_neq_f32_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x9d,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_neq_f32_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x9d,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x9d,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_neq_f32_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x9d,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_neq_f32_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x9d,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x9d,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_neq_f32_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x9d,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_neq_f32_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x9d,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x9d,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_neq_f32_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x9d,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_neq_f32_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x9d,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x9d,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_neq_f32_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x9d,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_neq_f32_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x9d,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x9d,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_neq_f32_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x9d,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_neq_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x9d,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7e,0x83,0x9d,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_neq_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x9d,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_neq_f64_e64 v[1:2], v[2:3]       ; encoding: [0x7e,0x00,0xad,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xad,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_neq_f64_e64 v[1:2], v[2:3]       ; encoding: [0x7e,0x00,0xad,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_neq_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xad,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xad,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_neq_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xad,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_neq_f64_e64 s[2:3], s[4:5]       ; encoding: [0x7e,0x00,0xad,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xad,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_neq_f64_e64 s[2:3], s[4:5]       ; encoding: [0x7e,0x00,0xad,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_neq_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xad,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xad,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_neq_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xad,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_neq_f64_e64 vcc, ttmp[14:15]     ; encoding: [0x7e,0x00,0xad,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xad,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_neq_f64_e64 vcc, ttmp[14:15]     ; encoding: [0x7e,0x00,0xad,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_neq_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xad,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xad,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_neq_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xad,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_neq_f64_e64 -|exec|, src_scc     ; encoding: [0x7e,0x01,0xad,0xd4,0x7e,0xfa,0x01,0x20]
 0x7e,0x01,0xad,0xd4,0x7e,0xfa,0x01,0x20
+# GFX11: v_cmpx_neq_f64_e64 -|exec|, src_scc     ; encoding: [0x7e,0x01,0xad,0xd4,0x7e,0xfa,0x01,0x20]
 
-# GFX11: v_cmpx_neq_f64_e64 null, 0.5            ; encoding: [0x7e,0x00,0xad,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xad,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_neq_f64_e64 null, 0.5            ; encoding: [0x7e,0x00,0xad,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_neq_f64_e64 -1, -1               ; encoding: [0x7e,0x00,0xad,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xad,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_neq_f64_e64 -1, -1               ; encoding: [0x7e,0x00,0xad,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_neq_f64_e64 0.5, null            ; encoding: [0x7e,0x00,0xad,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xad,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_neq_f64_e64 0.5, null            ; encoding: [0x7e,0x00,0xad,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_neq_f64_e64 -|src_scc|, -|exec|  ; encoding: [0x7e,0x03,0xad,0xd4,0xfd,0xfc,0x00,0x60]
 0x7e,0x03,0xad,0xd4,0xfd,0xfc,0x00,0x60
+# GFX11: v_cmpx_neq_f64_e64 -|src_scc|, -|exec|  ; encoding: [0x7e,0x03,0xad,0xd4,0xfd,0xfc,0x00,0x60]
 
-# GFX11: v_cmpx_neq_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xad,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7e,0x82,0xad,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_neq_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xad,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_nge_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_nge_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_nge_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_nge_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_nge_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x89,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_nge_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_nge_f16_e64 s105, s105           ; encoding: [0x7e,0x00,0x89,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x89,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_nge_f16_e64 s105, s105           ; encoding: [0x7e,0x00,0x89,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_nge_f16_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x89,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x89,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_nge_f16_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x89,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_nge_f16_e64 vcc_hi, 0xfe0b       ; encoding: [0x7e,0x00,0x89,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0x89,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_nge_f16_e64 vcc_hi, 0xfe0b       ; encoding: [0x7e,0x00,0x89,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_nge_f16_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x89,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x89,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_nge_f16_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x89,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_nge_f16_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x89,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x89,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_nge_f16_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x89,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_nge_f16_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x89,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x89,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_nge_f16_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x89,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_nge_f16_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x89,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x89,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_nge_f16_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x89,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_nge_f16_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x89,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x89,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_nge_f16_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x89,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_nge_f16_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x89,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x89,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_nge_f16_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x89,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_nge_f16_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x89,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x89,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_nge_f16_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x89,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_nge_f16_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x89,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x89,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_nge_f16_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x89,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_nge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x89,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7e,0x83,0x89,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_nge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x89,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_nge_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x99,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x99,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_nge_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x99,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_nge_f32_e64 v255, v255           ; encoding: [0x7e,0x00,0x99,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x99,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_nge_f32_e64 v255, v255           ; encoding: [0x7e,0x00,0x99,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_nge_f32_e64 s1, s2               ; encoding: [0x7e,0x00,0x99,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x99,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_nge_f32_e64 s1, s2               ; encoding: [0x7e,0x00,0x99,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_nge_f32_e64 s105, s105           ; encoding: [0x7e,0x00,0x99,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x99,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_nge_f32_e64 s105, s105           ; encoding: [0x7e,0x00,0x99,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_nge_f32_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x99,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x99,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_nge_f32_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x99,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_nge_f32_e64 vcc_hi, 0xaf123456   ; encoding: [0x7e,0x00,0x99,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0x99,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_nge_f32_e64 vcc_hi, 0xaf123456   ; encoding: [0x7e,0x00,0x99,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_nge_f32_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x99,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x99,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_nge_f32_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x99,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_nge_f32_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x99,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x99,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_nge_f32_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x99,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_nge_f32_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x99,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x99,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_nge_f32_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x99,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_nge_f32_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x99,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x99,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_nge_f32_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x99,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_nge_f32_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x99,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x99,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_nge_f32_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x99,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_nge_f32_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x99,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x99,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_nge_f32_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x99,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_nge_f32_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x99,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x99,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_nge_f32_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x99,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_nge_f32_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x99,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x99,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_nge_f32_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x99,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_nge_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x99,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7e,0x83,0x99,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_nge_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x99,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_nge_f64_e64 v[1:2], v[2:3]       ; encoding: [0x7e,0x00,0xa9,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xa9,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_nge_f64_e64 v[1:2], v[2:3]       ; encoding: [0x7e,0x00,0xa9,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_nge_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa9,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xa9,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_nge_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa9,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_nge_f64_e64 s[2:3], s[4:5]       ; encoding: [0x7e,0x00,0xa9,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xa9,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_nge_f64_e64 s[2:3], s[4:5]       ; encoding: [0x7e,0x00,0xa9,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_nge_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa9,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xa9,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_nge_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa9,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_nge_f64_e64 vcc, ttmp[14:15]     ; encoding: [0x7e,0x00,0xa9,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xa9,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_nge_f64_e64 vcc, ttmp[14:15]     ; encoding: [0x7e,0x00,0xa9,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_nge_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa9,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xa9,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_nge_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa9,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_nge_f64_e64 -|exec|, src_scc     ; encoding: [0x7e,0x01,0xa9,0xd4,0x7e,0xfa,0x01,0x20]
 0x7e,0x01,0xa9,0xd4,0x7e,0xfa,0x01,0x20
+# GFX11: v_cmpx_nge_f64_e64 -|exec|, src_scc     ; encoding: [0x7e,0x01,0xa9,0xd4,0x7e,0xfa,0x01,0x20]
 
-# GFX11: v_cmpx_nge_f64_e64 null, 0.5            ; encoding: [0x7e,0x00,0xa9,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xa9,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_nge_f64_e64 null, 0.5            ; encoding: [0x7e,0x00,0xa9,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_nge_f64_e64 -1, -1               ; encoding: [0x7e,0x00,0xa9,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xa9,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_nge_f64_e64 -1, -1               ; encoding: [0x7e,0x00,0xa9,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_nge_f64_e64 0.5, null            ; encoding: [0x7e,0x00,0xa9,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xa9,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_nge_f64_e64 0.5, null            ; encoding: [0x7e,0x00,0xa9,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_nge_f64_e64 -|src_scc|, -|exec|  ; encoding: [0x7e,0x03,0xa9,0xd4,0xfd,0xfc,0x00,0x60]
 0x7e,0x03,0xa9,0xd4,0xfd,0xfc,0x00,0x60
+# GFX11: v_cmpx_nge_f64_e64 -|src_scc|, -|exec|  ; encoding: [0x7e,0x03,0xa9,0xd4,0xfd,0xfc,0x00,0x60]
 
-# GFX11: v_cmpx_nge_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa9,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7e,0x82,0xa9,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_nge_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa9,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ngt_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_ngt_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_ngt_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_ngt_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_ngt_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x8b,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_ngt_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_ngt_f16_e64 s105, s105           ; encoding: [0x7e,0x00,0x8b,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x8b,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_ngt_f16_e64 s105, s105           ; encoding: [0x7e,0x00,0x8b,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_ngt_f16_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x8b,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x8b,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_ngt_f16_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x8b,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_ngt_f16_e64 vcc_hi, 0xfe0b       ; encoding: [0x7e,0x00,0x8b,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0x8b,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_ngt_f16_e64 vcc_hi, 0xfe0b       ; encoding: [0x7e,0x00,0x8b,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ngt_f16_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x8b,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x8b,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_ngt_f16_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x8b,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_ngt_f16_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x8b,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x8b,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_ngt_f16_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x8b,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_ngt_f16_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x8b,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x8b,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_ngt_f16_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x8b,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_ngt_f16_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x8b,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x8b,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_ngt_f16_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x8b,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_ngt_f16_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x8b,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x8b,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_ngt_f16_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x8b,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_ngt_f16_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x8b,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x8b,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_ngt_f16_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x8b,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ngt_f16_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x8b,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x8b,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_ngt_f16_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x8b,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_ngt_f16_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x8b,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x8b,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_ngt_f16_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x8b,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_ngt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7e,0x83,0x8b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_ngt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ngt_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9b,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x9b,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_ngt_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9b,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_ngt_f32_e64 v255, v255           ; encoding: [0x7e,0x00,0x9b,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x9b,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_ngt_f32_e64 v255, v255           ; encoding: [0x7e,0x00,0x9b,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_ngt_f32_e64 s1, s2               ; encoding: [0x7e,0x00,0x9b,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x9b,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_ngt_f32_e64 s1, s2               ; encoding: [0x7e,0x00,0x9b,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_ngt_f32_e64 s105, s105           ; encoding: [0x7e,0x00,0x9b,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x9b,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_ngt_f32_e64 s105, s105           ; encoding: [0x7e,0x00,0x9b,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_ngt_f32_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x9b,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x9b,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_ngt_f32_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x9b,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_ngt_f32_e64 vcc_hi, 0xaf123456   ; encoding: [0x7e,0x00,0x9b,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0x9b,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ngt_f32_e64 vcc_hi, 0xaf123456   ; encoding: [0x7e,0x00,0x9b,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ngt_f32_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x9b,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x9b,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_ngt_f32_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x9b,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_ngt_f32_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x9b,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x9b,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_ngt_f32_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x9b,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_ngt_f32_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x9b,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x9b,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_ngt_f32_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x9b,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_ngt_f32_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x9b,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x9b,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_ngt_f32_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x9b,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_ngt_f32_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x9b,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x9b,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_ngt_f32_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x9b,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_ngt_f32_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x9b,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x9b,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_ngt_f32_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x9b,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ngt_f32_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x9b,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x9b,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_ngt_f32_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x9b,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_ngt_f32_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x9b,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x9b,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_ngt_f32_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x9b,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_ngt_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x9b,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7e,0x83,0x9b,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ngt_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x9b,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ngt_f64_e64 v[1:2], v[2:3]       ; encoding: [0x7e,0x00,0xab,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xab,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_ngt_f64_e64 v[1:2], v[2:3]       ; encoding: [0x7e,0x00,0xab,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_ngt_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xab,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xab,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_ngt_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xab,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_ngt_f64_e64 s[2:3], s[4:5]       ; encoding: [0x7e,0x00,0xab,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xab,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_ngt_f64_e64 s[2:3], s[4:5]       ; encoding: [0x7e,0x00,0xab,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_ngt_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xab,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xab,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_ngt_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xab,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_ngt_f64_e64 vcc, ttmp[14:15]     ; encoding: [0x7e,0x00,0xab,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xab,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_ngt_f64_e64 vcc, ttmp[14:15]     ; encoding: [0x7e,0x00,0xab,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_ngt_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xab,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xab,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ngt_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xab,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ngt_f64_e64 -|exec|, src_scc     ; encoding: [0x7e,0x01,0xab,0xd4,0x7e,0xfa,0x01,0x20]
 0x7e,0x01,0xab,0xd4,0x7e,0xfa,0x01,0x20
+# GFX11: v_cmpx_ngt_f64_e64 -|exec|, src_scc     ; encoding: [0x7e,0x01,0xab,0xd4,0x7e,0xfa,0x01,0x20]
 
-# GFX11: v_cmpx_ngt_f64_e64 null, 0.5            ; encoding: [0x7e,0x00,0xab,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xab,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_ngt_f64_e64 null, 0.5            ; encoding: [0x7e,0x00,0xab,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_ngt_f64_e64 -1, -1               ; encoding: [0x7e,0x00,0xab,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xab,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_ngt_f64_e64 -1, -1               ; encoding: [0x7e,0x00,0xab,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_ngt_f64_e64 0.5, null            ; encoding: [0x7e,0x00,0xab,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xab,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_ngt_f64_e64 0.5, null            ; encoding: [0x7e,0x00,0xab,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_ngt_f64_e64 -|src_scc|, -|exec|  ; encoding: [0x7e,0x03,0xab,0xd4,0xfd,0xfc,0x00,0x60]
 0x7e,0x03,0xab,0xd4,0xfd,0xfc,0x00,0x60
+# GFX11: v_cmpx_ngt_f64_e64 -|src_scc|, -|exec|  ; encoding: [0x7e,0x03,0xab,0xd4,0xfd,0xfc,0x00,0x60]
 
-# GFX11: v_cmpx_ngt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xab,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7e,0x82,0xab,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ngt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xab,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_nle_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_nle_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_nle_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_nle_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_nle_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x8c,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_nle_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_nle_f16_e64 s105, s105           ; encoding: [0x7e,0x00,0x8c,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x8c,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_nle_f16_e64 s105, s105           ; encoding: [0x7e,0x00,0x8c,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_nle_f16_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x8c,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x8c,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_nle_f16_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x8c,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_nle_f16_e64 vcc_hi, 0xfe0b       ; encoding: [0x7e,0x00,0x8c,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0x8c,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_nle_f16_e64 vcc_hi, 0xfe0b       ; encoding: [0x7e,0x00,0x8c,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_nle_f16_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x8c,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x8c,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_nle_f16_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x8c,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_nle_f16_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x8c,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x8c,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_nle_f16_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x8c,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_nle_f16_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x8c,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x8c,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_nle_f16_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x8c,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_nle_f16_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x8c,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x8c,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_nle_f16_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x8c,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_nle_f16_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x8c,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x8c,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_nle_f16_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x8c,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_nle_f16_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x8c,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x8c,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_nle_f16_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x8c,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_nle_f16_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x8c,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x8c,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_nle_f16_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x8c,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_nle_f16_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x8c,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x8c,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_nle_f16_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x8c,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_nle_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7e,0x83,0x8c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_nle_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_nle_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9c,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x9c,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_nle_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9c,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_nle_f32_e64 v255, v255           ; encoding: [0x7e,0x00,0x9c,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x9c,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_nle_f32_e64 v255, v255           ; encoding: [0x7e,0x00,0x9c,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_nle_f32_e64 s1, s2               ; encoding: [0x7e,0x00,0x9c,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x9c,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_nle_f32_e64 s1, s2               ; encoding: [0x7e,0x00,0x9c,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_nle_f32_e64 s105, s105           ; encoding: [0x7e,0x00,0x9c,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x9c,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_nle_f32_e64 s105, s105           ; encoding: [0x7e,0x00,0x9c,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_nle_f32_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x9c,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x9c,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_nle_f32_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x9c,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_nle_f32_e64 vcc_hi, 0xaf123456   ; encoding: [0x7e,0x00,0x9c,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0x9c,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_nle_f32_e64 vcc_hi, 0xaf123456   ; encoding: [0x7e,0x00,0x9c,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_nle_f32_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x9c,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x9c,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_nle_f32_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x9c,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_nle_f32_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x9c,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x9c,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_nle_f32_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x9c,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_nle_f32_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x9c,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x9c,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_nle_f32_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x9c,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_nle_f32_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x9c,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x9c,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_nle_f32_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x9c,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_nle_f32_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x9c,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x9c,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_nle_f32_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x9c,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_nle_f32_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x9c,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x9c,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_nle_f32_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x9c,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_nle_f32_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x9c,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x9c,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_nle_f32_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x9c,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_nle_f32_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x9c,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x9c,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_nle_f32_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x9c,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_nle_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x9c,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7e,0x83,0x9c,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_nle_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x9c,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_nle_f64_e64 v[1:2], v[2:3]       ; encoding: [0x7e,0x00,0xac,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xac,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_nle_f64_e64 v[1:2], v[2:3]       ; encoding: [0x7e,0x00,0xac,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_nle_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xac,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xac,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_nle_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xac,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_nle_f64_e64 s[2:3], s[4:5]       ; encoding: [0x7e,0x00,0xac,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xac,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_nle_f64_e64 s[2:3], s[4:5]       ; encoding: [0x7e,0x00,0xac,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_nle_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xac,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xac,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_nle_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xac,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_nle_f64_e64 vcc, ttmp[14:15]     ; encoding: [0x7e,0x00,0xac,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xac,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_nle_f64_e64 vcc, ttmp[14:15]     ; encoding: [0x7e,0x00,0xac,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_nle_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xac,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xac,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_nle_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xac,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_nle_f64_e64 -|exec|, src_scc     ; encoding: [0x7e,0x01,0xac,0xd4,0x7e,0xfa,0x01,0x20]
 0x7e,0x01,0xac,0xd4,0x7e,0xfa,0x01,0x20
+# GFX11: v_cmpx_nle_f64_e64 -|exec|, src_scc     ; encoding: [0x7e,0x01,0xac,0xd4,0x7e,0xfa,0x01,0x20]
 
-# GFX11: v_cmpx_nle_f64_e64 null, 0.5            ; encoding: [0x7e,0x00,0xac,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xac,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_nle_f64_e64 null, 0.5            ; encoding: [0x7e,0x00,0xac,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_nle_f64_e64 -1, -1               ; encoding: [0x7e,0x00,0xac,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xac,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_nle_f64_e64 -1, -1               ; encoding: [0x7e,0x00,0xac,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_nle_f64_e64 0.5, null            ; encoding: [0x7e,0x00,0xac,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xac,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_nle_f64_e64 0.5, null            ; encoding: [0x7e,0x00,0xac,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_nle_f64_e64 -|src_scc|, -|exec|  ; encoding: [0x7e,0x03,0xac,0xd4,0xfd,0xfc,0x00,0x60]
 0x7e,0x03,0xac,0xd4,0xfd,0xfc,0x00,0x60
+# GFX11: v_cmpx_nle_f64_e64 -|src_scc|, -|exec|  ; encoding: [0x7e,0x03,0xac,0xd4,0xfd,0xfc,0x00,0x60]
 
-# GFX11: v_cmpx_nle_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xac,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7e,0x82,0xac,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_nle_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xac,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_nlg_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_nlg_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_nlg_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_nlg_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_nlg_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x8a,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_nlg_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_nlg_f16_e64 s105, s105           ; encoding: [0x7e,0x00,0x8a,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x8a,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_nlg_f16_e64 s105, s105           ; encoding: [0x7e,0x00,0x8a,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_nlg_f16_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x8a,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x8a,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_nlg_f16_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x8a,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_nlg_f16_e64 vcc_hi, 0xfe0b       ; encoding: [0x7e,0x00,0x8a,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0x8a,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_nlg_f16_e64 vcc_hi, 0xfe0b       ; encoding: [0x7e,0x00,0x8a,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_nlg_f16_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x8a,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x8a,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_nlg_f16_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x8a,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_nlg_f16_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x8a,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x8a,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_nlg_f16_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x8a,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_nlg_f16_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x8a,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x8a,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_nlg_f16_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x8a,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_nlg_f16_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x8a,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x8a,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_nlg_f16_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x8a,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_nlg_f16_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x8a,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x8a,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_nlg_f16_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x8a,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_nlg_f16_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x8a,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x8a,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_nlg_f16_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x8a,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_nlg_f16_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x8a,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x8a,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_nlg_f16_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x8a,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_nlg_f16_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x8a,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x8a,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_nlg_f16_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x8a,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_nlg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7e,0x83,0x8a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_nlg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_nlg_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9a,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x9a,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_nlg_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9a,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_nlg_f32_e64 v255, v255           ; encoding: [0x7e,0x00,0x9a,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x9a,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_nlg_f32_e64 v255, v255           ; encoding: [0x7e,0x00,0x9a,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_nlg_f32_e64 s1, s2               ; encoding: [0x7e,0x00,0x9a,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x9a,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_nlg_f32_e64 s1, s2               ; encoding: [0x7e,0x00,0x9a,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_nlg_f32_e64 s105, s105           ; encoding: [0x7e,0x00,0x9a,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x9a,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_nlg_f32_e64 s105, s105           ; encoding: [0x7e,0x00,0x9a,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_nlg_f32_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x9a,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x9a,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_nlg_f32_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x9a,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_nlg_f32_e64 vcc_hi, 0xaf123456   ; encoding: [0x7e,0x00,0x9a,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0x9a,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_nlg_f32_e64 vcc_hi, 0xaf123456   ; encoding: [0x7e,0x00,0x9a,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_nlg_f32_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x9a,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x9a,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_nlg_f32_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x9a,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_nlg_f32_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x9a,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x9a,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_nlg_f32_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x9a,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_nlg_f32_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x9a,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x9a,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_nlg_f32_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x9a,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_nlg_f32_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x9a,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x9a,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_nlg_f32_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x9a,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_nlg_f32_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x9a,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x9a,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_nlg_f32_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x9a,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_nlg_f32_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x9a,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x9a,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_nlg_f32_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x9a,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_nlg_f32_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x9a,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x9a,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_nlg_f32_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x9a,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_nlg_f32_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x9a,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x9a,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_nlg_f32_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x9a,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_nlg_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x9a,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7e,0x83,0x9a,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_nlg_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x9a,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_nlg_f64_e64 v[1:2], v[2:3]       ; encoding: [0x7e,0x00,0xaa,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xaa,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_nlg_f64_e64 v[1:2], v[2:3]       ; encoding: [0x7e,0x00,0xaa,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_nlg_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xaa,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xaa,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_nlg_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xaa,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_nlg_f64_e64 s[2:3], s[4:5]       ; encoding: [0x7e,0x00,0xaa,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xaa,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_nlg_f64_e64 s[2:3], s[4:5]       ; encoding: [0x7e,0x00,0xaa,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_nlg_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xaa,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xaa,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_nlg_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xaa,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_nlg_f64_e64 vcc, ttmp[14:15]     ; encoding: [0x7e,0x00,0xaa,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xaa,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_nlg_f64_e64 vcc, ttmp[14:15]     ; encoding: [0x7e,0x00,0xaa,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_nlg_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xaa,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xaa,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_nlg_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xaa,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_nlg_f64_e64 -|exec|, src_scc     ; encoding: [0x7e,0x01,0xaa,0xd4,0x7e,0xfa,0x01,0x20]
 0x7e,0x01,0xaa,0xd4,0x7e,0xfa,0x01,0x20
+# GFX11: v_cmpx_nlg_f64_e64 -|exec|, src_scc     ; encoding: [0x7e,0x01,0xaa,0xd4,0x7e,0xfa,0x01,0x20]
 
-# GFX11: v_cmpx_nlg_f64_e64 null, 0.5            ; encoding: [0x7e,0x00,0xaa,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xaa,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_nlg_f64_e64 null, 0.5            ; encoding: [0x7e,0x00,0xaa,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_nlg_f64_e64 -1, -1               ; encoding: [0x7e,0x00,0xaa,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xaa,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_nlg_f64_e64 -1, -1               ; encoding: [0x7e,0x00,0xaa,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_nlg_f64_e64 0.5, null            ; encoding: [0x7e,0x00,0xaa,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xaa,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_nlg_f64_e64 0.5, null            ; encoding: [0x7e,0x00,0xaa,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_nlg_f64_e64 -|src_scc|, -|exec|  ; encoding: [0x7e,0x03,0xaa,0xd4,0xfd,0xfc,0x00,0x60]
 0x7e,0x03,0xaa,0xd4,0xfd,0xfc,0x00,0x60
+# GFX11: v_cmpx_nlg_f64_e64 -|src_scc|, -|exec|  ; encoding: [0x7e,0x03,0xaa,0xd4,0xfd,0xfc,0x00,0x60]
 
-# GFX11: v_cmpx_nlg_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xaa,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7e,0x82,0xaa,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_nlg_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xaa,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_nlt_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_nlt_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_nlt_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_nlt_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_nlt_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x8e,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_nlt_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_nlt_f16_e64 s105, s105           ; encoding: [0x7e,0x00,0x8e,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x8e,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_nlt_f16_e64 s105, s105           ; encoding: [0x7e,0x00,0x8e,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_nlt_f16_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x8e,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x8e,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_nlt_f16_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x8e,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_nlt_f16_e64 vcc_hi, 0xfe0b       ; encoding: [0x7e,0x00,0x8e,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0x8e,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_nlt_f16_e64 vcc_hi, 0xfe0b       ; encoding: [0x7e,0x00,0x8e,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_nlt_f16_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x8e,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x8e,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_nlt_f16_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x8e,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_nlt_f16_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x8e,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x8e,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_nlt_f16_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x8e,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_nlt_f16_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x8e,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x8e,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_nlt_f16_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x8e,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_nlt_f16_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x8e,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x8e,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_nlt_f16_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x8e,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_nlt_f16_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x8e,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x8e,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_nlt_f16_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x8e,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_nlt_f16_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x8e,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x8e,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_nlt_f16_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x8e,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_nlt_f16_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x8e,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x8e,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_nlt_f16_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x8e,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_nlt_f16_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x8e,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x8e,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_nlt_f16_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x8e,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_nlt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7e,0x83,0x8e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_nlt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_nlt_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9e,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x9e,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_nlt_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9e,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_nlt_f32_e64 v255, v255           ; encoding: [0x7e,0x00,0x9e,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x9e,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_nlt_f32_e64 v255, v255           ; encoding: [0x7e,0x00,0x9e,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_nlt_f32_e64 s1, s2               ; encoding: [0x7e,0x00,0x9e,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x9e,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_nlt_f32_e64 s1, s2               ; encoding: [0x7e,0x00,0x9e,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_nlt_f32_e64 s105, s105           ; encoding: [0x7e,0x00,0x9e,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x9e,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_nlt_f32_e64 s105, s105           ; encoding: [0x7e,0x00,0x9e,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_nlt_f32_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x9e,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x9e,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_nlt_f32_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x9e,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_nlt_f32_e64 vcc_hi, 0xaf123456   ; encoding: [0x7e,0x00,0x9e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0x9e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_nlt_f32_e64 vcc_hi, 0xaf123456   ; encoding: [0x7e,0x00,0x9e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_nlt_f32_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x9e,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x9e,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_nlt_f32_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x9e,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_nlt_f32_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x9e,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x9e,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_nlt_f32_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x9e,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_nlt_f32_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x9e,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x9e,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_nlt_f32_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x9e,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_nlt_f32_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x9e,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x9e,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_nlt_f32_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x9e,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_nlt_f32_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x9e,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x9e,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_nlt_f32_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x9e,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_nlt_f32_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x9e,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x9e,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_nlt_f32_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x9e,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_nlt_f32_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x9e,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x9e,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_nlt_f32_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x9e,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_nlt_f32_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x9e,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x9e,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_nlt_f32_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x9e,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_nlt_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x9e,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7e,0x83,0x9e,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_nlt_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x9e,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_nlt_f64_e64 v[1:2], v[2:3]       ; encoding: [0x7e,0x00,0xae,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xae,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_nlt_f64_e64 v[1:2], v[2:3]       ; encoding: [0x7e,0x00,0xae,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_nlt_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xae,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xae,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_nlt_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xae,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_nlt_f64_e64 s[2:3], s[4:5]       ; encoding: [0x7e,0x00,0xae,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xae,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_nlt_f64_e64 s[2:3], s[4:5]       ; encoding: [0x7e,0x00,0xae,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_nlt_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xae,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xae,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_nlt_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xae,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_nlt_f64_e64 vcc, ttmp[14:15]     ; encoding: [0x7e,0x00,0xae,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xae,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_nlt_f64_e64 vcc, ttmp[14:15]     ; encoding: [0x7e,0x00,0xae,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_nlt_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xae,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xae,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_nlt_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xae,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_nlt_f64_e64 -|exec|, src_scc     ; encoding: [0x7e,0x01,0xae,0xd4,0x7e,0xfa,0x01,0x20]
 0x7e,0x01,0xae,0xd4,0x7e,0xfa,0x01,0x20
+# GFX11: v_cmpx_nlt_f64_e64 -|exec|, src_scc     ; encoding: [0x7e,0x01,0xae,0xd4,0x7e,0xfa,0x01,0x20]
 
-# GFX11: v_cmpx_nlt_f64_e64 null, 0.5            ; encoding: [0x7e,0x00,0xae,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xae,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_nlt_f64_e64 null, 0.5            ; encoding: [0x7e,0x00,0xae,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_nlt_f64_e64 -1, -1               ; encoding: [0x7e,0x00,0xae,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xae,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_nlt_f64_e64 -1, -1               ; encoding: [0x7e,0x00,0xae,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_nlt_f64_e64 0.5, null            ; encoding: [0x7e,0x00,0xae,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xae,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_nlt_f64_e64 0.5, null            ; encoding: [0x7e,0x00,0xae,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_nlt_f64_e64 -|src_scc|, -|exec|  ; encoding: [0x7e,0x03,0xae,0xd4,0xfd,0xfc,0x00,0x60]
 0x7e,0x03,0xae,0xd4,0xfd,0xfc,0x00,0x60
+# GFX11: v_cmpx_nlt_f64_e64 -|src_scc|, -|exec|  ; encoding: [0x7e,0x03,0xae,0xd4,0xfd,0xfc,0x00,0x60]
 
-# GFX11: v_cmpx_nlt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xae,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7e,0x82,0xae,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_nlt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xae,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_o_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_o_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_o_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_o_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_o_f16_e64 s1, s2                 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x87,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_o_f16_e64 s1, s2                 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_o_f16_e64 s105, s105             ; encoding: [0x7e,0x00,0x87,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x87,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_o_f16_e64 s105, s105             ; encoding: [0x7e,0x00,0x87,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_o_f16_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0x87,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x87,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_o_f16_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0x87,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_o_f16_e64 vcc_hi, 0xfe0b         ; encoding: [0x7e,0x00,0x87,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0x87,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_o_f16_e64 vcc_hi, 0xfe0b         ; encoding: [0x7e,0x00,0x87,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_o_f16_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0x87,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x87,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_o_f16_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0x87,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_o_f16_e64 m0, 0.5                ; encoding: [0x7e,0x00,0x87,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x87,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_o_f16_e64 m0, 0.5                ; encoding: [0x7e,0x00,0x87,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_o_f16_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0x87,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x87,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_o_f16_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0x87,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_o_f16_e64 |exec_hi|, null        ; encoding: [0x7e,0x01,0x87,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x87,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_o_f16_e64 |exec_hi|, null        ; encoding: [0x7e,0x01,0x87,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_o_f16_e64 null, exec_lo          ; encoding: [0x7e,0x00,0x87,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x87,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_o_f16_e64 null, exec_lo          ; encoding: [0x7e,0x00,0x87,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_o_f16_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0x87,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x87,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_o_f16_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0x87,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_o_f16_e64 0.5, -m0               ; encoding: [0x7e,0x00,0x87,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x87,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_o_f16_e64 0.5, -m0               ; encoding: [0x7e,0x00,0x87,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_o_f16_e64 -src_scc, |vcc_lo|     ; encoding: [0x7e,0x02,0x87,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x87,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_o_f16_e64 -src_scc, |vcc_lo|     ; encoding: [0x7e,0x02,0x87,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_o_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x87,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7e,0x83,0x87,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_o_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x87,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_o_f32_e64 v1, v2                 ; encoding: [0x7e,0x00,0x97,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x97,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_o_f32_e64 v1, v2                 ; encoding: [0x7e,0x00,0x97,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_o_f32_e64 v255, v255             ; encoding: [0x7e,0x00,0x97,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x97,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_o_f32_e64 v255, v255             ; encoding: [0x7e,0x00,0x97,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_o_f32_e64 s1, s2                 ; encoding: [0x7e,0x00,0x97,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x97,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_o_f32_e64 s1, s2                 ; encoding: [0x7e,0x00,0x97,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_o_f32_e64 s105, s105             ; encoding: [0x7e,0x00,0x97,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x97,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_o_f32_e64 s105, s105             ; encoding: [0x7e,0x00,0x97,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_o_f32_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0x97,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x97,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_o_f32_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0x97,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_o_f32_e64 vcc_hi, 0xaf123456     ; encoding: [0x7e,0x00,0x97,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0x97,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_o_f32_e64 vcc_hi, 0xaf123456     ; encoding: [0x7e,0x00,0x97,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_o_f32_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0x97,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x97,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_o_f32_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0x97,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_o_f32_e64 m0, 0.5                ; encoding: [0x7e,0x00,0x97,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x97,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_o_f32_e64 m0, 0.5                ; encoding: [0x7e,0x00,0x97,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_o_f32_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0x97,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x97,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_o_f32_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0x97,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_o_f32_e64 |exec_hi|, null        ; encoding: [0x7e,0x01,0x97,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x97,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_o_f32_e64 |exec_hi|, null        ; encoding: [0x7e,0x01,0x97,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_o_f32_e64 null, exec_lo          ; encoding: [0x7e,0x00,0x97,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x97,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_o_f32_e64 null, exec_lo          ; encoding: [0x7e,0x00,0x97,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_o_f32_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0x97,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x97,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_o_f32_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0x97,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_o_f32_e64 0.5, -m0               ; encoding: [0x7e,0x00,0x97,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x97,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_o_f32_e64 0.5, -m0               ; encoding: [0x7e,0x00,0x97,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_o_f32_e64 -src_scc, |vcc_lo|     ; encoding: [0x7e,0x02,0x97,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x97,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_o_f32_e64 -src_scc, |vcc_lo|     ; encoding: [0x7e,0x02,0x97,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_o_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x97,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7e,0x83,0x97,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_o_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x97,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_o_f64_e64 v[1:2], v[2:3]         ; encoding: [0x7e,0x00,0xa7,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xa7,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_o_f64_e64 v[1:2], v[2:3]         ; encoding: [0x7e,0x00,0xa7,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_o_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa7,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xa7,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_o_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa7,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_o_f64_e64 s[2:3], s[4:5]         ; encoding: [0x7e,0x00,0xa7,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xa7,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_o_f64_e64 s[2:3], s[4:5]         ; encoding: [0x7e,0x00,0xa7,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_o_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa7,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xa7,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_o_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa7,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_o_f64_e64 vcc, ttmp[14:15]       ; encoding: [0x7e,0x00,0xa7,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xa7,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_o_f64_e64 vcc, ttmp[14:15]       ; encoding: [0x7e,0x00,0xa7,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_o_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa7,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xa7,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_o_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa7,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_o_f64_e64 -|exec|, src_scc       ; encoding: [0x7e,0x01,0xa7,0xd4,0x7e,0xfa,0x01,0x20]
 0x7e,0x01,0xa7,0xd4,0x7e,0xfa,0x01,0x20
+# GFX11: v_cmpx_o_f64_e64 -|exec|, src_scc       ; encoding: [0x7e,0x01,0xa7,0xd4,0x7e,0xfa,0x01,0x20]
 
-# GFX11: v_cmpx_o_f64_e64 null, 0.5              ; encoding: [0x7e,0x00,0xa7,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xa7,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_o_f64_e64 null, 0.5              ; encoding: [0x7e,0x00,0xa7,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_o_f64_e64 -1, -1                 ; encoding: [0x7e,0x00,0xa7,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xa7,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_o_f64_e64 -1, -1                 ; encoding: [0x7e,0x00,0xa7,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_o_f64_e64 0.5, null              ; encoding: [0x7e,0x00,0xa7,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xa7,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_o_f64_e64 0.5, null              ; encoding: [0x7e,0x00,0xa7,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_o_f64_e64 -|src_scc|, -|exec|    ; encoding: [0x7e,0x03,0xa7,0xd4,0xfd,0xfc,0x00,0x60]
 0x7e,0x03,0xa7,0xd4,0xfd,0xfc,0x00,0x60
+# GFX11: v_cmpx_o_f64_e64 -|src_scc|, -|exec|    ; encoding: [0x7e,0x03,0xa7,0xd4,0xfd,0xfc,0x00,0x60]
 
-# GFX11: v_cmpx_o_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa7,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7e,0x82,0xa7,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_o_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa7,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_t_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x8f,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_t_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_t_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x8f,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x8f,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_t_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x8f,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_t_f16_e64 s1, s2                 ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x8f,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_t_f16_e64 s1, s2                 ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_t_f16_e64 s105, s105             ; encoding: [0x7e,0x00,0x8f,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x8f,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_t_f16_e64 s105, s105             ; encoding: [0x7e,0x00,0x8f,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_t_f16_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0x8f,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x8f,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_t_f16_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0x8f,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_t_f16_e64 vcc_hi, 0xfe0b         ; encoding: [0x7e,0x00,0x8f,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0x8f,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_t_f16_e64 vcc_hi, 0xfe0b         ; encoding: [0x7e,0x00,0x8f,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_t_f16_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0x8f,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x8f,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_t_f16_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0x8f,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_t_f16_e64 m0, 0.5                ; encoding: [0x7e,0x00,0x8f,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x8f,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_t_f16_e64 m0, 0.5                ; encoding: [0x7e,0x00,0x8f,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_t_f16_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0x8f,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x8f,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_t_f16_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0x8f,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_t_f16_e64 |exec_hi|, null        ; encoding: [0x7e,0x01,0x8f,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x8f,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_t_f16_e64 |exec_hi|, null        ; encoding: [0x7e,0x01,0x8f,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_t_f16_e64 null, exec_lo          ; encoding: [0x7e,0x00,0x8f,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x8f,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_t_f16_e64 null, exec_lo          ; encoding: [0x7e,0x00,0x8f,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_t_f16_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0x8f,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x8f,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_t_f16_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0x8f,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_t_f16_e64 0.5, -m0               ; encoding: [0x7e,0x00,0x8f,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x8f,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_t_f16_e64 0.5, -m0               ; encoding: [0x7e,0x00,0x8f,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_t_f16_e64 -src_scc, |vcc_lo|     ; encoding: [0x7e,0x02,0x8f,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x8f,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_t_f16_e64 -src_scc, |vcc_lo|     ; encoding: [0x7e,0x02,0x8f,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_t_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8f,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7e,0x83,0x8f,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_t_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8f,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_t_f32_e64 v1, v2                 ; encoding: [0x7e,0x00,0x9f,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x9f,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_t_f32_e64 v1, v2                 ; encoding: [0x7e,0x00,0x9f,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_t_f32_e64 v255, v255             ; encoding: [0x7e,0x00,0x9f,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x9f,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_t_f32_e64 v255, v255             ; encoding: [0x7e,0x00,0x9f,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_t_f32_e64 s1, s2                 ; encoding: [0x7e,0x00,0x9f,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x9f,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_t_f32_e64 s1, s2                 ; encoding: [0x7e,0x00,0x9f,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_t_f32_e64 s105, s105             ; encoding: [0x7e,0x00,0x9f,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x9f,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_t_f32_e64 s105, s105             ; encoding: [0x7e,0x00,0x9f,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_t_f32_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0x9f,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x9f,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_t_f32_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0x9f,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_t_f32_e64 vcc_hi, 0xaf123456     ; encoding: [0x7e,0x00,0x9f,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0x9f,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_t_f32_e64 vcc_hi, 0xaf123456     ; encoding: [0x7e,0x00,0x9f,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_t_f32_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0x9f,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x9f,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_t_f32_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0x9f,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_t_f32_e64 m0, 0.5                ; encoding: [0x7e,0x00,0x9f,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x9f,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_t_f32_e64 m0, 0.5                ; encoding: [0x7e,0x00,0x9f,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_t_f32_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0x9f,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x9f,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_t_f32_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0x9f,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_t_f32_e64 |exec_hi|, null        ; encoding: [0x7e,0x01,0x9f,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x9f,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_t_f32_e64 |exec_hi|, null        ; encoding: [0x7e,0x01,0x9f,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_t_f32_e64 null, exec_lo          ; encoding: [0x7e,0x00,0x9f,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x9f,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_t_f32_e64 null, exec_lo          ; encoding: [0x7e,0x00,0x9f,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_t_f32_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0x9f,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x9f,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_t_f32_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0x9f,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_t_f32_e64 0.5, -m0               ; encoding: [0x7e,0x00,0x9f,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x9f,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_t_f32_e64 0.5, -m0               ; encoding: [0x7e,0x00,0x9f,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_t_f32_e64 -src_scc, |vcc_lo|     ; encoding: [0x7e,0x02,0x9f,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x9f,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_t_f32_e64 -src_scc, |vcc_lo|     ; encoding: [0x7e,0x02,0x9f,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_t_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x9f,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7e,0x83,0x9f,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_t_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x9f,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_t_f64_e64 v[1:2], v[2:3]         ; encoding: [0x7e,0x00,0xaf,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xaf,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_t_f64_e64 v[1:2], v[2:3]         ; encoding: [0x7e,0x00,0xaf,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_t_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xaf,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xaf,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_t_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xaf,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_t_f64_e64 s[2:3], s[4:5]         ; encoding: [0x7e,0x00,0xaf,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xaf,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_t_f64_e64 s[2:3], s[4:5]         ; encoding: [0x7e,0x00,0xaf,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_t_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xaf,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xaf,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_t_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xaf,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_t_f64_e64 vcc, ttmp[14:15]       ; encoding: [0x7e,0x00,0xaf,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xaf,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_t_f64_e64 vcc, ttmp[14:15]       ; encoding: [0x7e,0x00,0xaf,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_t_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xaf,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xaf,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_t_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xaf,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_t_f64_e64 -|exec|, src_scc       ; encoding: [0x7e,0x01,0xaf,0xd4,0x7e,0xfa,0x01,0x20]
 0x7e,0x01,0xaf,0xd4,0x7e,0xfa,0x01,0x20
+# GFX11: v_cmpx_t_f64_e64 -|exec|, src_scc       ; encoding: [0x7e,0x01,0xaf,0xd4,0x7e,0xfa,0x01,0x20]
 
-# GFX11: v_cmpx_t_f64_e64 null, 0.5              ; encoding: [0x7e,0x00,0xaf,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xaf,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_t_f64_e64 null, 0.5              ; encoding: [0x7e,0x00,0xaf,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_t_f64_e64 -1, -1                 ; encoding: [0x7e,0x00,0xaf,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xaf,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_t_f64_e64 -1, -1                 ; encoding: [0x7e,0x00,0xaf,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_t_f64_e64 0.5, null              ; encoding: [0x7e,0x00,0xaf,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xaf,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_t_f64_e64 0.5, null              ; encoding: [0x7e,0x00,0xaf,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_t_f64_e64 -|src_scc|, -|exec|    ; encoding: [0x7e,0x03,0xaf,0xd4,0xfd,0xfc,0x00,0x60]
 0x7e,0x03,0xaf,0xd4,0xfd,0xfc,0x00,0x60
+# GFX11: v_cmpx_t_f64_e64 -|src_scc|, -|exec|    ; encoding: [0x7e,0x03,0xaf,0xd4,0xfd,0xfc,0x00,0x60]
 
-# GFX11: v_cmpx_t_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xaf,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7e,0x82,0xaf,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_t_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xaf,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_t_i32_e64 v1, v2                 ; encoding: [0x7e,0x00,0xc7,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xc7,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_t_i32_e64 v1, v2                 ; encoding: [0x7e,0x00,0xc7,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_t_i32_e64 v255, v255             ; encoding: [0x7e,0x00,0xc7,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xc7,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_t_i32_e64 v255, v255             ; encoding: [0x7e,0x00,0xc7,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_t_i32_e64 s1, s2                 ; encoding: [0x7e,0x00,0xc7,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xc7,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_t_i32_e64 s1, s2                 ; encoding: [0x7e,0x00,0xc7,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_t_i32_e64 s105, s105             ; encoding: [0x7e,0x00,0xc7,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xc7,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_t_i32_e64 s105, s105             ; encoding: [0x7e,0x00,0xc7,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_t_i32_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0xc7,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xc7,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_t_i32_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0xc7,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_t_i32_e64 vcc_hi, 0xaf123456     ; encoding: [0x7e,0x00,0xc7,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc7,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_t_i32_e64 vcc_hi, 0xaf123456     ; encoding: [0x7e,0x00,0xc7,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_t_i32_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0xc7,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xc7,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_t_i32_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0xc7,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_t_i32_e64 m0, 0.5                ; encoding: [0x7e,0x00,0xc7,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0xc7,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_t_i32_e64 m0, 0.5                ; encoding: [0x7e,0x00,0xc7,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_t_i32_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0xc7,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xc7,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_t_i32_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0xc7,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_t_i32_e64 exec_hi, null          ; encoding: [0x7e,0x00,0xc7,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xc7,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_t_i32_e64 exec_hi, null          ; encoding: [0x7e,0x00,0xc7,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_t_i32_e64 null, exec_lo          ; encoding: [0x7e,0x00,0xc7,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xc7,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_t_i32_e64 null, exec_lo          ; encoding: [0x7e,0x00,0xc7,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_t_i32_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0xc7,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xc7,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_t_i32_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0xc7,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_t_i32_e64 0.5, m0                ; encoding: [0x7e,0x00,0xc7,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xc7,0xd4,0xf0,0xfa,0x00,0x00
+# GFX11: v_cmpx_t_i32_e64 0.5, m0                ; encoding: [0x7e,0x00,0xc7,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX11: v_cmpx_t_i32_e64 src_scc, vcc_lo        ; encoding: [0x7e,0x00,0xc7,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xc7,0xd4,0xfd,0xd4,0x00,0x00
+# GFX11: v_cmpx_t_i32_e64 src_scc, vcc_lo        ; encoding: [0x7e,0x00,0xc7,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmpx_t_i32_e64 0xaf123456, vcc_hi     ; encoding: [0x7e,0x00,0xc7,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc7,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_t_i32_e64 0xaf123456, vcc_hi     ; encoding: [0x7e,0x00,0xc7,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_t_i64_e64 v[1:2], v[2:3]         ; encoding: [0x7e,0x00,0xd7,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xd7,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_t_i64_e64 v[1:2], v[2:3]         ; encoding: [0x7e,0x00,0xd7,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_t_i64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd7,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xd7,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_t_i64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd7,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_t_i64_e64 s[2:3], s[4:5]         ; encoding: [0x7e,0x00,0xd7,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xd7,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_t_i64_e64 s[2:3], s[4:5]         ; encoding: [0x7e,0x00,0xd7,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_t_i64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd7,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xd7,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_t_i64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd7,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_t_i64_e64 vcc, ttmp[14:15]       ; encoding: [0x7e,0x00,0xd7,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xd7,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_t_i64_e64 vcc, ttmp[14:15]       ; encoding: [0x7e,0x00,0xd7,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_t_i64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd7,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd7,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_t_i64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd7,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_t_i64_e64 exec, src_scc          ; encoding: [0x7e,0x00,0xd7,0xd4,0x7e,0xfa,0x01,0x00]
 0x7e,0x00,0xd7,0xd4,0x7e,0xfa,0x01,0x00
+# GFX11: v_cmpx_t_i64_e64 exec, src_scc          ; encoding: [0x7e,0x00,0xd7,0xd4,0x7e,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_t_i64_e64 null, 0.5              ; encoding: [0x7e,0x00,0xd7,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xd7,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_t_i64_e64 null, 0.5              ; encoding: [0x7e,0x00,0xd7,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_t_i64_e64 -1, -1                 ; encoding: [0x7e,0x00,0xd7,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xd7,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_t_i64_e64 -1, -1                 ; encoding: [0x7e,0x00,0xd7,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_t_i64_e64 0.5, null              ; encoding: [0x7e,0x00,0xd7,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xd7,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_t_i64_e64 0.5, null              ; encoding: [0x7e,0x00,0xd7,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_t_i64_e64 src_scc, exec          ; encoding: [0x7e,0x00,0xd7,0xd4,0xfd,0xfc,0x00,0x00]
 0x7e,0x00,0xd7,0xd4,0xfd,0xfc,0x00,0x00
+# GFX11: v_cmpx_t_i64_e64 src_scc, exec          ; encoding: [0x7e,0x00,0xd7,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_t_i64_e64 0xaf123456, vcc        ; encoding: [0x7e,0x00,0xd7,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd7,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_t_i64_e64 0xaf123456, vcc        ; encoding: [0x7e,0x00,0xd7,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_t_u32_e64 v1, v2                 ; encoding: [0x7e,0x00,0xcf,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xcf,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_t_u32_e64 v1, v2                 ; encoding: [0x7e,0x00,0xcf,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_t_u32_e64 v255, v255             ; encoding: [0x7e,0x00,0xcf,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xcf,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_t_u32_e64 v255, v255             ; encoding: [0x7e,0x00,0xcf,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_t_u32_e64 s1, s2                 ; encoding: [0x7e,0x00,0xcf,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xcf,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_t_u32_e64 s1, s2                 ; encoding: [0x7e,0x00,0xcf,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_t_u32_e64 s105, s105             ; encoding: [0x7e,0x00,0xcf,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xcf,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_t_u32_e64 s105, s105             ; encoding: [0x7e,0x00,0xcf,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_t_u32_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0xcf,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xcf,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_t_u32_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0xcf,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_t_u32_e64 vcc_hi, 0xaf123456     ; encoding: [0x7e,0x00,0xcf,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xcf,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_t_u32_e64 vcc_hi, 0xaf123456     ; encoding: [0x7e,0x00,0xcf,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_t_u32_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0xcf,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xcf,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_t_u32_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0xcf,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_t_u32_e64 m0, 0.5                ; encoding: [0x7e,0x00,0xcf,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0xcf,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_t_u32_e64 m0, 0.5                ; encoding: [0x7e,0x00,0xcf,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_t_u32_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0xcf,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xcf,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_t_u32_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0xcf,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_t_u32_e64 exec_hi, null          ; encoding: [0x7e,0x00,0xcf,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xcf,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_t_u32_e64 exec_hi, null          ; encoding: [0x7e,0x00,0xcf,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_t_u32_e64 null, exec_lo          ; encoding: [0x7e,0x00,0xcf,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xcf,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_t_u32_e64 null, exec_lo          ; encoding: [0x7e,0x00,0xcf,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_t_u32_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0xcf,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xcf,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_t_u32_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0xcf,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_t_u32_e64 0.5, m0                ; encoding: [0x7e,0x00,0xcf,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xcf,0xd4,0xf0,0xfa,0x00,0x00
+# GFX11: v_cmpx_t_u32_e64 0.5, m0                ; encoding: [0x7e,0x00,0xcf,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX11: v_cmpx_t_u32_e64 src_scc, vcc_lo        ; encoding: [0x7e,0x00,0xcf,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xcf,0xd4,0xfd,0xd4,0x00,0x00
+# GFX11: v_cmpx_t_u32_e64 src_scc, vcc_lo        ; encoding: [0x7e,0x00,0xcf,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX11: v_cmpx_t_u32_e64 0xaf123456, vcc_hi     ; encoding: [0x7e,0x00,0xcf,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xcf,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_t_u32_e64 0xaf123456, vcc_hi     ; encoding: [0x7e,0x00,0xcf,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_t_u64_e64 v[1:2], v[2:3]         ; encoding: [0x7e,0x00,0xdf,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xdf,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_t_u64_e64 v[1:2], v[2:3]         ; encoding: [0x7e,0x00,0xdf,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_t_u64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xdf,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xdf,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_t_u64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xdf,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_t_u64_e64 s[2:3], s[4:5]         ; encoding: [0x7e,0x00,0xdf,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xdf,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_t_u64_e64 s[2:3], s[4:5]         ; encoding: [0x7e,0x00,0xdf,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_t_u64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xdf,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xdf,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_t_u64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xdf,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_t_u64_e64 vcc, ttmp[14:15]       ; encoding: [0x7e,0x00,0xdf,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xdf,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_t_u64_e64 vcc, ttmp[14:15]       ; encoding: [0x7e,0x00,0xdf,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_t_u64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xdf,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xdf,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_t_u64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xdf,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_t_u64_e64 exec, src_scc          ; encoding: [0x7e,0x00,0xdf,0xd4,0x7e,0xfa,0x01,0x00]
 0x7e,0x00,0xdf,0xd4,0x7e,0xfa,0x01,0x00
+# GFX11: v_cmpx_t_u64_e64 exec, src_scc          ; encoding: [0x7e,0x00,0xdf,0xd4,0x7e,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_t_u64_e64 null, 0.5              ; encoding: [0x7e,0x00,0xdf,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xdf,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_t_u64_e64 null, 0.5              ; encoding: [0x7e,0x00,0xdf,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_t_u64_e64 -1, -1                 ; encoding: [0x7e,0x00,0xdf,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xdf,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_t_u64_e64 -1, -1                 ; encoding: [0x7e,0x00,0xdf,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_t_u64_e64 0.5, null              ; encoding: [0x7e,0x00,0xdf,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xdf,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_t_u64_e64 0.5, null              ; encoding: [0x7e,0x00,0xdf,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_t_u64_e64 src_scc, exec          ; encoding: [0x7e,0x00,0xdf,0xd4,0xfd,0xfc,0x00,0x00]
 0x7e,0x00,0xdf,0xd4,0xfd,0xfc,0x00,0x00
+# GFX11: v_cmpx_t_u64_e64 src_scc, exec          ; encoding: [0x7e,0x00,0xdf,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_t_u64_e64 0xaf123456, vcc        ; encoding: [0x7e,0x00,0xdf,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xdf,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_t_u64_e64 0xaf123456, vcc        ; encoding: [0x7e,0x00,0xdf,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_u_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_u_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_u_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_u_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_u_f16_e64 s1, s2                 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x88,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_u_f16_e64 s1, s2                 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_u_f16_e64 s105, s105             ; encoding: [0x7e,0x00,0x88,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x88,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_u_f16_e64 s105, s105             ; encoding: [0x7e,0x00,0x88,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_u_f16_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0x88,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x88,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_u_f16_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0x88,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_u_f16_e64 vcc_hi, 0xfe0b         ; encoding: [0x7e,0x00,0x88,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0x88,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_u_f16_e64 vcc_hi, 0xfe0b         ; encoding: [0x7e,0x00,0x88,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_u_f16_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0x88,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x88,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_u_f16_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0x88,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_u_f16_e64 m0, 0.5                ; encoding: [0x7e,0x00,0x88,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x88,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_u_f16_e64 m0, 0.5                ; encoding: [0x7e,0x00,0x88,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_u_f16_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0x88,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x88,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_u_f16_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0x88,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_u_f16_e64 |exec_hi|, null        ; encoding: [0x7e,0x01,0x88,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x88,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_u_f16_e64 |exec_hi|, null        ; encoding: [0x7e,0x01,0x88,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_u_f16_e64 null, exec_lo          ; encoding: [0x7e,0x00,0x88,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x88,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_u_f16_e64 null, exec_lo          ; encoding: [0x7e,0x00,0x88,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_u_f16_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0x88,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x88,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_u_f16_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0x88,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_u_f16_e64 0.5, -m0               ; encoding: [0x7e,0x00,0x88,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x88,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_u_f16_e64 0.5, -m0               ; encoding: [0x7e,0x00,0x88,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_u_f16_e64 -src_scc, |vcc_lo|     ; encoding: [0x7e,0x02,0x88,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x88,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_u_f16_e64 -src_scc, |vcc_lo|     ; encoding: [0x7e,0x02,0x88,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_u_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x88,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7e,0x83,0x88,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_u_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x88,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_u_f32_e64 v1, v2                 ; encoding: [0x7e,0x00,0x98,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x98,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_u_f32_e64 v1, v2                 ; encoding: [0x7e,0x00,0x98,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_u_f32_e64 v255, v255             ; encoding: [0x7e,0x00,0x98,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x98,0xd4,0xff,0xff,0x03,0x00
+# GFX11: v_cmpx_u_f32_e64 v255, v255             ; encoding: [0x7e,0x00,0x98,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_u_f32_e64 s1, s2                 ; encoding: [0x7e,0x00,0x98,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x98,0xd4,0x01,0x04,0x00,0x00
+# GFX11: v_cmpx_u_f32_e64 s1, s2                 ; encoding: [0x7e,0x00,0x98,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX11: v_cmpx_u_f32_e64 s105, s105             ; encoding: [0x7e,0x00,0x98,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x98,0xd4,0x69,0xd2,0x00,0x00
+# GFX11: v_cmpx_u_f32_e64 s105, s105             ; encoding: [0x7e,0x00,0x98,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX11: v_cmpx_u_f32_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0x98,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x98,0xd4,0x6a,0xf6,0x00,0x00
+# GFX11: v_cmpx_u_f32_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0x98,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX11: v_cmpx_u_f32_e64 vcc_hi, 0xaf123456     ; encoding: [0x7e,0x00,0x98,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0x98,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_u_f32_e64 vcc_hi, 0xaf123456     ; encoding: [0x7e,0x00,0x98,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_u_f32_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0x98,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x98,0xd4,0x7b,0xfa,0x01,0x00
+# GFX11: v_cmpx_u_f32_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0x98,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX11: v_cmpx_u_f32_e64 m0, 0.5                ; encoding: [0x7e,0x00,0x98,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x98,0xd4,0x7d,0xe0,0x01,0x00
+# GFX11: v_cmpx_u_f32_e64 m0, 0.5                ; encoding: [0x7e,0x00,0x98,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_u_f32_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0x98,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x98,0xd4,0x7e,0x82,0x01,0x00
+# GFX11: v_cmpx_u_f32_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0x98,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_u_f32_e64 |exec_hi|, null        ; encoding: [0x7e,0x01,0x98,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x98,0xd4,0x7f,0xf8,0x00,0x00
+# GFX11: v_cmpx_u_f32_e64 |exec_hi|, null        ; encoding: [0x7e,0x01,0x98,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_u_f32_e64 null, exec_lo          ; encoding: [0x7e,0x00,0x98,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x98,0xd4,0x7c,0xfc,0x00,0x00
+# GFX11: v_cmpx_u_f32_e64 null, exec_lo          ; encoding: [0x7e,0x00,0x98,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX11: v_cmpx_u_f32_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0x98,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x98,0xd4,0xc1,0xfe,0x00,0x00
+# GFX11: v_cmpx_u_f32_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0x98,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_u_f32_e64 0.5, -m0               ; encoding: [0x7e,0x00,0x98,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x98,0xd4,0xf0,0xfa,0x00,0x40
+# GFX11: v_cmpx_u_f32_e64 0.5, -m0               ; encoding: [0x7e,0x00,0x98,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX11: v_cmpx_u_f32_e64 -src_scc, |vcc_lo|     ; encoding: [0x7e,0x02,0x98,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x98,0xd4,0xfd,0xd4,0x00,0x20
+# GFX11: v_cmpx_u_f32_e64 -src_scc, |vcc_lo|     ; encoding: [0x7e,0x02,0x98,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX11: v_cmpx_u_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x98,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7e,0x83,0x98,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_u_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x98,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_u_f64_e64 v[1:2], v[2:3]         ; encoding: [0x7e,0x00,0xa8,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xa8,0xd4,0x01,0x05,0x02,0x00
+# GFX11: v_cmpx_u_f64_e64 v[1:2], v[2:3]         ; encoding: [0x7e,0x00,0xa8,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX11: v_cmpx_u_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa8,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xa8,0xd4,0xfe,0xfd,0x03,0x00
+# GFX11: v_cmpx_u_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa8,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX11: v_cmpx_u_f64_e64 s[2:3], s[4:5]         ; encoding: [0x7e,0x00,0xa8,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xa8,0xd4,0x02,0x08,0x00,0x00
+# GFX11: v_cmpx_u_f64_e64 s[2:3], s[4:5]         ; encoding: [0x7e,0x00,0xa8,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX11: v_cmpx_u_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa8,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xa8,0xd4,0x68,0xd0,0x00,0x00
+# GFX11: v_cmpx_u_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa8,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX11: v_cmpx_u_f64_e64 vcc, ttmp[14:15]       ; encoding: [0x7e,0x00,0xa8,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xa8,0xd4,0x6a,0xf4,0x00,0x00
+# GFX11: v_cmpx_u_f64_e64 vcc, ttmp[14:15]       ; encoding: [0x7e,0x00,0xa8,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX11: v_cmpx_u_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa8,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xa8,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_u_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa8,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_u_f64_e64 -|exec|, src_scc       ; encoding: [0x7e,0x01,0xa8,0xd4,0x7e,0xfa,0x01,0x20]
 0x7e,0x01,0xa8,0xd4,0x7e,0xfa,0x01,0x20
+# GFX11: v_cmpx_u_f64_e64 -|exec|, src_scc       ; encoding: [0x7e,0x01,0xa8,0xd4,0x7e,0xfa,0x01,0x20]
 
-# GFX11: v_cmpx_u_f64_e64 null, 0.5              ; encoding: [0x7e,0x00,0xa8,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xa8,0xd4,0x7c,0xe0,0x01,0x00
+# GFX11: v_cmpx_u_f64_e64 null, 0.5              ; encoding: [0x7e,0x00,0xa8,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX11: v_cmpx_u_f64_e64 -1, -1                 ; encoding: [0x7e,0x00,0xa8,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xa8,0xd4,0xc1,0x82,0x01,0x00
+# GFX11: v_cmpx_u_f64_e64 -1, -1                 ; encoding: [0x7e,0x00,0xa8,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX11: v_cmpx_u_f64_e64 0.5, null              ; encoding: [0x7e,0x00,0xa8,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xa8,0xd4,0xf0,0xf8,0x00,0x00
+# GFX11: v_cmpx_u_f64_e64 0.5, null              ; encoding: [0x7e,0x00,0xa8,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX11: v_cmpx_u_f64_e64 -|src_scc|, -|exec|    ; encoding: [0x7e,0x03,0xa8,0xd4,0xfd,0xfc,0x00,0x60]
 0x7e,0x03,0xa8,0xd4,0xfd,0xfc,0x00,0x60
+# GFX11: v_cmpx_u_f64_e64 -|src_scc|, -|exec|    ; encoding: [0x7e,0x03,0xa8,0xd4,0xfd,0xfc,0x00,0x60]
 
-# GFX11: v_cmpx_u_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa8,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7e,0x82,0xa8,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_u_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa8,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopc.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopc.txt
index b87c7bb9b268f..046c8f07e16fa 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopc.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopc.txt
@@ -1,5308 +1,5309 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W32
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W64
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W32
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W64
 
-# W32: v_cmp_class_f16_e32 vcc_lo, v1, v2        ; encoding: [0x01,0x05,0xfa,0x7c]
-# W64: v_cmp_class_f16_e32 vcc, v1, v2           ; encoding: [0x01,0x05,0xfa,0x7c]
 0x01,0x05,0xfa,0x7c
+# W32: v_cmp_class_f16_e32 vcc_lo, v1, v2      ; encoding: [0x01,0x05,0xfa,0x7c]
+# W64: v_cmp_class_f16_e32 vcc, v1, v2         ; encoding: [0x01,0x05,0xfa,0x7c]
 
-# W32: v_cmp_class_f16_e32 vcc_lo, v127, v2      ; encoding: [0x7f,0x05,0xfa,0x7c]
-# W64: v_cmp_class_f16_e32 vcc, v127, v2         ; encoding: [0x7f,0x05,0xfa,0x7c]
 0x7f,0x05,0xfa,0x7c
+# W32: v_cmp_class_f16_e32 vcc_lo, v127, v2    ; encoding: [0x7f,0x05,0xfa,0x7c]
+# W64: v_cmp_class_f16_e32 vcc, v127, v2       ; encoding: [0x7f,0x05,0xfa,0x7c]
 
-# W32: v_cmp_class_f16_e32 vcc_lo, s1, v2        ; encoding: [0x01,0x04,0xfa,0x7c]
-# W64: v_cmp_class_f16_e32 vcc, s1, v2           ; encoding: [0x01,0x04,0xfa,0x7c]
 0x01,0x04,0xfa,0x7c
+# W32: v_cmp_class_f16_e32 vcc_lo, s1, v2      ; encoding: [0x01,0x04,0xfa,0x7c]
+# W64: v_cmp_class_f16_e32 vcc, s1, v2         ; encoding: [0x01,0x04,0xfa,0x7c]
 
-# W32: v_cmp_class_f16_e32 vcc_lo, s105, v2      ; encoding: [0x69,0x04,0xfa,0x7c]
-# W64: v_cmp_class_f16_e32 vcc, s105, v2         ; encoding: [0x69,0x04,0xfa,0x7c]
 0x69,0x04,0xfa,0x7c
+# W32: v_cmp_class_f16_e32 vcc_lo, s105, v2    ; encoding: [0x69,0x04,0xfa,0x7c]
+# W64: v_cmp_class_f16_e32 vcc, s105, v2       ; encoding: [0x69,0x04,0xfa,0x7c]
 
-# W32: v_cmp_class_f16_e32 vcc_lo, vcc_lo, v2    ; encoding: [0x6a,0x04,0xfa,0x7c]
-# W64: v_cmp_class_f16_e32 vcc, vcc_lo, v2       ; encoding: [0x6a,0x04,0xfa,0x7c]
 0x6a,0x04,0xfa,0x7c
+# W32: v_cmp_class_f16_e32 vcc_lo, vcc_lo, v2  ; encoding: [0x6a,0x04,0xfa,0x7c]
+# W64: v_cmp_class_f16_e32 vcc, vcc_lo, v2     ; encoding: [0x6a,0x04,0xfa,0x7c]
 
-# W32: v_cmp_class_f16_e32 vcc_lo, vcc_hi, v2    ; encoding: [0x6b,0x04,0xfa,0x7c]
-# W64: v_cmp_class_f16_e32 vcc, vcc_hi, v2       ; encoding: [0x6b,0x04,0xfa,0x7c]
 0x6b,0x04,0xfa,0x7c
+# W32: v_cmp_class_f16_e32 vcc_lo, vcc_hi, v2  ; encoding: [0x6b,0x04,0xfa,0x7c]
+# W64: v_cmp_class_f16_e32 vcc, vcc_hi, v2     ; encoding: [0x6b,0x04,0xfa,0x7c]
 
-# W32: v_cmp_class_f16_e32 vcc_lo, ttmp15, v2    ; encoding: [0x7b,0x04,0xfa,0x7c]
-# W64: v_cmp_class_f16_e32 vcc, ttmp15, v2       ; encoding: [0x7b,0x04,0xfa,0x7c]
 0x7b,0x04,0xfa,0x7c
+# W32: v_cmp_class_f16_e32 vcc_lo, ttmp15, v2  ; encoding: [0x7b,0x04,0xfa,0x7c]
+# W64: v_cmp_class_f16_e32 vcc, ttmp15, v2     ; encoding: [0x7b,0x04,0xfa,0x7c]
 
-# W32: v_cmp_class_f16_e32 vcc_lo, m0, v2        ; encoding: [0x7d,0x04,0xfa,0x7c]
-# W64: v_cmp_class_f16_e32 vcc, m0, v2           ; encoding: [0x7d,0x04,0xfa,0x7c]
 0x7d,0x04,0xfa,0x7c
+# W32: v_cmp_class_f16_e32 vcc_lo, m0, v2      ; encoding: [0x7d,0x04,0xfa,0x7c]
+# W64: v_cmp_class_f16_e32 vcc, m0, v2         ; encoding: [0x7d,0x04,0xfa,0x7c]
 
-# W32: v_cmp_class_f16_e32 vcc_lo, exec_lo, v2   ; encoding: [0x7e,0x04,0xfa,0x7c]
-# W64: v_cmp_class_f16_e32 vcc, exec_lo, v2      ; encoding: [0x7e,0x04,0xfa,0x7c]
 0x7e,0x04,0xfa,0x7c
+# W32: v_cmp_class_f16_e32 vcc_lo, exec_lo, v2 ; encoding: [0x7e,0x04,0xfa,0x7c]
+# W64: v_cmp_class_f16_e32 vcc, exec_lo, v2    ; encoding: [0x7e,0x04,0xfa,0x7c]
 
-# W32: v_cmp_class_f16_e32 vcc_lo, exec_hi, v2   ; encoding: [0x7f,0x04,0xfa,0x7c]
-# W64: v_cmp_class_f16_e32 vcc, exec_hi, v2      ; encoding: [0x7f,0x04,0xfa,0x7c]
 0x7f,0x04,0xfa,0x7c
+# W32: v_cmp_class_f16_e32 vcc_lo, exec_hi, v2 ; encoding: [0x7f,0x04,0xfa,0x7c]
+# W64: v_cmp_class_f16_e32 vcc, exec_hi, v2    ; encoding: [0x7f,0x04,0xfa,0x7c]
 
-# W32: v_cmp_class_f16_e32 vcc_lo, null, v2      ; encoding: [0x7c,0x04,0xfa,0x7c]
-# W64: v_cmp_class_f16_e32 vcc, null, v2         ; encoding: [0x7c,0x04,0xfa,0x7c]
 0x7c,0x04,0xfa,0x7c
+# W32: v_cmp_class_f16_e32 vcc_lo, null, v2    ; encoding: [0x7c,0x04,0xfa,0x7c]
+# W64: v_cmp_class_f16_e32 vcc, null, v2       ; encoding: [0x7c,0x04,0xfa,0x7c]
 
-# W32: v_cmp_class_f16_e32 vcc_lo, -1, v2        ; encoding: [0xc1,0x04,0xfa,0x7c]
-# W64: v_cmp_class_f16_e32 vcc, -1, v2           ; encoding: [0xc1,0x04,0xfa,0x7c]
 0xc1,0x04,0xfa,0x7c
+# W32: v_cmp_class_f16_e32 vcc_lo, -1, v2      ; encoding: [0xc1,0x04,0xfa,0x7c]
+# W64: v_cmp_class_f16_e32 vcc, -1, v2         ; encoding: [0xc1,0x04,0xfa,0x7c]
 
-# W32: v_cmp_class_f16_e32 vcc_lo, 0.5, v2       ; encoding: [0xf0,0x04,0xfa,0x7c]
-# W64: v_cmp_class_f16_e32 vcc, 0.5, v2          ; encoding: [0xf0,0x04,0xfa,0x7c]
 0xf0,0x04,0xfa,0x7c
+# W32: v_cmp_class_f16_e32 vcc_lo, 0.5, v2     ; encoding: [0xf0,0x04,0xfa,0x7c]
+# W64: v_cmp_class_f16_e32 vcc, 0.5, v2        ; encoding: [0xf0,0x04,0xfa,0x7c]
 
-# W32: v_cmp_class_f16_e32 vcc_lo, src_scc, v2   ; encoding: [0xfd,0x04,0xfa,0x7c]
-# W64: v_cmp_class_f16_e32 vcc, src_scc, v2      ; encoding: [0xfd,0x04,0xfa,0x7c]
 0xfd,0x04,0xfa,0x7c
+# W32: v_cmp_class_f16_e32 vcc_lo, src_scc, v2 ; encoding: [0xfd,0x04,0xfa,0x7c]
+# W64: v_cmp_class_f16_e32 vcc, src_scc, v2    ; encoding: [0xfd,0x04,0xfa,0x7c]
 
-# W32: v_cmp_class_f16_e32 vcc_lo, 0xfe0b, v127  ; encoding: [0xff,0xfe,0xfa,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_class_f16_e32 vcc, 0xfe0b, v127     ; encoding: [0xff,0xfe,0xfa,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0xfa,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_class_f16_e32 vcc_lo, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfa,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_class_f16_e32 vcc, 0xfe0b, v127   ; encoding: [0xff,0xfe,0xfa,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_class_f32_e32 vcc_lo, v1, v2        ; encoding: [0x01,0x05,0xfc,0x7c]
-# W64: v_cmp_class_f32_e32 vcc, v1, v2           ; encoding: [0x01,0x05,0xfc,0x7c]
 0x01,0x05,0xfc,0x7c
+# W32: v_cmp_class_f32_e32 vcc_lo, v1, v2      ; encoding: [0x01,0x05,0xfc,0x7c]
+# W64: v_cmp_class_f32_e32 vcc, v1, v2         ; encoding: [0x01,0x05,0xfc,0x7c]
 
-# W32: v_cmp_class_f32_e32 vcc_lo, v255, v2      ; encoding: [0xff,0x05,0xfc,0x7c]
-# W64: v_cmp_class_f32_e32 vcc, v255, v2         ; encoding: [0xff,0x05,0xfc,0x7c]
 0xff,0x05,0xfc,0x7c
+# W32: v_cmp_class_f32_e32 vcc_lo, v255, v2    ; encoding: [0xff,0x05,0xfc,0x7c]
+# W64: v_cmp_class_f32_e32 vcc, v255, v2       ; encoding: [0xff,0x05,0xfc,0x7c]
 
-# W32: v_cmp_class_f32_e32 vcc_lo, s1, v2        ; encoding: [0x01,0x04,0xfc,0x7c]
-# W64: v_cmp_class_f32_e32 vcc, s1, v2           ; encoding: [0x01,0x04,0xfc,0x7c]
 0x01,0x04,0xfc,0x7c
+# W32: v_cmp_class_f32_e32 vcc_lo, s1, v2      ; encoding: [0x01,0x04,0xfc,0x7c]
+# W64: v_cmp_class_f32_e32 vcc, s1, v2         ; encoding: [0x01,0x04,0xfc,0x7c]
 
-# W32: v_cmp_class_f32_e32 vcc_lo, s105, v2      ; encoding: [0x69,0x04,0xfc,0x7c]
-# W64: v_cmp_class_f32_e32 vcc, s105, v2         ; encoding: [0x69,0x04,0xfc,0x7c]
 0x69,0x04,0xfc,0x7c
+# W32: v_cmp_class_f32_e32 vcc_lo, s105, v2    ; encoding: [0x69,0x04,0xfc,0x7c]
+# W64: v_cmp_class_f32_e32 vcc, s105, v2       ; encoding: [0x69,0x04,0xfc,0x7c]
 
-# W32: v_cmp_class_f32_e32 vcc_lo, vcc_lo, v2    ; encoding: [0x6a,0x04,0xfc,0x7c]
-# W64: v_cmp_class_f32_e32 vcc, vcc_lo, v2       ; encoding: [0x6a,0x04,0xfc,0x7c]
 0x6a,0x04,0xfc,0x7c
+# W32: v_cmp_class_f32_e32 vcc_lo, vcc_lo, v2  ; encoding: [0x6a,0x04,0xfc,0x7c]
+# W64: v_cmp_class_f32_e32 vcc, vcc_lo, v2     ; encoding: [0x6a,0x04,0xfc,0x7c]
 
-# W32: v_cmp_class_f32_e32 vcc_lo, vcc_hi, v2    ; encoding: [0x6b,0x04,0xfc,0x7c]
-# W64: v_cmp_class_f32_e32 vcc, vcc_hi, v2       ; encoding: [0x6b,0x04,0xfc,0x7c]
 0x6b,0x04,0xfc,0x7c
+# W32: v_cmp_class_f32_e32 vcc_lo, vcc_hi, v2  ; encoding: [0x6b,0x04,0xfc,0x7c]
+# W64: v_cmp_class_f32_e32 vcc, vcc_hi, v2     ; encoding: [0x6b,0x04,0xfc,0x7c]
 
-# W32: v_cmp_class_f32_e32 vcc_lo, ttmp15, v2    ; encoding: [0x7b,0x04,0xfc,0x7c]
-# W64: v_cmp_class_f32_e32 vcc, ttmp15, v2       ; encoding: [0x7b,0x04,0xfc,0x7c]
 0x7b,0x04,0xfc,0x7c
+# W32: v_cmp_class_f32_e32 vcc_lo, ttmp15, v2  ; encoding: [0x7b,0x04,0xfc,0x7c]
+# W64: v_cmp_class_f32_e32 vcc, ttmp15, v2     ; encoding: [0x7b,0x04,0xfc,0x7c]
 
-# W32: v_cmp_class_f32_e32 vcc_lo, m0, v2        ; encoding: [0x7d,0x04,0xfc,0x7c]
-# W64: v_cmp_class_f32_e32 vcc, m0, v2           ; encoding: [0x7d,0x04,0xfc,0x7c]
 0x7d,0x04,0xfc,0x7c
+# W32: v_cmp_class_f32_e32 vcc_lo, m0, v2      ; encoding: [0x7d,0x04,0xfc,0x7c]
+# W64: v_cmp_class_f32_e32 vcc, m0, v2         ; encoding: [0x7d,0x04,0xfc,0x7c]
 
-# W32: v_cmp_class_f32_e32 vcc_lo, exec_lo, v2   ; encoding: [0x7e,0x04,0xfc,0x7c]
-# W64: v_cmp_class_f32_e32 vcc, exec_lo, v2      ; encoding: [0x7e,0x04,0xfc,0x7c]
 0x7e,0x04,0xfc,0x7c
+# W32: v_cmp_class_f32_e32 vcc_lo, exec_lo, v2 ; encoding: [0x7e,0x04,0xfc,0x7c]
+# W64: v_cmp_class_f32_e32 vcc, exec_lo, v2    ; encoding: [0x7e,0x04,0xfc,0x7c]
 
-# W32: v_cmp_class_f32_e32 vcc_lo, exec_hi, v2   ; encoding: [0x7f,0x04,0xfc,0x7c]
-# W64: v_cmp_class_f32_e32 vcc, exec_hi, v2      ; encoding: [0x7f,0x04,0xfc,0x7c]
 0x7f,0x04,0xfc,0x7c
+# W32: v_cmp_class_f32_e32 vcc_lo, exec_hi, v2 ; encoding: [0x7f,0x04,0xfc,0x7c]
+# W64: v_cmp_class_f32_e32 vcc, exec_hi, v2    ; encoding: [0x7f,0x04,0xfc,0x7c]
 
-# W32: v_cmp_class_f32_e32 vcc_lo, null, v2      ; encoding: [0x7c,0x04,0xfc,0x7c]
-# W64: v_cmp_class_f32_e32 vcc, null, v2         ; encoding: [0x7c,0x04,0xfc,0x7c]
 0x7c,0x04,0xfc,0x7c
+# W32: v_cmp_class_f32_e32 vcc_lo, null, v2    ; encoding: [0x7c,0x04,0xfc,0x7c]
+# W64: v_cmp_class_f32_e32 vcc, null, v2       ; encoding: [0x7c,0x04,0xfc,0x7c]
 
-# W32: v_cmp_class_f32_e32 vcc_lo, -1, v2        ; encoding: [0xc1,0x04,0xfc,0x7c]
-# W64: v_cmp_class_f32_e32 vcc, -1, v2           ; encoding: [0xc1,0x04,0xfc,0x7c]
 0xc1,0x04,0xfc,0x7c
+# W32: v_cmp_class_f32_e32 vcc_lo, -1, v2      ; encoding: [0xc1,0x04,0xfc,0x7c]
+# W64: v_cmp_class_f32_e32 vcc, -1, v2         ; encoding: [0xc1,0x04,0xfc,0x7c]
 
-# W32: v_cmp_class_f32_e32 vcc_lo, 0.5, v2       ; encoding: [0xf0,0x04,0xfc,0x7c]
-# W64: v_cmp_class_f32_e32 vcc, 0.5, v2          ; encoding: [0xf0,0x04,0xfc,0x7c]
 0xf0,0x04,0xfc,0x7c
+# W32: v_cmp_class_f32_e32 vcc_lo, 0.5, v2     ; encoding: [0xf0,0x04,0xfc,0x7c]
+# W64: v_cmp_class_f32_e32 vcc, 0.5, v2        ; encoding: [0xf0,0x04,0xfc,0x7c]
 
-# W32: v_cmp_class_f32_e32 vcc_lo, src_scc, v2   ; encoding: [0xfd,0x04,0xfc,0x7c]
-# W64: v_cmp_class_f32_e32 vcc, src_scc, v2      ; encoding: [0xfd,0x04,0xfc,0x7c]
 0xfd,0x04,0xfc,0x7c
+# W32: v_cmp_class_f32_e32 vcc_lo, src_scc, v2 ; encoding: [0xfd,0x04,0xfc,0x7c]
+# W64: v_cmp_class_f32_e32 vcc, src_scc, v2    ; encoding: [0xfd,0x04,0xfc,0x7c]
 
+0xff,0xfe,0xfd,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_class_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xfd,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_class_f32_e32 vcc, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xfd,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfe,0xfd,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_class_f64_e32 vcc_lo, v[1:2], v2    ; encoding: [0x01,0x05,0xfe,0x7c]
-# W64: v_cmp_class_f64_e32 vcc, v[1:2], v2       ; encoding: [0x01,0x05,0xfe,0x7c]
 0x01,0x05,0xfe,0x7c
+# W32: v_cmp_class_f64_e32 vcc_lo, v[1:2], v2  ; encoding: [0x01,0x05,0xfe,0x7c]
+# W64: v_cmp_class_f64_e32 vcc, v[1:2], v2     ; encoding: [0x01,0x05,0xfe,0x7c]
 
-# W32: v_cmp_class_f64_e32 vcc_lo, v[254:255], v2 ; encoding: [0xfe,0x05,0xfe,0x7c]
-# W64: v_cmp_class_f64_e32 vcc, v[254:255], v2   ; encoding: [0xfe,0x05,0xfe,0x7c]
 0xfe,0x05,0xfe,0x7c
+# W32: v_cmp_class_f64_e32 vcc_lo, v[254:255], v2 ; encoding: [0xfe,0x05,0xfe,0x7c]
+# W64: v_cmp_class_f64_e32 vcc, v[254:255], v2 ; encoding: [0xfe,0x05,0xfe,0x7c]
 
-# W32: v_cmp_class_f64_e32 vcc_lo, s[2:3], v2    ; encoding: [0x02,0x04,0xfe,0x7c]
-# W64: v_cmp_class_f64_e32 vcc, s[2:3], v2       ; encoding: [0x02,0x04,0xfe,0x7c]
 0x02,0x04,0xfe,0x7c
+# W32: v_cmp_class_f64_e32 vcc_lo, s[2:3], v2  ; encoding: [0x02,0x04,0xfe,0x7c]
+# W64: v_cmp_class_f64_e32 vcc, s[2:3], v2     ; encoding: [0x02,0x04,0xfe,0x7c]
 
-# W32: v_cmp_class_f64_e32 vcc_lo, s[104:105], v2 ; encoding: [0x68,0x04,0xfe,0x7c]
-# W64: v_cmp_class_f64_e32 vcc, s[104:105], v2   ; encoding: [0x68,0x04,0xfe,0x7c]
 0x68,0x04,0xfe,0x7c
+# W32: v_cmp_class_f64_e32 vcc_lo, s[104:105], v2 ; encoding: [0x68,0x04,0xfe,0x7c]
+# W64: v_cmp_class_f64_e32 vcc, s[104:105], v2 ; encoding: [0x68,0x04,0xfe,0x7c]
 
-# W32: v_cmp_class_f64_e32 vcc_lo, vcc, v2       ; encoding: [0x6a,0x04,0xfe,0x7c]
-# W64: v_cmp_class_f64_e32 vcc, vcc, v2          ; encoding: [0x6a,0x04,0xfe,0x7c]
 0x6a,0x04,0xfe,0x7c
+# W32: v_cmp_class_f64_e32 vcc_lo, vcc, v2     ; encoding: [0x6a,0x04,0xfe,0x7c]
+# W64: v_cmp_class_f64_e32 vcc, vcc, v2        ; encoding: [0x6a,0x04,0xfe,0x7c]
 
-# W32: v_cmp_class_f64_e32 vcc_lo, ttmp[14:15], v2 ; encoding: [0x7a,0x04,0xfe,0x7c]
-# W64: v_cmp_class_f64_e32 vcc, ttmp[14:15], v2  ; encoding: [0x7a,0x04,0xfe,0x7c]
 0x7a,0x04,0xfe,0x7c
+# W32: v_cmp_class_f64_e32 vcc_lo, ttmp[14:15], v2 ; encoding: [0x7a,0x04,0xfe,0x7c]
+# W64: v_cmp_class_f64_e32 vcc, ttmp[14:15], v2 ; encoding: [0x7a,0x04,0xfe,0x7c]
 
-# W32: v_cmp_class_f64_e32 vcc_lo, exec, v2      ; encoding: [0x7e,0x04,0xfe,0x7c]
-# W64: v_cmp_class_f64_e32 vcc, exec, v2         ; encoding: [0x7e,0x04,0xfe,0x7c]
 0x7e,0x04,0xfe,0x7c
+# W32: v_cmp_class_f64_e32 vcc_lo, exec, v2    ; encoding: [0x7e,0x04,0xfe,0x7c]
+# W64: v_cmp_class_f64_e32 vcc, exec, v2       ; encoding: [0x7e,0x04,0xfe,0x7c]
 
-# W32: v_cmp_class_f64_e32 vcc_lo, null, v2      ; encoding: [0x7c,0x04,0xfe,0x7c]
-# W64: v_cmp_class_f64_e32 vcc, null, v2         ; encoding: [0x7c,0x04,0xfe,0x7c]
 0x7c,0x04,0xfe,0x7c
+# W32: v_cmp_class_f64_e32 vcc_lo, null, v2    ; encoding: [0x7c,0x04,0xfe,0x7c]
+# W64: v_cmp_class_f64_e32 vcc, null, v2       ; encoding: [0x7c,0x04,0xfe,0x7c]
 
-# W32: v_cmp_class_f64_e32 vcc_lo, -1, v2        ; encoding: [0xc1,0x04,0xfe,0x7c]
-# W64: v_cmp_class_f64_e32 vcc, -1, v2           ; encoding: [0xc1,0x04,0xfe,0x7c]
 0xc1,0x04,0xfe,0x7c
+# W32: v_cmp_class_f64_e32 vcc_lo, -1, v2      ; encoding: [0xc1,0x04,0xfe,0x7c]
+# W64: v_cmp_class_f64_e32 vcc, -1, v2         ; encoding: [0xc1,0x04,0xfe,0x7c]
 
-# W32: v_cmp_class_f64_e32 vcc_lo, 0.5, v2       ; encoding: [0xf0,0x04,0xfe,0x7c]
-# W64: v_cmp_class_f64_e32 vcc, 0.5, v2          ; encoding: [0xf0,0x04,0xfe,0x7c]
 0xf0,0x04,0xfe,0x7c
+# W32: v_cmp_class_f64_e32 vcc_lo, 0.5, v2     ; encoding: [0xf0,0x04,0xfe,0x7c]
+# W64: v_cmp_class_f64_e32 vcc, 0.5, v2        ; encoding: [0xf0,0x04,0xfe,0x7c]
 
-# W32: v_cmp_class_f64_e32 vcc_lo, src_scc, v2   ; encoding: [0xfd,0x04,0xfe,0x7c]
-# W64: v_cmp_class_f64_e32 vcc, src_scc, v2      ; encoding: [0xfd,0x04,0xfe,0x7c]
 0xfd,0x04,0xfe,0x7c
+# W32: v_cmp_class_f64_e32 vcc_lo, src_scc, v2 ; encoding: [0xfd,0x04,0xfe,0x7c]
+# W64: v_cmp_class_f64_e32 vcc, src_scc, v2    ; encoding: [0xfd,0x04,0xfe,0x7c]
 
+0xff,0xfe,0xff,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_class_f64_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_class_f64_e32 vcc, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfe,0xff,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_eq_f16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x04,0x7c]
-# W64: v_cmp_eq_f16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x04,0x7c]
 0x01,0x05,0x04,0x7c
+# W32: v_cmp_eq_f16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x04,0x7c]
+# W64: v_cmp_eq_f16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x04,0x7c]
 
-# W32: v_cmp_eq_f16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x04,0x7c]
-# W64: v_cmp_eq_f16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x04,0x7c]
 0x7f,0x05,0x04,0x7c
+# W32: v_cmp_eq_f16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x04,0x7c]
+# W64: v_cmp_eq_f16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x04,0x7c]
 
-# W32: v_cmp_eq_f16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x04,0x7c]
-# W64: v_cmp_eq_f16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x04,0x7c]
 0x01,0x04,0x04,0x7c
+# W32: v_cmp_eq_f16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x04,0x7c]
+# W64: v_cmp_eq_f16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x04,0x7c]
 
-# W32: v_cmp_eq_f16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x04,0x7c]
-# W64: v_cmp_eq_f16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x04,0x7c]
 0x69,0x04,0x04,0x7c
+# W32: v_cmp_eq_f16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x04,0x7c]
+# W64: v_cmp_eq_f16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x04,0x7c]
 
-# W32: v_cmp_eq_f16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x04,0x7c]
-# W64: v_cmp_eq_f16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x04,0x7c]
 0x6a,0x04,0x04,0x7c
+# W32: v_cmp_eq_f16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x04,0x7c]
+# W64: v_cmp_eq_f16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x04,0x7c]
 
-# W32: v_cmp_eq_f16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x04,0x7c]
-# W64: v_cmp_eq_f16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x04,0x7c]
 0x6b,0x04,0x04,0x7c
+# W32: v_cmp_eq_f16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x04,0x7c]
+# W64: v_cmp_eq_f16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x04,0x7c]
 
-# W32: v_cmp_eq_f16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x04,0x7c]
-# W64: v_cmp_eq_f16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x04,0x7c]
 0x7b,0x04,0x04,0x7c
+# W32: v_cmp_eq_f16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x04,0x7c]
+# W64: v_cmp_eq_f16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x04,0x7c]
 
-# W32: v_cmp_eq_f16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x04,0x7c]
-# W64: v_cmp_eq_f16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x04,0x7c]
 0x7d,0x04,0x04,0x7c
+# W32: v_cmp_eq_f16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x04,0x7c]
+# W64: v_cmp_eq_f16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x04,0x7c]
 
-# W32: v_cmp_eq_f16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x04,0x7c]
-# W64: v_cmp_eq_f16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x04,0x7c]
 0x7e,0x04,0x04,0x7c
+# W32: v_cmp_eq_f16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x04,0x7c]
+# W64: v_cmp_eq_f16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x04,0x7c]
 
-# W32: v_cmp_eq_f16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x04,0x7c]
-# W64: v_cmp_eq_f16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x04,0x7c]
 0x7f,0x04,0x04,0x7c
+# W32: v_cmp_eq_f16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x04,0x7c]
+# W64: v_cmp_eq_f16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x04,0x7c]
 
-# W32: v_cmp_eq_f16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x04,0x7c]
-# W64: v_cmp_eq_f16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x04,0x7c]
 0x7c,0x04,0x04,0x7c
+# W32: v_cmp_eq_f16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x04,0x7c]
+# W64: v_cmp_eq_f16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x04,0x7c]
 
-# W32: v_cmp_eq_f16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x04,0x7c]
-# W64: v_cmp_eq_f16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x04,0x7c]
 0xc1,0x04,0x04,0x7c
+# W32: v_cmp_eq_f16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x04,0x7c]
+# W64: v_cmp_eq_f16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x04,0x7c]
 
-# W32: v_cmp_eq_f16_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x04,0x7c]
-# W64: v_cmp_eq_f16_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x04,0x7c]
 0xf0,0x04,0x04,0x7c
+# W32: v_cmp_eq_f16_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x04,0x7c]
+# W64: v_cmp_eq_f16_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x04,0x7c]
 
-# W32: v_cmp_eq_f16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x04,0x7c]
-# W64: v_cmp_eq_f16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x04,0x7c]
 0xfd,0x04,0x04,0x7c
+# W32: v_cmp_eq_f16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x04,0x7c]
+# W64: v_cmp_eq_f16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x04,0x7c]
 
-# W32: v_cmp_eq_f16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x04,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_eq_f16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x04,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x04,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_eq_f16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x04,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_eq_f16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x04,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_eq_f32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x24,0x7c]
-# W64: v_cmp_eq_f32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x24,0x7c]
 0x01,0x05,0x24,0x7c
+# W32: v_cmp_eq_f32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x24,0x7c]
+# W64: v_cmp_eq_f32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x24,0x7c]
 
-# W32: v_cmp_eq_f32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x24,0x7c]
-# W64: v_cmp_eq_f32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x24,0x7c]
 0xff,0x05,0x24,0x7c
+# W32: v_cmp_eq_f32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x24,0x7c]
+# W64: v_cmp_eq_f32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x24,0x7c]
 
-# W32: v_cmp_eq_f32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x24,0x7c]
-# W64: v_cmp_eq_f32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x24,0x7c]
 0x01,0x04,0x24,0x7c
+# W32: v_cmp_eq_f32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x24,0x7c]
+# W64: v_cmp_eq_f32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x24,0x7c]
 
-# W32: v_cmp_eq_f32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x24,0x7c]
-# W64: v_cmp_eq_f32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x24,0x7c]
 0x69,0x04,0x24,0x7c
+# W32: v_cmp_eq_f32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x24,0x7c]
+# W64: v_cmp_eq_f32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x24,0x7c]
 
-# W32: v_cmp_eq_f32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x24,0x7c]
-# W64: v_cmp_eq_f32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x24,0x7c]
 0x6a,0x04,0x24,0x7c
+# W32: v_cmp_eq_f32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x24,0x7c]
+# W64: v_cmp_eq_f32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x24,0x7c]
 
-# W32: v_cmp_eq_f32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x24,0x7c]
-# W64: v_cmp_eq_f32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x24,0x7c]
 0x6b,0x04,0x24,0x7c
+# W32: v_cmp_eq_f32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x24,0x7c]
+# W64: v_cmp_eq_f32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x24,0x7c]
 
-# W32: v_cmp_eq_f32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x24,0x7c]
-# W64: v_cmp_eq_f32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x24,0x7c]
 0x7b,0x04,0x24,0x7c
+# W32: v_cmp_eq_f32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x24,0x7c]
+# W64: v_cmp_eq_f32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x24,0x7c]
 
-# W32: v_cmp_eq_f32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x24,0x7c]
-# W64: v_cmp_eq_f32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x24,0x7c]
 0x7d,0x04,0x24,0x7c
+# W32: v_cmp_eq_f32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x24,0x7c]
+# W64: v_cmp_eq_f32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x24,0x7c]
 
-# W32: v_cmp_eq_f32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x24,0x7c]
-# W64: v_cmp_eq_f32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x24,0x7c]
 0x7e,0x04,0x24,0x7c
+# W32: v_cmp_eq_f32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x24,0x7c]
+# W64: v_cmp_eq_f32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x24,0x7c]
 
-# W32: v_cmp_eq_f32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x24,0x7c]
-# W64: v_cmp_eq_f32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x24,0x7c]
 0x7f,0x04,0x24,0x7c
+# W32: v_cmp_eq_f32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x24,0x7c]
+# W64: v_cmp_eq_f32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x24,0x7c]
 
-# W32: v_cmp_eq_f32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x24,0x7c]
-# W64: v_cmp_eq_f32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x24,0x7c]
 0x7c,0x04,0x24,0x7c
+# W32: v_cmp_eq_f32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x24,0x7c]
+# W64: v_cmp_eq_f32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x24,0x7c]
 
-# W32: v_cmp_eq_f32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x24,0x7c]
-# W64: v_cmp_eq_f32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x24,0x7c]
 0xc1,0x04,0x24,0x7c
+# W32: v_cmp_eq_f32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x24,0x7c]
+# W64: v_cmp_eq_f32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x24,0x7c]
 
-# W32: v_cmp_eq_f32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x24,0x7c]
-# W64: v_cmp_eq_f32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x24,0x7c]
 0xf0,0x04,0x24,0x7c
+# W32: v_cmp_eq_f32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x24,0x7c]
+# W64: v_cmp_eq_f32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x24,0x7c]
 
-# W32: v_cmp_eq_f32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x24,0x7c]
-# W64: v_cmp_eq_f32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x24,0x7c]
 0xfd,0x04,0x24,0x7c
+# W32: v_cmp_eq_f32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x24,0x7c]
+# W64: v_cmp_eq_f32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x24,0x7c]
 
-# W32: v_cmp_eq_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x25,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_eq_f32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x25,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x25,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_eq_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x25,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_eq_f32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x25,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_eq_f64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0x44,0x7c]
-# W64: v_cmp_eq_f64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0x44,0x7c]
 0x01,0x05,0x44,0x7c
+# W32: v_cmp_eq_f64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0x44,0x7c]
+# W64: v_cmp_eq_f64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0x44,0x7c]
 
-# W32: v_cmp_eq_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x44,0x7c]
-# W64: v_cmp_eq_f64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0x44,0x7c]
 0xfe,0x05,0x44,0x7c
+# W32: v_cmp_eq_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x44,0x7c]
+# W64: v_cmp_eq_f64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x44,0x7c]
 
-# W32: v_cmp_eq_f64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0x44,0x7c]
-# W64: v_cmp_eq_f64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0x44,0x7c]
 0x02,0x04,0x44,0x7c
+# W32: v_cmp_eq_f64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0x44,0x7c]
+# W64: v_cmp_eq_f64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0x44,0x7c]
 
-# W32: v_cmp_eq_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x44,0x7c]
-# W64: v_cmp_eq_f64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0x44,0x7c]
 0x68,0x04,0x44,0x7c
+# W32: v_cmp_eq_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x44,0x7c]
+# W64: v_cmp_eq_f64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x44,0x7c]
 
-# W32: v_cmp_eq_f64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0x44,0x7c]
-# W64: v_cmp_eq_f64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0x44,0x7c]
 0x6a,0x04,0x44,0x7c
+# W32: v_cmp_eq_f64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0x44,0x7c]
+# W64: v_cmp_eq_f64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0x44,0x7c]
 
+0x7a,0x04,0x44,0x7c
 # W32: v_cmp_eq_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x44,0x7c]
 # W64: v_cmp_eq_f64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x44,0x7c]
-0x7a,0x04,0x44,0x7c
 
-# W32: v_cmp_eq_f64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0x44,0x7c]
-# W64: v_cmp_eq_f64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0x44,0x7c]
 0x7e,0x04,0x44,0x7c
+# W32: v_cmp_eq_f64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0x44,0x7c]
+# W64: v_cmp_eq_f64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0x44,0x7c]
 
-# W32: v_cmp_eq_f64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0x44,0x7c]
-# W64: v_cmp_eq_f64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0x44,0x7c]
 0x7c,0x04,0x44,0x7c
+# W32: v_cmp_eq_f64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0x44,0x7c]
+# W64: v_cmp_eq_f64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0x44,0x7c]
 
-# W32: v_cmp_eq_f64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0x44,0x7c]
-# W64: v_cmp_eq_f64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0x44,0x7c]
 0xc1,0x04,0x44,0x7c
+# W32: v_cmp_eq_f64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0x44,0x7c]
+# W64: v_cmp_eq_f64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0x44,0x7c]
 
-# W32: v_cmp_eq_f64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0x44,0x7c]
-# W64: v_cmp_eq_f64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0x44,0x7c]
 0xf0,0x04,0x44,0x7c
+# W32: v_cmp_eq_f64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0x44,0x7c]
+# W64: v_cmp_eq_f64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0x44,0x7c]
 
-# W32: v_cmp_eq_f64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0x44,0x7c]
-# W64: v_cmp_eq_f64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0x44,0x7c]
 0xfd,0x04,0x44,0x7c
+# W32: v_cmp_eq_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x44,0x7c]
+# W64: v_cmp_eq_f64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0x44,0x7c]
 
+0xff,0xfc,0x45,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_eq_f64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x45,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_eq_f64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x45,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0x45,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_eq_i16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x64,0x7c]
-# W64: v_cmp_eq_i16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x64,0x7c]
 0x01,0x05,0x64,0x7c
+# W32: v_cmp_eq_i16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x64,0x7c]
+# W64: v_cmp_eq_i16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x64,0x7c]
 
-# W32: v_cmp_eq_i16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x64,0x7c]
-# W64: v_cmp_eq_i16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x64,0x7c]
 0x7f,0x05,0x64,0x7c
+# W32: v_cmp_eq_i16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x64,0x7c]
+# W64: v_cmp_eq_i16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x64,0x7c]
 
-# W32: v_cmp_eq_i16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x64,0x7c]
-# W64: v_cmp_eq_i16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x64,0x7c]
 0x01,0x04,0x64,0x7c
+# W32: v_cmp_eq_i16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x64,0x7c]
+# W64: v_cmp_eq_i16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x64,0x7c]
 
-# W32: v_cmp_eq_i16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x64,0x7c]
-# W64: v_cmp_eq_i16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x64,0x7c]
 0x69,0x04,0x64,0x7c
+# W32: v_cmp_eq_i16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x64,0x7c]
+# W64: v_cmp_eq_i16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x64,0x7c]
 
-# W32: v_cmp_eq_i16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x64,0x7c]
-# W64: v_cmp_eq_i16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x64,0x7c]
 0x6a,0x04,0x64,0x7c
+# W32: v_cmp_eq_i16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x64,0x7c]
+# W64: v_cmp_eq_i16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x64,0x7c]
 
-# W32: v_cmp_eq_i16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x64,0x7c]
-# W64: v_cmp_eq_i16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x64,0x7c]
 0x6b,0x04,0x64,0x7c
+# W32: v_cmp_eq_i16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x64,0x7c]
+# W64: v_cmp_eq_i16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x64,0x7c]
 
-# W32: v_cmp_eq_i16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x64,0x7c]
-# W64: v_cmp_eq_i16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x64,0x7c]
 0x7b,0x04,0x64,0x7c
+# W32: v_cmp_eq_i16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x64,0x7c]
+# W64: v_cmp_eq_i16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x64,0x7c]
 
-# W32: v_cmp_eq_i16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x64,0x7c]
-# W64: v_cmp_eq_i16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x64,0x7c]
 0x7d,0x04,0x64,0x7c
+# W32: v_cmp_eq_i16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x64,0x7c]
+# W64: v_cmp_eq_i16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x64,0x7c]
 
-# W32: v_cmp_eq_i16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x64,0x7c]
-# W64: v_cmp_eq_i16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x64,0x7c]
 0x7e,0x04,0x64,0x7c
+# W32: v_cmp_eq_i16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x64,0x7c]
+# W64: v_cmp_eq_i16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x64,0x7c]
 
-# W32: v_cmp_eq_i16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x64,0x7c]
-# W64: v_cmp_eq_i16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x64,0x7c]
 0x7f,0x04,0x64,0x7c
+# W32: v_cmp_eq_i16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x64,0x7c]
+# W64: v_cmp_eq_i16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x64,0x7c]
 
-# W32: v_cmp_eq_i16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x64,0x7c]
-# W64: v_cmp_eq_i16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x64,0x7c]
 0x7c,0x04,0x64,0x7c
+# W32: v_cmp_eq_i16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x64,0x7c]
+# W64: v_cmp_eq_i16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x64,0x7c]
 
-# W32: v_cmp_eq_i16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x64,0x7c]
-# W64: v_cmp_eq_i16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x64,0x7c]
 0xc1,0x04,0x64,0x7c
+# W32: v_cmp_eq_i16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x64,0x7c]
+# W64: v_cmp_eq_i16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x64,0x7c]
 
-# W32: v_cmp_eq_i16_e32 vcc_lo, 0x3800, v2
-# W64: v_cmp_eq_i16_e32 vcc, 0x3800, v2          ; encoding: [0xff,0x04,0x64,0x7c,0x00,0x38,0x00,0x00]
 0xf0,0x04,0x64,0x7c
+# W32: v_cmp_eq_i16_e32 vcc_lo, 0x3800, v2     ; encoding: [0xff,0x04,0x64,0x7c,0x00,0x38,0x00,0x00]
+# W64: v_cmp_eq_i16_e32 vcc, 0x3800, v2        ; encoding: [0xff,0x04,0x64,0x7c,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_eq_i16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x64,0x7c]
-# W64: v_cmp_eq_i16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x64,0x7c]
 0xfd,0x04,0x64,0x7c
+# W32: v_cmp_eq_i16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x64,0x7c]
+# W64: v_cmp_eq_i16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x64,0x7c]
 
-# W32: v_cmp_eq_i16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x64,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_eq_i16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x64,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x64,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_eq_i16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x64,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_eq_i16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x64,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_eq_i32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x84,0x7c]
-# W64: v_cmp_eq_i32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x84,0x7c]
 0x01,0x05,0x84,0x7c
+# W32: v_cmp_eq_i32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x84,0x7c]
+# W64: v_cmp_eq_i32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x84,0x7c]
 
-# W32: v_cmp_eq_i32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x84,0x7c]
-# W64: v_cmp_eq_i32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x84,0x7c]
 0xff,0x05,0x84,0x7c
+# W32: v_cmp_eq_i32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x84,0x7c]
+# W64: v_cmp_eq_i32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x84,0x7c]
 
-# W32: v_cmp_eq_i32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x84,0x7c]
-# W64: v_cmp_eq_i32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x84,0x7c]
 0x01,0x04,0x84,0x7c
+# W32: v_cmp_eq_i32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x84,0x7c]
+# W64: v_cmp_eq_i32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x84,0x7c]
 
-# W32: v_cmp_eq_i32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x84,0x7c]
-# W64: v_cmp_eq_i32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x84,0x7c]
 0x69,0x04,0x84,0x7c
+# W32: v_cmp_eq_i32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x84,0x7c]
+# W64: v_cmp_eq_i32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x84,0x7c]
 
-# W32: v_cmp_eq_i32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x84,0x7c]
-# W64: v_cmp_eq_i32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x84,0x7c]
 0x6a,0x04,0x84,0x7c
+# W32: v_cmp_eq_i32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x84,0x7c]
+# W64: v_cmp_eq_i32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x84,0x7c]
 
-# W32: v_cmp_eq_i32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x84,0x7c]
-# W64: v_cmp_eq_i32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x84,0x7c]
 0x6b,0x04,0x84,0x7c
+# W32: v_cmp_eq_i32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x84,0x7c]
+# W64: v_cmp_eq_i32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x84,0x7c]
 
-# W32: v_cmp_eq_i32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x84,0x7c]
-# W64: v_cmp_eq_i32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x84,0x7c]
 0x7b,0x04,0x84,0x7c
+# W32: v_cmp_eq_i32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x84,0x7c]
+# W64: v_cmp_eq_i32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x84,0x7c]
 
-# W32: v_cmp_eq_i32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x84,0x7c]
-# W64: v_cmp_eq_i32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x84,0x7c]
 0x7d,0x04,0x84,0x7c
+# W32: v_cmp_eq_i32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x84,0x7c]
+# W64: v_cmp_eq_i32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x84,0x7c]
 
-# W32: v_cmp_eq_i32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x84,0x7c]
-# W64: v_cmp_eq_i32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x84,0x7c]
 0x7e,0x04,0x84,0x7c
+# W32: v_cmp_eq_i32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x84,0x7c]
+# W64: v_cmp_eq_i32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x84,0x7c]
 
-# W32: v_cmp_eq_i32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x84,0x7c]
-# W64: v_cmp_eq_i32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x84,0x7c]
 0x7f,0x04,0x84,0x7c
+# W32: v_cmp_eq_i32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x84,0x7c]
+# W64: v_cmp_eq_i32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x84,0x7c]
 
-# W32: v_cmp_eq_i32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x84,0x7c]
-# W64: v_cmp_eq_i32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x84,0x7c]
 0x7c,0x04,0x84,0x7c
+# W32: v_cmp_eq_i32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x84,0x7c]
+# W64: v_cmp_eq_i32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x84,0x7c]
 
-# W32: v_cmp_eq_i32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x84,0x7c]
-# W64: v_cmp_eq_i32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x84,0x7c]
 0xc1,0x04,0x84,0x7c
+# W32: v_cmp_eq_i32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x84,0x7c]
+# W64: v_cmp_eq_i32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x84,0x7c]
 
-# W32: v_cmp_eq_i32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x84,0x7c]
-# W64: v_cmp_eq_i32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x84,0x7c]
 0xf0,0x04,0x84,0x7c
+# W32: v_cmp_eq_i32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x84,0x7c]
+# W64: v_cmp_eq_i32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x84,0x7c]
 
-# W32: v_cmp_eq_i32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x84,0x7c]
-# W64: v_cmp_eq_i32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x84,0x7c]
 0xfd,0x04,0x84,0x7c
+# W32: v_cmp_eq_i32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x84,0x7c]
+# W64: v_cmp_eq_i32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x84,0x7c]
 
-# W32: v_cmp_eq_i32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x85,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_eq_i32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x85,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x85,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_eq_i32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x85,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_eq_i32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x85,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_eq_i64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0xa4,0x7c]
-# W64: v_cmp_eq_i64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0xa4,0x7c]
 0x01,0x05,0xa4,0x7c
+# W32: v_cmp_eq_i64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0xa4,0x7c]
+# W64: v_cmp_eq_i64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0xa4,0x7c]
 
-# W32: v_cmp_eq_i64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xa4,0x7c]
-# W64: v_cmp_eq_i64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0xa4,0x7c]
 0xfe,0x05,0xa4,0x7c
+# W32: v_cmp_eq_i64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xa4,0x7c]
+# W64: v_cmp_eq_i64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xa4,0x7c]
 
-# W32: v_cmp_eq_i64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0xa4,0x7c]
-# W64: v_cmp_eq_i64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0xa4,0x7c]
 0x02,0x04,0xa4,0x7c
+# W32: v_cmp_eq_i64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0xa4,0x7c]
+# W64: v_cmp_eq_i64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0xa4,0x7c]
 
-# W32: v_cmp_eq_i64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xa4,0x7c]
-# W64: v_cmp_eq_i64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0xa4,0x7c]
 0x68,0x04,0xa4,0x7c
+# W32: v_cmp_eq_i64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xa4,0x7c]
+# W64: v_cmp_eq_i64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xa4,0x7c]
 
-# W32: v_cmp_eq_i64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0xa4,0x7c]
-# W64: v_cmp_eq_i64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0xa4,0x7c]
 0x6a,0x04,0xa4,0x7c
+# W32: v_cmp_eq_i64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0xa4,0x7c]
+# W64: v_cmp_eq_i64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0xa4,0x7c]
 
+0x7a,0x04,0xa4,0x7c
 # W32: v_cmp_eq_i64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xa4,0x7c]
 # W64: v_cmp_eq_i64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xa4,0x7c]
-0x7a,0x04,0xa4,0x7c
 
-# W32: v_cmp_eq_i64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0xa4,0x7c]
-# W64: v_cmp_eq_i64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0xa4,0x7c]
 0x7e,0x04,0xa4,0x7c
+# W32: v_cmp_eq_i64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0xa4,0x7c]
+# W64: v_cmp_eq_i64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0xa4,0x7c]
 
-# W32: v_cmp_eq_i64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0xa4,0x7c]
-# W64: v_cmp_eq_i64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0xa4,0x7c]
 0x7c,0x04,0xa4,0x7c
+# W32: v_cmp_eq_i64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0xa4,0x7c]
+# W64: v_cmp_eq_i64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0xa4,0x7c]
 
-# W32: v_cmp_eq_i64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0xa4,0x7c]
-# W64: v_cmp_eq_i64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0xa4,0x7c]
 0xc1,0x04,0xa4,0x7c
+# W32: v_cmp_eq_i64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0xa4,0x7c]
+# W64: v_cmp_eq_i64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0xa4,0x7c]
 
-# W32: v_cmp_eq_i64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0xa4,0x7c]
-# W64: v_cmp_eq_i64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0xa4,0x7c]
 0xf0,0x04,0xa4,0x7c
+# W32: v_cmp_eq_i64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0xa4,0x7c]
+# W64: v_cmp_eq_i64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0xa4,0x7c]
 
-# W32: v_cmp_eq_i64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0xa4,0x7c]
-# W64: v_cmp_eq_i64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0xa4,0x7c]
 0xfd,0x04,0xa4,0x7c
+# W32: v_cmp_eq_i64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0xa4,0x7c]
+# W64: v_cmp_eq_i64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0xa4,0x7c]
 
+0xff,0xfc,0xa5,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_eq_i64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa5,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_eq_i64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa5,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0xa5,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_eq_u16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x74,0x7c]
-# W64: v_cmp_eq_u16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x74,0x7c]
 0x01,0x05,0x74,0x7c
+# W32: v_cmp_eq_u16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x74,0x7c]
+# W64: v_cmp_eq_u16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x74,0x7c]
 
-# W32: v_cmp_eq_u16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x74,0x7c]
-# W64: v_cmp_eq_u16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x74,0x7c]
 0x7f,0x05,0x74,0x7c
+# W32: v_cmp_eq_u16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x74,0x7c]
+# W64: v_cmp_eq_u16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x74,0x7c]
 
-# W32: v_cmp_eq_u16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x74,0x7c]
-# W64: v_cmp_eq_u16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x74,0x7c]
 0x01,0x04,0x74,0x7c
+# W32: v_cmp_eq_u16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x74,0x7c]
+# W64: v_cmp_eq_u16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x74,0x7c]
 
-# W32: v_cmp_eq_u16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x74,0x7c]
-# W64: v_cmp_eq_u16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x74,0x7c]
 0x69,0x04,0x74,0x7c
+# W32: v_cmp_eq_u16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x74,0x7c]
+# W64: v_cmp_eq_u16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x74,0x7c]
 
-# W32: v_cmp_eq_u16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x74,0x7c]
-# W64: v_cmp_eq_u16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x74,0x7c]
 0x6a,0x04,0x74,0x7c
+# W32: v_cmp_eq_u16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x74,0x7c]
+# W64: v_cmp_eq_u16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x74,0x7c]
 
-# W32: v_cmp_eq_u16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x74,0x7c]
-# W64: v_cmp_eq_u16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x74,0x7c]
 0x6b,0x04,0x74,0x7c
+# W32: v_cmp_eq_u16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x74,0x7c]
+# W64: v_cmp_eq_u16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x74,0x7c]
 
-# W32: v_cmp_eq_u16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x74,0x7c]
-# W64: v_cmp_eq_u16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x74,0x7c]
 0x7b,0x04,0x74,0x7c
+# W32: v_cmp_eq_u16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x74,0x7c]
+# W64: v_cmp_eq_u16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x74,0x7c]
 
-# W32: v_cmp_eq_u16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x74,0x7c]
-# W64: v_cmp_eq_u16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x74,0x7c]
 0x7d,0x04,0x74,0x7c
+# W32: v_cmp_eq_u16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x74,0x7c]
+# W64: v_cmp_eq_u16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x74,0x7c]
 
-# W32: v_cmp_eq_u16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x74,0x7c]
-# W64: v_cmp_eq_u16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x74,0x7c]
 0x7e,0x04,0x74,0x7c
+# W32: v_cmp_eq_u16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x74,0x7c]
+# W64: v_cmp_eq_u16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x74,0x7c]
 
-# W32: v_cmp_eq_u16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x74,0x7c]
-# W64: v_cmp_eq_u16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x74,0x7c]
 0x7f,0x04,0x74,0x7c
+# W32: v_cmp_eq_u16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x74,0x7c]
+# W64: v_cmp_eq_u16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x74,0x7c]
 
-# W32: v_cmp_eq_u16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x74,0x7c]
-# W64: v_cmp_eq_u16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x74,0x7c]
 0x7c,0x04,0x74,0x7c
+# W32: v_cmp_eq_u16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x74,0x7c]
+# W64: v_cmp_eq_u16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x74,0x7c]
 
-# W32: v_cmp_eq_u16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x74,0x7c]
-# W64: v_cmp_eq_u16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x74,0x7c]
 0xc1,0x04,0x74,0x7c
+# W32: v_cmp_eq_u16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x74,0x7c]
+# W64: v_cmp_eq_u16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x74,0x7c]
 
-# W32: v_cmp_eq_u16_e32 vcc_lo, 0x3800, v2
-# W64: v_cmp_eq_u16_e32 vcc, 0x3800, v2          ; encoding: [0xff,0x04,0x74,0x7c,0x00,0x38,0x00,0x00]
 0xf0,0x04,0x74,0x7c
+# W32: v_cmp_eq_u16_e32 vcc_lo, 0x3800, v2     ; encoding: [0xff,0x04,0x74,0x7c,0x00,0x38,0x00,0x00]
+# W64: v_cmp_eq_u16_e32 vcc, 0x3800, v2        ; encoding: [0xff,0x04,0x74,0x7c,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_eq_u16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x74,0x7c]
-# W64: v_cmp_eq_u16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x74,0x7c]
 0xfd,0x04,0x74,0x7c
+# W32: v_cmp_eq_u16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x74,0x7c]
+# W64: v_cmp_eq_u16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x74,0x7c]
 
-# W32: v_cmp_eq_u16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x74,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_eq_u16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x74,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x74,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_eq_u16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x74,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_eq_u16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x74,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_eq_u32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x94,0x7c]
-# W64: v_cmp_eq_u32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x94,0x7c]
 0x01,0x05,0x94,0x7c
+# W32: v_cmp_eq_u32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x94,0x7c]
+# W64: v_cmp_eq_u32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x94,0x7c]
 
-# W32: v_cmp_eq_u32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x94,0x7c]
-# W64: v_cmp_eq_u32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x94,0x7c]
 0xff,0x05,0x94,0x7c
+# W32: v_cmp_eq_u32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x94,0x7c]
+# W64: v_cmp_eq_u32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x94,0x7c]
 
-# W32: v_cmp_eq_u32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x94,0x7c]
-# W64: v_cmp_eq_u32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x94,0x7c]
 0x01,0x04,0x94,0x7c
+# W32: v_cmp_eq_u32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x94,0x7c]
+# W64: v_cmp_eq_u32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x94,0x7c]
 
-# W32: v_cmp_eq_u32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x94,0x7c]
-# W64: v_cmp_eq_u32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x94,0x7c]
 0x69,0x04,0x94,0x7c
+# W32: v_cmp_eq_u32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x94,0x7c]
+# W64: v_cmp_eq_u32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x94,0x7c]
 
-# W32: v_cmp_eq_u32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x94,0x7c]
-# W64: v_cmp_eq_u32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x94,0x7c]
 0x6a,0x04,0x94,0x7c
+# W32: v_cmp_eq_u32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x94,0x7c]
+# W64: v_cmp_eq_u32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x94,0x7c]
 
-# W32: v_cmp_eq_u32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x94,0x7c]
-# W64: v_cmp_eq_u32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x94,0x7c]
 0x6b,0x04,0x94,0x7c
+# W32: v_cmp_eq_u32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x94,0x7c]
+# W64: v_cmp_eq_u32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x94,0x7c]
 
-# W32: v_cmp_eq_u32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x94,0x7c]
-# W64: v_cmp_eq_u32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x94,0x7c]
 0x7b,0x04,0x94,0x7c
+# W32: v_cmp_eq_u32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x94,0x7c]
+# W64: v_cmp_eq_u32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x94,0x7c]
 
-# W32: v_cmp_eq_u32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x94,0x7c]
-# W64: v_cmp_eq_u32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x94,0x7c]
 0x7d,0x04,0x94,0x7c
+# W32: v_cmp_eq_u32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x94,0x7c]
+# W64: v_cmp_eq_u32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x94,0x7c]
 
-# W32: v_cmp_eq_u32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x94,0x7c]
-# W64: v_cmp_eq_u32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x94,0x7c]
 0x7e,0x04,0x94,0x7c
+# W32: v_cmp_eq_u32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x94,0x7c]
+# W64: v_cmp_eq_u32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x94,0x7c]
 
-# W32: v_cmp_eq_u32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x94,0x7c]
-# W64: v_cmp_eq_u32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x94,0x7c]
 0x7f,0x04,0x94,0x7c
+# W32: v_cmp_eq_u32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x94,0x7c]
+# W64: v_cmp_eq_u32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x94,0x7c]
 
-# W32: v_cmp_eq_u32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x94,0x7c]
-# W64: v_cmp_eq_u32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x94,0x7c]
 0x7c,0x04,0x94,0x7c
+# W32: v_cmp_eq_u32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x94,0x7c]
+# W64: v_cmp_eq_u32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x94,0x7c]
 
-# W32: v_cmp_eq_u32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x94,0x7c]
-# W64: v_cmp_eq_u32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x94,0x7c]
 0xc1,0x04,0x94,0x7c
+# W32: v_cmp_eq_u32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x94,0x7c]
+# W64: v_cmp_eq_u32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x94,0x7c]
 
-# W32: v_cmp_eq_u32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x94,0x7c]
-# W64: v_cmp_eq_u32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x94,0x7c]
 0xf0,0x04,0x94,0x7c
+# W32: v_cmp_eq_u32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x94,0x7c]
+# W64: v_cmp_eq_u32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x94,0x7c]
 
-# W32: v_cmp_eq_u32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x94,0x7c]
-# W64: v_cmp_eq_u32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x94,0x7c]
 0xfd,0x04,0x94,0x7c
+# W32: v_cmp_eq_u32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x94,0x7c]
+# W64: v_cmp_eq_u32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x94,0x7c]
 
-# W32: v_cmp_eq_u32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x95,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_eq_u32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x95,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x95,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_eq_u32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x95,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_eq_u32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x95,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0xb4,0x7c]
-# W64: v_cmp_eq_u64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0xb4,0x7c]
 0x01,0x05,0xb4,0x7c
+# W32: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0xb4,0x7c]
+# W64: v_cmp_eq_u64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0xb4,0x7c]
 
-# W32: v_cmp_eq_u64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xb4,0x7c]
-# W64: v_cmp_eq_u64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0xb4,0x7c]
 0xfe,0x05,0xb4,0x7c
+# W32: v_cmp_eq_u64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xb4,0x7c]
+# W64: v_cmp_eq_u64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xb4,0x7c]
 
-# W32: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0xb4,0x7c]
-# W64: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0xb4,0x7c]
 0x02,0x04,0xb4,0x7c
+# W32: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0xb4,0x7c]
+# W64: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0xb4,0x7c]
 
-# W32: v_cmp_eq_u64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xb4,0x7c]
-# W64: v_cmp_eq_u64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0xb4,0x7c]
 0x68,0x04,0xb4,0x7c
+# W32: v_cmp_eq_u64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xb4,0x7c]
+# W64: v_cmp_eq_u64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xb4,0x7c]
 
-# W32: v_cmp_eq_u64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0xb4,0x7c]
-# W64: v_cmp_eq_u64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0xb4,0x7c]
 0x6a,0x04,0xb4,0x7c
+# W32: v_cmp_eq_u64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0xb4,0x7c]
+# W64: v_cmp_eq_u64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0xb4,0x7c]
 
+0x7a,0x04,0xb4,0x7c
 # W32: v_cmp_eq_u64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xb4,0x7c]
 # W64: v_cmp_eq_u64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xb4,0x7c]
-0x7a,0x04,0xb4,0x7c
 
-# W32: v_cmp_eq_u64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0xb4,0x7c]
-# W64: v_cmp_eq_u64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0xb4,0x7c]
 0x7e,0x04,0xb4,0x7c
+# W32: v_cmp_eq_u64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0xb4,0x7c]
+# W64: v_cmp_eq_u64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0xb4,0x7c]
 
-# W32: v_cmp_eq_u64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0xb4,0x7c]
-# W64: v_cmp_eq_u64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0xb4,0x7c]
 0x7c,0x04,0xb4,0x7c
+# W32: v_cmp_eq_u64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0xb4,0x7c]
+# W64: v_cmp_eq_u64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0xb4,0x7c]
 
-# W32: v_cmp_eq_u64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0xb4,0x7c]
-# W64: v_cmp_eq_u64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0xb4,0x7c]
 0xc1,0x04,0xb4,0x7c
+# W32: v_cmp_eq_u64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0xb4,0x7c]
+# W64: v_cmp_eq_u64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0xb4,0x7c]
 
-# W32: v_cmp_eq_u64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0xb4,0x7c]
-# W64: v_cmp_eq_u64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0xb4,0x7c]
 0xf0,0x04,0xb4,0x7c
+# W32: v_cmp_eq_u64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0xb4,0x7c]
+# W64: v_cmp_eq_u64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0xb4,0x7c]
 
-# W32: v_cmp_eq_u64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0xb4,0x7c]
-# W64: v_cmp_eq_u64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0xb4,0x7c]
 0xfd,0x04,0xb4,0x7c
+# W32: v_cmp_eq_u64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0xb4,0x7c]
+# W64: v_cmp_eq_u64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0xb4,0x7c]
 
+0xff,0xfc,0xb5,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_eq_u64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb5,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_eq_u64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb5,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0xb5,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_f_f16_e32 vcc_lo, v1, v2            ; encoding: [0x01,0x05,0x00,0x7c]
-# W64: v_cmp_f_f16_e32 vcc, v1, v2               ; encoding: [0x01,0x05,0x00,0x7c]
 0x01,0x05,0x00,0x7c
+# W32: v_cmp_f_f16_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x00,0x7c]
+# W64: v_cmp_f_f16_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x00,0x7c]
 
-# W32: v_cmp_f_f16_e32 vcc_lo, v127, v2          ; encoding: [0x7f,0x05,0x00,0x7c]
-# W64: v_cmp_f_f16_e32 vcc, v127, v2             ; encoding: [0x7f,0x05,0x00,0x7c]
 0x7f,0x05,0x00,0x7c
+# W32: v_cmp_f_f16_e32 vcc_lo, v127, v2        ; encoding: [0x7f,0x05,0x00,0x7c]
+# W64: v_cmp_f_f16_e32 vcc, v127, v2           ; encoding: [0x7f,0x05,0x00,0x7c]
 
-# W32: v_cmp_f_f16_e32 vcc_lo, s1, v2            ; encoding: [0x01,0x04,0x00,0x7c]
-# W64: v_cmp_f_f16_e32 vcc, s1, v2               ; encoding: [0x01,0x04,0x00,0x7c]
 0x01,0x04,0x00,0x7c
+# W32: v_cmp_f_f16_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x00,0x7c]
+# W64: v_cmp_f_f16_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x00,0x7c]
 
-# W32: v_cmp_f_f16_e32 vcc_lo, s105, v2          ; encoding: [0x69,0x04,0x00,0x7c]
-# W64: v_cmp_f_f16_e32 vcc, s105, v2             ; encoding: [0x69,0x04,0x00,0x7c]
 0x69,0x04,0x00,0x7c
+# W32: v_cmp_f_f16_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x00,0x7c]
+# W64: v_cmp_f_f16_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x00,0x7c]
 
-# W32: v_cmp_f_f16_e32 vcc_lo, vcc_lo, v2        ; encoding: [0x6a,0x04,0x00,0x7c]
-# W64: v_cmp_f_f16_e32 vcc, vcc_lo, v2           ; encoding: [0x6a,0x04,0x00,0x7c]
 0x6a,0x04,0x00,0x7c
+# W32: v_cmp_f_f16_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x00,0x7c]
+# W64: v_cmp_f_f16_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x00,0x7c]
 
-# W32: v_cmp_f_f16_e32 vcc_lo, vcc_hi, v2        ; encoding: [0x6b,0x04,0x00,0x7c]
-# W64: v_cmp_f_f16_e32 vcc, vcc_hi, v2           ; encoding: [0x6b,0x04,0x00,0x7c]
 0x6b,0x04,0x00,0x7c
+# W32: v_cmp_f_f16_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x00,0x7c]
+# W64: v_cmp_f_f16_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x00,0x7c]
 
-# W32: v_cmp_f_f16_e32 vcc_lo, ttmp15, v2        ; encoding: [0x7b,0x04,0x00,0x7c]
-# W64: v_cmp_f_f16_e32 vcc, ttmp15, v2           ; encoding: [0x7b,0x04,0x00,0x7c]
 0x7b,0x04,0x00,0x7c
+# W32: v_cmp_f_f16_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x00,0x7c]
+# W64: v_cmp_f_f16_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x00,0x7c]
 
-# W32: v_cmp_f_f16_e32 vcc_lo, m0, v2            ; encoding: [0x7d,0x04,0x00,0x7c]
-# W64: v_cmp_f_f16_e32 vcc, m0, v2               ; encoding: [0x7d,0x04,0x00,0x7c]
 0x7d,0x04,0x00,0x7c
+# W32: v_cmp_f_f16_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x00,0x7c]
+# W64: v_cmp_f_f16_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x00,0x7c]
 
-# W32: v_cmp_f_f16_e32 vcc_lo, exec_lo, v2       ; encoding: [0x7e,0x04,0x00,0x7c]
-# W64: v_cmp_f_f16_e32 vcc, exec_lo, v2          ; encoding: [0x7e,0x04,0x00,0x7c]
 0x7e,0x04,0x00,0x7c
+# W32: v_cmp_f_f16_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x00,0x7c]
+# W64: v_cmp_f_f16_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x00,0x7c]
 
-# W32: v_cmp_f_f16_e32 vcc_lo, exec_hi, v2       ; encoding: [0x7f,0x04,0x00,0x7c]
-# W64: v_cmp_f_f16_e32 vcc, exec_hi, v2          ; encoding: [0x7f,0x04,0x00,0x7c]
 0x7f,0x04,0x00,0x7c
+# W32: v_cmp_f_f16_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x00,0x7c]
+# W64: v_cmp_f_f16_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x00,0x7c]
 
-# W32: v_cmp_f_f16_e32 vcc_lo, null, v2          ; encoding: [0x7c,0x04,0x00,0x7c]
-# W64: v_cmp_f_f16_e32 vcc, null, v2             ; encoding: [0x7c,0x04,0x00,0x7c]
 0x7c,0x04,0x00,0x7c
+# W32: v_cmp_f_f16_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x00,0x7c]
+# W64: v_cmp_f_f16_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x00,0x7c]
 
-# W32: v_cmp_f_f16_e32 vcc_lo, -1, v2            ; encoding: [0xc1,0x04,0x00,0x7c]
-# W64: v_cmp_f_f16_e32 vcc, -1, v2               ; encoding: [0xc1,0x04,0x00,0x7c]
 0xc1,0x04,0x00,0x7c
+# W32: v_cmp_f_f16_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x00,0x7c]
+# W64: v_cmp_f_f16_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x00,0x7c]
 
-# W32: v_cmp_f_f16_e32 vcc_lo, 0.5, v2           ; encoding: [0xf0,0x04,0x00,0x7c]
-# W64: v_cmp_f_f16_e32 vcc, 0.5, v2              ; encoding: [0xf0,0x04,0x00,0x7c]
 0xf0,0x04,0x00,0x7c
+# W32: v_cmp_f_f16_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x00,0x7c]
+# W64: v_cmp_f_f16_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x00,0x7c]
 
-# W32: v_cmp_f_f16_e32 vcc_lo, src_scc, v2       ; encoding: [0xfd,0x04,0x00,0x7c]
-# W64: v_cmp_f_f16_e32 vcc, src_scc, v2          ; encoding: [0xfd,0x04,0x00,0x7c]
 0xfd,0x04,0x00,0x7c
+# W32: v_cmp_f_f16_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x00,0x7c]
+# W64: v_cmp_f_f16_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x00,0x7c]
 
-# W32: v_cmp_f_f16_e32 vcc_lo, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x00,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_f_f16_e32 vcc, 0xfe0b, v127         ; encoding: [0xff,0xfe,0x00,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x00,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_f_f16_e32 vcc_lo, 0xfe0b, v127    ; encoding: [0xff,0xfe,0x00,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_f_f16_e32 vcc, 0xfe0b, v127       ; encoding: [0xff,0xfe,0x00,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_f_f32_e32 vcc_lo, v1, v2            ; encoding: [0x01,0x05,0x20,0x7c]
-# W64: v_cmp_f_f32_e32 vcc, v1, v2               ; encoding: [0x01,0x05,0x20,0x7c]
 0x01,0x05,0x20,0x7c
+# W32: v_cmp_f_f32_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x20,0x7c]
+# W64: v_cmp_f_f32_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x20,0x7c]
 
-# W32: v_cmp_f_f32_e32 vcc_lo, v255, v2          ; encoding: [0xff,0x05,0x20,0x7c]
-# W64: v_cmp_f_f32_e32 vcc, v255, v2             ; encoding: [0xff,0x05,0x20,0x7c]
 0xff,0x05,0x20,0x7c
+# W32: v_cmp_f_f32_e32 vcc_lo, v255, v2        ; encoding: [0xff,0x05,0x20,0x7c]
+# W64: v_cmp_f_f32_e32 vcc, v255, v2           ; encoding: [0xff,0x05,0x20,0x7c]
 
-# W32: v_cmp_f_f32_e32 vcc_lo, s1, v2            ; encoding: [0x01,0x04,0x20,0x7c]
-# W64: v_cmp_f_f32_e32 vcc, s1, v2               ; encoding: [0x01,0x04,0x20,0x7c]
 0x01,0x04,0x20,0x7c
+# W32: v_cmp_f_f32_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x20,0x7c]
+# W64: v_cmp_f_f32_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x20,0x7c]
 
-# W32: v_cmp_f_f32_e32 vcc_lo, s105, v2          ; encoding: [0x69,0x04,0x20,0x7c]
-# W64: v_cmp_f_f32_e32 vcc, s105, v2             ; encoding: [0x69,0x04,0x20,0x7c]
 0x69,0x04,0x20,0x7c
+# W32: v_cmp_f_f32_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x20,0x7c]
+# W64: v_cmp_f_f32_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x20,0x7c]
 
-# W32: v_cmp_f_f32_e32 vcc_lo, vcc_lo, v2        ; encoding: [0x6a,0x04,0x20,0x7c]
-# W64: v_cmp_f_f32_e32 vcc, vcc_lo, v2           ; encoding: [0x6a,0x04,0x20,0x7c]
 0x6a,0x04,0x20,0x7c
+# W32: v_cmp_f_f32_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x20,0x7c]
+# W64: v_cmp_f_f32_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x20,0x7c]
 
-# W32: v_cmp_f_f32_e32 vcc_lo, vcc_hi, v2        ; encoding: [0x6b,0x04,0x20,0x7c]
-# W64: v_cmp_f_f32_e32 vcc, vcc_hi, v2           ; encoding: [0x6b,0x04,0x20,0x7c]
 0x6b,0x04,0x20,0x7c
+# W32: v_cmp_f_f32_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x20,0x7c]
+# W64: v_cmp_f_f32_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x20,0x7c]
 
-# W32: v_cmp_f_f32_e32 vcc_lo, ttmp15, v2        ; encoding: [0x7b,0x04,0x20,0x7c]
-# W64: v_cmp_f_f32_e32 vcc, ttmp15, v2           ; encoding: [0x7b,0x04,0x20,0x7c]
 0x7b,0x04,0x20,0x7c
+# W32: v_cmp_f_f32_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x20,0x7c]
+# W64: v_cmp_f_f32_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x20,0x7c]
 
-# W32: v_cmp_f_f32_e32 vcc_lo, m0, v2            ; encoding: [0x7d,0x04,0x20,0x7c]
-# W64: v_cmp_f_f32_e32 vcc, m0, v2               ; encoding: [0x7d,0x04,0x20,0x7c]
 0x7d,0x04,0x20,0x7c
+# W32: v_cmp_f_f32_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x20,0x7c]
+# W64: v_cmp_f_f32_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x20,0x7c]
 
-# W32: v_cmp_f_f32_e32 vcc_lo, exec_lo, v2       ; encoding: [0x7e,0x04,0x20,0x7c]
-# W64: v_cmp_f_f32_e32 vcc, exec_lo, v2          ; encoding: [0x7e,0x04,0x20,0x7c]
 0x7e,0x04,0x20,0x7c
+# W32: v_cmp_f_f32_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x20,0x7c]
+# W64: v_cmp_f_f32_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x20,0x7c]
 
-# W32: v_cmp_f_f32_e32 vcc_lo, exec_hi, v2       ; encoding: [0x7f,0x04,0x20,0x7c]
-# W64: v_cmp_f_f32_e32 vcc, exec_hi, v2          ; encoding: [0x7f,0x04,0x20,0x7c]
 0x7f,0x04,0x20,0x7c
+# W32: v_cmp_f_f32_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x20,0x7c]
+# W64: v_cmp_f_f32_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x20,0x7c]
 
-# W32: v_cmp_f_f32_e32 vcc_lo, null, v2          ; encoding: [0x7c,0x04,0x20,0x7c]
-# W64: v_cmp_f_f32_e32 vcc, null, v2             ; encoding: [0x7c,0x04,0x20,0x7c]
 0x7c,0x04,0x20,0x7c
+# W32: v_cmp_f_f32_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x20,0x7c]
+# W64: v_cmp_f_f32_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x20,0x7c]
 
-# W32: v_cmp_f_f32_e32 vcc_lo, -1, v2            ; encoding: [0xc1,0x04,0x20,0x7c]
-# W64: v_cmp_f_f32_e32 vcc, -1, v2               ; encoding: [0xc1,0x04,0x20,0x7c]
 0xc1,0x04,0x20,0x7c
+# W32: v_cmp_f_f32_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x20,0x7c]
+# W64: v_cmp_f_f32_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x20,0x7c]
 
-# W32: v_cmp_f_f32_e32 vcc_lo, 0.5, v2           ; encoding: [0xf0,0x04,0x20,0x7c]
-# W64: v_cmp_f_f32_e32 vcc, 0.5, v2              ; encoding: [0xf0,0x04,0x20,0x7c]
 0xf0,0x04,0x20,0x7c
+# W32: v_cmp_f_f32_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x20,0x7c]
+# W64: v_cmp_f_f32_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x20,0x7c]
 
-# W32: v_cmp_f_f32_e32 vcc_lo, src_scc, v2       ; encoding: [0xfd,0x04,0x20,0x7c]
-# W64: v_cmp_f_f32_e32 vcc, src_scc, v2          ; encoding: [0xfd,0x04,0x20,0x7c]
 0xfd,0x04,0x20,0x7c
+# W32: v_cmp_f_f32_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x20,0x7c]
+# W64: v_cmp_f_f32_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x20,0x7c]
 
-# W32: v_cmp_f_f32_e32 vcc_lo, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x21,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_f_f32_e32 vcc, 0xaf123456, v255     ; encoding: [0xff,0xfe,0x21,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x21,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_f_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x21,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_f_f32_e32 vcc, 0xaf123456, v255   ; encoding: [0xff,0xfe,0x21,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_f_f64_e32 vcc_lo, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0x40,0x7c]
-# W64: v_cmp_f_f64_e32 vcc, v[1:2], v[2:3]       ; encoding: [0x01,0x05,0x40,0x7c]
 0x01,0x05,0x40,0x7c
+# W32: v_cmp_f_f64_e32 vcc_lo, v[1:2], v[2:3]  ; encoding: [0x01,0x05,0x40,0x7c]
+# W64: v_cmp_f_f64_e32 vcc, v[1:2], v[2:3]     ; encoding: [0x01,0x05,0x40,0x7c]
 
-# W32: v_cmp_f_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x40,0x7c]
-# W64: v_cmp_f_f64_e32 vcc, v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0x40,0x7c]
 0xfe,0x05,0x40,0x7c
+# W32: v_cmp_f_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x40,0x7c]
+# W64: v_cmp_f_f64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x40,0x7c]
 
-# W32: v_cmp_f_f64_e32 vcc_lo, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0x40,0x7c]
-# W64: v_cmp_f_f64_e32 vcc, s[2:3], v[2:3]       ; encoding: [0x02,0x04,0x40,0x7c]
 0x02,0x04,0x40,0x7c
+# W32: v_cmp_f_f64_e32 vcc_lo, s[2:3], v[2:3]  ; encoding: [0x02,0x04,0x40,0x7c]
+# W64: v_cmp_f_f64_e32 vcc, s[2:3], v[2:3]     ; encoding: [0x02,0x04,0x40,0x7c]
 
-# W32: v_cmp_f_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x40,0x7c]
-# W64: v_cmp_f_f64_e32 vcc, s[104:105], v[2:3]   ; encoding: [0x68,0x04,0x40,0x7c]
 0x68,0x04,0x40,0x7c
+# W32: v_cmp_f_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x40,0x7c]
+# W64: v_cmp_f_f64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x40,0x7c]
 
-# W32: v_cmp_f_f64_e32 vcc_lo, vcc, v[2:3]       ; encoding: [0x6a,0x04,0x40,0x7c]
-# W64: v_cmp_f_f64_e32 vcc, vcc, v[2:3]          ; encoding: [0x6a,0x04,0x40,0x7c]
 0x6a,0x04,0x40,0x7c
+# W32: v_cmp_f_f64_e32 vcc_lo, vcc, v[2:3]     ; encoding: [0x6a,0x04,0x40,0x7c]
+# W64: v_cmp_f_f64_e32 vcc, vcc, v[2:3]        ; encoding: [0x6a,0x04,0x40,0x7c]
 
-# W32: v_cmp_f_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x40,0x7c]
-# W64: v_cmp_f_f64_e32 vcc, ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0x40,0x7c]
 0x7a,0x04,0x40,0x7c
+# W32: v_cmp_f_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x40,0x7c]
+# W64: v_cmp_f_f64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x40,0x7c]
 
-# W32: v_cmp_f_f64_e32 vcc_lo, exec, v[2:3]      ; encoding: [0x7e,0x04,0x40,0x7c]
-# W64: v_cmp_f_f64_e32 vcc, exec, v[2:3]         ; encoding: [0x7e,0x04,0x40,0x7c]
 0x7e,0x04,0x40,0x7c
+# W32: v_cmp_f_f64_e32 vcc_lo, exec, v[2:3]    ; encoding: [0x7e,0x04,0x40,0x7c]
+# W64: v_cmp_f_f64_e32 vcc, exec, v[2:3]       ; encoding: [0x7e,0x04,0x40,0x7c]
 
-# W32: v_cmp_f_f64_e32 vcc_lo, null, v[2:3]      ; encoding: [0x7c,0x04,0x40,0x7c]
-# W64: v_cmp_f_f64_e32 vcc, null, v[2:3]         ; encoding: [0x7c,0x04,0x40,0x7c]
 0x7c,0x04,0x40,0x7c
+# W32: v_cmp_f_f64_e32 vcc_lo, null, v[2:3]    ; encoding: [0x7c,0x04,0x40,0x7c]
+# W64: v_cmp_f_f64_e32 vcc, null, v[2:3]       ; encoding: [0x7c,0x04,0x40,0x7c]
 
-# W32: v_cmp_f_f64_e32 vcc_lo, -1, v[2:3]        ; encoding: [0xc1,0x04,0x40,0x7c]
-# W64: v_cmp_f_f64_e32 vcc, -1, v[2:3]           ; encoding: [0xc1,0x04,0x40,0x7c]
 0xc1,0x04,0x40,0x7c
+# W32: v_cmp_f_f64_e32 vcc_lo, -1, v[2:3]      ; encoding: [0xc1,0x04,0x40,0x7c]
+# W64: v_cmp_f_f64_e32 vcc, -1, v[2:3]         ; encoding: [0xc1,0x04,0x40,0x7c]
 
-# W32: v_cmp_f_f64_e32 vcc_lo, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0x40,0x7c]
-# W64: v_cmp_f_f64_e32 vcc, 0.5, v[2:3]          ; encoding: [0xf0,0x04,0x40,0x7c]
 0xf0,0x04,0x40,0x7c
+# W32: v_cmp_f_f64_e32 vcc_lo, 0.5, v[2:3]     ; encoding: [0xf0,0x04,0x40,0x7c]
+# W64: v_cmp_f_f64_e32 vcc, 0.5, v[2:3]        ; encoding: [0xf0,0x04,0x40,0x7c]
 
-# W32: v_cmp_f_f64_e32 vcc_lo, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0x40,0x7c]
-# W64: v_cmp_f_f64_e32 vcc, src_scc, v[2:3]      ; encoding: [0xfd,0x04,0x40,0x7c]
 0xfd,0x04,0x40,0x7c
+# W32: v_cmp_f_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x40,0x7c]
+# W64: v_cmp_f_f64_e32 vcc, src_scc, v[2:3]    ; encoding: [0xfd,0x04,0x40,0x7c]
 
+0xff,0xfc,0x41,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_f_f64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x41,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_f_f64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x41,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0x41,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_f_i32_e32 vcc_lo, v1, v2            ; encoding: [0x01,0x05,0x80,0x7c]
-# W64: v_cmp_f_i32_e32 vcc, v1, v2               ; encoding: [0x01,0x05,0x80,0x7c]
 0x01,0x05,0x80,0x7c
+# W32: v_cmp_f_i32_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x80,0x7c]
+# W64: v_cmp_f_i32_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x80,0x7c]
 
-# W32: v_cmp_f_i32_e32 vcc_lo, v255, v2          ; encoding: [0xff,0x05,0x80,0x7c]
-# W64: v_cmp_f_i32_e32 vcc, v255, v2             ; encoding: [0xff,0x05,0x80,0x7c]
 0xff,0x05,0x80,0x7c
+# W32: v_cmp_f_i32_e32 vcc_lo, v255, v2        ; encoding: [0xff,0x05,0x80,0x7c]
+# W64: v_cmp_f_i32_e32 vcc, v255, v2           ; encoding: [0xff,0x05,0x80,0x7c]
 
-# W32: v_cmp_f_i32_e32 vcc_lo, s1, v2            ; encoding: [0x01,0x04,0x80,0x7c]
-# W64: v_cmp_f_i32_e32 vcc, s1, v2               ; encoding: [0x01,0x04,0x80,0x7c]
 0x01,0x04,0x80,0x7c
+# W32: v_cmp_f_i32_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x80,0x7c]
+# W64: v_cmp_f_i32_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x80,0x7c]
 
-# W32: v_cmp_f_i32_e32 vcc_lo, s105, v2          ; encoding: [0x69,0x04,0x80,0x7c]
-# W64: v_cmp_f_i32_e32 vcc, s105, v2             ; encoding: [0x69,0x04,0x80,0x7c]
 0x69,0x04,0x80,0x7c
+# W32: v_cmp_f_i32_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x80,0x7c]
+# W64: v_cmp_f_i32_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x80,0x7c]
 
-# W32: v_cmp_f_i32_e32 vcc_lo, vcc_lo, v2        ; encoding: [0x6a,0x04,0x80,0x7c]
-# W64: v_cmp_f_i32_e32 vcc, vcc_lo, v2           ; encoding: [0x6a,0x04,0x80,0x7c]
 0x6a,0x04,0x80,0x7c
+# W32: v_cmp_f_i32_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x80,0x7c]
+# W64: v_cmp_f_i32_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x80,0x7c]
 
-# W32: v_cmp_f_i32_e32 vcc_lo, vcc_hi, v2        ; encoding: [0x6b,0x04,0x80,0x7c]
-# W64: v_cmp_f_i32_e32 vcc, vcc_hi, v2           ; encoding: [0x6b,0x04,0x80,0x7c]
 0x6b,0x04,0x80,0x7c
+# W32: v_cmp_f_i32_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x80,0x7c]
+# W64: v_cmp_f_i32_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x80,0x7c]
 
-# W32: v_cmp_f_i32_e32 vcc_lo, ttmp15, v2        ; encoding: [0x7b,0x04,0x80,0x7c]
-# W64: v_cmp_f_i32_e32 vcc, ttmp15, v2           ; encoding: [0x7b,0x04,0x80,0x7c]
 0x7b,0x04,0x80,0x7c
+# W32: v_cmp_f_i32_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x80,0x7c]
+# W64: v_cmp_f_i32_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x80,0x7c]
 
-# W32: v_cmp_f_i32_e32 vcc_lo, m0, v2            ; encoding: [0x7d,0x04,0x80,0x7c]
-# W64: v_cmp_f_i32_e32 vcc, m0, v2               ; encoding: [0x7d,0x04,0x80,0x7c]
 0x7d,0x04,0x80,0x7c
+# W32: v_cmp_f_i32_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x80,0x7c]
+# W64: v_cmp_f_i32_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x80,0x7c]
 
-# W32: v_cmp_f_i32_e32 vcc_lo, exec_lo, v2       ; encoding: [0x7e,0x04,0x80,0x7c]
-# W64: v_cmp_f_i32_e32 vcc, exec_lo, v2          ; encoding: [0x7e,0x04,0x80,0x7c]
 0x7e,0x04,0x80,0x7c
+# W32: v_cmp_f_i32_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x80,0x7c]
+# W64: v_cmp_f_i32_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x80,0x7c]
 
-# W32: v_cmp_f_i32_e32 vcc_lo, exec_hi, v2       ; encoding: [0x7f,0x04,0x80,0x7c]
-# W64: v_cmp_f_i32_e32 vcc, exec_hi, v2          ; encoding: [0x7f,0x04,0x80,0x7c]
 0x7f,0x04,0x80,0x7c
+# W32: v_cmp_f_i32_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x80,0x7c]
+# W64: v_cmp_f_i32_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x80,0x7c]
 
-# W32: v_cmp_f_i32_e32 vcc_lo, null, v2          ; encoding: [0x7c,0x04,0x80,0x7c]
-# W64: v_cmp_f_i32_e32 vcc, null, v2             ; encoding: [0x7c,0x04,0x80,0x7c]
 0x7c,0x04,0x80,0x7c
+# W32: v_cmp_f_i32_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x80,0x7c]
+# W64: v_cmp_f_i32_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x80,0x7c]
 
-# W32: v_cmp_f_i32_e32 vcc_lo, -1, v2            ; encoding: [0xc1,0x04,0x80,0x7c]
-# W64: v_cmp_f_i32_e32 vcc, -1, v2               ; encoding: [0xc1,0x04,0x80,0x7c]
 0xc1,0x04,0x80,0x7c
+# W32: v_cmp_f_i32_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x80,0x7c]
+# W64: v_cmp_f_i32_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x80,0x7c]
 
-# W32: v_cmp_f_i32_e32 vcc_lo, 0.5, v2           ; encoding: [0xf0,0x04,0x80,0x7c]
-# W64: v_cmp_f_i32_e32 vcc, 0.5, v2              ; encoding: [0xf0,0x04,0x80,0x7c]
 0xf0,0x04,0x80,0x7c
+# W32: v_cmp_f_i32_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x80,0x7c]
+# W64: v_cmp_f_i32_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x80,0x7c]
 
-# W32: v_cmp_f_i32_e32 vcc_lo, src_scc, v2       ; encoding: [0xfd,0x04,0x80,0x7c]
-# W64: v_cmp_f_i32_e32 vcc, src_scc, v2          ; encoding: [0xfd,0x04,0x80,0x7c]
 0xfd,0x04,0x80,0x7c
+# W32: v_cmp_f_i32_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x80,0x7c]
+# W64: v_cmp_f_i32_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x80,0x7c]
 
-# W32: v_cmp_f_i32_e32 vcc_lo, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x81,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_f_i32_e32 vcc, 0xaf123456, v255     ; encoding: [0xff,0xfe,0x81,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x81,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_f_i32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x81,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_f_i32_e32 vcc, 0xaf123456, v255   ; encoding: [0xff,0xfe,0x81,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_f_i64_e32 vcc_lo, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0xa0,0x7c]
-# W64: v_cmp_f_i64_e32 vcc, v[1:2], v[2:3]       ; encoding: [0x01,0x05,0xa0,0x7c]
 0x01,0x05,0xa0,0x7c
+# W32: v_cmp_f_i64_e32 vcc_lo, v[1:2], v[2:3]  ; encoding: [0x01,0x05,0xa0,0x7c]
+# W64: v_cmp_f_i64_e32 vcc, v[1:2], v[2:3]     ; encoding: [0x01,0x05,0xa0,0x7c]
 
-# W32: v_cmp_f_i64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xa0,0x7c]
-# W64: v_cmp_f_i64_e32 vcc, v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0xa0,0x7c]
 0xfe,0x05,0xa0,0x7c
+# W32: v_cmp_f_i64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xa0,0x7c]
+# W64: v_cmp_f_i64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xa0,0x7c]
 
-# W32: v_cmp_f_i64_e32 vcc_lo, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0xa0,0x7c]
-# W64: v_cmp_f_i64_e32 vcc, s[2:3], v[2:3]       ; encoding: [0x02,0x04,0xa0,0x7c]
 0x02,0x04,0xa0,0x7c
+# W32: v_cmp_f_i64_e32 vcc_lo, s[2:3], v[2:3]  ; encoding: [0x02,0x04,0xa0,0x7c]
+# W64: v_cmp_f_i64_e32 vcc, s[2:3], v[2:3]     ; encoding: [0x02,0x04,0xa0,0x7c]
 
-# W32: v_cmp_f_i64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xa0,0x7c]
-# W64: v_cmp_f_i64_e32 vcc, s[104:105], v[2:3]   ; encoding: [0x68,0x04,0xa0,0x7c]
 0x68,0x04,0xa0,0x7c
+# W32: v_cmp_f_i64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xa0,0x7c]
+# W64: v_cmp_f_i64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xa0,0x7c]
 
-# W32: v_cmp_f_i64_e32 vcc_lo, vcc, v[2:3]       ; encoding: [0x6a,0x04,0xa0,0x7c]
-# W64: v_cmp_f_i64_e32 vcc, vcc, v[2:3]          ; encoding: [0x6a,0x04,0xa0,0x7c]
 0x6a,0x04,0xa0,0x7c
+# W32: v_cmp_f_i64_e32 vcc_lo, vcc, v[2:3]     ; encoding: [0x6a,0x04,0xa0,0x7c]
+# W64: v_cmp_f_i64_e32 vcc, vcc, v[2:3]        ; encoding: [0x6a,0x04,0xa0,0x7c]
 
-# W32: v_cmp_f_i64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xa0,0x7c]
-# W64: v_cmp_f_i64_e32 vcc, ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0xa0,0x7c]
 0x7a,0x04,0xa0,0x7c
+# W32: v_cmp_f_i64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xa0,0x7c]
+# W64: v_cmp_f_i64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xa0,0x7c]
 
-# W32: v_cmp_f_i64_e32 vcc_lo, exec, v[2:3]      ; encoding: [0x7e,0x04,0xa0,0x7c]
-# W64: v_cmp_f_i64_e32 vcc, exec, v[2:3]         ; encoding: [0x7e,0x04,0xa0,0x7c]
 0x7e,0x04,0xa0,0x7c
+# W32: v_cmp_f_i64_e32 vcc_lo, exec, v[2:3]    ; encoding: [0x7e,0x04,0xa0,0x7c]
+# W64: v_cmp_f_i64_e32 vcc, exec, v[2:3]       ; encoding: [0x7e,0x04,0xa0,0x7c]
 
-# W32: v_cmp_f_i64_e32 vcc_lo, null, v[2:3]      ; encoding: [0x7c,0x04,0xa0,0x7c]
-# W64: v_cmp_f_i64_e32 vcc, null, v[2:3]         ; encoding: [0x7c,0x04,0xa0,0x7c]
 0x7c,0x04,0xa0,0x7c
+# W32: v_cmp_f_i64_e32 vcc_lo, null, v[2:3]    ; encoding: [0x7c,0x04,0xa0,0x7c]
+# W64: v_cmp_f_i64_e32 vcc, null, v[2:3]       ; encoding: [0x7c,0x04,0xa0,0x7c]
 
-# W32: v_cmp_f_i64_e32 vcc_lo, -1, v[2:3]        ; encoding: [0xc1,0x04,0xa0,0x7c]
-# W64: v_cmp_f_i64_e32 vcc, -1, v[2:3]           ; encoding: [0xc1,0x04,0xa0,0x7c]
 0xc1,0x04,0xa0,0x7c
+# W32: v_cmp_f_i64_e32 vcc_lo, -1, v[2:3]      ; encoding: [0xc1,0x04,0xa0,0x7c]
+# W64: v_cmp_f_i64_e32 vcc, -1, v[2:3]         ; encoding: [0xc1,0x04,0xa0,0x7c]
 
-# W32: v_cmp_f_i64_e32 vcc_lo, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0xa0,0x7c]
-# W64: v_cmp_f_i64_e32 vcc, 0.5, v[2:3]          ; encoding: [0xf0,0x04,0xa0,0x7c]
 0xf0,0x04,0xa0,0x7c
+# W32: v_cmp_f_i64_e32 vcc_lo, 0.5, v[2:3]     ; encoding: [0xf0,0x04,0xa0,0x7c]
+# W64: v_cmp_f_i64_e32 vcc, 0.5, v[2:3]        ; encoding: [0xf0,0x04,0xa0,0x7c]
 
-# W32: v_cmp_f_i64_e32 vcc_lo, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0xa0,0x7c]
-# W64: v_cmp_f_i64_e32 vcc, src_scc, v[2:3]      ; encoding: [0xfd,0x04,0xa0,0x7c]
 0xfd,0x04,0xa0,0x7c
+# W32: v_cmp_f_i64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0xa0,0x7c]
+# W64: v_cmp_f_i64_e32 vcc, src_scc, v[2:3]    ; encoding: [0xfd,0x04,0xa0,0x7c]
 
+0xff,0xfc,0xa1,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_f_i64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa1,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_f_i64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa1,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0xa1,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_f_u32_e32 vcc_lo, v1, v2            ; encoding: [0x01,0x05,0x90,0x7c]
-# W64: v_cmp_f_u32_e32 vcc, v1, v2               ; encoding: [0x01,0x05,0x90,0x7c]
 0x01,0x05,0x90,0x7c
+# W32: v_cmp_f_u32_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x90,0x7c]
+# W64: v_cmp_f_u32_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x90,0x7c]
 
-# W32: v_cmp_f_u32_e32 vcc_lo, v255, v2          ; encoding: [0xff,0x05,0x90,0x7c]
-# W64: v_cmp_f_u32_e32 vcc, v255, v2             ; encoding: [0xff,0x05,0x90,0x7c]
 0xff,0x05,0x90,0x7c
+# W32: v_cmp_f_u32_e32 vcc_lo, v255, v2        ; encoding: [0xff,0x05,0x90,0x7c]
+# W64: v_cmp_f_u32_e32 vcc, v255, v2           ; encoding: [0xff,0x05,0x90,0x7c]
 
-# W32: v_cmp_f_u32_e32 vcc_lo, s1, v2            ; encoding: [0x01,0x04,0x90,0x7c]
-# W64: v_cmp_f_u32_e32 vcc, s1, v2               ; encoding: [0x01,0x04,0x90,0x7c]
 0x01,0x04,0x90,0x7c
+# W32: v_cmp_f_u32_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x90,0x7c]
+# W64: v_cmp_f_u32_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x90,0x7c]
 
-# W32: v_cmp_f_u32_e32 vcc_lo, s105, v2          ; encoding: [0x69,0x04,0x90,0x7c]
-# W64: v_cmp_f_u32_e32 vcc, s105, v2             ; encoding: [0x69,0x04,0x90,0x7c]
 0x69,0x04,0x90,0x7c
+# W32: v_cmp_f_u32_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x90,0x7c]
+# W64: v_cmp_f_u32_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x90,0x7c]
 
-# W32: v_cmp_f_u32_e32 vcc_lo, vcc_lo, v2        ; encoding: [0x6a,0x04,0x90,0x7c]
-# W64: v_cmp_f_u32_e32 vcc, vcc_lo, v2           ; encoding: [0x6a,0x04,0x90,0x7c]
 0x6a,0x04,0x90,0x7c
+# W32: v_cmp_f_u32_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x90,0x7c]
+# W64: v_cmp_f_u32_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x90,0x7c]
 
-# W32: v_cmp_f_u32_e32 vcc_lo, vcc_hi, v2        ; encoding: [0x6b,0x04,0x90,0x7c]
-# W64: v_cmp_f_u32_e32 vcc, vcc_hi, v2           ; encoding: [0x6b,0x04,0x90,0x7c]
 0x6b,0x04,0x90,0x7c
+# W32: v_cmp_f_u32_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x90,0x7c]
+# W64: v_cmp_f_u32_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x90,0x7c]
 
-# W32: v_cmp_f_u32_e32 vcc_lo, ttmp15, v2        ; encoding: [0x7b,0x04,0x90,0x7c]
-# W64: v_cmp_f_u32_e32 vcc, ttmp15, v2           ; encoding: [0x7b,0x04,0x90,0x7c]
 0x7b,0x04,0x90,0x7c
+# W32: v_cmp_f_u32_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x90,0x7c]
+# W64: v_cmp_f_u32_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x90,0x7c]
 
-# W32: v_cmp_f_u32_e32 vcc_lo, m0, v2            ; encoding: [0x7d,0x04,0x90,0x7c]
-# W64: v_cmp_f_u32_e32 vcc, m0, v2               ; encoding: [0x7d,0x04,0x90,0x7c]
 0x7d,0x04,0x90,0x7c
+# W32: v_cmp_f_u32_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x90,0x7c]
+# W64: v_cmp_f_u32_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x90,0x7c]
 
-# W32: v_cmp_f_u32_e32 vcc_lo, exec_lo, v2       ; encoding: [0x7e,0x04,0x90,0x7c]
-# W64: v_cmp_f_u32_e32 vcc, exec_lo, v2          ; encoding: [0x7e,0x04,0x90,0x7c]
 0x7e,0x04,0x90,0x7c
+# W32: v_cmp_f_u32_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x90,0x7c]
+# W64: v_cmp_f_u32_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x90,0x7c]
 
-# W32: v_cmp_f_u32_e32 vcc_lo, exec_hi, v2       ; encoding: [0x7f,0x04,0x90,0x7c]
-# W64: v_cmp_f_u32_e32 vcc, exec_hi, v2          ; encoding: [0x7f,0x04,0x90,0x7c]
 0x7f,0x04,0x90,0x7c
+# W32: v_cmp_f_u32_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x90,0x7c]
+# W64: v_cmp_f_u32_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x90,0x7c]
 
-# W32: v_cmp_f_u32_e32 vcc_lo, null, v2          ; encoding: [0x7c,0x04,0x90,0x7c]
-# W64: v_cmp_f_u32_e32 vcc, null, v2             ; encoding: [0x7c,0x04,0x90,0x7c]
 0x7c,0x04,0x90,0x7c
+# W32: v_cmp_f_u32_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x90,0x7c]
+# W64: v_cmp_f_u32_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x90,0x7c]
 
-# W32: v_cmp_f_u32_e32 vcc_lo, -1, v2            ; encoding: [0xc1,0x04,0x90,0x7c]
-# W64: v_cmp_f_u32_e32 vcc, -1, v2               ; encoding: [0xc1,0x04,0x90,0x7c]
 0xc1,0x04,0x90,0x7c
+# W32: v_cmp_f_u32_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x90,0x7c]
+# W64: v_cmp_f_u32_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x90,0x7c]
 
-# W32: v_cmp_f_u32_e32 vcc_lo, 0.5, v2           ; encoding: [0xf0,0x04,0x90,0x7c]
-# W64: v_cmp_f_u32_e32 vcc, 0.5, v2              ; encoding: [0xf0,0x04,0x90,0x7c]
 0xf0,0x04,0x90,0x7c
+# W32: v_cmp_f_u32_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x90,0x7c]
+# W64: v_cmp_f_u32_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x90,0x7c]
 
-# W32: v_cmp_f_u32_e32 vcc_lo, src_scc, v2       ; encoding: [0xfd,0x04,0x90,0x7c]
-# W64: v_cmp_f_u32_e32 vcc, src_scc, v2          ; encoding: [0xfd,0x04,0x90,0x7c]
 0xfd,0x04,0x90,0x7c
+# W32: v_cmp_f_u32_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x90,0x7c]
+# W64: v_cmp_f_u32_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x90,0x7c]
 
-# W32: v_cmp_f_u32_e32 vcc_lo, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x91,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_f_u32_e32 vcc, 0xaf123456, v255     ; encoding: [0xff,0xfe,0x91,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x91,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_f_u32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x91,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_f_u32_e32 vcc, 0xaf123456, v255   ; encoding: [0xff,0xfe,0x91,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_f_u64_e32 vcc_lo, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0xb0,0x7c]
-# W64: v_cmp_f_u64_e32 vcc, v[1:2], v[2:3]       ; encoding: [0x01,0x05,0xb0,0x7c]
 0x01,0x05,0xb0,0x7c
+# W32: v_cmp_f_u64_e32 vcc_lo, v[1:2], v[2:3]  ; encoding: [0x01,0x05,0xb0,0x7c]
+# W64: v_cmp_f_u64_e32 vcc, v[1:2], v[2:3]     ; encoding: [0x01,0x05,0xb0,0x7c]
 
-# W32: v_cmp_f_u64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xb0,0x7c]
-# W64: v_cmp_f_u64_e32 vcc, v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0xb0,0x7c]
 0xfe,0x05,0xb0,0x7c
+# W32: v_cmp_f_u64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xb0,0x7c]
+# W64: v_cmp_f_u64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xb0,0x7c]
 
-# W32: v_cmp_f_u64_e32 vcc_lo, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0xb0,0x7c]
-# W64: v_cmp_f_u64_e32 vcc, s[2:3], v[2:3]       ; encoding: [0x02,0x04,0xb0,0x7c]
 0x02,0x04,0xb0,0x7c
+# W32: v_cmp_f_u64_e32 vcc_lo, s[2:3], v[2:3]  ; encoding: [0x02,0x04,0xb0,0x7c]
+# W64: v_cmp_f_u64_e32 vcc, s[2:3], v[2:3]     ; encoding: [0x02,0x04,0xb0,0x7c]
 
-# W32: v_cmp_f_u64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xb0,0x7c]
-# W64: v_cmp_f_u64_e32 vcc, s[104:105], v[2:3]   ; encoding: [0x68,0x04,0xb0,0x7c]
 0x68,0x04,0xb0,0x7c
+# W32: v_cmp_f_u64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xb0,0x7c]
+# W64: v_cmp_f_u64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xb0,0x7c]
 
-# W32: v_cmp_f_u64_e32 vcc_lo, vcc, v[2:3]       ; encoding: [0x6a,0x04,0xb0,0x7c]
-# W64: v_cmp_f_u64_e32 vcc, vcc, v[2:3]          ; encoding: [0x6a,0x04,0xb0,0x7c]
 0x6a,0x04,0xb0,0x7c
+# W32: v_cmp_f_u64_e32 vcc_lo, vcc, v[2:3]     ; encoding: [0x6a,0x04,0xb0,0x7c]
+# W64: v_cmp_f_u64_e32 vcc, vcc, v[2:3]        ; encoding: [0x6a,0x04,0xb0,0x7c]
 
-# W32: v_cmp_f_u64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xb0,0x7c]
-# W64: v_cmp_f_u64_e32 vcc, ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0xb0,0x7c]
 0x7a,0x04,0xb0,0x7c
+# W32: v_cmp_f_u64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xb0,0x7c]
+# W64: v_cmp_f_u64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xb0,0x7c]
 
-# W32: v_cmp_f_u64_e32 vcc_lo, exec, v[2:3]      ; encoding: [0x7e,0x04,0xb0,0x7c]
-# W64: v_cmp_f_u64_e32 vcc, exec, v[2:3]         ; encoding: [0x7e,0x04,0xb0,0x7c]
 0x7e,0x04,0xb0,0x7c
+# W32: v_cmp_f_u64_e32 vcc_lo, exec, v[2:3]    ; encoding: [0x7e,0x04,0xb0,0x7c]
+# W64: v_cmp_f_u64_e32 vcc, exec, v[2:3]       ; encoding: [0x7e,0x04,0xb0,0x7c]
 
-# W32: v_cmp_f_u64_e32 vcc_lo, null, v[2:3]      ; encoding: [0x7c,0x04,0xb0,0x7c]
-# W64: v_cmp_f_u64_e32 vcc, null, v[2:3]         ; encoding: [0x7c,0x04,0xb0,0x7c]
 0x7c,0x04,0xb0,0x7c
+# W32: v_cmp_f_u64_e32 vcc_lo, null, v[2:3]    ; encoding: [0x7c,0x04,0xb0,0x7c]
+# W64: v_cmp_f_u64_e32 vcc, null, v[2:3]       ; encoding: [0x7c,0x04,0xb0,0x7c]
 
-# W32: v_cmp_f_u64_e32 vcc_lo, -1, v[2:3]        ; encoding: [0xc1,0x04,0xb0,0x7c]
-# W64: v_cmp_f_u64_e32 vcc, -1, v[2:3]           ; encoding: [0xc1,0x04,0xb0,0x7c]
 0xc1,0x04,0xb0,0x7c
+# W32: v_cmp_f_u64_e32 vcc_lo, -1, v[2:3]      ; encoding: [0xc1,0x04,0xb0,0x7c]
+# W64: v_cmp_f_u64_e32 vcc, -1, v[2:3]         ; encoding: [0xc1,0x04,0xb0,0x7c]
 
-# W32: v_cmp_f_u64_e32 vcc_lo, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0xb0,0x7c]
-# W64: v_cmp_f_u64_e32 vcc, 0.5, v[2:3]          ; encoding: [0xf0,0x04,0xb0,0x7c]
 0xf0,0x04,0xb0,0x7c
+# W32: v_cmp_f_u64_e32 vcc_lo, 0.5, v[2:3]     ; encoding: [0xf0,0x04,0xb0,0x7c]
+# W64: v_cmp_f_u64_e32 vcc, 0.5, v[2:3]        ; encoding: [0xf0,0x04,0xb0,0x7c]
 
-# W32: v_cmp_f_u64_e32 vcc_lo, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0xb0,0x7c]
-# W64: v_cmp_f_u64_e32 vcc, src_scc, v[2:3]      ; encoding: [0xfd,0x04,0xb0,0x7c]
 0xfd,0x04,0xb0,0x7c
+# W32: v_cmp_f_u64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0xb0,0x7c]
+# W64: v_cmp_f_u64_e32 vcc, src_scc, v[2:3]    ; encoding: [0xfd,0x04,0xb0,0x7c]
 
+0xff,0xfc,0xb1,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_f_u64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb1,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_f_u64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb1,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0xb1,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_ge_f16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x0c,0x7c]
-# W64: v_cmp_ge_f16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x0c,0x7c]
 0x01,0x05,0x0c,0x7c
+# W32: v_cmp_ge_f16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x0c,0x7c]
+# W64: v_cmp_ge_f16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x0c,0x7c]
 
-# W32: v_cmp_ge_f16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x0c,0x7c]
-# W64: v_cmp_ge_f16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x0c,0x7c]
 0x7f,0x05,0x0c,0x7c
+# W32: v_cmp_ge_f16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x0c,0x7c]
+# W64: v_cmp_ge_f16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x0c,0x7c]
 
-# W32: v_cmp_ge_f16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x0c,0x7c]
-# W64: v_cmp_ge_f16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x0c,0x7c]
 0x01,0x04,0x0c,0x7c
+# W32: v_cmp_ge_f16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x0c,0x7c]
+# W64: v_cmp_ge_f16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x0c,0x7c]
 
-# W32: v_cmp_ge_f16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x0c,0x7c]
-# W64: v_cmp_ge_f16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x0c,0x7c]
 0x69,0x04,0x0c,0x7c
+# W32: v_cmp_ge_f16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x0c,0x7c]
+# W64: v_cmp_ge_f16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x0c,0x7c]
 
-# W32: v_cmp_ge_f16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x0c,0x7c]
-# W64: v_cmp_ge_f16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x0c,0x7c]
 0x6a,0x04,0x0c,0x7c
+# W32: v_cmp_ge_f16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x0c,0x7c]
+# W64: v_cmp_ge_f16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x0c,0x7c]
 
-# W32: v_cmp_ge_f16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x0c,0x7c]
-# W64: v_cmp_ge_f16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x0c,0x7c]
 0x6b,0x04,0x0c,0x7c
+# W32: v_cmp_ge_f16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x0c,0x7c]
+# W64: v_cmp_ge_f16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x0c,0x7c]
 
-# W32: v_cmp_ge_f16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x0c,0x7c]
-# W64: v_cmp_ge_f16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x0c,0x7c]
 0x7b,0x04,0x0c,0x7c
+# W32: v_cmp_ge_f16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x0c,0x7c]
+# W64: v_cmp_ge_f16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x0c,0x7c]
 
-# W32: v_cmp_ge_f16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x0c,0x7c]
-# W64: v_cmp_ge_f16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x0c,0x7c]
 0x7d,0x04,0x0c,0x7c
+# W32: v_cmp_ge_f16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x0c,0x7c]
+# W64: v_cmp_ge_f16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x0c,0x7c]
 
-# W32: v_cmp_ge_f16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x0c,0x7c]
-# W64: v_cmp_ge_f16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x0c,0x7c]
 0x7e,0x04,0x0c,0x7c
+# W32: v_cmp_ge_f16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x0c,0x7c]
+# W64: v_cmp_ge_f16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x0c,0x7c]
 
-# W32: v_cmp_ge_f16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x0c,0x7c]
-# W64: v_cmp_ge_f16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x0c,0x7c]
 0x7f,0x04,0x0c,0x7c
+# W32: v_cmp_ge_f16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x0c,0x7c]
+# W64: v_cmp_ge_f16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x0c,0x7c]
 
-# W32: v_cmp_ge_f16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x0c,0x7c]
-# W64: v_cmp_ge_f16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x0c,0x7c]
 0x7c,0x04,0x0c,0x7c
+# W32: v_cmp_ge_f16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x0c,0x7c]
+# W64: v_cmp_ge_f16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x0c,0x7c]
 
-# W32: v_cmp_ge_f16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x0c,0x7c]
-# W64: v_cmp_ge_f16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x0c,0x7c]
 0xc1,0x04,0x0c,0x7c
+# W32: v_cmp_ge_f16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x0c,0x7c]
+# W64: v_cmp_ge_f16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x0c,0x7c]
 
-# W32: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x0c,0x7c]
-# W64: v_cmp_ge_f16_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x0c,0x7c]
 0xf0,0x04,0x0c,0x7c
+# W32: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x0c,0x7c]
+# W64: v_cmp_ge_f16_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x0c,0x7c]
 
-# W32: v_cmp_ge_f16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x0c,0x7c]
-# W64: v_cmp_ge_f16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x0c,0x7c]
 0xfd,0x04,0x0c,0x7c
+# W32: v_cmp_ge_f16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x0c,0x7c]
+# W64: v_cmp_ge_f16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x0c,0x7c]
 
-# W32: v_cmp_ge_f16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x0c,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_ge_f16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x0c,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x0c,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_ge_f16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x0c,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_ge_f16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x0c,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ge_f32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x2c,0x7c]
-# W64: v_cmp_ge_f32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x2c,0x7c]
 0x01,0x05,0x2c,0x7c
+# W32: v_cmp_ge_f32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x2c,0x7c]
+# W64: v_cmp_ge_f32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x2c,0x7c]
 
-# W32: v_cmp_ge_f32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x2c,0x7c]
-# W64: v_cmp_ge_f32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x2c,0x7c]
 0xff,0x05,0x2c,0x7c
+# W32: v_cmp_ge_f32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x2c,0x7c]
+# W64: v_cmp_ge_f32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x2c,0x7c]
 
-# W32: v_cmp_ge_f32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x2c,0x7c]
-# W64: v_cmp_ge_f32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x2c,0x7c]
 0x01,0x04,0x2c,0x7c
+# W32: v_cmp_ge_f32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x2c,0x7c]
+# W64: v_cmp_ge_f32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x2c,0x7c]
 
-# W32: v_cmp_ge_f32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x2c,0x7c]
-# W64: v_cmp_ge_f32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x2c,0x7c]
 0x69,0x04,0x2c,0x7c
+# W32: v_cmp_ge_f32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x2c,0x7c]
+# W64: v_cmp_ge_f32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x2c,0x7c]
 
-# W32: v_cmp_ge_f32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x2c,0x7c]
-# W64: v_cmp_ge_f32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x2c,0x7c]
 0x6a,0x04,0x2c,0x7c
+# W32: v_cmp_ge_f32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x2c,0x7c]
+# W64: v_cmp_ge_f32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x2c,0x7c]
 
-# W32: v_cmp_ge_f32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x2c,0x7c]
-# W64: v_cmp_ge_f32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x2c,0x7c]
 0x6b,0x04,0x2c,0x7c
+# W32: v_cmp_ge_f32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x2c,0x7c]
+# W64: v_cmp_ge_f32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x2c,0x7c]
 
-# W32: v_cmp_ge_f32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x2c,0x7c]
-# W64: v_cmp_ge_f32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x2c,0x7c]
 0x7b,0x04,0x2c,0x7c
+# W32: v_cmp_ge_f32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x2c,0x7c]
+# W64: v_cmp_ge_f32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x2c,0x7c]
 
-# W32: v_cmp_ge_f32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x2c,0x7c]
-# W64: v_cmp_ge_f32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x2c,0x7c]
 0x7d,0x04,0x2c,0x7c
+# W32: v_cmp_ge_f32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x2c,0x7c]
+# W64: v_cmp_ge_f32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x2c,0x7c]
 
-# W32: v_cmp_ge_f32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x2c,0x7c]
-# W64: v_cmp_ge_f32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x2c,0x7c]
 0x7e,0x04,0x2c,0x7c
+# W32: v_cmp_ge_f32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x2c,0x7c]
+# W64: v_cmp_ge_f32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x2c,0x7c]
 
-# W32: v_cmp_ge_f32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x2c,0x7c]
-# W64: v_cmp_ge_f32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x2c,0x7c]
 0x7f,0x04,0x2c,0x7c
+# W32: v_cmp_ge_f32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x2c,0x7c]
+# W64: v_cmp_ge_f32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x2c,0x7c]
 
-# W32: v_cmp_ge_f32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x2c,0x7c]
-# W64: v_cmp_ge_f32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x2c,0x7c]
 0x7c,0x04,0x2c,0x7c
+# W32: v_cmp_ge_f32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x2c,0x7c]
+# W64: v_cmp_ge_f32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x2c,0x7c]
 
-# W32: v_cmp_ge_f32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x2c,0x7c]
-# W64: v_cmp_ge_f32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x2c,0x7c]
 0xc1,0x04,0x2c,0x7c
+# W32: v_cmp_ge_f32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x2c,0x7c]
+# W64: v_cmp_ge_f32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x2c,0x7c]
 
-# W32: v_cmp_ge_f32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x2c,0x7c]
-# W64: v_cmp_ge_f32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x2c,0x7c]
 0xf0,0x04,0x2c,0x7c
+# W32: v_cmp_ge_f32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x2c,0x7c]
+# W64: v_cmp_ge_f32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x2c,0x7c]
 
-# W32: v_cmp_ge_f32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x2c,0x7c]
-# W64: v_cmp_ge_f32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x2c,0x7c]
 0xfd,0x04,0x2c,0x7c
+# W32: v_cmp_ge_f32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x2c,0x7c]
+# W64: v_cmp_ge_f32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x2c,0x7c]
 
-# W32: v_cmp_ge_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x2d,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_ge_f32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x2d,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x2d,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_ge_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x2d,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_ge_f32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x2d,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ge_f64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0x4c,0x7c]
-# W64: v_cmp_ge_f64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0x4c,0x7c]
 0x01,0x05,0x4c,0x7c
+# W32: v_cmp_ge_f64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0x4c,0x7c]
+# W64: v_cmp_ge_f64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0x4c,0x7c]
 
-# W32: v_cmp_ge_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x4c,0x7c]
-# W64: v_cmp_ge_f64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0x4c,0x7c]
 0xfe,0x05,0x4c,0x7c
+# W32: v_cmp_ge_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x4c,0x7c]
+# W64: v_cmp_ge_f64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x4c,0x7c]
 
-# W32: v_cmp_ge_f64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0x4c,0x7c]
-# W64: v_cmp_ge_f64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0x4c,0x7c]
 0x02,0x04,0x4c,0x7c
+# W32: v_cmp_ge_f64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0x4c,0x7c]
+# W64: v_cmp_ge_f64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0x4c,0x7c]
 
-# W32: v_cmp_ge_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x4c,0x7c]
-# W64: v_cmp_ge_f64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0x4c,0x7c]
 0x68,0x04,0x4c,0x7c
+# W32: v_cmp_ge_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x4c,0x7c]
+# W64: v_cmp_ge_f64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x4c,0x7c]
 
-# W32: v_cmp_ge_f64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0x4c,0x7c]
-# W64: v_cmp_ge_f64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0x4c,0x7c]
 0x6a,0x04,0x4c,0x7c
+# W32: v_cmp_ge_f64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0x4c,0x7c]
+# W64: v_cmp_ge_f64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0x4c,0x7c]
 
+0x7a,0x04,0x4c,0x7c
 # W32: v_cmp_ge_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x4c,0x7c]
 # W64: v_cmp_ge_f64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x4c,0x7c]
-0x7a,0x04,0x4c,0x7c
 
-# W32: v_cmp_ge_f64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0x4c,0x7c]
-# W64: v_cmp_ge_f64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0x4c,0x7c]
 0x7e,0x04,0x4c,0x7c
+# W32: v_cmp_ge_f64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0x4c,0x7c]
+# W64: v_cmp_ge_f64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0x4c,0x7c]
 
-# W32: v_cmp_ge_f64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0x4c,0x7c]
-# W64: v_cmp_ge_f64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0x4c,0x7c]
 0x7c,0x04,0x4c,0x7c
+# W32: v_cmp_ge_f64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0x4c,0x7c]
+# W64: v_cmp_ge_f64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0x4c,0x7c]
 
-# W32: v_cmp_ge_f64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0x4c,0x7c]
-# W64: v_cmp_ge_f64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0x4c,0x7c]
 0xc1,0x04,0x4c,0x7c
+# W32: v_cmp_ge_f64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0x4c,0x7c]
+# W64: v_cmp_ge_f64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0x4c,0x7c]
 
-# W32: v_cmp_ge_f64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0x4c,0x7c]
-# W64: v_cmp_ge_f64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0x4c,0x7c]
 0xf0,0x04,0x4c,0x7c
+# W32: v_cmp_ge_f64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0x4c,0x7c]
+# W64: v_cmp_ge_f64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0x4c,0x7c]
 
-# W32: v_cmp_ge_f64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0x4c,0x7c]
-# W64: v_cmp_ge_f64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0x4c,0x7c]
 0xfd,0x04,0x4c,0x7c
+# W32: v_cmp_ge_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x4c,0x7c]
+# W64: v_cmp_ge_f64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0x4c,0x7c]
 
+0xff,0xfc,0x4d,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_ge_f64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4d,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_ge_f64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4d,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0x4d,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_ge_i16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x6c,0x7c]
-# W64: v_cmp_ge_i16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x6c,0x7c]
 0x01,0x05,0x6c,0x7c
+# W32: v_cmp_ge_i16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x6c,0x7c]
+# W64: v_cmp_ge_i16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x6c,0x7c]
 
-# W32: v_cmp_ge_i16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x6c,0x7c]
-# W64: v_cmp_ge_i16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x6c,0x7c]
 0x7f,0x05,0x6c,0x7c
+# W32: v_cmp_ge_i16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x6c,0x7c]
+# W64: v_cmp_ge_i16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x6c,0x7c]
 
-# W32: v_cmp_ge_i16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x6c,0x7c]
-# W64: v_cmp_ge_i16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x6c,0x7c]
 0x01,0x04,0x6c,0x7c
+# W32: v_cmp_ge_i16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x6c,0x7c]
+# W64: v_cmp_ge_i16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x6c,0x7c]
 
-# W32: v_cmp_ge_i16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x6c,0x7c]
-# W64: v_cmp_ge_i16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x6c,0x7c]
 0x69,0x04,0x6c,0x7c
+# W32: v_cmp_ge_i16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x6c,0x7c]
+# W64: v_cmp_ge_i16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x6c,0x7c]
 
-# W32: v_cmp_ge_i16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x6c,0x7c]
-# W64: v_cmp_ge_i16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x6c,0x7c]
 0x6a,0x04,0x6c,0x7c
+# W32: v_cmp_ge_i16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x6c,0x7c]
+# W64: v_cmp_ge_i16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x6c,0x7c]
 
-# W32: v_cmp_ge_i16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x6c,0x7c]
-# W64: v_cmp_ge_i16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x6c,0x7c]
 0x6b,0x04,0x6c,0x7c
+# W32: v_cmp_ge_i16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x6c,0x7c]
+# W64: v_cmp_ge_i16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x6c,0x7c]
 
-# W32: v_cmp_ge_i16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x6c,0x7c]
-# W64: v_cmp_ge_i16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x6c,0x7c]
 0x7b,0x04,0x6c,0x7c
+# W32: v_cmp_ge_i16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x6c,0x7c]
+# W64: v_cmp_ge_i16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x6c,0x7c]
 
-# W32: v_cmp_ge_i16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x6c,0x7c]
-# W64: v_cmp_ge_i16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x6c,0x7c]
 0x7d,0x04,0x6c,0x7c
+# W32: v_cmp_ge_i16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x6c,0x7c]
+# W64: v_cmp_ge_i16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x6c,0x7c]
 
-# W32: v_cmp_ge_i16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x6c,0x7c]
-# W64: v_cmp_ge_i16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x6c,0x7c]
 0x7e,0x04,0x6c,0x7c
+# W32: v_cmp_ge_i16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x6c,0x7c]
+# W64: v_cmp_ge_i16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x6c,0x7c]
 
-# W32: v_cmp_ge_i16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x6c,0x7c]
-# W64: v_cmp_ge_i16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x6c,0x7c]
 0x7f,0x04,0x6c,0x7c
+# W32: v_cmp_ge_i16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x6c,0x7c]
+# W64: v_cmp_ge_i16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x6c,0x7c]
 
-# W32: v_cmp_ge_i16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x6c,0x7c]
-# W64: v_cmp_ge_i16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x6c,0x7c]
 0x7c,0x04,0x6c,0x7c
+# W32: v_cmp_ge_i16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x6c,0x7c]
+# W64: v_cmp_ge_i16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x6c,0x7c]
 
-# W32: v_cmp_ge_i16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x6c,0x7c]
-# W64: v_cmp_ge_i16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x6c,0x7c]
 0xc1,0x04,0x6c,0x7c
+# W32: v_cmp_ge_i16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x6c,0x7c]
+# W64: v_cmp_ge_i16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x6c,0x7c]
 
-# W32: v_cmp_ge_i16_e32 vcc_lo, 0x3800, v2
-# W64: v_cmp_ge_i16_e32 vcc, 0x3800, v2          ; encoding: [0xff,0x04,0x6c,0x7c,0x00,0x38,0x00,0x00]
 0xf0,0x04,0x6c,0x7c
+# W32: v_cmp_ge_i16_e32 vcc_lo, 0x3800, v2     ; encoding: [0xff,0x04,0x6c,0x7c,0x00,0x38,0x00,0x00]
+# W64: v_cmp_ge_i16_e32 vcc, 0x3800, v2        ; encoding: [0xff,0x04,0x6c,0x7c,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_ge_i16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x6c,0x7c]
-# W64: v_cmp_ge_i16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x6c,0x7c]
 0xfd,0x04,0x6c,0x7c
+# W32: v_cmp_ge_i16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x6c,0x7c]
+# W64: v_cmp_ge_i16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x6c,0x7c]
 
-# W32: v_cmp_ge_i16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x6c,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_ge_i16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x6c,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x6c,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_ge_i16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x6c,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_ge_i16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x6c,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ge_i32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x8c,0x7c]
-# W64: v_cmp_ge_i32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x8c,0x7c]
 0x01,0x05,0x8c,0x7c
+# W32: v_cmp_ge_i32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x8c,0x7c]
+# W64: v_cmp_ge_i32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x8c,0x7c]
 
-# W32: v_cmp_ge_i32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x8c,0x7c]
-# W64: v_cmp_ge_i32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x8c,0x7c]
 0xff,0x05,0x8c,0x7c
+# W32: v_cmp_ge_i32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x8c,0x7c]
+# W64: v_cmp_ge_i32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x8c,0x7c]
 
-# W32: v_cmp_ge_i32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x8c,0x7c]
-# W64: v_cmp_ge_i32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x8c,0x7c]
 0x01,0x04,0x8c,0x7c
+# W32: v_cmp_ge_i32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x8c,0x7c]
+# W64: v_cmp_ge_i32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x8c,0x7c]
 
-# W32: v_cmp_ge_i32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x8c,0x7c]
-# W64: v_cmp_ge_i32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x8c,0x7c]
 0x69,0x04,0x8c,0x7c
+# W32: v_cmp_ge_i32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x8c,0x7c]
+# W64: v_cmp_ge_i32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x8c,0x7c]
 
-# W32: v_cmp_ge_i32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x8c,0x7c]
-# W64: v_cmp_ge_i32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x8c,0x7c]
 0x6a,0x04,0x8c,0x7c
+# W32: v_cmp_ge_i32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x8c,0x7c]
+# W64: v_cmp_ge_i32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x8c,0x7c]
 
-# W32: v_cmp_ge_i32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x8c,0x7c]
-# W64: v_cmp_ge_i32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x8c,0x7c]
 0x6b,0x04,0x8c,0x7c
+# W32: v_cmp_ge_i32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x8c,0x7c]
+# W64: v_cmp_ge_i32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x8c,0x7c]
 
-# W32: v_cmp_ge_i32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x8c,0x7c]
-# W64: v_cmp_ge_i32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x8c,0x7c]
 0x7b,0x04,0x8c,0x7c
+# W32: v_cmp_ge_i32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x8c,0x7c]
+# W64: v_cmp_ge_i32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x8c,0x7c]
 
-# W32: v_cmp_ge_i32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x8c,0x7c]
-# W64: v_cmp_ge_i32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x8c,0x7c]
 0x7d,0x04,0x8c,0x7c
+# W32: v_cmp_ge_i32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x8c,0x7c]
+# W64: v_cmp_ge_i32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x8c,0x7c]
 
-# W32: v_cmp_ge_i32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x8c,0x7c]
-# W64: v_cmp_ge_i32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x8c,0x7c]
 0x7e,0x04,0x8c,0x7c
+# W32: v_cmp_ge_i32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x8c,0x7c]
+# W64: v_cmp_ge_i32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x8c,0x7c]
 
-# W32: v_cmp_ge_i32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x8c,0x7c]
-# W64: v_cmp_ge_i32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x8c,0x7c]
 0x7f,0x04,0x8c,0x7c
+# W32: v_cmp_ge_i32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x8c,0x7c]
+# W64: v_cmp_ge_i32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x8c,0x7c]
 
-# W32: v_cmp_ge_i32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x8c,0x7c]
-# W64: v_cmp_ge_i32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x8c,0x7c]
 0x7c,0x04,0x8c,0x7c
+# W32: v_cmp_ge_i32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x8c,0x7c]
+# W64: v_cmp_ge_i32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x8c,0x7c]
 
-# W32: v_cmp_ge_i32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x8c,0x7c]
-# W64: v_cmp_ge_i32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x8c,0x7c]
 0xc1,0x04,0x8c,0x7c
+# W32: v_cmp_ge_i32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x8c,0x7c]
+# W64: v_cmp_ge_i32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x8c,0x7c]
 
-# W32: v_cmp_ge_i32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x8c,0x7c]
-# W64: v_cmp_ge_i32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x8c,0x7c]
 0xf0,0x04,0x8c,0x7c
+# W32: v_cmp_ge_i32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x8c,0x7c]
+# W64: v_cmp_ge_i32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x8c,0x7c]
 
-# W32: v_cmp_ge_i32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x8c,0x7c]
-# W64: v_cmp_ge_i32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x8c,0x7c]
 0xfd,0x04,0x8c,0x7c
+# W32: v_cmp_ge_i32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x8c,0x7c]
+# W64: v_cmp_ge_i32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x8c,0x7c]
 
-# W32: v_cmp_ge_i32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x8d,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_ge_i32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x8d,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x8d,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_ge_i32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x8d,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_ge_i32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x8d,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ge_i64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0xac,0x7c]
-# W64: v_cmp_ge_i64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0xac,0x7c]
 0x01,0x05,0xac,0x7c
+# W32: v_cmp_ge_i64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0xac,0x7c]
+# W64: v_cmp_ge_i64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0xac,0x7c]
 
-# W32: v_cmp_ge_i64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xac,0x7c]
-# W64: v_cmp_ge_i64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0xac,0x7c]
 0xfe,0x05,0xac,0x7c
+# W32: v_cmp_ge_i64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xac,0x7c]
+# W64: v_cmp_ge_i64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xac,0x7c]
 
-# W32: v_cmp_ge_i64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0xac,0x7c]
-# W64: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0xac,0x7c]
 0x02,0x04,0xac,0x7c
+# W32: v_cmp_ge_i64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0xac,0x7c]
+# W64: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0xac,0x7c]
 
-# W32: v_cmp_ge_i64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xac,0x7c]
-# W64: v_cmp_ge_i64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0xac,0x7c]
 0x68,0x04,0xac,0x7c
+# W32: v_cmp_ge_i64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xac,0x7c]
+# W64: v_cmp_ge_i64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xac,0x7c]
 
-# W32: v_cmp_ge_i64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0xac,0x7c]
-# W64: v_cmp_ge_i64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0xac,0x7c]
 0x6a,0x04,0xac,0x7c
+# W32: v_cmp_ge_i64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0xac,0x7c]
+# W64: v_cmp_ge_i64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0xac,0x7c]
 
+0x7a,0x04,0xac,0x7c
 # W32: v_cmp_ge_i64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xac,0x7c]
 # W64: v_cmp_ge_i64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xac,0x7c]
-0x7a,0x04,0xac,0x7c
 
-# W32: v_cmp_ge_i64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0xac,0x7c]
-# W64: v_cmp_ge_i64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0xac,0x7c]
 0x7e,0x04,0xac,0x7c
+# W32: v_cmp_ge_i64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0xac,0x7c]
+# W64: v_cmp_ge_i64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0xac,0x7c]
 
-# W32: v_cmp_ge_i64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0xac,0x7c]
-# W64: v_cmp_ge_i64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0xac,0x7c]
 0x7c,0x04,0xac,0x7c
+# W32: v_cmp_ge_i64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0xac,0x7c]
+# W64: v_cmp_ge_i64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0xac,0x7c]
 
-# W32: v_cmp_ge_i64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0xac,0x7c]
-# W64: v_cmp_ge_i64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0xac,0x7c]
 0xc1,0x04,0xac,0x7c
+# W32: v_cmp_ge_i64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0xac,0x7c]
+# W64: v_cmp_ge_i64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0xac,0x7c]
 
-# W32: v_cmp_ge_i64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0xac,0x7c]
-# W64: v_cmp_ge_i64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0xac,0x7c]
 0xf0,0x04,0xac,0x7c
+# W32: v_cmp_ge_i64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0xac,0x7c]
+# W64: v_cmp_ge_i64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0xac,0x7c]
 
-# W32: v_cmp_ge_i64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0xac,0x7c]
-# W64: v_cmp_ge_i64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0xac,0x7c]
 0xfd,0x04,0xac,0x7c
+# W32: v_cmp_ge_i64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0xac,0x7c]
+# W64: v_cmp_ge_i64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0xac,0x7c]
 
+0xff,0xfc,0xad,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_ge_i64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xad,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_ge_i64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xad,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0xad,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_ge_u16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x7c,0x7c]
-# W64: v_cmp_ge_u16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x7c,0x7c]
 0x01,0x05,0x7c,0x7c
+# W32: v_cmp_ge_u16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x7c,0x7c]
+# W64: v_cmp_ge_u16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x7c,0x7c]
 
-# W32: v_cmp_ge_u16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x7c,0x7c]
-# W64: v_cmp_ge_u16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x7c,0x7c]
 0x7f,0x05,0x7c,0x7c
+# W32: v_cmp_ge_u16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x7c,0x7c]
+# W64: v_cmp_ge_u16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x7c,0x7c]
 
-# W32: v_cmp_ge_u16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x7c,0x7c]
-# W64: v_cmp_ge_u16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x7c,0x7c]
 0x01,0x04,0x7c,0x7c
+# W32: v_cmp_ge_u16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x7c,0x7c]
+# W64: v_cmp_ge_u16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x7c,0x7c]
 
-# W32: v_cmp_ge_u16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x7c,0x7c]
-# W64: v_cmp_ge_u16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x7c,0x7c]
 0x69,0x04,0x7c,0x7c
+# W32: v_cmp_ge_u16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x7c,0x7c]
+# W64: v_cmp_ge_u16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x7c,0x7c]
 
-# W32: v_cmp_ge_u16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x7c,0x7c]
-# W64: v_cmp_ge_u16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x7c,0x7c]
 0x6a,0x04,0x7c,0x7c
+# W32: v_cmp_ge_u16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x7c,0x7c]
+# W64: v_cmp_ge_u16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x7c,0x7c]
 
-# W32: v_cmp_ge_u16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x7c,0x7c]
-# W64: v_cmp_ge_u16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x7c,0x7c]
 0x6b,0x04,0x7c,0x7c
+# W32: v_cmp_ge_u16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x7c,0x7c]
+# W64: v_cmp_ge_u16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x7c,0x7c]
 
-# W32: v_cmp_ge_u16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x7c,0x7c]
-# W64: v_cmp_ge_u16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x7c,0x7c]
 0x7b,0x04,0x7c,0x7c
+# W32: v_cmp_ge_u16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x7c,0x7c]
+# W64: v_cmp_ge_u16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x7c,0x7c]
 
-# W32: v_cmp_ge_u16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x7c,0x7c]
-# W64: v_cmp_ge_u16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x7c,0x7c]
 0x7d,0x04,0x7c,0x7c
+# W32: v_cmp_ge_u16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x7c,0x7c]
+# W64: v_cmp_ge_u16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x7c,0x7c]
 
-# W32: v_cmp_ge_u16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x7c,0x7c]
-# W64: v_cmp_ge_u16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x7c,0x7c]
 0x7e,0x04,0x7c,0x7c
+# W32: v_cmp_ge_u16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x7c,0x7c]
+# W64: v_cmp_ge_u16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x7c,0x7c]
 
-# W32: v_cmp_ge_u16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x7c,0x7c]
-# W64: v_cmp_ge_u16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x7c,0x7c]
 0x7f,0x04,0x7c,0x7c
+# W32: v_cmp_ge_u16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x7c,0x7c]
+# W64: v_cmp_ge_u16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x7c,0x7c]
 
-# W32: v_cmp_ge_u16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x7c,0x7c]
-# W64: v_cmp_ge_u16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x7c,0x7c]
 0x7c,0x04,0x7c,0x7c
+# W32: v_cmp_ge_u16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x7c,0x7c]
+# W64: v_cmp_ge_u16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x7c,0x7c]
 
-# W32: v_cmp_ge_u16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x7c,0x7c]
-# W64: v_cmp_ge_u16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x7c,0x7c]
 0xc1,0x04,0x7c,0x7c
+# W32: v_cmp_ge_u16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x7c,0x7c]
+# W64: v_cmp_ge_u16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x7c,0x7c]
 
-# W32: v_cmp_ge_u16_e32 vcc_lo, 0x3800, v2
-# W64: v_cmp_ge_u16_e32 vcc, 0x3800, v2          ; encoding: [0xff,0x04,0x7c,0x7c,0x00,0x38,0x00,0x00]
 0xf0,0x04,0x7c,0x7c
+# W32: v_cmp_ge_u16_e32 vcc_lo, 0x3800, v2     ; encoding: [0xff,0x04,0x7c,0x7c,0x00,0x38,0x00,0x00]
+# W64: v_cmp_ge_u16_e32 vcc, 0x3800, v2        ; encoding: [0xff,0x04,0x7c,0x7c,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_ge_u16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x7c,0x7c]
-# W64: v_cmp_ge_u16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x7c,0x7c]
 0xfd,0x04,0x7c,0x7c
+# W32: v_cmp_ge_u16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x7c,0x7c]
+# W64: v_cmp_ge_u16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x7c,0x7c]
 
-# W32: v_cmp_ge_u16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x7c,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_ge_u16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x7c,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x7c,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_ge_u16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x7c,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_ge_u16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x7c,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ge_u32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x9c,0x7c]
-# W64: v_cmp_ge_u32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x9c,0x7c]
 0x01,0x05,0x9c,0x7c
+# W32: v_cmp_ge_u32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x9c,0x7c]
+# W64: v_cmp_ge_u32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x9c,0x7c]
 
-# W32: v_cmp_ge_u32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x9c,0x7c]
-# W64: v_cmp_ge_u32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x9c,0x7c]
 0xff,0x05,0x9c,0x7c
+# W32: v_cmp_ge_u32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x9c,0x7c]
+# W64: v_cmp_ge_u32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x9c,0x7c]
 
-# W32: v_cmp_ge_u32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x9c,0x7c]
-# W64: v_cmp_ge_u32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x9c,0x7c]
 0x01,0x04,0x9c,0x7c
+# W32: v_cmp_ge_u32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x9c,0x7c]
+# W64: v_cmp_ge_u32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x9c,0x7c]
 
-# W32: v_cmp_ge_u32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x9c,0x7c]
-# W64: v_cmp_ge_u32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x9c,0x7c]
 0x69,0x04,0x9c,0x7c
+# W32: v_cmp_ge_u32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x9c,0x7c]
+# W64: v_cmp_ge_u32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x9c,0x7c]
 
-# W32: v_cmp_ge_u32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x9c,0x7c]
-# W64: v_cmp_ge_u32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x9c,0x7c]
 0x6a,0x04,0x9c,0x7c
+# W32: v_cmp_ge_u32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x9c,0x7c]
+# W64: v_cmp_ge_u32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x9c,0x7c]
 
-# W32: v_cmp_ge_u32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x9c,0x7c]
-# W64: v_cmp_ge_u32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x9c,0x7c]
 0x6b,0x04,0x9c,0x7c
+# W32: v_cmp_ge_u32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x9c,0x7c]
+# W64: v_cmp_ge_u32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x9c,0x7c]
 
-# W32: v_cmp_ge_u32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x9c,0x7c]
-# W64: v_cmp_ge_u32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x9c,0x7c]
 0x7b,0x04,0x9c,0x7c
+# W32: v_cmp_ge_u32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x9c,0x7c]
+# W64: v_cmp_ge_u32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x9c,0x7c]
 
-# W32: v_cmp_ge_u32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x9c,0x7c]
-# W64: v_cmp_ge_u32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x9c,0x7c]
 0x7d,0x04,0x9c,0x7c
+# W32: v_cmp_ge_u32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x9c,0x7c]
+# W64: v_cmp_ge_u32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x9c,0x7c]
 
-# W32: v_cmp_ge_u32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x9c,0x7c]
-# W64: v_cmp_ge_u32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x9c,0x7c]
 0x7e,0x04,0x9c,0x7c
+# W32: v_cmp_ge_u32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x9c,0x7c]
+# W64: v_cmp_ge_u32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x9c,0x7c]
 
-# W32: v_cmp_ge_u32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x9c,0x7c]
-# W64: v_cmp_ge_u32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x9c,0x7c]
 0x7f,0x04,0x9c,0x7c
+# W32: v_cmp_ge_u32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x9c,0x7c]
+# W64: v_cmp_ge_u32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x9c,0x7c]
 
-# W32: v_cmp_ge_u32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x9c,0x7c]
-# W64: v_cmp_ge_u32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x9c,0x7c]
 0x7c,0x04,0x9c,0x7c
+# W32: v_cmp_ge_u32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x9c,0x7c]
+# W64: v_cmp_ge_u32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x9c,0x7c]
 
-# W32: v_cmp_ge_u32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x9c,0x7c]
-# W64: v_cmp_ge_u32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x9c,0x7c]
 0xc1,0x04,0x9c,0x7c
+# W32: v_cmp_ge_u32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x9c,0x7c]
+# W64: v_cmp_ge_u32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x9c,0x7c]
 
-# W32: v_cmp_ge_u32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x9c,0x7c]
-# W64: v_cmp_ge_u32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x9c,0x7c]
 0xf0,0x04,0x9c,0x7c
+# W32: v_cmp_ge_u32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x9c,0x7c]
+# W64: v_cmp_ge_u32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x9c,0x7c]
 
-# W32: v_cmp_ge_u32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x9c,0x7c]
-# W64: v_cmp_ge_u32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x9c,0x7c]
 0xfd,0x04,0x9c,0x7c
+# W32: v_cmp_ge_u32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x9c,0x7c]
+# W64: v_cmp_ge_u32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x9c,0x7c]
 
-# W32: v_cmp_ge_u32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x9d,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_ge_u32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x9d,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x9d,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_ge_u32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x9d,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_ge_u32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x9d,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ge_u64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0xbc,0x7c]
-# W64: v_cmp_ge_u64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0xbc,0x7c]
 0x01,0x05,0xbc,0x7c
+# W32: v_cmp_ge_u64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0xbc,0x7c]
+# W64: v_cmp_ge_u64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0xbc,0x7c]
 
-# W32: v_cmp_ge_u64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xbc,0x7c]
-# W64: v_cmp_ge_u64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0xbc,0x7c]
 0xfe,0x05,0xbc,0x7c
+# W32: v_cmp_ge_u64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xbc,0x7c]
+# W64: v_cmp_ge_u64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xbc,0x7c]
 
-# W32: v_cmp_ge_u64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0xbc,0x7c]
-# W64: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0xbc,0x7c]
 0x02,0x04,0xbc,0x7c
+# W32: v_cmp_ge_u64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0xbc,0x7c]
+# W64: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0xbc,0x7c]
 
-# W32: v_cmp_ge_u64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xbc,0x7c]
-# W64: v_cmp_ge_u64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0xbc,0x7c]
 0x68,0x04,0xbc,0x7c
+# W32: v_cmp_ge_u64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xbc,0x7c]
+# W64: v_cmp_ge_u64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xbc,0x7c]
 
-# W32: v_cmp_ge_u64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0xbc,0x7c]
-# W64: v_cmp_ge_u64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0xbc,0x7c]
 0x6a,0x04,0xbc,0x7c
+# W32: v_cmp_ge_u64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0xbc,0x7c]
+# W64: v_cmp_ge_u64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0xbc,0x7c]
 
+0x7a,0x04,0xbc,0x7c
 # W32: v_cmp_ge_u64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xbc,0x7c]
 # W64: v_cmp_ge_u64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xbc,0x7c]
-0x7a,0x04,0xbc,0x7c
 
-# W32: v_cmp_ge_u64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0xbc,0x7c]
-# W64: v_cmp_ge_u64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0xbc,0x7c]
 0x7e,0x04,0xbc,0x7c
+# W32: v_cmp_ge_u64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0xbc,0x7c]
+# W64: v_cmp_ge_u64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0xbc,0x7c]
 
-# W32: v_cmp_ge_u64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0xbc,0x7c]
-# W64: v_cmp_ge_u64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0xbc,0x7c]
 0x7c,0x04,0xbc,0x7c
+# W32: v_cmp_ge_u64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0xbc,0x7c]
+# W64: v_cmp_ge_u64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0xbc,0x7c]
 
-# W32: v_cmp_ge_u64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0xbc,0x7c]
-# W64: v_cmp_ge_u64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0xbc,0x7c]
 0xc1,0x04,0xbc,0x7c
+# W32: v_cmp_ge_u64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0xbc,0x7c]
+# W64: v_cmp_ge_u64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0xbc,0x7c]
 
-# W32: v_cmp_ge_u64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0xbc,0x7c]
-# W64: v_cmp_ge_u64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0xbc,0x7c]
 0xf0,0x04,0xbc,0x7c
+# W32: v_cmp_ge_u64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0xbc,0x7c]
+# W64: v_cmp_ge_u64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0xbc,0x7c]
 
-# W32: v_cmp_ge_u64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0xbc,0x7c]
-# W64: v_cmp_ge_u64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0xbc,0x7c]
 0xfd,0x04,0xbc,0x7c
+# W32: v_cmp_ge_u64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0xbc,0x7c]
+# W64: v_cmp_ge_u64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0xbc,0x7c]
 
+0xff,0xfc,0xbd,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_ge_u64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbd,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_ge_u64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbd,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0xbd,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_gt_f16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x08,0x7c]
-# W64: v_cmp_gt_f16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x08,0x7c]
 0x01,0x05,0x08,0x7c
+# W32: v_cmp_gt_f16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x08,0x7c]
+# W64: v_cmp_gt_f16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x08,0x7c]
 
-# W32: v_cmp_gt_f16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x08,0x7c]
-# W64: v_cmp_gt_f16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x08,0x7c]
 0x7f,0x05,0x08,0x7c
+# W32: v_cmp_gt_f16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x08,0x7c]
+# W64: v_cmp_gt_f16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x08,0x7c]
 
-# W32: v_cmp_gt_f16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x08,0x7c]
-# W64: v_cmp_gt_f16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x08,0x7c]
 0x01,0x04,0x08,0x7c
+# W32: v_cmp_gt_f16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x08,0x7c]
+# W64: v_cmp_gt_f16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x08,0x7c]
 
-# W32: v_cmp_gt_f16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x08,0x7c]
-# W64: v_cmp_gt_f16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x08,0x7c]
 0x69,0x04,0x08,0x7c
+# W32: v_cmp_gt_f16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x08,0x7c]
+# W64: v_cmp_gt_f16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x08,0x7c]
 
-# W32: v_cmp_gt_f16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x08,0x7c]
-# W64: v_cmp_gt_f16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x08,0x7c]
 0x6a,0x04,0x08,0x7c
+# W32: v_cmp_gt_f16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x08,0x7c]
+# W64: v_cmp_gt_f16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x08,0x7c]
 
-# W32: v_cmp_gt_f16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x08,0x7c]
-# W64: v_cmp_gt_f16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x08,0x7c]
 0x6b,0x04,0x08,0x7c
+# W32: v_cmp_gt_f16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x08,0x7c]
+# W64: v_cmp_gt_f16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x08,0x7c]
 
-# W32: v_cmp_gt_f16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x08,0x7c]
-# W64: v_cmp_gt_f16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x08,0x7c]
 0x7b,0x04,0x08,0x7c
+# W32: v_cmp_gt_f16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x08,0x7c]
+# W64: v_cmp_gt_f16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x08,0x7c]
 
-# W32: v_cmp_gt_f16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x08,0x7c]
-# W64: v_cmp_gt_f16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x08,0x7c]
 0x7d,0x04,0x08,0x7c
+# W32: v_cmp_gt_f16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x08,0x7c]
+# W64: v_cmp_gt_f16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x08,0x7c]
 
-# W32: v_cmp_gt_f16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x08,0x7c]
-# W64: v_cmp_gt_f16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x08,0x7c]
 0x7e,0x04,0x08,0x7c
+# W32: v_cmp_gt_f16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x08,0x7c]
+# W64: v_cmp_gt_f16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x08,0x7c]
 
-# W32: v_cmp_gt_f16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x08,0x7c]
-# W64: v_cmp_gt_f16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x08,0x7c]
 0x7f,0x04,0x08,0x7c
+# W32: v_cmp_gt_f16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x08,0x7c]
+# W64: v_cmp_gt_f16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x08,0x7c]
 
-# W32: v_cmp_gt_f16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x08,0x7c]
-# W64: v_cmp_gt_f16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x08,0x7c]
 0x7c,0x04,0x08,0x7c
+# W32: v_cmp_gt_f16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x08,0x7c]
+# W64: v_cmp_gt_f16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x08,0x7c]
 
-# W32: v_cmp_gt_f16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x08,0x7c]
-# W64: v_cmp_gt_f16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x08,0x7c]
 0xc1,0x04,0x08,0x7c
+# W32: v_cmp_gt_f16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x08,0x7c]
+# W64: v_cmp_gt_f16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x08,0x7c]
 
-# W32: v_cmp_gt_f16_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x08,0x7c]
-# W64: v_cmp_gt_f16_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x08,0x7c]
 0xf0,0x04,0x08,0x7c
+# W32: v_cmp_gt_f16_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x08,0x7c]
+# W64: v_cmp_gt_f16_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x08,0x7c]
 
-# W32: v_cmp_gt_f16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x08,0x7c]
-# W64: v_cmp_gt_f16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x08,0x7c]
 0xfd,0x04,0x08,0x7c
+# W32: v_cmp_gt_f16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x08,0x7c]
+# W64: v_cmp_gt_f16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x08,0x7c]
 
-# W32: v_cmp_gt_f16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x08,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_gt_f16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x08,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x08,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_gt_f16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x08,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_gt_f16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x08,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_gt_f32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x28,0x7c]
-# W64: v_cmp_gt_f32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x28,0x7c]
 0x01,0x05,0x28,0x7c
+# W32: v_cmp_gt_f32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x28,0x7c]
+# W64: v_cmp_gt_f32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x28,0x7c]
 
-# W32: v_cmp_gt_f32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x28,0x7c]
-# W64: v_cmp_gt_f32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x28,0x7c]
 0xff,0x05,0x28,0x7c
+# W32: v_cmp_gt_f32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x28,0x7c]
+# W64: v_cmp_gt_f32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x28,0x7c]
 
-# W32: v_cmp_gt_f32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x28,0x7c]
-# W64: v_cmp_gt_f32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x28,0x7c]
 0x01,0x04,0x28,0x7c
+# W32: v_cmp_gt_f32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x28,0x7c]
+# W64: v_cmp_gt_f32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x28,0x7c]
 
-# W32: v_cmp_gt_f32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x28,0x7c]
-# W64: v_cmp_gt_f32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x28,0x7c]
 0x69,0x04,0x28,0x7c
+# W32: v_cmp_gt_f32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x28,0x7c]
+# W64: v_cmp_gt_f32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x28,0x7c]
 
-# W32: v_cmp_gt_f32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x28,0x7c]
-# W64: v_cmp_gt_f32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x28,0x7c]
 0x6a,0x04,0x28,0x7c
+# W32: v_cmp_gt_f32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x28,0x7c]
+# W64: v_cmp_gt_f32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x28,0x7c]
 
-# W32: v_cmp_gt_f32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x28,0x7c]
-# W64: v_cmp_gt_f32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x28,0x7c]
 0x6b,0x04,0x28,0x7c
+# W32: v_cmp_gt_f32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x28,0x7c]
+# W64: v_cmp_gt_f32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x28,0x7c]
 
-# W32: v_cmp_gt_f32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x28,0x7c]
-# W64: v_cmp_gt_f32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x28,0x7c]
 0x7b,0x04,0x28,0x7c
+# W32: v_cmp_gt_f32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x28,0x7c]
+# W64: v_cmp_gt_f32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x28,0x7c]
 
-# W32: v_cmp_gt_f32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x28,0x7c]
-# W64: v_cmp_gt_f32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x28,0x7c]
 0x7d,0x04,0x28,0x7c
+# W32: v_cmp_gt_f32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x28,0x7c]
+# W64: v_cmp_gt_f32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x28,0x7c]
 
-# W32: v_cmp_gt_f32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x28,0x7c]
-# W64: v_cmp_gt_f32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x28,0x7c]
 0x7e,0x04,0x28,0x7c
+# W32: v_cmp_gt_f32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x28,0x7c]
+# W64: v_cmp_gt_f32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x28,0x7c]
 
-# W32: v_cmp_gt_f32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x28,0x7c]
-# W64: v_cmp_gt_f32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x28,0x7c]
 0x7f,0x04,0x28,0x7c
+# W32: v_cmp_gt_f32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x28,0x7c]
+# W64: v_cmp_gt_f32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x28,0x7c]
 
-# W32: v_cmp_gt_f32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x28,0x7c]
-# W64: v_cmp_gt_f32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x28,0x7c]
 0x7c,0x04,0x28,0x7c
+# W32: v_cmp_gt_f32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x28,0x7c]
+# W64: v_cmp_gt_f32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x28,0x7c]
 
-# W32: v_cmp_gt_f32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x28,0x7c]
-# W64: v_cmp_gt_f32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x28,0x7c]
 0xc1,0x04,0x28,0x7c
+# W32: v_cmp_gt_f32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x28,0x7c]
+# W64: v_cmp_gt_f32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x28,0x7c]
 
-# W32: v_cmp_gt_f32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x28,0x7c]
-# W64: v_cmp_gt_f32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x28,0x7c]
 0xf0,0x04,0x28,0x7c
+# W32: v_cmp_gt_f32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x28,0x7c]
+# W64: v_cmp_gt_f32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x28,0x7c]
 
-# W32: v_cmp_gt_f32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x28,0x7c]
-# W64: v_cmp_gt_f32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x28,0x7c]
 0xfd,0x04,0x28,0x7c
+# W32: v_cmp_gt_f32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x28,0x7c]
+# W64: v_cmp_gt_f32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x28,0x7c]
 
-# W32: v_cmp_gt_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x29,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_gt_f32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x29,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x29,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_gt_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x29,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_gt_f32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x29,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_gt_f64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0x48,0x7c]
-# W64: v_cmp_gt_f64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0x48,0x7c]
 0x01,0x05,0x48,0x7c
+# W32: v_cmp_gt_f64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0x48,0x7c]
+# W64: v_cmp_gt_f64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0x48,0x7c]
 
-# W32: v_cmp_gt_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x48,0x7c]
-# W64: v_cmp_gt_f64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0x48,0x7c]
 0xfe,0x05,0x48,0x7c
+# W32: v_cmp_gt_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x48,0x7c]
+# W64: v_cmp_gt_f64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x48,0x7c]
 
-# W32: v_cmp_gt_f64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0x48,0x7c]
-# W64: v_cmp_gt_f64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0x48,0x7c]
 0x02,0x04,0x48,0x7c
+# W32: v_cmp_gt_f64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0x48,0x7c]
+# W64: v_cmp_gt_f64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0x48,0x7c]
 
-# W32: v_cmp_gt_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x48,0x7c]
-# W64: v_cmp_gt_f64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0x48,0x7c]
 0x68,0x04,0x48,0x7c
+# W32: v_cmp_gt_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x48,0x7c]
+# W64: v_cmp_gt_f64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x48,0x7c]
 
-# W32: v_cmp_gt_f64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0x48,0x7c]
-# W64: v_cmp_gt_f64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0x48,0x7c]
 0x6a,0x04,0x48,0x7c
+# W32: v_cmp_gt_f64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0x48,0x7c]
+# W64: v_cmp_gt_f64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0x48,0x7c]
 
+0x7a,0x04,0x48,0x7c
 # W32: v_cmp_gt_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x48,0x7c]
 # W64: v_cmp_gt_f64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x48,0x7c]
-0x7a,0x04,0x48,0x7c
 
-# W32: v_cmp_gt_f64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0x48,0x7c]
-# W64: v_cmp_gt_f64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0x48,0x7c]
 0x7e,0x04,0x48,0x7c
+# W32: v_cmp_gt_f64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0x48,0x7c]
+# W64: v_cmp_gt_f64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0x48,0x7c]
 
-# W32: v_cmp_gt_f64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0x48,0x7c]
-# W64: v_cmp_gt_f64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0x48,0x7c]
 0x7c,0x04,0x48,0x7c
+# W32: v_cmp_gt_f64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0x48,0x7c]
+# W64: v_cmp_gt_f64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0x48,0x7c]
 
-# W32: v_cmp_gt_f64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0x48,0x7c]
-# W64: v_cmp_gt_f64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0x48,0x7c]
 0xc1,0x04,0x48,0x7c
+# W32: v_cmp_gt_f64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0x48,0x7c]
+# W64: v_cmp_gt_f64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0x48,0x7c]
 
-# W32: v_cmp_gt_f64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0x48,0x7c]
-# W64: v_cmp_gt_f64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0x48,0x7c]
 0xf0,0x04,0x48,0x7c
+# W32: v_cmp_gt_f64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0x48,0x7c]
+# W64: v_cmp_gt_f64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0x48,0x7c]
 
-# W32: v_cmp_gt_f64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0x48,0x7c]
-# W64: v_cmp_gt_f64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0x48,0x7c]
 0xfd,0x04,0x48,0x7c
+# W32: v_cmp_gt_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x48,0x7c]
+# W64: v_cmp_gt_f64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0x48,0x7c]
 
+0xff,0xfc,0x49,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_gt_f64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x49,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_gt_f64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x49,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0x49,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_gt_i16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x68,0x7c]
-# W64: v_cmp_gt_i16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x68,0x7c]
 0x01,0x05,0x68,0x7c
+# W32: v_cmp_gt_i16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x68,0x7c]
+# W64: v_cmp_gt_i16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x68,0x7c]
 
-# W32: v_cmp_gt_i16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x68,0x7c]
-# W64: v_cmp_gt_i16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x68,0x7c]
 0x7f,0x05,0x68,0x7c
+# W32: v_cmp_gt_i16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x68,0x7c]
+# W64: v_cmp_gt_i16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x68,0x7c]
 
-# W32: v_cmp_gt_i16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x68,0x7c]
-# W64: v_cmp_gt_i16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x68,0x7c]
 0x01,0x04,0x68,0x7c
+# W32: v_cmp_gt_i16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x68,0x7c]
+# W64: v_cmp_gt_i16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x68,0x7c]
 
-# W32: v_cmp_gt_i16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x68,0x7c]
-# W64: v_cmp_gt_i16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x68,0x7c]
 0x69,0x04,0x68,0x7c
+# W32: v_cmp_gt_i16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x68,0x7c]
+# W64: v_cmp_gt_i16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x68,0x7c]
 
-# W32: v_cmp_gt_i16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x68,0x7c]
-# W64: v_cmp_gt_i16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x68,0x7c]
 0x6a,0x04,0x68,0x7c
+# W32: v_cmp_gt_i16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x68,0x7c]
+# W64: v_cmp_gt_i16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x68,0x7c]
 
-# W32: v_cmp_gt_i16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x68,0x7c]
-# W64: v_cmp_gt_i16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x68,0x7c]
 0x6b,0x04,0x68,0x7c
+# W32: v_cmp_gt_i16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x68,0x7c]
+# W64: v_cmp_gt_i16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x68,0x7c]
 
-# W32: v_cmp_gt_i16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x68,0x7c]
-# W64: v_cmp_gt_i16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x68,0x7c]
 0x7b,0x04,0x68,0x7c
+# W32: v_cmp_gt_i16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x68,0x7c]
+# W64: v_cmp_gt_i16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x68,0x7c]
 
-# W32: v_cmp_gt_i16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x68,0x7c]
-# W64: v_cmp_gt_i16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x68,0x7c]
 0x7d,0x04,0x68,0x7c
+# W32: v_cmp_gt_i16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x68,0x7c]
+# W64: v_cmp_gt_i16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x68,0x7c]
 
-# W32: v_cmp_gt_i16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x68,0x7c]
-# W64: v_cmp_gt_i16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x68,0x7c]
 0x7e,0x04,0x68,0x7c
+# W32: v_cmp_gt_i16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x68,0x7c]
+# W64: v_cmp_gt_i16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x68,0x7c]
 
-# W32: v_cmp_gt_i16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x68,0x7c]
-# W64: v_cmp_gt_i16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x68,0x7c]
 0x7f,0x04,0x68,0x7c
+# W32: v_cmp_gt_i16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x68,0x7c]
+# W64: v_cmp_gt_i16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x68,0x7c]
 
-# W32: v_cmp_gt_i16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x68,0x7c]
-# W64: v_cmp_gt_i16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x68,0x7c]
 0x7c,0x04,0x68,0x7c
+# W32: v_cmp_gt_i16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x68,0x7c]
+# W64: v_cmp_gt_i16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x68,0x7c]
 
-# W32: v_cmp_gt_i16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x68,0x7c]
-# W64: v_cmp_gt_i16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x68,0x7c]
 0xc1,0x04,0x68,0x7c
+# W32: v_cmp_gt_i16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x68,0x7c]
+# W64: v_cmp_gt_i16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x68,0x7c]
 
-# W32: v_cmp_gt_i16_e32 vcc_lo, 0x3800, v2
-# W64: v_cmp_gt_i16_e32 vcc, 0x3800, v2          ; encoding: [0xff,0x04,0x68,0x7c,0x00,0x38,0x00,0x00]
 0xf0,0x04,0x68,0x7c
+# W32: v_cmp_gt_i16_e32 vcc_lo, 0x3800, v2     ; encoding: [0xff,0x04,0x68,0x7c,0x00,0x38,0x00,0x00]
+# W64: v_cmp_gt_i16_e32 vcc, 0x3800, v2        ; encoding: [0xff,0x04,0x68,0x7c,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_gt_i16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x68,0x7c]
-# W64: v_cmp_gt_i16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x68,0x7c]
 0xfd,0x04,0x68,0x7c
+# W32: v_cmp_gt_i16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x68,0x7c]
+# W64: v_cmp_gt_i16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x68,0x7c]
 
-# W32: v_cmp_gt_i16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x68,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_gt_i16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x68,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x68,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_gt_i16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x68,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_gt_i16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x68,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_gt_i32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x88,0x7c]
-# W64: v_cmp_gt_i32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x88,0x7c]
 0x01,0x05,0x88,0x7c
+# W32: v_cmp_gt_i32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x88,0x7c]
+# W64: v_cmp_gt_i32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x88,0x7c]
 
-# W32: v_cmp_gt_i32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x88,0x7c]
-# W64: v_cmp_gt_i32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x88,0x7c]
 0xff,0x05,0x88,0x7c
+# W32: v_cmp_gt_i32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x88,0x7c]
+# W64: v_cmp_gt_i32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x88,0x7c]
 
-# W32: v_cmp_gt_i32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x88,0x7c]
-# W64: v_cmp_gt_i32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x88,0x7c]
 0x01,0x04,0x88,0x7c
+# W32: v_cmp_gt_i32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x88,0x7c]
+# W64: v_cmp_gt_i32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x88,0x7c]
 
-# W32: v_cmp_gt_i32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x88,0x7c]
-# W64: v_cmp_gt_i32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x88,0x7c]
 0x69,0x04,0x88,0x7c
+# W32: v_cmp_gt_i32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x88,0x7c]
+# W64: v_cmp_gt_i32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x88,0x7c]
 
-# W32: v_cmp_gt_i32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x88,0x7c]
-# W64: v_cmp_gt_i32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x88,0x7c]
 0x6a,0x04,0x88,0x7c
+# W32: v_cmp_gt_i32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x88,0x7c]
+# W64: v_cmp_gt_i32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x88,0x7c]
 
-# W32: v_cmp_gt_i32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x88,0x7c]
-# W64: v_cmp_gt_i32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x88,0x7c]
 0x6b,0x04,0x88,0x7c
+# W32: v_cmp_gt_i32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x88,0x7c]
+# W64: v_cmp_gt_i32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x88,0x7c]
 
-# W32: v_cmp_gt_i32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x88,0x7c]
-# W64: v_cmp_gt_i32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x88,0x7c]
 0x7b,0x04,0x88,0x7c
+# W32: v_cmp_gt_i32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x88,0x7c]
+# W64: v_cmp_gt_i32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x88,0x7c]
 
-# W32: v_cmp_gt_i32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x88,0x7c]
-# W64: v_cmp_gt_i32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x88,0x7c]
 0x7d,0x04,0x88,0x7c
+# W32: v_cmp_gt_i32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x88,0x7c]
+# W64: v_cmp_gt_i32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x88,0x7c]
 
-# W32: v_cmp_gt_i32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x88,0x7c]
-# W64: v_cmp_gt_i32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x88,0x7c]
 0x7e,0x04,0x88,0x7c
+# W32: v_cmp_gt_i32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x88,0x7c]
+# W64: v_cmp_gt_i32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x88,0x7c]
 
-# W32: v_cmp_gt_i32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x88,0x7c]
-# W64: v_cmp_gt_i32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x88,0x7c]
 0x7f,0x04,0x88,0x7c
+# W32: v_cmp_gt_i32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x88,0x7c]
+# W64: v_cmp_gt_i32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x88,0x7c]
 
-# W32: v_cmp_gt_i32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x88,0x7c]
-# W64: v_cmp_gt_i32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x88,0x7c]
 0x7c,0x04,0x88,0x7c
+# W32: v_cmp_gt_i32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x88,0x7c]
+# W64: v_cmp_gt_i32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x88,0x7c]
 
-# W32: v_cmp_gt_i32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x88,0x7c]
-# W64: v_cmp_gt_i32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x88,0x7c]
 0xc1,0x04,0x88,0x7c
+# W32: v_cmp_gt_i32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x88,0x7c]
+# W64: v_cmp_gt_i32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x88,0x7c]
 
-# W32: v_cmp_gt_i32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x88,0x7c]
-# W64: v_cmp_gt_i32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x88,0x7c]
 0xf0,0x04,0x88,0x7c
+# W32: v_cmp_gt_i32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x88,0x7c]
+# W64: v_cmp_gt_i32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x88,0x7c]
 
-# W32: v_cmp_gt_i32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x88,0x7c]
-# W64: v_cmp_gt_i32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x88,0x7c]
 0xfd,0x04,0x88,0x7c
+# W32: v_cmp_gt_i32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x88,0x7c]
+# W64: v_cmp_gt_i32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x88,0x7c]
 
-# W32: v_cmp_gt_i32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x89,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_gt_i32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x89,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x89,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_gt_i32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x89,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_gt_i32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x89,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0xa8,0x7c]
-# W64: v_cmp_gt_i64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0xa8,0x7c]
 0x01,0x05,0xa8,0x7c
+# W32: v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0xa8,0x7c]
+# W64: v_cmp_gt_i64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0xa8,0x7c]
 
-# W32: v_cmp_gt_i64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xa8,0x7c]
-# W64: v_cmp_gt_i64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0xa8,0x7c]
 0xfe,0x05,0xa8,0x7c
+# W32: v_cmp_gt_i64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xa8,0x7c]
+# W64: v_cmp_gt_i64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xa8,0x7c]
 
-# W32: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0xa8,0x7c]
-# W64: v_cmp_gt_i64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0xa8,0x7c]
 0x02,0x04,0xa8,0x7c
+# W32: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0xa8,0x7c]
+# W64: v_cmp_gt_i64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0xa8,0x7c]
 
-# W32: v_cmp_gt_i64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xa8,0x7c]
-# W64: v_cmp_gt_i64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0xa8,0x7c]
 0x68,0x04,0xa8,0x7c
+# W32: v_cmp_gt_i64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xa8,0x7c]
+# W64: v_cmp_gt_i64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xa8,0x7c]
 
-# W32: v_cmp_gt_i64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0xa8,0x7c]
-# W64: v_cmp_gt_i64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0xa8,0x7c]
 0x6a,0x04,0xa8,0x7c
+# W32: v_cmp_gt_i64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0xa8,0x7c]
+# W64: v_cmp_gt_i64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0xa8,0x7c]
 
+0x7a,0x04,0xa8,0x7c
 # W32: v_cmp_gt_i64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xa8,0x7c]
 # W64: v_cmp_gt_i64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xa8,0x7c]
-0x7a,0x04,0xa8,0x7c
 
-# W32: v_cmp_gt_i64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0xa8,0x7c]
-# W64: v_cmp_gt_i64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0xa8,0x7c]
 0x7e,0x04,0xa8,0x7c
+# W32: v_cmp_gt_i64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0xa8,0x7c]
+# W64: v_cmp_gt_i64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0xa8,0x7c]
 
-# W32: v_cmp_gt_i64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0xa8,0x7c]
-# W64: v_cmp_gt_i64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0xa8,0x7c]
 0x7c,0x04,0xa8,0x7c
+# W32: v_cmp_gt_i64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0xa8,0x7c]
+# W64: v_cmp_gt_i64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0xa8,0x7c]
 
-# W32: v_cmp_gt_i64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0xa8,0x7c]
-# W64: v_cmp_gt_i64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0xa8,0x7c]
 0xc1,0x04,0xa8,0x7c
+# W32: v_cmp_gt_i64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0xa8,0x7c]
+# W64: v_cmp_gt_i64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0xa8,0x7c]
 
-# W32: v_cmp_gt_i64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0xa8,0x7c]
-# W64: v_cmp_gt_i64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0xa8,0x7c]
 0xf0,0x04,0xa8,0x7c
+# W32: v_cmp_gt_i64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0xa8,0x7c]
+# W64: v_cmp_gt_i64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0xa8,0x7c]
 
-# W32: v_cmp_gt_i64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0xa8,0x7c]
-# W64: v_cmp_gt_i64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0xa8,0x7c]
 0xfd,0x04,0xa8,0x7c
+# W32: v_cmp_gt_i64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0xa8,0x7c]
+# W64: v_cmp_gt_i64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0xa8,0x7c]
 
+0xff,0xfc,0xa9,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_gt_i64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa9,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_gt_i64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa9,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0xa9,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_gt_u16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x78,0x7c]
-# W64: v_cmp_gt_u16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x78,0x7c]
 0x01,0x05,0x78,0x7c
+# W32: v_cmp_gt_u16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x78,0x7c]
+# W64: v_cmp_gt_u16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x78,0x7c]
 
-# W32: v_cmp_gt_u16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x78,0x7c]
-# W64: v_cmp_gt_u16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x78,0x7c]
 0x7f,0x05,0x78,0x7c
+# W32: v_cmp_gt_u16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x78,0x7c]
+# W64: v_cmp_gt_u16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x78,0x7c]
 
-# W32: v_cmp_gt_u16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x78,0x7c]
-# W64: v_cmp_gt_u16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x78,0x7c]
 0x01,0x04,0x78,0x7c
+# W32: v_cmp_gt_u16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x78,0x7c]
+# W64: v_cmp_gt_u16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x78,0x7c]
 
-# W32: v_cmp_gt_u16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x78,0x7c]
-# W64: v_cmp_gt_u16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x78,0x7c]
 0x69,0x04,0x78,0x7c
+# W32: v_cmp_gt_u16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x78,0x7c]
+# W64: v_cmp_gt_u16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x78,0x7c]
 
-# W32: v_cmp_gt_u16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x78,0x7c]
-# W64: v_cmp_gt_u16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x78,0x7c]
 0x6a,0x04,0x78,0x7c
+# W32: v_cmp_gt_u16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x78,0x7c]
+# W64: v_cmp_gt_u16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x78,0x7c]
 
-# W32: v_cmp_gt_u16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x78,0x7c]
-# W64: v_cmp_gt_u16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x78,0x7c]
 0x6b,0x04,0x78,0x7c
+# W32: v_cmp_gt_u16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x78,0x7c]
+# W64: v_cmp_gt_u16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x78,0x7c]
 
-# W32: v_cmp_gt_u16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x78,0x7c]
-# W64: v_cmp_gt_u16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x78,0x7c]
 0x7b,0x04,0x78,0x7c
+# W32: v_cmp_gt_u16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x78,0x7c]
+# W64: v_cmp_gt_u16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x78,0x7c]
 
-# W32: v_cmp_gt_u16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x78,0x7c]
-# W64: v_cmp_gt_u16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x78,0x7c]
 0x7d,0x04,0x78,0x7c
+# W32: v_cmp_gt_u16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x78,0x7c]
+# W64: v_cmp_gt_u16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x78,0x7c]
 
-# W32: v_cmp_gt_u16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x78,0x7c]
-# W64: v_cmp_gt_u16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x78,0x7c]
 0x7e,0x04,0x78,0x7c
+# W32: v_cmp_gt_u16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x78,0x7c]
+# W64: v_cmp_gt_u16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x78,0x7c]
 
-# W32: v_cmp_gt_u16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x78,0x7c]
-# W64: v_cmp_gt_u16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x78,0x7c]
 0x7f,0x04,0x78,0x7c
+# W32: v_cmp_gt_u16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x78,0x7c]
+# W64: v_cmp_gt_u16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x78,0x7c]
 
-# W32: v_cmp_gt_u16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x78,0x7c]
-# W64: v_cmp_gt_u16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x78,0x7c]
 0x7c,0x04,0x78,0x7c
+# W32: v_cmp_gt_u16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x78,0x7c]
+# W64: v_cmp_gt_u16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x78,0x7c]
 
-# W32: v_cmp_gt_u16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x78,0x7c]
-# W64: v_cmp_gt_u16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x78,0x7c]
 0xc1,0x04,0x78,0x7c
+# W32: v_cmp_gt_u16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x78,0x7c]
+# W64: v_cmp_gt_u16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x78,0x7c]
 
-# W32: v_cmp_gt_u16_e32 vcc_lo, 0x3800, v2
-# W64: v_cmp_gt_u16_e32 vcc, 0x3800, v2          ; encoding: [0xff,0x04,0x78,0x7c,0x00,0x38,0x00,0x00]
 0xf0,0x04,0x78,0x7c
+# W32: v_cmp_gt_u16_e32 vcc_lo, 0x3800, v2     ; encoding: [0xff,0x04,0x78,0x7c,0x00,0x38,0x00,0x00]
+# W64: v_cmp_gt_u16_e32 vcc, 0x3800, v2        ; encoding: [0xff,0x04,0x78,0x7c,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_gt_u16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x78,0x7c]
-# W64: v_cmp_gt_u16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x78,0x7c]
 0xfd,0x04,0x78,0x7c
+# W32: v_cmp_gt_u16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x78,0x7c]
+# W64: v_cmp_gt_u16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x78,0x7c]
 
-# W32: v_cmp_gt_u16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x78,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_gt_u16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x78,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x78,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_gt_u16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x78,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_gt_u16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x78,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_gt_u32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x98,0x7c]
-# W64: v_cmp_gt_u32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x98,0x7c]
 0x01,0x05,0x98,0x7c
+# W32: v_cmp_gt_u32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x98,0x7c]
+# W64: v_cmp_gt_u32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x98,0x7c]
 
-# W32: v_cmp_gt_u32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x98,0x7c]
-# W64: v_cmp_gt_u32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x98,0x7c]
 0xff,0x05,0x98,0x7c
+# W32: v_cmp_gt_u32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x98,0x7c]
+# W64: v_cmp_gt_u32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x98,0x7c]
 
-# W32: v_cmp_gt_u32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x98,0x7c]
-# W64: v_cmp_gt_u32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x98,0x7c]
 0x01,0x04,0x98,0x7c
+# W32: v_cmp_gt_u32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x98,0x7c]
+# W64: v_cmp_gt_u32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x98,0x7c]
 
-# W32: v_cmp_gt_u32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x98,0x7c]
-# W64: v_cmp_gt_u32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x98,0x7c]
 0x69,0x04,0x98,0x7c
+# W32: v_cmp_gt_u32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x98,0x7c]
+# W64: v_cmp_gt_u32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x98,0x7c]
 
-# W32: v_cmp_gt_u32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x98,0x7c]
-# W64: v_cmp_gt_u32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x98,0x7c]
 0x6a,0x04,0x98,0x7c
+# W32: v_cmp_gt_u32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x98,0x7c]
+# W64: v_cmp_gt_u32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x98,0x7c]
 
-# W32: v_cmp_gt_u32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x98,0x7c]
-# W64: v_cmp_gt_u32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x98,0x7c]
 0x6b,0x04,0x98,0x7c
+# W32: v_cmp_gt_u32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x98,0x7c]
+# W64: v_cmp_gt_u32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x98,0x7c]
 
-# W32: v_cmp_gt_u32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x98,0x7c]
-# W64: v_cmp_gt_u32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x98,0x7c]
 0x7b,0x04,0x98,0x7c
+# W32: v_cmp_gt_u32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x98,0x7c]
+# W64: v_cmp_gt_u32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x98,0x7c]
 
-# W32: v_cmp_gt_u32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x98,0x7c]
-# W64: v_cmp_gt_u32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x98,0x7c]
 0x7d,0x04,0x98,0x7c
+# W32: v_cmp_gt_u32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x98,0x7c]
+# W64: v_cmp_gt_u32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x98,0x7c]
 
-# W32: v_cmp_gt_u32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x98,0x7c]
-# W64: v_cmp_gt_u32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x98,0x7c]
 0x7e,0x04,0x98,0x7c
+# W32: v_cmp_gt_u32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x98,0x7c]
+# W64: v_cmp_gt_u32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x98,0x7c]
 
-# W32: v_cmp_gt_u32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x98,0x7c]
-# W64: v_cmp_gt_u32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x98,0x7c]
 0x7f,0x04,0x98,0x7c
+# W32: v_cmp_gt_u32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x98,0x7c]
+# W64: v_cmp_gt_u32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x98,0x7c]
 
-# W32: v_cmp_gt_u32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x98,0x7c]
-# W64: v_cmp_gt_u32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x98,0x7c]
 0x7c,0x04,0x98,0x7c
+# W32: v_cmp_gt_u32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x98,0x7c]
+# W64: v_cmp_gt_u32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x98,0x7c]
 
-# W32: v_cmp_gt_u32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x98,0x7c]
-# W64: v_cmp_gt_u32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x98,0x7c]
 0xc1,0x04,0x98,0x7c
+# W32: v_cmp_gt_u32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x98,0x7c]
+# W64: v_cmp_gt_u32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x98,0x7c]
 
-# W32: v_cmp_gt_u32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x98,0x7c]
-# W64: v_cmp_gt_u32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x98,0x7c]
 0xf0,0x04,0x98,0x7c
+# W32: v_cmp_gt_u32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x98,0x7c]
+# W64: v_cmp_gt_u32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x98,0x7c]
 
-# W32: v_cmp_gt_u32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x98,0x7c]
-# W64: v_cmp_gt_u32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x98,0x7c]
 0xfd,0x04,0x98,0x7c
+# W32: v_cmp_gt_u32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x98,0x7c]
+# W64: v_cmp_gt_u32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x98,0x7c]
 
-# W32: v_cmp_gt_u32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x99,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_gt_u32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x99,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x99,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_gt_u32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x99,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_gt_u32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x99,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0xb8,0x7c]
-# W64: v_cmp_gt_u64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0xb8,0x7c]
 0x01,0x05,0xb8,0x7c
+# W32: v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0xb8,0x7c]
+# W64: v_cmp_gt_u64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0xb8,0x7c]
 
-# W32: v_cmp_gt_u64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xb8,0x7c]
-# W64: v_cmp_gt_u64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0xb8,0x7c]
 0xfe,0x05,0xb8,0x7c
+# W32: v_cmp_gt_u64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xb8,0x7c]
+# W64: v_cmp_gt_u64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xb8,0x7c]
 
-# W32: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0xb8,0x7c]
-# W64: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0xb8,0x7c]
 0x02,0x04,0xb8,0x7c
+# W32: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0xb8,0x7c]
+# W64: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0xb8,0x7c]
 
-# W32: v_cmp_gt_u64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xb8,0x7c]
-# W64: v_cmp_gt_u64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0xb8,0x7c]
 0x68,0x04,0xb8,0x7c
+# W32: v_cmp_gt_u64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xb8,0x7c]
+# W64: v_cmp_gt_u64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xb8,0x7c]
 
-# W32: v_cmp_gt_u64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0xb8,0x7c]
-# W64: v_cmp_gt_u64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0xb8,0x7c]
 0x6a,0x04,0xb8,0x7c
+# W32: v_cmp_gt_u64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0xb8,0x7c]
+# W64: v_cmp_gt_u64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0xb8,0x7c]
 
+0x7a,0x04,0xb8,0x7c
 # W32: v_cmp_gt_u64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xb8,0x7c]
 # W64: v_cmp_gt_u64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xb8,0x7c]
-0x7a,0x04,0xb8,0x7c
 
-# W32: v_cmp_gt_u64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0xb8,0x7c]
-# W64: v_cmp_gt_u64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0xb8,0x7c]
 0x7e,0x04,0xb8,0x7c
+# W32: v_cmp_gt_u64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0xb8,0x7c]
+# W64: v_cmp_gt_u64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0xb8,0x7c]
 
-# W32: v_cmp_gt_u64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0xb8,0x7c]
-# W64: v_cmp_gt_u64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0xb8,0x7c]
 0x7c,0x04,0xb8,0x7c
+# W32: v_cmp_gt_u64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0xb8,0x7c]
+# W64: v_cmp_gt_u64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0xb8,0x7c]
 
-# W32: v_cmp_gt_u64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0xb8,0x7c]
-# W64: v_cmp_gt_u64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0xb8,0x7c]
 0xc1,0x04,0xb8,0x7c
+# W32: v_cmp_gt_u64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0xb8,0x7c]
+# W64: v_cmp_gt_u64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0xb8,0x7c]
 
-# W32: v_cmp_gt_u64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0xb8,0x7c]
-# W64: v_cmp_gt_u64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0xb8,0x7c]
 0xf0,0x04,0xb8,0x7c
+# W32: v_cmp_gt_u64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0xb8,0x7c]
+# W64: v_cmp_gt_u64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0xb8,0x7c]
 
-# W32: v_cmp_gt_u64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0xb8,0x7c]
-# W64: v_cmp_gt_u64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0xb8,0x7c]
 0xfd,0x04,0xb8,0x7c
+# W32: v_cmp_gt_u64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0xb8,0x7c]
+# W64: v_cmp_gt_u64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0xb8,0x7c]
 
+0xff,0xfc,0xb9,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_gt_u64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb9,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_gt_u64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb9,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0xb9,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_le_f16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x06,0x7c]
-# W64: v_cmp_le_f16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x06,0x7c]
 0x01,0x05,0x06,0x7c
+# W32: v_cmp_le_f16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x06,0x7c]
+# W64: v_cmp_le_f16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x06,0x7c]
 
-# W32: v_cmp_le_f16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x06,0x7c]
-# W64: v_cmp_le_f16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x06,0x7c]
 0x7f,0x05,0x06,0x7c
+# W32: v_cmp_le_f16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x06,0x7c]
+# W64: v_cmp_le_f16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x06,0x7c]
 
-# W32: v_cmp_le_f16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x06,0x7c]
-# W64: v_cmp_le_f16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x06,0x7c]
 0x01,0x04,0x06,0x7c
+# W32: v_cmp_le_f16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x06,0x7c]
+# W64: v_cmp_le_f16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x06,0x7c]
 
-# W32: v_cmp_le_f16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x06,0x7c]
-# W64: v_cmp_le_f16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x06,0x7c]
 0x69,0x04,0x06,0x7c
+# W32: v_cmp_le_f16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x06,0x7c]
+# W64: v_cmp_le_f16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x06,0x7c]
 
-# W32: v_cmp_le_f16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x06,0x7c]
-# W64: v_cmp_le_f16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x06,0x7c]
 0x6a,0x04,0x06,0x7c
+# W32: v_cmp_le_f16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x06,0x7c]
+# W64: v_cmp_le_f16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x06,0x7c]
 
-# W32: v_cmp_le_f16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x06,0x7c]
-# W64: v_cmp_le_f16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x06,0x7c]
 0x6b,0x04,0x06,0x7c
+# W32: v_cmp_le_f16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x06,0x7c]
+# W64: v_cmp_le_f16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x06,0x7c]
 
-# W32: v_cmp_le_f16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x06,0x7c]
-# W64: v_cmp_le_f16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x06,0x7c]
 0x7b,0x04,0x06,0x7c
+# W32: v_cmp_le_f16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x06,0x7c]
+# W64: v_cmp_le_f16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x06,0x7c]
 
-# W32: v_cmp_le_f16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x06,0x7c]
-# W64: v_cmp_le_f16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x06,0x7c]
 0x7d,0x04,0x06,0x7c
+# W32: v_cmp_le_f16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x06,0x7c]
+# W64: v_cmp_le_f16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x06,0x7c]
 
-# W32: v_cmp_le_f16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x06,0x7c]
-# W64: v_cmp_le_f16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x06,0x7c]
 0x7e,0x04,0x06,0x7c
+# W32: v_cmp_le_f16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x06,0x7c]
+# W64: v_cmp_le_f16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x06,0x7c]
 
-# W32: v_cmp_le_f16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x06,0x7c]
-# W64: v_cmp_le_f16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x06,0x7c]
 0x7f,0x04,0x06,0x7c
+# W32: v_cmp_le_f16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x06,0x7c]
+# W64: v_cmp_le_f16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x06,0x7c]
 
-# W32: v_cmp_le_f16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x06,0x7c]
-# W64: v_cmp_le_f16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x06,0x7c]
 0x7c,0x04,0x06,0x7c
+# W32: v_cmp_le_f16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x06,0x7c]
+# W64: v_cmp_le_f16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x06,0x7c]
 
-# W32: v_cmp_le_f16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x06,0x7c]
-# W64: v_cmp_le_f16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x06,0x7c]
 0xc1,0x04,0x06,0x7c
+# W32: v_cmp_le_f16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x06,0x7c]
+# W64: v_cmp_le_f16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x06,0x7c]
 
-# W32: v_cmp_le_f16_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x06,0x7c]
-# W64: v_cmp_le_f16_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x06,0x7c]
 0xf0,0x04,0x06,0x7c
+# W32: v_cmp_le_f16_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x06,0x7c]
+# W64: v_cmp_le_f16_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x06,0x7c]
 
-# W32: v_cmp_le_f16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x06,0x7c]
-# W64: v_cmp_le_f16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x06,0x7c]
 0xfd,0x04,0x06,0x7c
+# W32: v_cmp_le_f16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x06,0x7c]
+# W64: v_cmp_le_f16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x06,0x7c]
 
-# W32: v_cmp_le_f16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x06,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_le_f16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x06,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x06,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_le_f16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x06,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_le_f16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x06,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_le_f32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x26,0x7c]
-# W64: v_cmp_le_f32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x26,0x7c]
 0x01,0x05,0x26,0x7c
+# W32: v_cmp_le_f32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x26,0x7c]
+# W64: v_cmp_le_f32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x26,0x7c]
 
-# W32: v_cmp_le_f32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x26,0x7c]
-# W64: v_cmp_le_f32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x26,0x7c]
 0xff,0x05,0x26,0x7c
+# W32: v_cmp_le_f32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x26,0x7c]
+# W64: v_cmp_le_f32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x26,0x7c]
 
-# W32: v_cmp_le_f32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x26,0x7c]
-# W64: v_cmp_le_f32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x26,0x7c]
 0x01,0x04,0x26,0x7c
+# W32: v_cmp_le_f32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x26,0x7c]
+# W64: v_cmp_le_f32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x26,0x7c]
 
-# W32: v_cmp_le_f32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x26,0x7c]
-# W64: v_cmp_le_f32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x26,0x7c]
 0x69,0x04,0x26,0x7c
+# W32: v_cmp_le_f32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x26,0x7c]
+# W64: v_cmp_le_f32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x26,0x7c]
 
-# W32: v_cmp_le_f32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x26,0x7c]
-# W64: v_cmp_le_f32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x26,0x7c]
 0x6a,0x04,0x26,0x7c
+# W32: v_cmp_le_f32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x26,0x7c]
+# W64: v_cmp_le_f32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x26,0x7c]
 
-# W32: v_cmp_le_f32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x26,0x7c]
-# W64: v_cmp_le_f32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x26,0x7c]
 0x6b,0x04,0x26,0x7c
+# W32: v_cmp_le_f32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x26,0x7c]
+# W64: v_cmp_le_f32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x26,0x7c]
 
-# W32: v_cmp_le_f32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x26,0x7c]
-# W64: v_cmp_le_f32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x26,0x7c]
 0x7b,0x04,0x26,0x7c
+# W32: v_cmp_le_f32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x26,0x7c]
+# W64: v_cmp_le_f32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x26,0x7c]
 
-# W32: v_cmp_le_f32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x26,0x7c]
-# W64: v_cmp_le_f32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x26,0x7c]
 0x7d,0x04,0x26,0x7c
+# W32: v_cmp_le_f32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x26,0x7c]
+# W64: v_cmp_le_f32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x26,0x7c]
 
-# W32: v_cmp_le_f32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x26,0x7c]
-# W64: v_cmp_le_f32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x26,0x7c]
 0x7e,0x04,0x26,0x7c
+# W32: v_cmp_le_f32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x26,0x7c]
+# W64: v_cmp_le_f32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x26,0x7c]
 
-# W32: v_cmp_le_f32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x26,0x7c]
-# W64: v_cmp_le_f32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x26,0x7c]
 0x7f,0x04,0x26,0x7c
+# W32: v_cmp_le_f32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x26,0x7c]
+# W64: v_cmp_le_f32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x26,0x7c]
 
-# W32: v_cmp_le_f32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x26,0x7c]
-# W64: v_cmp_le_f32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x26,0x7c]
 0x7c,0x04,0x26,0x7c
+# W32: v_cmp_le_f32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x26,0x7c]
+# W64: v_cmp_le_f32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x26,0x7c]
 
-# W32: v_cmp_le_f32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x26,0x7c]
-# W64: v_cmp_le_f32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x26,0x7c]
 0xc1,0x04,0x26,0x7c
+# W32: v_cmp_le_f32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x26,0x7c]
+# W64: v_cmp_le_f32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x26,0x7c]
 
-# W32: v_cmp_le_f32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x26,0x7c]
-# W64: v_cmp_le_f32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x26,0x7c]
 0xf0,0x04,0x26,0x7c
+# W32: v_cmp_le_f32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x26,0x7c]
+# W64: v_cmp_le_f32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x26,0x7c]
 
-# W32: v_cmp_le_f32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x26,0x7c]
-# W64: v_cmp_le_f32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x26,0x7c]
 0xfd,0x04,0x26,0x7c
+# W32: v_cmp_le_f32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x26,0x7c]
+# W64: v_cmp_le_f32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x26,0x7c]
 
-# W32: v_cmp_le_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x27,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_le_f32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x27,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x27,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_le_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x27,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_le_f32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x27,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_le_f64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0x46,0x7c]
-# W64: v_cmp_le_f64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0x46,0x7c]
 0x01,0x05,0x46,0x7c
+# W32: v_cmp_le_f64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0x46,0x7c]
+# W64: v_cmp_le_f64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0x46,0x7c]
 
-# W32: v_cmp_le_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x46,0x7c]
-# W64: v_cmp_le_f64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0x46,0x7c]
 0xfe,0x05,0x46,0x7c
+# W32: v_cmp_le_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x46,0x7c]
+# W64: v_cmp_le_f64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x46,0x7c]
 
-# W32: v_cmp_le_f64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0x46,0x7c]
-# W64: v_cmp_le_f64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0x46,0x7c]
 0x02,0x04,0x46,0x7c
+# W32: v_cmp_le_f64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0x46,0x7c]
+# W64: v_cmp_le_f64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0x46,0x7c]
 
-# W32: v_cmp_le_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x46,0x7c]
-# W64: v_cmp_le_f64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0x46,0x7c]
 0x68,0x04,0x46,0x7c
+# W32: v_cmp_le_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x46,0x7c]
+# W64: v_cmp_le_f64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x46,0x7c]
 
-# W32: v_cmp_le_f64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0x46,0x7c]
-# W64: v_cmp_le_f64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0x46,0x7c]
 0x6a,0x04,0x46,0x7c
+# W32: v_cmp_le_f64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0x46,0x7c]
+# W64: v_cmp_le_f64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0x46,0x7c]
 
+0x7a,0x04,0x46,0x7c
 # W32: v_cmp_le_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x46,0x7c]
 # W64: v_cmp_le_f64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x46,0x7c]
-0x7a,0x04,0x46,0x7c
 
-# W32: v_cmp_le_f64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0x46,0x7c]
-# W64: v_cmp_le_f64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0x46,0x7c]
 0x7e,0x04,0x46,0x7c
+# W32: v_cmp_le_f64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0x46,0x7c]
+# W64: v_cmp_le_f64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0x46,0x7c]
 
-# W32: v_cmp_le_f64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0x46,0x7c]
-# W64: v_cmp_le_f64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0x46,0x7c]
 0x7c,0x04,0x46,0x7c
+# W32: v_cmp_le_f64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0x46,0x7c]
+# W64: v_cmp_le_f64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0x46,0x7c]
 
-# W32: v_cmp_le_f64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0x46,0x7c]
-# W64: v_cmp_le_f64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0x46,0x7c]
 0xc1,0x04,0x46,0x7c
+# W32: v_cmp_le_f64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0x46,0x7c]
+# W64: v_cmp_le_f64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0x46,0x7c]
 
-# W32: v_cmp_le_f64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0x46,0x7c]
-# W64: v_cmp_le_f64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0x46,0x7c]
 0xf0,0x04,0x46,0x7c
+# W32: v_cmp_le_f64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0x46,0x7c]
+# W64: v_cmp_le_f64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0x46,0x7c]
 
-# W32: v_cmp_le_f64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0x46,0x7c]
-# W64: v_cmp_le_f64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0x46,0x7c]
 0xfd,0x04,0x46,0x7c
+# W32: v_cmp_le_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x46,0x7c]
+# W64: v_cmp_le_f64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0x46,0x7c]
 
+0xff,0xfc,0x47,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_le_f64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x47,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_le_f64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x47,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0x47,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_le_i16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x66,0x7c]
-# W64: v_cmp_le_i16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x66,0x7c]
 0x01,0x05,0x66,0x7c
+# W32: v_cmp_le_i16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x66,0x7c]
+# W64: v_cmp_le_i16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x66,0x7c]
 
-# W32: v_cmp_le_i16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x66,0x7c]
-# W64: v_cmp_le_i16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x66,0x7c]
 0x7f,0x05,0x66,0x7c
+# W32: v_cmp_le_i16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x66,0x7c]
+# W64: v_cmp_le_i16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x66,0x7c]
 
-# W32: v_cmp_le_i16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x66,0x7c]
-# W64: v_cmp_le_i16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x66,0x7c]
 0x01,0x04,0x66,0x7c
+# W32: v_cmp_le_i16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x66,0x7c]
+# W64: v_cmp_le_i16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x66,0x7c]
 
-# W32: v_cmp_le_i16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x66,0x7c]
-# W64: v_cmp_le_i16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x66,0x7c]
 0x69,0x04,0x66,0x7c
+# W32: v_cmp_le_i16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x66,0x7c]
+# W64: v_cmp_le_i16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x66,0x7c]
 
-# W32: v_cmp_le_i16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x66,0x7c]
-# W64: v_cmp_le_i16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x66,0x7c]
 0x6a,0x04,0x66,0x7c
+# W32: v_cmp_le_i16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x66,0x7c]
+# W64: v_cmp_le_i16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x66,0x7c]
 
-# W32: v_cmp_le_i16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x66,0x7c]
-# W64: v_cmp_le_i16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x66,0x7c]
 0x6b,0x04,0x66,0x7c
+# W32: v_cmp_le_i16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x66,0x7c]
+# W64: v_cmp_le_i16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x66,0x7c]
 
-# W32: v_cmp_le_i16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x66,0x7c]
-# W64: v_cmp_le_i16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x66,0x7c]
 0x7b,0x04,0x66,0x7c
+# W32: v_cmp_le_i16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x66,0x7c]
+# W64: v_cmp_le_i16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x66,0x7c]
 
-# W32: v_cmp_le_i16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x66,0x7c]
-# W64: v_cmp_le_i16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x66,0x7c]
 0x7d,0x04,0x66,0x7c
+# W32: v_cmp_le_i16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x66,0x7c]
+# W64: v_cmp_le_i16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x66,0x7c]
 
-# W32: v_cmp_le_i16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x66,0x7c]
-# W64: v_cmp_le_i16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x66,0x7c]
 0x7e,0x04,0x66,0x7c
+# W32: v_cmp_le_i16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x66,0x7c]
+# W64: v_cmp_le_i16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x66,0x7c]
 
-# W32: v_cmp_le_i16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x66,0x7c]
-# W64: v_cmp_le_i16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x66,0x7c]
 0x7f,0x04,0x66,0x7c
+# W32: v_cmp_le_i16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x66,0x7c]
+# W64: v_cmp_le_i16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x66,0x7c]
 
-# W32: v_cmp_le_i16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x66,0x7c]
-# W64: v_cmp_le_i16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x66,0x7c]
 0x7c,0x04,0x66,0x7c
+# W32: v_cmp_le_i16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x66,0x7c]
+# W64: v_cmp_le_i16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x66,0x7c]
 
-# W32: v_cmp_le_i16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x66,0x7c]
-# W64: v_cmp_le_i16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x66,0x7c]
 0xc1,0x04,0x66,0x7c
+# W32: v_cmp_le_i16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x66,0x7c]
+# W64: v_cmp_le_i16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x66,0x7c]
 
-# W32: v_cmp_le_i16_e32 vcc_lo, 0x3800, v2
-# W64: v_cmp_le_i16_e32 vcc, 0x3800, v2          ; encoding: [0xff,0x04,0x66,0x7c,0x00,0x38,0x00,0x00]
 0xf0,0x04,0x66,0x7c
+# W32: v_cmp_le_i16_e32 vcc_lo, 0x3800, v2     ; encoding: [0xff,0x04,0x66,0x7c,0x00,0x38,0x00,0x00]
+# W64: v_cmp_le_i16_e32 vcc, 0x3800, v2        ; encoding: [0xff,0x04,0x66,0x7c,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_le_i16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x66,0x7c]
-# W64: v_cmp_le_i16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x66,0x7c]
 0xfd,0x04,0x66,0x7c
+# W32: v_cmp_le_i16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x66,0x7c]
+# W64: v_cmp_le_i16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x66,0x7c]
 
-# W32: v_cmp_le_i16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x66,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_le_i16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x66,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x66,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_le_i16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x66,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_le_i16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x66,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_le_i32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x86,0x7c]
-# W64: v_cmp_le_i32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x86,0x7c]
 0x01,0x05,0x86,0x7c
+# W32: v_cmp_le_i32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x86,0x7c]
+# W64: v_cmp_le_i32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x86,0x7c]
 
-# W32: v_cmp_le_i32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x86,0x7c]
-# W64: v_cmp_le_i32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x86,0x7c]
 0xff,0x05,0x86,0x7c
+# W32: v_cmp_le_i32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x86,0x7c]
+# W64: v_cmp_le_i32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x86,0x7c]
 
-# W32: v_cmp_le_i32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x86,0x7c]
-# W64: v_cmp_le_i32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x86,0x7c]
 0x01,0x04,0x86,0x7c
+# W32: v_cmp_le_i32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x86,0x7c]
+# W64: v_cmp_le_i32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x86,0x7c]
 
-# W32: v_cmp_le_i32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x86,0x7c]
-# W64: v_cmp_le_i32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x86,0x7c]
 0x69,0x04,0x86,0x7c
+# W32: v_cmp_le_i32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x86,0x7c]
+# W64: v_cmp_le_i32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x86,0x7c]
 
-# W32: v_cmp_le_i32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x86,0x7c]
-# W64: v_cmp_le_i32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x86,0x7c]
 0x6a,0x04,0x86,0x7c
+# W32: v_cmp_le_i32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x86,0x7c]
+# W64: v_cmp_le_i32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x86,0x7c]
 
-# W32: v_cmp_le_i32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x86,0x7c]
-# W64: v_cmp_le_i32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x86,0x7c]
 0x6b,0x04,0x86,0x7c
+# W32: v_cmp_le_i32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x86,0x7c]
+# W64: v_cmp_le_i32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x86,0x7c]
 
-# W32: v_cmp_le_i32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x86,0x7c]
-# W64: v_cmp_le_i32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x86,0x7c]
 0x7b,0x04,0x86,0x7c
+# W32: v_cmp_le_i32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x86,0x7c]
+# W64: v_cmp_le_i32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x86,0x7c]
 
-# W32: v_cmp_le_i32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x86,0x7c]
-# W64: v_cmp_le_i32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x86,0x7c]
 0x7d,0x04,0x86,0x7c
+# W32: v_cmp_le_i32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x86,0x7c]
+# W64: v_cmp_le_i32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x86,0x7c]
 
-# W32: v_cmp_le_i32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x86,0x7c]
-# W64: v_cmp_le_i32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x86,0x7c]
 0x7e,0x04,0x86,0x7c
+# W32: v_cmp_le_i32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x86,0x7c]
+# W64: v_cmp_le_i32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x86,0x7c]
 
-# W32: v_cmp_le_i32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x86,0x7c]
-# W64: v_cmp_le_i32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x86,0x7c]
 0x7f,0x04,0x86,0x7c
+# W32: v_cmp_le_i32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x86,0x7c]
+# W64: v_cmp_le_i32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x86,0x7c]
 
-# W32: v_cmp_le_i32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x86,0x7c]
-# W64: v_cmp_le_i32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x86,0x7c]
 0x7c,0x04,0x86,0x7c
+# W32: v_cmp_le_i32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x86,0x7c]
+# W64: v_cmp_le_i32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x86,0x7c]
 
-# W32: v_cmp_le_i32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x86,0x7c]
-# W64: v_cmp_le_i32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x86,0x7c]
 0xc1,0x04,0x86,0x7c
+# W32: v_cmp_le_i32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x86,0x7c]
+# W64: v_cmp_le_i32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x86,0x7c]
 
-# W32: v_cmp_le_i32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x86,0x7c]
-# W64: v_cmp_le_i32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x86,0x7c]
 0xf0,0x04,0x86,0x7c
+# W32: v_cmp_le_i32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x86,0x7c]
+# W64: v_cmp_le_i32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x86,0x7c]
 
-# W32: v_cmp_le_i32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x86,0x7c]
-# W64: v_cmp_le_i32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x86,0x7c]
 0xfd,0x04,0x86,0x7c
+# W32: v_cmp_le_i32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x86,0x7c]
+# W64: v_cmp_le_i32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x86,0x7c]
 
-# W32: v_cmp_le_i32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x87,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_le_i32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x87,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x87,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_le_i32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x87,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_le_i32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x87,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_le_i64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0xa6,0x7c]
-# W64: v_cmp_le_i64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0xa6,0x7c]
 0x01,0x05,0xa6,0x7c
+# W32: v_cmp_le_i64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0xa6,0x7c]
+# W64: v_cmp_le_i64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0xa6,0x7c]
 
-# W32: v_cmp_le_i64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xa6,0x7c]
-# W64: v_cmp_le_i64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0xa6,0x7c]
 0xfe,0x05,0xa6,0x7c
+# W32: v_cmp_le_i64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xa6,0x7c]
+# W64: v_cmp_le_i64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xa6,0x7c]
 
-# W32: v_cmp_le_i64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0xa6,0x7c]
-# W64: v_cmp_le_i64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0xa6,0x7c]
 0x02,0x04,0xa6,0x7c
+# W32: v_cmp_le_i64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0xa6,0x7c]
+# W64: v_cmp_le_i64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0xa6,0x7c]
 
-# W32: v_cmp_le_i64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xa6,0x7c]
-# W64: v_cmp_le_i64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0xa6,0x7c]
 0x68,0x04,0xa6,0x7c
+# W32: v_cmp_le_i64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xa6,0x7c]
+# W64: v_cmp_le_i64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xa6,0x7c]
 
-# W32: v_cmp_le_i64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0xa6,0x7c]
-# W64: v_cmp_le_i64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0xa6,0x7c]
 0x6a,0x04,0xa6,0x7c
+# W32: v_cmp_le_i64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0xa6,0x7c]
+# W64: v_cmp_le_i64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0xa6,0x7c]
 
+0x7a,0x04,0xa6,0x7c
 # W32: v_cmp_le_i64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xa6,0x7c]
 # W64: v_cmp_le_i64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xa6,0x7c]
-0x7a,0x04,0xa6,0x7c
 
-# W32: v_cmp_le_i64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0xa6,0x7c]
-# W64: v_cmp_le_i64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0xa6,0x7c]
 0x7e,0x04,0xa6,0x7c
+# W32: v_cmp_le_i64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0xa6,0x7c]
+# W64: v_cmp_le_i64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0xa6,0x7c]
 
-# W32: v_cmp_le_i64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0xa6,0x7c]
-# W64: v_cmp_le_i64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0xa6,0x7c]
 0x7c,0x04,0xa6,0x7c
+# W32: v_cmp_le_i64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0xa6,0x7c]
+# W64: v_cmp_le_i64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0xa6,0x7c]
 
-# W32: v_cmp_le_i64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0xa6,0x7c]
-# W64: v_cmp_le_i64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0xa6,0x7c]
 0xc1,0x04,0xa6,0x7c
+# W32: v_cmp_le_i64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0xa6,0x7c]
+# W64: v_cmp_le_i64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0xa6,0x7c]
 
-# W32: v_cmp_le_i64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0xa6,0x7c]
-# W64: v_cmp_le_i64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0xa6,0x7c]
 0xf0,0x04,0xa6,0x7c
+# W32: v_cmp_le_i64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0xa6,0x7c]
+# W64: v_cmp_le_i64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0xa6,0x7c]
 
-# W32: v_cmp_le_i64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0xa6,0x7c]
-# W64: v_cmp_le_i64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0xa6,0x7c]
 0xfd,0x04,0xa6,0x7c
+# W32: v_cmp_le_i64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0xa6,0x7c]
+# W64: v_cmp_le_i64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0xa6,0x7c]
 
+0xff,0xfc,0xa7,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_le_i64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa7,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_le_i64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa7,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0xa7,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_le_u16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x76,0x7c]
-# W64: v_cmp_le_u16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x76,0x7c]
 0x01,0x05,0x76,0x7c
+# W32: v_cmp_le_u16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x76,0x7c]
+# W64: v_cmp_le_u16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x76,0x7c]
 
-# W32: v_cmp_le_u16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x76,0x7c]
-# W64: v_cmp_le_u16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x76,0x7c]
 0x7f,0x05,0x76,0x7c
+# W32: v_cmp_le_u16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x76,0x7c]
+# W64: v_cmp_le_u16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x76,0x7c]
 
-# W32: v_cmp_le_u16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x76,0x7c]
-# W64: v_cmp_le_u16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x76,0x7c]
 0x01,0x04,0x76,0x7c
+# W32: v_cmp_le_u16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x76,0x7c]
+# W64: v_cmp_le_u16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x76,0x7c]
 
-# W32: v_cmp_le_u16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x76,0x7c]
-# W64: v_cmp_le_u16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x76,0x7c]
 0x69,0x04,0x76,0x7c
+# W32: v_cmp_le_u16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x76,0x7c]
+# W64: v_cmp_le_u16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x76,0x7c]
 
-# W32: v_cmp_le_u16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x76,0x7c]
-# W64: v_cmp_le_u16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x76,0x7c]
 0x6a,0x04,0x76,0x7c
+# W32: v_cmp_le_u16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x76,0x7c]
+# W64: v_cmp_le_u16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x76,0x7c]
 
-# W32: v_cmp_le_u16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x76,0x7c]
-# W64: v_cmp_le_u16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x76,0x7c]
 0x6b,0x04,0x76,0x7c
+# W32: v_cmp_le_u16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x76,0x7c]
+# W64: v_cmp_le_u16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x76,0x7c]
 
-# W32: v_cmp_le_u16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x76,0x7c]
-# W64: v_cmp_le_u16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x76,0x7c]
 0x7b,0x04,0x76,0x7c
+# W32: v_cmp_le_u16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x76,0x7c]
+# W64: v_cmp_le_u16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x76,0x7c]
 
-# W32: v_cmp_le_u16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x76,0x7c]
-# W64: v_cmp_le_u16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x76,0x7c]
 0x7d,0x04,0x76,0x7c
+# W32: v_cmp_le_u16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x76,0x7c]
+# W64: v_cmp_le_u16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x76,0x7c]
 
-# W32: v_cmp_le_u16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x76,0x7c]
-# W64: v_cmp_le_u16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x76,0x7c]
 0x7e,0x04,0x76,0x7c
+# W32: v_cmp_le_u16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x76,0x7c]
+# W64: v_cmp_le_u16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x76,0x7c]
 
-# W32: v_cmp_le_u16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x76,0x7c]
-# W64: v_cmp_le_u16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x76,0x7c]
 0x7f,0x04,0x76,0x7c
+# W32: v_cmp_le_u16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x76,0x7c]
+# W64: v_cmp_le_u16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x76,0x7c]
 
-# W32: v_cmp_le_u16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x76,0x7c]
-# W64: v_cmp_le_u16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x76,0x7c]
 0x7c,0x04,0x76,0x7c
+# W32: v_cmp_le_u16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x76,0x7c]
+# W64: v_cmp_le_u16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x76,0x7c]
 
-# W32: v_cmp_le_u16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x76,0x7c]
-# W64: v_cmp_le_u16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x76,0x7c]
 0xc1,0x04,0x76,0x7c
+# W32: v_cmp_le_u16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x76,0x7c]
+# W64: v_cmp_le_u16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x76,0x7c]
 
-# W32: v_cmp_le_u16_e32 vcc_lo, 0x3800, v2
-# W64: v_cmp_le_u16_e32 vcc, 0x3800, v2          ; encoding: [0xff,0x04,0x76,0x7c,0x00,0x38,0x00,0x00]
 0xf0,0x04,0x76,0x7c
+# W32: v_cmp_le_u16_e32 vcc_lo, 0x3800, v2     ; encoding: [0xff,0x04,0x76,0x7c,0x00,0x38,0x00,0x00]
+# W64: v_cmp_le_u16_e32 vcc, 0x3800, v2        ; encoding: [0xff,0x04,0x76,0x7c,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_le_u16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x76,0x7c]
-# W64: v_cmp_le_u16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x76,0x7c]
 0xfd,0x04,0x76,0x7c
+# W32: v_cmp_le_u16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x76,0x7c]
+# W64: v_cmp_le_u16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x76,0x7c]
 
-# W32: v_cmp_le_u16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x76,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_le_u16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x76,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x76,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_le_u16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x76,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_le_u16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x76,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_le_u32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x96,0x7c]
-# W64: v_cmp_le_u32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x96,0x7c]
 0x01,0x05,0x96,0x7c
+# W32: v_cmp_le_u32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x96,0x7c]
+# W64: v_cmp_le_u32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x96,0x7c]
 
-# W32: v_cmp_le_u32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x96,0x7c]
-# W64: v_cmp_le_u32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x96,0x7c]
 0xff,0x05,0x96,0x7c
+# W32: v_cmp_le_u32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x96,0x7c]
+# W64: v_cmp_le_u32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x96,0x7c]
 
-# W32: v_cmp_le_u32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x96,0x7c]
-# W64: v_cmp_le_u32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x96,0x7c]
 0x01,0x04,0x96,0x7c
+# W32: v_cmp_le_u32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x96,0x7c]
+# W64: v_cmp_le_u32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x96,0x7c]
 
-# W32: v_cmp_le_u32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x96,0x7c]
-# W64: v_cmp_le_u32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x96,0x7c]
 0x69,0x04,0x96,0x7c
+# W32: v_cmp_le_u32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x96,0x7c]
+# W64: v_cmp_le_u32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x96,0x7c]
 
-# W32: v_cmp_le_u32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x96,0x7c]
-# W64: v_cmp_le_u32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x96,0x7c]
 0x6a,0x04,0x96,0x7c
+# W32: v_cmp_le_u32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x96,0x7c]
+# W64: v_cmp_le_u32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x96,0x7c]
 
-# W32: v_cmp_le_u32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x96,0x7c]
-# W64: v_cmp_le_u32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x96,0x7c]
 0x6b,0x04,0x96,0x7c
+# W32: v_cmp_le_u32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x96,0x7c]
+# W64: v_cmp_le_u32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x96,0x7c]
 
-# W32: v_cmp_le_u32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x96,0x7c]
-# W64: v_cmp_le_u32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x96,0x7c]
 0x7b,0x04,0x96,0x7c
+# W32: v_cmp_le_u32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x96,0x7c]
+# W64: v_cmp_le_u32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x96,0x7c]
 
-# W32: v_cmp_le_u32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x96,0x7c]
-# W64: v_cmp_le_u32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x96,0x7c]
 0x7d,0x04,0x96,0x7c
+# W32: v_cmp_le_u32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x96,0x7c]
+# W64: v_cmp_le_u32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x96,0x7c]
 
-# W32: v_cmp_le_u32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x96,0x7c]
-# W64: v_cmp_le_u32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x96,0x7c]
 0x7e,0x04,0x96,0x7c
+# W32: v_cmp_le_u32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x96,0x7c]
+# W64: v_cmp_le_u32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x96,0x7c]
 
-# W32: v_cmp_le_u32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x96,0x7c]
-# W64: v_cmp_le_u32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x96,0x7c]
 0x7f,0x04,0x96,0x7c
+# W32: v_cmp_le_u32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x96,0x7c]
+# W64: v_cmp_le_u32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x96,0x7c]
 
-# W32: v_cmp_le_u32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x96,0x7c]
-# W64: v_cmp_le_u32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x96,0x7c]
 0x7c,0x04,0x96,0x7c
+# W32: v_cmp_le_u32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x96,0x7c]
+# W64: v_cmp_le_u32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x96,0x7c]
 
-# W32: v_cmp_le_u32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x96,0x7c]
-# W64: v_cmp_le_u32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x96,0x7c]
 0xc1,0x04,0x96,0x7c
+# W32: v_cmp_le_u32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x96,0x7c]
+# W64: v_cmp_le_u32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x96,0x7c]
 
-# W32: v_cmp_le_u32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x96,0x7c]
-# W64: v_cmp_le_u32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x96,0x7c]
 0xf0,0x04,0x96,0x7c
+# W32: v_cmp_le_u32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x96,0x7c]
+# W64: v_cmp_le_u32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x96,0x7c]
 
-# W32: v_cmp_le_u32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x96,0x7c]
-# W64: v_cmp_le_u32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x96,0x7c]
 0xfd,0x04,0x96,0x7c
+# W32: v_cmp_le_u32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x96,0x7c]
+# W64: v_cmp_le_u32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x96,0x7c]
 
-# W32: v_cmp_le_u32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x97,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_le_u32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x97,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x97,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_le_u32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x97,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_le_u32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x97,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_le_u64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0xb6,0x7c]
-# W64: v_cmp_le_u64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0xb6,0x7c]
 0x01,0x05,0xb6,0x7c
+# W32: v_cmp_le_u64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0xb6,0x7c]
+# W64: v_cmp_le_u64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0xb6,0x7c]
 
-# W32: v_cmp_le_u64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xb6,0x7c]
-# W64: v_cmp_le_u64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0xb6,0x7c]
 0xfe,0x05,0xb6,0x7c
+# W32: v_cmp_le_u64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xb6,0x7c]
+# W64: v_cmp_le_u64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xb6,0x7c]
 
-# W32: v_cmp_le_u64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0xb6,0x7c]
-# W64: v_cmp_le_u64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0xb6,0x7c]
 0x02,0x04,0xb6,0x7c
+# W32: v_cmp_le_u64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0xb6,0x7c]
+# W64: v_cmp_le_u64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0xb6,0x7c]
 
-# W32: v_cmp_le_u64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xb6,0x7c]
-# W64: v_cmp_le_u64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0xb6,0x7c]
 0x68,0x04,0xb6,0x7c
+# W32: v_cmp_le_u64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xb6,0x7c]
+# W64: v_cmp_le_u64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xb6,0x7c]
 
-# W32: v_cmp_le_u64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0xb6,0x7c]
-# W64: v_cmp_le_u64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0xb6,0x7c]
 0x6a,0x04,0xb6,0x7c
+# W32: v_cmp_le_u64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0xb6,0x7c]
+# W64: v_cmp_le_u64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0xb6,0x7c]
 
+0x7a,0x04,0xb6,0x7c
 # W32: v_cmp_le_u64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xb6,0x7c]
 # W64: v_cmp_le_u64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xb6,0x7c]
-0x7a,0x04,0xb6,0x7c
 
-# W32: v_cmp_le_u64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0xb6,0x7c]
-# W64: v_cmp_le_u64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0xb6,0x7c]
 0x7e,0x04,0xb6,0x7c
+# W32: v_cmp_le_u64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0xb6,0x7c]
+# W64: v_cmp_le_u64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0xb6,0x7c]
 
-# W32: v_cmp_le_u64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0xb6,0x7c]
-# W64: v_cmp_le_u64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0xb6,0x7c]
 0x7c,0x04,0xb6,0x7c
+# W32: v_cmp_le_u64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0xb6,0x7c]
+# W64: v_cmp_le_u64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0xb6,0x7c]
 
-# W32: v_cmp_le_u64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0xb6,0x7c]
-# W64: v_cmp_le_u64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0xb6,0x7c]
 0xc1,0x04,0xb6,0x7c
+# W32: v_cmp_le_u64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0xb6,0x7c]
+# W64: v_cmp_le_u64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0xb6,0x7c]
 
-# W32: v_cmp_le_u64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0xb6,0x7c]
-# W64: v_cmp_le_u64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0xb6,0x7c]
 0xf0,0x04,0xb6,0x7c
+# W32: v_cmp_le_u64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0xb6,0x7c]
+# W64: v_cmp_le_u64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0xb6,0x7c]
 
-# W32: v_cmp_le_u64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0xb6,0x7c]
-# W64: v_cmp_le_u64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0xb6,0x7c]
 0xfd,0x04,0xb6,0x7c
+# W32: v_cmp_le_u64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0xb6,0x7c]
+# W64: v_cmp_le_u64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0xb6,0x7c]
 
+0xff,0xfc,0xb7,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_le_u64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb7,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_le_u64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb7,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0xb7,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_lg_f16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x0a,0x7c]
-# W64: v_cmp_lg_f16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x0a,0x7c]
 0x01,0x05,0x0a,0x7c
+# W32: v_cmp_lg_f16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x0a,0x7c]
+# W64: v_cmp_lg_f16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x0a,0x7c]
 
-# W32: v_cmp_lg_f16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x0a,0x7c]
-# W64: v_cmp_lg_f16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x0a,0x7c]
 0x7f,0x05,0x0a,0x7c
+# W32: v_cmp_lg_f16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x0a,0x7c]
+# W64: v_cmp_lg_f16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x0a,0x7c]
 
-# W32: v_cmp_lg_f16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x0a,0x7c]
-# W64: v_cmp_lg_f16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x0a,0x7c]
 0x01,0x04,0x0a,0x7c
+# W32: v_cmp_lg_f16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x0a,0x7c]
+# W64: v_cmp_lg_f16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x0a,0x7c]
 
-# W32: v_cmp_lg_f16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x0a,0x7c]
-# W64: v_cmp_lg_f16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x0a,0x7c]
 0x69,0x04,0x0a,0x7c
+# W32: v_cmp_lg_f16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x0a,0x7c]
+# W64: v_cmp_lg_f16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x0a,0x7c]
 
-# W32: v_cmp_lg_f16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x0a,0x7c]
-# W64: v_cmp_lg_f16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x0a,0x7c]
 0x6a,0x04,0x0a,0x7c
+# W32: v_cmp_lg_f16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x0a,0x7c]
+# W64: v_cmp_lg_f16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x0a,0x7c]
 
-# W32: v_cmp_lg_f16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x0a,0x7c]
-# W64: v_cmp_lg_f16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x0a,0x7c]
 0x6b,0x04,0x0a,0x7c
+# W32: v_cmp_lg_f16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x0a,0x7c]
+# W64: v_cmp_lg_f16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x0a,0x7c]
 
-# W32: v_cmp_lg_f16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x0a,0x7c]
-# W64: v_cmp_lg_f16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x0a,0x7c]
 0x7b,0x04,0x0a,0x7c
+# W32: v_cmp_lg_f16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x0a,0x7c]
+# W64: v_cmp_lg_f16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x0a,0x7c]
 
-# W32: v_cmp_lg_f16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x0a,0x7c]
-# W64: v_cmp_lg_f16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x0a,0x7c]
 0x7d,0x04,0x0a,0x7c
+# W32: v_cmp_lg_f16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x0a,0x7c]
+# W64: v_cmp_lg_f16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x0a,0x7c]
 
-# W32: v_cmp_lg_f16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x0a,0x7c]
-# W64: v_cmp_lg_f16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x0a,0x7c]
 0x7e,0x04,0x0a,0x7c
+# W32: v_cmp_lg_f16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x0a,0x7c]
+# W64: v_cmp_lg_f16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x0a,0x7c]
 
-# W32: v_cmp_lg_f16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x0a,0x7c]
-# W64: v_cmp_lg_f16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x0a,0x7c]
 0x7f,0x04,0x0a,0x7c
+# W32: v_cmp_lg_f16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x0a,0x7c]
+# W64: v_cmp_lg_f16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x0a,0x7c]
 
-# W32: v_cmp_lg_f16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x0a,0x7c]
-# W64: v_cmp_lg_f16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x0a,0x7c]
 0x7c,0x04,0x0a,0x7c
+# W32: v_cmp_lg_f16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x0a,0x7c]
+# W64: v_cmp_lg_f16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x0a,0x7c]
 
-# W32: v_cmp_lg_f16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x0a,0x7c]
-# W64: v_cmp_lg_f16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x0a,0x7c]
 0xc1,0x04,0x0a,0x7c
+# W32: v_cmp_lg_f16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x0a,0x7c]
+# W64: v_cmp_lg_f16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x0a,0x7c]
 
-# W32: v_cmp_lg_f16_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x0a,0x7c]
-# W64: v_cmp_lg_f16_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x0a,0x7c]
 0xf0,0x04,0x0a,0x7c
+# W32: v_cmp_lg_f16_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x0a,0x7c]
+# W64: v_cmp_lg_f16_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x0a,0x7c]
 
-# W32: v_cmp_lg_f16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x0a,0x7c]
-# W64: v_cmp_lg_f16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x0a,0x7c]
 0xfd,0x04,0x0a,0x7c
+# W32: v_cmp_lg_f16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x0a,0x7c]
+# W64: v_cmp_lg_f16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x0a,0x7c]
 
-# W32: v_cmp_lg_f16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x0a,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_lg_f16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x0a,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x0a,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_lg_f16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x0a,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_lg_f16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x0a,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_lg_f32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x2a,0x7c]
-# W64: v_cmp_lg_f32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x2a,0x7c]
 0x01,0x05,0x2a,0x7c
+# W32: v_cmp_lg_f32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x2a,0x7c]
+# W64: v_cmp_lg_f32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x2a,0x7c]
 
-# W32: v_cmp_lg_f32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x2a,0x7c]
-# W64: v_cmp_lg_f32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x2a,0x7c]
 0xff,0x05,0x2a,0x7c
+# W32: v_cmp_lg_f32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x2a,0x7c]
+# W64: v_cmp_lg_f32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x2a,0x7c]
 
-# W32: v_cmp_lg_f32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x2a,0x7c]
-# W64: v_cmp_lg_f32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x2a,0x7c]
 0x01,0x04,0x2a,0x7c
+# W32: v_cmp_lg_f32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x2a,0x7c]
+# W64: v_cmp_lg_f32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x2a,0x7c]
 
-# W32: v_cmp_lg_f32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x2a,0x7c]
-# W64: v_cmp_lg_f32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x2a,0x7c]
 0x69,0x04,0x2a,0x7c
+# W32: v_cmp_lg_f32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x2a,0x7c]
+# W64: v_cmp_lg_f32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x2a,0x7c]
 
-# W32: v_cmp_lg_f32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x2a,0x7c]
-# W64: v_cmp_lg_f32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x2a,0x7c]
 0x6a,0x04,0x2a,0x7c
+# W32: v_cmp_lg_f32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x2a,0x7c]
+# W64: v_cmp_lg_f32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x2a,0x7c]
 
-# W32: v_cmp_lg_f32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x2a,0x7c]
-# W64: v_cmp_lg_f32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x2a,0x7c]
 0x6b,0x04,0x2a,0x7c
+# W32: v_cmp_lg_f32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x2a,0x7c]
+# W64: v_cmp_lg_f32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x2a,0x7c]
 
-# W32: v_cmp_lg_f32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x2a,0x7c]
-# W64: v_cmp_lg_f32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x2a,0x7c]
 0x7b,0x04,0x2a,0x7c
+# W32: v_cmp_lg_f32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x2a,0x7c]
+# W64: v_cmp_lg_f32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x2a,0x7c]
 
-# W32: v_cmp_lg_f32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x2a,0x7c]
-# W64: v_cmp_lg_f32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x2a,0x7c]
 0x7d,0x04,0x2a,0x7c
+# W32: v_cmp_lg_f32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x2a,0x7c]
+# W64: v_cmp_lg_f32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x2a,0x7c]
 
-# W32: v_cmp_lg_f32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x2a,0x7c]
-# W64: v_cmp_lg_f32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x2a,0x7c]
 0x7e,0x04,0x2a,0x7c
+# W32: v_cmp_lg_f32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x2a,0x7c]
+# W64: v_cmp_lg_f32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x2a,0x7c]
 
-# W32: v_cmp_lg_f32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x2a,0x7c]
-# W64: v_cmp_lg_f32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x2a,0x7c]
 0x7f,0x04,0x2a,0x7c
+# W32: v_cmp_lg_f32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x2a,0x7c]
+# W64: v_cmp_lg_f32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x2a,0x7c]
 
-# W32: v_cmp_lg_f32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x2a,0x7c]
-# W64: v_cmp_lg_f32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x2a,0x7c]
 0x7c,0x04,0x2a,0x7c
+# W32: v_cmp_lg_f32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x2a,0x7c]
+# W64: v_cmp_lg_f32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x2a,0x7c]
 
-# W32: v_cmp_lg_f32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x2a,0x7c]
-# W64: v_cmp_lg_f32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x2a,0x7c]
 0xc1,0x04,0x2a,0x7c
+# W32: v_cmp_lg_f32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x2a,0x7c]
+# W64: v_cmp_lg_f32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x2a,0x7c]
 
-# W32: v_cmp_lg_f32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x2a,0x7c]
-# W64: v_cmp_lg_f32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x2a,0x7c]
 0xf0,0x04,0x2a,0x7c
+# W32: v_cmp_lg_f32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x2a,0x7c]
+# W64: v_cmp_lg_f32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x2a,0x7c]
 
-# W32: v_cmp_lg_f32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x2a,0x7c]
-# W64: v_cmp_lg_f32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x2a,0x7c]
 0xfd,0x04,0x2a,0x7c
+# W32: v_cmp_lg_f32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x2a,0x7c]
+# W64: v_cmp_lg_f32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x2a,0x7c]
 
-# W32: v_cmp_lg_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x2b,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_lg_f32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x2b,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x2b,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_lg_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x2b,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_lg_f32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x2b,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lg_f64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0x4a,0x7c]
-# W64: v_cmp_lg_f64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0x4a,0x7c]
 0x01,0x05,0x4a,0x7c
+# W32: v_cmp_lg_f64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0x4a,0x7c]
+# W64: v_cmp_lg_f64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0x4a,0x7c]
 
-# W32: v_cmp_lg_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x4a,0x7c]
-# W64: v_cmp_lg_f64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0x4a,0x7c]
 0xfe,0x05,0x4a,0x7c
+# W32: v_cmp_lg_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x4a,0x7c]
+# W64: v_cmp_lg_f64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x4a,0x7c]
 
-# W32: v_cmp_lg_f64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0x4a,0x7c]
-# W64: v_cmp_lg_f64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0x4a,0x7c]
 0x02,0x04,0x4a,0x7c
+# W32: v_cmp_lg_f64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0x4a,0x7c]
+# W64: v_cmp_lg_f64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0x4a,0x7c]
 
-# W32: v_cmp_lg_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x4a,0x7c]
-# W64: v_cmp_lg_f64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0x4a,0x7c]
 0x68,0x04,0x4a,0x7c
+# W32: v_cmp_lg_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x4a,0x7c]
+# W64: v_cmp_lg_f64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x4a,0x7c]
 
-# W32: v_cmp_lg_f64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0x4a,0x7c]
-# W64: v_cmp_lg_f64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0x4a,0x7c]
 0x6a,0x04,0x4a,0x7c
+# W32: v_cmp_lg_f64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0x4a,0x7c]
+# W64: v_cmp_lg_f64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0x4a,0x7c]
 
+0x7a,0x04,0x4a,0x7c
 # W32: v_cmp_lg_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x4a,0x7c]
 # W64: v_cmp_lg_f64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x4a,0x7c]
-0x7a,0x04,0x4a,0x7c
 
-# W32: v_cmp_lg_f64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0x4a,0x7c]
-# W64: v_cmp_lg_f64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0x4a,0x7c]
 0x7e,0x04,0x4a,0x7c
+# W32: v_cmp_lg_f64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0x4a,0x7c]
+# W64: v_cmp_lg_f64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0x4a,0x7c]
 
-# W32: v_cmp_lg_f64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0x4a,0x7c]
-# W64: v_cmp_lg_f64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0x4a,0x7c]
 0x7c,0x04,0x4a,0x7c
+# W32: v_cmp_lg_f64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0x4a,0x7c]
+# W64: v_cmp_lg_f64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0x4a,0x7c]
 
-# W32: v_cmp_lg_f64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0x4a,0x7c]
-# W64: v_cmp_lg_f64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0x4a,0x7c]
 0xc1,0x04,0x4a,0x7c
+# W32: v_cmp_lg_f64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0x4a,0x7c]
+# W64: v_cmp_lg_f64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0x4a,0x7c]
 
-# W32: v_cmp_lg_f64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0x4a,0x7c]
-# W64: v_cmp_lg_f64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0x4a,0x7c]
 0xf0,0x04,0x4a,0x7c
+# W32: v_cmp_lg_f64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0x4a,0x7c]
+# W64: v_cmp_lg_f64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0x4a,0x7c]
 
-# W32: v_cmp_lg_f64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0x4a,0x7c]
-# W64: v_cmp_lg_f64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0x4a,0x7c]
 0xfd,0x04,0x4a,0x7c
+# W32: v_cmp_lg_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x4a,0x7c]
+# W64: v_cmp_lg_f64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0x4a,0x7c]
 
+0xff,0xfc,0x4b,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_lg_f64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4b,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_lg_f64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4b,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0x4b,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_lt_f16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x02,0x7c]
-# W64: v_cmp_lt_f16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x02,0x7c]
 0x01,0x05,0x02,0x7c
+# W32: v_cmp_lt_f16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x02,0x7c]
+# W64: v_cmp_lt_f16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x02,0x7c]
 
-# W32: v_cmp_lt_f16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x02,0x7c]
-# W64: v_cmp_lt_f16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x02,0x7c]
 0x7f,0x05,0x02,0x7c
+# W32: v_cmp_lt_f16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x02,0x7c]
+# W64: v_cmp_lt_f16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x02,0x7c]
 
-# W32: v_cmp_lt_f16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x02,0x7c]
-# W64: v_cmp_lt_f16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x02,0x7c]
 0x01,0x04,0x02,0x7c
+# W32: v_cmp_lt_f16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x02,0x7c]
+# W64: v_cmp_lt_f16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x02,0x7c]
 
-# W32: v_cmp_lt_f16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x02,0x7c]
-# W64: v_cmp_lt_f16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x02,0x7c]
 0x69,0x04,0x02,0x7c
+# W32: v_cmp_lt_f16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x02,0x7c]
+# W64: v_cmp_lt_f16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x02,0x7c]
 
-# W32: v_cmp_lt_f16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x02,0x7c]
-# W64: v_cmp_lt_f16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x02,0x7c]
 0x6a,0x04,0x02,0x7c
+# W32: v_cmp_lt_f16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x02,0x7c]
+# W64: v_cmp_lt_f16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x02,0x7c]
 
-# W32: v_cmp_lt_f16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x02,0x7c]
-# W64: v_cmp_lt_f16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x02,0x7c]
 0x6b,0x04,0x02,0x7c
+# W32: v_cmp_lt_f16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x02,0x7c]
+# W64: v_cmp_lt_f16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x02,0x7c]
 
-# W32: v_cmp_lt_f16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x02,0x7c]
-# W64: v_cmp_lt_f16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x02,0x7c]
 0x7b,0x04,0x02,0x7c
+# W32: v_cmp_lt_f16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x02,0x7c]
+# W64: v_cmp_lt_f16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x02,0x7c]
 
-# W32: v_cmp_lt_f16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x02,0x7c]
-# W64: v_cmp_lt_f16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x02,0x7c]
 0x7d,0x04,0x02,0x7c
+# W32: v_cmp_lt_f16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x02,0x7c]
+# W64: v_cmp_lt_f16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x02,0x7c]
 
-# W32: v_cmp_lt_f16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x02,0x7c]
-# W64: v_cmp_lt_f16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x02,0x7c]
 0x7e,0x04,0x02,0x7c
+# W32: v_cmp_lt_f16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x02,0x7c]
+# W64: v_cmp_lt_f16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x02,0x7c]
 
-# W32: v_cmp_lt_f16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x02,0x7c]
-# W64: v_cmp_lt_f16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x02,0x7c]
 0x7f,0x04,0x02,0x7c
+# W32: v_cmp_lt_f16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x02,0x7c]
+# W64: v_cmp_lt_f16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x02,0x7c]
 
-# W32: v_cmp_lt_f16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x02,0x7c]
-# W64: v_cmp_lt_f16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x02,0x7c]
 0x7c,0x04,0x02,0x7c
+# W32: v_cmp_lt_f16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x02,0x7c]
+# W64: v_cmp_lt_f16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x02,0x7c]
 
-# W32: v_cmp_lt_f16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x02,0x7c]
-# W64: v_cmp_lt_f16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x02,0x7c]
 0xc1,0x04,0x02,0x7c
+# W32: v_cmp_lt_f16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x02,0x7c]
+# W64: v_cmp_lt_f16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x02,0x7c]
 
-# W32: v_cmp_lt_f16_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x02,0x7c]
-# W64: v_cmp_lt_f16_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x02,0x7c]
 0xf0,0x04,0x02,0x7c
+# W32: v_cmp_lt_f16_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x02,0x7c]
+# W64: v_cmp_lt_f16_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x02,0x7c]
 
-# W32: v_cmp_lt_f16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x02,0x7c]
-# W64: v_cmp_lt_f16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x02,0x7c]
 0xfd,0x04,0x02,0x7c
+# W32: v_cmp_lt_f16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x02,0x7c]
+# W64: v_cmp_lt_f16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x02,0x7c]
 
-# W32: v_cmp_lt_f16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x02,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_lt_f16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x02,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x02,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_lt_f16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x02,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_lt_f16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x02,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_lt_f32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x22,0x7c]
-# W64: v_cmp_lt_f32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x22,0x7c]
 0x01,0x05,0x22,0x7c
+# W32: v_cmp_lt_f32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x22,0x7c]
+# W64: v_cmp_lt_f32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x22,0x7c]
 
-# W32: v_cmp_lt_f32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x22,0x7c]
-# W64: v_cmp_lt_f32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x22,0x7c]
 0xff,0x05,0x22,0x7c
+# W32: v_cmp_lt_f32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x22,0x7c]
+# W64: v_cmp_lt_f32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x22,0x7c]
 
-# W32: v_cmp_lt_f32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x22,0x7c]
-# W64: v_cmp_lt_f32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x22,0x7c]
 0x01,0x04,0x22,0x7c
+# W32: v_cmp_lt_f32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x22,0x7c]
+# W64: v_cmp_lt_f32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x22,0x7c]
 
-# W32: v_cmp_lt_f32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x22,0x7c]
-# W64: v_cmp_lt_f32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x22,0x7c]
 0x69,0x04,0x22,0x7c
+# W32: v_cmp_lt_f32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x22,0x7c]
+# W64: v_cmp_lt_f32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x22,0x7c]
 
-# W32: v_cmp_lt_f32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x22,0x7c]
-# W64: v_cmp_lt_f32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x22,0x7c]
 0x6a,0x04,0x22,0x7c
+# W32: v_cmp_lt_f32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x22,0x7c]
+# W64: v_cmp_lt_f32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x22,0x7c]
 
-# W32: v_cmp_lt_f32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x22,0x7c]
-# W64: v_cmp_lt_f32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x22,0x7c]
 0x6b,0x04,0x22,0x7c
+# W32: v_cmp_lt_f32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x22,0x7c]
+# W64: v_cmp_lt_f32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x22,0x7c]
 
-# W32: v_cmp_lt_f32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x22,0x7c]
-# W64: v_cmp_lt_f32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x22,0x7c]
 0x7b,0x04,0x22,0x7c
+# W32: v_cmp_lt_f32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x22,0x7c]
+# W64: v_cmp_lt_f32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x22,0x7c]
 
-# W32: v_cmp_lt_f32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x22,0x7c]
-# W64: v_cmp_lt_f32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x22,0x7c]
 0x7d,0x04,0x22,0x7c
+# W32: v_cmp_lt_f32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x22,0x7c]
+# W64: v_cmp_lt_f32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x22,0x7c]
 
-# W32: v_cmp_lt_f32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x22,0x7c]
-# W64: v_cmp_lt_f32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x22,0x7c]
 0x7e,0x04,0x22,0x7c
+# W32: v_cmp_lt_f32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x22,0x7c]
+# W64: v_cmp_lt_f32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x22,0x7c]
 
-# W32: v_cmp_lt_f32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x22,0x7c]
-# W64: v_cmp_lt_f32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x22,0x7c]
 0x7f,0x04,0x22,0x7c
+# W32: v_cmp_lt_f32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x22,0x7c]
+# W64: v_cmp_lt_f32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x22,0x7c]
 
-# W32: v_cmp_lt_f32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x22,0x7c]
-# W64: v_cmp_lt_f32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x22,0x7c]
 0x7c,0x04,0x22,0x7c
+# W32: v_cmp_lt_f32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x22,0x7c]
+# W64: v_cmp_lt_f32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x22,0x7c]
 
-# W32: v_cmp_lt_f32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x22,0x7c]
-# W64: v_cmp_lt_f32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x22,0x7c]
 0xc1,0x04,0x22,0x7c
+# W32: v_cmp_lt_f32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x22,0x7c]
+# W64: v_cmp_lt_f32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x22,0x7c]
 
-# W32: v_cmp_lt_f32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x22,0x7c]
-# W64: v_cmp_lt_f32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x22,0x7c]
 0xf0,0x04,0x22,0x7c
+# W32: v_cmp_lt_f32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x22,0x7c]
+# W64: v_cmp_lt_f32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x22,0x7c]
 
-# W32: v_cmp_lt_f32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x22,0x7c]
-# W64: v_cmp_lt_f32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x22,0x7c]
 0xfd,0x04,0x22,0x7c
+# W32: v_cmp_lt_f32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x22,0x7c]
+# W64: v_cmp_lt_f32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x22,0x7c]
 
-# W32: v_cmp_lt_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x23,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_lt_f32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x23,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x23,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_lt_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x23,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_lt_f32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x23,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lt_f64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0x42,0x7c]
-# W64: v_cmp_lt_f64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0x42,0x7c]
 0x01,0x05,0x42,0x7c
+# W32: v_cmp_lt_f64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0x42,0x7c]
+# W64: v_cmp_lt_f64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0x42,0x7c]
 
-# W32: v_cmp_lt_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x42,0x7c]
-# W64: v_cmp_lt_f64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0x42,0x7c]
 0xfe,0x05,0x42,0x7c
+# W32: v_cmp_lt_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x42,0x7c]
+# W64: v_cmp_lt_f64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x42,0x7c]
 
-# W32: v_cmp_lt_f64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0x42,0x7c]
-# W64: v_cmp_lt_f64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0x42,0x7c]
 0x02,0x04,0x42,0x7c
+# W32: v_cmp_lt_f64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0x42,0x7c]
+# W64: v_cmp_lt_f64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0x42,0x7c]
 
-# W32: v_cmp_lt_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x42,0x7c]
-# W64: v_cmp_lt_f64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0x42,0x7c]
 0x68,0x04,0x42,0x7c
+# W32: v_cmp_lt_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x42,0x7c]
+# W64: v_cmp_lt_f64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x42,0x7c]
 
-# W32: v_cmp_lt_f64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0x42,0x7c]
-# W64: v_cmp_lt_f64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0x42,0x7c]
 0x6a,0x04,0x42,0x7c
+# W32: v_cmp_lt_f64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0x42,0x7c]
+# W64: v_cmp_lt_f64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0x42,0x7c]
 
+0x7a,0x04,0x42,0x7c
 # W32: v_cmp_lt_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x42,0x7c]
 # W64: v_cmp_lt_f64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x42,0x7c]
-0x7a,0x04,0x42,0x7c
 
-# W32: v_cmp_lt_f64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0x42,0x7c]
-# W64: v_cmp_lt_f64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0x42,0x7c]
 0x7e,0x04,0x42,0x7c
+# W32: v_cmp_lt_f64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0x42,0x7c]
+# W64: v_cmp_lt_f64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0x42,0x7c]
 
-# W32: v_cmp_lt_f64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0x42,0x7c]
-# W64: v_cmp_lt_f64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0x42,0x7c]
 0x7c,0x04,0x42,0x7c
+# W32: v_cmp_lt_f64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0x42,0x7c]
+# W64: v_cmp_lt_f64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0x42,0x7c]
 
-# W32: v_cmp_lt_f64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0x42,0x7c]
-# W64: v_cmp_lt_f64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0x42,0x7c]
 0xc1,0x04,0x42,0x7c
+# W32: v_cmp_lt_f64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0x42,0x7c]
+# W64: v_cmp_lt_f64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0x42,0x7c]
 
-# W32: v_cmp_lt_f64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0x42,0x7c]
-# W64: v_cmp_lt_f64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0x42,0x7c]
 0xf0,0x04,0x42,0x7c
+# W32: v_cmp_lt_f64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0x42,0x7c]
+# W64: v_cmp_lt_f64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0x42,0x7c]
 
-# W32: v_cmp_lt_f64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0x42,0x7c]
-# W64: v_cmp_lt_f64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0x42,0x7c]
 0xfd,0x04,0x42,0x7c
+# W32: v_cmp_lt_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x42,0x7c]
+# W64: v_cmp_lt_f64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0x42,0x7c]
 
+0xff,0xfc,0x43,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_lt_f64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x43,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_lt_f64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x43,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0x43,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_lt_i16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x62,0x7c]
-# W64: v_cmp_lt_i16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x62,0x7c]
 0x01,0x05,0x62,0x7c
+# W32: v_cmp_lt_i16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x62,0x7c]
+# W64: v_cmp_lt_i16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x62,0x7c]
 
-# W32: v_cmp_lt_i16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x62,0x7c]
-# W64: v_cmp_lt_i16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x62,0x7c]
 0x7f,0x05,0x62,0x7c
+# W32: v_cmp_lt_i16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x62,0x7c]
+# W64: v_cmp_lt_i16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x62,0x7c]
 
-# W32: v_cmp_lt_i16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x62,0x7c]
-# W64: v_cmp_lt_i16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x62,0x7c]
 0x01,0x04,0x62,0x7c
+# W32: v_cmp_lt_i16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x62,0x7c]
+# W64: v_cmp_lt_i16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x62,0x7c]
 
-# W32: v_cmp_lt_i16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x62,0x7c]
-# W64: v_cmp_lt_i16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x62,0x7c]
 0x69,0x04,0x62,0x7c
+# W32: v_cmp_lt_i16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x62,0x7c]
+# W64: v_cmp_lt_i16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x62,0x7c]
 
-# W32: v_cmp_lt_i16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x62,0x7c]
-# W64: v_cmp_lt_i16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x62,0x7c]
 0x6a,0x04,0x62,0x7c
+# W32: v_cmp_lt_i16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x62,0x7c]
+# W64: v_cmp_lt_i16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x62,0x7c]
 
-# W32: v_cmp_lt_i16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x62,0x7c]
-# W64: v_cmp_lt_i16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x62,0x7c]
 0x6b,0x04,0x62,0x7c
+# W32: v_cmp_lt_i16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x62,0x7c]
+# W64: v_cmp_lt_i16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x62,0x7c]
 
-# W32: v_cmp_lt_i16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x62,0x7c]
-# W64: v_cmp_lt_i16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x62,0x7c]
 0x7b,0x04,0x62,0x7c
+# W32: v_cmp_lt_i16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x62,0x7c]
+# W64: v_cmp_lt_i16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x62,0x7c]
 
-# W32: v_cmp_lt_i16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x62,0x7c]
-# W64: v_cmp_lt_i16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x62,0x7c]
 0x7d,0x04,0x62,0x7c
+# W32: v_cmp_lt_i16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x62,0x7c]
+# W64: v_cmp_lt_i16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x62,0x7c]
 
-# W32: v_cmp_lt_i16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x62,0x7c]
-# W64: v_cmp_lt_i16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x62,0x7c]
 0x7e,0x04,0x62,0x7c
+# W32: v_cmp_lt_i16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x62,0x7c]
+# W64: v_cmp_lt_i16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x62,0x7c]
 
-# W32: v_cmp_lt_i16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x62,0x7c]
-# W64: v_cmp_lt_i16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x62,0x7c]
 0x7f,0x04,0x62,0x7c
+# W32: v_cmp_lt_i16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x62,0x7c]
+# W64: v_cmp_lt_i16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x62,0x7c]
 
-# W32: v_cmp_lt_i16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x62,0x7c]
-# W64: v_cmp_lt_i16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x62,0x7c]
 0x7c,0x04,0x62,0x7c
+# W32: v_cmp_lt_i16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x62,0x7c]
+# W64: v_cmp_lt_i16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x62,0x7c]
 
-# W32: v_cmp_lt_i16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x62,0x7c]
-# W64: v_cmp_lt_i16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x62,0x7c]
 0xc1,0x04,0x62,0x7c
+# W32: v_cmp_lt_i16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x62,0x7c]
+# W64: v_cmp_lt_i16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x62,0x7c]
 
-# W32: v_cmp_lt_i16_e32 vcc_lo, 0x3800, v2
-# W64: v_cmp_lt_i16_e32 vcc, 0x3800, v2          ; encoding: [0xff,0x04,0x62,0x7c,0x00,0x38,0x00,0x00]
 0xf0,0x04,0x62,0x7c
+# W32: v_cmp_lt_i16_e32 vcc_lo, 0x3800, v2     ; encoding: [0xff,0x04,0x62,0x7c,0x00,0x38,0x00,0x00]
+# W64: v_cmp_lt_i16_e32 vcc, 0x3800, v2        ; encoding: [0xff,0x04,0x62,0x7c,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_lt_i16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x62,0x7c]
-# W64: v_cmp_lt_i16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x62,0x7c]
 0xfd,0x04,0x62,0x7c
+# W32: v_cmp_lt_i16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x62,0x7c]
+# W64: v_cmp_lt_i16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x62,0x7c]
 
-# W32: v_cmp_lt_i16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x62,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_lt_i16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x62,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x62,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_lt_i16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x62,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_lt_i16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x62,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_lt_i32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x82,0x7c]
-# W64: v_cmp_lt_i32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x82,0x7c]
 0x01,0x05,0x82,0x7c
+# W32: v_cmp_lt_i32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x82,0x7c]
+# W64: v_cmp_lt_i32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x82,0x7c]
 
-# W32: v_cmp_lt_i32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x82,0x7c]
-# W64: v_cmp_lt_i32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x82,0x7c]
 0xff,0x05,0x82,0x7c
+# W32: v_cmp_lt_i32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x82,0x7c]
+# W64: v_cmp_lt_i32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x82,0x7c]
 
-# W32: v_cmp_lt_i32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x82,0x7c]
-# W64: v_cmp_lt_i32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x82,0x7c]
 0x01,0x04,0x82,0x7c
+# W32: v_cmp_lt_i32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x82,0x7c]
+# W64: v_cmp_lt_i32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x82,0x7c]
 
-# W32: v_cmp_lt_i32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x82,0x7c]
-# W64: v_cmp_lt_i32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x82,0x7c]
 0x69,0x04,0x82,0x7c
+# W32: v_cmp_lt_i32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x82,0x7c]
+# W64: v_cmp_lt_i32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x82,0x7c]
 
-# W32: v_cmp_lt_i32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x82,0x7c]
-# W64: v_cmp_lt_i32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x82,0x7c]
 0x6a,0x04,0x82,0x7c
+# W32: v_cmp_lt_i32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x82,0x7c]
+# W64: v_cmp_lt_i32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x82,0x7c]
 
-# W32: v_cmp_lt_i32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x82,0x7c]
-# W64: v_cmp_lt_i32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x82,0x7c]
 0x6b,0x04,0x82,0x7c
+# W32: v_cmp_lt_i32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x82,0x7c]
+# W64: v_cmp_lt_i32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x82,0x7c]
 
-# W32: v_cmp_lt_i32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x82,0x7c]
-# W64: v_cmp_lt_i32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x82,0x7c]
 0x7b,0x04,0x82,0x7c
+# W32: v_cmp_lt_i32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x82,0x7c]
+# W64: v_cmp_lt_i32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x82,0x7c]
 
-# W32: v_cmp_lt_i32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x82,0x7c]
-# W64: v_cmp_lt_i32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x82,0x7c]
 0x7d,0x04,0x82,0x7c
+# W32: v_cmp_lt_i32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x82,0x7c]
+# W64: v_cmp_lt_i32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x82,0x7c]
 
-# W32: v_cmp_lt_i32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x82,0x7c]
-# W64: v_cmp_lt_i32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x82,0x7c]
 0x7e,0x04,0x82,0x7c
+# W32: v_cmp_lt_i32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x82,0x7c]
+# W64: v_cmp_lt_i32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x82,0x7c]
 
-# W32: v_cmp_lt_i32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x82,0x7c]
-# W64: v_cmp_lt_i32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x82,0x7c]
 0x7f,0x04,0x82,0x7c
+# W32: v_cmp_lt_i32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x82,0x7c]
+# W64: v_cmp_lt_i32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x82,0x7c]
 
-# W32: v_cmp_lt_i32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x82,0x7c]
-# W64: v_cmp_lt_i32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x82,0x7c]
 0x7c,0x04,0x82,0x7c
+# W32: v_cmp_lt_i32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x82,0x7c]
+# W64: v_cmp_lt_i32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x82,0x7c]
 
-# W32: v_cmp_lt_i32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x82,0x7c]
-# W64: v_cmp_lt_i32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x82,0x7c]
 0xc1,0x04,0x82,0x7c
+# W32: v_cmp_lt_i32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x82,0x7c]
+# W64: v_cmp_lt_i32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x82,0x7c]
 
-# W32: v_cmp_lt_i32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x82,0x7c]
-# W64: v_cmp_lt_i32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x82,0x7c]
 0xf0,0x04,0x82,0x7c
+# W32: v_cmp_lt_i32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x82,0x7c]
+# W64: v_cmp_lt_i32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x82,0x7c]
 
-# W32: v_cmp_lt_i32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x82,0x7c]
-# W64: v_cmp_lt_i32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x82,0x7c]
 0xfd,0x04,0x82,0x7c
+# W32: v_cmp_lt_i32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x82,0x7c]
+# W64: v_cmp_lt_i32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x82,0x7c]
 
-# W32: v_cmp_lt_i32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x83,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_lt_i32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x83,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x83,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_lt_i32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x83,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_lt_i32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x83,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0xa2,0x7c]
-# W64: v_cmp_lt_i64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0xa2,0x7c]
 0x01,0x05,0xa2,0x7c
+# W32: v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0xa2,0x7c]
+# W64: v_cmp_lt_i64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0xa2,0x7c]
 
-# W32: v_cmp_lt_i64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xa2,0x7c]
-# W64: v_cmp_lt_i64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0xa2,0x7c]
 0xfe,0x05,0xa2,0x7c
+# W32: v_cmp_lt_i64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xa2,0x7c]
+# W64: v_cmp_lt_i64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xa2,0x7c]
 
-# W32: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0xa2,0x7c]
-# W64: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0xa2,0x7c]
 0x02,0x04,0xa2,0x7c
+# W32: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0xa2,0x7c]
+# W64: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0xa2,0x7c]
 
-# W32: v_cmp_lt_i64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xa2,0x7c]
-# W64: v_cmp_lt_i64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0xa2,0x7c]
 0x68,0x04,0xa2,0x7c
+# W32: v_cmp_lt_i64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xa2,0x7c]
+# W64: v_cmp_lt_i64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xa2,0x7c]
 
-# W32: v_cmp_lt_i64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0xa2,0x7c]
-# W64: v_cmp_lt_i64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0xa2,0x7c]
 0x6a,0x04,0xa2,0x7c
+# W32: v_cmp_lt_i64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0xa2,0x7c]
+# W64: v_cmp_lt_i64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0xa2,0x7c]
 
+0x7a,0x04,0xa2,0x7c
 # W32: v_cmp_lt_i64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xa2,0x7c]
 # W64: v_cmp_lt_i64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xa2,0x7c]
-0x7a,0x04,0xa2,0x7c
 
-# W32: v_cmp_lt_i64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0xa2,0x7c]
-# W64: v_cmp_lt_i64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0xa2,0x7c]
 0x7e,0x04,0xa2,0x7c
+# W32: v_cmp_lt_i64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0xa2,0x7c]
+# W64: v_cmp_lt_i64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0xa2,0x7c]
 
-# W32: v_cmp_lt_i64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0xa2,0x7c]
-# W64: v_cmp_lt_i64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0xa2,0x7c]
 0x7c,0x04,0xa2,0x7c
+# W32: v_cmp_lt_i64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0xa2,0x7c]
+# W64: v_cmp_lt_i64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0xa2,0x7c]
 
-# W32: v_cmp_lt_i64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0xa2,0x7c]
-# W64: v_cmp_lt_i64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0xa2,0x7c]
 0xc1,0x04,0xa2,0x7c
+# W32: v_cmp_lt_i64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0xa2,0x7c]
+# W64: v_cmp_lt_i64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0xa2,0x7c]
 
-# W32: v_cmp_lt_i64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0xa2,0x7c]
-# W64: v_cmp_lt_i64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0xa2,0x7c]
 0xf0,0x04,0xa2,0x7c
+# W32: v_cmp_lt_i64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0xa2,0x7c]
+# W64: v_cmp_lt_i64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0xa2,0x7c]
 
-# W32: v_cmp_lt_i64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0xa2,0x7c]
-# W64: v_cmp_lt_i64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0xa2,0x7c]
 0xfd,0x04,0xa2,0x7c
+# W32: v_cmp_lt_i64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0xa2,0x7c]
+# W64: v_cmp_lt_i64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0xa2,0x7c]
 
+0xff,0xfc,0xa3,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_lt_i64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa3,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_lt_i64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa3,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0xa3,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_lt_u16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x72,0x7c]
-# W64: v_cmp_lt_u16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x72,0x7c]
 0x01,0x05,0x72,0x7c
+# W32: v_cmp_lt_u16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x72,0x7c]
+# W64: v_cmp_lt_u16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x72,0x7c]
 
-# W32: v_cmp_lt_u16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x72,0x7c]
-# W64: v_cmp_lt_u16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x72,0x7c]
 0x7f,0x05,0x72,0x7c
+# W32: v_cmp_lt_u16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x72,0x7c]
+# W64: v_cmp_lt_u16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x72,0x7c]
 
-# W32: v_cmp_lt_u16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x72,0x7c]
-# W64: v_cmp_lt_u16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x72,0x7c]
 0x01,0x04,0x72,0x7c
+# W32: v_cmp_lt_u16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x72,0x7c]
+# W64: v_cmp_lt_u16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x72,0x7c]
 
-# W32: v_cmp_lt_u16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x72,0x7c]
-# W64: v_cmp_lt_u16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x72,0x7c]
 0x69,0x04,0x72,0x7c
+# W32: v_cmp_lt_u16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x72,0x7c]
+# W64: v_cmp_lt_u16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x72,0x7c]
 
-# W32: v_cmp_lt_u16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x72,0x7c]
-# W64: v_cmp_lt_u16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x72,0x7c]
 0x6a,0x04,0x72,0x7c
+# W32: v_cmp_lt_u16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x72,0x7c]
+# W64: v_cmp_lt_u16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x72,0x7c]
 
-# W32: v_cmp_lt_u16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x72,0x7c]
-# W64: v_cmp_lt_u16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x72,0x7c]
 0x6b,0x04,0x72,0x7c
+# W32: v_cmp_lt_u16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x72,0x7c]
+# W64: v_cmp_lt_u16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x72,0x7c]
 
-# W32: v_cmp_lt_u16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x72,0x7c]
-# W64: v_cmp_lt_u16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x72,0x7c]
 0x7b,0x04,0x72,0x7c
+# W32: v_cmp_lt_u16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x72,0x7c]
+# W64: v_cmp_lt_u16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x72,0x7c]
 
-# W32: v_cmp_lt_u16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x72,0x7c]
-# W64: v_cmp_lt_u16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x72,0x7c]
 0x7d,0x04,0x72,0x7c
+# W32: v_cmp_lt_u16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x72,0x7c]
+# W64: v_cmp_lt_u16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x72,0x7c]
 
-# W32: v_cmp_lt_u16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x72,0x7c]
-# W64: v_cmp_lt_u16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x72,0x7c]
 0x7e,0x04,0x72,0x7c
+# W32: v_cmp_lt_u16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x72,0x7c]
+# W64: v_cmp_lt_u16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x72,0x7c]
 
-# W32: v_cmp_lt_u16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x72,0x7c]
-# W64: v_cmp_lt_u16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x72,0x7c]
 0x7f,0x04,0x72,0x7c
+# W32: v_cmp_lt_u16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x72,0x7c]
+# W64: v_cmp_lt_u16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x72,0x7c]
 
-# W32: v_cmp_lt_u16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x72,0x7c]
-# W64: v_cmp_lt_u16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x72,0x7c]
 0x7c,0x04,0x72,0x7c
+# W32: v_cmp_lt_u16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x72,0x7c]
+# W64: v_cmp_lt_u16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x72,0x7c]
 
-# W32: v_cmp_lt_u16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x72,0x7c]
-# W64: v_cmp_lt_u16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x72,0x7c]
 0xc1,0x04,0x72,0x7c
+# W32: v_cmp_lt_u16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x72,0x7c]
+# W64: v_cmp_lt_u16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x72,0x7c]
 
-# W32: v_cmp_lt_u16_e32 vcc_lo, 0x3800, v2
-# W64: v_cmp_lt_u16_e32 vcc, 0x3800, v2          ; encoding: [0xff,0x04,0x72,0x7c,0x00,0x38,0x00,0x00]
 0xf0,0x04,0x72,0x7c
+# W32: v_cmp_lt_u16_e32 vcc_lo, 0x3800, v2     ; encoding: [0xff,0x04,0x72,0x7c,0x00,0x38,0x00,0x00]
+# W64: v_cmp_lt_u16_e32 vcc, 0x3800, v2        ; encoding: [0xff,0x04,0x72,0x7c,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_lt_u16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x72,0x7c]
-# W64: v_cmp_lt_u16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x72,0x7c]
 0xfd,0x04,0x72,0x7c
+# W32: v_cmp_lt_u16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x72,0x7c]
+# W64: v_cmp_lt_u16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x72,0x7c]
 
-# W32: v_cmp_lt_u16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x72,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_lt_u16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x72,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x72,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_lt_u16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x72,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_lt_u16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x72,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_lt_u32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x92,0x7c]
-# W64: v_cmp_lt_u32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x92,0x7c]
 0x01,0x05,0x92,0x7c
+# W32: v_cmp_lt_u32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x92,0x7c]
+# W64: v_cmp_lt_u32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x92,0x7c]
 
-# W32: v_cmp_lt_u32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x92,0x7c]
-# W64: v_cmp_lt_u32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x92,0x7c]
 0xff,0x05,0x92,0x7c
+# W32: v_cmp_lt_u32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x92,0x7c]
+# W64: v_cmp_lt_u32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x92,0x7c]
 
-# W32: v_cmp_lt_u32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x92,0x7c]
-# W64: v_cmp_lt_u32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x92,0x7c]
 0x01,0x04,0x92,0x7c
+# W32: v_cmp_lt_u32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x92,0x7c]
+# W64: v_cmp_lt_u32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x92,0x7c]
 
-# W32: v_cmp_lt_u32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x92,0x7c]
-# W64: v_cmp_lt_u32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x92,0x7c]
 0x69,0x04,0x92,0x7c
+# W32: v_cmp_lt_u32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x92,0x7c]
+# W64: v_cmp_lt_u32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x92,0x7c]
 
-# W32: v_cmp_lt_u32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x92,0x7c]
-# W64: v_cmp_lt_u32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x92,0x7c]
 0x6a,0x04,0x92,0x7c
+# W32: v_cmp_lt_u32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x92,0x7c]
+# W64: v_cmp_lt_u32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x92,0x7c]
 
-# W32: v_cmp_lt_u32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x92,0x7c]
-# W64: v_cmp_lt_u32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x92,0x7c]
 0x6b,0x04,0x92,0x7c
+# W32: v_cmp_lt_u32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x92,0x7c]
+# W64: v_cmp_lt_u32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x92,0x7c]
 
-# W32: v_cmp_lt_u32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x92,0x7c]
-# W64: v_cmp_lt_u32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x92,0x7c]
 0x7b,0x04,0x92,0x7c
+# W32: v_cmp_lt_u32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x92,0x7c]
+# W64: v_cmp_lt_u32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x92,0x7c]
 
-# W32: v_cmp_lt_u32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x92,0x7c]
-# W64: v_cmp_lt_u32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x92,0x7c]
 0x7d,0x04,0x92,0x7c
+# W32: v_cmp_lt_u32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x92,0x7c]
+# W64: v_cmp_lt_u32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x92,0x7c]
 
-# W32: v_cmp_lt_u32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x92,0x7c]
-# W64: v_cmp_lt_u32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x92,0x7c]
 0x7e,0x04,0x92,0x7c
+# W32: v_cmp_lt_u32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x92,0x7c]
+# W64: v_cmp_lt_u32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x92,0x7c]
 
-# W32: v_cmp_lt_u32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x92,0x7c]
-# W64: v_cmp_lt_u32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x92,0x7c]
 0x7f,0x04,0x92,0x7c
+# W32: v_cmp_lt_u32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x92,0x7c]
+# W64: v_cmp_lt_u32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x92,0x7c]
 
-# W32: v_cmp_lt_u32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x92,0x7c]
-# W64: v_cmp_lt_u32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x92,0x7c]
 0x7c,0x04,0x92,0x7c
+# W32: v_cmp_lt_u32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x92,0x7c]
+# W64: v_cmp_lt_u32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x92,0x7c]
 
-# W32: v_cmp_lt_u32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x92,0x7c]
-# W64: v_cmp_lt_u32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x92,0x7c]
 0xc1,0x04,0x92,0x7c
+# W32: v_cmp_lt_u32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x92,0x7c]
+# W64: v_cmp_lt_u32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x92,0x7c]
 
-# W32: v_cmp_lt_u32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x92,0x7c]
-# W64: v_cmp_lt_u32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x92,0x7c]
 0xf0,0x04,0x92,0x7c
+# W32: v_cmp_lt_u32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x92,0x7c]
+# W64: v_cmp_lt_u32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x92,0x7c]
 
-# W32: v_cmp_lt_u32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x92,0x7c]
-# W64: v_cmp_lt_u32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x92,0x7c]
 0xfd,0x04,0x92,0x7c
+# W32: v_cmp_lt_u32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x92,0x7c]
+# W64: v_cmp_lt_u32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x92,0x7c]
 
-# W32: v_cmp_lt_u32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x93,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_lt_u32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x93,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x93,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_lt_u32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x93,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_lt_u32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x93,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0xb2,0x7c]
-# W64: v_cmp_lt_u64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0xb2,0x7c]
 0x01,0x05,0xb2,0x7c
+# W32: v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0xb2,0x7c]
+# W64: v_cmp_lt_u64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0xb2,0x7c]
 
-# W32: v_cmp_lt_u64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xb2,0x7c]
-# W64: v_cmp_lt_u64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0xb2,0x7c]
 0xfe,0x05,0xb2,0x7c
+# W32: v_cmp_lt_u64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xb2,0x7c]
+# W64: v_cmp_lt_u64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xb2,0x7c]
 
-# W32: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0xb2,0x7c]
-# W64: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0xb2,0x7c]
 0x02,0x04,0xb2,0x7c
+# W32: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0xb2,0x7c]
+# W64: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0xb2,0x7c]
 
-# W32: v_cmp_lt_u64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xb2,0x7c]
-# W64: v_cmp_lt_u64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0xb2,0x7c]
 0x68,0x04,0xb2,0x7c
+# W32: v_cmp_lt_u64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xb2,0x7c]
+# W64: v_cmp_lt_u64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xb2,0x7c]
 
-# W32: v_cmp_lt_u64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0xb2,0x7c]
-# W64: v_cmp_lt_u64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0xb2,0x7c]
 0x6a,0x04,0xb2,0x7c
+# W32: v_cmp_lt_u64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0xb2,0x7c]
+# W64: v_cmp_lt_u64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0xb2,0x7c]
 
+0x7a,0x04,0xb2,0x7c
 # W32: v_cmp_lt_u64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xb2,0x7c]
 # W64: v_cmp_lt_u64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xb2,0x7c]
-0x7a,0x04,0xb2,0x7c
 
-# W32: v_cmp_lt_u64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0xb2,0x7c]
-# W64: v_cmp_lt_u64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0xb2,0x7c]
 0x7e,0x04,0xb2,0x7c
+# W32: v_cmp_lt_u64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0xb2,0x7c]
+# W64: v_cmp_lt_u64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0xb2,0x7c]
 
-# W32: v_cmp_lt_u64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0xb2,0x7c]
-# W64: v_cmp_lt_u64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0xb2,0x7c]
 0x7c,0x04,0xb2,0x7c
+# W32: v_cmp_lt_u64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0xb2,0x7c]
+# W64: v_cmp_lt_u64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0xb2,0x7c]
 
-# W32: v_cmp_lt_u64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0xb2,0x7c]
-# W64: v_cmp_lt_u64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0xb2,0x7c]
 0xc1,0x04,0xb2,0x7c
+# W32: v_cmp_lt_u64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0xb2,0x7c]
+# W64: v_cmp_lt_u64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0xb2,0x7c]
 
-# W32: v_cmp_lt_u64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0xb2,0x7c]
-# W64: v_cmp_lt_u64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0xb2,0x7c]
 0xf0,0x04,0xb2,0x7c
+# W32: v_cmp_lt_u64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0xb2,0x7c]
+# W64: v_cmp_lt_u64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0xb2,0x7c]
 
-# W32: v_cmp_lt_u64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0xb2,0x7c]
-# W64: v_cmp_lt_u64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0xb2,0x7c]
 0xfd,0x04,0xb2,0x7c
+# W32: v_cmp_lt_u64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0xb2,0x7c]
+# W64: v_cmp_lt_u64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0xb2,0x7c]
 
+0xff,0xfc,0xb3,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_lt_u64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb3,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_lt_u64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb3,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0xb3,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_ne_i16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x6a,0x7c]
-# W64: v_cmp_ne_i16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x6a,0x7c]
 0x01,0x05,0x6a,0x7c
+# W32: v_cmp_ne_i16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x6a,0x7c]
+# W64: v_cmp_ne_i16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x6a,0x7c]
 
-# W32: v_cmp_ne_i16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x6a,0x7c]
-# W64: v_cmp_ne_i16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x6a,0x7c]
 0x7f,0x05,0x6a,0x7c
+# W32: v_cmp_ne_i16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x6a,0x7c]
+# W64: v_cmp_ne_i16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x6a,0x7c]
 
-# W32: v_cmp_ne_i16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x6a,0x7c]
-# W64: v_cmp_ne_i16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x6a,0x7c]
 0x01,0x04,0x6a,0x7c
+# W32: v_cmp_ne_i16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x6a,0x7c]
+# W64: v_cmp_ne_i16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x6a,0x7c]
 
-# W32: v_cmp_ne_i16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x6a,0x7c]
-# W64: v_cmp_ne_i16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x6a,0x7c]
 0x69,0x04,0x6a,0x7c
+# W32: v_cmp_ne_i16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x6a,0x7c]
+# W64: v_cmp_ne_i16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x6a,0x7c]
 
-# W32: v_cmp_ne_i16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x6a,0x7c]
-# W64: v_cmp_ne_i16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x6a,0x7c]
 0x6a,0x04,0x6a,0x7c
+# W32: v_cmp_ne_i16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x6a,0x7c]
+# W64: v_cmp_ne_i16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x6a,0x7c]
 
-# W32: v_cmp_ne_i16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x6a,0x7c]
-# W64: v_cmp_ne_i16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x6a,0x7c]
 0x6b,0x04,0x6a,0x7c
+# W32: v_cmp_ne_i16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x6a,0x7c]
+# W64: v_cmp_ne_i16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x6a,0x7c]
 
-# W32: v_cmp_ne_i16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x6a,0x7c]
-# W64: v_cmp_ne_i16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x6a,0x7c]
 0x7b,0x04,0x6a,0x7c
+# W32: v_cmp_ne_i16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x6a,0x7c]
+# W64: v_cmp_ne_i16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x6a,0x7c]
 
-# W32: v_cmp_ne_i16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x6a,0x7c]
-# W64: v_cmp_ne_i16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x6a,0x7c]
 0x7d,0x04,0x6a,0x7c
+# W32: v_cmp_ne_i16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x6a,0x7c]
+# W64: v_cmp_ne_i16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x6a,0x7c]
 
-# W32: v_cmp_ne_i16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x6a,0x7c]
-# W64: v_cmp_ne_i16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x6a,0x7c]
 0x7e,0x04,0x6a,0x7c
+# W32: v_cmp_ne_i16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x6a,0x7c]
+# W64: v_cmp_ne_i16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x6a,0x7c]
 
-# W32: v_cmp_ne_i16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x6a,0x7c]
-# W64: v_cmp_ne_i16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x6a,0x7c]
 0x7f,0x04,0x6a,0x7c
+# W32: v_cmp_ne_i16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x6a,0x7c]
+# W64: v_cmp_ne_i16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x6a,0x7c]
 
-# W32: v_cmp_ne_i16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x6a,0x7c]
-# W64: v_cmp_ne_i16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x6a,0x7c]
 0x7c,0x04,0x6a,0x7c
+# W32: v_cmp_ne_i16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x6a,0x7c]
+# W64: v_cmp_ne_i16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x6a,0x7c]
 
-# W32: v_cmp_ne_i16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x6a,0x7c]
-# W64: v_cmp_ne_i16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x6a,0x7c]
 0xc1,0x04,0x6a,0x7c
+# W32: v_cmp_ne_i16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x6a,0x7c]
+# W64: v_cmp_ne_i16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x6a,0x7c]
 
-# W32: v_cmp_ne_i16_e32 vcc_lo, 0x3800, v2
-# W64: v_cmp_ne_i16_e32 vcc, 0x3800, v2          ; encoding: [0xff,0x04,0x6a,0x7c,0x00,0x38,0x00,0x00]
 0xf0,0x04,0x6a,0x7c
+# W32: v_cmp_ne_i16_e32 vcc_lo, 0x3800, v2     ; encoding: [0xff,0x04,0x6a,0x7c,0x00,0x38,0x00,0x00]
+# W64: v_cmp_ne_i16_e32 vcc, 0x3800, v2        ; encoding: [0xff,0x04,0x6a,0x7c,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_ne_i16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x6a,0x7c]
-# W64: v_cmp_ne_i16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x6a,0x7c]
 0xfd,0x04,0x6a,0x7c
+# W32: v_cmp_ne_i16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x6a,0x7c]
+# W64: v_cmp_ne_i16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x6a,0x7c]
 
-# W32: v_cmp_ne_i16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x6a,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_ne_i16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x6a,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x6a,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_ne_i16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x6a,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_ne_i16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x6a,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ne_i32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x8a,0x7c]
-# W64: v_cmp_ne_i32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x8a,0x7c]
 0x01,0x05,0x8a,0x7c
+# W32: v_cmp_ne_i32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x8a,0x7c]
+# W64: v_cmp_ne_i32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x8a,0x7c]
 
-# W32: v_cmp_ne_i32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x8a,0x7c]
-# W64: v_cmp_ne_i32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x8a,0x7c]
 0xff,0x05,0x8a,0x7c
+# W32: v_cmp_ne_i32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x8a,0x7c]
+# W64: v_cmp_ne_i32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x8a,0x7c]
 
-# W32: v_cmp_ne_i32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x8a,0x7c]
-# W64: v_cmp_ne_i32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x8a,0x7c]
 0x01,0x04,0x8a,0x7c
+# W32: v_cmp_ne_i32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x8a,0x7c]
+# W64: v_cmp_ne_i32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x8a,0x7c]
 
-# W32: v_cmp_ne_i32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x8a,0x7c]
-# W64: v_cmp_ne_i32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x8a,0x7c]
 0x69,0x04,0x8a,0x7c
+# W32: v_cmp_ne_i32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x8a,0x7c]
+# W64: v_cmp_ne_i32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x8a,0x7c]
 
-# W32: v_cmp_ne_i32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x8a,0x7c]
-# W64: v_cmp_ne_i32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x8a,0x7c]
 0x6a,0x04,0x8a,0x7c
+# W32: v_cmp_ne_i32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x8a,0x7c]
+# W64: v_cmp_ne_i32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x8a,0x7c]
 
-# W32: v_cmp_ne_i32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x8a,0x7c]
-# W64: v_cmp_ne_i32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x8a,0x7c]
 0x6b,0x04,0x8a,0x7c
+# W32: v_cmp_ne_i32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x8a,0x7c]
+# W64: v_cmp_ne_i32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x8a,0x7c]
 
-# W32: v_cmp_ne_i32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x8a,0x7c]
-# W64: v_cmp_ne_i32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x8a,0x7c]
 0x7b,0x04,0x8a,0x7c
+# W32: v_cmp_ne_i32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x8a,0x7c]
+# W64: v_cmp_ne_i32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x8a,0x7c]
 
-# W32: v_cmp_ne_i32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x8a,0x7c]
-# W64: v_cmp_ne_i32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x8a,0x7c]
 0x7d,0x04,0x8a,0x7c
+# W32: v_cmp_ne_i32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x8a,0x7c]
+# W64: v_cmp_ne_i32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x8a,0x7c]
 
-# W32: v_cmp_ne_i32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x8a,0x7c]
-# W64: v_cmp_ne_i32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x8a,0x7c]
 0x7e,0x04,0x8a,0x7c
+# W32: v_cmp_ne_i32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x8a,0x7c]
+# W64: v_cmp_ne_i32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x8a,0x7c]
 
-# W32: v_cmp_ne_i32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x8a,0x7c]
-# W64: v_cmp_ne_i32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x8a,0x7c]
 0x7f,0x04,0x8a,0x7c
+# W32: v_cmp_ne_i32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x8a,0x7c]
+# W64: v_cmp_ne_i32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x8a,0x7c]
 
-# W32: v_cmp_ne_i32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x8a,0x7c]
-# W64: v_cmp_ne_i32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x8a,0x7c]
 0x7c,0x04,0x8a,0x7c
+# W32: v_cmp_ne_i32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x8a,0x7c]
+# W64: v_cmp_ne_i32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x8a,0x7c]
 
-# W32: v_cmp_ne_i32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x8a,0x7c]
-# W64: v_cmp_ne_i32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x8a,0x7c]
 0xc1,0x04,0x8a,0x7c
+# W32: v_cmp_ne_i32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x8a,0x7c]
+# W64: v_cmp_ne_i32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x8a,0x7c]
 
-# W32: v_cmp_ne_i32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x8a,0x7c]
-# W64: v_cmp_ne_i32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x8a,0x7c]
 0xf0,0x04,0x8a,0x7c
+# W32: v_cmp_ne_i32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x8a,0x7c]
+# W64: v_cmp_ne_i32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x8a,0x7c]
 
-# W32: v_cmp_ne_i32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x8a,0x7c]
-# W64: v_cmp_ne_i32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x8a,0x7c]
 0xfd,0x04,0x8a,0x7c
+# W32: v_cmp_ne_i32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x8a,0x7c]
+# W64: v_cmp_ne_i32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x8a,0x7c]
 
-# W32: v_cmp_ne_i32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x8b,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_ne_i32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x8b,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x8b,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_ne_i32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x8b,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_ne_i32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x8b,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ne_i64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0xaa,0x7c]
-# W64: v_cmp_ne_i64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0xaa,0x7c]
 0x01,0x05,0xaa,0x7c
+# W32: v_cmp_ne_i64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0xaa,0x7c]
+# W64: v_cmp_ne_i64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0xaa,0x7c]
 
-# W32: v_cmp_ne_i64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xaa,0x7c]
-# W64: v_cmp_ne_i64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0xaa,0x7c]
 0xfe,0x05,0xaa,0x7c
+# W32: v_cmp_ne_i64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xaa,0x7c]
+# W64: v_cmp_ne_i64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xaa,0x7c]
 
-# W32: v_cmp_ne_i64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0xaa,0x7c]
-# W64: v_cmp_ne_i64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0xaa,0x7c]
 0x02,0x04,0xaa,0x7c
+# W32: v_cmp_ne_i64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0xaa,0x7c]
+# W64: v_cmp_ne_i64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0xaa,0x7c]
 
-# W32: v_cmp_ne_i64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xaa,0x7c]
-# W64: v_cmp_ne_i64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0xaa,0x7c]
 0x68,0x04,0xaa,0x7c
+# W32: v_cmp_ne_i64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xaa,0x7c]
+# W64: v_cmp_ne_i64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xaa,0x7c]
 
-# W32: v_cmp_ne_i64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0xaa,0x7c]
-# W64: v_cmp_ne_i64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0xaa,0x7c]
 0x6a,0x04,0xaa,0x7c
+# W32: v_cmp_ne_i64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0xaa,0x7c]
+# W64: v_cmp_ne_i64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0xaa,0x7c]
 
+0x7a,0x04,0xaa,0x7c
 # W32: v_cmp_ne_i64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xaa,0x7c]
 # W64: v_cmp_ne_i64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xaa,0x7c]
-0x7a,0x04,0xaa,0x7c
 
-# W32: v_cmp_ne_i64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0xaa,0x7c]
-# W64: v_cmp_ne_i64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0xaa,0x7c]
 0x7e,0x04,0xaa,0x7c
+# W32: v_cmp_ne_i64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0xaa,0x7c]
+# W64: v_cmp_ne_i64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0xaa,0x7c]
 
-# W32: v_cmp_ne_i64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0xaa,0x7c]
-# W64: v_cmp_ne_i64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0xaa,0x7c]
 0x7c,0x04,0xaa,0x7c
+# W32: v_cmp_ne_i64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0xaa,0x7c]
+# W64: v_cmp_ne_i64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0xaa,0x7c]
 
-# W32: v_cmp_ne_i64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0xaa,0x7c]
-# W64: v_cmp_ne_i64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0xaa,0x7c]
 0xc1,0x04,0xaa,0x7c
+# W32: v_cmp_ne_i64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0xaa,0x7c]
+# W64: v_cmp_ne_i64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0xaa,0x7c]
 
-# W32: v_cmp_ne_i64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0xaa,0x7c]
-# W64: v_cmp_ne_i64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0xaa,0x7c]
 0xf0,0x04,0xaa,0x7c
+# W32: v_cmp_ne_i64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0xaa,0x7c]
+# W64: v_cmp_ne_i64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0xaa,0x7c]
 
-# W32: v_cmp_ne_i64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0xaa,0x7c]
-# W64: v_cmp_ne_i64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0xaa,0x7c]
 0xfd,0x04,0xaa,0x7c
+# W32: v_cmp_ne_i64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0xaa,0x7c]
+# W64: v_cmp_ne_i64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0xaa,0x7c]
 
+0xff,0xfc,0xab,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_ne_i64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xab,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_ne_i64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xab,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0xab,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_ne_u16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x7a,0x7c]
-# W64: v_cmp_ne_u16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x7a,0x7c]
 0x01,0x05,0x7a,0x7c
+# W32: v_cmp_ne_u16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x7a,0x7c]
+# W64: v_cmp_ne_u16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x7a,0x7c]
 
-# W32: v_cmp_ne_u16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x7a,0x7c]
-# W64: v_cmp_ne_u16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x7a,0x7c]
 0x7f,0x05,0x7a,0x7c
+# W32: v_cmp_ne_u16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x7a,0x7c]
+# W64: v_cmp_ne_u16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x7a,0x7c]
 
-# W32: v_cmp_ne_u16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x7a,0x7c]
-# W64: v_cmp_ne_u16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x7a,0x7c]
 0x01,0x04,0x7a,0x7c
+# W32: v_cmp_ne_u16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x7a,0x7c]
+# W64: v_cmp_ne_u16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x7a,0x7c]
 
-# W32: v_cmp_ne_u16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x7a,0x7c]
-# W64: v_cmp_ne_u16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x7a,0x7c]
 0x69,0x04,0x7a,0x7c
+# W32: v_cmp_ne_u16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x7a,0x7c]
+# W64: v_cmp_ne_u16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x7a,0x7c]
 
-# W32: v_cmp_ne_u16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x7a,0x7c]
-# W64: v_cmp_ne_u16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x7a,0x7c]
 0x6a,0x04,0x7a,0x7c
+# W32: v_cmp_ne_u16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x7a,0x7c]
+# W64: v_cmp_ne_u16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x7a,0x7c]
 
-# W32: v_cmp_ne_u16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x7a,0x7c]
-# W64: v_cmp_ne_u16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x7a,0x7c]
 0x6b,0x04,0x7a,0x7c
+# W32: v_cmp_ne_u16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x7a,0x7c]
+# W64: v_cmp_ne_u16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x7a,0x7c]
 
-# W32: v_cmp_ne_u16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x7a,0x7c]
-# W64: v_cmp_ne_u16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x7a,0x7c]
 0x7b,0x04,0x7a,0x7c
+# W32: v_cmp_ne_u16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x7a,0x7c]
+# W64: v_cmp_ne_u16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x7a,0x7c]
 
-# W32: v_cmp_ne_u16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x7a,0x7c]
-# W64: v_cmp_ne_u16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x7a,0x7c]
 0x7d,0x04,0x7a,0x7c
+# W32: v_cmp_ne_u16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x7a,0x7c]
+# W64: v_cmp_ne_u16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x7a,0x7c]
 
-# W32: v_cmp_ne_u16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x7a,0x7c]
-# W64: v_cmp_ne_u16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x7a,0x7c]
 0x7e,0x04,0x7a,0x7c
+# W32: v_cmp_ne_u16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x7a,0x7c]
+# W64: v_cmp_ne_u16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x7a,0x7c]
 
-# W32: v_cmp_ne_u16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x7a,0x7c]
-# W64: v_cmp_ne_u16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x7a,0x7c]
 0x7f,0x04,0x7a,0x7c
+# W32: v_cmp_ne_u16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x7a,0x7c]
+# W64: v_cmp_ne_u16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x7a,0x7c]
 
-# W32: v_cmp_ne_u16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x7a,0x7c]
-# W64: v_cmp_ne_u16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x7a,0x7c]
 0x7c,0x04,0x7a,0x7c
+# W32: v_cmp_ne_u16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x7a,0x7c]
+# W64: v_cmp_ne_u16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x7a,0x7c]
 
-# W32: v_cmp_ne_u16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x7a,0x7c]
-# W64: v_cmp_ne_u16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x7a,0x7c]
 0xc1,0x04,0x7a,0x7c
+# W32: v_cmp_ne_u16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x7a,0x7c]
+# W64: v_cmp_ne_u16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x7a,0x7c]
 
-# W32: v_cmp_ne_u16_e32 vcc_lo, 0x3800, v2
-# W64: v_cmp_ne_u16_e32 vcc, 0x3800, v2          ; encoding: [0xff,0x04,0x7a,0x7c,0x00,0x38,0x00,0x00]
 0xf0,0x04,0x7a,0x7c
+# W32: v_cmp_ne_u16_e32 vcc_lo, 0x3800, v2     ; encoding: [0xff,0x04,0x7a,0x7c,0x00,0x38,0x00,0x00]
+# W64: v_cmp_ne_u16_e32 vcc, 0x3800, v2        ; encoding: [0xff,0x04,0x7a,0x7c,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_ne_u16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x7a,0x7c]
-# W64: v_cmp_ne_u16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x7a,0x7c]
 0xfd,0x04,0x7a,0x7c
+# W32: v_cmp_ne_u16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x7a,0x7c]
+# W64: v_cmp_ne_u16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x7a,0x7c]
 
-# W32: v_cmp_ne_u16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x7a,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_ne_u16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x7a,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x7a,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_ne_u16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x7a,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_ne_u16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x7a,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ne_u32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x9a,0x7c]
-# W64: v_cmp_ne_u32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x9a,0x7c]
 0x01,0x05,0x9a,0x7c
+# W32: v_cmp_ne_u32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x9a,0x7c]
+# W64: v_cmp_ne_u32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x9a,0x7c]
 
-# W32: v_cmp_ne_u32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x9a,0x7c]
-# W64: v_cmp_ne_u32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x9a,0x7c]
 0xff,0x05,0x9a,0x7c
+# W32: v_cmp_ne_u32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x9a,0x7c]
+# W64: v_cmp_ne_u32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x9a,0x7c]
 
-# W32: v_cmp_ne_u32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x9a,0x7c]
-# W64: v_cmp_ne_u32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x9a,0x7c]
 0x01,0x04,0x9a,0x7c
+# W32: v_cmp_ne_u32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x9a,0x7c]
+# W64: v_cmp_ne_u32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x9a,0x7c]
 
-# W32: v_cmp_ne_u32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x9a,0x7c]
-# W64: v_cmp_ne_u32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x9a,0x7c]
 0x69,0x04,0x9a,0x7c
+# W32: v_cmp_ne_u32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x9a,0x7c]
+# W64: v_cmp_ne_u32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x9a,0x7c]
 
-# W32: v_cmp_ne_u32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x9a,0x7c]
-# W64: v_cmp_ne_u32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x9a,0x7c]
 0x6a,0x04,0x9a,0x7c
+# W32: v_cmp_ne_u32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x9a,0x7c]
+# W64: v_cmp_ne_u32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x9a,0x7c]
 
-# W32: v_cmp_ne_u32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x9a,0x7c]
-# W64: v_cmp_ne_u32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x9a,0x7c]
 0x6b,0x04,0x9a,0x7c
+# W32: v_cmp_ne_u32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x9a,0x7c]
+# W64: v_cmp_ne_u32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x9a,0x7c]
 
-# W32: v_cmp_ne_u32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x9a,0x7c]
-# W64: v_cmp_ne_u32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x9a,0x7c]
 0x7b,0x04,0x9a,0x7c
+# W32: v_cmp_ne_u32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x9a,0x7c]
+# W64: v_cmp_ne_u32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x9a,0x7c]
 
-# W32: v_cmp_ne_u32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x9a,0x7c]
-# W64: v_cmp_ne_u32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x9a,0x7c]
 0x7d,0x04,0x9a,0x7c
+# W32: v_cmp_ne_u32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x9a,0x7c]
+# W64: v_cmp_ne_u32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x9a,0x7c]
 
-# W32: v_cmp_ne_u32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x9a,0x7c]
-# W64: v_cmp_ne_u32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x9a,0x7c]
 0x7e,0x04,0x9a,0x7c
+# W32: v_cmp_ne_u32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x9a,0x7c]
+# W64: v_cmp_ne_u32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x9a,0x7c]
 
-# W32: v_cmp_ne_u32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x9a,0x7c]
-# W64: v_cmp_ne_u32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x9a,0x7c]
 0x7f,0x04,0x9a,0x7c
+# W32: v_cmp_ne_u32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x9a,0x7c]
+# W64: v_cmp_ne_u32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x9a,0x7c]
 
-# W32: v_cmp_ne_u32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x9a,0x7c]
-# W64: v_cmp_ne_u32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x9a,0x7c]
 0x7c,0x04,0x9a,0x7c
+# W32: v_cmp_ne_u32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x9a,0x7c]
+# W64: v_cmp_ne_u32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x9a,0x7c]
 
-# W32: v_cmp_ne_u32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x9a,0x7c]
-# W64: v_cmp_ne_u32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x9a,0x7c]
 0xc1,0x04,0x9a,0x7c
+# W32: v_cmp_ne_u32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x9a,0x7c]
+# W64: v_cmp_ne_u32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x9a,0x7c]
 
-# W32: v_cmp_ne_u32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x9a,0x7c]
-# W64: v_cmp_ne_u32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x9a,0x7c]
 0xf0,0x04,0x9a,0x7c
+# W32: v_cmp_ne_u32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x9a,0x7c]
+# W64: v_cmp_ne_u32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x9a,0x7c]
 
-# W32: v_cmp_ne_u32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x9a,0x7c]
-# W64: v_cmp_ne_u32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x9a,0x7c]
 0xfd,0x04,0x9a,0x7c
+# W32: v_cmp_ne_u32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x9a,0x7c]
+# W64: v_cmp_ne_u32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x9a,0x7c]
 
-# W32: v_cmp_ne_u32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x9b,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_ne_u32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x9b,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x9b,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_ne_u32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x9b,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_ne_u32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x9b,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ne_u64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0xba,0x7c]
-# W64: v_cmp_ne_u64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0xba,0x7c]
 0x01,0x05,0xba,0x7c
+# W32: v_cmp_ne_u64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0xba,0x7c]
+# W64: v_cmp_ne_u64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0xba,0x7c]
 
-# W32: v_cmp_ne_u64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xba,0x7c]
-# W64: v_cmp_ne_u64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0xba,0x7c]
 0xfe,0x05,0xba,0x7c
+# W32: v_cmp_ne_u64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xba,0x7c]
+# W64: v_cmp_ne_u64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xba,0x7c]
 
-# W32: v_cmp_ne_u64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0xba,0x7c]
-# W64: v_cmp_ne_u64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0xba,0x7c]
 0x02,0x04,0xba,0x7c
+# W32: v_cmp_ne_u64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0xba,0x7c]
+# W64: v_cmp_ne_u64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0xba,0x7c]
 
-# W32: v_cmp_ne_u64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xba,0x7c]
-# W64: v_cmp_ne_u64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0xba,0x7c]
 0x68,0x04,0xba,0x7c
+# W32: v_cmp_ne_u64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xba,0x7c]
+# W64: v_cmp_ne_u64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xba,0x7c]
 
-# W32: v_cmp_ne_u64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0xba,0x7c]
-# W64: v_cmp_ne_u64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0xba,0x7c]
 0x6a,0x04,0xba,0x7c
+# W32: v_cmp_ne_u64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0xba,0x7c]
+# W64: v_cmp_ne_u64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0xba,0x7c]
 
+0x7a,0x04,0xba,0x7c
 # W32: v_cmp_ne_u64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xba,0x7c]
 # W64: v_cmp_ne_u64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xba,0x7c]
-0x7a,0x04,0xba,0x7c
 
-# W32: v_cmp_ne_u64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0xba,0x7c]
-# W64: v_cmp_ne_u64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0xba,0x7c]
 0x7e,0x04,0xba,0x7c
+# W32: v_cmp_ne_u64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0xba,0x7c]
+# W64: v_cmp_ne_u64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0xba,0x7c]
 
-# W32: v_cmp_ne_u64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0xba,0x7c]
-# W64: v_cmp_ne_u64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0xba,0x7c]
 0x7c,0x04,0xba,0x7c
+# W32: v_cmp_ne_u64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0xba,0x7c]
+# W64: v_cmp_ne_u64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0xba,0x7c]
 
-# W32: v_cmp_ne_u64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0xba,0x7c]
-# W64: v_cmp_ne_u64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0xba,0x7c]
 0xc1,0x04,0xba,0x7c
+# W32: v_cmp_ne_u64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0xba,0x7c]
+# W64: v_cmp_ne_u64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0xba,0x7c]
 
-# W32: v_cmp_ne_u64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0xba,0x7c]
-# W64: v_cmp_ne_u64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0xba,0x7c]
 0xf0,0x04,0xba,0x7c
+# W32: v_cmp_ne_u64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0xba,0x7c]
+# W64: v_cmp_ne_u64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0xba,0x7c]
 
-# W32: v_cmp_ne_u64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0xba,0x7c]
-# W64: v_cmp_ne_u64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0xba,0x7c]
 0xfd,0x04,0xba,0x7c
+# W32: v_cmp_ne_u64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0xba,0x7c]
+# W64: v_cmp_ne_u64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0xba,0x7c]
 
+0xff,0xfc,0xbb,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_ne_u64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbb,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_ne_u64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbb,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0xbb,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_neq_f16_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x1a,0x7c]
-# W64: v_cmp_neq_f16_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x1a,0x7c]
 0x01,0x05,0x1a,0x7c
+# W32: v_cmp_neq_f16_e32 vcc_lo, v1, v2        ; encoding: [0x01,0x05,0x1a,0x7c]
+# W64: v_cmp_neq_f16_e32 vcc, v1, v2           ; encoding: [0x01,0x05,0x1a,0x7c]
 
-# W32: v_cmp_neq_f16_e32 vcc_lo, v127, v2        ; encoding: [0x7f,0x05,0x1a,0x7c]
-# W64: v_cmp_neq_f16_e32 vcc, v127, v2           ; encoding: [0x7f,0x05,0x1a,0x7c]
 0x7f,0x05,0x1a,0x7c
+# W32: v_cmp_neq_f16_e32 vcc_lo, v127, v2      ; encoding: [0x7f,0x05,0x1a,0x7c]
+# W64: v_cmp_neq_f16_e32 vcc, v127, v2         ; encoding: [0x7f,0x05,0x1a,0x7c]
 
-# W32: v_cmp_neq_f16_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x1a,0x7c]
-# W64: v_cmp_neq_f16_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x1a,0x7c]
 0x01,0x04,0x1a,0x7c
+# W32: v_cmp_neq_f16_e32 vcc_lo, s1, v2        ; encoding: [0x01,0x04,0x1a,0x7c]
+# W64: v_cmp_neq_f16_e32 vcc, s1, v2           ; encoding: [0x01,0x04,0x1a,0x7c]
 
-# W32: v_cmp_neq_f16_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x1a,0x7c]
-# W64: v_cmp_neq_f16_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x1a,0x7c]
 0x69,0x04,0x1a,0x7c
+# W32: v_cmp_neq_f16_e32 vcc_lo, s105, v2      ; encoding: [0x69,0x04,0x1a,0x7c]
+# W64: v_cmp_neq_f16_e32 vcc, s105, v2         ; encoding: [0x69,0x04,0x1a,0x7c]
 
-# W32: v_cmp_neq_f16_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x1a,0x7c]
-# W64: v_cmp_neq_f16_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x1a,0x7c]
 0x6a,0x04,0x1a,0x7c
+# W32: v_cmp_neq_f16_e32 vcc_lo, vcc_lo, v2    ; encoding: [0x6a,0x04,0x1a,0x7c]
+# W64: v_cmp_neq_f16_e32 vcc, vcc_lo, v2       ; encoding: [0x6a,0x04,0x1a,0x7c]
 
-# W32: v_cmp_neq_f16_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x1a,0x7c]
-# W64: v_cmp_neq_f16_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x1a,0x7c]
 0x6b,0x04,0x1a,0x7c
+# W32: v_cmp_neq_f16_e32 vcc_lo, vcc_hi, v2    ; encoding: [0x6b,0x04,0x1a,0x7c]
+# W64: v_cmp_neq_f16_e32 vcc, vcc_hi, v2       ; encoding: [0x6b,0x04,0x1a,0x7c]
 
-# W32: v_cmp_neq_f16_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x1a,0x7c]
-# W64: v_cmp_neq_f16_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x1a,0x7c]
 0x7b,0x04,0x1a,0x7c
+# W32: v_cmp_neq_f16_e32 vcc_lo, ttmp15, v2    ; encoding: [0x7b,0x04,0x1a,0x7c]
+# W64: v_cmp_neq_f16_e32 vcc, ttmp15, v2       ; encoding: [0x7b,0x04,0x1a,0x7c]
 
-# W32: v_cmp_neq_f16_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x1a,0x7c]
-# W64: v_cmp_neq_f16_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x1a,0x7c]
 0x7d,0x04,0x1a,0x7c
+# W32: v_cmp_neq_f16_e32 vcc_lo, m0, v2        ; encoding: [0x7d,0x04,0x1a,0x7c]
+# W64: v_cmp_neq_f16_e32 vcc, m0, v2           ; encoding: [0x7d,0x04,0x1a,0x7c]
 
-# W32: v_cmp_neq_f16_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x1a,0x7c]
-# W64: v_cmp_neq_f16_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x1a,0x7c]
 0x7e,0x04,0x1a,0x7c
+# W32: v_cmp_neq_f16_e32 vcc_lo, exec_lo, v2   ; encoding: [0x7e,0x04,0x1a,0x7c]
+# W64: v_cmp_neq_f16_e32 vcc, exec_lo, v2      ; encoding: [0x7e,0x04,0x1a,0x7c]
 
-# W32: v_cmp_neq_f16_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x1a,0x7c]
-# W64: v_cmp_neq_f16_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x1a,0x7c]
 0x7f,0x04,0x1a,0x7c
+# W32: v_cmp_neq_f16_e32 vcc_lo, exec_hi, v2   ; encoding: [0x7f,0x04,0x1a,0x7c]
+# W64: v_cmp_neq_f16_e32 vcc, exec_hi, v2      ; encoding: [0x7f,0x04,0x1a,0x7c]
 
-# W32: v_cmp_neq_f16_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x1a,0x7c]
-# W64: v_cmp_neq_f16_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x1a,0x7c]
 0x7c,0x04,0x1a,0x7c
+# W32: v_cmp_neq_f16_e32 vcc_lo, null, v2      ; encoding: [0x7c,0x04,0x1a,0x7c]
+# W64: v_cmp_neq_f16_e32 vcc, null, v2         ; encoding: [0x7c,0x04,0x1a,0x7c]
 
-# W32: v_cmp_neq_f16_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x1a,0x7c]
-# W64: v_cmp_neq_f16_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x1a,0x7c]
 0xc1,0x04,0x1a,0x7c
+# W32: v_cmp_neq_f16_e32 vcc_lo, -1, v2        ; encoding: [0xc1,0x04,0x1a,0x7c]
+# W64: v_cmp_neq_f16_e32 vcc, -1, v2           ; encoding: [0xc1,0x04,0x1a,0x7c]
 
-# W32: v_cmp_neq_f16_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x1a,0x7c]
-# W64: v_cmp_neq_f16_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x1a,0x7c]
 0xf0,0x04,0x1a,0x7c
+# W32: v_cmp_neq_f16_e32 vcc_lo, 0.5, v2       ; encoding: [0xf0,0x04,0x1a,0x7c]
+# W64: v_cmp_neq_f16_e32 vcc, 0.5, v2          ; encoding: [0xf0,0x04,0x1a,0x7c]
 
-# W32: v_cmp_neq_f16_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x1a,0x7c]
-# W64: v_cmp_neq_f16_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x1a,0x7c]
 0xfd,0x04,0x1a,0x7c
+# W32: v_cmp_neq_f16_e32 vcc_lo, src_scc, v2   ; encoding: [0xfd,0x04,0x1a,0x7c]
+# W64: v_cmp_neq_f16_e32 vcc, src_scc, v2      ; encoding: [0xfd,0x04,0x1a,0x7c]
 
-# W32: v_cmp_neq_f16_e32 vcc_lo, 0xfe0b, v127    ; encoding: [0xff,0xfe,0x1a,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_neq_f16_e32 vcc, 0xfe0b, v127       ; encoding: [0xff,0xfe,0x1a,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x1a,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_neq_f16_e32 vcc_lo, 0xfe0b, v127  ; encoding: [0xff,0xfe,0x1a,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_neq_f16_e32 vcc, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x1a,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_neq_f32_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x3a,0x7c]
-# W64: v_cmp_neq_f32_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x3a,0x7c]
 0x01,0x05,0x3a,0x7c
+# W32: v_cmp_neq_f32_e32 vcc_lo, v1, v2        ; encoding: [0x01,0x05,0x3a,0x7c]
+# W64: v_cmp_neq_f32_e32 vcc, v1, v2           ; encoding: [0x01,0x05,0x3a,0x7c]
 
-# W32: v_cmp_neq_f32_e32 vcc_lo, v255, v2        ; encoding: [0xff,0x05,0x3a,0x7c]
-# W64: v_cmp_neq_f32_e32 vcc, v255, v2           ; encoding: [0xff,0x05,0x3a,0x7c]
 0xff,0x05,0x3a,0x7c
+# W32: v_cmp_neq_f32_e32 vcc_lo, v255, v2      ; encoding: [0xff,0x05,0x3a,0x7c]
+# W64: v_cmp_neq_f32_e32 vcc, v255, v2         ; encoding: [0xff,0x05,0x3a,0x7c]
 
-# W32: v_cmp_neq_f32_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x3a,0x7c]
-# W64: v_cmp_neq_f32_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x3a,0x7c]
 0x01,0x04,0x3a,0x7c
+# W32: v_cmp_neq_f32_e32 vcc_lo, s1, v2        ; encoding: [0x01,0x04,0x3a,0x7c]
+# W64: v_cmp_neq_f32_e32 vcc, s1, v2           ; encoding: [0x01,0x04,0x3a,0x7c]
 
-# W32: v_cmp_neq_f32_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x3a,0x7c]
-# W64: v_cmp_neq_f32_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x3a,0x7c]
 0x69,0x04,0x3a,0x7c
+# W32: v_cmp_neq_f32_e32 vcc_lo, s105, v2      ; encoding: [0x69,0x04,0x3a,0x7c]
+# W64: v_cmp_neq_f32_e32 vcc, s105, v2         ; encoding: [0x69,0x04,0x3a,0x7c]
 
-# W32: v_cmp_neq_f32_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x3a,0x7c]
-# W64: v_cmp_neq_f32_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x3a,0x7c]
 0x6a,0x04,0x3a,0x7c
+# W32: v_cmp_neq_f32_e32 vcc_lo, vcc_lo, v2    ; encoding: [0x6a,0x04,0x3a,0x7c]
+# W64: v_cmp_neq_f32_e32 vcc, vcc_lo, v2       ; encoding: [0x6a,0x04,0x3a,0x7c]
 
-# W32: v_cmp_neq_f32_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x3a,0x7c]
-# W64: v_cmp_neq_f32_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x3a,0x7c]
 0x6b,0x04,0x3a,0x7c
+# W32: v_cmp_neq_f32_e32 vcc_lo, vcc_hi, v2    ; encoding: [0x6b,0x04,0x3a,0x7c]
+# W64: v_cmp_neq_f32_e32 vcc, vcc_hi, v2       ; encoding: [0x6b,0x04,0x3a,0x7c]
 
-# W32: v_cmp_neq_f32_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x3a,0x7c]
-# W64: v_cmp_neq_f32_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x3a,0x7c]
 0x7b,0x04,0x3a,0x7c
+# W32: v_cmp_neq_f32_e32 vcc_lo, ttmp15, v2    ; encoding: [0x7b,0x04,0x3a,0x7c]
+# W64: v_cmp_neq_f32_e32 vcc, ttmp15, v2       ; encoding: [0x7b,0x04,0x3a,0x7c]
 
-# W32: v_cmp_neq_f32_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x3a,0x7c]
-# W64: v_cmp_neq_f32_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x3a,0x7c]
 0x7d,0x04,0x3a,0x7c
+# W32: v_cmp_neq_f32_e32 vcc_lo, m0, v2        ; encoding: [0x7d,0x04,0x3a,0x7c]
+# W64: v_cmp_neq_f32_e32 vcc, m0, v2           ; encoding: [0x7d,0x04,0x3a,0x7c]
 
-# W32: v_cmp_neq_f32_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x3a,0x7c]
-# W64: v_cmp_neq_f32_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x3a,0x7c]
 0x7e,0x04,0x3a,0x7c
+# W32: v_cmp_neq_f32_e32 vcc_lo, exec_lo, v2   ; encoding: [0x7e,0x04,0x3a,0x7c]
+# W64: v_cmp_neq_f32_e32 vcc, exec_lo, v2      ; encoding: [0x7e,0x04,0x3a,0x7c]
 
-# W32: v_cmp_neq_f32_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x3a,0x7c]
-# W64: v_cmp_neq_f32_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x3a,0x7c]
 0x7f,0x04,0x3a,0x7c
+# W32: v_cmp_neq_f32_e32 vcc_lo, exec_hi, v2   ; encoding: [0x7f,0x04,0x3a,0x7c]
+# W64: v_cmp_neq_f32_e32 vcc, exec_hi, v2      ; encoding: [0x7f,0x04,0x3a,0x7c]
 
-# W32: v_cmp_neq_f32_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x3a,0x7c]
-# W64: v_cmp_neq_f32_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x3a,0x7c]
 0x7c,0x04,0x3a,0x7c
+# W32: v_cmp_neq_f32_e32 vcc_lo, null, v2      ; encoding: [0x7c,0x04,0x3a,0x7c]
+# W64: v_cmp_neq_f32_e32 vcc, null, v2         ; encoding: [0x7c,0x04,0x3a,0x7c]
 
-# W32: v_cmp_neq_f32_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x3a,0x7c]
-# W64: v_cmp_neq_f32_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x3a,0x7c]
 0xc1,0x04,0x3a,0x7c
+# W32: v_cmp_neq_f32_e32 vcc_lo, -1, v2        ; encoding: [0xc1,0x04,0x3a,0x7c]
+# W64: v_cmp_neq_f32_e32 vcc, -1, v2           ; encoding: [0xc1,0x04,0x3a,0x7c]
 
-# W32: v_cmp_neq_f32_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x3a,0x7c]
-# W64: v_cmp_neq_f32_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x3a,0x7c]
 0xf0,0x04,0x3a,0x7c
+# W32: v_cmp_neq_f32_e32 vcc_lo, 0.5, v2       ; encoding: [0xf0,0x04,0x3a,0x7c]
+# W64: v_cmp_neq_f32_e32 vcc, 0.5, v2          ; encoding: [0xf0,0x04,0x3a,0x7c]
 
-# W32: v_cmp_neq_f32_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x3a,0x7c]
-# W64: v_cmp_neq_f32_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x3a,0x7c]
 0xfd,0x04,0x3a,0x7c
+# W32: v_cmp_neq_f32_e32 vcc_lo, src_scc, v2   ; encoding: [0xfd,0x04,0x3a,0x7c]
+# W64: v_cmp_neq_f32_e32 vcc, src_scc, v2      ; encoding: [0xfd,0x04,0x3a,0x7c]
 
-# W32: v_cmp_neq_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x3b,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_neq_f32_e32 vcc, 0xaf123456, v255   ; encoding: [0xff,0xfe,0x3b,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x3b,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_neq_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x3b,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_neq_f32_e32 vcc, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x3b,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_neq_f64_e32 vcc_lo, v[1:2], v[2:3]  ; encoding: [0x01,0x05,0x5a,0x7c]
-# W64: v_cmp_neq_f64_e32 vcc, v[1:2], v[2:3]     ; encoding: [0x01,0x05,0x5a,0x7c]
 0x01,0x05,0x5a,0x7c
+# W32: v_cmp_neq_f64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0x5a,0x7c]
+# W64: v_cmp_neq_f64_e32 vcc, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0x5a,0x7c]
 
+0xfe,0x05,0x5a,0x7c
 # W32: v_cmp_neq_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x5a,0x7c]
 # W64: v_cmp_neq_f64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x5a,0x7c]
-0xfe,0x05,0x5a,0x7c
 
-# W32: v_cmp_neq_f64_e32 vcc_lo, s[2:3], v[2:3]  ; encoding: [0x02,0x04,0x5a,0x7c]
-# W64: v_cmp_neq_f64_e32 vcc, s[2:3], v[2:3]     ; encoding: [0x02,0x04,0x5a,0x7c]
 0x02,0x04,0x5a,0x7c
+# W32: v_cmp_neq_f64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0x5a,0x7c]
+# W64: v_cmp_neq_f64_e32 vcc, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0x5a,0x7c]
 
+0x68,0x04,0x5a,0x7c
 # W32: v_cmp_neq_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x5a,0x7c]
 # W64: v_cmp_neq_f64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x5a,0x7c]
-0x68,0x04,0x5a,0x7c
 
-# W32: v_cmp_neq_f64_e32 vcc_lo, vcc, v[2:3]     ; encoding: [0x6a,0x04,0x5a,0x7c]
-# W64: v_cmp_neq_f64_e32 vcc, vcc, v[2:3]        ; encoding: [0x6a,0x04,0x5a,0x7c]
 0x6a,0x04,0x5a,0x7c
+# W32: v_cmp_neq_f64_e32 vcc_lo, vcc, v[2:3]   ; encoding: [0x6a,0x04,0x5a,0x7c]
+# W64: v_cmp_neq_f64_e32 vcc, vcc, v[2:3]      ; encoding: [0x6a,0x04,0x5a,0x7c]
 
+0x7a,0x04,0x5a,0x7c
 # W32: v_cmp_neq_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x5a,0x7c]
 # W64: v_cmp_neq_f64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x5a,0x7c]
-0x7a,0x04,0x5a,0x7c
 
-# W32: v_cmp_neq_f64_e32 vcc_lo, exec, v[2:3]    ; encoding: [0x7e,0x04,0x5a,0x7c]
-# W64: v_cmp_neq_f64_e32 vcc, exec, v[2:3]       ; encoding: [0x7e,0x04,0x5a,0x7c]
 0x7e,0x04,0x5a,0x7c
+# W32: v_cmp_neq_f64_e32 vcc_lo, exec, v[2:3]  ; encoding: [0x7e,0x04,0x5a,0x7c]
+# W64: v_cmp_neq_f64_e32 vcc, exec, v[2:3]     ; encoding: [0x7e,0x04,0x5a,0x7c]
 
-# W32: v_cmp_neq_f64_e32 vcc_lo, null, v[2:3]    ; encoding: [0x7c,0x04,0x5a,0x7c]
-# W64: v_cmp_neq_f64_e32 vcc, null, v[2:3]       ; encoding: [0x7c,0x04,0x5a,0x7c]
 0x7c,0x04,0x5a,0x7c
+# W32: v_cmp_neq_f64_e32 vcc_lo, null, v[2:3]  ; encoding: [0x7c,0x04,0x5a,0x7c]
+# W64: v_cmp_neq_f64_e32 vcc, null, v[2:3]     ; encoding: [0x7c,0x04,0x5a,0x7c]
 
-# W32: v_cmp_neq_f64_e32 vcc_lo, -1, v[2:3]      ; encoding: [0xc1,0x04,0x5a,0x7c]
-# W64: v_cmp_neq_f64_e32 vcc, -1, v[2:3]         ; encoding: [0xc1,0x04,0x5a,0x7c]
 0xc1,0x04,0x5a,0x7c
+# W32: v_cmp_neq_f64_e32 vcc_lo, -1, v[2:3]    ; encoding: [0xc1,0x04,0x5a,0x7c]
+# W64: v_cmp_neq_f64_e32 vcc, -1, v[2:3]       ; encoding: [0xc1,0x04,0x5a,0x7c]
 
-# W32: v_cmp_neq_f64_e32 vcc_lo, 0.5, v[2:3]     ; encoding: [0xf0,0x04,0x5a,0x7c]
-# W64: v_cmp_neq_f64_e32 vcc, 0.5, v[2:3]        ; encoding: [0xf0,0x04,0x5a,0x7c]
 0xf0,0x04,0x5a,0x7c
+# W32: v_cmp_neq_f64_e32 vcc_lo, 0.5, v[2:3]   ; encoding: [0xf0,0x04,0x5a,0x7c]
+# W64: v_cmp_neq_f64_e32 vcc, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0x5a,0x7c]
 
-# W32: v_cmp_neq_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x5a,0x7c]
-# W64: v_cmp_neq_f64_e32 vcc, src_scc, v[2:3]    ; encoding: [0xfd,0x04,0x5a,0x7c]
 0xfd,0x04,0x5a,0x7c
+# W32: v_cmp_neq_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x5a,0x7c]
+# W64: v_cmp_neq_f64_e32 vcc, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0x5a,0x7c]
 
+0xff,0xfc,0x5b,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_neq_f64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5b,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_neq_f64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5b,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0x5b,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_nge_f16_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x12,0x7c]
-# W64: v_cmp_nge_f16_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x12,0x7c]
 0x01,0x05,0x12,0x7c
+# W32: v_cmp_nge_f16_e32 vcc_lo, v1, v2        ; encoding: [0x01,0x05,0x12,0x7c]
+# W64: v_cmp_nge_f16_e32 vcc, v1, v2           ; encoding: [0x01,0x05,0x12,0x7c]
 
-# W32: v_cmp_nge_f16_e32 vcc_lo, v127, v2        ; encoding: [0x7f,0x05,0x12,0x7c]
-# W64: v_cmp_nge_f16_e32 vcc, v127, v2           ; encoding: [0x7f,0x05,0x12,0x7c]
 0x7f,0x05,0x12,0x7c
+# W32: v_cmp_nge_f16_e32 vcc_lo, v127, v2      ; encoding: [0x7f,0x05,0x12,0x7c]
+# W64: v_cmp_nge_f16_e32 vcc, v127, v2         ; encoding: [0x7f,0x05,0x12,0x7c]
 
-# W32: v_cmp_nge_f16_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x12,0x7c]
-# W64: v_cmp_nge_f16_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x12,0x7c]
 0x01,0x04,0x12,0x7c
+# W32: v_cmp_nge_f16_e32 vcc_lo, s1, v2        ; encoding: [0x01,0x04,0x12,0x7c]
+# W64: v_cmp_nge_f16_e32 vcc, s1, v2           ; encoding: [0x01,0x04,0x12,0x7c]
 
-# W32: v_cmp_nge_f16_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x12,0x7c]
-# W64: v_cmp_nge_f16_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x12,0x7c]
 0x69,0x04,0x12,0x7c
+# W32: v_cmp_nge_f16_e32 vcc_lo, s105, v2      ; encoding: [0x69,0x04,0x12,0x7c]
+# W64: v_cmp_nge_f16_e32 vcc, s105, v2         ; encoding: [0x69,0x04,0x12,0x7c]
 
-# W32: v_cmp_nge_f16_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x12,0x7c]
-# W64: v_cmp_nge_f16_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x12,0x7c]
 0x6a,0x04,0x12,0x7c
+# W32: v_cmp_nge_f16_e32 vcc_lo, vcc_lo, v2    ; encoding: [0x6a,0x04,0x12,0x7c]
+# W64: v_cmp_nge_f16_e32 vcc, vcc_lo, v2       ; encoding: [0x6a,0x04,0x12,0x7c]
 
-# W32: v_cmp_nge_f16_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x12,0x7c]
-# W64: v_cmp_nge_f16_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x12,0x7c]
 0x6b,0x04,0x12,0x7c
+# W32: v_cmp_nge_f16_e32 vcc_lo, vcc_hi, v2    ; encoding: [0x6b,0x04,0x12,0x7c]
+# W64: v_cmp_nge_f16_e32 vcc, vcc_hi, v2       ; encoding: [0x6b,0x04,0x12,0x7c]
 
-# W32: v_cmp_nge_f16_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x12,0x7c]
-# W64: v_cmp_nge_f16_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x12,0x7c]
 0x7b,0x04,0x12,0x7c
+# W32: v_cmp_nge_f16_e32 vcc_lo, ttmp15, v2    ; encoding: [0x7b,0x04,0x12,0x7c]
+# W64: v_cmp_nge_f16_e32 vcc, ttmp15, v2       ; encoding: [0x7b,0x04,0x12,0x7c]
 
-# W32: v_cmp_nge_f16_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x12,0x7c]
-# W64: v_cmp_nge_f16_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x12,0x7c]
 0x7d,0x04,0x12,0x7c
+# W32: v_cmp_nge_f16_e32 vcc_lo, m0, v2        ; encoding: [0x7d,0x04,0x12,0x7c]
+# W64: v_cmp_nge_f16_e32 vcc, m0, v2           ; encoding: [0x7d,0x04,0x12,0x7c]
 
-# W32: v_cmp_nge_f16_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x12,0x7c]
-# W64: v_cmp_nge_f16_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x12,0x7c]
 0x7e,0x04,0x12,0x7c
+# W32: v_cmp_nge_f16_e32 vcc_lo, exec_lo, v2   ; encoding: [0x7e,0x04,0x12,0x7c]
+# W64: v_cmp_nge_f16_e32 vcc, exec_lo, v2      ; encoding: [0x7e,0x04,0x12,0x7c]
 
-# W32: v_cmp_nge_f16_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x12,0x7c]
-# W64: v_cmp_nge_f16_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x12,0x7c]
 0x7f,0x04,0x12,0x7c
+# W32: v_cmp_nge_f16_e32 vcc_lo, exec_hi, v2   ; encoding: [0x7f,0x04,0x12,0x7c]
+# W64: v_cmp_nge_f16_e32 vcc, exec_hi, v2      ; encoding: [0x7f,0x04,0x12,0x7c]
 
-# W32: v_cmp_nge_f16_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x12,0x7c]
-# W64: v_cmp_nge_f16_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x12,0x7c]
 0x7c,0x04,0x12,0x7c
+# W32: v_cmp_nge_f16_e32 vcc_lo, null, v2      ; encoding: [0x7c,0x04,0x12,0x7c]
+# W64: v_cmp_nge_f16_e32 vcc, null, v2         ; encoding: [0x7c,0x04,0x12,0x7c]
 
-# W32: v_cmp_nge_f16_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x12,0x7c]
-# W64: v_cmp_nge_f16_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x12,0x7c]
 0xc1,0x04,0x12,0x7c
+# W32: v_cmp_nge_f16_e32 vcc_lo, -1, v2        ; encoding: [0xc1,0x04,0x12,0x7c]
+# W64: v_cmp_nge_f16_e32 vcc, -1, v2           ; encoding: [0xc1,0x04,0x12,0x7c]
 
-# W32: v_cmp_nge_f16_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x12,0x7c]
-# W64: v_cmp_nge_f16_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x12,0x7c]
 0xf0,0x04,0x12,0x7c
+# W32: v_cmp_nge_f16_e32 vcc_lo, 0.5, v2       ; encoding: [0xf0,0x04,0x12,0x7c]
+# W64: v_cmp_nge_f16_e32 vcc, 0.5, v2          ; encoding: [0xf0,0x04,0x12,0x7c]
 
-# W32: v_cmp_nge_f16_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x12,0x7c]
-# W64: v_cmp_nge_f16_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x12,0x7c]
 0xfd,0x04,0x12,0x7c
+# W32: v_cmp_nge_f16_e32 vcc_lo, src_scc, v2   ; encoding: [0xfd,0x04,0x12,0x7c]
+# W64: v_cmp_nge_f16_e32 vcc, src_scc, v2      ; encoding: [0xfd,0x04,0x12,0x7c]
 
-# W32: v_cmp_nge_f16_e32 vcc_lo, 0xfe0b, v127    ; encoding: [0xff,0xfe,0x12,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_nge_f16_e32 vcc, 0xfe0b, v127       ; encoding: [0xff,0xfe,0x12,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x12,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_nge_f16_e32 vcc_lo, 0xfe0b, v127  ; encoding: [0xff,0xfe,0x12,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_nge_f16_e32 vcc, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x12,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_nge_f32_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x32,0x7c]
-# W64: v_cmp_nge_f32_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x32,0x7c]
 0x01,0x05,0x32,0x7c
+# W32: v_cmp_nge_f32_e32 vcc_lo, v1, v2        ; encoding: [0x01,0x05,0x32,0x7c]
+# W64: v_cmp_nge_f32_e32 vcc, v1, v2           ; encoding: [0x01,0x05,0x32,0x7c]
 
-# W32: v_cmp_nge_f32_e32 vcc_lo, v255, v2        ; encoding: [0xff,0x05,0x32,0x7c]
-# W64: v_cmp_nge_f32_e32 vcc, v255, v2           ; encoding: [0xff,0x05,0x32,0x7c]
 0xff,0x05,0x32,0x7c
+# W32: v_cmp_nge_f32_e32 vcc_lo, v255, v2      ; encoding: [0xff,0x05,0x32,0x7c]
+# W64: v_cmp_nge_f32_e32 vcc, v255, v2         ; encoding: [0xff,0x05,0x32,0x7c]
 
-# W32: v_cmp_nge_f32_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x32,0x7c]
-# W64: v_cmp_nge_f32_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x32,0x7c]
 0x01,0x04,0x32,0x7c
+# W32: v_cmp_nge_f32_e32 vcc_lo, s1, v2        ; encoding: [0x01,0x04,0x32,0x7c]
+# W64: v_cmp_nge_f32_e32 vcc, s1, v2           ; encoding: [0x01,0x04,0x32,0x7c]
 
-# W32: v_cmp_nge_f32_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x32,0x7c]
-# W64: v_cmp_nge_f32_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x32,0x7c]
 0x69,0x04,0x32,0x7c
+# W32: v_cmp_nge_f32_e32 vcc_lo, s105, v2      ; encoding: [0x69,0x04,0x32,0x7c]
+# W64: v_cmp_nge_f32_e32 vcc, s105, v2         ; encoding: [0x69,0x04,0x32,0x7c]
 
-# W32: v_cmp_nge_f32_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x32,0x7c]
-# W64: v_cmp_nge_f32_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x32,0x7c]
 0x6a,0x04,0x32,0x7c
+# W32: v_cmp_nge_f32_e32 vcc_lo, vcc_lo, v2    ; encoding: [0x6a,0x04,0x32,0x7c]
+# W64: v_cmp_nge_f32_e32 vcc, vcc_lo, v2       ; encoding: [0x6a,0x04,0x32,0x7c]
 
-# W32: v_cmp_nge_f32_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x32,0x7c]
-# W64: v_cmp_nge_f32_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x32,0x7c]
 0x6b,0x04,0x32,0x7c
+# W32: v_cmp_nge_f32_e32 vcc_lo, vcc_hi, v2    ; encoding: [0x6b,0x04,0x32,0x7c]
+# W64: v_cmp_nge_f32_e32 vcc, vcc_hi, v2       ; encoding: [0x6b,0x04,0x32,0x7c]
 
-# W32: v_cmp_nge_f32_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x32,0x7c]
-# W64: v_cmp_nge_f32_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x32,0x7c]
 0x7b,0x04,0x32,0x7c
+# W32: v_cmp_nge_f32_e32 vcc_lo, ttmp15, v2    ; encoding: [0x7b,0x04,0x32,0x7c]
+# W64: v_cmp_nge_f32_e32 vcc, ttmp15, v2       ; encoding: [0x7b,0x04,0x32,0x7c]
 
-# W32: v_cmp_nge_f32_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x32,0x7c]
-# W64: v_cmp_nge_f32_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x32,0x7c]
 0x7d,0x04,0x32,0x7c
+# W32: v_cmp_nge_f32_e32 vcc_lo, m0, v2        ; encoding: [0x7d,0x04,0x32,0x7c]
+# W64: v_cmp_nge_f32_e32 vcc, m0, v2           ; encoding: [0x7d,0x04,0x32,0x7c]
 
-# W32: v_cmp_nge_f32_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x32,0x7c]
-# W64: v_cmp_nge_f32_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x32,0x7c]
 0x7e,0x04,0x32,0x7c
+# W32: v_cmp_nge_f32_e32 vcc_lo, exec_lo, v2   ; encoding: [0x7e,0x04,0x32,0x7c]
+# W64: v_cmp_nge_f32_e32 vcc, exec_lo, v2      ; encoding: [0x7e,0x04,0x32,0x7c]
 
-# W32: v_cmp_nge_f32_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x32,0x7c]
-# W64: v_cmp_nge_f32_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x32,0x7c]
 0x7f,0x04,0x32,0x7c
+# W32: v_cmp_nge_f32_e32 vcc_lo, exec_hi, v2   ; encoding: [0x7f,0x04,0x32,0x7c]
+# W64: v_cmp_nge_f32_e32 vcc, exec_hi, v2      ; encoding: [0x7f,0x04,0x32,0x7c]
 
-# W32: v_cmp_nge_f32_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x32,0x7c]
-# W64: v_cmp_nge_f32_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x32,0x7c]
 0x7c,0x04,0x32,0x7c
+# W32: v_cmp_nge_f32_e32 vcc_lo, null, v2      ; encoding: [0x7c,0x04,0x32,0x7c]
+# W64: v_cmp_nge_f32_e32 vcc, null, v2         ; encoding: [0x7c,0x04,0x32,0x7c]
 
-# W32: v_cmp_nge_f32_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x32,0x7c]
-# W64: v_cmp_nge_f32_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x32,0x7c]
 0xc1,0x04,0x32,0x7c
+# W32: v_cmp_nge_f32_e32 vcc_lo, -1, v2        ; encoding: [0xc1,0x04,0x32,0x7c]
+# W64: v_cmp_nge_f32_e32 vcc, -1, v2           ; encoding: [0xc1,0x04,0x32,0x7c]
 
-# W32: v_cmp_nge_f32_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x32,0x7c]
-# W64: v_cmp_nge_f32_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x32,0x7c]
 0xf0,0x04,0x32,0x7c
+# W32: v_cmp_nge_f32_e32 vcc_lo, 0.5, v2       ; encoding: [0xf0,0x04,0x32,0x7c]
+# W64: v_cmp_nge_f32_e32 vcc, 0.5, v2          ; encoding: [0xf0,0x04,0x32,0x7c]
 
-# W32: v_cmp_nge_f32_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x32,0x7c]
-# W64: v_cmp_nge_f32_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x32,0x7c]
 0xfd,0x04,0x32,0x7c
+# W32: v_cmp_nge_f32_e32 vcc_lo, src_scc, v2   ; encoding: [0xfd,0x04,0x32,0x7c]
+# W64: v_cmp_nge_f32_e32 vcc, src_scc, v2      ; encoding: [0xfd,0x04,0x32,0x7c]
 
-# W32: v_cmp_nge_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x33,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_nge_f32_e32 vcc, 0xaf123456, v255   ; encoding: [0xff,0xfe,0x33,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x33,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_nge_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x33,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_nge_f32_e32 vcc, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x33,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_nge_f64_e32 vcc_lo, v[1:2], v[2:3]  ; encoding: [0x01,0x05,0x52,0x7c]
-# W64: v_cmp_nge_f64_e32 vcc, v[1:2], v[2:3]     ; encoding: [0x01,0x05,0x52,0x7c]
 0x01,0x05,0x52,0x7c
+# W32: v_cmp_nge_f64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0x52,0x7c]
+# W64: v_cmp_nge_f64_e32 vcc, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0x52,0x7c]
 
+0xfe,0x05,0x52,0x7c
 # W32: v_cmp_nge_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x52,0x7c]
 # W64: v_cmp_nge_f64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x52,0x7c]
-0xfe,0x05,0x52,0x7c
 
-# W32: v_cmp_nge_f64_e32 vcc_lo, s[2:3], v[2:3]  ; encoding: [0x02,0x04,0x52,0x7c]
-# W64: v_cmp_nge_f64_e32 vcc, s[2:3], v[2:3]     ; encoding: [0x02,0x04,0x52,0x7c]
 0x02,0x04,0x52,0x7c
+# W32: v_cmp_nge_f64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0x52,0x7c]
+# W64: v_cmp_nge_f64_e32 vcc, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0x52,0x7c]
 
+0x68,0x04,0x52,0x7c
 # W32: v_cmp_nge_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x52,0x7c]
 # W64: v_cmp_nge_f64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x52,0x7c]
-0x68,0x04,0x52,0x7c
 
-# W32: v_cmp_nge_f64_e32 vcc_lo, vcc, v[2:3]     ; encoding: [0x6a,0x04,0x52,0x7c]
-# W64: v_cmp_nge_f64_e32 vcc, vcc, v[2:3]        ; encoding: [0x6a,0x04,0x52,0x7c]
 0x6a,0x04,0x52,0x7c
+# W32: v_cmp_nge_f64_e32 vcc_lo, vcc, v[2:3]   ; encoding: [0x6a,0x04,0x52,0x7c]
+# W64: v_cmp_nge_f64_e32 vcc, vcc, v[2:3]      ; encoding: [0x6a,0x04,0x52,0x7c]
 
+0x7a,0x04,0x52,0x7c
 # W32: v_cmp_nge_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x52,0x7c]
 # W64: v_cmp_nge_f64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x52,0x7c]
-0x7a,0x04,0x52,0x7c
 
-# W32: v_cmp_nge_f64_e32 vcc_lo, exec, v[2:3]    ; encoding: [0x7e,0x04,0x52,0x7c]
-# W64: v_cmp_nge_f64_e32 vcc, exec, v[2:3]       ; encoding: [0x7e,0x04,0x52,0x7c]
 0x7e,0x04,0x52,0x7c
+# W32: v_cmp_nge_f64_e32 vcc_lo, exec, v[2:3]  ; encoding: [0x7e,0x04,0x52,0x7c]
+# W64: v_cmp_nge_f64_e32 vcc, exec, v[2:3]     ; encoding: [0x7e,0x04,0x52,0x7c]
 
-# W32: v_cmp_nge_f64_e32 vcc_lo, null, v[2:3]    ; encoding: [0x7c,0x04,0x52,0x7c]
-# W64: v_cmp_nge_f64_e32 vcc, null, v[2:3]       ; encoding: [0x7c,0x04,0x52,0x7c]
 0x7c,0x04,0x52,0x7c
+# W32: v_cmp_nge_f64_e32 vcc_lo, null, v[2:3]  ; encoding: [0x7c,0x04,0x52,0x7c]
+# W64: v_cmp_nge_f64_e32 vcc, null, v[2:3]     ; encoding: [0x7c,0x04,0x52,0x7c]
 
-# W32: v_cmp_nge_f64_e32 vcc_lo, -1, v[2:3]      ; encoding: [0xc1,0x04,0x52,0x7c]
-# W64: v_cmp_nge_f64_e32 vcc, -1, v[2:3]         ; encoding: [0xc1,0x04,0x52,0x7c]
 0xc1,0x04,0x52,0x7c
+# W32: v_cmp_nge_f64_e32 vcc_lo, -1, v[2:3]    ; encoding: [0xc1,0x04,0x52,0x7c]
+# W64: v_cmp_nge_f64_e32 vcc, -1, v[2:3]       ; encoding: [0xc1,0x04,0x52,0x7c]
 
-# W32: v_cmp_nge_f64_e32 vcc_lo, 0.5, v[2:3]     ; encoding: [0xf0,0x04,0x52,0x7c]
-# W64: v_cmp_nge_f64_e32 vcc, 0.5, v[2:3]        ; encoding: [0xf0,0x04,0x52,0x7c]
 0xf0,0x04,0x52,0x7c
+# W32: v_cmp_nge_f64_e32 vcc_lo, 0.5, v[2:3]   ; encoding: [0xf0,0x04,0x52,0x7c]
+# W64: v_cmp_nge_f64_e32 vcc, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0x52,0x7c]
 
-# W32: v_cmp_nge_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x52,0x7c]
-# W64: v_cmp_nge_f64_e32 vcc, src_scc, v[2:3]    ; encoding: [0xfd,0x04,0x52,0x7c]
 0xfd,0x04,0x52,0x7c
+# W32: v_cmp_nge_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x52,0x7c]
+# W64: v_cmp_nge_f64_e32 vcc, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0x52,0x7c]
 
+0xff,0xfc,0x53,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_nge_f64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x53,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_nge_f64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x53,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0x53,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_ngt_f16_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x16,0x7c]
-# W64: v_cmp_ngt_f16_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x16,0x7c]
 0x01,0x05,0x16,0x7c
+# W32: v_cmp_ngt_f16_e32 vcc_lo, v1, v2        ; encoding: [0x01,0x05,0x16,0x7c]
+# W64: v_cmp_ngt_f16_e32 vcc, v1, v2           ; encoding: [0x01,0x05,0x16,0x7c]
 
-# W32: v_cmp_ngt_f16_e32 vcc_lo, v127, v2        ; encoding: [0x7f,0x05,0x16,0x7c]
-# W64: v_cmp_ngt_f16_e32 vcc, v127, v2           ; encoding: [0x7f,0x05,0x16,0x7c]
 0x7f,0x05,0x16,0x7c
+# W32: v_cmp_ngt_f16_e32 vcc_lo, v127, v2      ; encoding: [0x7f,0x05,0x16,0x7c]
+# W64: v_cmp_ngt_f16_e32 vcc, v127, v2         ; encoding: [0x7f,0x05,0x16,0x7c]
 
-# W32: v_cmp_ngt_f16_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x16,0x7c]
-# W64: v_cmp_ngt_f16_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x16,0x7c]
 0x01,0x04,0x16,0x7c
+# W32: v_cmp_ngt_f16_e32 vcc_lo, s1, v2        ; encoding: [0x01,0x04,0x16,0x7c]
+# W64: v_cmp_ngt_f16_e32 vcc, s1, v2           ; encoding: [0x01,0x04,0x16,0x7c]
 
-# W32: v_cmp_ngt_f16_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x16,0x7c]
-# W64: v_cmp_ngt_f16_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x16,0x7c]
 0x69,0x04,0x16,0x7c
+# W32: v_cmp_ngt_f16_e32 vcc_lo, s105, v2      ; encoding: [0x69,0x04,0x16,0x7c]
+# W64: v_cmp_ngt_f16_e32 vcc, s105, v2         ; encoding: [0x69,0x04,0x16,0x7c]
 
-# W32: v_cmp_ngt_f16_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x16,0x7c]
-# W64: v_cmp_ngt_f16_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x16,0x7c]
 0x6a,0x04,0x16,0x7c
+# W32: v_cmp_ngt_f16_e32 vcc_lo, vcc_lo, v2    ; encoding: [0x6a,0x04,0x16,0x7c]
+# W64: v_cmp_ngt_f16_e32 vcc, vcc_lo, v2       ; encoding: [0x6a,0x04,0x16,0x7c]
 
-# W32: v_cmp_ngt_f16_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x16,0x7c]
-# W64: v_cmp_ngt_f16_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x16,0x7c]
 0x6b,0x04,0x16,0x7c
+# W32: v_cmp_ngt_f16_e32 vcc_lo, vcc_hi, v2    ; encoding: [0x6b,0x04,0x16,0x7c]
+# W64: v_cmp_ngt_f16_e32 vcc, vcc_hi, v2       ; encoding: [0x6b,0x04,0x16,0x7c]
 
-# W32: v_cmp_ngt_f16_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x16,0x7c]
-# W64: v_cmp_ngt_f16_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x16,0x7c]
 0x7b,0x04,0x16,0x7c
+# W32: v_cmp_ngt_f16_e32 vcc_lo, ttmp15, v2    ; encoding: [0x7b,0x04,0x16,0x7c]
+# W64: v_cmp_ngt_f16_e32 vcc, ttmp15, v2       ; encoding: [0x7b,0x04,0x16,0x7c]
 
-# W32: v_cmp_ngt_f16_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x16,0x7c]
-# W64: v_cmp_ngt_f16_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x16,0x7c]
 0x7d,0x04,0x16,0x7c
+# W32: v_cmp_ngt_f16_e32 vcc_lo, m0, v2        ; encoding: [0x7d,0x04,0x16,0x7c]
+# W64: v_cmp_ngt_f16_e32 vcc, m0, v2           ; encoding: [0x7d,0x04,0x16,0x7c]
 
-# W32: v_cmp_ngt_f16_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x16,0x7c]
-# W64: v_cmp_ngt_f16_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x16,0x7c]
 0x7e,0x04,0x16,0x7c
+# W32: v_cmp_ngt_f16_e32 vcc_lo, exec_lo, v2   ; encoding: [0x7e,0x04,0x16,0x7c]
+# W64: v_cmp_ngt_f16_e32 vcc, exec_lo, v2      ; encoding: [0x7e,0x04,0x16,0x7c]
 
-# W32: v_cmp_ngt_f16_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x16,0x7c]
-# W64: v_cmp_ngt_f16_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x16,0x7c]
 0x7f,0x04,0x16,0x7c
+# W32: v_cmp_ngt_f16_e32 vcc_lo, exec_hi, v2   ; encoding: [0x7f,0x04,0x16,0x7c]
+# W64: v_cmp_ngt_f16_e32 vcc, exec_hi, v2      ; encoding: [0x7f,0x04,0x16,0x7c]
 
-# W32: v_cmp_ngt_f16_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x16,0x7c]
-# W64: v_cmp_ngt_f16_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x16,0x7c]
 0x7c,0x04,0x16,0x7c
+# W32: v_cmp_ngt_f16_e32 vcc_lo, null, v2      ; encoding: [0x7c,0x04,0x16,0x7c]
+# W64: v_cmp_ngt_f16_e32 vcc, null, v2         ; encoding: [0x7c,0x04,0x16,0x7c]
 
-# W32: v_cmp_ngt_f16_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x16,0x7c]
-# W64: v_cmp_ngt_f16_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x16,0x7c]
 0xc1,0x04,0x16,0x7c
+# W32: v_cmp_ngt_f16_e32 vcc_lo, -1, v2        ; encoding: [0xc1,0x04,0x16,0x7c]
+# W64: v_cmp_ngt_f16_e32 vcc, -1, v2           ; encoding: [0xc1,0x04,0x16,0x7c]
 
-# W32: v_cmp_ngt_f16_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x16,0x7c]
-# W64: v_cmp_ngt_f16_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x16,0x7c]
 0xf0,0x04,0x16,0x7c
+# W32: v_cmp_ngt_f16_e32 vcc_lo, 0.5, v2       ; encoding: [0xf0,0x04,0x16,0x7c]
+# W64: v_cmp_ngt_f16_e32 vcc, 0.5, v2          ; encoding: [0xf0,0x04,0x16,0x7c]
 
-# W32: v_cmp_ngt_f16_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x16,0x7c]
-# W64: v_cmp_ngt_f16_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x16,0x7c]
 0xfd,0x04,0x16,0x7c
+# W32: v_cmp_ngt_f16_e32 vcc_lo, src_scc, v2   ; encoding: [0xfd,0x04,0x16,0x7c]
+# W64: v_cmp_ngt_f16_e32 vcc, src_scc, v2      ; encoding: [0xfd,0x04,0x16,0x7c]
 
-# W32: v_cmp_ngt_f16_e32 vcc_lo, 0xfe0b, v127    ; encoding: [0xff,0xfe,0x16,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_ngt_f16_e32 vcc, 0xfe0b, v127       ; encoding: [0xff,0xfe,0x16,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x16,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_ngt_f16_e32 vcc_lo, 0xfe0b, v127  ; encoding: [0xff,0xfe,0x16,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_ngt_f16_e32 vcc, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x16,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ngt_f32_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x36,0x7c]
-# W64: v_cmp_ngt_f32_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x36,0x7c]
 0x01,0x05,0x36,0x7c
+# W32: v_cmp_ngt_f32_e32 vcc_lo, v1, v2        ; encoding: [0x01,0x05,0x36,0x7c]
+# W64: v_cmp_ngt_f32_e32 vcc, v1, v2           ; encoding: [0x01,0x05,0x36,0x7c]
 
-# W32: v_cmp_ngt_f32_e32 vcc_lo, v255, v2        ; encoding: [0xff,0x05,0x36,0x7c]
-# W64: v_cmp_ngt_f32_e32 vcc, v255, v2           ; encoding: [0xff,0x05,0x36,0x7c]
 0xff,0x05,0x36,0x7c
+# W32: v_cmp_ngt_f32_e32 vcc_lo, v255, v2      ; encoding: [0xff,0x05,0x36,0x7c]
+# W64: v_cmp_ngt_f32_e32 vcc, v255, v2         ; encoding: [0xff,0x05,0x36,0x7c]
 
-# W32: v_cmp_ngt_f32_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x36,0x7c]
-# W64: v_cmp_ngt_f32_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x36,0x7c]
 0x01,0x04,0x36,0x7c
+# W32: v_cmp_ngt_f32_e32 vcc_lo, s1, v2        ; encoding: [0x01,0x04,0x36,0x7c]
+# W64: v_cmp_ngt_f32_e32 vcc, s1, v2           ; encoding: [0x01,0x04,0x36,0x7c]
 
-# W32: v_cmp_ngt_f32_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x36,0x7c]
-# W64: v_cmp_ngt_f32_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x36,0x7c]
 0x69,0x04,0x36,0x7c
+# W32: v_cmp_ngt_f32_e32 vcc_lo, s105, v2      ; encoding: [0x69,0x04,0x36,0x7c]
+# W64: v_cmp_ngt_f32_e32 vcc, s105, v2         ; encoding: [0x69,0x04,0x36,0x7c]
 
-# W32: v_cmp_ngt_f32_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x36,0x7c]
-# W64: v_cmp_ngt_f32_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x36,0x7c]
 0x6a,0x04,0x36,0x7c
+# W32: v_cmp_ngt_f32_e32 vcc_lo, vcc_lo, v2    ; encoding: [0x6a,0x04,0x36,0x7c]
+# W64: v_cmp_ngt_f32_e32 vcc, vcc_lo, v2       ; encoding: [0x6a,0x04,0x36,0x7c]
 
-# W32: v_cmp_ngt_f32_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x36,0x7c]
-# W64: v_cmp_ngt_f32_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x36,0x7c]
 0x6b,0x04,0x36,0x7c
+# W32: v_cmp_ngt_f32_e32 vcc_lo, vcc_hi, v2    ; encoding: [0x6b,0x04,0x36,0x7c]
+# W64: v_cmp_ngt_f32_e32 vcc, vcc_hi, v2       ; encoding: [0x6b,0x04,0x36,0x7c]
 
-# W32: v_cmp_ngt_f32_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x36,0x7c]
-# W64: v_cmp_ngt_f32_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x36,0x7c]
 0x7b,0x04,0x36,0x7c
+# W32: v_cmp_ngt_f32_e32 vcc_lo, ttmp15, v2    ; encoding: [0x7b,0x04,0x36,0x7c]
+# W64: v_cmp_ngt_f32_e32 vcc, ttmp15, v2       ; encoding: [0x7b,0x04,0x36,0x7c]
 
-# W32: v_cmp_ngt_f32_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x36,0x7c]
-# W64: v_cmp_ngt_f32_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x36,0x7c]
 0x7d,0x04,0x36,0x7c
+# W32: v_cmp_ngt_f32_e32 vcc_lo, m0, v2        ; encoding: [0x7d,0x04,0x36,0x7c]
+# W64: v_cmp_ngt_f32_e32 vcc, m0, v2           ; encoding: [0x7d,0x04,0x36,0x7c]
 
-# W32: v_cmp_ngt_f32_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x36,0x7c]
-# W64: v_cmp_ngt_f32_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x36,0x7c]
 0x7e,0x04,0x36,0x7c
+# W32: v_cmp_ngt_f32_e32 vcc_lo, exec_lo, v2   ; encoding: [0x7e,0x04,0x36,0x7c]
+# W64: v_cmp_ngt_f32_e32 vcc, exec_lo, v2      ; encoding: [0x7e,0x04,0x36,0x7c]
 
-# W32: v_cmp_ngt_f32_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x36,0x7c]
-# W64: v_cmp_ngt_f32_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x36,0x7c]
 0x7f,0x04,0x36,0x7c
+# W32: v_cmp_ngt_f32_e32 vcc_lo, exec_hi, v2   ; encoding: [0x7f,0x04,0x36,0x7c]
+# W64: v_cmp_ngt_f32_e32 vcc, exec_hi, v2      ; encoding: [0x7f,0x04,0x36,0x7c]
 
-# W32: v_cmp_ngt_f32_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x36,0x7c]
-# W64: v_cmp_ngt_f32_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x36,0x7c]
 0x7c,0x04,0x36,0x7c
+# W32: v_cmp_ngt_f32_e32 vcc_lo, null, v2      ; encoding: [0x7c,0x04,0x36,0x7c]
+# W64: v_cmp_ngt_f32_e32 vcc, null, v2         ; encoding: [0x7c,0x04,0x36,0x7c]
 
-# W32: v_cmp_ngt_f32_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x36,0x7c]
-# W64: v_cmp_ngt_f32_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x36,0x7c]
 0xc1,0x04,0x36,0x7c
+# W32: v_cmp_ngt_f32_e32 vcc_lo, -1, v2        ; encoding: [0xc1,0x04,0x36,0x7c]
+# W64: v_cmp_ngt_f32_e32 vcc, -1, v2           ; encoding: [0xc1,0x04,0x36,0x7c]
 
-# W32: v_cmp_ngt_f32_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x36,0x7c]
-# W64: v_cmp_ngt_f32_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x36,0x7c]
 0xf0,0x04,0x36,0x7c
+# W32: v_cmp_ngt_f32_e32 vcc_lo, 0.5, v2       ; encoding: [0xf0,0x04,0x36,0x7c]
+# W64: v_cmp_ngt_f32_e32 vcc, 0.5, v2          ; encoding: [0xf0,0x04,0x36,0x7c]
 
-# W32: v_cmp_ngt_f32_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x36,0x7c]
-# W64: v_cmp_ngt_f32_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x36,0x7c]
 0xfd,0x04,0x36,0x7c
+# W32: v_cmp_ngt_f32_e32 vcc_lo, src_scc, v2   ; encoding: [0xfd,0x04,0x36,0x7c]
+# W64: v_cmp_ngt_f32_e32 vcc, src_scc, v2      ; encoding: [0xfd,0x04,0x36,0x7c]
 
-# W32: v_cmp_ngt_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x37,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_ngt_f32_e32 vcc, 0xaf123456, v255   ; encoding: [0xff,0xfe,0x37,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x37,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_ngt_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x37,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_ngt_f32_e32 vcc, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x37,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ngt_f64_e32 vcc_lo, v[1:2], v[2:3]  ; encoding: [0x01,0x05,0x56,0x7c]
-# W64: v_cmp_ngt_f64_e32 vcc, v[1:2], v[2:3]     ; encoding: [0x01,0x05,0x56,0x7c]
 0x01,0x05,0x56,0x7c
+# W32: v_cmp_ngt_f64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0x56,0x7c]
+# W64: v_cmp_ngt_f64_e32 vcc, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0x56,0x7c]
 
+0xfe,0x05,0x56,0x7c
 # W32: v_cmp_ngt_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x56,0x7c]
 # W64: v_cmp_ngt_f64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x56,0x7c]
-0xfe,0x05,0x56,0x7c
 
-# W32: v_cmp_ngt_f64_e32 vcc_lo, s[2:3], v[2:3]  ; encoding: [0x02,0x04,0x56,0x7c]
-# W64: v_cmp_ngt_f64_e32 vcc, s[2:3], v[2:3]     ; encoding: [0x02,0x04,0x56,0x7c]
 0x02,0x04,0x56,0x7c
+# W32: v_cmp_ngt_f64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0x56,0x7c]
+# W64: v_cmp_ngt_f64_e32 vcc, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0x56,0x7c]
 
+0x68,0x04,0x56,0x7c
 # W32: v_cmp_ngt_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x56,0x7c]
 # W64: v_cmp_ngt_f64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x56,0x7c]
-0x68,0x04,0x56,0x7c
 
-# W32: v_cmp_ngt_f64_e32 vcc_lo, vcc, v[2:3]     ; encoding: [0x6a,0x04,0x56,0x7c]
-# W64: v_cmp_ngt_f64_e32 vcc, vcc, v[2:3]        ; encoding: [0x6a,0x04,0x56,0x7c]
 0x6a,0x04,0x56,0x7c
+# W32: v_cmp_ngt_f64_e32 vcc_lo, vcc, v[2:3]   ; encoding: [0x6a,0x04,0x56,0x7c]
+# W64: v_cmp_ngt_f64_e32 vcc, vcc, v[2:3]      ; encoding: [0x6a,0x04,0x56,0x7c]
 
+0x7a,0x04,0x56,0x7c
 # W32: v_cmp_ngt_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x56,0x7c]
 # W64: v_cmp_ngt_f64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x56,0x7c]
-0x7a,0x04,0x56,0x7c
 
-# W32: v_cmp_ngt_f64_e32 vcc_lo, exec, v[2:3]    ; encoding: [0x7e,0x04,0x56,0x7c]
-# W64: v_cmp_ngt_f64_e32 vcc, exec, v[2:3]       ; encoding: [0x7e,0x04,0x56,0x7c]
 0x7e,0x04,0x56,0x7c
+# W32: v_cmp_ngt_f64_e32 vcc_lo, exec, v[2:3]  ; encoding: [0x7e,0x04,0x56,0x7c]
+# W64: v_cmp_ngt_f64_e32 vcc, exec, v[2:3]     ; encoding: [0x7e,0x04,0x56,0x7c]
 
-# W32: v_cmp_ngt_f64_e32 vcc_lo, null, v[2:3]    ; encoding: [0x7c,0x04,0x56,0x7c]
-# W64: v_cmp_ngt_f64_e32 vcc, null, v[2:3]       ; encoding: [0x7c,0x04,0x56,0x7c]
 0x7c,0x04,0x56,0x7c
+# W32: v_cmp_ngt_f64_e32 vcc_lo, null, v[2:3]  ; encoding: [0x7c,0x04,0x56,0x7c]
+# W64: v_cmp_ngt_f64_e32 vcc, null, v[2:3]     ; encoding: [0x7c,0x04,0x56,0x7c]
 
-# W32: v_cmp_ngt_f64_e32 vcc_lo, -1, v[2:3]      ; encoding: [0xc1,0x04,0x56,0x7c]
-# W64: v_cmp_ngt_f64_e32 vcc, -1, v[2:3]         ; encoding: [0xc1,0x04,0x56,0x7c]
 0xc1,0x04,0x56,0x7c
+# W32: v_cmp_ngt_f64_e32 vcc_lo, -1, v[2:3]    ; encoding: [0xc1,0x04,0x56,0x7c]
+# W64: v_cmp_ngt_f64_e32 vcc, -1, v[2:3]       ; encoding: [0xc1,0x04,0x56,0x7c]
 
-# W32: v_cmp_ngt_f64_e32 vcc_lo, 0.5, v[2:3]     ; encoding: [0xf0,0x04,0x56,0x7c]
-# W64: v_cmp_ngt_f64_e32 vcc, 0.5, v[2:3]        ; encoding: [0xf0,0x04,0x56,0x7c]
 0xf0,0x04,0x56,0x7c
+# W32: v_cmp_ngt_f64_e32 vcc_lo, 0.5, v[2:3]   ; encoding: [0xf0,0x04,0x56,0x7c]
+# W64: v_cmp_ngt_f64_e32 vcc, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0x56,0x7c]
 
-# W32: v_cmp_ngt_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x56,0x7c]
-# W64: v_cmp_ngt_f64_e32 vcc, src_scc, v[2:3]    ; encoding: [0xfd,0x04,0x56,0x7c]
 0xfd,0x04,0x56,0x7c
+# W32: v_cmp_ngt_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x56,0x7c]
+# W64: v_cmp_ngt_f64_e32 vcc, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0x56,0x7c]
 
+0xff,0xfc,0x57,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_ngt_f64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x57,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_ngt_f64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x57,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0x57,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_nle_f16_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x18,0x7c]
-# W64: v_cmp_nle_f16_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x18,0x7c]
 0x01,0x05,0x18,0x7c
+# W32: v_cmp_nle_f16_e32 vcc_lo, v1, v2        ; encoding: [0x01,0x05,0x18,0x7c]
+# W64: v_cmp_nle_f16_e32 vcc, v1, v2           ; encoding: [0x01,0x05,0x18,0x7c]
 
-# W32: v_cmp_nle_f16_e32 vcc_lo, v127, v2        ; encoding: [0x7f,0x05,0x18,0x7c]
-# W64: v_cmp_nle_f16_e32 vcc, v127, v2           ; encoding: [0x7f,0x05,0x18,0x7c]
 0x7f,0x05,0x18,0x7c
+# W32: v_cmp_nle_f16_e32 vcc_lo, v127, v2      ; encoding: [0x7f,0x05,0x18,0x7c]
+# W64: v_cmp_nle_f16_e32 vcc, v127, v2         ; encoding: [0x7f,0x05,0x18,0x7c]
 
-# W32: v_cmp_nle_f16_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x18,0x7c]
-# W64: v_cmp_nle_f16_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x18,0x7c]
 0x01,0x04,0x18,0x7c
+# W32: v_cmp_nle_f16_e32 vcc_lo, s1, v2        ; encoding: [0x01,0x04,0x18,0x7c]
+# W64: v_cmp_nle_f16_e32 vcc, s1, v2           ; encoding: [0x01,0x04,0x18,0x7c]
 
-# W32: v_cmp_nle_f16_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x18,0x7c]
-# W64: v_cmp_nle_f16_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x18,0x7c]
 0x69,0x04,0x18,0x7c
+# W32: v_cmp_nle_f16_e32 vcc_lo, s105, v2      ; encoding: [0x69,0x04,0x18,0x7c]
+# W64: v_cmp_nle_f16_e32 vcc, s105, v2         ; encoding: [0x69,0x04,0x18,0x7c]
 
-# W32: v_cmp_nle_f16_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x18,0x7c]
-# W64: v_cmp_nle_f16_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x18,0x7c]
 0x6a,0x04,0x18,0x7c
+# W32: v_cmp_nle_f16_e32 vcc_lo, vcc_lo, v2    ; encoding: [0x6a,0x04,0x18,0x7c]
+# W64: v_cmp_nle_f16_e32 vcc, vcc_lo, v2       ; encoding: [0x6a,0x04,0x18,0x7c]
 
-# W32: v_cmp_nle_f16_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x18,0x7c]
-# W64: v_cmp_nle_f16_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x18,0x7c]
 0x6b,0x04,0x18,0x7c
+# W32: v_cmp_nle_f16_e32 vcc_lo, vcc_hi, v2    ; encoding: [0x6b,0x04,0x18,0x7c]
+# W64: v_cmp_nle_f16_e32 vcc, vcc_hi, v2       ; encoding: [0x6b,0x04,0x18,0x7c]
 
-# W32: v_cmp_nle_f16_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x18,0x7c]
-# W64: v_cmp_nle_f16_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x18,0x7c]
 0x7b,0x04,0x18,0x7c
+# W32: v_cmp_nle_f16_e32 vcc_lo, ttmp15, v2    ; encoding: [0x7b,0x04,0x18,0x7c]
+# W64: v_cmp_nle_f16_e32 vcc, ttmp15, v2       ; encoding: [0x7b,0x04,0x18,0x7c]
 
-# W32: v_cmp_nle_f16_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x18,0x7c]
-# W64: v_cmp_nle_f16_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x18,0x7c]
 0x7d,0x04,0x18,0x7c
+# W32: v_cmp_nle_f16_e32 vcc_lo, m0, v2        ; encoding: [0x7d,0x04,0x18,0x7c]
+# W64: v_cmp_nle_f16_e32 vcc, m0, v2           ; encoding: [0x7d,0x04,0x18,0x7c]
 
-# W32: v_cmp_nle_f16_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x18,0x7c]
-# W64: v_cmp_nle_f16_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x18,0x7c]
 0x7e,0x04,0x18,0x7c
+# W32: v_cmp_nle_f16_e32 vcc_lo, exec_lo, v2   ; encoding: [0x7e,0x04,0x18,0x7c]
+# W64: v_cmp_nle_f16_e32 vcc, exec_lo, v2      ; encoding: [0x7e,0x04,0x18,0x7c]
 
-# W32: v_cmp_nle_f16_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x18,0x7c]
-# W64: v_cmp_nle_f16_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x18,0x7c]
 0x7f,0x04,0x18,0x7c
+# W32: v_cmp_nle_f16_e32 vcc_lo, exec_hi, v2   ; encoding: [0x7f,0x04,0x18,0x7c]
+# W64: v_cmp_nle_f16_e32 vcc, exec_hi, v2      ; encoding: [0x7f,0x04,0x18,0x7c]
 
-# W32: v_cmp_nle_f16_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x18,0x7c]
-# W64: v_cmp_nle_f16_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x18,0x7c]
 0x7c,0x04,0x18,0x7c
+# W32: v_cmp_nle_f16_e32 vcc_lo, null, v2      ; encoding: [0x7c,0x04,0x18,0x7c]
+# W64: v_cmp_nle_f16_e32 vcc, null, v2         ; encoding: [0x7c,0x04,0x18,0x7c]
 
-# W32: v_cmp_nle_f16_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x18,0x7c]
-# W64: v_cmp_nle_f16_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x18,0x7c]
 0xc1,0x04,0x18,0x7c
+# W32: v_cmp_nle_f16_e32 vcc_lo, -1, v2        ; encoding: [0xc1,0x04,0x18,0x7c]
+# W64: v_cmp_nle_f16_e32 vcc, -1, v2           ; encoding: [0xc1,0x04,0x18,0x7c]
 
-# W32: v_cmp_nle_f16_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x18,0x7c]
-# W64: v_cmp_nle_f16_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x18,0x7c]
 0xf0,0x04,0x18,0x7c
+# W32: v_cmp_nle_f16_e32 vcc_lo, 0.5, v2       ; encoding: [0xf0,0x04,0x18,0x7c]
+# W64: v_cmp_nle_f16_e32 vcc, 0.5, v2          ; encoding: [0xf0,0x04,0x18,0x7c]
 
-# W32: v_cmp_nle_f16_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x18,0x7c]
-# W64: v_cmp_nle_f16_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x18,0x7c]
 0xfd,0x04,0x18,0x7c
+# W32: v_cmp_nle_f16_e32 vcc_lo, src_scc, v2   ; encoding: [0xfd,0x04,0x18,0x7c]
+# W64: v_cmp_nle_f16_e32 vcc, src_scc, v2      ; encoding: [0xfd,0x04,0x18,0x7c]
 
-# W32: v_cmp_nle_f16_e32 vcc_lo, 0xfe0b, v127    ; encoding: [0xff,0xfe,0x18,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_nle_f16_e32 vcc, 0xfe0b, v127       ; encoding: [0xff,0xfe,0x18,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x18,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_nle_f16_e32 vcc_lo, 0xfe0b, v127  ; encoding: [0xff,0xfe,0x18,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_nle_f16_e32 vcc, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x18,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_nle_f32_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x38,0x7c]
-# W64: v_cmp_nle_f32_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x38,0x7c]
 0x01,0x05,0x38,0x7c
+# W32: v_cmp_nle_f32_e32 vcc_lo, v1, v2        ; encoding: [0x01,0x05,0x38,0x7c]
+# W64: v_cmp_nle_f32_e32 vcc, v1, v2           ; encoding: [0x01,0x05,0x38,0x7c]
 
-# W32: v_cmp_nle_f32_e32 vcc_lo, v255, v2        ; encoding: [0xff,0x05,0x38,0x7c]
-# W64: v_cmp_nle_f32_e32 vcc, v255, v2           ; encoding: [0xff,0x05,0x38,0x7c]
 0xff,0x05,0x38,0x7c
+# W32: v_cmp_nle_f32_e32 vcc_lo, v255, v2      ; encoding: [0xff,0x05,0x38,0x7c]
+# W64: v_cmp_nle_f32_e32 vcc, v255, v2         ; encoding: [0xff,0x05,0x38,0x7c]
 
-# W32: v_cmp_nle_f32_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x38,0x7c]
-# W64: v_cmp_nle_f32_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x38,0x7c]
 0x01,0x04,0x38,0x7c
+# W32: v_cmp_nle_f32_e32 vcc_lo, s1, v2        ; encoding: [0x01,0x04,0x38,0x7c]
+# W64: v_cmp_nle_f32_e32 vcc, s1, v2           ; encoding: [0x01,0x04,0x38,0x7c]
 
-# W32: v_cmp_nle_f32_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x38,0x7c]
-# W64: v_cmp_nle_f32_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x38,0x7c]
 0x69,0x04,0x38,0x7c
+# W32: v_cmp_nle_f32_e32 vcc_lo, s105, v2      ; encoding: [0x69,0x04,0x38,0x7c]
+# W64: v_cmp_nle_f32_e32 vcc, s105, v2         ; encoding: [0x69,0x04,0x38,0x7c]
 
-# W32: v_cmp_nle_f32_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x38,0x7c]
-# W64: v_cmp_nle_f32_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x38,0x7c]
 0x6a,0x04,0x38,0x7c
+# W32: v_cmp_nle_f32_e32 vcc_lo, vcc_lo, v2    ; encoding: [0x6a,0x04,0x38,0x7c]
+# W64: v_cmp_nle_f32_e32 vcc, vcc_lo, v2       ; encoding: [0x6a,0x04,0x38,0x7c]
 
-# W32: v_cmp_nle_f32_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x38,0x7c]
-# W64: v_cmp_nle_f32_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x38,0x7c]
 0x6b,0x04,0x38,0x7c
+# W32: v_cmp_nle_f32_e32 vcc_lo, vcc_hi, v2    ; encoding: [0x6b,0x04,0x38,0x7c]
+# W64: v_cmp_nle_f32_e32 vcc, vcc_hi, v2       ; encoding: [0x6b,0x04,0x38,0x7c]
 
-# W32: v_cmp_nle_f32_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x38,0x7c]
-# W64: v_cmp_nle_f32_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x38,0x7c]
 0x7b,0x04,0x38,0x7c
+# W32: v_cmp_nle_f32_e32 vcc_lo, ttmp15, v2    ; encoding: [0x7b,0x04,0x38,0x7c]
+# W64: v_cmp_nle_f32_e32 vcc, ttmp15, v2       ; encoding: [0x7b,0x04,0x38,0x7c]
 
-# W32: v_cmp_nle_f32_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x38,0x7c]
-# W64: v_cmp_nle_f32_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x38,0x7c]
 0x7d,0x04,0x38,0x7c
+# W32: v_cmp_nle_f32_e32 vcc_lo, m0, v2        ; encoding: [0x7d,0x04,0x38,0x7c]
+# W64: v_cmp_nle_f32_e32 vcc, m0, v2           ; encoding: [0x7d,0x04,0x38,0x7c]
 
-# W32: v_cmp_nle_f32_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x38,0x7c]
-# W64: v_cmp_nle_f32_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x38,0x7c]
 0x7e,0x04,0x38,0x7c
+# W32: v_cmp_nle_f32_e32 vcc_lo, exec_lo, v2   ; encoding: [0x7e,0x04,0x38,0x7c]
+# W64: v_cmp_nle_f32_e32 vcc, exec_lo, v2      ; encoding: [0x7e,0x04,0x38,0x7c]
 
-# W32: v_cmp_nle_f32_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x38,0x7c]
-# W64: v_cmp_nle_f32_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x38,0x7c]
 0x7f,0x04,0x38,0x7c
+# W32: v_cmp_nle_f32_e32 vcc_lo, exec_hi, v2   ; encoding: [0x7f,0x04,0x38,0x7c]
+# W64: v_cmp_nle_f32_e32 vcc, exec_hi, v2      ; encoding: [0x7f,0x04,0x38,0x7c]
 
-# W32: v_cmp_nle_f32_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x38,0x7c]
-# W64: v_cmp_nle_f32_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x38,0x7c]
 0x7c,0x04,0x38,0x7c
+# W32: v_cmp_nle_f32_e32 vcc_lo, null, v2      ; encoding: [0x7c,0x04,0x38,0x7c]
+# W64: v_cmp_nle_f32_e32 vcc, null, v2         ; encoding: [0x7c,0x04,0x38,0x7c]
 
-# W32: v_cmp_nle_f32_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x38,0x7c]
-# W64: v_cmp_nle_f32_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x38,0x7c]
 0xc1,0x04,0x38,0x7c
+# W32: v_cmp_nle_f32_e32 vcc_lo, -1, v2        ; encoding: [0xc1,0x04,0x38,0x7c]
+# W64: v_cmp_nle_f32_e32 vcc, -1, v2           ; encoding: [0xc1,0x04,0x38,0x7c]
 
-# W32: v_cmp_nle_f32_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x38,0x7c]
-# W64: v_cmp_nle_f32_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x38,0x7c]
 0xf0,0x04,0x38,0x7c
+# W32: v_cmp_nle_f32_e32 vcc_lo, 0.5, v2       ; encoding: [0xf0,0x04,0x38,0x7c]
+# W64: v_cmp_nle_f32_e32 vcc, 0.5, v2          ; encoding: [0xf0,0x04,0x38,0x7c]
 
-# W32: v_cmp_nle_f32_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x38,0x7c]
-# W64: v_cmp_nle_f32_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x38,0x7c]
 0xfd,0x04,0x38,0x7c
+# W32: v_cmp_nle_f32_e32 vcc_lo, src_scc, v2   ; encoding: [0xfd,0x04,0x38,0x7c]
+# W64: v_cmp_nle_f32_e32 vcc, src_scc, v2      ; encoding: [0xfd,0x04,0x38,0x7c]
 
-# W32: v_cmp_nle_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x39,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_nle_f32_e32 vcc, 0xaf123456, v255   ; encoding: [0xff,0xfe,0x39,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x39,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_nle_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x39,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_nle_f32_e32 vcc, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x39,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_nle_f64_e32 vcc_lo, v[1:2], v[2:3]  ; encoding: [0x01,0x05,0x58,0x7c]
-# W64: v_cmp_nle_f64_e32 vcc, v[1:2], v[2:3]     ; encoding: [0x01,0x05,0x58,0x7c]
 0x01,0x05,0x58,0x7c
+# W32: v_cmp_nle_f64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0x58,0x7c]
+# W64: v_cmp_nle_f64_e32 vcc, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0x58,0x7c]
 
+0xfe,0x05,0x58,0x7c
 # W32: v_cmp_nle_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x58,0x7c]
 # W64: v_cmp_nle_f64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x58,0x7c]
-0xfe,0x05,0x58,0x7c
 
-# W32: v_cmp_nle_f64_e32 vcc_lo, s[2:3], v[2:3]  ; encoding: [0x02,0x04,0x58,0x7c]
-# W64: v_cmp_nle_f64_e32 vcc, s[2:3], v[2:3]     ; encoding: [0x02,0x04,0x58,0x7c]
 0x02,0x04,0x58,0x7c
+# W32: v_cmp_nle_f64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0x58,0x7c]
+# W64: v_cmp_nle_f64_e32 vcc, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0x58,0x7c]
 
+0x68,0x04,0x58,0x7c
 # W32: v_cmp_nle_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x58,0x7c]
 # W64: v_cmp_nle_f64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x58,0x7c]
-0x68,0x04,0x58,0x7c
 
-# W32: v_cmp_nle_f64_e32 vcc_lo, vcc, v[2:3]     ; encoding: [0x6a,0x04,0x58,0x7c]
-# W64: v_cmp_nle_f64_e32 vcc, vcc, v[2:3]        ; encoding: [0x6a,0x04,0x58,0x7c]
 0x6a,0x04,0x58,0x7c
+# W32: v_cmp_nle_f64_e32 vcc_lo, vcc, v[2:3]   ; encoding: [0x6a,0x04,0x58,0x7c]
+# W64: v_cmp_nle_f64_e32 vcc, vcc, v[2:3]      ; encoding: [0x6a,0x04,0x58,0x7c]
 
+0x7a,0x04,0x58,0x7c
 # W32: v_cmp_nle_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x58,0x7c]
 # W64: v_cmp_nle_f64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x58,0x7c]
-0x7a,0x04,0x58,0x7c
 
-# W32: v_cmp_nle_f64_e32 vcc_lo, exec, v[2:3]    ; encoding: [0x7e,0x04,0x58,0x7c]
-# W64: v_cmp_nle_f64_e32 vcc, exec, v[2:3]       ; encoding: [0x7e,0x04,0x58,0x7c]
 0x7e,0x04,0x58,0x7c
+# W32: v_cmp_nle_f64_e32 vcc_lo, exec, v[2:3]  ; encoding: [0x7e,0x04,0x58,0x7c]
+# W64: v_cmp_nle_f64_e32 vcc, exec, v[2:3]     ; encoding: [0x7e,0x04,0x58,0x7c]
 
-# W32: v_cmp_nle_f64_e32 vcc_lo, null, v[2:3]    ; encoding: [0x7c,0x04,0x58,0x7c]
-# W64: v_cmp_nle_f64_e32 vcc, null, v[2:3]       ; encoding: [0x7c,0x04,0x58,0x7c]
 0x7c,0x04,0x58,0x7c
+# W32: v_cmp_nle_f64_e32 vcc_lo, null, v[2:3]  ; encoding: [0x7c,0x04,0x58,0x7c]
+# W64: v_cmp_nle_f64_e32 vcc, null, v[2:3]     ; encoding: [0x7c,0x04,0x58,0x7c]
 
-# W32: v_cmp_nle_f64_e32 vcc_lo, -1, v[2:3]      ; encoding: [0xc1,0x04,0x58,0x7c]
-# W64: v_cmp_nle_f64_e32 vcc, -1, v[2:3]         ; encoding: [0xc1,0x04,0x58,0x7c]
 0xc1,0x04,0x58,0x7c
+# W32: v_cmp_nle_f64_e32 vcc_lo, -1, v[2:3]    ; encoding: [0xc1,0x04,0x58,0x7c]
+# W64: v_cmp_nle_f64_e32 vcc, -1, v[2:3]       ; encoding: [0xc1,0x04,0x58,0x7c]
 
-# W32: v_cmp_nle_f64_e32 vcc_lo, 0.5, v[2:3]     ; encoding: [0xf0,0x04,0x58,0x7c]
-# W64: v_cmp_nle_f64_e32 vcc, 0.5, v[2:3]        ; encoding: [0xf0,0x04,0x58,0x7c]
 0xf0,0x04,0x58,0x7c
+# W32: v_cmp_nle_f64_e32 vcc_lo, 0.5, v[2:3]   ; encoding: [0xf0,0x04,0x58,0x7c]
+# W64: v_cmp_nle_f64_e32 vcc, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0x58,0x7c]
 
-# W32: v_cmp_nle_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x58,0x7c]
-# W64: v_cmp_nle_f64_e32 vcc, src_scc, v[2:3]    ; encoding: [0xfd,0x04,0x58,0x7c]
 0xfd,0x04,0x58,0x7c
+# W32: v_cmp_nle_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x58,0x7c]
+# W64: v_cmp_nle_f64_e32 vcc, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0x58,0x7c]
 
+0xff,0xfc,0x59,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_nle_f64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x59,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_nle_f64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x59,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0x59,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_nlg_f16_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x14,0x7c]
-# W64: v_cmp_nlg_f16_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x14,0x7c]
 0x01,0x05,0x14,0x7c
+# W32: v_cmp_nlg_f16_e32 vcc_lo, v1, v2        ; encoding: [0x01,0x05,0x14,0x7c]
+# W64: v_cmp_nlg_f16_e32 vcc, v1, v2           ; encoding: [0x01,0x05,0x14,0x7c]
 
-# W32: v_cmp_nlg_f16_e32 vcc_lo, v127, v2        ; encoding: [0x7f,0x05,0x14,0x7c]
-# W64: v_cmp_nlg_f16_e32 vcc, v127, v2           ; encoding: [0x7f,0x05,0x14,0x7c]
 0x7f,0x05,0x14,0x7c
+# W32: v_cmp_nlg_f16_e32 vcc_lo, v127, v2      ; encoding: [0x7f,0x05,0x14,0x7c]
+# W64: v_cmp_nlg_f16_e32 vcc, v127, v2         ; encoding: [0x7f,0x05,0x14,0x7c]
 
-# W32: v_cmp_nlg_f16_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x14,0x7c]
-# W64: v_cmp_nlg_f16_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x14,0x7c]
 0x01,0x04,0x14,0x7c
+# W32: v_cmp_nlg_f16_e32 vcc_lo, s1, v2        ; encoding: [0x01,0x04,0x14,0x7c]
+# W64: v_cmp_nlg_f16_e32 vcc, s1, v2           ; encoding: [0x01,0x04,0x14,0x7c]
 
-# W32: v_cmp_nlg_f16_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x14,0x7c]
-# W64: v_cmp_nlg_f16_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x14,0x7c]
 0x69,0x04,0x14,0x7c
+# W32: v_cmp_nlg_f16_e32 vcc_lo, s105, v2      ; encoding: [0x69,0x04,0x14,0x7c]
+# W64: v_cmp_nlg_f16_e32 vcc, s105, v2         ; encoding: [0x69,0x04,0x14,0x7c]
 
-# W32: v_cmp_nlg_f16_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x14,0x7c]
-# W64: v_cmp_nlg_f16_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x14,0x7c]
 0x6a,0x04,0x14,0x7c
+# W32: v_cmp_nlg_f16_e32 vcc_lo, vcc_lo, v2    ; encoding: [0x6a,0x04,0x14,0x7c]
+# W64: v_cmp_nlg_f16_e32 vcc, vcc_lo, v2       ; encoding: [0x6a,0x04,0x14,0x7c]
 
-# W32: v_cmp_nlg_f16_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x14,0x7c]
-# W64: v_cmp_nlg_f16_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x14,0x7c]
 0x6b,0x04,0x14,0x7c
+# W32: v_cmp_nlg_f16_e32 vcc_lo, vcc_hi, v2    ; encoding: [0x6b,0x04,0x14,0x7c]
+# W64: v_cmp_nlg_f16_e32 vcc, vcc_hi, v2       ; encoding: [0x6b,0x04,0x14,0x7c]
 
-# W32: v_cmp_nlg_f16_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x14,0x7c]
-# W64: v_cmp_nlg_f16_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x14,0x7c]
 0x7b,0x04,0x14,0x7c
+# W32: v_cmp_nlg_f16_e32 vcc_lo, ttmp15, v2    ; encoding: [0x7b,0x04,0x14,0x7c]
+# W64: v_cmp_nlg_f16_e32 vcc, ttmp15, v2       ; encoding: [0x7b,0x04,0x14,0x7c]
 
-# W32: v_cmp_nlg_f16_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x14,0x7c]
-# W64: v_cmp_nlg_f16_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x14,0x7c]
 0x7d,0x04,0x14,0x7c
+# W32: v_cmp_nlg_f16_e32 vcc_lo, m0, v2        ; encoding: [0x7d,0x04,0x14,0x7c]
+# W64: v_cmp_nlg_f16_e32 vcc, m0, v2           ; encoding: [0x7d,0x04,0x14,0x7c]
 
-# W32: v_cmp_nlg_f16_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x14,0x7c]
-# W64: v_cmp_nlg_f16_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x14,0x7c]
 0x7e,0x04,0x14,0x7c
+# W32: v_cmp_nlg_f16_e32 vcc_lo, exec_lo, v2   ; encoding: [0x7e,0x04,0x14,0x7c]
+# W64: v_cmp_nlg_f16_e32 vcc, exec_lo, v2      ; encoding: [0x7e,0x04,0x14,0x7c]
 
-# W32: v_cmp_nlg_f16_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x14,0x7c]
-# W64: v_cmp_nlg_f16_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x14,0x7c]
 0x7f,0x04,0x14,0x7c
+# W32: v_cmp_nlg_f16_e32 vcc_lo, exec_hi, v2   ; encoding: [0x7f,0x04,0x14,0x7c]
+# W64: v_cmp_nlg_f16_e32 vcc, exec_hi, v2      ; encoding: [0x7f,0x04,0x14,0x7c]
 
-# W32: v_cmp_nlg_f16_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x14,0x7c]
-# W64: v_cmp_nlg_f16_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x14,0x7c]
 0x7c,0x04,0x14,0x7c
+# W32: v_cmp_nlg_f16_e32 vcc_lo, null, v2      ; encoding: [0x7c,0x04,0x14,0x7c]
+# W64: v_cmp_nlg_f16_e32 vcc, null, v2         ; encoding: [0x7c,0x04,0x14,0x7c]
 
-# W32: v_cmp_nlg_f16_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x14,0x7c]
-# W64: v_cmp_nlg_f16_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x14,0x7c]
 0xc1,0x04,0x14,0x7c
+# W32: v_cmp_nlg_f16_e32 vcc_lo, -1, v2        ; encoding: [0xc1,0x04,0x14,0x7c]
+# W64: v_cmp_nlg_f16_e32 vcc, -1, v2           ; encoding: [0xc1,0x04,0x14,0x7c]
 
-# W32: v_cmp_nlg_f16_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x14,0x7c]
-# W64: v_cmp_nlg_f16_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x14,0x7c]
 0xf0,0x04,0x14,0x7c
+# W32: v_cmp_nlg_f16_e32 vcc_lo, 0.5, v2       ; encoding: [0xf0,0x04,0x14,0x7c]
+# W64: v_cmp_nlg_f16_e32 vcc, 0.5, v2          ; encoding: [0xf0,0x04,0x14,0x7c]
 
-# W32: v_cmp_nlg_f16_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x14,0x7c]
-# W64: v_cmp_nlg_f16_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x14,0x7c]
 0xfd,0x04,0x14,0x7c
+# W32: v_cmp_nlg_f16_e32 vcc_lo, src_scc, v2   ; encoding: [0xfd,0x04,0x14,0x7c]
+# W64: v_cmp_nlg_f16_e32 vcc, src_scc, v2      ; encoding: [0xfd,0x04,0x14,0x7c]
 
-# W32: v_cmp_nlg_f16_e32 vcc_lo, 0xfe0b, v127    ; encoding: [0xff,0xfe,0x14,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_nlg_f16_e32 vcc, 0xfe0b, v127       ; encoding: [0xff,0xfe,0x14,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x14,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_nlg_f16_e32 vcc_lo, 0xfe0b, v127  ; encoding: [0xff,0xfe,0x14,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_nlg_f16_e32 vcc, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x14,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_nlg_f32_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x34,0x7c]
-# W64: v_cmp_nlg_f32_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x34,0x7c]
 0x01,0x05,0x34,0x7c
+# W32: v_cmp_nlg_f32_e32 vcc_lo, v1, v2        ; encoding: [0x01,0x05,0x34,0x7c]
+# W64: v_cmp_nlg_f32_e32 vcc, v1, v2           ; encoding: [0x01,0x05,0x34,0x7c]
 
-# W32: v_cmp_nlg_f32_e32 vcc_lo, v255, v2        ; encoding: [0xff,0x05,0x34,0x7c]
-# W64: v_cmp_nlg_f32_e32 vcc, v255, v2           ; encoding: [0xff,0x05,0x34,0x7c]
 0xff,0x05,0x34,0x7c
+# W32: v_cmp_nlg_f32_e32 vcc_lo, v255, v2      ; encoding: [0xff,0x05,0x34,0x7c]
+# W64: v_cmp_nlg_f32_e32 vcc, v255, v2         ; encoding: [0xff,0x05,0x34,0x7c]
 
-# W32: v_cmp_nlg_f32_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x34,0x7c]
-# W64: v_cmp_nlg_f32_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x34,0x7c]
 0x01,0x04,0x34,0x7c
+# W32: v_cmp_nlg_f32_e32 vcc_lo, s1, v2        ; encoding: [0x01,0x04,0x34,0x7c]
+# W64: v_cmp_nlg_f32_e32 vcc, s1, v2           ; encoding: [0x01,0x04,0x34,0x7c]
 
-# W32: v_cmp_nlg_f32_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x34,0x7c]
-# W64: v_cmp_nlg_f32_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x34,0x7c]
 0x69,0x04,0x34,0x7c
+# W32: v_cmp_nlg_f32_e32 vcc_lo, s105, v2      ; encoding: [0x69,0x04,0x34,0x7c]
+# W64: v_cmp_nlg_f32_e32 vcc, s105, v2         ; encoding: [0x69,0x04,0x34,0x7c]
 
-# W32: v_cmp_nlg_f32_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x34,0x7c]
-# W64: v_cmp_nlg_f32_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x34,0x7c]
 0x6a,0x04,0x34,0x7c
+# W32: v_cmp_nlg_f32_e32 vcc_lo, vcc_lo, v2    ; encoding: [0x6a,0x04,0x34,0x7c]
+# W64: v_cmp_nlg_f32_e32 vcc, vcc_lo, v2       ; encoding: [0x6a,0x04,0x34,0x7c]
 
-# W32: v_cmp_nlg_f32_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x34,0x7c]
-# W64: v_cmp_nlg_f32_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x34,0x7c]
 0x6b,0x04,0x34,0x7c
+# W32: v_cmp_nlg_f32_e32 vcc_lo, vcc_hi, v2    ; encoding: [0x6b,0x04,0x34,0x7c]
+# W64: v_cmp_nlg_f32_e32 vcc, vcc_hi, v2       ; encoding: [0x6b,0x04,0x34,0x7c]
 
-# W32: v_cmp_nlg_f32_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x34,0x7c]
-# W64: v_cmp_nlg_f32_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x34,0x7c]
 0x7b,0x04,0x34,0x7c
+# W32: v_cmp_nlg_f32_e32 vcc_lo, ttmp15, v2    ; encoding: [0x7b,0x04,0x34,0x7c]
+# W64: v_cmp_nlg_f32_e32 vcc, ttmp15, v2       ; encoding: [0x7b,0x04,0x34,0x7c]
 
-# W32: v_cmp_nlg_f32_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x34,0x7c]
-# W64: v_cmp_nlg_f32_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x34,0x7c]
 0x7d,0x04,0x34,0x7c
+# W32: v_cmp_nlg_f32_e32 vcc_lo, m0, v2        ; encoding: [0x7d,0x04,0x34,0x7c]
+# W64: v_cmp_nlg_f32_e32 vcc, m0, v2           ; encoding: [0x7d,0x04,0x34,0x7c]
 
-# W32: v_cmp_nlg_f32_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x34,0x7c]
-# W64: v_cmp_nlg_f32_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x34,0x7c]
 0x7e,0x04,0x34,0x7c
+# W32: v_cmp_nlg_f32_e32 vcc_lo, exec_lo, v2   ; encoding: [0x7e,0x04,0x34,0x7c]
+# W64: v_cmp_nlg_f32_e32 vcc, exec_lo, v2      ; encoding: [0x7e,0x04,0x34,0x7c]
 
-# W32: v_cmp_nlg_f32_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x34,0x7c]
-# W64: v_cmp_nlg_f32_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x34,0x7c]
 0x7f,0x04,0x34,0x7c
+# W32: v_cmp_nlg_f32_e32 vcc_lo, exec_hi, v2   ; encoding: [0x7f,0x04,0x34,0x7c]
+# W64: v_cmp_nlg_f32_e32 vcc, exec_hi, v2      ; encoding: [0x7f,0x04,0x34,0x7c]
 
-# W32: v_cmp_nlg_f32_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x34,0x7c]
-# W64: v_cmp_nlg_f32_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x34,0x7c]
 0x7c,0x04,0x34,0x7c
+# W32: v_cmp_nlg_f32_e32 vcc_lo, null, v2      ; encoding: [0x7c,0x04,0x34,0x7c]
+# W64: v_cmp_nlg_f32_e32 vcc, null, v2         ; encoding: [0x7c,0x04,0x34,0x7c]
 
-# W32: v_cmp_nlg_f32_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x34,0x7c]
-# W64: v_cmp_nlg_f32_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x34,0x7c]
 0xc1,0x04,0x34,0x7c
+# W32: v_cmp_nlg_f32_e32 vcc_lo, -1, v2        ; encoding: [0xc1,0x04,0x34,0x7c]
+# W64: v_cmp_nlg_f32_e32 vcc, -1, v2           ; encoding: [0xc1,0x04,0x34,0x7c]
 
-# W32: v_cmp_nlg_f32_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x34,0x7c]
-# W64: v_cmp_nlg_f32_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x34,0x7c]
 0xf0,0x04,0x34,0x7c
+# W32: v_cmp_nlg_f32_e32 vcc_lo, 0.5, v2       ; encoding: [0xf0,0x04,0x34,0x7c]
+# W64: v_cmp_nlg_f32_e32 vcc, 0.5, v2          ; encoding: [0xf0,0x04,0x34,0x7c]
 
-# W32: v_cmp_nlg_f32_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x34,0x7c]
-# W64: v_cmp_nlg_f32_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x34,0x7c]
 0xfd,0x04,0x34,0x7c
+# W32: v_cmp_nlg_f32_e32 vcc_lo, src_scc, v2   ; encoding: [0xfd,0x04,0x34,0x7c]
+# W64: v_cmp_nlg_f32_e32 vcc, src_scc, v2      ; encoding: [0xfd,0x04,0x34,0x7c]
 
-# W32: v_cmp_nlg_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x35,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_nlg_f32_e32 vcc, 0xaf123456, v255   ; encoding: [0xff,0xfe,0x35,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x35,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_nlg_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x35,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_nlg_f32_e32 vcc, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x35,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_nlg_f64_e32 vcc_lo, v[1:2], v[2:3]  ; encoding: [0x01,0x05,0x54,0x7c]
-# W64: v_cmp_nlg_f64_e32 vcc, v[1:2], v[2:3]     ; encoding: [0x01,0x05,0x54,0x7c]
 0x01,0x05,0x54,0x7c
+# W32: v_cmp_nlg_f64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0x54,0x7c]
+# W64: v_cmp_nlg_f64_e32 vcc, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0x54,0x7c]
 
+0xfe,0x05,0x54,0x7c
 # W32: v_cmp_nlg_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x54,0x7c]
 # W64: v_cmp_nlg_f64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x54,0x7c]
-0xfe,0x05,0x54,0x7c
 
-# W32: v_cmp_nlg_f64_e32 vcc_lo, s[2:3], v[2:3]  ; encoding: [0x02,0x04,0x54,0x7c]
-# W64: v_cmp_nlg_f64_e32 vcc, s[2:3], v[2:3]     ; encoding: [0x02,0x04,0x54,0x7c]
 0x02,0x04,0x54,0x7c
+# W32: v_cmp_nlg_f64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0x54,0x7c]
+# W64: v_cmp_nlg_f64_e32 vcc, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0x54,0x7c]
 
+0x68,0x04,0x54,0x7c
 # W32: v_cmp_nlg_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x54,0x7c]
 # W64: v_cmp_nlg_f64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x54,0x7c]
-0x68,0x04,0x54,0x7c
 
-# W32: v_cmp_nlg_f64_e32 vcc_lo, vcc, v[2:3]     ; encoding: [0x6a,0x04,0x54,0x7c]
-# W64: v_cmp_nlg_f64_e32 vcc, vcc, v[2:3]        ; encoding: [0x6a,0x04,0x54,0x7c]
 0x6a,0x04,0x54,0x7c
+# W32: v_cmp_nlg_f64_e32 vcc_lo, vcc, v[2:3]   ; encoding: [0x6a,0x04,0x54,0x7c]
+# W64: v_cmp_nlg_f64_e32 vcc, vcc, v[2:3]      ; encoding: [0x6a,0x04,0x54,0x7c]
 
+0x7a,0x04,0x54,0x7c
 # W32: v_cmp_nlg_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x54,0x7c]
 # W64: v_cmp_nlg_f64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x54,0x7c]
-0x7a,0x04,0x54,0x7c
 
-# W32: v_cmp_nlg_f64_e32 vcc_lo, exec, v[2:3]    ; encoding: [0x7e,0x04,0x54,0x7c]
-# W64: v_cmp_nlg_f64_e32 vcc, exec, v[2:3]       ; encoding: [0x7e,0x04,0x54,0x7c]
 0x7e,0x04,0x54,0x7c
+# W32: v_cmp_nlg_f64_e32 vcc_lo, exec, v[2:3]  ; encoding: [0x7e,0x04,0x54,0x7c]
+# W64: v_cmp_nlg_f64_e32 vcc, exec, v[2:3]     ; encoding: [0x7e,0x04,0x54,0x7c]
 
-# W32: v_cmp_nlg_f64_e32 vcc_lo, null, v[2:3]    ; encoding: [0x7c,0x04,0x54,0x7c]
-# W64: v_cmp_nlg_f64_e32 vcc, null, v[2:3]       ; encoding: [0x7c,0x04,0x54,0x7c]
 0x7c,0x04,0x54,0x7c
+# W32: v_cmp_nlg_f64_e32 vcc_lo, null, v[2:3]  ; encoding: [0x7c,0x04,0x54,0x7c]
+# W64: v_cmp_nlg_f64_e32 vcc, null, v[2:3]     ; encoding: [0x7c,0x04,0x54,0x7c]
 
-# W32: v_cmp_nlg_f64_e32 vcc_lo, -1, v[2:3]      ; encoding: [0xc1,0x04,0x54,0x7c]
-# W64: v_cmp_nlg_f64_e32 vcc, -1, v[2:3]         ; encoding: [0xc1,0x04,0x54,0x7c]
 0xc1,0x04,0x54,0x7c
+# W32: v_cmp_nlg_f64_e32 vcc_lo, -1, v[2:3]    ; encoding: [0xc1,0x04,0x54,0x7c]
+# W64: v_cmp_nlg_f64_e32 vcc, -1, v[2:3]       ; encoding: [0xc1,0x04,0x54,0x7c]
 
-# W32: v_cmp_nlg_f64_e32 vcc_lo, 0.5, v[2:3]     ; encoding: [0xf0,0x04,0x54,0x7c]
-# W64: v_cmp_nlg_f64_e32 vcc, 0.5, v[2:3]        ; encoding: [0xf0,0x04,0x54,0x7c]
 0xf0,0x04,0x54,0x7c
+# W32: v_cmp_nlg_f64_e32 vcc_lo, 0.5, v[2:3]   ; encoding: [0xf0,0x04,0x54,0x7c]
+# W64: v_cmp_nlg_f64_e32 vcc, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0x54,0x7c]
 
-# W32: v_cmp_nlg_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x54,0x7c]
-# W64: v_cmp_nlg_f64_e32 vcc, src_scc, v[2:3]    ; encoding: [0xfd,0x04,0x54,0x7c]
 0xfd,0x04,0x54,0x7c
+# W32: v_cmp_nlg_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x54,0x7c]
+# W64: v_cmp_nlg_f64_e32 vcc, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0x54,0x7c]
 
+0xff,0xfc,0x55,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_nlg_f64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x55,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_nlg_f64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x55,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0x55,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_nlt_f16_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x1c,0x7c]
-# W64: v_cmp_nlt_f16_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x1c,0x7c]
 0x01,0x05,0x1c,0x7c
+# W32: v_cmp_nlt_f16_e32 vcc_lo, v1, v2        ; encoding: [0x01,0x05,0x1c,0x7c]
+# W64: v_cmp_nlt_f16_e32 vcc, v1, v2           ; encoding: [0x01,0x05,0x1c,0x7c]
 
-# W32: v_cmp_nlt_f16_e32 vcc_lo, v127, v2        ; encoding: [0x7f,0x05,0x1c,0x7c]
-# W64: v_cmp_nlt_f16_e32 vcc, v127, v2           ; encoding: [0x7f,0x05,0x1c,0x7c]
 0x7f,0x05,0x1c,0x7c
+# W32: v_cmp_nlt_f16_e32 vcc_lo, v127, v2      ; encoding: [0x7f,0x05,0x1c,0x7c]
+# W64: v_cmp_nlt_f16_e32 vcc, v127, v2         ; encoding: [0x7f,0x05,0x1c,0x7c]
 
-# W32: v_cmp_nlt_f16_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x1c,0x7c]
-# W64: v_cmp_nlt_f16_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x1c,0x7c]
 0x01,0x04,0x1c,0x7c
+# W32: v_cmp_nlt_f16_e32 vcc_lo, s1, v2        ; encoding: [0x01,0x04,0x1c,0x7c]
+# W64: v_cmp_nlt_f16_e32 vcc, s1, v2           ; encoding: [0x01,0x04,0x1c,0x7c]
 
-# W32: v_cmp_nlt_f16_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x1c,0x7c]
-# W64: v_cmp_nlt_f16_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x1c,0x7c]
 0x69,0x04,0x1c,0x7c
+# W32: v_cmp_nlt_f16_e32 vcc_lo, s105, v2      ; encoding: [0x69,0x04,0x1c,0x7c]
+# W64: v_cmp_nlt_f16_e32 vcc, s105, v2         ; encoding: [0x69,0x04,0x1c,0x7c]
 
-# W32: v_cmp_nlt_f16_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x1c,0x7c]
-# W64: v_cmp_nlt_f16_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x1c,0x7c]
 0x6a,0x04,0x1c,0x7c
+# W32: v_cmp_nlt_f16_e32 vcc_lo, vcc_lo, v2    ; encoding: [0x6a,0x04,0x1c,0x7c]
+# W64: v_cmp_nlt_f16_e32 vcc, vcc_lo, v2       ; encoding: [0x6a,0x04,0x1c,0x7c]
 
-# W32: v_cmp_nlt_f16_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x1c,0x7c]
-# W64: v_cmp_nlt_f16_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x1c,0x7c]
 0x6b,0x04,0x1c,0x7c
+# W32: v_cmp_nlt_f16_e32 vcc_lo, vcc_hi, v2    ; encoding: [0x6b,0x04,0x1c,0x7c]
+# W64: v_cmp_nlt_f16_e32 vcc, vcc_hi, v2       ; encoding: [0x6b,0x04,0x1c,0x7c]
 
-# W32: v_cmp_nlt_f16_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x1c,0x7c]
-# W64: v_cmp_nlt_f16_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x1c,0x7c]
 0x7b,0x04,0x1c,0x7c
+# W32: v_cmp_nlt_f16_e32 vcc_lo, ttmp15, v2    ; encoding: [0x7b,0x04,0x1c,0x7c]
+# W64: v_cmp_nlt_f16_e32 vcc, ttmp15, v2       ; encoding: [0x7b,0x04,0x1c,0x7c]
 
-# W32: v_cmp_nlt_f16_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x1c,0x7c]
-# W64: v_cmp_nlt_f16_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x1c,0x7c]
 0x7d,0x04,0x1c,0x7c
+# W32: v_cmp_nlt_f16_e32 vcc_lo, m0, v2        ; encoding: [0x7d,0x04,0x1c,0x7c]
+# W64: v_cmp_nlt_f16_e32 vcc, m0, v2           ; encoding: [0x7d,0x04,0x1c,0x7c]
 
-# W32: v_cmp_nlt_f16_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x1c,0x7c]
-# W64: v_cmp_nlt_f16_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x1c,0x7c]
 0x7e,0x04,0x1c,0x7c
+# W32: v_cmp_nlt_f16_e32 vcc_lo, exec_lo, v2   ; encoding: [0x7e,0x04,0x1c,0x7c]
+# W64: v_cmp_nlt_f16_e32 vcc, exec_lo, v2      ; encoding: [0x7e,0x04,0x1c,0x7c]
 
-# W32: v_cmp_nlt_f16_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x1c,0x7c]
-# W64: v_cmp_nlt_f16_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x1c,0x7c]
 0x7f,0x04,0x1c,0x7c
+# W32: v_cmp_nlt_f16_e32 vcc_lo, exec_hi, v2   ; encoding: [0x7f,0x04,0x1c,0x7c]
+# W64: v_cmp_nlt_f16_e32 vcc, exec_hi, v2      ; encoding: [0x7f,0x04,0x1c,0x7c]
 
-# W32: v_cmp_nlt_f16_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x1c,0x7c]
-# W64: v_cmp_nlt_f16_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x1c,0x7c]
 0x7c,0x04,0x1c,0x7c
+# W32: v_cmp_nlt_f16_e32 vcc_lo, null, v2      ; encoding: [0x7c,0x04,0x1c,0x7c]
+# W64: v_cmp_nlt_f16_e32 vcc, null, v2         ; encoding: [0x7c,0x04,0x1c,0x7c]
 
-# W32: v_cmp_nlt_f16_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x1c,0x7c]
-# W64: v_cmp_nlt_f16_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x1c,0x7c]
 0xc1,0x04,0x1c,0x7c
+# W32: v_cmp_nlt_f16_e32 vcc_lo, -1, v2        ; encoding: [0xc1,0x04,0x1c,0x7c]
+# W64: v_cmp_nlt_f16_e32 vcc, -1, v2           ; encoding: [0xc1,0x04,0x1c,0x7c]
 
-# W32: v_cmp_nlt_f16_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x1c,0x7c]
-# W64: v_cmp_nlt_f16_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x1c,0x7c]
 0xf0,0x04,0x1c,0x7c
+# W32: v_cmp_nlt_f16_e32 vcc_lo, 0.5, v2       ; encoding: [0xf0,0x04,0x1c,0x7c]
+# W64: v_cmp_nlt_f16_e32 vcc, 0.5, v2          ; encoding: [0xf0,0x04,0x1c,0x7c]
 
-# W32: v_cmp_nlt_f16_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x1c,0x7c]
-# W64: v_cmp_nlt_f16_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x1c,0x7c]
 0xfd,0x04,0x1c,0x7c
+# W32: v_cmp_nlt_f16_e32 vcc_lo, src_scc, v2   ; encoding: [0xfd,0x04,0x1c,0x7c]
+# W64: v_cmp_nlt_f16_e32 vcc, src_scc, v2      ; encoding: [0xfd,0x04,0x1c,0x7c]
 
-# W32: v_cmp_nlt_f16_e32 vcc_lo, 0xfe0b, v127    ; encoding: [0xff,0xfe,0x1c,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_nlt_f16_e32 vcc, 0xfe0b, v127       ; encoding: [0xff,0xfe,0x1c,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x1c,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_nlt_f16_e32 vcc_lo, 0xfe0b, v127  ; encoding: [0xff,0xfe,0x1c,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_nlt_f16_e32 vcc, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x1c,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_nlt_f32_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x3c,0x7c]
-# W64: v_cmp_nlt_f32_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x3c,0x7c]
 0x01,0x05,0x3c,0x7c
+# W32: v_cmp_nlt_f32_e32 vcc_lo, v1, v2        ; encoding: [0x01,0x05,0x3c,0x7c]
+# W64: v_cmp_nlt_f32_e32 vcc, v1, v2           ; encoding: [0x01,0x05,0x3c,0x7c]
 
-# W32: v_cmp_nlt_f32_e32 vcc_lo, v255, v2        ; encoding: [0xff,0x05,0x3c,0x7c]
-# W64: v_cmp_nlt_f32_e32 vcc, v255, v2           ; encoding: [0xff,0x05,0x3c,0x7c]
 0xff,0x05,0x3c,0x7c
+# W32: v_cmp_nlt_f32_e32 vcc_lo, v255, v2      ; encoding: [0xff,0x05,0x3c,0x7c]
+# W64: v_cmp_nlt_f32_e32 vcc, v255, v2         ; encoding: [0xff,0x05,0x3c,0x7c]
 
-# W32: v_cmp_nlt_f32_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x3c,0x7c]
-# W64: v_cmp_nlt_f32_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x3c,0x7c]
 0x01,0x04,0x3c,0x7c
+# W32: v_cmp_nlt_f32_e32 vcc_lo, s1, v2        ; encoding: [0x01,0x04,0x3c,0x7c]
+# W64: v_cmp_nlt_f32_e32 vcc, s1, v2           ; encoding: [0x01,0x04,0x3c,0x7c]
 
-# W32: v_cmp_nlt_f32_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x3c,0x7c]
-# W64: v_cmp_nlt_f32_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x3c,0x7c]
 0x69,0x04,0x3c,0x7c
+# W32: v_cmp_nlt_f32_e32 vcc_lo, s105, v2      ; encoding: [0x69,0x04,0x3c,0x7c]
+# W64: v_cmp_nlt_f32_e32 vcc, s105, v2         ; encoding: [0x69,0x04,0x3c,0x7c]
 
-# W32: v_cmp_nlt_f32_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x3c,0x7c]
-# W64: v_cmp_nlt_f32_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x3c,0x7c]
 0x6a,0x04,0x3c,0x7c
+# W32: v_cmp_nlt_f32_e32 vcc_lo, vcc_lo, v2    ; encoding: [0x6a,0x04,0x3c,0x7c]
+# W64: v_cmp_nlt_f32_e32 vcc, vcc_lo, v2       ; encoding: [0x6a,0x04,0x3c,0x7c]
 
-# W32: v_cmp_nlt_f32_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x3c,0x7c]
-# W64: v_cmp_nlt_f32_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x3c,0x7c]
 0x6b,0x04,0x3c,0x7c
+# W32: v_cmp_nlt_f32_e32 vcc_lo, vcc_hi, v2    ; encoding: [0x6b,0x04,0x3c,0x7c]
+# W64: v_cmp_nlt_f32_e32 vcc, vcc_hi, v2       ; encoding: [0x6b,0x04,0x3c,0x7c]
 
-# W32: v_cmp_nlt_f32_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x3c,0x7c]
-# W64: v_cmp_nlt_f32_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x3c,0x7c]
 0x7b,0x04,0x3c,0x7c
+# W32: v_cmp_nlt_f32_e32 vcc_lo, ttmp15, v2    ; encoding: [0x7b,0x04,0x3c,0x7c]
+# W64: v_cmp_nlt_f32_e32 vcc, ttmp15, v2       ; encoding: [0x7b,0x04,0x3c,0x7c]
 
-# W32: v_cmp_nlt_f32_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x3c,0x7c]
-# W64: v_cmp_nlt_f32_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x3c,0x7c]
 0x7d,0x04,0x3c,0x7c
+# W32: v_cmp_nlt_f32_e32 vcc_lo, m0, v2        ; encoding: [0x7d,0x04,0x3c,0x7c]
+# W64: v_cmp_nlt_f32_e32 vcc, m0, v2           ; encoding: [0x7d,0x04,0x3c,0x7c]
 
-# W32: v_cmp_nlt_f32_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x3c,0x7c]
-# W64: v_cmp_nlt_f32_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x3c,0x7c]
 0x7e,0x04,0x3c,0x7c
+# W32: v_cmp_nlt_f32_e32 vcc_lo, exec_lo, v2   ; encoding: [0x7e,0x04,0x3c,0x7c]
+# W64: v_cmp_nlt_f32_e32 vcc, exec_lo, v2      ; encoding: [0x7e,0x04,0x3c,0x7c]
 
-# W32: v_cmp_nlt_f32_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x3c,0x7c]
-# W64: v_cmp_nlt_f32_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x3c,0x7c]
 0x7f,0x04,0x3c,0x7c
+# W32: v_cmp_nlt_f32_e32 vcc_lo, exec_hi, v2   ; encoding: [0x7f,0x04,0x3c,0x7c]
+# W64: v_cmp_nlt_f32_e32 vcc, exec_hi, v2      ; encoding: [0x7f,0x04,0x3c,0x7c]
 
-# W32: v_cmp_nlt_f32_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x3c,0x7c]
-# W64: v_cmp_nlt_f32_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x3c,0x7c]
 0x7c,0x04,0x3c,0x7c
+# W32: v_cmp_nlt_f32_e32 vcc_lo, null, v2      ; encoding: [0x7c,0x04,0x3c,0x7c]
+# W64: v_cmp_nlt_f32_e32 vcc, null, v2         ; encoding: [0x7c,0x04,0x3c,0x7c]
 
-# W32: v_cmp_nlt_f32_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x3c,0x7c]
-# W64: v_cmp_nlt_f32_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x3c,0x7c]
 0xc1,0x04,0x3c,0x7c
+# W32: v_cmp_nlt_f32_e32 vcc_lo, -1, v2        ; encoding: [0xc1,0x04,0x3c,0x7c]
+# W64: v_cmp_nlt_f32_e32 vcc, -1, v2           ; encoding: [0xc1,0x04,0x3c,0x7c]
 
-# W32: v_cmp_nlt_f32_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x3c,0x7c]
-# W64: v_cmp_nlt_f32_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x3c,0x7c]
 0xf0,0x04,0x3c,0x7c
+# W32: v_cmp_nlt_f32_e32 vcc_lo, 0.5, v2       ; encoding: [0xf0,0x04,0x3c,0x7c]
+# W64: v_cmp_nlt_f32_e32 vcc, 0.5, v2          ; encoding: [0xf0,0x04,0x3c,0x7c]
 
-# W32: v_cmp_nlt_f32_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x3c,0x7c]
-# W64: v_cmp_nlt_f32_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x3c,0x7c]
 0xfd,0x04,0x3c,0x7c
+# W32: v_cmp_nlt_f32_e32 vcc_lo, src_scc, v2   ; encoding: [0xfd,0x04,0x3c,0x7c]
+# W64: v_cmp_nlt_f32_e32 vcc, src_scc, v2      ; encoding: [0xfd,0x04,0x3c,0x7c]
 
-# W32: v_cmp_nlt_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x3d,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_nlt_f32_e32 vcc, 0xaf123456, v255   ; encoding: [0xff,0xfe,0x3d,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x3d,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_nlt_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x3d,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_nlt_f32_e32 vcc, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x3d,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_nlt_f64_e32 vcc_lo, v[1:2], v[2:3]  ; encoding: [0x01,0x05,0x5c,0x7c]
-# W64: v_cmp_nlt_f64_e32 vcc, v[1:2], v[2:3]     ; encoding: [0x01,0x05,0x5c,0x7c]
 0x01,0x05,0x5c,0x7c
+# W32: v_cmp_nlt_f64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0x5c,0x7c]
+# W64: v_cmp_nlt_f64_e32 vcc, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0x5c,0x7c]
 
+0xfe,0x05,0x5c,0x7c
 # W32: v_cmp_nlt_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x5c,0x7c]
 # W64: v_cmp_nlt_f64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x5c,0x7c]
-0xfe,0x05,0x5c,0x7c
 
-# W32: v_cmp_nlt_f64_e32 vcc_lo, s[2:3], v[2:3]  ; encoding: [0x02,0x04,0x5c,0x7c]
-# W64: v_cmp_nlt_f64_e32 vcc, s[2:3], v[2:3]     ; encoding: [0x02,0x04,0x5c,0x7c]
 0x02,0x04,0x5c,0x7c
+# W32: v_cmp_nlt_f64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0x5c,0x7c]
+# W64: v_cmp_nlt_f64_e32 vcc, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0x5c,0x7c]
 
+0x68,0x04,0x5c,0x7c
 # W32: v_cmp_nlt_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x5c,0x7c]
 # W64: v_cmp_nlt_f64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x5c,0x7c]
-0x68,0x04,0x5c,0x7c
 
-# W32: v_cmp_nlt_f64_e32 vcc_lo, vcc, v[2:3]     ; encoding: [0x6a,0x04,0x5c,0x7c]
-# W64: v_cmp_nlt_f64_e32 vcc, vcc, v[2:3]        ; encoding: [0x6a,0x04,0x5c,0x7c]
 0x6a,0x04,0x5c,0x7c
+# W32: v_cmp_nlt_f64_e32 vcc_lo, vcc, v[2:3]   ; encoding: [0x6a,0x04,0x5c,0x7c]
+# W64: v_cmp_nlt_f64_e32 vcc, vcc, v[2:3]      ; encoding: [0x6a,0x04,0x5c,0x7c]
 
+0x7a,0x04,0x5c,0x7c
 # W32: v_cmp_nlt_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x5c,0x7c]
 # W64: v_cmp_nlt_f64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x5c,0x7c]
-0x7a,0x04,0x5c,0x7c
 
-# W32: v_cmp_nlt_f64_e32 vcc_lo, exec, v[2:3]    ; encoding: [0x7e,0x04,0x5c,0x7c]
-# W64: v_cmp_nlt_f64_e32 vcc, exec, v[2:3]       ; encoding: [0x7e,0x04,0x5c,0x7c]
 0x7e,0x04,0x5c,0x7c
+# W32: v_cmp_nlt_f64_e32 vcc_lo, exec, v[2:3]  ; encoding: [0x7e,0x04,0x5c,0x7c]
+# W64: v_cmp_nlt_f64_e32 vcc, exec, v[2:3]     ; encoding: [0x7e,0x04,0x5c,0x7c]
 
-# W32: v_cmp_nlt_f64_e32 vcc_lo, null, v[2:3]    ; encoding: [0x7c,0x04,0x5c,0x7c]
-# W64: v_cmp_nlt_f64_e32 vcc, null, v[2:3]       ; encoding: [0x7c,0x04,0x5c,0x7c]
 0x7c,0x04,0x5c,0x7c
+# W32: v_cmp_nlt_f64_e32 vcc_lo, null, v[2:3]  ; encoding: [0x7c,0x04,0x5c,0x7c]
+# W64: v_cmp_nlt_f64_e32 vcc, null, v[2:3]     ; encoding: [0x7c,0x04,0x5c,0x7c]
 
-# W32: v_cmp_nlt_f64_e32 vcc_lo, -1, v[2:3]      ; encoding: [0xc1,0x04,0x5c,0x7c]
-# W64: v_cmp_nlt_f64_e32 vcc, -1, v[2:3]         ; encoding: [0xc1,0x04,0x5c,0x7c]
 0xc1,0x04,0x5c,0x7c
+# W32: v_cmp_nlt_f64_e32 vcc_lo, -1, v[2:3]    ; encoding: [0xc1,0x04,0x5c,0x7c]
+# W64: v_cmp_nlt_f64_e32 vcc, -1, v[2:3]       ; encoding: [0xc1,0x04,0x5c,0x7c]
 
-# W32: v_cmp_nlt_f64_e32 vcc_lo, 0.5, v[2:3]     ; encoding: [0xf0,0x04,0x5c,0x7c]
-# W64: v_cmp_nlt_f64_e32 vcc, 0.5, v[2:3]        ; encoding: [0xf0,0x04,0x5c,0x7c]
 0xf0,0x04,0x5c,0x7c
+# W32: v_cmp_nlt_f64_e32 vcc_lo, 0.5, v[2:3]   ; encoding: [0xf0,0x04,0x5c,0x7c]
+# W64: v_cmp_nlt_f64_e32 vcc, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0x5c,0x7c]
 
-# W32: v_cmp_nlt_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x5c,0x7c]
-# W64: v_cmp_nlt_f64_e32 vcc, src_scc, v[2:3]    ; encoding: [0xfd,0x04,0x5c,0x7c]
 0xfd,0x04,0x5c,0x7c
+# W32: v_cmp_nlt_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x5c,0x7c]
+# W64: v_cmp_nlt_f64_e32 vcc, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0x5c,0x7c]
 
+0xff,0xfc,0x5d,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_nlt_f64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5d,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_nlt_f64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5d,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0x5d,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_o_f16_e32 vcc_lo, v1, v2            ; encoding: [0x01,0x05,0x0e,0x7c]
-# W64: v_cmp_o_f16_e32 vcc, v1, v2               ; encoding: [0x01,0x05,0x0e,0x7c]
 0x01,0x05,0x0e,0x7c
+# W32: v_cmp_o_f16_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x0e,0x7c]
+# W64: v_cmp_o_f16_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x0e,0x7c]
 
-# W32: v_cmp_o_f16_e32 vcc_lo, v127, v2          ; encoding: [0x7f,0x05,0x0e,0x7c]
-# W64: v_cmp_o_f16_e32 vcc, v127, v2             ; encoding: [0x7f,0x05,0x0e,0x7c]
 0x7f,0x05,0x0e,0x7c
+# W32: v_cmp_o_f16_e32 vcc_lo, v127, v2        ; encoding: [0x7f,0x05,0x0e,0x7c]
+# W64: v_cmp_o_f16_e32 vcc, v127, v2           ; encoding: [0x7f,0x05,0x0e,0x7c]
 
-# W32: v_cmp_o_f16_e32 vcc_lo, s1, v2            ; encoding: [0x01,0x04,0x0e,0x7c]
-# W64: v_cmp_o_f16_e32 vcc, s1, v2               ; encoding: [0x01,0x04,0x0e,0x7c]
 0x01,0x04,0x0e,0x7c
+# W32: v_cmp_o_f16_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x0e,0x7c]
+# W64: v_cmp_o_f16_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x0e,0x7c]
 
-# W32: v_cmp_o_f16_e32 vcc_lo, s105, v2          ; encoding: [0x69,0x04,0x0e,0x7c]
-# W64: v_cmp_o_f16_e32 vcc, s105, v2             ; encoding: [0x69,0x04,0x0e,0x7c]
 0x69,0x04,0x0e,0x7c
+# W32: v_cmp_o_f16_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x0e,0x7c]
+# W64: v_cmp_o_f16_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x0e,0x7c]
 
-# W32: v_cmp_o_f16_e32 vcc_lo, vcc_lo, v2        ; encoding: [0x6a,0x04,0x0e,0x7c]
-# W64: v_cmp_o_f16_e32 vcc, vcc_lo, v2           ; encoding: [0x6a,0x04,0x0e,0x7c]
 0x6a,0x04,0x0e,0x7c
+# W32: v_cmp_o_f16_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x0e,0x7c]
+# W64: v_cmp_o_f16_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x0e,0x7c]
 
-# W32: v_cmp_o_f16_e32 vcc_lo, vcc_hi, v2        ; encoding: [0x6b,0x04,0x0e,0x7c]
-# W64: v_cmp_o_f16_e32 vcc, vcc_hi, v2           ; encoding: [0x6b,0x04,0x0e,0x7c]
 0x6b,0x04,0x0e,0x7c
+# W32: v_cmp_o_f16_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x0e,0x7c]
+# W64: v_cmp_o_f16_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x0e,0x7c]
 
-# W32: v_cmp_o_f16_e32 vcc_lo, ttmp15, v2        ; encoding: [0x7b,0x04,0x0e,0x7c]
-# W64: v_cmp_o_f16_e32 vcc, ttmp15, v2           ; encoding: [0x7b,0x04,0x0e,0x7c]
 0x7b,0x04,0x0e,0x7c
+# W32: v_cmp_o_f16_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x0e,0x7c]
+# W64: v_cmp_o_f16_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x0e,0x7c]
 
-# W32: v_cmp_o_f16_e32 vcc_lo, m0, v2            ; encoding: [0x7d,0x04,0x0e,0x7c]
-# W64: v_cmp_o_f16_e32 vcc, m0, v2               ; encoding: [0x7d,0x04,0x0e,0x7c]
 0x7d,0x04,0x0e,0x7c
+# W32: v_cmp_o_f16_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x0e,0x7c]
+# W64: v_cmp_o_f16_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x0e,0x7c]
 
-# W32: v_cmp_o_f16_e32 vcc_lo, exec_lo, v2       ; encoding: [0x7e,0x04,0x0e,0x7c]
-# W64: v_cmp_o_f16_e32 vcc, exec_lo, v2          ; encoding: [0x7e,0x04,0x0e,0x7c]
 0x7e,0x04,0x0e,0x7c
+# W32: v_cmp_o_f16_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x0e,0x7c]
+# W64: v_cmp_o_f16_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x0e,0x7c]
 
-# W32: v_cmp_o_f16_e32 vcc_lo, exec_hi, v2       ; encoding: [0x7f,0x04,0x0e,0x7c]
-# W64: v_cmp_o_f16_e32 vcc, exec_hi, v2          ; encoding: [0x7f,0x04,0x0e,0x7c]
 0x7f,0x04,0x0e,0x7c
+# W32: v_cmp_o_f16_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x0e,0x7c]
+# W64: v_cmp_o_f16_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x0e,0x7c]
 
-# W32: v_cmp_o_f16_e32 vcc_lo, null, v2          ; encoding: [0x7c,0x04,0x0e,0x7c]
-# W64: v_cmp_o_f16_e32 vcc, null, v2             ; encoding: [0x7c,0x04,0x0e,0x7c]
 0x7c,0x04,0x0e,0x7c
+# W32: v_cmp_o_f16_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x0e,0x7c]
+# W64: v_cmp_o_f16_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x0e,0x7c]
 
-# W32: v_cmp_o_f16_e32 vcc_lo, -1, v2            ; encoding: [0xc1,0x04,0x0e,0x7c]
-# W64: v_cmp_o_f16_e32 vcc, -1, v2               ; encoding: [0xc1,0x04,0x0e,0x7c]
 0xc1,0x04,0x0e,0x7c
+# W32: v_cmp_o_f16_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x0e,0x7c]
+# W64: v_cmp_o_f16_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x0e,0x7c]
 
-# W32: v_cmp_o_f16_e32 vcc_lo, 0.5, v2           ; encoding: [0xf0,0x04,0x0e,0x7c]
-# W64: v_cmp_o_f16_e32 vcc, 0.5, v2              ; encoding: [0xf0,0x04,0x0e,0x7c]
 0xf0,0x04,0x0e,0x7c
+# W32: v_cmp_o_f16_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x0e,0x7c]
+# W64: v_cmp_o_f16_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x0e,0x7c]
 
-# W32: v_cmp_o_f16_e32 vcc_lo, src_scc, v2       ; encoding: [0xfd,0x04,0x0e,0x7c]
-# W64: v_cmp_o_f16_e32 vcc, src_scc, v2          ; encoding: [0xfd,0x04,0x0e,0x7c]
 0xfd,0x04,0x0e,0x7c
+# W32: v_cmp_o_f16_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x0e,0x7c]
+# W64: v_cmp_o_f16_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x0e,0x7c]
 
-# W32: v_cmp_o_f16_e32 vcc_lo, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x0e,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_o_f16_e32 vcc, 0xfe0b, v127         ; encoding: [0xff,0xfe,0x0e,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x0e,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_o_f16_e32 vcc_lo, 0xfe0b, v127    ; encoding: [0xff,0xfe,0x0e,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_o_f16_e32 vcc, 0xfe0b, v127       ; encoding: [0xff,0xfe,0x0e,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_o_f32_e32 vcc_lo, v1, v2            ; encoding: [0x01,0x05,0x2e,0x7c]
-# W64: v_cmp_o_f32_e32 vcc, v1, v2               ; encoding: [0x01,0x05,0x2e,0x7c]
 0x01,0x05,0x2e,0x7c
+# W32: v_cmp_o_f32_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x2e,0x7c]
+# W64: v_cmp_o_f32_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x2e,0x7c]
 
-# W32: v_cmp_o_f32_e32 vcc_lo, v255, v2          ; encoding: [0xff,0x05,0x2e,0x7c]
-# W64: v_cmp_o_f32_e32 vcc, v255, v2             ; encoding: [0xff,0x05,0x2e,0x7c]
 0xff,0x05,0x2e,0x7c
+# W32: v_cmp_o_f32_e32 vcc_lo, v255, v2        ; encoding: [0xff,0x05,0x2e,0x7c]
+# W64: v_cmp_o_f32_e32 vcc, v255, v2           ; encoding: [0xff,0x05,0x2e,0x7c]
 
-# W32: v_cmp_o_f32_e32 vcc_lo, s1, v2            ; encoding: [0x01,0x04,0x2e,0x7c]
-# W64: v_cmp_o_f32_e32 vcc, s1, v2               ; encoding: [0x01,0x04,0x2e,0x7c]
 0x01,0x04,0x2e,0x7c
+# W32: v_cmp_o_f32_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x2e,0x7c]
+# W64: v_cmp_o_f32_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x2e,0x7c]
 
-# W32: v_cmp_o_f32_e32 vcc_lo, s105, v2          ; encoding: [0x69,0x04,0x2e,0x7c]
-# W64: v_cmp_o_f32_e32 vcc, s105, v2             ; encoding: [0x69,0x04,0x2e,0x7c]
 0x69,0x04,0x2e,0x7c
+# W32: v_cmp_o_f32_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x2e,0x7c]
+# W64: v_cmp_o_f32_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x2e,0x7c]
 
-# W32: v_cmp_o_f32_e32 vcc_lo, vcc_lo, v2        ; encoding: [0x6a,0x04,0x2e,0x7c]
-# W64: v_cmp_o_f32_e32 vcc, vcc_lo, v2           ; encoding: [0x6a,0x04,0x2e,0x7c]
 0x6a,0x04,0x2e,0x7c
+# W32: v_cmp_o_f32_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x2e,0x7c]
+# W64: v_cmp_o_f32_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x2e,0x7c]
 
-# W32: v_cmp_o_f32_e32 vcc_lo, vcc_hi, v2        ; encoding: [0x6b,0x04,0x2e,0x7c]
-# W64: v_cmp_o_f32_e32 vcc, vcc_hi, v2           ; encoding: [0x6b,0x04,0x2e,0x7c]
 0x6b,0x04,0x2e,0x7c
+# W32: v_cmp_o_f32_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x2e,0x7c]
+# W64: v_cmp_o_f32_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x2e,0x7c]
 
-# W32: v_cmp_o_f32_e32 vcc_lo, ttmp15, v2        ; encoding: [0x7b,0x04,0x2e,0x7c]
-# W64: v_cmp_o_f32_e32 vcc, ttmp15, v2           ; encoding: [0x7b,0x04,0x2e,0x7c]
 0x7b,0x04,0x2e,0x7c
+# W32: v_cmp_o_f32_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x2e,0x7c]
+# W64: v_cmp_o_f32_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x2e,0x7c]
 
-# W32: v_cmp_o_f32_e32 vcc_lo, m0, v2            ; encoding: [0x7d,0x04,0x2e,0x7c]
-# W64: v_cmp_o_f32_e32 vcc, m0, v2               ; encoding: [0x7d,0x04,0x2e,0x7c]
 0x7d,0x04,0x2e,0x7c
+# W32: v_cmp_o_f32_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x2e,0x7c]
+# W64: v_cmp_o_f32_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x2e,0x7c]
 
-# W32: v_cmp_o_f32_e32 vcc_lo, exec_lo, v2       ; encoding: [0x7e,0x04,0x2e,0x7c]
-# W64: v_cmp_o_f32_e32 vcc, exec_lo, v2          ; encoding: [0x7e,0x04,0x2e,0x7c]
 0x7e,0x04,0x2e,0x7c
+# W32: v_cmp_o_f32_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x2e,0x7c]
+# W64: v_cmp_o_f32_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x2e,0x7c]
 
-# W32: v_cmp_o_f32_e32 vcc_lo, exec_hi, v2       ; encoding: [0x7f,0x04,0x2e,0x7c]
-# W64: v_cmp_o_f32_e32 vcc, exec_hi, v2          ; encoding: [0x7f,0x04,0x2e,0x7c]
 0x7f,0x04,0x2e,0x7c
+# W32: v_cmp_o_f32_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x2e,0x7c]
+# W64: v_cmp_o_f32_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x2e,0x7c]
 
-# W32: v_cmp_o_f32_e32 vcc_lo, null, v2          ; encoding: [0x7c,0x04,0x2e,0x7c]
-# W64: v_cmp_o_f32_e32 vcc, null, v2             ; encoding: [0x7c,0x04,0x2e,0x7c]
 0x7c,0x04,0x2e,0x7c
+# W32: v_cmp_o_f32_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x2e,0x7c]
+# W64: v_cmp_o_f32_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x2e,0x7c]
 
-# W32: v_cmp_o_f32_e32 vcc_lo, -1, v2            ; encoding: [0xc1,0x04,0x2e,0x7c]
-# W64: v_cmp_o_f32_e32 vcc, -1, v2               ; encoding: [0xc1,0x04,0x2e,0x7c]
 0xc1,0x04,0x2e,0x7c
+# W32: v_cmp_o_f32_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x2e,0x7c]
+# W64: v_cmp_o_f32_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x2e,0x7c]
 
-# W32: v_cmp_o_f32_e32 vcc_lo, 0.5, v2           ; encoding: [0xf0,0x04,0x2e,0x7c]
-# W64: v_cmp_o_f32_e32 vcc, 0.5, v2              ; encoding: [0xf0,0x04,0x2e,0x7c]
 0xf0,0x04,0x2e,0x7c
+# W32: v_cmp_o_f32_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x2e,0x7c]
+# W64: v_cmp_o_f32_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x2e,0x7c]
 
-# W32: v_cmp_o_f32_e32 vcc_lo, src_scc, v2       ; encoding: [0xfd,0x04,0x2e,0x7c]
-# W64: v_cmp_o_f32_e32 vcc, src_scc, v2          ; encoding: [0xfd,0x04,0x2e,0x7c]
 0xfd,0x04,0x2e,0x7c
+# W32: v_cmp_o_f32_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x2e,0x7c]
+# W64: v_cmp_o_f32_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x2e,0x7c]
 
-# W32: v_cmp_o_f32_e32 vcc_lo, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x2f,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_o_f32_e32 vcc, 0xaf123456, v255     ; encoding: [0xff,0xfe,0x2f,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x2f,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_o_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x2f,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_o_f32_e32 vcc, 0xaf123456, v255   ; encoding: [0xff,0xfe,0x2f,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_o_f64_e32 vcc_lo, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0x4e,0x7c]
-# W64: v_cmp_o_f64_e32 vcc, v[1:2], v[2:3]       ; encoding: [0x01,0x05,0x4e,0x7c]
 0x01,0x05,0x4e,0x7c
+# W32: v_cmp_o_f64_e32 vcc_lo, v[1:2], v[2:3]  ; encoding: [0x01,0x05,0x4e,0x7c]
+# W64: v_cmp_o_f64_e32 vcc, v[1:2], v[2:3]     ; encoding: [0x01,0x05,0x4e,0x7c]
 
-# W32: v_cmp_o_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x4e,0x7c]
-# W64: v_cmp_o_f64_e32 vcc, v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0x4e,0x7c]
 0xfe,0x05,0x4e,0x7c
+# W32: v_cmp_o_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x4e,0x7c]
+# W64: v_cmp_o_f64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x4e,0x7c]
 
-# W32: v_cmp_o_f64_e32 vcc_lo, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0x4e,0x7c]
-# W64: v_cmp_o_f64_e32 vcc, s[2:3], v[2:3]       ; encoding: [0x02,0x04,0x4e,0x7c]
 0x02,0x04,0x4e,0x7c
+# W32: v_cmp_o_f64_e32 vcc_lo, s[2:3], v[2:3]  ; encoding: [0x02,0x04,0x4e,0x7c]
+# W64: v_cmp_o_f64_e32 vcc, s[2:3], v[2:3]     ; encoding: [0x02,0x04,0x4e,0x7c]
 
-# W32: v_cmp_o_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x4e,0x7c]
-# W64: v_cmp_o_f64_e32 vcc, s[104:105], v[2:3]   ; encoding: [0x68,0x04,0x4e,0x7c]
 0x68,0x04,0x4e,0x7c
+# W32: v_cmp_o_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x4e,0x7c]
+# W64: v_cmp_o_f64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x4e,0x7c]
 
-# W32: v_cmp_o_f64_e32 vcc_lo, vcc, v[2:3]       ; encoding: [0x6a,0x04,0x4e,0x7c]
-# W64: v_cmp_o_f64_e32 vcc, vcc, v[2:3]          ; encoding: [0x6a,0x04,0x4e,0x7c]
 0x6a,0x04,0x4e,0x7c
+# W32: v_cmp_o_f64_e32 vcc_lo, vcc, v[2:3]     ; encoding: [0x6a,0x04,0x4e,0x7c]
+# W64: v_cmp_o_f64_e32 vcc, vcc, v[2:3]        ; encoding: [0x6a,0x04,0x4e,0x7c]
 
-# W32: v_cmp_o_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x4e,0x7c]
-# W64: v_cmp_o_f64_e32 vcc, ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0x4e,0x7c]
 0x7a,0x04,0x4e,0x7c
+# W32: v_cmp_o_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x4e,0x7c]
+# W64: v_cmp_o_f64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x4e,0x7c]
 
-# W32: v_cmp_o_f64_e32 vcc_lo, exec, v[2:3]      ; encoding: [0x7e,0x04,0x4e,0x7c]
-# W64: v_cmp_o_f64_e32 vcc, exec, v[2:3]         ; encoding: [0x7e,0x04,0x4e,0x7c]
 0x7e,0x04,0x4e,0x7c
+# W32: v_cmp_o_f64_e32 vcc_lo, exec, v[2:3]    ; encoding: [0x7e,0x04,0x4e,0x7c]
+# W64: v_cmp_o_f64_e32 vcc, exec, v[2:3]       ; encoding: [0x7e,0x04,0x4e,0x7c]
 
-# W32: v_cmp_o_f64_e32 vcc_lo, null, v[2:3]      ; encoding: [0x7c,0x04,0x4e,0x7c]
-# W64: v_cmp_o_f64_e32 vcc, null, v[2:3]         ; encoding: [0x7c,0x04,0x4e,0x7c]
 0x7c,0x04,0x4e,0x7c
+# W32: v_cmp_o_f64_e32 vcc_lo, null, v[2:3]    ; encoding: [0x7c,0x04,0x4e,0x7c]
+# W64: v_cmp_o_f64_e32 vcc, null, v[2:3]       ; encoding: [0x7c,0x04,0x4e,0x7c]
 
-# W32: v_cmp_o_f64_e32 vcc_lo, -1, v[2:3]        ; encoding: [0xc1,0x04,0x4e,0x7c]
-# W64: v_cmp_o_f64_e32 vcc, -1, v[2:3]           ; encoding: [0xc1,0x04,0x4e,0x7c]
 0xc1,0x04,0x4e,0x7c
+# W32: v_cmp_o_f64_e32 vcc_lo, -1, v[2:3]      ; encoding: [0xc1,0x04,0x4e,0x7c]
+# W64: v_cmp_o_f64_e32 vcc, -1, v[2:3]         ; encoding: [0xc1,0x04,0x4e,0x7c]
 
-# W32: v_cmp_o_f64_e32 vcc_lo, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0x4e,0x7c]
-# W64: v_cmp_o_f64_e32 vcc, 0.5, v[2:3]          ; encoding: [0xf0,0x04,0x4e,0x7c]
 0xf0,0x04,0x4e,0x7c
+# W32: v_cmp_o_f64_e32 vcc_lo, 0.5, v[2:3]     ; encoding: [0xf0,0x04,0x4e,0x7c]
+# W64: v_cmp_o_f64_e32 vcc, 0.5, v[2:3]        ; encoding: [0xf0,0x04,0x4e,0x7c]
 
-# W32: v_cmp_o_f64_e32 vcc_lo, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0x4e,0x7c]
-# W64: v_cmp_o_f64_e32 vcc, src_scc, v[2:3]      ; encoding: [0xfd,0x04,0x4e,0x7c]
 0xfd,0x04,0x4e,0x7c
+# W32: v_cmp_o_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x4e,0x7c]
+# W64: v_cmp_o_f64_e32 vcc, src_scc, v[2:3]    ; encoding: [0xfd,0x04,0x4e,0x7c]
 
+0xff,0xfc,0x4f,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_o_f64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4f,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_o_f64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4f,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0x4f,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_t_f16_e32 vcc_lo, v1, v2            ; encoding: [0x01,0x05,0x1e,0x7c]
-# W64: v_cmp_t_f16_e32 vcc, v1, v2               ; encoding: [0x01,0x05,0x1e,0x7c]
 0x01,0x05,0x1e,0x7c
+# W32: v_cmp_t_f16_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x1e,0x7c]
+# W64: v_cmp_t_f16_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x1e,0x7c]
 
-# W32: v_cmp_t_f16_e32 vcc_lo, v127, v2          ; encoding: [0x7f,0x05,0x1e,0x7c]
-# W64: v_cmp_t_f16_e32 vcc, v127, v2             ; encoding: [0x7f,0x05,0x1e,0x7c]
 0x7f,0x05,0x1e,0x7c
+# W32: v_cmp_t_f16_e32 vcc_lo, v127, v2        ; encoding: [0x7f,0x05,0x1e,0x7c]
+# W64: v_cmp_t_f16_e32 vcc, v127, v2           ; encoding: [0x7f,0x05,0x1e,0x7c]
 
-# W32: v_cmp_t_f16_e32 vcc_lo, s1, v2            ; encoding: [0x01,0x04,0x1e,0x7c]
-# W64: v_cmp_t_f16_e32 vcc, s1, v2               ; encoding: [0x01,0x04,0x1e,0x7c]
 0x01,0x04,0x1e,0x7c
+# W32: v_cmp_t_f16_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x1e,0x7c]
+# W64: v_cmp_t_f16_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x1e,0x7c]
 
-# W32: v_cmp_t_f16_e32 vcc_lo, s105, v2          ; encoding: [0x69,0x04,0x1e,0x7c]
-# W64: v_cmp_t_f16_e32 vcc, s105, v2             ; encoding: [0x69,0x04,0x1e,0x7c]
 0x69,0x04,0x1e,0x7c
+# W32: v_cmp_t_f16_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x1e,0x7c]
+# W64: v_cmp_t_f16_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x1e,0x7c]
 
-# W32: v_cmp_t_f16_e32 vcc_lo, vcc_lo, v2        ; encoding: [0x6a,0x04,0x1e,0x7c]
-# W64: v_cmp_t_f16_e32 vcc, vcc_lo, v2           ; encoding: [0x6a,0x04,0x1e,0x7c]
 0x6a,0x04,0x1e,0x7c
+# W32: v_cmp_t_f16_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x1e,0x7c]
+# W64: v_cmp_t_f16_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x1e,0x7c]
 
-# W32: v_cmp_t_f16_e32 vcc_lo, vcc_hi, v2        ; encoding: [0x6b,0x04,0x1e,0x7c]
-# W64: v_cmp_t_f16_e32 vcc, vcc_hi, v2           ; encoding: [0x6b,0x04,0x1e,0x7c]
 0x6b,0x04,0x1e,0x7c
+# W32: v_cmp_t_f16_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x1e,0x7c]
+# W64: v_cmp_t_f16_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x1e,0x7c]
 
-# W32: v_cmp_t_f16_e32 vcc_lo, ttmp15, v2        ; encoding: [0x7b,0x04,0x1e,0x7c]
-# W64: v_cmp_t_f16_e32 vcc, ttmp15, v2           ; encoding: [0x7b,0x04,0x1e,0x7c]
 0x7b,0x04,0x1e,0x7c
+# W32: v_cmp_t_f16_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x1e,0x7c]
+# W64: v_cmp_t_f16_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x1e,0x7c]
 
-# W32: v_cmp_t_f16_e32 vcc_lo, m0, v2            ; encoding: [0x7d,0x04,0x1e,0x7c]
-# W64: v_cmp_t_f16_e32 vcc, m0, v2               ; encoding: [0x7d,0x04,0x1e,0x7c]
 0x7d,0x04,0x1e,0x7c
+# W32: v_cmp_t_f16_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x1e,0x7c]
+# W64: v_cmp_t_f16_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x1e,0x7c]
 
-# W32: v_cmp_t_f16_e32 vcc_lo, exec_lo, v2       ; encoding: [0x7e,0x04,0x1e,0x7c]
-# W64: v_cmp_t_f16_e32 vcc, exec_lo, v2          ; encoding: [0x7e,0x04,0x1e,0x7c]
 0x7e,0x04,0x1e,0x7c
+# W32: v_cmp_t_f16_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x1e,0x7c]
+# W64: v_cmp_t_f16_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x1e,0x7c]
 
-# W32: v_cmp_t_f16_e32 vcc_lo, exec_hi, v2       ; encoding: [0x7f,0x04,0x1e,0x7c]
-# W64: v_cmp_t_f16_e32 vcc, exec_hi, v2          ; encoding: [0x7f,0x04,0x1e,0x7c]
 0x7f,0x04,0x1e,0x7c
+# W32: v_cmp_t_f16_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x1e,0x7c]
+# W64: v_cmp_t_f16_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x1e,0x7c]
 
-# W32: v_cmp_t_f16_e32 vcc_lo, null, v2          ; encoding: [0x7c,0x04,0x1e,0x7c]
-# W64: v_cmp_t_f16_e32 vcc, null, v2             ; encoding: [0x7c,0x04,0x1e,0x7c]
 0x7c,0x04,0x1e,0x7c
+# W32: v_cmp_t_f16_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x1e,0x7c]
+# W64: v_cmp_t_f16_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x1e,0x7c]
 
-# W32: v_cmp_t_f16_e32 vcc_lo, -1, v2            ; encoding: [0xc1,0x04,0x1e,0x7c]
-# W64: v_cmp_t_f16_e32 vcc, -1, v2               ; encoding: [0xc1,0x04,0x1e,0x7c]
 0xc1,0x04,0x1e,0x7c
+# W32: v_cmp_t_f16_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x1e,0x7c]
+# W64: v_cmp_t_f16_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x1e,0x7c]
 
-# W32: v_cmp_t_f16_e32 vcc_lo, 0.5, v2           ; encoding: [0xf0,0x04,0x1e,0x7c]
-# W64: v_cmp_t_f16_e32 vcc, 0.5, v2              ; encoding: [0xf0,0x04,0x1e,0x7c]
 0xf0,0x04,0x1e,0x7c
+# W32: v_cmp_t_f16_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x1e,0x7c]
+# W64: v_cmp_t_f16_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x1e,0x7c]
 
-# W32: v_cmp_t_f16_e32 vcc_lo, src_scc, v2       ; encoding: [0xfd,0x04,0x1e,0x7c]
-# W64: v_cmp_t_f16_e32 vcc, src_scc, v2          ; encoding: [0xfd,0x04,0x1e,0x7c]
 0xfd,0x04,0x1e,0x7c
+# W32: v_cmp_t_f16_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x1e,0x7c]
+# W64: v_cmp_t_f16_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x1e,0x7c]
 
-# W32: v_cmp_t_f16_e32 vcc_lo, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x1e,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_t_f16_e32 vcc, 0xfe0b, v127         ; encoding: [0xff,0xfe,0x1e,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x1e,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_t_f16_e32 vcc_lo, 0xfe0b, v127    ; encoding: [0xff,0xfe,0x1e,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_t_f16_e32 vcc, 0xfe0b, v127       ; encoding: [0xff,0xfe,0x1e,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_t_f32_e32 vcc_lo, v1, v2            ; encoding: [0x01,0x05,0x3e,0x7c]
-# W64: v_cmp_t_f32_e32 vcc, v1, v2               ; encoding: [0x01,0x05,0x3e,0x7c]
 0x01,0x05,0x3e,0x7c
+# W32: v_cmp_t_f32_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x3e,0x7c]
+# W64: v_cmp_t_f32_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x3e,0x7c]
 
-# W32: v_cmp_t_f32_e32 vcc_lo, v255, v2          ; encoding: [0xff,0x05,0x3e,0x7c]
-# W64: v_cmp_t_f32_e32 vcc, v255, v2             ; encoding: [0xff,0x05,0x3e,0x7c]
 0xff,0x05,0x3e,0x7c
+# W32: v_cmp_t_f32_e32 vcc_lo, v255, v2        ; encoding: [0xff,0x05,0x3e,0x7c]
+# W64: v_cmp_t_f32_e32 vcc, v255, v2           ; encoding: [0xff,0x05,0x3e,0x7c]
 
-# W32: v_cmp_t_f32_e32 vcc_lo, s1, v2            ; encoding: [0x01,0x04,0x3e,0x7c]
-# W64: v_cmp_t_f32_e32 vcc, s1, v2               ; encoding: [0x01,0x04,0x3e,0x7c]
 0x01,0x04,0x3e,0x7c
+# W32: v_cmp_t_f32_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x3e,0x7c]
+# W64: v_cmp_t_f32_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x3e,0x7c]
 
-# W32: v_cmp_t_f32_e32 vcc_lo, s105, v2          ; encoding: [0x69,0x04,0x3e,0x7c]
-# W64: v_cmp_t_f32_e32 vcc, s105, v2             ; encoding: [0x69,0x04,0x3e,0x7c]
 0x69,0x04,0x3e,0x7c
+# W32: v_cmp_t_f32_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x3e,0x7c]
+# W64: v_cmp_t_f32_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x3e,0x7c]
 
-# W32: v_cmp_t_f32_e32 vcc_lo, vcc_lo, v2        ; encoding: [0x6a,0x04,0x3e,0x7c]
-# W64: v_cmp_t_f32_e32 vcc, vcc_lo, v2           ; encoding: [0x6a,0x04,0x3e,0x7c]
 0x6a,0x04,0x3e,0x7c
+# W32: v_cmp_t_f32_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x3e,0x7c]
+# W64: v_cmp_t_f32_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x3e,0x7c]
 
-# W32: v_cmp_t_f32_e32 vcc_lo, vcc_hi, v2        ; encoding: [0x6b,0x04,0x3e,0x7c]
-# W64: v_cmp_t_f32_e32 vcc, vcc_hi, v2           ; encoding: [0x6b,0x04,0x3e,0x7c]
 0x6b,0x04,0x3e,0x7c
+# W32: v_cmp_t_f32_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x3e,0x7c]
+# W64: v_cmp_t_f32_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x3e,0x7c]
 
-# W32: v_cmp_t_f32_e32 vcc_lo, ttmp15, v2        ; encoding: [0x7b,0x04,0x3e,0x7c]
-# W64: v_cmp_t_f32_e32 vcc, ttmp15, v2           ; encoding: [0x7b,0x04,0x3e,0x7c]
 0x7b,0x04,0x3e,0x7c
+# W32: v_cmp_t_f32_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x3e,0x7c]
+# W64: v_cmp_t_f32_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x3e,0x7c]
 
-# W32: v_cmp_t_f32_e32 vcc_lo, m0, v2            ; encoding: [0x7d,0x04,0x3e,0x7c]
-# W64: v_cmp_t_f32_e32 vcc, m0, v2               ; encoding: [0x7d,0x04,0x3e,0x7c]
 0x7d,0x04,0x3e,0x7c
+# W32: v_cmp_t_f32_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x3e,0x7c]
+# W64: v_cmp_t_f32_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x3e,0x7c]
 
-# W32: v_cmp_t_f32_e32 vcc_lo, exec_lo, v2       ; encoding: [0x7e,0x04,0x3e,0x7c]
-# W64: v_cmp_t_f32_e32 vcc, exec_lo, v2          ; encoding: [0x7e,0x04,0x3e,0x7c]
 0x7e,0x04,0x3e,0x7c
+# W32: v_cmp_t_f32_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x3e,0x7c]
+# W64: v_cmp_t_f32_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x3e,0x7c]
 
-# W32: v_cmp_t_f32_e32 vcc_lo, exec_hi, v2       ; encoding: [0x7f,0x04,0x3e,0x7c]
-# W64: v_cmp_t_f32_e32 vcc, exec_hi, v2          ; encoding: [0x7f,0x04,0x3e,0x7c]
 0x7f,0x04,0x3e,0x7c
+# W32: v_cmp_t_f32_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x3e,0x7c]
+# W64: v_cmp_t_f32_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x3e,0x7c]
 
-# W32: v_cmp_t_f32_e32 vcc_lo, null, v2          ; encoding: [0x7c,0x04,0x3e,0x7c]
-# W64: v_cmp_t_f32_e32 vcc, null, v2             ; encoding: [0x7c,0x04,0x3e,0x7c]
 0x7c,0x04,0x3e,0x7c
+# W32: v_cmp_t_f32_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x3e,0x7c]
+# W64: v_cmp_t_f32_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x3e,0x7c]
 
-# W32: v_cmp_t_f32_e32 vcc_lo, -1, v2            ; encoding: [0xc1,0x04,0x3e,0x7c]
-# W64: v_cmp_t_f32_e32 vcc, -1, v2               ; encoding: [0xc1,0x04,0x3e,0x7c]
 0xc1,0x04,0x3e,0x7c
+# W32: v_cmp_t_f32_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x3e,0x7c]
+# W64: v_cmp_t_f32_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x3e,0x7c]
 
-# W32: v_cmp_t_f32_e32 vcc_lo, 0.5, v2           ; encoding: [0xf0,0x04,0x3e,0x7c]
-# W64: v_cmp_t_f32_e32 vcc, 0.5, v2              ; encoding: [0xf0,0x04,0x3e,0x7c]
 0xf0,0x04,0x3e,0x7c
+# W32: v_cmp_t_f32_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x3e,0x7c]
+# W64: v_cmp_t_f32_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x3e,0x7c]
 
-# W32: v_cmp_t_f32_e32 vcc_lo, src_scc, v2       ; encoding: [0xfd,0x04,0x3e,0x7c]
-# W64: v_cmp_t_f32_e32 vcc, src_scc, v2          ; encoding: [0xfd,0x04,0x3e,0x7c]
 0xfd,0x04,0x3e,0x7c
+# W32: v_cmp_t_f32_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x3e,0x7c]
+# W64: v_cmp_t_f32_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x3e,0x7c]
 
-# W32: v_cmp_t_f32_e32 vcc_lo, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x3f,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_t_f32_e32 vcc, 0xaf123456, v255     ; encoding: [0xff,0xfe,0x3f,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x3f,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_t_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x3f,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_t_f32_e32 vcc, 0xaf123456, v255   ; encoding: [0xff,0xfe,0x3f,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_t_f64_e32 vcc_lo, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0x5e,0x7c]
-# W64: v_cmp_t_f64_e32 vcc, v[1:2], v[2:3]       ; encoding: [0x01,0x05,0x5e,0x7c]
 0x01,0x05,0x5e,0x7c
+# W32: v_cmp_t_f64_e32 vcc_lo, v[1:2], v[2:3]  ; encoding: [0x01,0x05,0x5e,0x7c]
+# W64: v_cmp_t_f64_e32 vcc, v[1:2], v[2:3]     ; encoding: [0x01,0x05,0x5e,0x7c]
 
-# W32: v_cmp_t_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x5e,0x7c]
-# W64: v_cmp_t_f64_e32 vcc, v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0x5e,0x7c]
 0xfe,0x05,0x5e,0x7c
+# W32: v_cmp_t_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x5e,0x7c]
+# W64: v_cmp_t_f64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x5e,0x7c]
 
-# W32: v_cmp_t_f64_e32 vcc_lo, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0x5e,0x7c]
-# W64: v_cmp_t_f64_e32 vcc, s[2:3], v[2:3]       ; encoding: [0x02,0x04,0x5e,0x7c]
 0x02,0x04,0x5e,0x7c
+# W32: v_cmp_t_f64_e32 vcc_lo, s[2:3], v[2:3]  ; encoding: [0x02,0x04,0x5e,0x7c]
+# W64: v_cmp_t_f64_e32 vcc, s[2:3], v[2:3]     ; encoding: [0x02,0x04,0x5e,0x7c]
 
-# W32: v_cmp_t_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x5e,0x7c]
-# W64: v_cmp_t_f64_e32 vcc, s[104:105], v[2:3]   ; encoding: [0x68,0x04,0x5e,0x7c]
 0x68,0x04,0x5e,0x7c
+# W32: v_cmp_t_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x5e,0x7c]
+# W64: v_cmp_t_f64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x5e,0x7c]
 
-# W32: v_cmp_t_f64_e32 vcc_lo, vcc, v[2:3]       ; encoding: [0x6a,0x04,0x5e,0x7c]
-# W64: v_cmp_t_f64_e32 vcc, vcc, v[2:3]          ; encoding: [0x6a,0x04,0x5e,0x7c]
 0x6a,0x04,0x5e,0x7c
+# W32: v_cmp_t_f64_e32 vcc_lo, vcc, v[2:3]     ; encoding: [0x6a,0x04,0x5e,0x7c]
+# W64: v_cmp_t_f64_e32 vcc, vcc, v[2:3]        ; encoding: [0x6a,0x04,0x5e,0x7c]
 
-# W32: v_cmp_t_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x5e,0x7c]
-# W64: v_cmp_t_f64_e32 vcc, ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0x5e,0x7c]
 0x7a,0x04,0x5e,0x7c
+# W32: v_cmp_t_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x5e,0x7c]
+# W64: v_cmp_t_f64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x5e,0x7c]
 
-# W32: v_cmp_t_f64_e32 vcc_lo, exec, v[2:3]      ; encoding: [0x7e,0x04,0x5e,0x7c]
-# W64: v_cmp_t_f64_e32 vcc, exec, v[2:3]         ; encoding: [0x7e,0x04,0x5e,0x7c]
 0x7e,0x04,0x5e,0x7c
+# W32: v_cmp_t_f64_e32 vcc_lo, exec, v[2:3]    ; encoding: [0x7e,0x04,0x5e,0x7c]
+# W64: v_cmp_t_f64_e32 vcc, exec, v[2:3]       ; encoding: [0x7e,0x04,0x5e,0x7c]
 
-# W32: v_cmp_t_f64_e32 vcc_lo, null, v[2:3]      ; encoding: [0x7c,0x04,0x5e,0x7c]
-# W64: v_cmp_t_f64_e32 vcc, null, v[2:3]         ; encoding: [0x7c,0x04,0x5e,0x7c]
 0x7c,0x04,0x5e,0x7c
+# W32: v_cmp_t_f64_e32 vcc_lo, null, v[2:3]    ; encoding: [0x7c,0x04,0x5e,0x7c]
+# W64: v_cmp_t_f64_e32 vcc, null, v[2:3]       ; encoding: [0x7c,0x04,0x5e,0x7c]
 
-# W32: v_cmp_t_f64_e32 vcc_lo, -1, v[2:3]        ; encoding: [0xc1,0x04,0x5e,0x7c]
-# W64: v_cmp_t_f64_e32 vcc, -1, v[2:3]           ; encoding: [0xc1,0x04,0x5e,0x7c]
 0xc1,0x04,0x5e,0x7c
+# W32: v_cmp_t_f64_e32 vcc_lo, -1, v[2:3]      ; encoding: [0xc1,0x04,0x5e,0x7c]
+# W64: v_cmp_t_f64_e32 vcc, -1, v[2:3]         ; encoding: [0xc1,0x04,0x5e,0x7c]
 
-# W32: v_cmp_t_f64_e32 vcc_lo, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0x5e,0x7c]
-# W64: v_cmp_t_f64_e32 vcc, 0.5, v[2:3]          ; encoding: [0xf0,0x04,0x5e,0x7c]
 0xf0,0x04,0x5e,0x7c
+# W32: v_cmp_t_f64_e32 vcc_lo, 0.5, v[2:3]     ; encoding: [0xf0,0x04,0x5e,0x7c]
+# W64: v_cmp_t_f64_e32 vcc, 0.5, v[2:3]        ; encoding: [0xf0,0x04,0x5e,0x7c]
 
-# W32: v_cmp_t_f64_e32 vcc_lo, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0x5e,0x7c]
-# W64: v_cmp_t_f64_e32 vcc, src_scc, v[2:3]      ; encoding: [0xfd,0x04,0x5e,0x7c]
 0xfd,0x04,0x5e,0x7c
+# W32: v_cmp_t_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x5e,0x7c]
+# W64: v_cmp_t_f64_e32 vcc, src_scc, v[2:3]    ; encoding: [0xfd,0x04,0x5e,0x7c]
 
+0xff,0xfc,0x5f,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_t_f64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5f,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_t_f64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5f,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0x5f,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_t_i32_e32 vcc_lo, v1, v2            ; encoding: [0x01,0x05,0x8e,0x7c]
-# W64: v_cmp_t_i32_e32 vcc, v1, v2               ; encoding: [0x01,0x05,0x8e,0x7c]
 0x01,0x05,0x8e,0x7c
+# W32: v_cmp_t_i32_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x8e,0x7c]
+# W64: v_cmp_t_i32_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x8e,0x7c]
 
-# W32: v_cmp_t_i32_e32 vcc_lo, v255, v2          ; encoding: [0xff,0x05,0x8e,0x7c]
-# W64: v_cmp_t_i32_e32 vcc, v255, v2             ; encoding: [0xff,0x05,0x8e,0x7c]
 0xff,0x05,0x8e,0x7c
+# W32: v_cmp_t_i32_e32 vcc_lo, v255, v2        ; encoding: [0xff,0x05,0x8e,0x7c]
+# W64: v_cmp_t_i32_e32 vcc, v255, v2           ; encoding: [0xff,0x05,0x8e,0x7c]
 
-# W32: v_cmp_t_i32_e32 vcc_lo, s1, v2            ; encoding: [0x01,0x04,0x8e,0x7c]
-# W64: v_cmp_t_i32_e32 vcc, s1, v2               ; encoding: [0x01,0x04,0x8e,0x7c]
 0x01,0x04,0x8e,0x7c
+# W32: v_cmp_t_i32_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x8e,0x7c]
+# W64: v_cmp_t_i32_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x8e,0x7c]
 
-# W32: v_cmp_t_i32_e32 vcc_lo, s105, v2          ; encoding: [0x69,0x04,0x8e,0x7c]
-# W64: v_cmp_t_i32_e32 vcc, s105, v2             ; encoding: [0x69,0x04,0x8e,0x7c]
 0x69,0x04,0x8e,0x7c
+# W32: v_cmp_t_i32_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x8e,0x7c]
+# W64: v_cmp_t_i32_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x8e,0x7c]
 
-# W32: v_cmp_t_i32_e32 vcc_lo, vcc_lo, v2        ; encoding: [0x6a,0x04,0x8e,0x7c]
-# W64: v_cmp_t_i32_e32 vcc, vcc_lo, v2           ; encoding: [0x6a,0x04,0x8e,0x7c]
 0x6a,0x04,0x8e,0x7c
+# W32: v_cmp_t_i32_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x8e,0x7c]
+# W64: v_cmp_t_i32_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x8e,0x7c]
 
-# W32: v_cmp_t_i32_e32 vcc_lo, vcc_hi, v2        ; encoding: [0x6b,0x04,0x8e,0x7c]
-# W64: v_cmp_t_i32_e32 vcc, vcc_hi, v2           ; encoding: [0x6b,0x04,0x8e,0x7c]
 0x6b,0x04,0x8e,0x7c
+# W32: v_cmp_t_i32_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x8e,0x7c]
+# W64: v_cmp_t_i32_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x8e,0x7c]
 
-# W32: v_cmp_t_i32_e32 vcc_lo, ttmp15, v2        ; encoding: [0x7b,0x04,0x8e,0x7c]
-# W64: v_cmp_t_i32_e32 vcc, ttmp15, v2           ; encoding: [0x7b,0x04,0x8e,0x7c]
 0x7b,0x04,0x8e,0x7c
+# W32: v_cmp_t_i32_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x8e,0x7c]
+# W64: v_cmp_t_i32_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x8e,0x7c]
 
-# W32: v_cmp_t_i32_e32 vcc_lo, m0, v2            ; encoding: [0x7d,0x04,0x8e,0x7c]
-# W64: v_cmp_t_i32_e32 vcc, m0, v2               ; encoding: [0x7d,0x04,0x8e,0x7c]
 0x7d,0x04,0x8e,0x7c
+# W32: v_cmp_t_i32_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x8e,0x7c]
+# W64: v_cmp_t_i32_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x8e,0x7c]
 
-# W32: v_cmp_t_i32_e32 vcc_lo, exec_lo, v2       ; encoding: [0x7e,0x04,0x8e,0x7c]
-# W64: v_cmp_t_i32_e32 vcc, exec_lo, v2          ; encoding: [0x7e,0x04,0x8e,0x7c]
 0x7e,0x04,0x8e,0x7c
+# W32: v_cmp_t_i32_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x8e,0x7c]
+# W64: v_cmp_t_i32_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x8e,0x7c]
 
-# W32: v_cmp_t_i32_e32 vcc_lo, exec_hi, v2       ; encoding: [0x7f,0x04,0x8e,0x7c]
-# W64: v_cmp_t_i32_e32 vcc, exec_hi, v2          ; encoding: [0x7f,0x04,0x8e,0x7c]
 0x7f,0x04,0x8e,0x7c
+# W32: v_cmp_t_i32_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x8e,0x7c]
+# W64: v_cmp_t_i32_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x8e,0x7c]
 
-# W32: v_cmp_t_i32_e32 vcc_lo, null, v2          ; encoding: [0x7c,0x04,0x8e,0x7c]
-# W64: v_cmp_t_i32_e32 vcc, null, v2             ; encoding: [0x7c,0x04,0x8e,0x7c]
 0x7c,0x04,0x8e,0x7c
+# W32: v_cmp_t_i32_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x8e,0x7c]
+# W64: v_cmp_t_i32_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x8e,0x7c]
 
-# W32: v_cmp_t_i32_e32 vcc_lo, -1, v2            ; encoding: [0xc1,0x04,0x8e,0x7c]
-# W64: v_cmp_t_i32_e32 vcc, -1, v2               ; encoding: [0xc1,0x04,0x8e,0x7c]
 0xc1,0x04,0x8e,0x7c
+# W32: v_cmp_t_i32_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x8e,0x7c]
+# W64: v_cmp_t_i32_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x8e,0x7c]
 
-# W32: v_cmp_t_i32_e32 vcc_lo, 0.5, v2           ; encoding: [0xf0,0x04,0x8e,0x7c]
-# W64: v_cmp_t_i32_e32 vcc, 0.5, v2              ; encoding: [0xf0,0x04,0x8e,0x7c]
 0xf0,0x04,0x8e,0x7c
+# W32: v_cmp_t_i32_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x8e,0x7c]
+# W64: v_cmp_t_i32_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x8e,0x7c]
 
-# W32: v_cmp_t_i32_e32 vcc_lo, src_scc, v2       ; encoding: [0xfd,0x04,0x8e,0x7c]
-# W64: v_cmp_t_i32_e32 vcc, src_scc, v2          ; encoding: [0xfd,0x04,0x8e,0x7c]
 0xfd,0x04,0x8e,0x7c
+# W32: v_cmp_t_i32_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x8e,0x7c]
+# W64: v_cmp_t_i32_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x8e,0x7c]
 
-# W32: v_cmp_t_i32_e32 vcc_lo, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x8f,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_t_i32_e32 vcc, 0xaf123456, v255     ; encoding: [0xff,0xfe,0x8f,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x8f,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_t_i32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x8f,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_t_i32_e32 vcc, 0xaf123456, v255   ; encoding: [0xff,0xfe,0x8f,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_t_i64_e32 vcc_lo, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0xae,0x7c]
-# W64: v_cmp_t_i64_e32 vcc, v[1:2], v[2:3]       ; encoding: [0x01,0x05,0xae,0x7c]
 0x01,0x05,0xae,0x7c
+# W32: v_cmp_t_i64_e32 vcc_lo, v[1:2], v[2:3]  ; encoding: [0x01,0x05,0xae,0x7c]
+# W64: v_cmp_t_i64_e32 vcc, v[1:2], v[2:3]     ; encoding: [0x01,0x05,0xae,0x7c]
 
-# W32: v_cmp_t_i64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xae,0x7c]
-# W64: v_cmp_t_i64_e32 vcc, v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0xae,0x7c]
 0xfe,0x05,0xae,0x7c
+# W32: v_cmp_t_i64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xae,0x7c]
+# W64: v_cmp_t_i64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xae,0x7c]
 
-# W32: v_cmp_t_i64_e32 vcc_lo, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0xae,0x7c]
-# W64: v_cmp_t_i64_e32 vcc, s[2:3], v[2:3]       ; encoding: [0x02,0x04,0xae,0x7c]
 0x02,0x04,0xae,0x7c
+# W32: v_cmp_t_i64_e32 vcc_lo, s[2:3], v[2:3]  ; encoding: [0x02,0x04,0xae,0x7c]
+# W64: v_cmp_t_i64_e32 vcc, s[2:3], v[2:3]     ; encoding: [0x02,0x04,0xae,0x7c]
 
-# W32: v_cmp_t_i64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xae,0x7c]
-# W64: v_cmp_t_i64_e32 vcc, s[104:105], v[2:3]   ; encoding: [0x68,0x04,0xae,0x7c]
 0x68,0x04,0xae,0x7c
+# W32: v_cmp_t_i64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xae,0x7c]
+# W64: v_cmp_t_i64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xae,0x7c]
 
-# W32: v_cmp_t_i64_e32 vcc_lo, vcc, v[2:3]       ; encoding: [0x6a,0x04,0xae,0x7c]
-# W64: v_cmp_t_i64_e32 vcc, vcc, v[2:3]          ; encoding: [0x6a,0x04,0xae,0x7c]
 0x6a,0x04,0xae,0x7c
+# W32: v_cmp_t_i64_e32 vcc_lo, vcc, v[2:3]     ; encoding: [0x6a,0x04,0xae,0x7c]
+# W64: v_cmp_t_i64_e32 vcc, vcc, v[2:3]        ; encoding: [0x6a,0x04,0xae,0x7c]
 
-# W32: v_cmp_t_i64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xae,0x7c]
-# W64: v_cmp_t_i64_e32 vcc, ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0xae,0x7c]
 0x7a,0x04,0xae,0x7c
+# W32: v_cmp_t_i64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xae,0x7c]
+# W64: v_cmp_t_i64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xae,0x7c]
 
-# W32: v_cmp_t_i64_e32 vcc_lo, exec, v[2:3]      ; encoding: [0x7e,0x04,0xae,0x7c]
-# W64: v_cmp_t_i64_e32 vcc, exec, v[2:3]         ; encoding: [0x7e,0x04,0xae,0x7c]
 0x7e,0x04,0xae,0x7c
+# W32: v_cmp_t_i64_e32 vcc_lo, exec, v[2:3]    ; encoding: [0x7e,0x04,0xae,0x7c]
+# W64: v_cmp_t_i64_e32 vcc, exec, v[2:3]       ; encoding: [0x7e,0x04,0xae,0x7c]
 
-# W32: v_cmp_t_i64_e32 vcc_lo, null, v[2:3]      ; encoding: [0x7c,0x04,0xae,0x7c]
-# W64: v_cmp_t_i64_e32 vcc, null, v[2:3]         ; encoding: [0x7c,0x04,0xae,0x7c]
 0x7c,0x04,0xae,0x7c
+# W32: v_cmp_t_i64_e32 vcc_lo, null, v[2:3]    ; encoding: [0x7c,0x04,0xae,0x7c]
+# W64: v_cmp_t_i64_e32 vcc, null, v[2:3]       ; encoding: [0x7c,0x04,0xae,0x7c]
 
-# W32: v_cmp_t_i64_e32 vcc_lo, -1, v[2:3]        ; encoding: [0xc1,0x04,0xae,0x7c]
-# W64: v_cmp_t_i64_e32 vcc, -1, v[2:3]           ; encoding: [0xc1,0x04,0xae,0x7c]
 0xc1,0x04,0xae,0x7c
+# W32: v_cmp_t_i64_e32 vcc_lo, -1, v[2:3]      ; encoding: [0xc1,0x04,0xae,0x7c]
+# W64: v_cmp_t_i64_e32 vcc, -1, v[2:3]         ; encoding: [0xc1,0x04,0xae,0x7c]
 
-# W32: v_cmp_t_i64_e32 vcc_lo, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0xae,0x7c]
-# W64: v_cmp_t_i64_e32 vcc, 0.5, v[2:3]          ; encoding: [0xf0,0x04,0xae,0x7c]
 0xf0,0x04,0xae,0x7c
+# W32: v_cmp_t_i64_e32 vcc_lo, 0.5, v[2:3]     ; encoding: [0xf0,0x04,0xae,0x7c]
+# W64: v_cmp_t_i64_e32 vcc, 0.5, v[2:3]        ; encoding: [0xf0,0x04,0xae,0x7c]
 
-# W32: v_cmp_t_i64_e32 vcc_lo, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0xae,0x7c]
-# W64: v_cmp_t_i64_e32 vcc, src_scc, v[2:3]      ; encoding: [0xfd,0x04,0xae,0x7c]
 0xfd,0x04,0xae,0x7c
+# W32: v_cmp_t_i64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0xae,0x7c]
+# W64: v_cmp_t_i64_e32 vcc, src_scc, v[2:3]    ; encoding: [0xfd,0x04,0xae,0x7c]
 
+0xff,0xfc,0xaf,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_t_i64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xaf,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_t_i64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xaf,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0xaf,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_t_u32_e32 vcc_lo, v1, v2            ; encoding: [0x01,0x05,0x9e,0x7c]
-# W64: v_cmp_t_u32_e32 vcc, v1, v2               ; encoding: [0x01,0x05,0x9e,0x7c]
 0x01,0x05,0x9e,0x7c
+# W32: v_cmp_t_u32_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x9e,0x7c]
+# W64: v_cmp_t_u32_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x9e,0x7c]
 
-# W32: v_cmp_t_u32_e32 vcc_lo, v255, v2          ; encoding: [0xff,0x05,0x9e,0x7c]
-# W64: v_cmp_t_u32_e32 vcc, v255, v2             ; encoding: [0xff,0x05,0x9e,0x7c]
 0xff,0x05,0x9e,0x7c
+# W32: v_cmp_t_u32_e32 vcc_lo, v255, v2        ; encoding: [0xff,0x05,0x9e,0x7c]
+# W64: v_cmp_t_u32_e32 vcc, v255, v2           ; encoding: [0xff,0x05,0x9e,0x7c]
 
-# W32: v_cmp_t_u32_e32 vcc_lo, s1, v2            ; encoding: [0x01,0x04,0x9e,0x7c]
-# W64: v_cmp_t_u32_e32 vcc, s1, v2               ; encoding: [0x01,0x04,0x9e,0x7c]
 0x01,0x04,0x9e,0x7c
+# W32: v_cmp_t_u32_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x9e,0x7c]
+# W64: v_cmp_t_u32_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x9e,0x7c]
 
-# W32: v_cmp_t_u32_e32 vcc_lo, s105, v2          ; encoding: [0x69,0x04,0x9e,0x7c]
-# W64: v_cmp_t_u32_e32 vcc, s105, v2             ; encoding: [0x69,0x04,0x9e,0x7c]
 0x69,0x04,0x9e,0x7c
+# W32: v_cmp_t_u32_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x9e,0x7c]
+# W64: v_cmp_t_u32_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x9e,0x7c]
 
-# W32: v_cmp_t_u32_e32 vcc_lo, vcc_lo, v2        ; encoding: [0x6a,0x04,0x9e,0x7c]
-# W64: v_cmp_t_u32_e32 vcc, vcc_lo, v2           ; encoding: [0x6a,0x04,0x9e,0x7c]
 0x6a,0x04,0x9e,0x7c
+# W32: v_cmp_t_u32_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x9e,0x7c]
+# W64: v_cmp_t_u32_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x9e,0x7c]
 
-# W32: v_cmp_t_u32_e32 vcc_lo, vcc_hi, v2        ; encoding: [0x6b,0x04,0x9e,0x7c]
-# W64: v_cmp_t_u32_e32 vcc, vcc_hi, v2           ; encoding: [0x6b,0x04,0x9e,0x7c]
 0x6b,0x04,0x9e,0x7c
+# W32: v_cmp_t_u32_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x9e,0x7c]
+# W64: v_cmp_t_u32_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x9e,0x7c]
 
-# W32: v_cmp_t_u32_e32 vcc_lo, ttmp15, v2        ; encoding: [0x7b,0x04,0x9e,0x7c]
-# W64: v_cmp_t_u32_e32 vcc, ttmp15, v2           ; encoding: [0x7b,0x04,0x9e,0x7c]
 0x7b,0x04,0x9e,0x7c
+# W32: v_cmp_t_u32_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x9e,0x7c]
+# W64: v_cmp_t_u32_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x9e,0x7c]
 
-# W32: v_cmp_t_u32_e32 vcc_lo, m0, v2            ; encoding: [0x7d,0x04,0x9e,0x7c]
-# W64: v_cmp_t_u32_e32 vcc, m0, v2               ; encoding: [0x7d,0x04,0x9e,0x7c]
 0x7d,0x04,0x9e,0x7c
+# W32: v_cmp_t_u32_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x9e,0x7c]
+# W64: v_cmp_t_u32_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x9e,0x7c]
 
-# W32: v_cmp_t_u32_e32 vcc_lo, exec_lo, v2       ; encoding: [0x7e,0x04,0x9e,0x7c]
-# W64: v_cmp_t_u32_e32 vcc, exec_lo, v2          ; encoding: [0x7e,0x04,0x9e,0x7c]
 0x7e,0x04,0x9e,0x7c
+# W32: v_cmp_t_u32_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x9e,0x7c]
+# W64: v_cmp_t_u32_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x9e,0x7c]
 
-# W32: v_cmp_t_u32_e32 vcc_lo, exec_hi, v2       ; encoding: [0x7f,0x04,0x9e,0x7c]
-# W64: v_cmp_t_u32_e32 vcc, exec_hi, v2          ; encoding: [0x7f,0x04,0x9e,0x7c]
 0x7f,0x04,0x9e,0x7c
+# W32: v_cmp_t_u32_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x9e,0x7c]
+# W64: v_cmp_t_u32_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x9e,0x7c]
 
-# W32: v_cmp_t_u32_e32 vcc_lo, null, v2          ; encoding: [0x7c,0x04,0x9e,0x7c]
-# W64: v_cmp_t_u32_e32 vcc, null, v2             ; encoding: [0x7c,0x04,0x9e,0x7c]
 0x7c,0x04,0x9e,0x7c
+# W32: v_cmp_t_u32_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x9e,0x7c]
+# W64: v_cmp_t_u32_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x9e,0x7c]
 
-# W32: v_cmp_t_u32_e32 vcc_lo, -1, v2            ; encoding: [0xc1,0x04,0x9e,0x7c]
-# W64: v_cmp_t_u32_e32 vcc, -1, v2               ; encoding: [0xc1,0x04,0x9e,0x7c]
 0xc1,0x04,0x9e,0x7c
+# W32: v_cmp_t_u32_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x9e,0x7c]
+# W64: v_cmp_t_u32_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x9e,0x7c]
 
-# W32: v_cmp_t_u32_e32 vcc_lo, 0.5, v2           ; encoding: [0xf0,0x04,0x9e,0x7c]
-# W64: v_cmp_t_u32_e32 vcc, 0.5, v2              ; encoding: [0xf0,0x04,0x9e,0x7c]
 0xf0,0x04,0x9e,0x7c
+# W32: v_cmp_t_u32_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x9e,0x7c]
+# W64: v_cmp_t_u32_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x9e,0x7c]
 
-# W32: v_cmp_t_u32_e32 vcc_lo, src_scc, v2       ; encoding: [0xfd,0x04,0x9e,0x7c]
-# W64: v_cmp_t_u32_e32 vcc, src_scc, v2          ; encoding: [0xfd,0x04,0x9e,0x7c]
 0xfd,0x04,0x9e,0x7c
+# W32: v_cmp_t_u32_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x9e,0x7c]
+# W64: v_cmp_t_u32_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x9e,0x7c]
 
-# W32: v_cmp_t_u32_e32 vcc_lo, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x9f,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_t_u32_e32 vcc, 0xaf123456, v255     ; encoding: [0xff,0xfe,0x9f,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x9f,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_t_u32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x9f,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_t_u32_e32 vcc, 0xaf123456, v255   ; encoding: [0xff,0xfe,0x9f,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_t_u64_e32 vcc_lo, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0xbe,0x7c]
-# W64: v_cmp_t_u64_e32 vcc, v[1:2], v[2:3]       ; encoding: [0x01,0x05,0xbe,0x7c]
 0x01,0x05,0xbe,0x7c
+# W32: v_cmp_t_u64_e32 vcc_lo, v[1:2], v[2:3]  ; encoding: [0x01,0x05,0xbe,0x7c]
+# W64: v_cmp_t_u64_e32 vcc, v[1:2], v[2:3]     ; encoding: [0x01,0x05,0xbe,0x7c]
 
-# W32: v_cmp_t_u64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xbe,0x7c]
-# W64: v_cmp_t_u64_e32 vcc, v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0xbe,0x7c]
 0xfe,0x05,0xbe,0x7c
+# W32: v_cmp_t_u64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xbe,0x7c]
+# W64: v_cmp_t_u64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xbe,0x7c]
 
-# W32: v_cmp_t_u64_e32 vcc_lo, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0xbe,0x7c]
-# W64: v_cmp_t_u64_e32 vcc, s[2:3], v[2:3]       ; encoding: [0x02,0x04,0xbe,0x7c]
 0x02,0x04,0xbe,0x7c
+# W32: v_cmp_t_u64_e32 vcc_lo, s[2:3], v[2:3]  ; encoding: [0x02,0x04,0xbe,0x7c]
+# W64: v_cmp_t_u64_e32 vcc, s[2:3], v[2:3]     ; encoding: [0x02,0x04,0xbe,0x7c]
 
-# W32: v_cmp_t_u64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xbe,0x7c]
-# W64: v_cmp_t_u64_e32 vcc, s[104:105], v[2:3]   ; encoding: [0x68,0x04,0xbe,0x7c]
 0x68,0x04,0xbe,0x7c
+# W32: v_cmp_t_u64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xbe,0x7c]
+# W64: v_cmp_t_u64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xbe,0x7c]
 
-# W32: v_cmp_t_u64_e32 vcc_lo, vcc, v[2:3]       ; encoding: [0x6a,0x04,0xbe,0x7c]
-# W64: v_cmp_t_u64_e32 vcc, vcc, v[2:3]          ; encoding: [0x6a,0x04,0xbe,0x7c]
 0x6a,0x04,0xbe,0x7c
+# W32: v_cmp_t_u64_e32 vcc_lo, vcc, v[2:3]     ; encoding: [0x6a,0x04,0xbe,0x7c]
+# W64: v_cmp_t_u64_e32 vcc, vcc, v[2:3]        ; encoding: [0x6a,0x04,0xbe,0x7c]
 
-# W32: v_cmp_t_u64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xbe,0x7c]
-# W64: v_cmp_t_u64_e32 vcc, ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0xbe,0x7c]
 0x7a,0x04,0xbe,0x7c
+# W32: v_cmp_t_u64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xbe,0x7c]
+# W64: v_cmp_t_u64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xbe,0x7c]
 
-# W32: v_cmp_t_u64_e32 vcc_lo, exec, v[2:3]      ; encoding: [0x7e,0x04,0xbe,0x7c]
-# W64: v_cmp_t_u64_e32 vcc, exec, v[2:3]         ; encoding: [0x7e,0x04,0xbe,0x7c]
 0x7e,0x04,0xbe,0x7c
+# W32: v_cmp_t_u64_e32 vcc_lo, exec, v[2:3]    ; encoding: [0x7e,0x04,0xbe,0x7c]
+# W64: v_cmp_t_u64_e32 vcc, exec, v[2:3]       ; encoding: [0x7e,0x04,0xbe,0x7c]
 
-# W32: v_cmp_t_u64_e32 vcc_lo, null, v[2:3]      ; encoding: [0x7c,0x04,0xbe,0x7c]
-# W64: v_cmp_t_u64_e32 vcc, null, v[2:3]         ; encoding: [0x7c,0x04,0xbe,0x7c]
 0x7c,0x04,0xbe,0x7c
+# W32: v_cmp_t_u64_e32 vcc_lo, null, v[2:3]    ; encoding: [0x7c,0x04,0xbe,0x7c]
+# W64: v_cmp_t_u64_e32 vcc, null, v[2:3]       ; encoding: [0x7c,0x04,0xbe,0x7c]
 
-# W32: v_cmp_t_u64_e32 vcc_lo, -1, v[2:3]        ; encoding: [0xc1,0x04,0xbe,0x7c]
-# W64: v_cmp_t_u64_e32 vcc, -1, v[2:3]           ; encoding: [0xc1,0x04,0xbe,0x7c]
 0xc1,0x04,0xbe,0x7c
+# W32: v_cmp_t_u64_e32 vcc_lo, -1, v[2:3]      ; encoding: [0xc1,0x04,0xbe,0x7c]
+# W64: v_cmp_t_u64_e32 vcc, -1, v[2:3]         ; encoding: [0xc1,0x04,0xbe,0x7c]
 
-# W32: v_cmp_t_u64_e32 vcc_lo, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0xbe,0x7c]
-# W64: v_cmp_t_u64_e32 vcc, 0.5, v[2:3]          ; encoding: [0xf0,0x04,0xbe,0x7c]
 0xf0,0x04,0xbe,0x7c
+# W32: v_cmp_t_u64_e32 vcc_lo, 0.5, v[2:3]     ; encoding: [0xf0,0x04,0xbe,0x7c]
+# W64: v_cmp_t_u64_e32 vcc, 0.5, v[2:3]        ; encoding: [0xf0,0x04,0xbe,0x7c]
 
-# W32: v_cmp_t_u64_e32 vcc_lo, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0xbe,0x7c]
-# W64: v_cmp_t_u64_e32 vcc, src_scc, v[2:3]      ; encoding: [0xfd,0x04,0xbe,0x7c]
 0xfd,0x04,0xbe,0x7c
+# W32: v_cmp_t_u64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0xbe,0x7c]
+# W64: v_cmp_t_u64_e32 vcc, src_scc, v[2:3]    ; encoding: [0xfd,0x04,0xbe,0x7c]
 
+0xff,0xfc,0xbf,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_t_u64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbf,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_t_u64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbf,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0xbf,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_u_f16_e32 vcc_lo, v1, v2            ; encoding: [0x01,0x05,0x10,0x7c]
-# W64: v_cmp_u_f16_e32 vcc, v1, v2               ; encoding: [0x01,0x05,0x10,0x7c]
 0x01,0x05,0x10,0x7c
+# W32: v_cmp_u_f16_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x10,0x7c]
+# W64: v_cmp_u_f16_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x10,0x7c]
 
-# W32: v_cmp_u_f16_e32 vcc_lo, v127, v2          ; encoding: [0x7f,0x05,0x10,0x7c]
-# W64: v_cmp_u_f16_e32 vcc, v127, v2             ; encoding: [0x7f,0x05,0x10,0x7c]
 0x7f,0x05,0x10,0x7c
+# W32: v_cmp_u_f16_e32 vcc_lo, v127, v2        ; encoding: [0x7f,0x05,0x10,0x7c]
+# W64: v_cmp_u_f16_e32 vcc, v127, v2           ; encoding: [0x7f,0x05,0x10,0x7c]
 
-# W32: v_cmp_u_f16_e32 vcc_lo, s1, v2            ; encoding: [0x01,0x04,0x10,0x7c]
-# W64: v_cmp_u_f16_e32 vcc, s1, v2               ; encoding: [0x01,0x04,0x10,0x7c]
 0x01,0x04,0x10,0x7c
+# W32: v_cmp_u_f16_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x10,0x7c]
+# W64: v_cmp_u_f16_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x10,0x7c]
 
-# W32: v_cmp_u_f16_e32 vcc_lo, s105, v2          ; encoding: [0x69,0x04,0x10,0x7c]
-# W64: v_cmp_u_f16_e32 vcc, s105, v2             ; encoding: [0x69,0x04,0x10,0x7c]
 0x69,0x04,0x10,0x7c
+# W32: v_cmp_u_f16_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x10,0x7c]
+# W64: v_cmp_u_f16_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x10,0x7c]
 
-# W32: v_cmp_u_f16_e32 vcc_lo, vcc_lo, v2        ; encoding: [0x6a,0x04,0x10,0x7c]
-# W64: v_cmp_u_f16_e32 vcc, vcc_lo, v2           ; encoding: [0x6a,0x04,0x10,0x7c]
 0x6a,0x04,0x10,0x7c
+# W32: v_cmp_u_f16_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x10,0x7c]
+# W64: v_cmp_u_f16_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x10,0x7c]
 
-# W32: v_cmp_u_f16_e32 vcc_lo, vcc_hi, v2        ; encoding: [0x6b,0x04,0x10,0x7c]
-# W64: v_cmp_u_f16_e32 vcc, vcc_hi, v2           ; encoding: [0x6b,0x04,0x10,0x7c]
 0x6b,0x04,0x10,0x7c
+# W32: v_cmp_u_f16_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x10,0x7c]
+# W64: v_cmp_u_f16_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x10,0x7c]
 
-# W32: v_cmp_u_f16_e32 vcc_lo, ttmp15, v2        ; encoding: [0x7b,0x04,0x10,0x7c]
-# W64: v_cmp_u_f16_e32 vcc, ttmp15, v2           ; encoding: [0x7b,0x04,0x10,0x7c]
 0x7b,0x04,0x10,0x7c
+# W32: v_cmp_u_f16_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x10,0x7c]
+# W64: v_cmp_u_f16_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x10,0x7c]
 
-# W32: v_cmp_u_f16_e32 vcc_lo, m0, v2            ; encoding: [0x7d,0x04,0x10,0x7c]
-# W64: v_cmp_u_f16_e32 vcc, m0, v2               ; encoding: [0x7d,0x04,0x10,0x7c]
 0x7d,0x04,0x10,0x7c
+# W32: v_cmp_u_f16_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x10,0x7c]
+# W64: v_cmp_u_f16_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x10,0x7c]
 
-# W32: v_cmp_u_f16_e32 vcc_lo, exec_lo, v2       ; encoding: [0x7e,0x04,0x10,0x7c]
-# W64: v_cmp_u_f16_e32 vcc, exec_lo, v2          ; encoding: [0x7e,0x04,0x10,0x7c]
 0x7e,0x04,0x10,0x7c
+# W32: v_cmp_u_f16_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x10,0x7c]
+# W64: v_cmp_u_f16_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x10,0x7c]
 
-# W32: v_cmp_u_f16_e32 vcc_lo, exec_hi, v2       ; encoding: [0x7f,0x04,0x10,0x7c]
-# W64: v_cmp_u_f16_e32 vcc, exec_hi, v2          ; encoding: [0x7f,0x04,0x10,0x7c]
 0x7f,0x04,0x10,0x7c
+# W32: v_cmp_u_f16_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x10,0x7c]
+# W64: v_cmp_u_f16_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x10,0x7c]
 
-# W32: v_cmp_u_f16_e32 vcc_lo, null, v2          ; encoding: [0x7c,0x04,0x10,0x7c]
-# W64: v_cmp_u_f16_e32 vcc, null, v2             ; encoding: [0x7c,0x04,0x10,0x7c]
 0x7c,0x04,0x10,0x7c
+# W32: v_cmp_u_f16_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x10,0x7c]
+# W64: v_cmp_u_f16_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x10,0x7c]
 
-# W32: v_cmp_u_f16_e32 vcc_lo, -1, v2            ; encoding: [0xc1,0x04,0x10,0x7c]
-# W64: v_cmp_u_f16_e32 vcc, -1, v2               ; encoding: [0xc1,0x04,0x10,0x7c]
 0xc1,0x04,0x10,0x7c
+# W32: v_cmp_u_f16_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x10,0x7c]
+# W64: v_cmp_u_f16_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x10,0x7c]
 
-# W32: v_cmp_u_f16_e32 vcc_lo, 0.5, v2           ; encoding: [0xf0,0x04,0x10,0x7c]
-# W64: v_cmp_u_f16_e32 vcc, 0.5, v2              ; encoding: [0xf0,0x04,0x10,0x7c]
 0xf0,0x04,0x10,0x7c
+# W32: v_cmp_u_f16_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x10,0x7c]
+# W64: v_cmp_u_f16_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x10,0x7c]
 
-# W32: v_cmp_u_f16_e32 vcc_lo, src_scc, v2       ; encoding: [0xfd,0x04,0x10,0x7c]
-# W64: v_cmp_u_f16_e32 vcc, src_scc, v2          ; encoding: [0xfd,0x04,0x10,0x7c]
 0xfd,0x04,0x10,0x7c
+# W32: v_cmp_u_f16_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x10,0x7c]
+# W64: v_cmp_u_f16_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x10,0x7c]
 
-# W32: v_cmp_u_f16_e32 vcc_lo, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x10,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_u_f16_e32 vcc, 0xfe0b, v127         ; encoding: [0xff,0xfe,0x10,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x10,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_u_f16_e32 vcc_lo, 0xfe0b, v127    ; encoding: [0xff,0xfe,0x10,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_u_f16_e32 vcc, 0xfe0b, v127       ; encoding: [0xff,0xfe,0x10,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_u_f32_e32 vcc_lo, v1, v2            ; encoding: [0x01,0x05,0x30,0x7c]
-# W64: v_cmp_u_f32_e32 vcc, v1, v2               ; encoding: [0x01,0x05,0x30,0x7c]
 0x01,0x05,0x30,0x7c
+# W32: v_cmp_u_f32_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x30,0x7c]
+# W64: v_cmp_u_f32_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x30,0x7c]
 
-# W32: v_cmp_u_f32_e32 vcc_lo, v255, v2          ; encoding: [0xff,0x05,0x30,0x7c]
-# W64: v_cmp_u_f32_e32 vcc, v255, v2             ; encoding: [0xff,0x05,0x30,0x7c]
 0xff,0x05,0x30,0x7c
+# W32: v_cmp_u_f32_e32 vcc_lo, v255, v2        ; encoding: [0xff,0x05,0x30,0x7c]
+# W64: v_cmp_u_f32_e32 vcc, v255, v2           ; encoding: [0xff,0x05,0x30,0x7c]
 
-# W32: v_cmp_u_f32_e32 vcc_lo, s1, v2            ; encoding: [0x01,0x04,0x30,0x7c]
-# W64: v_cmp_u_f32_e32 vcc, s1, v2               ; encoding: [0x01,0x04,0x30,0x7c]
 0x01,0x04,0x30,0x7c
+# W32: v_cmp_u_f32_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x30,0x7c]
+# W64: v_cmp_u_f32_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x30,0x7c]
 
-# W32: v_cmp_u_f32_e32 vcc_lo, s105, v2          ; encoding: [0x69,0x04,0x30,0x7c]
-# W64: v_cmp_u_f32_e32 vcc, s105, v2             ; encoding: [0x69,0x04,0x30,0x7c]
 0x69,0x04,0x30,0x7c
+# W32: v_cmp_u_f32_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x30,0x7c]
+# W64: v_cmp_u_f32_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x30,0x7c]
 
-# W32: v_cmp_u_f32_e32 vcc_lo, vcc_lo, v2        ; encoding: [0x6a,0x04,0x30,0x7c]
-# W64: v_cmp_u_f32_e32 vcc, vcc_lo, v2           ; encoding: [0x6a,0x04,0x30,0x7c]
 0x6a,0x04,0x30,0x7c
+# W32: v_cmp_u_f32_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x30,0x7c]
+# W64: v_cmp_u_f32_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x30,0x7c]
 
-# W32: v_cmp_u_f32_e32 vcc_lo, vcc_hi, v2        ; encoding: [0x6b,0x04,0x30,0x7c]
-# W64: v_cmp_u_f32_e32 vcc, vcc_hi, v2           ; encoding: [0x6b,0x04,0x30,0x7c]
 0x6b,0x04,0x30,0x7c
+# W32: v_cmp_u_f32_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x30,0x7c]
+# W64: v_cmp_u_f32_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x30,0x7c]
 
-# W32: v_cmp_u_f32_e32 vcc_lo, ttmp15, v2        ; encoding: [0x7b,0x04,0x30,0x7c]
-# W64: v_cmp_u_f32_e32 vcc, ttmp15, v2           ; encoding: [0x7b,0x04,0x30,0x7c]
 0x7b,0x04,0x30,0x7c
+# W32: v_cmp_u_f32_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x30,0x7c]
+# W64: v_cmp_u_f32_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x30,0x7c]
 
-# W32: v_cmp_u_f32_e32 vcc_lo, m0, v2            ; encoding: [0x7d,0x04,0x30,0x7c]
-# W64: v_cmp_u_f32_e32 vcc, m0, v2               ; encoding: [0x7d,0x04,0x30,0x7c]
 0x7d,0x04,0x30,0x7c
+# W32: v_cmp_u_f32_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x30,0x7c]
+# W64: v_cmp_u_f32_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x30,0x7c]
 
-# W32: v_cmp_u_f32_e32 vcc_lo, exec_lo, v2       ; encoding: [0x7e,0x04,0x30,0x7c]
-# W64: v_cmp_u_f32_e32 vcc, exec_lo, v2          ; encoding: [0x7e,0x04,0x30,0x7c]
 0x7e,0x04,0x30,0x7c
+# W32: v_cmp_u_f32_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x30,0x7c]
+# W64: v_cmp_u_f32_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x30,0x7c]
 
-# W32: v_cmp_u_f32_e32 vcc_lo, exec_hi, v2       ; encoding: [0x7f,0x04,0x30,0x7c]
-# W64: v_cmp_u_f32_e32 vcc, exec_hi, v2          ; encoding: [0x7f,0x04,0x30,0x7c]
 0x7f,0x04,0x30,0x7c
+# W32: v_cmp_u_f32_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x30,0x7c]
+# W64: v_cmp_u_f32_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x30,0x7c]
 
-# W32: v_cmp_u_f32_e32 vcc_lo, null, v2          ; encoding: [0x7c,0x04,0x30,0x7c]
-# W64: v_cmp_u_f32_e32 vcc, null, v2             ; encoding: [0x7c,0x04,0x30,0x7c]
 0x7c,0x04,0x30,0x7c
+# W32: v_cmp_u_f32_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x30,0x7c]
+# W64: v_cmp_u_f32_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x30,0x7c]
 
-# W32: v_cmp_u_f32_e32 vcc_lo, -1, v2            ; encoding: [0xc1,0x04,0x30,0x7c]
-# W64: v_cmp_u_f32_e32 vcc, -1, v2               ; encoding: [0xc1,0x04,0x30,0x7c]
 0xc1,0x04,0x30,0x7c
+# W32: v_cmp_u_f32_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x30,0x7c]
+# W64: v_cmp_u_f32_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x30,0x7c]
 
-# W32: v_cmp_u_f32_e32 vcc_lo, 0.5, v2           ; encoding: [0xf0,0x04,0x30,0x7c]
-# W64: v_cmp_u_f32_e32 vcc, 0.5, v2              ; encoding: [0xf0,0x04,0x30,0x7c]
 0xf0,0x04,0x30,0x7c
+# W32: v_cmp_u_f32_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x30,0x7c]
+# W64: v_cmp_u_f32_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x30,0x7c]
 
-# W32: v_cmp_u_f32_e32 vcc_lo, src_scc, v2       ; encoding: [0xfd,0x04,0x30,0x7c]
-# W64: v_cmp_u_f32_e32 vcc, src_scc, v2          ; encoding: [0xfd,0x04,0x30,0x7c]
 0xfd,0x04,0x30,0x7c
+# W32: v_cmp_u_f32_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x30,0x7c]
+# W64: v_cmp_u_f32_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x30,0x7c]
 
-# W32: v_cmp_u_f32_e32 vcc_lo, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x31,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_u_f32_e32 vcc, 0xaf123456, v255     ; encoding: [0xff,0xfe,0x31,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x31,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_u_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x31,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_u_f32_e32 vcc, 0xaf123456, v255   ; encoding: [0xff,0xfe,0x31,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_u_f64_e32 vcc_lo, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0x50,0x7c]
-# W64: v_cmp_u_f64_e32 vcc, v[1:2], v[2:3]       ; encoding: [0x01,0x05,0x50,0x7c]
 0x01,0x05,0x50,0x7c
+# W32: v_cmp_u_f64_e32 vcc_lo, v[1:2], v[2:3]  ; encoding: [0x01,0x05,0x50,0x7c]
+# W64: v_cmp_u_f64_e32 vcc, v[1:2], v[2:3]     ; encoding: [0x01,0x05,0x50,0x7c]
 
-# W32: v_cmp_u_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x50,0x7c]
-# W64: v_cmp_u_f64_e32 vcc, v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0x50,0x7c]
 0xfe,0x05,0x50,0x7c
+# W32: v_cmp_u_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x50,0x7c]
+# W64: v_cmp_u_f64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x50,0x7c]
 
-# W32: v_cmp_u_f64_e32 vcc_lo, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0x50,0x7c]
-# W64: v_cmp_u_f64_e32 vcc, s[2:3], v[2:3]       ; encoding: [0x02,0x04,0x50,0x7c]
 0x02,0x04,0x50,0x7c
+# W32: v_cmp_u_f64_e32 vcc_lo, s[2:3], v[2:3]  ; encoding: [0x02,0x04,0x50,0x7c]
+# W64: v_cmp_u_f64_e32 vcc, s[2:3], v[2:3]     ; encoding: [0x02,0x04,0x50,0x7c]
 
-# W32: v_cmp_u_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x50,0x7c]
-# W64: v_cmp_u_f64_e32 vcc, s[104:105], v[2:3]   ; encoding: [0x68,0x04,0x50,0x7c]
 0x68,0x04,0x50,0x7c
+# W32: v_cmp_u_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x50,0x7c]
+# W64: v_cmp_u_f64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x50,0x7c]
 
-# W32: v_cmp_u_f64_e32 vcc_lo, vcc, v[2:3]       ; encoding: [0x6a,0x04,0x50,0x7c]
-# W64: v_cmp_u_f64_e32 vcc, vcc, v[2:3]          ; encoding: [0x6a,0x04,0x50,0x7c]
 0x6a,0x04,0x50,0x7c
+# W32: v_cmp_u_f64_e32 vcc_lo, vcc, v[2:3]     ; encoding: [0x6a,0x04,0x50,0x7c]
+# W64: v_cmp_u_f64_e32 vcc, vcc, v[2:3]        ; encoding: [0x6a,0x04,0x50,0x7c]
 
-# W32: v_cmp_u_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x50,0x7c]
-# W64: v_cmp_u_f64_e32 vcc, ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0x50,0x7c]
 0x7a,0x04,0x50,0x7c
+# W32: v_cmp_u_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x50,0x7c]
+# W64: v_cmp_u_f64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x50,0x7c]
 
-# W32: v_cmp_u_f64_e32 vcc_lo, exec, v[2:3]      ; encoding: [0x7e,0x04,0x50,0x7c]
-# W64: v_cmp_u_f64_e32 vcc, exec, v[2:3]         ; encoding: [0x7e,0x04,0x50,0x7c]
 0x7e,0x04,0x50,0x7c
+# W32: v_cmp_u_f64_e32 vcc_lo, exec, v[2:3]    ; encoding: [0x7e,0x04,0x50,0x7c]
+# W64: v_cmp_u_f64_e32 vcc, exec, v[2:3]       ; encoding: [0x7e,0x04,0x50,0x7c]
 
-# W32: v_cmp_u_f64_e32 vcc_lo, null, v[2:3]      ; encoding: [0x7c,0x04,0x50,0x7c]
-# W64: v_cmp_u_f64_e32 vcc, null, v[2:3]         ; encoding: [0x7c,0x04,0x50,0x7c]
 0x7c,0x04,0x50,0x7c
+# W32: v_cmp_u_f64_e32 vcc_lo, null, v[2:3]    ; encoding: [0x7c,0x04,0x50,0x7c]
+# W64: v_cmp_u_f64_e32 vcc, null, v[2:3]       ; encoding: [0x7c,0x04,0x50,0x7c]
 
-# W32: v_cmp_u_f64_e32 vcc_lo, -1, v[2:3]        ; encoding: [0xc1,0x04,0x50,0x7c]
-# W64: v_cmp_u_f64_e32 vcc, -1, v[2:3]           ; encoding: [0xc1,0x04,0x50,0x7c]
 0xc1,0x04,0x50,0x7c
+# W32: v_cmp_u_f64_e32 vcc_lo, -1, v[2:3]      ; encoding: [0xc1,0x04,0x50,0x7c]
+# W64: v_cmp_u_f64_e32 vcc, -1, v[2:3]         ; encoding: [0xc1,0x04,0x50,0x7c]
 
-# W32: v_cmp_u_f64_e32 vcc_lo, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0x50,0x7c]
-# W64: v_cmp_u_f64_e32 vcc, 0.5, v[2:3]          ; encoding: [0xf0,0x04,0x50,0x7c]
 0xf0,0x04,0x50,0x7c
+# W32: v_cmp_u_f64_e32 vcc_lo, 0.5, v[2:3]     ; encoding: [0xf0,0x04,0x50,0x7c]
+# W64: v_cmp_u_f64_e32 vcc, 0.5, v[2:3]        ; encoding: [0xf0,0x04,0x50,0x7c]
 
-# W32: v_cmp_u_f64_e32 vcc_lo, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0x50,0x7c]
-# W64: v_cmp_u_f64_e32 vcc, src_scc, v[2:3]      ; encoding: [0xfd,0x04,0x50,0x7c]
 0xfd,0x04,0x50,0x7c
+# W32: v_cmp_u_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x50,0x7c]
+# W64: v_cmp_u_f64_e32 vcc, src_scc, v[2:3]    ; encoding: [0xfd,0x04,0x50,0x7c]
 
+0xff,0xfc,0x51,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_u_f64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x51,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_u_f64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x51,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0x51,0x7c,0x56,0x34,0x12,0xaf
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopc_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopc_dpp16.txt
index 40735cef0c536..a5f52e62fda44 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopc_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopc_dpp16.txt
@@ -1,3476 +1,3477 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=W32
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=W64
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=W32
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=W64
 
+0xfa,0x04,0xfa,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_class_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_class_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0xfa,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0xfa,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_class_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_class_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0xfa,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0xfa,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_class_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_class_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0xfa,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0xfa,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_class_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_class_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0xfa,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0xfa,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_class_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_class_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0xfa,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0xfa,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_class_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_class_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0xfa,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0xfa,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_class_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_class_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0xfa,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0xfa,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_class_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_class_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0xfa,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0xfa,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_class_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_class_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0xfa,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0xfa,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_class_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_class_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0xfa,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0xfa,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_class_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_class_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0xfa,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0xfa,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_class_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_class_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0xfa,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0xfa,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_class_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_class_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0xfa,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0xfa,0x7c,0x7f,0x6f,0x3d,0x30
 # W32: v_cmp_class_f16 vcc_lo, -|v127|, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfa,0x7c,0x7f,0x6f,0x3d,0x30]
 # W64: v_cmp_class_f16 vcc, -|v127|, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfa,0x7c,0x7f,0x6f,0x3d,0x30]
-0xfa,0xfe,0xfa,0x7c,0x7f,0x6f,0x3d,0x30
 
+0xfa,0x04,0xfc,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_class_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_class_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0xfc,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0xfc,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_class_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_class_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0xfc,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0xfc,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_class_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_class_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0xfc,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0xfc,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_class_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_class_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0xfc,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0xfc,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_class_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_class_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0xfc,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0xfc,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_class_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_class_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0xfc,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0xfc,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_class_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_class_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0xfc,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0xfc,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_class_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_class_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0xfc,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0xfc,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_class_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_class_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0xfc,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0xfc,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_class_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_class_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0xfc,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0xfc,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_class_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_class_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0xfc,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0xfc,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_class_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_class_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0xfc,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0xfc,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_class_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_class_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0xfc,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0xfd,0x7c,0xff,0x6f,0x3d,0x30
 # W32: v_cmp_class_f32 vcc_lo, -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfd,0x7c,0xff,0x6f,0x3d,0x30]
 # W64: v_cmp_class_f32 vcc, -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfd,0x7c,0xff,0x6f,0x3d,0x30]
-0xfa,0xfe,0xfd,0x7c,0xff,0x6f,0x3d,0x30
 
+0xfa,0x04,0x04,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_eq_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_eq_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x04,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x04,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_eq_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_eq_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x04,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x04,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_eq_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_eq_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x04,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x04,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_eq_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_eq_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x04,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x04,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_eq_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_eq_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x04,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x04,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_eq_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x04,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x04,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_eq_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_eq_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x04,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x04,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_eq_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_eq_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x04,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x04,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_eq_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_eq_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x04,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x04,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_eq_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_eq_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x04,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x04,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_eq_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_eq_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x04,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x04,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_eq_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_eq_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x04,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x04,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_eq_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_eq_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x04,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x04,0x7c,0x7f,0x6f,0xfd,0x30
 # W32: v_cmp_eq_f16 vcc_lo, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x04,0x7c,0x7f,0x6f,0xfd,0x30]
 # W64: v_cmp_eq_f16 vcc, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x04,0x7c,0x7f,0x6f,0xfd,0x30]
-0xfa,0xfe,0x04,0x7c,0x7f,0x6f,0xfd,0x30
 
+0xfa,0x04,0x24,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_eq_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_eq_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x24,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x24,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_eq_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_eq_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x24,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x24,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_eq_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_eq_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x24,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x24,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_eq_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_eq_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x24,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x24,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_eq_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_eq_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x24,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x24,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_eq_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x24,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x24,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_eq_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_eq_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x24,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x24,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_eq_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_eq_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x24,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x24,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_eq_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_eq_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x24,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x24,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_eq_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_eq_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x24,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x24,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_eq_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_eq_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x24,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x24,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_eq_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_eq_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x24,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x24,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_eq_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_eq_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x24,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x25,0x7c,0xff,0x6f,0xfd,0x30
 # W32: v_cmp_eq_f32 vcc_lo, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x25,0x7c,0xff,0x6f,0xfd,0x30]
 # W64: v_cmp_eq_f32 vcc, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x25,0x7c,0xff,0x6f,0xfd,0x30]
-0xfa,0xfe,0x25,0x7c,0xff,0x6f,0xfd,0x30
 
+0xfa,0x04,0x64,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_eq_i16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_eq_i16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x64,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x64,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_eq_i16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_eq_i16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x64,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x64,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_eq_i16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_eq_i16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x64,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x64,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_eq_i16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_eq_i16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x64,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x64,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_eq_i16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_eq_i16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x64,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x64,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_eq_i16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_i16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x64,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x64,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_eq_i16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_eq_i16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x64,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x64,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_eq_i16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_eq_i16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x64,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x64,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_eq_i16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_eq_i16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x64,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x64,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_eq_i16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_eq_i16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x64,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x64,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_eq_i16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_eq_i16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x64,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x64,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_eq_i16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_eq_i16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x64,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x64,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_eq_i16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_eq_i16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x64,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x64,0x7c,0x7f,0x6f,0x0d,0x30
 # W32: v_cmp_eq_i16 vcc_lo, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x64,0x7c,0x7f,0x6f,0x0d,0x30]
 # W64: v_cmp_eq_i16 vcc, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x64,0x7c,0x7f,0x6f,0x0d,0x30]
-0xfa,0xfe,0x64,0x7c,0x7f,0x6f,0x0d,0x30
 
+0xfa,0x04,0x84,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_eq_i32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_eq_i32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x84,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x84,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_eq_i32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_eq_i32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x84,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x84,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_eq_i32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_eq_i32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x84,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x84,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_eq_i32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_eq_i32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x84,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x84,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_eq_i32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_eq_i32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x84,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x84,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_eq_i32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_i32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x84,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x84,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_eq_i32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_eq_i32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x84,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x84,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_eq_i32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_eq_i32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x84,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x84,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_eq_i32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_eq_i32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x84,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x84,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_eq_i32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_eq_i32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x84,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x84,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_eq_i32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_eq_i32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x84,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x84,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_eq_i32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_eq_i32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x84,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x84,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_eq_i32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_eq_i32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x84,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x85,0x7c,0xff,0x6f,0x0d,0x30
 # W32: v_cmp_eq_i32 vcc_lo, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x85,0x7c,0xff,0x6f,0x0d,0x30]
 # W64: v_cmp_eq_i32 vcc, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x85,0x7c,0xff,0x6f,0x0d,0x30]
-0xfa,0xfe,0x85,0x7c,0xff,0x6f,0x0d,0x30
 
+0xfa,0x04,0x74,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_eq_u16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_eq_u16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x74,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x74,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_eq_u16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_eq_u16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x74,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x74,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_eq_u16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_eq_u16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x74,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x74,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_eq_u16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_eq_u16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x74,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x74,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_eq_u16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_eq_u16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x74,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x74,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_eq_u16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_u16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x74,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x74,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_eq_u16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_eq_u16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x74,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x74,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_eq_u16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_eq_u16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x74,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x74,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_eq_u16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_eq_u16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x74,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x74,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_eq_u16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_eq_u16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x74,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x74,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_eq_u16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_eq_u16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x74,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x74,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_eq_u16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_eq_u16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x74,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x74,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_eq_u16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_eq_u16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x74,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x74,0x7c,0x7f,0x6f,0x0d,0x30
 # W32: v_cmp_eq_u16 vcc_lo, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x74,0x7c,0x7f,0x6f,0x0d,0x30]
 # W64: v_cmp_eq_u16 vcc, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x74,0x7c,0x7f,0x6f,0x0d,0x30]
-0xfa,0xfe,0x74,0x7c,0x7f,0x6f,0x0d,0x30
 
+0xfa,0x04,0x94,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_eq_u32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_eq_u32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x94,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x94,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_eq_u32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_eq_u32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x94,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x94,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_eq_u32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_eq_u32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x94,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x94,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_eq_u32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_eq_u32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x94,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x94,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_eq_u32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_eq_u32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x94,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x94,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_eq_u32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_u32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x94,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x94,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_eq_u32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_eq_u32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x94,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x94,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_eq_u32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_eq_u32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x94,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x94,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_eq_u32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_eq_u32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x94,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x94,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_eq_u32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_eq_u32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x94,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x94,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_eq_u32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_eq_u32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x94,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x94,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_eq_u32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_eq_u32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x94,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x94,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_eq_u32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_eq_u32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x94,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x95,0x7c,0xff,0x6f,0x0d,0x30
 # W32: v_cmp_eq_u32 vcc_lo, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x95,0x7c,0xff,0x6f,0x0d,0x30]
 # W64: v_cmp_eq_u32 vcc, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x95,0x7c,0xff,0x6f,0x0d,0x30]
-0xfa,0xfe,0x95,0x7c,0xff,0x6f,0x0d,0x30
 
+0xfa,0x04,0x00,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_f_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_f_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x00,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x00,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_f_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_f_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x00,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x00,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_f_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_f_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x00,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x00,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_f_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_f_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x00,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x00,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_f_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_f_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x00,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x00,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_f_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_f_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x00,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x00,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_f_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_f_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x00,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x00,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_f_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_f_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x00,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x00,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_f_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_f_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x00,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x00,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_f_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_f_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x00,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x00,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_f_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_f_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x00,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x00,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_f_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x00,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_f_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x00,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x00,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x00,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_f_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x00,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_f_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x00,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x00,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x00,0x7c,0x7f,0x6f,0xfd,0x30
 # W32: v_cmp_f_f16 vcc_lo, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x00,0x7c,0x7f,0x6f,0xfd,0x30]
 # W64: v_cmp_f_f16 vcc, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x00,0x7c,0x7f,0x6f,0xfd,0x30]
-0xfa,0xfe,0x00,0x7c,0x7f,0x6f,0xfd,0x30
 
+0xfa,0x04,0x20,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_f_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_f_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x20,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x20,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_f_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_f_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x20,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x20,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_f_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_f_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x20,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x20,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_f_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_f_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x20,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x20,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_f_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_f_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x20,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x20,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_f_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_f_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x20,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x20,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_f_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_f_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x20,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x20,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_f_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_f_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x20,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x20,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_f_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_f_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x20,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x20,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_f_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_f_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x20,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x20,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_f_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_f_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x20,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x20,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_f_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x20,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_f_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x20,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x20,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x20,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_f_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x20,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_f_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x20,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x20,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x21,0x7c,0xff,0x6f,0xfd,0x30
 # W32: v_cmp_f_f32 vcc_lo, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x21,0x7c,0xff,0x6f,0xfd,0x30]
 # W64: v_cmp_f_f32 vcc, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x21,0x7c,0xff,0x6f,0xfd,0x30]
-0xfa,0xfe,0x21,0x7c,0xff,0x6f,0xfd,0x30
 
+0xfa,0x04,0x80,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_f_i32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_f_i32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x80,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x80,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_f_i32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_f_i32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x80,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x80,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_f_i32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_f_i32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x80,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x80,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_f_i32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_f_i32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x80,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x80,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_f_i32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_f_i32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x80,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x80,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_f_i32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_f_i32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x80,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x80,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_f_i32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_f_i32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x80,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x80,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_f_i32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_f_i32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x80,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x80,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_f_i32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_f_i32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x80,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x80,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_f_i32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_f_i32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x80,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x80,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_f_i32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_f_i32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x80,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x80,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_f_i32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x80,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_f_i32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x80,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x80,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x80,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_f_i32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x80,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_f_i32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x80,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x80,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x81,0x7c,0xff,0x6f,0x0d,0x30
 # W32: v_cmp_f_i32 vcc_lo, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x81,0x7c,0xff,0x6f,0x0d,0x30]
 # W64: v_cmp_f_i32 vcc, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x81,0x7c,0xff,0x6f,0x0d,0x30]
-0xfa,0xfe,0x81,0x7c,0xff,0x6f,0x0d,0x30
 
+0xfa,0x04,0x90,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_f_u32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_f_u32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x90,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x90,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_f_u32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_f_u32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x90,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x90,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_f_u32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_f_u32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x90,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x90,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_f_u32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_f_u32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x90,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x90,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_f_u32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_f_u32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x90,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x90,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_f_u32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_f_u32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x90,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x90,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_f_u32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_f_u32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x90,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x90,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_f_u32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_f_u32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x90,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x90,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_f_u32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_f_u32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x90,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x90,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_f_u32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_f_u32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x90,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x90,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_f_u32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_f_u32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x90,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x90,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_f_u32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x90,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_f_u32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x90,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x90,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x90,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_f_u32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x90,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_f_u32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x90,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x90,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x91,0x7c,0xff,0x6f,0x0d,0x30
 # W32: v_cmp_f_u32 vcc_lo, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x91,0x7c,0xff,0x6f,0x0d,0x30]
 # W64: v_cmp_f_u32 vcc, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x91,0x7c,0xff,0x6f,0x0d,0x30]
-0xfa,0xfe,0x91,0x7c,0xff,0x6f,0x0d,0x30
 
+0xfa,0x04,0x0c,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ge_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ge_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x0c,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x0c,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ge_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ge_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x0c,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x0c,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_ge_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ge_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x0c,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x0c,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_ge_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ge_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x0c,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x0c,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_ge_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ge_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x0c,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x0c,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ge_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x0c,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x0c,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_ge_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ge_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x0c,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x0c,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ge_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ge_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x0c,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x0c,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_ge_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ge_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x0c,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x0c,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ge_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ge_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x0c,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x0c,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_ge_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ge_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x0c,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x0c,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ge_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ge_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x0c,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x0c,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_ge_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ge_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x0c,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x0c,0x7c,0x7f,0x6f,0xfd,0x30
 # W32: v_cmp_ge_f16 vcc_lo, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0c,0x7c,0x7f,0x6f,0xfd,0x30]
 # W64: v_cmp_ge_f16 vcc, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0c,0x7c,0x7f,0x6f,0xfd,0x30]
-0xfa,0xfe,0x0c,0x7c,0x7f,0x6f,0xfd,0x30
 
+0xfa,0x04,0x2c,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ge_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ge_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x2c,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x2c,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ge_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ge_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x2c,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x2c,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_ge_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ge_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x2c,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x2c,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_ge_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ge_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x2c,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x2c,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_ge_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ge_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x2c,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x2c,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ge_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x2c,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x2c,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_ge_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ge_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x2c,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x2c,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ge_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ge_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x2c,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x2c,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_ge_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ge_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x2c,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x2c,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ge_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ge_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x2c,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x2c,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_ge_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ge_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x2c,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x2c,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ge_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ge_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x2c,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x2c,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_ge_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ge_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x2c,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x2d,0x7c,0xff,0x6f,0xfd,0x30
 # W32: v_cmp_ge_f32 vcc_lo, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2d,0x7c,0xff,0x6f,0xfd,0x30]
 # W64: v_cmp_ge_f32 vcc, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2d,0x7c,0xff,0x6f,0xfd,0x30]
-0xfa,0xfe,0x2d,0x7c,0xff,0x6f,0xfd,0x30
 
+0xfa,0x04,0x6c,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ge_i16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ge_i16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x6c,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x6c,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ge_i16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ge_i16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x6c,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x6c,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_ge_i16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ge_i16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x6c,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x6c,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_ge_i16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ge_i16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x6c,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x6c,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_ge_i16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ge_i16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x6c,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x6c,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ge_i16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_i16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x6c,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x6c,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_ge_i16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ge_i16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x6c,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x6c,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ge_i16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ge_i16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x6c,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x6c,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_ge_i16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ge_i16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x6c,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x6c,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ge_i16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ge_i16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x6c,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x6c,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_ge_i16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ge_i16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x6c,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x6c,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ge_i16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ge_i16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x6c,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x6c,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_ge_i16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ge_i16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x6c,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x6c,0x7c,0x7f,0x6f,0x0d,0x30
 # W32: v_cmp_ge_i16 vcc_lo, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6c,0x7c,0x7f,0x6f,0x0d,0x30]
 # W64: v_cmp_ge_i16 vcc, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6c,0x7c,0x7f,0x6f,0x0d,0x30]
-0xfa,0xfe,0x6c,0x7c,0x7f,0x6f,0x0d,0x30
 
+0xfa,0x04,0x8c,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ge_i32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ge_i32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x8c,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x8c,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ge_i32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ge_i32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x8c,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x8c,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_ge_i32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ge_i32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x8c,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x8c,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_ge_i32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ge_i32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x8c,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x8c,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_ge_i32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ge_i32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x8c,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x8c,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ge_i32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_i32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x8c,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x8c,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_ge_i32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ge_i32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x8c,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x8c,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ge_i32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ge_i32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x8c,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x8c,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_ge_i32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ge_i32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x8c,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x8c,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ge_i32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ge_i32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x8c,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x8c,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_ge_i32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ge_i32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x8c,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x8c,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ge_i32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ge_i32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x8c,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x8c,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_ge_i32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ge_i32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x8c,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x8d,0x7c,0xff,0x6f,0x0d,0x30
 # W32: v_cmp_ge_i32 vcc_lo, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x8d,0x7c,0xff,0x6f,0x0d,0x30]
 # W64: v_cmp_ge_i32 vcc, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x8d,0x7c,0xff,0x6f,0x0d,0x30]
-0xfa,0xfe,0x8d,0x7c,0xff,0x6f,0x0d,0x30
 
+0xfa,0x04,0x7c,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ge_u16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ge_u16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x7c,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x7c,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ge_u16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ge_u16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x7c,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x7c,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_ge_u16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ge_u16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x7c,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x7c,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_ge_u16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ge_u16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x7c,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x7c,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_ge_u16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ge_u16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x7c,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x7c,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ge_u16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_u16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x7c,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x7c,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_ge_u16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ge_u16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x7c,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x7c,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ge_u16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ge_u16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x7c,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x7c,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_ge_u16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ge_u16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x7c,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x7c,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ge_u16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ge_u16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x7c,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x7c,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_ge_u16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ge_u16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x7c,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x7c,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ge_u16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ge_u16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x7c,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x7c,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_ge_u16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ge_u16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x7c,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x7c,0x7c,0x7f,0x6f,0x0d,0x30
 # W32: v_cmp_ge_u16 vcc_lo, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7c,0x7c,0x7f,0x6f,0x0d,0x30]
 # W64: v_cmp_ge_u16 vcc, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7c,0x7c,0x7f,0x6f,0x0d,0x30]
-0xfa,0xfe,0x7c,0x7c,0x7f,0x6f,0x0d,0x30
 
+0xfa,0x04,0x9c,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ge_u32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ge_u32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x9c,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x9c,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ge_u32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ge_u32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x9c,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x9c,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_ge_u32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ge_u32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x9c,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x9c,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_ge_u32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ge_u32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x9c,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x9c,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_ge_u32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ge_u32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x9c,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x9c,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ge_u32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_u32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x9c,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x9c,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_ge_u32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ge_u32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x9c,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x9c,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ge_u32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ge_u32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x9c,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x9c,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_ge_u32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ge_u32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x9c,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x9c,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ge_u32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ge_u32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x9c,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x9c,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_ge_u32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ge_u32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x9c,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x9c,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ge_u32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ge_u32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x9c,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x9c,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_ge_u32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ge_u32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x9c,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x9d,0x7c,0xff,0x6f,0x0d,0x30
 # W32: v_cmp_ge_u32 vcc_lo, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x9d,0x7c,0xff,0x6f,0x0d,0x30]
 # W64: v_cmp_ge_u32 vcc, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x9d,0x7c,0xff,0x6f,0x0d,0x30]
-0xfa,0xfe,0x9d,0x7c,0xff,0x6f,0x0d,0x30
 
+0xfa,0x04,0x08,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_gt_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_gt_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x08,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x08,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_gt_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_gt_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x08,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x08,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_gt_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_gt_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x08,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x08,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_gt_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_gt_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x08,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x08,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_gt_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_gt_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x08,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x08,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_gt_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x08,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x08,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_gt_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_gt_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x08,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x08,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_gt_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_gt_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x08,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x08,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_gt_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_gt_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x08,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x08,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_gt_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_gt_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x08,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x08,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_gt_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_gt_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x08,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x08,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_gt_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_gt_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x08,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x08,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_gt_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_gt_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x08,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x08,0x7c,0x7f,0x6f,0xfd,0x30
 # W32: v_cmp_gt_f16 vcc_lo, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x08,0x7c,0x7f,0x6f,0xfd,0x30]
 # W64: v_cmp_gt_f16 vcc, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x08,0x7c,0x7f,0x6f,0xfd,0x30]
-0xfa,0xfe,0x08,0x7c,0x7f,0x6f,0xfd,0x30
 
+0xfa,0x04,0x28,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_gt_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_gt_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x28,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x28,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_gt_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_gt_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x28,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x28,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_gt_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_gt_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x28,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x28,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_gt_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_gt_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x28,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x28,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_gt_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_gt_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x28,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x28,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_gt_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x28,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x28,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_gt_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_gt_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x28,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x28,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_gt_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_gt_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x28,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x28,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_gt_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_gt_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x28,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x28,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_gt_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_gt_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x28,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x28,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_gt_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_gt_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x28,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x28,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_gt_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_gt_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x28,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x28,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_gt_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_gt_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x28,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x29,0x7c,0xff,0x6f,0xfd,0x30
 # W32: v_cmp_gt_f32 vcc_lo, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x29,0x7c,0xff,0x6f,0xfd,0x30]
 # W64: v_cmp_gt_f32 vcc, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x29,0x7c,0xff,0x6f,0xfd,0x30]
-0xfa,0xfe,0x29,0x7c,0xff,0x6f,0xfd,0x30
 
+0xfa,0x04,0x68,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_gt_i16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_gt_i16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x68,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x68,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_gt_i16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_gt_i16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x68,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x68,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_gt_i16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_gt_i16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x68,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x68,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_gt_i16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_gt_i16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x68,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x68,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_gt_i16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_gt_i16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x68,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x68,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_gt_i16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_i16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x68,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x68,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_gt_i16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_gt_i16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x68,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x68,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_gt_i16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_gt_i16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x68,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x68,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_gt_i16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_gt_i16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x68,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x68,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_gt_i16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_gt_i16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x68,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x68,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_gt_i16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_gt_i16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x68,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x68,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_gt_i16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_gt_i16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x68,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x68,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_gt_i16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_gt_i16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x68,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x68,0x7c,0x7f,0x6f,0x0d,0x30
 # W32: v_cmp_gt_i16 vcc_lo, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x68,0x7c,0x7f,0x6f,0x0d,0x30]
 # W64: v_cmp_gt_i16 vcc, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x68,0x7c,0x7f,0x6f,0x0d,0x30]
-0xfa,0xfe,0x68,0x7c,0x7f,0x6f,0x0d,0x30
 
+0xfa,0x04,0x88,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_gt_i32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_gt_i32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x88,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x88,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_gt_i32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_gt_i32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x88,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x88,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_gt_i32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_gt_i32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x88,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x88,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_gt_i32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_gt_i32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x88,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x88,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_gt_i32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_gt_i32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x88,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x88,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_gt_i32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_i32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x88,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x88,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_gt_i32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_gt_i32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x88,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x88,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_gt_i32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_gt_i32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x88,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x88,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_gt_i32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_gt_i32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x88,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x88,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_gt_i32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_gt_i32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x88,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x88,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_gt_i32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_gt_i32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x88,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x88,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_gt_i32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_gt_i32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x88,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x88,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_gt_i32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_gt_i32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x88,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x89,0x7c,0xff,0x6f,0x0d,0x30
 # W32: v_cmp_gt_i32 vcc_lo, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x89,0x7c,0xff,0x6f,0x0d,0x30]
 # W64: v_cmp_gt_i32 vcc, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x89,0x7c,0xff,0x6f,0x0d,0x30]
-0xfa,0xfe,0x89,0x7c,0xff,0x6f,0x0d,0x30
 
+0xfa,0x04,0x78,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_gt_u16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_gt_u16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x78,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x78,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_gt_u16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_gt_u16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x78,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x78,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_gt_u16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_gt_u16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x78,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x78,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_gt_u16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_gt_u16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x78,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x78,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_gt_u16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_gt_u16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x78,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x78,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_gt_u16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_u16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x78,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x78,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_gt_u16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_gt_u16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x78,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x78,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_gt_u16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_gt_u16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x78,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x78,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_gt_u16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_gt_u16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x78,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x78,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_gt_u16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_gt_u16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x78,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x78,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_gt_u16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_gt_u16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x78,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x78,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_gt_u16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_gt_u16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x78,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x78,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_gt_u16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_gt_u16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x78,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x78,0x7c,0x7f,0x6f,0x0d,0x30
 # W32: v_cmp_gt_u16 vcc_lo, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x78,0x7c,0x7f,0x6f,0x0d,0x30]
 # W64: v_cmp_gt_u16 vcc, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x78,0x7c,0x7f,0x6f,0x0d,0x30]
-0xfa,0xfe,0x78,0x7c,0x7f,0x6f,0x0d,0x30
 
+0xfa,0x04,0x98,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_gt_u32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_gt_u32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x98,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x98,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_gt_u32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_gt_u32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x98,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x98,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_gt_u32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_gt_u32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x98,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x98,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_gt_u32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_gt_u32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x98,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x98,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_gt_u32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_gt_u32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x98,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x98,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_gt_u32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_u32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x98,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x98,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_gt_u32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_gt_u32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x98,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x98,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_gt_u32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_gt_u32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x98,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x98,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_gt_u32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_gt_u32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x98,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x98,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_gt_u32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_gt_u32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x98,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x98,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_gt_u32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_gt_u32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x98,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x98,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_gt_u32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_gt_u32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x98,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x98,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_gt_u32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_gt_u32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x98,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x99,0x7c,0xff,0x6f,0x0d,0x30
 # W32: v_cmp_gt_u32 vcc_lo, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x99,0x7c,0xff,0x6f,0x0d,0x30]
 # W64: v_cmp_gt_u32 vcc, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x99,0x7c,0xff,0x6f,0x0d,0x30]
-0xfa,0xfe,0x99,0x7c,0xff,0x6f,0x0d,0x30
 
+0xfa,0x04,0x06,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_le_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_le_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x06,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x06,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_le_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_le_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x06,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x06,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_le_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_le_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x06,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x06,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_le_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_le_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x06,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x06,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_le_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_le_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x06,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x06,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_le_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x06,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x06,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_le_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_le_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x06,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x06,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_le_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_le_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x06,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x06,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_le_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_le_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x06,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x06,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_le_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_le_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x06,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x06,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_le_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_le_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x06,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x06,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_le_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_le_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x06,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x06,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_le_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_le_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x06,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x06,0x7c,0x7f,0x6f,0xfd,0x30
 # W32: v_cmp_le_f16 vcc_lo, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x06,0x7c,0x7f,0x6f,0xfd,0x30]
 # W64: v_cmp_le_f16 vcc, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x06,0x7c,0x7f,0x6f,0xfd,0x30]
-0xfa,0xfe,0x06,0x7c,0x7f,0x6f,0xfd,0x30
 
+0xfa,0x04,0x26,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_le_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_le_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x26,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x26,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_le_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_le_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x26,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x26,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_le_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_le_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x26,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x26,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_le_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_le_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x26,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x26,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_le_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_le_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x26,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x26,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_le_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x26,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x26,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_le_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_le_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x26,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x26,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_le_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_le_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x26,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x26,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_le_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_le_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x26,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x26,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_le_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_le_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x26,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x26,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_le_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_le_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x26,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x26,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_le_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_le_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x26,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x26,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_le_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_le_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x26,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x27,0x7c,0xff,0x6f,0xfd,0x30
 # W32: v_cmp_le_f32 vcc_lo, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x27,0x7c,0xff,0x6f,0xfd,0x30]
 # W64: v_cmp_le_f32 vcc, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x27,0x7c,0xff,0x6f,0xfd,0x30]
-0xfa,0xfe,0x27,0x7c,0xff,0x6f,0xfd,0x30
 
+0xfa,0x04,0x66,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_le_i16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_le_i16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x66,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x66,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_le_i16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_le_i16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x66,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x66,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_le_i16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_le_i16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x66,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x66,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_le_i16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_le_i16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x66,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x66,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_le_i16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_le_i16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x66,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x66,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_le_i16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_i16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x66,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x66,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_le_i16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_le_i16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x66,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x66,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_le_i16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_le_i16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x66,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x66,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_le_i16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_le_i16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x66,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x66,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_le_i16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_le_i16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x66,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x66,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_le_i16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_le_i16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x66,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x66,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_le_i16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_le_i16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x66,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x66,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_le_i16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_le_i16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x66,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x66,0x7c,0x7f,0x6f,0x0d,0x30
 # W32: v_cmp_le_i16 vcc_lo, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x66,0x7c,0x7f,0x6f,0x0d,0x30]
 # W64: v_cmp_le_i16 vcc, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x66,0x7c,0x7f,0x6f,0x0d,0x30]
-0xfa,0xfe,0x66,0x7c,0x7f,0x6f,0x0d,0x30
 
+0xfa,0x04,0x86,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_le_i32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_le_i32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x86,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x86,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_le_i32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_le_i32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x86,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x86,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_le_i32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_le_i32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x86,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x86,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_le_i32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_le_i32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x86,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x86,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_le_i32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_le_i32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x86,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x86,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_le_i32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_i32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x86,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x86,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_le_i32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_le_i32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x86,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x86,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_le_i32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_le_i32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x86,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x86,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_le_i32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_le_i32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x86,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x86,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_le_i32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_le_i32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x86,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x86,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_le_i32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_le_i32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x86,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x86,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_le_i32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_le_i32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x86,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x86,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_le_i32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_le_i32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x86,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x87,0x7c,0xff,0x6f,0x0d,0x30
 # W32: v_cmp_le_i32 vcc_lo, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x87,0x7c,0xff,0x6f,0x0d,0x30]
 # W64: v_cmp_le_i32 vcc, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x87,0x7c,0xff,0x6f,0x0d,0x30]
-0xfa,0xfe,0x87,0x7c,0xff,0x6f,0x0d,0x30
 
+0xfa,0x04,0x76,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_le_u16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_le_u16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x76,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x76,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_le_u16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_le_u16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x76,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x76,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_le_u16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_le_u16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x76,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x76,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_le_u16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_le_u16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x76,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x76,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_le_u16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_le_u16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x76,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x76,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_le_u16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_u16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x76,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x76,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_le_u16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_le_u16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x76,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x76,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_le_u16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_le_u16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x76,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x76,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_le_u16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_le_u16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x76,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x76,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_le_u16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_le_u16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x76,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x76,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_le_u16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_le_u16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x76,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x76,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_le_u16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_le_u16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x76,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x76,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_le_u16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_le_u16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x76,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x76,0x7c,0x7f,0x6f,0x0d,0x30
 # W32: v_cmp_le_u16 vcc_lo, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x76,0x7c,0x7f,0x6f,0x0d,0x30]
 # W64: v_cmp_le_u16 vcc, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x76,0x7c,0x7f,0x6f,0x0d,0x30]
-0xfa,0xfe,0x76,0x7c,0x7f,0x6f,0x0d,0x30
 
+0xfa,0x04,0x96,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_le_u32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_le_u32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x96,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x96,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_le_u32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_le_u32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x96,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x96,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_le_u32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_le_u32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x96,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x96,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_le_u32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_le_u32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x96,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x96,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_le_u32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_le_u32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x96,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x96,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_le_u32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_u32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x96,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x96,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_le_u32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_le_u32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x96,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x96,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_le_u32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_le_u32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x96,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x96,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_le_u32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_le_u32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x96,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x96,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_le_u32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_le_u32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x96,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x96,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_le_u32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_le_u32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x96,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x96,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_le_u32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_le_u32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x96,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x96,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_le_u32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_le_u32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x96,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x97,0x7c,0xff,0x6f,0x0d,0x30
 # W32: v_cmp_le_u32 vcc_lo, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x97,0x7c,0xff,0x6f,0x0d,0x30]
 # W64: v_cmp_le_u32 vcc, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x97,0x7c,0xff,0x6f,0x0d,0x30]
-0xfa,0xfe,0x97,0x7c,0xff,0x6f,0x0d,0x30
 
+0xfa,0x04,0x0a,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lg_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lg_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x0a,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x0a,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lg_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lg_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x0a,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x0a,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_lg_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lg_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x0a,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x0a,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_lg_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lg_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x0a,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x0a,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_lg_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lg_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x0a,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x0a,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lg_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lg_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x0a,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x0a,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_lg_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lg_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x0a,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x0a,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lg_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lg_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x0a,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x0a,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_lg_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lg_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x0a,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x0a,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lg_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lg_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x0a,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x0a,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_lg_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lg_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x0a,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x0a,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lg_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lg_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x0a,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x0a,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_lg_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lg_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x0a,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x0a,0x7c,0x7f,0x6f,0xfd,0x30
 # W32: v_cmp_lg_f16 vcc_lo, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0a,0x7c,0x7f,0x6f,0xfd,0x30]
 # W64: v_cmp_lg_f16 vcc, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0a,0x7c,0x7f,0x6f,0xfd,0x30]
-0xfa,0xfe,0x0a,0x7c,0x7f,0x6f,0xfd,0x30
 
+0xfa,0x04,0x2a,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lg_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lg_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x2a,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x2a,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lg_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lg_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x2a,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x2a,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_lg_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lg_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x2a,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x2a,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_lg_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lg_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x2a,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x2a,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_lg_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lg_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x2a,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x2a,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lg_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lg_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x2a,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x2a,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_lg_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lg_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x2a,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x2a,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lg_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lg_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x2a,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x2a,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_lg_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lg_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x2a,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x2a,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lg_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lg_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x2a,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x2a,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_lg_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lg_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x2a,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x2a,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lg_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lg_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x2a,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x2a,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_lg_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lg_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x2a,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x2b,0x7c,0xff,0x6f,0xfd,0x30
 # W32: v_cmp_lg_f32 vcc_lo, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2b,0x7c,0xff,0x6f,0xfd,0x30]
 # W64: v_cmp_lg_f32 vcc, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2b,0x7c,0xff,0x6f,0xfd,0x30]
-0xfa,0xfe,0x2b,0x7c,0xff,0x6f,0xfd,0x30
 
+0xfa,0x04,0x02,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lt_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lt_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x02,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x02,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lt_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lt_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x02,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x02,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_lt_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lt_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x02,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x02,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_lt_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lt_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x02,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x02,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_lt_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lt_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x02,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x02,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lt_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x02,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x02,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_lt_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lt_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x02,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x02,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lt_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lt_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x02,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x02,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_lt_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lt_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x02,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x02,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lt_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lt_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x02,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x02,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_lt_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lt_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x02,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x02,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lt_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lt_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x02,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x02,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_lt_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lt_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x02,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x02,0x7c,0x7f,0x6f,0xfd,0x30
 # W32: v_cmp_lt_f16 vcc_lo, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x02,0x7c,0x7f,0x6f,0xfd,0x30]
 # W64: v_cmp_lt_f16 vcc, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x02,0x7c,0x7f,0x6f,0xfd,0x30]
-0xfa,0xfe,0x02,0x7c,0x7f,0x6f,0xfd,0x30
 
+0xfa,0x04,0x22,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lt_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lt_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x22,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x22,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lt_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lt_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x22,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x22,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_lt_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lt_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x22,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x22,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_lt_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lt_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x22,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x22,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_lt_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lt_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x22,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x22,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lt_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x22,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x22,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_lt_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lt_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x22,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x22,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lt_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lt_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x22,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x22,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_lt_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lt_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x22,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x22,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lt_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lt_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x22,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x22,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_lt_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lt_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x22,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x22,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lt_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lt_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x22,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x22,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_lt_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lt_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x22,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x23,0x7c,0xff,0x6f,0xfd,0x30
 # W32: v_cmp_lt_f32 vcc_lo, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x23,0x7c,0xff,0x6f,0xfd,0x30]
 # W64: v_cmp_lt_f32 vcc, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x23,0x7c,0xff,0x6f,0xfd,0x30]
-0xfa,0xfe,0x23,0x7c,0xff,0x6f,0xfd,0x30
 
+0xfa,0x04,0x62,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lt_i16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lt_i16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x62,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x62,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lt_i16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lt_i16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x62,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x62,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_lt_i16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lt_i16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x62,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x62,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_lt_i16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lt_i16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x62,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x62,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_lt_i16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lt_i16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x62,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x62,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lt_i16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_i16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x62,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x62,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_lt_i16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lt_i16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x62,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x62,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lt_i16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lt_i16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x62,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x62,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_lt_i16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lt_i16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x62,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x62,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lt_i16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lt_i16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x62,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x62,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_lt_i16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lt_i16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x62,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x62,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lt_i16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lt_i16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x62,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x62,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_lt_i16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lt_i16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x62,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x62,0x7c,0x7f,0x6f,0x0d,0x30
 # W32: v_cmp_lt_i16 vcc_lo, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x62,0x7c,0x7f,0x6f,0x0d,0x30]
 # W64: v_cmp_lt_i16 vcc, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x62,0x7c,0x7f,0x6f,0x0d,0x30]
-0xfa,0xfe,0x62,0x7c,0x7f,0x6f,0x0d,0x30
 
+0xfa,0x04,0x82,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lt_i32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lt_i32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x82,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x82,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lt_i32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lt_i32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x82,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x82,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_lt_i32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lt_i32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x82,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x82,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_lt_i32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lt_i32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x82,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x82,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_lt_i32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lt_i32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x82,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x82,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lt_i32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_i32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x82,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x82,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_lt_i32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lt_i32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x82,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x82,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lt_i32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lt_i32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x82,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x82,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_lt_i32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lt_i32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x82,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x82,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lt_i32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lt_i32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x82,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x82,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_lt_i32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lt_i32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x82,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x82,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lt_i32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lt_i32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x82,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x82,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_lt_i32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lt_i32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x82,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x83,0x7c,0xff,0x6f,0x0d,0x30
 # W32: v_cmp_lt_i32 vcc_lo, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x83,0x7c,0xff,0x6f,0x0d,0x30]
 # W64: v_cmp_lt_i32 vcc, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x83,0x7c,0xff,0x6f,0x0d,0x30]
-0xfa,0xfe,0x83,0x7c,0xff,0x6f,0x0d,0x30
 
+0xfa,0x04,0x72,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lt_u16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lt_u16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x72,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x72,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lt_u16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lt_u16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x72,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x72,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_lt_u16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lt_u16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x72,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x72,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_lt_u16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lt_u16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x72,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x72,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_lt_u16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lt_u16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x72,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x72,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lt_u16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_u16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x72,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x72,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_lt_u16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lt_u16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x72,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x72,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lt_u16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lt_u16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x72,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x72,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_lt_u16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lt_u16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x72,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x72,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lt_u16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lt_u16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x72,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x72,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_lt_u16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lt_u16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x72,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x72,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lt_u16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lt_u16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x72,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x72,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_lt_u16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lt_u16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x72,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x72,0x7c,0x7f,0x6f,0x0d,0x30
 # W32: v_cmp_lt_u16 vcc_lo, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x72,0x7c,0x7f,0x6f,0x0d,0x30]
 # W64: v_cmp_lt_u16 vcc, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x72,0x7c,0x7f,0x6f,0x0d,0x30]
-0xfa,0xfe,0x72,0x7c,0x7f,0x6f,0x0d,0x30
 
+0xfa,0x04,0x92,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lt_u32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lt_u32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x92,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x92,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lt_u32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lt_u32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x92,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x92,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_lt_u32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lt_u32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x92,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x92,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_lt_u32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lt_u32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x92,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x92,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_lt_u32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lt_u32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x92,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x92,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lt_u32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_u32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x92,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x92,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_lt_u32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lt_u32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x92,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x92,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lt_u32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lt_u32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x92,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x92,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_lt_u32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lt_u32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x92,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x92,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lt_u32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lt_u32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x92,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x92,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_lt_u32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lt_u32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x92,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x92,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lt_u32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lt_u32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x92,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x92,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_lt_u32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lt_u32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x92,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x93,0x7c,0xff,0x6f,0x0d,0x30
 # W32: v_cmp_lt_u32 vcc_lo, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x93,0x7c,0xff,0x6f,0x0d,0x30]
 # W64: v_cmp_lt_u32 vcc, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x93,0x7c,0xff,0x6f,0x0d,0x30]
-0xfa,0xfe,0x93,0x7c,0xff,0x6f,0x0d,0x30
 
+0xfa,0x04,0x6a,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ne_i16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ne_i16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x6a,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x6a,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ne_i16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ne_i16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x6a,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x6a,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_ne_i16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ne_i16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x6a,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x6a,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_ne_i16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ne_i16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x6a,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x6a,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_ne_i16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ne_i16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x6a,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x6a,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ne_i16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ne_i16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x6a,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x6a,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_ne_i16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ne_i16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x6a,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x6a,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ne_i16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ne_i16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x6a,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x6a,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_ne_i16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ne_i16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x6a,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x6a,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ne_i16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ne_i16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x6a,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x6a,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_ne_i16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ne_i16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x6a,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x6a,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ne_i16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ne_i16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x6a,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x6a,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_ne_i16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ne_i16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x6a,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x6a,0x7c,0x7f,0x6f,0x0d,0x30
 # W32: v_cmp_ne_i16 vcc_lo, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6a,0x7c,0x7f,0x6f,0x0d,0x30]
 # W64: v_cmp_ne_i16 vcc, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6a,0x7c,0x7f,0x6f,0x0d,0x30]
-0xfa,0xfe,0x6a,0x7c,0x7f,0x6f,0x0d,0x30
 
+0xfa,0x04,0x8a,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ne_i32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ne_i32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x8a,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x8a,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ne_i32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ne_i32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x8a,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x8a,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_ne_i32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ne_i32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x8a,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x8a,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_ne_i32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ne_i32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x8a,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x8a,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_ne_i32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ne_i32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x8a,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x8a,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ne_i32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ne_i32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x8a,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x8a,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_ne_i32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ne_i32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x8a,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x8a,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ne_i32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ne_i32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x8a,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x8a,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_ne_i32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ne_i32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x8a,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x8a,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ne_i32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ne_i32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x8a,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x8a,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_ne_i32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ne_i32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x8a,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x8a,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ne_i32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ne_i32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x8a,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x8a,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_ne_i32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ne_i32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x8a,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x8b,0x7c,0xff,0x6f,0x0d,0x30
 # W32: v_cmp_ne_i32 vcc_lo, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x8b,0x7c,0xff,0x6f,0x0d,0x30]
 # W64: v_cmp_ne_i32 vcc, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x8b,0x7c,0xff,0x6f,0x0d,0x30]
-0xfa,0xfe,0x8b,0x7c,0xff,0x6f,0x0d,0x30
 
+0xfa,0x04,0x7a,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ne_u16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ne_u16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x7a,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x7a,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ne_u16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ne_u16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x7a,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x7a,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_ne_u16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ne_u16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x7a,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x7a,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_ne_u16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ne_u16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x7a,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x7a,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_ne_u16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ne_u16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x7a,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x7a,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ne_u16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ne_u16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x7a,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x7a,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_ne_u16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ne_u16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x7a,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x7a,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ne_u16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ne_u16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x7a,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x7a,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_ne_u16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ne_u16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x7a,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x7a,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ne_u16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ne_u16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x7a,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x7a,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_ne_u16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ne_u16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x7a,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x7a,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ne_u16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ne_u16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x7a,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x7a,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_ne_u16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ne_u16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x7a,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x7a,0x7c,0x7f,0x6f,0x0d,0x30
 # W32: v_cmp_ne_u16 vcc_lo, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7a,0x7c,0x7f,0x6f,0x0d,0x30]
 # W64: v_cmp_ne_u16 vcc, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7a,0x7c,0x7f,0x6f,0x0d,0x30]
-0xfa,0xfe,0x7a,0x7c,0x7f,0x6f,0x0d,0x30
 
+0xfa,0x04,0x9a,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ne_u32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ne_u32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x9a,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x9a,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ne_u32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ne_u32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x9a,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x9a,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_ne_u32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ne_u32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x9a,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x9a,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_ne_u32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ne_u32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x9a,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x9a,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_ne_u32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ne_u32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x9a,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x9a,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ne_u32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ne_u32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x9a,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x9a,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_ne_u32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ne_u32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x9a,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x9a,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ne_u32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ne_u32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x9a,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x9a,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_ne_u32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ne_u32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x9a,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x9a,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ne_u32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ne_u32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x9a,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x9a,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_ne_u32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ne_u32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x9a,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x9a,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ne_u32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ne_u32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x9a,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x9a,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_ne_u32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ne_u32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x9a,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x9b,0x7c,0xff,0x6f,0x0d,0x30
 # W32: v_cmp_ne_u32 vcc_lo, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x9b,0x7c,0xff,0x6f,0x0d,0x30]
 # W64: v_cmp_ne_u32 vcc, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x9b,0x7c,0xff,0x6f,0x0d,0x30]
-0xfa,0xfe,0x9b,0x7c,0xff,0x6f,0x0d,0x30
 
+0xfa,0x04,0x1a,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_neq_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_neq_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x1a,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x1a,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_neq_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_neq_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x1a,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x1a,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_neq_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_neq_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x1a,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x1a,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_neq_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_neq_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x1a,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x1a,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_neq_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_neq_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x1a,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x1a,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_neq_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_neq_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x1a,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x1a,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_neq_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_neq_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x1a,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x1a,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_neq_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_neq_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x1a,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x1a,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_neq_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_neq_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x1a,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x1a,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_neq_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_neq_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x1a,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x1a,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_neq_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_neq_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x1a,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x1a,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_neq_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_neq_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x1a,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x1a,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_neq_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_neq_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x1a,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x1a,0x7c,0x7f,0x6f,0xfd,0x30
 # W32: v_cmp_neq_f16 vcc_lo, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1a,0x7c,0x7f,0x6f,0xfd,0x30]
 # W64: v_cmp_neq_f16 vcc, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1a,0x7c,0x7f,0x6f,0xfd,0x30]
-0xfa,0xfe,0x1a,0x7c,0x7f,0x6f,0xfd,0x30
 
+0xfa,0x04,0x3a,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_neq_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_neq_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x3a,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x3a,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_neq_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_neq_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x3a,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x3a,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_neq_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_neq_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x3a,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x3a,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_neq_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_neq_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x3a,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x3a,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_neq_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_neq_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x3a,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x3a,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_neq_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_neq_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x3a,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x3a,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_neq_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_neq_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x3a,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x3a,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_neq_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_neq_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x3a,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x3a,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_neq_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_neq_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x3a,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x3a,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_neq_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_neq_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x3a,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x3a,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_neq_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_neq_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x3a,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x3a,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_neq_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_neq_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x3a,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x3a,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_neq_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_neq_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x3a,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x3b,0x7c,0xff,0x6f,0xfd,0x30
 # W32: v_cmp_neq_f32 vcc_lo, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x3b,0x7c,0xff,0x6f,0xfd,0x30]
 # W64: v_cmp_neq_f32 vcc, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x3b,0x7c,0xff,0x6f,0xfd,0x30]
-0xfa,0xfe,0x3b,0x7c,0xff,0x6f,0xfd,0x30
 
+0xfa,0x04,0x12,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nge_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nge_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x12,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x12,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nge_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nge_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x12,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x12,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_nge_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nge_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x12,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x12,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_nge_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nge_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x12,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x12,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_nge_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nge_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x12,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x12,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nge_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nge_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x12,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x12,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_nge_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nge_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x12,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x12,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nge_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nge_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x12,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x12,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_nge_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nge_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x12,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x12,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nge_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nge_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x12,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x12,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_nge_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nge_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x12,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x12,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nge_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nge_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x12,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x12,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_nge_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nge_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x12,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x12,0x7c,0x7f,0x6f,0xfd,0x30
 # W32: v_cmp_nge_f16 vcc_lo, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x12,0x7c,0x7f,0x6f,0xfd,0x30]
 # W64: v_cmp_nge_f16 vcc, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x12,0x7c,0x7f,0x6f,0xfd,0x30]
-0xfa,0xfe,0x12,0x7c,0x7f,0x6f,0xfd,0x30
 
+0xfa,0x04,0x32,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nge_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nge_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x32,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x32,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nge_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nge_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x32,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x32,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_nge_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nge_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x32,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x32,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_nge_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nge_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x32,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x32,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_nge_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nge_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x32,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x32,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nge_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nge_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x32,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x32,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_nge_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nge_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x32,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x32,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nge_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nge_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x32,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x32,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_nge_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nge_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x32,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x32,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nge_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nge_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x32,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x32,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_nge_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nge_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x32,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x32,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nge_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nge_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x32,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x32,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_nge_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nge_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x32,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x33,0x7c,0xff,0x6f,0xfd,0x30
 # W32: v_cmp_nge_f32 vcc_lo, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x33,0x7c,0xff,0x6f,0xfd,0x30]
 # W64: v_cmp_nge_f32 vcc, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x33,0x7c,0xff,0x6f,0xfd,0x30]
-0xfa,0xfe,0x33,0x7c,0xff,0x6f,0xfd,0x30
 
+0xfa,0x04,0x16,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ngt_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ngt_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x16,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x16,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ngt_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ngt_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x16,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x16,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_ngt_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ngt_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x16,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x16,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_ngt_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ngt_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x16,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x16,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_ngt_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ngt_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x16,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x16,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ngt_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ngt_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x16,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x16,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_ngt_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ngt_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x16,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x16,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ngt_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ngt_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x16,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x16,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_ngt_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ngt_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x16,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x16,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ngt_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ngt_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x16,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x16,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_ngt_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ngt_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x16,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x16,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ngt_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ngt_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x16,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x16,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_ngt_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ngt_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x16,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x16,0x7c,0x7f,0x6f,0xfd,0x30
 # W32: v_cmp_ngt_f16 vcc_lo, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x16,0x7c,0x7f,0x6f,0xfd,0x30]
 # W64: v_cmp_ngt_f16 vcc, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x16,0x7c,0x7f,0x6f,0xfd,0x30]
-0xfa,0xfe,0x16,0x7c,0x7f,0x6f,0xfd,0x30
 
+0xfa,0x04,0x36,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ngt_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ngt_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x36,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x36,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ngt_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ngt_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x36,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x36,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_ngt_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ngt_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x36,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x36,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_ngt_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ngt_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x36,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x36,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_ngt_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ngt_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x36,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x36,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ngt_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ngt_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x36,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x36,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_ngt_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ngt_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x36,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x36,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ngt_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ngt_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x36,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x36,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_ngt_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ngt_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x36,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x36,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ngt_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ngt_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x36,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x36,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_ngt_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ngt_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x36,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x36,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ngt_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ngt_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x36,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x36,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_ngt_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ngt_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x36,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x37,0x7c,0xff,0x6f,0xfd,0x30
 # W32: v_cmp_ngt_f32 vcc_lo, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x37,0x7c,0xff,0x6f,0xfd,0x30]
 # W64: v_cmp_ngt_f32 vcc, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x37,0x7c,0xff,0x6f,0xfd,0x30]
-0xfa,0xfe,0x37,0x7c,0xff,0x6f,0xfd,0x30
 
+0xfa,0x04,0x18,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nle_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nle_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x18,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x18,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nle_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nle_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x18,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x18,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_nle_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nle_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x18,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x18,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_nle_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nle_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x18,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x18,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_nle_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nle_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x18,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x18,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nle_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nle_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x18,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x18,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_nle_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nle_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x18,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x18,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nle_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nle_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x18,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x18,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_nle_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nle_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x18,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x18,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nle_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nle_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x18,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x18,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_nle_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nle_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x18,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x18,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nle_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nle_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x18,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x18,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_nle_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nle_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x18,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x18,0x7c,0x7f,0x6f,0xfd,0x30
 # W32: v_cmp_nle_f16 vcc_lo, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x18,0x7c,0x7f,0x6f,0xfd,0x30]
 # W64: v_cmp_nle_f16 vcc, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x18,0x7c,0x7f,0x6f,0xfd,0x30]
-0xfa,0xfe,0x18,0x7c,0x7f,0x6f,0xfd,0x30
 
+0xfa,0x04,0x38,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nle_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nle_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x38,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x38,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nle_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nle_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x38,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x38,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_nle_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nle_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x38,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x38,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_nle_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nle_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x38,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x38,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_nle_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nle_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x38,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x38,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nle_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nle_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x38,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x38,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_nle_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nle_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x38,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x38,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nle_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nle_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x38,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x38,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_nle_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nle_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x38,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x38,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nle_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nle_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x38,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x38,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_nle_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nle_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x38,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x38,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nle_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nle_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x38,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x38,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_nle_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nle_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x38,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x39,0x7c,0xff,0x6f,0xfd,0x30
 # W32: v_cmp_nle_f32 vcc_lo, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x39,0x7c,0xff,0x6f,0xfd,0x30]
 # W64: v_cmp_nle_f32 vcc, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x39,0x7c,0xff,0x6f,0xfd,0x30]
-0xfa,0xfe,0x39,0x7c,0xff,0x6f,0xfd,0x30
 
+0xfa,0x04,0x14,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nlg_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nlg_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x14,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x14,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nlg_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nlg_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x14,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x14,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_nlg_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nlg_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x14,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x14,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_nlg_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nlg_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x14,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x14,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_nlg_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nlg_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x14,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x14,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nlg_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nlg_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x14,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x14,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_nlg_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nlg_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x14,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x14,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nlg_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nlg_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x14,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x14,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_nlg_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nlg_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x14,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x14,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nlg_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nlg_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x14,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x14,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_nlg_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nlg_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x14,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x14,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nlg_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nlg_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x14,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x14,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_nlg_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nlg_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x14,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x14,0x7c,0x7f,0x6f,0xfd,0x30
 # W32: v_cmp_nlg_f16 vcc_lo, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x14,0x7c,0x7f,0x6f,0xfd,0x30]
 # W64: v_cmp_nlg_f16 vcc, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x14,0x7c,0x7f,0x6f,0xfd,0x30]
-0xfa,0xfe,0x14,0x7c,0x7f,0x6f,0xfd,0x30
 
+0xfa,0x04,0x34,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nlg_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nlg_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x34,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x34,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nlg_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nlg_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x34,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x34,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_nlg_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nlg_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x34,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x34,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_nlg_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nlg_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x34,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x34,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_nlg_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nlg_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x34,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x34,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nlg_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nlg_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x34,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x34,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_nlg_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nlg_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x34,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x34,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nlg_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nlg_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x34,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x34,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_nlg_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nlg_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x34,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x34,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nlg_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nlg_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x34,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x34,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_nlg_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nlg_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x34,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x34,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nlg_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nlg_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x34,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x34,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_nlg_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nlg_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x34,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x35,0x7c,0xff,0x6f,0xfd,0x30
 # W32: v_cmp_nlg_f32 vcc_lo, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x35,0x7c,0xff,0x6f,0xfd,0x30]
 # W64: v_cmp_nlg_f32 vcc, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x35,0x7c,0xff,0x6f,0xfd,0x30]
-0xfa,0xfe,0x35,0x7c,0xff,0x6f,0xfd,0x30
 
+0xfa,0x04,0x1c,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nlt_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nlt_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x1c,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x1c,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nlt_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nlt_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x1c,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x1c,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_nlt_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nlt_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x1c,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x1c,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_nlt_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nlt_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x1c,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x1c,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_nlt_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nlt_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x1c,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x1c,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nlt_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nlt_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x1c,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x1c,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_nlt_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nlt_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x1c,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x1c,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nlt_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nlt_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x1c,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x1c,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_nlt_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nlt_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x1c,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x1c,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nlt_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nlt_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x1c,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x1c,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_nlt_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nlt_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x1c,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x1c,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nlt_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nlt_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x1c,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x1c,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_nlt_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nlt_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x1c,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x1c,0x7c,0x7f,0x6f,0xfd,0x30
 # W32: v_cmp_nlt_f16 vcc_lo, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1c,0x7c,0x7f,0x6f,0xfd,0x30]
 # W64: v_cmp_nlt_f16 vcc, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1c,0x7c,0x7f,0x6f,0xfd,0x30]
-0xfa,0xfe,0x1c,0x7c,0x7f,0x6f,0xfd,0x30
 
+0xfa,0x04,0x3c,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nlt_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nlt_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x3c,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x3c,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nlt_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nlt_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x3c,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x3c,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_nlt_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nlt_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x3c,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x3c,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_nlt_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nlt_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x3c,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x3c,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_nlt_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nlt_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x3c,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x3c,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nlt_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nlt_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x3c,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x3c,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_nlt_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nlt_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x3c,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x3c,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nlt_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nlt_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x3c,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x3c,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_nlt_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nlt_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x3c,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x3c,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nlt_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nlt_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x3c,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x3c,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_nlt_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nlt_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x3c,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x3c,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nlt_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nlt_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x3c,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x3c,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_nlt_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nlt_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x3c,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x3d,0x7c,0xff,0x6f,0xfd,0x30
 # W32: v_cmp_nlt_f32 vcc_lo, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x3d,0x7c,0xff,0x6f,0xfd,0x30]
 # W64: v_cmp_nlt_f32 vcc, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x3d,0x7c,0xff,0x6f,0xfd,0x30]
-0xfa,0xfe,0x3d,0x7c,0xff,0x6f,0xfd,0x30
 
+0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_o_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_o_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x0e,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_o_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_o_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x0e,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x0e,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_o_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_o_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x0e,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x0e,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_o_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_o_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x0e,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x0e,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_o_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_o_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x0e,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x0e,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_o_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_o_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x0e,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x0e,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_o_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_o_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x0e,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x0e,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_o_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_o_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x0e,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x0e,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_o_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_o_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x0e,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x0e,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_o_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_o_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x0e,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x0e,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_o_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_o_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x0e,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x0e,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_o_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_o_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x0e,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x0e,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_o_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_o_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x0e,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x0e,0x7c,0x7f,0x6f,0xfd,0x30
 # W32: v_cmp_o_f16 vcc_lo, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0e,0x7c,0x7f,0x6f,0xfd,0x30]
 # W64: v_cmp_o_f16 vcc, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0e,0x7c,0x7f,0x6f,0xfd,0x30]
-0xfa,0xfe,0x0e,0x7c,0x7f,0x6f,0xfd,0x30
 
+0xfa,0x04,0x2e,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_o_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_o_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x2e,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x2e,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_o_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_o_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x2e,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x2e,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_o_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_o_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x2e,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x2e,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_o_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_o_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x2e,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x2e,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_o_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_o_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x2e,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x2e,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_o_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_o_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x2e,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x2e,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_o_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_o_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x2e,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x2e,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_o_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_o_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x2e,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x2e,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_o_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_o_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x2e,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x2e,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_o_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_o_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x2e,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x2e,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_o_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_o_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x2e,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x2e,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_o_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_o_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x2e,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x2e,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_o_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_o_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x2e,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x2f,0x7c,0xff,0x6f,0xfd,0x30
 # W32: v_cmp_o_f32 vcc_lo, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2f,0x7c,0xff,0x6f,0xfd,0x30]
 # W64: v_cmp_o_f32 vcc, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2f,0x7c,0xff,0x6f,0xfd,0x30]
-0xfa,0xfe,0x2f,0x7c,0xff,0x6f,0xfd,0x30
 
+0xfa,0x04,0x1e,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_t_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_t_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x1e,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x1e,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_t_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_t_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x1e,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x1e,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_t_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_t_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x1e,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x1e,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_t_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_t_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x1e,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x1e,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_t_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_t_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x1e,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x1e,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_t_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_t_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x1e,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x1e,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_t_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_t_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x1e,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x1e,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_t_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_t_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x1e,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x1e,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_t_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_t_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x1e,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x1e,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_t_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_t_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x1e,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x1e,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_t_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_t_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x1e,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x1e,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_t_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1e,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_t_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1e,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x1e,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x1e,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_t_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1e,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_t_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1e,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x1e,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x1e,0x7c,0x7f,0x6f,0xfd,0x30
 # W32: v_cmp_t_f16 vcc_lo, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1e,0x7c,0x7f,0x6f,0xfd,0x30]
 # W64: v_cmp_t_f16 vcc, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1e,0x7c,0x7f,0x6f,0xfd,0x30]
-0xfa,0xfe,0x1e,0x7c,0x7f,0x6f,0xfd,0x30
 
+0xfa,0x04,0x3e,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_t_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_t_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x3e,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x3e,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_t_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_t_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x3e,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x3e,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_t_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_t_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x3e,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x3e,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_t_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_t_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x3e,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x3e,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_t_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_t_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x3e,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x3e,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_t_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_t_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x3e,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x3e,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_t_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_t_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x3e,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x3e,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_t_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_t_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x3e,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x3e,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_t_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_t_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x3e,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x3e,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_t_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_t_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x3e,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x3e,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_t_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_t_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x3e,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x3e,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_t_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x3e,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_t_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x3e,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x3e,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x3e,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_t_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x3e,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_t_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x3e,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x3e,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x3f,0x7c,0xff,0x6f,0xfd,0x30
 # W32: v_cmp_t_f32 vcc_lo, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x3f,0x7c,0xff,0x6f,0xfd,0x30]
 # W64: v_cmp_t_f32 vcc, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x3f,0x7c,0xff,0x6f,0xfd,0x30]
-0xfa,0xfe,0x3f,0x7c,0xff,0x6f,0xfd,0x30
 
+0xfa,0x04,0x8e,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_t_i32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_t_i32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x8e,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x8e,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_t_i32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_t_i32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x8e,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x8e,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_t_i32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_t_i32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x8e,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x8e,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_t_i32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_t_i32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x8e,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x8e,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_t_i32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_t_i32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x8e,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x8e,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_t_i32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_t_i32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x8e,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x8e,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_t_i32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_t_i32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x8e,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x8e,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_t_i32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_t_i32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x8e,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x8e,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_t_i32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_t_i32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x8e,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x8e,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_t_i32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_t_i32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x8e,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x8e,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_t_i32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_t_i32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x8e,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x8e,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_t_i32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x8e,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_t_i32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x8e,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x8e,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x8e,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_t_i32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x8e,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_t_i32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x8e,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x8e,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x8f,0x7c,0xff,0x6f,0x0d,0x30
 # W32: v_cmp_t_i32 vcc_lo, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x8f,0x7c,0xff,0x6f,0x0d,0x30]
 # W64: v_cmp_t_i32 vcc, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x8f,0x7c,0xff,0x6f,0x0d,0x30]
-0xfa,0xfe,0x8f,0x7c,0xff,0x6f,0x0d,0x30
 
+0xfa,0x04,0x9e,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_t_u32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_t_u32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x9e,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x9e,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_t_u32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_t_u32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x9e,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x9e,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_t_u32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_t_u32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x9e,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x9e,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_t_u32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_t_u32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x9e,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x9e,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_t_u32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_t_u32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x9e,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x9e,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_t_u32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_t_u32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x9e,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x9e,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_t_u32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_t_u32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x9e,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x9e,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_t_u32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_t_u32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x9e,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x9e,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_t_u32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_t_u32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x9e,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x9e,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_t_u32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_t_u32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x9e,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x9e,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_t_u32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_t_u32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x9e,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x9e,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_t_u32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x9e,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_t_u32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x9e,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x9e,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x9e,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_t_u32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x9e,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_t_u32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x9e,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x9e,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x9f,0x7c,0xff,0x6f,0x0d,0x30
 # W32: v_cmp_t_u32 vcc_lo, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x9f,0x7c,0xff,0x6f,0x0d,0x30]
 # W64: v_cmp_t_u32 vcc, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x9f,0x7c,0xff,0x6f,0x0d,0x30]
-0xfa,0xfe,0x9f,0x7c,0xff,0x6f,0x0d,0x30
 
+0xfa,0x04,0x10,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_u_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_u_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x10,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x10,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_u_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_u_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x10,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x10,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_u_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_u_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x10,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x10,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_u_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_u_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x10,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x10,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_u_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_u_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x10,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x10,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_u_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_u_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x10,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x10,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_u_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_u_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x10,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x10,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_u_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_u_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x10,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x10,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_u_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_u_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x10,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x10,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_u_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_u_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x10,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x10,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_u_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_u_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x10,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x10,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_u_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_u_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x10,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x10,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_u_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_u_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x10,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x10,0x7c,0x7f,0x6f,0xfd,0x30
 # W32: v_cmp_u_f16 vcc_lo, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x10,0x7c,0x7f,0x6f,0xfd,0x30]
 # W64: v_cmp_u_f16 vcc, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x10,0x7c,0x7f,0x6f,0xfd,0x30]
-0xfa,0xfe,0x10,0x7c,0x7f,0x6f,0xfd,0x30
 
+0xfa,0x04,0x30,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_u_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_u_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x30,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x30,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_u_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_u_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x30,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x30,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_u_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_u_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x30,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x30,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_u_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_u_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x30,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x30,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_u_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_u_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x30,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x30,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_u_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_u_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x30,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x30,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_u_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_u_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x30,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x30,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_u_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_u_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x30,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x30,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_u_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_u_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x30,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x30,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_u_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_u_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x30,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x30,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_u_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_u_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x30,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x30,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_u_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_u_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x30,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x30,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_u_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_u_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x30,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x31,0x7c,0xff,0x6f,0xfd,0x30
 # W32: v_cmp_u_f32 vcc_lo, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x31,0x7c,0xff,0x6f,0xfd,0x30]
 # W64: v_cmp_u_f32 vcc, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x31,0x7c,0xff,0x6f,0xfd,0x30]
-0xfa,0xfe,0x31,0x7c,0xff,0x6f,0xfd,0x30
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopc_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopc_dpp8.txt
index 00d5106cc90a5..261ad14af9010 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopc_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopc_dpp8.txt
@@ -1,500 +1,501 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W32
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W64
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W32
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W64
 
+0xe9,0x04,0xfa,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_class_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_class_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0xfa,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0xfa,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_class_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfa,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_class_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfa,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0xfa,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0xfc,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_class_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfc,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_class_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfc,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0xfc,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0xfd,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_class_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfd,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_class_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfd,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0xfd,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x04,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x04,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x04,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x04,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x04,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_eq_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x04,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_eq_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x04,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x04,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x24,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x24,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x24,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x24,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x25,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_eq_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x25,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_eq_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x25,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x25,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x64,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_i16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x64,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_i16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x64,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x64,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x64,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_eq_i16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x64,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_eq_i16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x64,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x64,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x84,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_i32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x84,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_i32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x84,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x84,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x85,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_eq_i32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x85,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_eq_i32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x85,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x85,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x74,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_u16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x74,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_u16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x74,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x74,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x74,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_eq_u16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x74,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_eq_u16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x74,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x74,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x94,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_u32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x94,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_u32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x94,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x94,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x95,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_eq_u32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x95,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_eq_u32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x95,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x95,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x00,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_f_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x00,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_f_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x00,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x00,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x00,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_f_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x00,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_f_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x00,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x00,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x20,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_f_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x20,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_f_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x20,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x20,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x21,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_f_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x21,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_f_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x21,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x21,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x80,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_f_i32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x80,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_f_i32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x80,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x80,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x81,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_f_i32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x81,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_f_i32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x81,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x81,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x90,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_f_u32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x90,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_f_u32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x90,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x90,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x91,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_f_u32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x91,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_f_u32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x91,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x91,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x0c,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0c,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0c,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x0c,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x0c,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_ge_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0c,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_ge_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0c,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x0c,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x2c,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2c,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2c,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x2c,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x2d,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_ge_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2d,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_ge_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2d,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x2d,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x6c,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_i16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6c,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_i16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6c,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x6c,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x6c,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_ge_i16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6c,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_ge_i16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6c,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x6c,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x8c,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_i32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x8c,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_i32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x8c,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x8c,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x8d,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_ge_i32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x8d,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_ge_i32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x8d,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x8d,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x7c,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_u16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7c,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_u16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7c,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x7c,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x7c,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_ge_u16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7c,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_ge_u16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7c,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x7c,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x9c,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_u32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x9c,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_u32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x9c,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x9c,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x9d,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_ge_u32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x9d,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_ge_u32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x9d,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x9d,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x08,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x08,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x08,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x08,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x08,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_gt_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x08,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_gt_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x08,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x08,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x28,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x28,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x28,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x28,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x29,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_gt_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x29,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_gt_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x29,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x29,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x68,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_i16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x68,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_i16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x68,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x68,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x68,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_gt_i16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x68,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_gt_i16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x68,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x68,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x88,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_i32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x88,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_i32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x88,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x88,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x89,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_gt_i32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x89,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_gt_i32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x89,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x89,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x78,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_u16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x78,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_u16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x78,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x78,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x78,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_gt_u16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x78,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_gt_u16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x78,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x78,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x98,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_u32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x98,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_u32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x98,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x98,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x99,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_gt_u32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x99,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_gt_u32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x99,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x99,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x06,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x06,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x06,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x06,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x06,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_le_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x06,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_le_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x06,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x06,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x26,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x26,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x26,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x26,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x27,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_le_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x27,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_le_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x27,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x27,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x66,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_i16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x66,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_i16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x66,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x66,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x66,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_le_i16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x66,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_le_i16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x66,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x66,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x86,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_i32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x86,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_i32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x86,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x86,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x87,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_le_i32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x87,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_le_i32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x87,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x87,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x76,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_u16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x76,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_u16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x76,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x76,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x76,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_le_u16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x76,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_le_u16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x76,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x76,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x96,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_u32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x96,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_u32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x96,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x96,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x97,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_le_u32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x97,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_le_u32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x97,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x97,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x0a,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_lg_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lg_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x0a,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x0a,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_lg_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_lg_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x0a,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x2a,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_lg_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2a,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lg_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2a,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x2a,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x2b,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_lg_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2b,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_lg_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2b,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x2b,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x02,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x02,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x02,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x02,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x02,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_lt_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x02,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_lt_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x02,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x02,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x22,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x22,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x22,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x22,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x23,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_lt_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x23,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_lt_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x23,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x23,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x62,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_i16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x62,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_i16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x62,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x62,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x62,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_lt_i16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x62,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_lt_i16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x62,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x62,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x82,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_i32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x82,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_i32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x82,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x82,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x83,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_lt_i32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x83,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_lt_i32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x83,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x83,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x72,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_u16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x72,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_u16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x72,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x72,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x72,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_lt_u16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x72,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_lt_u16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x72,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x72,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x92,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_u32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x92,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_u32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x92,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x92,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x93,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_lt_u32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x93,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_lt_u32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x93,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x93,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x6a,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_i16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6a,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_i16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6a,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x6a,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x6a,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_ne_i16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6a,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_ne_i16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6a,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x6a,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x8a,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_i32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x8a,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_i32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x8a,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x8a,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x8b,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_ne_i32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x8b,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_ne_i32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x8b,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x8b,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x7a,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_u16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7a,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_u16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7a,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x7a,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x7a,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_ne_u16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7a,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_ne_u16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7a,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x7a,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x9a,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_u32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x9a,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_u32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x9a,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x9a,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x9b,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_ne_u32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x9b,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_ne_u32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x9b,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x9b,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x1a,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_neq_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1a,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_neq_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1a,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x1a,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x1a,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_neq_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1a,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_neq_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1a,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x1a,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x3a,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_neq_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3a,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_neq_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3a,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x3a,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x3b,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_neq_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x3b,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_neq_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x3b,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x3b,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x12,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_nge_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x12,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nge_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x12,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x12,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x12,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_nge_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x12,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_nge_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x12,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x12,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x32,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_nge_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x32,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nge_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x32,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x32,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x33,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_nge_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x33,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_nge_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x33,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x33,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x16,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_ngt_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x16,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ngt_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x16,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x16,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x16,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_ngt_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x16,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_ngt_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x16,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x16,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x36,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_ngt_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x36,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ngt_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x36,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x36,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x37,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_ngt_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x37,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_ngt_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x37,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x37,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x18,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_nle_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x18,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nle_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x18,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x18,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x18,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_nle_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x18,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_nle_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x18,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x18,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x38,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_nle_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x38,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nle_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x38,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x38,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x39,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_nle_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x39,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_nle_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x39,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x39,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x14,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlg_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x14,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlg_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x14,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x14,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x14,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_nlg_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x14,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_nlg_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x14,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x14,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x34,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlg_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x34,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlg_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x34,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x34,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x35,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_nlg_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x35,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_nlg_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x35,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x35,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x1c,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlt_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1c,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlt_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1c,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x1c,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x1c,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_nlt_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1c,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_nlt_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1c,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x1c,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x3c,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlt_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3c,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlt_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3c,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x3c,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x3d,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_nlt_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x3d,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_nlt_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x3d,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x3d,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x0e,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_o_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0e,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_o_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0e,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x0e,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x0e,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_o_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0e,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_o_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0e,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x0e,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x2e,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_o_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2e,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_o_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2e,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x2e,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x2f,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_o_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2f,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_o_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2f,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x2f,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x1e,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_t_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1e,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_t_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1e,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x1e,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x1e,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_t_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1e,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_t_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1e,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x1e,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x3e,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_t_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3e,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_t_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3e,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x3e,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x3f,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_t_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x3f,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_t_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x3f,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x3f,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x8e,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_t_i32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x8e,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_t_i32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x8e,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x8e,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x8f,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_t_i32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x8f,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_t_i32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x8f,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x8f,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x9e,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_t_u32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x9e,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_t_u32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x9e,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x9e,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x9f,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_t_u32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x9f,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_t_u32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x9f,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x9f,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x10,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_u_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x10,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_u_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x10,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x10,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x10,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_u_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x10,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_u_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x10,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x10,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x30,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_u_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x30,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_u_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x30,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x30,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x31,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_u_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x31,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_u_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x31,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x31,0x7c,0xff,0x00,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt
index c6019b7fdfa75..e0b5c16c27d2f 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt
@@ -1,3982 +1,3983 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11
 
-# GFX11: v_cmpx_class_f16_e32 v1, v2             ; encoding: [0x01,0x05,0xfa,0x7d]
 0x01,0x05,0xfa,0x7d
+# GFX11: v_cmpx_class_f16_e32 v1, v2             ; encoding: [0x01,0x05,0xfa,0x7d]
 
-# GFX11: v_cmpx_class_f16_e32 v127, v2           ; encoding: [0x7f,0x05,0xfa,0x7d]
 0x7f,0x05,0xfa,0x7d
+# GFX11: v_cmpx_class_f16_e32 v127, v2           ; encoding: [0x7f,0x05,0xfa,0x7d]
 
-# GFX11: v_cmpx_class_f16_e32 s1, v2             ; encoding: [0x01,0x04,0xfa,0x7d]
 0x01,0x04,0xfa,0x7d
+# GFX11: v_cmpx_class_f16_e32 s1, v2             ; encoding: [0x01,0x04,0xfa,0x7d]
 
-# GFX11: v_cmpx_class_f16_e32 s105, v2           ; encoding: [0x69,0x04,0xfa,0x7d]
 0x69,0x04,0xfa,0x7d
+# GFX11: v_cmpx_class_f16_e32 s105, v2           ; encoding: [0x69,0x04,0xfa,0x7d]
 
-# GFX11: v_cmpx_class_f16_e32 vcc_lo, v2         ; encoding: [0x6a,0x04,0xfa,0x7d]
 0x6a,0x04,0xfa,0x7d
+# GFX11: v_cmpx_class_f16_e32 vcc_lo, v2         ; encoding: [0x6a,0x04,0xfa,0x7d]
 
-# GFX11: v_cmpx_class_f16_e32 vcc_hi, v2         ; encoding: [0x6b,0x04,0xfa,0x7d]
 0x6b,0x04,0xfa,0x7d
+# GFX11: v_cmpx_class_f16_e32 vcc_hi, v2         ; encoding: [0x6b,0x04,0xfa,0x7d]
 
-# GFX11: v_cmpx_class_f16_e32 ttmp15, v2         ; encoding: [0x7b,0x04,0xfa,0x7d]
 0x7b,0x04,0xfa,0x7d
+# GFX11: v_cmpx_class_f16_e32 ttmp15, v2         ; encoding: [0x7b,0x04,0xfa,0x7d]
 
-# GFX11: v_cmpx_class_f16_e32 m0, v2             ; encoding: [0x7d,0x04,0xfa,0x7d]
 0x7d,0x04,0xfa,0x7d
+# GFX11: v_cmpx_class_f16_e32 m0, v2             ; encoding: [0x7d,0x04,0xfa,0x7d]
 
-# GFX11: v_cmpx_class_f16_e32 exec_lo, v2        ; encoding: [0x7e,0x04,0xfa,0x7d]
 0x7e,0x04,0xfa,0x7d
+# GFX11: v_cmpx_class_f16_e32 exec_lo, v2        ; encoding: [0x7e,0x04,0xfa,0x7d]
 
-# GFX11: v_cmpx_class_f16_e32 exec_hi, v2        ; encoding: [0x7f,0x04,0xfa,0x7d]
 0x7f,0x04,0xfa,0x7d
+# GFX11: v_cmpx_class_f16_e32 exec_hi, v2        ; encoding: [0x7f,0x04,0xfa,0x7d]
 
-# GFX11: v_cmpx_class_f16_e32 null, v2           ; encoding: [0x7c,0x04,0xfa,0x7d]
 0x7c,0x04,0xfa,0x7d
+# GFX11: v_cmpx_class_f16_e32 null, v2           ; encoding: [0x7c,0x04,0xfa,0x7d]
 
-# GFX11: v_cmpx_class_f16_e32 -1, v2             ; encoding: [0xc1,0x04,0xfa,0x7d]
 0xc1,0x04,0xfa,0x7d
+# GFX11: v_cmpx_class_f16_e32 -1, v2             ; encoding: [0xc1,0x04,0xfa,0x7d]
 
-# GFX11: v_cmpx_class_f16_e32 0.5, v2            ; encoding: [0xf0,0x04,0xfa,0x7d]
 0xf0,0x04,0xfa,0x7d
+# GFX11: v_cmpx_class_f16_e32 0.5, v2            ; encoding: [0xf0,0x04,0xfa,0x7d]
 
-# GFX11: v_cmpx_class_f16_e32 src_scc, v2        ; encoding: [0xfd,0x04,0xfa,0x7d]
 0xfd,0x04,0xfa,0x7d
+# GFX11: v_cmpx_class_f16_e32 src_scc, v2        ; encoding: [0xfd,0x04,0xfa,0x7d]
 
-# GFX11: v_cmpx_class_f16_e32 0xfe0b, v127       ; encoding: [0xff,0xfe,0xfa,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0xfa,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_class_f16_e32 0xfe0b, v127       ; encoding: [0xff,0xfe,0xfa,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_class_f32_e32 v1, v2             ; encoding: [0x01,0x05,0xfc,0x7d]
 0x01,0x05,0xfc,0x7d
+# GFX11: v_cmpx_class_f32_e32 v1, v2             ; encoding: [0x01,0x05,0xfc,0x7d]
 
-# GFX11: v_cmpx_class_f32_e32 v255, v2           ; encoding: [0xff,0x05,0xfc,0x7d]
 0xff,0x05,0xfc,0x7d
+# GFX11: v_cmpx_class_f32_e32 v255, v2           ; encoding: [0xff,0x05,0xfc,0x7d]
 
-# GFX11: v_cmpx_class_f32_e32 s1, v2             ; encoding: [0x01,0x04,0xfc,0x7d]
 0x01,0x04,0xfc,0x7d
+# GFX11: v_cmpx_class_f32_e32 s1, v2             ; encoding: [0x01,0x04,0xfc,0x7d]
 
-# GFX11: v_cmpx_class_f32_e32 s105, v2           ; encoding: [0x69,0x04,0xfc,0x7d]
 0x69,0x04,0xfc,0x7d
+# GFX11: v_cmpx_class_f32_e32 s105, v2           ; encoding: [0x69,0x04,0xfc,0x7d]
 
-# GFX11: v_cmpx_class_f32_e32 vcc_lo, v2         ; encoding: [0x6a,0x04,0xfc,0x7d]
 0x6a,0x04,0xfc,0x7d
+# GFX11: v_cmpx_class_f32_e32 vcc_lo, v2         ; encoding: [0x6a,0x04,0xfc,0x7d]
 
-# GFX11: v_cmpx_class_f32_e32 vcc_hi, v2         ; encoding: [0x6b,0x04,0xfc,0x7d]
 0x6b,0x04,0xfc,0x7d
+# GFX11: v_cmpx_class_f32_e32 vcc_hi, v2         ; encoding: [0x6b,0x04,0xfc,0x7d]
 
-# GFX11: v_cmpx_class_f32_e32 ttmp15, v2         ; encoding: [0x7b,0x04,0xfc,0x7d]
 0x7b,0x04,0xfc,0x7d
+# GFX11: v_cmpx_class_f32_e32 ttmp15, v2         ; encoding: [0x7b,0x04,0xfc,0x7d]
 
-# GFX11: v_cmpx_class_f32_e32 m0, v2             ; encoding: [0x7d,0x04,0xfc,0x7d]
 0x7d,0x04,0xfc,0x7d
+# GFX11: v_cmpx_class_f32_e32 m0, v2             ; encoding: [0x7d,0x04,0xfc,0x7d]
 
-# GFX11: v_cmpx_class_f32_e32 exec_lo, v2        ; encoding: [0x7e,0x04,0xfc,0x7d]
 0x7e,0x04,0xfc,0x7d
+# GFX11: v_cmpx_class_f32_e32 exec_lo, v2        ; encoding: [0x7e,0x04,0xfc,0x7d]
 
-# GFX11: v_cmpx_class_f32_e32 exec_hi, v2        ; encoding: [0x7f,0x04,0xfc,0x7d]
 0x7f,0x04,0xfc,0x7d
+# GFX11: v_cmpx_class_f32_e32 exec_hi, v2        ; encoding: [0x7f,0x04,0xfc,0x7d]
 
-# GFX11: v_cmpx_class_f32_e32 null, v2           ; encoding: [0x7c,0x04,0xfc,0x7d]
 0x7c,0x04,0xfc,0x7d
+# GFX11: v_cmpx_class_f32_e32 null, v2           ; encoding: [0x7c,0x04,0xfc,0x7d]
 
-# GFX11: v_cmpx_class_f32_e32 -1, v2             ; encoding: [0xc1,0x04,0xfc,0x7d]
 0xc1,0x04,0xfc,0x7d
+# GFX11: v_cmpx_class_f32_e32 -1, v2             ; encoding: [0xc1,0x04,0xfc,0x7d]
 
-# GFX11: v_cmpx_class_f32_e32 0.5, v2            ; encoding: [0xf0,0x04,0xfc,0x7d]
 0xf0,0x04,0xfc,0x7d
+# GFX11: v_cmpx_class_f32_e32 0.5, v2            ; encoding: [0xf0,0x04,0xfc,0x7d]
 
-# GFX11: v_cmpx_class_f32_e32 src_scc, v2        ; encoding: [0xfd,0x04,0xfc,0x7d]
 0xfd,0x04,0xfc,0x7d
+# GFX11: v_cmpx_class_f32_e32 src_scc, v2        ; encoding: [0xfd,0x04,0xfc,0x7d]
 
-# GFX11: v_cmpx_class_f32_e32 0xaf123456, v255   ; encoding: [0xff,0xfe,0xfd,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0xfd,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_class_f32_e32 0xaf123456, v255   ; encoding: [0xff,0xfe,0xfd,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_class_f64_e32 v[1:2], v2         ; encoding: [0x01,0x05,0xfe,0x7d]
 0x01,0x05,0xfe,0x7d
+# GFX11: v_cmpx_class_f64_e32 v[1:2], v2         ; encoding: [0x01,0x05,0xfe,0x7d]
 
-# GFX11: v_cmpx_class_f64_e32 v[254:255], v2     ; encoding: [0xfe,0x05,0xfe,0x7d]
 0xfe,0x05,0xfe,0x7d
+# GFX11: v_cmpx_class_f64_e32 v[254:255], v2     ; encoding: [0xfe,0x05,0xfe,0x7d]
 
-# GFX11: v_cmpx_class_f64_e32 s[2:3], v2         ; encoding: [0x02,0x04,0xfe,0x7d]
 0x02,0x04,0xfe,0x7d
+# GFX11: v_cmpx_class_f64_e32 s[2:3], v2         ; encoding: [0x02,0x04,0xfe,0x7d]
 
-# GFX11: v_cmpx_class_f64_e32 s[104:105], v2     ; encoding: [0x68,0x04,0xfe,0x7d]
 0x68,0x04,0xfe,0x7d
+# GFX11: v_cmpx_class_f64_e32 s[104:105], v2     ; encoding: [0x68,0x04,0xfe,0x7d]
 
-# GFX11: v_cmpx_class_f64_e32 vcc, v2            ; encoding: [0x6a,0x04,0xfe,0x7d]
 0x6a,0x04,0xfe,0x7d
+# GFX11: v_cmpx_class_f64_e32 vcc, v2            ; encoding: [0x6a,0x04,0xfe,0x7d]
 
-# GFX11: v_cmpx_class_f64_e32 ttmp[14:15], v2    ; encoding: [0x7a,0x04,0xfe,0x7d]
 0x7a,0x04,0xfe,0x7d
+# GFX11: v_cmpx_class_f64_e32 ttmp[14:15], v2    ; encoding: [0x7a,0x04,0xfe,0x7d]
 
-# GFX11: v_cmpx_class_f64_e32 exec, v2           ; encoding: [0x7e,0x04,0xfe,0x7d]
 0x7e,0x04,0xfe,0x7d
+# GFX11: v_cmpx_class_f64_e32 exec, v2           ; encoding: [0x7e,0x04,0xfe,0x7d]
 
-# GFX11: v_cmpx_class_f64_e32 null, v2           ; encoding: [0x7c,0x04,0xfe,0x7d]
 0x7c,0x04,0xfe,0x7d
+# GFX11: v_cmpx_class_f64_e32 null, v2           ; encoding: [0x7c,0x04,0xfe,0x7d]
 
-# GFX11: v_cmpx_class_f64_e32 -1, v2             ; encoding: [0xc1,0x04,0xfe,0x7d]
 0xc1,0x04,0xfe,0x7d
+# GFX11: v_cmpx_class_f64_e32 -1, v2             ; encoding: [0xc1,0x04,0xfe,0x7d]
 
-# GFX11: v_cmpx_class_f64_e32 0.5, v2            ; encoding: [0xf0,0x04,0xfe,0x7d]
 0xf0,0x04,0xfe,0x7d
+# GFX11: v_cmpx_class_f64_e32 0.5, v2            ; encoding: [0xf0,0x04,0xfe,0x7d]
 
-# GFX11: v_cmpx_class_f64_e32 src_scc, v2        ; encoding: [0xfd,0x04,0xfe,0x7d]
 0xfd,0x04,0xfe,0x7d
+# GFX11: v_cmpx_class_f64_e32 src_scc, v2        ; encoding: [0xfd,0x04,0xfe,0x7d]
 
-# GFX11: v_cmpx_class_f64_e32 0xaf123456, v255   ; encoding: [0xff,0xfe,0xff,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0xff,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_class_f64_e32 0xaf123456, v255   ; encoding: [0xff,0xfe,0xff,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_eq_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x04,0x7d]
 0x01,0x05,0x04,0x7d
+# GFX11: v_cmpx_eq_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x04,0x7d]
 
-# GFX11: v_cmpx_eq_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x04,0x7d]
 0x7f,0x05,0x04,0x7d
+# GFX11: v_cmpx_eq_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x04,0x7d]
 
-# GFX11: v_cmpx_eq_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x04,0x7d]
 0x01,0x04,0x04,0x7d
+# GFX11: v_cmpx_eq_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x04,0x7d]
 
-# GFX11: v_cmpx_eq_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x04,0x7d]
 0x69,0x04,0x04,0x7d
+# GFX11: v_cmpx_eq_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x04,0x7d]
 
-# GFX11: v_cmpx_eq_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x04,0x7d]
 0x6a,0x04,0x04,0x7d
+# GFX11: v_cmpx_eq_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x04,0x7d]
 
-# GFX11: v_cmpx_eq_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x04,0x7d]
 0x6b,0x04,0x04,0x7d
+# GFX11: v_cmpx_eq_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x04,0x7d]
 
-# GFX11: v_cmpx_eq_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x04,0x7d]
 0x7b,0x04,0x04,0x7d
+# GFX11: v_cmpx_eq_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x04,0x7d]
 
-# GFX11: v_cmpx_eq_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x04,0x7d]
 0x7d,0x04,0x04,0x7d
+# GFX11: v_cmpx_eq_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x04,0x7d]
 
-# GFX11: v_cmpx_eq_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x04,0x7d]
 0x7e,0x04,0x04,0x7d
+# GFX11: v_cmpx_eq_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x04,0x7d]
 
-# GFX11: v_cmpx_eq_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x04,0x7d]
 0x7f,0x04,0x04,0x7d
+# GFX11: v_cmpx_eq_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x04,0x7d]
 
-# GFX11: v_cmpx_eq_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x04,0x7d]
 0x7c,0x04,0x04,0x7d
+# GFX11: v_cmpx_eq_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x04,0x7d]
 
-# GFX11: v_cmpx_eq_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x04,0x7d]
 0xc1,0x04,0x04,0x7d
+# GFX11: v_cmpx_eq_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x04,0x7d]
 
-# GFX11: v_cmpx_eq_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x04,0x7d]
 0xf0,0x04,0x04,0x7d
+# GFX11: v_cmpx_eq_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x04,0x7d]
 
-# GFX11: v_cmpx_eq_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x04,0x7d]
 0xfd,0x04,0x04,0x7d
+# GFX11: v_cmpx_eq_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x04,0x7d]
 
-# GFX11: v_cmpx_eq_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x04,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x04,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_eq_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x04,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_eq_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x24,0x7d]
 0x01,0x05,0x24,0x7d
+# GFX11: v_cmpx_eq_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x24,0x7d]
 
-# GFX11: v_cmpx_eq_f32_e32 v255, v2              ; encoding: [0xff,0x05,0x24,0x7d]
 0xff,0x05,0x24,0x7d
+# GFX11: v_cmpx_eq_f32_e32 v255, v2              ; encoding: [0xff,0x05,0x24,0x7d]
 
-# GFX11: v_cmpx_eq_f32_e32 s1, v2                ; encoding: [0x01,0x04,0x24,0x7d]
 0x01,0x04,0x24,0x7d
+# GFX11: v_cmpx_eq_f32_e32 s1, v2                ; encoding: [0x01,0x04,0x24,0x7d]
 
-# GFX11: v_cmpx_eq_f32_e32 s105, v2              ; encoding: [0x69,0x04,0x24,0x7d]
 0x69,0x04,0x24,0x7d
+# GFX11: v_cmpx_eq_f32_e32 s105, v2              ; encoding: [0x69,0x04,0x24,0x7d]
 
-# GFX11: v_cmpx_eq_f32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x24,0x7d]
 0x6a,0x04,0x24,0x7d
+# GFX11: v_cmpx_eq_f32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x24,0x7d]
 
-# GFX11: v_cmpx_eq_f32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x24,0x7d]
 0x6b,0x04,0x24,0x7d
+# GFX11: v_cmpx_eq_f32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x24,0x7d]
 
-# GFX11: v_cmpx_eq_f32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x24,0x7d]
 0x7b,0x04,0x24,0x7d
+# GFX11: v_cmpx_eq_f32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x24,0x7d]
 
-# GFX11: v_cmpx_eq_f32_e32 m0, v2                ; encoding: [0x7d,0x04,0x24,0x7d]
 0x7d,0x04,0x24,0x7d
+# GFX11: v_cmpx_eq_f32_e32 m0, v2                ; encoding: [0x7d,0x04,0x24,0x7d]
 
-# GFX11: v_cmpx_eq_f32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x24,0x7d]
 0x7e,0x04,0x24,0x7d
+# GFX11: v_cmpx_eq_f32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x24,0x7d]
 
-# GFX11: v_cmpx_eq_f32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x24,0x7d]
 0x7f,0x04,0x24,0x7d
+# GFX11: v_cmpx_eq_f32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x24,0x7d]
 
-# GFX11: v_cmpx_eq_f32_e32 null, v2              ; encoding: [0x7c,0x04,0x24,0x7d]
 0x7c,0x04,0x24,0x7d
+# GFX11: v_cmpx_eq_f32_e32 null, v2              ; encoding: [0x7c,0x04,0x24,0x7d]
 
-# GFX11: v_cmpx_eq_f32_e32 -1, v2                ; encoding: [0xc1,0x04,0x24,0x7d]
 0xc1,0x04,0x24,0x7d
+# GFX11: v_cmpx_eq_f32_e32 -1, v2                ; encoding: [0xc1,0x04,0x24,0x7d]
 
-# GFX11: v_cmpx_eq_f32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x24,0x7d]
 0xf0,0x04,0x24,0x7d
+# GFX11: v_cmpx_eq_f32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x24,0x7d]
 
-# GFX11: v_cmpx_eq_f32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x24,0x7d]
 0xfd,0x04,0x24,0x7d
+# GFX11: v_cmpx_eq_f32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x24,0x7d]
 
-# GFX11: v_cmpx_eq_f32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x25,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x25,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_eq_f32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x25,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_eq_f64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0x44,0x7d]
 0x01,0x05,0x44,0x7d
+# GFX11: v_cmpx_eq_f64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0x44,0x7d]
 
-# GFX11: v_cmpx_eq_f64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0x44,0x7d]
 0xfe,0x05,0x44,0x7d
+# GFX11: v_cmpx_eq_f64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0x44,0x7d]
 
-# GFX11: v_cmpx_eq_f64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0x44,0x7d]
 0x02,0x04,0x44,0x7d
+# GFX11: v_cmpx_eq_f64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0x44,0x7d]
 
-# GFX11: v_cmpx_eq_f64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0x44,0x7d]
 0x68,0x04,0x44,0x7d
+# GFX11: v_cmpx_eq_f64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0x44,0x7d]
 
-# GFX11: v_cmpx_eq_f64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0x44,0x7d]
 0x6a,0x04,0x44,0x7d
+# GFX11: v_cmpx_eq_f64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0x44,0x7d]
 
-# GFX11: v_cmpx_eq_f64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0x44,0x7d]
 0x7a,0x04,0x44,0x7d
+# GFX11: v_cmpx_eq_f64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0x44,0x7d]
 
-# GFX11: v_cmpx_eq_f64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0x44,0x7d]
 0x7e,0x04,0x44,0x7d
+# GFX11: v_cmpx_eq_f64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0x44,0x7d]
 
-# GFX11: v_cmpx_eq_f64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0x44,0x7d]
 0x7c,0x04,0x44,0x7d
+# GFX11: v_cmpx_eq_f64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0x44,0x7d]
 
-# GFX11: v_cmpx_eq_f64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0x44,0x7d]
 0xc1,0x04,0x44,0x7d
+# GFX11: v_cmpx_eq_f64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0x44,0x7d]
 
-# GFX11: v_cmpx_eq_f64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0x44,0x7d]
 0xf0,0x04,0x44,0x7d
+# GFX11: v_cmpx_eq_f64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0x44,0x7d]
 
-# GFX11: v_cmpx_eq_f64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0x44,0x7d]
 0xfd,0x04,0x44,0x7d
+# GFX11: v_cmpx_eq_f64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0x44,0x7d]
 
-# GFX11: v_cmpx_eq_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x45,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x45,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_eq_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x45,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_eq_i16_e32 v1, v2                ; encoding: [0x01,0x05,0x64,0x7d]
 0x01,0x05,0x64,0x7d
+# GFX11: v_cmpx_eq_i16_e32 v1, v2                ; encoding: [0x01,0x05,0x64,0x7d]
 
-# GFX11: v_cmpx_eq_i16_e32 v127, v2              ; encoding: [0x7f,0x05,0x64,0x7d]
 0x7f,0x05,0x64,0x7d
+# GFX11: v_cmpx_eq_i16_e32 v127, v2              ; encoding: [0x7f,0x05,0x64,0x7d]
 
-# GFX11: v_cmpx_eq_i16_e32 s1, v2                ; encoding: [0x01,0x04,0x64,0x7d]
 0x01,0x04,0x64,0x7d
+# GFX11: v_cmpx_eq_i16_e32 s1, v2                ; encoding: [0x01,0x04,0x64,0x7d]
 
-# GFX11: v_cmpx_eq_i16_e32 s105, v2              ; encoding: [0x69,0x04,0x64,0x7d]
 0x69,0x04,0x64,0x7d
+# GFX11: v_cmpx_eq_i16_e32 s105, v2              ; encoding: [0x69,0x04,0x64,0x7d]
 
-# GFX11: v_cmpx_eq_i16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x64,0x7d]
 0x6a,0x04,0x64,0x7d
+# GFX11: v_cmpx_eq_i16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x64,0x7d]
 
-# GFX11: v_cmpx_eq_i16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x64,0x7d]
 0x6b,0x04,0x64,0x7d
+# GFX11: v_cmpx_eq_i16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x64,0x7d]
 
-# GFX11: v_cmpx_eq_i16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x64,0x7d]
 0x7b,0x04,0x64,0x7d
+# GFX11: v_cmpx_eq_i16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x64,0x7d]
 
-# GFX11: v_cmpx_eq_i16_e32 m0, v2                ; encoding: [0x7d,0x04,0x64,0x7d]
 0x7d,0x04,0x64,0x7d
+# GFX11: v_cmpx_eq_i16_e32 m0, v2                ; encoding: [0x7d,0x04,0x64,0x7d]
 
-# GFX11: v_cmpx_eq_i16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x64,0x7d]
 0x7e,0x04,0x64,0x7d
+# GFX11: v_cmpx_eq_i16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x64,0x7d]
 
-# GFX11: v_cmpx_eq_i16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x64,0x7d]
 0x7f,0x04,0x64,0x7d
+# GFX11: v_cmpx_eq_i16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x64,0x7d]
 
-# GFX11: v_cmpx_eq_i16_e32 null, v2              ; encoding: [0x7c,0x04,0x64,0x7d]
 0x7c,0x04,0x64,0x7d
+# GFX11: v_cmpx_eq_i16_e32 null, v2              ; encoding: [0x7c,0x04,0x64,0x7d]
 
-# GFX11: v_cmpx_eq_i16_e32 -1, v2                ; encoding: [0xc1,0x04,0x64,0x7d]
 0xc1,0x04,0x64,0x7d
+# GFX11: v_cmpx_eq_i16_e32 -1, v2                ; encoding: [0xc1,0x04,0x64,0x7d]
 
-# GFX11: v_cmpx_eq_i16_e32 0x3800, v2
 0xf0,0x04,0x64,0x7d
+# GFX11: v_cmpx_eq_i16_e32 0x3800, v2            ; encoding: [0xff,0x04,0x64,0x7d,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x64,0x7d]
 0xfd,0x04,0x64,0x7d
+# GFX11: v_cmpx_eq_i16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x64,0x7d]
 
-# GFX11: v_cmpx_eq_i16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x64,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x64,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_eq_i16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x64,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i32_e32 v1, v2                ; encoding: [0x01,0x05,0x84,0x7d]
 0x01,0x05,0x84,0x7d
+# GFX11: v_cmpx_eq_i32_e32 v1, v2                ; encoding: [0x01,0x05,0x84,0x7d]
 
-# GFX11: v_cmpx_eq_i32_e32 v255, v2              ; encoding: [0xff,0x05,0x84,0x7d]
 0xff,0x05,0x84,0x7d
+# GFX11: v_cmpx_eq_i32_e32 v255, v2              ; encoding: [0xff,0x05,0x84,0x7d]
 
-# GFX11: v_cmpx_eq_i32_e32 s1, v2                ; encoding: [0x01,0x04,0x84,0x7d]
 0x01,0x04,0x84,0x7d
+# GFX11: v_cmpx_eq_i32_e32 s1, v2                ; encoding: [0x01,0x04,0x84,0x7d]
 
-# GFX11: v_cmpx_eq_i32_e32 s105, v2              ; encoding: [0x69,0x04,0x84,0x7d]
 0x69,0x04,0x84,0x7d
+# GFX11: v_cmpx_eq_i32_e32 s105, v2              ; encoding: [0x69,0x04,0x84,0x7d]
 
-# GFX11: v_cmpx_eq_i32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x84,0x7d]
 0x6a,0x04,0x84,0x7d
+# GFX11: v_cmpx_eq_i32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x84,0x7d]
 
-# GFX11: v_cmpx_eq_i32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x84,0x7d]
 0x6b,0x04,0x84,0x7d
+# GFX11: v_cmpx_eq_i32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x84,0x7d]
 
-# GFX11: v_cmpx_eq_i32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x84,0x7d]
 0x7b,0x04,0x84,0x7d
+# GFX11: v_cmpx_eq_i32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x84,0x7d]
 
-# GFX11: v_cmpx_eq_i32_e32 m0, v2                ; encoding: [0x7d,0x04,0x84,0x7d]
 0x7d,0x04,0x84,0x7d
+# GFX11: v_cmpx_eq_i32_e32 m0, v2                ; encoding: [0x7d,0x04,0x84,0x7d]
 
-# GFX11: v_cmpx_eq_i32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x84,0x7d]
 0x7e,0x04,0x84,0x7d
+# GFX11: v_cmpx_eq_i32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x84,0x7d]
 
-# GFX11: v_cmpx_eq_i32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x84,0x7d]
 0x7f,0x04,0x84,0x7d
+# GFX11: v_cmpx_eq_i32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x84,0x7d]
 
-# GFX11: v_cmpx_eq_i32_e32 null, v2              ; encoding: [0x7c,0x04,0x84,0x7d]
 0x7c,0x04,0x84,0x7d
+# GFX11: v_cmpx_eq_i32_e32 null, v2              ; encoding: [0x7c,0x04,0x84,0x7d]
 
-# GFX11: v_cmpx_eq_i32_e32 -1, v2                ; encoding: [0xc1,0x04,0x84,0x7d]
 0xc1,0x04,0x84,0x7d
+# GFX11: v_cmpx_eq_i32_e32 -1, v2                ; encoding: [0xc1,0x04,0x84,0x7d]
 
-# GFX11: v_cmpx_eq_i32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x84,0x7d]
 0xf0,0x04,0x84,0x7d
+# GFX11: v_cmpx_eq_i32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x84,0x7d]
 
-# GFX11: v_cmpx_eq_i32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x84,0x7d]
 0xfd,0x04,0x84,0x7d
+# GFX11: v_cmpx_eq_i32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x84,0x7d]
 
-# GFX11: v_cmpx_eq_i32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x85,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x85,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_eq_i32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x85,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_eq_i64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xa4,0x7d]
 0x01,0x05,0xa4,0x7d
+# GFX11: v_cmpx_eq_i64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xa4,0x7d]
 
-# GFX11: v_cmpx_eq_i64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xa4,0x7d]
 0xfe,0x05,0xa4,0x7d
+# GFX11: v_cmpx_eq_i64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xa4,0x7d]
 
-# GFX11: v_cmpx_eq_i64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xa4,0x7d]
 0x02,0x04,0xa4,0x7d
+# GFX11: v_cmpx_eq_i64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xa4,0x7d]
 
-# GFX11: v_cmpx_eq_i64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xa4,0x7d]
 0x68,0x04,0xa4,0x7d
+# GFX11: v_cmpx_eq_i64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xa4,0x7d]
 
-# GFX11: v_cmpx_eq_i64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xa4,0x7d]
 0x6a,0x04,0xa4,0x7d
+# GFX11: v_cmpx_eq_i64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xa4,0x7d]
 
-# GFX11: v_cmpx_eq_i64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xa4,0x7d]
 0x7a,0x04,0xa4,0x7d
+# GFX11: v_cmpx_eq_i64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xa4,0x7d]
 
-# GFX11: v_cmpx_eq_i64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xa4,0x7d]
 0x7e,0x04,0xa4,0x7d
+# GFX11: v_cmpx_eq_i64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xa4,0x7d]
 
-# GFX11: v_cmpx_eq_i64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xa4,0x7d]
 0x7c,0x04,0xa4,0x7d
+# GFX11: v_cmpx_eq_i64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xa4,0x7d]
 
-# GFX11: v_cmpx_eq_i64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xa4,0x7d]
 0xc1,0x04,0xa4,0x7d
+# GFX11: v_cmpx_eq_i64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xa4,0x7d]
 
-# GFX11: v_cmpx_eq_i64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xa4,0x7d]
 0xf0,0x04,0xa4,0x7d
+# GFX11: v_cmpx_eq_i64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xa4,0x7d]
 
-# GFX11: v_cmpx_eq_i64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xa4,0x7d]
 0xfd,0x04,0xa4,0x7d
+# GFX11: v_cmpx_eq_i64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xa4,0x7d]
 
-# GFX11: v_cmpx_eq_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa5,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0xa5,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_eq_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa5,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_eq_u16_e32 v1, v2                ; encoding: [0x01,0x05,0x74,0x7d]
 0x01,0x05,0x74,0x7d
+# GFX11: v_cmpx_eq_u16_e32 v1, v2                ; encoding: [0x01,0x05,0x74,0x7d]
 
-# GFX11: v_cmpx_eq_u16_e32 v127, v2              ; encoding: [0x7f,0x05,0x74,0x7d]
 0x7f,0x05,0x74,0x7d
+# GFX11: v_cmpx_eq_u16_e32 v127, v2              ; encoding: [0x7f,0x05,0x74,0x7d]
 
-# GFX11: v_cmpx_eq_u16_e32 s1, v2                ; encoding: [0x01,0x04,0x74,0x7d]
 0x01,0x04,0x74,0x7d
+# GFX11: v_cmpx_eq_u16_e32 s1, v2                ; encoding: [0x01,0x04,0x74,0x7d]
 
-# GFX11: v_cmpx_eq_u16_e32 s105, v2              ; encoding: [0x69,0x04,0x74,0x7d]
 0x69,0x04,0x74,0x7d
+# GFX11: v_cmpx_eq_u16_e32 s105, v2              ; encoding: [0x69,0x04,0x74,0x7d]
 
-# GFX11: v_cmpx_eq_u16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x74,0x7d]
 0x6a,0x04,0x74,0x7d
+# GFX11: v_cmpx_eq_u16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x74,0x7d]
 
-# GFX11: v_cmpx_eq_u16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x74,0x7d]
 0x6b,0x04,0x74,0x7d
+# GFX11: v_cmpx_eq_u16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x74,0x7d]
 
-# GFX11: v_cmpx_eq_u16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x74,0x7d]
 0x7b,0x04,0x74,0x7d
+# GFX11: v_cmpx_eq_u16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x74,0x7d]
 
-# GFX11: v_cmpx_eq_u16_e32 m0, v2                ; encoding: [0x7d,0x04,0x74,0x7d]
 0x7d,0x04,0x74,0x7d
+# GFX11: v_cmpx_eq_u16_e32 m0, v2                ; encoding: [0x7d,0x04,0x74,0x7d]
 
-# GFX11: v_cmpx_eq_u16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x74,0x7d]
 0x7e,0x04,0x74,0x7d
+# GFX11: v_cmpx_eq_u16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x74,0x7d]
 
-# GFX11: v_cmpx_eq_u16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x74,0x7d]
 0x7f,0x04,0x74,0x7d
+# GFX11: v_cmpx_eq_u16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x74,0x7d]
 
-# GFX11: v_cmpx_eq_u16_e32 null, v2              ; encoding: [0x7c,0x04,0x74,0x7d]
 0x7c,0x04,0x74,0x7d
+# GFX11: v_cmpx_eq_u16_e32 null, v2              ; encoding: [0x7c,0x04,0x74,0x7d]
 
-# GFX11: v_cmpx_eq_u16_e32 -1, v2                ; encoding: [0xc1,0x04,0x74,0x7d]
 0xc1,0x04,0x74,0x7d
+# GFX11: v_cmpx_eq_u16_e32 -1, v2                ; encoding: [0xc1,0x04,0x74,0x7d]
 
-# GFX11: v_cmpx_eq_u16_e32 0x3800, v2
 0xf0,0x04,0x74,0x7d
+# GFX11: v_cmpx_eq_u16_e32 0x3800, v2            ; encoding: [0xff,0x04,0x74,0x7d,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x74,0x7d]
 0xfd,0x04,0x74,0x7d
+# GFX11: v_cmpx_eq_u16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x74,0x7d]
 
-# GFX11: v_cmpx_eq_u16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x74,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x74,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_eq_u16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x74,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u32_e32 v1, v2                ; encoding: [0x01,0x05,0x94,0x7d]
 0x01,0x05,0x94,0x7d
+# GFX11: v_cmpx_eq_u32_e32 v1, v2                ; encoding: [0x01,0x05,0x94,0x7d]
 
-# GFX11: v_cmpx_eq_u32_e32 v255, v2              ; encoding: [0xff,0x05,0x94,0x7d]
 0xff,0x05,0x94,0x7d
+# GFX11: v_cmpx_eq_u32_e32 v255, v2              ; encoding: [0xff,0x05,0x94,0x7d]
 
-# GFX11: v_cmpx_eq_u32_e32 s1, v2                ; encoding: [0x01,0x04,0x94,0x7d]
 0x01,0x04,0x94,0x7d
+# GFX11: v_cmpx_eq_u32_e32 s1, v2                ; encoding: [0x01,0x04,0x94,0x7d]
 
-# GFX11: v_cmpx_eq_u32_e32 s105, v2              ; encoding: [0x69,0x04,0x94,0x7d]
 0x69,0x04,0x94,0x7d
+# GFX11: v_cmpx_eq_u32_e32 s105, v2              ; encoding: [0x69,0x04,0x94,0x7d]
 
-# GFX11: v_cmpx_eq_u32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x94,0x7d]
 0x6a,0x04,0x94,0x7d
+# GFX11: v_cmpx_eq_u32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x94,0x7d]
 
-# GFX11: v_cmpx_eq_u32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x94,0x7d]
 0x6b,0x04,0x94,0x7d
+# GFX11: v_cmpx_eq_u32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x94,0x7d]
 
-# GFX11: v_cmpx_eq_u32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x94,0x7d]
 0x7b,0x04,0x94,0x7d
+# GFX11: v_cmpx_eq_u32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x94,0x7d]
 
-# GFX11: v_cmpx_eq_u32_e32 m0, v2                ; encoding: [0x7d,0x04,0x94,0x7d]
 0x7d,0x04,0x94,0x7d
+# GFX11: v_cmpx_eq_u32_e32 m0, v2                ; encoding: [0x7d,0x04,0x94,0x7d]
 
-# GFX11: v_cmpx_eq_u32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x94,0x7d]
 0x7e,0x04,0x94,0x7d
+# GFX11: v_cmpx_eq_u32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x94,0x7d]
 
-# GFX11: v_cmpx_eq_u32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x94,0x7d]
 0x7f,0x04,0x94,0x7d
+# GFX11: v_cmpx_eq_u32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x94,0x7d]
 
-# GFX11: v_cmpx_eq_u32_e32 null, v2              ; encoding: [0x7c,0x04,0x94,0x7d]
 0x7c,0x04,0x94,0x7d
+# GFX11: v_cmpx_eq_u32_e32 null, v2              ; encoding: [0x7c,0x04,0x94,0x7d]
 
-# GFX11: v_cmpx_eq_u32_e32 -1, v2                ; encoding: [0xc1,0x04,0x94,0x7d]
 0xc1,0x04,0x94,0x7d
+# GFX11: v_cmpx_eq_u32_e32 -1, v2                ; encoding: [0xc1,0x04,0x94,0x7d]
 
-# GFX11: v_cmpx_eq_u32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x94,0x7d]
 0xf0,0x04,0x94,0x7d
+# GFX11: v_cmpx_eq_u32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x94,0x7d]
 
-# GFX11: v_cmpx_eq_u32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x94,0x7d]
 0xfd,0x04,0x94,0x7d
+# GFX11: v_cmpx_eq_u32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x94,0x7d]
 
-# GFX11: v_cmpx_eq_u32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x95,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x95,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_eq_u32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x95,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_eq_u64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xb4,0x7d]
 0x01,0x05,0xb4,0x7d
+# GFX11: v_cmpx_eq_u64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xb4,0x7d]
 
-# GFX11: v_cmpx_eq_u64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xb4,0x7d]
 0xfe,0x05,0xb4,0x7d
+# GFX11: v_cmpx_eq_u64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xb4,0x7d]
 
-# GFX11: v_cmpx_eq_u64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xb4,0x7d]
 0x02,0x04,0xb4,0x7d
+# GFX11: v_cmpx_eq_u64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xb4,0x7d]
 
-# GFX11: v_cmpx_eq_u64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xb4,0x7d]
 0x68,0x04,0xb4,0x7d
+# GFX11: v_cmpx_eq_u64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xb4,0x7d]
 
-# GFX11: v_cmpx_eq_u64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xb4,0x7d]
 0x6a,0x04,0xb4,0x7d
+# GFX11: v_cmpx_eq_u64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xb4,0x7d]
 
-# GFX11: v_cmpx_eq_u64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xb4,0x7d]
 0x7a,0x04,0xb4,0x7d
+# GFX11: v_cmpx_eq_u64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xb4,0x7d]
 
-# GFX11: v_cmpx_eq_u64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xb4,0x7d]
 0x7e,0x04,0xb4,0x7d
+# GFX11: v_cmpx_eq_u64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xb4,0x7d]
 
-# GFX11: v_cmpx_eq_u64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xb4,0x7d]
 0x7c,0x04,0xb4,0x7d
+# GFX11: v_cmpx_eq_u64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xb4,0x7d]
 
-# GFX11: v_cmpx_eq_u64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xb4,0x7d]
 0xc1,0x04,0xb4,0x7d
+# GFX11: v_cmpx_eq_u64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xb4,0x7d]
 
-# GFX11: v_cmpx_eq_u64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xb4,0x7d]
 0xf0,0x04,0xb4,0x7d
+# GFX11: v_cmpx_eq_u64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xb4,0x7d]
 
-# GFX11: v_cmpx_eq_u64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xb4,0x7d]
 0xfd,0x04,0xb4,0x7d
+# GFX11: v_cmpx_eq_u64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xb4,0x7d]
 
-# GFX11: v_cmpx_eq_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb5,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0xb5,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_eq_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb5,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_f_f16_e32 v1, v2                 ; encoding: [0x01,0x05,0x00,0x7d]
 0x01,0x05,0x00,0x7d
+# GFX11: v_cmpx_f_f16_e32 v1, v2                 ; encoding: [0x01,0x05,0x00,0x7d]
 
-# GFX11: v_cmpx_f_f16_e32 v127, v2               ; encoding: [0x7f,0x05,0x00,0x7d]
 0x7f,0x05,0x00,0x7d
+# GFX11: v_cmpx_f_f16_e32 v127, v2               ; encoding: [0x7f,0x05,0x00,0x7d]
 
-# GFX11: v_cmpx_f_f16_e32 s1, v2                 ; encoding: [0x01,0x04,0x00,0x7d]
 0x01,0x04,0x00,0x7d
+# GFX11: v_cmpx_f_f16_e32 s1, v2                 ; encoding: [0x01,0x04,0x00,0x7d]
 
-# GFX11: v_cmpx_f_f16_e32 s105, v2               ; encoding: [0x69,0x04,0x00,0x7d]
 0x69,0x04,0x00,0x7d
+# GFX11: v_cmpx_f_f16_e32 s105, v2               ; encoding: [0x69,0x04,0x00,0x7d]
 
-# GFX11: v_cmpx_f_f16_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x00,0x7d]
 0x6a,0x04,0x00,0x7d
+# GFX11: v_cmpx_f_f16_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x00,0x7d]
 
-# GFX11: v_cmpx_f_f16_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x00,0x7d]
 0x6b,0x04,0x00,0x7d
+# GFX11: v_cmpx_f_f16_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x00,0x7d]
 
-# GFX11: v_cmpx_f_f16_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x00,0x7d]
 0x7b,0x04,0x00,0x7d
+# GFX11: v_cmpx_f_f16_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x00,0x7d]
 
-# GFX11: v_cmpx_f_f16_e32 m0, v2                 ; encoding: [0x7d,0x04,0x00,0x7d]
 0x7d,0x04,0x00,0x7d
+# GFX11: v_cmpx_f_f16_e32 m0, v2                 ; encoding: [0x7d,0x04,0x00,0x7d]
 
-# GFX11: v_cmpx_f_f16_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x00,0x7d]
 0x7e,0x04,0x00,0x7d
+# GFX11: v_cmpx_f_f16_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x00,0x7d]
 
-# GFX11: v_cmpx_f_f16_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x00,0x7d]
 0x7f,0x04,0x00,0x7d
+# GFX11: v_cmpx_f_f16_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x00,0x7d]
 
-# GFX11: v_cmpx_f_f16_e32 null, v2               ; encoding: [0x7c,0x04,0x00,0x7d]
 0x7c,0x04,0x00,0x7d
+# GFX11: v_cmpx_f_f16_e32 null, v2               ; encoding: [0x7c,0x04,0x00,0x7d]
 
-# GFX11: v_cmpx_f_f16_e32 -1, v2                 ; encoding: [0xc1,0x04,0x00,0x7d]
 0xc1,0x04,0x00,0x7d
+# GFX11: v_cmpx_f_f16_e32 -1, v2                 ; encoding: [0xc1,0x04,0x00,0x7d]
 
-# GFX11: v_cmpx_f_f16_e32 0.5, v2                ; encoding: [0xf0,0x04,0x00,0x7d]
 0xf0,0x04,0x00,0x7d
+# GFX11: v_cmpx_f_f16_e32 0.5, v2                ; encoding: [0xf0,0x04,0x00,0x7d]
 
-# GFX11: v_cmpx_f_f16_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x00,0x7d]
 0xfd,0x04,0x00,0x7d
+# GFX11: v_cmpx_f_f16_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x00,0x7d]
 
-# GFX11: v_cmpx_f_f16_e32 0xfe0b, v127           ; encoding: [0xff,0xfe,0x00,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x00,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_f_f16_e32 0xfe0b, v127           ; encoding: [0xff,0xfe,0x00,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_f_f32_e32 v1, v2                 ; encoding: [0x01,0x05,0x20,0x7d]
 0x01,0x05,0x20,0x7d
+# GFX11: v_cmpx_f_f32_e32 v1, v2                 ; encoding: [0x01,0x05,0x20,0x7d]
 
-# GFX11: v_cmpx_f_f32_e32 v255, v2               ; encoding: [0xff,0x05,0x20,0x7d]
 0xff,0x05,0x20,0x7d
+# GFX11: v_cmpx_f_f32_e32 v255, v2               ; encoding: [0xff,0x05,0x20,0x7d]
 
-# GFX11: v_cmpx_f_f32_e32 s1, v2                 ; encoding: [0x01,0x04,0x20,0x7d]
 0x01,0x04,0x20,0x7d
+# GFX11: v_cmpx_f_f32_e32 s1, v2                 ; encoding: [0x01,0x04,0x20,0x7d]
 
-# GFX11: v_cmpx_f_f32_e32 s105, v2               ; encoding: [0x69,0x04,0x20,0x7d]
 0x69,0x04,0x20,0x7d
+# GFX11: v_cmpx_f_f32_e32 s105, v2               ; encoding: [0x69,0x04,0x20,0x7d]
 
-# GFX11: v_cmpx_f_f32_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x20,0x7d]
 0x6a,0x04,0x20,0x7d
+# GFX11: v_cmpx_f_f32_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x20,0x7d]
 
-# GFX11: v_cmpx_f_f32_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x20,0x7d]
 0x6b,0x04,0x20,0x7d
+# GFX11: v_cmpx_f_f32_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x20,0x7d]
 
-# GFX11: v_cmpx_f_f32_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x20,0x7d]
 0x7b,0x04,0x20,0x7d
+# GFX11: v_cmpx_f_f32_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x20,0x7d]
 
-# GFX11: v_cmpx_f_f32_e32 m0, v2                 ; encoding: [0x7d,0x04,0x20,0x7d]
 0x7d,0x04,0x20,0x7d
+# GFX11: v_cmpx_f_f32_e32 m0, v2                 ; encoding: [0x7d,0x04,0x20,0x7d]
 
-# GFX11: v_cmpx_f_f32_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x20,0x7d]
 0x7e,0x04,0x20,0x7d
+# GFX11: v_cmpx_f_f32_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x20,0x7d]
 
-# GFX11: v_cmpx_f_f32_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x20,0x7d]
 0x7f,0x04,0x20,0x7d
+# GFX11: v_cmpx_f_f32_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x20,0x7d]
 
-# GFX11: v_cmpx_f_f32_e32 null, v2               ; encoding: [0x7c,0x04,0x20,0x7d]
 0x7c,0x04,0x20,0x7d
+# GFX11: v_cmpx_f_f32_e32 null, v2               ; encoding: [0x7c,0x04,0x20,0x7d]
 
-# GFX11: v_cmpx_f_f32_e32 -1, v2                 ; encoding: [0xc1,0x04,0x20,0x7d]
 0xc1,0x04,0x20,0x7d
+# GFX11: v_cmpx_f_f32_e32 -1, v2                 ; encoding: [0xc1,0x04,0x20,0x7d]
 
-# GFX11: v_cmpx_f_f32_e32 0.5, v2                ; encoding: [0xf0,0x04,0x20,0x7d]
 0xf0,0x04,0x20,0x7d
+# GFX11: v_cmpx_f_f32_e32 0.5, v2                ; encoding: [0xf0,0x04,0x20,0x7d]
 
-# GFX11: v_cmpx_f_f32_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x20,0x7d]
 0xfd,0x04,0x20,0x7d
+# GFX11: v_cmpx_f_f32_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x20,0x7d]
 
-# GFX11: v_cmpx_f_f32_e32 0xaf123456, v255       ; encoding: [0xff,0xfe,0x21,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x21,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_f_f32_e32 0xaf123456, v255       ; encoding: [0xff,0xfe,0x21,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_f_f64_e32 v[1:2], v[2:3]         ; encoding: [0x01,0x05,0x40,0x7d]
 0x01,0x05,0x40,0x7d
+# GFX11: v_cmpx_f_f64_e32 v[1:2], v[2:3]         ; encoding: [0x01,0x05,0x40,0x7d]
 
-# GFX11: v_cmpx_f_f64_e32 v[254:255], v[2:3]     ; encoding: [0xfe,0x05,0x40,0x7d]
 0xfe,0x05,0x40,0x7d
+# GFX11: v_cmpx_f_f64_e32 v[254:255], v[2:3]     ; encoding: [0xfe,0x05,0x40,0x7d]
 
-# GFX11: v_cmpx_f_f64_e32 s[2:3], v[2:3]         ; encoding: [0x02,0x04,0x40,0x7d]
 0x02,0x04,0x40,0x7d
+# GFX11: v_cmpx_f_f64_e32 s[2:3], v[2:3]         ; encoding: [0x02,0x04,0x40,0x7d]
 
-# GFX11: v_cmpx_f_f64_e32 s[104:105], v[2:3]     ; encoding: [0x68,0x04,0x40,0x7d]
 0x68,0x04,0x40,0x7d
+# GFX11: v_cmpx_f_f64_e32 s[104:105], v[2:3]     ; encoding: [0x68,0x04,0x40,0x7d]
 
-# GFX11: v_cmpx_f_f64_e32 vcc, v[2:3]            ; encoding: [0x6a,0x04,0x40,0x7d]
 0x6a,0x04,0x40,0x7d
+# GFX11: v_cmpx_f_f64_e32 vcc, v[2:3]            ; encoding: [0x6a,0x04,0x40,0x7d]
 
-# GFX11: v_cmpx_f_f64_e32 ttmp[14:15], v[2:3]    ; encoding: [0x7a,0x04,0x40,0x7d]
 0x7a,0x04,0x40,0x7d
+# GFX11: v_cmpx_f_f64_e32 ttmp[14:15], v[2:3]    ; encoding: [0x7a,0x04,0x40,0x7d]
 
-# GFX11: v_cmpx_f_f64_e32 exec, v[2:3]           ; encoding: [0x7e,0x04,0x40,0x7d]
 0x7e,0x04,0x40,0x7d
+# GFX11: v_cmpx_f_f64_e32 exec, v[2:3]           ; encoding: [0x7e,0x04,0x40,0x7d]
 
-# GFX11: v_cmpx_f_f64_e32 null, v[2:3]           ; encoding: [0x7c,0x04,0x40,0x7d]
 0x7c,0x04,0x40,0x7d
+# GFX11: v_cmpx_f_f64_e32 null, v[2:3]           ; encoding: [0x7c,0x04,0x40,0x7d]
 
-# GFX11: v_cmpx_f_f64_e32 -1, v[2:3]             ; encoding: [0xc1,0x04,0x40,0x7d]
 0xc1,0x04,0x40,0x7d
+# GFX11: v_cmpx_f_f64_e32 -1, v[2:3]             ; encoding: [0xc1,0x04,0x40,0x7d]
 
-# GFX11: v_cmpx_f_f64_e32 0.5, v[2:3]            ; encoding: [0xf0,0x04,0x40,0x7d]
 0xf0,0x04,0x40,0x7d
+# GFX11: v_cmpx_f_f64_e32 0.5, v[2:3]            ; encoding: [0xf0,0x04,0x40,0x7d]
 
-# GFX11: v_cmpx_f_f64_e32 src_scc, v[2:3]        ; encoding: [0xfd,0x04,0x40,0x7d]
 0xfd,0x04,0x40,0x7d
+# GFX11: v_cmpx_f_f64_e32 src_scc, v[2:3]        ; encoding: [0xfd,0x04,0x40,0x7d]
 
-# GFX11: v_cmpx_f_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x41,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x41,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_f_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x41,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_f_i32_e32 v1, v2                 ; encoding: [0x01,0x05,0x80,0x7d]
 0x01,0x05,0x80,0x7d
+# GFX11: v_cmpx_f_i32_e32 v1, v2                 ; encoding: [0x01,0x05,0x80,0x7d]
 
-# GFX11: v_cmpx_f_i32_e32 v255, v2               ; encoding: [0xff,0x05,0x80,0x7d]
 0xff,0x05,0x80,0x7d
+# GFX11: v_cmpx_f_i32_e32 v255, v2               ; encoding: [0xff,0x05,0x80,0x7d]
 
-# GFX11: v_cmpx_f_i32_e32 s1, v2                 ; encoding: [0x01,0x04,0x80,0x7d]
 0x01,0x04,0x80,0x7d
+# GFX11: v_cmpx_f_i32_e32 s1, v2                 ; encoding: [0x01,0x04,0x80,0x7d]
 
-# GFX11: v_cmpx_f_i32_e32 s105, v2               ; encoding: [0x69,0x04,0x80,0x7d]
 0x69,0x04,0x80,0x7d
+# GFX11: v_cmpx_f_i32_e32 s105, v2               ; encoding: [0x69,0x04,0x80,0x7d]
 
-# GFX11: v_cmpx_f_i32_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x80,0x7d]
 0x6a,0x04,0x80,0x7d
+# GFX11: v_cmpx_f_i32_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x80,0x7d]
 
-# GFX11: v_cmpx_f_i32_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x80,0x7d]
 0x6b,0x04,0x80,0x7d
+# GFX11: v_cmpx_f_i32_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x80,0x7d]
 
-# GFX11: v_cmpx_f_i32_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x80,0x7d]
 0x7b,0x04,0x80,0x7d
+# GFX11: v_cmpx_f_i32_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x80,0x7d]
 
-# GFX11: v_cmpx_f_i32_e32 m0, v2                 ; encoding: [0x7d,0x04,0x80,0x7d]
 0x7d,0x04,0x80,0x7d
+# GFX11: v_cmpx_f_i32_e32 m0, v2                 ; encoding: [0x7d,0x04,0x80,0x7d]
 
-# GFX11: v_cmpx_f_i32_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x80,0x7d]
 0x7e,0x04,0x80,0x7d
+# GFX11: v_cmpx_f_i32_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x80,0x7d]
 
-# GFX11: v_cmpx_f_i32_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x80,0x7d]
 0x7f,0x04,0x80,0x7d
+# GFX11: v_cmpx_f_i32_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x80,0x7d]
 
-# GFX11: v_cmpx_f_i32_e32 null, v2               ; encoding: [0x7c,0x04,0x80,0x7d]
 0x7c,0x04,0x80,0x7d
+# GFX11: v_cmpx_f_i32_e32 null, v2               ; encoding: [0x7c,0x04,0x80,0x7d]
 
-# GFX11: v_cmpx_f_i32_e32 -1, v2                 ; encoding: [0xc1,0x04,0x80,0x7d]
 0xc1,0x04,0x80,0x7d
+# GFX11: v_cmpx_f_i32_e32 -1, v2                 ; encoding: [0xc1,0x04,0x80,0x7d]
 
-# GFX11: v_cmpx_f_i32_e32 0.5, v2                ; encoding: [0xf0,0x04,0x80,0x7d]
 0xf0,0x04,0x80,0x7d
+# GFX11: v_cmpx_f_i32_e32 0.5, v2                ; encoding: [0xf0,0x04,0x80,0x7d]
 
-# GFX11: v_cmpx_f_i32_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x80,0x7d]
 0xfd,0x04,0x80,0x7d
+# GFX11: v_cmpx_f_i32_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x80,0x7d]
 
-# GFX11: v_cmpx_f_i32_e32 0xaf123456, v255       ; encoding: [0xff,0xfe,0x81,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x81,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_f_i32_e32 0xaf123456, v255       ; encoding: [0xff,0xfe,0x81,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_f_i64_e32 v[1:2], v[2:3]         ; encoding: [0x01,0x05,0xa0,0x7d]
 0x01,0x05,0xa0,0x7d
+# GFX11: v_cmpx_f_i64_e32 v[1:2], v[2:3]         ; encoding: [0x01,0x05,0xa0,0x7d]
 
-# GFX11: v_cmpx_f_i64_e32 v[254:255], v[2:3]     ; encoding: [0xfe,0x05,0xa0,0x7d]
 0xfe,0x05,0xa0,0x7d
+# GFX11: v_cmpx_f_i64_e32 v[254:255], v[2:3]     ; encoding: [0xfe,0x05,0xa0,0x7d]
 
-# GFX11: v_cmpx_f_i64_e32 s[2:3], v[2:3]         ; encoding: [0x02,0x04,0xa0,0x7d]
 0x02,0x04,0xa0,0x7d
+# GFX11: v_cmpx_f_i64_e32 s[2:3], v[2:3]         ; encoding: [0x02,0x04,0xa0,0x7d]
 
-# GFX11: v_cmpx_f_i64_e32 s[104:105], v[2:3]     ; encoding: [0x68,0x04,0xa0,0x7d]
 0x68,0x04,0xa0,0x7d
+# GFX11: v_cmpx_f_i64_e32 s[104:105], v[2:3]     ; encoding: [0x68,0x04,0xa0,0x7d]
 
-# GFX11: v_cmpx_f_i64_e32 vcc, v[2:3]            ; encoding: [0x6a,0x04,0xa0,0x7d]
 0x6a,0x04,0xa0,0x7d
+# GFX11: v_cmpx_f_i64_e32 vcc, v[2:3]            ; encoding: [0x6a,0x04,0xa0,0x7d]
 
-# GFX11: v_cmpx_f_i64_e32 ttmp[14:15], v[2:3]    ; encoding: [0x7a,0x04,0xa0,0x7d]
 0x7a,0x04,0xa0,0x7d
+# GFX11: v_cmpx_f_i64_e32 ttmp[14:15], v[2:3]    ; encoding: [0x7a,0x04,0xa0,0x7d]
 
-# GFX11: v_cmpx_f_i64_e32 exec, v[2:3]           ; encoding: [0x7e,0x04,0xa0,0x7d]
 0x7e,0x04,0xa0,0x7d
+# GFX11: v_cmpx_f_i64_e32 exec, v[2:3]           ; encoding: [0x7e,0x04,0xa0,0x7d]
 
-# GFX11: v_cmpx_f_i64_e32 null, v[2:3]           ; encoding: [0x7c,0x04,0xa0,0x7d]
 0x7c,0x04,0xa0,0x7d
+# GFX11: v_cmpx_f_i64_e32 null, v[2:3]           ; encoding: [0x7c,0x04,0xa0,0x7d]
 
-# GFX11: v_cmpx_f_i64_e32 -1, v[2:3]             ; encoding: [0xc1,0x04,0xa0,0x7d]
 0xc1,0x04,0xa0,0x7d
+# GFX11: v_cmpx_f_i64_e32 -1, v[2:3]             ; encoding: [0xc1,0x04,0xa0,0x7d]
 
-# GFX11: v_cmpx_f_i64_e32 0.5, v[2:3]            ; encoding: [0xf0,0x04,0xa0,0x7d]
 0xf0,0x04,0xa0,0x7d
+# GFX11: v_cmpx_f_i64_e32 0.5, v[2:3]            ; encoding: [0xf0,0x04,0xa0,0x7d]
 
-# GFX11: v_cmpx_f_i64_e32 src_scc, v[2:3]        ; encoding: [0xfd,0x04,0xa0,0x7d]
 0xfd,0x04,0xa0,0x7d
+# GFX11: v_cmpx_f_i64_e32 src_scc, v[2:3]        ; encoding: [0xfd,0x04,0xa0,0x7d]
 
-# GFX11: v_cmpx_f_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa1,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0xa1,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_f_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa1,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_f_u32_e32 v1, v2                 ; encoding: [0x01,0x05,0x90,0x7d]
 0x01,0x05,0x90,0x7d
+# GFX11: v_cmpx_f_u32_e32 v1, v2                 ; encoding: [0x01,0x05,0x90,0x7d]
 
-# GFX11: v_cmpx_f_u32_e32 v255, v2               ; encoding: [0xff,0x05,0x90,0x7d]
 0xff,0x05,0x90,0x7d
+# GFX11: v_cmpx_f_u32_e32 v255, v2               ; encoding: [0xff,0x05,0x90,0x7d]
 
-# GFX11: v_cmpx_f_u32_e32 s1, v2                 ; encoding: [0x01,0x04,0x90,0x7d]
 0x01,0x04,0x90,0x7d
+# GFX11: v_cmpx_f_u32_e32 s1, v2                 ; encoding: [0x01,0x04,0x90,0x7d]
 
-# GFX11: v_cmpx_f_u32_e32 s105, v2               ; encoding: [0x69,0x04,0x90,0x7d]
 0x69,0x04,0x90,0x7d
+# GFX11: v_cmpx_f_u32_e32 s105, v2               ; encoding: [0x69,0x04,0x90,0x7d]
 
-# GFX11: v_cmpx_f_u32_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x90,0x7d]
 0x6a,0x04,0x90,0x7d
+# GFX11: v_cmpx_f_u32_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x90,0x7d]
 
-# GFX11: v_cmpx_f_u32_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x90,0x7d]
 0x6b,0x04,0x90,0x7d
+# GFX11: v_cmpx_f_u32_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x90,0x7d]
 
-# GFX11: v_cmpx_f_u32_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x90,0x7d]
 0x7b,0x04,0x90,0x7d
+# GFX11: v_cmpx_f_u32_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x90,0x7d]
 
-# GFX11: v_cmpx_f_u32_e32 m0, v2                 ; encoding: [0x7d,0x04,0x90,0x7d]
 0x7d,0x04,0x90,0x7d
+# GFX11: v_cmpx_f_u32_e32 m0, v2                 ; encoding: [0x7d,0x04,0x90,0x7d]
 
-# GFX11: v_cmpx_f_u32_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x90,0x7d]
 0x7e,0x04,0x90,0x7d
+# GFX11: v_cmpx_f_u32_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x90,0x7d]
 
-# GFX11: v_cmpx_f_u32_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x90,0x7d]
 0x7f,0x04,0x90,0x7d
+# GFX11: v_cmpx_f_u32_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x90,0x7d]
 
-# GFX11: v_cmpx_f_u32_e32 null, v2               ; encoding: [0x7c,0x04,0x90,0x7d]
 0x7c,0x04,0x90,0x7d
+# GFX11: v_cmpx_f_u32_e32 null, v2               ; encoding: [0x7c,0x04,0x90,0x7d]
 
-# GFX11: v_cmpx_f_u32_e32 -1, v2                 ; encoding: [0xc1,0x04,0x90,0x7d]
 0xc1,0x04,0x90,0x7d
+# GFX11: v_cmpx_f_u32_e32 -1, v2                 ; encoding: [0xc1,0x04,0x90,0x7d]
 
-# GFX11: v_cmpx_f_u32_e32 0.5, v2                ; encoding: [0xf0,0x04,0x90,0x7d]
 0xf0,0x04,0x90,0x7d
+# GFX11: v_cmpx_f_u32_e32 0.5, v2                ; encoding: [0xf0,0x04,0x90,0x7d]
 
-# GFX11: v_cmpx_f_u32_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x90,0x7d]
 0xfd,0x04,0x90,0x7d
+# GFX11: v_cmpx_f_u32_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x90,0x7d]
 
-# GFX11: v_cmpx_f_u32_e32 0xaf123456, v255       ; encoding: [0xff,0xfe,0x91,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x91,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_f_u32_e32 0xaf123456, v255       ; encoding: [0xff,0xfe,0x91,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_f_u64_e32 v[1:2], v[2:3]         ; encoding: [0x01,0x05,0xb0,0x7d]
 0x01,0x05,0xb0,0x7d
+# GFX11: v_cmpx_f_u64_e32 v[1:2], v[2:3]         ; encoding: [0x01,0x05,0xb0,0x7d]
 
-# GFX11: v_cmpx_f_u64_e32 v[254:255], v[2:3]     ; encoding: [0xfe,0x05,0xb0,0x7d]
 0xfe,0x05,0xb0,0x7d
+# GFX11: v_cmpx_f_u64_e32 v[254:255], v[2:3]     ; encoding: [0xfe,0x05,0xb0,0x7d]
 
-# GFX11: v_cmpx_f_u64_e32 s[2:3], v[2:3]         ; encoding: [0x02,0x04,0xb0,0x7d]
 0x02,0x04,0xb0,0x7d
+# GFX11: v_cmpx_f_u64_e32 s[2:3], v[2:3]         ; encoding: [0x02,0x04,0xb0,0x7d]
 
-# GFX11: v_cmpx_f_u64_e32 s[104:105], v[2:3]     ; encoding: [0x68,0x04,0xb0,0x7d]
 0x68,0x04,0xb0,0x7d
+# GFX11: v_cmpx_f_u64_e32 s[104:105], v[2:3]     ; encoding: [0x68,0x04,0xb0,0x7d]
 
-# GFX11: v_cmpx_f_u64_e32 vcc, v[2:3]            ; encoding: [0x6a,0x04,0xb0,0x7d]
 0x6a,0x04,0xb0,0x7d
+# GFX11: v_cmpx_f_u64_e32 vcc, v[2:3]            ; encoding: [0x6a,0x04,0xb0,0x7d]
 
-# GFX11: v_cmpx_f_u64_e32 ttmp[14:15], v[2:3]    ; encoding: [0x7a,0x04,0xb0,0x7d]
 0x7a,0x04,0xb0,0x7d
+# GFX11: v_cmpx_f_u64_e32 ttmp[14:15], v[2:3]    ; encoding: [0x7a,0x04,0xb0,0x7d]
 
-# GFX11: v_cmpx_f_u64_e32 exec, v[2:3]           ; encoding: [0x7e,0x04,0xb0,0x7d]
 0x7e,0x04,0xb0,0x7d
+# GFX11: v_cmpx_f_u64_e32 exec, v[2:3]           ; encoding: [0x7e,0x04,0xb0,0x7d]
 
-# GFX11: v_cmpx_f_u64_e32 null, v[2:3]           ; encoding: [0x7c,0x04,0xb0,0x7d]
 0x7c,0x04,0xb0,0x7d
+# GFX11: v_cmpx_f_u64_e32 null, v[2:3]           ; encoding: [0x7c,0x04,0xb0,0x7d]
 
-# GFX11: v_cmpx_f_u64_e32 -1, v[2:3]             ; encoding: [0xc1,0x04,0xb0,0x7d]
 0xc1,0x04,0xb0,0x7d
+# GFX11: v_cmpx_f_u64_e32 -1, v[2:3]             ; encoding: [0xc1,0x04,0xb0,0x7d]
 
-# GFX11: v_cmpx_f_u64_e32 0.5, v[2:3]            ; encoding: [0xf0,0x04,0xb0,0x7d]
 0xf0,0x04,0xb0,0x7d
+# GFX11: v_cmpx_f_u64_e32 0.5, v[2:3]            ; encoding: [0xf0,0x04,0xb0,0x7d]
 
-# GFX11: v_cmpx_f_u64_e32 src_scc, v[2:3]        ; encoding: [0xfd,0x04,0xb0,0x7d]
 0xfd,0x04,0xb0,0x7d
+# GFX11: v_cmpx_f_u64_e32 src_scc, v[2:3]        ; encoding: [0xfd,0x04,0xb0,0x7d]
 
-# GFX11: v_cmpx_f_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb1,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0xb1,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_f_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb1,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ge_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x0c,0x7d]
 0x01,0x05,0x0c,0x7d
+# GFX11: v_cmpx_ge_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x0c,0x7d]
 
-# GFX11: v_cmpx_ge_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x0c,0x7d]
 0x7f,0x05,0x0c,0x7d
+# GFX11: v_cmpx_ge_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x0c,0x7d]
 
-# GFX11: v_cmpx_ge_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x0c,0x7d]
 0x01,0x04,0x0c,0x7d
+# GFX11: v_cmpx_ge_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x0c,0x7d]
 
-# GFX11: v_cmpx_ge_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x0c,0x7d]
 0x69,0x04,0x0c,0x7d
+# GFX11: v_cmpx_ge_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x0c,0x7d]
 
-# GFX11: v_cmpx_ge_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x0c,0x7d]
 0x6a,0x04,0x0c,0x7d
+# GFX11: v_cmpx_ge_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x0c,0x7d]
 
-# GFX11: v_cmpx_ge_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x0c,0x7d]
 0x6b,0x04,0x0c,0x7d
+# GFX11: v_cmpx_ge_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x0c,0x7d]
 
-# GFX11: v_cmpx_ge_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x0c,0x7d]
 0x7b,0x04,0x0c,0x7d
+# GFX11: v_cmpx_ge_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x0c,0x7d]
 
-# GFX11: v_cmpx_ge_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x0c,0x7d]
 0x7d,0x04,0x0c,0x7d
+# GFX11: v_cmpx_ge_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x0c,0x7d]
 
-# GFX11: v_cmpx_ge_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x0c,0x7d]
 0x7e,0x04,0x0c,0x7d
+# GFX11: v_cmpx_ge_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x0c,0x7d]
 
-# GFX11: v_cmpx_ge_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x0c,0x7d]
 0x7f,0x04,0x0c,0x7d
+# GFX11: v_cmpx_ge_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x0c,0x7d]
 
-# GFX11: v_cmpx_ge_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x0c,0x7d]
 0x7c,0x04,0x0c,0x7d
+# GFX11: v_cmpx_ge_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x0c,0x7d]
 
-# GFX11: v_cmpx_ge_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x0c,0x7d]
 0xc1,0x04,0x0c,0x7d
+# GFX11: v_cmpx_ge_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x0c,0x7d]
 
-# GFX11: v_cmpx_ge_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x0c,0x7d]
 0xf0,0x04,0x0c,0x7d
+# GFX11: v_cmpx_ge_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x0c,0x7d]
 
-# GFX11: v_cmpx_ge_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x0c,0x7d]
 0xfd,0x04,0x0c,0x7d
+# GFX11: v_cmpx_ge_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x0c,0x7d]
 
-# GFX11: v_cmpx_ge_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x0c,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x0c,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_ge_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x0c,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ge_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x2c,0x7d]
 0x01,0x05,0x2c,0x7d
+# GFX11: v_cmpx_ge_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x2c,0x7d]
 
-# GFX11: v_cmpx_ge_f32_e32 v255, v2              ; encoding: [0xff,0x05,0x2c,0x7d]
 0xff,0x05,0x2c,0x7d
+# GFX11: v_cmpx_ge_f32_e32 v255, v2              ; encoding: [0xff,0x05,0x2c,0x7d]
 
-# GFX11: v_cmpx_ge_f32_e32 s1, v2                ; encoding: [0x01,0x04,0x2c,0x7d]
 0x01,0x04,0x2c,0x7d
+# GFX11: v_cmpx_ge_f32_e32 s1, v2                ; encoding: [0x01,0x04,0x2c,0x7d]
 
-# GFX11: v_cmpx_ge_f32_e32 s105, v2              ; encoding: [0x69,0x04,0x2c,0x7d]
 0x69,0x04,0x2c,0x7d
+# GFX11: v_cmpx_ge_f32_e32 s105, v2              ; encoding: [0x69,0x04,0x2c,0x7d]
 
-# GFX11: v_cmpx_ge_f32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x2c,0x7d]
 0x6a,0x04,0x2c,0x7d
+# GFX11: v_cmpx_ge_f32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x2c,0x7d]
 
-# GFX11: v_cmpx_ge_f32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x2c,0x7d]
 0x6b,0x04,0x2c,0x7d
+# GFX11: v_cmpx_ge_f32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x2c,0x7d]
 
-# GFX11: v_cmpx_ge_f32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x2c,0x7d]
 0x7b,0x04,0x2c,0x7d
+# GFX11: v_cmpx_ge_f32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x2c,0x7d]
 
-# GFX11: v_cmpx_ge_f32_e32 m0, v2                ; encoding: [0x7d,0x04,0x2c,0x7d]
 0x7d,0x04,0x2c,0x7d
+# GFX11: v_cmpx_ge_f32_e32 m0, v2                ; encoding: [0x7d,0x04,0x2c,0x7d]
 
-# GFX11: v_cmpx_ge_f32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x2c,0x7d]
 0x7e,0x04,0x2c,0x7d
+# GFX11: v_cmpx_ge_f32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x2c,0x7d]
 
-# GFX11: v_cmpx_ge_f32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x2c,0x7d]
 0x7f,0x04,0x2c,0x7d
+# GFX11: v_cmpx_ge_f32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x2c,0x7d]
 
-# GFX11: v_cmpx_ge_f32_e32 null, v2              ; encoding: [0x7c,0x04,0x2c,0x7d]
 0x7c,0x04,0x2c,0x7d
+# GFX11: v_cmpx_ge_f32_e32 null, v2              ; encoding: [0x7c,0x04,0x2c,0x7d]
 
-# GFX11: v_cmpx_ge_f32_e32 -1, v2                ; encoding: [0xc1,0x04,0x2c,0x7d]
 0xc1,0x04,0x2c,0x7d
+# GFX11: v_cmpx_ge_f32_e32 -1, v2                ; encoding: [0xc1,0x04,0x2c,0x7d]
 
-# GFX11: v_cmpx_ge_f32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x2c,0x7d]
 0xf0,0x04,0x2c,0x7d
+# GFX11: v_cmpx_ge_f32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x2c,0x7d]
 
-# GFX11: v_cmpx_ge_f32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x2c,0x7d]
 0xfd,0x04,0x2c,0x7d
+# GFX11: v_cmpx_ge_f32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x2c,0x7d]
 
-# GFX11: v_cmpx_ge_f32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x2d,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x2d,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ge_f32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x2d,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ge_f64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0x4c,0x7d]
 0x01,0x05,0x4c,0x7d
+# GFX11: v_cmpx_ge_f64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0x4c,0x7d]
 
-# GFX11: v_cmpx_ge_f64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0x4c,0x7d]
 0xfe,0x05,0x4c,0x7d
+# GFX11: v_cmpx_ge_f64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0x4c,0x7d]
 
-# GFX11: v_cmpx_ge_f64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0x4c,0x7d]
 0x02,0x04,0x4c,0x7d
+# GFX11: v_cmpx_ge_f64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0x4c,0x7d]
 
-# GFX11: v_cmpx_ge_f64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0x4c,0x7d]
 0x68,0x04,0x4c,0x7d
+# GFX11: v_cmpx_ge_f64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0x4c,0x7d]
 
-# GFX11: v_cmpx_ge_f64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0x4c,0x7d]
 0x6a,0x04,0x4c,0x7d
+# GFX11: v_cmpx_ge_f64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0x4c,0x7d]
 
-# GFX11: v_cmpx_ge_f64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0x4c,0x7d]
 0x7a,0x04,0x4c,0x7d
+# GFX11: v_cmpx_ge_f64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0x4c,0x7d]
 
-# GFX11: v_cmpx_ge_f64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0x4c,0x7d]
 0x7e,0x04,0x4c,0x7d
+# GFX11: v_cmpx_ge_f64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0x4c,0x7d]
 
-# GFX11: v_cmpx_ge_f64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0x4c,0x7d]
 0x7c,0x04,0x4c,0x7d
+# GFX11: v_cmpx_ge_f64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0x4c,0x7d]
 
-# GFX11: v_cmpx_ge_f64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0x4c,0x7d]
 0xc1,0x04,0x4c,0x7d
+# GFX11: v_cmpx_ge_f64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0x4c,0x7d]
 
-# GFX11: v_cmpx_ge_f64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0x4c,0x7d]
 0xf0,0x04,0x4c,0x7d
+# GFX11: v_cmpx_ge_f64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0x4c,0x7d]
 
-# GFX11: v_cmpx_ge_f64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0x4c,0x7d]
 0xfd,0x04,0x4c,0x7d
+# GFX11: v_cmpx_ge_f64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0x4c,0x7d]
 
-# GFX11: v_cmpx_ge_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4d,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x4d,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ge_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4d,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ge_i16_e32 v1, v2                ; encoding: [0x01,0x05,0x6c,0x7d]
 0x01,0x05,0x6c,0x7d
+# GFX11: v_cmpx_ge_i16_e32 v1, v2                ; encoding: [0x01,0x05,0x6c,0x7d]
 
-# GFX11: v_cmpx_ge_i16_e32 v127, v2              ; encoding: [0x7f,0x05,0x6c,0x7d]
 0x7f,0x05,0x6c,0x7d
+# GFX11: v_cmpx_ge_i16_e32 v127, v2              ; encoding: [0x7f,0x05,0x6c,0x7d]
 
-# GFX11: v_cmpx_ge_i16_e32 s1, v2                ; encoding: [0x01,0x04,0x6c,0x7d]
 0x01,0x04,0x6c,0x7d
+# GFX11: v_cmpx_ge_i16_e32 s1, v2                ; encoding: [0x01,0x04,0x6c,0x7d]
 
-# GFX11: v_cmpx_ge_i16_e32 s105, v2              ; encoding: [0x69,0x04,0x6c,0x7d]
 0x69,0x04,0x6c,0x7d
+# GFX11: v_cmpx_ge_i16_e32 s105, v2              ; encoding: [0x69,0x04,0x6c,0x7d]
 
-# GFX11: v_cmpx_ge_i16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x6c,0x7d]
 0x6a,0x04,0x6c,0x7d
+# GFX11: v_cmpx_ge_i16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x6c,0x7d]
 
-# GFX11: v_cmpx_ge_i16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x6c,0x7d]
 0x6b,0x04,0x6c,0x7d
+# GFX11: v_cmpx_ge_i16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x6c,0x7d]
 
-# GFX11: v_cmpx_ge_i16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x6c,0x7d]
 0x7b,0x04,0x6c,0x7d
+# GFX11: v_cmpx_ge_i16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x6c,0x7d]
 
-# GFX11: v_cmpx_ge_i16_e32 m0, v2                ; encoding: [0x7d,0x04,0x6c,0x7d]
 0x7d,0x04,0x6c,0x7d
+# GFX11: v_cmpx_ge_i16_e32 m0, v2                ; encoding: [0x7d,0x04,0x6c,0x7d]
 
-# GFX11: v_cmpx_ge_i16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x6c,0x7d]
 0x7e,0x04,0x6c,0x7d
+# GFX11: v_cmpx_ge_i16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x6c,0x7d]
 
-# GFX11: v_cmpx_ge_i16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x6c,0x7d]
 0x7f,0x04,0x6c,0x7d
+# GFX11: v_cmpx_ge_i16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x6c,0x7d]
 
-# GFX11: v_cmpx_ge_i16_e32 null, v2              ; encoding: [0x7c,0x04,0x6c,0x7d]
 0x7c,0x04,0x6c,0x7d
+# GFX11: v_cmpx_ge_i16_e32 null, v2              ; encoding: [0x7c,0x04,0x6c,0x7d]
 
-# GFX11: v_cmpx_ge_i16_e32 -1, v2                ; encoding: [0xc1,0x04,0x6c,0x7d]
 0xc1,0x04,0x6c,0x7d
+# GFX11: v_cmpx_ge_i16_e32 -1, v2                ; encoding: [0xc1,0x04,0x6c,0x7d]
 
-# GFX11: v_cmpx_ge_i16_e32 0x3800, v2
 0xf0,0x04,0x6c,0x7d
+# GFX11: v_cmpx_ge_i16_e32 0x3800, v2            ; encoding: [0xff,0x04,0x6c,0x7d,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x6c,0x7d]
 0xfd,0x04,0x6c,0x7d
+# GFX11: v_cmpx_ge_i16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x6c,0x7d]
 
-# GFX11: v_cmpx_ge_i16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x6c,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x6c,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_ge_i16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x6c,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i32_e32 v1, v2                ; encoding: [0x01,0x05,0x8c,0x7d]
 0x01,0x05,0x8c,0x7d
+# GFX11: v_cmpx_ge_i32_e32 v1, v2                ; encoding: [0x01,0x05,0x8c,0x7d]
 
-# GFX11: v_cmpx_ge_i32_e32 v255, v2              ; encoding: [0xff,0x05,0x8c,0x7d]
 0xff,0x05,0x8c,0x7d
+# GFX11: v_cmpx_ge_i32_e32 v255, v2              ; encoding: [0xff,0x05,0x8c,0x7d]
 
-# GFX11: v_cmpx_ge_i32_e32 s1, v2                ; encoding: [0x01,0x04,0x8c,0x7d]
 0x01,0x04,0x8c,0x7d
+# GFX11: v_cmpx_ge_i32_e32 s1, v2                ; encoding: [0x01,0x04,0x8c,0x7d]
 
-# GFX11: v_cmpx_ge_i32_e32 s105, v2              ; encoding: [0x69,0x04,0x8c,0x7d]
 0x69,0x04,0x8c,0x7d
+# GFX11: v_cmpx_ge_i32_e32 s105, v2              ; encoding: [0x69,0x04,0x8c,0x7d]
 
-# GFX11: v_cmpx_ge_i32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x8c,0x7d]
 0x6a,0x04,0x8c,0x7d
+# GFX11: v_cmpx_ge_i32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x8c,0x7d]
 
-# GFX11: v_cmpx_ge_i32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x8c,0x7d]
 0x6b,0x04,0x8c,0x7d
+# GFX11: v_cmpx_ge_i32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x8c,0x7d]
 
-# GFX11: v_cmpx_ge_i32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x8c,0x7d]
 0x7b,0x04,0x8c,0x7d
+# GFX11: v_cmpx_ge_i32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x8c,0x7d]
 
-# GFX11: v_cmpx_ge_i32_e32 m0, v2                ; encoding: [0x7d,0x04,0x8c,0x7d]
 0x7d,0x04,0x8c,0x7d
+# GFX11: v_cmpx_ge_i32_e32 m0, v2                ; encoding: [0x7d,0x04,0x8c,0x7d]
 
-# GFX11: v_cmpx_ge_i32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x8c,0x7d]
 0x7e,0x04,0x8c,0x7d
+# GFX11: v_cmpx_ge_i32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x8c,0x7d]
 
-# GFX11: v_cmpx_ge_i32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x8c,0x7d]
 0x7f,0x04,0x8c,0x7d
+# GFX11: v_cmpx_ge_i32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x8c,0x7d]
 
-# GFX11: v_cmpx_ge_i32_e32 null, v2              ; encoding: [0x7c,0x04,0x8c,0x7d]
 0x7c,0x04,0x8c,0x7d
+# GFX11: v_cmpx_ge_i32_e32 null, v2              ; encoding: [0x7c,0x04,0x8c,0x7d]
 
-# GFX11: v_cmpx_ge_i32_e32 -1, v2                ; encoding: [0xc1,0x04,0x8c,0x7d]
 0xc1,0x04,0x8c,0x7d
+# GFX11: v_cmpx_ge_i32_e32 -1, v2                ; encoding: [0xc1,0x04,0x8c,0x7d]
 
-# GFX11: v_cmpx_ge_i32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x8c,0x7d]
 0xf0,0x04,0x8c,0x7d
+# GFX11: v_cmpx_ge_i32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x8c,0x7d]
 
-# GFX11: v_cmpx_ge_i32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x8c,0x7d]
 0xfd,0x04,0x8c,0x7d
+# GFX11: v_cmpx_ge_i32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x8c,0x7d]
 
-# GFX11: v_cmpx_ge_i32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x8d,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x8d,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ge_i32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x8d,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ge_i64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xac,0x7d]
 0x01,0x05,0xac,0x7d
+# GFX11: v_cmpx_ge_i64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xac,0x7d]
 
-# GFX11: v_cmpx_ge_i64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xac,0x7d]
 0xfe,0x05,0xac,0x7d
+# GFX11: v_cmpx_ge_i64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xac,0x7d]
 
-# GFX11: v_cmpx_ge_i64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xac,0x7d]
 0x02,0x04,0xac,0x7d
+# GFX11: v_cmpx_ge_i64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xac,0x7d]
 
-# GFX11: v_cmpx_ge_i64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xac,0x7d]
 0x68,0x04,0xac,0x7d
+# GFX11: v_cmpx_ge_i64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xac,0x7d]
 
-# GFX11: v_cmpx_ge_i64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xac,0x7d]
 0x6a,0x04,0xac,0x7d
+# GFX11: v_cmpx_ge_i64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xac,0x7d]
 
-# GFX11: v_cmpx_ge_i64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xac,0x7d]
 0x7a,0x04,0xac,0x7d
+# GFX11: v_cmpx_ge_i64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xac,0x7d]
 
-# GFX11: v_cmpx_ge_i64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xac,0x7d]
 0x7e,0x04,0xac,0x7d
+# GFX11: v_cmpx_ge_i64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xac,0x7d]
 
-# GFX11: v_cmpx_ge_i64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xac,0x7d]
 0x7c,0x04,0xac,0x7d
+# GFX11: v_cmpx_ge_i64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xac,0x7d]
 
-# GFX11: v_cmpx_ge_i64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xac,0x7d]
 0xc1,0x04,0xac,0x7d
+# GFX11: v_cmpx_ge_i64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xac,0x7d]
 
-# GFX11: v_cmpx_ge_i64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xac,0x7d]
 0xf0,0x04,0xac,0x7d
+# GFX11: v_cmpx_ge_i64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xac,0x7d]
 
-# GFX11: v_cmpx_ge_i64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xac,0x7d]
 0xfd,0x04,0xac,0x7d
+# GFX11: v_cmpx_ge_i64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xac,0x7d]
 
-# GFX11: v_cmpx_ge_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xad,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0xad,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ge_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xad,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ge_u16_e32 v1, v2                ; encoding: [0x01,0x05,0x7c,0x7d]
 0x01,0x05,0x7c,0x7d
+# GFX11: v_cmpx_ge_u16_e32 v1, v2                ; encoding: [0x01,0x05,0x7c,0x7d]
 
-# GFX11: v_cmpx_ge_u16_e32 v127, v2              ; encoding: [0x7f,0x05,0x7c,0x7d]
 0x7f,0x05,0x7c,0x7d
+# GFX11: v_cmpx_ge_u16_e32 v127, v2              ; encoding: [0x7f,0x05,0x7c,0x7d]
 
-# GFX11: v_cmpx_ge_u16_e32 s1, v2                ; encoding: [0x01,0x04,0x7c,0x7d]
 0x01,0x04,0x7c,0x7d
+# GFX11: v_cmpx_ge_u16_e32 s1, v2                ; encoding: [0x01,0x04,0x7c,0x7d]
 
-# GFX11: v_cmpx_ge_u16_e32 s105, v2              ; encoding: [0x69,0x04,0x7c,0x7d]
 0x69,0x04,0x7c,0x7d
+# GFX11: v_cmpx_ge_u16_e32 s105, v2              ; encoding: [0x69,0x04,0x7c,0x7d]
 
-# GFX11: v_cmpx_ge_u16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x7c,0x7d]
 0x6a,0x04,0x7c,0x7d
+# GFX11: v_cmpx_ge_u16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x7c,0x7d]
 
-# GFX11: v_cmpx_ge_u16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x7c,0x7d]
 0x6b,0x04,0x7c,0x7d
+# GFX11: v_cmpx_ge_u16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x7c,0x7d]
 
-# GFX11: v_cmpx_ge_u16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x7c,0x7d]
 0x7b,0x04,0x7c,0x7d
+# GFX11: v_cmpx_ge_u16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x7c,0x7d]
 
-# GFX11: v_cmpx_ge_u16_e32 m0, v2                ; encoding: [0x7d,0x04,0x7c,0x7d]
 0x7d,0x04,0x7c,0x7d
+# GFX11: v_cmpx_ge_u16_e32 m0, v2                ; encoding: [0x7d,0x04,0x7c,0x7d]
 
-# GFX11: v_cmpx_ge_u16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x7c,0x7d]
 0x7e,0x04,0x7c,0x7d
+# GFX11: v_cmpx_ge_u16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x7c,0x7d]
 
-# GFX11: v_cmpx_ge_u16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x7c,0x7d]
 0x7f,0x04,0x7c,0x7d
+# GFX11: v_cmpx_ge_u16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x7c,0x7d]
 
-# GFX11: v_cmpx_ge_u16_e32 null, v2              ; encoding: [0x7c,0x04,0x7c,0x7d]
 0x7c,0x04,0x7c,0x7d
+# GFX11: v_cmpx_ge_u16_e32 null, v2              ; encoding: [0x7c,0x04,0x7c,0x7d]
 
-# GFX11: v_cmpx_ge_u16_e32 -1, v2                ; encoding: [0xc1,0x04,0x7c,0x7d]
 0xc1,0x04,0x7c,0x7d
+# GFX11: v_cmpx_ge_u16_e32 -1, v2                ; encoding: [0xc1,0x04,0x7c,0x7d]
 
-# GFX11: v_cmpx_ge_u16_e32 0x3800, v2
 0xf0,0x04,0x7c,0x7d
+# GFX11: v_cmpx_ge_u16_e32 0x3800, v2            ; encoding: [0xff,0x04,0x7c,0x7d,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x7c,0x7d]
 0xfd,0x04,0x7c,0x7d
+# GFX11: v_cmpx_ge_u16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x7c,0x7d]
 
-# GFX11: v_cmpx_ge_u16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x7c,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x7c,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_ge_u16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x7c,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u32_e32 v1, v2                ; encoding: [0x01,0x05,0x9c,0x7d]
 0x01,0x05,0x9c,0x7d
+# GFX11: v_cmpx_ge_u32_e32 v1, v2                ; encoding: [0x01,0x05,0x9c,0x7d]
 
-# GFX11: v_cmpx_ge_u32_e32 v255, v2              ; encoding: [0xff,0x05,0x9c,0x7d]
 0xff,0x05,0x9c,0x7d
+# GFX11: v_cmpx_ge_u32_e32 v255, v2              ; encoding: [0xff,0x05,0x9c,0x7d]
 
-# GFX11: v_cmpx_ge_u32_e32 s1, v2                ; encoding: [0x01,0x04,0x9c,0x7d]
 0x01,0x04,0x9c,0x7d
+# GFX11: v_cmpx_ge_u32_e32 s1, v2                ; encoding: [0x01,0x04,0x9c,0x7d]
 
-# GFX11: v_cmpx_ge_u32_e32 s105, v2              ; encoding: [0x69,0x04,0x9c,0x7d]
 0x69,0x04,0x9c,0x7d
+# GFX11: v_cmpx_ge_u32_e32 s105, v2              ; encoding: [0x69,0x04,0x9c,0x7d]
 
-# GFX11: v_cmpx_ge_u32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x9c,0x7d]
 0x6a,0x04,0x9c,0x7d
+# GFX11: v_cmpx_ge_u32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x9c,0x7d]
 
-# GFX11: v_cmpx_ge_u32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x9c,0x7d]
 0x6b,0x04,0x9c,0x7d
+# GFX11: v_cmpx_ge_u32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x9c,0x7d]
 
-# GFX11: v_cmpx_ge_u32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x9c,0x7d]
 0x7b,0x04,0x9c,0x7d
+# GFX11: v_cmpx_ge_u32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x9c,0x7d]
 
-# GFX11: v_cmpx_ge_u32_e32 m0, v2                ; encoding: [0x7d,0x04,0x9c,0x7d]
 0x7d,0x04,0x9c,0x7d
+# GFX11: v_cmpx_ge_u32_e32 m0, v2                ; encoding: [0x7d,0x04,0x9c,0x7d]
 
-# GFX11: v_cmpx_ge_u32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x9c,0x7d]
 0x7e,0x04,0x9c,0x7d
+# GFX11: v_cmpx_ge_u32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x9c,0x7d]
 
-# GFX11: v_cmpx_ge_u32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x9c,0x7d]
 0x7f,0x04,0x9c,0x7d
+# GFX11: v_cmpx_ge_u32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x9c,0x7d]
 
-# GFX11: v_cmpx_ge_u32_e32 null, v2              ; encoding: [0x7c,0x04,0x9c,0x7d]
 0x7c,0x04,0x9c,0x7d
+# GFX11: v_cmpx_ge_u32_e32 null, v2              ; encoding: [0x7c,0x04,0x9c,0x7d]
 
-# GFX11: v_cmpx_ge_u32_e32 -1, v2                ; encoding: [0xc1,0x04,0x9c,0x7d]
 0xc1,0x04,0x9c,0x7d
+# GFX11: v_cmpx_ge_u32_e32 -1, v2                ; encoding: [0xc1,0x04,0x9c,0x7d]
 
-# GFX11: v_cmpx_ge_u32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x9c,0x7d]
 0xf0,0x04,0x9c,0x7d
+# GFX11: v_cmpx_ge_u32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x9c,0x7d]
 
-# GFX11: v_cmpx_ge_u32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x9c,0x7d]
 0xfd,0x04,0x9c,0x7d
+# GFX11: v_cmpx_ge_u32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x9c,0x7d]
 
-# GFX11: v_cmpx_ge_u32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x9d,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x9d,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ge_u32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x9d,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ge_u64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xbc,0x7d]
 0x01,0x05,0xbc,0x7d
+# GFX11: v_cmpx_ge_u64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xbc,0x7d]
 
-# GFX11: v_cmpx_ge_u64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xbc,0x7d]
 0xfe,0x05,0xbc,0x7d
+# GFX11: v_cmpx_ge_u64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xbc,0x7d]
 
-# GFX11: v_cmpx_ge_u64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xbc,0x7d]
 0x02,0x04,0xbc,0x7d
+# GFX11: v_cmpx_ge_u64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xbc,0x7d]
 
-# GFX11: v_cmpx_ge_u64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xbc,0x7d]
 0x68,0x04,0xbc,0x7d
+# GFX11: v_cmpx_ge_u64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xbc,0x7d]
 
-# GFX11: v_cmpx_ge_u64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xbc,0x7d]
 0x6a,0x04,0xbc,0x7d
+# GFX11: v_cmpx_ge_u64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xbc,0x7d]
 
-# GFX11: v_cmpx_ge_u64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xbc,0x7d]
 0x7a,0x04,0xbc,0x7d
+# GFX11: v_cmpx_ge_u64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xbc,0x7d]
 
-# GFX11: v_cmpx_ge_u64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xbc,0x7d]
 0x7e,0x04,0xbc,0x7d
+# GFX11: v_cmpx_ge_u64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xbc,0x7d]
 
-# GFX11: v_cmpx_ge_u64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xbc,0x7d]
 0x7c,0x04,0xbc,0x7d
+# GFX11: v_cmpx_ge_u64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xbc,0x7d]
 
-# GFX11: v_cmpx_ge_u64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xbc,0x7d]
 0xc1,0x04,0xbc,0x7d
+# GFX11: v_cmpx_ge_u64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xbc,0x7d]
 
-# GFX11: v_cmpx_ge_u64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xbc,0x7d]
 0xf0,0x04,0xbc,0x7d
+# GFX11: v_cmpx_ge_u64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xbc,0x7d]
 
-# GFX11: v_cmpx_ge_u64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xbc,0x7d]
 0xfd,0x04,0xbc,0x7d
+# GFX11: v_cmpx_ge_u64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xbc,0x7d]
 
-# GFX11: v_cmpx_ge_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbd,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0xbd,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ge_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbd,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_gt_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x08,0x7d]
 0x01,0x05,0x08,0x7d
+# GFX11: v_cmpx_gt_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x08,0x7d]
 
-# GFX11: v_cmpx_gt_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x08,0x7d]
 0x7f,0x05,0x08,0x7d
+# GFX11: v_cmpx_gt_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x08,0x7d]
 
-# GFX11: v_cmpx_gt_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x08,0x7d]
 0x01,0x04,0x08,0x7d
+# GFX11: v_cmpx_gt_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x08,0x7d]
 
-# GFX11: v_cmpx_gt_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x08,0x7d]
 0x69,0x04,0x08,0x7d
+# GFX11: v_cmpx_gt_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x08,0x7d]
 
-# GFX11: v_cmpx_gt_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x08,0x7d]
 0x6a,0x04,0x08,0x7d
+# GFX11: v_cmpx_gt_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x08,0x7d]
 
-# GFX11: v_cmpx_gt_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x08,0x7d]
 0x6b,0x04,0x08,0x7d
+# GFX11: v_cmpx_gt_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x08,0x7d]
 
-# GFX11: v_cmpx_gt_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x08,0x7d]
 0x7b,0x04,0x08,0x7d
+# GFX11: v_cmpx_gt_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x08,0x7d]
 
-# GFX11: v_cmpx_gt_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x08,0x7d]
 0x7d,0x04,0x08,0x7d
+# GFX11: v_cmpx_gt_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x08,0x7d]
 
-# GFX11: v_cmpx_gt_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x08,0x7d]
 0x7e,0x04,0x08,0x7d
+# GFX11: v_cmpx_gt_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x08,0x7d]
 
-# GFX11: v_cmpx_gt_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x08,0x7d]
 0x7f,0x04,0x08,0x7d
+# GFX11: v_cmpx_gt_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x08,0x7d]
 
-# GFX11: v_cmpx_gt_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x08,0x7d]
 0x7c,0x04,0x08,0x7d
+# GFX11: v_cmpx_gt_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x08,0x7d]
 
-# GFX11: v_cmpx_gt_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x08,0x7d]
 0xc1,0x04,0x08,0x7d
+# GFX11: v_cmpx_gt_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x08,0x7d]
 
-# GFX11: v_cmpx_gt_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x08,0x7d]
 0xf0,0x04,0x08,0x7d
+# GFX11: v_cmpx_gt_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x08,0x7d]
 
-# GFX11: v_cmpx_gt_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x08,0x7d]
 0xfd,0x04,0x08,0x7d
+# GFX11: v_cmpx_gt_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x08,0x7d]
 
-# GFX11: v_cmpx_gt_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x08,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x08,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_gt_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x08,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_gt_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x28,0x7d]
 0x01,0x05,0x28,0x7d
+# GFX11: v_cmpx_gt_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x28,0x7d]
 
-# GFX11: v_cmpx_gt_f32_e32 v255, v2              ; encoding: [0xff,0x05,0x28,0x7d]
 0xff,0x05,0x28,0x7d
+# GFX11: v_cmpx_gt_f32_e32 v255, v2              ; encoding: [0xff,0x05,0x28,0x7d]
 
-# GFX11: v_cmpx_gt_f32_e32 s1, v2                ; encoding: [0x01,0x04,0x28,0x7d]
 0x01,0x04,0x28,0x7d
+# GFX11: v_cmpx_gt_f32_e32 s1, v2                ; encoding: [0x01,0x04,0x28,0x7d]
 
-# GFX11: v_cmpx_gt_f32_e32 s105, v2              ; encoding: [0x69,0x04,0x28,0x7d]
 0x69,0x04,0x28,0x7d
+# GFX11: v_cmpx_gt_f32_e32 s105, v2              ; encoding: [0x69,0x04,0x28,0x7d]
 
-# GFX11: v_cmpx_gt_f32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x28,0x7d]
 0x6a,0x04,0x28,0x7d
+# GFX11: v_cmpx_gt_f32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x28,0x7d]
 
-# GFX11: v_cmpx_gt_f32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x28,0x7d]
 0x6b,0x04,0x28,0x7d
+# GFX11: v_cmpx_gt_f32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x28,0x7d]
 
-# GFX11: v_cmpx_gt_f32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x28,0x7d]
 0x7b,0x04,0x28,0x7d
+# GFX11: v_cmpx_gt_f32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x28,0x7d]
 
-# GFX11: v_cmpx_gt_f32_e32 m0, v2                ; encoding: [0x7d,0x04,0x28,0x7d]
 0x7d,0x04,0x28,0x7d
+# GFX11: v_cmpx_gt_f32_e32 m0, v2                ; encoding: [0x7d,0x04,0x28,0x7d]
 
-# GFX11: v_cmpx_gt_f32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x28,0x7d]
 0x7e,0x04,0x28,0x7d
+# GFX11: v_cmpx_gt_f32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x28,0x7d]
 
-# GFX11: v_cmpx_gt_f32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x28,0x7d]
 0x7f,0x04,0x28,0x7d
+# GFX11: v_cmpx_gt_f32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x28,0x7d]
 
-# GFX11: v_cmpx_gt_f32_e32 null, v2              ; encoding: [0x7c,0x04,0x28,0x7d]
 0x7c,0x04,0x28,0x7d
+# GFX11: v_cmpx_gt_f32_e32 null, v2              ; encoding: [0x7c,0x04,0x28,0x7d]
 
-# GFX11: v_cmpx_gt_f32_e32 -1, v2                ; encoding: [0xc1,0x04,0x28,0x7d]
 0xc1,0x04,0x28,0x7d
+# GFX11: v_cmpx_gt_f32_e32 -1, v2                ; encoding: [0xc1,0x04,0x28,0x7d]
 
-# GFX11: v_cmpx_gt_f32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x28,0x7d]
 0xf0,0x04,0x28,0x7d
+# GFX11: v_cmpx_gt_f32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x28,0x7d]
 
-# GFX11: v_cmpx_gt_f32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x28,0x7d]
 0xfd,0x04,0x28,0x7d
+# GFX11: v_cmpx_gt_f32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x28,0x7d]
 
-# GFX11: v_cmpx_gt_f32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x29,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x29,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_gt_f32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x29,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_gt_f64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0x48,0x7d]
 0x01,0x05,0x48,0x7d
+# GFX11: v_cmpx_gt_f64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0x48,0x7d]
 
-# GFX11: v_cmpx_gt_f64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0x48,0x7d]
 0xfe,0x05,0x48,0x7d
+# GFX11: v_cmpx_gt_f64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0x48,0x7d]
 
-# GFX11: v_cmpx_gt_f64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0x48,0x7d]
 0x02,0x04,0x48,0x7d
+# GFX11: v_cmpx_gt_f64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0x48,0x7d]
 
-# GFX11: v_cmpx_gt_f64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0x48,0x7d]
 0x68,0x04,0x48,0x7d
+# GFX11: v_cmpx_gt_f64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0x48,0x7d]
 
-# GFX11: v_cmpx_gt_f64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0x48,0x7d]
 0x6a,0x04,0x48,0x7d
+# GFX11: v_cmpx_gt_f64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0x48,0x7d]
 
-# GFX11: v_cmpx_gt_f64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0x48,0x7d]
 0x7a,0x04,0x48,0x7d
+# GFX11: v_cmpx_gt_f64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0x48,0x7d]
 
-# GFX11: v_cmpx_gt_f64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0x48,0x7d]
 0x7e,0x04,0x48,0x7d
+# GFX11: v_cmpx_gt_f64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0x48,0x7d]
 
-# GFX11: v_cmpx_gt_f64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0x48,0x7d]
 0x7c,0x04,0x48,0x7d
+# GFX11: v_cmpx_gt_f64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0x48,0x7d]
 
-# GFX11: v_cmpx_gt_f64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0x48,0x7d]
 0xc1,0x04,0x48,0x7d
+# GFX11: v_cmpx_gt_f64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0x48,0x7d]
 
-# GFX11: v_cmpx_gt_f64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0x48,0x7d]
 0xf0,0x04,0x48,0x7d
+# GFX11: v_cmpx_gt_f64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0x48,0x7d]
 
-# GFX11: v_cmpx_gt_f64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0x48,0x7d]
 0xfd,0x04,0x48,0x7d
+# GFX11: v_cmpx_gt_f64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0x48,0x7d]
 
-# GFX11: v_cmpx_gt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x49,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x49,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_gt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x49,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_gt_i16_e32 v1, v2                ; encoding: [0x01,0x05,0x68,0x7d]
 0x01,0x05,0x68,0x7d
+# GFX11: v_cmpx_gt_i16_e32 v1, v2                ; encoding: [0x01,0x05,0x68,0x7d]
 
-# GFX11: v_cmpx_gt_i16_e32 v127, v2              ; encoding: [0x7f,0x05,0x68,0x7d]
 0x7f,0x05,0x68,0x7d
+# GFX11: v_cmpx_gt_i16_e32 v127, v2              ; encoding: [0x7f,0x05,0x68,0x7d]
 
-# GFX11: v_cmpx_gt_i16_e32 s1, v2                ; encoding: [0x01,0x04,0x68,0x7d]
 0x01,0x04,0x68,0x7d
+# GFX11: v_cmpx_gt_i16_e32 s1, v2                ; encoding: [0x01,0x04,0x68,0x7d]
 
-# GFX11: v_cmpx_gt_i16_e32 s105, v2              ; encoding: [0x69,0x04,0x68,0x7d]
 0x69,0x04,0x68,0x7d
+# GFX11: v_cmpx_gt_i16_e32 s105, v2              ; encoding: [0x69,0x04,0x68,0x7d]
 
-# GFX11: v_cmpx_gt_i16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x68,0x7d]
 0x6a,0x04,0x68,0x7d
+# GFX11: v_cmpx_gt_i16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x68,0x7d]
 
-# GFX11: v_cmpx_gt_i16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x68,0x7d]
 0x6b,0x04,0x68,0x7d
+# GFX11: v_cmpx_gt_i16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x68,0x7d]
 
-# GFX11: v_cmpx_gt_i16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x68,0x7d]
 0x7b,0x04,0x68,0x7d
+# GFX11: v_cmpx_gt_i16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x68,0x7d]
 
-# GFX11: v_cmpx_gt_i16_e32 m0, v2                ; encoding: [0x7d,0x04,0x68,0x7d]
 0x7d,0x04,0x68,0x7d
+# GFX11: v_cmpx_gt_i16_e32 m0, v2                ; encoding: [0x7d,0x04,0x68,0x7d]
 
-# GFX11: v_cmpx_gt_i16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x68,0x7d]
 0x7e,0x04,0x68,0x7d
+# GFX11: v_cmpx_gt_i16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x68,0x7d]
 
-# GFX11: v_cmpx_gt_i16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x68,0x7d]
 0x7f,0x04,0x68,0x7d
+# GFX11: v_cmpx_gt_i16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x68,0x7d]
 
-# GFX11: v_cmpx_gt_i16_e32 null, v2              ; encoding: [0x7c,0x04,0x68,0x7d]
 0x7c,0x04,0x68,0x7d
+# GFX11: v_cmpx_gt_i16_e32 null, v2              ; encoding: [0x7c,0x04,0x68,0x7d]
 
-# GFX11: v_cmpx_gt_i16_e32 -1, v2                ; encoding: [0xc1,0x04,0x68,0x7d]
 0xc1,0x04,0x68,0x7d
+# GFX11: v_cmpx_gt_i16_e32 -1, v2                ; encoding: [0xc1,0x04,0x68,0x7d]
 
-# GFX11: v_cmpx_gt_i16_e32 0x3800, v2
 0xf0,0x04,0x68,0x7d
+# GFX11: v_cmpx_gt_i16_e32 0x3800, v2            ; encoding: [0xff,0x04,0x68,0x7d,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x68,0x7d]
 0xfd,0x04,0x68,0x7d
+# GFX11: v_cmpx_gt_i16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x68,0x7d]
 
-# GFX11: v_cmpx_gt_i16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x68,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x68,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_gt_i16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x68,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i32_e32 v1, v2                ; encoding: [0x01,0x05,0x88,0x7d]
 0x01,0x05,0x88,0x7d
+# GFX11: v_cmpx_gt_i32_e32 v1, v2                ; encoding: [0x01,0x05,0x88,0x7d]
 
-# GFX11: v_cmpx_gt_i32_e32 v255, v2              ; encoding: [0xff,0x05,0x88,0x7d]
 0xff,0x05,0x88,0x7d
+# GFX11: v_cmpx_gt_i32_e32 v255, v2              ; encoding: [0xff,0x05,0x88,0x7d]
 
-# GFX11: v_cmpx_gt_i32_e32 s1, v2                ; encoding: [0x01,0x04,0x88,0x7d]
 0x01,0x04,0x88,0x7d
+# GFX11: v_cmpx_gt_i32_e32 s1, v2                ; encoding: [0x01,0x04,0x88,0x7d]
 
-# GFX11: v_cmpx_gt_i32_e32 s105, v2              ; encoding: [0x69,0x04,0x88,0x7d]
 0x69,0x04,0x88,0x7d
+# GFX11: v_cmpx_gt_i32_e32 s105, v2              ; encoding: [0x69,0x04,0x88,0x7d]
 
-# GFX11: v_cmpx_gt_i32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x88,0x7d]
 0x6a,0x04,0x88,0x7d
+# GFX11: v_cmpx_gt_i32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x88,0x7d]
 
-# GFX11: v_cmpx_gt_i32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x88,0x7d]
 0x6b,0x04,0x88,0x7d
+# GFX11: v_cmpx_gt_i32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x88,0x7d]
 
-# GFX11: v_cmpx_gt_i32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x88,0x7d]
 0x7b,0x04,0x88,0x7d
+# GFX11: v_cmpx_gt_i32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x88,0x7d]
 
-# GFX11: v_cmpx_gt_i32_e32 m0, v2                ; encoding: [0x7d,0x04,0x88,0x7d]
 0x7d,0x04,0x88,0x7d
+# GFX11: v_cmpx_gt_i32_e32 m0, v2                ; encoding: [0x7d,0x04,0x88,0x7d]
 
-# GFX11: v_cmpx_gt_i32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x88,0x7d]
 0x7e,0x04,0x88,0x7d
+# GFX11: v_cmpx_gt_i32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x88,0x7d]
 
-# GFX11: v_cmpx_gt_i32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x88,0x7d]
 0x7f,0x04,0x88,0x7d
+# GFX11: v_cmpx_gt_i32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x88,0x7d]
 
-# GFX11: v_cmpx_gt_i32_e32 null, v2              ; encoding: [0x7c,0x04,0x88,0x7d]
 0x7c,0x04,0x88,0x7d
+# GFX11: v_cmpx_gt_i32_e32 null, v2              ; encoding: [0x7c,0x04,0x88,0x7d]
 
-# GFX11: v_cmpx_gt_i32_e32 -1, v2                ; encoding: [0xc1,0x04,0x88,0x7d]
 0xc1,0x04,0x88,0x7d
+# GFX11: v_cmpx_gt_i32_e32 -1, v2                ; encoding: [0xc1,0x04,0x88,0x7d]
 
-# GFX11: v_cmpx_gt_i32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x88,0x7d]
 0xf0,0x04,0x88,0x7d
+# GFX11: v_cmpx_gt_i32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x88,0x7d]
 
-# GFX11: v_cmpx_gt_i32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x88,0x7d]
 0xfd,0x04,0x88,0x7d
+# GFX11: v_cmpx_gt_i32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x88,0x7d]
 
-# GFX11: v_cmpx_gt_i32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x89,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x89,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_gt_i32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x89,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_gt_i64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xa8,0x7d]
 0x01,0x05,0xa8,0x7d
+# GFX11: v_cmpx_gt_i64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xa8,0x7d]
 
-# GFX11: v_cmpx_gt_i64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xa8,0x7d]
 0xfe,0x05,0xa8,0x7d
+# GFX11: v_cmpx_gt_i64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xa8,0x7d]
 
-# GFX11: v_cmpx_gt_i64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xa8,0x7d]
 0x02,0x04,0xa8,0x7d
+# GFX11: v_cmpx_gt_i64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xa8,0x7d]
 
-# GFX11: v_cmpx_gt_i64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xa8,0x7d]
 0x68,0x04,0xa8,0x7d
+# GFX11: v_cmpx_gt_i64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xa8,0x7d]
 
-# GFX11: v_cmpx_gt_i64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xa8,0x7d]
 0x6a,0x04,0xa8,0x7d
+# GFX11: v_cmpx_gt_i64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xa8,0x7d]
 
-# GFX11: v_cmpx_gt_i64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xa8,0x7d]
 0x7a,0x04,0xa8,0x7d
+# GFX11: v_cmpx_gt_i64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xa8,0x7d]
 
-# GFX11: v_cmpx_gt_i64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xa8,0x7d]
 0x7e,0x04,0xa8,0x7d
+# GFX11: v_cmpx_gt_i64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xa8,0x7d]
 
-# GFX11: v_cmpx_gt_i64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xa8,0x7d]
 0x7c,0x04,0xa8,0x7d
+# GFX11: v_cmpx_gt_i64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xa8,0x7d]
 
-# GFX11: v_cmpx_gt_i64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xa8,0x7d]
 0xc1,0x04,0xa8,0x7d
+# GFX11: v_cmpx_gt_i64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xa8,0x7d]
 
-# GFX11: v_cmpx_gt_i64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xa8,0x7d]
 0xf0,0x04,0xa8,0x7d
+# GFX11: v_cmpx_gt_i64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xa8,0x7d]
 
-# GFX11: v_cmpx_gt_i64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xa8,0x7d]
 0xfd,0x04,0xa8,0x7d
+# GFX11: v_cmpx_gt_i64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xa8,0x7d]
 
-# GFX11: v_cmpx_gt_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa9,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0xa9,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_gt_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa9,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_gt_u16_e32 v1, v2                ; encoding: [0x01,0x05,0x78,0x7d]
 0x01,0x05,0x78,0x7d
+# GFX11: v_cmpx_gt_u16_e32 v1, v2                ; encoding: [0x01,0x05,0x78,0x7d]
 
-# GFX11: v_cmpx_gt_u16_e32 v127, v2              ; encoding: [0x7f,0x05,0x78,0x7d]
 0x7f,0x05,0x78,0x7d
+# GFX11: v_cmpx_gt_u16_e32 v127, v2              ; encoding: [0x7f,0x05,0x78,0x7d]
 
-# GFX11: v_cmpx_gt_u16_e32 s1, v2                ; encoding: [0x01,0x04,0x78,0x7d]
 0x01,0x04,0x78,0x7d
+# GFX11: v_cmpx_gt_u16_e32 s1, v2                ; encoding: [0x01,0x04,0x78,0x7d]
 
-# GFX11: v_cmpx_gt_u16_e32 s105, v2              ; encoding: [0x69,0x04,0x78,0x7d]
 0x69,0x04,0x78,0x7d
+# GFX11: v_cmpx_gt_u16_e32 s105, v2              ; encoding: [0x69,0x04,0x78,0x7d]
 
-# GFX11: v_cmpx_gt_u16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x78,0x7d]
 0x6a,0x04,0x78,0x7d
+# GFX11: v_cmpx_gt_u16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x78,0x7d]
 
-# GFX11: v_cmpx_gt_u16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x78,0x7d]
 0x6b,0x04,0x78,0x7d
+# GFX11: v_cmpx_gt_u16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x78,0x7d]
 
-# GFX11: v_cmpx_gt_u16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x78,0x7d]
 0x7b,0x04,0x78,0x7d
+# GFX11: v_cmpx_gt_u16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x78,0x7d]
 
-# GFX11: v_cmpx_gt_u16_e32 m0, v2                ; encoding: [0x7d,0x04,0x78,0x7d]
 0x7d,0x04,0x78,0x7d
+# GFX11: v_cmpx_gt_u16_e32 m0, v2                ; encoding: [0x7d,0x04,0x78,0x7d]
 
-# GFX11: v_cmpx_gt_u16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x78,0x7d]
 0x7e,0x04,0x78,0x7d
+# GFX11: v_cmpx_gt_u16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x78,0x7d]
 
-# GFX11: v_cmpx_gt_u16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x78,0x7d]
 0x7f,0x04,0x78,0x7d
+# GFX11: v_cmpx_gt_u16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x78,0x7d]
 
-# GFX11: v_cmpx_gt_u16_e32 null, v2              ; encoding: [0x7c,0x04,0x78,0x7d]
 0x7c,0x04,0x78,0x7d
+# GFX11: v_cmpx_gt_u16_e32 null, v2              ; encoding: [0x7c,0x04,0x78,0x7d]
 
-# GFX11: v_cmpx_gt_u16_e32 -1, v2                ; encoding: [0xc1,0x04,0x78,0x7d]
 0xc1,0x04,0x78,0x7d
+# GFX11: v_cmpx_gt_u16_e32 -1, v2                ; encoding: [0xc1,0x04,0x78,0x7d]
 
-# GFX11: v_cmpx_gt_u16_e32 0x3800, v2
 0xf0,0x04,0x78,0x7d
+# GFX11: v_cmpx_gt_u16_e32 0x3800, v2            ; encoding: [0xff,0x04,0x78,0x7d,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x78,0x7d]
 0xfd,0x04,0x78,0x7d
+# GFX11: v_cmpx_gt_u16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x78,0x7d]
 
-# GFX11: v_cmpx_gt_u16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x78,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x78,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_gt_u16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x78,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u32_e32 v1, v2                ; encoding: [0x01,0x05,0x98,0x7d]
 0x01,0x05,0x98,0x7d
+# GFX11: v_cmpx_gt_u32_e32 v1, v2                ; encoding: [0x01,0x05,0x98,0x7d]
 
-# GFX11: v_cmpx_gt_u32_e32 v255, v2              ; encoding: [0xff,0x05,0x98,0x7d]
 0xff,0x05,0x98,0x7d
+# GFX11: v_cmpx_gt_u32_e32 v255, v2              ; encoding: [0xff,0x05,0x98,0x7d]
 
-# GFX11: v_cmpx_gt_u32_e32 s1, v2                ; encoding: [0x01,0x04,0x98,0x7d]
 0x01,0x04,0x98,0x7d
+# GFX11: v_cmpx_gt_u32_e32 s1, v2                ; encoding: [0x01,0x04,0x98,0x7d]
 
-# GFX11: v_cmpx_gt_u32_e32 s105, v2              ; encoding: [0x69,0x04,0x98,0x7d]
 0x69,0x04,0x98,0x7d
+# GFX11: v_cmpx_gt_u32_e32 s105, v2              ; encoding: [0x69,0x04,0x98,0x7d]
 
-# GFX11: v_cmpx_gt_u32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x98,0x7d]
 0x6a,0x04,0x98,0x7d
+# GFX11: v_cmpx_gt_u32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x98,0x7d]
 
-# GFX11: v_cmpx_gt_u32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x98,0x7d]
 0x6b,0x04,0x98,0x7d
+# GFX11: v_cmpx_gt_u32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x98,0x7d]
 
-# GFX11: v_cmpx_gt_u32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x98,0x7d]
 0x7b,0x04,0x98,0x7d
+# GFX11: v_cmpx_gt_u32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x98,0x7d]
 
-# GFX11: v_cmpx_gt_u32_e32 m0, v2                ; encoding: [0x7d,0x04,0x98,0x7d]
 0x7d,0x04,0x98,0x7d
+# GFX11: v_cmpx_gt_u32_e32 m0, v2                ; encoding: [0x7d,0x04,0x98,0x7d]
 
-# GFX11: v_cmpx_gt_u32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x98,0x7d]
 0x7e,0x04,0x98,0x7d
+# GFX11: v_cmpx_gt_u32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x98,0x7d]
 
-# GFX11: v_cmpx_gt_u32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x98,0x7d]
 0x7f,0x04,0x98,0x7d
+# GFX11: v_cmpx_gt_u32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x98,0x7d]
 
-# GFX11: v_cmpx_gt_u32_e32 null, v2              ; encoding: [0x7c,0x04,0x98,0x7d]
 0x7c,0x04,0x98,0x7d
+# GFX11: v_cmpx_gt_u32_e32 null, v2              ; encoding: [0x7c,0x04,0x98,0x7d]
 
-# GFX11: v_cmpx_gt_u32_e32 -1, v2                ; encoding: [0xc1,0x04,0x98,0x7d]
 0xc1,0x04,0x98,0x7d
+# GFX11: v_cmpx_gt_u32_e32 -1, v2                ; encoding: [0xc1,0x04,0x98,0x7d]
 
-# GFX11: v_cmpx_gt_u32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x98,0x7d]
 0xf0,0x04,0x98,0x7d
+# GFX11: v_cmpx_gt_u32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x98,0x7d]
 
-# GFX11: v_cmpx_gt_u32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x98,0x7d]
 0xfd,0x04,0x98,0x7d
+# GFX11: v_cmpx_gt_u32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x98,0x7d]
 
-# GFX11: v_cmpx_gt_u32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x99,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x99,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_gt_u32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x99,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_gt_u64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xb8,0x7d]
 0x01,0x05,0xb8,0x7d
+# GFX11: v_cmpx_gt_u64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xb8,0x7d]
 
-# GFX11: v_cmpx_gt_u64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xb8,0x7d]
 0xfe,0x05,0xb8,0x7d
+# GFX11: v_cmpx_gt_u64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xb8,0x7d]
 
-# GFX11: v_cmpx_gt_u64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xb8,0x7d]
 0x02,0x04,0xb8,0x7d
+# GFX11: v_cmpx_gt_u64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xb8,0x7d]
 
-# GFX11: v_cmpx_gt_u64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xb8,0x7d]
 0x68,0x04,0xb8,0x7d
+# GFX11: v_cmpx_gt_u64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xb8,0x7d]
 
-# GFX11: v_cmpx_gt_u64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xb8,0x7d]
 0x6a,0x04,0xb8,0x7d
+# GFX11: v_cmpx_gt_u64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xb8,0x7d]
 
-# GFX11: v_cmpx_gt_u64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xb8,0x7d]
 0x7a,0x04,0xb8,0x7d
+# GFX11: v_cmpx_gt_u64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xb8,0x7d]
 
-# GFX11: v_cmpx_gt_u64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xb8,0x7d]
 0x7e,0x04,0xb8,0x7d
+# GFX11: v_cmpx_gt_u64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xb8,0x7d]
 
-# GFX11: v_cmpx_gt_u64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xb8,0x7d]
 0x7c,0x04,0xb8,0x7d
+# GFX11: v_cmpx_gt_u64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xb8,0x7d]
 
-# GFX11: v_cmpx_gt_u64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xb8,0x7d]
 0xc1,0x04,0xb8,0x7d
+# GFX11: v_cmpx_gt_u64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xb8,0x7d]
 
-# GFX11: v_cmpx_gt_u64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xb8,0x7d]
 0xf0,0x04,0xb8,0x7d
+# GFX11: v_cmpx_gt_u64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xb8,0x7d]
 
-# GFX11: v_cmpx_gt_u64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xb8,0x7d]
 0xfd,0x04,0xb8,0x7d
+# GFX11: v_cmpx_gt_u64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xb8,0x7d]
 
-# GFX11: v_cmpx_gt_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb9,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0xb9,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_gt_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb9,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_le_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x06,0x7d]
 0x01,0x05,0x06,0x7d
+# GFX11: v_cmpx_le_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x06,0x7d]
 
-# GFX11: v_cmpx_le_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x06,0x7d]
 0x7f,0x05,0x06,0x7d
+# GFX11: v_cmpx_le_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x06,0x7d]
 
-# GFX11: v_cmpx_le_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x06,0x7d]
 0x01,0x04,0x06,0x7d
+# GFX11: v_cmpx_le_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x06,0x7d]
 
-# GFX11: v_cmpx_le_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x06,0x7d]
 0x69,0x04,0x06,0x7d
+# GFX11: v_cmpx_le_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x06,0x7d]
 
-# GFX11: v_cmpx_le_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x06,0x7d]
 0x6a,0x04,0x06,0x7d
+# GFX11: v_cmpx_le_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x06,0x7d]
 
-# GFX11: v_cmpx_le_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x06,0x7d]
 0x6b,0x04,0x06,0x7d
+# GFX11: v_cmpx_le_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x06,0x7d]
 
-# GFX11: v_cmpx_le_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x06,0x7d]
 0x7b,0x04,0x06,0x7d
+# GFX11: v_cmpx_le_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x06,0x7d]
 
-# GFX11: v_cmpx_le_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x06,0x7d]
 0x7d,0x04,0x06,0x7d
+# GFX11: v_cmpx_le_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x06,0x7d]
 
-# GFX11: v_cmpx_le_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x06,0x7d]
 0x7e,0x04,0x06,0x7d
+# GFX11: v_cmpx_le_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x06,0x7d]
 
-# GFX11: v_cmpx_le_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x06,0x7d]
 0x7f,0x04,0x06,0x7d
+# GFX11: v_cmpx_le_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x06,0x7d]
 
-# GFX11: v_cmpx_le_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x06,0x7d]
 0x7c,0x04,0x06,0x7d
+# GFX11: v_cmpx_le_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x06,0x7d]
 
-# GFX11: v_cmpx_le_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x06,0x7d]
 0xc1,0x04,0x06,0x7d
+# GFX11: v_cmpx_le_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x06,0x7d]
 
-# GFX11: v_cmpx_le_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x06,0x7d]
 0xf0,0x04,0x06,0x7d
+# GFX11: v_cmpx_le_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x06,0x7d]
 
-# GFX11: v_cmpx_le_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x06,0x7d]
 0xfd,0x04,0x06,0x7d
+# GFX11: v_cmpx_le_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x06,0x7d]
 
-# GFX11: v_cmpx_le_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x06,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x06,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_le_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x06,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_le_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x26,0x7d]
 0x01,0x05,0x26,0x7d
+# GFX11: v_cmpx_le_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x26,0x7d]
 
-# GFX11: v_cmpx_le_f32_e32 v255, v2              ; encoding: [0xff,0x05,0x26,0x7d]
 0xff,0x05,0x26,0x7d
+# GFX11: v_cmpx_le_f32_e32 v255, v2              ; encoding: [0xff,0x05,0x26,0x7d]
 
-# GFX11: v_cmpx_le_f32_e32 s1, v2                ; encoding: [0x01,0x04,0x26,0x7d]
 0x01,0x04,0x26,0x7d
+# GFX11: v_cmpx_le_f32_e32 s1, v2                ; encoding: [0x01,0x04,0x26,0x7d]
 
-# GFX11: v_cmpx_le_f32_e32 s105, v2              ; encoding: [0x69,0x04,0x26,0x7d]
 0x69,0x04,0x26,0x7d
+# GFX11: v_cmpx_le_f32_e32 s105, v2              ; encoding: [0x69,0x04,0x26,0x7d]
 
-# GFX11: v_cmpx_le_f32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x26,0x7d]
 0x6a,0x04,0x26,0x7d
+# GFX11: v_cmpx_le_f32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x26,0x7d]
 
-# GFX11: v_cmpx_le_f32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x26,0x7d]
 0x6b,0x04,0x26,0x7d
+# GFX11: v_cmpx_le_f32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x26,0x7d]
 
-# GFX11: v_cmpx_le_f32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x26,0x7d]
 0x7b,0x04,0x26,0x7d
+# GFX11: v_cmpx_le_f32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x26,0x7d]
 
-# GFX11: v_cmpx_le_f32_e32 m0, v2                ; encoding: [0x7d,0x04,0x26,0x7d]
 0x7d,0x04,0x26,0x7d
+# GFX11: v_cmpx_le_f32_e32 m0, v2                ; encoding: [0x7d,0x04,0x26,0x7d]
 
-# GFX11: v_cmpx_le_f32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x26,0x7d]
 0x7e,0x04,0x26,0x7d
+# GFX11: v_cmpx_le_f32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x26,0x7d]
 
-# GFX11: v_cmpx_le_f32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x26,0x7d]
 0x7f,0x04,0x26,0x7d
+# GFX11: v_cmpx_le_f32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x26,0x7d]
 
-# GFX11: v_cmpx_le_f32_e32 null, v2              ; encoding: [0x7c,0x04,0x26,0x7d]
 0x7c,0x04,0x26,0x7d
+# GFX11: v_cmpx_le_f32_e32 null, v2              ; encoding: [0x7c,0x04,0x26,0x7d]
 
-# GFX11: v_cmpx_le_f32_e32 -1, v2                ; encoding: [0xc1,0x04,0x26,0x7d]
 0xc1,0x04,0x26,0x7d
+# GFX11: v_cmpx_le_f32_e32 -1, v2                ; encoding: [0xc1,0x04,0x26,0x7d]
 
-# GFX11: v_cmpx_le_f32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x26,0x7d]
 0xf0,0x04,0x26,0x7d
+# GFX11: v_cmpx_le_f32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x26,0x7d]
 
-# GFX11: v_cmpx_le_f32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x26,0x7d]
 0xfd,0x04,0x26,0x7d
+# GFX11: v_cmpx_le_f32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x26,0x7d]
 
-# GFX11: v_cmpx_le_f32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x27,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x27,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_le_f32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x27,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_le_f64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0x46,0x7d]
 0x01,0x05,0x46,0x7d
+# GFX11: v_cmpx_le_f64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0x46,0x7d]
 
-# GFX11: v_cmpx_le_f64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0x46,0x7d]
 0xfe,0x05,0x46,0x7d
+# GFX11: v_cmpx_le_f64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0x46,0x7d]
 
-# GFX11: v_cmpx_le_f64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0x46,0x7d]
 0x02,0x04,0x46,0x7d
+# GFX11: v_cmpx_le_f64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0x46,0x7d]
 
-# GFX11: v_cmpx_le_f64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0x46,0x7d]
 0x68,0x04,0x46,0x7d
+# GFX11: v_cmpx_le_f64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0x46,0x7d]
 
-# GFX11: v_cmpx_le_f64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0x46,0x7d]
 0x6a,0x04,0x46,0x7d
+# GFX11: v_cmpx_le_f64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0x46,0x7d]
 
-# GFX11: v_cmpx_le_f64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0x46,0x7d]
 0x7a,0x04,0x46,0x7d
+# GFX11: v_cmpx_le_f64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0x46,0x7d]
 
-# GFX11: v_cmpx_le_f64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0x46,0x7d]
 0x7e,0x04,0x46,0x7d
+# GFX11: v_cmpx_le_f64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0x46,0x7d]
 
-# GFX11: v_cmpx_le_f64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0x46,0x7d]
 0x7c,0x04,0x46,0x7d
+# GFX11: v_cmpx_le_f64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0x46,0x7d]
 
-# GFX11: v_cmpx_le_f64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0x46,0x7d]
 0xc1,0x04,0x46,0x7d
+# GFX11: v_cmpx_le_f64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0x46,0x7d]
 
-# GFX11: v_cmpx_le_f64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0x46,0x7d]
 0xf0,0x04,0x46,0x7d
+# GFX11: v_cmpx_le_f64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0x46,0x7d]
 
-# GFX11: v_cmpx_le_f64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0x46,0x7d]
 0xfd,0x04,0x46,0x7d
+# GFX11: v_cmpx_le_f64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0x46,0x7d]
 
-# GFX11: v_cmpx_le_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x47,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x47,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_le_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x47,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_le_i16_e32 v1, v2                ; encoding: [0x01,0x05,0x66,0x7d]
 0x01,0x05,0x66,0x7d
+# GFX11: v_cmpx_le_i16_e32 v1, v2                ; encoding: [0x01,0x05,0x66,0x7d]
 
-# GFX11: v_cmpx_le_i16_e32 v127, v2              ; encoding: [0x7f,0x05,0x66,0x7d]
 0x7f,0x05,0x66,0x7d
+# GFX11: v_cmpx_le_i16_e32 v127, v2              ; encoding: [0x7f,0x05,0x66,0x7d]
 
-# GFX11: v_cmpx_le_i16_e32 s1, v2                ; encoding: [0x01,0x04,0x66,0x7d]
 0x01,0x04,0x66,0x7d
+# GFX11: v_cmpx_le_i16_e32 s1, v2                ; encoding: [0x01,0x04,0x66,0x7d]
 
-# GFX11: v_cmpx_le_i16_e32 s105, v2              ; encoding: [0x69,0x04,0x66,0x7d]
 0x69,0x04,0x66,0x7d
+# GFX11: v_cmpx_le_i16_e32 s105, v2              ; encoding: [0x69,0x04,0x66,0x7d]
 
-# GFX11: v_cmpx_le_i16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x66,0x7d]
 0x6a,0x04,0x66,0x7d
+# GFX11: v_cmpx_le_i16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x66,0x7d]
 
-# GFX11: v_cmpx_le_i16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x66,0x7d]
 0x6b,0x04,0x66,0x7d
+# GFX11: v_cmpx_le_i16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x66,0x7d]
 
-# GFX11: v_cmpx_le_i16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x66,0x7d]
 0x7b,0x04,0x66,0x7d
+# GFX11: v_cmpx_le_i16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x66,0x7d]
 
-# GFX11: v_cmpx_le_i16_e32 m0, v2                ; encoding: [0x7d,0x04,0x66,0x7d]
 0x7d,0x04,0x66,0x7d
+# GFX11: v_cmpx_le_i16_e32 m0, v2                ; encoding: [0x7d,0x04,0x66,0x7d]
 
-# GFX11: v_cmpx_le_i16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x66,0x7d]
 0x7e,0x04,0x66,0x7d
+# GFX11: v_cmpx_le_i16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x66,0x7d]
 
-# GFX11: v_cmpx_le_i16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x66,0x7d]
 0x7f,0x04,0x66,0x7d
+# GFX11: v_cmpx_le_i16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x66,0x7d]
 
-# GFX11: v_cmpx_le_i16_e32 null, v2              ; encoding: [0x7c,0x04,0x66,0x7d]
 0x7c,0x04,0x66,0x7d
+# GFX11: v_cmpx_le_i16_e32 null, v2              ; encoding: [0x7c,0x04,0x66,0x7d]
 
-# GFX11: v_cmpx_le_i16_e32 -1, v2                ; encoding: [0xc1,0x04,0x66,0x7d]
 0xc1,0x04,0x66,0x7d
+# GFX11: v_cmpx_le_i16_e32 -1, v2                ; encoding: [0xc1,0x04,0x66,0x7d]
 
-# GFX11: v_cmpx_le_i16_e32 0x3800, v2
 0xf0,0x04,0x66,0x7d
+# GFX11: v_cmpx_le_i16_e32 0x3800, v2            ; encoding: [0xff,0x04,0x66,0x7d,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_le_i16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x66,0x7d]
 0xfd,0x04,0x66,0x7d
+# GFX11: v_cmpx_le_i16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x66,0x7d]
 
-# GFX11: v_cmpx_le_i16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x66,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x66,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_le_i16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x66,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_le_i32_e32 v1, v2                ; encoding: [0x01,0x05,0x86,0x7d]
 0x01,0x05,0x86,0x7d
+# GFX11: v_cmpx_le_i32_e32 v1, v2                ; encoding: [0x01,0x05,0x86,0x7d]
 
-# GFX11: v_cmpx_le_i32_e32 v255, v2              ; encoding: [0xff,0x05,0x86,0x7d]
 0xff,0x05,0x86,0x7d
+# GFX11: v_cmpx_le_i32_e32 v255, v2              ; encoding: [0xff,0x05,0x86,0x7d]
 
-# GFX11: v_cmpx_le_i32_e32 s1, v2                ; encoding: [0x01,0x04,0x86,0x7d]
 0x01,0x04,0x86,0x7d
+# GFX11: v_cmpx_le_i32_e32 s1, v2                ; encoding: [0x01,0x04,0x86,0x7d]
 
-# GFX11: v_cmpx_le_i32_e32 s105, v2              ; encoding: [0x69,0x04,0x86,0x7d]
 0x69,0x04,0x86,0x7d
+# GFX11: v_cmpx_le_i32_e32 s105, v2              ; encoding: [0x69,0x04,0x86,0x7d]
 
-# GFX11: v_cmpx_le_i32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x86,0x7d]
 0x6a,0x04,0x86,0x7d
+# GFX11: v_cmpx_le_i32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x86,0x7d]
 
-# GFX11: v_cmpx_le_i32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x86,0x7d]
 0x6b,0x04,0x86,0x7d
+# GFX11: v_cmpx_le_i32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x86,0x7d]
 
-# GFX11: v_cmpx_le_i32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x86,0x7d]
 0x7b,0x04,0x86,0x7d
+# GFX11: v_cmpx_le_i32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x86,0x7d]
 
-# GFX11: v_cmpx_le_i32_e32 m0, v2                ; encoding: [0x7d,0x04,0x86,0x7d]
 0x7d,0x04,0x86,0x7d
+# GFX11: v_cmpx_le_i32_e32 m0, v2                ; encoding: [0x7d,0x04,0x86,0x7d]
 
-# GFX11: v_cmpx_le_i32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x86,0x7d]
 0x7e,0x04,0x86,0x7d
+# GFX11: v_cmpx_le_i32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x86,0x7d]
 
-# GFX11: v_cmpx_le_i32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x86,0x7d]
 0x7f,0x04,0x86,0x7d
+# GFX11: v_cmpx_le_i32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x86,0x7d]
 
-# GFX11: v_cmpx_le_i32_e32 null, v2              ; encoding: [0x7c,0x04,0x86,0x7d]
 0x7c,0x04,0x86,0x7d
+# GFX11: v_cmpx_le_i32_e32 null, v2              ; encoding: [0x7c,0x04,0x86,0x7d]
 
-# GFX11: v_cmpx_le_i32_e32 -1, v2                ; encoding: [0xc1,0x04,0x86,0x7d]
 0xc1,0x04,0x86,0x7d
+# GFX11: v_cmpx_le_i32_e32 -1, v2                ; encoding: [0xc1,0x04,0x86,0x7d]
 
-# GFX11: v_cmpx_le_i32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x86,0x7d]
 0xf0,0x04,0x86,0x7d
+# GFX11: v_cmpx_le_i32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x86,0x7d]
 
-# GFX11: v_cmpx_le_i32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x86,0x7d]
 0xfd,0x04,0x86,0x7d
+# GFX11: v_cmpx_le_i32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x86,0x7d]
 
-# GFX11: v_cmpx_le_i32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x87,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x87,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_le_i32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x87,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_le_i64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xa6,0x7d]
 0x01,0x05,0xa6,0x7d
+# GFX11: v_cmpx_le_i64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xa6,0x7d]
 
-# GFX11: v_cmpx_le_i64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xa6,0x7d]
 0xfe,0x05,0xa6,0x7d
+# GFX11: v_cmpx_le_i64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xa6,0x7d]
 
-# GFX11: v_cmpx_le_i64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xa6,0x7d]
 0x02,0x04,0xa6,0x7d
+# GFX11: v_cmpx_le_i64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xa6,0x7d]
 
-# GFX11: v_cmpx_le_i64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xa6,0x7d]
 0x68,0x04,0xa6,0x7d
+# GFX11: v_cmpx_le_i64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xa6,0x7d]
 
-# GFX11: v_cmpx_le_i64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xa6,0x7d]
 0x6a,0x04,0xa6,0x7d
+# GFX11: v_cmpx_le_i64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xa6,0x7d]
 
-# GFX11: v_cmpx_le_i64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xa6,0x7d]
 0x7a,0x04,0xa6,0x7d
+# GFX11: v_cmpx_le_i64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xa6,0x7d]
 
-# GFX11: v_cmpx_le_i64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xa6,0x7d]
 0x7e,0x04,0xa6,0x7d
+# GFX11: v_cmpx_le_i64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xa6,0x7d]
 
-# GFX11: v_cmpx_le_i64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xa6,0x7d]
 0x7c,0x04,0xa6,0x7d
+# GFX11: v_cmpx_le_i64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xa6,0x7d]
 
-# GFX11: v_cmpx_le_i64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xa6,0x7d]
 0xc1,0x04,0xa6,0x7d
+# GFX11: v_cmpx_le_i64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xa6,0x7d]
 
-# GFX11: v_cmpx_le_i64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xa6,0x7d]
 0xf0,0x04,0xa6,0x7d
+# GFX11: v_cmpx_le_i64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xa6,0x7d]
 
-# GFX11: v_cmpx_le_i64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xa6,0x7d]
 0xfd,0x04,0xa6,0x7d
+# GFX11: v_cmpx_le_i64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xa6,0x7d]
 
-# GFX11: v_cmpx_le_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa7,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0xa7,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_le_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa7,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_le_u16_e32 v1, v2                ; encoding: [0x01,0x05,0x76,0x7d]
 0x01,0x05,0x76,0x7d
+# GFX11: v_cmpx_le_u16_e32 v1, v2                ; encoding: [0x01,0x05,0x76,0x7d]
 
-# GFX11: v_cmpx_le_u16_e32 v127, v2              ; encoding: [0x7f,0x05,0x76,0x7d]
 0x7f,0x05,0x76,0x7d
+# GFX11: v_cmpx_le_u16_e32 v127, v2              ; encoding: [0x7f,0x05,0x76,0x7d]
 
-# GFX11: v_cmpx_le_u16_e32 s1, v2                ; encoding: [0x01,0x04,0x76,0x7d]
 0x01,0x04,0x76,0x7d
+# GFX11: v_cmpx_le_u16_e32 s1, v2                ; encoding: [0x01,0x04,0x76,0x7d]
 
-# GFX11: v_cmpx_le_u16_e32 s105, v2              ; encoding: [0x69,0x04,0x76,0x7d]
 0x69,0x04,0x76,0x7d
+# GFX11: v_cmpx_le_u16_e32 s105, v2              ; encoding: [0x69,0x04,0x76,0x7d]
 
-# GFX11: v_cmpx_le_u16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x76,0x7d]
 0x6a,0x04,0x76,0x7d
+# GFX11: v_cmpx_le_u16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x76,0x7d]
 
-# GFX11: v_cmpx_le_u16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x76,0x7d]
 0x6b,0x04,0x76,0x7d
+# GFX11: v_cmpx_le_u16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x76,0x7d]
 
-# GFX11: v_cmpx_le_u16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x76,0x7d]
 0x7b,0x04,0x76,0x7d
+# GFX11: v_cmpx_le_u16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x76,0x7d]
 
-# GFX11: v_cmpx_le_u16_e32 m0, v2                ; encoding: [0x7d,0x04,0x76,0x7d]
 0x7d,0x04,0x76,0x7d
+# GFX11: v_cmpx_le_u16_e32 m0, v2                ; encoding: [0x7d,0x04,0x76,0x7d]
 
-# GFX11: v_cmpx_le_u16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x76,0x7d]
 0x7e,0x04,0x76,0x7d
+# GFX11: v_cmpx_le_u16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x76,0x7d]
 
-# GFX11: v_cmpx_le_u16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x76,0x7d]
 0x7f,0x04,0x76,0x7d
+# GFX11: v_cmpx_le_u16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x76,0x7d]
 
-# GFX11: v_cmpx_le_u16_e32 null, v2              ; encoding: [0x7c,0x04,0x76,0x7d]
 0x7c,0x04,0x76,0x7d
+# GFX11: v_cmpx_le_u16_e32 null, v2              ; encoding: [0x7c,0x04,0x76,0x7d]
 
-# GFX11: v_cmpx_le_u16_e32 -1, v2                ; encoding: [0xc1,0x04,0x76,0x7d]
 0xc1,0x04,0x76,0x7d
+# GFX11: v_cmpx_le_u16_e32 -1, v2                ; encoding: [0xc1,0x04,0x76,0x7d]
 
-# GFX11: v_cmpx_le_u16_e32 0x3800, v2
 0xf0,0x04,0x76,0x7d
+# GFX11: v_cmpx_le_u16_e32 0x3800, v2            ; encoding: [0xff,0x04,0x76,0x7d,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_le_u16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x76,0x7d]
 0xfd,0x04,0x76,0x7d
+# GFX11: v_cmpx_le_u16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x76,0x7d]
 
-# GFX11: v_cmpx_le_u16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x76,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x76,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_le_u16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x76,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_le_u32_e32 v1, v2                ; encoding: [0x01,0x05,0x96,0x7d]
 0x01,0x05,0x96,0x7d
+# GFX11: v_cmpx_le_u32_e32 v1, v2                ; encoding: [0x01,0x05,0x96,0x7d]
 
-# GFX11: v_cmpx_le_u32_e32 v255, v2              ; encoding: [0xff,0x05,0x96,0x7d]
 0xff,0x05,0x96,0x7d
+# GFX11: v_cmpx_le_u32_e32 v255, v2              ; encoding: [0xff,0x05,0x96,0x7d]
 
-# GFX11: v_cmpx_le_u32_e32 s1, v2                ; encoding: [0x01,0x04,0x96,0x7d]
 0x01,0x04,0x96,0x7d
+# GFX11: v_cmpx_le_u32_e32 s1, v2                ; encoding: [0x01,0x04,0x96,0x7d]
 
-# GFX11: v_cmpx_le_u32_e32 s105, v2              ; encoding: [0x69,0x04,0x96,0x7d]
 0x69,0x04,0x96,0x7d
+# GFX11: v_cmpx_le_u32_e32 s105, v2              ; encoding: [0x69,0x04,0x96,0x7d]
 
-# GFX11: v_cmpx_le_u32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x96,0x7d]
 0x6a,0x04,0x96,0x7d
+# GFX11: v_cmpx_le_u32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x96,0x7d]
 
-# GFX11: v_cmpx_le_u32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x96,0x7d]
 0x6b,0x04,0x96,0x7d
+# GFX11: v_cmpx_le_u32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x96,0x7d]
 
-# GFX11: v_cmpx_le_u32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x96,0x7d]
 0x7b,0x04,0x96,0x7d
+# GFX11: v_cmpx_le_u32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x96,0x7d]
 
-# GFX11: v_cmpx_le_u32_e32 m0, v2                ; encoding: [0x7d,0x04,0x96,0x7d]
 0x7d,0x04,0x96,0x7d
+# GFX11: v_cmpx_le_u32_e32 m0, v2                ; encoding: [0x7d,0x04,0x96,0x7d]
 
-# GFX11: v_cmpx_le_u32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x96,0x7d]
 0x7e,0x04,0x96,0x7d
+# GFX11: v_cmpx_le_u32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x96,0x7d]
 
-# GFX11: v_cmpx_le_u32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x96,0x7d]
 0x7f,0x04,0x96,0x7d
+# GFX11: v_cmpx_le_u32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x96,0x7d]
 
-# GFX11: v_cmpx_le_u32_e32 null, v2              ; encoding: [0x7c,0x04,0x96,0x7d]
 0x7c,0x04,0x96,0x7d
+# GFX11: v_cmpx_le_u32_e32 null, v2              ; encoding: [0x7c,0x04,0x96,0x7d]
 
-# GFX11: v_cmpx_le_u32_e32 -1, v2                ; encoding: [0xc1,0x04,0x96,0x7d]
 0xc1,0x04,0x96,0x7d
+# GFX11: v_cmpx_le_u32_e32 -1, v2                ; encoding: [0xc1,0x04,0x96,0x7d]
 
-# GFX11: v_cmpx_le_u32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x96,0x7d]
 0xf0,0x04,0x96,0x7d
+# GFX11: v_cmpx_le_u32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x96,0x7d]
 
-# GFX11: v_cmpx_le_u32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x96,0x7d]
 0xfd,0x04,0x96,0x7d
+# GFX11: v_cmpx_le_u32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x96,0x7d]
 
-# GFX11: v_cmpx_le_u32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x97,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x97,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_le_u32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x97,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_le_u64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xb6,0x7d]
 0x01,0x05,0xb6,0x7d
+# GFX11: v_cmpx_le_u64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xb6,0x7d]
 
-# GFX11: v_cmpx_le_u64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xb6,0x7d]
 0xfe,0x05,0xb6,0x7d
+# GFX11: v_cmpx_le_u64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xb6,0x7d]
 
-# GFX11: v_cmpx_le_u64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xb6,0x7d]
 0x02,0x04,0xb6,0x7d
+# GFX11: v_cmpx_le_u64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xb6,0x7d]
 
-# GFX11: v_cmpx_le_u64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xb6,0x7d]
 0x68,0x04,0xb6,0x7d
+# GFX11: v_cmpx_le_u64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xb6,0x7d]
 
-# GFX11: v_cmpx_le_u64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xb6,0x7d]
 0x6a,0x04,0xb6,0x7d
+# GFX11: v_cmpx_le_u64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xb6,0x7d]
 
-# GFX11: v_cmpx_le_u64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xb6,0x7d]
 0x7a,0x04,0xb6,0x7d
+# GFX11: v_cmpx_le_u64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xb6,0x7d]
 
-# GFX11: v_cmpx_le_u64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xb6,0x7d]
 0x7e,0x04,0xb6,0x7d
+# GFX11: v_cmpx_le_u64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xb6,0x7d]
 
-# GFX11: v_cmpx_le_u64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xb6,0x7d]
 0x7c,0x04,0xb6,0x7d
+# GFX11: v_cmpx_le_u64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xb6,0x7d]
 
-# GFX11: v_cmpx_le_u64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xb6,0x7d]
 0xc1,0x04,0xb6,0x7d
+# GFX11: v_cmpx_le_u64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xb6,0x7d]
 
-# GFX11: v_cmpx_le_u64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xb6,0x7d]
 0xf0,0x04,0xb6,0x7d
+# GFX11: v_cmpx_le_u64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xb6,0x7d]
 
-# GFX11: v_cmpx_le_u64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xb6,0x7d]
 0xfd,0x04,0xb6,0x7d
+# GFX11: v_cmpx_le_u64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xb6,0x7d]
 
-# GFX11: v_cmpx_le_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb7,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0xb7,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_le_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb7,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_lg_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x0a,0x7d]
 0x01,0x05,0x0a,0x7d
+# GFX11: v_cmpx_lg_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x0a,0x7d]
 
-# GFX11: v_cmpx_lg_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x0a,0x7d]
 0x7f,0x05,0x0a,0x7d
+# GFX11: v_cmpx_lg_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x0a,0x7d]
 
-# GFX11: v_cmpx_lg_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x0a,0x7d]
 0x01,0x04,0x0a,0x7d
+# GFX11: v_cmpx_lg_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x0a,0x7d]
 
-# GFX11: v_cmpx_lg_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x0a,0x7d]
 0x69,0x04,0x0a,0x7d
+# GFX11: v_cmpx_lg_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x0a,0x7d]
 
-# GFX11: v_cmpx_lg_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x0a,0x7d]
 0x6a,0x04,0x0a,0x7d
+# GFX11: v_cmpx_lg_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x0a,0x7d]
 
-# GFX11: v_cmpx_lg_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x0a,0x7d]
 0x6b,0x04,0x0a,0x7d
+# GFX11: v_cmpx_lg_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x0a,0x7d]
 
-# GFX11: v_cmpx_lg_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x0a,0x7d]
 0x7b,0x04,0x0a,0x7d
+# GFX11: v_cmpx_lg_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x0a,0x7d]
 
-# GFX11: v_cmpx_lg_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x0a,0x7d]
 0x7d,0x04,0x0a,0x7d
+# GFX11: v_cmpx_lg_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x0a,0x7d]
 
-# GFX11: v_cmpx_lg_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x0a,0x7d]
 0x7e,0x04,0x0a,0x7d
+# GFX11: v_cmpx_lg_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x0a,0x7d]
 
-# GFX11: v_cmpx_lg_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x0a,0x7d]
 0x7f,0x04,0x0a,0x7d
+# GFX11: v_cmpx_lg_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x0a,0x7d]
 
-# GFX11: v_cmpx_lg_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x0a,0x7d]
 0x7c,0x04,0x0a,0x7d
+# GFX11: v_cmpx_lg_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x0a,0x7d]
 
-# GFX11: v_cmpx_lg_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x0a,0x7d]
 0xc1,0x04,0x0a,0x7d
+# GFX11: v_cmpx_lg_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x0a,0x7d]
 
-# GFX11: v_cmpx_lg_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x0a,0x7d]
 0xf0,0x04,0x0a,0x7d
+# GFX11: v_cmpx_lg_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x0a,0x7d]
 
-# GFX11: v_cmpx_lg_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x0a,0x7d]
 0xfd,0x04,0x0a,0x7d
+# GFX11: v_cmpx_lg_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x0a,0x7d]
 
-# GFX11: v_cmpx_lg_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x0a,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x0a,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_lg_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x0a,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_lg_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x2a,0x7d]
 0x01,0x05,0x2a,0x7d
+# GFX11: v_cmpx_lg_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x2a,0x7d]
 
-# GFX11: v_cmpx_lg_f32_e32 v255, v2              ; encoding: [0xff,0x05,0x2a,0x7d]
 0xff,0x05,0x2a,0x7d
+# GFX11: v_cmpx_lg_f32_e32 v255, v2              ; encoding: [0xff,0x05,0x2a,0x7d]
 
-# GFX11: v_cmpx_lg_f32_e32 s1, v2                ; encoding: [0x01,0x04,0x2a,0x7d]
 0x01,0x04,0x2a,0x7d
+# GFX11: v_cmpx_lg_f32_e32 s1, v2                ; encoding: [0x01,0x04,0x2a,0x7d]
 
-# GFX11: v_cmpx_lg_f32_e32 s105, v2              ; encoding: [0x69,0x04,0x2a,0x7d]
 0x69,0x04,0x2a,0x7d
+# GFX11: v_cmpx_lg_f32_e32 s105, v2              ; encoding: [0x69,0x04,0x2a,0x7d]
 
-# GFX11: v_cmpx_lg_f32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x2a,0x7d]
 0x6a,0x04,0x2a,0x7d
+# GFX11: v_cmpx_lg_f32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x2a,0x7d]
 
-# GFX11: v_cmpx_lg_f32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x2a,0x7d]
 0x6b,0x04,0x2a,0x7d
+# GFX11: v_cmpx_lg_f32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x2a,0x7d]
 
-# GFX11: v_cmpx_lg_f32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x2a,0x7d]
 0x7b,0x04,0x2a,0x7d
+# GFX11: v_cmpx_lg_f32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x2a,0x7d]
 
-# GFX11: v_cmpx_lg_f32_e32 m0, v2                ; encoding: [0x7d,0x04,0x2a,0x7d]
 0x7d,0x04,0x2a,0x7d
+# GFX11: v_cmpx_lg_f32_e32 m0, v2                ; encoding: [0x7d,0x04,0x2a,0x7d]
 
-# GFX11: v_cmpx_lg_f32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x2a,0x7d]
 0x7e,0x04,0x2a,0x7d
+# GFX11: v_cmpx_lg_f32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x2a,0x7d]
 
-# GFX11: v_cmpx_lg_f32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x2a,0x7d]
 0x7f,0x04,0x2a,0x7d
+# GFX11: v_cmpx_lg_f32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x2a,0x7d]
 
-# GFX11: v_cmpx_lg_f32_e32 null, v2              ; encoding: [0x7c,0x04,0x2a,0x7d]
 0x7c,0x04,0x2a,0x7d
+# GFX11: v_cmpx_lg_f32_e32 null, v2              ; encoding: [0x7c,0x04,0x2a,0x7d]
 
-# GFX11: v_cmpx_lg_f32_e32 -1, v2                ; encoding: [0xc1,0x04,0x2a,0x7d]
 0xc1,0x04,0x2a,0x7d
+# GFX11: v_cmpx_lg_f32_e32 -1, v2                ; encoding: [0xc1,0x04,0x2a,0x7d]
 
-# GFX11: v_cmpx_lg_f32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x2a,0x7d]
 0xf0,0x04,0x2a,0x7d
+# GFX11: v_cmpx_lg_f32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x2a,0x7d]
 
-# GFX11: v_cmpx_lg_f32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x2a,0x7d]
 0xfd,0x04,0x2a,0x7d
+# GFX11: v_cmpx_lg_f32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x2a,0x7d]
 
-# GFX11: v_cmpx_lg_f32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x2b,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x2b,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_lg_f32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x2b,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_lg_f64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0x4a,0x7d]
 0x01,0x05,0x4a,0x7d
+# GFX11: v_cmpx_lg_f64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0x4a,0x7d]
 
-# GFX11: v_cmpx_lg_f64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0x4a,0x7d]
 0xfe,0x05,0x4a,0x7d
+# GFX11: v_cmpx_lg_f64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0x4a,0x7d]
 
-# GFX11: v_cmpx_lg_f64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0x4a,0x7d]
 0x02,0x04,0x4a,0x7d
+# GFX11: v_cmpx_lg_f64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0x4a,0x7d]
 
-# GFX11: v_cmpx_lg_f64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0x4a,0x7d]
 0x68,0x04,0x4a,0x7d
+# GFX11: v_cmpx_lg_f64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0x4a,0x7d]
 
-# GFX11: v_cmpx_lg_f64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0x4a,0x7d]
 0x6a,0x04,0x4a,0x7d
+# GFX11: v_cmpx_lg_f64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0x4a,0x7d]
 
-# GFX11: v_cmpx_lg_f64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0x4a,0x7d]
 0x7a,0x04,0x4a,0x7d
+# GFX11: v_cmpx_lg_f64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0x4a,0x7d]
 
-# GFX11: v_cmpx_lg_f64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0x4a,0x7d]
 0x7e,0x04,0x4a,0x7d
+# GFX11: v_cmpx_lg_f64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0x4a,0x7d]
 
-# GFX11: v_cmpx_lg_f64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0x4a,0x7d]
 0x7c,0x04,0x4a,0x7d
+# GFX11: v_cmpx_lg_f64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0x4a,0x7d]
 
-# GFX11: v_cmpx_lg_f64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0x4a,0x7d]
 0xc1,0x04,0x4a,0x7d
+# GFX11: v_cmpx_lg_f64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0x4a,0x7d]
 
-# GFX11: v_cmpx_lg_f64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0x4a,0x7d]
 0xf0,0x04,0x4a,0x7d
+# GFX11: v_cmpx_lg_f64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0x4a,0x7d]
 
-# GFX11: v_cmpx_lg_f64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0x4a,0x7d]
 0xfd,0x04,0x4a,0x7d
+# GFX11: v_cmpx_lg_f64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0x4a,0x7d]
 
-# GFX11: v_cmpx_lg_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4b,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x4b,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_lg_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4b,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_lt_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x02,0x7d]
 0x01,0x05,0x02,0x7d
+# GFX11: v_cmpx_lt_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x02,0x7d]
 
-# GFX11: v_cmpx_lt_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x02,0x7d]
 0x7f,0x05,0x02,0x7d
+# GFX11: v_cmpx_lt_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x02,0x7d]
 
-# GFX11: v_cmpx_lt_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x02,0x7d]
 0x01,0x04,0x02,0x7d
+# GFX11: v_cmpx_lt_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x02,0x7d]
 
-# GFX11: v_cmpx_lt_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x02,0x7d]
 0x69,0x04,0x02,0x7d
+# GFX11: v_cmpx_lt_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x02,0x7d]
 
-# GFX11: v_cmpx_lt_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x02,0x7d]
 0x6a,0x04,0x02,0x7d
+# GFX11: v_cmpx_lt_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x02,0x7d]
 
-# GFX11: v_cmpx_lt_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x02,0x7d]
 0x6b,0x04,0x02,0x7d
+# GFX11: v_cmpx_lt_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x02,0x7d]
 
-# GFX11: v_cmpx_lt_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x02,0x7d]
 0x7b,0x04,0x02,0x7d
+# GFX11: v_cmpx_lt_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x02,0x7d]
 
-# GFX11: v_cmpx_lt_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x02,0x7d]
 0x7d,0x04,0x02,0x7d
+# GFX11: v_cmpx_lt_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x02,0x7d]
 
-# GFX11: v_cmpx_lt_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x02,0x7d]
 0x7e,0x04,0x02,0x7d
+# GFX11: v_cmpx_lt_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x02,0x7d]
 
-# GFX11: v_cmpx_lt_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x02,0x7d]
 0x7f,0x04,0x02,0x7d
+# GFX11: v_cmpx_lt_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x02,0x7d]
 
-# GFX11: v_cmpx_lt_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x02,0x7d]
 0x7c,0x04,0x02,0x7d
+# GFX11: v_cmpx_lt_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x02,0x7d]
 
-# GFX11: v_cmpx_lt_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x02,0x7d]
 0xc1,0x04,0x02,0x7d
+# GFX11: v_cmpx_lt_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x02,0x7d]
 
-# GFX11: v_cmpx_lt_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x02,0x7d]
 0xf0,0x04,0x02,0x7d
+# GFX11: v_cmpx_lt_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x02,0x7d]
 
-# GFX11: v_cmpx_lt_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x02,0x7d]
 0xfd,0x04,0x02,0x7d
+# GFX11: v_cmpx_lt_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x02,0x7d]
 
-# GFX11: v_cmpx_lt_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x02,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x02,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_lt_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x02,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_lt_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x22,0x7d]
 0x01,0x05,0x22,0x7d
+# GFX11: v_cmpx_lt_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x22,0x7d]
 
-# GFX11: v_cmpx_lt_f32_e32 v255, v2              ; encoding: [0xff,0x05,0x22,0x7d]
 0xff,0x05,0x22,0x7d
+# GFX11: v_cmpx_lt_f32_e32 v255, v2              ; encoding: [0xff,0x05,0x22,0x7d]
 
-# GFX11: v_cmpx_lt_f32_e32 s1, v2                ; encoding: [0x01,0x04,0x22,0x7d]
 0x01,0x04,0x22,0x7d
+# GFX11: v_cmpx_lt_f32_e32 s1, v2                ; encoding: [0x01,0x04,0x22,0x7d]
 
-# GFX11: v_cmpx_lt_f32_e32 s105, v2              ; encoding: [0x69,0x04,0x22,0x7d]
 0x69,0x04,0x22,0x7d
+# GFX11: v_cmpx_lt_f32_e32 s105, v2              ; encoding: [0x69,0x04,0x22,0x7d]
 
-# GFX11: v_cmpx_lt_f32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x22,0x7d]
 0x6a,0x04,0x22,0x7d
+# GFX11: v_cmpx_lt_f32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x22,0x7d]
 
-# GFX11: v_cmpx_lt_f32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x22,0x7d]
 0x6b,0x04,0x22,0x7d
+# GFX11: v_cmpx_lt_f32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x22,0x7d]
 
-# GFX11: v_cmpx_lt_f32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x22,0x7d]
 0x7b,0x04,0x22,0x7d
+# GFX11: v_cmpx_lt_f32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x22,0x7d]
 
-# GFX11: v_cmpx_lt_f32_e32 m0, v2                ; encoding: [0x7d,0x04,0x22,0x7d]
 0x7d,0x04,0x22,0x7d
+# GFX11: v_cmpx_lt_f32_e32 m0, v2                ; encoding: [0x7d,0x04,0x22,0x7d]
 
-# GFX11: v_cmpx_lt_f32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x22,0x7d]
 0x7e,0x04,0x22,0x7d
+# GFX11: v_cmpx_lt_f32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x22,0x7d]
 
-# GFX11: v_cmpx_lt_f32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x22,0x7d]
 0x7f,0x04,0x22,0x7d
+# GFX11: v_cmpx_lt_f32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x22,0x7d]
 
-# GFX11: v_cmpx_lt_f32_e32 null, v2              ; encoding: [0x7c,0x04,0x22,0x7d]
 0x7c,0x04,0x22,0x7d
+# GFX11: v_cmpx_lt_f32_e32 null, v2              ; encoding: [0x7c,0x04,0x22,0x7d]
 
-# GFX11: v_cmpx_lt_f32_e32 -1, v2                ; encoding: [0xc1,0x04,0x22,0x7d]
 0xc1,0x04,0x22,0x7d
+# GFX11: v_cmpx_lt_f32_e32 -1, v2                ; encoding: [0xc1,0x04,0x22,0x7d]
 
-# GFX11: v_cmpx_lt_f32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x22,0x7d]
 0xf0,0x04,0x22,0x7d
+# GFX11: v_cmpx_lt_f32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x22,0x7d]
 
-# GFX11: v_cmpx_lt_f32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x22,0x7d]
 0xfd,0x04,0x22,0x7d
+# GFX11: v_cmpx_lt_f32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x22,0x7d]
 
-# GFX11: v_cmpx_lt_f32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x23,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x23,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_lt_f32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x23,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_lt_f64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0x42,0x7d]
 0x01,0x05,0x42,0x7d
+# GFX11: v_cmpx_lt_f64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0x42,0x7d]
 
-# GFX11: v_cmpx_lt_f64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0x42,0x7d]
 0xfe,0x05,0x42,0x7d
+# GFX11: v_cmpx_lt_f64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0x42,0x7d]
 
-# GFX11: v_cmpx_lt_f64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0x42,0x7d]
 0x02,0x04,0x42,0x7d
+# GFX11: v_cmpx_lt_f64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0x42,0x7d]
 
-# GFX11: v_cmpx_lt_f64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0x42,0x7d]
 0x68,0x04,0x42,0x7d
+# GFX11: v_cmpx_lt_f64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0x42,0x7d]
 
-# GFX11: v_cmpx_lt_f64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0x42,0x7d]
 0x6a,0x04,0x42,0x7d
+# GFX11: v_cmpx_lt_f64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0x42,0x7d]
 
-# GFX11: v_cmpx_lt_f64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0x42,0x7d]
 0x7a,0x04,0x42,0x7d
+# GFX11: v_cmpx_lt_f64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0x42,0x7d]
 
-# GFX11: v_cmpx_lt_f64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0x42,0x7d]
 0x7e,0x04,0x42,0x7d
+# GFX11: v_cmpx_lt_f64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0x42,0x7d]
 
-# GFX11: v_cmpx_lt_f64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0x42,0x7d]
 0x7c,0x04,0x42,0x7d
+# GFX11: v_cmpx_lt_f64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0x42,0x7d]
 
-# GFX11: v_cmpx_lt_f64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0x42,0x7d]
 0xc1,0x04,0x42,0x7d
+# GFX11: v_cmpx_lt_f64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0x42,0x7d]
 
-# GFX11: v_cmpx_lt_f64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0x42,0x7d]
 0xf0,0x04,0x42,0x7d
+# GFX11: v_cmpx_lt_f64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0x42,0x7d]
 
-# GFX11: v_cmpx_lt_f64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0x42,0x7d]
 0xfd,0x04,0x42,0x7d
+# GFX11: v_cmpx_lt_f64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0x42,0x7d]
 
-# GFX11: v_cmpx_lt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x43,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x43,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_lt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x43,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_lt_i16_e32 v1, v2                ; encoding: [0x01,0x05,0x62,0x7d]
 0x01,0x05,0x62,0x7d
+# GFX11: v_cmpx_lt_i16_e32 v1, v2                ; encoding: [0x01,0x05,0x62,0x7d]
 
-# GFX11: v_cmpx_lt_i16_e32 v127, v2              ; encoding: [0x7f,0x05,0x62,0x7d]
 0x7f,0x05,0x62,0x7d
+# GFX11: v_cmpx_lt_i16_e32 v127, v2              ; encoding: [0x7f,0x05,0x62,0x7d]
 
-# GFX11: v_cmpx_lt_i16_e32 s1, v2                ; encoding: [0x01,0x04,0x62,0x7d]
 0x01,0x04,0x62,0x7d
+# GFX11: v_cmpx_lt_i16_e32 s1, v2                ; encoding: [0x01,0x04,0x62,0x7d]
 
-# GFX11: v_cmpx_lt_i16_e32 s105, v2              ; encoding: [0x69,0x04,0x62,0x7d]
 0x69,0x04,0x62,0x7d
+# GFX11: v_cmpx_lt_i16_e32 s105, v2              ; encoding: [0x69,0x04,0x62,0x7d]
 
-# GFX11: v_cmpx_lt_i16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x62,0x7d]
 0x6a,0x04,0x62,0x7d
+# GFX11: v_cmpx_lt_i16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x62,0x7d]
 
-# GFX11: v_cmpx_lt_i16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x62,0x7d]
 0x6b,0x04,0x62,0x7d
+# GFX11: v_cmpx_lt_i16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x62,0x7d]
 
-# GFX11: v_cmpx_lt_i16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x62,0x7d]
 0x7b,0x04,0x62,0x7d
+# GFX11: v_cmpx_lt_i16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x62,0x7d]
 
-# GFX11: v_cmpx_lt_i16_e32 m0, v2                ; encoding: [0x7d,0x04,0x62,0x7d]
 0x7d,0x04,0x62,0x7d
+# GFX11: v_cmpx_lt_i16_e32 m0, v2                ; encoding: [0x7d,0x04,0x62,0x7d]
 
-# GFX11: v_cmpx_lt_i16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x62,0x7d]
 0x7e,0x04,0x62,0x7d
+# GFX11: v_cmpx_lt_i16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x62,0x7d]
 
-# GFX11: v_cmpx_lt_i16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x62,0x7d]
 0x7f,0x04,0x62,0x7d
+# GFX11: v_cmpx_lt_i16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x62,0x7d]
 
-# GFX11: v_cmpx_lt_i16_e32 null, v2              ; encoding: [0x7c,0x04,0x62,0x7d]
 0x7c,0x04,0x62,0x7d
+# GFX11: v_cmpx_lt_i16_e32 null, v2              ; encoding: [0x7c,0x04,0x62,0x7d]
 
-# GFX11: v_cmpx_lt_i16_e32 -1, v2                ; encoding: [0xc1,0x04,0x62,0x7d]
 0xc1,0x04,0x62,0x7d
+# GFX11: v_cmpx_lt_i16_e32 -1, v2                ; encoding: [0xc1,0x04,0x62,0x7d]
 
-# GFX11: v_cmpx_lt_i16_e32 0x3800, v2
 0xf0,0x04,0x62,0x7d
+# GFX11: v_cmpx_lt_i16_e32 0x3800, v2            ; encoding: [0xff,0x04,0x62,0x7d,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x62,0x7d]
 0xfd,0x04,0x62,0x7d
+# GFX11: v_cmpx_lt_i16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x62,0x7d]
 
-# GFX11: v_cmpx_lt_i16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x62,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x62,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_lt_i16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x62,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i32_e32 v1, v2                ; encoding: [0x01,0x05,0x82,0x7d]
 0x01,0x05,0x82,0x7d
+# GFX11: v_cmpx_lt_i32_e32 v1, v2                ; encoding: [0x01,0x05,0x82,0x7d]
 
-# GFX11: v_cmpx_lt_i32_e32 v255, v2              ; encoding: [0xff,0x05,0x82,0x7d]
 0xff,0x05,0x82,0x7d
+# GFX11: v_cmpx_lt_i32_e32 v255, v2              ; encoding: [0xff,0x05,0x82,0x7d]
 
-# GFX11: v_cmpx_lt_i32_e32 s1, v2                ; encoding: [0x01,0x04,0x82,0x7d]
 0x01,0x04,0x82,0x7d
+# GFX11: v_cmpx_lt_i32_e32 s1, v2                ; encoding: [0x01,0x04,0x82,0x7d]
 
-# GFX11: v_cmpx_lt_i32_e32 s105, v2              ; encoding: [0x69,0x04,0x82,0x7d]
 0x69,0x04,0x82,0x7d
+# GFX11: v_cmpx_lt_i32_e32 s105, v2              ; encoding: [0x69,0x04,0x82,0x7d]
 
-# GFX11: v_cmpx_lt_i32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x82,0x7d]
 0x6a,0x04,0x82,0x7d
+# GFX11: v_cmpx_lt_i32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x82,0x7d]
 
-# GFX11: v_cmpx_lt_i32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x82,0x7d]
 0x6b,0x04,0x82,0x7d
+# GFX11: v_cmpx_lt_i32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x82,0x7d]
 
-# GFX11: v_cmpx_lt_i32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x82,0x7d]
 0x7b,0x04,0x82,0x7d
+# GFX11: v_cmpx_lt_i32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x82,0x7d]
 
-# GFX11: v_cmpx_lt_i32_e32 m0, v2                ; encoding: [0x7d,0x04,0x82,0x7d]
 0x7d,0x04,0x82,0x7d
+# GFX11: v_cmpx_lt_i32_e32 m0, v2                ; encoding: [0x7d,0x04,0x82,0x7d]
 
-# GFX11: v_cmpx_lt_i32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x82,0x7d]
 0x7e,0x04,0x82,0x7d
+# GFX11: v_cmpx_lt_i32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x82,0x7d]
 
-# GFX11: v_cmpx_lt_i32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x82,0x7d]
 0x7f,0x04,0x82,0x7d
+# GFX11: v_cmpx_lt_i32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x82,0x7d]
 
-# GFX11: v_cmpx_lt_i32_e32 null, v2              ; encoding: [0x7c,0x04,0x82,0x7d]
 0x7c,0x04,0x82,0x7d
+# GFX11: v_cmpx_lt_i32_e32 null, v2              ; encoding: [0x7c,0x04,0x82,0x7d]
 
-# GFX11: v_cmpx_lt_i32_e32 -1, v2                ; encoding: [0xc1,0x04,0x82,0x7d]
 0xc1,0x04,0x82,0x7d
+# GFX11: v_cmpx_lt_i32_e32 -1, v2                ; encoding: [0xc1,0x04,0x82,0x7d]
 
-# GFX11: v_cmpx_lt_i32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x82,0x7d]
 0xf0,0x04,0x82,0x7d
+# GFX11: v_cmpx_lt_i32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x82,0x7d]
 
-# GFX11: v_cmpx_lt_i32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x82,0x7d]
 0xfd,0x04,0x82,0x7d
+# GFX11: v_cmpx_lt_i32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x82,0x7d]
 
-# GFX11: v_cmpx_lt_i32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x83,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x83,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_lt_i32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x83,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_lt_i64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xa2,0x7d]
 0x01,0x05,0xa2,0x7d
+# GFX11: v_cmpx_lt_i64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xa2,0x7d]
 
-# GFX11: v_cmpx_lt_i64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xa2,0x7d]
 0xfe,0x05,0xa2,0x7d
+# GFX11: v_cmpx_lt_i64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xa2,0x7d]
 
-# GFX11: v_cmpx_lt_i64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xa2,0x7d]
 0x02,0x04,0xa2,0x7d
+# GFX11: v_cmpx_lt_i64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xa2,0x7d]
 
-# GFX11: v_cmpx_lt_i64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xa2,0x7d]
 0x68,0x04,0xa2,0x7d
+# GFX11: v_cmpx_lt_i64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xa2,0x7d]
 
-# GFX11: v_cmpx_lt_i64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xa2,0x7d]
 0x6a,0x04,0xa2,0x7d
+# GFX11: v_cmpx_lt_i64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xa2,0x7d]
 
-# GFX11: v_cmpx_lt_i64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xa2,0x7d]
 0x7a,0x04,0xa2,0x7d
+# GFX11: v_cmpx_lt_i64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xa2,0x7d]
 
-# GFX11: v_cmpx_lt_i64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xa2,0x7d]
 0x7e,0x04,0xa2,0x7d
+# GFX11: v_cmpx_lt_i64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xa2,0x7d]
 
-# GFX11: v_cmpx_lt_i64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xa2,0x7d]
 0x7c,0x04,0xa2,0x7d
+# GFX11: v_cmpx_lt_i64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xa2,0x7d]
 
-# GFX11: v_cmpx_lt_i64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xa2,0x7d]
 0xc1,0x04,0xa2,0x7d
+# GFX11: v_cmpx_lt_i64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xa2,0x7d]
 
-# GFX11: v_cmpx_lt_i64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xa2,0x7d]
 0xf0,0x04,0xa2,0x7d
+# GFX11: v_cmpx_lt_i64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xa2,0x7d]
 
-# GFX11: v_cmpx_lt_i64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xa2,0x7d]
 0xfd,0x04,0xa2,0x7d
+# GFX11: v_cmpx_lt_i64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xa2,0x7d]
 
-# GFX11: v_cmpx_lt_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa3,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0xa3,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_lt_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa3,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_lt_u16_e32 v1, v2                ; encoding: [0x01,0x05,0x72,0x7d]
 0x01,0x05,0x72,0x7d
+# GFX11: v_cmpx_lt_u16_e32 v1, v2                ; encoding: [0x01,0x05,0x72,0x7d]
 
-# GFX11: v_cmpx_lt_u16_e32 v127, v2              ; encoding: [0x7f,0x05,0x72,0x7d]
 0x7f,0x05,0x72,0x7d
+# GFX11: v_cmpx_lt_u16_e32 v127, v2              ; encoding: [0x7f,0x05,0x72,0x7d]
 
-# GFX11: v_cmpx_lt_u16_e32 s1, v2                ; encoding: [0x01,0x04,0x72,0x7d]
 0x01,0x04,0x72,0x7d
+# GFX11: v_cmpx_lt_u16_e32 s1, v2                ; encoding: [0x01,0x04,0x72,0x7d]
 
-# GFX11: v_cmpx_lt_u16_e32 s105, v2              ; encoding: [0x69,0x04,0x72,0x7d]
 0x69,0x04,0x72,0x7d
+# GFX11: v_cmpx_lt_u16_e32 s105, v2              ; encoding: [0x69,0x04,0x72,0x7d]
 
-# GFX11: v_cmpx_lt_u16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x72,0x7d]
 0x6a,0x04,0x72,0x7d
+# GFX11: v_cmpx_lt_u16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x72,0x7d]
 
-# GFX11: v_cmpx_lt_u16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x72,0x7d]
 0x6b,0x04,0x72,0x7d
+# GFX11: v_cmpx_lt_u16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x72,0x7d]
 
-# GFX11: v_cmpx_lt_u16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x72,0x7d]
 0x7b,0x04,0x72,0x7d
+# GFX11: v_cmpx_lt_u16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x72,0x7d]
 
-# GFX11: v_cmpx_lt_u16_e32 m0, v2                ; encoding: [0x7d,0x04,0x72,0x7d]
 0x7d,0x04,0x72,0x7d
+# GFX11: v_cmpx_lt_u16_e32 m0, v2                ; encoding: [0x7d,0x04,0x72,0x7d]
 
-# GFX11: v_cmpx_lt_u16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x72,0x7d]
 0x7e,0x04,0x72,0x7d
+# GFX11: v_cmpx_lt_u16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x72,0x7d]
 
-# GFX11: v_cmpx_lt_u16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x72,0x7d]
 0x7f,0x04,0x72,0x7d
+# GFX11: v_cmpx_lt_u16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x72,0x7d]
 
-# GFX11: v_cmpx_lt_u16_e32 null, v2              ; encoding: [0x7c,0x04,0x72,0x7d]
 0x7c,0x04,0x72,0x7d
+# GFX11: v_cmpx_lt_u16_e32 null, v2              ; encoding: [0x7c,0x04,0x72,0x7d]
 
-# GFX11: v_cmpx_lt_u16_e32 -1, v2                ; encoding: [0xc1,0x04,0x72,0x7d]
 0xc1,0x04,0x72,0x7d
+# GFX11: v_cmpx_lt_u16_e32 -1, v2                ; encoding: [0xc1,0x04,0x72,0x7d]
 
-# GFX11: v_cmpx_lt_u16_e32 0x3800, v2
 0xf0,0x04,0x72,0x7d
+# GFX11: v_cmpx_lt_u16_e32 0x3800, v2            ; encoding: [0xff,0x04,0x72,0x7d,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x72,0x7d]
 0xfd,0x04,0x72,0x7d
+# GFX11: v_cmpx_lt_u16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x72,0x7d]
 
-# GFX11: v_cmpx_lt_u16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x72,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x72,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_lt_u16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x72,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u32_e32 v1, v2                ; encoding: [0x01,0x05,0x92,0x7d]
 0x01,0x05,0x92,0x7d
+# GFX11: v_cmpx_lt_u32_e32 v1, v2                ; encoding: [0x01,0x05,0x92,0x7d]
 
-# GFX11: v_cmpx_lt_u32_e32 v255, v2              ; encoding: [0xff,0x05,0x92,0x7d]
 0xff,0x05,0x92,0x7d
+# GFX11: v_cmpx_lt_u32_e32 v255, v2              ; encoding: [0xff,0x05,0x92,0x7d]
 
-# GFX11: v_cmpx_lt_u32_e32 s1, v2                ; encoding: [0x01,0x04,0x92,0x7d]
 0x01,0x04,0x92,0x7d
+# GFX11: v_cmpx_lt_u32_e32 s1, v2                ; encoding: [0x01,0x04,0x92,0x7d]
 
-# GFX11: v_cmpx_lt_u32_e32 s105, v2              ; encoding: [0x69,0x04,0x92,0x7d]
 0x69,0x04,0x92,0x7d
+# GFX11: v_cmpx_lt_u32_e32 s105, v2              ; encoding: [0x69,0x04,0x92,0x7d]
 
-# GFX11: v_cmpx_lt_u32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x92,0x7d]
 0x6a,0x04,0x92,0x7d
+# GFX11: v_cmpx_lt_u32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x92,0x7d]
 
-# GFX11: v_cmpx_lt_u32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x92,0x7d]
 0x6b,0x04,0x92,0x7d
+# GFX11: v_cmpx_lt_u32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x92,0x7d]
 
-# GFX11: v_cmpx_lt_u32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x92,0x7d]
 0x7b,0x04,0x92,0x7d
+# GFX11: v_cmpx_lt_u32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x92,0x7d]
 
-# GFX11: v_cmpx_lt_u32_e32 m0, v2                ; encoding: [0x7d,0x04,0x92,0x7d]
 0x7d,0x04,0x92,0x7d
+# GFX11: v_cmpx_lt_u32_e32 m0, v2                ; encoding: [0x7d,0x04,0x92,0x7d]
 
-# GFX11: v_cmpx_lt_u32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x92,0x7d]
 0x7e,0x04,0x92,0x7d
+# GFX11: v_cmpx_lt_u32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x92,0x7d]
 
-# GFX11: v_cmpx_lt_u32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x92,0x7d]
 0x7f,0x04,0x92,0x7d
+# GFX11: v_cmpx_lt_u32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x92,0x7d]
 
-# GFX11: v_cmpx_lt_u32_e32 null, v2              ; encoding: [0x7c,0x04,0x92,0x7d]
 0x7c,0x04,0x92,0x7d
+# GFX11: v_cmpx_lt_u32_e32 null, v2              ; encoding: [0x7c,0x04,0x92,0x7d]
 
-# GFX11: v_cmpx_lt_u32_e32 -1, v2                ; encoding: [0xc1,0x04,0x92,0x7d]
 0xc1,0x04,0x92,0x7d
+# GFX11: v_cmpx_lt_u32_e32 -1, v2                ; encoding: [0xc1,0x04,0x92,0x7d]
 
-# GFX11: v_cmpx_lt_u32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x92,0x7d]
 0xf0,0x04,0x92,0x7d
+# GFX11: v_cmpx_lt_u32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x92,0x7d]
 
-# GFX11: v_cmpx_lt_u32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x92,0x7d]
 0xfd,0x04,0x92,0x7d
+# GFX11: v_cmpx_lt_u32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x92,0x7d]
 
-# GFX11: v_cmpx_lt_u32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x93,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x93,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_lt_u32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x93,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_lt_u64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xb2,0x7d]
 0x01,0x05,0xb2,0x7d
+# GFX11: v_cmpx_lt_u64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xb2,0x7d]
 
-# GFX11: v_cmpx_lt_u64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xb2,0x7d]
 0xfe,0x05,0xb2,0x7d
+# GFX11: v_cmpx_lt_u64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xb2,0x7d]
 
-# GFX11: v_cmpx_lt_u64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xb2,0x7d]
 0x02,0x04,0xb2,0x7d
+# GFX11: v_cmpx_lt_u64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xb2,0x7d]
 
-# GFX11: v_cmpx_lt_u64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xb2,0x7d]
 0x68,0x04,0xb2,0x7d
+# GFX11: v_cmpx_lt_u64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xb2,0x7d]
 
-# GFX11: v_cmpx_lt_u64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xb2,0x7d]
 0x6a,0x04,0xb2,0x7d
+# GFX11: v_cmpx_lt_u64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xb2,0x7d]
 
-# GFX11: v_cmpx_lt_u64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xb2,0x7d]
 0x7a,0x04,0xb2,0x7d
+# GFX11: v_cmpx_lt_u64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xb2,0x7d]
 
-# GFX11: v_cmpx_lt_u64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xb2,0x7d]
 0x7e,0x04,0xb2,0x7d
+# GFX11: v_cmpx_lt_u64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xb2,0x7d]
 
-# GFX11: v_cmpx_lt_u64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xb2,0x7d]
 0x7c,0x04,0xb2,0x7d
+# GFX11: v_cmpx_lt_u64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xb2,0x7d]
 
-# GFX11: v_cmpx_lt_u64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xb2,0x7d]
 0xc1,0x04,0xb2,0x7d
+# GFX11: v_cmpx_lt_u64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xb2,0x7d]
 
-# GFX11: v_cmpx_lt_u64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xb2,0x7d]
 0xf0,0x04,0xb2,0x7d
+# GFX11: v_cmpx_lt_u64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xb2,0x7d]
 
-# GFX11: v_cmpx_lt_u64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xb2,0x7d]
 0xfd,0x04,0xb2,0x7d
+# GFX11: v_cmpx_lt_u64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xb2,0x7d]
 
-# GFX11: v_cmpx_lt_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb3,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0xb3,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_lt_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb3,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ne_i16_e32 v1, v2                ; encoding: [0x01,0x05,0x6a,0x7d]
 0x01,0x05,0x6a,0x7d
+# GFX11: v_cmpx_ne_i16_e32 v1, v2                ; encoding: [0x01,0x05,0x6a,0x7d]
 
-# GFX11: v_cmpx_ne_i16_e32 v127, v2              ; encoding: [0x7f,0x05,0x6a,0x7d]
 0x7f,0x05,0x6a,0x7d
+# GFX11: v_cmpx_ne_i16_e32 v127, v2              ; encoding: [0x7f,0x05,0x6a,0x7d]
 
-# GFX11: v_cmpx_ne_i16_e32 s1, v2                ; encoding: [0x01,0x04,0x6a,0x7d]
 0x01,0x04,0x6a,0x7d
+# GFX11: v_cmpx_ne_i16_e32 s1, v2                ; encoding: [0x01,0x04,0x6a,0x7d]
 
-# GFX11: v_cmpx_ne_i16_e32 s105, v2              ; encoding: [0x69,0x04,0x6a,0x7d]
 0x69,0x04,0x6a,0x7d
+# GFX11: v_cmpx_ne_i16_e32 s105, v2              ; encoding: [0x69,0x04,0x6a,0x7d]
 
-# GFX11: v_cmpx_ne_i16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x6a,0x7d]
 0x6a,0x04,0x6a,0x7d
+# GFX11: v_cmpx_ne_i16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x6a,0x7d]
 
-# GFX11: v_cmpx_ne_i16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x6a,0x7d]
 0x6b,0x04,0x6a,0x7d
+# GFX11: v_cmpx_ne_i16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x6a,0x7d]
 
-# GFX11: v_cmpx_ne_i16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x6a,0x7d]
 0x7b,0x04,0x6a,0x7d
+# GFX11: v_cmpx_ne_i16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x6a,0x7d]
 
-# GFX11: v_cmpx_ne_i16_e32 m0, v2                ; encoding: [0x7d,0x04,0x6a,0x7d]
 0x7d,0x04,0x6a,0x7d
+# GFX11: v_cmpx_ne_i16_e32 m0, v2                ; encoding: [0x7d,0x04,0x6a,0x7d]
 
-# GFX11: v_cmpx_ne_i16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x6a,0x7d]
 0x7e,0x04,0x6a,0x7d
+# GFX11: v_cmpx_ne_i16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x6a,0x7d]
 
-# GFX11: v_cmpx_ne_i16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x6a,0x7d]
 0x7f,0x04,0x6a,0x7d
+# GFX11: v_cmpx_ne_i16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x6a,0x7d]
 
-# GFX11: v_cmpx_ne_i16_e32 null, v2              ; encoding: [0x7c,0x04,0x6a,0x7d]
 0x7c,0x04,0x6a,0x7d
+# GFX11: v_cmpx_ne_i16_e32 null, v2              ; encoding: [0x7c,0x04,0x6a,0x7d]
 
-# GFX11: v_cmpx_ne_i16_e32 -1, v2                ; encoding: [0xc1,0x04,0x6a,0x7d]
 0xc1,0x04,0x6a,0x7d
+# GFX11: v_cmpx_ne_i16_e32 -1, v2                ; encoding: [0xc1,0x04,0x6a,0x7d]
 
-# GFX11: v_cmpx_ne_i16_e32 0x3800, v2
 0xf0,0x04,0x6a,0x7d
+# GFX11: v_cmpx_ne_i16_e32 0x3800, v2            ; encoding: [0xff,0x04,0x6a,0x7d,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x6a,0x7d]
 0xfd,0x04,0x6a,0x7d
+# GFX11: v_cmpx_ne_i16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x6a,0x7d]
 
-# GFX11: v_cmpx_ne_i16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x6a,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x6a,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_ne_i16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x6a,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i32_e32 v1, v2                ; encoding: [0x01,0x05,0x8a,0x7d]
 0x01,0x05,0x8a,0x7d
+# GFX11: v_cmpx_ne_i32_e32 v1, v2                ; encoding: [0x01,0x05,0x8a,0x7d]
 
-# GFX11: v_cmpx_ne_i32_e32 v255, v2              ; encoding: [0xff,0x05,0x8a,0x7d]
 0xff,0x05,0x8a,0x7d
+# GFX11: v_cmpx_ne_i32_e32 v255, v2              ; encoding: [0xff,0x05,0x8a,0x7d]
 
-# GFX11: v_cmpx_ne_i32_e32 s1, v2                ; encoding: [0x01,0x04,0x8a,0x7d]
 0x01,0x04,0x8a,0x7d
+# GFX11: v_cmpx_ne_i32_e32 s1, v2                ; encoding: [0x01,0x04,0x8a,0x7d]
 
-# GFX11: v_cmpx_ne_i32_e32 s105, v2              ; encoding: [0x69,0x04,0x8a,0x7d]
 0x69,0x04,0x8a,0x7d
+# GFX11: v_cmpx_ne_i32_e32 s105, v2              ; encoding: [0x69,0x04,0x8a,0x7d]
 
-# GFX11: v_cmpx_ne_i32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x8a,0x7d]
 0x6a,0x04,0x8a,0x7d
+# GFX11: v_cmpx_ne_i32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x8a,0x7d]
 
-# GFX11: v_cmpx_ne_i32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x8a,0x7d]
 0x6b,0x04,0x8a,0x7d
+# GFX11: v_cmpx_ne_i32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x8a,0x7d]
 
-# GFX11: v_cmpx_ne_i32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x8a,0x7d]
 0x7b,0x04,0x8a,0x7d
+# GFX11: v_cmpx_ne_i32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x8a,0x7d]
 
-# GFX11: v_cmpx_ne_i32_e32 m0, v2                ; encoding: [0x7d,0x04,0x8a,0x7d]
 0x7d,0x04,0x8a,0x7d
+# GFX11: v_cmpx_ne_i32_e32 m0, v2                ; encoding: [0x7d,0x04,0x8a,0x7d]
 
-# GFX11: v_cmpx_ne_i32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x8a,0x7d]
 0x7e,0x04,0x8a,0x7d
+# GFX11: v_cmpx_ne_i32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x8a,0x7d]
 
-# GFX11: v_cmpx_ne_i32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x8a,0x7d]
 0x7f,0x04,0x8a,0x7d
+# GFX11: v_cmpx_ne_i32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x8a,0x7d]
 
-# GFX11: v_cmpx_ne_i32_e32 null, v2              ; encoding: [0x7c,0x04,0x8a,0x7d]
 0x7c,0x04,0x8a,0x7d
+# GFX11: v_cmpx_ne_i32_e32 null, v2              ; encoding: [0x7c,0x04,0x8a,0x7d]
 
-# GFX11: v_cmpx_ne_i32_e32 -1, v2                ; encoding: [0xc1,0x04,0x8a,0x7d]
 0xc1,0x04,0x8a,0x7d
+# GFX11: v_cmpx_ne_i32_e32 -1, v2                ; encoding: [0xc1,0x04,0x8a,0x7d]
 
-# GFX11: v_cmpx_ne_i32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x8a,0x7d]
 0xf0,0x04,0x8a,0x7d
+# GFX11: v_cmpx_ne_i32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x8a,0x7d]
 
-# GFX11: v_cmpx_ne_i32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x8a,0x7d]
 0xfd,0x04,0x8a,0x7d
+# GFX11: v_cmpx_ne_i32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x8a,0x7d]
 
-# GFX11: v_cmpx_ne_i32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x8b,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x8b,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ne_i32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x8b,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ne_i64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xaa,0x7d]
 0x01,0x05,0xaa,0x7d
+# GFX11: v_cmpx_ne_i64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xaa,0x7d]
 
-# GFX11: v_cmpx_ne_i64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xaa,0x7d]
 0xfe,0x05,0xaa,0x7d
+# GFX11: v_cmpx_ne_i64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xaa,0x7d]
 
-# GFX11: v_cmpx_ne_i64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xaa,0x7d]
 0x02,0x04,0xaa,0x7d
+# GFX11: v_cmpx_ne_i64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xaa,0x7d]
 
-# GFX11: v_cmpx_ne_i64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xaa,0x7d]
 0x68,0x04,0xaa,0x7d
+# GFX11: v_cmpx_ne_i64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xaa,0x7d]
 
-# GFX11: v_cmpx_ne_i64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xaa,0x7d]
 0x6a,0x04,0xaa,0x7d
+# GFX11: v_cmpx_ne_i64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xaa,0x7d]
 
-# GFX11: v_cmpx_ne_i64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xaa,0x7d]
 0x7a,0x04,0xaa,0x7d
+# GFX11: v_cmpx_ne_i64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xaa,0x7d]
 
-# GFX11: v_cmpx_ne_i64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xaa,0x7d]
 0x7e,0x04,0xaa,0x7d
+# GFX11: v_cmpx_ne_i64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xaa,0x7d]
 
-# GFX11: v_cmpx_ne_i64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xaa,0x7d]
 0x7c,0x04,0xaa,0x7d
+# GFX11: v_cmpx_ne_i64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xaa,0x7d]
 
-# GFX11: v_cmpx_ne_i64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xaa,0x7d]
 0xc1,0x04,0xaa,0x7d
+# GFX11: v_cmpx_ne_i64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xaa,0x7d]
 
-# GFX11: v_cmpx_ne_i64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xaa,0x7d]
 0xf0,0x04,0xaa,0x7d
+# GFX11: v_cmpx_ne_i64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xaa,0x7d]
 
-# GFX11: v_cmpx_ne_i64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xaa,0x7d]
 0xfd,0x04,0xaa,0x7d
+# GFX11: v_cmpx_ne_i64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xaa,0x7d]
 
-# GFX11: v_cmpx_ne_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xab,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0xab,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ne_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xab,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ne_u16_e32 v1, v2                ; encoding: [0x01,0x05,0x7a,0x7d]
 0x01,0x05,0x7a,0x7d
+# GFX11: v_cmpx_ne_u16_e32 v1, v2                ; encoding: [0x01,0x05,0x7a,0x7d]
 
-# GFX11: v_cmpx_ne_u16_e32 v127, v2              ; encoding: [0x7f,0x05,0x7a,0x7d]
 0x7f,0x05,0x7a,0x7d
+# GFX11: v_cmpx_ne_u16_e32 v127, v2              ; encoding: [0x7f,0x05,0x7a,0x7d]
 
-# GFX11: v_cmpx_ne_u16_e32 s1, v2                ; encoding: [0x01,0x04,0x7a,0x7d]
 0x01,0x04,0x7a,0x7d
+# GFX11: v_cmpx_ne_u16_e32 s1, v2                ; encoding: [0x01,0x04,0x7a,0x7d]
 
-# GFX11: v_cmpx_ne_u16_e32 s105, v2              ; encoding: [0x69,0x04,0x7a,0x7d]
 0x69,0x04,0x7a,0x7d
+# GFX11: v_cmpx_ne_u16_e32 s105, v2              ; encoding: [0x69,0x04,0x7a,0x7d]
 
-# GFX11: v_cmpx_ne_u16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x7a,0x7d]
 0x6a,0x04,0x7a,0x7d
+# GFX11: v_cmpx_ne_u16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x7a,0x7d]
 
-# GFX11: v_cmpx_ne_u16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x7a,0x7d]
 0x6b,0x04,0x7a,0x7d
+# GFX11: v_cmpx_ne_u16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x7a,0x7d]
 
-# GFX11: v_cmpx_ne_u16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x7a,0x7d]
 0x7b,0x04,0x7a,0x7d
+# GFX11: v_cmpx_ne_u16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x7a,0x7d]
 
-# GFX11: v_cmpx_ne_u16_e32 m0, v2                ; encoding: [0x7d,0x04,0x7a,0x7d]
 0x7d,0x04,0x7a,0x7d
+# GFX11: v_cmpx_ne_u16_e32 m0, v2                ; encoding: [0x7d,0x04,0x7a,0x7d]
 
-# GFX11: v_cmpx_ne_u16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x7a,0x7d]
 0x7e,0x04,0x7a,0x7d
+# GFX11: v_cmpx_ne_u16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x7a,0x7d]
 
-# GFX11: v_cmpx_ne_u16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x7a,0x7d]
 0x7f,0x04,0x7a,0x7d
+# GFX11: v_cmpx_ne_u16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x7a,0x7d]
 
-# GFX11: v_cmpx_ne_u16_e32 null, v2              ; encoding: [0x7c,0x04,0x7a,0x7d]
 0x7c,0x04,0x7a,0x7d
+# GFX11: v_cmpx_ne_u16_e32 null, v2              ; encoding: [0x7c,0x04,0x7a,0x7d]
 
-# GFX11: v_cmpx_ne_u16_e32 -1, v2                ; encoding: [0xc1,0x04,0x7a,0x7d]
 0xc1,0x04,0x7a,0x7d
+# GFX11: v_cmpx_ne_u16_e32 -1, v2                ; encoding: [0xc1,0x04,0x7a,0x7d]
 
-# GFX11: v_cmpx_ne_u16_e32 0x3800, v2
 0xf0,0x04,0x7a,0x7d
+# GFX11: v_cmpx_ne_u16_e32 0x3800, v2            ; encoding: [0xff,0x04,0x7a,0x7d,0x00,0x38,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x7a,0x7d]
 0xfd,0x04,0x7a,0x7d
+# GFX11: v_cmpx_ne_u16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x7a,0x7d]
 
-# GFX11: v_cmpx_ne_u16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x7a,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x7a,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_ne_u16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x7a,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u32_e32 v1, v2                ; encoding: [0x01,0x05,0x9a,0x7d]
 0x01,0x05,0x9a,0x7d
+# GFX11: v_cmpx_ne_u32_e32 v1, v2                ; encoding: [0x01,0x05,0x9a,0x7d]
 
-# GFX11: v_cmpx_ne_u32_e32 v255, v2              ; encoding: [0xff,0x05,0x9a,0x7d]
 0xff,0x05,0x9a,0x7d
+# GFX11: v_cmpx_ne_u32_e32 v255, v2              ; encoding: [0xff,0x05,0x9a,0x7d]
 
-# GFX11: v_cmpx_ne_u32_e32 s1, v2                ; encoding: [0x01,0x04,0x9a,0x7d]
 0x01,0x04,0x9a,0x7d
+# GFX11: v_cmpx_ne_u32_e32 s1, v2                ; encoding: [0x01,0x04,0x9a,0x7d]
 
-# GFX11: v_cmpx_ne_u32_e32 s105, v2              ; encoding: [0x69,0x04,0x9a,0x7d]
 0x69,0x04,0x9a,0x7d
+# GFX11: v_cmpx_ne_u32_e32 s105, v2              ; encoding: [0x69,0x04,0x9a,0x7d]
 
-# GFX11: v_cmpx_ne_u32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x9a,0x7d]
 0x6a,0x04,0x9a,0x7d
+# GFX11: v_cmpx_ne_u32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x9a,0x7d]
 
-# GFX11: v_cmpx_ne_u32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x9a,0x7d]
 0x6b,0x04,0x9a,0x7d
+# GFX11: v_cmpx_ne_u32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x9a,0x7d]
 
-# GFX11: v_cmpx_ne_u32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x9a,0x7d]
 0x7b,0x04,0x9a,0x7d
+# GFX11: v_cmpx_ne_u32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x9a,0x7d]
 
-# GFX11: v_cmpx_ne_u32_e32 m0, v2                ; encoding: [0x7d,0x04,0x9a,0x7d]
 0x7d,0x04,0x9a,0x7d
+# GFX11: v_cmpx_ne_u32_e32 m0, v2                ; encoding: [0x7d,0x04,0x9a,0x7d]
 
-# GFX11: v_cmpx_ne_u32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x9a,0x7d]
 0x7e,0x04,0x9a,0x7d
+# GFX11: v_cmpx_ne_u32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x9a,0x7d]
 
-# GFX11: v_cmpx_ne_u32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x9a,0x7d]
 0x7f,0x04,0x9a,0x7d
+# GFX11: v_cmpx_ne_u32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x9a,0x7d]
 
-# GFX11: v_cmpx_ne_u32_e32 null, v2              ; encoding: [0x7c,0x04,0x9a,0x7d]
 0x7c,0x04,0x9a,0x7d
+# GFX11: v_cmpx_ne_u32_e32 null, v2              ; encoding: [0x7c,0x04,0x9a,0x7d]
 
-# GFX11: v_cmpx_ne_u32_e32 -1, v2                ; encoding: [0xc1,0x04,0x9a,0x7d]
 0xc1,0x04,0x9a,0x7d
+# GFX11: v_cmpx_ne_u32_e32 -1, v2                ; encoding: [0xc1,0x04,0x9a,0x7d]
 
-# GFX11: v_cmpx_ne_u32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x9a,0x7d]
 0xf0,0x04,0x9a,0x7d
+# GFX11: v_cmpx_ne_u32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x9a,0x7d]
 
-# GFX11: v_cmpx_ne_u32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x9a,0x7d]
 0xfd,0x04,0x9a,0x7d
+# GFX11: v_cmpx_ne_u32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x9a,0x7d]
 
-# GFX11: v_cmpx_ne_u32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x9b,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x9b,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ne_u32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x9b,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ne_u64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xba,0x7d]
 0x01,0x05,0xba,0x7d
+# GFX11: v_cmpx_ne_u64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xba,0x7d]
 
-# GFX11: v_cmpx_ne_u64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xba,0x7d]
 0xfe,0x05,0xba,0x7d
+# GFX11: v_cmpx_ne_u64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xba,0x7d]
 
-# GFX11: v_cmpx_ne_u64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xba,0x7d]
 0x02,0x04,0xba,0x7d
+# GFX11: v_cmpx_ne_u64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xba,0x7d]
 
-# GFX11: v_cmpx_ne_u64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xba,0x7d]
 0x68,0x04,0xba,0x7d
+# GFX11: v_cmpx_ne_u64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xba,0x7d]
 
-# GFX11: v_cmpx_ne_u64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xba,0x7d]
 0x6a,0x04,0xba,0x7d
+# GFX11: v_cmpx_ne_u64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xba,0x7d]
 
-# GFX11: v_cmpx_ne_u64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xba,0x7d]
 0x7a,0x04,0xba,0x7d
+# GFX11: v_cmpx_ne_u64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xba,0x7d]
 
-# GFX11: v_cmpx_ne_u64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xba,0x7d]
 0x7e,0x04,0xba,0x7d
+# GFX11: v_cmpx_ne_u64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xba,0x7d]
 
-# GFX11: v_cmpx_ne_u64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xba,0x7d]
 0x7c,0x04,0xba,0x7d
+# GFX11: v_cmpx_ne_u64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xba,0x7d]
 
-# GFX11: v_cmpx_ne_u64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xba,0x7d]
 0xc1,0x04,0xba,0x7d
+# GFX11: v_cmpx_ne_u64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xba,0x7d]
 
-# GFX11: v_cmpx_ne_u64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xba,0x7d]
 0xf0,0x04,0xba,0x7d
+# GFX11: v_cmpx_ne_u64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xba,0x7d]
 
-# GFX11: v_cmpx_ne_u64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xba,0x7d]
 0xfd,0x04,0xba,0x7d
+# GFX11: v_cmpx_ne_u64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xba,0x7d]
 
-# GFX11: v_cmpx_ne_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbb,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0xbb,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ne_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbb,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_neq_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x1a,0x7d]
 0x01,0x05,0x1a,0x7d
+# GFX11: v_cmpx_neq_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x1a,0x7d]
 
-# GFX11: v_cmpx_neq_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x1a,0x7d]
 0x7f,0x05,0x1a,0x7d
+# GFX11: v_cmpx_neq_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x1a,0x7d]
 
-# GFX11: v_cmpx_neq_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x1a,0x7d]
 0x01,0x04,0x1a,0x7d
+# GFX11: v_cmpx_neq_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x1a,0x7d]
 
-# GFX11: v_cmpx_neq_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x1a,0x7d]
 0x69,0x04,0x1a,0x7d
+# GFX11: v_cmpx_neq_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x1a,0x7d]
 
-# GFX11: v_cmpx_neq_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x1a,0x7d]
 0x6a,0x04,0x1a,0x7d
+# GFX11: v_cmpx_neq_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x1a,0x7d]
 
-# GFX11: v_cmpx_neq_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x1a,0x7d]
 0x6b,0x04,0x1a,0x7d
+# GFX11: v_cmpx_neq_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x1a,0x7d]
 
-# GFX11: v_cmpx_neq_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x1a,0x7d]
 0x7b,0x04,0x1a,0x7d
+# GFX11: v_cmpx_neq_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x1a,0x7d]
 
-# GFX11: v_cmpx_neq_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x1a,0x7d]
 0x7d,0x04,0x1a,0x7d
+# GFX11: v_cmpx_neq_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x1a,0x7d]
 
-# GFX11: v_cmpx_neq_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x1a,0x7d]
 0x7e,0x04,0x1a,0x7d
+# GFX11: v_cmpx_neq_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x1a,0x7d]
 
-# GFX11: v_cmpx_neq_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x1a,0x7d]
 0x7f,0x04,0x1a,0x7d
+# GFX11: v_cmpx_neq_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x1a,0x7d]
 
-# GFX11: v_cmpx_neq_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x1a,0x7d]
 0x7c,0x04,0x1a,0x7d
+# GFX11: v_cmpx_neq_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x1a,0x7d]
 
-# GFX11: v_cmpx_neq_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x1a,0x7d]
 0xc1,0x04,0x1a,0x7d
+# GFX11: v_cmpx_neq_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x1a,0x7d]
 
-# GFX11: v_cmpx_neq_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x1a,0x7d]
 0xf0,0x04,0x1a,0x7d
+# GFX11: v_cmpx_neq_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x1a,0x7d]
 
-# GFX11: v_cmpx_neq_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x1a,0x7d]
 0xfd,0x04,0x1a,0x7d
+# GFX11: v_cmpx_neq_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x1a,0x7d]
 
-# GFX11: v_cmpx_neq_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x1a,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x1a,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_neq_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x1a,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_neq_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x3a,0x7d]
 0x01,0x05,0x3a,0x7d
+# GFX11: v_cmpx_neq_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x3a,0x7d]
 
-# GFX11: v_cmpx_neq_f32_e32 v255, v2             ; encoding: [0xff,0x05,0x3a,0x7d]
 0xff,0x05,0x3a,0x7d
+# GFX11: v_cmpx_neq_f32_e32 v255, v2             ; encoding: [0xff,0x05,0x3a,0x7d]
 
-# GFX11: v_cmpx_neq_f32_e32 s1, v2               ; encoding: [0x01,0x04,0x3a,0x7d]
 0x01,0x04,0x3a,0x7d
+# GFX11: v_cmpx_neq_f32_e32 s1, v2               ; encoding: [0x01,0x04,0x3a,0x7d]
 
-# GFX11: v_cmpx_neq_f32_e32 s105, v2             ; encoding: [0x69,0x04,0x3a,0x7d]
 0x69,0x04,0x3a,0x7d
+# GFX11: v_cmpx_neq_f32_e32 s105, v2             ; encoding: [0x69,0x04,0x3a,0x7d]
 
-# GFX11: v_cmpx_neq_f32_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x3a,0x7d]
 0x6a,0x04,0x3a,0x7d
+# GFX11: v_cmpx_neq_f32_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x3a,0x7d]
 
-# GFX11: v_cmpx_neq_f32_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x3a,0x7d]
 0x6b,0x04,0x3a,0x7d
+# GFX11: v_cmpx_neq_f32_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x3a,0x7d]
 
-# GFX11: v_cmpx_neq_f32_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x3a,0x7d]
 0x7b,0x04,0x3a,0x7d
+# GFX11: v_cmpx_neq_f32_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x3a,0x7d]
 
-# GFX11: v_cmpx_neq_f32_e32 m0, v2               ; encoding: [0x7d,0x04,0x3a,0x7d]
 0x7d,0x04,0x3a,0x7d
+# GFX11: v_cmpx_neq_f32_e32 m0, v2               ; encoding: [0x7d,0x04,0x3a,0x7d]
 
-# GFX11: v_cmpx_neq_f32_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x3a,0x7d]
 0x7e,0x04,0x3a,0x7d
+# GFX11: v_cmpx_neq_f32_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x3a,0x7d]
 
-# GFX11: v_cmpx_neq_f32_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x3a,0x7d]
 0x7f,0x04,0x3a,0x7d
+# GFX11: v_cmpx_neq_f32_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x3a,0x7d]
 
-# GFX11: v_cmpx_neq_f32_e32 null, v2             ; encoding: [0x7c,0x04,0x3a,0x7d]
 0x7c,0x04,0x3a,0x7d
+# GFX11: v_cmpx_neq_f32_e32 null, v2             ; encoding: [0x7c,0x04,0x3a,0x7d]
 
-# GFX11: v_cmpx_neq_f32_e32 -1, v2               ; encoding: [0xc1,0x04,0x3a,0x7d]
 0xc1,0x04,0x3a,0x7d
+# GFX11: v_cmpx_neq_f32_e32 -1, v2               ; encoding: [0xc1,0x04,0x3a,0x7d]
 
-# GFX11: v_cmpx_neq_f32_e32 0.5, v2              ; encoding: [0xf0,0x04,0x3a,0x7d]
 0xf0,0x04,0x3a,0x7d
+# GFX11: v_cmpx_neq_f32_e32 0.5, v2              ; encoding: [0xf0,0x04,0x3a,0x7d]
 
-# GFX11: v_cmpx_neq_f32_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x3a,0x7d]
 0xfd,0x04,0x3a,0x7d
+# GFX11: v_cmpx_neq_f32_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x3a,0x7d]
 
-# GFX11: v_cmpx_neq_f32_e32 0xaf123456, v255     ; encoding: [0xff,0xfe,0x3b,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x3b,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_neq_f32_e32 0xaf123456, v255     ; encoding: [0xff,0xfe,0x3b,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_neq_f64_e32 v[1:2], v[2:3]       ; encoding: [0x01,0x05,0x5a,0x7d]
 0x01,0x05,0x5a,0x7d
+# GFX11: v_cmpx_neq_f64_e32 v[1:2], v[2:3]       ; encoding: [0x01,0x05,0x5a,0x7d]
 
-# GFX11: v_cmpx_neq_f64_e32 v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0x5a,0x7d]
 0xfe,0x05,0x5a,0x7d
+# GFX11: v_cmpx_neq_f64_e32 v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0x5a,0x7d]
 
-# GFX11: v_cmpx_neq_f64_e32 s[2:3], v[2:3]       ; encoding: [0x02,0x04,0x5a,0x7d]
 0x02,0x04,0x5a,0x7d
+# GFX11: v_cmpx_neq_f64_e32 s[2:3], v[2:3]       ; encoding: [0x02,0x04,0x5a,0x7d]
 
-# GFX11: v_cmpx_neq_f64_e32 s[104:105], v[2:3]   ; encoding: [0x68,0x04,0x5a,0x7d]
 0x68,0x04,0x5a,0x7d
+# GFX11: v_cmpx_neq_f64_e32 s[104:105], v[2:3]   ; encoding: [0x68,0x04,0x5a,0x7d]
 
-# GFX11: v_cmpx_neq_f64_e32 vcc, v[2:3]          ; encoding: [0x6a,0x04,0x5a,0x7d]
 0x6a,0x04,0x5a,0x7d
+# GFX11: v_cmpx_neq_f64_e32 vcc, v[2:3]          ; encoding: [0x6a,0x04,0x5a,0x7d]
 
-# GFX11: v_cmpx_neq_f64_e32 ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0x5a,0x7d]
 0x7a,0x04,0x5a,0x7d
+# GFX11: v_cmpx_neq_f64_e32 ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0x5a,0x7d]
 
-# GFX11: v_cmpx_neq_f64_e32 exec, v[2:3]         ; encoding: [0x7e,0x04,0x5a,0x7d]
 0x7e,0x04,0x5a,0x7d
+# GFX11: v_cmpx_neq_f64_e32 exec, v[2:3]         ; encoding: [0x7e,0x04,0x5a,0x7d]
 
-# GFX11: v_cmpx_neq_f64_e32 null, v[2:3]         ; encoding: [0x7c,0x04,0x5a,0x7d]
 0x7c,0x04,0x5a,0x7d
+# GFX11: v_cmpx_neq_f64_e32 null, v[2:3]         ; encoding: [0x7c,0x04,0x5a,0x7d]
 
-# GFX11: v_cmpx_neq_f64_e32 -1, v[2:3]           ; encoding: [0xc1,0x04,0x5a,0x7d]
 0xc1,0x04,0x5a,0x7d
+# GFX11: v_cmpx_neq_f64_e32 -1, v[2:3]           ; encoding: [0xc1,0x04,0x5a,0x7d]
 
-# GFX11: v_cmpx_neq_f64_e32 0.5, v[2:3]          ; encoding: [0xf0,0x04,0x5a,0x7d]
 0xf0,0x04,0x5a,0x7d
+# GFX11: v_cmpx_neq_f64_e32 0.5, v[2:3]          ; encoding: [0xf0,0x04,0x5a,0x7d]
 
-# GFX11: v_cmpx_neq_f64_e32 src_scc, v[2:3]      ; encoding: [0xfd,0x04,0x5a,0x7d]
 0xfd,0x04,0x5a,0x7d
+# GFX11: v_cmpx_neq_f64_e32 src_scc, v[2:3]      ; encoding: [0xfd,0x04,0x5a,0x7d]
 
-# GFX11: v_cmpx_neq_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5b,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x5b,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_neq_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5b,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_nge_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x12,0x7d]
 0x01,0x05,0x12,0x7d
+# GFX11: v_cmpx_nge_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x12,0x7d]
 
-# GFX11: v_cmpx_nge_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x12,0x7d]
 0x7f,0x05,0x12,0x7d
+# GFX11: v_cmpx_nge_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x12,0x7d]
 
-# GFX11: v_cmpx_nge_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x12,0x7d]
 0x01,0x04,0x12,0x7d
+# GFX11: v_cmpx_nge_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x12,0x7d]
 
-# GFX11: v_cmpx_nge_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x12,0x7d]
 0x69,0x04,0x12,0x7d
+# GFX11: v_cmpx_nge_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x12,0x7d]
 
-# GFX11: v_cmpx_nge_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x12,0x7d]
 0x6a,0x04,0x12,0x7d
+# GFX11: v_cmpx_nge_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x12,0x7d]
 
-# GFX11: v_cmpx_nge_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x12,0x7d]
 0x6b,0x04,0x12,0x7d
+# GFX11: v_cmpx_nge_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x12,0x7d]
 
-# GFX11: v_cmpx_nge_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x12,0x7d]
 0x7b,0x04,0x12,0x7d
+# GFX11: v_cmpx_nge_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x12,0x7d]
 
-# GFX11: v_cmpx_nge_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x12,0x7d]
 0x7d,0x04,0x12,0x7d
+# GFX11: v_cmpx_nge_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x12,0x7d]
 
-# GFX11: v_cmpx_nge_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x12,0x7d]
 0x7e,0x04,0x12,0x7d
+# GFX11: v_cmpx_nge_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x12,0x7d]
 
-# GFX11: v_cmpx_nge_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x12,0x7d]
 0x7f,0x04,0x12,0x7d
+# GFX11: v_cmpx_nge_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x12,0x7d]
 
-# GFX11: v_cmpx_nge_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x12,0x7d]
 0x7c,0x04,0x12,0x7d
+# GFX11: v_cmpx_nge_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x12,0x7d]
 
-# GFX11: v_cmpx_nge_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x12,0x7d]
 0xc1,0x04,0x12,0x7d
+# GFX11: v_cmpx_nge_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x12,0x7d]
 
-# GFX11: v_cmpx_nge_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x12,0x7d]
 0xf0,0x04,0x12,0x7d
+# GFX11: v_cmpx_nge_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x12,0x7d]
 
-# GFX11: v_cmpx_nge_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x12,0x7d]
 0xfd,0x04,0x12,0x7d
+# GFX11: v_cmpx_nge_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x12,0x7d]
 
-# GFX11: v_cmpx_nge_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x12,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x12,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_nge_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x12,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_nge_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x32,0x7d]
 0x01,0x05,0x32,0x7d
+# GFX11: v_cmpx_nge_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x32,0x7d]
 
-# GFX11: v_cmpx_nge_f32_e32 v255, v2             ; encoding: [0xff,0x05,0x32,0x7d]
 0xff,0x05,0x32,0x7d
+# GFX11: v_cmpx_nge_f32_e32 v255, v2             ; encoding: [0xff,0x05,0x32,0x7d]
 
-# GFX11: v_cmpx_nge_f32_e32 s1, v2               ; encoding: [0x01,0x04,0x32,0x7d]
 0x01,0x04,0x32,0x7d
+# GFX11: v_cmpx_nge_f32_e32 s1, v2               ; encoding: [0x01,0x04,0x32,0x7d]
 
-# GFX11: v_cmpx_nge_f32_e32 s105, v2             ; encoding: [0x69,0x04,0x32,0x7d]
 0x69,0x04,0x32,0x7d
+# GFX11: v_cmpx_nge_f32_e32 s105, v2             ; encoding: [0x69,0x04,0x32,0x7d]
 
-# GFX11: v_cmpx_nge_f32_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x32,0x7d]
 0x6a,0x04,0x32,0x7d
+# GFX11: v_cmpx_nge_f32_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x32,0x7d]
 
-# GFX11: v_cmpx_nge_f32_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x32,0x7d]
 0x6b,0x04,0x32,0x7d
+# GFX11: v_cmpx_nge_f32_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x32,0x7d]
 
-# GFX11: v_cmpx_nge_f32_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x32,0x7d]
 0x7b,0x04,0x32,0x7d
+# GFX11: v_cmpx_nge_f32_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x32,0x7d]
 
-# GFX11: v_cmpx_nge_f32_e32 m0, v2               ; encoding: [0x7d,0x04,0x32,0x7d]
 0x7d,0x04,0x32,0x7d
+# GFX11: v_cmpx_nge_f32_e32 m0, v2               ; encoding: [0x7d,0x04,0x32,0x7d]
 
-# GFX11: v_cmpx_nge_f32_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x32,0x7d]
 0x7e,0x04,0x32,0x7d
+# GFX11: v_cmpx_nge_f32_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x32,0x7d]
 
-# GFX11: v_cmpx_nge_f32_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x32,0x7d]
 0x7f,0x04,0x32,0x7d
+# GFX11: v_cmpx_nge_f32_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x32,0x7d]
 
-# GFX11: v_cmpx_nge_f32_e32 null, v2             ; encoding: [0x7c,0x04,0x32,0x7d]
 0x7c,0x04,0x32,0x7d
+# GFX11: v_cmpx_nge_f32_e32 null, v2             ; encoding: [0x7c,0x04,0x32,0x7d]
 
-# GFX11: v_cmpx_nge_f32_e32 -1, v2               ; encoding: [0xc1,0x04,0x32,0x7d]
 0xc1,0x04,0x32,0x7d
+# GFX11: v_cmpx_nge_f32_e32 -1, v2               ; encoding: [0xc1,0x04,0x32,0x7d]
 
-# GFX11: v_cmpx_nge_f32_e32 0.5, v2              ; encoding: [0xf0,0x04,0x32,0x7d]
 0xf0,0x04,0x32,0x7d
+# GFX11: v_cmpx_nge_f32_e32 0.5, v2              ; encoding: [0xf0,0x04,0x32,0x7d]
 
-# GFX11: v_cmpx_nge_f32_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x32,0x7d]
 0xfd,0x04,0x32,0x7d
+# GFX11: v_cmpx_nge_f32_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x32,0x7d]
 
-# GFX11: v_cmpx_nge_f32_e32 0xaf123456, v255     ; encoding: [0xff,0xfe,0x33,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x33,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_nge_f32_e32 0xaf123456, v255     ; encoding: [0xff,0xfe,0x33,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_nge_f64_e32 v[1:2], v[2:3]       ; encoding: [0x01,0x05,0x52,0x7d]
 0x01,0x05,0x52,0x7d
+# GFX11: v_cmpx_nge_f64_e32 v[1:2], v[2:3]       ; encoding: [0x01,0x05,0x52,0x7d]
 
-# GFX11: v_cmpx_nge_f64_e32 v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0x52,0x7d]
 0xfe,0x05,0x52,0x7d
+# GFX11: v_cmpx_nge_f64_e32 v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0x52,0x7d]
 
-# GFX11: v_cmpx_nge_f64_e32 s[2:3], v[2:3]       ; encoding: [0x02,0x04,0x52,0x7d]
 0x02,0x04,0x52,0x7d
+# GFX11: v_cmpx_nge_f64_e32 s[2:3], v[2:3]       ; encoding: [0x02,0x04,0x52,0x7d]
 
-# GFX11: v_cmpx_nge_f64_e32 s[104:105], v[2:3]   ; encoding: [0x68,0x04,0x52,0x7d]
 0x68,0x04,0x52,0x7d
+# GFX11: v_cmpx_nge_f64_e32 s[104:105], v[2:3]   ; encoding: [0x68,0x04,0x52,0x7d]
 
-# GFX11: v_cmpx_nge_f64_e32 vcc, v[2:3]          ; encoding: [0x6a,0x04,0x52,0x7d]
 0x6a,0x04,0x52,0x7d
+# GFX11: v_cmpx_nge_f64_e32 vcc, v[2:3]          ; encoding: [0x6a,0x04,0x52,0x7d]
 
-# GFX11: v_cmpx_nge_f64_e32 ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0x52,0x7d]
 0x7a,0x04,0x52,0x7d
+# GFX11: v_cmpx_nge_f64_e32 ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0x52,0x7d]
 
-# GFX11: v_cmpx_nge_f64_e32 exec, v[2:3]         ; encoding: [0x7e,0x04,0x52,0x7d]
 0x7e,0x04,0x52,0x7d
+# GFX11: v_cmpx_nge_f64_e32 exec, v[2:3]         ; encoding: [0x7e,0x04,0x52,0x7d]
 
-# GFX11: v_cmpx_nge_f64_e32 null, v[2:3]         ; encoding: [0x7c,0x04,0x52,0x7d]
 0x7c,0x04,0x52,0x7d
+# GFX11: v_cmpx_nge_f64_e32 null, v[2:3]         ; encoding: [0x7c,0x04,0x52,0x7d]
 
-# GFX11: v_cmpx_nge_f64_e32 -1, v[2:3]           ; encoding: [0xc1,0x04,0x52,0x7d]
 0xc1,0x04,0x52,0x7d
+# GFX11: v_cmpx_nge_f64_e32 -1, v[2:3]           ; encoding: [0xc1,0x04,0x52,0x7d]
 
-# GFX11: v_cmpx_nge_f64_e32 0.5, v[2:3]          ; encoding: [0xf0,0x04,0x52,0x7d]
 0xf0,0x04,0x52,0x7d
+# GFX11: v_cmpx_nge_f64_e32 0.5, v[2:3]          ; encoding: [0xf0,0x04,0x52,0x7d]
 
-# GFX11: v_cmpx_nge_f64_e32 src_scc, v[2:3]      ; encoding: [0xfd,0x04,0x52,0x7d]
 0xfd,0x04,0x52,0x7d
+# GFX11: v_cmpx_nge_f64_e32 src_scc, v[2:3]      ; encoding: [0xfd,0x04,0x52,0x7d]
 
-# GFX11: v_cmpx_nge_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x53,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x53,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_nge_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x53,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ngt_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x16,0x7d]
 0x01,0x05,0x16,0x7d
+# GFX11: v_cmpx_ngt_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x16,0x7d]
 
-# GFX11: v_cmpx_ngt_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x16,0x7d]
 0x7f,0x05,0x16,0x7d
+# GFX11: v_cmpx_ngt_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x16,0x7d]
 
-# GFX11: v_cmpx_ngt_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x16,0x7d]
 0x01,0x04,0x16,0x7d
+# GFX11: v_cmpx_ngt_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x16,0x7d]
 
-# GFX11: v_cmpx_ngt_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x16,0x7d]
 0x69,0x04,0x16,0x7d
+# GFX11: v_cmpx_ngt_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x16,0x7d]
 
-# GFX11: v_cmpx_ngt_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x16,0x7d]
 0x6a,0x04,0x16,0x7d
+# GFX11: v_cmpx_ngt_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x16,0x7d]
 
-# GFX11: v_cmpx_ngt_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x16,0x7d]
 0x6b,0x04,0x16,0x7d
+# GFX11: v_cmpx_ngt_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x16,0x7d]
 
-# GFX11: v_cmpx_ngt_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x16,0x7d]
 0x7b,0x04,0x16,0x7d
+# GFX11: v_cmpx_ngt_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x16,0x7d]
 
-# GFX11: v_cmpx_ngt_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x16,0x7d]
 0x7d,0x04,0x16,0x7d
+# GFX11: v_cmpx_ngt_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x16,0x7d]
 
-# GFX11: v_cmpx_ngt_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x16,0x7d]
 0x7e,0x04,0x16,0x7d
+# GFX11: v_cmpx_ngt_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x16,0x7d]
 
-# GFX11: v_cmpx_ngt_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x16,0x7d]
 0x7f,0x04,0x16,0x7d
+# GFX11: v_cmpx_ngt_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x16,0x7d]
 
-# GFX11: v_cmpx_ngt_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x16,0x7d]
 0x7c,0x04,0x16,0x7d
+# GFX11: v_cmpx_ngt_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x16,0x7d]
 
-# GFX11: v_cmpx_ngt_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x16,0x7d]
 0xc1,0x04,0x16,0x7d
+# GFX11: v_cmpx_ngt_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x16,0x7d]
 
-# GFX11: v_cmpx_ngt_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x16,0x7d]
 0xf0,0x04,0x16,0x7d
+# GFX11: v_cmpx_ngt_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x16,0x7d]
 
-# GFX11: v_cmpx_ngt_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x16,0x7d]
 0xfd,0x04,0x16,0x7d
+# GFX11: v_cmpx_ngt_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x16,0x7d]
 
-# GFX11: v_cmpx_ngt_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x16,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x16,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_ngt_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x16,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ngt_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x36,0x7d]
 0x01,0x05,0x36,0x7d
+# GFX11: v_cmpx_ngt_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x36,0x7d]
 
-# GFX11: v_cmpx_ngt_f32_e32 v255, v2             ; encoding: [0xff,0x05,0x36,0x7d]
 0xff,0x05,0x36,0x7d
+# GFX11: v_cmpx_ngt_f32_e32 v255, v2             ; encoding: [0xff,0x05,0x36,0x7d]
 
-# GFX11: v_cmpx_ngt_f32_e32 s1, v2               ; encoding: [0x01,0x04,0x36,0x7d]
 0x01,0x04,0x36,0x7d
+# GFX11: v_cmpx_ngt_f32_e32 s1, v2               ; encoding: [0x01,0x04,0x36,0x7d]
 
-# GFX11: v_cmpx_ngt_f32_e32 s105, v2             ; encoding: [0x69,0x04,0x36,0x7d]
 0x69,0x04,0x36,0x7d
+# GFX11: v_cmpx_ngt_f32_e32 s105, v2             ; encoding: [0x69,0x04,0x36,0x7d]
 
-# GFX11: v_cmpx_ngt_f32_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x36,0x7d]
 0x6a,0x04,0x36,0x7d
+# GFX11: v_cmpx_ngt_f32_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x36,0x7d]
 
-# GFX11: v_cmpx_ngt_f32_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x36,0x7d]
 0x6b,0x04,0x36,0x7d
+# GFX11: v_cmpx_ngt_f32_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x36,0x7d]
 
-# GFX11: v_cmpx_ngt_f32_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x36,0x7d]
 0x7b,0x04,0x36,0x7d
+# GFX11: v_cmpx_ngt_f32_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x36,0x7d]
 
-# GFX11: v_cmpx_ngt_f32_e32 m0, v2               ; encoding: [0x7d,0x04,0x36,0x7d]
 0x7d,0x04,0x36,0x7d
+# GFX11: v_cmpx_ngt_f32_e32 m0, v2               ; encoding: [0x7d,0x04,0x36,0x7d]
 
-# GFX11: v_cmpx_ngt_f32_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x36,0x7d]
 0x7e,0x04,0x36,0x7d
+# GFX11: v_cmpx_ngt_f32_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x36,0x7d]
 
-# GFX11: v_cmpx_ngt_f32_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x36,0x7d]
 0x7f,0x04,0x36,0x7d
+# GFX11: v_cmpx_ngt_f32_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x36,0x7d]
 
-# GFX11: v_cmpx_ngt_f32_e32 null, v2             ; encoding: [0x7c,0x04,0x36,0x7d]
 0x7c,0x04,0x36,0x7d
+# GFX11: v_cmpx_ngt_f32_e32 null, v2             ; encoding: [0x7c,0x04,0x36,0x7d]
 
-# GFX11: v_cmpx_ngt_f32_e32 -1, v2               ; encoding: [0xc1,0x04,0x36,0x7d]
 0xc1,0x04,0x36,0x7d
+# GFX11: v_cmpx_ngt_f32_e32 -1, v2               ; encoding: [0xc1,0x04,0x36,0x7d]
 
-# GFX11: v_cmpx_ngt_f32_e32 0.5, v2              ; encoding: [0xf0,0x04,0x36,0x7d]
 0xf0,0x04,0x36,0x7d
+# GFX11: v_cmpx_ngt_f32_e32 0.5, v2              ; encoding: [0xf0,0x04,0x36,0x7d]
 
-# GFX11: v_cmpx_ngt_f32_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x36,0x7d]
 0xfd,0x04,0x36,0x7d
+# GFX11: v_cmpx_ngt_f32_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x36,0x7d]
 
-# GFX11: v_cmpx_ngt_f32_e32 0xaf123456, v255     ; encoding: [0xff,0xfe,0x37,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x37,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ngt_f32_e32 0xaf123456, v255     ; encoding: [0xff,0xfe,0x37,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_ngt_f64_e32 v[1:2], v[2:3]       ; encoding: [0x01,0x05,0x56,0x7d]
 0x01,0x05,0x56,0x7d
+# GFX11: v_cmpx_ngt_f64_e32 v[1:2], v[2:3]       ; encoding: [0x01,0x05,0x56,0x7d]
 
-# GFX11: v_cmpx_ngt_f64_e32 v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0x56,0x7d]
 0xfe,0x05,0x56,0x7d
+# GFX11: v_cmpx_ngt_f64_e32 v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0x56,0x7d]
 
-# GFX11: v_cmpx_ngt_f64_e32 s[2:3], v[2:3]       ; encoding: [0x02,0x04,0x56,0x7d]
 0x02,0x04,0x56,0x7d
+# GFX11: v_cmpx_ngt_f64_e32 s[2:3], v[2:3]       ; encoding: [0x02,0x04,0x56,0x7d]
 
-# GFX11: v_cmpx_ngt_f64_e32 s[104:105], v[2:3]   ; encoding: [0x68,0x04,0x56,0x7d]
 0x68,0x04,0x56,0x7d
+# GFX11: v_cmpx_ngt_f64_e32 s[104:105], v[2:3]   ; encoding: [0x68,0x04,0x56,0x7d]
 
-# GFX11: v_cmpx_ngt_f64_e32 vcc, v[2:3]          ; encoding: [0x6a,0x04,0x56,0x7d]
 0x6a,0x04,0x56,0x7d
+# GFX11: v_cmpx_ngt_f64_e32 vcc, v[2:3]          ; encoding: [0x6a,0x04,0x56,0x7d]
 
-# GFX11: v_cmpx_ngt_f64_e32 ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0x56,0x7d]
 0x7a,0x04,0x56,0x7d
+# GFX11: v_cmpx_ngt_f64_e32 ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0x56,0x7d]
 
-# GFX11: v_cmpx_ngt_f64_e32 exec, v[2:3]         ; encoding: [0x7e,0x04,0x56,0x7d]
 0x7e,0x04,0x56,0x7d
+# GFX11: v_cmpx_ngt_f64_e32 exec, v[2:3]         ; encoding: [0x7e,0x04,0x56,0x7d]
 
-# GFX11: v_cmpx_ngt_f64_e32 null, v[2:3]         ; encoding: [0x7c,0x04,0x56,0x7d]
 0x7c,0x04,0x56,0x7d
+# GFX11: v_cmpx_ngt_f64_e32 null, v[2:3]         ; encoding: [0x7c,0x04,0x56,0x7d]
 
-# GFX11: v_cmpx_ngt_f64_e32 -1, v[2:3]           ; encoding: [0xc1,0x04,0x56,0x7d]
 0xc1,0x04,0x56,0x7d
+# GFX11: v_cmpx_ngt_f64_e32 -1, v[2:3]           ; encoding: [0xc1,0x04,0x56,0x7d]
 
-# GFX11: v_cmpx_ngt_f64_e32 0.5, v[2:3]          ; encoding: [0xf0,0x04,0x56,0x7d]
 0xf0,0x04,0x56,0x7d
+# GFX11: v_cmpx_ngt_f64_e32 0.5, v[2:3]          ; encoding: [0xf0,0x04,0x56,0x7d]
 
-# GFX11: v_cmpx_ngt_f64_e32 src_scc, v[2:3]      ; encoding: [0xfd,0x04,0x56,0x7d]
 0xfd,0x04,0x56,0x7d
+# GFX11: v_cmpx_ngt_f64_e32 src_scc, v[2:3]      ; encoding: [0xfd,0x04,0x56,0x7d]
 
-# GFX11: v_cmpx_ngt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x57,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x57,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_ngt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x57,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_nle_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x18,0x7d]
 0x01,0x05,0x18,0x7d
+# GFX11: v_cmpx_nle_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x18,0x7d]
 
-# GFX11: v_cmpx_nle_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x18,0x7d]
 0x7f,0x05,0x18,0x7d
+# GFX11: v_cmpx_nle_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x18,0x7d]
 
-# GFX11: v_cmpx_nle_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x18,0x7d]
 0x01,0x04,0x18,0x7d
+# GFX11: v_cmpx_nle_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x18,0x7d]
 
-# GFX11: v_cmpx_nle_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x18,0x7d]
 0x69,0x04,0x18,0x7d
+# GFX11: v_cmpx_nle_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x18,0x7d]
 
-# GFX11: v_cmpx_nle_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x18,0x7d]
 0x6a,0x04,0x18,0x7d
+# GFX11: v_cmpx_nle_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x18,0x7d]
 
-# GFX11: v_cmpx_nle_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x18,0x7d]
 0x6b,0x04,0x18,0x7d
+# GFX11: v_cmpx_nle_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x18,0x7d]
 
-# GFX11: v_cmpx_nle_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x18,0x7d]
 0x7b,0x04,0x18,0x7d
+# GFX11: v_cmpx_nle_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x18,0x7d]
 
-# GFX11: v_cmpx_nle_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x18,0x7d]
 0x7d,0x04,0x18,0x7d
+# GFX11: v_cmpx_nle_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x18,0x7d]
 
-# GFX11: v_cmpx_nle_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x18,0x7d]
 0x7e,0x04,0x18,0x7d
+# GFX11: v_cmpx_nle_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x18,0x7d]
 
-# GFX11: v_cmpx_nle_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x18,0x7d]
 0x7f,0x04,0x18,0x7d
+# GFX11: v_cmpx_nle_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x18,0x7d]
 
-# GFX11: v_cmpx_nle_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x18,0x7d]
 0x7c,0x04,0x18,0x7d
+# GFX11: v_cmpx_nle_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x18,0x7d]
 
-# GFX11: v_cmpx_nle_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x18,0x7d]
 0xc1,0x04,0x18,0x7d
+# GFX11: v_cmpx_nle_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x18,0x7d]
 
-# GFX11: v_cmpx_nle_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x18,0x7d]
 0xf0,0x04,0x18,0x7d
+# GFX11: v_cmpx_nle_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x18,0x7d]
 
-# GFX11: v_cmpx_nle_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x18,0x7d]
 0xfd,0x04,0x18,0x7d
+# GFX11: v_cmpx_nle_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x18,0x7d]
 
-# GFX11: v_cmpx_nle_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x18,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x18,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_nle_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x18,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_nle_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x38,0x7d]
 0x01,0x05,0x38,0x7d
+# GFX11: v_cmpx_nle_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x38,0x7d]
 
-# GFX11: v_cmpx_nle_f32_e32 v255, v2             ; encoding: [0xff,0x05,0x38,0x7d]
 0xff,0x05,0x38,0x7d
+# GFX11: v_cmpx_nle_f32_e32 v255, v2             ; encoding: [0xff,0x05,0x38,0x7d]
 
-# GFX11: v_cmpx_nle_f32_e32 s1, v2               ; encoding: [0x01,0x04,0x38,0x7d]
 0x01,0x04,0x38,0x7d
+# GFX11: v_cmpx_nle_f32_e32 s1, v2               ; encoding: [0x01,0x04,0x38,0x7d]
 
-# GFX11: v_cmpx_nle_f32_e32 s105, v2             ; encoding: [0x69,0x04,0x38,0x7d]
 0x69,0x04,0x38,0x7d
+# GFX11: v_cmpx_nle_f32_e32 s105, v2             ; encoding: [0x69,0x04,0x38,0x7d]
 
-# GFX11: v_cmpx_nle_f32_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x38,0x7d]
 0x6a,0x04,0x38,0x7d
+# GFX11: v_cmpx_nle_f32_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x38,0x7d]
 
-# GFX11: v_cmpx_nle_f32_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x38,0x7d]
 0x6b,0x04,0x38,0x7d
+# GFX11: v_cmpx_nle_f32_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x38,0x7d]
 
-# GFX11: v_cmpx_nle_f32_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x38,0x7d]
 0x7b,0x04,0x38,0x7d
+# GFX11: v_cmpx_nle_f32_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x38,0x7d]
 
-# GFX11: v_cmpx_nle_f32_e32 m0, v2               ; encoding: [0x7d,0x04,0x38,0x7d]
 0x7d,0x04,0x38,0x7d
+# GFX11: v_cmpx_nle_f32_e32 m0, v2               ; encoding: [0x7d,0x04,0x38,0x7d]
 
-# GFX11: v_cmpx_nle_f32_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x38,0x7d]
 0x7e,0x04,0x38,0x7d
+# GFX11: v_cmpx_nle_f32_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x38,0x7d]
 
-# GFX11: v_cmpx_nle_f32_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x38,0x7d]
 0x7f,0x04,0x38,0x7d
+# GFX11: v_cmpx_nle_f32_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x38,0x7d]
 
-# GFX11: v_cmpx_nle_f32_e32 null, v2             ; encoding: [0x7c,0x04,0x38,0x7d]
 0x7c,0x04,0x38,0x7d
+# GFX11: v_cmpx_nle_f32_e32 null, v2             ; encoding: [0x7c,0x04,0x38,0x7d]
 
-# GFX11: v_cmpx_nle_f32_e32 -1, v2               ; encoding: [0xc1,0x04,0x38,0x7d]
 0xc1,0x04,0x38,0x7d
+# GFX11: v_cmpx_nle_f32_e32 -1, v2               ; encoding: [0xc1,0x04,0x38,0x7d]
 
-# GFX11: v_cmpx_nle_f32_e32 0.5, v2              ; encoding: [0xf0,0x04,0x38,0x7d]
 0xf0,0x04,0x38,0x7d
+# GFX11: v_cmpx_nle_f32_e32 0.5, v2              ; encoding: [0xf0,0x04,0x38,0x7d]
 
-# GFX11: v_cmpx_nle_f32_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x38,0x7d]
 0xfd,0x04,0x38,0x7d
+# GFX11: v_cmpx_nle_f32_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x38,0x7d]
 
-# GFX11: v_cmpx_nle_f32_e32 0xaf123456, v255     ; encoding: [0xff,0xfe,0x39,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x39,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_nle_f32_e32 0xaf123456, v255     ; encoding: [0xff,0xfe,0x39,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_nle_f64_e32 v[1:2], v[2:3]       ; encoding: [0x01,0x05,0x58,0x7d]
 0x01,0x05,0x58,0x7d
+# GFX11: v_cmpx_nle_f64_e32 v[1:2], v[2:3]       ; encoding: [0x01,0x05,0x58,0x7d]
 
-# GFX11: v_cmpx_nle_f64_e32 v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0x58,0x7d]
 0xfe,0x05,0x58,0x7d
+# GFX11: v_cmpx_nle_f64_e32 v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0x58,0x7d]
 
-# GFX11: v_cmpx_nle_f64_e32 s[2:3], v[2:3]       ; encoding: [0x02,0x04,0x58,0x7d]
 0x02,0x04,0x58,0x7d
+# GFX11: v_cmpx_nle_f64_e32 s[2:3], v[2:3]       ; encoding: [0x02,0x04,0x58,0x7d]
 
-# GFX11: v_cmpx_nle_f64_e32 s[104:105], v[2:3]   ; encoding: [0x68,0x04,0x58,0x7d]
 0x68,0x04,0x58,0x7d
+# GFX11: v_cmpx_nle_f64_e32 s[104:105], v[2:3]   ; encoding: [0x68,0x04,0x58,0x7d]
 
-# GFX11: v_cmpx_nle_f64_e32 vcc, v[2:3]          ; encoding: [0x6a,0x04,0x58,0x7d]
 0x6a,0x04,0x58,0x7d
+# GFX11: v_cmpx_nle_f64_e32 vcc, v[2:3]          ; encoding: [0x6a,0x04,0x58,0x7d]
 
-# GFX11: v_cmpx_nle_f64_e32 ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0x58,0x7d]
 0x7a,0x04,0x58,0x7d
+# GFX11: v_cmpx_nle_f64_e32 ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0x58,0x7d]
 
-# GFX11: v_cmpx_nle_f64_e32 exec, v[2:3]         ; encoding: [0x7e,0x04,0x58,0x7d]
 0x7e,0x04,0x58,0x7d
+# GFX11: v_cmpx_nle_f64_e32 exec, v[2:3]         ; encoding: [0x7e,0x04,0x58,0x7d]
 
-# GFX11: v_cmpx_nle_f64_e32 null, v[2:3]         ; encoding: [0x7c,0x04,0x58,0x7d]
 0x7c,0x04,0x58,0x7d
+# GFX11: v_cmpx_nle_f64_e32 null, v[2:3]         ; encoding: [0x7c,0x04,0x58,0x7d]
 
-# GFX11: v_cmpx_nle_f64_e32 -1, v[2:3]           ; encoding: [0xc1,0x04,0x58,0x7d]
 0xc1,0x04,0x58,0x7d
+# GFX11: v_cmpx_nle_f64_e32 -1, v[2:3]           ; encoding: [0xc1,0x04,0x58,0x7d]
 
-# GFX11: v_cmpx_nle_f64_e32 0.5, v[2:3]          ; encoding: [0xf0,0x04,0x58,0x7d]
 0xf0,0x04,0x58,0x7d
+# GFX11: v_cmpx_nle_f64_e32 0.5, v[2:3]          ; encoding: [0xf0,0x04,0x58,0x7d]
 
-# GFX11: v_cmpx_nle_f64_e32 src_scc, v[2:3]      ; encoding: [0xfd,0x04,0x58,0x7d]
 0xfd,0x04,0x58,0x7d
+# GFX11: v_cmpx_nle_f64_e32 src_scc, v[2:3]      ; encoding: [0xfd,0x04,0x58,0x7d]
 
-# GFX11: v_cmpx_nle_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x59,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x59,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_nle_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x59,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_nlg_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x14,0x7d]
 0x01,0x05,0x14,0x7d
+# GFX11: v_cmpx_nlg_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x14,0x7d]
 
-# GFX11: v_cmpx_nlg_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x14,0x7d]
 0x7f,0x05,0x14,0x7d
+# GFX11: v_cmpx_nlg_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x14,0x7d]
 
-# GFX11: v_cmpx_nlg_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x14,0x7d]
 0x01,0x04,0x14,0x7d
+# GFX11: v_cmpx_nlg_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x14,0x7d]
 
-# GFX11: v_cmpx_nlg_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x14,0x7d]
 0x69,0x04,0x14,0x7d
+# GFX11: v_cmpx_nlg_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x14,0x7d]
 
-# GFX11: v_cmpx_nlg_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x14,0x7d]
 0x6a,0x04,0x14,0x7d
+# GFX11: v_cmpx_nlg_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x14,0x7d]
 
-# GFX11: v_cmpx_nlg_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x14,0x7d]
 0x6b,0x04,0x14,0x7d
+# GFX11: v_cmpx_nlg_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x14,0x7d]
 
-# GFX11: v_cmpx_nlg_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x14,0x7d]
 0x7b,0x04,0x14,0x7d
+# GFX11: v_cmpx_nlg_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x14,0x7d]
 
-# GFX11: v_cmpx_nlg_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x14,0x7d]
 0x7d,0x04,0x14,0x7d
+# GFX11: v_cmpx_nlg_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x14,0x7d]
 
-# GFX11: v_cmpx_nlg_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x14,0x7d]
 0x7e,0x04,0x14,0x7d
+# GFX11: v_cmpx_nlg_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x14,0x7d]
 
-# GFX11: v_cmpx_nlg_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x14,0x7d]
 0x7f,0x04,0x14,0x7d
+# GFX11: v_cmpx_nlg_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x14,0x7d]
 
-# GFX11: v_cmpx_nlg_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x14,0x7d]
 0x7c,0x04,0x14,0x7d
+# GFX11: v_cmpx_nlg_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x14,0x7d]
 
-# GFX11: v_cmpx_nlg_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x14,0x7d]
 0xc1,0x04,0x14,0x7d
+# GFX11: v_cmpx_nlg_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x14,0x7d]
 
-# GFX11: v_cmpx_nlg_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x14,0x7d]
 0xf0,0x04,0x14,0x7d
+# GFX11: v_cmpx_nlg_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x14,0x7d]
 
-# GFX11: v_cmpx_nlg_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x14,0x7d]
 0xfd,0x04,0x14,0x7d
+# GFX11: v_cmpx_nlg_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x14,0x7d]
 
-# GFX11: v_cmpx_nlg_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x14,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x14,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_nlg_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x14,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_nlg_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x34,0x7d]
 0x01,0x05,0x34,0x7d
+# GFX11: v_cmpx_nlg_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x34,0x7d]
 
-# GFX11: v_cmpx_nlg_f32_e32 v255, v2             ; encoding: [0xff,0x05,0x34,0x7d]
 0xff,0x05,0x34,0x7d
+# GFX11: v_cmpx_nlg_f32_e32 v255, v2             ; encoding: [0xff,0x05,0x34,0x7d]
 
-# GFX11: v_cmpx_nlg_f32_e32 s1, v2               ; encoding: [0x01,0x04,0x34,0x7d]
 0x01,0x04,0x34,0x7d
+# GFX11: v_cmpx_nlg_f32_e32 s1, v2               ; encoding: [0x01,0x04,0x34,0x7d]
 
-# GFX11: v_cmpx_nlg_f32_e32 s105, v2             ; encoding: [0x69,0x04,0x34,0x7d]
 0x69,0x04,0x34,0x7d
+# GFX11: v_cmpx_nlg_f32_e32 s105, v2             ; encoding: [0x69,0x04,0x34,0x7d]
 
-# GFX11: v_cmpx_nlg_f32_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x34,0x7d]
 0x6a,0x04,0x34,0x7d
+# GFX11: v_cmpx_nlg_f32_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x34,0x7d]
 
-# GFX11: v_cmpx_nlg_f32_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x34,0x7d]
 0x6b,0x04,0x34,0x7d
+# GFX11: v_cmpx_nlg_f32_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x34,0x7d]
 
-# GFX11: v_cmpx_nlg_f32_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x34,0x7d]
 0x7b,0x04,0x34,0x7d
+# GFX11: v_cmpx_nlg_f32_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x34,0x7d]
 
-# GFX11: v_cmpx_nlg_f32_e32 m0, v2               ; encoding: [0x7d,0x04,0x34,0x7d]
 0x7d,0x04,0x34,0x7d
+# GFX11: v_cmpx_nlg_f32_e32 m0, v2               ; encoding: [0x7d,0x04,0x34,0x7d]
 
-# GFX11: v_cmpx_nlg_f32_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x34,0x7d]
 0x7e,0x04,0x34,0x7d
+# GFX11: v_cmpx_nlg_f32_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x34,0x7d]
 
-# GFX11: v_cmpx_nlg_f32_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x34,0x7d]
 0x7f,0x04,0x34,0x7d
+# GFX11: v_cmpx_nlg_f32_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x34,0x7d]
 
-# GFX11: v_cmpx_nlg_f32_e32 null, v2             ; encoding: [0x7c,0x04,0x34,0x7d]
 0x7c,0x04,0x34,0x7d
+# GFX11: v_cmpx_nlg_f32_e32 null, v2             ; encoding: [0x7c,0x04,0x34,0x7d]
 
-# GFX11: v_cmpx_nlg_f32_e32 -1, v2               ; encoding: [0xc1,0x04,0x34,0x7d]
 0xc1,0x04,0x34,0x7d
+# GFX11: v_cmpx_nlg_f32_e32 -1, v2               ; encoding: [0xc1,0x04,0x34,0x7d]
 
-# GFX11: v_cmpx_nlg_f32_e32 0.5, v2              ; encoding: [0xf0,0x04,0x34,0x7d]
 0xf0,0x04,0x34,0x7d
+# GFX11: v_cmpx_nlg_f32_e32 0.5, v2              ; encoding: [0xf0,0x04,0x34,0x7d]
 
-# GFX11: v_cmpx_nlg_f32_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x34,0x7d]
 0xfd,0x04,0x34,0x7d
+# GFX11: v_cmpx_nlg_f32_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x34,0x7d]
 
-# GFX11: v_cmpx_nlg_f32_e32 0xaf123456, v255     ; encoding: [0xff,0xfe,0x35,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x35,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_nlg_f32_e32 0xaf123456, v255     ; encoding: [0xff,0xfe,0x35,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_nlg_f64_e32 v[1:2], v[2:3]       ; encoding: [0x01,0x05,0x54,0x7d]
 0x01,0x05,0x54,0x7d
+# GFX11: v_cmpx_nlg_f64_e32 v[1:2], v[2:3]       ; encoding: [0x01,0x05,0x54,0x7d]
 
-# GFX11: v_cmpx_nlg_f64_e32 v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0x54,0x7d]
 0xfe,0x05,0x54,0x7d
+# GFX11: v_cmpx_nlg_f64_e32 v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0x54,0x7d]
 
-# GFX11: v_cmpx_nlg_f64_e32 s[2:3], v[2:3]       ; encoding: [0x02,0x04,0x54,0x7d]
 0x02,0x04,0x54,0x7d
+# GFX11: v_cmpx_nlg_f64_e32 s[2:3], v[2:3]       ; encoding: [0x02,0x04,0x54,0x7d]
 
-# GFX11: v_cmpx_nlg_f64_e32 s[104:105], v[2:3]   ; encoding: [0x68,0x04,0x54,0x7d]
 0x68,0x04,0x54,0x7d
+# GFX11: v_cmpx_nlg_f64_e32 s[104:105], v[2:3]   ; encoding: [0x68,0x04,0x54,0x7d]
 
-# GFX11: v_cmpx_nlg_f64_e32 vcc, v[2:3]          ; encoding: [0x6a,0x04,0x54,0x7d]
 0x6a,0x04,0x54,0x7d
+# GFX11: v_cmpx_nlg_f64_e32 vcc, v[2:3]          ; encoding: [0x6a,0x04,0x54,0x7d]
 
-# GFX11: v_cmpx_nlg_f64_e32 ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0x54,0x7d]
 0x7a,0x04,0x54,0x7d
+# GFX11: v_cmpx_nlg_f64_e32 ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0x54,0x7d]
 
-# GFX11: v_cmpx_nlg_f64_e32 exec, v[2:3]         ; encoding: [0x7e,0x04,0x54,0x7d]
 0x7e,0x04,0x54,0x7d
+# GFX11: v_cmpx_nlg_f64_e32 exec, v[2:3]         ; encoding: [0x7e,0x04,0x54,0x7d]
 
-# GFX11: v_cmpx_nlg_f64_e32 null, v[2:3]         ; encoding: [0x7c,0x04,0x54,0x7d]
 0x7c,0x04,0x54,0x7d
+# GFX11: v_cmpx_nlg_f64_e32 null, v[2:3]         ; encoding: [0x7c,0x04,0x54,0x7d]
 
-# GFX11: v_cmpx_nlg_f64_e32 -1, v[2:3]           ; encoding: [0xc1,0x04,0x54,0x7d]
 0xc1,0x04,0x54,0x7d
+# GFX11: v_cmpx_nlg_f64_e32 -1, v[2:3]           ; encoding: [0xc1,0x04,0x54,0x7d]
 
-# GFX11: v_cmpx_nlg_f64_e32 0.5, v[2:3]          ; encoding: [0xf0,0x04,0x54,0x7d]
 0xf0,0x04,0x54,0x7d
+# GFX11: v_cmpx_nlg_f64_e32 0.5, v[2:3]          ; encoding: [0xf0,0x04,0x54,0x7d]
 
-# GFX11: v_cmpx_nlg_f64_e32 src_scc, v[2:3]      ; encoding: [0xfd,0x04,0x54,0x7d]
 0xfd,0x04,0x54,0x7d
+# GFX11: v_cmpx_nlg_f64_e32 src_scc, v[2:3]      ; encoding: [0xfd,0x04,0x54,0x7d]
 
-# GFX11: v_cmpx_nlg_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x55,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x55,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_nlg_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x55,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_nlt_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x1c,0x7d]
 0x01,0x05,0x1c,0x7d
+# GFX11: v_cmpx_nlt_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x1c,0x7d]
 
-# GFX11: v_cmpx_nlt_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x1c,0x7d]
 0x7f,0x05,0x1c,0x7d
+# GFX11: v_cmpx_nlt_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x1c,0x7d]
 
-# GFX11: v_cmpx_nlt_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x1c,0x7d]
 0x01,0x04,0x1c,0x7d
+# GFX11: v_cmpx_nlt_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x1c,0x7d]
 
-# GFX11: v_cmpx_nlt_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x1c,0x7d]
 0x69,0x04,0x1c,0x7d
+# GFX11: v_cmpx_nlt_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x1c,0x7d]
 
-# GFX11: v_cmpx_nlt_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x1c,0x7d]
 0x6a,0x04,0x1c,0x7d
+# GFX11: v_cmpx_nlt_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x1c,0x7d]
 
-# GFX11: v_cmpx_nlt_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x1c,0x7d]
 0x6b,0x04,0x1c,0x7d
+# GFX11: v_cmpx_nlt_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x1c,0x7d]
 
-# GFX11: v_cmpx_nlt_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x1c,0x7d]
 0x7b,0x04,0x1c,0x7d
+# GFX11: v_cmpx_nlt_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x1c,0x7d]
 
-# GFX11: v_cmpx_nlt_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x1c,0x7d]
 0x7d,0x04,0x1c,0x7d
+# GFX11: v_cmpx_nlt_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x1c,0x7d]
 
-# GFX11: v_cmpx_nlt_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x1c,0x7d]
 0x7e,0x04,0x1c,0x7d
+# GFX11: v_cmpx_nlt_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x1c,0x7d]
 
-# GFX11: v_cmpx_nlt_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x1c,0x7d]
 0x7f,0x04,0x1c,0x7d
+# GFX11: v_cmpx_nlt_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x1c,0x7d]
 
-# GFX11: v_cmpx_nlt_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x1c,0x7d]
 0x7c,0x04,0x1c,0x7d
+# GFX11: v_cmpx_nlt_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x1c,0x7d]
 
-# GFX11: v_cmpx_nlt_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x1c,0x7d]
 0xc1,0x04,0x1c,0x7d
+# GFX11: v_cmpx_nlt_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x1c,0x7d]
 
-# GFX11: v_cmpx_nlt_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x1c,0x7d]
 0xf0,0x04,0x1c,0x7d
+# GFX11: v_cmpx_nlt_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x1c,0x7d]
 
-# GFX11: v_cmpx_nlt_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x1c,0x7d]
 0xfd,0x04,0x1c,0x7d
+# GFX11: v_cmpx_nlt_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x1c,0x7d]
 
-# GFX11: v_cmpx_nlt_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x1c,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x1c,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_nlt_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x1c,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_nlt_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x3c,0x7d]
 0x01,0x05,0x3c,0x7d
+# GFX11: v_cmpx_nlt_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x3c,0x7d]
 
-# GFX11: v_cmpx_nlt_f32_e32 v255, v2             ; encoding: [0xff,0x05,0x3c,0x7d]
 0xff,0x05,0x3c,0x7d
+# GFX11: v_cmpx_nlt_f32_e32 v255, v2             ; encoding: [0xff,0x05,0x3c,0x7d]
 
-# GFX11: v_cmpx_nlt_f32_e32 s1, v2               ; encoding: [0x01,0x04,0x3c,0x7d]
 0x01,0x04,0x3c,0x7d
+# GFX11: v_cmpx_nlt_f32_e32 s1, v2               ; encoding: [0x01,0x04,0x3c,0x7d]
 
-# GFX11: v_cmpx_nlt_f32_e32 s105, v2             ; encoding: [0x69,0x04,0x3c,0x7d]
 0x69,0x04,0x3c,0x7d
+# GFX11: v_cmpx_nlt_f32_e32 s105, v2             ; encoding: [0x69,0x04,0x3c,0x7d]
 
-# GFX11: v_cmpx_nlt_f32_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x3c,0x7d]
 0x6a,0x04,0x3c,0x7d
+# GFX11: v_cmpx_nlt_f32_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x3c,0x7d]
 
-# GFX11: v_cmpx_nlt_f32_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x3c,0x7d]
 0x6b,0x04,0x3c,0x7d
+# GFX11: v_cmpx_nlt_f32_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x3c,0x7d]
 
-# GFX11: v_cmpx_nlt_f32_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x3c,0x7d]
 0x7b,0x04,0x3c,0x7d
+# GFX11: v_cmpx_nlt_f32_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x3c,0x7d]
 
-# GFX11: v_cmpx_nlt_f32_e32 m0, v2               ; encoding: [0x7d,0x04,0x3c,0x7d]
 0x7d,0x04,0x3c,0x7d
+# GFX11: v_cmpx_nlt_f32_e32 m0, v2               ; encoding: [0x7d,0x04,0x3c,0x7d]
 
-# GFX11: v_cmpx_nlt_f32_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x3c,0x7d]
 0x7e,0x04,0x3c,0x7d
+# GFX11: v_cmpx_nlt_f32_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x3c,0x7d]
 
-# GFX11: v_cmpx_nlt_f32_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x3c,0x7d]
 0x7f,0x04,0x3c,0x7d
+# GFX11: v_cmpx_nlt_f32_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x3c,0x7d]
 
-# GFX11: v_cmpx_nlt_f32_e32 null, v2             ; encoding: [0x7c,0x04,0x3c,0x7d]
 0x7c,0x04,0x3c,0x7d
+# GFX11: v_cmpx_nlt_f32_e32 null, v2             ; encoding: [0x7c,0x04,0x3c,0x7d]
 
-# GFX11: v_cmpx_nlt_f32_e32 -1, v2               ; encoding: [0xc1,0x04,0x3c,0x7d]
 0xc1,0x04,0x3c,0x7d
+# GFX11: v_cmpx_nlt_f32_e32 -1, v2               ; encoding: [0xc1,0x04,0x3c,0x7d]
 
-# GFX11: v_cmpx_nlt_f32_e32 0.5, v2              ; encoding: [0xf0,0x04,0x3c,0x7d]
 0xf0,0x04,0x3c,0x7d
+# GFX11: v_cmpx_nlt_f32_e32 0.5, v2              ; encoding: [0xf0,0x04,0x3c,0x7d]
 
-# GFX11: v_cmpx_nlt_f32_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x3c,0x7d]
 0xfd,0x04,0x3c,0x7d
+# GFX11: v_cmpx_nlt_f32_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x3c,0x7d]
 
-# GFX11: v_cmpx_nlt_f32_e32 0xaf123456, v255     ; encoding: [0xff,0xfe,0x3d,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x3d,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_nlt_f32_e32 0xaf123456, v255     ; encoding: [0xff,0xfe,0x3d,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_nlt_f64_e32 v[1:2], v[2:3]       ; encoding: [0x01,0x05,0x5c,0x7d]
 0x01,0x05,0x5c,0x7d
+# GFX11: v_cmpx_nlt_f64_e32 v[1:2], v[2:3]       ; encoding: [0x01,0x05,0x5c,0x7d]
 
-# GFX11: v_cmpx_nlt_f64_e32 v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0x5c,0x7d]
 0xfe,0x05,0x5c,0x7d
+# GFX11: v_cmpx_nlt_f64_e32 v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0x5c,0x7d]
 
-# GFX11: v_cmpx_nlt_f64_e32 s[2:3], v[2:3]       ; encoding: [0x02,0x04,0x5c,0x7d]
 0x02,0x04,0x5c,0x7d
+# GFX11: v_cmpx_nlt_f64_e32 s[2:3], v[2:3]       ; encoding: [0x02,0x04,0x5c,0x7d]
 
-# GFX11: v_cmpx_nlt_f64_e32 s[104:105], v[2:3]   ; encoding: [0x68,0x04,0x5c,0x7d]
 0x68,0x04,0x5c,0x7d
+# GFX11: v_cmpx_nlt_f64_e32 s[104:105], v[2:3]   ; encoding: [0x68,0x04,0x5c,0x7d]
 
-# GFX11: v_cmpx_nlt_f64_e32 vcc, v[2:3]          ; encoding: [0x6a,0x04,0x5c,0x7d]
 0x6a,0x04,0x5c,0x7d
+# GFX11: v_cmpx_nlt_f64_e32 vcc, v[2:3]          ; encoding: [0x6a,0x04,0x5c,0x7d]
 
-# GFX11: v_cmpx_nlt_f64_e32 ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0x5c,0x7d]
 0x7a,0x04,0x5c,0x7d
+# GFX11: v_cmpx_nlt_f64_e32 ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0x5c,0x7d]
 
-# GFX11: v_cmpx_nlt_f64_e32 exec, v[2:3]         ; encoding: [0x7e,0x04,0x5c,0x7d]
 0x7e,0x04,0x5c,0x7d
+# GFX11: v_cmpx_nlt_f64_e32 exec, v[2:3]         ; encoding: [0x7e,0x04,0x5c,0x7d]
 
-# GFX11: v_cmpx_nlt_f64_e32 null, v[2:3]         ; encoding: [0x7c,0x04,0x5c,0x7d]
 0x7c,0x04,0x5c,0x7d
+# GFX11: v_cmpx_nlt_f64_e32 null, v[2:3]         ; encoding: [0x7c,0x04,0x5c,0x7d]
 
-# GFX11: v_cmpx_nlt_f64_e32 -1, v[2:3]           ; encoding: [0xc1,0x04,0x5c,0x7d]
 0xc1,0x04,0x5c,0x7d
+# GFX11: v_cmpx_nlt_f64_e32 -1, v[2:3]           ; encoding: [0xc1,0x04,0x5c,0x7d]
 
-# GFX11: v_cmpx_nlt_f64_e32 0.5, v[2:3]          ; encoding: [0xf0,0x04,0x5c,0x7d]
 0xf0,0x04,0x5c,0x7d
+# GFX11: v_cmpx_nlt_f64_e32 0.5, v[2:3]          ; encoding: [0xf0,0x04,0x5c,0x7d]
 
-# GFX11: v_cmpx_nlt_f64_e32 src_scc, v[2:3]      ; encoding: [0xfd,0x04,0x5c,0x7d]
 0xfd,0x04,0x5c,0x7d
+# GFX11: v_cmpx_nlt_f64_e32 src_scc, v[2:3]      ; encoding: [0xfd,0x04,0x5c,0x7d]
 
-# GFX11: v_cmpx_nlt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5d,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x5d,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_nlt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5d,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_o_f16_e32 v1, v2                 ; encoding: [0x01,0x05,0x0e,0x7d]
 0x01,0x05,0x0e,0x7d
+# GFX11: v_cmpx_o_f16_e32 v1, v2                 ; encoding: [0x01,0x05,0x0e,0x7d]
 
-# GFX11: v_cmpx_o_f16_e32 v127, v2               ; encoding: [0x7f,0x05,0x0e,0x7d]
 0x7f,0x05,0x0e,0x7d
+# GFX11: v_cmpx_o_f16_e32 v127, v2               ; encoding: [0x7f,0x05,0x0e,0x7d]
 
-# GFX11: v_cmpx_o_f16_e32 s1, v2                 ; encoding: [0x01,0x04,0x0e,0x7d]
 0x01,0x04,0x0e,0x7d
+# GFX11: v_cmpx_o_f16_e32 s1, v2                 ; encoding: [0x01,0x04,0x0e,0x7d]
 
-# GFX11: v_cmpx_o_f16_e32 s105, v2               ; encoding: [0x69,0x04,0x0e,0x7d]
 0x69,0x04,0x0e,0x7d
+# GFX11: v_cmpx_o_f16_e32 s105, v2               ; encoding: [0x69,0x04,0x0e,0x7d]
 
-# GFX11: v_cmpx_o_f16_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x0e,0x7d]
 0x6a,0x04,0x0e,0x7d
+# GFX11: v_cmpx_o_f16_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x0e,0x7d]
 
-# GFX11: v_cmpx_o_f16_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x0e,0x7d]
 0x6b,0x04,0x0e,0x7d
+# GFX11: v_cmpx_o_f16_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x0e,0x7d]
 
-# GFX11: v_cmpx_o_f16_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x0e,0x7d]
 0x7b,0x04,0x0e,0x7d
+# GFX11: v_cmpx_o_f16_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x0e,0x7d]
 
-# GFX11: v_cmpx_o_f16_e32 m0, v2                 ; encoding: [0x7d,0x04,0x0e,0x7d]
 0x7d,0x04,0x0e,0x7d
+# GFX11: v_cmpx_o_f16_e32 m0, v2                 ; encoding: [0x7d,0x04,0x0e,0x7d]
 
-# GFX11: v_cmpx_o_f16_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x0e,0x7d]
 0x7e,0x04,0x0e,0x7d
+# GFX11: v_cmpx_o_f16_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x0e,0x7d]
 
-# GFX11: v_cmpx_o_f16_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x0e,0x7d]
 0x7f,0x04,0x0e,0x7d
+# GFX11: v_cmpx_o_f16_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x0e,0x7d]
 
-# GFX11: v_cmpx_o_f16_e32 null, v2               ; encoding: [0x7c,0x04,0x0e,0x7d]
 0x7c,0x04,0x0e,0x7d
+# GFX11: v_cmpx_o_f16_e32 null, v2               ; encoding: [0x7c,0x04,0x0e,0x7d]
 
-# GFX11: v_cmpx_o_f16_e32 -1, v2                 ; encoding: [0xc1,0x04,0x0e,0x7d]
 0xc1,0x04,0x0e,0x7d
+# GFX11: v_cmpx_o_f16_e32 -1, v2                 ; encoding: [0xc1,0x04,0x0e,0x7d]
 
-# GFX11: v_cmpx_o_f16_e32 0.5, v2                ; encoding: [0xf0,0x04,0x0e,0x7d]
 0xf0,0x04,0x0e,0x7d
+# GFX11: v_cmpx_o_f16_e32 0.5, v2                ; encoding: [0xf0,0x04,0x0e,0x7d]
 
-# GFX11: v_cmpx_o_f16_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x0e,0x7d]
 0xfd,0x04,0x0e,0x7d
+# GFX11: v_cmpx_o_f16_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x0e,0x7d]
 
-# GFX11: v_cmpx_o_f16_e32 0xfe0b, v127           ; encoding: [0xff,0xfe,0x0e,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x0e,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_o_f16_e32 0xfe0b, v127           ; encoding: [0xff,0xfe,0x0e,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_o_f32_e32 v1, v2                 ; encoding: [0x01,0x05,0x2e,0x7d]
 0x01,0x05,0x2e,0x7d
+# GFX11: v_cmpx_o_f32_e32 v1, v2                 ; encoding: [0x01,0x05,0x2e,0x7d]
 
-# GFX11: v_cmpx_o_f32_e32 v255, v2               ; encoding: [0xff,0x05,0x2e,0x7d]
 0xff,0x05,0x2e,0x7d
+# GFX11: v_cmpx_o_f32_e32 v255, v2               ; encoding: [0xff,0x05,0x2e,0x7d]
 
-# GFX11: v_cmpx_o_f32_e32 s1, v2                 ; encoding: [0x01,0x04,0x2e,0x7d]
 0x01,0x04,0x2e,0x7d
+# GFX11: v_cmpx_o_f32_e32 s1, v2                 ; encoding: [0x01,0x04,0x2e,0x7d]
 
-# GFX11: v_cmpx_o_f32_e32 s105, v2               ; encoding: [0x69,0x04,0x2e,0x7d]
 0x69,0x04,0x2e,0x7d
+# GFX11: v_cmpx_o_f32_e32 s105, v2               ; encoding: [0x69,0x04,0x2e,0x7d]
 
-# GFX11: v_cmpx_o_f32_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x2e,0x7d]
 0x6a,0x04,0x2e,0x7d
+# GFX11: v_cmpx_o_f32_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x2e,0x7d]
 
-# GFX11: v_cmpx_o_f32_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x2e,0x7d]
 0x6b,0x04,0x2e,0x7d
+# GFX11: v_cmpx_o_f32_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x2e,0x7d]
 
-# GFX11: v_cmpx_o_f32_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x2e,0x7d]
 0x7b,0x04,0x2e,0x7d
+# GFX11: v_cmpx_o_f32_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x2e,0x7d]
 
-# GFX11: v_cmpx_o_f32_e32 m0, v2                 ; encoding: [0x7d,0x04,0x2e,0x7d]
 0x7d,0x04,0x2e,0x7d
+# GFX11: v_cmpx_o_f32_e32 m0, v2                 ; encoding: [0x7d,0x04,0x2e,0x7d]
 
-# GFX11: v_cmpx_o_f32_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x2e,0x7d]
 0x7e,0x04,0x2e,0x7d
+# GFX11: v_cmpx_o_f32_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x2e,0x7d]
 
-# GFX11: v_cmpx_o_f32_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x2e,0x7d]
 0x7f,0x04,0x2e,0x7d
+# GFX11: v_cmpx_o_f32_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x2e,0x7d]
 
-# GFX11: v_cmpx_o_f32_e32 null, v2               ; encoding: [0x7c,0x04,0x2e,0x7d]
 0x7c,0x04,0x2e,0x7d
+# GFX11: v_cmpx_o_f32_e32 null, v2               ; encoding: [0x7c,0x04,0x2e,0x7d]
 
-# GFX11: v_cmpx_o_f32_e32 -1, v2                 ; encoding: [0xc1,0x04,0x2e,0x7d]
 0xc1,0x04,0x2e,0x7d
+# GFX11: v_cmpx_o_f32_e32 -1, v2                 ; encoding: [0xc1,0x04,0x2e,0x7d]
 
-# GFX11: v_cmpx_o_f32_e32 0.5, v2                ; encoding: [0xf0,0x04,0x2e,0x7d]
 0xf0,0x04,0x2e,0x7d
+# GFX11: v_cmpx_o_f32_e32 0.5, v2                ; encoding: [0xf0,0x04,0x2e,0x7d]
 
-# GFX11: v_cmpx_o_f32_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x2e,0x7d]
 0xfd,0x04,0x2e,0x7d
+# GFX11: v_cmpx_o_f32_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x2e,0x7d]
 
-# GFX11: v_cmpx_o_f32_e32 0xaf123456, v255       ; encoding: [0xff,0xfe,0x2f,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x2f,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_o_f32_e32 0xaf123456, v255       ; encoding: [0xff,0xfe,0x2f,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_o_f64_e32 v[1:2], v[2:3]         ; encoding: [0x01,0x05,0x4e,0x7d]
 0x01,0x05,0x4e,0x7d
+# GFX11: v_cmpx_o_f64_e32 v[1:2], v[2:3]         ; encoding: [0x01,0x05,0x4e,0x7d]
 
-# GFX11: v_cmpx_o_f64_e32 v[254:255], v[2:3]     ; encoding: [0xfe,0x05,0x4e,0x7d]
 0xfe,0x05,0x4e,0x7d
+# GFX11: v_cmpx_o_f64_e32 v[254:255], v[2:3]     ; encoding: [0xfe,0x05,0x4e,0x7d]
 
-# GFX11: v_cmpx_o_f64_e32 s[2:3], v[2:3]         ; encoding: [0x02,0x04,0x4e,0x7d]
 0x02,0x04,0x4e,0x7d
+# GFX11: v_cmpx_o_f64_e32 s[2:3], v[2:3]         ; encoding: [0x02,0x04,0x4e,0x7d]
 
-# GFX11: v_cmpx_o_f64_e32 s[104:105], v[2:3]     ; encoding: [0x68,0x04,0x4e,0x7d]
 0x68,0x04,0x4e,0x7d
+# GFX11: v_cmpx_o_f64_e32 s[104:105], v[2:3]     ; encoding: [0x68,0x04,0x4e,0x7d]
 
-# GFX11: v_cmpx_o_f64_e32 vcc, v[2:3]            ; encoding: [0x6a,0x04,0x4e,0x7d]
 0x6a,0x04,0x4e,0x7d
+# GFX11: v_cmpx_o_f64_e32 vcc, v[2:3]            ; encoding: [0x6a,0x04,0x4e,0x7d]
 
-# GFX11: v_cmpx_o_f64_e32 ttmp[14:15], v[2:3]    ; encoding: [0x7a,0x04,0x4e,0x7d]
 0x7a,0x04,0x4e,0x7d
+# GFX11: v_cmpx_o_f64_e32 ttmp[14:15], v[2:3]    ; encoding: [0x7a,0x04,0x4e,0x7d]
 
-# GFX11: v_cmpx_o_f64_e32 exec, v[2:3]           ; encoding: [0x7e,0x04,0x4e,0x7d]
 0x7e,0x04,0x4e,0x7d
+# GFX11: v_cmpx_o_f64_e32 exec, v[2:3]           ; encoding: [0x7e,0x04,0x4e,0x7d]
 
-# GFX11: v_cmpx_o_f64_e32 null, v[2:3]           ; encoding: [0x7c,0x04,0x4e,0x7d]
 0x7c,0x04,0x4e,0x7d
+# GFX11: v_cmpx_o_f64_e32 null, v[2:3]           ; encoding: [0x7c,0x04,0x4e,0x7d]
 
-# GFX11: v_cmpx_o_f64_e32 -1, v[2:3]             ; encoding: [0xc1,0x04,0x4e,0x7d]
 0xc1,0x04,0x4e,0x7d
+# GFX11: v_cmpx_o_f64_e32 -1, v[2:3]             ; encoding: [0xc1,0x04,0x4e,0x7d]
 
-# GFX11: v_cmpx_o_f64_e32 0.5, v[2:3]            ; encoding: [0xf0,0x04,0x4e,0x7d]
 0xf0,0x04,0x4e,0x7d
+# GFX11: v_cmpx_o_f64_e32 0.5, v[2:3]            ; encoding: [0xf0,0x04,0x4e,0x7d]
 
-# GFX11: v_cmpx_o_f64_e32 src_scc, v[2:3]        ; encoding: [0xfd,0x04,0x4e,0x7d]
 0xfd,0x04,0x4e,0x7d
+# GFX11: v_cmpx_o_f64_e32 src_scc, v[2:3]        ; encoding: [0xfd,0x04,0x4e,0x7d]
 
-# GFX11: v_cmpx_o_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4f,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x4f,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_o_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4f,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_t_f16_e32 v1, v2                 ; encoding: [0x01,0x05,0x1e,0x7d]
 0x01,0x05,0x1e,0x7d
+# GFX11: v_cmpx_t_f16_e32 v1, v2                 ; encoding: [0x01,0x05,0x1e,0x7d]
 
-# GFX11: v_cmpx_t_f16_e32 v127, v2               ; encoding: [0x7f,0x05,0x1e,0x7d]
 0x7f,0x05,0x1e,0x7d
+# GFX11: v_cmpx_t_f16_e32 v127, v2               ; encoding: [0x7f,0x05,0x1e,0x7d]
 
-# GFX11: v_cmpx_t_f16_e32 s1, v2                 ; encoding: [0x01,0x04,0x1e,0x7d]
 0x01,0x04,0x1e,0x7d
+# GFX11: v_cmpx_t_f16_e32 s1, v2                 ; encoding: [0x01,0x04,0x1e,0x7d]
 
-# GFX11: v_cmpx_t_f16_e32 s105, v2               ; encoding: [0x69,0x04,0x1e,0x7d]
 0x69,0x04,0x1e,0x7d
+# GFX11: v_cmpx_t_f16_e32 s105, v2               ; encoding: [0x69,0x04,0x1e,0x7d]
 
-# GFX11: v_cmpx_t_f16_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x1e,0x7d]
 0x6a,0x04,0x1e,0x7d
+# GFX11: v_cmpx_t_f16_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x1e,0x7d]
 
-# GFX11: v_cmpx_t_f16_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x1e,0x7d]
 0x6b,0x04,0x1e,0x7d
+# GFX11: v_cmpx_t_f16_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x1e,0x7d]
 
-# GFX11: v_cmpx_t_f16_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x1e,0x7d]
 0x7b,0x04,0x1e,0x7d
+# GFX11: v_cmpx_t_f16_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x1e,0x7d]
 
-# GFX11: v_cmpx_t_f16_e32 m0, v2                 ; encoding: [0x7d,0x04,0x1e,0x7d]
 0x7d,0x04,0x1e,0x7d
+# GFX11: v_cmpx_t_f16_e32 m0, v2                 ; encoding: [0x7d,0x04,0x1e,0x7d]
 
-# GFX11: v_cmpx_t_f16_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x1e,0x7d]
 0x7e,0x04,0x1e,0x7d
+# GFX11: v_cmpx_t_f16_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x1e,0x7d]
 
-# GFX11: v_cmpx_t_f16_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x1e,0x7d]
 0x7f,0x04,0x1e,0x7d
+# GFX11: v_cmpx_t_f16_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x1e,0x7d]
 
-# GFX11: v_cmpx_t_f16_e32 null, v2               ; encoding: [0x7c,0x04,0x1e,0x7d]
 0x7c,0x04,0x1e,0x7d
+# GFX11: v_cmpx_t_f16_e32 null, v2               ; encoding: [0x7c,0x04,0x1e,0x7d]
 
-# GFX11: v_cmpx_t_f16_e32 -1, v2                 ; encoding: [0xc1,0x04,0x1e,0x7d]
 0xc1,0x04,0x1e,0x7d
+# GFX11: v_cmpx_t_f16_e32 -1, v2                 ; encoding: [0xc1,0x04,0x1e,0x7d]
 
-# GFX11: v_cmpx_t_f16_e32 0.5, v2                ; encoding: [0xf0,0x04,0x1e,0x7d]
 0xf0,0x04,0x1e,0x7d
+# GFX11: v_cmpx_t_f16_e32 0.5, v2                ; encoding: [0xf0,0x04,0x1e,0x7d]
 
-# GFX11: v_cmpx_t_f16_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x1e,0x7d]
 0xfd,0x04,0x1e,0x7d
+# GFX11: v_cmpx_t_f16_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x1e,0x7d]
 
-# GFX11: v_cmpx_t_f16_e32 0xfe0b, v127           ; encoding: [0xff,0xfe,0x1e,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x1e,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_t_f16_e32 0xfe0b, v127           ; encoding: [0xff,0xfe,0x1e,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_t_f32_e32 v1, v2                 ; encoding: [0x01,0x05,0x3e,0x7d]
 0x01,0x05,0x3e,0x7d
+# GFX11: v_cmpx_t_f32_e32 v1, v2                 ; encoding: [0x01,0x05,0x3e,0x7d]
 
-# GFX11: v_cmpx_t_f32_e32 v255, v2               ; encoding: [0xff,0x05,0x3e,0x7d]
 0xff,0x05,0x3e,0x7d
+# GFX11: v_cmpx_t_f32_e32 v255, v2               ; encoding: [0xff,0x05,0x3e,0x7d]
 
-# GFX11: v_cmpx_t_f32_e32 s1, v2                 ; encoding: [0x01,0x04,0x3e,0x7d]
 0x01,0x04,0x3e,0x7d
+# GFX11: v_cmpx_t_f32_e32 s1, v2                 ; encoding: [0x01,0x04,0x3e,0x7d]
 
-# GFX11: v_cmpx_t_f32_e32 s105, v2               ; encoding: [0x69,0x04,0x3e,0x7d]
 0x69,0x04,0x3e,0x7d
+# GFX11: v_cmpx_t_f32_e32 s105, v2               ; encoding: [0x69,0x04,0x3e,0x7d]
 
-# GFX11: v_cmpx_t_f32_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x3e,0x7d]
 0x6a,0x04,0x3e,0x7d
+# GFX11: v_cmpx_t_f32_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x3e,0x7d]
 
-# GFX11: v_cmpx_t_f32_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x3e,0x7d]
 0x6b,0x04,0x3e,0x7d
+# GFX11: v_cmpx_t_f32_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x3e,0x7d]
 
-# GFX11: v_cmpx_t_f32_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x3e,0x7d]
 0x7b,0x04,0x3e,0x7d
+# GFX11: v_cmpx_t_f32_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x3e,0x7d]
 
-# GFX11: v_cmpx_t_f32_e32 m0, v2                 ; encoding: [0x7d,0x04,0x3e,0x7d]
 0x7d,0x04,0x3e,0x7d
+# GFX11: v_cmpx_t_f32_e32 m0, v2                 ; encoding: [0x7d,0x04,0x3e,0x7d]
 
-# GFX11: v_cmpx_t_f32_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x3e,0x7d]
 0x7e,0x04,0x3e,0x7d
+# GFX11: v_cmpx_t_f32_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x3e,0x7d]
 
-# GFX11: v_cmpx_t_f32_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x3e,0x7d]
 0x7f,0x04,0x3e,0x7d
+# GFX11: v_cmpx_t_f32_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x3e,0x7d]
 
-# GFX11: v_cmpx_t_f32_e32 null, v2               ; encoding: [0x7c,0x04,0x3e,0x7d]
 0x7c,0x04,0x3e,0x7d
+# GFX11: v_cmpx_t_f32_e32 null, v2               ; encoding: [0x7c,0x04,0x3e,0x7d]
 
-# GFX11: v_cmpx_t_f32_e32 -1, v2                 ; encoding: [0xc1,0x04,0x3e,0x7d]
 0xc1,0x04,0x3e,0x7d
+# GFX11: v_cmpx_t_f32_e32 -1, v2                 ; encoding: [0xc1,0x04,0x3e,0x7d]
 
-# GFX11: v_cmpx_t_f32_e32 0.5, v2                ; encoding: [0xf0,0x04,0x3e,0x7d]
 0xf0,0x04,0x3e,0x7d
+# GFX11: v_cmpx_t_f32_e32 0.5, v2                ; encoding: [0xf0,0x04,0x3e,0x7d]
 
-# GFX11: v_cmpx_t_f32_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x3e,0x7d]
 0xfd,0x04,0x3e,0x7d
+# GFX11: v_cmpx_t_f32_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x3e,0x7d]
 
-# GFX11: v_cmpx_t_f32_e32 0xaf123456, v255       ; encoding: [0xff,0xfe,0x3f,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x3f,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_t_f32_e32 0xaf123456, v255       ; encoding: [0xff,0xfe,0x3f,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_t_f64_e32 v[1:2], v[2:3]         ; encoding: [0x01,0x05,0x5e,0x7d]
 0x01,0x05,0x5e,0x7d
+# GFX11: v_cmpx_t_f64_e32 v[1:2], v[2:3]         ; encoding: [0x01,0x05,0x5e,0x7d]
 
-# GFX11: v_cmpx_t_f64_e32 v[254:255], v[2:3]     ; encoding: [0xfe,0x05,0x5e,0x7d]
 0xfe,0x05,0x5e,0x7d
+# GFX11: v_cmpx_t_f64_e32 v[254:255], v[2:3]     ; encoding: [0xfe,0x05,0x5e,0x7d]
 
-# GFX11: v_cmpx_t_f64_e32 s[2:3], v[2:3]         ; encoding: [0x02,0x04,0x5e,0x7d]
 0x02,0x04,0x5e,0x7d
+# GFX11: v_cmpx_t_f64_e32 s[2:3], v[2:3]         ; encoding: [0x02,0x04,0x5e,0x7d]
 
-# GFX11: v_cmpx_t_f64_e32 s[104:105], v[2:3]     ; encoding: [0x68,0x04,0x5e,0x7d]
 0x68,0x04,0x5e,0x7d
+# GFX11: v_cmpx_t_f64_e32 s[104:105], v[2:3]     ; encoding: [0x68,0x04,0x5e,0x7d]
 
-# GFX11: v_cmpx_t_f64_e32 vcc, v[2:3]            ; encoding: [0x6a,0x04,0x5e,0x7d]
 0x6a,0x04,0x5e,0x7d
+# GFX11: v_cmpx_t_f64_e32 vcc, v[2:3]            ; encoding: [0x6a,0x04,0x5e,0x7d]
 
-# GFX11: v_cmpx_t_f64_e32 ttmp[14:15], v[2:3]    ; encoding: [0x7a,0x04,0x5e,0x7d]
 0x7a,0x04,0x5e,0x7d
+# GFX11: v_cmpx_t_f64_e32 ttmp[14:15], v[2:3]    ; encoding: [0x7a,0x04,0x5e,0x7d]
 
-# GFX11: v_cmpx_t_f64_e32 exec, v[2:3]           ; encoding: [0x7e,0x04,0x5e,0x7d]
 0x7e,0x04,0x5e,0x7d
+# GFX11: v_cmpx_t_f64_e32 exec, v[2:3]           ; encoding: [0x7e,0x04,0x5e,0x7d]
 
-# GFX11: v_cmpx_t_f64_e32 null, v[2:3]           ; encoding: [0x7c,0x04,0x5e,0x7d]
 0x7c,0x04,0x5e,0x7d
+# GFX11: v_cmpx_t_f64_e32 null, v[2:3]           ; encoding: [0x7c,0x04,0x5e,0x7d]
 
-# GFX11: v_cmpx_t_f64_e32 -1, v[2:3]             ; encoding: [0xc1,0x04,0x5e,0x7d]
 0xc1,0x04,0x5e,0x7d
+# GFX11: v_cmpx_t_f64_e32 -1, v[2:3]             ; encoding: [0xc1,0x04,0x5e,0x7d]
 
-# GFX11: v_cmpx_t_f64_e32 0.5, v[2:3]            ; encoding: [0xf0,0x04,0x5e,0x7d]
 0xf0,0x04,0x5e,0x7d
+# GFX11: v_cmpx_t_f64_e32 0.5, v[2:3]            ; encoding: [0xf0,0x04,0x5e,0x7d]
 
-# GFX11: v_cmpx_t_f64_e32 src_scc, v[2:3]        ; encoding: [0xfd,0x04,0x5e,0x7d]
 0xfd,0x04,0x5e,0x7d
+# GFX11: v_cmpx_t_f64_e32 src_scc, v[2:3]        ; encoding: [0xfd,0x04,0x5e,0x7d]
 
-# GFX11: v_cmpx_t_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5f,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x5f,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_t_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5f,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_t_i32_e32 v1, v2                 ; encoding: [0x01,0x05,0x8e,0x7d]
 0x01,0x05,0x8e,0x7d
+# GFX11: v_cmpx_t_i32_e32 v1, v2                 ; encoding: [0x01,0x05,0x8e,0x7d]
 
-# GFX11: v_cmpx_t_i32_e32 v255, v2               ; encoding: [0xff,0x05,0x8e,0x7d]
 0xff,0x05,0x8e,0x7d
+# GFX11: v_cmpx_t_i32_e32 v255, v2               ; encoding: [0xff,0x05,0x8e,0x7d]
 
-# GFX11: v_cmpx_t_i32_e32 s1, v2                 ; encoding: [0x01,0x04,0x8e,0x7d]
 0x01,0x04,0x8e,0x7d
+# GFX11: v_cmpx_t_i32_e32 s1, v2                 ; encoding: [0x01,0x04,0x8e,0x7d]
 
-# GFX11: v_cmpx_t_i32_e32 s105, v2               ; encoding: [0x69,0x04,0x8e,0x7d]
 0x69,0x04,0x8e,0x7d
+# GFX11: v_cmpx_t_i32_e32 s105, v2               ; encoding: [0x69,0x04,0x8e,0x7d]
 
-# GFX11: v_cmpx_t_i32_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x8e,0x7d]
 0x6a,0x04,0x8e,0x7d
+# GFX11: v_cmpx_t_i32_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x8e,0x7d]
 
-# GFX11: v_cmpx_t_i32_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x8e,0x7d]
 0x6b,0x04,0x8e,0x7d
+# GFX11: v_cmpx_t_i32_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x8e,0x7d]
 
-# GFX11: v_cmpx_t_i32_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x8e,0x7d]
 0x7b,0x04,0x8e,0x7d
+# GFX11: v_cmpx_t_i32_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x8e,0x7d]
 
-# GFX11: v_cmpx_t_i32_e32 m0, v2                 ; encoding: [0x7d,0x04,0x8e,0x7d]
 0x7d,0x04,0x8e,0x7d
+# GFX11: v_cmpx_t_i32_e32 m0, v2                 ; encoding: [0x7d,0x04,0x8e,0x7d]
 
-# GFX11: v_cmpx_t_i32_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x8e,0x7d]
 0x7e,0x04,0x8e,0x7d
+# GFX11: v_cmpx_t_i32_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x8e,0x7d]
 
-# GFX11: v_cmpx_t_i32_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x8e,0x7d]
 0x7f,0x04,0x8e,0x7d
+# GFX11: v_cmpx_t_i32_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x8e,0x7d]
 
-# GFX11: v_cmpx_t_i32_e32 null, v2               ; encoding: [0x7c,0x04,0x8e,0x7d]
 0x7c,0x04,0x8e,0x7d
+# GFX11: v_cmpx_t_i32_e32 null, v2               ; encoding: [0x7c,0x04,0x8e,0x7d]
 
-# GFX11: v_cmpx_t_i32_e32 -1, v2                 ; encoding: [0xc1,0x04,0x8e,0x7d]
 0xc1,0x04,0x8e,0x7d
+# GFX11: v_cmpx_t_i32_e32 -1, v2                 ; encoding: [0xc1,0x04,0x8e,0x7d]
 
-# GFX11: v_cmpx_t_i32_e32 0.5, v2                ; encoding: [0xf0,0x04,0x8e,0x7d]
 0xf0,0x04,0x8e,0x7d
+# GFX11: v_cmpx_t_i32_e32 0.5, v2                ; encoding: [0xf0,0x04,0x8e,0x7d]
 
-# GFX11: v_cmpx_t_i32_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x8e,0x7d]
 0xfd,0x04,0x8e,0x7d
+# GFX11: v_cmpx_t_i32_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x8e,0x7d]
 
-# GFX11: v_cmpx_t_i32_e32 0xaf123456, v255       ; encoding: [0xff,0xfe,0x8f,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x8f,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_t_i32_e32 0xaf123456, v255       ; encoding: [0xff,0xfe,0x8f,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_t_i64_e32 v[1:2], v[2:3]         ; encoding: [0x01,0x05,0xae,0x7d]
 0x01,0x05,0xae,0x7d
+# GFX11: v_cmpx_t_i64_e32 v[1:2], v[2:3]         ; encoding: [0x01,0x05,0xae,0x7d]
 
-# GFX11: v_cmpx_t_i64_e32 v[254:255], v[2:3]     ; encoding: [0xfe,0x05,0xae,0x7d]
 0xfe,0x05,0xae,0x7d
+# GFX11: v_cmpx_t_i64_e32 v[254:255], v[2:3]     ; encoding: [0xfe,0x05,0xae,0x7d]
 
-# GFX11: v_cmpx_t_i64_e32 s[2:3], v[2:3]         ; encoding: [0x02,0x04,0xae,0x7d]
 0x02,0x04,0xae,0x7d
+# GFX11: v_cmpx_t_i64_e32 s[2:3], v[2:3]         ; encoding: [0x02,0x04,0xae,0x7d]
 
-# GFX11: v_cmpx_t_i64_e32 s[104:105], v[2:3]     ; encoding: [0x68,0x04,0xae,0x7d]
 0x68,0x04,0xae,0x7d
+# GFX11: v_cmpx_t_i64_e32 s[104:105], v[2:3]     ; encoding: [0x68,0x04,0xae,0x7d]
 
-# GFX11: v_cmpx_t_i64_e32 vcc, v[2:3]            ; encoding: [0x6a,0x04,0xae,0x7d]
 0x6a,0x04,0xae,0x7d
+# GFX11: v_cmpx_t_i64_e32 vcc, v[2:3]            ; encoding: [0x6a,0x04,0xae,0x7d]
 
-# GFX11: v_cmpx_t_i64_e32 ttmp[14:15], v[2:3]    ; encoding: [0x7a,0x04,0xae,0x7d]
 0x7a,0x04,0xae,0x7d
+# GFX11: v_cmpx_t_i64_e32 ttmp[14:15], v[2:3]    ; encoding: [0x7a,0x04,0xae,0x7d]
 
-# GFX11: v_cmpx_t_i64_e32 exec, v[2:3]           ; encoding: [0x7e,0x04,0xae,0x7d]
 0x7e,0x04,0xae,0x7d
+# GFX11: v_cmpx_t_i64_e32 exec, v[2:3]           ; encoding: [0x7e,0x04,0xae,0x7d]
 
-# GFX11: v_cmpx_t_i64_e32 null, v[2:3]           ; encoding: [0x7c,0x04,0xae,0x7d]
 0x7c,0x04,0xae,0x7d
+# GFX11: v_cmpx_t_i64_e32 null, v[2:3]           ; encoding: [0x7c,0x04,0xae,0x7d]
 
-# GFX11: v_cmpx_t_i64_e32 -1, v[2:3]             ; encoding: [0xc1,0x04,0xae,0x7d]
 0xc1,0x04,0xae,0x7d
+# GFX11: v_cmpx_t_i64_e32 -1, v[2:3]             ; encoding: [0xc1,0x04,0xae,0x7d]
 
-# GFX11: v_cmpx_t_i64_e32 0.5, v[2:3]            ; encoding: [0xf0,0x04,0xae,0x7d]
 0xf0,0x04,0xae,0x7d
+# GFX11: v_cmpx_t_i64_e32 0.5, v[2:3]            ; encoding: [0xf0,0x04,0xae,0x7d]
 
-# GFX11: v_cmpx_t_i64_e32 src_scc, v[2:3]        ; encoding: [0xfd,0x04,0xae,0x7d]
 0xfd,0x04,0xae,0x7d
+# GFX11: v_cmpx_t_i64_e32 src_scc, v[2:3]        ; encoding: [0xfd,0x04,0xae,0x7d]
 
-# GFX11: v_cmpx_t_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xaf,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0xaf,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_t_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xaf,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_t_u32_e32 v1, v2                 ; encoding: [0x01,0x05,0x9e,0x7d]
 0x01,0x05,0x9e,0x7d
+# GFX11: v_cmpx_t_u32_e32 v1, v2                 ; encoding: [0x01,0x05,0x9e,0x7d]
 
-# GFX11: v_cmpx_t_u32_e32 v255, v2               ; encoding: [0xff,0x05,0x9e,0x7d]
 0xff,0x05,0x9e,0x7d
+# GFX11: v_cmpx_t_u32_e32 v255, v2               ; encoding: [0xff,0x05,0x9e,0x7d]
 
-# GFX11: v_cmpx_t_u32_e32 s1, v2                 ; encoding: [0x01,0x04,0x9e,0x7d]
 0x01,0x04,0x9e,0x7d
+# GFX11: v_cmpx_t_u32_e32 s1, v2                 ; encoding: [0x01,0x04,0x9e,0x7d]
 
-# GFX11: v_cmpx_t_u32_e32 s105, v2               ; encoding: [0x69,0x04,0x9e,0x7d]
 0x69,0x04,0x9e,0x7d
+# GFX11: v_cmpx_t_u32_e32 s105, v2               ; encoding: [0x69,0x04,0x9e,0x7d]
 
-# GFX11: v_cmpx_t_u32_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x9e,0x7d]
 0x6a,0x04,0x9e,0x7d
+# GFX11: v_cmpx_t_u32_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x9e,0x7d]
 
-# GFX11: v_cmpx_t_u32_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x9e,0x7d]
 0x6b,0x04,0x9e,0x7d
+# GFX11: v_cmpx_t_u32_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x9e,0x7d]
 
-# GFX11: v_cmpx_t_u32_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x9e,0x7d]
 0x7b,0x04,0x9e,0x7d
+# GFX11: v_cmpx_t_u32_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x9e,0x7d]
 
-# GFX11: v_cmpx_t_u32_e32 m0, v2                 ; encoding: [0x7d,0x04,0x9e,0x7d]
 0x7d,0x04,0x9e,0x7d
+# GFX11: v_cmpx_t_u32_e32 m0, v2                 ; encoding: [0x7d,0x04,0x9e,0x7d]
 
-# GFX11: v_cmpx_t_u32_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x9e,0x7d]
 0x7e,0x04,0x9e,0x7d
+# GFX11: v_cmpx_t_u32_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x9e,0x7d]
 
-# GFX11: v_cmpx_t_u32_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x9e,0x7d]
 0x7f,0x04,0x9e,0x7d
+# GFX11: v_cmpx_t_u32_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x9e,0x7d]
 
-# GFX11: v_cmpx_t_u32_e32 null, v2               ; encoding: [0x7c,0x04,0x9e,0x7d]
 0x7c,0x04,0x9e,0x7d
+# GFX11: v_cmpx_t_u32_e32 null, v2               ; encoding: [0x7c,0x04,0x9e,0x7d]
 
-# GFX11: v_cmpx_t_u32_e32 -1, v2                 ; encoding: [0xc1,0x04,0x9e,0x7d]
 0xc1,0x04,0x9e,0x7d
+# GFX11: v_cmpx_t_u32_e32 -1, v2                 ; encoding: [0xc1,0x04,0x9e,0x7d]
 
-# GFX11: v_cmpx_t_u32_e32 0.5, v2                ; encoding: [0xf0,0x04,0x9e,0x7d]
 0xf0,0x04,0x9e,0x7d
+# GFX11: v_cmpx_t_u32_e32 0.5, v2                ; encoding: [0xf0,0x04,0x9e,0x7d]
 
-# GFX11: v_cmpx_t_u32_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x9e,0x7d]
 0xfd,0x04,0x9e,0x7d
+# GFX11: v_cmpx_t_u32_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x9e,0x7d]
 
-# GFX11: v_cmpx_t_u32_e32 0xaf123456, v255       ; encoding: [0xff,0xfe,0x9f,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x9f,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_t_u32_e32 0xaf123456, v255       ; encoding: [0xff,0xfe,0x9f,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_t_u64_e32 v[1:2], v[2:3]         ; encoding: [0x01,0x05,0xbe,0x7d]
 0x01,0x05,0xbe,0x7d
+# GFX11: v_cmpx_t_u64_e32 v[1:2], v[2:3]         ; encoding: [0x01,0x05,0xbe,0x7d]
 
-# GFX11: v_cmpx_t_u64_e32 v[254:255], v[2:3]     ; encoding: [0xfe,0x05,0xbe,0x7d]
 0xfe,0x05,0xbe,0x7d
+# GFX11: v_cmpx_t_u64_e32 v[254:255], v[2:3]     ; encoding: [0xfe,0x05,0xbe,0x7d]
 
-# GFX11: v_cmpx_t_u64_e32 s[2:3], v[2:3]         ; encoding: [0x02,0x04,0xbe,0x7d]
 0x02,0x04,0xbe,0x7d
+# GFX11: v_cmpx_t_u64_e32 s[2:3], v[2:3]         ; encoding: [0x02,0x04,0xbe,0x7d]
 
-# GFX11: v_cmpx_t_u64_e32 s[104:105], v[2:3]     ; encoding: [0x68,0x04,0xbe,0x7d]
 0x68,0x04,0xbe,0x7d
+# GFX11: v_cmpx_t_u64_e32 s[104:105], v[2:3]     ; encoding: [0x68,0x04,0xbe,0x7d]
 
-# GFX11: v_cmpx_t_u64_e32 vcc, v[2:3]            ; encoding: [0x6a,0x04,0xbe,0x7d]
 0x6a,0x04,0xbe,0x7d
+# GFX11: v_cmpx_t_u64_e32 vcc, v[2:3]            ; encoding: [0x6a,0x04,0xbe,0x7d]
 
-# GFX11: v_cmpx_t_u64_e32 ttmp[14:15], v[2:3]    ; encoding: [0x7a,0x04,0xbe,0x7d]
 0x7a,0x04,0xbe,0x7d
+# GFX11: v_cmpx_t_u64_e32 ttmp[14:15], v[2:3]    ; encoding: [0x7a,0x04,0xbe,0x7d]
 
-# GFX11: v_cmpx_t_u64_e32 exec, v[2:3]           ; encoding: [0x7e,0x04,0xbe,0x7d]
 0x7e,0x04,0xbe,0x7d
+# GFX11: v_cmpx_t_u64_e32 exec, v[2:3]           ; encoding: [0x7e,0x04,0xbe,0x7d]
 
-# GFX11: v_cmpx_t_u64_e32 null, v[2:3]           ; encoding: [0x7c,0x04,0xbe,0x7d]
 0x7c,0x04,0xbe,0x7d
+# GFX11: v_cmpx_t_u64_e32 null, v[2:3]           ; encoding: [0x7c,0x04,0xbe,0x7d]
 
-# GFX11: v_cmpx_t_u64_e32 -1, v[2:3]             ; encoding: [0xc1,0x04,0xbe,0x7d]
 0xc1,0x04,0xbe,0x7d
+# GFX11: v_cmpx_t_u64_e32 -1, v[2:3]             ; encoding: [0xc1,0x04,0xbe,0x7d]
 
-# GFX11: v_cmpx_t_u64_e32 0.5, v[2:3]            ; encoding: [0xf0,0x04,0xbe,0x7d]
 0xf0,0x04,0xbe,0x7d
+# GFX11: v_cmpx_t_u64_e32 0.5, v[2:3]            ; encoding: [0xf0,0x04,0xbe,0x7d]
 
-# GFX11: v_cmpx_t_u64_e32 src_scc, v[2:3]        ; encoding: [0xfd,0x04,0xbe,0x7d]
 0xfd,0x04,0xbe,0x7d
+# GFX11: v_cmpx_t_u64_e32 src_scc, v[2:3]        ; encoding: [0xfd,0x04,0xbe,0x7d]
 
-# GFX11: v_cmpx_t_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbf,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0xbf,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_t_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbf,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_u_f16_e32 v1, v2                 ; encoding: [0x01,0x05,0x10,0x7d]
 0x01,0x05,0x10,0x7d
+# GFX11: v_cmpx_u_f16_e32 v1, v2                 ; encoding: [0x01,0x05,0x10,0x7d]
 
-# GFX11: v_cmpx_u_f16_e32 v127, v2               ; encoding: [0x7f,0x05,0x10,0x7d]
 0x7f,0x05,0x10,0x7d
+# GFX11: v_cmpx_u_f16_e32 v127, v2               ; encoding: [0x7f,0x05,0x10,0x7d]
 
-# GFX11: v_cmpx_u_f16_e32 s1, v2                 ; encoding: [0x01,0x04,0x10,0x7d]
 0x01,0x04,0x10,0x7d
+# GFX11: v_cmpx_u_f16_e32 s1, v2                 ; encoding: [0x01,0x04,0x10,0x7d]
 
-# GFX11: v_cmpx_u_f16_e32 s105, v2               ; encoding: [0x69,0x04,0x10,0x7d]
 0x69,0x04,0x10,0x7d
+# GFX11: v_cmpx_u_f16_e32 s105, v2               ; encoding: [0x69,0x04,0x10,0x7d]
 
-# GFX11: v_cmpx_u_f16_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x10,0x7d]
 0x6a,0x04,0x10,0x7d
+# GFX11: v_cmpx_u_f16_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x10,0x7d]
 
-# GFX11: v_cmpx_u_f16_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x10,0x7d]
 0x6b,0x04,0x10,0x7d
+# GFX11: v_cmpx_u_f16_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x10,0x7d]
 
-# GFX11: v_cmpx_u_f16_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x10,0x7d]
 0x7b,0x04,0x10,0x7d
+# GFX11: v_cmpx_u_f16_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x10,0x7d]
 
-# GFX11: v_cmpx_u_f16_e32 m0, v2                 ; encoding: [0x7d,0x04,0x10,0x7d]
 0x7d,0x04,0x10,0x7d
+# GFX11: v_cmpx_u_f16_e32 m0, v2                 ; encoding: [0x7d,0x04,0x10,0x7d]
 
-# GFX11: v_cmpx_u_f16_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x10,0x7d]
 0x7e,0x04,0x10,0x7d
+# GFX11: v_cmpx_u_f16_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x10,0x7d]
 
-# GFX11: v_cmpx_u_f16_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x10,0x7d]
 0x7f,0x04,0x10,0x7d
+# GFX11: v_cmpx_u_f16_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x10,0x7d]
 
-# GFX11: v_cmpx_u_f16_e32 null, v2               ; encoding: [0x7c,0x04,0x10,0x7d]
 0x7c,0x04,0x10,0x7d
+# GFX11: v_cmpx_u_f16_e32 null, v2               ; encoding: [0x7c,0x04,0x10,0x7d]
 
-# GFX11: v_cmpx_u_f16_e32 -1, v2                 ; encoding: [0xc1,0x04,0x10,0x7d]
 0xc1,0x04,0x10,0x7d
+# GFX11: v_cmpx_u_f16_e32 -1, v2                 ; encoding: [0xc1,0x04,0x10,0x7d]
 
-# GFX11: v_cmpx_u_f16_e32 0.5, v2                ; encoding: [0xf0,0x04,0x10,0x7d]
 0xf0,0x04,0x10,0x7d
+# GFX11: v_cmpx_u_f16_e32 0.5, v2                ; encoding: [0xf0,0x04,0x10,0x7d]
 
-# GFX11: v_cmpx_u_f16_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x10,0x7d]
 0xfd,0x04,0x10,0x7d
+# GFX11: v_cmpx_u_f16_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x10,0x7d]
 
-# GFX11: v_cmpx_u_f16_e32 0xfe0b, v127           ; encoding: [0xff,0xfe,0x10,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x10,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11: v_cmpx_u_f16_e32 0xfe0b, v127           ; encoding: [0xff,0xfe,0x10,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_u_f32_e32 v1, v2                 ; encoding: [0x01,0x05,0x30,0x7d]
 0x01,0x05,0x30,0x7d
+# GFX11: v_cmpx_u_f32_e32 v1, v2                 ; encoding: [0x01,0x05,0x30,0x7d]
 
-# GFX11: v_cmpx_u_f32_e32 v255, v2               ; encoding: [0xff,0x05,0x30,0x7d]
 0xff,0x05,0x30,0x7d
+# GFX11: v_cmpx_u_f32_e32 v255, v2               ; encoding: [0xff,0x05,0x30,0x7d]
 
-# GFX11: v_cmpx_u_f32_e32 s1, v2                 ; encoding: [0x01,0x04,0x30,0x7d]
 0x01,0x04,0x30,0x7d
+# GFX11: v_cmpx_u_f32_e32 s1, v2                 ; encoding: [0x01,0x04,0x30,0x7d]
 
-# GFX11: v_cmpx_u_f32_e32 s105, v2               ; encoding: [0x69,0x04,0x30,0x7d]
 0x69,0x04,0x30,0x7d
+# GFX11: v_cmpx_u_f32_e32 s105, v2               ; encoding: [0x69,0x04,0x30,0x7d]
 
-# GFX11: v_cmpx_u_f32_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x30,0x7d]
 0x6a,0x04,0x30,0x7d
+# GFX11: v_cmpx_u_f32_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x30,0x7d]
 
-# GFX11: v_cmpx_u_f32_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x30,0x7d]
 0x6b,0x04,0x30,0x7d
+# GFX11: v_cmpx_u_f32_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x30,0x7d]
 
-# GFX11: v_cmpx_u_f32_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x30,0x7d]
 0x7b,0x04,0x30,0x7d
+# GFX11: v_cmpx_u_f32_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x30,0x7d]
 
-# GFX11: v_cmpx_u_f32_e32 m0, v2                 ; encoding: [0x7d,0x04,0x30,0x7d]
 0x7d,0x04,0x30,0x7d
+# GFX11: v_cmpx_u_f32_e32 m0, v2                 ; encoding: [0x7d,0x04,0x30,0x7d]
 
-# GFX11: v_cmpx_u_f32_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x30,0x7d]
 0x7e,0x04,0x30,0x7d
+# GFX11: v_cmpx_u_f32_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x30,0x7d]
 
-# GFX11: v_cmpx_u_f32_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x30,0x7d]
 0x7f,0x04,0x30,0x7d
+# GFX11: v_cmpx_u_f32_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x30,0x7d]
 
-# GFX11: v_cmpx_u_f32_e32 null, v2               ; encoding: [0x7c,0x04,0x30,0x7d]
 0x7c,0x04,0x30,0x7d
+# GFX11: v_cmpx_u_f32_e32 null, v2               ; encoding: [0x7c,0x04,0x30,0x7d]
 
-# GFX11: v_cmpx_u_f32_e32 -1, v2                 ; encoding: [0xc1,0x04,0x30,0x7d]
 0xc1,0x04,0x30,0x7d
+# GFX11: v_cmpx_u_f32_e32 -1, v2                 ; encoding: [0xc1,0x04,0x30,0x7d]
 
-# GFX11: v_cmpx_u_f32_e32 0.5, v2                ; encoding: [0xf0,0x04,0x30,0x7d]
 0xf0,0x04,0x30,0x7d
+# GFX11: v_cmpx_u_f32_e32 0.5, v2                ; encoding: [0xf0,0x04,0x30,0x7d]
 
-# GFX11: v_cmpx_u_f32_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x30,0x7d]
 0xfd,0x04,0x30,0x7d
+# GFX11: v_cmpx_u_f32_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x30,0x7d]
 
-# GFX11: v_cmpx_u_f32_e32 0xaf123456, v255       ; encoding: [0xff,0xfe,0x31,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x31,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_u_f32_e32 0xaf123456, v255       ; encoding: [0xff,0xfe,0x31,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX11: v_cmpx_u_f64_e32 v[1:2], v[2:3]         ; encoding: [0x01,0x05,0x50,0x7d]
 0x01,0x05,0x50,0x7d
+# GFX11: v_cmpx_u_f64_e32 v[1:2], v[2:3]         ; encoding: [0x01,0x05,0x50,0x7d]
 
-# GFX11: v_cmpx_u_f64_e32 v[254:255], v[2:3]     ; encoding: [0xfe,0x05,0x50,0x7d]
 0xfe,0x05,0x50,0x7d
+# GFX11: v_cmpx_u_f64_e32 v[254:255], v[2:3]     ; encoding: [0xfe,0x05,0x50,0x7d]
 
-# GFX11: v_cmpx_u_f64_e32 s[2:3], v[2:3]         ; encoding: [0x02,0x04,0x50,0x7d]
 0x02,0x04,0x50,0x7d
+# GFX11: v_cmpx_u_f64_e32 s[2:3], v[2:3]         ; encoding: [0x02,0x04,0x50,0x7d]
 
-# GFX11: v_cmpx_u_f64_e32 s[104:105], v[2:3]     ; encoding: [0x68,0x04,0x50,0x7d]
 0x68,0x04,0x50,0x7d
+# GFX11: v_cmpx_u_f64_e32 s[104:105], v[2:3]     ; encoding: [0x68,0x04,0x50,0x7d]
 
-# GFX11: v_cmpx_u_f64_e32 vcc, v[2:3]            ; encoding: [0x6a,0x04,0x50,0x7d]
 0x6a,0x04,0x50,0x7d
+# GFX11: v_cmpx_u_f64_e32 vcc, v[2:3]            ; encoding: [0x6a,0x04,0x50,0x7d]
 
-# GFX11: v_cmpx_u_f64_e32 ttmp[14:15], v[2:3]    ; encoding: [0x7a,0x04,0x50,0x7d]
 0x7a,0x04,0x50,0x7d
+# GFX11: v_cmpx_u_f64_e32 ttmp[14:15], v[2:3]    ; encoding: [0x7a,0x04,0x50,0x7d]
 
-# GFX11: v_cmpx_u_f64_e32 exec, v[2:3]           ; encoding: [0x7e,0x04,0x50,0x7d]
 0x7e,0x04,0x50,0x7d
+# GFX11: v_cmpx_u_f64_e32 exec, v[2:3]           ; encoding: [0x7e,0x04,0x50,0x7d]
 
-# GFX11: v_cmpx_u_f64_e32 null, v[2:3]           ; encoding: [0x7c,0x04,0x50,0x7d]
 0x7c,0x04,0x50,0x7d
+# GFX11: v_cmpx_u_f64_e32 null, v[2:3]           ; encoding: [0x7c,0x04,0x50,0x7d]
 
-# GFX11: v_cmpx_u_f64_e32 -1, v[2:3]             ; encoding: [0xc1,0x04,0x50,0x7d]
 0xc1,0x04,0x50,0x7d
+# GFX11: v_cmpx_u_f64_e32 -1, v[2:3]             ; encoding: [0xc1,0x04,0x50,0x7d]
 
-# GFX11: v_cmpx_u_f64_e32 0.5, v[2:3]            ; encoding: [0xf0,0x04,0x50,0x7d]
 0xf0,0x04,0x50,0x7d
+# GFX11: v_cmpx_u_f64_e32 0.5, v[2:3]            ; encoding: [0xf0,0x04,0x50,0x7d]
 
-# GFX11: v_cmpx_u_f64_e32 src_scc, v[2:3]        ; encoding: [0xfd,0x04,0x50,0x7d]
 0xfd,0x04,0x50,0x7d
+# GFX11: v_cmpx_u_f64_e32 src_scc, v[2:3]        ; encoding: [0xfd,0x04,0x50,0x7d]
 
-# GFX11: v_cmpx_u_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x51,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x51,0x7d,0x56,0x34,0x12,0xaf
+# GFX11: v_cmpx_u_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x51,0x7d,0x56,0x34,0x12,0xaf]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt
index d3f92d0358188..1d7e82c8bf96f 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt
@@ -1,2608 +1,2609 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11
 
-# GFX11: v_cmpx_class_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_class_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_class_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0xfa,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_class_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_class_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0xfa,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_class_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_class_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0xfa,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_class_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_class_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0xfa,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_class_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_class_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0xfa,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_class_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_class_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0xfa,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_class_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_class_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0xfa,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_class_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_class_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0xfa,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_class_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_class_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0xfa,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_class_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_class_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0xfa,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_class_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_class_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0xfa,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_class_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_class_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0xfa,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_class_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_class_f16 -|v127|, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfa,0x7d,0x7f,0x6f,0x3d,0x30]
 0xfa,0xfe,0xfa,0x7d,0x7f,0x6f,0x3d,0x30
+# GFX11: v_cmpx_class_f16 -|v127|, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfa,0x7d,0x7f,0x6f,0x3d,0x30]
 
-# GFX11: v_cmpx_class_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0xfc,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_class_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_class_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0xfc,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_class_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_class_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0xfc,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_class_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_class_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0xfc,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_class_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_class_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0xfc,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_class_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_class_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0xfc,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_class_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_class_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0xfc,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_class_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_class_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0xfc,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_class_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_class_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0xfc,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_class_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_class_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0xfc,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_class_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_class_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0xfc,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_class_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_class_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0xfc,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_class_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_class_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0xfc,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_class_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_class_f32 -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfd,0x7d,0xff,0x6f,0x3d,0x30]
 0xfa,0xfe,0xfd,0x7d,0xff,0x6f,0x3d,0x30
+# GFX11: v_cmpx_class_f32 -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfd,0x7d,0xff,0x6f,0x3d,0x30]
 
-# GFX11: v_cmpx_eq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x04,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_eq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_eq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x04,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_eq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_eq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x04,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_eq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x04,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_eq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x04,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_eq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x04,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_eq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x04,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_eq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x04,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_eq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x04,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_eq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x04,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_eq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x04,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_eq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x04,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_eq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_eq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x04,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_eq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_eq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x04,0x7d,0x7f,0x6f,0xfd,0x30]
 0xfa,0xfe,0x04,0x7d,0x7f,0x6f,0xfd,0x30
+# GFX11: v_cmpx_eq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x04,0x7d,0x7f,0x6f,0xfd,0x30]
 
-# GFX11: v_cmpx_eq_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x24,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_eq_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_eq_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x24,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_eq_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_eq_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x24,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_eq_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x24,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_eq_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x24,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_eq_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x24,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_eq_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x24,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_eq_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x24,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_eq_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x24,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_eq_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x24,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_eq_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x24,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_eq_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_eq_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x24,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_eq_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_eq_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x24,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_eq_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_eq_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x25,0x7d,0xff,0x6f,0xfd,0x30]
 0xfa,0xfe,0x25,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11: v_cmpx_eq_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x25,0x7d,0xff,0x6f,0xfd,0x30]
 
-# GFX11: v_cmpx_eq_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x64,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_eq_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_eq_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x64,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_eq_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_eq_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x64,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_eq_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x64,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_eq_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x64,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_eq_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x64,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_eq_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x64,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_eq_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x64,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_eq_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x64,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_eq_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x64,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_eq_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x64,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_eq_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x64,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_eq_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_eq_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x64,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_eq_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_eq_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x64,0x7d,0x7f,0x6f,0x0d,0x30]
 0xfa,0xfe,0x64,0x7d,0x7f,0x6f,0x0d,0x30
+# GFX11: v_cmpx_eq_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x64,0x7d,0x7f,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_eq_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x84,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_eq_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_eq_i32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x84,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_eq_i32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_eq_i32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x84,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_eq_i32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x84,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_eq_i32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x84,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_eq_i32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x84,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_eq_i32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x84,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_eq_i32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x84,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_eq_i32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x84,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_eq_i32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x84,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_eq_i32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x84,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_eq_i32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_eq_i32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x84,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_eq_i32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_eq_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x84,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_eq_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_eq_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x85,0x7d,0xff,0x6f,0x0d,0x30]
 0xfa,0xfe,0x85,0x7d,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_eq_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x85,0x7d,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_eq_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x74,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_eq_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_eq_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x74,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_eq_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_eq_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x74,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_eq_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x74,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_eq_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x74,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_eq_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x74,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_eq_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x74,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_eq_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x74,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_eq_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x74,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_eq_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x74,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_eq_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x74,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_eq_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x74,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_eq_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_eq_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x74,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_eq_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_eq_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x74,0x7d,0x7f,0x6f,0x0d,0x30]
 0xfa,0xfe,0x74,0x7d,0x7f,0x6f,0x0d,0x30
+# GFX11: v_cmpx_eq_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x74,0x7d,0x7f,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_eq_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x94,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_eq_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_eq_u32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x94,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_eq_u32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_eq_u32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x94,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_eq_u32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x94,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_eq_u32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x94,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_eq_u32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x94,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_eq_u32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x94,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_eq_u32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x94,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_eq_u32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x94,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_eq_u32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x94,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_eq_u32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x94,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_eq_u32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_eq_u32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x94,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_eq_u32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_eq_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x94,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_eq_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_eq_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x95,0x7d,0xff,0x6f,0x0d,0x30]
 0xfa,0xfe,0x95,0x7d,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_eq_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x95,0x7d,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_f_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x00,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_f_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_f_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x00,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_f_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_f_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x00,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_f_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_f_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x00,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_f_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_f_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x00,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_f_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_f_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x00,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_f_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_f_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x00,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_f_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_f_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x00,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_f_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_f_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x00,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_f_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_f_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x00,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_f_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_f_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x00,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_f_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_f_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x00,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_f_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_f_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x00,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_f_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_f_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x00,0x7d,0x7f,0x6f,0xfd,0x30]
 0xfa,0xfe,0x00,0x7d,0x7f,0x6f,0xfd,0x30
+# GFX11: v_cmpx_f_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x00,0x7d,0x7f,0x6f,0xfd,0x30]
 
-# GFX11: v_cmpx_f_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x20,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_f_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_f_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x20,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_f_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_f_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x20,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_f_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_f_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x20,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_f_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_f_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x20,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_f_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_f_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x20,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_f_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_f_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x20,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_f_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_f_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x20,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_f_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_f_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x20,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_f_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_f_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x20,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_f_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_f_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x20,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_f_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_f_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x20,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x20,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_f_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x20,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_f_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x20,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x20,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_f_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x20,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_f_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x21,0x7d,0xff,0x6f,0xfd,0x30]
 0xfa,0xfe,0x21,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11: v_cmpx_f_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x21,0x7d,0xff,0x6f,0xfd,0x30]
 
-# GFX11: v_cmpx_f_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x80,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_f_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_f_i32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x80,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_f_i32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_f_i32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x80,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_f_i32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_f_i32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x80,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_f_i32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_f_i32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x80,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_f_i32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_f_i32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x80,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_f_i32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_f_i32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x80,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_f_i32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_f_i32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x80,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_f_i32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_f_i32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x80,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_f_i32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_f_i32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x80,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_f_i32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_f_i32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x80,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_f_i32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x80,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_f_i32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x80,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x80,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_f_i32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x80,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_f_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x80,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x80,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_f_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x80,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_f_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x81,0x7d,0xff,0x6f,0x0d,0x30]
 0xfa,0xfe,0x81,0x7d,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_f_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x81,0x7d,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_f_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x90,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_f_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_f_u32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x90,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_f_u32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_f_u32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x90,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_f_u32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_f_u32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x90,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_f_u32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_f_u32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x90,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_f_u32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_f_u32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x90,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_f_u32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_f_u32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x90,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_f_u32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_f_u32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x90,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_f_u32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_f_u32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x90,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_f_u32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_f_u32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x90,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_f_u32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_f_u32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x90,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_f_u32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x90,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_f_u32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x90,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x90,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_f_u32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x90,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_f_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x90,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x90,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_f_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x90,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_f_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x91,0x7d,0xff,0x6f,0x0d,0x30]
 0xfa,0xfe,0x91,0x7d,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_f_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x91,0x7d,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_ge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x0c,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_ge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_ge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x0c,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_ge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_ge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x0c,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_ge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x0c,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_ge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x0c,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_ge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x0c,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_ge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x0c,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_ge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x0c,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_ge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x0c,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_ge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x0c,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_ge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x0c,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_ge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x0c,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_ge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_ge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x0c,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_ge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_ge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0c,0x7d,0x7f,0x6f,0xfd,0x30]
 0xfa,0xfe,0x0c,0x7d,0x7f,0x6f,0xfd,0x30
+# GFX11: v_cmpx_ge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0c,0x7d,0x7f,0x6f,0xfd,0x30]
 
-# GFX11: v_cmpx_ge_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x2c,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_ge_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_ge_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x2c,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_ge_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_ge_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x2c,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_ge_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x2c,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_ge_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x2c,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_ge_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x2c,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_ge_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x2c,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_ge_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x2c,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_ge_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x2c,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_ge_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x2c,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_ge_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x2c,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_ge_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_ge_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x2c,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_ge_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_ge_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x2c,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_ge_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_ge_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2d,0x7d,0xff,0x6f,0xfd,0x30]
 0xfa,0xfe,0x2d,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11: v_cmpx_ge_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2d,0x7d,0xff,0x6f,0xfd,0x30]
 
-# GFX11: v_cmpx_ge_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x6c,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_ge_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_ge_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x6c,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_ge_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_ge_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x6c,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_ge_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x6c,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_ge_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x6c,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_ge_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x6c,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_ge_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x6c,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_ge_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x6c,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_ge_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x6c,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_ge_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x6c,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_ge_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x6c,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_ge_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x6c,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_ge_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_ge_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x6c,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_ge_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_ge_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6c,0x7d,0x7f,0x6f,0x0d,0x30]
 0xfa,0xfe,0x6c,0x7d,0x7f,0x6f,0x0d,0x30
+# GFX11: v_cmpx_ge_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6c,0x7d,0x7f,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_ge_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x8c,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_ge_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_ge_i32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x8c,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_ge_i32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_ge_i32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x8c,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_ge_i32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x8c,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_ge_i32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x8c,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_ge_i32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x8c,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_ge_i32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x8c,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_ge_i32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x8c,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_ge_i32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x8c,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_ge_i32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x8c,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_ge_i32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x8c,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_ge_i32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_ge_i32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x8c,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_ge_i32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_ge_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x8c,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_ge_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_ge_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x8d,0x7d,0xff,0x6f,0x0d,0x30]
 0xfa,0xfe,0x8d,0x7d,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_ge_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x8d,0x7d,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_ge_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x7c,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_ge_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_ge_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x7c,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_ge_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_ge_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x7c,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_ge_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x7c,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_ge_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x7c,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_ge_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x7c,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_ge_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x7c,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_ge_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x7c,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_ge_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x7c,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_ge_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x7c,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_ge_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x7c,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_ge_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x7c,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_ge_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_ge_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x7c,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_ge_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_ge_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7c,0x7d,0x7f,0x6f,0x0d,0x30]
 0xfa,0xfe,0x7c,0x7d,0x7f,0x6f,0x0d,0x30
+# GFX11: v_cmpx_ge_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7c,0x7d,0x7f,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_ge_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x9c,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_ge_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_ge_u32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x9c,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_ge_u32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_ge_u32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x9c,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_ge_u32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x9c,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_ge_u32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x9c,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_ge_u32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x9c,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_ge_u32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x9c,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_ge_u32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x9c,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_ge_u32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x9c,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_ge_u32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x9c,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_ge_u32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x9c,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_ge_u32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_ge_u32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x9c,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_ge_u32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_ge_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x9c,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_ge_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_ge_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x9d,0x7d,0xff,0x6f,0x0d,0x30]
 0xfa,0xfe,0x9d,0x7d,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_ge_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x9d,0x7d,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_gt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x08,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_gt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_gt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x08,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_gt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_gt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x08,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_gt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x08,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_gt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x08,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_gt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x08,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_gt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x08,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_gt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x08,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_gt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x08,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_gt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x08,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_gt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x08,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_gt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x08,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_gt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_gt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x08,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_gt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_gt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x08,0x7d,0x7f,0x6f,0xfd,0x30]
 0xfa,0xfe,0x08,0x7d,0x7f,0x6f,0xfd,0x30
+# GFX11: v_cmpx_gt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x08,0x7d,0x7f,0x6f,0xfd,0x30]
 
-# GFX11: v_cmpx_gt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x28,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_gt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_gt_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x28,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_gt_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_gt_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x28,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_gt_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x28,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_gt_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x28,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_gt_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x28,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_gt_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x28,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_gt_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x28,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_gt_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x28,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_gt_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x28,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_gt_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x28,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_gt_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_gt_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x28,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_gt_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_gt_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x28,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_gt_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_gt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x29,0x7d,0xff,0x6f,0xfd,0x30]
 0xfa,0xfe,0x29,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11: v_cmpx_gt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x29,0x7d,0xff,0x6f,0xfd,0x30]
 
-# GFX11: v_cmpx_gt_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x68,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_gt_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_gt_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x68,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_gt_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_gt_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x68,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_gt_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x68,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_gt_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x68,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_gt_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x68,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_gt_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x68,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_gt_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x68,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_gt_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x68,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_gt_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x68,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_gt_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x68,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_gt_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x68,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_gt_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_gt_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x68,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_gt_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_gt_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x68,0x7d,0x7f,0x6f,0x0d,0x30]
 0xfa,0xfe,0x68,0x7d,0x7f,0x6f,0x0d,0x30
+# GFX11: v_cmpx_gt_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x68,0x7d,0x7f,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_gt_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x88,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_gt_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_gt_i32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x88,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_gt_i32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_gt_i32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x88,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_gt_i32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x88,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_gt_i32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x88,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_gt_i32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x88,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_gt_i32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x88,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_gt_i32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x88,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_gt_i32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x88,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_gt_i32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x88,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_gt_i32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x88,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_gt_i32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_gt_i32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x88,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_gt_i32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_gt_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x88,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_gt_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_gt_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x89,0x7d,0xff,0x6f,0x0d,0x30]
 0xfa,0xfe,0x89,0x7d,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_gt_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x89,0x7d,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_gt_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x78,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_gt_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_gt_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x78,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_gt_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_gt_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x78,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_gt_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x78,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_gt_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x78,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_gt_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x78,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_gt_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x78,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_gt_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x78,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_gt_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x78,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_gt_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x78,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_gt_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x78,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_gt_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x78,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_gt_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_gt_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x78,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_gt_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_gt_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x78,0x7d,0x7f,0x6f,0x0d,0x30]
 0xfa,0xfe,0x78,0x7d,0x7f,0x6f,0x0d,0x30
+# GFX11: v_cmpx_gt_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x78,0x7d,0x7f,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_gt_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x98,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_gt_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_gt_u32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x98,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_gt_u32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_gt_u32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x98,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_gt_u32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x98,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_gt_u32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x98,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_gt_u32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x98,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_gt_u32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x98,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_gt_u32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x98,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_gt_u32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x98,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_gt_u32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x98,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_gt_u32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x98,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_gt_u32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_gt_u32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x98,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_gt_u32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_gt_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x98,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_gt_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_gt_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x99,0x7d,0xff,0x6f,0x0d,0x30]
 0xfa,0xfe,0x99,0x7d,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_gt_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x99,0x7d,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_le_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x06,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_le_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_le_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x06,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_le_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_le_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x06,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_le_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_le_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x06,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_le_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_le_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x06,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_le_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_le_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x06,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_le_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_le_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x06,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_le_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_le_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x06,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_le_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_le_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x06,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_le_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_le_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x06,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_le_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_le_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x06,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_le_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_le_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x06,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_le_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_le_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x06,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_le_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_le_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x06,0x7d,0x7f,0x6f,0xfd,0x30]
 0xfa,0xfe,0x06,0x7d,0x7f,0x6f,0xfd,0x30
+# GFX11: v_cmpx_le_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x06,0x7d,0x7f,0x6f,0xfd,0x30]
 
-# GFX11: v_cmpx_le_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x26,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_le_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_le_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x26,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_le_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_le_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x26,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_le_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_le_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x26,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_le_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_le_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x26,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_le_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_le_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x26,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_le_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_le_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x26,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_le_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_le_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x26,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_le_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_le_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x26,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_le_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_le_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x26,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_le_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_le_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x26,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_le_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_le_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x26,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_le_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_le_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x26,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_le_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_le_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x27,0x7d,0xff,0x6f,0xfd,0x30]
 0xfa,0xfe,0x27,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11: v_cmpx_le_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x27,0x7d,0xff,0x6f,0xfd,0x30]
 
-# GFX11: v_cmpx_le_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x66,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_le_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_le_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x66,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_le_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_le_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x66,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_le_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_le_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x66,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_le_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_le_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x66,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_le_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_le_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x66,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_le_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_le_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x66,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_le_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_le_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x66,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_le_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_le_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x66,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_le_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_le_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x66,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_le_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_le_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x66,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_le_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_le_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x66,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_le_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_le_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x66,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_le_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_le_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x66,0x7d,0x7f,0x6f,0x0d,0x30]
 0xfa,0xfe,0x66,0x7d,0x7f,0x6f,0x0d,0x30
+# GFX11: v_cmpx_le_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x66,0x7d,0x7f,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_le_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x86,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_le_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_le_i32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x86,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_le_i32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_le_i32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x86,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_le_i32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_le_i32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x86,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_le_i32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_le_i32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x86,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_le_i32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_le_i32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x86,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_le_i32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_le_i32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x86,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_le_i32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_le_i32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x86,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_le_i32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_le_i32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x86,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_le_i32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_le_i32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x86,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_le_i32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_le_i32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x86,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_le_i32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_le_i32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x86,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_le_i32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_le_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x86,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_le_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_le_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x87,0x7d,0xff,0x6f,0x0d,0x30]
 0xfa,0xfe,0x87,0x7d,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_le_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x87,0x7d,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_le_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x76,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_le_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_le_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x76,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_le_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_le_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x76,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_le_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_le_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x76,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_le_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_le_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x76,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_le_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_le_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x76,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_le_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_le_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x76,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_le_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_le_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x76,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_le_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_le_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x76,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_le_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_le_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x76,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_le_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_le_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x76,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_le_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_le_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x76,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_le_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_le_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x76,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_le_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_le_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x76,0x7d,0x7f,0x6f,0x0d,0x30]
 0xfa,0xfe,0x76,0x7d,0x7f,0x6f,0x0d,0x30
+# GFX11: v_cmpx_le_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x76,0x7d,0x7f,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_le_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x96,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_le_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_le_u32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x96,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_le_u32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_le_u32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x96,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_le_u32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_le_u32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x96,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_le_u32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_le_u32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x96,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_le_u32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_le_u32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x96,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_le_u32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_le_u32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x96,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_le_u32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_le_u32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x96,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_le_u32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_le_u32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x96,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_le_u32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_le_u32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x96,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_le_u32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_le_u32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x96,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_le_u32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_le_u32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x96,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_le_u32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_le_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x96,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_le_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_le_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x97,0x7d,0xff,0x6f,0x0d,0x30]
 0xfa,0xfe,0x97,0x7d,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_le_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x97,0x7d,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_lg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x0a,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_lg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_lg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x0a,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_lg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_lg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x0a,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_lg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x0a,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_lg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x0a,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_lg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x0a,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_lg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x0a,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_lg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x0a,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_lg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x0a,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_lg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x0a,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_lg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x0a,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_lg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x0a,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_lg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_lg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x0a,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_lg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_lg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0a,0x7d,0x7f,0x6f,0xfd,0x30]
 0xfa,0xfe,0x0a,0x7d,0x7f,0x6f,0xfd,0x30
+# GFX11: v_cmpx_lg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0a,0x7d,0x7f,0x6f,0xfd,0x30]
 
-# GFX11: v_cmpx_lg_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x2a,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_lg_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_lg_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x2a,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_lg_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_lg_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x2a,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_lg_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x2a,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_lg_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x2a,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_lg_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x2a,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_lg_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x2a,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_lg_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x2a,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_lg_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x2a,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_lg_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x2a,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_lg_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x2a,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_lg_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_lg_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x2a,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_lg_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_lg_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x2a,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_lg_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_lg_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2b,0x7d,0xff,0x6f,0xfd,0x30]
 0xfa,0xfe,0x2b,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11: v_cmpx_lg_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2b,0x7d,0xff,0x6f,0xfd,0x30]
 
-# GFX11: v_cmpx_lt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x02,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_lt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_lt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x02,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_lt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_lt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x02,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_lt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x02,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_lt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x02,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_lt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x02,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_lt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x02,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_lt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x02,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_lt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x02,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_lt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x02,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_lt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x02,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_lt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x02,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_lt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_lt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x02,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_lt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_lt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x02,0x7d,0x7f,0x6f,0xfd,0x30]
 0xfa,0xfe,0x02,0x7d,0x7f,0x6f,0xfd,0x30
+# GFX11: v_cmpx_lt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x02,0x7d,0x7f,0x6f,0xfd,0x30]
 
-# GFX11: v_cmpx_lt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x22,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_lt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_lt_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x22,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_lt_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_lt_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x22,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_lt_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x22,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_lt_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x22,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_lt_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x22,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_lt_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x22,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_lt_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x22,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_lt_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x22,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_lt_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x22,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_lt_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x22,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_lt_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_lt_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x22,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_lt_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_lt_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x22,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_lt_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_lt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x23,0x7d,0xff,0x6f,0xfd,0x30]
 0xfa,0xfe,0x23,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11: v_cmpx_lt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x23,0x7d,0xff,0x6f,0xfd,0x30]
 
-# GFX11: v_cmpx_lt_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x62,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_lt_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_lt_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x62,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_lt_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_lt_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x62,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_lt_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x62,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_lt_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x62,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_lt_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x62,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_lt_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x62,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_lt_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x62,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_lt_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x62,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_lt_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x62,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_lt_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x62,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_lt_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x62,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_lt_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_lt_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x62,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_lt_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_lt_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x62,0x7d,0x7f,0x6f,0x0d,0x30]
 0xfa,0xfe,0x62,0x7d,0x7f,0x6f,0x0d,0x30
+# GFX11: v_cmpx_lt_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x62,0x7d,0x7f,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_lt_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x82,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_lt_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_lt_i32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x82,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_lt_i32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_lt_i32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x82,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_lt_i32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x82,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_lt_i32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x82,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_lt_i32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x82,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_lt_i32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x82,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_lt_i32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x82,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_lt_i32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x82,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_lt_i32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x82,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_lt_i32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x82,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_lt_i32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_lt_i32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x82,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_lt_i32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_lt_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x82,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_lt_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_lt_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x83,0x7d,0xff,0x6f,0x0d,0x30]
 0xfa,0xfe,0x83,0x7d,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_lt_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x83,0x7d,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_lt_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x72,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_lt_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_lt_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x72,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_lt_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_lt_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x72,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_lt_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x72,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_lt_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x72,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_lt_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x72,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_lt_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x72,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_lt_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x72,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_lt_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x72,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_lt_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x72,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_lt_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x72,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_lt_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x72,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_lt_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_lt_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x72,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_lt_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_lt_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x72,0x7d,0x7f,0x6f,0x0d,0x30]
 0xfa,0xfe,0x72,0x7d,0x7f,0x6f,0x0d,0x30
+# GFX11: v_cmpx_lt_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x72,0x7d,0x7f,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_lt_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x92,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_lt_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_lt_u32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x92,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_lt_u32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_lt_u32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x92,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_lt_u32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x92,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_lt_u32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x92,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_lt_u32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x92,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_lt_u32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x92,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_lt_u32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x92,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_lt_u32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x92,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_lt_u32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x92,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_lt_u32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x92,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_lt_u32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_lt_u32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x92,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_lt_u32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_lt_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x92,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_lt_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_lt_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x93,0x7d,0xff,0x6f,0x0d,0x30]
 0xfa,0xfe,0x93,0x7d,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_lt_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x93,0x7d,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_ne_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x6a,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_ne_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_ne_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x6a,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_ne_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_ne_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x6a,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_ne_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x6a,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_ne_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x6a,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_ne_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x6a,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_ne_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x6a,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_ne_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x6a,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_ne_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x6a,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_ne_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x6a,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_ne_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x6a,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_ne_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x6a,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_ne_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_ne_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x6a,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_ne_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_ne_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6a,0x7d,0x7f,0x6f,0x0d,0x30]
 0xfa,0xfe,0x6a,0x7d,0x7f,0x6f,0x0d,0x30
+# GFX11: v_cmpx_ne_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6a,0x7d,0x7f,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_ne_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x8a,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_ne_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_ne_i32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x8a,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_ne_i32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_ne_i32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x8a,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_ne_i32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x8a,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_ne_i32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x8a,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_ne_i32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x8a,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_ne_i32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x8a,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_ne_i32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x8a,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_ne_i32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x8a,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_ne_i32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x8a,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_ne_i32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x8a,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_ne_i32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_ne_i32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x8a,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_ne_i32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_ne_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x8a,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_ne_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_ne_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x8b,0x7d,0xff,0x6f,0x0d,0x30]
 0xfa,0xfe,0x8b,0x7d,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_ne_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x8b,0x7d,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_ne_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x7a,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_ne_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_ne_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x7a,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_ne_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_ne_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x7a,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_ne_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x7a,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_ne_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x7a,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_ne_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x7a,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_ne_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x7a,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_ne_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x7a,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_ne_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x7a,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_ne_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x7a,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_ne_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x7a,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_ne_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x7a,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_ne_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_ne_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x7a,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_ne_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_ne_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7a,0x7d,0x7f,0x6f,0x0d,0x30]
 0xfa,0xfe,0x7a,0x7d,0x7f,0x6f,0x0d,0x30
+# GFX11: v_cmpx_ne_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7a,0x7d,0x7f,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_ne_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x9a,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_ne_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_ne_u32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x9a,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_ne_u32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_ne_u32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x9a,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_ne_u32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x9a,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_ne_u32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x9a,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_ne_u32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x9a,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_ne_u32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x9a,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_ne_u32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x9a,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_ne_u32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x9a,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_ne_u32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x9a,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_ne_u32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x9a,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_ne_u32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_ne_u32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x9a,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_ne_u32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_ne_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x9a,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_ne_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_ne_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x9b,0x7d,0xff,0x6f,0x0d,0x30]
 0xfa,0xfe,0x9b,0x7d,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_ne_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x9b,0x7d,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_neq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x1a,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_neq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_neq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x1a,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_neq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_neq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x1a,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_neq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x1a,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_neq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x1a,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_neq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x1a,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_neq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x1a,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_neq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x1a,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_neq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x1a,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_neq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x1a,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_neq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x1a,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_neq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x1a,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_neq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_neq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x1a,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_neq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_neq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1a,0x7d,0x7f,0x6f,0xfd,0x30]
 0xfa,0xfe,0x1a,0x7d,0x7f,0x6f,0xfd,0x30
+# GFX11: v_cmpx_neq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1a,0x7d,0x7f,0x6f,0xfd,0x30]
 
-# GFX11: v_cmpx_neq_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x3a,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_neq_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_neq_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x3a,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_neq_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_neq_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x3a,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_neq_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x3a,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_neq_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x3a,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_neq_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x3a,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_neq_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x3a,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_neq_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x3a,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_neq_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x3a,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_neq_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x3a,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_neq_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x3a,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_neq_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_neq_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x3a,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_neq_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_neq_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x3a,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_neq_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_neq_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x3b,0x7d,0xff,0x6f,0xfd,0x30]
 0xfa,0xfe,0x3b,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11: v_cmpx_neq_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x3b,0x7d,0xff,0x6f,0xfd,0x30]
 
-# GFX11: v_cmpx_nge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x12,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_nge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_nge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x12,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_nge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_nge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x12,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_nge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x12,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_nge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x12,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_nge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x12,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_nge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x12,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_nge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x12,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_nge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x12,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_nge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x12,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_nge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x12,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_nge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x12,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_nge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_nge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x12,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_nge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_nge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x12,0x7d,0x7f,0x6f,0xfd,0x30]
 0xfa,0xfe,0x12,0x7d,0x7f,0x6f,0xfd,0x30
+# GFX11: v_cmpx_nge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x12,0x7d,0x7f,0x6f,0xfd,0x30]
 
-# GFX11: v_cmpx_nge_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x32,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_nge_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_nge_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x32,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_nge_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_nge_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x32,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_nge_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x32,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_nge_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x32,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_nge_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x32,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_nge_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x32,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_nge_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x32,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_nge_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x32,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_nge_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x32,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_nge_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x32,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_nge_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_nge_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x32,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_nge_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_nge_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x32,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_nge_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_nge_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x33,0x7d,0xff,0x6f,0xfd,0x30]
 0xfa,0xfe,0x33,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11: v_cmpx_nge_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x33,0x7d,0xff,0x6f,0xfd,0x30]
 
-# GFX11: v_cmpx_ngt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x16,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_ngt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_ngt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x16,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_ngt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_ngt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x16,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_ngt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x16,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_ngt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x16,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_ngt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x16,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_ngt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x16,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_ngt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x16,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_ngt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x16,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_ngt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x16,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_ngt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x16,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_ngt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x16,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_ngt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_ngt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x16,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_ngt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_ngt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x16,0x7d,0x7f,0x6f,0xfd,0x30]
 0xfa,0xfe,0x16,0x7d,0x7f,0x6f,0xfd,0x30
+# GFX11: v_cmpx_ngt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x16,0x7d,0x7f,0x6f,0xfd,0x30]
 
-# GFX11: v_cmpx_ngt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x36,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_ngt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_ngt_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x36,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_ngt_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_ngt_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x36,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_ngt_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x36,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_ngt_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x36,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_ngt_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x36,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_ngt_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x36,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_ngt_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x36,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_ngt_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x36,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_ngt_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x36,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_ngt_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x36,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_ngt_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_ngt_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x36,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_ngt_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_ngt_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x36,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_ngt_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_ngt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x37,0x7d,0xff,0x6f,0xfd,0x30]
 0xfa,0xfe,0x37,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11: v_cmpx_ngt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x37,0x7d,0xff,0x6f,0xfd,0x30]
 
-# GFX11: v_cmpx_nle_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x18,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_nle_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_nle_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x18,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_nle_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_nle_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x18,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_nle_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x18,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_nle_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x18,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_nle_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x18,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_nle_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x18,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_nle_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x18,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_nle_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x18,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_nle_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x18,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_nle_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x18,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_nle_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x18,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_nle_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_nle_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x18,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_nle_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_nle_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x18,0x7d,0x7f,0x6f,0xfd,0x30]
 0xfa,0xfe,0x18,0x7d,0x7f,0x6f,0xfd,0x30
+# GFX11: v_cmpx_nle_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x18,0x7d,0x7f,0x6f,0xfd,0x30]
 
-# GFX11: v_cmpx_nle_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x38,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_nle_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_nle_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x38,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_nle_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_nle_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x38,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_nle_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x38,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_nle_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x38,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_nle_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x38,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_nle_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x38,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_nle_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x38,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_nle_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x38,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_nle_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x38,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_nle_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x38,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_nle_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_nle_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x38,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_nle_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_nle_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x38,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_nle_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_nle_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x39,0x7d,0xff,0x6f,0xfd,0x30]
 0xfa,0xfe,0x39,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11: v_cmpx_nle_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x39,0x7d,0xff,0x6f,0xfd,0x30]
 
-# GFX11: v_cmpx_nlg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x14,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_nlg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_nlg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x14,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_nlg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_nlg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x14,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_nlg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x14,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_nlg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x14,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_nlg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x14,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_nlg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x14,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_nlg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x14,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_nlg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x14,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_nlg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x14,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_nlg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x14,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_nlg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x14,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_nlg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_nlg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x14,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_nlg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_nlg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x14,0x7d,0x7f,0x6f,0xfd,0x30]
 0xfa,0xfe,0x14,0x7d,0x7f,0x6f,0xfd,0x30
+# GFX11: v_cmpx_nlg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x14,0x7d,0x7f,0x6f,0xfd,0x30]
 
-# GFX11: v_cmpx_nlg_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x34,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_nlg_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_nlg_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x34,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_nlg_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_nlg_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x34,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_nlg_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x34,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_nlg_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x34,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_nlg_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x34,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_nlg_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x34,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_nlg_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x34,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_nlg_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x34,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_nlg_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x34,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_nlg_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x34,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_nlg_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_nlg_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x34,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_nlg_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_nlg_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x34,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_nlg_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_nlg_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x35,0x7d,0xff,0x6f,0xfd,0x30]
 0xfa,0xfe,0x35,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11: v_cmpx_nlg_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x35,0x7d,0xff,0x6f,0xfd,0x30]
 
-# GFX11: v_cmpx_nlt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x1c,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_nlt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_nlt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x1c,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_nlt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_nlt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x1c,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_nlt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x1c,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_nlt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x1c,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_nlt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x1c,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_nlt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x1c,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_nlt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x1c,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_nlt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x1c,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_nlt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x1c,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_nlt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x1c,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_nlt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x1c,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_nlt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_nlt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x1c,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_nlt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_nlt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1c,0x7d,0x7f,0x6f,0xfd,0x30]
 0xfa,0xfe,0x1c,0x7d,0x7f,0x6f,0xfd,0x30
+# GFX11: v_cmpx_nlt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1c,0x7d,0x7f,0x6f,0xfd,0x30]
 
-# GFX11: v_cmpx_nlt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x3c,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_nlt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_nlt_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x3c,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_nlt_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_nlt_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x3c,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_nlt_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x3c,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_nlt_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x3c,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_nlt_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x3c,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_nlt_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x3c,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_nlt_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x3c,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_nlt_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x3c,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_nlt_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x3c,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_nlt_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x3c,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_nlt_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_nlt_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x3c,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_nlt_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_nlt_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x3c,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_nlt_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_nlt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x3d,0x7d,0xff,0x6f,0xfd,0x30]
 0xfa,0xfe,0x3d,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11: v_cmpx_nlt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x3d,0x7d,0xff,0x6f,0xfd,0x30]
 
-# GFX11: v_cmpx_o_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x0e,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_o_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_o_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x0e,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_o_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_o_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x0e,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_o_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_o_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x0e,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_o_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_o_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x0e,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_o_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_o_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x0e,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_o_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_o_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x0e,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_o_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_o_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x0e,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_o_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_o_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x0e,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_o_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_o_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x0e,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_o_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_o_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x0e,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_o_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_o_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x0e,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_o_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_o_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x0e,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_o_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_o_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0e,0x7d,0x7f,0x6f,0xfd,0x30]
 0xfa,0xfe,0x0e,0x7d,0x7f,0x6f,0xfd,0x30
+# GFX11: v_cmpx_o_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0e,0x7d,0x7f,0x6f,0xfd,0x30]
 
-# GFX11: v_cmpx_o_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x2e,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_o_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_o_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x2e,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_o_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_o_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x2e,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_o_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_o_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x2e,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_o_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_o_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x2e,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_o_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_o_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x2e,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_o_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_o_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x2e,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_o_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_o_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x2e,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_o_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_o_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x2e,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_o_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_o_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x2e,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_o_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_o_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x2e,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_o_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_o_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x2e,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_o_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_o_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x2e,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_o_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_o_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2f,0x7d,0xff,0x6f,0xfd,0x30]
 0xfa,0xfe,0x2f,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11: v_cmpx_o_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2f,0x7d,0xff,0x6f,0xfd,0x30]
 
-# GFX11: v_cmpx_t_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x1e,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_t_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_t_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x1e,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_t_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_t_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x1e,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_t_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_t_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x1e,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_t_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_t_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x1e,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_t_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_t_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x1e,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_t_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_t_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x1e,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_t_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_t_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x1e,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_t_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_t_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x1e,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_t_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_t_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x1e,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_t_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_t_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x1e,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_t_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_t_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x1e,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_t_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_t_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x1e,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_t_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_t_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1e,0x7d,0x7f,0x6f,0xfd,0x30]
 0xfa,0xfe,0x1e,0x7d,0x7f,0x6f,0xfd,0x30
+# GFX11: v_cmpx_t_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1e,0x7d,0x7f,0x6f,0xfd,0x30]
 
-# GFX11: v_cmpx_t_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x3e,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_t_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_t_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x3e,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_t_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_t_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x3e,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_t_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_t_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x3e,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_t_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_t_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x3e,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_t_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_t_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x3e,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_t_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_t_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x3e,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_t_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_t_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x3e,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_t_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_t_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x3e,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_t_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_t_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x3e,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_t_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_t_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x3e,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_t_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_t_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x3e,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x3e,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_t_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x3e,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_t_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x3e,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x3e,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_t_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x3e,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_t_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x3f,0x7d,0xff,0x6f,0xfd,0x30]
 0xfa,0xfe,0x3f,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11: v_cmpx_t_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x3f,0x7d,0xff,0x6f,0xfd,0x30]
 
-# GFX11: v_cmpx_t_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x8e,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_t_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_t_i32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x8e,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_t_i32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_t_i32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x8e,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_t_i32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_t_i32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x8e,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_t_i32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_t_i32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x8e,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_t_i32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_t_i32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x8e,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_t_i32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_t_i32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x8e,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_t_i32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_t_i32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x8e,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_t_i32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_t_i32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x8e,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_t_i32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_t_i32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x8e,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_t_i32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_t_i32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x8e,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_t_i32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8e,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_t_i32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x8e,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x8e,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_t_i32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x8e,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_t_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x8e,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x8e,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_t_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x8e,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_t_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x8f,0x7d,0xff,0x6f,0x0d,0x30]
 0xfa,0xfe,0x8f,0x7d,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_t_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x8f,0x7d,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_t_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x9e,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_t_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_t_u32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x9e,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_t_u32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_t_u32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x9e,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_t_u32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_t_u32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x9e,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_t_u32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_t_u32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x9e,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_t_u32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_t_u32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x9e,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_t_u32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_t_u32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x9e,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_t_u32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_t_u32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x9e,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_t_u32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_t_u32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x9e,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_t_u32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_t_u32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x9e,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_t_u32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_t_u32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x9e,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_t_u32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9e,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_t_u32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x9e,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x9e,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_t_u32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x9e,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_t_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x9e,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x9e,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_t_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x9e,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_t_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x9f,0x7d,0xff,0x6f,0x0d,0x30]
 0xfa,0xfe,0x9f,0x7d,0xff,0x6f,0x0d,0x30
+# GFX11: v_cmpx_t_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x9f,0x7d,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_cmpx_u_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x10,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_u_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_u_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x10,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_u_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_u_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x10,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_u_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_u_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x10,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_u_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_u_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x10,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_u_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_u_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x10,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_u_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_u_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x10,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_u_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_u_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x10,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_u_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_u_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x10,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_u_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_u_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x10,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_u_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_u_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x10,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_u_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_u_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x10,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_u_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_u_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x10,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_u_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_u_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x10,0x7d,0x7f,0x6f,0xfd,0x30]
 0xfa,0xfe,0x10,0x7d,0x7f,0x6f,0xfd,0x30
+# GFX11: v_cmpx_u_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x10,0x7d,0x7f,0x6f,0xfd,0x30]
 
-# GFX11: v_cmpx_u_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x30,0x7d,0x01,0x1b,0x00,0xff
+# GFX11: v_cmpx_u_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX11: v_cmpx_u_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x30,0x7d,0x01,0xe4,0x00,0xff
+# GFX11: v_cmpx_u_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX11: v_cmpx_u_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x30,0x7d,0x01,0x40,0x01,0xff
+# GFX11: v_cmpx_u_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX11: v_cmpx_u_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x30,0x7d,0x01,0x41,0x01,0xff
+# GFX11: v_cmpx_u_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX11: v_cmpx_u_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x30,0x7d,0x01,0x01,0x01,0xff
+# GFX11: v_cmpx_u_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX11: v_cmpx_u_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x30,0x7d,0x01,0x0f,0x01,0xff
+# GFX11: v_cmpx_u_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX11: v_cmpx_u_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x30,0x7d,0x01,0x11,0x01,0xff
+# GFX11: v_cmpx_u_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX11: v_cmpx_u_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x30,0x7d,0x01,0x1f,0x01,0xff
+# GFX11: v_cmpx_u_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX11: v_cmpx_u_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x30,0x7d,0x01,0x21,0x01,0xff
+# GFX11: v_cmpx_u_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX11: v_cmpx_u_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x30,0x7d,0x01,0x2f,0x01,0xff
+# GFX11: v_cmpx_u_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX11: v_cmpx_u_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x30,0x7d,0x01,0x50,0x01,0xff
+# GFX11: v_cmpx_u_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX11: v_cmpx_u_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x30,0x7d,0x01,0x5f,0x01,0x01
+# GFX11: v_cmpx_u_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_u_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x30,0x7d,0x01,0x60,0x01,0x13
+# GFX11: v_cmpx_u_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_u_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x31,0x7d,0xff,0x6f,0xfd,0x30]
 0xfa,0xfe,0x31,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11: v_cmpx_u_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x31,0x7d,0xff,0x6f,0xfd,0x30]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt
index 3c5b243e497f1..a6d8ec95d6d63 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt
@@ -1,376 +1,377 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11
 
-# GFX11: v_cmpx_class_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_class_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_class_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfa,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0xfa,0x7d,0x7f,0x00,0x00,0x00
+# GFX11: v_cmpx_class_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfa,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_class_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfc,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0xfc,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_class_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfc,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_class_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfd,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0xfd,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_class_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfd,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_eq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x04,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x04,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_eq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x04,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_eq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x04,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x04,0x7d,0x7f,0x00,0x00,0x00
+# GFX11: v_cmpx_eq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x04,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_eq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x24,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x24,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_eq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x24,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_eq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x25,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x25,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_eq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x25,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x64,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x64,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_eq_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x64,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_eq_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x64,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x64,0x7d,0x7f,0x00,0x00,0x00
+# GFX11: v_cmpx_eq_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x64,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x84,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x84,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_eq_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x84,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_eq_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x85,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x85,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_eq_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x85,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x74,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x74,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_eq_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x74,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_eq_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x74,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x74,0x7d,0x7f,0x00,0x00,0x00
+# GFX11: v_cmpx_eq_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x74,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x94,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x94,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_eq_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x94,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_eq_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x95,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x95,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_eq_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x95,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_f_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x00,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x00,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_f_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x00,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_f_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x00,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x00,0x7d,0x7f,0x00,0x00,0x00
+# GFX11: v_cmpx_f_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x00,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_f_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x20,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x20,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_f_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x20,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_f_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x21,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x21,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_f_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x21,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_f_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x80,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x80,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_f_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x80,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_f_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x81,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x81,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_f_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x81,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_f_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x90,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x90,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_f_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x90,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_f_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x91,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x91,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_f_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x91,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_ge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0c,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x0c,0x7d,0x7f,0x00,0x00,0x00
+# GFX11: v_cmpx_ge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0c,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_ge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2c,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x2c,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2c,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ge_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2d,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x2d,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_ge_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2d,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6c,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x6c,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ge_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6c,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ge_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6c,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x6c,0x7d,0x7f,0x00,0x00,0x00
+# GFX11: v_cmpx_ge_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6c,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x8c,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x8c,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ge_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x8c,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ge_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x8d,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x8d,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_ge_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x8d,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7c,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x7c,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ge_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7c,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ge_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7c,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x7c,0x7d,0x7f,0x00,0x00,0x00
+# GFX11: v_cmpx_ge_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7c,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x9c,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x9c,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ge_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x9c,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ge_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x9d,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x9d,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_ge_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x9d,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_gt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x08,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x08,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_gt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x08,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_gt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x08,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x08,0x7d,0x7f,0x00,0x00,0x00
+# GFX11: v_cmpx_gt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x08,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_gt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x28,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x28,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_gt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x28,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_gt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x29,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x29,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_gt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x29,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x68,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x68,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_gt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x68,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_gt_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x68,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x68,0x7d,0x7f,0x00,0x00,0x00
+# GFX11: v_cmpx_gt_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x68,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x88,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x88,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_gt_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x88,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_gt_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x89,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x89,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_gt_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x89,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x78,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x78,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_gt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x78,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_gt_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x78,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x78,0x7d,0x7f,0x00,0x00,0x00
+# GFX11: v_cmpx_gt_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x78,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x98,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x98,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_gt_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x98,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_gt_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x99,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x99,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_gt_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x99,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_le_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x06,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x06,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_le_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x06,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_le_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x06,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x06,0x7d,0x7f,0x00,0x00,0x00
+# GFX11: v_cmpx_le_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x06,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_le_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x26,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x26,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_le_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x26,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_le_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x27,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x27,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_le_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x27,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_le_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x66,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x66,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_le_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x66,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_le_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x66,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x66,0x7d,0x7f,0x00,0x00,0x00
+# GFX11: v_cmpx_le_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x66,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_le_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x86,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x86,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_le_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x86,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_le_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x87,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x87,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_le_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x87,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_le_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x76,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x76,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_le_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x76,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_le_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x76,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x76,0x7d,0x7f,0x00,0x00,0x00
+# GFX11: v_cmpx_le_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x76,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_le_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x96,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x96,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_le_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x96,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_le_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x97,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x97,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_le_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x97,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_lg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_lg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_lg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x0a,0x7d,0x7f,0x00,0x00,0x00
+# GFX11: v_cmpx_lg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_lg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2a,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x2a,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_lg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2a,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_lg_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2b,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x2b,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_lg_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2b,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_lt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x02,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x02,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_lt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x02,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_lt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x02,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x02,0x7d,0x7f,0x00,0x00,0x00
+# GFX11: v_cmpx_lt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x02,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_lt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x22,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x22,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_lt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x22,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_lt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x23,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x23,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_lt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x23,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x62,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x62,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_lt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x62,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_lt_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x62,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x62,0x7d,0x7f,0x00,0x00,0x00
+# GFX11: v_cmpx_lt_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x62,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x82,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x82,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_lt_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x82,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_lt_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x83,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x83,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_lt_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x83,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x72,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x72,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_lt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x72,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_lt_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x72,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x72,0x7d,0x7f,0x00,0x00,0x00
+# GFX11: v_cmpx_lt_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x72,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x92,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x92,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_lt_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x92,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_lt_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x93,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x93,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_lt_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x93,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6a,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x6a,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ne_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6a,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ne_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6a,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x6a,0x7d,0x7f,0x00,0x00,0x00
+# GFX11: v_cmpx_ne_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6a,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x8a,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x8a,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ne_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x8a,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ne_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x8b,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x8b,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_ne_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x8b,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7a,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x7a,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ne_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7a,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ne_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7a,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x7a,0x7d,0x7f,0x00,0x00,0x00
+# GFX11: v_cmpx_ne_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7a,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x9a,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x9a,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ne_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x9a,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ne_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x9b,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x9b,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_ne_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x9b,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_neq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_neq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_neq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1a,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x1a,0x7d,0x7f,0x00,0x00,0x00
+# GFX11: v_cmpx_neq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1a,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_neq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3a,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x3a,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_neq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3a,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_neq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x3b,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x3b,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_neq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x3b,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_nge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x12,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x12,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x12,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x12,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x12,0x7d,0x7f,0x00,0x00,0x00
+# GFX11: v_cmpx_nge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x12,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_nge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x32,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x32,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x32,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nge_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x33,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x33,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_nge_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x33,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_ngt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x16,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x16,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ngt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x16,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ngt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x16,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x16,0x7d,0x7f,0x00,0x00,0x00
+# GFX11: v_cmpx_ngt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x16,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_ngt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x36,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x36,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_ngt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x36,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ngt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x37,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x37,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_ngt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x37,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_nle_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x18,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x18,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nle_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x18,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nle_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x18,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x18,0x7d,0x7f,0x00,0x00,0x00
+# GFX11: v_cmpx_nle_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x18,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_nle_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x38,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x38,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nle_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x38,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nle_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x39,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x39,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_nle_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x39,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_nlg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x14,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x14,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nlg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x14,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nlg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x14,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x14,0x7d,0x7f,0x00,0x00,0x00
+# GFX11: v_cmpx_nlg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x14,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_nlg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x34,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x34,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nlg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x34,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nlg_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x35,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x35,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_nlg_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x35,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_nlt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nlt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nlt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1c,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x1c,0x7d,0x7f,0x00,0x00,0x00
+# GFX11: v_cmpx_nlt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1c,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_nlt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3c,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x3c,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_nlt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3c,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_nlt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x3d,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x3d,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_nlt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x3d,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_o_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_o_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_o_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0e,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x0e,0x7d,0x7f,0x00,0x00,0x00
+# GFX11: v_cmpx_o_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0e,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_o_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2e,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x2e,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_o_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2e,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_o_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2f,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x2f,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_o_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2f,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_t_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1e,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x1e,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_t_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1e,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_t_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1e,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x1e,0x7d,0x7f,0x00,0x00,0x00
+# GFX11: v_cmpx_t_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1e,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_t_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3e,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x3e,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_t_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3e,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_t_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x3f,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x3f,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_t_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x3f,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_t_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x8e,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x8e,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_t_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x8e,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_t_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x8f,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x8f,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_t_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x8f,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_t_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x9e,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x9e,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_t_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x9e,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_t_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x9f,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x9f,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_t_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x9f,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_u_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x10,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x10,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_u_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x10,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_u_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x10,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x10,0x7d,0x7f,0x00,0x00,0x00
+# GFX11: v_cmpx_u_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x10,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_u_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x30,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x30,0x7d,0x01,0x77,0x39,0x05
+# GFX11: v_cmpx_u_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x30,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_u_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x31,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x31,0x7d,0xff,0x00,0x00,0x00
+# GFX11: v_cmpx_u_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x31,0x7d,0xff,0x00,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c.txt
index 057e81c6fe27f..62d235e1d8206 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c.txt
@@ -1,4469 +1,4472 @@
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s
 
-# W32: v_cmp_class_f16_e64 s10, v1, v2           ; encoding: [0x0a,0x00,0x7d,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_class_f16_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0x7d,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x7d,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_class_f16_e64 s10, v1, v2         ; encoding: [0x0a,0x00,0x7d,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_class_f16_e64 s[10:11], v1, v2    ; encoding: [0x0a,0x00,0x7d,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_class_f16_e64 s10, v255, v2         ; encoding: [0x0a,0x00,0x7d,0xd4,0xff,0x05,0x02,0x00]
-# W64: v_cmp_class_f16_e64 s[10:11], v255, v2    ; encoding: [0x0a,0x00,0x7d,0xd4,0xff,0x05,0x02,0x00]
 0x0a,0x00,0x7d,0xd4,0xff,0x05,0x02,0x00
+# W32: v_cmp_class_f16_e64 s10, v255, v2       ; encoding: [0x0a,0x00,0x7d,0xd4,0xff,0x05,0x02,0x00]
+# W64: v_cmp_class_f16_e64 s[10:11], v255, v2  ; encoding: [0x0a,0x00,0x7d,0xd4,0xff,0x05,0x02,0x00]
 
-# W32: v_cmp_class_f16_e64 s10, s1, v2           ; encoding: [0x0a,0x00,0x7d,0xd4,0x01,0x04,0x02,0x00]
-# W64: v_cmp_class_f16_e64 s[10:11], s1, v2      ; encoding: [0x0a,0x00,0x7d,0xd4,0x01,0x04,0x02,0x00]
 0x0a,0x00,0x7d,0xd4,0x01,0x04,0x02,0x00
+# W32: v_cmp_class_f16_e64 s10, s1, v2         ; encoding: [0x0a,0x00,0x7d,0xd4,0x01,0x04,0x02,0x00]
+# W64: v_cmp_class_f16_e64 s[10:11], s1, v2    ; encoding: [0x0a,0x00,0x7d,0xd4,0x01,0x04,0x02,0x00]
 
-# W32: v_cmp_class_f16_e64 s10, s105, v255       ; encoding: [0x0a,0x00,0x7d,0xd4,0x69,0xfe,0x03,0x00]
-# W64: v_cmp_class_f16_e64 s[10:11], s105, v255  ; encoding: [0x0a,0x00,0x7d,0xd4,0x69,0xfe,0x03,0x00]
 0x0a,0x00,0x7d,0xd4,0x69,0xfe,0x03,0x00
+# W32: v_cmp_class_f16_e64 s10, s105, v255     ; encoding: [0x0a,0x00,0x7d,0xd4,0x69,0xfe,0x03,0x00]
+# W64: v_cmp_class_f16_e64 s[10:11], s105, v255 ; encoding: [0x0a,0x00,0x7d,0xd4,0x69,0xfe,0x03,0x00]
 
-# W32: v_cmp_class_f16_e64 s10, vcc_lo, s2       ; encoding: [0x0a,0x00,0x7d,0xd4,0x6a,0x04,0x00,0x00]
-# W64: v_cmp_class_f16_e64 s[10:11], vcc_lo, s2  ; encoding: [0x0a,0x00,0x7d,0xd4,0x6a,0x04,0x00,0x00]
 0x0a,0x00,0x7d,0xd4,0x6a,0x04,0x00,0x00
+# W32: v_cmp_class_f16_e64 s10, vcc_lo, s2     ; encoding: [0x0a,0x00,0x7d,0xd4,0x6a,0x04,0x00,0x00]
+# W64: v_cmp_class_f16_e64 s[10:11], vcc_lo, s2 ; encoding: [0x0a,0x00,0x7d,0xd4,0x6a,0x04,0x00,0x00]
 
-# W32: v_cmp_class_f16_e64 s10, vcc_hi, s105     ; encoding: [0x0a,0x00,0x7d,0xd4,0x6b,0xd2,0x00,0x00]
-# W64: v_cmp_class_f16_e64 s[10:11], vcc_hi, s105 ; encoding: [0x0a,0x00,0x7d,0xd4,0x6b,0xd2,0x00,0x00]
 0x0a,0x00,0x7d,0xd4,0x6b,0xd2,0x00,0x00
+# W32: v_cmp_class_f16_e64 s10, vcc_hi, s105   ; encoding: [0x0a,0x00,0x7d,0xd4,0x6b,0xd2,0x00,0x00]
+# W64: v_cmp_class_f16_e64 s[10:11], vcc_hi, s105 ; encoding: [0x0a,0x00,0x7d,0xd4,0x6b,0xd2,0x00,0x00]
 
-# W32: v_cmp_class_f16_e64 s10, ttmp15, ttmp15   ; encoding: [0x0a,0x00,0x7d,0xd4,0x7b,0xf6,0x00,0x00]
-# W64: v_cmp_class_f16_e64 s[10:11], ttmp15, ttmp15 ; encoding: [0x0a,0x00,0x7d,0xd4,0x7b,0xf6,0x00,0x00]
 0x0a,0x00,0x7d,0xd4,0x7b,0xf6,0x00,0x00
+# W32: v_cmp_class_f16_e64 s10, ttmp15, ttmp15 ; encoding: [0x0a,0x00,0x7d,0xd4,0x7b,0xf6,0x00,0x00]
+# W64: v_cmp_class_f16_e64 s[10:11], ttmp15, ttmp15 ; encoding: [0x0a,0x00,0x7d,0xd4,0x7b,0xf6,0x00,0x00]
 
-# W32: v_cmp_class_f16_e64 s10, m0, src_scc      ; encoding: [0x0a,0x00,0x7d,0xd4,0x7d,0xfa,0x01,0x00]
-# W64: v_cmp_class_f16_e64 s[10:11], m0, src_scc ; encoding: [0x0a,0x00,0x7d,0xd4,0x7d,0xfa,0x01,0x00]
 0x0a,0x00,0x7d,0xd4,0x7d,0xfa,0x01,0x00
+# W32: v_cmp_class_f16_e64 s10, m0, src_scc    ; encoding: [0x0a,0x00,0x7d,0xd4,0x7d,0xfa,0x01,0x00]
+# W64: v_cmp_class_f16_e64 s[10:11], m0, src_scc ; encoding: [0x0a,0x00,0x7d,0xd4,0x7d,0xfa,0x01,0x00]
 
-# W32: v_cmp_class_f16_e64 s10, exec_lo, -1      ; encoding: [0x0a,0x00,0x7d,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_class_f16_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x7d,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x7d,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_class_f16_e64 s10, exec_lo, -1    ; encoding: [0x0a,0x00,0x7d,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_class_f16_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x7d,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_class_f16_e64 s10, exec_hi, null    ; encoding: [0x0a,0x00,0x7d,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_class_f16_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x7d,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x7d,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_class_f16_e64 s10, exec_hi, null  ; encoding: [0x0a,0x00,0x7d,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_class_f16_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x7d,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_class_f16_e64 s10, null, exec_lo    ; encoding: [0x0a,0x00,0x7d,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_class_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x7d,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x7d,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_class_f16_e64 s10, null, exec_lo  ; encoding: [0x0a,0x00,0x7d,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_class_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x7d,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_class_f16_e64 s104, -1, exec_hi     ; encoding: [0x68,0x00,0x7d,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_class_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x7d,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x7d,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_class_f16_e64 s104, -1, exec_hi   ; encoding: [0x68,0x00,0x7d,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_class_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x7d,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_class_f16_e64 vcc_lo, 0.5, m0       ; encoding: [0x6a,0x00,0x7d,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_class_f16_e64 vcc, 0.5, m0          ; encoding: [0x6a,0x00,0x7d,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x7d,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_class_f16_e64 vcc_lo, 0.5, m0     ; encoding: [0x6a,0x00,0x7d,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_class_f16_e64 vcc, 0.5, m0        ; encoding: [0x6a,0x00,0x7d,0xd4,0xf0,0xfa,0x00,0x00]
 
+0x7a,0x00,0x7d,0xd4,0xfd,0xd4,0x00,0x00
 # W32: v_cmp_class_f16_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x7d,0xd4,0xfd,0xd4,0x00,0x00]
 # W64: v_cmp_class_f16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x7d,0xd4,0xfd,0xd4,0x00,0x00]
-0x7a,0x00,0x7d,0xd4,0xfd,0xd4,0x00,0x00
 
-# GFX12: v_cmp_class_f16_e64 null, -|0xfe0b|, vcc_hi ; encoding: [0x7c,0x01,0x7d,0xd4,0xff,0xd6,0x00,0x20,0x0b,0xfe,0x00,0x00]
 0x7c,0x01,0x7d,0xd4,0xff,0xd6,0x00,0x20,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmp_class_f16_e64 null, -|0xfe0b|, vcc_hi ; encoding: [0x7c,0x01,0x7d,0xd4,0xff,0xd6,0x00,0x20,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_class_f32_e64 s10, v1, v2           ; encoding: [0x0a,0x00,0x7e,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_class_f32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0x7e,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x7e,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_class_f32_e64 s10, v1, v2         ; encoding: [0x0a,0x00,0x7e,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_class_f32_e64 s[10:11], v1, v2    ; encoding: [0x0a,0x00,0x7e,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_class_f32_e64 s10, v255, v255       ; encoding: [0x0a,0x00,0x7e,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_class_f32_e64 s[10:11], v255, v255  ; encoding: [0x0a,0x00,0x7e,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x7e,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_class_f32_e64 s10, v255, v255     ; encoding: [0x0a,0x00,0x7e,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_class_f32_e64 s[10:11], v255, v255 ; encoding: [0x0a,0x00,0x7e,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_class_f32_e64 s10, s1, s2           ; encoding: [0x0a,0x00,0x7e,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_class_f32_e64 s[10:11], s1, s2      ; encoding: [0x0a,0x00,0x7e,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x7e,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_class_f32_e64 s10, s1, s2         ; encoding: [0x0a,0x00,0x7e,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_class_f32_e64 s[10:11], s1, s2    ; encoding: [0x0a,0x00,0x7e,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_class_f32_e64 s10, s105, s105       ; encoding: [0x0a,0x00,0x7e,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_class_f32_e64 s[10:11], s105, s105  ; encoding: [0x0a,0x00,0x7e,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x7e,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_class_f32_e64 s10, s105, s105     ; encoding: [0x0a,0x00,0x7e,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_class_f32_e64 s[10:11], s105, s105 ; encoding: [0x0a,0x00,0x7e,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_class_f32_e64 s10, vcc_lo, ttmp15   ; encoding: [0x0a,0x00,0x7e,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_class_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x7e,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x7e,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_class_f32_e64 s10, vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x7e,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_class_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x7e,0xd4,0x6a,0xf6,0x00,0x00]
 
+0x0a,0x00,0x7e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_class_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x7e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_class_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x7e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x7e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_class_f32_e64 s10, ttmp15, src_scc  ; encoding: [0x0a,0x00,0x7e,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_class_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x7e,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x7e,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_class_f32_e64 s10, ttmp15, src_scc ; encoding: [0x0a,0x00,0x7e,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_class_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x7e,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_class_f32_e64 s10, m0, 0.5          ; encoding: [0x0a,0x00,0x7e,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_class_f32_e64 s[10:11], m0, 0.5     ; encoding: [0x0a,0x00,0x7e,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x7e,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_class_f32_e64 s10, m0, 0.5        ; encoding: [0x0a,0x00,0x7e,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_class_f32_e64 s[10:11], m0, 0.5   ; encoding: [0x0a,0x00,0x7e,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_class_f32_e64 s10, exec_lo, -1      ; encoding: [0x0a,0x00,0x7e,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_class_f32_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x7e,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x7e,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_class_f32_e64 s10, exec_lo, -1    ; encoding: [0x0a,0x00,0x7e,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_class_f32_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x7e,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_class_f32_e64 s10, exec_hi, null    ; encoding: [0x0a,0x00,0x7e,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_class_f32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x7e,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x7e,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_class_f32_e64 s10, exec_hi, null  ; encoding: [0x0a,0x00,0x7e,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_class_f32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x7e,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_class_f32_e64 s10, null, exec_lo    ; encoding: [0x0a,0x00,0x7e,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_class_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x7e,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x7e,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_class_f32_e64 s10, null, exec_lo  ; encoding: [0x0a,0x00,0x7e,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_class_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x7e,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_class_f32_e64 s104, -1, exec_hi     ; encoding: [0x68,0x00,0x7e,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_class_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x7e,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x7e,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_class_f32_e64 s104, -1, exec_hi   ; encoding: [0x68,0x00,0x7e,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_class_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x7e,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_class_f32_e64 vcc_lo, 0.5, m0       ; encoding: [0x6a,0x00,0x7e,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_class_f32_e64 vcc, 0.5, m0          ; encoding: [0x6a,0x00,0x7e,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x7e,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_class_f32_e64 vcc_lo, 0.5, m0     ; encoding: [0x6a,0x00,0x7e,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_class_f32_e64 vcc, 0.5, m0        ; encoding: [0x6a,0x00,0x7e,0xd4,0xf0,0xfa,0x00,0x00]
 
+0x7a,0x00,0x7e,0xd4,0xfd,0xd4,0x00,0x00
 # W32: v_cmp_class_f32_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x7e,0xd4,0xfd,0xd4,0x00,0x00]
 # W64: v_cmp_class_f32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x7e,0xd4,0xfd,0xd4,0x00,0x00]
-0x7a,0x00,0x7e,0xd4,0xfd,0xd4,0x00,0x00
 
-# GFX12: v_cmp_class_f32_e64 null, -|0xaf123456|, vcc_hi ; encoding: [0x7c,0x01,0x7e,0xd4,0xff,0xd6,0x00,0x20,0x56,0x34,0x12,0xaf]
 0x7c,0x01,0x7e,0xd4,0xff,0xd6,0x00,0x20,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_class_f32_e64 null, -|0xaf123456|, vcc_hi ; encoding: [0x7c,0x01,0x7e,0xd4,0xff,0xd6,0x00,0x20,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_class_f64_e64 s10, v[1:2], v2       ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_class_f64_e64 s[10:11], v[1:2], v2  ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x7f,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_class_f64_e64 s10, v[1:2], v2     ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_class_f64_e64 s[10:11], v[1:2], v2 ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_class_f64_e64 s10, v[1:2], v255     ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0xff,0x03,0x00]
-# W64: v_cmp_class_f64_e64 s[10:11], v[1:2], v255 ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0xff,0x03,0x00]
 0x0a,0x00,0x7f,0xd4,0x01,0xff,0x03,0x00
+# W32: v_cmp_class_f64_e64 s10, v[1:2], v255   ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0xff,0x03,0x00]
+# W64: v_cmp_class_f64_e64 s[10:11], v[1:2], v255 ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0xff,0x03,0x00]
 
-# W32: v_cmp_class_f64_e64 s10, v[1:2], s2       ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0x05,0x00,0x00]
-# W64: v_cmp_class_f64_e64 s[10:11], v[1:2], s2  ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0x05,0x00,0x00]
 0x0a,0x00,0x7f,0xd4,0x01,0x05,0x00,0x00
+# W32: v_cmp_class_f64_e64 s10, v[1:2], s2     ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0x05,0x00,0x00]
+# W64: v_cmp_class_f64_e64 s[10:11], v[1:2], s2 ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0x05,0x00,0x00]
 
-# W32: v_cmp_class_f64_e64 s10, v[1:2], s105     ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0xd3,0x00,0x00]
-# W64: v_cmp_class_f64_e64 s[10:11], v[1:2], s105 ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0xd3,0x00,0x00]
 0x0a,0x00,0x7f,0xd4,0x01,0xd3,0x00,0x00
+# W32: v_cmp_class_f64_e64 s10, v[1:2], s105   ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0xd3,0x00,0x00]
+# W64: v_cmp_class_f64_e64 s[10:11], v[1:2], s105 ; encoding: [0x0a,0x00,0x7f,0xd4,0x01,0xd3,0x00,0x00]
 
+0x0a,0x00,0x7f,0xd4,0xfe,0xf7,0x00,0x00
 # W32: v_cmp_class_f64_e64 s10, v[254:255], ttmp15 ; encoding: [0x0a,0x00,0x7f,0xd4,0xfe,0xf7,0x00,0x00]
 # W64: v_cmp_class_f64_e64 s[10:11], v[254:255], ttmp15 ; encoding: [0x0a,0x00,0x7f,0xd4,0xfe,0xf7,0x00,0x00]
-0x0a,0x00,0x7f,0xd4,0xfe,0xf7,0x00,0x00
 
-# W32: v_cmp_class_f64_e64 s10, s[2:3], vcc_hi   ; encoding: [0x0a,0x00,0x7f,0xd4,0x02,0xd6,0x00,0x00]
-# W64: v_cmp_class_f64_e64 s[10:11], s[2:3], vcc_hi ; encoding: [0x0a,0x00,0x7f,0xd4,0x02,0xd6,0x00,0x00]
 0x0a,0x00,0x7f,0xd4,0x02,0xd6,0x00,0x00
+# W32: v_cmp_class_f64_e64 s10, s[2:3], vcc_hi ; encoding: [0x0a,0x00,0x7f,0xd4,0x02,0xd6,0x00,0x00]
+# W64: v_cmp_class_f64_e64 s[10:11], s[2:3], vcc_hi ; encoding: [0x0a,0x00,0x7f,0xd4,0x02,0xd6,0x00,0x00]
 
+0x0a,0x00,0x7f,0xd4,0x68,0xd4,0x00,0x00
 # W32: v_cmp_class_f64_e64 s10, s[104:105], vcc_lo ; encoding: [0x0a,0x00,0x7f,0xd4,0x68,0xd4,0x00,0x00]
 # W64: v_cmp_class_f64_e64 s[10:11], s[104:105], vcc_lo ; encoding: [0x0a,0x00,0x7f,0xd4,0x68,0xd4,0x00,0x00]
-0x0a,0x00,0x7f,0xd4,0x68,0xd4,0x00,0x00
 
-# W32: v_cmp_class_f64_e64 s10, vcc, m0          ; encoding: [0x0a,0x00,0x7f,0xd4,0x6a,0xfa,0x00,0x00]
-# W64: v_cmp_class_f64_e64 s[10:11], vcc, m0     ; encoding: [0x0a,0x00,0x7f,0xd4,0x6a,0xfa,0x00,0x00]
 0x0a,0x00,0x7f,0xd4,0x6a,0xfa,0x00,0x00
+# W32: v_cmp_class_f64_e64 s10, vcc, m0        ; encoding: [0x0a,0x00,0x7f,0xd4,0x6a,0xfa,0x00,0x00]
+# W64: v_cmp_class_f64_e64 s[10:11], vcc, m0   ; encoding: [0x0a,0x00,0x7f,0xd4,0x6a,0xfa,0x00,0x00]
 
+0x0a,0x00,0x7f,0xd4,0x7a,0xfe,0x00,0x00
 # W32: v_cmp_class_f64_e64 s10, ttmp[14:15], exec_hi ; encoding: [0x0a,0x00,0x7f,0xd4,0x7a,0xfe,0x00,0x00]
 # W64: v_cmp_class_f64_e64 s[10:11], ttmp[14:15], exec_hi ; encoding: [0x0a,0x00,0x7f,0xd4,0x7a,0xfe,0x00,0x00]
-0x0a,0x00,0x7f,0xd4,0x7a,0xfe,0x00,0x00
 
-# W32: v_cmp_class_f64_e64 s10, exec, exec_lo    ; encoding: [0x0a,0x00,0x7f,0xd4,0x7e,0xfc,0x00,0x00]
-# W64: v_cmp_class_f64_e64 s[10:11], exec, exec_lo ; encoding: [0x0a,0x00,0x7f,0xd4,0x7e,0xfc,0x00,0x00]
 0x0a,0x00,0x7f,0xd4,0x7e,0xfc,0x00,0x00
+# W32: v_cmp_class_f64_e64 s10, exec, exec_lo  ; encoding: [0x0a,0x00,0x7f,0xd4,0x7e,0xfc,0x00,0x00]
+# W64: v_cmp_class_f64_e64 s[10:11], exec, exec_lo ; encoding: [0x0a,0x00,0x7f,0xd4,0x7e,0xfc,0x00,0x00]
 
-# W32: v_cmp_class_f64_e64 s10, null, null       ; encoding: [0x0a,0x00,0x7f,0xd4,0x7c,0xf8,0x00,0x00]
-# W64: v_cmp_class_f64_e64 s[10:11], null, null  ; encoding: [0x0a,0x00,0x7f,0xd4,0x7c,0xf8,0x00,0x00]
 0x0a,0x00,0x7f,0xd4,0x7c,0xf8,0x00,0x00
+# W32: v_cmp_class_f64_e64 s10, null, null     ; encoding: [0x0a,0x00,0x7f,0xd4,0x7c,0xf8,0x00,0x00]
+# W64: v_cmp_class_f64_e64 s[10:11], null, null ; encoding: [0x0a,0x00,0x7f,0xd4,0x7c,0xf8,0x00,0x00]
 
-# W32: v_cmp_class_f64_e64 s104, -1, -1          ; encoding: [0x68,0x00,0x7f,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_class_f64_e64 s[104:105], -1, -1    ; encoding: [0x68,0x00,0x7f,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x7f,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_class_f64_e64 s104, -1, -1        ; encoding: [0x68,0x00,0x7f,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_class_f64_e64 s[104:105], -1, -1  ; encoding: [0x68,0x00,0x7f,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_class_f64_e64 vcc_lo, 0.5, 0.5      ; encoding: [0x6a,0x00,0x7f,0xd4,0xf0,0xe0,0x01,0x00]
-# W64: v_cmp_class_f64_e64 vcc, 0.5, 0.5         ; encoding: [0x6a,0x00,0x7f,0xd4,0xf0,0xe0,0x01,0x00]
 0x6a,0x00,0x7f,0xd4,0xf0,0xe0,0x01,0x00
+# W32: v_cmp_class_f64_e64 vcc_lo, 0.5, 0.5    ; encoding: [0x6a,0x00,0x7f,0xd4,0xf0,0xe0,0x01,0x00]
+# W64: v_cmp_class_f64_e64 vcc, 0.5, 0.5       ; encoding: [0x6a,0x00,0x7f,0xd4,0xf0,0xe0,0x01,0x00]
 
+0x7a,0x01,0x7f,0xd4,0xfd,0xfa,0x01,0x20
 # W32: v_cmp_class_f64_e64 ttmp14, -|src_scc|, src_scc ; encoding: [0x7a,0x01,0x7f,0xd4,0xfd,0xfa,0x01,0x20]
 # W64: v_cmp_class_f64_e64 ttmp[14:15], -|src_scc|, src_scc ; encoding: [0x7a,0x01,0x7f,0xd4,0xfd,0xfa,0x01,0x20]
-0x7a,0x01,0x7f,0xd4,0xfd,0xfa,0x01,0x20
 
-# GFX12: v_cmp_class_f64_e64 null, 0xaf123456, 0xaf123456 ; encoding: [0x7c,0x00,0x7f,0xd4,0xff,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x7f,0xd4,0xff,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_class_f64_e64 null, 0xaf123456, 0xaf123456 ; encoding: [0x7c,0x00,0x7f,0xd4,0xff,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_eq_f16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x02,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_eq_f16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x02,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x02,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_eq_f16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x02,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_eq_f16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x02,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_eq_f16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x02,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_eq_f16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x02,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x02,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_eq_f16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x02,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_eq_f16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x02,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_eq_f16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x02,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_eq_f16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x02,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x02,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_eq_f16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x02,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_eq_f16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x02,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_eq_f16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x02,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_eq_f16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x02,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x02,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_eq_f16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x02,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_eq_f16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x02,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_eq_f16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x02,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_eq_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x02,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x02,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_eq_f16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x02,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_eq_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x02,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_eq_f16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x02,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_eq_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x02,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x02,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_eq_f16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x02,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_eq_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x02,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_eq_f16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x02,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_eq_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x02,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x02,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_eq_f16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x02,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_eq_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x02,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_eq_f16_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x02,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_eq_f16_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x02,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x02,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_eq_f16_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x02,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_eq_f16_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x02,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_eq_f16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x02,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_eq_f16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x02,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x02,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_eq_f16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x02,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_eq_f16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x02,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_eq_f16_e64 s10, |exec_hi|, null     ; encoding: [0x0a,0x01,0x02,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_eq_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x02,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x02,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_eq_f16_e64 s10, |exec_hi|, null   ; encoding: [0x0a,0x01,0x02,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_eq_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x02,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_eq_f16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x02,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_eq_f16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x02,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x02,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_eq_f16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x02,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_eq_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x02,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_eq_f16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x02,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_eq_f16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x02,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x02,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_eq_f16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x02,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_eq_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x02,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_eq_f16_e64 vcc_lo, 0.5, -m0         ; encoding: [0x6a,0x00,0x02,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_eq_f16_e64 vcc, 0.5, -m0            ; encoding: [0x6a,0x00,0x02,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x02,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_eq_f16_e64 vcc_lo, 0.5, -m0       ; encoding: [0x6a,0x00,0x02,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_eq_f16_e64 vcc, 0.5, -m0          ; encoding: [0x6a,0x00,0x02,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x02,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_eq_f16_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x02,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_eq_f16_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x02,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x02,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX12: v_cmp_eq_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x02,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7c,0x83,0x02,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmp_eq_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x02,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_eq_f32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x12,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_eq_f32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x12,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x12,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_eq_f32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x12,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_eq_f32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x12,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_eq_f32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x12,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_eq_f32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x12,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x12,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_eq_f32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x12,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_eq_f32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x12,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_eq_f32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x12,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_eq_f32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x12,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x12,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_eq_f32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x12,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_eq_f32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x12,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_eq_f32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x12,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_eq_f32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x12,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x12,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_eq_f32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x12,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_eq_f32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x12,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_eq_f32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x12,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_eq_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x12,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x12,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_eq_f32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x12,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_eq_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x12,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_eq_f32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x12,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_eq_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x12,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x12,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_eq_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x12,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_eq_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x12,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_eq_f32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x12,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_eq_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x12,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x12,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_eq_f32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x12,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_eq_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x12,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_eq_f32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x12,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_eq_f32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x12,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x12,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_eq_f32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x12,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_eq_f32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x12,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_eq_f32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x12,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_eq_f32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x12,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x12,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_eq_f32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x12,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_eq_f32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x12,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_eq_f32_e64 s10, |exec_hi|, null     ; encoding: [0x0a,0x01,0x12,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_eq_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x12,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x12,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_eq_f32_e64 s10, |exec_hi|, null   ; encoding: [0x0a,0x01,0x12,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_eq_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x12,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_eq_f32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x12,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_eq_f32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x12,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x12,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_eq_f32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x12,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_eq_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x12,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_eq_f32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x12,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_eq_f32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x12,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x12,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_eq_f32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x12,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_eq_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x12,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_eq_f32_e64 vcc_lo, 0.5, -m0         ; encoding: [0x6a,0x00,0x12,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_eq_f32_e64 vcc, 0.5, -m0            ; encoding: [0x6a,0x00,0x12,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x12,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_eq_f32_e64 vcc_lo, 0.5, -m0       ; encoding: [0x6a,0x00,0x12,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_eq_f32_e64 vcc, 0.5, -m0          ; encoding: [0x6a,0x00,0x12,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x12,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_eq_f32_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x12,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_eq_f32_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x12,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x12,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX12: v_cmp_eq_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x12,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7c,0x83,0x12,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_eq_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x12,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_eq_f64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x22,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_eq_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x22,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x22,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_eq_f64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x22,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_eq_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x22,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x22,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_eq_f64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x22,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_eq_f64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x22,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x22,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_eq_f64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x22,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_eq_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x22,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x22,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_eq_f64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x22,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_eq_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x22,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x22,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_eq_f64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x22,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_eq_f64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x22,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x22,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_eq_f64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x22,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_eq_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x22,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x22,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_eq_f64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x22,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_eq_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x22,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x22,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_eq_f64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x22,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_eq_f64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x22,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x22,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_eq_f64_e64 s10, -|exec|, src_scc    ; encoding: [0x0a,0x01,0x22,0xd4,0x7e,0xfa,0x01,0x20]
-# W64: v_cmp_eq_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x22,0xd4,0x7e,0xfa,0x01,0x20]
 0x0a,0x01,0x22,0xd4,0x7e,0xfa,0x01,0x20
+# W32: v_cmp_eq_f64_e64 s10, -|exec|, src_scc  ; encoding: [0x0a,0x01,0x22,0xd4,0x7e,0xfa,0x01,0x20]
+# W64: v_cmp_eq_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x22,0xd4,0x7e,0xfa,0x01,0x20]
 
-# W32: v_cmp_eq_f64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x22,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_eq_f64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x22,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x22,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_eq_f64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x22,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_eq_f64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x22,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_eq_f64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x22,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_eq_f64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x22,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x22,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_eq_f64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x22,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_eq_f64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x22,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_eq_f64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x22,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_eq_f64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x22,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x22,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_eq_f64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x22,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_eq_f64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x22,0xd4,0xf0,0xf8,0x00,0x00]
 
+0x7a,0x03,0x22,0xd4,0xfd,0xfc,0x00,0x60
 # W32: v_cmp_eq_f64_e64 ttmp14, -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x22,0xd4,0xfd,0xfc,0x00,0x60]
 # W64: v_cmp_eq_f64_e64 ttmp[14:15], -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x22,0xd4,0xfd,0xfc,0x00,0x60]
-0x7a,0x03,0x22,0xd4,0xfd,0xfc,0x00,0x60
 
-# GFX12: v_cmp_eq_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x22,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7c,0x82,0x22,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_eq_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x22,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_eq_i16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x32,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_eq_i16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x32,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x32,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_eq_i16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x32,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_eq_i16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x32,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_eq_i16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x32,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_eq_i16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x32,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x32,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_eq_i16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x32,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_eq_i16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x32,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_eq_i16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x32,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_eq_i16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x32,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x32,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_eq_i16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x32,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_eq_i16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x32,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_eq_i16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x32,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_eq_i16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x32,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x32,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_eq_i16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x32,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_eq_i16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x32,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_eq_i16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x32,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_eq_i16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x32,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x32,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_eq_i16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x32,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_eq_i16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x32,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_eq_i16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x32,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_eq_i16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x32,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x32,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_eq_i16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x32,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_eq_i16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x32,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_eq_i16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x32,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_eq_i16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x32,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x32,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_eq_i16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x32,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_eq_i16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x32,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_eq_i16_e64 s10, m0, 0x3800          ; encoding: [0x0a,0x00,0x32,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_eq_i16_e64 s[10:11], m0, 0x3800     ; encoding: [0x0a,0x00,0x32,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x0a,0x00,0x32,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_eq_i16_e64 s10, m0, 0x3800        ; encoding: [0x0a,0x00,0x32,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_eq_i16_e64 s[10:11], m0, 0x3800   ; encoding: [0x0a,0x00,0x32,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_eq_i16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x32,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_eq_i16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x32,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x32,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_eq_i16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x32,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_eq_i16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x32,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_eq_i16_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x32,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_eq_i16_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x32,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x32,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_eq_i16_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x32,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_eq_i16_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x32,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_eq_i16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x32,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_eq_i16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x32,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x32,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_eq_i16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x32,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_eq_i16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x32,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_eq_i16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x32,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_eq_i16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x32,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x32,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_eq_i16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x32,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_eq_i16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x32,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_eq_i16_e64 vcc_lo, 0x3800, m0       ; encoding: [0x6a,0x00,0x32,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_eq_i16_e64 vcc, 0x3800, m0          ; encoding: [0x6a,0x00,0x32,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x6a,0x00,0x32,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_eq_i16_e64 vcc_lo, 0x3800, m0     ; encoding: [0x6a,0x00,0x32,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_eq_i16_e64 vcc, 0x3800, m0        ; encoding: [0x6a,0x00,0x32,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_eq_i16_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x32,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_eq_i16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x32,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x32,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_eq_i16_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x32,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_eq_i16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x32,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmp_eq_i16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x32,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7c,0x00,0x32,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmp_eq_i16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x32,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_eq_i32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x42,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_eq_i32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x42,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x42,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_eq_i32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x42,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_eq_i32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x42,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_eq_i32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x42,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_eq_i32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x42,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x42,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_eq_i32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x42,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_eq_i32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x42,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_eq_i32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x42,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_eq_i32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x42,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x42,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_eq_i32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x42,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_eq_i32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x42,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_eq_i32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x42,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_eq_i32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x42,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x42,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_eq_i32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x42,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_eq_i32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x42,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_eq_i32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x42,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_eq_i32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x42,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x42,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_eq_i32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x42,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_eq_i32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x42,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_eq_i32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x42,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_eq_i32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x42,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x42,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_eq_i32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x42,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_eq_i32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x42,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_eq_i32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x42,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_eq_i32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x42,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x42,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_eq_i32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x42,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_eq_i32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x42,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_eq_i32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x42,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_eq_i32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x42,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x42,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_eq_i32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x42,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_eq_i32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x42,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_eq_i32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x42,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_eq_i32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x42,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x42,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_eq_i32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x42,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_eq_i32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x42,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_eq_i32_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x42,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_eq_i32_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x42,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x42,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_eq_i32_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x42,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_eq_i32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x42,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_eq_i32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x42,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_eq_i32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x42,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x42,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_eq_i32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x42,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_eq_i32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x42,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_eq_i32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x42,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_eq_i32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x42,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x42,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_eq_i32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x42,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_eq_i32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x42,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_eq_i32_e64 vcc_lo, 0.5, m0          ; encoding: [0x6a,0x00,0x42,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_eq_i32_e64 vcc, 0.5, m0             ; encoding: [0x6a,0x00,0x42,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x42,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_eq_i32_e64 vcc_lo, 0.5, m0        ; encoding: [0x6a,0x00,0x42,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_eq_i32_e64 vcc, 0.5, m0           ; encoding: [0x6a,0x00,0x42,0xd4,0xf0,0xfa,0x00,0x00]
 
-# W32: v_cmp_eq_i32_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x42,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_eq_i32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x42,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x42,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_eq_i32_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x42,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_eq_i32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x42,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmp_eq_i32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x42,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x42,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_eq_i32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x42,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_eq_i64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x52,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_eq_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x52,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x52,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_eq_i64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x52,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_eq_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x52,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x52,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_eq_i64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x52,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_eq_i64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x52,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x52,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_eq_i64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x52,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_eq_i64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x52,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x52,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_eq_i64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x52,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_eq_i64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x52,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x52,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_eq_i64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x52,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_eq_i64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x52,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x52,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_eq_i64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x52,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_eq_i64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x52,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x52,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_eq_i64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x52,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_eq_i64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x52,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x52,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_eq_i64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x52,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_eq_i64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x52,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x52,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_eq_i64_e64 s10, exec, src_scc       ; encoding: [0x0a,0x00,0x52,0xd4,0x7e,0xfa,0x01,0x00]
-# W64: v_cmp_eq_i64_e64 s[10:11], exec, src_scc  ; encoding: [0x0a,0x00,0x52,0xd4,0x7e,0xfa,0x01,0x00]
 0x0a,0x00,0x52,0xd4,0x7e,0xfa,0x01,0x00
+# W32: v_cmp_eq_i64_e64 s10, exec, src_scc     ; encoding: [0x0a,0x00,0x52,0xd4,0x7e,0xfa,0x01,0x00]
+# W64: v_cmp_eq_i64_e64 s[10:11], exec, src_scc ; encoding: [0x0a,0x00,0x52,0xd4,0x7e,0xfa,0x01,0x00]
 
-# W32: v_cmp_eq_i64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x52,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_eq_i64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x52,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x52,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_eq_i64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x52,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_eq_i64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x52,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_eq_i64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x52,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_eq_i64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x52,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x52,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_eq_i64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x52,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_eq_i64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x52,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_eq_i64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x52,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_eq_i64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x52,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x52,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_eq_i64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x52,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_eq_i64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x52,0xd4,0xf0,0xf8,0x00,0x00]
 
-# W32: v_cmp_eq_i64_e64 ttmp14, src_scc, exec    ; encoding: [0x7a,0x00,0x52,0xd4,0xfd,0xfc,0x00,0x00]
-# W64: v_cmp_eq_i64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x52,0xd4,0xfd,0xfc,0x00,0x00]
 0x7a,0x00,0x52,0xd4,0xfd,0xfc,0x00,0x00
+# W32: v_cmp_eq_i64_e64 ttmp14, src_scc, exec  ; encoding: [0x7a,0x00,0x52,0xd4,0xfd,0xfc,0x00,0x00]
+# W64: v_cmp_eq_i64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x52,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX12: v_cmp_eq_i64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x52,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x52,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_eq_i64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x52,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_eq_u16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x3a,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_eq_u16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x3a,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x3a,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_eq_u16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x3a,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_eq_u16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x3a,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_eq_u16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x3a,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_eq_u16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x3a,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x3a,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_eq_u16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x3a,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_eq_u16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x3a,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_eq_u16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x3a,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_eq_u16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x3a,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x3a,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_eq_u16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x3a,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_eq_u16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x3a,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_eq_u16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x3a,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_eq_u16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x3a,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x3a,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_eq_u16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x3a,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_eq_u16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x3a,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_eq_u16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x3a,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_eq_u16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x3a,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x3a,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_eq_u16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x3a,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_eq_u16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x3a,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_eq_u16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x3a,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_eq_u16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x3a,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x3a,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_eq_u16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x3a,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_eq_u16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x3a,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_eq_u16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x3a,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_eq_u16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x3a,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x3a,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_eq_u16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x3a,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_eq_u16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x3a,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_eq_u16_e64 s10, m0, 0x3800          ; encoding: [0x0a,0x00,0x3a,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_eq_u16_e64 s[10:11], m0, 0x3800     ; encoding: [0x0a,0x00,0x3a,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x0a,0x00,0x3a,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_eq_u16_e64 s10, m0, 0x3800        ; encoding: [0x0a,0x00,0x3a,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_eq_u16_e64 s[10:11], m0, 0x3800   ; encoding: [0x0a,0x00,0x3a,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_eq_u16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x3a,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_eq_u16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x3a,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x3a,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_eq_u16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x3a,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_eq_u16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x3a,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_eq_u16_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x3a,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_eq_u16_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x3a,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x3a,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_eq_u16_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x3a,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_eq_u16_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x3a,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_eq_u16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x3a,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_eq_u16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x3a,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x3a,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_eq_u16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x3a,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_eq_u16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x3a,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_eq_u16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x3a,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_eq_u16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x3a,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x3a,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_eq_u16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x3a,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_eq_u16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x3a,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_eq_u16_e64 vcc_lo, 0x3800, m0       ; encoding: [0x6a,0x00,0x3a,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_eq_u16_e64 vcc, 0x3800, m0          ; encoding: [0x6a,0x00,0x3a,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x6a,0x00,0x3a,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_eq_u16_e64 vcc_lo, 0x3800, m0     ; encoding: [0x6a,0x00,0x3a,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_eq_u16_e64 vcc, 0x3800, m0        ; encoding: [0x6a,0x00,0x3a,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_eq_u16_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x3a,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_eq_u16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x3a,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x3a,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_eq_u16_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x3a,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_eq_u16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x3a,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmp_eq_u16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x3a,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7c,0x00,0x3a,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmp_eq_u16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x3a,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_eq_u32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x4a,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_eq_u32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x4a,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x4a,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_eq_u32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x4a,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_eq_u32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x4a,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_eq_u32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x4a,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_eq_u32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x4a,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x4a,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_eq_u32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x4a,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_eq_u32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x4a,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_eq_u32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x4a,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_eq_u32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x4a,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x4a,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_eq_u32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x4a,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_eq_u32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x4a,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_eq_u32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x4a,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_eq_u32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x4a,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x4a,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_eq_u32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x4a,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_eq_u32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x4a,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_eq_u32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x4a,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_eq_u32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x4a,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x4a,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_eq_u32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x4a,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_eq_u32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x4a,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_eq_u32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x4a,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_eq_u32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4a,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x4a,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_eq_u32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4a,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_eq_u32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4a,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_eq_u32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x4a,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_eq_u32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x4a,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x4a,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_eq_u32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x4a,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_eq_u32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x4a,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_eq_u32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x4a,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_eq_u32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x4a,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x4a,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_eq_u32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x4a,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_eq_u32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x4a,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_eq_u32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x4a,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_eq_u32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x4a,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x4a,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_eq_u32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x4a,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_eq_u32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x4a,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_eq_u32_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x4a,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_eq_u32_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x4a,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x4a,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_eq_u32_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x4a,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_eq_u32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x4a,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_eq_u32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x4a,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_eq_u32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x4a,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x4a,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_eq_u32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x4a,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_eq_u32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x4a,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_eq_u32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x4a,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_eq_u32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x4a,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x4a,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_eq_u32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x4a,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_eq_u32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x4a,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_eq_u32_e64 vcc_lo, 0.5, m0          ; encoding: [0x6a,0x00,0x4a,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_eq_u32_e64 vcc, 0.5, m0             ; encoding: [0x6a,0x00,0x4a,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x4a,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_eq_u32_e64 vcc_lo, 0.5, m0        ; encoding: [0x6a,0x00,0x4a,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_eq_u32_e64 vcc, 0.5, m0           ; encoding: [0x6a,0x00,0x4a,0xd4,0xf0,0xfa,0x00,0x00]
 
-# W32: v_cmp_eq_u32_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x4a,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_eq_u32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4a,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x4a,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_eq_u32_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4a,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_eq_u32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4a,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmp_eq_u32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x4a,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x4a,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_eq_u32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x4a,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_eq_u64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x5a,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_eq_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x5a,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x5a,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_eq_u64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x5a,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_eq_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x5a,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x5a,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_eq_u64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x5a,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_eq_u64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x5a,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x5a,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_eq_u64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x5a,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_eq_u64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x5a,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x5a,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_eq_u64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x5a,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_eq_u64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x5a,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x5a,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_eq_u64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x5a,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_eq_u64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x5a,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x5a,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_eq_u64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x5a,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_eq_u64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x5a,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x5a,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_eq_u64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x5a,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_eq_u64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x5a,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x5a,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_eq_u64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x5a,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_eq_u64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x5a,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x5a,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_eq_u64_e64 s10, exec, src_scc       ; encoding: [0x0a,0x00,0x5a,0xd4,0x7e,0xfa,0x01,0x00]
-# W64: v_cmp_eq_u64_e64 s[10:11], exec, src_scc  ; encoding: [0x0a,0x00,0x5a,0xd4,0x7e,0xfa,0x01,0x00]
 0x0a,0x00,0x5a,0xd4,0x7e,0xfa,0x01,0x00
+# W32: v_cmp_eq_u64_e64 s10, exec, src_scc     ; encoding: [0x0a,0x00,0x5a,0xd4,0x7e,0xfa,0x01,0x00]
+# W64: v_cmp_eq_u64_e64 s[10:11], exec, src_scc ; encoding: [0x0a,0x00,0x5a,0xd4,0x7e,0xfa,0x01,0x00]
 
-# W32: v_cmp_eq_u64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x5a,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_eq_u64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x5a,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x5a,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_eq_u64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x5a,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_eq_u64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x5a,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_eq_u64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x5a,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_eq_u64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x5a,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x5a,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_eq_u64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x5a,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_eq_u64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x5a,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_eq_u64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x5a,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_eq_u64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x5a,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x5a,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_eq_u64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x5a,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_eq_u64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x5a,0xd4,0xf0,0xf8,0x00,0x00]
 
-# W32: v_cmp_eq_u64_e64 ttmp14, src_scc, exec    ; encoding: [0x7a,0x00,0x5a,0xd4,0xfd,0xfc,0x00,0x00]
-# W64: v_cmp_eq_u64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x5a,0xd4,0xfd,0xfc,0x00,0x00]
 0x7a,0x00,0x5a,0xd4,0xfd,0xfc,0x00,0x00
+# W32: v_cmp_eq_u64_e64 ttmp14, src_scc, exec  ; encoding: [0x7a,0x00,0x5a,0xd4,0xfd,0xfc,0x00,0x00]
+# W64: v_cmp_eq_u64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x5a,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX12: v_cmp_eq_u64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x5a,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x5a,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_eq_u64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x5a,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ge_f16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x06,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ge_f16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x06,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x06,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ge_f16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x06,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ge_f16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x06,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_ge_f16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x06,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_ge_f16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x06,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x06,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_ge_f16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x06,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_ge_f16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x06,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_ge_f16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x06,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_ge_f16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x06,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x06,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_ge_f16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x06,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_ge_f16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x06,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_ge_f16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x06,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_ge_f16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x06,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x06,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_ge_f16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x06,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_ge_f16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x06,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_ge_f16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x06,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_ge_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x06,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x06,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_ge_f16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x06,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_ge_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x06,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_ge_f16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x06,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_ge_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x06,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x06,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_ge_f16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x06,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_ge_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x06,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ge_f16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x06,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_ge_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x06,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x06,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_ge_f16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x06,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_ge_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x06,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_ge_f16_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x06,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_ge_f16_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x06,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x06,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_ge_f16_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x06,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_ge_f16_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x06,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_ge_f16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x06,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_ge_f16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x06,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x06,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_ge_f16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x06,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_ge_f16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x06,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_ge_f16_e64 s10, |exec_hi|, null     ; encoding: [0x0a,0x01,0x06,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_ge_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x06,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x06,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_ge_f16_e64 s10, |exec_hi|, null   ; encoding: [0x0a,0x01,0x06,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_ge_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x06,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_ge_f16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x06,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_ge_f16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x06,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x06,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_ge_f16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x06,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_ge_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x06,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_ge_f16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x06,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_ge_f16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x06,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x06,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_ge_f16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x06,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_ge_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x06,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_ge_f16_e64 vcc_lo, 0.5, -m0         ; encoding: [0x6a,0x00,0x06,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_ge_f16_e64 vcc, 0.5, -m0            ; encoding: [0x6a,0x00,0x06,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x06,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_ge_f16_e64 vcc_lo, 0.5, -m0       ; encoding: [0x6a,0x00,0x06,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_ge_f16_e64 vcc, 0.5, -m0          ; encoding: [0x6a,0x00,0x06,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x06,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_ge_f16_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x06,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_ge_f16_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x06,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x06,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX12: v_cmp_ge_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x06,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7c,0x83,0x06,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmp_ge_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x06,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ge_f32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x16,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ge_f32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x16,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x16,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ge_f32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x16,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ge_f32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x16,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_ge_f32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x16,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_ge_f32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x16,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x16,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_ge_f32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x16,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_ge_f32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x16,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_ge_f32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x16,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_ge_f32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x16,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x16,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_ge_f32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x16,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_ge_f32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x16,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_ge_f32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x16,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_ge_f32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x16,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x16,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_ge_f32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x16,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_ge_f32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x16,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_ge_f32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x16,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_ge_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x16,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x16,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_ge_f32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x16,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_ge_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x16,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_ge_f32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x16,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_ge_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x16,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x16,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_ge_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x16,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_ge_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x16,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ge_f32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x16,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_ge_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x16,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x16,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_ge_f32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x16,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_ge_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x16,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_ge_f32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x16,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_ge_f32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x16,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x16,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_ge_f32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x16,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_ge_f32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x16,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_ge_f32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x16,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_ge_f32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x16,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x16,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_ge_f32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x16,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_ge_f32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x16,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_ge_f32_e64 s10, |exec_hi|, null     ; encoding: [0x0a,0x01,0x16,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_ge_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x16,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x16,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_ge_f32_e64 s10, |exec_hi|, null   ; encoding: [0x0a,0x01,0x16,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_ge_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x16,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_ge_f32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x16,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_ge_f32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x16,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x16,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_ge_f32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x16,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_ge_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x16,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_ge_f32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x16,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_ge_f32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x16,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x16,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_ge_f32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x16,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_ge_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x16,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_ge_f32_e64 vcc_lo, 0.5, -m0         ; encoding: [0x6a,0x00,0x16,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_ge_f32_e64 vcc, 0.5, -m0            ; encoding: [0x6a,0x00,0x16,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x16,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_ge_f32_e64 vcc_lo, 0.5, -m0       ; encoding: [0x6a,0x00,0x16,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_ge_f32_e64 vcc, 0.5, -m0          ; encoding: [0x6a,0x00,0x16,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x16,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_ge_f32_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x16,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_ge_f32_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x16,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x16,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX12: v_cmp_ge_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x16,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7c,0x83,0x16,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_ge_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x16,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ge_f64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x26,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ge_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x26,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x26,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ge_f64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x26,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ge_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x26,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x26,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_ge_f64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x26,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_ge_f64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x26,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x26,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_ge_f64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x26,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_ge_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x26,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x26,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_ge_f64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x26,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_ge_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x26,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x26,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_ge_f64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x26,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_ge_f64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x26,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x26,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_ge_f64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x26,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_ge_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x26,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x26,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_ge_f64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x26,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_ge_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x26,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x26,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_ge_f64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x26,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_ge_f64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x26,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x26,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_ge_f64_e64 s10, -|exec|, src_scc    ; encoding: [0x0a,0x01,0x26,0xd4,0x7e,0xfa,0x01,0x20]
-# W64: v_cmp_ge_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x26,0xd4,0x7e,0xfa,0x01,0x20]
 0x0a,0x01,0x26,0xd4,0x7e,0xfa,0x01,0x20
+# W32: v_cmp_ge_f64_e64 s10, -|exec|, src_scc  ; encoding: [0x0a,0x01,0x26,0xd4,0x7e,0xfa,0x01,0x20]
+# W64: v_cmp_ge_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x26,0xd4,0x7e,0xfa,0x01,0x20]
 
-# W32: v_cmp_ge_f64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x26,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_ge_f64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x26,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x26,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_ge_f64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x26,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_ge_f64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x26,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_ge_f64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x26,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_ge_f64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x26,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x26,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_ge_f64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x26,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_ge_f64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x26,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_ge_f64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x26,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_ge_f64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x26,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x26,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_ge_f64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x26,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_ge_f64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x26,0xd4,0xf0,0xf8,0x00,0x00]
 
+0x7a,0x03,0x26,0xd4,0xfd,0xfc,0x00,0x60
 # W32: v_cmp_ge_f64_e64 ttmp14, -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x26,0xd4,0xfd,0xfc,0x00,0x60]
 # W64: v_cmp_ge_f64_e64 ttmp[14:15], -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x26,0xd4,0xfd,0xfc,0x00,0x60]
-0x7a,0x03,0x26,0xd4,0xfd,0xfc,0x00,0x60
 
-# GFX12: v_cmp_ge_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x26,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7c,0x82,0x26,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_ge_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x26,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ge_i16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x36,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ge_i16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x36,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x36,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ge_i16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x36,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ge_i16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x36,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_ge_i16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x36,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_ge_i16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x36,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x36,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_ge_i16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x36,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_ge_i16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x36,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_ge_i16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x36,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_ge_i16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x36,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x36,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_ge_i16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x36,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_ge_i16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x36,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_ge_i16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x36,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_ge_i16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x36,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x36,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_ge_i16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x36,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_ge_i16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x36,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_ge_i16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x36,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_ge_i16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x36,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x36,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_ge_i16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x36,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_ge_i16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x36,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_ge_i16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x36,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_ge_i16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x36,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x36,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_ge_i16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x36,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_ge_i16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x36,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ge_i16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x36,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_ge_i16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x36,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x36,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_ge_i16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x36,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_ge_i16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x36,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_ge_i16_e64 s10, m0, 0x3800          ; encoding: [0x0a,0x00,0x36,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_ge_i16_e64 s[10:11], m0, 0x3800     ; encoding: [0x0a,0x00,0x36,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x0a,0x00,0x36,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_ge_i16_e64 s10, m0, 0x3800        ; encoding: [0x0a,0x00,0x36,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_ge_i16_e64 s[10:11], m0, 0x3800   ; encoding: [0x0a,0x00,0x36,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_ge_i16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x36,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_ge_i16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x36,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x36,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_ge_i16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x36,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_ge_i16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x36,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_ge_i16_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x36,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_ge_i16_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x36,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x36,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_ge_i16_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x36,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_ge_i16_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x36,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_ge_i16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x36,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_ge_i16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x36,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x36,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_ge_i16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x36,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_ge_i16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x36,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_ge_i16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x36,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_ge_i16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x36,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x36,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_ge_i16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x36,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_ge_i16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x36,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_ge_i16_e64 vcc_lo, 0x3800, m0       ; encoding: [0x6a,0x00,0x36,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_ge_i16_e64 vcc, 0x3800, m0          ; encoding: [0x6a,0x00,0x36,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x6a,0x00,0x36,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_ge_i16_e64 vcc_lo, 0x3800, m0     ; encoding: [0x6a,0x00,0x36,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_ge_i16_e64 vcc, 0x3800, m0        ; encoding: [0x6a,0x00,0x36,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_ge_i16_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x36,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_ge_i16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x36,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x36,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_ge_i16_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x36,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_ge_i16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x36,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmp_ge_i16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x36,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7c,0x00,0x36,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmp_ge_i16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x36,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ge_i32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x46,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ge_i32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x46,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x46,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ge_i32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x46,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ge_i32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x46,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_ge_i32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x46,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_ge_i32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x46,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x46,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_ge_i32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x46,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_ge_i32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x46,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_ge_i32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x46,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_ge_i32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x46,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x46,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_ge_i32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x46,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_ge_i32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x46,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_ge_i32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x46,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_ge_i32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x46,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x46,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_ge_i32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x46,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_ge_i32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x46,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_ge_i32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x46,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_ge_i32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x46,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x46,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_ge_i32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x46,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_ge_i32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x46,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_ge_i32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x46,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_ge_i32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x46,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x46,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_ge_i32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x46,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_ge_i32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x46,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ge_i32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x46,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_ge_i32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x46,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x46,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_ge_i32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x46,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_ge_i32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x46,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_ge_i32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x46,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_ge_i32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x46,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x46,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_ge_i32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x46,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_ge_i32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x46,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_ge_i32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x46,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_ge_i32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x46,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x46,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_ge_i32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x46,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_ge_i32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x46,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_ge_i32_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x46,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_ge_i32_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x46,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x46,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_ge_i32_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x46,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_ge_i32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x46,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_ge_i32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x46,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_ge_i32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x46,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x46,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_ge_i32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x46,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_ge_i32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x46,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_ge_i32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x46,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_ge_i32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x46,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x46,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_ge_i32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x46,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_ge_i32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x46,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_ge_i32_e64 vcc_lo, 0.5, m0          ; encoding: [0x6a,0x00,0x46,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_ge_i32_e64 vcc, 0.5, m0             ; encoding: [0x6a,0x00,0x46,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x46,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_ge_i32_e64 vcc_lo, 0.5, m0        ; encoding: [0x6a,0x00,0x46,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_ge_i32_e64 vcc, 0.5, m0           ; encoding: [0x6a,0x00,0x46,0xd4,0xf0,0xfa,0x00,0x00]
 
-# W32: v_cmp_ge_i32_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x46,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_ge_i32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x46,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x46,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_ge_i32_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x46,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_ge_i32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x46,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmp_ge_i32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x46,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x46,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_ge_i32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x46,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ge_i64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x56,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ge_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x56,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x56,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ge_i64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x56,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ge_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x56,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x56,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_ge_i64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x56,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_ge_i64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x56,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x56,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_ge_i64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x56,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_ge_i64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x56,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x56,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_ge_i64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x56,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_ge_i64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x56,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x56,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_ge_i64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x56,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_ge_i64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x56,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x56,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_ge_i64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x56,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_ge_i64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x56,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x56,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_ge_i64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x56,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_ge_i64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x56,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x56,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_ge_i64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x56,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_ge_i64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x56,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x56,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_ge_i64_e64 s10, exec, src_scc       ; encoding: [0x0a,0x00,0x56,0xd4,0x7e,0xfa,0x01,0x00]
-# W64: v_cmp_ge_i64_e64 s[10:11], exec, src_scc  ; encoding: [0x0a,0x00,0x56,0xd4,0x7e,0xfa,0x01,0x00]
 0x0a,0x00,0x56,0xd4,0x7e,0xfa,0x01,0x00
+# W32: v_cmp_ge_i64_e64 s10, exec, src_scc     ; encoding: [0x0a,0x00,0x56,0xd4,0x7e,0xfa,0x01,0x00]
+# W64: v_cmp_ge_i64_e64 s[10:11], exec, src_scc ; encoding: [0x0a,0x00,0x56,0xd4,0x7e,0xfa,0x01,0x00]
 
-# W32: v_cmp_ge_i64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x56,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_ge_i64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x56,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x56,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_ge_i64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x56,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_ge_i64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x56,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_ge_i64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x56,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_ge_i64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x56,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x56,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_ge_i64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x56,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_ge_i64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x56,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_ge_i64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x56,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_ge_i64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x56,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x56,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_ge_i64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x56,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_ge_i64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x56,0xd4,0xf0,0xf8,0x00,0x00]
 
-# W32: v_cmp_ge_i64_e64 ttmp14, src_scc, exec    ; encoding: [0x7a,0x00,0x56,0xd4,0xfd,0xfc,0x00,0x00]
-# W64: v_cmp_ge_i64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x56,0xd4,0xfd,0xfc,0x00,0x00]
 0x7a,0x00,0x56,0xd4,0xfd,0xfc,0x00,0x00
+# W32: v_cmp_ge_i64_e64 ttmp14, src_scc, exec  ; encoding: [0x7a,0x00,0x56,0xd4,0xfd,0xfc,0x00,0x00]
+# W64: v_cmp_ge_i64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x56,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX12: v_cmp_ge_i64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x56,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x56,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_ge_i64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x56,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ge_u16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x3e,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ge_u16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x3e,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x3e,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ge_u16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x3e,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ge_u16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x3e,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_ge_u16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x3e,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_ge_u16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x3e,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x3e,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_ge_u16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x3e,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_ge_u16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x3e,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_ge_u16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x3e,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_ge_u16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x3e,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x3e,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_ge_u16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x3e,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_ge_u16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x3e,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_ge_u16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x3e,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_ge_u16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x3e,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x3e,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_ge_u16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x3e,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_ge_u16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x3e,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_ge_u16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x3e,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_ge_u16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x3e,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x3e,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_ge_u16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x3e,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_ge_u16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x3e,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_ge_u16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x3e,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_ge_u16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x3e,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x3e,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_ge_u16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x3e,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_ge_u16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x3e,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ge_u16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x3e,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_ge_u16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x3e,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x3e,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_ge_u16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x3e,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_ge_u16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x3e,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_ge_u16_e64 s10, m0, 0x3800          ; encoding: [0x0a,0x00,0x3e,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_ge_u16_e64 s[10:11], m0, 0x3800     ; encoding: [0x0a,0x00,0x3e,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x0a,0x00,0x3e,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_ge_u16_e64 s10, m0, 0x3800        ; encoding: [0x0a,0x00,0x3e,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_ge_u16_e64 s[10:11], m0, 0x3800   ; encoding: [0x0a,0x00,0x3e,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_ge_u16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x3e,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_ge_u16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x3e,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x3e,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_ge_u16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x3e,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_ge_u16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x3e,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_ge_u16_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x3e,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_ge_u16_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x3e,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x3e,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_ge_u16_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x3e,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_ge_u16_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x3e,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_ge_u16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x3e,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_ge_u16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x3e,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x3e,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_ge_u16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x3e,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_ge_u16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x3e,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_ge_u16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x3e,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_ge_u16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x3e,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x3e,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_ge_u16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x3e,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_ge_u16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x3e,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_ge_u16_e64 vcc_lo, 0x3800, m0       ; encoding: [0x6a,0x00,0x3e,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_ge_u16_e64 vcc, 0x3800, m0          ; encoding: [0x6a,0x00,0x3e,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x6a,0x00,0x3e,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_ge_u16_e64 vcc_lo, 0x3800, m0     ; encoding: [0x6a,0x00,0x3e,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_ge_u16_e64 vcc, 0x3800, m0        ; encoding: [0x6a,0x00,0x3e,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_ge_u16_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x3e,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_ge_u16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x3e,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x3e,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_ge_u16_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x3e,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_ge_u16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x3e,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmp_ge_u16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x3e,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7c,0x00,0x3e,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmp_ge_u16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x3e,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ge_u32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x4e,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ge_u32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x4e,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x4e,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ge_u32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x4e,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ge_u32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x4e,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_ge_u32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x4e,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_ge_u32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x4e,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x4e,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_ge_u32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x4e,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_ge_u32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x4e,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_ge_u32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x4e,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_ge_u32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x4e,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x4e,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_ge_u32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x4e,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_ge_u32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x4e,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_ge_u32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x4e,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_ge_u32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x4e,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x4e,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_ge_u32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x4e,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_ge_u32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x4e,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_ge_u32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x4e,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_ge_u32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x4e,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x4e,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_ge_u32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x4e,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_ge_u32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x4e,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_ge_u32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x4e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_ge_u32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x4e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_ge_u32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_ge_u32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ge_u32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x4e,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_ge_u32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x4e,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x4e,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_ge_u32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x4e,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_ge_u32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x4e,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_ge_u32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x4e,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_ge_u32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x4e,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x4e,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_ge_u32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x4e,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_ge_u32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x4e,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_ge_u32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x4e,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_ge_u32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x4e,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x4e,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_ge_u32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x4e,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_ge_u32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x4e,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_ge_u32_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x4e,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_ge_u32_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x4e,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x4e,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_ge_u32_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x4e,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_ge_u32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x4e,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_ge_u32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x4e,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_ge_u32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x4e,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x4e,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_ge_u32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x4e,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_ge_u32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x4e,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_ge_u32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x4e,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_ge_u32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x4e,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x4e,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_ge_u32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x4e,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_ge_u32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x4e,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_ge_u32_e64 vcc_lo, 0.5, m0          ; encoding: [0x6a,0x00,0x4e,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_ge_u32_e64 vcc, 0.5, m0             ; encoding: [0x6a,0x00,0x4e,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x4e,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_ge_u32_e64 vcc_lo, 0.5, m0        ; encoding: [0x6a,0x00,0x4e,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_ge_u32_e64 vcc, 0.5, m0           ; encoding: [0x6a,0x00,0x4e,0xd4,0xf0,0xfa,0x00,0x00]
 
-# W32: v_cmp_ge_u32_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x4e,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_ge_u32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4e,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x4e,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_ge_u32_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4e,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_ge_u32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4e,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmp_ge_u32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x4e,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x4e,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_ge_u32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x4e,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ge_u64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x5e,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ge_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x5e,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x5e,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ge_u64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x5e,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ge_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x5e,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x5e,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_ge_u64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x5e,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_ge_u64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x5e,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x5e,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_ge_u64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x5e,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_ge_u64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x5e,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x5e,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_ge_u64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x5e,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_ge_u64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x5e,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x5e,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_ge_u64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x5e,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_ge_u64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x5e,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x5e,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_ge_u64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x5e,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_ge_u64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x5e,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x5e,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_ge_u64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x5e,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_ge_u64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x5e,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x5e,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_ge_u64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x5e,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_ge_u64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x5e,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x5e,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_ge_u64_e64 s10, exec, src_scc       ; encoding: [0x0a,0x00,0x5e,0xd4,0x7e,0xfa,0x01,0x00]
-# W64: v_cmp_ge_u64_e64 s[10:11], exec, src_scc  ; encoding: [0x0a,0x00,0x5e,0xd4,0x7e,0xfa,0x01,0x00]
 0x0a,0x00,0x5e,0xd4,0x7e,0xfa,0x01,0x00
+# W32: v_cmp_ge_u64_e64 s10, exec, src_scc     ; encoding: [0x0a,0x00,0x5e,0xd4,0x7e,0xfa,0x01,0x00]
+# W64: v_cmp_ge_u64_e64 s[10:11], exec, src_scc ; encoding: [0x0a,0x00,0x5e,0xd4,0x7e,0xfa,0x01,0x00]
 
-# W32: v_cmp_ge_u64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x5e,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_ge_u64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x5e,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x5e,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_ge_u64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x5e,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_ge_u64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x5e,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_ge_u64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x5e,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_ge_u64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x5e,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x5e,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_ge_u64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x5e,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_ge_u64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x5e,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_ge_u64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x5e,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_ge_u64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x5e,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x5e,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_ge_u64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x5e,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_ge_u64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x5e,0xd4,0xf0,0xf8,0x00,0x00]
 
-# W32: v_cmp_ge_u64_e64 ttmp14, src_scc, exec    ; encoding: [0x7a,0x00,0x5e,0xd4,0xfd,0xfc,0x00,0x00]
-# W64: v_cmp_ge_u64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x5e,0xd4,0xfd,0xfc,0x00,0x00]
 0x7a,0x00,0x5e,0xd4,0xfd,0xfc,0x00,0x00
+# W32: v_cmp_ge_u64_e64 ttmp14, src_scc, exec  ; encoding: [0x7a,0x00,0x5e,0xd4,0xfd,0xfc,0x00,0x00]
+# W64: v_cmp_ge_u64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x5e,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX12: v_cmp_ge_u64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x5e,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x5e,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_ge_u64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x5e,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_gt_f16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x04,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_gt_f16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x04,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x04,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_gt_f16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x04,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_gt_f16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x04,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_gt_f16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x04,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_gt_f16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x04,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x04,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_gt_f16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x04,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_gt_f16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x04,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_gt_f16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x04,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_gt_f16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x04,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x04,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_gt_f16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x04,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_gt_f16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x04,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_gt_f16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x04,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_gt_f16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x04,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x04,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_gt_f16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x04,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_gt_f16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x04,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_gt_f16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x04,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_gt_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x04,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x04,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_gt_f16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x04,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_gt_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x04,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_gt_f16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x04,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_gt_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x04,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x04,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_gt_f16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x04,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_gt_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x04,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_gt_f16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x04,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_gt_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x04,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x04,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_gt_f16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x04,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_gt_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x04,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_gt_f16_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x04,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_gt_f16_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x04,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x04,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_gt_f16_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x04,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_gt_f16_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x04,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_gt_f16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x04,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_gt_f16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x04,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x04,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_gt_f16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x04,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_gt_f16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x04,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_gt_f16_e64 s10, |exec_hi|, null     ; encoding: [0x0a,0x01,0x04,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_gt_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x04,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x04,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_gt_f16_e64 s10, |exec_hi|, null   ; encoding: [0x0a,0x01,0x04,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_gt_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x04,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_gt_f16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x04,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_gt_f16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x04,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x04,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_gt_f16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x04,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_gt_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x04,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_gt_f16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x04,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_gt_f16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x04,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x04,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_gt_f16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x04,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_gt_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x04,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_gt_f16_e64 vcc_lo, 0.5, -m0         ; encoding: [0x6a,0x00,0x04,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_gt_f16_e64 vcc, 0.5, -m0            ; encoding: [0x6a,0x00,0x04,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x04,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_gt_f16_e64 vcc_lo, 0.5, -m0       ; encoding: [0x6a,0x00,0x04,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_gt_f16_e64 vcc, 0.5, -m0          ; encoding: [0x6a,0x00,0x04,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x04,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_gt_f16_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x04,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_gt_f16_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x04,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x04,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX12: v_cmp_gt_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x04,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7c,0x83,0x04,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmp_gt_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x04,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_gt_f32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x14,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_gt_f32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x14,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x14,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_gt_f32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x14,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_gt_f32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x14,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_gt_f32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x14,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_gt_f32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x14,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x14,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_gt_f32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x14,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_gt_f32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x14,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_gt_f32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x14,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_gt_f32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x14,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x14,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_gt_f32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x14,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_gt_f32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x14,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_gt_f32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x14,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_gt_f32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x14,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x14,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_gt_f32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x14,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_gt_f32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x14,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_gt_f32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x14,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_gt_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x14,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x14,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_gt_f32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x14,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_gt_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x14,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_gt_f32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x14,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_gt_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x14,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x14,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_gt_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x14,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_gt_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x14,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_gt_f32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x14,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_gt_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x14,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x14,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_gt_f32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x14,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_gt_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x14,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_gt_f32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x14,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_gt_f32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x14,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x14,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_gt_f32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x14,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_gt_f32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x14,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_gt_f32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x14,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_gt_f32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x14,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x14,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_gt_f32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x14,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_gt_f32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x14,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_gt_f32_e64 s10, |exec_hi|, null     ; encoding: [0x0a,0x01,0x14,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_gt_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x14,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x14,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_gt_f32_e64 s10, |exec_hi|, null   ; encoding: [0x0a,0x01,0x14,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_gt_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x14,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_gt_f32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x14,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_gt_f32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x14,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x14,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_gt_f32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x14,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_gt_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x14,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_gt_f32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x14,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_gt_f32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x14,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x14,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_gt_f32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x14,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_gt_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x14,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_gt_f32_e64 vcc_lo, 0.5, -m0         ; encoding: [0x6a,0x00,0x14,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_gt_f32_e64 vcc, 0.5, -m0            ; encoding: [0x6a,0x00,0x14,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x14,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_gt_f32_e64 vcc_lo, 0.5, -m0       ; encoding: [0x6a,0x00,0x14,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_gt_f32_e64 vcc, 0.5, -m0          ; encoding: [0x6a,0x00,0x14,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x14,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_gt_f32_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x14,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_gt_f32_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x14,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x14,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX12: v_cmp_gt_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x14,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7c,0x83,0x14,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_gt_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x14,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_gt_f64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x24,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_gt_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x24,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x24,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_gt_f64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x24,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_gt_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x24,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x24,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_gt_f64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x24,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_gt_f64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x24,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x24,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_gt_f64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x24,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_gt_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x24,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x24,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_gt_f64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x24,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_gt_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x24,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x24,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_gt_f64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x24,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_gt_f64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x24,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x24,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_gt_f64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x24,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_gt_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x24,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x24,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_gt_f64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x24,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_gt_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x24,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x24,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_gt_f64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x24,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_gt_f64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x24,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x24,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_gt_f64_e64 s10, -|exec|, src_scc    ; encoding: [0x0a,0x01,0x24,0xd4,0x7e,0xfa,0x01,0x20]
-# W64: v_cmp_gt_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x24,0xd4,0x7e,0xfa,0x01,0x20]
 0x0a,0x01,0x24,0xd4,0x7e,0xfa,0x01,0x20
+# W32: v_cmp_gt_f64_e64 s10, -|exec|, src_scc  ; encoding: [0x0a,0x01,0x24,0xd4,0x7e,0xfa,0x01,0x20]
+# W64: v_cmp_gt_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x24,0xd4,0x7e,0xfa,0x01,0x20]
 
-# W32: v_cmp_gt_f64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x24,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_gt_f64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x24,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x24,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_gt_f64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x24,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_gt_f64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x24,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_gt_f64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x24,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_gt_f64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x24,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x24,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_gt_f64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x24,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_gt_f64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x24,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_gt_f64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x24,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_gt_f64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x24,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x24,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_gt_f64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x24,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_gt_f64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x24,0xd4,0xf0,0xf8,0x00,0x00]
 
+0x7a,0x03,0x24,0xd4,0xfd,0xfc,0x00,0x60
 # W32: v_cmp_gt_f64_e64 ttmp14, -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x24,0xd4,0xfd,0xfc,0x00,0x60]
 # W64: v_cmp_gt_f64_e64 ttmp[14:15], -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x24,0xd4,0xfd,0xfc,0x00,0x60]
-0x7a,0x03,0x24,0xd4,0xfd,0xfc,0x00,0x60
 
-# GFX12: v_cmp_gt_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x24,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7c,0x82,0x24,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_gt_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x24,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_gt_i16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x34,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_gt_i16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x34,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x34,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_gt_i16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x34,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_gt_i16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x34,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_gt_i16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x34,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_gt_i16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x34,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x34,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_gt_i16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x34,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_gt_i16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x34,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_gt_i16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x34,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_gt_i16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x34,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x34,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_gt_i16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x34,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_gt_i16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x34,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_gt_i16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x34,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_gt_i16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x34,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x34,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_gt_i16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x34,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_gt_i16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x34,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_gt_i16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x34,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_gt_i16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x34,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x34,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_gt_i16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x34,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_gt_i16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x34,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_gt_i16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x34,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_gt_i16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x34,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x34,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_gt_i16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x34,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_gt_i16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x34,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_gt_i16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x34,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_gt_i16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x34,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x34,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_gt_i16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x34,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_gt_i16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x34,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_gt_i16_e64 s10, m0, 0x3800          ; encoding: [0x0a,0x00,0x34,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_gt_i16_e64 s[10:11], m0, 0x3800     ; encoding: [0x0a,0x00,0x34,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x0a,0x00,0x34,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_gt_i16_e64 s10, m0, 0x3800        ; encoding: [0x0a,0x00,0x34,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_gt_i16_e64 s[10:11], m0, 0x3800   ; encoding: [0x0a,0x00,0x34,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_gt_i16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x34,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_gt_i16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x34,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x34,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_gt_i16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x34,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_gt_i16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x34,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_gt_i16_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x34,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_gt_i16_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x34,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x34,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_gt_i16_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x34,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_gt_i16_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x34,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_gt_i16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x34,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_gt_i16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x34,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x34,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_gt_i16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x34,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_gt_i16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x34,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_gt_i16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x34,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_gt_i16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x34,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x34,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_gt_i16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x34,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_gt_i16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x34,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_gt_i16_e64 vcc_lo, 0x3800, m0       ; encoding: [0x6a,0x00,0x34,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_gt_i16_e64 vcc, 0x3800, m0          ; encoding: [0x6a,0x00,0x34,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x6a,0x00,0x34,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_gt_i16_e64 vcc_lo, 0x3800, m0     ; encoding: [0x6a,0x00,0x34,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_gt_i16_e64 vcc, 0x3800, m0        ; encoding: [0x6a,0x00,0x34,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_gt_i16_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x34,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_gt_i16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x34,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x34,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_gt_i16_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x34,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_gt_i16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x34,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmp_gt_i16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x34,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7c,0x00,0x34,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmp_gt_i16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x34,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_gt_i32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x44,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_gt_i32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x44,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x44,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_gt_i32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x44,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_gt_i32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x44,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_gt_i32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x44,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_gt_i32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x44,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x44,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_gt_i32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x44,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_gt_i32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x44,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_gt_i32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x44,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_gt_i32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x44,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x44,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_gt_i32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x44,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_gt_i32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x44,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_gt_i32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x44,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_gt_i32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x44,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x44,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_gt_i32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x44,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_gt_i32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x44,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_gt_i32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x44,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_gt_i32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x44,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x44,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_gt_i32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x44,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_gt_i32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x44,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_gt_i32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x44,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_gt_i32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x44,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x44,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_gt_i32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x44,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_gt_i32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x44,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_gt_i32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x44,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_gt_i32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x44,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x44,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_gt_i32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x44,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_gt_i32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x44,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_gt_i32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x44,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_gt_i32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x44,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x44,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_gt_i32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x44,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_gt_i32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x44,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_gt_i32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x44,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_gt_i32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x44,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x44,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_gt_i32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x44,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_gt_i32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x44,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_gt_i32_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x44,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_gt_i32_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x44,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x44,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_gt_i32_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x44,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_gt_i32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x44,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_gt_i32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x44,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_gt_i32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x44,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x44,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_gt_i32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x44,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_gt_i32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x44,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_gt_i32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x44,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_gt_i32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x44,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x44,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_gt_i32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x44,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_gt_i32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x44,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_gt_i32_e64 vcc_lo, 0.5, m0          ; encoding: [0x6a,0x00,0x44,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_gt_i32_e64 vcc, 0.5, m0             ; encoding: [0x6a,0x00,0x44,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x44,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_gt_i32_e64 vcc_lo, 0.5, m0        ; encoding: [0x6a,0x00,0x44,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_gt_i32_e64 vcc, 0.5, m0           ; encoding: [0x6a,0x00,0x44,0xd4,0xf0,0xfa,0x00,0x00]
 
-# W32: v_cmp_gt_i32_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x44,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_gt_i32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x44,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x44,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_gt_i32_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x44,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_gt_i32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x44,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmp_gt_i32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x44,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x44,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_gt_i32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x44,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_gt_i64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x54,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_gt_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x54,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x54,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_gt_i64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x54,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_gt_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x54,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x54,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_gt_i64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x54,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_gt_i64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x54,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x54,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_gt_i64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x54,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_gt_i64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x54,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x54,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_gt_i64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x54,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_gt_i64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x54,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x54,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_gt_i64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x54,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_gt_i64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x54,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x54,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_gt_i64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x54,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_gt_i64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x54,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x54,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_gt_i64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x54,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_gt_i64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x54,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x54,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_gt_i64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x54,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_gt_i64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x54,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x54,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_gt_i64_e64 s10, exec, src_scc       ; encoding: [0x0a,0x00,0x54,0xd4,0x7e,0xfa,0x01,0x00]
-# W64: v_cmp_gt_i64_e64 s[10:11], exec, src_scc  ; encoding: [0x0a,0x00,0x54,0xd4,0x7e,0xfa,0x01,0x00]
 0x0a,0x00,0x54,0xd4,0x7e,0xfa,0x01,0x00
+# W32: v_cmp_gt_i64_e64 s10, exec, src_scc     ; encoding: [0x0a,0x00,0x54,0xd4,0x7e,0xfa,0x01,0x00]
+# W64: v_cmp_gt_i64_e64 s[10:11], exec, src_scc ; encoding: [0x0a,0x00,0x54,0xd4,0x7e,0xfa,0x01,0x00]
 
-# W32: v_cmp_gt_i64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x54,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_gt_i64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x54,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x54,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_gt_i64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x54,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_gt_i64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x54,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_gt_i64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x54,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_gt_i64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x54,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x54,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_gt_i64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x54,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_gt_i64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x54,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_gt_i64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x54,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_gt_i64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x54,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x54,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_gt_i64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x54,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_gt_i64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x54,0xd4,0xf0,0xf8,0x00,0x00]
 
-# W32: v_cmp_gt_i64_e64 ttmp14, src_scc, exec    ; encoding: [0x7a,0x00,0x54,0xd4,0xfd,0xfc,0x00,0x00]
-# W64: v_cmp_gt_i64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x54,0xd4,0xfd,0xfc,0x00,0x00]
 0x7a,0x00,0x54,0xd4,0xfd,0xfc,0x00,0x00
+# W32: v_cmp_gt_i64_e64 ttmp14, src_scc, exec  ; encoding: [0x7a,0x00,0x54,0xd4,0xfd,0xfc,0x00,0x00]
+# W64: v_cmp_gt_i64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x54,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX12: v_cmp_gt_i64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x54,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x54,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_gt_i64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x54,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_gt_u16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x3c,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_gt_u16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x3c,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x3c,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_gt_u16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x3c,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_gt_u16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x3c,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_gt_u16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x3c,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_gt_u16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x3c,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x3c,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_gt_u16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x3c,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_gt_u16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x3c,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_gt_u16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x3c,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_gt_u16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x3c,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x3c,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_gt_u16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x3c,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_gt_u16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x3c,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_gt_u16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x3c,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_gt_u16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x3c,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x3c,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_gt_u16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x3c,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_gt_u16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x3c,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_gt_u16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x3c,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_gt_u16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x3c,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x3c,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_gt_u16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x3c,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_gt_u16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x3c,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_gt_u16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x3c,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_gt_u16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x3c,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x3c,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_gt_u16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x3c,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_gt_u16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x3c,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_gt_u16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x3c,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_gt_u16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x3c,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x3c,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_gt_u16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x3c,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_gt_u16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x3c,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_gt_u16_e64 s10, m0, 0x3800          ; encoding: [0x0a,0x00,0x3c,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_gt_u16_e64 s[10:11], m0, 0x3800     ; encoding: [0x0a,0x00,0x3c,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x0a,0x00,0x3c,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_gt_u16_e64 s10, m0, 0x3800        ; encoding: [0x0a,0x00,0x3c,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_gt_u16_e64 s[10:11], m0, 0x3800   ; encoding: [0x0a,0x00,0x3c,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_gt_u16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x3c,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_gt_u16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x3c,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x3c,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_gt_u16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x3c,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_gt_u16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x3c,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_gt_u16_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x3c,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_gt_u16_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x3c,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x3c,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_gt_u16_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x3c,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_gt_u16_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x3c,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_gt_u16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x3c,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_gt_u16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x3c,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x3c,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_gt_u16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x3c,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_gt_u16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x3c,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_gt_u16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x3c,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_gt_u16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x3c,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x3c,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_gt_u16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x3c,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_gt_u16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x3c,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_gt_u16_e64 vcc_lo, 0x3800, m0       ; encoding: [0x6a,0x00,0x3c,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_gt_u16_e64 vcc, 0x3800, m0          ; encoding: [0x6a,0x00,0x3c,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x6a,0x00,0x3c,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_gt_u16_e64 vcc_lo, 0x3800, m0     ; encoding: [0x6a,0x00,0x3c,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_gt_u16_e64 vcc, 0x3800, m0        ; encoding: [0x6a,0x00,0x3c,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_gt_u16_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x3c,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_gt_u16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x3c,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x3c,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_gt_u16_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x3c,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_gt_u16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x3c,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmp_gt_u16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x3c,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7c,0x00,0x3c,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmp_gt_u16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x3c,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_gt_u32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x4c,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_gt_u32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x4c,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x4c,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_gt_u32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x4c,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_gt_u32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x4c,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_gt_u32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x4c,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_gt_u32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x4c,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x4c,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_gt_u32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x4c,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_gt_u32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x4c,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_gt_u32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x4c,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_gt_u32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x4c,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x4c,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_gt_u32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x4c,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_gt_u32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x4c,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_gt_u32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x4c,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_gt_u32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x4c,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x4c,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_gt_u32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x4c,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_gt_u32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x4c,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_gt_u32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x4c,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_gt_u32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x4c,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x4c,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_gt_u32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x4c,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_gt_u32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x4c,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_gt_u32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x4c,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_gt_u32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4c,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x4c,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_gt_u32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4c,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_gt_u32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4c,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_gt_u32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x4c,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_gt_u32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x4c,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x4c,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_gt_u32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x4c,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_gt_u32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x4c,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_gt_u32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x4c,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_gt_u32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x4c,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x4c,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_gt_u32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x4c,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_gt_u32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x4c,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_gt_u32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x4c,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_gt_u32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x4c,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x4c,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_gt_u32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x4c,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_gt_u32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x4c,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_gt_u32_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x4c,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_gt_u32_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x4c,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x4c,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_gt_u32_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x4c,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_gt_u32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x4c,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_gt_u32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x4c,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_gt_u32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x4c,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x4c,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_gt_u32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x4c,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_gt_u32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x4c,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_gt_u32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x4c,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_gt_u32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x4c,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x4c,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_gt_u32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x4c,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_gt_u32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x4c,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_gt_u32_e64 vcc_lo, 0.5, m0          ; encoding: [0x6a,0x00,0x4c,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_gt_u32_e64 vcc, 0.5, m0             ; encoding: [0x6a,0x00,0x4c,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x4c,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_gt_u32_e64 vcc_lo, 0.5, m0        ; encoding: [0x6a,0x00,0x4c,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_gt_u32_e64 vcc, 0.5, m0           ; encoding: [0x6a,0x00,0x4c,0xd4,0xf0,0xfa,0x00,0x00]
 
-# W32: v_cmp_gt_u32_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x4c,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_gt_u32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4c,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x4c,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_gt_u32_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4c,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_gt_u32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4c,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmp_gt_u32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x4c,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x4c,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_gt_u32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x4c,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_gt_u64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x5c,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_gt_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x5c,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x5c,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_gt_u64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x5c,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_gt_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x5c,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x5c,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_gt_u64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x5c,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_gt_u64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x5c,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x5c,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_gt_u64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x5c,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_gt_u64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x5c,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x5c,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_gt_u64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x5c,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_gt_u64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x5c,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x5c,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_gt_u64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x5c,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_gt_u64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x5c,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x5c,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_gt_u64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x5c,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_gt_u64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x5c,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x5c,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_gt_u64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x5c,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_gt_u64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x5c,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x5c,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_gt_u64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x5c,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_gt_u64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x5c,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x5c,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_gt_u64_e64 s10, exec, src_scc       ; encoding: [0x0a,0x00,0x5c,0xd4,0x7e,0xfa,0x01,0x00]
-# W64: v_cmp_gt_u64_e64 s[10:11], exec, src_scc  ; encoding: [0x0a,0x00,0x5c,0xd4,0x7e,0xfa,0x01,0x00]
 0x0a,0x00,0x5c,0xd4,0x7e,0xfa,0x01,0x00
+# W32: v_cmp_gt_u64_e64 s10, exec, src_scc     ; encoding: [0x0a,0x00,0x5c,0xd4,0x7e,0xfa,0x01,0x00]
+# W64: v_cmp_gt_u64_e64 s[10:11], exec, src_scc ; encoding: [0x0a,0x00,0x5c,0xd4,0x7e,0xfa,0x01,0x00]
 
-# W32: v_cmp_gt_u64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x5c,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_gt_u64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x5c,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x5c,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_gt_u64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x5c,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_gt_u64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x5c,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_gt_u64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x5c,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_gt_u64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x5c,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x5c,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_gt_u64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x5c,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_gt_u64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x5c,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_gt_u64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x5c,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_gt_u64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x5c,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x5c,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_gt_u64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x5c,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_gt_u64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x5c,0xd4,0xf0,0xf8,0x00,0x00]
 
-# W32: v_cmp_gt_u64_e64 ttmp14, src_scc, exec    ; encoding: [0x7a,0x00,0x5c,0xd4,0xfd,0xfc,0x00,0x00]
-# W64: v_cmp_gt_u64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x5c,0xd4,0xfd,0xfc,0x00,0x00]
 0x7a,0x00,0x5c,0xd4,0xfd,0xfc,0x00,0x00
+# W32: v_cmp_gt_u64_e64 ttmp14, src_scc, exec  ; encoding: [0x7a,0x00,0x5c,0xd4,0xfd,0xfc,0x00,0x00]
+# W64: v_cmp_gt_u64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x5c,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX12: v_cmp_gt_u64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x5c,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x5c,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_gt_u64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x5c,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_le_f16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x03,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_le_f16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x03,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x03,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_le_f16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x03,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_le_f16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x03,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_le_f16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x03,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_le_f16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x03,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x03,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_le_f16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x03,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_le_f16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x03,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_le_f16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x03,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_le_f16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x03,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x03,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_le_f16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x03,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_le_f16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x03,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_le_f16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x03,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_le_f16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x03,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x03,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_le_f16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x03,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_le_f16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x03,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_le_f16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x03,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_le_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x03,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x03,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_le_f16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x03,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_le_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x03,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_le_f16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x03,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_le_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x03,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x03,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_le_f16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x03,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_le_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x03,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_le_f16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x03,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_le_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x03,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x03,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_le_f16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x03,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_le_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x03,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_le_f16_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x03,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_le_f16_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x03,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x03,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_le_f16_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x03,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_le_f16_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x03,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_le_f16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x03,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_le_f16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x03,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x03,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_le_f16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x03,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_le_f16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x03,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_le_f16_e64 s10, |exec_hi|, null     ; encoding: [0x0a,0x01,0x03,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_le_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x03,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x03,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_le_f16_e64 s10, |exec_hi|, null   ; encoding: [0x0a,0x01,0x03,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_le_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x03,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_le_f16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x03,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_le_f16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x03,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x03,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_le_f16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x03,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_le_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x03,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_le_f16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x03,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_le_f16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x03,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x03,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_le_f16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x03,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_le_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x03,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_le_f16_e64 vcc_lo, 0.5, -m0         ; encoding: [0x6a,0x00,0x03,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_le_f16_e64 vcc, 0.5, -m0            ; encoding: [0x6a,0x00,0x03,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x03,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_le_f16_e64 vcc_lo, 0.5, -m0       ; encoding: [0x6a,0x00,0x03,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_le_f16_e64 vcc, 0.5, -m0          ; encoding: [0x6a,0x00,0x03,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x03,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_le_f16_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x03,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_le_f16_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x03,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x03,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX12: v_cmp_le_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x03,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7c,0x83,0x03,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmp_le_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x03,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_le_f32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x13,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_le_f32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x13,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x13,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_le_f32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x13,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_le_f32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x13,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_le_f32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x13,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_le_f32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x13,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x13,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_le_f32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x13,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_le_f32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x13,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_le_f32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x13,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_le_f32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x13,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x13,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_le_f32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x13,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_le_f32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x13,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_le_f32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x13,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_le_f32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x13,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x13,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_le_f32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x13,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_le_f32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x13,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_le_f32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x13,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_le_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x13,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x13,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_le_f32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x13,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_le_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x13,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_le_f32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x13,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_le_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x13,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x13,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_le_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x13,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_le_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x13,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_le_f32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x13,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_le_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x13,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x13,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_le_f32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x13,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_le_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x13,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_le_f32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x13,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_le_f32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x13,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x13,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_le_f32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x13,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_le_f32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x13,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_le_f32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x13,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_le_f32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x13,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x13,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_le_f32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x13,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_le_f32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x13,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_le_f32_e64 s10, |exec_hi|, null     ; encoding: [0x0a,0x01,0x13,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_le_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x13,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x13,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_le_f32_e64 s10, |exec_hi|, null   ; encoding: [0x0a,0x01,0x13,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_le_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x13,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_le_f32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x13,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_le_f32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x13,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x13,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_le_f32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x13,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_le_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x13,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_le_f32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x13,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_le_f32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x13,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x13,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_le_f32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x13,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_le_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x13,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_le_f32_e64 vcc_lo, 0.5, -m0         ; encoding: [0x6a,0x00,0x13,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_le_f32_e64 vcc, 0.5, -m0            ; encoding: [0x6a,0x00,0x13,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x13,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_le_f32_e64 vcc_lo, 0.5, -m0       ; encoding: [0x6a,0x00,0x13,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_le_f32_e64 vcc, 0.5, -m0          ; encoding: [0x6a,0x00,0x13,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x13,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_le_f32_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x13,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_le_f32_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x13,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x13,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX12: v_cmp_le_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x13,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7c,0x83,0x13,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_le_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x13,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_le_f64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x23,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_le_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x23,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x23,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_le_f64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x23,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_le_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x23,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x23,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_le_f64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x23,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_le_f64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x23,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x23,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_le_f64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x23,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_le_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x23,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x23,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_le_f64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x23,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_le_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x23,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x23,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_le_f64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x23,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_le_f64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x23,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x23,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_le_f64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x23,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_le_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x23,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x23,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_le_f64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x23,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_le_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x23,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x23,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_le_f64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x23,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_le_f64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x23,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x23,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_le_f64_e64 s10, -|exec|, src_scc    ; encoding: [0x0a,0x01,0x23,0xd4,0x7e,0xfa,0x01,0x20]
-# W64: v_cmp_le_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x23,0xd4,0x7e,0xfa,0x01,0x20]
 0x0a,0x01,0x23,0xd4,0x7e,0xfa,0x01,0x20
+# W32: v_cmp_le_f64_e64 s10, -|exec|, src_scc  ; encoding: [0x0a,0x01,0x23,0xd4,0x7e,0xfa,0x01,0x20]
+# W64: v_cmp_le_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x23,0xd4,0x7e,0xfa,0x01,0x20]
 
-# W32: v_cmp_le_f64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x23,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_le_f64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x23,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x23,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_le_f64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x23,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_le_f64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x23,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_le_f64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x23,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_le_f64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x23,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x23,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_le_f64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x23,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_le_f64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x23,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_le_f64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x23,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_le_f64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x23,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x23,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_le_f64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x23,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_le_f64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x23,0xd4,0xf0,0xf8,0x00,0x00]
 
+0x7a,0x03,0x23,0xd4,0xfd,0xfc,0x00,0x60
 # W32: v_cmp_le_f64_e64 ttmp14, -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x23,0xd4,0xfd,0xfc,0x00,0x60]
 # W64: v_cmp_le_f64_e64 ttmp[14:15], -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x23,0xd4,0xfd,0xfc,0x00,0x60]
-0x7a,0x03,0x23,0xd4,0xfd,0xfc,0x00,0x60
 
-# GFX12: v_cmp_le_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x23,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7c,0x82,0x23,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_le_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x23,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_le_i16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x33,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_le_i16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x33,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x33,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_le_i16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x33,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_le_i16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x33,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_le_i16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x33,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_le_i16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x33,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x33,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_le_i16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x33,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_le_i16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x33,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_le_i16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x33,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_le_i16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x33,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x33,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_le_i16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x33,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_le_i16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x33,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_le_i16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x33,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_le_i16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x33,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x33,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_le_i16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x33,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_le_i16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x33,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_le_i16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x33,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_le_i16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x33,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x33,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_le_i16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x33,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_le_i16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x33,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_le_i16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x33,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_le_i16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x33,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x33,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_le_i16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x33,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_le_i16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x33,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_le_i16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x33,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_le_i16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x33,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x33,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_le_i16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x33,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_le_i16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x33,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_le_i16_e64 s10, m0, 0x3800          ; encoding: [0x0a,0x00,0x33,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_le_i16_e64 s[10:11], m0, 0x3800     ; encoding: [0x0a,0x00,0x33,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x0a,0x00,0x33,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_le_i16_e64 s10, m0, 0x3800        ; encoding: [0x0a,0x00,0x33,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_le_i16_e64 s[10:11], m0, 0x3800   ; encoding: [0x0a,0x00,0x33,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_le_i16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x33,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_le_i16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x33,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x33,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_le_i16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x33,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_le_i16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x33,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_le_i16_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x33,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_le_i16_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x33,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x33,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_le_i16_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x33,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_le_i16_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x33,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_le_i16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x33,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_le_i16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x33,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x33,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_le_i16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x33,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_le_i16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x33,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_le_i16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x33,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_le_i16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x33,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x33,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_le_i16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x33,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_le_i16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x33,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_le_i16_e64 vcc_lo, 0x3800, m0       ; encoding: [0x6a,0x00,0x33,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_le_i16_e64 vcc, 0x3800, m0          ; encoding: [0x6a,0x00,0x33,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x6a,0x00,0x33,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_le_i16_e64 vcc_lo, 0x3800, m0     ; encoding: [0x6a,0x00,0x33,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_le_i16_e64 vcc, 0x3800, m0        ; encoding: [0x6a,0x00,0x33,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_le_i16_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x33,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_le_i16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x33,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x33,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_le_i16_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x33,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_le_i16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x33,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmp_le_i16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x33,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7c,0x00,0x33,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmp_le_i16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x33,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_le_i32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x43,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_le_i32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x43,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x43,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_le_i32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x43,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_le_i32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x43,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_le_i32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x43,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_le_i32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x43,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x43,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_le_i32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x43,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_le_i32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x43,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_le_i32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x43,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_le_i32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x43,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x43,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_le_i32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x43,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_le_i32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x43,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_le_i32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x43,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_le_i32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x43,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x43,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_le_i32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x43,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_le_i32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x43,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_le_i32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x43,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_le_i32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x43,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x43,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_le_i32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x43,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_le_i32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x43,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_le_i32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x43,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_le_i32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x43,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x43,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_le_i32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x43,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_le_i32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x43,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_le_i32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x43,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_le_i32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x43,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x43,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_le_i32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x43,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_le_i32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x43,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_le_i32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x43,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_le_i32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x43,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x43,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_le_i32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x43,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_le_i32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x43,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_le_i32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x43,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_le_i32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x43,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x43,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_le_i32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x43,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_le_i32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x43,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_le_i32_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x43,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_le_i32_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x43,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x43,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_le_i32_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x43,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_le_i32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x43,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_le_i32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x43,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_le_i32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x43,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x43,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_le_i32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x43,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_le_i32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x43,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_le_i32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x43,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_le_i32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x43,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x43,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_le_i32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x43,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_le_i32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x43,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_le_i32_e64 vcc_lo, 0.5, m0          ; encoding: [0x6a,0x00,0x43,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_le_i32_e64 vcc, 0.5, m0             ; encoding: [0x6a,0x00,0x43,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x43,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_le_i32_e64 vcc_lo, 0.5, m0        ; encoding: [0x6a,0x00,0x43,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_le_i32_e64 vcc, 0.5, m0           ; encoding: [0x6a,0x00,0x43,0xd4,0xf0,0xfa,0x00,0x00]
 
-# W32: v_cmp_le_i32_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x43,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_le_i32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x43,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x43,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_le_i32_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x43,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_le_i32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x43,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmp_le_i32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x43,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x43,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_le_i32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x43,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_le_i64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x53,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_le_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x53,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x53,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_le_i64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x53,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_le_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x53,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x53,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_le_i64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x53,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_le_i64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x53,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x53,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_le_i64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x53,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_le_i64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x53,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x53,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_le_i64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x53,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_le_i64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x53,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x53,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_le_i64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x53,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_le_i64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x53,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x53,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_le_i64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x53,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_le_i64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x53,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x53,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_le_i64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x53,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_le_i64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x53,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x53,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_le_i64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x53,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_le_i64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x53,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x53,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_le_i64_e64 s10, exec, src_scc       ; encoding: [0x0a,0x00,0x53,0xd4,0x7e,0xfa,0x01,0x00]
-# W64: v_cmp_le_i64_e64 s[10:11], exec, src_scc  ; encoding: [0x0a,0x00,0x53,0xd4,0x7e,0xfa,0x01,0x00]
 0x0a,0x00,0x53,0xd4,0x7e,0xfa,0x01,0x00
+# W32: v_cmp_le_i64_e64 s10, exec, src_scc     ; encoding: [0x0a,0x00,0x53,0xd4,0x7e,0xfa,0x01,0x00]
+# W64: v_cmp_le_i64_e64 s[10:11], exec, src_scc ; encoding: [0x0a,0x00,0x53,0xd4,0x7e,0xfa,0x01,0x00]
 
-# W32: v_cmp_le_i64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x53,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_le_i64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x53,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x53,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_le_i64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x53,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_le_i64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x53,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_le_i64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x53,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_le_i64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x53,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x53,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_le_i64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x53,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_le_i64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x53,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_le_i64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x53,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_le_i64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x53,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x53,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_le_i64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x53,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_le_i64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x53,0xd4,0xf0,0xf8,0x00,0x00]
 
-# W32: v_cmp_le_i64_e64 ttmp14, src_scc, exec    ; encoding: [0x7a,0x00,0x53,0xd4,0xfd,0xfc,0x00,0x00]
-# W64: v_cmp_le_i64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x53,0xd4,0xfd,0xfc,0x00,0x00]
 0x7a,0x00,0x53,0xd4,0xfd,0xfc,0x00,0x00
+# W32: v_cmp_le_i64_e64 ttmp14, src_scc, exec  ; encoding: [0x7a,0x00,0x53,0xd4,0xfd,0xfc,0x00,0x00]
+# W64: v_cmp_le_i64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x53,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX12: v_cmp_le_i64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x53,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x53,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_le_i64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x53,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_le_u16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x3b,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_le_u16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x3b,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x3b,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_le_u16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x3b,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_le_u16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x3b,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_le_u16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x3b,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_le_u16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x3b,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x3b,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_le_u16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x3b,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_le_u16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x3b,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_le_u16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x3b,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_le_u16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x3b,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x3b,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_le_u16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x3b,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_le_u16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x3b,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_le_u16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x3b,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_le_u16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x3b,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x3b,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_le_u16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x3b,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_le_u16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x3b,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_le_u16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x3b,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_le_u16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x3b,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x3b,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_le_u16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x3b,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_le_u16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x3b,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_le_u16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x3b,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_le_u16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x3b,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x3b,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_le_u16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x3b,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_le_u16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x3b,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_le_u16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x3b,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_le_u16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x3b,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x3b,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_le_u16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x3b,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_le_u16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x3b,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_le_u16_e64 s10, m0, 0x3800          ; encoding: [0x0a,0x00,0x3b,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_le_u16_e64 s[10:11], m0, 0x3800     ; encoding: [0x0a,0x00,0x3b,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x0a,0x00,0x3b,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_le_u16_e64 s10, m0, 0x3800        ; encoding: [0x0a,0x00,0x3b,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_le_u16_e64 s[10:11], m0, 0x3800   ; encoding: [0x0a,0x00,0x3b,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_le_u16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x3b,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_le_u16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x3b,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x3b,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_le_u16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x3b,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_le_u16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x3b,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_le_u16_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x3b,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_le_u16_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x3b,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x3b,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_le_u16_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x3b,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_le_u16_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x3b,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_le_u16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x3b,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_le_u16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x3b,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x3b,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_le_u16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x3b,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_le_u16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x3b,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_le_u16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x3b,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_le_u16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x3b,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x3b,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_le_u16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x3b,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_le_u16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x3b,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_le_u16_e64 vcc_lo, 0x3800, m0       ; encoding: [0x6a,0x00,0x3b,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_le_u16_e64 vcc, 0x3800, m0          ; encoding: [0x6a,0x00,0x3b,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x6a,0x00,0x3b,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_le_u16_e64 vcc_lo, 0x3800, m0     ; encoding: [0x6a,0x00,0x3b,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_le_u16_e64 vcc, 0x3800, m0        ; encoding: [0x6a,0x00,0x3b,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_le_u16_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x3b,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_le_u16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x3b,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x3b,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_le_u16_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x3b,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_le_u16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x3b,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmp_le_u16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x3b,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7c,0x00,0x3b,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmp_le_u16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x3b,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_le_u32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x4b,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_le_u32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x4b,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x4b,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_le_u32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x4b,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_le_u32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x4b,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_le_u32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x4b,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_le_u32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x4b,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x4b,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_le_u32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x4b,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_le_u32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x4b,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_le_u32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x4b,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_le_u32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x4b,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x4b,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_le_u32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x4b,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_le_u32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x4b,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_le_u32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x4b,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_le_u32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x4b,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x4b,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_le_u32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x4b,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_le_u32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x4b,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_le_u32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x4b,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_le_u32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x4b,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x4b,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_le_u32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x4b,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_le_u32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x4b,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_le_u32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x4b,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_le_u32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4b,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x4b,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_le_u32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4b,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_le_u32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4b,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_le_u32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x4b,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_le_u32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x4b,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x4b,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_le_u32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x4b,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_le_u32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x4b,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_le_u32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x4b,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_le_u32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x4b,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x4b,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_le_u32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x4b,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_le_u32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x4b,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_le_u32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x4b,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_le_u32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x4b,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x4b,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_le_u32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x4b,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_le_u32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x4b,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_le_u32_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x4b,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_le_u32_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x4b,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x4b,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_le_u32_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x4b,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_le_u32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x4b,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_le_u32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x4b,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_le_u32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x4b,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x4b,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_le_u32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x4b,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_le_u32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x4b,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_le_u32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x4b,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_le_u32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x4b,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x4b,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_le_u32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x4b,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_le_u32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x4b,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_le_u32_e64 vcc_lo, 0.5, m0          ; encoding: [0x6a,0x00,0x4b,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_le_u32_e64 vcc, 0.5, m0             ; encoding: [0x6a,0x00,0x4b,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x4b,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_le_u32_e64 vcc_lo, 0.5, m0        ; encoding: [0x6a,0x00,0x4b,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_le_u32_e64 vcc, 0.5, m0           ; encoding: [0x6a,0x00,0x4b,0xd4,0xf0,0xfa,0x00,0x00]
 
-# W32: v_cmp_le_u32_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x4b,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_le_u32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4b,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x4b,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_le_u32_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4b,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_le_u32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4b,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmp_le_u32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x4b,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x4b,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_le_u32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x4b,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_le_u64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x5b,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_le_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x5b,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x5b,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_le_u64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x5b,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_le_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x5b,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x5b,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_le_u64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x5b,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_le_u64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x5b,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x5b,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_le_u64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x5b,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_le_u64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x5b,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x5b,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_le_u64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x5b,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_le_u64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x5b,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x5b,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_le_u64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x5b,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_le_u64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x5b,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x5b,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_le_u64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x5b,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_le_u64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x5b,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x5b,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_le_u64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x5b,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_le_u64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x5b,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x5b,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_le_u64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x5b,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_le_u64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x5b,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x5b,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_le_u64_e64 s10, exec, src_scc       ; encoding: [0x0a,0x00,0x5b,0xd4,0x7e,0xfa,0x01,0x00]
-# W64: v_cmp_le_u64_e64 s[10:11], exec, src_scc  ; encoding: [0x0a,0x00,0x5b,0xd4,0x7e,0xfa,0x01,0x00]
 0x0a,0x00,0x5b,0xd4,0x7e,0xfa,0x01,0x00
+# W32: v_cmp_le_u64_e64 s10, exec, src_scc     ; encoding: [0x0a,0x00,0x5b,0xd4,0x7e,0xfa,0x01,0x00]
+# W64: v_cmp_le_u64_e64 s[10:11], exec, src_scc ; encoding: [0x0a,0x00,0x5b,0xd4,0x7e,0xfa,0x01,0x00]
 
-# W32: v_cmp_le_u64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x5b,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_le_u64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x5b,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x5b,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_le_u64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x5b,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_le_u64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x5b,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_le_u64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x5b,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_le_u64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x5b,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x5b,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_le_u64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x5b,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_le_u64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x5b,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_le_u64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x5b,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_le_u64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x5b,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x5b,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_le_u64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x5b,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_le_u64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x5b,0xd4,0xf0,0xf8,0x00,0x00]
 
-# W32: v_cmp_le_u64_e64 ttmp14, src_scc, exec    ; encoding: [0x7a,0x00,0x5b,0xd4,0xfd,0xfc,0x00,0x00]
-# W64: v_cmp_le_u64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x5b,0xd4,0xfd,0xfc,0x00,0x00]
 0x7a,0x00,0x5b,0xd4,0xfd,0xfc,0x00,0x00
+# W32: v_cmp_le_u64_e64 ttmp14, src_scc, exec  ; encoding: [0x7a,0x00,0x5b,0xd4,0xfd,0xfc,0x00,0x00]
+# W64: v_cmp_le_u64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x5b,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX12: v_cmp_le_u64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x5b,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x5b,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_le_u64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x5b,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lg_f16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x05,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_lg_f16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x05,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x05,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_lg_f16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x05,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_lg_f16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x05,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_lg_f16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x05,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_lg_f16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x05,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x05,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_lg_f16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x05,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_lg_f16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x05,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_lg_f16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x05,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_lg_f16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x05,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x05,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_lg_f16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x05,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_lg_f16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x05,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_lg_f16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x05,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_lg_f16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x05,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x05,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_lg_f16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x05,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_lg_f16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x05,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_lg_f16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x05,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_lg_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x05,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x05,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_lg_f16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x05,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_lg_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x05,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_lg_f16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x05,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_lg_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x05,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x05,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_lg_f16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x05,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_lg_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x05,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_lg_f16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x05,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_lg_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x05,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x05,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_lg_f16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x05,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_lg_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x05,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_lg_f16_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x05,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_lg_f16_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x05,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x05,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_lg_f16_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x05,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_lg_f16_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x05,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_lg_f16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x05,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_lg_f16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x05,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x05,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_lg_f16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x05,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_lg_f16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x05,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_lg_f16_e64 s10, |exec_hi|, null     ; encoding: [0x0a,0x01,0x05,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_lg_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x05,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x05,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_lg_f16_e64 s10, |exec_hi|, null   ; encoding: [0x0a,0x01,0x05,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_lg_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x05,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_lg_f16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x05,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_lg_f16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x05,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x05,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_lg_f16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x05,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_lg_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x05,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_lg_f16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x05,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_lg_f16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x05,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x05,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_lg_f16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x05,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_lg_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x05,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_lg_f16_e64 vcc_lo, 0.5, -m0         ; encoding: [0x6a,0x00,0x05,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_lg_f16_e64 vcc, 0.5, -m0            ; encoding: [0x6a,0x00,0x05,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x05,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_lg_f16_e64 vcc_lo, 0.5, -m0       ; encoding: [0x6a,0x00,0x05,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_lg_f16_e64 vcc, 0.5, -m0          ; encoding: [0x6a,0x00,0x05,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x05,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_lg_f16_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x05,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_lg_f16_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x05,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x05,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX12: v_cmp_lg_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x05,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7c,0x83,0x05,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmp_lg_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x05,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_lg_f32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x15,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_lg_f32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x15,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x15,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_lg_f32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x15,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_lg_f32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x15,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_lg_f32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x15,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_lg_f32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x15,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x15,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_lg_f32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x15,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_lg_f32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x15,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_lg_f32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x15,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_lg_f32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x15,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x15,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_lg_f32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x15,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_lg_f32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x15,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_lg_f32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x15,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_lg_f32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x15,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x15,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_lg_f32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x15,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_lg_f32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x15,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_lg_f32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x15,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_lg_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x15,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x15,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_lg_f32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x15,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_lg_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x15,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_lg_f32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x15,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_lg_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x15,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x15,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_lg_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x15,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_lg_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x15,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lg_f32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x15,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_lg_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x15,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x15,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_lg_f32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x15,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_lg_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x15,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_lg_f32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x15,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_lg_f32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x15,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x15,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_lg_f32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x15,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_lg_f32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x15,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_lg_f32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x15,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_lg_f32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x15,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x15,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_lg_f32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x15,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_lg_f32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x15,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_lg_f32_e64 s10, |exec_hi|, null     ; encoding: [0x0a,0x01,0x15,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_lg_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x15,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x15,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_lg_f32_e64 s10, |exec_hi|, null   ; encoding: [0x0a,0x01,0x15,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_lg_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x15,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_lg_f32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x15,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_lg_f32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x15,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x15,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_lg_f32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x15,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_lg_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x15,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_lg_f32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x15,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_lg_f32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x15,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x15,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_lg_f32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x15,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_lg_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x15,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_lg_f32_e64 vcc_lo, 0.5, -m0         ; encoding: [0x6a,0x00,0x15,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_lg_f32_e64 vcc, 0.5, -m0            ; encoding: [0x6a,0x00,0x15,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x15,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_lg_f32_e64 vcc_lo, 0.5, -m0       ; encoding: [0x6a,0x00,0x15,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_lg_f32_e64 vcc, 0.5, -m0          ; encoding: [0x6a,0x00,0x15,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x15,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_lg_f32_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x15,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_lg_f32_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x15,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x15,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX12: v_cmp_lg_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x15,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7c,0x83,0x15,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_lg_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x15,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lg_f64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x25,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_lg_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x25,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x25,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_lg_f64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x25,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_lg_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x25,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x25,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_lg_f64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x25,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_lg_f64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x25,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x25,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_lg_f64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x25,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_lg_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x25,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x25,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_lg_f64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x25,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_lg_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x25,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x25,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_lg_f64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x25,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_lg_f64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x25,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x25,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_lg_f64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x25,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_lg_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x25,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x25,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_lg_f64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x25,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_lg_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x25,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x25,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_lg_f64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x25,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_lg_f64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x25,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x25,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_lg_f64_e64 s10, -|exec|, src_scc    ; encoding: [0x0a,0x01,0x25,0xd4,0x7e,0xfa,0x01,0x20]
-# W64: v_cmp_lg_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x25,0xd4,0x7e,0xfa,0x01,0x20]
 0x0a,0x01,0x25,0xd4,0x7e,0xfa,0x01,0x20
+# W32: v_cmp_lg_f64_e64 s10, -|exec|, src_scc  ; encoding: [0x0a,0x01,0x25,0xd4,0x7e,0xfa,0x01,0x20]
+# W64: v_cmp_lg_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x25,0xd4,0x7e,0xfa,0x01,0x20]
 
-# W32: v_cmp_lg_f64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x25,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_lg_f64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x25,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x25,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_lg_f64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x25,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_lg_f64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x25,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_lg_f64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x25,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_lg_f64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x25,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x25,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_lg_f64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x25,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_lg_f64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x25,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_lg_f64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x25,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_lg_f64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x25,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x25,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_lg_f64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x25,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_lg_f64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x25,0xd4,0xf0,0xf8,0x00,0x00]
 
+0x7a,0x03,0x25,0xd4,0xfd,0xfc,0x00,0x60
 # W32: v_cmp_lg_f64_e64 ttmp14, -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x25,0xd4,0xfd,0xfc,0x00,0x60]
 # W64: v_cmp_lg_f64_e64 ttmp[14:15], -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x25,0xd4,0xfd,0xfc,0x00,0x60]
-0x7a,0x03,0x25,0xd4,0xfd,0xfc,0x00,0x60
 
-# GFX12: v_cmp_lg_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x25,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7c,0x82,0x25,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_lg_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x25,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lt_f16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x01,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_lt_f16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x01,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x01,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_lt_f16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x01,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_lt_f16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x01,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_lt_f16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x01,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_lt_f16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x01,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x01,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_lt_f16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x01,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_lt_f16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x01,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_lt_f16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x01,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_lt_f16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x01,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x01,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_lt_f16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x01,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_lt_f16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x01,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_lt_f16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x01,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_lt_f16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x01,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x01,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_lt_f16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x01,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_lt_f16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x01,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_lt_f16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x01,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_lt_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x01,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x01,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_lt_f16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x01,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_lt_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x01,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_lt_f16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x01,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_lt_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x01,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x01,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_lt_f16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x01,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_lt_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x01,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_lt_f16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x01,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_lt_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x01,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x01,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_lt_f16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x01,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_lt_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x01,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_lt_f16_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x01,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_lt_f16_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x01,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x01,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_lt_f16_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x01,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_lt_f16_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x01,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_lt_f16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x01,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_lt_f16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x01,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x01,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_lt_f16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x01,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_lt_f16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x01,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_lt_f16_e64 s10, |exec_hi|, null     ; encoding: [0x0a,0x01,0x01,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_lt_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x01,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x01,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_lt_f16_e64 s10, |exec_hi|, null   ; encoding: [0x0a,0x01,0x01,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_lt_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x01,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_lt_f16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x01,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_lt_f16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x01,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x01,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_lt_f16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x01,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_lt_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x01,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_lt_f16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x01,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_lt_f16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x01,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x01,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_lt_f16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x01,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_lt_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x01,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_lt_f16_e64 vcc_lo, 0.5, -m0         ; encoding: [0x6a,0x00,0x01,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_lt_f16_e64 vcc, 0.5, -m0            ; encoding: [0x6a,0x00,0x01,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x01,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_lt_f16_e64 vcc_lo, 0.5, -m0       ; encoding: [0x6a,0x00,0x01,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_lt_f16_e64 vcc, 0.5, -m0          ; encoding: [0x6a,0x00,0x01,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x01,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_lt_f16_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x01,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_lt_f16_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x01,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x01,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX12: v_cmp_lt_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x01,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7c,0x83,0x01,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmp_lt_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x01,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_lt_f32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x11,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_lt_f32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x11,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x11,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_lt_f32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x11,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_lt_f32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x11,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_lt_f32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x11,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_lt_f32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x11,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x11,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_lt_f32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x11,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_lt_f32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x11,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_lt_f32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x11,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_lt_f32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x11,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x11,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_lt_f32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x11,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_lt_f32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x11,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_lt_f32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x11,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_lt_f32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x11,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x11,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_lt_f32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x11,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_lt_f32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x11,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_lt_f32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x11,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_lt_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x11,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x11,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_lt_f32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x11,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_lt_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x11,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_lt_f32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x11,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_lt_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x11,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x11,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_lt_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x11,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_lt_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x11,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lt_f32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x11,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_lt_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x11,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x11,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_lt_f32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x11,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_lt_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x11,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_lt_f32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x11,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_lt_f32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x11,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x11,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_lt_f32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x11,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_lt_f32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x11,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_lt_f32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x11,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_lt_f32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x11,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x11,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_lt_f32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x11,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_lt_f32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x11,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_lt_f32_e64 s10, |exec_hi|, null     ; encoding: [0x0a,0x01,0x11,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_lt_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x11,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x11,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_lt_f32_e64 s10, |exec_hi|, null   ; encoding: [0x0a,0x01,0x11,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_lt_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x11,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_lt_f32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x11,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_lt_f32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x11,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x11,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_lt_f32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x11,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_lt_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x11,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_lt_f32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x11,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_lt_f32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x11,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x11,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_lt_f32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x11,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_lt_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x11,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_lt_f32_e64 vcc_lo, 0.5, -m0         ; encoding: [0x6a,0x00,0x11,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_lt_f32_e64 vcc, 0.5, -m0            ; encoding: [0x6a,0x00,0x11,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x11,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_lt_f32_e64 vcc_lo, 0.5, -m0       ; encoding: [0x6a,0x00,0x11,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_lt_f32_e64 vcc, 0.5, -m0          ; encoding: [0x6a,0x00,0x11,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x11,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_lt_f32_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x11,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_lt_f32_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x11,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x11,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX12: v_cmp_lt_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x11,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7c,0x83,0x11,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_lt_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x11,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lt_f64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x21,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_lt_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x21,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x21,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_lt_f64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x21,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_lt_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x21,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x21,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_lt_f64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x21,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_lt_f64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x21,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x21,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_lt_f64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x21,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_lt_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x21,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x21,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_lt_f64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x21,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_lt_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x21,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x21,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_lt_f64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x21,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_lt_f64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x21,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x21,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_lt_f64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x21,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_lt_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x21,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x21,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_lt_f64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x21,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_lt_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x21,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x21,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_lt_f64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x21,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_lt_f64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x21,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x21,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_lt_f64_e64 s10, -|exec|, src_scc    ; encoding: [0x0a,0x01,0x21,0xd4,0x7e,0xfa,0x01,0x20]
-# W64: v_cmp_lt_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x21,0xd4,0x7e,0xfa,0x01,0x20]
 0x0a,0x01,0x21,0xd4,0x7e,0xfa,0x01,0x20
+# W32: v_cmp_lt_f64_e64 s10, -|exec|, src_scc  ; encoding: [0x0a,0x01,0x21,0xd4,0x7e,0xfa,0x01,0x20]
+# W64: v_cmp_lt_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x21,0xd4,0x7e,0xfa,0x01,0x20]
 
-# W32: v_cmp_lt_f64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x21,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_lt_f64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x21,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x21,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_lt_f64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x21,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_lt_f64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x21,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_lt_f64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x21,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_lt_f64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x21,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x21,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_lt_f64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x21,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_lt_f64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x21,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_lt_f64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x21,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_lt_f64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x21,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x21,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_lt_f64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x21,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_lt_f64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x21,0xd4,0xf0,0xf8,0x00,0x00]
 
+0x7a,0x03,0x21,0xd4,0xfd,0xfc,0x00,0x60
 # W32: v_cmp_lt_f64_e64 ttmp14, -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x21,0xd4,0xfd,0xfc,0x00,0x60]
 # W64: v_cmp_lt_f64_e64 ttmp[14:15], -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x21,0xd4,0xfd,0xfc,0x00,0x60]
-0x7a,0x03,0x21,0xd4,0xfd,0xfc,0x00,0x60
 
-# GFX12: v_cmp_lt_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x21,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7c,0x82,0x21,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_lt_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x21,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lt_i16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x31,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_lt_i16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x31,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x31,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_lt_i16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x31,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_lt_i16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x31,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_lt_i16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x31,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_lt_i16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x31,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x31,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_lt_i16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x31,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_lt_i16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x31,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_lt_i16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x31,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_lt_i16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x31,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x31,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_lt_i16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x31,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_lt_i16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x31,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_lt_i16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x31,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_lt_i16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x31,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x31,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_lt_i16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x31,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_lt_i16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x31,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_lt_i16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x31,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_lt_i16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x31,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x31,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_lt_i16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x31,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_lt_i16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x31,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_lt_i16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x31,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_lt_i16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x31,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x31,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_lt_i16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x31,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_lt_i16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x31,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_lt_i16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x31,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_lt_i16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x31,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x31,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_lt_i16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x31,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_lt_i16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x31,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_lt_i16_e64 s10, m0, 0x3800          ; encoding: [0x0a,0x00,0x31,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_lt_i16_e64 s[10:11], m0, 0x3800     ; encoding: [0x0a,0x00,0x31,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x0a,0x00,0x31,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_lt_i16_e64 s10, m0, 0x3800        ; encoding: [0x0a,0x00,0x31,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_lt_i16_e64 s[10:11], m0, 0x3800   ; encoding: [0x0a,0x00,0x31,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_lt_i16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x31,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_lt_i16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x31,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x31,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_lt_i16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x31,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_lt_i16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x31,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_lt_i16_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x31,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_lt_i16_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x31,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x31,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_lt_i16_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x31,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_lt_i16_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x31,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_lt_i16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x31,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_lt_i16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x31,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x31,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_lt_i16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x31,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_lt_i16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x31,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_lt_i16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x31,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_lt_i16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x31,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x31,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_lt_i16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x31,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_lt_i16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x31,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_lt_i16_e64 vcc_lo, 0x3800, m0       ; encoding: [0x6a,0x00,0x31,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_lt_i16_e64 vcc, 0x3800, m0          ; encoding: [0x6a,0x00,0x31,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x6a,0x00,0x31,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_lt_i16_e64 vcc_lo, 0x3800, m0     ; encoding: [0x6a,0x00,0x31,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_lt_i16_e64 vcc, 0x3800, m0        ; encoding: [0x6a,0x00,0x31,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_lt_i16_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x31,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_lt_i16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x31,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x31,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_lt_i16_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x31,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_lt_i16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x31,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmp_lt_i16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x31,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7c,0x00,0x31,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmp_lt_i16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x31,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_lt_i32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x41,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_lt_i32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x41,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x41,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_lt_i32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x41,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_lt_i32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x41,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_lt_i32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x41,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_lt_i32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x41,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x41,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_lt_i32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x41,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_lt_i32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x41,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_lt_i32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x41,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_lt_i32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x41,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x41,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_lt_i32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x41,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_lt_i32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x41,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_lt_i32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x41,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_lt_i32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x41,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x41,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_lt_i32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x41,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_lt_i32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x41,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_lt_i32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x41,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_lt_i32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x41,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x41,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_lt_i32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x41,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_lt_i32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x41,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_lt_i32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x41,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_lt_i32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x41,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x41,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_lt_i32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x41,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_lt_i32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x41,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lt_i32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x41,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_lt_i32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x41,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x41,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_lt_i32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x41,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_lt_i32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x41,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_lt_i32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x41,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_lt_i32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x41,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x41,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_lt_i32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x41,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_lt_i32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x41,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_lt_i32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x41,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_lt_i32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x41,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x41,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_lt_i32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x41,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_lt_i32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x41,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_lt_i32_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x41,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_lt_i32_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x41,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x41,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_lt_i32_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x41,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_lt_i32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x41,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_lt_i32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x41,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_lt_i32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x41,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x41,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_lt_i32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x41,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_lt_i32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x41,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_lt_i32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x41,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_lt_i32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x41,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x41,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_lt_i32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x41,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_lt_i32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x41,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_lt_i32_e64 vcc_lo, 0.5, m0          ; encoding: [0x6a,0x00,0x41,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_lt_i32_e64 vcc, 0.5, m0             ; encoding: [0x6a,0x00,0x41,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x41,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_lt_i32_e64 vcc_lo, 0.5, m0        ; encoding: [0x6a,0x00,0x41,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_lt_i32_e64 vcc, 0.5, m0           ; encoding: [0x6a,0x00,0x41,0xd4,0xf0,0xfa,0x00,0x00]
 
-# W32: v_cmp_lt_i32_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x41,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_lt_i32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x41,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x41,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_lt_i32_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x41,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_lt_i32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x41,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmp_lt_i32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x41,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x41,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_lt_i32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x41,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lt_i64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x51,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_lt_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x51,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x51,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_lt_i64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x51,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_lt_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x51,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x51,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_lt_i64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x51,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_lt_i64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x51,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x51,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_lt_i64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x51,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_lt_i64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x51,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x51,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_lt_i64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x51,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_lt_i64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x51,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x51,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_lt_i64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x51,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_lt_i64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x51,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x51,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_lt_i64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x51,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_lt_i64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x51,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x51,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_lt_i64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x51,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_lt_i64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x51,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x51,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_lt_i64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x51,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_lt_i64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x51,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x51,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_lt_i64_e64 s10, exec, src_scc       ; encoding: [0x0a,0x00,0x51,0xd4,0x7e,0xfa,0x01,0x00]
-# W64: v_cmp_lt_i64_e64 s[10:11], exec, src_scc  ; encoding: [0x0a,0x00,0x51,0xd4,0x7e,0xfa,0x01,0x00]
 0x0a,0x00,0x51,0xd4,0x7e,0xfa,0x01,0x00
+# W32: v_cmp_lt_i64_e64 s10, exec, src_scc     ; encoding: [0x0a,0x00,0x51,0xd4,0x7e,0xfa,0x01,0x00]
+# W64: v_cmp_lt_i64_e64 s[10:11], exec, src_scc ; encoding: [0x0a,0x00,0x51,0xd4,0x7e,0xfa,0x01,0x00]
 
-# W32: v_cmp_lt_i64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x51,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_lt_i64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x51,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x51,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_lt_i64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x51,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_lt_i64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x51,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_lt_i64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x51,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_lt_i64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x51,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x51,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_lt_i64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x51,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_lt_i64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x51,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_lt_i64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x51,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_lt_i64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x51,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x51,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_lt_i64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x51,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_lt_i64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x51,0xd4,0xf0,0xf8,0x00,0x00]
 
-# W32: v_cmp_lt_i64_e64 ttmp14, src_scc, exec    ; encoding: [0x7a,0x00,0x51,0xd4,0xfd,0xfc,0x00,0x00]
-# W64: v_cmp_lt_i64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x51,0xd4,0xfd,0xfc,0x00,0x00]
 0x7a,0x00,0x51,0xd4,0xfd,0xfc,0x00,0x00
+# W32: v_cmp_lt_i64_e64 ttmp14, src_scc, exec  ; encoding: [0x7a,0x00,0x51,0xd4,0xfd,0xfc,0x00,0x00]
+# W64: v_cmp_lt_i64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x51,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX12: v_cmp_lt_i64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x51,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x51,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_lt_i64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x51,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lt_u16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x39,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_lt_u16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x39,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x39,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_lt_u16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x39,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_lt_u16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x39,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_lt_u16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x39,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_lt_u16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x39,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x39,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_lt_u16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x39,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_lt_u16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x39,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_lt_u16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x39,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_lt_u16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x39,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x39,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_lt_u16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x39,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_lt_u16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x39,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_lt_u16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x39,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_lt_u16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x39,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x39,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_lt_u16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x39,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_lt_u16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x39,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_lt_u16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x39,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_lt_u16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x39,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x39,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_lt_u16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x39,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_lt_u16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x39,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_lt_u16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x39,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_lt_u16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x39,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x39,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_lt_u16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x39,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_lt_u16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x39,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_lt_u16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x39,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_lt_u16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x39,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x39,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_lt_u16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x39,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_lt_u16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x39,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_lt_u16_e64 s10, m0, 0x3800          ; encoding: [0x0a,0x00,0x39,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_lt_u16_e64 s[10:11], m0, 0x3800     ; encoding: [0x0a,0x00,0x39,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x0a,0x00,0x39,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_lt_u16_e64 s10, m0, 0x3800        ; encoding: [0x0a,0x00,0x39,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_lt_u16_e64 s[10:11], m0, 0x3800   ; encoding: [0x0a,0x00,0x39,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_lt_u16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x39,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_lt_u16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x39,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x39,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_lt_u16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x39,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_lt_u16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x39,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_lt_u16_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x39,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_lt_u16_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x39,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x39,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_lt_u16_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x39,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_lt_u16_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x39,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_lt_u16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x39,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_lt_u16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x39,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x39,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_lt_u16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x39,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_lt_u16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x39,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_lt_u16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x39,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_lt_u16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x39,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x39,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_lt_u16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x39,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_lt_u16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x39,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_lt_u16_e64 vcc_lo, 0x3800, m0       ; encoding: [0x6a,0x00,0x39,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_lt_u16_e64 vcc, 0x3800, m0          ; encoding: [0x6a,0x00,0x39,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x6a,0x00,0x39,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_lt_u16_e64 vcc_lo, 0x3800, m0     ; encoding: [0x6a,0x00,0x39,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_lt_u16_e64 vcc, 0x3800, m0        ; encoding: [0x6a,0x00,0x39,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_lt_u16_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x39,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_lt_u16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x39,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x39,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_lt_u16_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x39,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_lt_u16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x39,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmp_lt_u16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x39,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7c,0x00,0x39,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmp_lt_u16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x39,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_lt_u32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x49,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_lt_u32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x49,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x49,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_lt_u32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x49,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_lt_u32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x49,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_lt_u32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x49,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_lt_u32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x49,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x49,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_lt_u32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x49,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_lt_u32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x49,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_lt_u32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x49,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_lt_u32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x49,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x49,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_lt_u32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x49,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_lt_u32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x49,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_lt_u32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x49,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_lt_u32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x49,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x49,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_lt_u32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x49,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_lt_u32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x49,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_lt_u32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x49,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_lt_u32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x49,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x49,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_lt_u32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x49,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_lt_u32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x49,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_lt_u32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x49,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_lt_u32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x49,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x49,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_lt_u32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x49,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_lt_u32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x49,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lt_u32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x49,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_lt_u32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x49,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x49,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_lt_u32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x49,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_lt_u32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x49,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_lt_u32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x49,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_lt_u32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x49,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x49,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_lt_u32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x49,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_lt_u32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x49,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_lt_u32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x49,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_lt_u32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x49,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x49,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_lt_u32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x49,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_lt_u32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x49,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_lt_u32_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x49,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_lt_u32_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x49,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x49,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_lt_u32_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x49,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_lt_u32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x49,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_lt_u32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x49,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_lt_u32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x49,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x49,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_lt_u32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x49,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_lt_u32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x49,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_lt_u32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x49,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_lt_u32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x49,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x49,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_lt_u32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x49,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_lt_u32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x49,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_lt_u32_e64 vcc_lo, 0.5, m0          ; encoding: [0x6a,0x00,0x49,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_lt_u32_e64 vcc, 0.5, m0             ; encoding: [0x6a,0x00,0x49,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x49,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_lt_u32_e64 vcc_lo, 0.5, m0        ; encoding: [0x6a,0x00,0x49,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_lt_u32_e64 vcc, 0.5, m0           ; encoding: [0x6a,0x00,0x49,0xd4,0xf0,0xfa,0x00,0x00]
 
-# W32: v_cmp_lt_u32_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x49,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_lt_u32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x49,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x49,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_lt_u32_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x49,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_lt_u32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x49,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmp_lt_u32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x49,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x49,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_lt_u32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x49,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lt_u64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x59,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_lt_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x59,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x59,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_lt_u64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x59,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_lt_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x59,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x59,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_lt_u64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x59,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_lt_u64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x59,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x59,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_lt_u64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x59,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_lt_u64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x59,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x59,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_lt_u64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x59,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_lt_u64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x59,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x59,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_lt_u64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x59,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_lt_u64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x59,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x59,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_lt_u64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x59,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_lt_u64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x59,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x59,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_lt_u64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x59,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_lt_u64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x59,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x59,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_lt_u64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x59,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_lt_u64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x59,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x59,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_lt_u64_e64 s10, exec, src_scc       ; encoding: [0x0a,0x00,0x59,0xd4,0x7e,0xfa,0x01,0x00]
-# W64: v_cmp_lt_u64_e64 s[10:11], exec, src_scc  ; encoding: [0x0a,0x00,0x59,0xd4,0x7e,0xfa,0x01,0x00]
 0x0a,0x00,0x59,0xd4,0x7e,0xfa,0x01,0x00
+# W32: v_cmp_lt_u64_e64 s10, exec, src_scc     ; encoding: [0x0a,0x00,0x59,0xd4,0x7e,0xfa,0x01,0x00]
+# W64: v_cmp_lt_u64_e64 s[10:11], exec, src_scc ; encoding: [0x0a,0x00,0x59,0xd4,0x7e,0xfa,0x01,0x00]
 
-# W32: v_cmp_lt_u64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x59,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_lt_u64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x59,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x59,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_lt_u64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x59,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_lt_u64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x59,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_lt_u64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x59,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_lt_u64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x59,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x59,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_lt_u64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x59,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_lt_u64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x59,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_lt_u64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x59,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_lt_u64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x59,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x59,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_lt_u64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x59,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_lt_u64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x59,0xd4,0xf0,0xf8,0x00,0x00]
 
-# W32: v_cmp_lt_u64_e64 ttmp14, src_scc, exec    ; encoding: [0x7a,0x00,0x59,0xd4,0xfd,0xfc,0x00,0x00]
-# W64: v_cmp_lt_u64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x59,0xd4,0xfd,0xfc,0x00,0x00]
 0x7a,0x00,0x59,0xd4,0xfd,0xfc,0x00,0x00
+# W32: v_cmp_lt_u64_e64 ttmp14, src_scc, exec  ; encoding: [0x7a,0x00,0x59,0xd4,0xfd,0xfc,0x00,0x00]
+# W64: v_cmp_lt_u64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x59,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX12: v_cmp_lt_u64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x59,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x59,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_lt_u64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x59,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ne_i16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x35,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ne_i16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x35,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x35,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ne_i16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x35,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ne_i16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x35,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_ne_i16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x35,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_ne_i16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x35,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x35,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_ne_i16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x35,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_ne_i16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x35,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_ne_i16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x35,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_ne_i16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x35,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x35,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_ne_i16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x35,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_ne_i16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x35,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_ne_i16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x35,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_ne_i16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x35,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x35,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_ne_i16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x35,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_ne_i16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x35,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_ne_i16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x35,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_ne_i16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x35,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x35,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_ne_i16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x35,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_ne_i16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x35,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_ne_i16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x35,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_ne_i16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x35,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x35,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_ne_i16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x35,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_ne_i16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x35,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ne_i16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x35,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_ne_i16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x35,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x35,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_ne_i16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x35,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_ne_i16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x35,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_ne_i16_e64 s10, m0, 0x3800          ; encoding: [0x0a,0x00,0x35,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_ne_i16_e64 s[10:11], m0, 0x3800     ; encoding: [0x0a,0x00,0x35,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x0a,0x00,0x35,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_ne_i16_e64 s10, m0, 0x3800        ; encoding: [0x0a,0x00,0x35,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_ne_i16_e64 s[10:11], m0, 0x3800   ; encoding: [0x0a,0x00,0x35,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_ne_i16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x35,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_ne_i16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x35,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x35,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_ne_i16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x35,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_ne_i16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x35,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_ne_i16_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x35,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_ne_i16_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x35,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x35,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_ne_i16_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x35,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_ne_i16_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x35,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_ne_i16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x35,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_ne_i16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x35,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x35,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_ne_i16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x35,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_ne_i16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x35,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_ne_i16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x35,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_ne_i16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x35,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x35,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_ne_i16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x35,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_ne_i16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x35,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_ne_i16_e64 vcc_lo, 0x3800, m0       ; encoding: [0x6a,0x00,0x35,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_ne_i16_e64 vcc, 0x3800, m0          ; encoding: [0x6a,0x00,0x35,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x6a,0x00,0x35,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_ne_i16_e64 vcc_lo, 0x3800, m0     ; encoding: [0x6a,0x00,0x35,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_ne_i16_e64 vcc, 0x3800, m0        ; encoding: [0x6a,0x00,0x35,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_ne_i16_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x35,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_ne_i16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x35,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x35,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_ne_i16_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x35,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_ne_i16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x35,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmp_ne_i16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x35,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7c,0x00,0x35,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmp_ne_i16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x35,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ne_i32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x45,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ne_i32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x45,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x45,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ne_i32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x45,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ne_i32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x45,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_ne_i32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x45,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_ne_i32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x45,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x45,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_ne_i32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x45,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_ne_i32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x45,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_ne_i32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x45,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_ne_i32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x45,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x45,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_ne_i32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x45,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_ne_i32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x45,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_ne_i32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x45,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_ne_i32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x45,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x45,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_ne_i32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x45,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_ne_i32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x45,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_ne_i32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x45,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_ne_i32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x45,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x45,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_ne_i32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x45,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_ne_i32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x45,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_ne_i32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x45,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_ne_i32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x45,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x45,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_ne_i32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x45,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_ne_i32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x45,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ne_i32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x45,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_ne_i32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x45,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x45,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_ne_i32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x45,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_ne_i32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x45,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_ne_i32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x45,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_ne_i32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x45,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x45,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_ne_i32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x45,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_ne_i32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x45,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_ne_i32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x45,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_ne_i32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x45,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x45,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_ne_i32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x45,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_ne_i32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x45,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_ne_i32_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x45,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_ne_i32_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x45,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x45,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_ne_i32_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x45,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_ne_i32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x45,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_ne_i32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x45,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_ne_i32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x45,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x45,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_ne_i32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x45,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_ne_i32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x45,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_ne_i32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x45,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_ne_i32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x45,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x45,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_ne_i32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x45,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_ne_i32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x45,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_ne_i32_e64 vcc_lo, 0.5, m0          ; encoding: [0x6a,0x00,0x45,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_ne_i32_e64 vcc, 0.5, m0             ; encoding: [0x6a,0x00,0x45,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x45,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_ne_i32_e64 vcc_lo, 0.5, m0        ; encoding: [0x6a,0x00,0x45,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_ne_i32_e64 vcc, 0.5, m0           ; encoding: [0x6a,0x00,0x45,0xd4,0xf0,0xfa,0x00,0x00]
 
-# W32: v_cmp_ne_i32_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x45,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_ne_i32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x45,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x45,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_ne_i32_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x45,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_ne_i32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x45,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmp_ne_i32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x45,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x45,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_ne_i32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x45,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ne_i64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x55,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ne_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x55,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x55,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ne_i64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x55,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ne_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x55,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x55,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_ne_i64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x55,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_ne_i64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x55,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x55,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_ne_i64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x55,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_ne_i64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x55,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x55,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_ne_i64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x55,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_ne_i64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x55,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x55,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_ne_i64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x55,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_ne_i64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x55,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x55,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_ne_i64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x55,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_ne_i64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x55,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x55,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_ne_i64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x55,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_ne_i64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x55,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x55,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_ne_i64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x55,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_ne_i64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x55,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x55,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_ne_i64_e64 s10, exec, src_scc       ; encoding: [0x0a,0x00,0x55,0xd4,0x7e,0xfa,0x01,0x00]
-# W64: v_cmp_ne_i64_e64 s[10:11], exec, src_scc  ; encoding: [0x0a,0x00,0x55,0xd4,0x7e,0xfa,0x01,0x00]
 0x0a,0x00,0x55,0xd4,0x7e,0xfa,0x01,0x00
+# W32: v_cmp_ne_i64_e64 s10, exec, src_scc     ; encoding: [0x0a,0x00,0x55,0xd4,0x7e,0xfa,0x01,0x00]
+# W64: v_cmp_ne_i64_e64 s[10:11], exec, src_scc ; encoding: [0x0a,0x00,0x55,0xd4,0x7e,0xfa,0x01,0x00]
 
-# W32: v_cmp_ne_i64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x55,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_ne_i64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x55,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x55,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_ne_i64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x55,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_ne_i64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x55,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_ne_i64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x55,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_ne_i64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x55,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x55,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_ne_i64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x55,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_ne_i64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x55,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_ne_i64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x55,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_ne_i64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x55,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x55,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_ne_i64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x55,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_ne_i64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x55,0xd4,0xf0,0xf8,0x00,0x00]
 
-# W32: v_cmp_ne_i64_e64 ttmp14, src_scc, exec    ; encoding: [0x7a,0x00,0x55,0xd4,0xfd,0xfc,0x00,0x00]
-# W64: v_cmp_ne_i64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x55,0xd4,0xfd,0xfc,0x00,0x00]
 0x7a,0x00,0x55,0xd4,0xfd,0xfc,0x00,0x00
+# W32: v_cmp_ne_i64_e64 ttmp14, src_scc, exec  ; encoding: [0x7a,0x00,0x55,0xd4,0xfd,0xfc,0x00,0x00]
+# W64: v_cmp_ne_i64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x55,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX12: v_cmp_ne_i64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x55,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x55,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_ne_i64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x55,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ne_u16_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x3d,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ne_u16_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x3d,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x3d,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ne_u16_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x3d,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ne_u16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x3d,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_ne_u16_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x3d,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_ne_u16_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x3d,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x3d,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_ne_u16_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x3d,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_ne_u16_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x3d,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_ne_u16_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x3d,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_ne_u16_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x3d,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x3d,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_ne_u16_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x3d,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_ne_u16_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x3d,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_ne_u16_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x3d,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_ne_u16_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x3d,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x3d,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_ne_u16_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x3d,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_ne_u16_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x3d,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_ne_u16_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x3d,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_ne_u16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x3d,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x3d,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_ne_u16_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x3d,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_ne_u16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x3d,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_ne_u16_e64 s10, vcc_hi, 0xfe0b      ; encoding: [0x0a,0x00,0x3d,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_ne_u16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x3d,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x3d,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_ne_u16_e64 s10, vcc_hi, 0xfe0b    ; encoding: [0x0a,0x00,0x3d,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_ne_u16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x3d,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ne_u16_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x3d,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_ne_u16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x3d,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x3d,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_ne_u16_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x3d,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_ne_u16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x3d,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_ne_u16_e64 s10, m0, 0x3800          ; encoding: [0x0a,0x00,0x3d,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_ne_u16_e64 s[10:11], m0, 0x3800     ; encoding: [0x0a,0x00,0x3d,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x0a,0x00,0x3d,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_ne_u16_e64 s10, m0, 0x3800        ; encoding: [0x0a,0x00,0x3d,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_ne_u16_e64 s[10:11], m0, 0x3800   ; encoding: [0x0a,0x00,0x3d,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_ne_u16_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x3d,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_ne_u16_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x3d,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x3d,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_ne_u16_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x3d,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_ne_u16_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x3d,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_ne_u16_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x3d,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_ne_u16_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x3d,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x3d,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_ne_u16_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x3d,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_ne_u16_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x3d,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_ne_u16_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x3d,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_ne_u16_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x3d,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x3d,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_ne_u16_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x3d,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_ne_u16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x3d,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_ne_u16_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x3d,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_ne_u16_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x3d,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x3d,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_ne_u16_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x3d,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_ne_u16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x3d,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_ne_u16_e64 vcc_lo, 0x3800, m0       ; encoding: [0x6a,0x00,0x3d,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cmp_ne_u16_e64 vcc, 0x3800, m0          ; encoding: [0x6a,0x00,0x3d,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x6a,0x00,0x3d,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_ne_u16_e64 vcc_lo, 0x3800, m0     ; encoding: [0x6a,0x00,0x3d,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64: v_cmp_ne_u16_e64 vcc, 0x3800, m0        ; encoding: [0x6a,0x00,0x3d,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_ne_u16_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x3d,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_ne_u16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x3d,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x3d,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_ne_u16_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x3d,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_ne_u16_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x3d,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmp_ne_u16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x3d,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7c,0x00,0x3d,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmp_ne_u16_e64 null, 0xfe0b, vcc_hi   ; encoding: [0x7c,0x00,0x3d,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ne_u32_e64 s10, v1, v2              ; encoding: [0x0a,0x00,0x4d,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ne_u32_e64 s[10:11], v1, v2         ; encoding: [0x0a,0x00,0x4d,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x4d,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ne_u32_e64 s10, v1, v2            ; encoding: [0x0a,0x00,0x4d,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ne_u32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0x4d,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_ne_u32_e64 s10, v255, v255          ; encoding: [0x0a,0x00,0x4d,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_ne_u32_e64 s[10:11], v255, v255     ; encoding: [0x0a,0x00,0x4d,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x4d,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_ne_u32_e64 s10, v255, v255        ; encoding: [0x0a,0x00,0x4d,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_ne_u32_e64 s[10:11], v255, v255   ; encoding: [0x0a,0x00,0x4d,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_ne_u32_e64 s10, s1, s2              ; encoding: [0x0a,0x00,0x4d,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_ne_u32_e64 s[10:11], s1, s2         ; encoding: [0x0a,0x00,0x4d,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x4d,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_ne_u32_e64 s10, s1, s2            ; encoding: [0x0a,0x00,0x4d,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_ne_u32_e64 s[10:11], s1, s2       ; encoding: [0x0a,0x00,0x4d,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_ne_u32_e64 s10, s105, s105          ; encoding: [0x0a,0x00,0x4d,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_ne_u32_e64 s[10:11], s105, s105     ; encoding: [0x0a,0x00,0x4d,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x4d,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_ne_u32_e64 s10, s105, s105        ; encoding: [0x0a,0x00,0x4d,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_ne_u32_e64 s[10:11], s105, s105   ; encoding: [0x0a,0x00,0x4d,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_ne_u32_e64 s10, vcc_lo, ttmp15      ; encoding: [0x0a,0x00,0x4d,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_ne_u32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x4d,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x4d,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_ne_u32_e64 s10, vcc_lo, ttmp15    ; encoding: [0x0a,0x00,0x4d,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_ne_u32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x4d,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_ne_u32_e64 s10, vcc_hi, 0xaf123456  ; encoding: [0x0a,0x00,0x4d,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_ne_u32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4d,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x4d,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_ne_u32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4d,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_ne_u32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x4d,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ne_u32_e64 s10, ttmp15, src_scc     ; encoding: [0x0a,0x00,0x4d,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_ne_u32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x4d,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x4d,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_ne_u32_e64 s10, ttmp15, src_scc   ; encoding: [0x0a,0x00,0x4d,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_ne_u32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x4d,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_ne_u32_e64 s10, m0, 0.5             ; encoding: [0x0a,0x00,0x4d,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_ne_u32_e64 s[10:11], m0, 0.5        ; encoding: [0x0a,0x00,0x4d,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x4d,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_ne_u32_e64 s10, m0, 0.5           ; encoding: [0x0a,0x00,0x4d,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_ne_u32_e64 s[10:11], m0, 0.5      ; encoding: [0x0a,0x00,0x4d,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_ne_u32_e64 s10, exec_lo, -1         ; encoding: [0x0a,0x00,0x4d,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_ne_u32_e64 s[10:11], exec_lo, -1    ; encoding: [0x0a,0x00,0x4d,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x4d,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_ne_u32_e64 s10, exec_lo, -1       ; encoding: [0x0a,0x00,0x4d,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_ne_u32_e64 s[10:11], exec_lo, -1  ; encoding: [0x0a,0x00,0x4d,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_ne_u32_e64 s10, exec_hi, null       ; encoding: [0x0a,0x00,0x4d,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_ne_u32_e64 s[10:11], exec_hi, null  ; encoding: [0x0a,0x00,0x4d,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x00,0x4d,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_ne_u32_e64 s10, exec_hi, null     ; encoding: [0x0a,0x00,0x4d,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_ne_u32_e64 s[10:11], exec_hi, null ; encoding: [0x0a,0x00,0x4d,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_ne_u32_e64 s10, null, exec_lo       ; encoding: [0x0a,0x00,0x4d,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_ne_u32_e64 s[10:11], null, exec_lo  ; encoding: [0x0a,0x00,0x4d,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x4d,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_ne_u32_e64 s10, null, exec_lo     ; encoding: [0x0a,0x00,0x4d,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_ne_u32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x4d,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_ne_u32_e64 s104, -1, exec_hi        ; encoding: [0x68,0x00,0x4d,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_ne_u32_e64 s[104:105], -1, exec_hi  ; encoding: [0x68,0x00,0x4d,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x4d,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_ne_u32_e64 s104, -1, exec_hi      ; encoding: [0x68,0x00,0x4d,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_ne_u32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x4d,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_ne_u32_e64 vcc_lo, 0.5, m0          ; encoding: [0x6a,0x00,0x4d,0xd4,0xf0,0xfa,0x00,0x00]
-# W64: v_cmp_ne_u32_e64 vcc, 0.5, m0             ; encoding: [0x6a,0x00,0x4d,0xd4,0xf0,0xfa,0x00,0x00]
 0x6a,0x00,0x4d,0xd4,0xf0,0xfa,0x00,0x00
+# W32: v_cmp_ne_u32_e64 vcc_lo, 0.5, m0        ; encoding: [0x6a,0x00,0x4d,0xd4,0xf0,0xfa,0x00,0x00]
+# W64: v_cmp_ne_u32_e64 vcc, 0.5, m0           ; encoding: [0x6a,0x00,0x4d,0xd4,0xf0,0xfa,0x00,0x00]
 
-# W32: v_cmp_ne_u32_e64 ttmp14, src_scc, vcc_lo  ; encoding: [0x7a,0x00,0x4d,0xd4,0xfd,0xd4,0x00,0x00]
-# W64: v_cmp_ne_u32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4d,0xd4,0xfd,0xd4,0x00,0x00]
 0x7a,0x00,0x4d,0xd4,0xfd,0xd4,0x00,0x00
+# W32: v_cmp_ne_u32_e64 ttmp14, src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4d,0xd4,0xfd,0xd4,0x00,0x00]
+# W64: v_cmp_ne_u32_e64 ttmp[14:15], src_scc, vcc_lo ; encoding: [0x7a,0x00,0x4d,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmp_ne_u32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x4d,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x4d,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_ne_u32_e64 null, 0xaf123456, vcc_hi ; encoding: [0x7c,0x00,0x4d,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ne_u64_e64 s10, v[1:2], v[2:3]      ; encoding: [0x0a,0x00,0x5d,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ne_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x5d,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x5d,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ne_u64_e64 s10, v[1:2], v[2:3]    ; encoding: [0x0a,0x00,0x5d,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ne_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x5d,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x5d,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_ne_u64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x5d,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_ne_u64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x5d,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x5d,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_ne_u64_e64 s10, s[2:3], s[4:5]      ; encoding: [0x0a,0x00,0x5d,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_ne_u64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x5d,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x5d,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_ne_u64_e64 s10, s[2:3], s[4:5]    ; encoding: [0x0a,0x00,0x5d,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_ne_u64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x5d,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x5d,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_ne_u64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x5d,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_ne_u64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x5d,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x5d,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_ne_u64_e64 s10, vcc, ttmp[14:15]    ; encoding: [0x0a,0x00,0x5d,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_ne_u64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x5d,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x5d,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_ne_u64_e64 s10, vcc, ttmp[14:15]  ; encoding: [0x0a,0x00,0x5d,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_ne_u64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x5d,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x5d,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_ne_u64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x5d,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_ne_u64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x5d,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x5d,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_ne_u64_e64 s10, exec, src_scc       ; encoding: [0x0a,0x00,0x5d,0xd4,0x7e,0xfa,0x01,0x00]
-# W64: v_cmp_ne_u64_e64 s[10:11], exec, src_scc  ; encoding: [0x0a,0x00,0x5d,0xd4,0x7e,0xfa,0x01,0x00]
 0x0a,0x00,0x5d,0xd4,0x7e,0xfa,0x01,0x00
+# W32: v_cmp_ne_u64_e64 s10, exec, src_scc     ; encoding: [0x0a,0x00,0x5d,0xd4,0x7e,0xfa,0x01,0x00]
+# W64: v_cmp_ne_u64_e64 s[10:11], exec, src_scc ; encoding: [0x0a,0x00,0x5d,0xd4,0x7e,0xfa,0x01,0x00]
 
-# W32: v_cmp_ne_u64_e64 s10, null, 0.5           ; encoding: [0x0a,0x00,0x5d,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_ne_u64_e64 s[10:11], null, 0.5      ; encoding: [0x0a,0x00,0x5d,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x5d,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_ne_u64_e64 s10, null, 0.5         ; encoding: [0x0a,0x00,0x5d,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_ne_u64_e64 s[10:11], null, 0.5    ; encoding: [0x0a,0x00,0x5d,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_ne_u64_e64 s104, -1, -1             ; encoding: [0x68,0x00,0x5d,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_ne_u64_e64 s[104:105], -1, -1       ; encoding: [0x68,0x00,0x5d,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x5d,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_ne_u64_e64 s104, -1, -1           ; encoding: [0x68,0x00,0x5d,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_ne_u64_e64 s[104:105], -1, -1     ; encoding: [0x68,0x00,0x5d,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_ne_u64_e64 vcc_lo, 0.5, null        ; encoding: [0x6a,0x00,0x5d,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_ne_u64_e64 vcc, 0.5, null           ; encoding: [0x6a,0x00,0x5d,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x5d,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_ne_u64_e64 vcc_lo, 0.5, null      ; encoding: [0x6a,0x00,0x5d,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_ne_u64_e64 vcc, 0.5, null         ; encoding: [0x6a,0x00,0x5d,0xd4,0xf0,0xf8,0x00,0x00]
 
-# W32: v_cmp_ne_u64_e64 ttmp14, src_scc, exec    ; encoding: [0x7a,0x00,0x5d,0xd4,0xfd,0xfc,0x00,0x00]
-# W64: v_cmp_ne_u64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x5d,0xd4,0xfd,0xfc,0x00,0x00]
 0x7a,0x00,0x5d,0xd4,0xfd,0xfc,0x00,0x00
+# W32: v_cmp_ne_u64_e64 ttmp14, src_scc, exec  ; encoding: [0x7a,0x00,0x5d,0xd4,0xfd,0xfc,0x00,0x00]
+# W64: v_cmp_ne_u64_e64 ttmp[14:15], src_scc, exec ; encoding: [0x7a,0x00,0x5d,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX12: v_cmp_ne_u64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x5d,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7c,0x00,0x5d,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_ne_u64_e64 null, 0xaf123456, vcc  ; encoding: [0x7c,0x00,0x5d,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_neq_f16_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x0d,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_neq_f16_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x0d,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x0d,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_neq_f16_e64 s10, v1, v2           ; encoding: [0x0a,0x00,0x0d,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_neq_f16_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0x0d,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_neq_f16_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x0d,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_neq_f16_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x0d,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x0d,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_neq_f16_e64 s10, v255, v255       ; encoding: [0x0a,0x00,0x0d,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_neq_f16_e64 s[10:11], v255, v255  ; encoding: [0x0a,0x00,0x0d,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_neq_f16_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x0d,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_neq_f16_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x0d,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x0d,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_neq_f16_e64 s10, s1, s2           ; encoding: [0x0a,0x00,0x0d,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_neq_f16_e64 s[10:11], s1, s2      ; encoding: [0x0a,0x00,0x0d,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_neq_f16_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x0d,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_neq_f16_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x0d,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x0d,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_neq_f16_e64 s10, s105, s105       ; encoding: [0x0a,0x00,0x0d,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_neq_f16_e64 s[10:11], s105, s105  ; encoding: [0x0a,0x00,0x0d,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_neq_f16_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x0d,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_neq_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x0d,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x0d,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_neq_f16_e64 s10, vcc_lo, ttmp15   ; encoding: [0x0a,0x00,0x0d,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_neq_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x0d,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_neq_f16_e64 s10, vcc_hi, 0xfe0b     ; encoding: [0x0a,0x00,0x0d,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_neq_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x0d,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x0d,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_neq_f16_e64 s10, vcc_hi, 0xfe0b   ; encoding: [0x0a,0x00,0x0d,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_neq_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x0d,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_neq_f16_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x0d,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_neq_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x0d,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x0d,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_neq_f16_e64 s10, ttmp15, src_scc  ; encoding: [0x0a,0x00,0x0d,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_neq_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x0d,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_neq_f16_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x0d,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_neq_f16_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x0d,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x0d,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_neq_f16_e64 s10, m0, 0.5          ; encoding: [0x0a,0x00,0x0d,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_neq_f16_e64 s[10:11], m0, 0.5     ; encoding: [0x0a,0x00,0x0d,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_neq_f16_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x0d,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_neq_f16_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x0d,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x0d,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_neq_f16_e64 s10, exec_lo, -1      ; encoding: [0x0a,0x00,0x0d,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_neq_f16_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x0d,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_neq_f16_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x0d,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_neq_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x0d,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x0d,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_neq_f16_e64 s10, |exec_hi|, null  ; encoding: [0x0a,0x01,0x0d,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_neq_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x0d,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_neq_f16_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x0d,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_neq_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x0d,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x0d,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_neq_f16_e64 s10, null, exec_lo    ; encoding: [0x0a,0x00,0x0d,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_neq_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x0d,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_neq_f16_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x0d,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_neq_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x0d,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x0d,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_neq_f16_e64 s104, -1, exec_hi     ; encoding: [0x68,0x00,0x0d,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_neq_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x0d,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_neq_f16_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x0d,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_neq_f16_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x0d,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x0d,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_neq_f16_e64 vcc_lo, 0.5, -m0      ; encoding: [0x6a,0x00,0x0d,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_neq_f16_e64 vcc, 0.5, -m0         ; encoding: [0x6a,0x00,0x0d,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x0d,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_neq_f16_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x0d,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_neq_f16_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x0d,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x0d,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX12: v_cmp_neq_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x0d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7c,0x83,0x0d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmp_neq_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x0d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_neq_f32_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x1d,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_neq_f32_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x1d,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x1d,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_neq_f32_e64 s10, v1, v2           ; encoding: [0x0a,0x00,0x1d,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_neq_f32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0x1d,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_neq_f32_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x1d,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_neq_f32_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x1d,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x1d,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_neq_f32_e64 s10, v255, v255       ; encoding: [0x0a,0x00,0x1d,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_neq_f32_e64 s[10:11], v255, v255  ; encoding: [0x0a,0x00,0x1d,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_neq_f32_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x1d,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_neq_f32_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x1d,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x1d,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_neq_f32_e64 s10, s1, s2           ; encoding: [0x0a,0x00,0x1d,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_neq_f32_e64 s[10:11], s1, s2      ; encoding: [0x0a,0x00,0x1d,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_neq_f32_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x1d,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_neq_f32_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x1d,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x1d,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_neq_f32_e64 s10, s105, s105       ; encoding: [0x0a,0x00,0x1d,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_neq_f32_e64 s[10:11], s105, s105  ; encoding: [0x0a,0x00,0x1d,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_neq_f32_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x1d,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_neq_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x1d,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x1d,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_neq_f32_e64 s10, vcc_lo, ttmp15   ; encoding: [0x0a,0x00,0x1d,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_neq_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x1d,0xd4,0x6a,0xf6,0x00,0x00]
 
+0x0a,0x00,0x1d,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_neq_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x1d,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_neq_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x1d,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x1d,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_neq_f32_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x1d,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_neq_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x1d,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x1d,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_neq_f32_e64 s10, ttmp15, src_scc  ; encoding: [0x0a,0x00,0x1d,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_neq_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x1d,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_neq_f32_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x1d,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_neq_f32_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x1d,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x1d,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_neq_f32_e64 s10, m0, 0.5          ; encoding: [0x0a,0x00,0x1d,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_neq_f32_e64 s[10:11], m0, 0.5     ; encoding: [0x0a,0x00,0x1d,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_neq_f32_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x1d,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_neq_f32_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x1d,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x1d,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_neq_f32_e64 s10, exec_lo, -1      ; encoding: [0x0a,0x00,0x1d,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_neq_f32_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x1d,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_neq_f32_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x1d,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_neq_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x1d,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x1d,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_neq_f32_e64 s10, |exec_hi|, null  ; encoding: [0x0a,0x01,0x1d,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_neq_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x1d,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_neq_f32_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x1d,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_neq_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x1d,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x1d,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_neq_f32_e64 s10, null, exec_lo    ; encoding: [0x0a,0x00,0x1d,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_neq_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x1d,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_neq_f32_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x1d,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_neq_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x1d,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x1d,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_neq_f32_e64 s104, -1, exec_hi     ; encoding: [0x68,0x00,0x1d,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_neq_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x1d,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_neq_f32_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x1d,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_neq_f32_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x1d,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x1d,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_neq_f32_e64 vcc_lo, 0.5, -m0      ; encoding: [0x6a,0x00,0x1d,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_neq_f32_e64 vcc, 0.5, -m0         ; encoding: [0x6a,0x00,0x1d,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x1d,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_neq_f32_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x1d,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_neq_f32_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x1d,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x1d,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX12: v_cmp_neq_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x1d,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7c,0x83,0x1d,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_neq_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x1d,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_neq_f64_e64 s10, v[1:2], v[2:3]     ; encoding: [0x0a,0x00,0x2d,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_neq_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x2d,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x2d,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_neq_f64_e64 s10, v[1:2], v[2:3]   ; encoding: [0x0a,0x00,0x2d,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_neq_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x2d,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x2d,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_neq_f64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x2d,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_neq_f64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x2d,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x2d,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_neq_f64_e64 s10, s[2:3], s[4:5]     ; encoding: [0x0a,0x00,0x2d,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_neq_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x2d,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x2d,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_neq_f64_e64 s10, s[2:3], s[4:5]   ; encoding: [0x0a,0x00,0x2d,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_neq_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x2d,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x2d,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_neq_f64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x2d,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_neq_f64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x2d,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x2d,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_neq_f64_e64 s10, vcc, ttmp[14:15]   ; encoding: [0x0a,0x00,0x2d,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_neq_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2d,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x2d,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_neq_f64_e64 s10, vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2d,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_neq_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2d,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x2d,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_neq_f64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x2d,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_neq_f64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x2d,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x2d,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_neq_f64_e64 s10, -|exec|, src_scc   ; encoding: [0x0a,0x01,0x2d,0xd4,0x7e,0xfa,0x01,0x20]
-# W64: v_cmp_neq_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x2d,0xd4,0x7e,0xfa,0x01,0x20]
 0x0a,0x01,0x2d,0xd4,0x7e,0xfa,0x01,0x20
+# W32: v_cmp_neq_f64_e64 s10, -|exec|, src_scc ; encoding: [0x0a,0x01,0x2d,0xd4,0x7e,0xfa,0x01,0x20]
+# W64: v_cmp_neq_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x2d,0xd4,0x7e,0xfa,0x01,0x20]
 
-# W32: v_cmp_neq_f64_e64 s10, null, 0.5          ; encoding: [0x0a,0x00,0x2d,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_neq_f64_e64 s[10:11], null, 0.5     ; encoding: [0x0a,0x00,0x2d,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x2d,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_neq_f64_e64 s10, null, 0.5        ; encoding: [0x0a,0x00,0x2d,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_neq_f64_e64 s[10:11], null, 0.5   ; encoding: [0x0a,0x00,0x2d,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_neq_f64_e64 s104, -1, -1            ; encoding: [0x68,0x00,0x2d,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_neq_f64_e64 s[104:105], -1, -1      ; encoding: [0x68,0x00,0x2d,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x2d,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_neq_f64_e64 s104, -1, -1          ; encoding: [0x68,0x00,0x2d,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_neq_f64_e64 s[104:105], -1, -1    ; encoding: [0x68,0x00,0x2d,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_neq_f64_e64 vcc_lo, 0.5, null       ; encoding: [0x6a,0x00,0x2d,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_neq_f64_e64 vcc, 0.5, null          ; encoding: [0x6a,0x00,0x2d,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x2d,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_neq_f64_e64 vcc_lo, 0.5, null     ; encoding: [0x6a,0x00,0x2d,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_neq_f64_e64 vcc, 0.5, null        ; encoding: [0x6a,0x00,0x2d,0xd4,0xf0,0xf8,0x00,0x00]
 
+0x7a,0x03,0x2d,0xd4,0xfd,0xfc,0x00,0x60
 # W32: v_cmp_neq_f64_e64 ttmp14, -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x2d,0xd4,0xfd,0xfc,0x00,0x60]
 # W64: v_cmp_neq_f64_e64 ttmp[14:15], -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x2d,0xd4,0xfd,0xfc,0x00,0x60]
-0x7a,0x03,0x2d,0xd4,0xfd,0xfc,0x00,0x60
 
-# GFX12: v_cmp_neq_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x2d,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7c,0x82,0x2d,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_neq_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x2d,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_nge_f16_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x09,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_nge_f16_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x09,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x09,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_nge_f16_e64 s10, v1, v2           ; encoding: [0x0a,0x00,0x09,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_nge_f16_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0x09,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_nge_f16_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x09,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_nge_f16_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x09,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x09,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_nge_f16_e64 s10, v255, v255       ; encoding: [0x0a,0x00,0x09,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_nge_f16_e64 s[10:11], v255, v255  ; encoding: [0x0a,0x00,0x09,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_nge_f16_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x09,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_nge_f16_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x09,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x09,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_nge_f16_e64 s10, s1, s2           ; encoding: [0x0a,0x00,0x09,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_nge_f16_e64 s[10:11], s1, s2      ; encoding: [0x0a,0x00,0x09,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_nge_f16_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x09,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_nge_f16_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x09,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x09,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_nge_f16_e64 s10, s105, s105       ; encoding: [0x0a,0x00,0x09,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_nge_f16_e64 s[10:11], s105, s105  ; encoding: [0x0a,0x00,0x09,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_nge_f16_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x09,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_nge_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x09,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x09,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_nge_f16_e64 s10, vcc_lo, ttmp15   ; encoding: [0x0a,0x00,0x09,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_nge_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x09,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_nge_f16_e64 s10, vcc_hi, 0xfe0b     ; encoding: [0x0a,0x00,0x09,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_nge_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x09,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x09,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_nge_f16_e64 s10, vcc_hi, 0xfe0b   ; encoding: [0x0a,0x00,0x09,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_nge_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x09,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_nge_f16_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x09,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_nge_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x09,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x09,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_nge_f16_e64 s10, ttmp15, src_scc  ; encoding: [0x0a,0x00,0x09,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_nge_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x09,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_nge_f16_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x09,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_nge_f16_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x09,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x09,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_nge_f16_e64 s10, m0, 0.5          ; encoding: [0x0a,0x00,0x09,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_nge_f16_e64 s[10:11], m0, 0.5     ; encoding: [0x0a,0x00,0x09,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_nge_f16_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x09,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_nge_f16_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x09,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x09,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_nge_f16_e64 s10, exec_lo, -1      ; encoding: [0x0a,0x00,0x09,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_nge_f16_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x09,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_nge_f16_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x09,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_nge_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x09,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x09,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_nge_f16_e64 s10, |exec_hi|, null  ; encoding: [0x0a,0x01,0x09,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_nge_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x09,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_nge_f16_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x09,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_nge_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x09,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x09,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_nge_f16_e64 s10, null, exec_lo    ; encoding: [0x0a,0x00,0x09,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_nge_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x09,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_nge_f16_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x09,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_nge_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x09,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x09,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_nge_f16_e64 s104, -1, exec_hi     ; encoding: [0x68,0x00,0x09,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_nge_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x09,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_nge_f16_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x09,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_nge_f16_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x09,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x09,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_nge_f16_e64 vcc_lo, 0.5, -m0      ; encoding: [0x6a,0x00,0x09,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_nge_f16_e64 vcc, 0.5, -m0         ; encoding: [0x6a,0x00,0x09,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x09,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_nge_f16_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x09,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_nge_f16_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x09,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x09,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX12: v_cmp_nge_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x09,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7c,0x83,0x09,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmp_nge_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x09,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_nge_f32_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x19,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_nge_f32_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x19,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x19,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_nge_f32_e64 s10, v1, v2           ; encoding: [0x0a,0x00,0x19,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_nge_f32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0x19,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_nge_f32_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x19,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_nge_f32_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x19,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x19,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_nge_f32_e64 s10, v255, v255       ; encoding: [0x0a,0x00,0x19,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_nge_f32_e64 s[10:11], v255, v255  ; encoding: [0x0a,0x00,0x19,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_nge_f32_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x19,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_nge_f32_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x19,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x19,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_nge_f32_e64 s10, s1, s2           ; encoding: [0x0a,0x00,0x19,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_nge_f32_e64 s[10:11], s1, s2      ; encoding: [0x0a,0x00,0x19,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_nge_f32_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x19,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_nge_f32_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x19,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x19,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_nge_f32_e64 s10, s105, s105       ; encoding: [0x0a,0x00,0x19,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_nge_f32_e64 s[10:11], s105, s105  ; encoding: [0x0a,0x00,0x19,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_nge_f32_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x19,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_nge_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x19,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x19,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_nge_f32_e64 s10, vcc_lo, ttmp15   ; encoding: [0x0a,0x00,0x19,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_nge_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x19,0xd4,0x6a,0xf6,0x00,0x00]
 
+0x0a,0x00,0x19,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_nge_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x19,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_nge_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x19,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x19,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_nge_f32_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x19,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_nge_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x19,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x19,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_nge_f32_e64 s10, ttmp15, src_scc  ; encoding: [0x0a,0x00,0x19,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_nge_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x19,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_nge_f32_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x19,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_nge_f32_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x19,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x19,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_nge_f32_e64 s10, m0, 0.5          ; encoding: [0x0a,0x00,0x19,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_nge_f32_e64 s[10:11], m0, 0.5     ; encoding: [0x0a,0x00,0x19,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_nge_f32_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x19,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_nge_f32_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x19,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x19,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_nge_f32_e64 s10, exec_lo, -1      ; encoding: [0x0a,0x00,0x19,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_nge_f32_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x19,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_nge_f32_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x19,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_nge_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x19,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x19,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_nge_f32_e64 s10, |exec_hi|, null  ; encoding: [0x0a,0x01,0x19,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_nge_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x19,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_nge_f32_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x19,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_nge_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x19,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x19,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_nge_f32_e64 s10, null, exec_lo    ; encoding: [0x0a,0x00,0x19,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_nge_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x19,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_nge_f32_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x19,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_nge_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x19,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x19,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_nge_f32_e64 s104, -1, exec_hi     ; encoding: [0x68,0x00,0x19,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_nge_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x19,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_nge_f32_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x19,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_nge_f32_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x19,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x19,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_nge_f32_e64 vcc_lo, 0.5, -m0      ; encoding: [0x6a,0x00,0x19,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_nge_f32_e64 vcc, 0.5, -m0         ; encoding: [0x6a,0x00,0x19,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x19,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_nge_f32_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x19,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_nge_f32_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x19,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x19,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX12: v_cmp_nge_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x19,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7c,0x83,0x19,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_nge_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x19,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_nge_f64_e64 s10, v[1:2], v[2:3]     ; encoding: [0x0a,0x00,0x29,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_nge_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x29,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x29,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_nge_f64_e64 s10, v[1:2], v[2:3]   ; encoding: [0x0a,0x00,0x29,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_nge_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x29,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x29,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_nge_f64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x29,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_nge_f64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x29,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x29,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_nge_f64_e64 s10, s[2:3], s[4:5]     ; encoding: [0x0a,0x00,0x29,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_nge_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x29,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x29,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_nge_f64_e64 s10, s[2:3], s[4:5]   ; encoding: [0x0a,0x00,0x29,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_nge_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x29,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x29,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_nge_f64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x29,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_nge_f64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x29,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x29,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_nge_f64_e64 s10, vcc, ttmp[14:15]   ; encoding: [0x0a,0x00,0x29,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_nge_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x29,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x29,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_nge_f64_e64 s10, vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x29,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_nge_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x29,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x29,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_nge_f64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x29,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_nge_f64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x29,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x29,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_nge_f64_e64 s10, -|exec|, src_scc   ; encoding: [0x0a,0x01,0x29,0xd4,0x7e,0xfa,0x01,0x20]
-# W64: v_cmp_nge_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x29,0xd4,0x7e,0xfa,0x01,0x20]
 0x0a,0x01,0x29,0xd4,0x7e,0xfa,0x01,0x20
+# W32: v_cmp_nge_f64_e64 s10, -|exec|, src_scc ; encoding: [0x0a,0x01,0x29,0xd4,0x7e,0xfa,0x01,0x20]
+# W64: v_cmp_nge_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x29,0xd4,0x7e,0xfa,0x01,0x20]
 
-# W32: v_cmp_nge_f64_e64 s10, null, 0.5          ; encoding: [0x0a,0x00,0x29,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_nge_f64_e64 s[10:11], null, 0.5     ; encoding: [0x0a,0x00,0x29,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x29,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_nge_f64_e64 s10, null, 0.5        ; encoding: [0x0a,0x00,0x29,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_nge_f64_e64 s[10:11], null, 0.5   ; encoding: [0x0a,0x00,0x29,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_nge_f64_e64 s104, -1, -1            ; encoding: [0x68,0x00,0x29,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_nge_f64_e64 s[104:105], -1, -1      ; encoding: [0x68,0x00,0x29,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x29,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_nge_f64_e64 s104, -1, -1          ; encoding: [0x68,0x00,0x29,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_nge_f64_e64 s[104:105], -1, -1    ; encoding: [0x68,0x00,0x29,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_nge_f64_e64 vcc_lo, 0.5, null       ; encoding: [0x6a,0x00,0x29,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_nge_f64_e64 vcc, 0.5, null          ; encoding: [0x6a,0x00,0x29,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x29,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_nge_f64_e64 vcc_lo, 0.5, null     ; encoding: [0x6a,0x00,0x29,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_nge_f64_e64 vcc, 0.5, null        ; encoding: [0x6a,0x00,0x29,0xd4,0xf0,0xf8,0x00,0x00]
 
+0x7a,0x03,0x29,0xd4,0xfd,0xfc,0x00,0x60
 # W32: v_cmp_nge_f64_e64 ttmp14, -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x29,0xd4,0xfd,0xfc,0x00,0x60]
 # W64: v_cmp_nge_f64_e64 ttmp[14:15], -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x29,0xd4,0xfd,0xfc,0x00,0x60]
-0x7a,0x03,0x29,0xd4,0xfd,0xfc,0x00,0x60
 
-# GFX12: v_cmp_nge_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x29,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7c,0x82,0x29,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_nge_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x29,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ngt_f16_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x0b,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ngt_f16_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x0b,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x0b,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ngt_f16_e64 s10, v1, v2           ; encoding: [0x0a,0x00,0x0b,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ngt_f16_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0x0b,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_ngt_f16_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x0b,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_ngt_f16_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x0b,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x0b,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_ngt_f16_e64 s10, v255, v255       ; encoding: [0x0a,0x00,0x0b,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_ngt_f16_e64 s[10:11], v255, v255  ; encoding: [0x0a,0x00,0x0b,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_ngt_f16_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x0b,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_ngt_f16_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x0b,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x0b,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_ngt_f16_e64 s10, s1, s2           ; encoding: [0x0a,0x00,0x0b,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_ngt_f16_e64 s[10:11], s1, s2      ; encoding: [0x0a,0x00,0x0b,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_ngt_f16_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x0b,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_ngt_f16_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x0b,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x0b,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_ngt_f16_e64 s10, s105, s105       ; encoding: [0x0a,0x00,0x0b,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_ngt_f16_e64 s[10:11], s105, s105  ; encoding: [0x0a,0x00,0x0b,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_ngt_f16_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x0b,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_ngt_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x0b,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x0b,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_ngt_f16_e64 s10, vcc_lo, ttmp15   ; encoding: [0x0a,0x00,0x0b,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_ngt_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x0b,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_ngt_f16_e64 s10, vcc_hi, 0xfe0b     ; encoding: [0x0a,0x00,0x0b,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_ngt_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x0b,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x0b,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_ngt_f16_e64 s10, vcc_hi, 0xfe0b   ; encoding: [0x0a,0x00,0x0b,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_ngt_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x0b,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ngt_f16_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x0b,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_ngt_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x0b,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x0b,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_ngt_f16_e64 s10, ttmp15, src_scc  ; encoding: [0x0a,0x00,0x0b,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_ngt_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x0b,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_ngt_f16_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x0b,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_ngt_f16_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x0b,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x0b,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_ngt_f16_e64 s10, m0, 0.5          ; encoding: [0x0a,0x00,0x0b,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_ngt_f16_e64 s[10:11], m0, 0.5     ; encoding: [0x0a,0x00,0x0b,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_ngt_f16_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x0b,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_ngt_f16_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x0b,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x0b,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_ngt_f16_e64 s10, exec_lo, -1      ; encoding: [0x0a,0x00,0x0b,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_ngt_f16_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x0b,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_ngt_f16_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x0b,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_ngt_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x0b,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x0b,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_ngt_f16_e64 s10, |exec_hi|, null  ; encoding: [0x0a,0x01,0x0b,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_ngt_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x0b,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_ngt_f16_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x0b,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_ngt_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x0b,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x0b,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_ngt_f16_e64 s10, null, exec_lo    ; encoding: [0x0a,0x00,0x0b,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_ngt_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x0b,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_ngt_f16_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x0b,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_ngt_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x0b,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x0b,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_ngt_f16_e64 s104, -1, exec_hi     ; encoding: [0x68,0x00,0x0b,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_ngt_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x0b,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_ngt_f16_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x0b,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_ngt_f16_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x0b,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x0b,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_ngt_f16_e64 vcc_lo, 0.5, -m0      ; encoding: [0x6a,0x00,0x0b,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_ngt_f16_e64 vcc, 0.5, -m0         ; encoding: [0x6a,0x00,0x0b,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x0b,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_ngt_f16_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x0b,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_ngt_f16_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x0b,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x0b,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX12: v_cmp_ngt_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x0b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7c,0x83,0x0b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmp_ngt_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x0b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ngt_f32_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x1b,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ngt_f32_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x1b,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x1b,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ngt_f32_e64 s10, v1, v2           ; encoding: [0x0a,0x00,0x1b,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ngt_f32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0x1b,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_ngt_f32_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x1b,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_ngt_f32_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x1b,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x1b,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_ngt_f32_e64 s10, v255, v255       ; encoding: [0x0a,0x00,0x1b,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_ngt_f32_e64 s[10:11], v255, v255  ; encoding: [0x0a,0x00,0x1b,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_ngt_f32_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x1b,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_ngt_f32_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x1b,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x1b,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_ngt_f32_e64 s10, s1, s2           ; encoding: [0x0a,0x00,0x1b,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_ngt_f32_e64 s[10:11], s1, s2      ; encoding: [0x0a,0x00,0x1b,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_ngt_f32_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x1b,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_ngt_f32_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x1b,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x1b,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_ngt_f32_e64 s10, s105, s105       ; encoding: [0x0a,0x00,0x1b,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_ngt_f32_e64 s[10:11], s105, s105  ; encoding: [0x0a,0x00,0x1b,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_ngt_f32_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x1b,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_ngt_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x1b,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x1b,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_ngt_f32_e64 s10, vcc_lo, ttmp15   ; encoding: [0x0a,0x00,0x1b,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_ngt_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x1b,0xd4,0x6a,0xf6,0x00,0x00]
 
+0x0a,0x00,0x1b,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_ngt_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x1b,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_ngt_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x1b,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x1b,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_ngt_f32_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x1b,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_ngt_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x1b,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x1b,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_ngt_f32_e64 s10, ttmp15, src_scc  ; encoding: [0x0a,0x00,0x1b,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_ngt_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x1b,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_ngt_f32_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x1b,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_ngt_f32_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x1b,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x1b,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_ngt_f32_e64 s10, m0, 0.5          ; encoding: [0x0a,0x00,0x1b,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_ngt_f32_e64 s[10:11], m0, 0.5     ; encoding: [0x0a,0x00,0x1b,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_ngt_f32_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x1b,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_ngt_f32_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x1b,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x1b,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_ngt_f32_e64 s10, exec_lo, -1      ; encoding: [0x0a,0x00,0x1b,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_ngt_f32_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x1b,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_ngt_f32_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x1b,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_ngt_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x1b,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x1b,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_ngt_f32_e64 s10, |exec_hi|, null  ; encoding: [0x0a,0x01,0x1b,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_ngt_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x1b,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_ngt_f32_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x1b,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_ngt_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x1b,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x1b,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_ngt_f32_e64 s10, null, exec_lo    ; encoding: [0x0a,0x00,0x1b,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_ngt_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x1b,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_ngt_f32_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x1b,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_ngt_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x1b,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x1b,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_ngt_f32_e64 s104, -1, exec_hi     ; encoding: [0x68,0x00,0x1b,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_ngt_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x1b,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_ngt_f32_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x1b,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_ngt_f32_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x1b,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x1b,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_ngt_f32_e64 vcc_lo, 0.5, -m0      ; encoding: [0x6a,0x00,0x1b,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_ngt_f32_e64 vcc, 0.5, -m0         ; encoding: [0x6a,0x00,0x1b,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x1b,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_ngt_f32_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x1b,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_ngt_f32_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x1b,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x1b,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX12: v_cmp_ngt_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x1b,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7c,0x83,0x1b,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_ngt_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x1b,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ngt_f64_e64 s10, v[1:2], v[2:3]     ; encoding: [0x0a,0x00,0x2b,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_ngt_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x2b,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x2b,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_ngt_f64_e64 s10, v[1:2], v[2:3]   ; encoding: [0x0a,0x00,0x2b,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_ngt_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x2b,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x2b,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_ngt_f64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x2b,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_ngt_f64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x2b,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x2b,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_ngt_f64_e64 s10, s[2:3], s[4:5]     ; encoding: [0x0a,0x00,0x2b,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_ngt_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x2b,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x2b,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_ngt_f64_e64 s10, s[2:3], s[4:5]   ; encoding: [0x0a,0x00,0x2b,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_ngt_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x2b,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x2b,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_ngt_f64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x2b,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_ngt_f64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x2b,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x2b,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_ngt_f64_e64 s10, vcc, ttmp[14:15]   ; encoding: [0x0a,0x00,0x2b,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_ngt_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2b,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x2b,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_ngt_f64_e64 s10, vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2b,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_ngt_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2b,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x2b,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_ngt_f64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x2b,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_ngt_f64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x2b,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x2b,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_ngt_f64_e64 s10, -|exec|, src_scc   ; encoding: [0x0a,0x01,0x2b,0xd4,0x7e,0xfa,0x01,0x20]
-# W64: v_cmp_ngt_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x2b,0xd4,0x7e,0xfa,0x01,0x20]
 0x0a,0x01,0x2b,0xd4,0x7e,0xfa,0x01,0x20
+# W32: v_cmp_ngt_f64_e64 s10, -|exec|, src_scc ; encoding: [0x0a,0x01,0x2b,0xd4,0x7e,0xfa,0x01,0x20]
+# W64: v_cmp_ngt_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x2b,0xd4,0x7e,0xfa,0x01,0x20]
 
-# W32: v_cmp_ngt_f64_e64 s10, null, 0.5          ; encoding: [0x0a,0x00,0x2b,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_ngt_f64_e64 s[10:11], null, 0.5     ; encoding: [0x0a,0x00,0x2b,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x2b,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_ngt_f64_e64 s10, null, 0.5        ; encoding: [0x0a,0x00,0x2b,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_ngt_f64_e64 s[10:11], null, 0.5   ; encoding: [0x0a,0x00,0x2b,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_ngt_f64_e64 s104, -1, -1            ; encoding: [0x68,0x00,0x2b,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_ngt_f64_e64 s[104:105], -1, -1      ; encoding: [0x68,0x00,0x2b,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x2b,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_ngt_f64_e64 s104, -1, -1          ; encoding: [0x68,0x00,0x2b,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_ngt_f64_e64 s[104:105], -1, -1    ; encoding: [0x68,0x00,0x2b,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_ngt_f64_e64 vcc_lo, 0.5, null       ; encoding: [0x6a,0x00,0x2b,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_ngt_f64_e64 vcc, 0.5, null          ; encoding: [0x6a,0x00,0x2b,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x2b,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_ngt_f64_e64 vcc_lo, 0.5, null     ; encoding: [0x6a,0x00,0x2b,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_ngt_f64_e64 vcc, 0.5, null        ; encoding: [0x6a,0x00,0x2b,0xd4,0xf0,0xf8,0x00,0x00]
 
+0x7a,0x03,0x2b,0xd4,0xfd,0xfc,0x00,0x60
 # W32: v_cmp_ngt_f64_e64 ttmp14, -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x2b,0xd4,0xfd,0xfc,0x00,0x60]
 # W64: v_cmp_ngt_f64_e64 ttmp[14:15], -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x2b,0xd4,0xfd,0xfc,0x00,0x60]
-0x7a,0x03,0x2b,0xd4,0xfd,0xfc,0x00,0x60
 
-# GFX12: v_cmp_ngt_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x2b,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7c,0x82,0x2b,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_ngt_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x2b,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_nle_f16_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x0c,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_nle_f16_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x0c,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x0c,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_nle_f16_e64 s10, v1, v2           ; encoding: [0x0a,0x00,0x0c,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_nle_f16_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0x0c,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_nle_f16_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x0c,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_nle_f16_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x0c,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x0c,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_nle_f16_e64 s10, v255, v255       ; encoding: [0x0a,0x00,0x0c,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_nle_f16_e64 s[10:11], v255, v255  ; encoding: [0x0a,0x00,0x0c,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_nle_f16_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x0c,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_nle_f16_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x0c,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x0c,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_nle_f16_e64 s10, s1, s2           ; encoding: [0x0a,0x00,0x0c,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_nle_f16_e64 s[10:11], s1, s2      ; encoding: [0x0a,0x00,0x0c,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_nle_f16_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x0c,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_nle_f16_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x0c,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x0c,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_nle_f16_e64 s10, s105, s105       ; encoding: [0x0a,0x00,0x0c,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_nle_f16_e64 s[10:11], s105, s105  ; encoding: [0x0a,0x00,0x0c,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_nle_f16_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x0c,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_nle_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x0c,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x0c,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_nle_f16_e64 s10, vcc_lo, ttmp15   ; encoding: [0x0a,0x00,0x0c,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_nle_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x0c,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_nle_f16_e64 s10, vcc_hi, 0xfe0b     ; encoding: [0x0a,0x00,0x0c,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_nle_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x0c,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x0c,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_nle_f16_e64 s10, vcc_hi, 0xfe0b   ; encoding: [0x0a,0x00,0x0c,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_nle_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x0c,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_nle_f16_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x0c,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_nle_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x0c,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x0c,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_nle_f16_e64 s10, ttmp15, src_scc  ; encoding: [0x0a,0x00,0x0c,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_nle_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x0c,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_nle_f16_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x0c,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_nle_f16_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x0c,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x0c,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_nle_f16_e64 s10, m0, 0.5          ; encoding: [0x0a,0x00,0x0c,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_nle_f16_e64 s[10:11], m0, 0.5     ; encoding: [0x0a,0x00,0x0c,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_nle_f16_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x0c,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_nle_f16_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x0c,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x0c,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_nle_f16_e64 s10, exec_lo, -1      ; encoding: [0x0a,0x00,0x0c,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_nle_f16_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x0c,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_nle_f16_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x0c,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_nle_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x0c,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x0c,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_nle_f16_e64 s10, |exec_hi|, null  ; encoding: [0x0a,0x01,0x0c,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_nle_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x0c,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_nle_f16_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x0c,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_nle_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x0c,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x0c,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_nle_f16_e64 s10, null, exec_lo    ; encoding: [0x0a,0x00,0x0c,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_nle_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x0c,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_nle_f16_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x0c,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_nle_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x0c,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x0c,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_nle_f16_e64 s104, -1, exec_hi     ; encoding: [0x68,0x00,0x0c,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_nle_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x0c,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_nle_f16_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x0c,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_nle_f16_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x0c,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x0c,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_nle_f16_e64 vcc_lo, 0.5, -m0      ; encoding: [0x6a,0x00,0x0c,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_nle_f16_e64 vcc, 0.5, -m0         ; encoding: [0x6a,0x00,0x0c,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x0c,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_nle_f16_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x0c,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_nle_f16_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x0c,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x0c,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX12: v_cmp_nle_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x0c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7c,0x83,0x0c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmp_nle_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x0c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_nle_f32_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x1c,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_nle_f32_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x1c,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x1c,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_nle_f32_e64 s10, v1, v2           ; encoding: [0x0a,0x00,0x1c,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_nle_f32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0x1c,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_nle_f32_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x1c,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_nle_f32_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x1c,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x1c,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_nle_f32_e64 s10, v255, v255       ; encoding: [0x0a,0x00,0x1c,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_nle_f32_e64 s[10:11], v255, v255  ; encoding: [0x0a,0x00,0x1c,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_nle_f32_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x1c,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_nle_f32_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x1c,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x1c,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_nle_f32_e64 s10, s1, s2           ; encoding: [0x0a,0x00,0x1c,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_nle_f32_e64 s[10:11], s1, s2      ; encoding: [0x0a,0x00,0x1c,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_nle_f32_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x1c,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_nle_f32_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x1c,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x1c,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_nle_f32_e64 s10, s105, s105       ; encoding: [0x0a,0x00,0x1c,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_nle_f32_e64 s[10:11], s105, s105  ; encoding: [0x0a,0x00,0x1c,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_nle_f32_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x1c,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_nle_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x1c,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x1c,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_nle_f32_e64 s10, vcc_lo, ttmp15   ; encoding: [0x0a,0x00,0x1c,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_nle_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x1c,0xd4,0x6a,0xf6,0x00,0x00]
 
+0x0a,0x00,0x1c,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_nle_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x1c,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_nle_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x1c,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x1c,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_nle_f32_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x1c,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_nle_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x1c,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x1c,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_nle_f32_e64 s10, ttmp15, src_scc  ; encoding: [0x0a,0x00,0x1c,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_nle_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x1c,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_nle_f32_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x1c,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_nle_f32_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x1c,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x1c,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_nle_f32_e64 s10, m0, 0.5          ; encoding: [0x0a,0x00,0x1c,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_nle_f32_e64 s[10:11], m0, 0.5     ; encoding: [0x0a,0x00,0x1c,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_nle_f32_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x1c,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_nle_f32_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x1c,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x1c,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_nle_f32_e64 s10, exec_lo, -1      ; encoding: [0x0a,0x00,0x1c,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_nle_f32_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x1c,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_nle_f32_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x1c,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_nle_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x1c,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x1c,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_nle_f32_e64 s10, |exec_hi|, null  ; encoding: [0x0a,0x01,0x1c,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_nle_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x1c,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_nle_f32_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x1c,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_nle_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x1c,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x1c,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_nle_f32_e64 s10, null, exec_lo    ; encoding: [0x0a,0x00,0x1c,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_nle_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x1c,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_nle_f32_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x1c,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_nle_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x1c,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x1c,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_nle_f32_e64 s104, -1, exec_hi     ; encoding: [0x68,0x00,0x1c,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_nle_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x1c,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_nle_f32_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x1c,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_nle_f32_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x1c,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x1c,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_nle_f32_e64 vcc_lo, 0.5, -m0      ; encoding: [0x6a,0x00,0x1c,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_nle_f32_e64 vcc, 0.5, -m0         ; encoding: [0x6a,0x00,0x1c,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x1c,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_nle_f32_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x1c,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_nle_f32_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x1c,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x1c,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX12: v_cmp_nle_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x1c,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7c,0x83,0x1c,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_nle_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x1c,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_nle_f64_e64 s10, v[1:2], v[2:3]     ; encoding: [0x0a,0x00,0x2c,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_nle_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x2c,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x2c,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_nle_f64_e64 s10, v[1:2], v[2:3]   ; encoding: [0x0a,0x00,0x2c,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_nle_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x2c,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x2c,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_nle_f64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x2c,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_nle_f64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x2c,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x2c,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_nle_f64_e64 s10, s[2:3], s[4:5]     ; encoding: [0x0a,0x00,0x2c,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_nle_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x2c,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x2c,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_nle_f64_e64 s10, s[2:3], s[4:5]   ; encoding: [0x0a,0x00,0x2c,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_nle_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x2c,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x2c,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_nle_f64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x2c,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_nle_f64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x2c,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x2c,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_nle_f64_e64 s10, vcc, ttmp[14:15]   ; encoding: [0x0a,0x00,0x2c,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_nle_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2c,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x2c,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_nle_f64_e64 s10, vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2c,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_nle_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2c,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x2c,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_nle_f64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x2c,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_nle_f64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x2c,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x2c,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_nle_f64_e64 s10, -|exec|, src_scc   ; encoding: [0x0a,0x01,0x2c,0xd4,0x7e,0xfa,0x01,0x20]
-# W64: v_cmp_nle_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x2c,0xd4,0x7e,0xfa,0x01,0x20]
 0x0a,0x01,0x2c,0xd4,0x7e,0xfa,0x01,0x20
+# W32: v_cmp_nle_f64_e64 s10, -|exec|, src_scc ; encoding: [0x0a,0x01,0x2c,0xd4,0x7e,0xfa,0x01,0x20]
+# W64: v_cmp_nle_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x2c,0xd4,0x7e,0xfa,0x01,0x20]
 
-# W32: v_cmp_nle_f64_e64 s10, null, 0.5          ; encoding: [0x0a,0x00,0x2c,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_nle_f64_e64 s[10:11], null, 0.5     ; encoding: [0x0a,0x00,0x2c,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x2c,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_nle_f64_e64 s10, null, 0.5        ; encoding: [0x0a,0x00,0x2c,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_nle_f64_e64 s[10:11], null, 0.5   ; encoding: [0x0a,0x00,0x2c,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_nle_f64_e64 s104, -1, -1            ; encoding: [0x68,0x00,0x2c,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_nle_f64_e64 s[104:105], -1, -1      ; encoding: [0x68,0x00,0x2c,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x2c,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_nle_f64_e64 s104, -1, -1          ; encoding: [0x68,0x00,0x2c,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_nle_f64_e64 s[104:105], -1, -1    ; encoding: [0x68,0x00,0x2c,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_nle_f64_e64 vcc_lo, 0.5, null       ; encoding: [0x6a,0x00,0x2c,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_nle_f64_e64 vcc, 0.5, null          ; encoding: [0x6a,0x00,0x2c,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x2c,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_nle_f64_e64 vcc_lo, 0.5, null     ; encoding: [0x6a,0x00,0x2c,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_nle_f64_e64 vcc, 0.5, null        ; encoding: [0x6a,0x00,0x2c,0xd4,0xf0,0xf8,0x00,0x00]
 
+0x7a,0x03,0x2c,0xd4,0xfd,0xfc,0x00,0x60
 # W32: v_cmp_nle_f64_e64 ttmp14, -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x2c,0xd4,0xfd,0xfc,0x00,0x60]
 # W64: v_cmp_nle_f64_e64 ttmp[14:15], -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x2c,0xd4,0xfd,0xfc,0x00,0x60]
-0x7a,0x03,0x2c,0xd4,0xfd,0xfc,0x00,0x60
 
-# GFX12: v_cmp_nle_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x2c,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7c,0x82,0x2c,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_nle_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x2c,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_nlg_f16_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x0a,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_nlg_f16_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x0a,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x0a,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_nlg_f16_e64 s10, v1, v2           ; encoding: [0x0a,0x00,0x0a,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_nlg_f16_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0x0a,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_nlg_f16_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x0a,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_nlg_f16_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x0a,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x0a,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_nlg_f16_e64 s10, v255, v255       ; encoding: [0x0a,0x00,0x0a,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_nlg_f16_e64 s[10:11], v255, v255  ; encoding: [0x0a,0x00,0x0a,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_nlg_f16_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x0a,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_nlg_f16_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x0a,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x0a,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_nlg_f16_e64 s10, s1, s2           ; encoding: [0x0a,0x00,0x0a,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_nlg_f16_e64 s[10:11], s1, s2      ; encoding: [0x0a,0x00,0x0a,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_nlg_f16_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x0a,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_nlg_f16_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x0a,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x0a,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_nlg_f16_e64 s10, s105, s105       ; encoding: [0x0a,0x00,0x0a,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_nlg_f16_e64 s[10:11], s105, s105  ; encoding: [0x0a,0x00,0x0a,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_nlg_f16_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x0a,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_nlg_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x0a,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x0a,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_nlg_f16_e64 s10, vcc_lo, ttmp15   ; encoding: [0x0a,0x00,0x0a,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_nlg_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x0a,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_nlg_f16_e64 s10, vcc_hi, 0xfe0b     ; encoding: [0x0a,0x00,0x0a,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_nlg_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x0a,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x0a,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_nlg_f16_e64 s10, vcc_hi, 0xfe0b   ; encoding: [0x0a,0x00,0x0a,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_nlg_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x0a,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_nlg_f16_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x0a,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_nlg_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x0a,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x0a,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_nlg_f16_e64 s10, ttmp15, src_scc  ; encoding: [0x0a,0x00,0x0a,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_nlg_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x0a,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_nlg_f16_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x0a,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_nlg_f16_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x0a,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x0a,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_nlg_f16_e64 s10, m0, 0.5          ; encoding: [0x0a,0x00,0x0a,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_nlg_f16_e64 s[10:11], m0, 0.5     ; encoding: [0x0a,0x00,0x0a,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_nlg_f16_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x0a,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_nlg_f16_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x0a,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x0a,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_nlg_f16_e64 s10, exec_lo, -1      ; encoding: [0x0a,0x00,0x0a,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_nlg_f16_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x0a,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_nlg_f16_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x0a,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_nlg_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x0a,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x0a,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_nlg_f16_e64 s10, |exec_hi|, null  ; encoding: [0x0a,0x01,0x0a,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_nlg_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x0a,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_nlg_f16_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x0a,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_nlg_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x0a,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x0a,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_nlg_f16_e64 s10, null, exec_lo    ; encoding: [0x0a,0x00,0x0a,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_nlg_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x0a,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_nlg_f16_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x0a,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_nlg_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x0a,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x0a,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_nlg_f16_e64 s104, -1, exec_hi     ; encoding: [0x68,0x00,0x0a,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_nlg_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x0a,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_nlg_f16_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x0a,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_nlg_f16_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x0a,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x0a,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_nlg_f16_e64 vcc_lo, 0.5, -m0      ; encoding: [0x6a,0x00,0x0a,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_nlg_f16_e64 vcc, 0.5, -m0         ; encoding: [0x6a,0x00,0x0a,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x0a,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_nlg_f16_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x0a,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_nlg_f16_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x0a,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x0a,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX12: v_cmp_nlg_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x0a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7c,0x83,0x0a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmp_nlg_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x0a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_nlg_f32_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x1a,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_nlg_f32_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x1a,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x1a,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_nlg_f32_e64 s10, v1, v2           ; encoding: [0x0a,0x00,0x1a,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_nlg_f32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0x1a,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_nlg_f32_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x1a,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_nlg_f32_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x1a,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x1a,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_nlg_f32_e64 s10, v255, v255       ; encoding: [0x0a,0x00,0x1a,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_nlg_f32_e64 s[10:11], v255, v255  ; encoding: [0x0a,0x00,0x1a,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_nlg_f32_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x1a,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_nlg_f32_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x1a,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x1a,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_nlg_f32_e64 s10, s1, s2           ; encoding: [0x0a,0x00,0x1a,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_nlg_f32_e64 s[10:11], s1, s2      ; encoding: [0x0a,0x00,0x1a,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_nlg_f32_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x1a,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_nlg_f32_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x1a,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x1a,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_nlg_f32_e64 s10, s105, s105       ; encoding: [0x0a,0x00,0x1a,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_nlg_f32_e64 s[10:11], s105, s105  ; encoding: [0x0a,0x00,0x1a,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_nlg_f32_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x1a,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_nlg_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x1a,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x1a,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_nlg_f32_e64 s10, vcc_lo, ttmp15   ; encoding: [0x0a,0x00,0x1a,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_nlg_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x1a,0xd4,0x6a,0xf6,0x00,0x00]
 
+0x0a,0x00,0x1a,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_nlg_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x1a,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_nlg_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x1a,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x1a,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_nlg_f32_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x1a,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_nlg_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x1a,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x1a,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_nlg_f32_e64 s10, ttmp15, src_scc  ; encoding: [0x0a,0x00,0x1a,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_nlg_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x1a,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_nlg_f32_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x1a,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_nlg_f32_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x1a,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x1a,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_nlg_f32_e64 s10, m0, 0.5          ; encoding: [0x0a,0x00,0x1a,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_nlg_f32_e64 s[10:11], m0, 0.5     ; encoding: [0x0a,0x00,0x1a,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_nlg_f32_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x1a,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_nlg_f32_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x1a,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x1a,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_nlg_f32_e64 s10, exec_lo, -1      ; encoding: [0x0a,0x00,0x1a,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_nlg_f32_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x1a,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_nlg_f32_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x1a,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_nlg_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x1a,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x1a,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_nlg_f32_e64 s10, |exec_hi|, null  ; encoding: [0x0a,0x01,0x1a,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_nlg_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x1a,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_nlg_f32_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x1a,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_nlg_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x1a,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x1a,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_nlg_f32_e64 s10, null, exec_lo    ; encoding: [0x0a,0x00,0x1a,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_nlg_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x1a,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_nlg_f32_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x1a,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_nlg_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x1a,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x1a,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_nlg_f32_e64 s104, -1, exec_hi     ; encoding: [0x68,0x00,0x1a,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_nlg_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x1a,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_nlg_f32_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x1a,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_nlg_f32_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x1a,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x1a,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_nlg_f32_e64 vcc_lo, 0.5, -m0      ; encoding: [0x6a,0x00,0x1a,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_nlg_f32_e64 vcc, 0.5, -m0         ; encoding: [0x6a,0x00,0x1a,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x1a,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_nlg_f32_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x1a,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_nlg_f32_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x1a,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x1a,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX12: v_cmp_nlg_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x1a,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7c,0x83,0x1a,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_nlg_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x1a,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_nlg_f64_e64 s10, v[1:2], v[2:3]     ; encoding: [0x0a,0x00,0x2a,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_nlg_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x2a,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x2a,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_nlg_f64_e64 s10, v[1:2], v[2:3]   ; encoding: [0x0a,0x00,0x2a,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_nlg_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x2a,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x2a,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_nlg_f64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x2a,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_nlg_f64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x2a,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x2a,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_nlg_f64_e64 s10, s[2:3], s[4:5]     ; encoding: [0x0a,0x00,0x2a,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_nlg_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x2a,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x2a,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_nlg_f64_e64 s10, s[2:3], s[4:5]   ; encoding: [0x0a,0x00,0x2a,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_nlg_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x2a,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x2a,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_nlg_f64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x2a,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_nlg_f64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x2a,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x2a,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_nlg_f64_e64 s10, vcc, ttmp[14:15]   ; encoding: [0x0a,0x00,0x2a,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_nlg_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2a,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x2a,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_nlg_f64_e64 s10, vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2a,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_nlg_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2a,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x2a,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_nlg_f64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x2a,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_nlg_f64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x2a,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x2a,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_nlg_f64_e64 s10, -|exec|, src_scc   ; encoding: [0x0a,0x01,0x2a,0xd4,0x7e,0xfa,0x01,0x20]
-# W64: v_cmp_nlg_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x2a,0xd4,0x7e,0xfa,0x01,0x20]
 0x0a,0x01,0x2a,0xd4,0x7e,0xfa,0x01,0x20
+# W32: v_cmp_nlg_f64_e64 s10, -|exec|, src_scc ; encoding: [0x0a,0x01,0x2a,0xd4,0x7e,0xfa,0x01,0x20]
+# W64: v_cmp_nlg_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x2a,0xd4,0x7e,0xfa,0x01,0x20]
 
-# W32: v_cmp_nlg_f64_e64 s10, null, 0.5          ; encoding: [0x0a,0x00,0x2a,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_nlg_f64_e64 s[10:11], null, 0.5     ; encoding: [0x0a,0x00,0x2a,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x2a,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_nlg_f64_e64 s10, null, 0.5        ; encoding: [0x0a,0x00,0x2a,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_nlg_f64_e64 s[10:11], null, 0.5   ; encoding: [0x0a,0x00,0x2a,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_nlg_f64_e64 s104, -1, -1            ; encoding: [0x68,0x00,0x2a,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_nlg_f64_e64 s[104:105], -1, -1      ; encoding: [0x68,0x00,0x2a,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x2a,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_nlg_f64_e64 s104, -1, -1          ; encoding: [0x68,0x00,0x2a,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_nlg_f64_e64 s[104:105], -1, -1    ; encoding: [0x68,0x00,0x2a,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_nlg_f64_e64 vcc_lo, 0.5, null       ; encoding: [0x6a,0x00,0x2a,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_nlg_f64_e64 vcc, 0.5, null          ; encoding: [0x6a,0x00,0x2a,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x2a,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_nlg_f64_e64 vcc_lo, 0.5, null     ; encoding: [0x6a,0x00,0x2a,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_nlg_f64_e64 vcc, 0.5, null        ; encoding: [0x6a,0x00,0x2a,0xd4,0xf0,0xf8,0x00,0x00]
 
+0x7a,0x03,0x2a,0xd4,0xfd,0xfc,0x00,0x60
 # W32: v_cmp_nlg_f64_e64 ttmp14, -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x2a,0xd4,0xfd,0xfc,0x00,0x60]
 # W64: v_cmp_nlg_f64_e64 ttmp[14:15], -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x2a,0xd4,0xfd,0xfc,0x00,0x60]
-0x7a,0x03,0x2a,0xd4,0xfd,0xfc,0x00,0x60
 
-# GFX12: v_cmp_nlg_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x2a,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7c,0x82,0x2a,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_nlg_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x2a,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_nlt_f16_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x0e,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_nlt_f16_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x0e,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x0e,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_nlt_f16_e64 s10, v1, v2           ; encoding: [0x0a,0x00,0x0e,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_nlt_f16_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0x0e,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_nlt_f16_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x0e,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_nlt_f16_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x0e,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x0e,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_nlt_f16_e64 s10, v255, v255       ; encoding: [0x0a,0x00,0x0e,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_nlt_f16_e64 s[10:11], v255, v255  ; encoding: [0x0a,0x00,0x0e,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_nlt_f16_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x0e,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_nlt_f16_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x0e,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x0e,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_nlt_f16_e64 s10, s1, s2           ; encoding: [0x0a,0x00,0x0e,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_nlt_f16_e64 s[10:11], s1, s2      ; encoding: [0x0a,0x00,0x0e,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_nlt_f16_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x0e,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_nlt_f16_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x0e,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x0e,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_nlt_f16_e64 s10, s105, s105       ; encoding: [0x0a,0x00,0x0e,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_nlt_f16_e64 s[10:11], s105, s105  ; encoding: [0x0a,0x00,0x0e,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_nlt_f16_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x0e,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_nlt_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x0e,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x0e,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_nlt_f16_e64 s10, vcc_lo, ttmp15   ; encoding: [0x0a,0x00,0x0e,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_nlt_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x0e,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_nlt_f16_e64 s10, vcc_hi, 0xfe0b     ; encoding: [0x0a,0x00,0x0e,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_nlt_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x0e,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x0e,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_nlt_f16_e64 s10, vcc_hi, 0xfe0b   ; encoding: [0x0a,0x00,0x0e,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_nlt_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x0e,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_nlt_f16_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x0e,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_nlt_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x0e,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x0e,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_nlt_f16_e64 s10, ttmp15, src_scc  ; encoding: [0x0a,0x00,0x0e,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_nlt_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x0e,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_nlt_f16_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x0e,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_nlt_f16_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x0e,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x0e,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_nlt_f16_e64 s10, m0, 0.5          ; encoding: [0x0a,0x00,0x0e,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_nlt_f16_e64 s[10:11], m0, 0.5     ; encoding: [0x0a,0x00,0x0e,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_nlt_f16_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x0e,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_nlt_f16_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x0e,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x0e,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_nlt_f16_e64 s10, exec_lo, -1      ; encoding: [0x0a,0x00,0x0e,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_nlt_f16_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x0e,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_nlt_f16_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x0e,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_nlt_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x0e,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x0e,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_nlt_f16_e64 s10, |exec_hi|, null  ; encoding: [0x0a,0x01,0x0e,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_nlt_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x0e,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_nlt_f16_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x0e,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_nlt_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x0e,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x0e,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_nlt_f16_e64 s10, null, exec_lo    ; encoding: [0x0a,0x00,0x0e,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_nlt_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x0e,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_nlt_f16_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x0e,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_nlt_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x0e,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x0e,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_nlt_f16_e64 s104, -1, exec_hi     ; encoding: [0x68,0x00,0x0e,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_nlt_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x0e,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_nlt_f16_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x0e,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_nlt_f16_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x0e,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x0e,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_nlt_f16_e64 vcc_lo, 0.5, -m0      ; encoding: [0x6a,0x00,0x0e,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_nlt_f16_e64 vcc, 0.5, -m0         ; encoding: [0x6a,0x00,0x0e,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x0e,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_nlt_f16_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x0e,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_nlt_f16_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x0e,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x0e,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX12: v_cmp_nlt_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x0e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7c,0x83,0x0e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmp_nlt_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x0e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_nlt_f32_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x1e,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_nlt_f32_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x1e,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x1e,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_nlt_f32_e64 s10, v1, v2           ; encoding: [0x0a,0x00,0x1e,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_nlt_f32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0x1e,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_nlt_f32_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x1e,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_nlt_f32_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x1e,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x1e,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_nlt_f32_e64 s10, v255, v255       ; encoding: [0x0a,0x00,0x1e,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_nlt_f32_e64 s[10:11], v255, v255  ; encoding: [0x0a,0x00,0x1e,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_nlt_f32_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x1e,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_nlt_f32_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x1e,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x1e,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_nlt_f32_e64 s10, s1, s2           ; encoding: [0x0a,0x00,0x1e,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_nlt_f32_e64 s[10:11], s1, s2      ; encoding: [0x0a,0x00,0x1e,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_nlt_f32_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x1e,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_nlt_f32_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x1e,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x1e,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_nlt_f32_e64 s10, s105, s105       ; encoding: [0x0a,0x00,0x1e,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_nlt_f32_e64 s[10:11], s105, s105  ; encoding: [0x0a,0x00,0x1e,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_nlt_f32_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x1e,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_nlt_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x1e,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x1e,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_nlt_f32_e64 s10, vcc_lo, ttmp15   ; encoding: [0x0a,0x00,0x1e,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_nlt_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x1e,0xd4,0x6a,0xf6,0x00,0x00]
 
+0x0a,0x00,0x1e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_nlt_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x1e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_nlt_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x1e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x1e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_nlt_f32_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x1e,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_nlt_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x1e,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x1e,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_nlt_f32_e64 s10, ttmp15, src_scc  ; encoding: [0x0a,0x00,0x1e,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_nlt_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x1e,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_nlt_f32_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x1e,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_nlt_f32_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x1e,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x1e,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_nlt_f32_e64 s10, m0, 0.5          ; encoding: [0x0a,0x00,0x1e,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_nlt_f32_e64 s[10:11], m0, 0.5     ; encoding: [0x0a,0x00,0x1e,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_nlt_f32_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x1e,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_nlt_f32_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x1e,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x1e,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_nlt_f32_e64 s10, exec_lo, -1      ; encoding: [0x0a,0x00,0x1e,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_nlt_f32_e64 s[10:11], exec_lo, -1 ; encoding: [0x0a,0x00,0x1e,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_nlt_f32_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x1e,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_nlt_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x1e,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x1e,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_nlt_f32_e64 s10, |exec_hi|, null  ; encoding: [0x0a,0x01,0x1e,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_nlt_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x1e,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_nlt_f32_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x1e,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_nlt_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x1e,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x1e,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_nlt_f32_e64 s10, null, exec_lo    ; encoding: [0x0a,0x00,0x1e,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_nlt_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x1e,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_nlt_f32_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x1e,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_nlt_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x1e,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x1e,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_nlt_f32_e64 s104, -1, exec_hi     ; encoding: [0x68,0x00,0x1e,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_nlt_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x1e,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_nlt_f32_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x1e,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_nlt_f32_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x1e,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x1e,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_nlt_f32_e64 vcc_lo, 0.5, -m0      ; encoding: [0x6a,0x00,0x1e,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_nlt_f32_e64 vcc, 0.5, -m0         ; encoding: [0x6a,0x00,0x1e,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x1e,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_nlt_f32_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x1e,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_nlt_f32_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x1e,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x1e,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX12: v_cmp_nlt_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x1e,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7c,0x83,0x1e,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_nlt_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x1e,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_nlt_f64_e64 s10, v[1:2], v[2:3]     ; encoding: [0x0a,0x00,0x2e,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_nlt_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x2e,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x2e,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_nlt_f64_e64 s10, v[1:2], v[2:3]   ; encoding: [0x0a,0x00,0x2e,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_nlt_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x2e,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x2e,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_nlt_f64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x2e,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_nlt_f64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x2e,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x2e,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_nlt_f64_e64 s10, s[2:3], s[4:5]     ; encoding: [0x0a,0x00,0x2e,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_nlt_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x2e,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x2e,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_nlt_f64_e64 s10, s[2:3], s[4:5]   ; encoding: [0x0a,0x00,0x2e,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_nlt_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x2e,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x2e,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_nlt_f64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x2e,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_nlt_f64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x2e,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x2e,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_nlt_f64_e64 s10, vcc, ttmp[14:15]   ; encoding: [0x0a,0x00,0x2e,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_nlt_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2e,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x2e,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_nlt_f64_e64 s10, vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2e,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_nlt_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x2e,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x2e,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_nlt_f64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x2e,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_nlt_f64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x2e,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x2e,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_nlt_f64_e64 s10, -|exec|, src_scc   ; encoding: [0x0a,0x01,0x2e,0xd4,0x7e,0xfa,0x01,0x20]
-# W64: v_cmp_nlt_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x2e,0xd4,0x7e,0xfa,0x01,0x20]
 0x0a,0x01,0x2e,0xd4,0x7e,0xfa,0x01,0x20
+# W32: v_cmp_nlt_f64_e64 s10, -|exec|, src_scc ; encoding: [0x0a,0x01,0x2e,0xd4,0x7e,0xfa,0x01,0x20]
+# W64: v_cmp_nlt_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x2e,0xd4,0x7e,0xfa,0x01,0x20]
 
-# W32: v_cmp_nlt_f64_e64 s10, null, 0.5          ; encoding: [0x0a,0x00,0x2e,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_nlt_f64_e64 s[10:11], null, 0.5     ; encoding: [0x0a,0x00,0x2e,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x2e,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_nlt_f64_e64 s10, null, 0.5        ; encoding: [0x0a,0x00,0x2e,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_nlt_f64_e64 s[10:11], null, 0.5   ; encoding: [0x0a,0x00,0x2e,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_nlt_f64_e64 s104, -1, -1            ; encoding: [0x68,0x00,0x2e,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_nlt_f64_e64 s[104:105], -1, -1      ; encoding: [0x68,0x00,0x2e,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x2e,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_nlt_f64_e64 s104, -1, -1          ; encoding: [0x68,0x00,0x2e,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_nlt_f64_e64 s[104:105], -1, -1    ; encoding: [0x68,0x00,0x2e,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_nlt_f64_e64 vcc_lo, 0.5, null       ; encoding: [0x6a,0x00,0x2e,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_nlt_f64_e64 vcc, 0.5, null          ; encoding: [0x6a,0x00,0x2e,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x2e,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_nlt_f64_e64 vcc_lo, 0.5, null     ; encoding: [0x6a,0x00,0x2e,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_nlt_f64_e64 vcc, 0.5, null        ; encoding: [0x6a,0x00,0x2e,0xd4,0xf0,0xf8,0x00,0x00]
 
+0x7a,0x03,0x2e,0xd4,0xfd,0xfc,0x00,0x60
 # W32: v_cmp_nlt_f64_e64 ttmp14, -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x2e,0xd4,0xfd,0xfc,0x00,0x60]
 # W64: v_cmp_nlt_f64_e64 ttmp[14:15], -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x2e,0xd4,0xfd,0xfc,0x00,0x60]
-0x7a,0x03,0x2e,0xd4,0xfd,0xfc,0x00,0x60
 
-# GFX12: v_cmp_nlt_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x2e,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7c,0x82,0x2e,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_nlt_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x2e,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_o_f16_e64 s10, v1, v2               ; encoding: [0x0a,0x00,0x07,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_o_f16_e64 s[10:11], v1, v2          ; encoding: [0x0a,0x00,0x07,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x07,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_o_f16_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x07,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_o_f16_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x07,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_o_f16_e64 s10, v255, v255           ; encoding: [0x0a,0x00,0x07,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_o_f16_e64 s[10:11], v255, v255      ; encoding: [0x0a,0x00,0x07,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x07,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_o_f16_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x07,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_o_f16_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x07,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_o_f16_e64 s10, s1, s2               ; encoding: [0x0a,0x00,0x07,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_o_f16_e64 s[10:11], s1, s2          ; encoding: [0x0a,0x00,0x07,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x07,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_o_f16_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x07,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_o_f16_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x07,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_o_f16_e64 s10, s105, s105           ; encoding: [0x0a,0x00,0x07,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_o_f16_e64 s[10:11], s105, s105      ; encoding: [0x0a,0x00,0x07,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x07,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_o_f16_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x07,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_o_f16_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x07,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_o_f16_e64 s10, vcc_lo, ttmp15       ; encoding: [0x0a,0x00,0x07,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_o_f16_e64 s[10:11], vcc_lo, ttmp15  ; encoding: [0x0a,0x00,0x07,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x07,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_o_f16_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x07,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_o_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x07,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_o_f16_e64 s10, vcc_hi, 0xfe0b       ; encoding: [0x0a,0x00,0x07,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_o_f16_e64 s[10:11], vcc_hi, 0xfe0b  ; encoding: [0x0a,0x00,0x07,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x07,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_o_f16_e64 s10, vcc_hi, 0xfe0b     ; encoding: [0x0a,0x00,0x07,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_o_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x07,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_o_f16_e64 s10, ttmp15, src_scc      ; encoding: [0x0a,0x00,0x07,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_o_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x07,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x07,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_o_f16_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x07,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_o_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x07,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_o_f16_e64 s10, m0, 0.5              ; encoding: [0x0a,0x00,0x07,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_o_f16_e64 s[10:11], m0, 0.5         ; encoding: [0x0a,0x00,0x07,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x07,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_o_f16_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x07,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_o_f16_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x07,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_o_f16_e64 s10, exec_lo, -1          ; encoding: [0x0a,0x00,0x07,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_o_f16_e64 s[10:11], exec_lo, -1     ; encoding: [0x0a,0x00,0x07,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x07,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_o_f16_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x07,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_o_f16_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x07,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_o_f16_e64 s10, |exec_hi|, null      ; encoding: [0x0a,0x01,0x07,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_o_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x07,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x07,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_o_f16_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x07,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_o_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x07,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_o_f16_e64 s10, null, exec_lo        ; encoding: [0x0a,0x00,0x07,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_o_f16_e64 s[10:11], null, exec_lo   ; encoding: [0x0a,0x00,0x07,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x07,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_o_f16_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x07,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_o_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x07,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_o_f16_e64 s104, -1, exec_hi         ; encoding: [0x68,0x00,0x07,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_o_f16_e64 s[104:105], -1, exec_hi   ; encoding: [0x68,0x00,0x07,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x07,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_o_f16_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x07,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_o_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x07,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_o_f16_e64 vcc_lo, 0.5, -m0          ; encoding: [0x6a,0x00,0x07,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_o_f16_e64 vcc, 0.5, -m0             ; encoding: [0x6a,0x00,0x07,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x07,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_o_f16_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x07,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_o_f16_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x07,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x07,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_o_f16_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x07,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_o_f16_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x07,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x07,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX12: v_cmp_o_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x07,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7c,0x83,0x07,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmp_o_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x07,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_o_f32_e64 s10, v1, v2               ; encoding: [0x0a,0x00,0x17,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_o_f32_e64 s[10:11], v1, v2          ; encoding: [0x0a,0x00,0x17,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x17,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_o_f32_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x17,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_o_f32_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x17,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_o_f32_e64 s10, v255, v255           ; encoding: [0x0a,0x00,0x17,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_o_f32_e64 s[10:11], v255, v255      ; encoding: [0x0a,0x00,0x17,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x17,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_o_f32_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x17,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_o_f32_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x17,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_o_f32_e64 s10, s1, s2               ; encoding: [0x0a,0x00,0x17,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_o_f32_e64 s[10:11], s1, s2          ; encoding: [0x0a,0x00,0x17,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x17,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_o_f32_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x17,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_o_f32_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x17,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_o_f32_e64 s10, s105, s105           ; encoding: [0x0a,0x00,0x17,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_o_f32_e64 s[10:11], s105, s105      ; encoding: [0x0a,0x00,0x17,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x17,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_o_f32_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x17,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_o_f32_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x17,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_o_f32_e64 s10, vcc_lo, ttmp15       ; encoding: [0x0a,0x00,0x17,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_o_f32_e64 s[10:11], vcc_lo, ttmp15  ; encoding: [0x0a,0x00,0x17,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x17,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_o_f32_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x17,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_o_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x17,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_o_f32_e64 s10, vcc_hi, 0xaf123456   ; encoding: [0x0a,0x00,0x17,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_o_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x17,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x17,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_o_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x17,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_o_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x17,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_o_f32_e64 s10, ttmp15, src_scc      ; encoding: [0x0a,0x00,0x17,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_o_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x17,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x17,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_o_f32_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x17,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_o_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x17,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_o_f32_e64 s10, m0, 0.5              ; encoding: [0x0a,0x00,0x17,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_o_f32_e64 s[10:11], m0, 0.5         ; encoding: [0x0a,0x00,0x17,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x17,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_o_f32_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x17,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_o_f32_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x17,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_o_f32_e64 s10, exec_lo, -1          ; encoding: [0x0a,0x00,0x17,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_o_f32_e64 s[10:11], exec_lo, -1     ; encoding: [0x0a,0x00,0x17,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x17,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_o_f32_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x17,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_o_f32_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x17,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_o_f32_e64 s10, |exec_hi|, null      ; encoding: [0x0a,0x01,0x17,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_o_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x17,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x17,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_o_f32_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x17,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_o_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x17,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_o_f32_e64 s10, null, exec_lo        ; encoding: [0x0a,0x00,0x17,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_o_f32_e64 s[10:11], null, exec_lo   ; encoding: [0x0a,0x00,0x17,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x17,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_o_f32_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x17,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_o_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x17,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_o_f32_e64 s104, -1, exec_hi         ; encoding: [0x68,0x00,0x17,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_o_f32_e64 s[104:105], -1, exec_hi   ; encoding: [0x68,0x00,0x17,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x17,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_o_f32_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x17,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_o_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x17,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_o_f32_e64 vcc_lo, 0.5, -m0          ; encoding: [0x6a,0x00,0x17,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_o_f32_e64 vcc, 0.5, -m0             ; encoding: [0x6a,0x00,0x17,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x17,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_o_f32_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x17,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_o_f32_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x17,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x17,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_o_f32_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x17,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_o_f32_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x17,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x17,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX12: v_cmp_o_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x17,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7c,0x83,0x17,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_o_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x17,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_o_f64_e64 s10, v[1:2], v[2:3]       ; encoding: [0x0a,0x00,0x27,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_o_f64_e64 s[10:11], v[1:2], v[2:3]  ; encoding: [0x0a,0x00,0x27,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x27,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_o_f64_e64 s10, v[1:2], v[2:3]     ; encoding: [0x0a,0x00,0x27,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_o_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x27,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x27,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_o_f64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x27,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_o_f64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x27,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x27,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_o_f64_e64 s10, s[2:3], s[4:5]       ; encoding: [0x0a,0x00,0x27,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_o_f64_e64 s[10:11], s[2:3], s[4:5]  ; encoding: [0x0a,0x00,0x27,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x27,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_o_f64_e64 s10, s[2:3], s[4:5]     ; encoding: [0x0a,0x00,0x27,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_o_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x27,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x27,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_o_f64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x27,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_o_f64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x27,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x27,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_o_f64_e64 s10, vcc, ttmp[14:15]     ; encoding: [0x0a,0x00,0x27,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_o_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x27,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x27,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_o_f64_e64 s10, vcc, ttmp[14:15]   ; encoding: [0x0a,0x00,0x27,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_o_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x27,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x27,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_o_f64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x27,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_o_f64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x27,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x27,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_o_f64_e64 s10, -|exec|, src_scc     ; encoding: [0x0a,0x01,0x27,0xd4,0x7e,0xfa,0x01,0x20]
-# W64: v_cmp_o_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x27,0xd4,0x7e,0xfa,0x01,0x20]
 0x0a,0x01,0x27,0xd4,0x7e,0xfa,0x01,0x20
+# W32: v_cmp_o_f64_e64 s10, -|exec|, src_scc   ; encoding: [0x0a,0x01,0x27,0xd4,0x7e,0xfa,0x01,0x20]
+# W64: v_cmp_o_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x27,0xd4,0x7e,0xfa,0x01,0x20]
 
-# W32: v_cmp_o_f64_e64 s10, null, 0.5            ; encoding: [0x0a,0x00,0x27,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_o_f64_e64 s[10:11], null, 0.5       ; encoding: [0x0a,0x00,0x27,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x27,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_o_f64_e64 s10, null, 0.5          ; encoding: [0x0a,0x00,0x27,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_o_f64_e64 s[10:11], null, 0.5     ; encoding: [0x0a,0x00,0x27,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_o_f64_e64 s104, -1, -1              ; encoding: [0x68,0x00,0x27,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_o_f64_e64 s[104:105], -1, -1        ; encoding: [0x68,0x00,0x27,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x27,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_o_f64_e64 s104, -1, -1            ; encoding: [0x68,0x00,0x27,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_o_f64_e64 s[104:105], -1, -1      ; encoding: [0x68,0x00,0x27,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_o_f64_e64 vcc_lo, 0.5, null         ; encoding: [0x6a,0x00,0x27,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_o_f64_e64 vcc, 0.5, null            ; encoding: [0x6a,0x00,0x27,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x27,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_o_f64_e64 vcc_lo, 0.5, null       ; encoding: [0x6a,0x00,0x27,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_o_f64_e64 vcc, 0.5, null          ; encoding: [0x6a,0x00,0x27,0xd4,0xf0,0xf8,0x00,0x00]
 
+0x7a,0x03,0x27,0xd4,0xfd,0xfc,0x00,0x60
 # W32: v_cmp_o_f64_e64 ttmp14, -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x27,0xd4,0xfd,0xfc,0x00,0x60]
 # W64: v_cmp_o_f64_e64 ttmp[14:15], -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x27,0xd4,0xfd,0xfc,0x00,0x60]
-0x7a,0x03,0x27,0xd4,0xfd,0xfc,0x00,0x60
 
-# GFX12: v_cmp_o_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x27,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7c,0x82,0x27,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_o_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x27,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_u_f16_e64 s10, v1, v2               ; encoding: [0x0a,0x00,0x08,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_u_f16_e64 s[10:11], v1, v2          ; encoding: [0x0a,0x00,0x08,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x08,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_u_f16_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x08,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_u_f16_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x08,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_u_f16_e64 s10, v255, v255           ; encoding: [0x0a,0x00,0x08,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_u_f16_e64 s[10:11], v255, v255      ; encoding: [0x0a,0x00,0x08,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x08,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_u_f16_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x08,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_u_f16_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x08,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_u_f16_e64 s10, s1, s2               ; encoding: [0x0a,0x00,0x08,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_u_f16_e64 s[10:11], s1, s2          ; encoding: [0x0a,0x00,0x08,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x08,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_u_f16_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x08,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_u_f16_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x08,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_u_f16_e64 s10, s105, s105           ; encoding: [0x0a,0x00,0x08,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_u_f16_e64 s[10:11], s105, s105      ; encoding: [0x0a,0x00,0x08,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x08,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_u_f16_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x08,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_u_f16_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x08,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_u_f16_e64 s10, vcc_lo, ttmp15       ; encoding: [0x0a,0x00,0x08,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_u_f16_e64 s[10:11], vcc_lo, ttmp15  ; encoding: [0x0a,0x00,0x08,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x08,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_u_f16_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x08,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_u_f16_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x08,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_u_f16_e64 s10, vcc_hi, 0xfe0b       ; encoding: [0x0a,0x00,0x08,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_u_f16_e64 s[10:11], vcc_hi, 0xfe0b  ; encoding: [0x0a,0x00,0x08,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x0a,0x00,0x08,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_u_f16_e64 s10, vcc_hi, 0xfe0b     ; encoding: [0x0a,0x00,0x08,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_u_f16_e64 s[10:11], vcc_hi, 0xfe0b ; encoding: [0x0a,0x00,0x08,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_u_f16_e64 s10, ttmp15, src_scc      ; encoding: [0x0a,0x00,0x08,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_u_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x08,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x08,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_u_f16_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x08,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_u_f16_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x08,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_u_f16_e64 s10, m0, 0.5              ; encoding: [0x0a,0x00,0x08,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_u_f16_e64 s[10:11], m0, 0.5         ; encoding: [0x0a,0x00,0x08,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x08,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_u_f16_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x08,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_u_f16_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x08,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_u_f16_e64 s10, exec_lo, -1          ; encoding: [0x0a,0x00,0x08,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_u_f16_e64 s[10:11], exec_lo, -1     ; encoding: [0x0a,0x00,0x08,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x08,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_u_f16_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x08,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_u_f16_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x08,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_u_f16_e64 s10, |exec_hi|, null      ; encoding: [0x0a,0x01,0x08,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_u_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x08,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x08,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_u_f16_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x08,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_u_f16_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x08,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_u_f16_e64 s10, null, exec_lo        ; encoding: [0x0a,0x00,0x08,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_u_f16_e64 s[10:11], null, exec_lo   ; encoding: [0x0a,0x00,0x08,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x08,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_u_f16_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x08,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_u_f16_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x08,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_u_f16_e64 s104, -1, exec_hi         ; encoding: [0x68,0x00,0x08,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_u_f16_e64 s[104:105], -1, exec_hi   ; encoding: [0x68,0x00,0x08,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x08,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_u_f16_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x08,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_u_f16_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x08,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_u_f16_e64 vcc_lo, 0.5, -m0          ; encoding: [0x6a,0x00,0x08,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_u_f16_e64 vcc, 0.5, -m0             ; encoding: [0x6a,0x00,0x08,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x08,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_u_f16_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x08,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_u_f16_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x08,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x08,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_u_f16_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x08,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_u_f16_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x08,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x08,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX12: v_cmp_u_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x08,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7c,0x83,0x08,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmp_u_f16_e64 null, -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x08,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_u_f32_e64 s10, v1, v2               ; encoding: [0x0a,0x00,0x18,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_u_f32_e64 s[10:11], v1, v2          ; encoding: [0x0a,0x00,0x18,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x18,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_u_f32_e64 s10, v1, v2             ; encoding: [0x0a,0x00,0x18,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_u_f32_e64 s[10:11], v1, v2        ; encoding: [0x0a,0x00,0x18,0xd4,0x01,0x05,0x02,0x00]
 
-# W32: v_cmp_u_f32_e64 s10, v255, v255           ; encoding: [0x0a,0x00,0x18,0xd4,0xff,0xff,0x03,0x00]
-# W64: v_cmp_u_f32_e64 s[10:11], v255, v255      ; encoding: [0x0a,0x00,0x18,0xd4,0xff,0xff,0x03,0x00]
 0x0a,0x00,0x18,0xd4,0xff,0xff,0x03,0x00
+# W32: v_cmp_u_f32_e64 s10, v255, v255         ; encoding: [0x0a,0x00,0x18,0xd4,0xff,0xff,0x03,0x00]
+# W64: v_cmp_u_f32_e64 s[10:11], v255, v255    ; encoding: [0x0a,0x00,0x18,0xd4,0xff,0xff,0x03,0x00]
 
-# W32: v_cmp_u_f32_e64 s10, s1, s2               ; encoding: [0x0a,0x00,0x18,0xd4,0x01,0x04,0x00,0x00]
-# W64: v_cmp_u_f32_e64 s[10:11], s1, s2          ; encoding: [0x0a,0x00,0x18,0xd4,0x01,0x04,0x00,0x00]
 0x0a,0x00,0x18,0xd4,0x01,0x04,0x00,0x00
+# W32: v_cmp_u_f32_e64 s10, s1, s2             ; encoding: [0x0a,0x00,0x18,0xd4,0x01,0x04,0x00,0x00]
+# W64: v_cmp_u_f32_e64 s[10:11], s1, s2        ; encoding: [0x0a,0x00,0x18,0xd4,0x01,0x04,0x00,0x00]
 
-# W32: v_cmp_u_f32_e64 s10, s105, s105           ; encoding: [0x0a,0x00,0x18,0xd4,0x69,0xd2,0x00,0x00]
-# W64: v_cmp_u_f32_e64 s[10:11], s105, s105      ; encoding: [0x0a,0x00,0x18,0xd4,0x69,0xd2,0x00,0x00]
 0x0a,0x00,0x18,0xd4,0x69,0xd2,0x00,0x00
+# W32: v_cmp_u_f32_e64 s10, s105, s105         ; encoding: [0x0a,0x00,0x18,0xd4,0x69,0xd2,0x00,0x00]
+# W64: v_cmp_u_f32_e64 s[10:11], s105, s105    ; encoding: [0x0a,0x00,0x18,0xd4,0x69,0xd2,0x00,0x00]
 
-# W32: v_cmp_u_f32_e64 s10, vcc_lo, ttmp15       ; encoding: [0x0a,0x00,0x18,0xd4,0x6a,0xf6,0x00,0x00]
-# W64: v_cmp_u_f32_e64 s[10:11], vcc_lo, ttmp15  ; encoding: [0x0a,0x00,0x18,0xd4,0x6a,0xf6,0x00,0x00]
 0x0a,0x00,0x18,0xd4,0x6a,0xf6,0x00,0x00
+# W32: v_cmp_u_f32_e64 s10, vcc_lo, ttmp15     ; encoding: [0x0a,0x00,0x18,0xd4,0x6a,0xf6,0x00,0x00]
+# W64: v_cmp_u_f32_e64 s[10:11], vcc_lo, ttmp15 ; encoding: [0x0a,0x00,0x18,0xd4,0x6a,0xf6,0x00,0x00]
 
-# W32: v_cmp_u_f32_e64 s10, vcc_hi, 0xaf123456   ; encoding: [0x0a,0x00,0x18,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_u_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x18,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x0a,0x00,0x18,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# W32: v_cmp_u_f32_e64 s10, vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x18,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_u_f32_e64 s[10:11], vcc_hi, 0xaf123456 ; encoding: [0x0a,0x00,0x18,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_u_f32_e64 s10, ttmp15, src_scc      ; encoding: [0x0a,0x00,0x18,0xd4,0x7b,0xfa,0x01,0x00]
-# W64: v_cmp_u_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x18,0xd4,0x7b,0xfa,0x01,0x00]
 0x0a,0x00,0x18,0xd4,0x7b,0xfa,0x01,0x00
+# W32: v_cmp_u_f32_e64 s10, ttmp15, src_scc    ; encoding: [0x0a,0x00,0x18,0xd4,0x7b,0xfa,0x01,0x00]
+# W64: v_cmp_u_f32_e64 s[10:11], ttmp15, src_scc ; encoding: [0x0a,0x00,0x18,0xd4,0x7b,0xfa,0x01,0x00]
 
-# W32: v_cmp_u_f32_e64 s10, m0, 0.5              ; encoding: [0x0a,0x00,0x18,0xd4,0x7d,0xe0,0x01,0x00]
-# W64: v_cmp_u_f32_e64 s[10:11], m0, 0.5         ; encoding: [0x0a,0x00,0x18,0xd4,0x7d,0xe0,0x01,0x00]
 0x0a,0x00,0x18,0xd4,0x7d,0xe0,0x01,0x00
+# W32: v_cmp_u_f32_e64 s10, m0, 0.5            ; encoding: [0x0a,0x00,0x18,0xd4,0x7d,0xe0,0x01,0x00]
+# W64: v_cmp_u_f32_e64 s[10:11], m0, 0.5       ; encoding: [0x0a,0x00,0x18,0xd4,0x7d,0xe0,0x01,0x00]
 
-# W32: v_cmp_u_f32_e64 s10, exec_lo, -1          ; encoding: [0x0a,0x00,0x18,0xd4,0x7e,0x82,0x01,0x00]
-# W64: v_cmp_u_f32_e64 s[10:11], exec_lo, -1     ; encoding: [0x0a,0x00,0x18,0xd4,0x7e,0x82,0x01,0x00]
 0x0a,0x00,0x18,0xd4,0x7e,0x82,0x01,0x00
+# W32: v_cmp_u_f32_e64 s10, exec_lo, -1        ; encoding: [0x0a,0x00,0x18,0xd4,0x7e,0x82,0x01,0x00]
+# W64: v_cmp_u_f32_e64 s[10:11], exec_lo, -1   ; encoding: [0x0a,0x00,0x18,0xd4,0x7e,0x82,0x01,0x00]
 
-# W32: v_cmp_u_f32_e64 s10, |exec_hi|, null      ; encoding: [0x0a,0x01,0x18,0xd4,0x7f,0xf8,0x00,0x00]
-# W64: v_cmp_u_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x18,0xd4,0x7f,0xf8,0x00,0x00]
 0x0a,0x01,0x18,0xd4,0x7f,0xf8,0x00,0x00
+# W32: v_cmp_u_f32_e64 s10, |exec_hi|, null    ; encoding: [0x0a,0x01,0x18,0xd4,0x7f,0xf8,0x00,0x00]
+# W64: v_cmp_u_f32_e64 s[10:11], |exec_hi|, null ; encoding: [0x0a,0x01,0x18,0xd4,0x7f,0xf8,0x00,0x00]
 
-# W32: v_cmp_u_f32_e64 s10, null, exec_lo        ; encoding: [0x0a,0x00,0x18,0xd4,0x7c,0xfc,0x00,0x00]
-# W64: v_cmp_u_f32_e64 s[10:11], null, exec_lo   ; encoding: [0x0a,0x00,0x18,0xd4,0x7c,0xfc,0x00,0x00]
 0x0a,0x00,0x18,0xd4,0x7c,0xfc,0x00,0x00
+# W32: v_cmp_u_f32_e64 s10, null, exec_lo      ; encoding: [0x0a,0x00,0x18,0xd4,0x7c,0xfc,0x00,0x00]
+# W64: v_cmp_u_f32_e64 s[10:11], null, exec_lo ; encoding: [0x0a,0x00,0x18,0xd4,0x7c,0xfc,0x00,0x00]
 
-# W32: v_cmp_u_f32_e64 s104, -1, exec_hi         ; encoding: [0x68,0x00,0x18,0xd4,0xc1,0xfe,0x00,0x00]
-# W64: v_cmp_u_f32_e64 s[104:105], -1, exec_hi   ; encoding: [0x68,0x00,0x18,0xd4,0xc1,0xfe,0x00,0x00]
 0x68,0x00,0x18,0xd4,0xc1,0xfe,0x00,0x00
+# W32: v_cmp_u_f32_e64 s104, -1, exec_hi       ; encoding: [0x68,0x00,0x18,0xd4,0xc1,0xfe,0x00,0x00]
+# W64: v_cmp_u_f32_e64 s[104:105], -1, exec_hi ; encoding: [0x68,0x00,0x18,0xd4,0xc1,0xfe,0x00,0x00]
 
-# W32: v_cmp_u_f32_e64 vcc_lo, 0.5, -m0          ; encoding: [0x6a,0x00,0x18,0xd4,0xf0,0xfa,0x00,0x40]
-# W64: v_cmp_u_f32_e64 vcc, 0.5, -m0             ; encoding: [0x6a,0x00,0x18,0xd4,0xf0,0xfa,0x00,0x40]
 0x6a,0x00,0x18,0xd4,0xf0,0xfa,0x00,0x40
+# W32: v_cmp_u_f32_e64 vcc_lo, 0.5, -m0        ; encoding: [0x6a,0x00,0x18,0xd4,0xf0,0xfa,0x00,0x40]
+# W64: v_cmp_u_f32_e64 vcc, 0.5, -m0           ; encoding: [0x6a,0x00,0x18,0xd4,0xf0,0xfa,0x00,0x40]
 
+0x7a,0x02,0x18,0xd4,0xfd,0xd4,0x00,0x20
 # W32: v_cmp_u_f32_e64 ttmp14, -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x18,0xd4,0xfd,0xd4,0x00,0x20]
 # W64: v_cmp_u_f32_e64 ttmp[14:15], -src_scc, |vcc_lo| ; encoding: [0x7a,0x02,0x18,0xd4,0xfd,0xd4,0x00,0x20]
-0x7a,0x02,0x18,0xd4,0xfd,0xd4,0x00,0x20
 
-# GFX12: v_cmp_u_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x18,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7c,0x83,0x18,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_u_f32_e64 null, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7c,0x83,0x18,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_u_f64_e64 s10, v[1:2], v[2:3]       ; encoding: [0x0a,0x00,0x28,0xd4,0x01,0x05,0x02,0x00]
-# W64: v_cmp_u_f64_e64 s[10:11], v[1:2], v[2:3]  ; encoding: [0x0a,0x00,0x28,0xd4,0x01,0x05,0x02,0x00]
 0x0a,0x00,0x28,0xd4,0x01,0x05,0x02,0x00
+# W32: v_cmp_u_f64_e64 s10, v[1:2], v[2:3]     ; encoding: [0x0a,0x00,0x28,0xd4,0x01,0x05,0x02,0x00]
+# W64: v_cmp_u_f64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0x28,0xd4,0x01,0x05,0x02,0x00]
 
+0x0a,0x00,0x28,0xd4,0xfe,0xfd,0x03,0x00
 # W32: v_cmp_u_f64_e64 s10, v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x28,0xd4,0xfe,0xfd,0x03,0x00]
 # W64: v_cmp_u_f64_e64 s[10:11], v[254:255], v[254:255] ; encoding: [0x0a,0x00,0x28,0xd4,0xfe,0xfd,0x03,0x00]
-0x0a,0x00,0x28,0xd4,0xfe,0xfd,0x03,0x00
 
-# W32: v_cmp_u_f64_e64 s10, s[2:3], s[4:5]       ; encoding: [0x0a,0x00,0x28,0xd4,0x02,0x08,0x00,0x00]
-# W64: v_cmp_u_f64_e64 s[10:11], s[2:3], s[4:5]  ; encoding: [0x0a,0x00,0x28,0xd4,0x02,0x08,0x00,0x00]
 0x0a,0x00,0x28,0xd4,0x02,0x08,0x00,0x00
+# W32: v_cmp_u_f64_e64 s10, s[2:3], s[4:5]     ; encoding: [0x0a,0x00,0x28,0xd4,0x02,0x08,0x00,0x00]
+# W64: v_cmp_u_f64_e64 s[10:11], s[2:3], s[4:5] ; encoding: [0x0a,0x00,0x28,0xd4,0x02,0x08,0x00,0x00]
 
+0x0a,0x00,0x28,0xd4,0x68,0xd0,0x00,0x00
 # W32: v_cmp_u_f64_e64 s10, s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x28,0xd4,0x68,0xd0,0x00,0x00]
 # W64: v_cmp_u_f64_e64 s[10:11], s[104:105], s[104:105] ; encoding: [0x0a,0x00,0x28,0xd4,0x68,0xd0,0x00,0x00]
-0x0a,0x00,0x28,0xd4,0x68,0xd0,0x00,0x00
 
-# W32: v_cmp_u_f64_e64 s10, vcc, ttmp[14:15]     ; encoding: [0x0a,0x00,0x28,0xd4,0x6a,0xf4,0x00,0x00]
-# W64: v_cmp_u_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x28,0xd4,0x6a,0xf4,0x00,0x00]
 0x0a,0x00,0x28,0xd4,0x6a,0xf4,0x00,0x00
+# W32: v_cmp_u_f64_e64 s10, vcc, ttmp[14:15]   ; encoding: [0x0a,0x00,0x28,0xd4,0x6a,0xf4,0x00,0x00]
+# W64: v_cmp_u_f64_e64 s[10:11], vcc, ttmp[14:15] ; encoding: [0x0a,0x00,0x28,0xd4,0x6a,0xf4,0x00,0x00]
 
+0x0a,0x00,0x28,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 # W32: v_cmp_u_f64_e64 s10, ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x28,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_u_f64_e64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x0a,0x00,0x28,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-0x0a,0x00,0x28,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_u_f64_e64 s10, -|exec|, src_scc     ; encoding: [0x0a,0x01,0x28,0xd4,0x7e,0xfa,0x01,0x20]
-# W64: v_cmp_u_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x28,0xd4,0x7e,0xfa,0x01,0x20]
 0x0a,0x01,0x28,0xd4,0x7e,0xfa,0x01,0x20
+# W32: v_cmp_u_f64_e64 s10, -|exec|, src_scc   ; encoding: [0x0a,0x01,0x28,0xd4,0x7e,0xfa,0x01,0x20]
+# W64: v_cmp_u_f64_e64 s[10:11], -|exec|, src_scc ; encoding: [0x0a,0x01,0x28,0xd4,0x7e,0xfa,0x01,0x20]
 
-# W32: v_cmp_u_f64_e64 s10, null, 0.5            ; encoding: [0x0a,0x00,0x28,0xd4,0x7c,0xe0,0x01,0x00]
-# W64: v_cmp_u_f64_e64 s[10:11], null, 0.5       ; encoding: [0x0a,0x00,0x28,0xd4,0x7c,0xe0,0x01,0x00]
 0x0a,0x00,0x28,0xd4,0x7c,0xe0,0x01,0x00
+# W32: v_cmp_u_f64_e64 s10, null, 0.5          ; encoding: [0x0a,0x00,0x28,0xd4,0x7c,0xe0,0x01,0x00]
+# W64: v_cmp_u_f64_e64 s[10:11], null, 0.5     ; encoding: [0x0a,0x00,0x28,0xd4,0x7c,0xe0,0x01,0x00]
 
-# W32: v_cmp_u_f64_e64 s104, -1, -1              ; encoding: [0x68,0x00,0x28,0xd4,0xc1,0x82,0x01,0x00]
-# W64: v_cmp_u_f64_e64 s[104:105], -1, -1        ; encoding: [0x68,0x00,0x28,0xd4,0xc1,0x82,0x01,0x00]
 0x68,0x00,0x28,0xd4,0xc1,0x82,0x01,0x00
+# W32: v_cmp_u_f64_e64 s104, -1, -1            ; encoding: [0x68,0x00,0x28,0xd4,0xc1,0x82,0x01,0x00]
+# W64: v_cmp_u_f64_e64 s[104:105], -1, -1      ; encoding: [0x68,0x00,0x28,0xd4,0xc1,0x82,0x01,0x00]
 
-# W32: v_cmp_u_f64_e64 vcc_lo, 0.5, null         ; encoding: [0x6a,0x00,0x28,0xd4,0xf0,0xf8,0x00,0x00]
-# W64: v_cmp_u_f64_e64 vcc, 0.5, null            ; encoding: [0x6a,0x00,0x28,0xd4,0xf0,0xf8,0x00,0x00]
 0x6a,0x00,0x28,0xd4,0xf0,0xf8,0x00,0x00
+# W32: v_cmp_u_f64_e64 vcc_lo, 0.5, null       ; encoding: [0x6a,0x00,0x28,0xd4,0xf0,0xf8,0x00,0x00]
+# W64: v_cmp_u_f64_e64 vcc, 0.5, null          ; encoding: [0x6a,0x00,0x28,0xd4,0xf0,0xf8,0x00,0x00]
 
+0x7a,0x03,0x28,0xd4,0xfd,0xfc,0x00,0x60
 # W32: v_cmp_u_f64_e64 ttmp14, -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x28,0xd4,0xfd,0xfc,0x00,0x60]
 # W64: v_cmp_u_f64_e64 ttmp[14:15], -|src_scc|, -|exec| ; encoding: [0x7a,0x03,0x28,0xd4,0xfd,0xfc,0x00,0x60]
-0x7a,0x03,0x28,0xd4,0xfd,0xfc,0x00,0x60
 
-# GFX12: v_cmp_u_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x28,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7c,0x82,0x28,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX12: v_cmp_u_f64_e64 null, 0xaf123456, -|vcc| clamp ; encoding: [0x7c,0x82,0x28,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp16.txt
index 564312f579bce..8c782471db224 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp16.txt
@@ -1,3195 +1,3198 @@
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s
 
+0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_class_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_class_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_class_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_class_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_class_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x7d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_class_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x7d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_class_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_class_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_class_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_class_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_class_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_class_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_class_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_class_f16_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_class_f16_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_class_f16_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_class_f16_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_class_f16_e64_dpp null, -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x01,0x7d,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30]
 0x7c,0x01,0x7d,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_class_f16_e64_dpp null, -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x01,0x7d,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_class_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_class_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_class_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_class_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_class_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_class_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_class_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_class_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_class_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_class_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x7e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_class_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_class_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x7e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_class_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_class_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_class_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_class_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_class_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_class_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_class_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_class_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_class_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_class_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_class_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_class_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_class_f32_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_class_f32_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_class_f32_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_class_f32_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_class_f32_e64_dpp null, -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x01,0x7e,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30]
 0x7c,0x01,0x7e,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_class_f32_e64_dpp null, -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x01,0x7e,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_eq_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_eq_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_eq_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_eq_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_eq_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x02,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_eq_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x02,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_eq_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_eq_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_eq_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_eq_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_eq_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_eq_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_eq_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x02,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_eq_f16_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x02,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_eq_f16_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x02,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x02,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x02,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_eq_f16_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x02,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_eq_f16_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x02,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x02,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_eq_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x02,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x02,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_eq_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x02,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_eq_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_eq_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_eq_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_eq_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_eq_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x12,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_eq_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x12,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_eq_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_eq_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_eq_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_eq_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_eq_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_eq_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_eq_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x12,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_eq_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x12,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_eq_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x12,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x12,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x12,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_eq_f32_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x12,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_eq_f32_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x12,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x12,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_eq_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x12,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x12,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_eq_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x12,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_eq_i16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_eq_i16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_eq_i16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_eq_i16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_eq_i16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x32,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_eq_i16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x32,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_eq_i16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_eq_i16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_eq_i16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_eq_i16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_eq_i16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_eq_i16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_eq_i16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_eq_i16_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_eq_i16_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_eq_i16_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_eq_i16_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_eq_i16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x32,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x32,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_eq_i16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x32,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_eq_i32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_eq_i32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_eq_i32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_eq_i32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_eq_i32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x42,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_eq_i32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x42,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_eq_i32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_eq_i32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_eq_i32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_eq_i32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_eq_i32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_eq_i32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_eq_i32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_eq_i32_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_eq_i32_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_eq_i32_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_eq_i32_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_eq_i32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x42,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x42,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_eq_i32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x42,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_eq_u16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_eq_u16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_eq_u16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_eq_u16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_eq_u16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x3a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_eq_u16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x3a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_eq_u16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_eq_u16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_eq_u16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_eq_u16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_eq_u16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_eq_u16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_eq_u16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_eq_u16_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_eq_u16_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_eq_u16_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_eq_u16_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_eq_u16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x3a,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x3a,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_eq_u16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x3a,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_eq_u32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_eq_u32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_eq_u32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_eq_u32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_eq_u32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x4a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_eq_u32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x4a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_eq_u32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_eq_u32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_eq_u32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_eq_u32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_eq_u32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_eq_u32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_eq_u32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_eq_u32_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_eq_u32_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_eq_u32_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_eq_u32_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_eq_u32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x4a,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x4a,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_eq_u32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x4a,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ge_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ge_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_ge_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_ge_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ge_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x06,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ge_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x06,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ge_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_ge_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ge_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_ge_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ge_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_ge_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ge_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x06,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ge_f16_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x06,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ge_f16_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x06,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x06,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x06,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_ge_f16_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x06,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ge_f16_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x06,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x06,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_ge_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x06,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x06,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_ge_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x06,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ge_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ge_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_ge_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_ge_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ge_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x16,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ge_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x16,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ge_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_ge_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ge_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_ge_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ge_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_ge_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ge_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x16,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ge_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x16,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ge_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x16,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x16,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x16,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_ge_f32_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x16,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ge_f32_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x16,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x16,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_ge_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x16,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x16,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_ge_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x16,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ge_i16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ge_i16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_ge_i16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_ge_i16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ge_i16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x36,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ge_i16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x36,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ge_i16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_ge_i16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ge_i16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_ge_i16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ge_i16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_ge_i16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ge_i16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ge_i16_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ge_i16_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_ge_i16_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ge_i16_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_ge_i16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x36,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x36,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_ge_i16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x36,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmp_ge_i16_e64_dpp null, v255, 10 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x36,0xd4,0xfa,0x14,0x01,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x36,0xd4,0xfa,0x14,0x01,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_ge_i16_e64_dpp null, v255, 10 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x36,0xd4,0xfa,0x14,0x01,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ge_i32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ge_i32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_ge_i32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_ge_i32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ge_i32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x46,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ge_i32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x46,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ge_i32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_ge_i32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ge_i32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_ge_i32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ge_i32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_ge_i32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ge_i32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ge_i32_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ge_i32_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_ge_i32_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ge_i32_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_ge_i32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x46,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x46,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_ge_i32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x46,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ge_u16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ge_u16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_ge_u16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_ge_u16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ge_u16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x3e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ge_u16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x3e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ge_u16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_ge_u16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ge_u16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_ge_u16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ge_u16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_ge_u16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ge_u16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ge_u16_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ge_u16_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_ge_u16_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ge_u16_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_ge_u16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x3e,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x3e,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_ge_u16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x3e,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ge_u32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ge_u32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_ge_u32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_ge_u32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ge_u32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x4e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ge_u32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x4e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ge_u32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_ge_u32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ge_u32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_ge_u32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ge_u32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_ge_u32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ge_u32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ge_u32_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ge_u32_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_ge_u32_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ge_u32_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_ge_u32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x4e,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x4e,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_ge_u32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x4e,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_gt_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_gt_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_gt_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_gt_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_gt_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x04,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_gt_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x04,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_gt_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_gt_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_gt_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_gt_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_gt_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_gt_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_gt_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x04,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_gt_f16_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x04,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_gt_f16_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x04,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x04,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x04,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_gt_f16_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x04,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_gt_f16_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x04,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x04,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_gt_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x04,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x04,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_gt_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x04,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_gt_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_gt_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_gt_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_gt_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_gt_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x14,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_gt_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x14,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_gt_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_gt_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_gt_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_gt_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_gt_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_gt_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_gt_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x14,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_gt_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x14,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_gt_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x14,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x14,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x14,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_gt_f32_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x14,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_gt_f32_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x14,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x14,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_gt_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x14,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x14,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_gt_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x14,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_gt_i16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_gt_i16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_gt_i16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_gt_i16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_gt_i16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x34,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_gt_i16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x34,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_gt_i16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_gt_i16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_gt_i16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_gt_i16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_gt_i16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_gt_i16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_gt_i16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_gt_i16_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_gt_i16_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_gt_i16_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_gt_i16_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_gt_i16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x34,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x34,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_gt_i16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x34,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_gt_i32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_gt_i32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_gt_i32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_gt_i32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_gt_i32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x44,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_gt_i32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x44,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_gt_i32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_gt_i32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_gt_i32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_gt_i32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_gt_i32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_gt_i32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_gt_i32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_gt_i32_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_gt_i32_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_gt_i32_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_gt_i32_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_gt_i32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x44,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x44,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_gt_i32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x44,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_gt_u16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_gt_u16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_gt_u16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_gt_u16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_gt_u16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x3c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_gt_u16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x3c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_gt_u16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_gt_u16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_gt_u16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_gt_u16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_gt_u16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_gt_u16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_gt_u16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_gt_u16_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_gt_u16_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_gt_u16_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_gt_u16_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_gt_u16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x3c,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x3c,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_gt_u16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x3c,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_gt_u32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_gt_u32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_gt_u32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_gt_u32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_gt_u32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x4c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_gt_u32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x4c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_gt_u32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_gt_u32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_gt_u32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_gt_u32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_gt_u32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_gt_u32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_gt_u32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_gt_u32_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_gt_u32_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_gt_u32_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_gt_u32_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_gt_u32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x4c,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x4c,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_gt_u32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x4c,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_le_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_le_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_le_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_le_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_le_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_le_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_le_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_le_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_le_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_le_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x03,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_le_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_le_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x03,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_le_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_le_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_le_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_le_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_le_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_le_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_le_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_le_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_le_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_le_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_le_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x03,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_le_f16_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x03,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_le_f16_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x03,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x03,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x03,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_le_f16_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x03,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_le_f16_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x03,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x03,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_le_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x03,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x03,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_le_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x03,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_le_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_le_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_le_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_le_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_le_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_le_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_le_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_le_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_le_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_le_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x13,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_le_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_le_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x13,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_le_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_le_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_le_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_le_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_le_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_le_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_le_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_le_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_le_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_le_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_le_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x13,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_le_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x13,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_le_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x13,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x13,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x13,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_le_f32_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x13,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_le_f32_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x13,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x13,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_le_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x13,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x13,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_le_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x13,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_le_i16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_le_i16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_le_i16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_le_i16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_le_i16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_le_i16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_le_i16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_le_i16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_le_i16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_le_i16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x33,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_le_i16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_le_i16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x33,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_le_i16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_i16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_le_i16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_le_i16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_le_i16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_le_i16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_le_i16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_le_i16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_le_i16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_le_i16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_le_i16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_le_i16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_le_i16_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_le_i16_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_le_i16_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_le_i16_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_le_i16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x33,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x33,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_le_i16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x33,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_le_i32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_le_i32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_le_i32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_le_i32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_le_i32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_le_i32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_le_i32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_le_i32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_le_i32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_le_i32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x43,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_le_i32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_le_i32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x43,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_le_i32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_i32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_le_i32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_le_i32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_le_i32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_le_i32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_le_i32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_le_i32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_le_i32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_le_i32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_le_i32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_le_i32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_le_i32_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_le_i32_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_le_i32_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_le_i32_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_le_i32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x43,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x43,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_le_i32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x43,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_le_u16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_le_u16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_le_u16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_le_u16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_le_u16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_le_u16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_le_u16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_le_u16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_le_u16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_le_u16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x3b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_le_u16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_le_u16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x3b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_le_u16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_u16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_le_u16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_le_u16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_le_u16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_le_u16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_le_u16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_le_u16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_le_u16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_le_u16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_le_u16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_le_u16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_le_u16_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_le_u16_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_le_u16_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_le_u16_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_le_u16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x3b,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x3b,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_le_u16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x3b,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_le_u32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_le_u32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_le_u32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_le_u32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_le_u32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_le_u32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_le_u32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_le_u32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_le_u32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_le_u32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x4b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_le_u32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_le_u32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x4b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_le_u32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_u32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_le_u32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_le_u32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_le_u32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_le_u32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_le_u32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_le_u32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_le_u32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_le_u32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_le_u32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_le_u32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_le_u32_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_le_u32_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_le_u32_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_le_u32_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_le_u32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x4b,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x4b,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_le_u32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x4b,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lg_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lg_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_lg_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_lg_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_lg_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x05,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_lg_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x05,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lg_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_lg_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lg_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_lg_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lg_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_lg_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lg_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x05,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lg_f16_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x05,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lg_f16_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x05,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x05,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x05,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_lg_f16_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x05,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lg_f16_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x05,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x05,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_lg_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x05,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x05,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_lg_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x05,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lg_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lg_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_lg_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_lg_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_lg_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x15,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_lg_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x15,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lg_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_lg_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lg_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_lg_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lg_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_lg_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lg_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x15,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lg_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x15,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lg_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x15,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x15,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x15,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_lg_f32_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x15,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lg_f32_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x15,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x15,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_lg_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x15,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x15,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_lg_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x15,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lt_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lt_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_lt_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_lt_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_lt_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x01,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_lt_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x01,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lt_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_lt_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lt_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_lt_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lt_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_lt_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lt_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x01,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lt_f16_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x01,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lt_f16_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x01,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x01,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x01,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_lt_f16_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x01,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lt_f16_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x01,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x01,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_lt_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x01,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x01,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_lt_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x01,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lt_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lt_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_lt_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_lt_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_lt_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x11,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_lt_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x11,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lt_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_lt_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lt_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_lt_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lt_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_lt_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lt_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x11,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lt_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x11,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lt_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x11,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x11,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x11,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_lt_f32_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x11,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lt_f32_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x11,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x11,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_lt_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x11,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x11,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_lt_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x11,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lt_i16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lt_i16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_lt_i16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_lt_i16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_lt_i16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x31,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_lt_i16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x31,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lt_i16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_lt_i16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lt_i16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_lt_i16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lt_i16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_lt_i16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lt_i16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lt_i16_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lt_i16_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_lt_i16_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lt_i16_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_lt_i16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x31,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x31,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_lt_i16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x31,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lt_i32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lt_i32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_lt_i32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_lt_i32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_lt_i32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x41,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_lt_i32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x41,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lt_i32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_lt_i32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lt_i32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_lt_i32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lt_i32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_lt_i32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lt_i32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lt_i32_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lt_i32_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_lt_i32_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lt_i32_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_lt_i32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x41,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x41,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_lt_i32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x41,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lt_u16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lt_u16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_lt_u16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_lt_u16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_lt_u16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x39,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_lt_u16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x39,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lt_u16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_lt_u16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lt_u16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_lt_u16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lt_u16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_lt_u16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lt_u16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lt_u16_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lt_u16_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_lt_u16_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lt_u16_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_lt_u16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x39,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x39,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_lt_u16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x39,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lt_u32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lt_u32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_lt_u32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_lt_u32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_lt_u32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x49,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_lt_u32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x49,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lt_u32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_lt_u32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lt_u32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_lt_u32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lt_u32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_lt_u32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lt_u32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lt_u32_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lt_u32_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_lt_u32_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lt_u32_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_lt_u32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x49,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x49,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_lt_u32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x49,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ne_i16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ne_i16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_ne_i16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_ne_i16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ne_i16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x35,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ne_i16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x35,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ne_i16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_ne_i16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ne_i16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_ne_i16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ne_i16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_ne_i16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ne_i16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ne_i16_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ne_i16_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_ne_i16_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ne_i16_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_ne_i16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x35,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x35,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_ne_i16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x35,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ne_i32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ne_i32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_ne_i32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_ne_i32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ne_i32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x45,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ne_i32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x45,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ne_i32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_ne_i32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ne_i32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_ne_i32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ne_i32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_ne_i32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ne_i32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ne_i32_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ne_i32_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_ne_i32_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ne_i32_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_ne_i32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x45,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x45,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_ne_i32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x45,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ne_u16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ne_u16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_ne_u16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_ne_u16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ne_u16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x3d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ne_u16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x3d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ne_u16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_ne_u16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ne_u16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_ne_u16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ne_u16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_ne_u16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ne_u16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ne_u16_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ne_u16_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_ne_u16_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ne_u16_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_ne_u16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x3d,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x3d,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_ne_u16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x3d,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ne_u32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ne_u32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_ne_u32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_ne_u32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ne_u32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x4d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ne_u32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x4d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ne_u32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_ne_u32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ne_u32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_ne_u32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ne_u32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_ne_u32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ne_u32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ne_u32_e64_dpp vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ne_u32_e64_dpp vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-0x6a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
+0x7a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # W32: v_cmp_ne_u32_e64_dpp ttmp14, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ne_u32_e64_dpp ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-0x7a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_ne_u32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x4d,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x4d,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_ne_u32_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x4d,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_neq_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_neq_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_neq_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_neq_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_neq_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x0d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_neq_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x0d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_neq_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_neq_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_neq_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_neq_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_neq_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_neq_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_neq_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x0d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_neq_f16_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x0d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_neq_f16_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x0d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x0d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x0d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_neq_f16_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x0d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_neq_f16_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x0d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x0d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_neq_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x0d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x0d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_neq_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x0d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_neq_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_neq_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_neq_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_neq_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_neq_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x1d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_neq_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x1d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_neq_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_neq_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_neq_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_neq_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_neq_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_neq_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_neq_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x1d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_neq_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x1d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_neq_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x1d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x1d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x1d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_neq_f32_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x1d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_neq_f32_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x1d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x1d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_neq_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x1d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x1d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_neq_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x1d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nge_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nge_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_nge_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_nge_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_nge_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x09,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_nge_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x09,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nge_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_nge_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nge_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_nge_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nge_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_nge_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nge_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x09,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nge_f16_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x09,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nge_f16_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x09,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x09,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x09,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_nge_f16_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x09,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nge_f16_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x09,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x09,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_nge_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x09,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x09,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_nge_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x09,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nge_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nge_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_nge_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_nge_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_nge_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x19,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_nge_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x19,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nge_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_nge_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nge_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_nge_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nge_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_nge_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nge_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x19,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nge_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x19,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nge_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x19,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x19,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x19,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_nge_f32_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x19,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nge_f32_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x19,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x19,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_nge_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x19,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x19,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_nge_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x19,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ngt_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ngt_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_ngt_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_ngt_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ngt_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x0b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ngt_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x0b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ngt_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_ngt_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ngt_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_ngt_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ngt_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_ngt_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ngt_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x0b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ngt_f16_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x0b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ngt_f16_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x0b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x0b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x0b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_ngt_f16_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x0b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ngt_f16_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x0b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x0b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_ngt_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x0b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x0b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_ngt_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x0b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ngt_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ngt_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_ngt_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_ngt_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ngt_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x1b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_ngt_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x1b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ngt_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_ngt_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ngt_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_ngt_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ngt_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_ngt_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ngt_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x1b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ngt_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x1b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ngt_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x1b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x1b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x1b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_ngt_f32_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x1b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ngt_f32_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x1b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x1b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_ngt_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x1b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x1b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_ngt_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x1b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nle_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nle_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_nle_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_nle_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_nle_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x0c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_nle_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x0c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nle_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_nle_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nle_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_nle_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nle_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_nle_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nle_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x0c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nle_f16_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x0c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nle_f16_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x0c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x0c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x0c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_nle_f16_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x0c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nle_f16_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x0c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x0c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_nle_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x0c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x0c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_nle_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x0c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nle_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nle_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_nle_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_nle_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_nle_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x1c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_nle_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x1c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nle_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_nle_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nle_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_nle_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nle_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_nle_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nle_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x1c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nle_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x1c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nle_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x1c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x1c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x1c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_nle_f32_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x1c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nle_f32_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x1c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x1c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_nle_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x1c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x1c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_nle_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x1c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nlg_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nlg_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_nlg_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_nlg_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_nlg_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x0a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_nlg_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x0a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nlg_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_nlg_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nlg_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_nlg_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nlg_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_nlg_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nlg_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x0a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nlg_f16_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x0a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nlg_f16_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x0a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x0a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x0a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_nlg_f16_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x0a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nlg_f16_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x0a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x0a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_nlg_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x0a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x0a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_nlg_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x0a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nlg_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nlg_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_nlg_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_nlg_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_nlg_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x1a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_nlg_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x1a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nlg_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_nlg_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nlg_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_nlg_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nlg_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_nlg_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nlg_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x1a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nlg_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x1a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nlg_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x1a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x1a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x1a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_nlg_f32_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x1a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nlg_f32_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x1a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x1a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_nlg_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x1a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x1a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_nlg_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x1a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nlt_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nlt_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_nlt_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_nlt_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_nlt_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x0e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_nlt_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x0e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nlt_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_nlt_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nlt_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_nlt_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nlt_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_nlt_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nlt_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x0e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nlt_f16_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x0e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nlt_f16_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x0e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x0e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x0e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_nlt_f16_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x0e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nlt_f16_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x0e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x0e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_nlt_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x0e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x0e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_nlt_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x0e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nlt_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nlt_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_nlt_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_nlt_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_nlt_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x1e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_nlt_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x1e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nlt_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_nlt_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nlt_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_nlt_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nlt_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_nlt_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nlt_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x1e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nlt_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x1e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nlt_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x1e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x1e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x1e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_nlt_f32_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x1e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nlt_f32_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x1e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x1e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_nlt_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x1e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x1e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_nlt_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x1e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_o_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_o_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_o_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_o_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_o_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_o_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_o_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_o_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_o_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_o_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x07,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_o_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_o_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x07,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_o_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_o_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_o_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_o_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_o_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_o_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_o_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_o_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_o_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_o_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_o_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_o_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x07,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_o_f16_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x07,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_o_f16_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x07,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x07,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x07,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_o_f16_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x07,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_o_f16_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x07,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x07,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_o_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x07,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x07,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_o_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x07,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_o_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_o_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_o_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_o_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_o_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_o_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_o_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_o_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_o_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_o_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x17,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_o_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_o_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x17,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_o_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_o_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_o_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_o_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_o_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_o_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_o_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_o_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_o_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_o_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_o_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_o_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x68,0x00,0x17,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_o_f32_e64_dpp s104, v1, 2.0 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x17,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_o_f32_e64_dpp s[104:105], v1, 2.0 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x17,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x17,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x17,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_o_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x17,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_o_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x17,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x17,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x17,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_o_f32_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x17,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_o_f32_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x17,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x17,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_o_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x17,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x17,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_o_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x17,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_u_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_u_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_u_f16_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_u_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_u_f16_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_u_f16_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_u_f16_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_u_f16_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_u_f16_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_u_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x08,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_u_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_u_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x08,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_u_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_u_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_u_f16_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_u_f16_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_u_f16_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_u_f16_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_u_f16_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_u_f16_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_u_f16_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_u_f16_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_u_f16_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_u_f16_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x08,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_u_f16_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x08,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_u_f16_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x08,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x08,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x08,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_u_f16_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x08,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_u_f16_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x08,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x08,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_u_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x08,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x08,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_u_f16_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x08,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
+0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # W32: v_cmp_u_f32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_u_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 # W32: v_cmp_u_f32_e64_dpp s10, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_u_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
+0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 # W32: v_cmp_u_f32_e64_dpp s10, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 # W64: v_cmp_u_f32_e64_dpp s[10:11], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
+0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 # W32: v_cmp_u_f32_e64_dpp s10, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 # W64: v_cmp_u_f32_e64_dpp s[10:11], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
+0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_u_f32_e64_dpp s10, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_u_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x18,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 # W32: v_cmp_u_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
 # W64: v_cmp_u_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
-0x0a,0x00,0x18,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
 
+0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 # W32: v_cmp_u_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_u_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
+0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 # W32: v_cmp_u_f32_e64_dpp s10, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 # W64: v_cmp_u_f32_e64_dpp s[10:11], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
+0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 # W32: v_cmp_u_f32_e64_dpp s10, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_u_f32_e64_dpp s[10:11], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
+0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 # W32: v_cmp_u_f32_e64_dpp s10, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 # W64: v_cmp_u_f32_e64_dpp s[10:11], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 # W32: v_cmp_u_f32_e64_dpp s10, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_u_f32_e64_dpp s[10:11], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+0x68,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 # W32: v_cmp_u_f32_e64_dpp s104, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 # W64: v_cmp_u_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-0x68,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+0x6a,0x01,0x18,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 # W32: v_cmp_u_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x18,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_u_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x18,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
-0x6a,0x01,0x18,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
 
+0x7a,0x02,0x18,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 # W32: v_cmp_u_f32_e64_dpp ttmp14, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x18,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 # W64: v_cmp_u_f32_e64_dpp ttmp[14:15], -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7a,0x02,0x18,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
-0x7a,0x02,0x18,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
 
-# GFX12: v_cmp_u_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x18,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7c,0x83,0x18,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmp_u_f32_e64_dpp null, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x83,0x18,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp8.txt
index 1d2169efa5ac0..cb9f8f4799743 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp8.txt
@@ -1,1260 +1,1263 @@
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s
 
+0x0a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_class_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x7d,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_class_f16_e64_dpp s10, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7d,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7d,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x7d,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x7d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_class_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x7d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_class_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_class_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_class_f16_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_class_f16_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_class_f16_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_class_f16_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_class_f16_e64_dpp null, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x01,0x7d,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 0x7c,0x01,0x7d,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_class_f16_e64_dpp null, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x01,0x7d,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_class_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_class_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x7e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_class_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_class_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x7e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_class_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_class_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_class_f32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_class_f32_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_class_f32_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_class_f32_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_class_f32_e64_dpp null, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x01,0x7e,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 0x7c,0x01,0x7e,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_class_f32_e64_dpp null, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x01,0x7e,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x02,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x02,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x02,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x02,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x02,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_f16_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x02,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_f16_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x02,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x02,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x02,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_f16_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x02,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_f16_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x02,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x02,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_eq_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x02,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x02,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_eq_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x02,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x12,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x12,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x12,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x12,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x12,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x12,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x12,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x12,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x12,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x12,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x12,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x12,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_eq_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x12,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x12,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_eq_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x12,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_i16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x32,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_i16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x32,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x32,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x32,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_i16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_i16_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_i16_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_i16_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_i16_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_eq_i16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x32,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x32,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_eq_i16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x32,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_i32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x42,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_i32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x42,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x42,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x42,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_i32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_i32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_i32_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_i32_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_i32_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_eq_i32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x42,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x42,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_eq_i32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x42,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_u16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x3a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_u16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x3a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_u16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_u16_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_u16_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_u16_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_u16_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_eq_u16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x3a,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x3a,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_eq_u16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x3a,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_u32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x4a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_u32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x4a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_u32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_u32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_u32_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_u32_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_u32_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_eq_u32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x4a,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x4a,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_eq_u32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x4a,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x06,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x06,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x06,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x06,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x06,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_f16_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x06,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_f16_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x06,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x06,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x06,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_f16_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x06,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_f16_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x06,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x06,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_ge_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x06,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x06,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_ge_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x06,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x16,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x16,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x16,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x16,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x16,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x16,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x16,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x16,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x16,0xd4,0xe9,0xea,0x01,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_f32_e64_dpp vcc_lo, |v1|, -2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x16,0xd4,0xe9,0xea,0x01,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_f32_e64_dpp vcc, |v1|, -2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x16,0xd4,0xe9,0xea,0x01,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x16,0xd4,0xe9,0xea,0x01,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x16,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x16,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x16,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x16,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_ge_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x16,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x16,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_ge_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x16,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_i16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x36,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_i16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x36,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x36,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x36,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_i16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_i16_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_i16_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_i16_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_i16_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_ge_i16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x36,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x36,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_ge_i16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x36,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_i32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x46,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_i32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x46,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x46,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x46,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_i32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_i32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_i32_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_i32_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_i32_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_ge_i32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x46,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x46,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_ge_i32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x46,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_u16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x3e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_u16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x3e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_u16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_u16_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_u16_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_u16_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_u16_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_ge_u16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x3e,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x3e,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_ge_u16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x3e,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_u32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x4e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_u32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x4e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x4e,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_u32_e64_dpp s10, v1, 10 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4e,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4e,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x4e,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_u32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_u32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_u32_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_u32_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_u32_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_ge_u32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x4e,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x4e,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_ge_u32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x4e,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x04,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x04,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x04,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x04,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x04,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_f16_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x04,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_f16_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x04,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x04,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x04,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_f16_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x04,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_f16_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x04,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x04,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_gt_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x04,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x04,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_gt_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x04,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x14,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x14,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x14,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x14,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x14,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x14,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x14,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x14,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x14,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x14,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x14,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x14,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_gt_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x14,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x14,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_gt_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x14,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_i16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x34,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_i16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x34,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x34,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x34,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_i16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_i16_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_i16_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_i16_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_i16_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_gt_i16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x34,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x34,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_gt_i16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x34,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_i32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x44,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_i32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x44,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x44,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x44,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_i32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_i32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_i32_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_i32_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_i32_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_gt_i32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x44,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x44,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_gt_i32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x44,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_u16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x3c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_u16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x3c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_u16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_u16_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_u16_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_u16_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_u16_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_gt_u16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x3c,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x3c,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_gt_u16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x3c,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_u32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x4c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_u32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x4c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_u32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_u32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_u32_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_u32_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_u32_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_gt_u32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x4c,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x4c,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_gt_u32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x4c,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x03,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x03,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x03,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x03,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x03,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_f16_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x03,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_f16_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x03,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x03,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x03,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_f16_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x03,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_f16_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x03,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x03,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_le_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x03,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x03,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_le_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x03,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x13,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x13,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x13,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x13,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x13,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x13,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x13,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x13,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x13,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x13,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x13,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x13,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_le_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x13,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x13,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_le_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x13,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_i16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x33,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_i16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x33,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_i16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x33,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x33,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_i16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_i16_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_i16_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_i16_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_i16_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_le_i16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x33,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x33,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_le_i16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x33,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_i32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x43,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_i32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x43,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_i32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x43,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x43,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_i32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_i32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_i32_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_i32_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_i32_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_le_i32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x43,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x43,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_le_i32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x43,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_u16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x3b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_u16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_u16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x3b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_u16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_u16_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_u16_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_u16_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_u16_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_le_u16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x3b,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x3b,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_le_u16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x3b,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_u32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x4b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_u32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_u32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x4b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_u32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_u32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_u32_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_u32_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_u32_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_le_u32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x4b,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x4b,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_le_u32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x4b,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lg_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x05,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lg_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x05,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x05,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x05,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lg_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lg_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x05,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_lg_f16_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x05,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lg_f16_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x05,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x05,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x05,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_lg_f16_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x05,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lg_f16_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x05,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x05,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_lg_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x05,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x05,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_lg_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x05,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lg_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x15,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lg_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x15,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x15,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x15,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lg_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lg_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x15,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_lg_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x15,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lg_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x15,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x15,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x15,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_lg_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x15,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lg_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x15,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x15,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_lg_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x15,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x15,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_lg_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x15,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x01,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x01,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x01,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x01,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x01,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_f16_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x01,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_f16_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x01,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x01,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x01,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_f16_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x01,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_f16_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x01,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x01,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_lt_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x01,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x01,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_lt_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x01,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x11,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x11,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x11,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x11,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x11,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x11,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x11,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x11,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x11,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x11,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x11,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x11,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_lt_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x11,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x11,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_lt_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x11,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_i16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x31,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_i16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x31,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x31,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x31,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_i16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_i16_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_i16_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_i16_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_i16_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_lt_i16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x31,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x31,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_lt_i16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x31,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_i32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x41,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_i32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x41,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x41,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x41,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_i32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_i32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_i32_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_i32_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_i32_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_lt_i32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x41,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x41,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_lt_i32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x41,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_u16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x39,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_u16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x39,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x39,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x39,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_u16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_u16_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_u16_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_u16_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_u16_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_lt_u16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x39,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x39,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_lt_u16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x39,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_u32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x49,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_u32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x49,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x49,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x49,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_u32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_u32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_u32_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_u32_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_u32_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_lt_u32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x49,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x49,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_lt_u32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x49,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_i16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x35,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_i16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x35,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x35,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x35,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_i16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_i16_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_i16_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_i16_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_i16_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_ne_i16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x35,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x35,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_ne_i16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x35,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_i32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x45,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_i32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x45,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x45,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x45,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_i32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_i32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_i32_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_i32_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_i32_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_ne_i32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x45,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x45,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_ne_i32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x45,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_u16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x3d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_u16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x3d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_u16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_u16_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_u16_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_u16_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_u16_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_ne_u16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x3d,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x3d,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_ne_u16_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x3d,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_u32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x4d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_u32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x4d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_u32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_u32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_u32_e64_dpp vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x6a,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x7a,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_u32_e64_dpp ttmp14, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_u32_e64_dpp ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x7a,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_ne_u32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x4d,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7c,0x00,0x4d,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_ne_u32_e64_dpp null, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x00,0x4d,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_neq_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x0d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_neq_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x0d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_neq_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_neq_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x0d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_neq_f16_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x0d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_neq_f16_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x0d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x0d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x0d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_neq_f16_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x0d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_neq_f16_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x0d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x0d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_neq_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x0d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x0d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_neq_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x0d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_neq_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x1d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_neq_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x1d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_neq_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_neq_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x1d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_neq_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x1d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_neq_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x1d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x1d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x1d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_neq_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x1d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_neq_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x1d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x1d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_neq_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x1d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x1d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_neq_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x1d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nge_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x09,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nge_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x09,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x09,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x09,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nge_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nge_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x09,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_nge_f16_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x09,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nge_f16_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x09,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x09,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x09,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_nge_f16_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x09,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nge_f16_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x09,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x09,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_nge_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x09,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x09,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_nge_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x09,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nge_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x19,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nge_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x19,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x19,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x19,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nge_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nge_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x19,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_nge_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x19,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nge_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x19,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x19,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x19,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_nge_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x19,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nge_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x19,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x19,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_nge_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x19,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x19,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_nge_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x19,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ngt_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x0b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ngt_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x0b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ngt_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ngt_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x0b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_ngt_f16_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x0b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ngt_f16_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x0b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x0b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x0b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_ngt_f16_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x0b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ngt_f16_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x0b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x0b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_ngt_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x0b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x0b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_ngt_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x0b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ngt_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x1b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ngt_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x1b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_ngt_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ngt_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x1b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_ngt_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x1b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ngt_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x1b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x1b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x1b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_ngt_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x1b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ngt_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x1b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x1b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_ngt_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x1b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x1b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_ngt_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x1b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nle_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x0c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nle_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x0c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nle_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nle_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x0c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_nle_f16_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x0c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nle_f16_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x0c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x0c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x0c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_nle_f16_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x0c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nle_f16_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x0c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x0c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_nle_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x0c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x0c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_nle_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x0c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nle_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x1c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nle_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x1c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nle_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nle_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x1c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_nle_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x1c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nle_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x1c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x1c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x1c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_nle_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x1c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nle_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x1c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x1c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_nle_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x1c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x1c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_nle_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x1c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlg_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x0a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlg_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x0a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlg_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlg_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x0a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlg_f16_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x0a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlg_f16_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x0a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x0a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x0a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlg_f16_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x0a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlg_f16_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x0a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x0a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_nlg_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x0a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x0a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_nlg_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x0a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlg_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x1a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlg_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x1a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlg_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlg_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x1a,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlg_f32_e64_dpp s104, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1a,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlg_f32_e64_dpp s[104:105], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1a,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x1a,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x1a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlg_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x1a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlg_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x1a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x1a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x1a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlg_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x1a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlg_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x1a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x1a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_nlg_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x1a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x1a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_nlg_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x1a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlt_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x0e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlt_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x0e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlt_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlt_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x0e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlt_f16_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x0e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlt_f16_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x0e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x0e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x0e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlt_f16_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x0e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlt_f16_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x0e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x0e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_nlt_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x0e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x0e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_nlt_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x0e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlt_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x1e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlt_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x1e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlt_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlt_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x1e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlt_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x1e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlt_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x1e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x1e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x1e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlt_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x1e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlt_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x1e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x1e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_nlt_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x1e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x1e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_nlt_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x1e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_o_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_o_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x07,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_o_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x07,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_o_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x07,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x07,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_o_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_o_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x07,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_o_f16_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x07,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_o_f16_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x07,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x07,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x07,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_o_f16_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x07,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_o_f16_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x07,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x07,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_o_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x07,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x07,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_o_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x07,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_o_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_o_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x17,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_o_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x17,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_o_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x17,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x17,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_o_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_o_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x17,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_o_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x17,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_o_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x17,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x17,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x17,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_o_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x17,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_o_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x17,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x17,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_o_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x17,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x17,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_o_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x17,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_u_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_u_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x08,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_u_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x08,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_u_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x08,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x08,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_u_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_u_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x08,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_u_f16_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x08,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_u_f16_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x08,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x08,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x08,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_u_f16_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x08,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_u_f16_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x08,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x08,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_u_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x08,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x08,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_u_f16_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x08,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+0x0a,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_u_f32_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_u_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x0a,0x00,0x18,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_u_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x18,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_u_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x18,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
-0x0a,0x00,0x18,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
 
+0x68,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # W32: v_cmp_u_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_u_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-0x68,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+0x6a,0x01,0x18,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 # W32: v_cmp_u_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x18,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_u_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x18,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
-0x6a,0x01,0x18,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+0x7a,0x02,0x18,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 # W32: v_cmp_u_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x18,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_u_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x18,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
-0x7a,0x02,0x18,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
 
-# GFX12: v_cmp_u_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x18,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7c,0x83,0x18,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmp_u_f32_e64_dpp null, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7c,0x83,0x18,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt
index f8efb82736117..46f255c2f484f 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt
@@ -1,3413 +1,3416 @@
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
 
-# GFX12: v_cmpx_class_f16_e64 v1, v2             ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_class_f16_e64 v1, v2             ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_class_f16_e64 v255, v2           ; encoding: [0x7e,0x00,0xfd,0xd4,0xff,0x05,0x02,0x00]
 0x7e,0x00,0xfd,0xd4,0xff,0x05,0x02,0x00
+# GFX12: v_cmpx_class_f16_e64 v255, v2           ; encoding: [0x7e,0x00,0xfd,0xd4,0xff,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_class_f16_e64 s1, v2             ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x04,0x02,0x00]
 0x7e,0x00,0xfd,0xd4,0x01,0x04,0x02,0x00
+# GFX12: v_cmpx_class_f16_e64 s1, v2             ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x04,0x02,0x00]
 
-# GFX12: v_cmpx_class_f16_e64 s105, v255         ; encoding: [0x7e,0x00,0xfd,0xd4,0x69,0xfe,0x03,0x00]
 0x7e,0x00,0xfd,0xd4,0x69,0xfe,0x03,0x00
+# GFX12: v_cmpx_class_f16_e64 s105, v255         ; encoding: [0x7e,0x00,0xfd,0xd4,0x69,0xfe,0x03,0x00]
 
-# GFX12: v_cmpx_class_f16_e64 vcc_lo, s2         ; encoding: [0x7e,0x00,0xfd,0xd4,0x6a,0x04,0x00,0x00]
 0x7e,0x00,0xfd,0xd4,0x6a,0x04,0x00,0x00
+# GFX12: v_cmpx_class_f16_e64 vcc_lo, s2         ; encoding: [0x7e,0x00,0xfd,0xd4,0x6a,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_class_f16_e64 vcc_hi, s105       ; encoding: [0x7e,0x00,0xfd,0xd4,0x6b,0xd2,0x00,0x00]
 0x7e,0x00,0xfd,0xd4,0x6b,0xd2,0x00,0x00
+# GFX12: v_cmpx_class_f16_e64 vcc_hi, s105       ; encoding: [0x7e,0x00,0xfd,0xd4,0x6b,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_class_f16_e64 ttmp15, ttmp15     ; encoding: [0x7e,0x00,0xfd,0xd4,0x7b,0xf6,0x00,0x00]
 0x7e,0x00,0xfd,0xd4,0x7b,0xf6,0x00,0x00
+# GFX12: v_cmpx_class_f16_e64 ttmp15, ttmp15     ; encoding: [0x7e,0x00,0xfd,0xd4,0x7b,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_class_f16_e64 m0, src_scc        ; encoding: [0x7e,0x00,0xfd,0xd4,0x7d,0xfa,0x01,0x00]
 0x7e,0x00,0xfd,0xd4,0x7d,0xfa,0x01,0x00
+# GFX12: v_cmpx_class_f16_e64 m0, src_scc        ; encoding: [0x7e,0x00,0xfd,0xd4,0x7d,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_class_f16_e64 exec_lo, -1        ; encoding: [0x7e,0x00,0xfd,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xfd,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_class_f16_e64 exec_lo, -1        ; encoding: [0x7e,0x00,0xfd,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_class_f16_e64 exec_hi, null      ; encoding: [0x7e,0x00,0xfd,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xfd,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_class_f16_e64 exec_hi, null      ; encoding: [0x7e,0x00,0xfd,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_class_f16_e64 null, exec_lo      ; encoding: [0x7e,0x00,0xfd,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xfd,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_class_f16_e64 null, exec_lo      ; encoding: [0x7e,0x00,0xfd,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_class_f16_e64 -1, exec_hi        ; encoding: [0x7e,0x00,0xfd,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xfd,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_class_f16_e64 -1, exec_hi        ; encoding: [0x7e,0x00,0xfd,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_class_f16_e64 0.5, m0            ; encoding: [0x7e,0x00,0xfd,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xfd,0xd4,0xf0,0xfa,0x00,0x00
+# GFX12: v_cmpx_class_f16_e64 0.5, m0            ; encoding: [0x7e,0x00,0xfd,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX12: v_cmpx_class_f16_e64 src_scc, vcc_lo    ; encoding: [0x7e,0x00,0xfd,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xfd,0xd4,0xfd,0xd4,0x00,0x00
+# GFX12: v_cmpx_class_f16_e64 src_scc, vcc_lo    ; encoding: [0x7e,0x00,0xfd,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmpx_class_f16_e64 -|0xfe0b|, vcc_hi  ; encoding: [0x7e,0x01,0xfd,0xd4,0xff,0xd6,0x00,0x20,0x0b,0xfe,0x00,0x00]
 0x7e,0x01,0xfd,0xd4,0xff,0xd6,0x00,0x20,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_class_f16_e64 -|0xfe0b|, vcc_hi  ; encoding: [0x7e,0x01,0xfd,0xd4,0xff,0xd6,0x00,0x20,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_class_f32_e64 v1, v2             ; encoding: [0x7e,0x00,0xfe,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xfe,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_class_f32_e64 v1, v2             ; encoding: [0x7e,0x00,0xfe,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_class_f32_e64 v255, v255         ; encoding: [0x7e,0x00,0xfe,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xfe,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_class_f32_e64 v255, v255         ; encoding: [0x7e,0x00,0xfe,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_class_f32_e64 s1, s2             ; encoding: [0x7e,0x00,0xfe,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xfe,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_class_f32_e64 s1, s2             ; encoding: [0x7e,0x00,0xfe,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_class_f32_e64 s105, s105         ; encoding: [0x7e,0x00,0xfe,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xfe,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_class_f32_e64 s105, s105         ; encoding: [0x7e,0x00,0xfe,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_class_f32_e64 vcc_lo, ttmp15     ; encoding: [0x7e,0x00,0xfe,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xfe,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_class_f32_e64 vcc_lo, ttmp15     ; encoding: [0x7e,0x00,0xfe,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_class_f32_e64 vcc_hi, 0xaf123456 ; encoding: [0x7e,0x00,0xfe,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xfe,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_class_f32_e64 vcc_hi, 0xaf123456 ; encoding: [0x7e,0x00,0xfe,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_class_f32_e64 ttmp15, src_scc    ; encoding: [0x7e,0x00,0xfe,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xfe,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_class_f32_e64 ttmp15, src_scc    ; encoding: [0x7e,0x00,0xfe,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_class_f32_e64 m0, 0.5            ; encoding: [0x7e,0x00,0xfe,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0xfe,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_class_f32_e64 m0, 0.5            ; encoding: [0x7e,0x00,0xfe,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_class_f32_e64 exec_lo, -1        ; encoding: [0x7e,0x00,0xfe,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xfe,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_class_f32_e64 exec_lo, -1        ; encoding: [0x7e,0x00,0xfe,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_class_f32_e64 exec_hi, null      ; encoding: [0x7e,0x00,0xfe,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xfe,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_class_f32_e64 exec_hi, null      ; encoding: [0x7e,0x00,0xfe,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_class_f32_e64 null, exec_lo      ; encoding: [0x7e,0x00,0xfe,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xfe,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_class_f32_e64 null, exec_lo      ; encoding: [0x7e,0x00,0xfe,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_class_f32_e64 -1, exec_hi        ; encoding: [0x7e,0x00,0xfe,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xfe,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_class_f32_e64 -1, exec_hi        ; encoding: [0x7e,0x00,0xfe,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_class_f32_e64 0.5, m0            ; encoding: [0x7e,0x00,0xfe,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xfe,0xd4,0xf0,0xfa,0x00,0x00
+# GFX12: v_cmpx_class_f32_e64 0.5, m0            ; encoding: [0x7e,0x00,0xfe,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX12: v_cmpx_class_f32_e64 src_scc, vcc_lo    ; encoding: [0x7e,0x00,0xfe,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xfe,0xd4,0xfd,0xd4,0x00,0x00
+# GFX12: v_cmpx_class_f32_e64 src_scc, vcc_lo    ; encoding: [0x7e,0x00,0xfe,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmpx_class_f32_e64 -|0xaf123456|, vcc_hi ; encoding: [0x7e,0x01,0xfe,0xd4,0xff,0xd6,0x00,0x20,0x56,0x34,0x12,0xaf]
 0x7e,0x01,0xfe,0xd4,0xff,0xd6,0x00,0x20,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_class_f32_e64 -|0xaf123456|, vcc_hi ; encoding: [0x7e,0x01,0xfe,0xd4,0xff,0xd6,0x00,0x20,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_class_f64_e64 v[1:2], v2         ; encoding: [0x7e,0x00,0xff,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xff,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_class_f64_e64 v[1:2], v2         ; encoding: [0x7e,0x00,0xff,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_class_f64_e64 v[1:2], v255       ; encoding: [0x7e,0x00,0xff,0xd4,0x01,0xff,0x03,0x00]
 0x7e,0x00,0xff,0xd4,0x01,0xff,0x03,0x00
+# GFX12: v_cmpx_class_f64_e64 v[1:2], v255       ; encoding: [0x7e,0x00,0xff,0xd4,0x01,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_class_f64_e64 v[1:2], s2         ; encoding: [0x7e,0x00,0xff,0xd4,0x01,0x05,0x00,0x00]
 0x7e,0x00,0xff,0xd4,0x01,0x05,0x00,0x00
+# GFX12: v_cmpx_class_f64_e64 v[1:2], s2         ; encoding: [0x7e,0x00,0xff,0xd4,0x01,0x05,0x00,0x00]
 
-# GFX12: v_cmpx_class_f64_e64 v[1:2], s105       ; encoding: [0x7e,0x00,0xff,0xd4,0x01,0xd3,0x00,0x00]
 0x7e,0x00,0xff,0xd4,0x01,0xd3,0x00,0x00
+# GFX12: v_cmpx_class_f64_e64 v[1:2], s105       ; encoding: [0x7e,0x00,0xff,0xd4,0x01,0xd3,0x00,0x00]
 
-# GFX12: v_cmpx_class_f64_e64 v[254:255], ttmp15 ; encoding: [0x7e,0x00,0xff,0xd4,0xfe,0xf7,0x00,0x00]
 0x7e,0x00,0xff,0xd4,0xfe,0xf7,0x00,0x00
+# GFX12: v_cmpx_class_f64_e64 v[254:255], ttmp15 ; encoding: [0x7e,0x00,0xff,0xd4,0xfe,0xf7,0x00,0x00]
 
-# GFX12: v_cmpx_class_f64_e64 s[2:3], vcc_hi     ; encoding: [0x7e,0x00,0xff,0xd4,0x02,0xd6,0x00,0x00]
 0x7e,0x00,0xff,0xd4,0x02,0xd6,0x00,0x00
+# GFX12: v_cmpx_class_f64_e64 s[2:3], vcc_hi     ; encoding: [0x7e,0x00,0xff,0xd4,0x02,0xd6,0x00,0x00]
 
-# GFX12: v_cmpx_class_f64_e64 s[104:105], vcc_lo ; encoding: [0x7e,0x00,0xff,0xd4,0x68,0xd4,0x00,0x00]
 0x7e,0x00,0xff,0xd4,0x68,0xd4,0x00,0x00
+# GFX12: v_cmpx_class_f64_e64 s[104:105], vcc_lo ; encoding: [0x7e,0x00,0xff,0xd4,0x68,0xd4,0x00,0x00]
 
-# GFX12: v_cmpx_class_f64_e64 vcc, m0            ; encoding: [0x7e,0x00,0xff,0xd4,0x6a,0xfa,0x00,0x00]
 0x7e,0x00,0xff,0xd4,0x6a,0xfa,0x00,0x00
+# GFX12: v_cmpx_class_f64_e64 vcc, m0            ; encoding: [0x7e,0x00,0xff,0xd4,0x6a,0xfa,0x00,0x00]
 
-# GFX12: v_cmpx_class_f64_e64 ttmp[14:15], exec_hi ; encoding: [0x7e,0x00,0xff,0xd4,0x7a,0xfe,0x00,0x00]
 0x7e,0x00,0xff,0xd4,0x7a,0xfe,0x00,0x00
+# GFX12: v_cmpx_class_f64_e64 ttmp[14:15], exec_hi ; encoding: [0x7e,0x00,0xff,0xd4,0x7a,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_class_f64_e64 exec, exec_lo      ; encoding: [0x7e,0x00,0xff,0xd4,0x7e,0xfc,0x00,0x00]
 0x7e,0x00,0xff,0xd4,0x7e,0xfc,0x00,0x00
+# GFX12: v_cmpx_class_f64_e64 exec, exec_lo      ; encoding: [0x7e,0x00,0xff,0xd4,0x7e,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_class_f64_e64 null, null         ; encoding: [0x7e,0x00,0xff,0xd4,0x7c,0xf8,0x00,0x00]
 0x7e,0x00,0xff,0xd4,0x7c,0xf8,0x00,0x00
+# GFX12: v_cmpx_class_f64_e64 null, null         ; encoding: [0x7e,0x00,0xff,0xd4,0x7c,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_class_f64_e64 -1, -1             ; encoding: [0x7e,0x00,0xff,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xff,0xd4,0xc1,0x82,0x01,0x00
+# GFX12: v_cmpx_class_f64_e64 -1, -1             ; encoding: [0x7e,0x00,0xff,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_class_f64_e64 0.5, 0.5           ; encoding: [0x7e,0x00,0xff,0xd4,0xf0,0xe0,0x01,0x00]
 0x7e,0x00,0xff,0xd4,0xf0,0xe0,0x01,0x00
+# GFX12: v_cmpx_class_f64_e64 0.5, 0.5           ; encoding: [0x7e,0x00,0xff,0xd4,0xf0,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_class_f64_e64 -|src_scc|, src_scc ; encoding: [0x7e,0x01,0xff,0xd4,0xfd,0xfa,0x01,0x20]
 0x7e,0x01,0xff,0xd4,0xfd,0xfa,0x01,0x20
+# GFX12: v_cmpx_class_f64_e64 -|src_scc|, src_scc ; encoding: [0x7e,0x01,0xff,0xd4,0xfd,0xfa,0x01,0x20]
 
-# GFX12: v_cmpx_class_f64_e64 0xaf123456, 0xaf123456 ; encoding: [0x7e,0x00,0xff,0xd4,0xff,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xff,0xd4,0xff,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_class_f64_e64 0xaf123456, 0xaf123456 ; encoding: [0x7e,0x00,0xff,0xd4,0xff,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_eq_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_eq_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_eq_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_eq_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_eq_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x82,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_eq_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_eq_f16_e64 s105, s105            ; encoding: [0x7e,0x00,0x82,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x82,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_eq_f16_e64 s105, s105            ; encoding: [0x7e,0x00,0x82,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_eq_f16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x82,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x82,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_eq_f16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x82,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_eq_f16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0x82,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0x82,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_eq_f16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0x82,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_eq_f16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x82,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x82,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_eq_f16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x82,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_eq_f16_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x82,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x82,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_eq_f16_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x82,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_eq_f16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x82,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x82,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_eq_f16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x82,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_eq_f16_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x82,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x82,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_eq_f16_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x82,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_eq_f16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x82,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x82,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_eq_f16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x82,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_eq_f16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x82,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x82,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_eq_f16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x82,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_eq_f16_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x82,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x82,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_eq_f16_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x82,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX12: v_cmpx_eq_f16_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x82,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x82,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_eq_f16_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x82,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX12: v_cmpx_eq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x82,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7e,0x83,0x82,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_eq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x82,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_eq_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x92,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x92,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_eq_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x92,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_eq_f32_e64 v255, v255            ; encoding: [0x7e,0x00,0x92,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x92,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_eq_f32_e64 v255, v255            ; encoding: [0x7e,0x00,0x92,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_eq_f32_e64 s1, s2                ; encoding: [0x7e,0x00,0x92,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x92,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_eq_f32_e64 s1, s2                ; encoding: [0x7e,0x00,0x92,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_eq_f32_e64 s105, s105            ; encoding: [0x7e,0x00,0x92,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x92,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_eq_f32_e64 s105, s105            ; encoding: [0x7e,0x00,0x92,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_eq_f32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x92,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x92,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_eq_f32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x92,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_eq_f32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0x92,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0x92,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_eq_f32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0x92,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_eq_f32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x92,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x92,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_eq_f32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x92,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_eq_f32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x92,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x92,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_eq_f32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x92,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_eq_f32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x92,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x92,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_eq_f32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x92,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_eq_f32_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x92,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x92,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_eq_f32_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x92,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_eq_f32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x92,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x92,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_eq_f32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x92,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_eq_f32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x92,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x92,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_eq_f32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x92,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_eq_f32_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x92,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x92,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_eq_f32_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x92,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX12: v_cmpx_eq_f32_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x92,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x92,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_eq_f32_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x92,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX12: v_cmpx_eq_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x92,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7e,0x83,0x92,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_eq_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x92,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_eq_f64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xa2,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xa2,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_eq_f64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xa2,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_eq_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa2,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xa2,0xd4,0xfe,0xfd,0x03,0x00
+# GFX12: v_cmpx_eq_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa2,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX12: v_cmpx_eq_f64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xa2,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xa2,0xd4,0x02,0x08,0x00,0x00
+# GFX12: v_cmpx_eq_f64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xa2,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX12: v_cmpx_eq_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa2,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xa2,0xd4,0x68,0xd0,0x00,0x00
+# GFX12: v_cmpx_eq_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa2,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX12: v_cmpx_eq_f64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xa2,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xa2,0xd4,0x6a,0xf4,0x00,0x00
+# GFX12: v_cmpx_eq_f64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xa2,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX12: v_cmpx_eq_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa2,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xa2,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_eq_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa2,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_eq_f64_e64 -|exec|, src_scc      ; encoding: [0x7e,0x01,0xa2,0xd4,0x7e,0xfa,0x01,0x20]
 0x7e,0x01,0xa2,0xd4,0x7e,0xfa,0x01,0x20
+# GFX12: v_cmpx_eq_f64_e64 -|exec|, src_scc      ; encoding: [0x7e,0x01,0xa2,0xd4,0x7e,0xfa,0x01,0x20]
 
-# GFX12: v_cmpx_eq_f64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xa2,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xa2,0xd4,0x7c,0xe0,0x01,0x00
+# GFX12: v_cmpx_eq_f64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xa2,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_eq_f64_e64 -1, -1                ; encoding: [0x7e,0x00,0xa2,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xa2,0xd4,0xc1,0x82,0x01,0x00
+# GFX12: v_cmpx_eq_f64_e64 -1, -1                ; encoding: [0x7e,0x00,0xa2,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_eq_f64_e64 0.5, null             ; encoding: [0x7e,0x00,0xa2,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xa2,0xd4,0xf0,0xf8,0x00,0x00
+# GFX12: v_cmpx_eq_f64_e64 0.5, null             ; encoding: [0x7e,0x00,0xa2,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_eq_f64_e64 -|src_scc|, -|exec|   ; encoding: [0x7e,0x03,0xa2,0xd4,0xfd,0xfc,0x00,0x60]
 0x7e,0x03,0xa2,0xd4,0xfd,0xfc,0x00,0x60
+# GFX12: v_cmpx_eq_f64_e64 -|src_scc|, -|exec|   ; encoding: [0x7e,0x03,0xa2,0xd4,0xfd,0xfc,0x00,0x60]
 
-# GFX12: v_cmpx_eq_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa2,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7e,0x82,0xa2,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_eq_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa2,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_eq_i16_e64 v1, v2                ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xb2,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_eq_i16_e64 v1, v2                ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_eq_i16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xb2,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_eq_i16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_eq_i16_e64 s1, s2                ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xb2,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_eq_i16_e64 s1, s2                ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_eq_i16_e64 s105, s105            ; encoding: [0x7e,0x00,0xb2,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xb2,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_eq_i16_e64 s105, s105            ; encoding: [0x7e,0x00,0xb2,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_eq_i16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xb2,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xb2,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_eq_i16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xb2,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_eq_i16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xb2,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xb2,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_eq_i16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xb2,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_eq_i16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xb2,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xb2,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_eq_i16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xb2,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_eq_i16_e64 m0, 0x3800
 0x7e,0x00,0xb2,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_eq_i16_e64 m0, 0x3800            ; encoding: [0x7e,0x00,0xb2,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_eq_i16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xb2,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xb2,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_eq_i16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xb2,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_eq_i16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xb2,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xb2,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_eq_i16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xb2,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_eq_i16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xb2,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xb2,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_eq_i16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xb2,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_eq_i16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xb2,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xb2,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_eq_i16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xb2,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_eq_i16_e64 0x3800, m0
 0x7e,0x00,0xb2,0xd4,0xf0,0xfa,0x00,0x00
+# GFX12: v_cmpx_eq_i16_e64 0x3800, m0            ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_eq_i16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xb2,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xb2,0xd4,0xfd,0xd4,0x00,0x00
+# GFX12: v_cmpx_eq_i16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xb2,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmpx_eq_i16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xb2,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_eq_i16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_eq_i32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc2,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xc2,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_eq_i32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc2,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_eq_i32_e64 v255, v255            ; encoding: [0x7e,0x00,0xc2,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xc2,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_eq_i32_e64 v255, v255            ; encoding: [0x7e,0x00,0xc2,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_eq_i32_e64 s1, s2                ; encoding: [0x7e,0x00,0xc2,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xc2,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_eq_i32_e64 s1, s2                ; encoding: [0x7e,0x00,0xc2,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_eq_i32_e64 s105, s105            ; encoding: [0x7e,0x00,0xc2,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xc2,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_eq_i32_e64 s105, s105            ; encoding: [0x7e,0x00,0xc2,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_eq_i32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xc2,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xc2,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_eq_i32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xc2,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_eq_i32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xc2,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc2,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_eq_i32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xc2,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_eq_i32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xc2,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xc2,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_eq_i32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xc2,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_eq_i32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xc2,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0xc2,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_eq_i32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xc2,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_eq_i32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xc2,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xc2,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_eq_i32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xc2,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_eq_i32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xc2,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xc2,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_eq_i32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xc2,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_eq_i32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xc2,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xc2,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_eq_i32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xc2,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_eq_i32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xc2,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xc2,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_eq_i32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xc2,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_eq_i32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xc2,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xc2,0xd4,0xf0,0xfa,0x00,0x00
+# GFX12: v_cmpx_eq_i32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xc2,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX12: v_cmpx_eq_i32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xc2,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xc2,0xd4,0xfd,0xd4,0x00,0x00
+# GFX12: v_cmpx_eq_i32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xc2,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmpx_eq_i32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xc2,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc2,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_eq_i32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xc2,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_eq_i64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xd2,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xd2,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_eq_i64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xd2,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_eq_i64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd2,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xd2,0xd4,0xfe,0xfd,0x03,0x00
+# GFX12: v_cmpx_eq_i64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd2,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX12: v_cmpx_eq_i64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xd2,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xd2,0xd4,0x02,0x08,0x00,0x00
+# GFX12: v_cmpx_eq_i64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xd2,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX12: v_cmpx_eq_i64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd2,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xd2,0xd4,0x68,0xd0,0x00,0x00
+# GFX12: v_cmpx_eq_i64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd2,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX12: v_cmpx_eq_i64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xd2,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xd2,0xd4,0x6a,0xf4,0x00,0x00
+# GFX12: v_cmpx_eq_i64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xd2,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX12: v_cmpx_eq_i64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd2,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd2,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_eq_i64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd2,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_eq_i64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xd2,0xd4,0x7e,0xfa,0x01,0x00]
 0x7e,0x00,0xd2,0xd4,0x7e,0xfa,0x01,0x00
+# GFX12: v_cmpx_eq_i64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xd2,0xd4,0x7e,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_eq_i64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xd2,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xd2,0xd4,0x7c,0xe0,0x01,0x00
+# GFX12: v_cmpx_eq_i64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xd2,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_eq_i64_e64 -1, -1                ; encoding: [0x7e,0x00,0xd2,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xd2,0xd4,0xc1,0x82,0x01,0x00
+# GFX12: v_cmpx_eq_i64_e64 -1, -1                ; encoding: [0x7e,0x00,0xd2,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_eq_i64_e64 0.5, null             ; encoding: [0x7e,0x00,0xd2,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xd2,0xd4,0xf0,0xf8,0x00,0x00
+# GFX12: v_cmpx_eq_i64_e64 0.5, null             ; encoding: [0x7e,0x00,0xd2,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_eq_i64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xd2,0xd4,0xfd,0xfc,0x00,0x00]
 0x7e,0x00,0xd2,0xd4,0xfd,0xfc,0x00,0x00
+# GFX12: v_cmpx_eq_i64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xd2,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_eq_i64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xd2,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd2,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_eq_i64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xd2,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_eq_u16_e64 v1, v2                ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xba,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_eq_u16_e64 v1, v2                ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_eq_u16_e64 v255, v255            ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xba,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_eq_u16_e64 v255, v255            ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_eq_u16_e64 s1, s2                ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xba,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_eq_u16_e64 s1, s2                ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_eq_u16_e64 s105, s105            ; encoding: [0x7e,0x00,0xba,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xba,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_eq_u16_e64 s105, s105            ; encoding: [0x7e,0x00,0xba,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_eq_u16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xba,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xba,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_eq_u16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xba,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_eq_u16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xba,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xba,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_eq_u16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xba,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_eq_u16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xba,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xba,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_eq_u16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xba,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_eq_u16_e64 m0, 0x3800
 0x7e,0x00,0xba,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_eq_u16_e64 m0, 0x3800            ; encoding: [0x7e,0x00,0xba,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_eq_u16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xba,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xba,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_eq_u16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xba,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_eq_u16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xba,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xba,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_eq_u16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xba,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_eq_u16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xba,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xba,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_eq_u16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xba,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_eq_u16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xba,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xba,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_eq_u16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xba,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_eq_u16_e64 0x3800, m0
 0x7e,0x00,0xba,0xd4,0xf0,0xfa,0x00,0x00
+# GFX12: v_cmpx_eq_u16_e64 0x3800, m0            ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_eq_u16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xba,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xba,0xd4,0xfd,0xd4,0x00,0x00
+# GFX12: v_cmpx_eq_u16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xba,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmpx_eq_u16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xba,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_eq_u16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_eq_u32_e64 v1, v2                ; encoding: [0x7e,0x00,0xca,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xca,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_eq_u32_e64 v1, v2                ; encoding: [0x7e,0x00,0xca,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_eq_u32_e64 v255, v255            ; encoding: [0x7e,0x00,0xca,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xca,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_eq_u32_e64 v255, v255            ; encoding: [0x7e,0x00,0xca,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_eq_u32_e64 s1, s2                ; encoding: [0x7e,0x00,0xca,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xca,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_eq_u32_e64 s1, s2                ; encoding: [0x7e,0x00,0xca,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_eq_u32_e64 s105, s105            ; encoding: [0x7e,0x00,0xca,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xca,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_eq_u32_e64 s105, s105            ; encoding: [0x7e,0x00,0xca,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_eq_u32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xca,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xca,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_eq_u32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xca,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_eq_u32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xca,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xca,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_eq_u32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xca,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_eq_u32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xca,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xca,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_eq_u32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xca,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_eq_u32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xca,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0xca,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_eq_u32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xca,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_eq_u32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xca,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xca,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_eq_u32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xca,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_eq_u32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xca,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xca,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_eq_u32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xca,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_eq_u32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xca,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xca,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_eq_u32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xca,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_eq_u32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xca,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xca,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_eq_u32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xca,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_eq_u32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xca,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xca,0xd4,0xf0,0xfa,0x00,0x00
+# GFX12: v_cmpx_eq_u32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xca,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX12: v_cmpx_eq_u32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xca,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xca,0xd4,0xfd,0xd4,0x00,0x00
+# GFX12: v_cmpx_eq_u32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xca,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmpx_eq_u32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xca,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xca,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_eq_u32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xca,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_eq_u64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xda,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xda,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_eq_u64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xda,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_eq_u64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xda,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xda,0xd4,0xfe,0xfd,0x03,0x00
+# GFX12: v_cmpx_eq_u64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xda,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX12: v_cmpx_eq_u64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xda,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xda,0xd4,0x02,0x08,0x00,0x00
+# GFX12: v_cmpx_eq_u64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xda,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX12: v_cmpx_eq_u64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xda,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xda,0xd4,0x68,0xd0,0x00,0x00
+# GFX12: v_cmpx_eq_u64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xda,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX12: v_cmpx_eq_u64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xda,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xda,0xd4,0x6a,0xf4,0x00,0x00
+# GFX12: v_cmpx_eq_u64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xda,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX12: v_cmpx_eq_u64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xda,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xda,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_eq_u64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xda,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_eq_u64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xda,0xd4,0x7e,0xfa,0x01,0x00]
 0x7e,0x00,0xda,0xd4,0x7e,0xfa,0x01,0x00
+# GFX12: v_cmpx_eq_u64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xda,0xd4,0x7e,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_eq_u64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xda,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xda,0xd4,0x7c,0xe0,0x01,0x00
+# GFX12: v_cmpx_eq_u64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xda,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_eq_u64_e64 -1, -1                ; encoding: [0x7e,0x00,0xda,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xda,0xd4,0xc1,0x82,0x01,0x00
+# GFX12: v_cmpx_eq_u64_e64 -1, -1                ; encoding: [0x7e,0x00,0xda,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_eq_u64_e64 0.5, null             ; encoding: [0x7e,0x00,0xda,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xda,0xd4,0xf0,0xf8,0x00,0x00
+# GFX12: v_cmpx_eq_u64_e64 0.5, null             ; encoding: [0x7e,0x00,0xda,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_eq_u64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xda,0xd4,0xfd,0xfc,0x00,0x00]
 0x7e,0x00,0xda,0xd4,0xfd,0xfc,0x00,0x00
+# GFX12: v_cmpx_eq_u64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xda,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_eq_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xda,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xda,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_eq_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xda,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ge_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_ge_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_ge_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_ge_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_ge_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x86,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_ge_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_ge_f16_e64 s105, s105            ; encoding: [0x7e,0x00,0x86,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x86,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_ge_f16_e64 s105, s105            ; encoding: [0x7e,0x00,0x86,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_ge_f16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x86,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x86,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_ge_f16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x86,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_ge_f16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0x86,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0x86,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_ge_f16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0x86,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_ge_f16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x86,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x86,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_ge_f16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x86,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_ge_f16_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x86,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x86,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_ge_f16_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x86,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_ge_f16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x86,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x86,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_ge_f16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x86,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_ge_f16_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x86,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x86,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_ge_f16_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x86,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_ge_f16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x86,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x86,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_ge_f16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x86,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_ge_f16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x86,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x86,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_ge_f16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x86,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_ge_f16_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x86,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x86,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_ge_f16_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x86,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX12: v_cmpx_ge_f16_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x86,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x86,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_ge_f16_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x86,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX12: v_cmpx_ge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x86,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7e,0x83,0x86,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_ge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x86,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_ge_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x96,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x96,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_ge_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x96,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_ge_f32_e64 v255, v255            ; encoding: [0x7e,0x00,0x96,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x96,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_ge_f32_e64 v255, v255            ; encoding: [0x7e,0x00,0x96,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_ge_f32_e64 s1, s2                ; encoding: [0x7e,0x00,0x96,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x96,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_ge_f32_e64 s1, s2                ; encoding: [0x7e,0x00,0x96,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_ge_f32_e64 s105, s105            ; encoding: [0x7e,0x00,0x96,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x96,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_ge_f32_e64 s105, s105            ; encoding: [0x7e,0x00,0x96,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_ge_f32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x96,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x96,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_ge_f32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x96,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_ge_f32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0x96,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0x96,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ge_f32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0x96,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ge_f32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x96,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x96,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_ge_f32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x96,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_ge_f32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x96,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x96,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_ge_f32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x96,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_ge_f32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x96,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x96,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_ge_f32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x96,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_ge_f32_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x96,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x96,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_ge_f32_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x96,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_ge_f32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x96,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x96,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_ge_f32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x96,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_ge_f32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x96,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x96,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_ge_f32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x96,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_ge_f32_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x96,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x96,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_ge_f32_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x96,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX12: v_cmpx_ge_f32_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x96,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x96,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_ge_f32_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x96,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX12: v_cmpx_ge_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x96,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7e,0x83,0x96,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ge_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x96,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ge_f64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xa6,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xa6,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_ge_f64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xa6,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_ge_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa6,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xa6,0xd4,0xfe,0xfd,0x03,0x00
+# GFX12: v_cmpx_ge_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa6,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX12: v_cmpx_ge_f64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xa6,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xa6,0xd4,0x02,0x08,0x00,0x00
+# GFX12: v_cmpx_ge_f64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xa6,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX12: v_cmpx_ge_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa6,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xa6,0xd4,0x68,0xd0,0x00,0x00
+# GFX12: v_cmpx_ge_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa6,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX12: v_cmpx_ge_f64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xa6,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xa6,0xd4,0x6a,0xf4,0x00,0x00
+# GFX12: v_cmpx_ge_f64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xa6,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX12: v_cmpx_ge_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa6,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xa6,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ge_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa6,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ge_f64_e64 -|exec|, src_scc      ; encoding: [0x7e,0x01,0xa6,0xd4,0x7e,0xfa,0x01,0x20]
 0x7e,0x01,0xa6,0xd4,0x7e,0xfa,0x01,0x20
+# GFX12: v_cmpx_ge_f64_e64 -|exec|, src_scc      ; encoding: [0x7e,0x01,0xa6,0xd4,0x7e,0xfa,0x01,0x20]
 
-# GFX12: v_cmpx_ge_f64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xa6,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xa6,0xd4,0x7c,0xe0,0x01,0x00
+# GFX12: v_cmpx_ge_f64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xa6,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_ge_f64_e64 -1, -1                ; encoding: [0x7e,0x00,0xa6,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xa6,0xd4,0xc1,0x82,0x01,0x00
+# GFX12: v_cmpx_ge_f64_e64 -1, -1                ; encoding: [0x7e,0x00,0xa6,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_ge_f64_e64 0.5, null             ; encoding: [0x7e,0x00,0xa6,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xa6,0xd4,0xf0,0xf8,0x00,0x00
+# GFX12: v_cmpx_ge_f64_e64 0.5, null             ; encoding: [0x7e,0x00,0xa6,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_ge_f64_e64 -|src_scc|, -|exec|   ; encoding: [0x7e,0x03,0xa6,0xd4,0xfd,0xfc,0x00,0x60]
 0x7e,0x03,0xa6,0xd4,0xfd,0xfc,0x00,0x60
+# GFX12: v_cmpx_ge_f64_e64 -|src_scc|, -|exec|   ; encoding: [0x7e,0x03,0xa6,0xd4,0xfd,0xfc,0x00,0x60]
 
-# GFX12: v_cmpx_ge_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa6,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7e,0x82,0xa6,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ge_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa6,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ge_i16_e64 v1, v2                ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xb6,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_ge_i16_e64 v1, v2                ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_ge_i16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xb6,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_ge_i16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_ge_i16_e64 s1, s2                ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xb6,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_ge_i16_e64 s1, s2                ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_ge_i16_e64 s105, s105            ; encoding: [0x7e,0x00,0xb6,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xb6,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_ge_i16_e64 s105, s105            ; encoding: [0x7e,0x00,0xb6,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_ge_i16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xb6,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xb6,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_ge_i16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xb6,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_ge_i16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xb6,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xb6,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_ge_i16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xb6,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_ge_i16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xb6,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xb6,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_ge_i16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xb6,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_ge_i16_e64 m0, 0x3800
 0x7e,0x00,0xb6,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_ge_i16_e64 m0, 0x3800            ; encoding: [0x7e,0x00,0xb6,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_ge_i16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xb6,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xb6,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_ge_i16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xb6,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_ge_i16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xb6,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xb6,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_ge_i16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xb6,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_ge_i16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xb6,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xb6,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_ge_i16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xb6,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_ge_i16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xb6,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xb6,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_ge_i16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xb6,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_ge_i16_e64 0x3800, m0
 0x7e,0x00,0xb6,0xd4,0xf0,0xfa,0x00,0x00
+# GFX12: v_cmpx_ge_i16_e64 0x3800, m0            ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_ge_i16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xb6,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xb6,0xd4,0xfd,0xd4,0x00,0x00
+# GFX12: v_cmpx_ge_i16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xb6,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmpx_ge_i16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xb6,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_ge_i16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_ge_i32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc6,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xc6,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_ge_i32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc6,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_ge_i32_e64 v255, v255            ; encoding: [0x7e,0x00,0xc6,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xc6,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_ge_i32_e64 v255, v255            ; encoding: [0x7e,0x00,0xc6,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_ge_i32_e64 s1, s2                ; encoding: [0x7e,0x00,0xc6,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xc6,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_ge_i32_e64 s1, s2                ; encoding: [0x7e,0x00,0xc6,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_ge_i32_e64 s105, s105            ; encoding: [0x7e,0x00,0xc6,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xc6,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_ge_i32_e64 s105, s105            ; encoding: [0x7e,0x00,0xc6,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_ge_i32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xc6,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xc6,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_ge_i32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xc6,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_ge_i32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xc6,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc6,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ge_i32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xc6,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ge_i32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xc6,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xc6,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_ge_i32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xc6,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_ge_i32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xc6,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0xc6,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_ge_i32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xc6,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_ge_i32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xc6,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xc6,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_ge_i32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xc6,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_ge_i32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xc6,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xc6,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_ge_i32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xc6,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_ge_i32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xc6,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xc6,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_ge_i32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xc6,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_ge_i32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xc6,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xc6,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_ge_i32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xc6,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_ge_i32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xc6,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xc6,0xd4,0xf0,0xfa,0x00,0x00
+# GFX12: v_cmpx_ge_i32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xc6,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX12: v_cmpx_ge_i32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xc6,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xc6,0xd4,0xfd,0xd4,0x00,0x00
+# GFX12: v_cmpx_ge_i32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xc6,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmpx_ge_i32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xc6,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc6,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ge_i32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xc6,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ge_i64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xd6,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xd6,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_ge_i64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xd6,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_ge_i64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd6,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xd6,0xd4,0xfe,0xfd,0x03,0x00
+# GFX12: v_cmpx_ge_i64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd6,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX12: v_cmpx_ge_i64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xd6,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xd6,0xd4,0x02,0x08,0x00,0x00
+# GFX12: v_cmpx_ge_i64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xd6,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX12: v_cmpx_ge_i64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd6,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xd6,0xd4,0x68,0xd0,0x00,0x00
+# GFX12: v_cmpx_ge_i64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd6,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX12: v_cmpx_ge_i64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xd6,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xd6,0xd4,0x6a,0xf4,0x00,0x00
+# GFX12: v_cmpx_ge_i64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xd6,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX12: v_cmpx_ge_i64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd6,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd6,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ge_i64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd6,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ge_i64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xd6,0xd4,0x7e,0xfa,0x01,0x00]
 0x7e,0x00,0xd6,0xd4,0x7e,0xfa,0x01,0x00
+# GFX12: v_cmpx_ge_i64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xd6,0xd4,0x7e,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_ge_i64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xd6,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xd6,0xd4,0x7c,0xe0,0x01,0x00
+# GFX12: v_cmpx_ge_i64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xd6,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_ge_i64_e64 -1, -1                ; encoding: [0x7e,0x00,0xd6,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xd6,0xd4,0xc1,0x82,0x01,0x00
+# GFX12: v_cmpx_ge_i64_e64 -1, -1                ; encoding: [0x7e,0x00,0xd6,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_ge_i64_e64 0.5, null             ; encoding: [0x7e,0x00,0xd6,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xd6,0xd4,0xf0,0xf8,0x00,0x00
+# GFX12: v_cmpx_ge_i64_e64 0.5, null             ; encoding: [0x7e,0x00,0xd6,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_ge_i64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xd6,0xd4,0xfd,0xfc,0x00,0x00]
 0x7e,0x00,0xd6,0xd4,0xfd,0xfc,0x00,0x00
+# GFX12: v_cmpx_ge_i64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xd6,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_ge_i64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xd6,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd6,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ge_i64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xd6,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ge_u16_e64 v1, v2                ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xbe,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_ge_u16_e64 v1, v2                ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_ge_u16_e64 v255, v255            ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xbe,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_ge_u16_e64 v255, v255            ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_ge_u16_e64 s1, s2                ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xbe,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_ge_u16_e64 s1, s2                ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_ge_u16_e64 s105, s105            ; encoding: [0x7e,0x00,0xbe,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xbe,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_ge_u16_e64 s105, s105            ; encoding: [0x7e,0x00,0xbe,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_ge_u16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xbe,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xbe,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_ge_u16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xbe,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_ge_u16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xbe,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xbe,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_ge_u16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xbe,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_ge_u16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xbe,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xbe,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_ge_u16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xbe,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_ge_u16_e64 m0, 0x3800
 0x7e,0x00,0xbe,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_ge_u16_e64 m0, 0x3800            ; encoding: [0x7e,0x00,0xbe,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_ge_u16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xbe,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xbe,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_ge_u16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xbe,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_ge_u16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xbe,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xbe,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_ge_u16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xbe,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_ge_u16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xbe,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xbe,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_ge_u16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xbe,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_ge_u16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xbe,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xbe,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_ge_u16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xbe,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_ge_u16_e64 0x3800, m0
 0x7e,0x00,0xbe,0xd4,0xf0,0xfa,0x00,0x00
+# GFX12: v_cmpx_ge_u16_e64 0x3800, m0            ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_ge_u16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xbe,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xbe,0xd4,0xfd,0xd4,0x00,0x00
+# GFX12: v_cmpx_ge_u16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xbe,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmpx_ge_u16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xbe,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_ge_u16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_ge_u32_e64 v1, v2                ; encoding: [0x7e,0x00,0xce,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xce,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_ge_u32_e64 v1, v2                ; encoding: [0x7e,0x00,0xce,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_ge_u32_e64 v255, v255            ; encoding: [0x7e,0x00,0xce,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xce,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_ge_u32_e64 v255, v255            ; encoding: [0x7e,0x00,0xce,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_ge_u32_e64 s1, s2                ; encoding: [0x7e,0x00,0xce,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xce,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_ge_u32_e64 s1, s2                ; encoding: [0x7e,0x00,0xce,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_ge_u32_e64 s105, s105            ; encoding: [0x7e,0x00,0xce,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xce,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_ge_u32_e64 s105, s105            ; encoding: [0x7e,0x00,0xce,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_ge_u32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xce,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xce,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_ge_u32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xce,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_ge_u32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xce,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xce,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ge_u32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xce,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ge_u32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xce,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xce,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_ge_u32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xce,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_ge_u32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xce,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0xce,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_ge_u32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xce,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_ge_u32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xce,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xce,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_ge_u32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xce,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_ge_u32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xce,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xce,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_ge_u32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xce,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_ge_u32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xce,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xce,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_ge_u32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xce,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_ge_u32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xce,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xce,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_ge_u32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xce,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_ge_u32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xce,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xce,0xd4,0xf0,0xfa,0x00,0x00
+# GFX12: v_cmpx_ge_u32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xce,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX12: v_cmpx_ge_u32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xce,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xce,0xd4,0xfd,0xd4,0x00,0x00
+# GFX12: v_cmpx_ge_u32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xce,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmpx_ge_u32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xce,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xce,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ge_u32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xce,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ge_u64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xde,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xde,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_ge_u64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xde,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_ge_u64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xde,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xde,0xd4,0xfe,0xfd,0x03,0x00
+# GFX12: v_cmpx_ge_u64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xde,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX12: v_cmpx_ge_u64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xde,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xde,0xd4,0x02,0x08,0x00,0x00
+# GFX12: v_cmpx_ge_u64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xde,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX12: v_cmpx_ge_u64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xde,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xde,0xd4,0x68,0xd0,0x00,0x00
+# GFX12: v_cmpx_ge_u64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xde,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX12: v_cmpx_ge_u64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xde,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xde,0xd4,0x6a,0xf4,0x00,0x00
+# GFX12: v_cmpx_ge_u64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xde,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX12: v_cmpx_ge_u64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xde,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xde,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ge_u64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xde,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ge_u64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xde,0xd4,0x7e,0xfa,0x01,0x00]
 0x7e,0x00,0xde,0xd4,0x7e,0xfa,0x01,0x00
+# GFX12: v_cmpx_ge_u64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xde,0xd4,0x7e,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_ge_u64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xde,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xde,0xd4,0x7c,0xe0,0x01,0x00
+# GFX12: v_cmpx_ge_u64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xde,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_ge_u64_e64 -1, -1                ; encoding: [0x7e,0x00,0xde,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xde,0xd4,0xc1,0x82,0x01,0x00
+# GFX12: v_cmpx_ge_u64_e64 -1, -1                ; encoding: [0x7e,0x00,0xde,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_ge_u64_e64 0.5, null             ; encoding: [0x7e,0x00,0xde,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xde,0xd4,0xf0,0xf8,0x00,0x00
+# GFX12: v_cmpx_ge_u64_e64 0.5, null             ; encoding: [0x7e,0x00,0xde,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_ge_u64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xde,0xd4,0xfd,0xfc,0x00,0x00]
 0x7e,0x00,0xde,0xd4,0xfd,0xfc,0x00,0x00
+# GFX12: v_cmpx_ge_u64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xde,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_ge_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xde,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xde,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ge_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xde,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_gt_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_gt_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_gt_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_gt_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_gt_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x84,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_gt_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_gt_f16_e64 s105, s105            ; encoding: [0x7e,0x00,0x84,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x84,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_gt_f16_e64 s105, s105            ; encoding: [0x7e,0x00,0x84,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_gt_f16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x84,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x84,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_gt_f16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x84,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_gt_f16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0x84,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0x84,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_gt_f16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0x84,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_gt_f16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x84,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x84,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_gt_f16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x84,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_gt_f16_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x84,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x84,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_gt_f16_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x84,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_gt_f16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x84,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x84,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_gt_f16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x84,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_gt_f16_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x84,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x84,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_gt_f16_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x84,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_gt_f16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x84,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x84,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_gt_f16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x84,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_gt_f16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x84,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x84,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_gt_f16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x84,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_gt_f16_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x84,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x84,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_gt_f16_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x84,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX12: v_cmpx_gt_f16_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x84,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x84,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_gt_f16_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x84,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX12: v_cmpx_gt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x84,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7e,0x83,0x84,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_gt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x84,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_gt_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x94,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x94,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_gt_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x94,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_gt_f32_e64 v255, v255            ; encoding: [0x7e,0x00,0x94,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x94,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_gt_f32_e64 v255, v255            ; encoding: [0x7e,0x00,0x94,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_gt_f32_e64 s1, s2                ; encoding: [0x7e,0x00,0x94,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x94,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_gt_f32_e64 s1, s2                ; encoding: [0x7e,0x00,0x94,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_gt_f32_e64 s105, s105            ; encoding: [0x7e,0x00,0x94,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x94,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_gt_f32_e64 s105, s105            ; encoding: [0x7e,0x00,0x94,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_gt_f32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x94,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x94,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_gt_f32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x94,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_gt_f32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0x94,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0x94,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_gt_f32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0x94,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_gt_f32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x94,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x94,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_gt_f32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x94,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_gt_f32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x94,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x94,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_gt_f32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x94,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_gt_f32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x94,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x94,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_gt_f32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x94,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_gt_f32_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x94,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x94,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_gt_f32_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x94,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_gt_f32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x94,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x94,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_gt_f32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x94,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_gt_f32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x94,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x94,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_gt_f32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x94,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_gt_f32_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x94,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x94,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_gt_f32_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x94,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX12: v_cmpx_gt_f32_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x94,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x94,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_gt_f32_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x94,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX12: v_cmpx_gt_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x94,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7e,0x83,0x94,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_gt_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x94,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_gt_f64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xa4,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xa4,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_gt_f64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xa4,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_gt_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa4,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xa4,0xd4,0xfe,0xfd,0x03,0x00
+# GFX12: v_cmpx_gt_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa4,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX12: v_cmpx_gt_f64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xa4,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xa4,0xd4,0x02,0x08,0x00,0x00
+# GFX12: v_cmpx_gt_f64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xa4,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX12: v_cmpx_gt_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa4,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xa4,0xd4,0x68,0xd0,0x00,0x00
+# GFX12: v_cmpx_gt_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa4,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX12: v_cmpx_gt_f64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xa4,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xa4,0xd4,0x6a,0xf4,0x00,0x00
+# GFX12: v_cmpx_gt_f64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xa4,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX12: v_cmpx_gt_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa4,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xa4,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_gt_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa4,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_gt_f64_e64 -|exec|, src_scc      ; encoding: [0x7e,0x01,0xa4,0xd4,0x7e,0xfa,0x01,0x20]
 0x7e,0x01,0xa4,0xd4,0x7e,0xfa,0x01,0x20
+# GFX12: v_cmpx_gt_f64_e64 -|exec|, src_scc      ; encoding: [0x7e,0x01,0xa4,0xd4,0x7e,0xfa,0x01,0x20]
 
-# GFX12: v_cmpx_gt_f64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xa4,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xa4,0xd4,0x7c,0xe0,0x01,0x00
+# GFX12: v_cmpx_gt_f64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xa4,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_gt_f64_e64 -1, -1                ; encoding: [0x7e,0x00,0xa4,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xa4,0xd4,0xc1,0x82,0x01,0x00
+# GFX12: v_cmpx_gt_f64_e64 -1, -1                ; encoding: [0x7e,0x00,0xa4,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_gt_f64_e64 0.5, null             ; encoding: [0x7e,0x00,0xa4,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xa4,0xd4,0xf0,0xf8,0x00,0x00
+# GFX12: v_cmpx_gt_f64_e64 0.5, null             ; encoding: [0x7e,0x00,0xa4,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_gt_f64_e64 -|src_scc|, -|exec|   ; encoding: [0x7e,0x03,0xa4,0xd4,0xfd,0xfc,0x00,0x60]
 0x7e,0x03,0xa4,0xd4,0xfd,0xfc,0x00,0x60
+# GFX12: v_cmpx_gt_f64_e64 -|src_scc|, -|exec|   ; encoding: [0x7e,0x03,0xa4,0xd4,0xfd,0xfc,0x00,0x60]
 
-# GFX12: v_cmpx_gt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa4,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7e,0x82,0xa4,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_gt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa4,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_gt_i16_e64 v1, v2                ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xb4,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_gt_i16_e64 v1, v2                ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_gt_i16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xb4,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_gt_i16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_gt_i16_e64 s1, s2                ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xb4,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_gt_i16_e64 s1, s2                ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_gt_i16_e64 s105, s105            ; encoding: [0x7e,0x00,0xb4,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xb4,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_gt_i16_e64 s105, s105            ; encoding: [0x7e,0x00,0xb4,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_gt_i16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xb4,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xb4,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_gt_i16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xb4,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_gt_i16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xb4,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xb4,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_gt_i16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xb4,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_gt_i16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xb4,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xb4,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_gt_i16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xb4,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_gt_i16_e64 m0, 0x3800
 0x7e,0x00,0xb4,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_gt_i16_e64 m0, 0x3800            ; encoding: [0x7e,0x00,0xb4,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_gt_i16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xb4,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xb4,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_gt_i16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xb4,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_gt_i16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xb4,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xb4,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_gt_i16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xb4,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_gt_i16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xb4,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xb4,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_gt_i16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xb4,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_gt_i16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xb4,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xb4,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_gt_i16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xb4,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_gt_i16_e64 0x3800, m0
 0x7e,0x00,0xb4,0xd4,0xf0,0xfa,0x00,0x00
+# GFX12: v_cmpx_gt_i16_e64 0x3800, m0            ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_gt_i16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xb4,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xb4,0xd4,0xfd,0xd4,0x00,0x00
+# GFX12: v_cmpx_gt_i16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xb4,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmpx_gt_i16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xb4,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_gt_i16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_gt_i32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc4,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xc4,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_gt_i32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc4,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_gt_i32_e64 v255, v255            ; encoding: [0x7e,0x00,0xc4,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xc4,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_gt_i32_e64 v255, v255            ; encoding: [0x7e,0x00,0xc4,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_gt_i32_e64 s1, s2                ; encoding: [0x7e,0x00,0xc4,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xc4,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_gt_i32_e64 s1, s2                ; encoding: [0x7e,0x00,0xc4,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_gt_i32_e64 s105, s105            ; encoding: [0x7e,0x00,0xc4,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xc4,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_gt_i32_e64 s105, s105            ; encoding: [0x7e,0x00,0xc4,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_gt_i32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xc4,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xc4,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_gt_i32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xc4,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_gt_i32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xc4,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc4,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_gt_i32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xc4,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_gt_i32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xc4,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xc4,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_gt_i32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xc4,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_gt_i32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xc4,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0xc4,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_gt_i32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xc4,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_gt_i32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xc4,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xc4,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_gt_i32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xc4,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_gt_i32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xc4,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xc4,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_gt_i32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xc4,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_gt_i32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xc4,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xc4,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_gt_i32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xc4,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_gt_i32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xc4,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xc4,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_gt_i32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xc4,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_gt_i32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xc4,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xc4,0xd4,0xf0,0xfa,0x00,0x00
+# GFX12: v_cmpx_gt_i32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xc4,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX12: v_cmpx_gt_i32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xc4,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xc4,0xd4,0xfd,0xd4,0x00,0x00
+# GFX12: v_cmpx_gt_i32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xc4,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmpx_gt_i32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xc4,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc4,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_gt_i32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xc4,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_gt_i64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xd4,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xd4,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_gt_i64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xd4,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_gt_i64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd4,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xd4,0xd4,0xfe,0xfd,0x03,0x00
+# GFX12: v_cmpx_gt_i64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd4,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX12: v_cmpx_gt_i64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xd4,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xd4,0xd4,0x02,0x08,0x00,0x00
+# GFX12: v_cmpx_gt_i64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xd4,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX12: v_cmpx_gt_i64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd4,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xd4,0xd4,0x68,0xd0,0x00,0x00
+# GFX12: v_cmpx_gt_i64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd4,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX12: v_cmpx_gt_i64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xd4,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xd4,0xd4,0x6a,0xf4,0x00,0x00
+# GFX12: v_cmpx_gt_i64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xd4,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX12: v_cmpx_gt_i64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd4,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd4,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_gt_i64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd4,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_gt_i64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xd4,0xd4,0x7e,0xfa,0x01,0x00]
 0x7e,0x00,0xd4,0xd4,0x7e,0xfa,0x01,0x00
+# GFX12: v_cmpx_gt_i64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xd4,0xd4,0x7e,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_gt_i64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xd4,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xd4,0xd4,0x7c,0xe0,0x01,0x00
+# GFX12: v_cmpx_gt_i64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xd4,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_gt_i64_e64 -1, -1                ; encoding: [0x7e,0x00,0xd4,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xd4,0xd4,0xc1,0x82,0x01,0x00
+# GFX12: v_cmpx_gt_i64_e64 -1, -1                ; encoding: [0x7e,0x00,0xd4,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_gt_i64_e64 0.5, null             ; encoding: [0x7e,0x00,0xd4,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xd4,0xd4,0xf0,0xf8,0x00,0x00
+# GFX12: v_cmpx_gt_i64_e64 0.5, null             ; encoding: [0x7e,0x00,0xd4,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_gt_i64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xd4,0xd4,0xfd,0xfc,0x00,0x00]
 0x7e,0x00,0xd4,0xd4,0xfd,0xfc,0x00,0x00
+# GFX12: v_cmpx_gt_i64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xd4,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_gt_i64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xd4,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd4,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_gt_i64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xd4,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_gt_u16_e64 v1, v2                ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xbc,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_gt_u16_e64 v1, v2                ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_gt_u16_e64 v255, v255            ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xbc,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_gt_u16_e64 v255, v255            ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_gt_u16_e64 s1, s2                ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xbc,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_gt_u16_e64 s1, s2                ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_gt_u16_e64 s105, s105            ; encoding: [0x7e,0x00,0xbc,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xbc,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_gt_u16_e64 s105, s105            ; encoding: [0x7e,0x00,0xbc,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_gt_u16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xbc,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xbc,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_gt_u16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xbc,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_gt_u16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xbc,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xbc,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_gt_u16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xbc,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_gt_u16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xbc,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xbc,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_gt_u16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xbc,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_gt_u16_e64 m0, 0x3800
 0x7e,0x00,0xbc,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_gt_u16_e64 m0, 0x3800            ; encoding: [0x7e,0x00,0xbc,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_gt_u16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xbc,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xbc,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_gt_u16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xbc,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_gt_u16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xbc,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xbc,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_gt_u16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xbc,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_gt_u16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xbc,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xbc,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_gt_u16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xbc,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_gt_u16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xbc,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xbc,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_gt_u16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xbc,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_gt_u16_e64 0x3800, m0
 0x7e,0x00,0xbc,0xd4,0xf0,0xfa,0x00,0x00
+# GFX12: v_cmpx_gt_u16_e64 0x3800, m0            ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_gt_u16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xbc,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xbc,0xd4,0xfd,0xd4,0x00,0x00
+# GFX12: v_cmpx_gt_u16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xbc,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmpx_gt_u16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xbc,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_gt_u16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_gt_u32_e64 v1, v2                ; encoding: [0x7e,0x00,0xcc,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xcc,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_gt_u32_e64 v1, v2                ; encoding: [0x7e,0x00,0xcc,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_gt_u32_e64 v255, v255            ; encoding: [0x7e,0x00,0xcc,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xcc,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_gt_u32_e64 v255, v255            ; encoding: [0x7e,0x00,0xcc,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_gt_u32_e64 s1, s2                ; encoding: [0x7e,0x00,0xcc,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xcc,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_gt_u32_e64 s1, s2                ; encoding: [0x7e,0x00,0xcc,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_gt_u32_e64 s105, s105            ; encoding: [0x7e,0x00,0xcc,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xcc,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_gt_u32_e64 s105, s105            ; encoding: [0x7e,0x00,0xcc,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_gt_u32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xcc,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xcc,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_gt_u32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xcc,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_gt_u32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xcc,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xcc,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_gt_u32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xcc,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_gt_u32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xcc,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xcc,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_gt_u32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xcc,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_gt_u32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xcc,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0xcc,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_gt_u32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xcc,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_gt_u32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xcc,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xcc,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_gt_u32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xcc,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_gt_u32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xcc,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xcc,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_gt_u32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xcc,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_gt_u32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xcc,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xcc,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_gt_u32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xcc,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_gt_u32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xcc,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xcc,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_gt_u32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xcc,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_gt_u32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xcc,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xcc,0xd4,0xf0,0xfa,0x00,0x00
+# GFX12: v_cmpx_gt_u32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xcc,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX12: v_cmpx_gt_u32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xcc,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xcc,0xd4,0xfd,0xd4,0x00,0x00
+# GFX12: v_cmpx_gt_u32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xcc,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmpx_gt_u32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xcc,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xcc,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_gt_u32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xcc,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_gt_u64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xdc,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xdc,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_gt_u64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xdc,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_gt_u64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xdc,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xdc,0xd4,0xfe,0xfd,0x03,0x00
+# GFX12: v_cmpx_gt_u64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xdc,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX12: v_cmpx_gt_u64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xdc,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xdc,0xd4,0x02,0x08,0x00,0x00
+# GFX12: v_cmpx_gt_u64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xdc,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX12: v_cmpx_gt_u64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xdc,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xdc,0xd4,0x68,0xd0,0x00,0x00
+# GFX12: v_cmpx_gt_u64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xdc,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX12: v_cmpx_gt_u64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xdc,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xdc,0xd4,0x6a,0xf4,0x00,0x00
+# GFX12: v_cmpx_gt_u64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xdc,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX12: v_cmpx_gt_u64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xdc,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xdc,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_gt_u64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xdc,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_gt_u64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xdc,0xd4,0x7e,0xfa,0x01,0x00]
 0x7e,0x00,0xdc,0xd4,0x7e,0xfa,0x01,0x00
+# GFX12: v_cmpx_gt_u64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xdc,0xd4,0x7e,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_gt_u64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xdc,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xdc,0xd4,0x7c,0xe0,0x01,0x00
+# GFX12: v_cmpx_gt_u64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xdc,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_gt_u64_e64 -1, -1                ; encoding: [0x7e,0x00,0xdc,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xdc,0xd4,0xc1,0x82,0x01,0x00
+# GFX12: v_cmpx_gt_u64_e64 -1, -1                ; encoding: [0x7e,0x00,0xdc,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_gt_u64_e64 0.5, null             ; encoding: [0x7e,0x00,0xdc,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xdc,0xd4,0xf0,0xf8,0x00,0x00
+# GFX12: v_cmpx_gt_u64_e64 0.5, null             ; encoding: [0x7e,0x00,0xdc,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_gt_u64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xdc,0xd4,0xfd,0xfc,0x00,0x00]
 0x7e,0x00,0xdc,0xd4,0xfd,0xfc,0x00,0x00
+# GFX12: v_cmpx_gt_u64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xdc,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_gt_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xdc,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xdc,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_gt_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xdc,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_le_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_le_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_le_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_le_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_le_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x83,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_le_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_le_f16_e64 s105, s105            ; encoding: [0x7e,0x00,0x83,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x83,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_le_f16_e64 s105, s105            ; encoding: [0x7e,0x00,0x83,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_le_f16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x83,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x83,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_le_f16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x83,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_le_f16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0x83,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0x83,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_le_f16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0x83,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_le_f16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x83,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x83,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_le_f16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x83,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_le_f16_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x83,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x83,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_le_f16_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x83,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_le_f16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x83,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x83,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_le_f16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x83,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_le_f16_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x83,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x83,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_le_f16_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x83,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_le_f16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x83,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x83,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_le_f16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x83,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_le_f16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x83,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x83,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_le_f16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x83,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_le_f16_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x83,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x83,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_le_f16_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x83,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX12: v_cmpx_le_f16_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x83,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x83,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_le_f16_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x83,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX12: v_cmpx_le_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x83,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7e,0x83,0x83,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_le_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x83,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_le_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x93,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x93,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_le_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x93,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_le_f32_e64 v255, v255            ; encoding: [0x7e,0x00,0x93,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x93,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_le_f32_e64 v255, v255            ; encoding: [0x7e,0x00,0x93,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_le_f32_e64 s1, s2                ; encoding: [0x7e,0x00,0x93,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x93,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_le_f32_e64 s1, s2                ; encoding: [0x7e,0x00,0x93,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_le_f32_e64 s105, s105            ; encoding: [0x7e,0x00,0x93,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x93,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_le_f32_e64 s105, s105            ; encoding: [0x7e,0x00,0x93,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_le_f32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x93,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x93,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_le_f32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x93,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_le_f32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0x93,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0x93,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_le_f32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0x93,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_le_f32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x93,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x93,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_le_f32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x93,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_le_f32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x93,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x93,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_le_f32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x93,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_le_f32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x93,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x93,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_le_f32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x93,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_le_f32_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x93,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x93,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_le_f32_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x93,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_le_f32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x93,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x93,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_le_f32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x93,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_le_f32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x93,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x93,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_le_f32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x93,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_le_f32_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x93,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x93,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_le_f32_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x93,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX12: v_cmpx_le_f32_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x93,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x93,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_le_f32_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x93,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX12: v_cmpx_le_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x93,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7e,0x83,0x93,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_le_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x93,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_le_f64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xa3,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xa3,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_le_f64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xa3,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_le_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa3,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xa3,0xd4,0xfe,0xfd,0x03,0x00
+# GFX12: v_cmpx_le_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa3,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX12: v_cmpx_le_f64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xa3,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xa3,0xd4,0x02,0x08,0x00,0x00
+# GFX12: v_cmpx_le_f64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xa3,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX12: v_cmpx_le_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa3,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xa3,0xd4,0x68,0xd0,0x00,0x00
+# GFX12: v_cmpx_le_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa3,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX12: v_cmpx_le_f64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xa3,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xa3,0xd4,0x6a,0xf4,0x00,0x00
+# GFX12: v_cmpx_le_f64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xa3,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX12: v_cmpx_le_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa3,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xa3,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_le_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa3,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_le_f64_e64 -|exec|, src_scc      ; encoding: [0x7e,0x01,0xa3,0xd4,0x7e,0xfa,0x01,0x20]
 0x7e,0x01,0xa3,0xd4,0x7e,0xfa,0x01,0x20
+# GFX12: v_cmpx_le_f64_e64 -|exec|, src_scc      ; encoding: [0x7e,0x01,0xa3,0xd4,0x7e,0xfa,0x01,0x20]
 
-# GFX12: v_cmpx_le_f64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xa3,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xa3,0xd4,0x7c,0xe0,0x01,0x00
+# GFX12: v_cmpx_le_f64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xa3,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_le_f64_e64 -1, -1                ; encoding: [0x7e,0x00,0xa3,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xa3,0xd4,0xc1,0x82,0x01,0x00
+# GFX12: v_cmpx_le_f64_e64 -1, -1                ; encoding: [0x7e,0x00,0xa3,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_le_f64_e64 0.5, null             ; encoding: [0x7e,0x00,0xa3,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xa3,0xd4,0xf0,0xf8,0x00,0x00
+# GFX12: v_cmpx_le_f64_e64 0.5, null             ; encoding: [0x7e,0x00,0xa3,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_le_f64_e64 -|src_scc|, -|exec|   ; encoding: [0x7e,0x03,0xa3,0xd4,0xfd,0xfc,0x00,0x60]
 0x7e,0x03,0xa3,0xd4,0xfd,0xfc,0x00,0x60
+# GFX12: v_cmpx_le_f64_e64 -|src_scc|, -|exec|   ; encoding: [0x7e,0x03,0xa3,0xd4,0xfd,0xfc,0x00,0x60]
 
-# GFX12: v_cmpx_le_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa3,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7e,0x82,0xa3,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_le_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa3,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_le_i16_e64 v1, v2                ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xb3,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_le_i16_e64 v1, v2                ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_le_i16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xb3,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_le_i16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_le_i16_e64 s1, s2                ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xb3,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_le_i16_e64 s1, s2                ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_le_i16_e64 s105, s105            ; encoding: [0x7e,0x00,0xb3,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xb3,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_le_i16_e64 s105, s105            ; encoding: [0x7e,0x00,0xb3,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_le_i16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xb3,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xb3,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_le_i16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xb3,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_le_i16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xb3,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xb3,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_le_i16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xb3,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_le_i16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xb3,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xb3,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_le_i16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xb3,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_le_i16_e64 m0, 0x3800
 0x7e,0x00,0xb3,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_le_i16_e64 m0, 0x3800            ; encoding: [0x7e,0x00,0xb3,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_le_i16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xb3,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xb3,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_le_i16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xb3,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_le_i16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xb3,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xb3,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_le_i16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xb3,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_le_i16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xb3,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xb3,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_le_i16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xb3,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_le_i16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xb3,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xb3,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_le_i16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xb3,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_le_i16_e64 0x3800, m0
 0x7e,0x00,0xb3,0xd4,0xf0,0xfa,0x00,0x00
+# GFX12: v_cmpx_le_i16_e64 0x3800, m0            ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_le_i16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xb3,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xb3,0xd4,0xfd,0xd4,0x00,0x00
+# GFX12: v_cmpx_le_i16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xb3,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmpx_le_i16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xb3,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_le_i16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_le_i32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc3,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xc3,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_le_i32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc3,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_le_i32_e64 v255, v255            ; encoding: [0x7e,0x00,0xc3,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xc3,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_le_i32_e64 v255, v255            ; encoding: [0x7e,0x00,0xc3,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_le_i32_e64 s1, s2                ; encoding: [0x7e,0x00,0xc3,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xc3,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_le_i32_e64 s1, s2                ; encoding: [0x7e,0x00,0xc3,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_le_i32_e64 s105, s105            ; encoding: [0x7e,0x00,0xc3,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xc3,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_le_i32_e64 s105, s105            ; encoding: [0x7e,0x00,0xc3,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_le_i32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xc3,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xc3,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_le_i32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xc3,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_le_i32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xc3,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc3,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_le_i32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xc3,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_le_i32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xc3,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xc3,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_le_i32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xc3,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_le_i32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xc3,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0xc3,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_le_i32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xc3,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_le_i32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xc3,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xc3,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_le_i32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xc3,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_le_i32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xc3,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xc3,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_le_i32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xc3,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_le_i32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xc3,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xc3,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_le_i32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xc3,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_le_i32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xc3,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xc3,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_le_i32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xc3,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_le_i32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xc3,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xc3,0xd4,0xf0,0xfa,0x00,0x00
+# GFX12: v_cmpx_le_i32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xc3,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX12: v_cmpx_le_i32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xc3,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xc3,0xd4,0xfd,0xd4,0x00,0x00
+# GFX12: v_cmpx_le_i32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xc3,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmpx_le_i32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xc3,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc3,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_le_i32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xc3,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_le_i64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xd3,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xd3,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_le_i64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xd3,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_le_i64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd3,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xd3,0xd4,0xfe,0xfd,0x03,0x00
+# GFX12: v_cmpx_le_i64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd3,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX12: v_cmpx_le_i64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xd3,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xd3,0xd4,0x02,0x08,0x00,0x00
+# GFX12: v_cmpx_le_i64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xd3,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX12: v_cmpx_le_i64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd3,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xd3,0xd4,0x68,0xd0,0x00,0x00
+# GFX12: v_cmpx_le_i64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd3,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX12: v_cmpx_le_i64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xd3,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xd3,0xd4,0x6a,0xf4,0x00,0x00
+# GFX12: v_cmpx_le_i64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xd3,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX12: v_cmpx_le_i64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd3,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd3,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_le_i64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd3,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_le_i64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xd3,0xd4,0x7e,0xfa,0x01,0x00]
 0x7e,0x00,0xd3,0xd4,0x7e,0xfa,0x01,0x00
+# GFX12: v_cmpx_le_i64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xd3,0xd4,0x7e,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_le_i64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xd3,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xd3,0xd4,0x7c,0xe0,0x01,0x00
+# GFX12: v_cmpx_le_i64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xd3,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_le_i64_e64 -1, -1                ; encoding: [0x7e,0x00,0xd3,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xd3,0xd4,0xc1,0x82,0x01,0x00
+# GFX12: v_cmpx_le_i64_e64 -1, -1                ; encoding: [0x7e,0x00,0xd3,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_le_i64_e64 0.5, null             ; encoding: [0x7e,0x00,0xd3,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xd3,0xd4,0xf0,0xf8,0x00,0x00
+# GFX12: v_cmpx_le_i64_e64 0.5, null             ; encoding: [0x7e,0x00,0xd3,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_le_i64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xd3,0xd4,0xfd,0xfc,0x00,0x00]
 0x7e,0x00,0xd3,0xd4,0xfd,0xfc,0x00,0x00
+# GFX12: v_cmpx_le_i64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xd3,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_le_i64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xd3,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd3,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_le_i64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xd3,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_le_u16_e64 v1, v2                ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xbb,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_le_u16_e64 v1, v2                ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_le_u16_e64 v255, v255            ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xbb,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_le_u16_e64 v255, v255            ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_le_u16_e64 s1, s2                ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xbb,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_le_u16_e64 s1, s2                ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_le_u16_e64 s105, s105            ; encoding: [0x7e,0x00,0xbb,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xbb,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_le_u16_e64 s105, s105            ; encoding: [0x7e,0x00,0xbb,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_le_u16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xbb,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xbb,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_le_u16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xbb,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_le_u16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xbb,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xbb,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_le_u16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xbb,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_le_u16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xbb,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xbb,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_le_u16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xbb,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_le_u16_e64 m0, 0x3800
 0x7e,0x00,0xbb,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_le_u16_e64 m0, 0x3800            ; encoding: [0x7e,0x00,0xbb,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_le_u16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xbb,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xbb,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_le_u16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xbb,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_le_u16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xbb,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xbb,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_le_u16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xbb,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_le_u16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xbb,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xbb,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_le_u16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xbb,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_le_u16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xbb,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xbb,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_le_u16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xbb,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_le_u16_e64 0x3800, m0
 0x7e,0x00,0xbb,0xd4,0xf0,0xfa,0x00,0x00
+# GFX12: v_cmpx_le_u16_e64 0x3800, m0            ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_le_u16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xbb,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xbb,0xd4,0xfd,0xd4,0x00,0x00
+# GFX12: v_cmpx_le_u16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xbb,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmpx_le_u16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xbb,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_le_u16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_le_u32_e64 v1, v2                ; encoding: [0x7e,0x00,0xcb,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xcb,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_le_u32_e64 v1, v2                ; encoding: [0x7e,0x00,0xcb,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_le_u32_e64 v255, v255            ; encoding: [0x7e,0x00,0xcb,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xcb,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_le_u32_e64 v255, v255            ; encoding: [0x7e,0x00,0xcb,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_le_u32_e64 s1, s2                ; encoding: [0x7e,0x00,0xcb,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xcb,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_le_u32_e64 s1, s2                ; encoding: [0x7e,0x00,0xcb,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_le_u32_e64 s105, s105            ; encoding: [0x7e,0x00,0xcb,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xcb,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_le_u32_e64 s105, s105            ; encoding: [0x7e,0x00,0xcb,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_le_u32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xcb,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xcb,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_le_u32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xcb,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_le_u32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xcb,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xcb,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_le_u32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xcb,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_le_u32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xcb,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xcb,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_le_u32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xcb,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_le_u32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xcb,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0xcb,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_le_u32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xcb,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_le_u32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xcb,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xcb,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_le_u32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xcb,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_le_u32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xcb,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xcb,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_le_u32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xcb,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_le_u32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xcb,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xcb,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_le_u32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xcb,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_le_u32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xcb,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xcb,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_le_u32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xcb,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_le_u32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xcb,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xcb,0xd4,0xf0,0xfa,0x00,0x00
+# GFX12: v_cmpx_le_u32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xcb,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX12: v_cmpx_le_u32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xcb,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xcb,0xd4,0xfd,0xd4,0x00,0x00
+# GFX12: v_cmpx_le_u32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xcb,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmpx_le_u32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xcb,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xcb,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_le_u32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xcb,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_le_u64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xdb,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xdb,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_le_u64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xdb,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_le_u64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xdb,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xdb,0xd4,0xfe,0xfd,0x03,0x00
+# GFX12: v_cmpx_le_u64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xdb,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX12: v_cmpx_le_u64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xdb,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xdb,0xd4,0x02,0x08,0x00,0x00
+# GFX12: v_cmpx_le_u64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xdb,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX12: v_cmpx_le_u64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xdb,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xdb,0xd4,0x68,0xd0,0x00,0x00
+# GFX12: v_cmpx_le_u64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xdb,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX12: v_cmpx_le_u64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xdb,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xdb,0xd4,0x6a,0xf4,0x00,0x00
+# GFX12: v_cmpx_le_u64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xdb,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX12: v_cmpx_le_u64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xdb,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xdb,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_le_u64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xdb,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_le_u64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xdb,0xd4,0x7e,0xfa,0x01,0x00]
 0x7e,0x00,0xdb,0xd4,0x7e,0xfa,0x01,0x00
+# GFX12: v_cmpx_le_u64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xdb,0xd4,0x7e,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_le_u64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xdb,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xdb,0xd4,0x7c,0xe0,0x01,0x00
+# GFX12: v_cmpx_le_u64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xdb,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_le_u64_e64 -1, -1                ; encoding: [0x7e,0x00,0xdb,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xdb,0xd4,0xc1,0x82,0x01,0x00
+# GFX12: v_cmpx_le_u64_e64 -1, -1                ; encoding: [0x7e,0x00,0xdb,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_le_u64_e64 0.5, null             ; encoding: [0x7e,0x00,0xdb,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xdb,0xd4,0xf0,0xf8,0x00,0x00
+# GFX12: v_cmpx_le_u64_e64 0.5, null             ; encoding: [0x7e,0x00,0xdb,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_le_u64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xdb,0xd4,0xfd,0xfc,0x00,0x00]
 0x7e,0x00,0xdb,0xd4,0xfd,0xfc,0x00,0x00
+# GFX12: v_cmpx_le_u64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xdb,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_le_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xdb,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xdb,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_le_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xdb,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_lg_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_lg_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_lg_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_lg_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_lg_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x85,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_lg_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_lg_f16_e64 s105, s105            ; encoding: [0x7e,0x00,0x85,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x85,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_lg_f16_e64 s105, s105            ; encoding: [0x7e,0x00,0x85,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_lg_f16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x85,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x85,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_lg_f16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x85,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_lg_f16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0x85,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0x85,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_lg_f16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0x85,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_lg_f16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x85,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x85,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_lg_f16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x85,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_lg_f16_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x85,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x85,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_lg_f16_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x85,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_lg_f16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x85,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x85,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_lg_f16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x85,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_lg_f16_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x85,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x85,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_lg_f16_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x85,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_lg_f16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x85,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x85,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_lg_f16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x85,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_lg_f16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x85,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x85,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_lg_f16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x85,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_lg_f16_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x85,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x85,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_lg_f16_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x85,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX12: v_cmpx_lg_f16_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x85,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x85,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_lg_f16_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x85,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX12: v_cmpx_lg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x85,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7e,0x83,0x85,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_lg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x85,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_lg_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x95,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x95,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_lg_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x95,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_lg_f32_e64 v255, v255            ; encoding: [0x7e,0x00,0x95,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x95,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_lg_f32_e64 v255, v255            ; encoding: [0x7e,0x00,0x95,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_lg_f32_e64 s1, s2                ; encoding: [0x7e,0x00,0x95,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x95,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_lg_f32_e64 s1, s2                ; encoding: [0x7e,0x00,0x95,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_lg_f32_e64 s105, s105            ; encoding: [0x7e,0x00,0x95,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x95,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_lg_f32_e64 s105, s105            ; encoding: [0x7e,0x00,0x95,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_lg_f32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x95,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x95,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_lg_f32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x95,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_lg_f32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0x95,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0x95,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_lg_f32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0x95,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_lg_f32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x95,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x95,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_lg_f32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x95,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_lg_f32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x95,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x95,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_lg_f32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x95,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_lg_f32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x95,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x95,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_lg_f32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x95,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_lg_f32_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x95,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x95,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_lg_f32_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x95,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_lg_f32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x95,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x95,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_lg_f32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x95,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_lg_f32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x95,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x95,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_lg_f32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x95,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_lg_f32_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x95,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x95,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_lg_f32_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x95,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX12: v_cmpx_lg_f32_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x95,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x95,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_lg_f32_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x95,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX12: v_cmpx_lg_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x95,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7e,0x83,0x95,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_lg_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x95,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_lg_f64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xa5,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xa5,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_lg_f64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xa5,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_lg_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa5,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xa5,0xd4,0xfe,0xfd,0x03,0x00
+# GFX12: v_cmpx_lg_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa5,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX12: v_cmpx_lg_f64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xa5,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xa5,0xd4,0x02,0x08,0x00,0x00
+# GFX12: v_cmpx_lg_f64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xa5,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX12: v_cmpx_lg_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa5,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xa5,0xd4,0x68,0xd0,0x00,0x00
+# GFX12: v_cmpx_lg_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa5,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX12: v_cmpx_lg_f64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xa5,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xa5,0xd4,0x6a,0xf4,0x00,0x00
+# GFX12: v_cmpx_lg_f64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xa5,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX12: v_cmpx_lg_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa5,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xa5,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_lg_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa5,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_lg_f64_e64 -|exec|, src_scc      ; encoding: [0x7e,0x01,0xa5,0xd4,0x7e,0xfa,0x01,0x20]
 0x7e,0x01,0xa5,0xd4,0x7e,0xfa,0x01,0x20
+# GFX12: v_cmpx_lg_f64_e64 -|exec|, src_scc      ; encoding: [0x7e,0x01,0xa5,0xd4,0x7e,0xfa,0x01,0x20]
 
-# GFX12: v_cmpx_lg_f64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xa5,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xa5,0xd4,0x7c,0xe0,0x01,0x00
+# GFX12: v_cmpx_lg_f64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xa5,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_lg_f64_e64 -1, -1                ; encoding: [0x7e,0x00,0xa5,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xa5,0xd4,0xc1,0x82,0x01,0x00
+# GFX12: v_cmpx_lg_f64_e64 -1, -1                ; encoding: [0x7e,0x00,0xa5,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_lg_f64_e64 0.5, null             ; encoding: [0x7e,0x00,0xa5,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xa5,0xd4,0xf0,0xf8,0x00,0x00
+# GFX12: v_cmpx_lg_f64_e64 0.5, null             ; encoding: [0x7e,0x00,0xa5,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_lg_f64_e64 -|src_scc|, -|exec|   ; encoding: [0x7e,0x03,0xa5,0xd4,0xfd,0xfc,0x00,0x60]
 0x7e,0x03,0xa5,0xd4,0xfd,0xfc,0x00,0x60
+# GFX12: v_cmpx_lg_f64_e64 -|src_scc|, -|exec|   ; encoding: [0x7e,0x03,0xa5,0xd4,0xfd,0xfc,0x00,0x60]
 
-# GFX12: v_cmpx_lg_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa5,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7e,0x82,0xa5,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_lg_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa5,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_lt_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x81,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_lt_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_lt_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x81,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x81,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_lt_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x81,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_lt_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x81,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_lt_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_lt_f16_e64 s105, s105            ; encoding: [0x7e,0x00,0x81,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x81,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_lt_f16_e64 s105, s105            ; encoding: [0x7e,0x00,0x81,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_lt_f16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x81,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x81,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_lt_f16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x81,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_lt_f16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0x81,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0x81,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_lt_f16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0x81,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_lt_f16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x81,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x81,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_lt_f16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x81,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_lt_f16_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x81,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x81,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_lt_f16_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x81,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_lt_f16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x81,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x81,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_lt_f16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x81,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_lt_f16_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x81,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x81,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_lt_f16_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x81,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_lt_f16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x81,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x81,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_lt_f16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x81,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_lt_f16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x81,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x81,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_lt_f16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x81,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_lt_f16_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x81,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x81,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_lt_f16_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x81,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX12: v_cmpx_lt_f16_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x81,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x81,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_lt_f16_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x81,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX12: v_cmpx_lt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x81,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7e,0x83,0x81,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_lt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x81,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_lt_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x91,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x91,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_lt_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x91,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_lt_f32_e64 v255, v255            ; encoding: [0x7e,0x00,0x91,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x91,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_lt_f32_e64 v255, v255            ; encoding: [0x7e,0x00,0x91,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_lt_f32_e64 s1, s2                ; encoding: [0x7e,0x00,0x91,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x91,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_lt_f32_e64 s1, s2                ; encoding: [0x7e,0x00,0x91,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_lt_f32_e64 s105, s105            ; encoding: [0x7e,0x00,0x91,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x91,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_lt_f32_e64 s105, s105            ; encoding: [0x7e,0x00,0x91,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_lt_f32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x91,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x91,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_lt_f32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x91,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_lt_f32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0x91,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0x91,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_lt_f32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0x91,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_lt_f32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x91,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x91,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_lt_f32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x91,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_lt_f32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x91,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x91,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_lt_f32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x91,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_lt_f32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x91,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x91,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_lt_f32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x91,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_lt_f32_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x91,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x91,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_lt_f32_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x91,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_lt_f32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x91,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x91,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_lt_f32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x91,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_lt_f32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x91,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x91,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_lt_f32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x91,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_lt_f32_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x91,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x91,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_lt_f32_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x91,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX12: v_cmpx_lt_f32_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x91,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x91,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_lt_f32_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x91,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX12: v_cmpx_lt_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x91,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7e,0x83,0x91,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_lt_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x91,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_lt_f64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xa1,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xa1,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_lt_f64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xa1,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_lt_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa1,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xa1,0xd4,0xfe,0xfd,0x03,0x00
+# GFX12: v_cmpx_lt_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa1,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX12: v_cmpx_lt_f64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xa1,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xa1,0xd4,0x02,0x08,0x00,0x00
+# GFX12: v_cmpx_lt_f64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xa1,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX12: v_cmpx_lt_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa1,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xa1,0xd4,0x68,0xd0,0x00,0x00
+# GFX12: v_cmpx_lt_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa1,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX12: v_cmpx_lt_f64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xa1,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xa1,0xd4,0x6a,0xf4,0x00,0x00
+# GFX12: v_cmpx_lt_f64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xa1,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX12: v_cmpx_lt_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa1,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xa1,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_lt_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa1,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_lt_f64_e64 -|exec|, src_scc      ; encoding: [0x7e,0x01,0xa1,0xd4,0x7e,0xfa,0x01,0x20]
 0x7e,0x01,0xa1,0xd4,0x7e,0xfa,0x01,0x20
+# GFX12: v_cmpx_lt_f64_e64 -|exec|, src_scc      ; encoding: [0x7e,0x01,0xa1,0xd4,0x7e,0xfa,0x01,0x20]
 
-# GFX12: v_cmpx_lt_f64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xa1,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xa1,0xd4,0x7c,0xe0,0x01,0x00
+# GFX12: v_cmpx_lt_f64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xa1,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_lt_f64_e64 -1, -1                ; encoding: [0x7e,0x00,0xa1,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xa1,0xd4,0xc1,0x82,0x01,0x00
+# GFX12: v_cmpx_lt_f64_e64 -1, -1                ; encoding: [0x7e,0x00,0xa1,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_lt_f64_e64 0.5, null             ; encoding: [0x7e,0x00,0xa1,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xa1,0xd4,0xf0,0xf8,0x00,0x00
+# GFX12: v_cmpx_lt_f64_e64 0.5, null             ; encoding: [0x7e,0x00,0xa1,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_lt_f64_e64 -|src_scc|, -|exec|   ; encoding: [0x7e,0x03,0xa1,0xd4,0xfd,0xfc,0x00,0x60]
 0x7e,0x03,0xa1,0xd4,0xfd,0xfc,0x00,0x60
+# GFX12: v_cmpx_lt_f64_e64 -|src_scc|, -|exec|   ; encoding: [0x7e,0x03,0xa1,0xd4,0xfd,0xfc,0x00,0x60]
 
-# GFX12: v_cmpx_lt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa1,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7e,0x82,0xa1,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_lt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa1,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_lt_i16_e64 v1, v2                ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xb1,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_lt_i16_e64 v1, v2                ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_lt_i16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xb1,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_lt_i16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_lt_i16_e64 s1, s2                ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xb1,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_lt_i16_e64 s1, s2                ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_lt_i16_e64 s105, s105            ; encoding: [0x7e,0x00,0xb1,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xb1,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_lt_i16_e64 s105, s105            ; encoding: [0x7e,0x00,0xb1,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_lt_i16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xb1,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xb1,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_lt_i16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xb1,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_lt_i16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xb1,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xb1,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_lt_i16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xb1,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_lt_i16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xb1,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xb1,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_lt_i16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xb1,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_lt_i16_e64 m0, 0x3800
 0x7e,0x00,0xb1,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_lt_i16_e64 m0, 0x3800            ; encoding: [0x7e,0x00,0xb1,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_lt_i16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xb1,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xb1,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_lt_i16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xb1,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_lt_i16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xb1,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xb1,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_lt_i16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xb1,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_lt_i16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xb1,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xb1,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_lt_i16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xb1,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_lt_i16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xb1,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xb1,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_lt_i16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xb1,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_lt_i16_e64 0x3800, m0
 0x7e,0x00,0xb1,0xd4,0xf0,0xfa,0x00,0x00
+# GFX12: v_cmpx_lt_i16_e64 0x3800, m0            ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_lt_i16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xb1,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xb1,0xd4,0xfd,0xd4,0x00,0x00
+# GFX12: v_cmpx_lt_i16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xb1,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmpx_lt_i16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xb1,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_lt_i16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_lt_i32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc1,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xc1,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_lt_i32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc1,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_lt_i32_e64 v255, v255            ; encoding: [0x7e,0x00,0xc1,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xc1,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_lt_i32_e64 v255, v255            ; encoding: [0x7e,0x00,0xc1,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_lt_i32_e64 s1, s2                ; encoding: [0x7e,0x00,0xc1,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xc1,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_lt_i32_e64 s1, s2                ; encoding: [0x7e,0x00,0xc1,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_lt_i32_e64 s105, s105            ; encoding: [0x7e,0x00,0xc1,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xc1,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_lt_i32_e64 s105, s105            ; encoding: [0x7e,0x00,0xc1,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_lt_i32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xc1,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xc1,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_lt_i32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xc1,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_lt_i32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xc1,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc1,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_lt_i32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xc1,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_lt_i32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xc1,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xc1,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_lt_i32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xc1,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_lt_i32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xc1,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0xc1,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_lt_i32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xc1,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_lt_i32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xc1,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xc1,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_lt_i32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xc1,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_lt_i32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xc1,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xc1,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_lt_i32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xc1,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_lt_i32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xc1,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xc1,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_lt_i32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xc1,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_lt_i32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xc1,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xc1,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_lt_i32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xc1,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_lt_i32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xc1,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xc1,0xd4,0xf0,0xfa,0x00,0x00
+# GFX12: v_cmpx_lt_i32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xc1,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX12: v_cmpx_lt_i32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xc1,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xc1,0xd4,0xfd,0xd4,0x00,0x00
+# GFX12: v_cmpx_lt_i32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xc1,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmpx_lt_i32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xc1,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc1,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_lt_i32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xc1,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_lt_i64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xd1,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xd1,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_lt_i64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xd1,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_lt_i64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd1,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xd1,0xd4,0xfe,0xfd,0x03,0x00
+# GFX12: v_cmpx_lt_i64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd1,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX12: v_cmpx_lt_i64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xd1,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xd1,0xd4,0x02,0x08,0x00,0x00
+# GFX12: v_cmpx_lt_i64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xd1,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX12: v_cmpx_lt_i64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd1,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xd1,0xd4,0x68,0xd0,0x00,0x00
+# GFX12: v_cmpx_lt_i64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd1,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX12: v_cmpx_lt_i64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xd1,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xd1,0xd4,0x6a,0xf4,0x00,0x00
+# GFX12: v_cmpx_lt_i64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xd1,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX12: v_cmpx_lt_i64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd1,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd1,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_lt_i64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd1,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_lt_i64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xd1,0xd4,0x7e,0xfa,0x01,0x00]
 0x7e,0x00,0xd1,0xd4,0x7e,0xfa,0x01,0x00
+# GFX12: v_cmpx_lt_i64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xd1,0xd4,0x7e,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_lt_i64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xd1,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xd1,0xd4,0x7c,0xe0,0x01,0x00
+# GFX12: v_cmpx_lt_i64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xd1,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_lt_i64_e64 -1, -1                ; encoding: [0x7e,0x00,0xd1,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xd1,0xd4,0xc1,0x82,0x01,0x00
+# GFX12: v_cmpx_lt_i64_e64 -1, -1                ; encoding: [0x7e,0x00,0xd1,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_lt_i64_e64 0.5, null             ; encoding: [0x7e,0x00,0xd1,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xd1,0xd4,0xf0,0xf8,0x00,0x00
+# GFX12: v_cmpx_lt_i64_e64 0.5, null             ; encoding: [0x7e,0x00,0xd1,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_lt_i64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xd1,0xd4,0xfd,0xfc,0x00,0x00]
 0x7e,0x00,0xd1,0xd4,0xfd,0xfc,0x00,0x00
+# GFX12: v_cmpx_lt_i64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xd1,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_lt_i64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xd1,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd1,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_lt_i64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xd1,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_lt_u16_e64 v1, v2                ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xb9,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_lt_u16_e64 v1, v2                ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_lt_u16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xb9,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_lt_u16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_lt_u16_e64 s1, s2                ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xb9,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_lt_u16_e64 s1, s2                ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_lt_u16_e64 s105, s105            ; encoding: [0x7e,0x00,0xb9,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xb9,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_lt_u16_e64 s105, s105            ; encoding: [0x7e,0x00,0xb9,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_lt_u16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xb9,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xb9,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_lt_u16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xb9,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_lt_u16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xb9,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xb9,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_lt_u16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xb9,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_lt_u16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xb9,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xb9,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_lt_u16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xb9,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_lt_u16_e64 m0, 0x3800
 0x7e,0x00,0xb9,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_lt_u16_e64 m0, 0x3800            ; encoding: [0x7e,0x00,0xb9,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_lt_u16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xb9,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xb9,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_lt_u16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xb9,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_lt_u16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xb9,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xb9,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_lt_u16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xb9,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_lt_u16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xb9,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xb9,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_lt_u16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xb9,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_lt_u16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xb9,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xb9,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_lt_u16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xb9,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_lt_u16_e64 0x3800, m0
 0x7e,0x00,0xb9,0xd4,0xf0,0xfa,0x00,0x00
+# GFX12: v_cmpx_lt_u16_e64 0x3800, m0            ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_lt_u16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xb9,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xb9,0xd4,0xfd,0xd4,0x00,0x00
+# GFX12: v_cmpx_lt_u16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xb9,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmpx_lt_u16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xb9,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_lt_u16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_lt_u32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc9,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xc9,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_lt_u32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc9,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_lt_u32_e64 v255, v255            ; encoding: [0x7e,0x00,0xc9,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xc9,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_lt_u32_e64 v255, v255            ; encoding: [0x7e,0x00,0xc9,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_lt_u32_e64 s1, s2                ; encoding: [0x7e,0x00,0xc9,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xc9,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_lt_u32_e64 s1, s2                ; encoding: [0x7e,0x00,0xc9,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_lt_u32_e64 s105, s105            ; encoding: [0x7e,0x00,0xc9,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xc9,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_lt_u32_e64 s105, s105            ; encoding: [0x7e,0x00,0xc9,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_lt_u32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xc9,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xc9,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_lt_u32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xc9,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_lt_u32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xc9,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc9,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_lt_u32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xc9,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_lt_u32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xc9,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xc9,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_lt_u32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xc9,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_lt_u32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xc9,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0xc9,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_lt_u32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xc9,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_lt_u32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xc9,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xc9,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_lt_u32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xc9,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_lt_u32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xc9,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xc9,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_lt_u32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xc9,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_lt_u32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xc9,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xc9,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_lt_u32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xc9,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_lt_u32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xc9,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xc9,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_lt_u32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xc9,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_lt_u32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xc9,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xc9,0xd4,0xf0,0xfa,0x00,0x00
+# GFX12: v_cmpx_lt_u32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xc9,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX12: v_cmpx_lt_u32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xc9,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xc9,0xd4,0xfd,0xd4,0x00,0x00
+# GFX12: v_cmpx_lt_u32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xc9,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmpx_lt_u32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xc9,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc9,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_lt_u32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xc9,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_lt_u64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xd9,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xd9,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_lt_u64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xd9,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_lt_u64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd9,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xd9,0xd4,0xfe,0xfd,0x03,0x00
+# GFX12: v_cmpx_lt_u64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd9,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX12: v_cmpx_lt_u64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xd9,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xd9,0xd4,0x02,0x08,0x00,0x00
+# GFX12: v_cmpx_lt_u64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xd9,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX12: v_cmpx_lt_u64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd9,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xd9,0xd4,0x68,0xd0,0x00,0x00
+# GFX12: v_cmpx_lt_u64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd9,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX12: v_cmpx_lt_u64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xd9,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xd9,0xd4,0x6a,0xf4,0x00,0x00
+# GFX12: v_cmpx_lt_u64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xd9,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX12: v_cmpx_lt_u64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd9,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd9,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_lt_u64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd9,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_lt_u64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xd9,0xd4,0x7e,0xfa,0x01,0x00]
 0x7e,0x00,0xd9,0xd4,0x7e,0xfa,0x01,0x00
+# GFX12: v_cmpx_lt_u64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xd9,0xd4,0x7e,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_lt_u64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xd9,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xd9,0xd4,0x7c,0xe0,0x01,0x00
+# GFX12: v_cmpx_lt_u64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xd9,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_lt_u64_e64 -1, -1                ; encoding: [0x7e,0x00,0xd9,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xd9,0xd4,0xc1,0x82,0x01,0x00
+# GFX12: v_cmpx_lt_u64_e64 -1, -1                ; encoding: [0x7e,0x00,0xd9,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_lt_u64_e64 0.5, null             ; encoding: [0x7e,0x00,0xd9,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xd9,0xd4,0xf0,0xf8,0x00,0x00
+# GFX12: v_cmpx_lt_u64_e64 0.5, null             ; encoding: [0x7e,0x00,0xd9,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_lt_u64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xd9,0xd4,0xfd,0xfc,0x00,0x00]
 0x7e,0x00,0xd9,0xd4,0xfd,0xfc,0x00,0x00
+# GFX12: v_cmpx_lt_u64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xd9,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_lt_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xd9,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd9,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_lt_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xd9,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ne_i16_e64 v1, v2                ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xb5,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_ne_i16_e64 v1, v2                ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_ne_i16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xb5,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_ne_i16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_ne_i16_e64 s1, s2                ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xb5,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_ne_i16_e64 s1, s2                ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_ne_i16_e64 s105, s105            ; encoding: [0x7e,0x00,0xb5,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xb5,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_ne_i16_e64 s105, s105            ; encoding: [0x7e,0x00,0xb5,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_ne_i16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xb5,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xb5,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_ne_i16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xb5,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_ne_i16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xb5,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xb5,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_ne_i16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xb5,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_ne_i16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xb5,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xb5,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_ne_i16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xb5,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_ne_i16_e64 m0, 0x3800
 0x7e,0x00,0xb5,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_ne_i16_e64 m0, 0x3800            ; encoding: [0x7e,0x00,0xb5,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_ne_i16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xb5,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xb5,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_ne_i16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xb5,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_ne_i16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xb5,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xb5,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_ne_i16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xb5,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_ne_i16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xb5,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xb5,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_ne_i16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xb5,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_ne_i16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xb5,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xb5,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_ne_i16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xb5,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_ne_i16_e64 0x3800, m0
 0x7e,0x00,0xb5,0xd4,0xf0,0xfa,0x00,0x00
+# GFX12: v_cmpx_ne_i16_e64 0x3800, m0            ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_ne_i16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xb5,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xb5,0xd4,0xfd,0xd4,0x00,0x00
+# GFX12: v_cmpx_ne_i16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xb5,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmpx_ne_i16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xb5,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_ne_i16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_ne_i32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc5,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xc5,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_ne_i32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc5,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_ne_i32_e64 v255, v255            ; encoding: [0x7e,0x00,0xc5,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xc5,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_ne_i32_e64 v255, v255            ; encoding: [0x7e,0x00,0xc5,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_ne_i32_e64 s1, s2                ; encoding: [0x7e,0x00,0xc5,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xc5,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_ne_i32_e64 s1, s2                ; encoding: [0x7e,0x00,0xc5,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_ne_i32_e64 s105, s105            ; encoding: [0x7e,0x00,0xc5,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xc5,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_ne_i32_e64 s105, s105            ; encoding: [0x7e,0x00,0xc5,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_ne_i32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xc5,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xc5,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_ne_i32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xc5,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_ne_i32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xc5,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc5,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ne_i32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xc5,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ne_i32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xc5,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xc5,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_ne_i32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xc5,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_ne_i32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xc5,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0xc5,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_ne_i32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xc5,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_ne_i32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xc5,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xc5,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_ne_i32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xc5,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_ne_i32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xc5,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xc5,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_ne_i32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xc5,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_ne_i32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xc5,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xc5,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_ne_i32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xc5,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_ne_i32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xc5,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xc5,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_ne_i32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xc5,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_ne_i32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xc5,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xc5,0xd4,0xf0,0xfa,0x00,0x00
+# GFX12: v_cmpx_ne_i32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xc5,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX12: v_cmpx_ne_i32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xc5,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xc5,0xd4,0xfd,0xd4,0x00,0x00
+# GFX12: v_cmpx_ne_i32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xc5,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmpx_ne_i32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xc5,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xc5,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ne_i32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xc5,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ne_i64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xd5,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xd5,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_ne_i64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xd5,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_ne_i64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd5,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xd5,0xd4,0xfe,0xfd,0x03,0x00
+# GFX12: v_cmpx_ne_i64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xd5,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX12: v_cmpx_ne_i64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xd5,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xd5,0xd4,0x02,0x08,0x00,0x00
+# GFX12: v_cmpx_ne_i64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xd5,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX12: v_cmpx_ne_i64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd5,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xd5,0xd4,0x68,0xd0,0x00,0x00
+# GFX12: v_cmpx_ne_i64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xd5,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX12: v_cmpx_ne_i64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xd5,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xd5,0xd4,0x6a,0xf4,0x00,0x00
+# GFX12: v_cmpx_ne_i64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xd5,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX12: v_cmpx_ne_i64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd5,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd5,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ne_i64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xd5,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ne_i64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xd5,0xd4,0x7e,0xfa,0x01,0x00]
 0x7e,0x00,0xd5,0xd4,0x7e,0xfa,0x01,0x00
+# GFX12: v_cmpx_ne_i64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xd5,0xd4,0x7e,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_ne_i64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xd5,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xd5,0xd4,0x7c,0xe0,0x01,0x00
+# GFX12: v_cmpx_ne_i64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xd5,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_ne_i64_e64 -1, -1                ; encoding: [0x7e,0x00,0xd5,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xd5,0xd4,0xc1,0x82,0x01,0x00
+# GFX12: v_cmpx_ne_i64_e64 -1, -1                ; encoding: [0x7e,0x00,0xd5,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_ne_i64_e64 0.5, null             ; encoding: [0x7e,0x00,0xd5,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xd5,0xd4,0xf0,0xf8,0x00,0x00
+# GFX12: v_cmpx_ne_i64_e64 0.5, null             ; encoding: [0x7e,0x00,0xd5,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_ne_i64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xd5,0xd4,0xfd,0xfc,0x00,0x00]
 0x7e,0x00,0xd5,0xd4,0xfd,0xfc,0x00,0x00
+# GFX12: v_cmpx_ne_i64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xd5,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_ne_i64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xd5,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xd5,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ne_i64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xd5,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ne_u16_e64 v1, v2                ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xbd,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_ne_u16_e64 v1, v2                ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_ne_u16_e64 v255, v255            ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xbd,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_ne_u16_e64 v255, v255            ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_ne_u16_e64 s1, s2                ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xbd,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_ne_u16_e64 s1, s2                ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_ne_u16_e64 s105, s105            ; encoding: [0x7e,0x00,0xbd,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xbd,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_ne_u16_e64 s105, s105            ; encoding: [0x7e,0x00,0xbd,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_ne_u16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xbd,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xbd,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_ne_u16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xbd,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_ne_u16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xbd,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xbd,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_ne_u16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0xbd,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_ne_u16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xbd,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xbd,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_ne_u16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xbd,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_ne_u16_e64 m0, 0x3800
 0x7e,0x00,0xbd,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_ne_u16_e64 m0, 0x3800            ; encoding: [0x7e,0x00,0xbd,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_ne_u16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xbd,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xbd,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_ne_u16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xbd,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_ne_u16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xbd,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xbd,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_ne_u16_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xbd,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_ne_u16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xbd,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xbd,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_ne_u16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xbd,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_ne_u16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xbd,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xbd,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_ne_u16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xbd,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_ne_u16_e64 0x3800, m0
 0x7e,0x00,0xbd,0xd4,0xf0,0xfa,0x00,0x00
+# GFX12: v_cmpx_ne_u16_e64 0x3800, m0            ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_ne_u16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xbd,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xbd,0xd4,0xfd,0xd4,0x00,0x00
+# GFX12: v_cmpx_ne_u16_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xbd,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmpx_ne_u16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0xbd,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_ne_u16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_ne_u32_e64 v1, v2                ; encoding: [0x7e,0x00,0xcd,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xcd,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_ne_u32_e64 v1, v2                ; encoding: [0x7e,0x00,0xcd,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_ne_u32_e64 v255, v255            ; encoding: [0x7e,0x00,0xcd,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0xcd,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_ne_u32_e64 v255, v255            ; encoding: [0x7e,0x00,0xcd,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_ne_u32_e64 s1, s2                ; encoding: [0x7e,0x00,0xcd,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0xcd,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_ne_u32_e64 s1, s2                ; encoding: [0x7e,0x00,0xcd,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_ne_u32_e64 s105, s105            ; encoding: [0x7e,0x00,0xcd,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0xcd,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_ne_u32_e64 s105, s105            ; encoding: [0x7e,0x00,0xcd,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_ne_u32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xcd,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0xcd,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_ne_u32_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0xcd,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_ne_u32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xcd,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xcd,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ne_u32_e64 vcc_hi, 0xaf123456    ; encoding: [0x7e,0x00,0xcd,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ne_u32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xcd,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0xcd,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_ne_u32_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0xcd,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_ne_u32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xcd,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0xcd,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_ne_u32_e64 m0, 0.5               ; encoding: [0x7e,0x00,0xcd,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_ne_u32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xcd,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0xcd,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_ne_u32_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0xcd,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_ne_u32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xcd,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x00,0xcd,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_ne_u32_e64 exec_hi, null         ; encoding: [0x7e,0x00,0xcd,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_ne_u32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xcd,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0xcd,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_ne_u32_e64 null, exec_lo         ; encoding: [0x7e,0x00,0xcd,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_ne_u32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xcd,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0xcd,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_ne_u32_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0xcd,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_ne_u32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xcd,0xd4,0xf0,0xfa,0x00,0x00]
 0x7e,0x00,0xcd,0xd4,0xf0,0xfa,0x00,0x00
+# GFX12: v_cmpx_ne_u32_e64 0.5, m0               ; encoding: [0x7e,0x00,0xcd,0xd4,0xf0,0xfa,0x00,0x00]
 
-# GFX12: v_cmpx_ne_u32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xcd,0xd4,0xfd,0xd4,0x00,0x00]
 0x7e,0x00,0xcd,0xd4,0xfd,0xd4,0x00,0x00
+# GFX12: v_cmpx_ne_u32_e64 src_scc, vcc_lo       ; encoding: [0x7e,0x00,0xcd,0xd4,0xfd,0xd4,0x00,0x00]
 
-# GFX12: v_cmpx_ne_u32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xcd,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xcd,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ne_u32_e64 0xaf123456, vcc_hi    ; encoding: [0x7e,0x00,0xcd,0xd4,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ne_u64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xdd,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xdd,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_ne_u64_e64 v[1:2], v[2:3]        ; encoding: [0x7e,0x00,0xdd,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_ne_u64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xdd,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xdd,0xd4,0xfe,0xfd,0x03,0x00
+# GFX12: v_cmpx_ne_u64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xdd,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX12: v_cmpx_ne_u64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xdd,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xdd,0xd4,0x02,0x08,0x00,0x00
+# GFX12: v_cmpx_ne_u64_e64 s[2:3], s[4:5]        ; encoding: [0x7e,0x00,0xdd,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX12: v_cmpx_ne_u64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xdd,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xdd,0xd4,0x68,0xd0,0x00,0x00
+# GFX12: v_cmpx_ne_u64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xdd,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX12: v_cmpx_ne_u64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xdd,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xdd,0xd4,0x6a,0xf4,0x00,0x00
+# GFX12: v_cmpx_ne_u64_e64 vcc, ttmp[14:15]      ; encoding: [0x7e,0x00,0xdd,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX12: v_cmpx_ne_u64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xdd,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xdd,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ne_u64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xdd,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ne_u64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xdd,0xd4,0x7e,0xfa,0x01,0x00]
 0x7e,0x00,0xdd,0xd4,0x7e,0xfa,0x01,0x00
+# GFX12: v_cmpx_ne_u64_e64 exec, src_scc         ; encoding: [0x7e,0x00,0xdd,0xd4,0x7e,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_ne_u64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xdd,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xdd,0xd4,0x7c,0xe0,0x01,0x00
+# GFX12: v_cmpx_ne_u64_e64 null, 0.5             ; encoding: [0x7e,0x00,0xdd,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_ne_u64_e64 -1, -1                ; encoding: [0x7e,0x00,0xdd,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xdd,0xd4,0xc1,0x82,0x01,0x00
+# GFX12: v_cmpx_ne_u64_e64 -1, -1                ; encoding: [0x7e,0x00,0xdd,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_ne_u64_e64 0.5, null             ; encoding: [0x7e,0x00,0xdd,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xdd,0xd4,0xf0,0xf8,0x00,0x00
+# GFX12: v_cmpx_ne_u64_e64 0.5, null             ; encoding: [0x7e,0x00,0xdd,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_ne_u64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xdd,0xd4,0xfd,0xfc,0x00,0x00]
 0x7e,0x00,0xdd,0xd4,0xfd,0xfc,0x00,0x00
+# GFX12: v_cmpx_ne_u64_e64 src_scc, exec         ; encoding: [0x7e,0x00,0xdd,0xd4,0xfd,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_ne_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xdd,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xdd,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ne_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xdd,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_neq_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_neq_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_neq_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_neq_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_neq_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x8d,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_neq_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_neq_f16_e64 s105, s105           ; encoding: [0x7e,0x00,0x8d,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x8d,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_neq_f16_e64 s105, s105           ; encoding: [0x7e,0x00,0x8d,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_neq_f16_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x8d,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x8d,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_neq_f16_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x8d,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_neq_f16_e64 vcc_hi, 0xfe0b       ; encoding: [0x7e,0x00,0x8d,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0x8d,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_neq_f16_e64 vcc_hi, 0xfe0b       ; encoding: [0x7e,0x00,0x8d,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_neq_f16_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x8d,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x8d,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_neq_f16_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x8d,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_neq_f16_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x8d,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x8d,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_neq_f16_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x8d,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_neq_f16_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x8d,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x8d,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_neq_f16_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x8d,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_neq_f16_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x8d,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x8d,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_neq_f16_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x8d,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_neq_f16_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x8d,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x8d,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_neq_f16_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x8d,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_neq_f16_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x8d,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x8d,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_neq_f16_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x8d,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_neq_f16_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x8d,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x8d,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_neq_f16_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x8d,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX12: v_cmpx_neq_f16_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x8d,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x8d,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_neq_f16_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x8d,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX12: v_cmpx_neq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7e,0x83,0x8d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_neq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_neq_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9d,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x9d,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_neq_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9d,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_neq_f32_e64 v255, v255           ; encoding: [0x7e,0x00,0x9d,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x9d,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_neq_f32_e64 v255, v255           ; encoding: [0x7e,0x00,0x9d,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_neq_f32_e64 s1, s2               ; encoding: [0x7e,0x00,0x9d,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x9d,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_neq_f32_e64 s1, s2               ; encoding: [0x7e,0x00,0x9d,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_neq_f32_e64 s105, s105           ; encoding: [0x7e,0x00,0x9d,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x9d,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_neq_f32_e64 s105, s105           ; encoding: [0x7e,0x00,0x9d,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_neq_f32_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x9d,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x9d,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_neq_f32_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x9d,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_neq_f32_e64 vcc_hi, 0xaf123456   ; encoding: [0x7e,0x00,0x9d,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0x9d,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_neq_f32_e64 vcc_hi, 0xaf123456   ; encoding: [0x7e,0x00,0x9d,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_neq_f32_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x9d,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x9d,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_neq_f32_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x9d,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_neq_f32_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x9d,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x9d,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_neq_f32_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x9d,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_neq_f32_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x9d,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x9d,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_neq_f32_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x9d,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_neq_f32_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x9d,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x9d,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_neq_f32_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x9d,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_neq_f32_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x9d,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x9d,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_neq_f32_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x9d,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_neq_f32_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x9d,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x9d,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_neq_f32_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x9d,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_neq_f32_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x9d,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x9d,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_neq_f32_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x9d,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX12: v_cmpx_neq_f32_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x9d,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x9d,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_neq_f32_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x9d,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX12: v_cmpx_neq_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x9d,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7e,0x83,0x9d,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_neq_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x9d,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_neq_f64_e64 v[1:2], v[2:3]       ; encoding: [0x7e,0x00,0xad,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xad,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_neq_f64_e64 v[1:2], v[2:3]       ; encoding: [0x7e,0x00,0xad,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_neq_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xad,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xad,0xd4,0xfe,0xfd,0x03,0x00
+# GFX12: v_cmpx_neq_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xad,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX12: v_cmpx_neq_f64_e64 s[2:3], s[4:5]       ; encoding: [0x7e,0x00,0xad,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xad,0xd4,0x02,0x08,0x00,0x00
+# GFX12: v_cmpx_neq_f64_e64 s[2:3], s[4:5]       ; encoding: [0x7e,0x00,0xad,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX12: v_cmpx_neq_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xad,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xad,0xd4,0x68,0xd0,0x00,0x00
+# GFX12: v_cmpx_neq_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xad,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX12: v_cmpx_neq_f64_e64 vcc, ttmp[14:15]     ; encoding: [0x7e,0x00,0xad,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xad,0xd4,0x6a,0xf4,0x00,0x00
+# GFX12: v_cmpx_neq_f64_e64 vcc, ttmp[14:15]     ; encoding: [0x7e,0x00,0xad,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX12: v_cmpx_neq_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xad,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xad,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_neq_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xad,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_neq_f64_e64 -|exec|, src_scc     ; encoding: [0x7e,0x01,0xad,0xd4,0x7e,0xfa,0x01,0x20]
 0x7e,0x01,0xad,0xd4,0x7e,0xfa,0x01,0x20
+# GFX12: v_cmpx_neq_f64_e64 -|exec|, src_scc     ; encoding: [0x7e,0x01,0xad,0xd4,0x7e,0xfa,0x01,0x20]
 
-# GFX12: v_cmpx_neq_f64_e64 null, 0.5            ; encoding: [0x7e,0x00,0xad,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xad,0xd4,0x7c,0xe0,0x01,0x00
+# GFX12: v_cmpx_neq_f64_e64 null, 0.5            ; encoding: [0x7e,0x00,0xad,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_neq_f64_e64 -1, -1               ; encoding: [0x7e,0x00,0xad,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xad,0xd4,0xc1,0x82,0x01,0x00
+# GFX12: v_cmpx_neq_f64_e64 -1, -1               ; encoding: [0x7e,0x00,0xad,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_neq_f64_e64 0.5, null            ; encoding: [0x7e,0x00,0xad,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xad,0xd4,0xf0,0xf8,0x00,0x00
+# GFX12: v_cmpx_neq_f64_e64 0.5, null            ; encoding: [0x7e,0x00,0xad,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_neq_f64_e64 -|src_scc|, -|exec|  ; encoding: [0x7e,0x03,0xad,0xd4,0xfd,0xfc,0x00,0x60]
 0x7e,0x03,0xad,0xd4,0xfd,0xfc,0x00,0x60
+# GFX12: v_cmpx_neq_f64_e64 -|src_scc|, -|exec|  ; encoding: [0x7e,0x03,0xad,0xd4,0xfd,0xfc,0x00,0x60]
 
-# GFX12: v_cmpx_neq_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xad,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7e,0x82,0xad,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_neq_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xad,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_nge_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_nge_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_nge_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_nge_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_nge_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x89,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_nge_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_nge_f16_e64 s105, s105           ; encoding: [0x7e,0x00,0x89,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x89,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_nge_f16_e64 s105, s105           ; encoding: [0x7e,0x00,0x89,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_nge_f16_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x89,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x89,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_nge_f16_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x89,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_nge_f16_e64 vcc_hi, 0xfe0b       ; encoding: [0x7e,0x00,0x89,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0x89,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_nge_f16_e64 vcc_hi, 0xfe0b       ; encoding: [0x7e,0x00,0x89,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_nge_f16_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x89,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x89,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_nge_f16_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x89,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_nge_f16_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x89,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x89,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_nge_f16_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x89,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_nge_f16_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x89,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x89,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_nge_f16_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x89,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_nge_f16_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x89,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x89,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_nge_f16_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x89,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_nge_f16_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x89,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x89,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_nge_f16_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x89,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_nge_f16_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x89,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x89,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_nge_f16_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x89,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_nge_f16_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x89,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x89,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_nge_f16_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x89,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX12: v_cmpx_nge_f16_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x89,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x89,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_nge_f16_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x89,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX12: v_cmpx_nge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x89,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7e,0x83,0x89,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_nge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x89,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_nge_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x99,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x99,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_nge_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x99,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_nge_f32_e64 v255, v255           ; encoding: [0x7e,0x00,0x99,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x99,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_nge_f32_e64 v255, v255           ; encoding: [0x7e,0x00,0x99,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_nge_f32_e64 s1, s2               ; encoding: [0x7e,0x00,0x99,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x99,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_nge_f32_e64 s1, s2               ; encoding: [0x7e,0x00,0x99,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_nge_f32_e64 s105, s105           ; encoding: [0x7e,0x00,0x99,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x99,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_nge_f32_e64 s105, s105           ; encoding: [0x7e,0x00,0x99,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_nge_f32_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x99,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x99,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_nge_f32_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x99,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_nge_f32_e64 vcc_hi, 0xaf123456   ; encoding: [0x7e,0x00,0x99,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0x99,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_nge_f32_e64 vcc_hi, 0xaf123456   ; encoding: [0x7e,0x00,0x99,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_nge_f32_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x99,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x99,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_nge_f32_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x99,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_nge_f32_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x99,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x99,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_nge_f32_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x99,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_nge_f32_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x99,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x99,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_nge_f32_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x99,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_nge_f32_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x99,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x99,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_nge_f32_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x99,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_nge_f32_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x99,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x99,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_nge_f32_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x99,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_nge_f32_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x99,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x99,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_nge_f32_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x99,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_nge_f32_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x99,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x99,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_nge_f32_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x99,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX12: v_cmpx_nge_f32_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x99,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x99,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_nge_f32_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x99,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX12: v_cmpx_nge_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x99,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7e,0x83,0x99,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_nge_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x99,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_nge_f64_e64 v[1:2], v[2:3]       ; encoding: [0x7e,0x00,0xa9,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xa9,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_nge_f64_e64 v[1:2], v[2:3]       ; encoding: [0x7e,0x00,0xa9,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_nge_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa9,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xa9,0xd4,0xfe,0xfd,0x03,0x00
+# GFX12: v_cmpx_nge_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa9,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX12: v_cmpx_nge_f64_e64 s[2:3], s[4:5]       ; encoding: [0x7e,0x00,0xa9,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xa9,0xd4,0x02,0x08,0x00,0x00
+# GFX12: v_cmpx_nge_f64_e64 s[2:3], s[4:5]       ; encoding: [0x7e,0x00,0xa9,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX12: v_cmpx_nge_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa9,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xa9,0xd4,0x68,0xd0,0x00,0x00
+# GFX12: v_cmpx_nge_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa9,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX12: v_cmpx_nge_f64_e64 vcc, ttmp[14:15]     ; encoding: [0x7e,0x00,0xa9,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xa9,0xd4,0x6a,0xf4,0x00,0x00
+# GFX12: v_cmpx_nge_f64_e64 vcc, ttmp[14:15]     ; encoding: [0x7e,0x00,0xa9,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX12: v_cmpx_nge_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa9,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xa9,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_nge_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa9,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_nge_f64_e64 -|exec|, src_scc     ; encoding: [0x7e,0x01,0xa9,0xd4,0x7e,0xfa,0x01,0x20]
 0x7e,0x01,0xa9,0xd4,0x7e,0xfa,0x01,0x20
+# GFX12: v_cmpx_nge_f64_e64 -|exec|, src_scc     ; encoding: [0x7e,0x01,0xa9,0xd4,0x7e,0xfa,0x01,0x20]
 
-# GFX12: v_cmpx_nge_f64_e64 null, 0.5            ; encoding: [0x7e,0x00,0xa9,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xa9,0xd4,0x7c,0xe0,0x01,0x00
+# GFX12: v_cmpx_nge_f64_e64 null, 0.5            ; encoding: [0x7e,0x00,0xa9,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_nge_f64_e64 -1, -1               ; encoding: [0x7e,0x00,0xa9,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xa9,0xd4,0xc1,0x82,0x01,0x00
+# GFX12: v_cmpx_nge_f64_e64 -1, -1               ; encoding: [0x7e,0x00,0xa9,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_nge_f64_e64 0.5, null            ; encoding: [0x7e,0x00,0xa9,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xa9,0xd4,0xf0,0xf8,0x00,0x00
+# GFX12: v_cmpx_nge_f64_e64 0.5, null            ; encoding: [0x7e,0x00,0xa9,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_nge_f64_e64 -|src_scc|, -|exec|  ; encoding: [0x7e,0x03,0xa9,0xd4,0xfd,0xfc,0x00,0x60]
 0x7e,0x03,0xa9,0xd4,0xfd,0xfc,0x00,0x60
+# GFX12: v_cmpx_nge_f64_e64 -|src_scc|, -|exec|  ; encoding: [0x7e,0x03,0xa9,0xd4,0xfd,0xfc,0x00,0x60]
 
-# GFX12: v_cmpx_nge_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa9,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7e,0x82,0xa9,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_nge_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa9,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ngt_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_ngt_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_ngt_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_ngt_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_ngt_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x8b,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_ngt_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_ngt_f16_e64 s105, s105           ; encoding: [0x7e,0x00,0x8b,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x8b,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_ngt_f16_e64 s105, s105           ; encoding: [0x7e,0x00,0x8b,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_ngt_f16_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x8b,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x8b,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_ngt_f16_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x8b,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_ngt_f16_e64 vcc_hi, 0xfe0b       ; encoding: [0x7e,0x00,0x8b,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0x8b,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_ngt_f16_e64 vcc_hi, 0xfe0b       ; encoding: [0x7e,0x00,0x8b,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_ngt_f16_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x8b,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x8b,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_ngt_f16_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x8b,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_ngt_f16_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x8b,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x8b,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_ngt_f16_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x8b,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_ngt_f16_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x8b,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x8b,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_ngt_f16_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x8b,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_ngt_f16_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x8b,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x8b,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_ngt_f16_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x8b,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_ngt_f16_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x8b,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x8b,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_ngt_f16_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x8b,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_ngt_f16_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x8b,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x8b,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_ngt_f16_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x8b,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_ngt_f16_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x8b,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x8b,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_ngt_f16_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x8b,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX12: v_cmpx_ngt_f16_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x8b,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x8b,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_ngt_f16_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x8b,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX12: v_cmpx_ngt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7e,0x83,0x8b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_ngt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_ngt_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9b,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x9b,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_ngt_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9b,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_ngt_f32_e64 v255, v255           ; encoding: [0x7e,0x00,0x9b,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x9b,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_ngt_f32_e64 v255, v255           ; encoding: [0x7e,0x00,0x9b,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_ngt_f32_e64 s1, s2               ; encoding: [0x7e,0x00,0x9b,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x9b,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_ngt_f32_e64 s1, s2               ; encoding: [0x7e,0x00,0x9b,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_ngt_f32_e64 s105, s105           ; encoding: [0x7e,0x00,0x9b,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x9b,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_ngt_f32_e64 s105, s105           ; encoding: [0x7e,0x00,0x9b,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_ngt_f32_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x9b,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x9b,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_ngt_f32_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x9b,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_ngt_f32_e64 vcc_hi, 0xaf123456   ; encoding: [0x7e,0x00,0x9b,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0x9b,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ngt_f32_e64 vcc_hi, 0xaf123456   ; encoding: [0x7e,0x00,0x9b,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ngt_f32_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x9b,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x9b,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_ngt_f32_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x9b,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_ngt_f32_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x9b,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x9b,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_ngt_f32_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x9b,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_ngt_f32_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x9b,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x9b,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_ngt_f32_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x9b,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_ngt_f32_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x9b,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x9b,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_ngt_f32_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x9b,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_ngt_f32_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x9b,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x9b,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_ngt_f32_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x9b,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_ngt_f32_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x9b,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x9b,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_ngt_f32_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x9b,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_ngt_f32_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x9b,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x9b,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_ngt_f32_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x9b,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX12: v_cmpx_ngt_f32_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x9b,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x9b,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_ngt_f32_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x9b,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX12: v_cmpx_ngt_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x9b,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7e,0x83,0x9b,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ngt_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x9b,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ngt_f64_e64 v[1:2], v[2:3]       ; encoding: [0x7e,0x00,0xab,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xab,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_ngt_f64_e64 v[1:2], v[2:3]       ; encoding: [0x7e,0x00,0xab,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_ngt_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xab,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xab,0xd4,0xfe,0xfd,0x03,0x00
+# GFX12: v_cmpx_ngt_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xab,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX12: v_cmpx_ngt_f64_e64 s[2:3], s[4:5]       ; encoding: [0x7e,0x00,0xab,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xab,0xd4,0x02,0x08,0x00,0x00
+# GFX12: v_cmpx_ngt_f64_e64 s[2:3], s[4:5]       ; encoding: [0x7e,0x00,0xab,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX12: v_cmpx_ngt_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xab,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xab,0xd4,0x68,0xd0,0x00,0x00
+# GFX12: v_cmpx_ngt_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xab,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX12: v_cmpx_ngt_f64_e64 vcc, ttmp[14:15]     ; encoding: [0x7e,0x00,0xab,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xab,0xd4,0x6a,0xf4,0x00,0x00
+# GFX12: v_cmpx_ngt_f64_e64 vcc, ttmp[14:15]     ; encoding: [0x7e,0x00,0xab,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX12: v_cmpx_ngt_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xab,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xab,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ngt_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xab,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ngt_f64_e64 -|exec|, src_scc     ; encoding: [0x7e,0x01,0xab,0xd4,0x7e,0xfa,0x01,0x20]
 0x7e,0x01,0xab,0xd4,0x7e,0xfa,0x01,0x20
+# GFX12: v_cmpx_ngt_f64_e64 -|exec|, src_scc     ; encoding: [0x7e,0x01,0xab,0xd4,0x7e,0xfa,0x01,0x20]
 
-# GFX12: v_cmpx_ngt_f64_e64 null, 0.5            ; encoding: [0x7e,0x00,0xab,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xab,0xd4,0x7c,0xe0,0x01,0x00
+# GFX12: v_cmpx_ngt_f64_e64 null, 0.5            ; encoding: [0x7e,0x00,0xab,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_ngt_f64_e64 -1, -1               ; encoding: [0x7e,0x00,0xab,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xab,0xd4,0xc1,0x82,0x01,0x00
+# GFX12: v_cmpx_ngt_f64_e64 -1, -1               ; encoding: [0x7e,0x00,0xab,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_ngt_f64_e64 0.5, null            ; encoding: [0x7e,0x00,0xab,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xab,0xd4,0xf0,0xf8,0x00,0x00
+# GFX12: v_cmpx_ngt_f64_e64 0.5, null            ; encoding: [0x7e,0x00,0xab,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_ngt_f64_e64 -|src_scc|, -|exec|  ; encoding: [0x7e,0x03,0xab,0xd4,0xfd,0xfc,0x00,0x60]
 0x7e,0x03,0xab,0xd4,0xfd,0xfc,0x00,0x60
+# GFX12: v_cmpx_ngt_f64_e64 -|src_scc|, -|exec|  ; encoding: [0x7e,0x03,0xab,0xd4,0xfd,0xfc,0x00,0x60]
 
-# GFX12: v_cmpx_ngt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xab,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7e,0x82,0xab,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ngt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xab,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_nle_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_nle_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_nle_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_nle_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_nle_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x8c,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_nle_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_nle_f16_e64 s105, s105           ; encoding: [0x7e,0x00,0x8c,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x8c,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_nle_f16_e64 s105, s105           ; encoding: [0x7e,0x00,0x8c,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_nle_f16_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x8c,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x8c,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_nle_f16_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x8c,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_nle_f16_e64 vcc_hi, 0xfe0b       ; encoding: [0x7e,0x00,0x8c,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0x8c,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_nle_f16_e64 vcc_hi, 0xfe0b       ; encoding: [0x7e,0x00,0x8c,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_nle_f16_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x8c,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x8c,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_nle_f16_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x8c,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_nle_f16_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x8c,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x8c,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_nle_f16_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x8c,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_nle_f16_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x8c,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x8c,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_nle_f16_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x8c,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_nle_f16_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x8c,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x8c,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_nle_f16_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x8c,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_nle_f16_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x8c,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x8c,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_nle_f16_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x8c,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_nle_f16_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x8c,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x8c,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_nle_f16_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x8c,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_nle_f16_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x8c,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x8c,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_nle_f16_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x8c,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX12: v_cmpx_nle_f16_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x8c,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x8c,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_nle_f16_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x8c,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX12: v_cmpx_nle_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7e,0x83,0x8c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_nle_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_nle_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9c,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x9c,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_nle_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9c,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_nle_f32_e64 v255, v255           ; encoding: [0x7e,0x00,0x9c,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x9c,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_nle_f32_e64 v255, v255           ; encoding: [0x7e,0x00,0x9c,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_nle_f32_e64 s1, s2               ; encoding: [0x7e,0x00,0x9c,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x9c,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_nle_f32_e64 s1, s2               ; encoding: [0x7e,0x00,0x9c,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_nle_f32_e64 s105, s105           ; encoding: [0x7e,0x00,0x9c,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x9c,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_nle_f32_e64 s105, s105           ; encoding: [0x7e,0x00,0x9c,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_nle_f32_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x9c,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x9c,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_nle_f32_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x9c,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_nle_f32_e64 vcc_hi, 0xaf123456   ; encoding: [0x7e,0x00,0x9c,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0x9c,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_nle_f32_e64 vcc_hi, 0xaf123456   ; encoding: [0x7e,0x00,0x9c,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_nle_f32_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x9c,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x9c,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_nle_f32_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x9c,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_nle_f32_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x9c,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x9c,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_nle_f32_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x9c,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_nle_f32_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x9c,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x9c,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_nle_f32_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x9c,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_nle_f32_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x9c,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x9c,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_nle_f32_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x9c,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_nle_f32_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x9c,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x9c,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_nle_f32_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x9c,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_nle_f32_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x9c,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x9c,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_nle_f32_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x9c,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_nle_f32_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x9c,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x9c,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_nle_f32_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x9c,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX12: v_cmpx_nle_f32_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x9c,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x9c,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_nle_f32_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x9c,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX12: v_cmpx_nle_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x9c,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7e,0x83,0x9c,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_nle_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x9c,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_nle_f64_e64 v[1:2], v[2:3]       ; encoding: [0x7e,0x00,0xac,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xac,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_nle_f64_e64 v[1:2], v[2:3]       ; encoding: [0x7e,0x00,0xac,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_nle_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xac,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xac,0xd4,0xfe,0xfd,0x03,0x00
+# GFX12: v_cmpx_nle_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xac,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX12: v_cmpx_nle_f64_e64 s[2:3], s[4:5]       ; encoding: [0x7e,0x00,0xac,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xac,0xd4,0x02,0x08,0x00,0x00
+# GFX12: v_cmpx_nle_f64_e64 s[2:3], s[4:5]       ; encoding: [0x7e,0x00,0xac,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX12: v_cmpx_nle_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xac,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xac,0xd4,0x68,0xd0,0x00,0x00
+# GFX12: v_cmpx_nle_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xac,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX12: v_cmpx_nle_f64_e64 vcc, ttmp[14:15]     ; encoding: [0x7e,0x00,0xac,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xac,0xd4,0x6a,0xf4,0x00,0x00
+# GFX12: v_cmpx_nle_f64_e64 vcc, ttmp[14:15]     ; encoding: [0x7e,0x00,0xac,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX12: v_cmpx_nle_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xac,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xac,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_nle_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xac,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_nle_f64_e64 -|exec|, src_scc     ; encoding: [0x7e,0x01,0xac,0xd4,0x7e,0xfa,0x01,0x20]
 0x7e,0x01,0xac,0xd4,0x7e,0xfa,0x01,0x20
+# GFX12: v_cmpx_nle_f64_e64 -|exec|, src_scc     ; encoding: [0x7e,0x01,0xac,0xd4,0x7e,0xfa,0x01,0x20]
 
-# GFX12: v_cmpx_nle_f64_e64 null, 0.5            ; encoding: [0x7e,0x00,0xac,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xac,0xd4,0x7c,0xe0,0x01,0x00
+# GFX12: v_cmpx_nle_f64_e64 null, 0.5            ; encoding: [0x7e,0x00,0xac,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_nle_f64_e64 -1, -1               ; encoding: [0x7e,0x00,0xac,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xac,0xd4,0xc1,0x82,0x01,0x00
+# GFX12: v_cmpx_nle_f64_e64 -1, -1               ; encoding: [0x7e,0x00,0xac,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_nle_f64_e64 0.5, null            ; encoding: [0x7e,0x00,0xac,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xac,0xd4,0xf0,0xf8,0x00,0x00
+# GFX12: v_cmpx_nle_f64_e64 0.5, null            ; encoding: [0x7e,0x00,0xac,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_nle_f64_e64 -|src_scc|, -|exec|  ; encoding: [0x7e,0x03,0xac,0xd4,0xfd,0xfc,0x00,0x60]
 0x7e,0x03,0xac,0xd4,0xfd,0xfc,0x00,0x60
+# GFX12: v_cmpx_nle_f64_e64 -|src_scc|, -|exec|  ; encoding: [0x7e,0x03,0xac,0xd4,0xfd,0xfc,0x00,0x60]
 
-# GFX12: v_cmpx_nle_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xac,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7e,0x82,0xac,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_nle_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xac,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_nlg_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_nlg_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_nlg_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_nlg_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_nlg_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x8a,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_nlg_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_nlg_f16_e64 s105, s105           ; encoding: [0x7e,0x00,0x8a,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x8a,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_nlg_f16_e64 s105, s105           ; encoding: [0x7e,0x00,0x8a,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_nlg_f16_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x8a,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x8a,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_nlg_f16_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x8a,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_nlg_f16_e64 vcc_hi, 0xfe0b       ; encoding: [0x7e,0x00,0x8a,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0x8a,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_nlg_f16_e64 vcc_hi, 0xfe0b       ; encoding: [0x7e,0x00,0x8a,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_nlg_f16_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x8a,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x8a,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_nlg_f16_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x8a,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_nlg_f16_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x8a,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x8a,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_nlg_f16_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x8a,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_nlg_f16_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x8a,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x8a,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_nlg_f16_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x8a,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_nlg_f16_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x8a,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x8a,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_nlg_f16_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x8a,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_nlg_f16_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x8a,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x8a,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_nlg_f16_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x8a,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_nlg_f16_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x8a,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x8a,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_nlg_f16_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x8a,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_nlg_f16_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x8a,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x8a,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_nlg_f16_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x8a,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX12: v_cmpx_nlg_f16_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x8a,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x8a,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_nlg_f16_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x8a,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX12: v_cmpx_nlg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7e,0x83,0x8a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_nlg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_nlg_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9a,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x9a,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_nlg_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9a,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_nlg_f32_e64 v255, v255           ; encoding: [0x7e,0x00,0x9a,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x9a,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_nlg_f32_e64 v255, v255           ; encoding: [0x7e,0x00,0x9a,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_nlg_f32_e64 s1, s2               ; encoding: [0x7e,0x00,0x9a,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x9a,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_nlg_f32_e64 s1, s2               ; encoding: [0x7e,0x00,0x9a,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_nlg_f32_e64 s105, s105           ; encoding: [0x7e,0x00,0x9a,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x9a,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_nlg_f32_e64 s105, s105           ; encoding: [0x7e,0x00,0x9a,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_nlg_f32_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x9a,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x9a,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_nlg_f32_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x9a,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_nlg_f32_e64 vcc_hi, 0xaf123456   ; encoding: [0x7e,0x00,0x9a,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0x9a,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_nlg_f32_e64 vcc_hi, 0xaf123456   ; encoding: [0x7e,0x00,0x9a,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_nlg_f32_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x9a,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x9a,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_nlg_f32_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x9a,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_nlg_f32_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x9a,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x9a,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_nlg_f32_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x9a,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_nlg_f32_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x9a,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x9a,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_nlg_f32_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x9a,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_nlg_f32_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x9a,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x9a,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_nlg_f32_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x9a,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_nlg_f32_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x9a,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x9a,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_nlg_f32_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x9a,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_nlg_f32_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x9a,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x9a,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_nlg_f32_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x9a,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_nlg_f32_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x9a,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x9a,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_nlg_f32_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x9a,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX12: v_cmpx_nlg_f32_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x9a,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x9a,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_nlg_f32_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x9a,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX12: v_cmpx_nlg_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x9a,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7e,0x83,0x9a,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_nlg_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x9a,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_nlg_f64_e64 v[1:2], v[2:3]       ; encoding: [0x7e,0x00,0xaa,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xaa,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_nlg_f64_e64 v[1:2], v[2:3]       ; encoding: [0x7e,0x00,0xaa,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_nlg_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xaa,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xaa,0xd4,0xfe,0xfd,0x03,0x00
+# GFX12: v_cmpx_nlg_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xaa,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX12: v_cmpx_nlg_f64_e64 s[2:3], s[4:5]       ; encoding: [0x7e,0x00,0xaa,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xaa,0xd4,0x02,0x08,0x00,0x00
+# GFX12: v_cmpx_nlg_f64_e64 s[2:3], s[4:5]       ; encoding: [0x7e,0x00,0xaa,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX12: v_cmpx_nlg_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xaa,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xaa,0xd4,0x68,0xd0,0x00,0x00
+# GFX12: v_cmpx_nlg_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xaa,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX12: v_cmpx_nlg_f64_e64 vcc, ttmp[14:15]     ; encoding: [0x7e,0x00,0xaa,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xaa,0xd4,0x6a,0xf4,0x00,0x00
+# GFX12: v_cmpx_nlg_f64_e64 vcc, ttmp[14:15]     ; encoding: [0x7e,0x00,0xaa,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX12: v_cmpx_nlg_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xaa,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xaa,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_nlg_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xaa,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_nlg_f64_e64 -|exec|, src_scc     ; encoding: [0x7e,0x01,0xaa,0xd4,0x7e,0xfa,0x01,0x20]
 0x7e,0x01,0xaa,0xd4,0x7e,0xfa,0x01,0x20
+# GFX12: v_cmpx_nlg_f64_e64 -|exec|, src_scc     ; encoding: [0x7e,0x01,0xaa,0xd4,0x7e,0xfa,0x01,0x20]
 
-# GFX12: v_cmpx_nlg_f64_e64 null, 0.5            ; encoding: [0x7e,0x00,0xaa,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xaa,0xd4,0x7c,0xe0,0x01,0x00
+# GFX12: v_cmpx_nlg_f64_e64 null, 0.5            ; encoding: [0x7e,0x00,0xaa,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_nlg_f64_e64 -1, -1               ; encoding: [0x7e,0x00,0xaa,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xaa,0xd4,0xc1,0x82,0x01,0x00
+# GFX12: v_cmpx_nlg_f64_e64 -1, -1               ; encoding: [0x7e,0x00,0xaa,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_nlg_f64_e64 0.5, null            ; encoding: [0x7e,0x00,0xaa,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xaa,0xd4,0xf0,0xf8,0x00,0x00
+# GFX12: v_cmpx_nlg_f64_e64 0.5, null            ; encoding: [0x7e,0x00,0xaa,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_nlg_f64_e64 -|src_scc|, -|exec|  ; encoding: [0x7e,0x03,0xaa,0xd4,0xfd,0xfc,0x00,0x60]
 0x7e,0x03,0xaa,0xd4,0xfd,0xfc,0x00,0x60
+# GFX12: v_cmpx_nlg_f64_e64 -|src_scc|, -|exec|  ; encoding: [0x7e,0x03,0xaa,0xd4,0xfd,0xfc,0x00,0x60]
 
-# GFX12: v_cmpx_nlg_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xaa,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7e,0x82,0xaa,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_nlg_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xaa,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_nlt_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_nlt_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_nlt_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_nlt_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_nlt_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x8e,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_nlt_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_nlt_f16_e64 s105, s105           ; encoding: [0x7e,0x00,0x8e,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x8e,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_nlt_f16_e64 s105, s105           ; encoding: [0x7e,0x00,0x8e,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_nlt_f16_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x8e,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x8e,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_nlt_f16_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x8e,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_nlt_f16_e64 vcc_hi, 0xfe0b       ; encoding: [0x7e,0x00,0x8e,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0x8e,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_nlt_f16_e64 vcc_hi, 0xfe0b       ; encoding: [0x7e,0x00,0x8e,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_nlt_f16_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x8e,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x8e,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_nlt_f16_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x8e,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_nlt_f16_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x8e,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x8e,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_nlt_f16_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x8e,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_nlt_f16_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x8e,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x8e,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_nlt_f16_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x8e,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_nlt_f16_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x8e,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x8e,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_nlt_f16_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x8e,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_nlt_f16_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x8e,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x8e,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_nlt_f16_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x8e,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_nlt_f16_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x8e,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x8e,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_nlt_f16_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x8e,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_nlt_f16_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x8e,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x8e,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_nlt_f16_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x8e,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX12: v_cmpx_nlt_f16_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x8e,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x8e,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_nlt_f16_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x8e,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX12: v_cmpx_nlt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7e,0x83,0x8e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_nlt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_nlt_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9e,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x9e,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_nlt_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9e,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_nlt_f32_e64 v255, v255           ; encoding: [0x7e,0x00,0x9e,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x9e,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_nlt_f32_e64 v255, v255           ; encoding: [0x7e,0x00,0x9e,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_nlt_f32_e64 s1, s2               ; encoding: [0x7e,0x00,0x9e,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x9e,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_nlt_f32_e64 s1, s2               ; encoding: [0x7e,0x00,0x9e,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_nlt_f32_e64 s105, s105           ; encoding: [0x7e,0x00,0x9e,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x9e,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_nlt_f32_e64 s105, s105           ; encoding: [0x7e,0x00,0x9e,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_nlt_f32_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x9e,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x9e,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_nlt_f32_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x9e,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_nlt_f32_e64 vcc_hi, 0xaf123456   ; encoding: [0x7e,0x00,0x9e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0x9e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_nlt_f32_e64 vcc_hi, 0xaf123456   ; encoding: [0x7e,0x00,0x9e,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_nlt_f32_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x9e,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x9e,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_nlt_f32_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x9e,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_nlt_f32_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x9e,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x9e,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_nlt_f32_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x9e,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_nlt_f32_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x9e,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x9e,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_nlt_f32_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x9e,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_nlt_f32_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x9e,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x9e,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_nlt_f32_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x9e,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_nlt_f32_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x9e,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x9e,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_nlt_f32_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x9e,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_nlt_f32_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x9e,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x9e,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_nlt_f32_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x9e,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_nlt_f32_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x9e,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x9e,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_nlt_f32_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x9e,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX12: v_cmpx_nlt_f32_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x9e,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x9e,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_nlt_f32_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x9e,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX12: v_cmpx_nlt_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x9e,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7e,0x83,0x9e,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_nlt_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x9e,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_nlt_f64_e64 v[1:2], v[2:3]       ; encoding: [0x7e,0x00,0xae,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xae,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_nlt_f64_e64 v[1:2], v[2:3]       ; encoding: [0x7e,0x00,0xae,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_nlt_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xae,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xae,0xd4,0xfe,0xfd,0x03,0x00
+# GFX12: v_cmpx_nlt_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xae,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX12: v_cmpx_nlt_f64_e64 s[2:3], s[4:5]       ; encoding: [0x7e,0x00,0xae,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xae,0xd4,0x02,0x08,0x00,0x00
+# GFX12: v_cmpx_nlt_f64_e64 s[2:3], s[4:5]       ; encoding: [0x7e,0x00,0xae,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX12: v_cmpx_nlt_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xae,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xae,0xd4,0x68,0xd0,0x00,0x00
+# GFX12: v_cmpx_nlt_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xae,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX12: v_cmpx_nlt_f64_e64 vcc, ttmp[14:15]     ; encoding: [0x7e,0x00,0xae,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xae,0xd4,0x6a,0xf4,0x00,0x00
+# GFX12: v_cmpx_nlt_f64_e64 vcc, ttmp[14:15]     ; encoding: [0x7e,0x00,0xae,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX12: v_cmpx_nlt_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xae,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xae,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_nlt_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xae,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_nlt_f64_e64 -|exec|, src_scc     ; encoding: [0x7e,0x01,0xae,0xd4,0x7e,0xfa,0x01,0x20]
 0x7e,0x01,0xae,0xd4,0x7e,0xfa,0x01,0x20
+# GFX12: v_cmpx_nlt_f64_e64 -|exec|, src_scc     ; encoding: [0x7e,0x01,0xae,0xd4,0x7e,0xfa,0x01,0x20]
 
-# GFX12: v_cmpx_nlt_f64_e64 null, 0.5            ; encoding: [0x7e,0x00,0xae,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xae,0xd4,0x7c,0xe0,0x01,0x00
+# GFX12: v_cmpx_nlt_f64_e64 null, 0.5            ; encoding: [0x7e,0x00,0xae,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_nlt_f64_e64 -1, -1               ; encoding: [0x7e,0x00,0xae,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xae,0xd4,0xc1,0x82,0x01,0x00
+# GFX12: v_cmpx_nlt_f64_e64 -1, -1               ; encoding: [0x7e,0x00,0xae,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_nlt_f64_e64 0.5, null            ; encoding: [0x7e,0x00,0xae,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xae,0xd4,0xf0,0xf8,0x00,0x00
+# GFX12: v_cmpx_nlt_f64_e64 0.5, null            ; encoding: [0x7e,0x00,0xae,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_nlt_f64_e64 -|src_scc|, -|exec|  ; encoding: [0x7e,0x03,0xae,0xd4,0xfd,0xfc,0x00,0x60]
 0x7e,0x03,0xae,0xd4,0xfd,0xfc,0x00,0x60
+# GFX12: v_cmpx_nlt_f64_e64 -|src_scc|, -|exec|  ; encoding: [0x7e,0x03,0xae,0xd4,0xfd,0xfc,0x00,0x60]
 
-# GFX12: v_cmpx_nlt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xae,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7e,0x82,0xae,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_nlt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xae,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_o_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_o_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_o_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_o_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_o_f16_e64 s1, s2                 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x87,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_o_f16_e64 s1, s2                 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_o_f16_e64 s105, s105             ; encoding: [0x7e,0x00,0x87,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x87,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_o_f16_e64 s105, s105             ; encoding: [0x7e,0x00,0x87,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_o_f16_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0x87,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x87,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_o_f16_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0x87,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_o_f16_e64 vcc_hi, 0xfe0b         ; encoding: [0x7e,0x00,0x87,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0x87,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_o_f16_e64 vcc_hi, 0xfe0b         ; encoding: [0x7e,0x00,0x87,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_o_f16_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0x87,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x87,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_o_f16_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0x87,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_o_f16_e64 m0, 0.5                ; encoding: [0x7e,0x00,0x87,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x87,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_o_f16_e64 m0, 0.5                ; encoding: [0x7e,0x00,0x87,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_o_f16_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0x87,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x87,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_o_f16_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0x87,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_o_f16_e64 |exec_hi|, null        ; encoding: [0x7e,0x01,0x87,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x87,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_o_f16_e64 |exec_hi|, null        ; encoding: [0x7e,0x01,0x87,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_o_f16_e64 null, exec_lo          ; encoding: [0x7e,0x00,0x87,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x87,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_o_f16_e64 null, exec_lo          ; encoding: [0x7e,0x00,0x87,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_o_f16_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0x87,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x87,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_o_f16_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0x87,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_o_f16_e64 0.5, -m0               ; encoding: [0x7e,0x00,0x87,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x87,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_o_f16_e64 0.5, -m0               ; encoding: [0x7e,0x00,0x87,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX12: v_cmpx_o_f16_e64 -src_scc, |vcc_lo|     ; encoding: [0x7e,0x02,0x87,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x87,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_o_f16_e64 -src_scc, |vcc_lo|     ; encoding: [0x7e,0x02,0x87,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX12: v_cmpx_o_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x87,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7e,0x83,0x87,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_o_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x87,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_o_f32_e64 v1, v2                 ; encoding: [0x7e,0x00,0x97,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x97,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_o_f32_e64 v1, v2                 ; encoding: [0x7e,0x00,0x97,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_o_f32_e64 v255, v255             ; encoding: [0x7e,0x00,0x97,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x97,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_o_f32_e64 v255, v255             ; encoding: [0x7e,0x00,0x97,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_o_f32_e64 s1, s2                 ; encoding: [0x7e,0x00,0x97,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x97,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_o_f32_e64 s1, s2                 ; encoding: [0x7e,0x00,0x97,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_o_f32_e64 s105, s105             ; encoding: [0x7e,0x00,0x97,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x97,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_o_f32_e64 s105, s105             ; encoding: [0x7e,0x00,0x97,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_o_f32_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0x97,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x97,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_o_f32_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0x97,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_o_f32_e64 vcc_hi, 0xaf123456     ; encoding: [0x7e,0x00,0x97,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0x97,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_o_f32_e64 vcc_hi, 0xaf123456     ; encoding: [0x7e,0x00,0x97,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_o_f32_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0x97,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x97,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_o_f32_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0x97,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_o_f32_e64 m0, 0.5                ; encoding: [0x7e,0x00,0x97,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x97,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_o_f32_e64 m0, 0.5                ; encoding: [0x7e,0x00,0x97,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_o_f32_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0x97,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x97,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_o_f32_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0x97,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_o_f32_e64 |exec_hi|, null        ; encoding: [0x7e,0x01,0x97,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x97,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_o_f32_e64 |exec_hi|, null        ; encoding: [0x7e,0x01,0x97,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_o_f32_e64 null, exec_lo          ; encoding: [0x7e,0x00,0x97,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x97,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_o_f32_e64 null, exec_lo          ; encoding: [0x7e,0x00,0x97,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_o_f32_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0x97,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x97,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_o_f32_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0x97,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_o_f32_e64 0.5, -m0               ; encoding: [0x7e,0x00,0x97,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x97,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_o_f32_e64 0.5, -m0               ; encoding: [0x7e,0x00,0x97,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX12: v_cmpx_o_f32_e64 -src_scc, |vcc_lo|     ; encoding: [0x7e,0x02,0x97,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x97,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_o_f32_e64 -src_scc, |vcc_lo|     ; encoding: [0x7e,0x02,0x97,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX12: v_cmpx_o_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x97,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7e,0x83,0x97,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_o_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x97,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_o_f64_e64 v[1:2], v[2:3]         ; encoding: [0x7e,0x00,0xa7,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xa7,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_o_f64_e64 v[1:2], v[2:3]         ; encoding: [0x7e,0x00,0xa7,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_o_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa7,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xa7,0xd4,0xfe,0xfd,0x03,0x00
+# GFX12: v_cmpx_o_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa7,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX12: v_cmpx_o_f64_e64 s[2:3], s[4:5]         ; encoding: [0x7e,0x00,0xa7,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xa7,0xd4,0x02,0x08,0x00,0x00
+# GFX12: v_cmpx_o_f64_e64 s[2:3], s[4:5]         ; encoding: [0x7e,0x00,0xa7,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX12: v_cmpx_o_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa7,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xa7,0xd4,0x68,0xd0,0x00,0x00
+# GFX12: v_cmpx_o_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa7,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX12: v_cmpx_o_f64_e64 vcc, ttmp[14:15]       ; encoding: [0x7e,0x00,0xa7,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xa7,0xd4,0x6a,0xf4,0x00,0x00
+# GFX12: v_cmpx_o_f64_e64 vcc, ttmp[14:15]       ; encoding: [0x7e,0x00,0xa7,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX12: v_cmpx_o_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa7,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xa7,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_o_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa7,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_o_f64_e64 -|exec|, src_scc       ; encoding: [0x7e,0x01,0xa7,0xd4,0x7e,0xfa,0x01,0x20]
 0x7e,0x01,0xa7,0xd4,0x7e,0xfa,0x01,0x20
+# GFX12: v_cmpx_o_f64_e64 -|exec|, src_scc       ; encoding: [0x7e,0x01,0xa7,0xd4,0x7e,0xfa,0x01,0x20]
 
-# GFX12: v_cmpx_o_f64_e64 null, 0.5              ; encoding: [0x7e,0x00,0xa7,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xa7,0xd4,0x7c,0xe0,0x01,0x00
+# GFX12: v_cmpx_o_f64_e64 null, 0.5              ; encoding: [0x7e,0x00,0xa7,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_o_f64_e64 -1, -1                 ; encoding: [0x7e,0x00,0xa7,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xa7,0xd4,0xc1,0x82,0x01,0x00
+# GFX12: v_cmpx_o_f64_e64 -1, -1                 ; encoding: [0x7e,0x00,0xa7,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_o_f64_e64 0.5, null              ; encoding: [0x7e,0x00,0xa7,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xa7,0xd4,0xf0,0xf8,0x00,0x00
+# GFX12: v_cmpx_o_f64_e64 0.5, null              ; encoding: [0x7e,0x00,0xa7,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_o_f64_e64 -|src_scc|, -|exec|    ; encoding: [0x7e,0x03,0xa7,0xd4,0xfd,0xfc,0x00,0x60]
 0x7e,0x03,0xa7,0xd4,0xfd,0xfc,0x00,0x60
+# GFX12: v_cmpx_o_f64_e64 -|src_scc|, -|exec|    ; encoding: [0x7e,0x03,0xa7,0xd4,0xfd,0xfc,0x00,0x60]
 
-# GFX12: v_cmpx_o_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa7,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7e,0x82,0xa7,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_o_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa7,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_u_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_u_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_u_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_u_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_u_f16_e64 s1, s2                 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x88,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_u_f16_e64 s1, s2                 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_u_f16_e64 s105, s105             ; encoding: [0x7e,0x00,0x88,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x88,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_u_f16_e64 s105, s105             ; encoding: [0x7e,0x00,0x88,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_u_f16_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0x88,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x88,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_u_f16_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0x88,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_u_f16_e64 vcc_hi, 0xfe0b         ; encoding: [0x7e,0x00,0x88,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x7e,0x00,0x88,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_u_f16_e64 vcc_hi, 0xfe0b         ; encoding: [0x7e,0x00,0x88,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_u_f16_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0x88,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x88,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_u_f16_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0x88,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_u_f16_e64 m0, 0.5                ; encoding: [0x7e,0x00,0x88,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x88,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_u_f16_e64 m0, 0.5                ; encoding: [0x7e,0x00,0x88,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_u_f16_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0x88,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x88,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_u_f16_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0x88,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_u_f16_e64 |exec_hi|, null        ; encoding: [0x7e,0x01,0x88,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x88,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_u_f16_e64 |exec_hi|, null        ; encoding: [0x7e,0x01,0x88,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_u_f16_e64 null, exec_lo          ; encoding: [0x7e,0x00,0x88,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x88,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_u_f16_e64 null, exec_lo          ; encoding: [0x7e,0x00,0x88,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_u_f16_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0x88,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x88,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_u_f16_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0x88,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_u_f16_e64 0.5, -m0               ; encoding: [0x7e,0x00,0x88,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x88,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_u_f16_e64 0.5, -m0               ; encoding: [0x7e,0x00,0x88,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX12: v_cmpx_u_f16_e64 -src_scc, |vcc_lo|     ; encoding: [0x7e,0x02,0x88,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x88,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_u_f16_e64 -src_scc, |vcc_lo|     ; encoding: [0x7e,0x02,0x88,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX12: v_cmpx_u_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x88,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 0x7e,0x83,0x88,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_u_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x88,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_u_f32_e64 v1, v2                 ; encoding: [0x7e,0x00,0x98,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0x98,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_u_f32_e64 v1, v2                 ; encoding: [0x7e,0x00,0x98,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_u_f32_e64 v255, v255             ; encoding: [0x7e,0x00,0x98,0xd4,0xff,0xff,0x03,0x00]
 0x7e,0x00,0x98,0xd4,0xff,0xff,0x03,0x00
+# GFX12: v_cmpx_u_f32_e64 v255, v255             ; encoding: [0x7e,0x00,0x98,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX12: v_cmpx_u_f32_e64 s1, s2                 ; encoding: [0x7e,0x00,0x98,0xd4,0x01,0x04,0x00,0x00]
 0x7e,0x00,0x98,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_u_f32_e64 s1, s2                 ; encoding: [0x7e,0x00,0x98,0xd4,0x01,0x04,0x00,0x00]
 
-# GFX12: v_cmpx_u_f32_e64 s105, s105             ; encoding: [0x7e,0x00,0x98,0xd4,0x69,0xd2,0x00,0x00]
 0x7e,0x00,0x98,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_u_f32_e64 s105, s105             ; encoding: [0x7e,0x00,0x98,0xd4,0x69,0xd2,0x00,0x00]
 
-# GFX12: v_cmpx_u_f32_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0x98,0xd4,0x6a,0xf6,0x00,0x00]
 0x7e,0x00,0x98,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_u_f32_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0x98,0xd4,0x6a,0xf6,0x00,0x00]
 
-# GFX12: v_cmpx_u_f32_e64 vcc_hi, 0xaf123456     ; encoding: [0x7e,0x00,0x98,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0x98,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_u_f32_e64 vcc_hi, 0xaf123456     ; encoding: [0x7e,0x00,0x98,0xd4,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_u_f32_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0x98,0xd4,0x7b,0xfa,0x01,0x00]
 0x7e,0x00,0x98,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_u_f32_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0x98,0xd4,0x7b,0xfa,0x01,0x00]
 
-# GFX12: v_cmpx_u_f32_e64 m0, 0.5                ; encoding: [0x7e,0x00,0x98,0xd4,0x7d,0xe0,0x01,0x00]
 0x7e,0x00,0x98,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_u_f32_e64 m0, 0.5                ; encoding: [0x7e,0x00,0x98,0xd4,0x7d,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_u_f32_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0x98,0xd4,0x7e,0x82,0x01,0x00]
 0x7e,0x00,0x98,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_u_f32_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0x98,0xd4,0x7e,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_u_f32_e64 |exec_hi|, null        ; encoding: [0x7e,0x01,0x98,0xd4,0x7f,0xf8,0x00,0x00]
 0x7e,0x01,0x98,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_u_f32_e64 |exec_hi|, null        ; encoding: [0x7e,0x01,0x98,0xd4,0x7f,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_u_f32_e64 null, exec_lo          ; encoding: [0x7e,0x00,0x98,0xd4,0x7c,0xfc,0x00,0x00]
 0x7e,0x00,0x98,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_u_f32_e64 null, exec_lo          ; encoding: [0x7e,0x00,0x98,0xd4,0x7c,0xfc,0x00,0x00]
 
-# GFX12: v_cmpx_u_f32_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0x98,0xd4,0xc1,0xfe,0x00,0x00]
 0x7e,0x00,0x98,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_u_f32_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0x98,0xd4,0xc1,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_u_f32_e64 0.5, -m0               ; encoding: [0x7e,0x00,0x98,0xd4,0xf0,0xfa,0x00,0x40]
 0x7e,0x00,0x98,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_u_f32_e64 0.5, -m0               ; encoding: [0x7e,0x00,0x98,0xd4,0xf0,0xfa,0x00,0x40]
 
-# GFX12: v_cmpx_u_f32_e64 -src_scc, |vcc_lo|     ; encoding: [0x7e,0x02,0x98,0xd4,0xfd,0xd4,0x00,0x20]
 0x7e,0x02,0x98,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_u_f32_e64 -src_scc, |vcc_lo|     ; encoding: [0x7e,0x02,0x98,0xd4,0xfd,0xd4,0x00,0x20]
 
-# GFX12: v_cmpx_u_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x98,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 0x7e,0x83,0x98,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_u_f32_e64 -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x98,0xd4,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_u_f64_e64 v[1:2], v[2:3]         ; encoding: [0x7e,0x00,0xa8,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xa8,0xd4,0x01,0x05,0x02,0x00
+# GFX12: v_cmpx_u_f64_e64 v[1:2], v[2:3]         ; encoding: [0x7e,0x00,0xa8,0xd4,0x01,0x05,0x02,0x00]
 
-# GFX12: v_cmpx_u_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa8,0xd4,0xfe,0xfd,0x03,0x00]
 0x7e,0x00,0xa8,0xd4,0xfe,0xfd,0x03,0x00
+# GFX12: v_cmpx_u_f64_e64 v[254:255], v[254:255] ; encoding: [0x7e,0x00,0xa8,0xd4,0xfe,0xfd,0x03,0x00]
 
-# GFX12: v_cmpx_u_f64_e64 s[2:3], s[4:5]         ; encoding: [0x7e,0x00,0xa8,0xd4,0x02,0x08,0x00,0x00]
 0x7e,0x00,0xa8,0xd4,0x02,0x08,0x00,0x00
+# GFX12: v_cmpx_u_f64_e64 s[2:3], s[4:5]         ; encoding: [0x7e,0x00,0xa8,0xd4,0x02,0x08,0x00,0x00]
 
-# GFX12: v_cmpx_u_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa8,0xd4,0x68,0xd0,0x00,0x00]
 0x7e,0x00,0xa8,0xd4,0x68,0xd0,0x00,0x00
+# GFX12: v_cmpx_u_f64_e64 s[104:105], s[104:105] ; encoding: [0x7e,0x00,0xa8,0xd4,0x68,0xd0,0x00,0x00]
 
-# GFX12: v_cmpx_u_f64_e64 vcc, ttmp[14:15]       ; encoding: [0x7e,0x00,0xa8,0xd4,0x6a,0xf4,0x00,0x00]
 0x7e,0x00,0xa8,0xd4,0x6a,0xf4,0x00,0x00
+# GFX12: v_cmpx_u_f64_e64 vcc, ttmp[14:15]       ; encoding: [0x7e,0x00,0xa8,0xd4,0x6a,0xf4,0x00,0x00]
 
-# GFX12: v_cmpx_u_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa8,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 0x7e,0x00,0xa8,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_u_f64_e64 ttmp[14:15], 0xaf123456 ; encoding: [0x7e,0x00,0xa8,0xd4,0x7a,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_u_f64_e64 -|exec|, src_scc       ; encoding: [0x7e,0x01,0xa8,0xd4,0x7e,0xfa,0x01,0x20]
 0x7e,0x01,0xa8,0xd4,0x7e,0xfa,0x01,0x20
+# GFX12: v_cmpx_u_f64_e64 -|exec|, src_scc       ; encoding: [0x7e,0x01,0xa8,0xd4,0x7e,0xfa,0x01,0x20]
 
-# GFX12: v_cmpx_u_f64_e64 null, 0.5              ; encoding: [0x7e,0x00,0xa8,0xd4,0x7c,0xe0,0x01,0x00]
 0x7e,0x00,0xa8,0xd4,0x7c,0xe0,0x01,0x00
+# GFX12: v_cmpx_u_f64_e64 null, 0.5              ; encoding: [0x7e,0x00,0xa8,0xd4,0x7c,0xe0,0x01,0x00]
 
-# GFX12: v_cmpx_u_f64_e64 -1, -1                 ; encoding: [0x7e,0x00,0xa8,0xd4,0xc1,0x82,0x01,0x00]
 0x7e,0x00,0xa8,0xd4,0xc1,0x82,0x01,0x00
+# GFX12: v_cmpx_u_f64_e64 -1, -1                 ; encoding: [0x7e,0x00,0xa8,0xd4,0xc1,0x82,0x01,0x00]
 
-# GFX12: v_cmpx_u_f64_e64 0.5, null              ; encoding: [0x7e,0x00,0xa8,0xd4,0xf0,0xf8,0x00,0x00]
 0x7e,0x00,0xa8,0xd4,0xf0,0xf8,0x00,0x00
+# GFX12: v_cmpx_u_f64_e64 0.5, null              ; encoding: [0x7e,0x00,0xa8,0xd4,0xf0,0xf8,0x00,0x00]
 
-# GFX12: v_cmpx_u_f64_e64 -|src_scc|, -|exec|    ; encoding: [0x7e,0x03,0xa8,0xd4,0xfd,0xfc,0x00,0x60]
 0x7e,0x03,0xa8,0xd4,0xfd,0xfc,0x00,0x60
+# GFX12: v_cmpx_u_f64_e64 -|src_scc|, -|exec|    ; encoding: [0x7e,0x03,0xa8,0xd4,0xfd,0xfc,0x00,0x60]
 
-# GFX12: v_cmpx_u_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa8,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 0x7e,0x82,0xa8,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_u_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa8,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt
index ce56a87570ec3..3550b6fc5e95d 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt
@@ -1,2446 +1,2449 @@
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
 
-# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_class_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xfd,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_class_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_class_f16_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x01,0xfd,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30]
 0x7e,0x01,0xfd,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_class_f16_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x01,0xfd,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_class_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_class_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_class_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_class_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_class_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_class_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_class_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_class_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_class_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_class_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_class_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_class_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_class_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_class_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_class_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_class_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_class_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_class_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_class_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_class_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_class_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xfe,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_class_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_class_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_class_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_class_f32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_class_f32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_class_f32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_class_f32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_class_f32_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x01,0xfe,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30]
 0x7e,0x01,0xfe,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_class_f32_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x01,0xfe,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x82,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_eq_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_eq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_eq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_eq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x92,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_eq_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x92,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x92,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_eq_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x92,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_eq_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x92,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x92,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_eq_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x92,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_eq_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x92,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x92,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_eq_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x92,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb2,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_eq_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_eq_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_eq_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc2,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_eq_i32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_eq_i32_e64_dpp v1, 10 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x14,0x01,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xc2,0xd4,0xfa,0x14,0x01,0x00,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_eq_i32_e64_dpp v1, 10 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x14,0x01,0x00,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_eq_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xc2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_eq_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xba,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_eq_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_eq_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_eq_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xca,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_eq_u32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_eq_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xca,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_eq_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x86,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ge_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_ge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_ge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_ge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x96,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ge_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x96,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x96,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_ge_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x96,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_ge_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x96,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x96,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_ge_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x96,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_ge_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x96,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x96,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_ge_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x96,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb6,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ge_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_ge_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_ge_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc6,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ge_i32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_ge_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xc6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_ge_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xbe,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ge_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_ge_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_ge_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xce,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ge_u32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_ge_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xce,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_ge_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x84,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_gt_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_gt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_gt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_gt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x94,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_gt_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x94,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x94,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_gt_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x94,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_gt_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x94,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x94,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_gt_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x94,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_gt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x94,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x94,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_gt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x94,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb4,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_gt_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_gt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_gt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc4,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_gt_i32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_gt_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xc4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_gt_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xbc,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_gt_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_gt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_gt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xcc,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_gt_u32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_gt_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xcc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_gt_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_le_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x83,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_le_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_le_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_le_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_le_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_le_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_le_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_le_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_le_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_le_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_le_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_le_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_le_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_le_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_le_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_le_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_le_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_le_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_le_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_le_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_le_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_le_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_le_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_le_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_le_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_le_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_le_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x93,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_le_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_le_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_le_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_le_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x93,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x93,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_le_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x93,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_le_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x93,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x93,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_le_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x93,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_le_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x93,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x93,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_le_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x93,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_le_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb3,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_le_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_le_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_le_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_le_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_le_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_le_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_le_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_le_i32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_le_i32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_le_i32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_le_i32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_le_i32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_le_i32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_le_i32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_le_i32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_le_i32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_le_i32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_le_i32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_le_i32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_le_i32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_le_i32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_le_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_le_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_le_i32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc3,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_le_i32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_le_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_le_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_le_i32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_le_i32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_le_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_le_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_le_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xc3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_le_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_le_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xbb,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_le_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_le_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_le_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_le_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_le_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_le_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_le_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_le_u32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_le_u32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_le_u32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_le_u32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_le_u32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_le_u32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_le_u32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_le_u32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_le_u32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_le_u32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_le_u32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_le_u32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_le_u32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_le_u32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_le_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_le_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_le_u32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xcb,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_le_u32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_le_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_le_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_le_u32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_le_u32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_le_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_le_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_le_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xcb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_le_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x85,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_lg_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_lg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_lg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_lg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x95,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_lg_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x95,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x95,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_lg_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x95,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_lg_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x95,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x95,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_lg_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x95,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_lg_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x95,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x95,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_lg_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x95,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x81,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_lt_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x81,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x81,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_lt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x81,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_lt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x81,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x81,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_lt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x81,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x81,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x81,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x81,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x91,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_lt_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x91,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x91,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_lt_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x91,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_lt_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x91,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x91,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_lt_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x91,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_lt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x91,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x91,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_lt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x91,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb1,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_lt_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_lt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_lt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc1,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_lt_i32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_lt_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xc1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_lt_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb9,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_lt_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_lt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_lt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc9,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_lt_u32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_lt_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xc9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_lt_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb5,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ne_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_ne_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_ne_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc5,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ne_i32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_ne_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xc5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_ne_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xbd,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ne_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_ne_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_ne_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xcd,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ne_u32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_ne_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xcd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_ne_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x8d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_neq_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_neq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_neq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_neq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x9d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_neq_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x9d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x9d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_neq_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x9d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_neq_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x9d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x9d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_neq_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x9d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x9d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x89,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_nge_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_nge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_nge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_nge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x99,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_nge_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x99,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x99,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_nge_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x99,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_nge_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x99,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x99,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_nge_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x99,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x99,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x99,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x99,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x8b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ngt_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_ngt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_ngt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x9b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ngt_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x9b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x9b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_ngt_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x9b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_ngt_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x9b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x9b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_ngt_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x9b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x9b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x8c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_nle_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_nle_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_nle_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_nle_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x9c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_nle_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x9c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x9c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_nle_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x9c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_nle_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x9c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x9c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_nle_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x9c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x9c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x8a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_nlg_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_nlg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_nlg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x9a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_nlg_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x9a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x9a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_nlg_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x9a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_nlg_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x9a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x9a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_nlg_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x9a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x9a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x8e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_nlt_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_nlt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_nlt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x9e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_nlt_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x9e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x9e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_nlt_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x9e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_nlt_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x9e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x9e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_nlt_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x9e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x9e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_o_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x87,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_o_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_o_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_o_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_o_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_o_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_o_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_o_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_o_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_o_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_o_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_o_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_o_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_o_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_o_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_o_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_o_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_o_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_o_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_o_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_o_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_o_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_o_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_o_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_o_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_o_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_o_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x97,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_o_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_o_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_o_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_o_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x97,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x97,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_o_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x97,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_o_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x97,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x97,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_o_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x97,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x97,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x97,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x97,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_u_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x88,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_u_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_u_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_u_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_u_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_u_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_u_f32_e64_dpp v1, 2.0 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x98,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_u_f32_e64_dpp v1, 2.0 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_u_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x98,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_u_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_u_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x98,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x7e,0x01,0x98,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_u_f32_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x98,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_u_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x98,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 0x7e,0x02,0x98,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_u_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x98,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_u_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x98,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0x7e,0x83,0x98,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_u_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x98,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 # Check that dst value does not affect disassembly
-# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x00,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 # Check that dst value does not affect disassembly
-# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0xff,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt
index 0184f667640b5..9442dcc4fb1d5 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt
@@ -1,676 +1,679 @@
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
 
-# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_class_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xfd,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_class_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_class_f16_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x01,0xfd,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 0x7e,0x01,0xfd,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_class_f16_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x01,0xfd,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_class_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xfe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_class_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_class_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfe,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xfe,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_class_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfe,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_class_f32_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x01,0xfe,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 0x7e,0x01,0xfe,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_class_f32_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x01,0xfe,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_eq_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x82,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_eq_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_eq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_eq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_eq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_eq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x92,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x92,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x92,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_eq_f32_e64_dpp v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x92,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x92,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_eq_f32_e64_dpp v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x92,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_eq_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x92,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x92,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_eq_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x92,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_eq_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x92,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x92,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_eq_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x92,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_eq_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x92,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x92,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_eq_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x92,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_eq_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x92,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x92,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_eq_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x92,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_eq_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb2,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_eq_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_eq_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_eq_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_eq_i32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc2,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc2,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_eq_i32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc2,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_eq_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xc2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_eq_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_eq_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xba,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_eq_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_eq_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_eq_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xca,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xca,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xca,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_eq_u32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xca,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xca,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_eq_u32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xca,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_eq_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xca,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xca,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_eq_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xca,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ge_f16_e64_dpp v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x86,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ge_f16_e64_dpp v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ge_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x86,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ge_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x96,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x96,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x96,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ge_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x96,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x96,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ge_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x96,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ge_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x96,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x96,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ge_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x96,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ge_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x96,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x96,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ge_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x96,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ge_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x96,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x96,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_ge_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x96,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ge_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb6,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ge_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ge_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_ge_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ge_i32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc6,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc6,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ge_i32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc6,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ge_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xc6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_ge_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ge_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xbe,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ge_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ge_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_ge_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xce,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xce,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xce,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ge_u32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xce,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xce,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ge_u32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xce,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ge_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xce,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xce,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_ge_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xce,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_gt_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x84,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_gt_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_gt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_gt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_gt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_gt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x94,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x94,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x94,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_gt_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x94,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x94,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_gt_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x94,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_gt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x94,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x94,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_gt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x94,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_gt_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x94,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x94,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_gt_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x94,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_gt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x94,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x94,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_gt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x94,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_gt_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb4,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_gt_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_gt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_gt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_gt_i32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc4,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc4,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_gt_i32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc4,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_gt_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xc4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_gt_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_gt_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xbc,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_gt_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_gt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_gt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xcc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_gt_u32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcc,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xcc,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_gt_u32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcc,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_gt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xcc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_gt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_le_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x83,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_le_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_le_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_le_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_le_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_le_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_le_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x93,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x93,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_le_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x93,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_le_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x93,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x93,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_le_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x93,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_le_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x93,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x93,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_le_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x93,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_le_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x93,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x93,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_le_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x93,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_le_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x93,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x93,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_le_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x93,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_le_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb3,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_le_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_le_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_le_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_le_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_le_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_le_i32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc3,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc3,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_le_i32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc3,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_le_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xc3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_le_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_le_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xbb,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_le_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_le_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_le_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_le_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xcb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_le_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_le_u32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcb,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xcb,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_le_u32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcb,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_le_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xcb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_le_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lg_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x85,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lg_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x95,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x95,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x95,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lg_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x95,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x95,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lg_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x95,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lg_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x95,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x95,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lg_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x95,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lg_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x95,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x95,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lg_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x95,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lg_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x95,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x95,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_lg_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x95,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x81,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x81,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x81,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lt_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x81,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x81,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lt_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x81,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x81,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x81,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x81,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x81,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x81,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x81,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x81,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x81,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x81,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x91,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x91,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x91,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lt_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x91,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x91,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lt_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x91,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x91,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x91,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x91,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lt_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x91,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x91,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lt_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x91,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x91,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x91,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_lt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x91,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lt_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb1,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lt_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_lt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lt_i32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc1,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc1,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lt_i32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc1,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lt_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xc1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_lt_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lt_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb9,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lt_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_lt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lt_u32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc9,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc9,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lt_u32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc9,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xc9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_lt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ne_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb5,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ne_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ne_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_ne_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ne_i32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc5,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc5,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ne_i32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc5,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ne_i32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc5,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc5,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ne_i32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc5,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ne_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xc5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_ne_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ne_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xbd,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ne_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ne_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_ne_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xcd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ne_u32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcd,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xcd,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ne_u32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcd,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ne_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xcd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_ne_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_neq_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x8d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_neq_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_neq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_neq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_neq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_neq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x9d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_neq_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x9d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_neq_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_neq_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x9d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_neq_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_neq_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x9d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x9d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_neq_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x9d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x9d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nge_f16_e64_dpp v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x89,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nge_f16_e64_dpp v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nge_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x89,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nge_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x99,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x99,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x99,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nge_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x99,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x99,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nge_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x99,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nge_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x99,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x99,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nge_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x99,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nge_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x99,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x99,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nge_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x99,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x99,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x99,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x99,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ngt_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x8b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ngt_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ngt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ngt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x9b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ngt_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x9b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ngt_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ngt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x9b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ngt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ngt_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x9b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x9b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ngt_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x9b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x9b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nle_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x8c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nle_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nle_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nle_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nle_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nle_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x9c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nle_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x9c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nle_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nle_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x9c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nle_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nle_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x9c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x9c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nle_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x9c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x9c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nlg_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x8a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nlg_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nlg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nlg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x9a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nlg_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x9a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nlg_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nlg_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x9a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nlg_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nlg_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x9a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x9a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nlg_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x9a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x9a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nlt_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x8e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nlt_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nlt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nlt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x9e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nlt_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x9e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nlt_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nlt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x9e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nlt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nlt_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x9e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x9e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nlt_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x9e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x9e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_o_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x87,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_o_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_o_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_o_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_o_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_o_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_o_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x97,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x97,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_o_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x97,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_o_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x97,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x97,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_o_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x97,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_o_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x97,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x97,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_o_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x97,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_o_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x97,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x97,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_o_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x97,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x97,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x97,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x97,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_u_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x88,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_u_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_u_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_u_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_u_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_u_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_u_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x98,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x98,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_u_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x98,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_u_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x98,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x98,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_u_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x98,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_u_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x98,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7e,0x02,0x98,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_u_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x98,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_u_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x98,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0x7e,0x83,0x98,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_u_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x98,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 # Check that dst value does not affect disassembly
-# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x00,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 # Check that dst value does not affect disassembly
-# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0xff,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_u_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopc.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopc.txt
index 8fff403b502ac..3054c5e04f7a8 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopc.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopc.txt
@@ -1,4540 +1,4541 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W32
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W64
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W32
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W64
 
-# W32: v_cmp_class_f16_e32 vcc_lo, v1, v2        ; encoding: [0x01,0x05,0xfa,0x7c]
-# W64: v_cmp_class_f16_e32 vcc, v1, v2           ; encoding: [0x01,0x05,0xfa,0x7c]
 0x01,0x05,0xfa,0x7c
+# W32: v_cmp_class_f16_e32 vcc_lo, v1, v2      ; encoding: [0x01,0x05,0xfa,0x7c]
+# W64: v_cmp_class_f16_e32 vcc, v1, v2         ; encoding: [0x01,0x05,0xfa,0x7c]
 
-# W32: v_cmp_class_f16_e32 vcc_lo, v127, v2      ; encoding: [0x7f,0x05,0xfa,0x7c]
-# W64: v_cmp_class_f16_e32 vcc, v127, v2         ; encoding: [0x7f,0x05,0xfa,0x7c]
 0x7f,0x05,0xfa,0x7c
+# W32: v_cmp_class_f16_e32 vcc_lo, v127, v2    ; encoding: [0x7f,0x05,0xfa,0x7c]
+# W64: v_cmp_class_f16_e32 vcc, v127, v2       ; encoding: [0x7f,0x05,0xfa,0x7c]
 
-# W32: v_cmp_class_f16_e32 vcc_lo, s1, v2        ; encoding: [0x01,0x04,0xfa,0x7c]
-# W64: v_cmp_class_f16_e32 vcc, s1, v2           ; encoding: [0x01,0x04,0xfa,0x7c]
 0x01,0x04,0xfa,0x7c
+# W32: v_cmp_class_f16_e32 vcc_lo, s1, v2      ; encoding: [0x01,0x04,0xfa,0x7c]
+# W64: v_cmp_class_f16_e32 vcc, s1, v2         ; encoding: [0x01,0x04,0xfa,0x7c]
 
-# W32: v_cmp_class_f16_e32 vcc_lo, s105, v2      ; encoding: [0x69,0x04,0xfa,0x7c]
-# W64: v_cmp_class_f16_e32 vcc, s105, v2         ; encoding: [0x69,0x04,0xfa,0x7c]
 0x69,0x04,0xfa,0x7c
+# W32: v_cmp_class_f16_e32 vcc_lo, s105, v2    ; encoding: [0x69,0x04,0xfa,0x7c]
+# W64: v_cmp_class_f16_e32 vcc, s105, v2       ; encoding: [0x69,0x04,0xfa,0x7c]
 
-# W32: v_cmp_class_f16_e32 vcc_lo, vcc_lo, v2    ; encoding: [0x6a,0x04,0xfa,0x7c]
-# W64: v_cmp_class_f16_e32 vcc, vcc_lo, v2       ; encoding: [0x6a,0x04,0xfa,0x7c]
 0x6a,0x04,0xfa,0x7c
+# W32: v_cmp_class_f16_e32 vcc_lo, vcc_lo, v2  ; encoding: [0x6a,0x04,0xfa,0x7c]
+# W64: v_cmp_class_f16_e32 vcc, vcc_lo, v2     ; encoding: [0x6a,0x04,0xfa,0x7c]
 
-# W32: v_cmp_class_f16_e32 vcc_lo, vcc_hi, v2    ; encoding: [0x6b,0x04,0xfa,0x7c]
-# W64: v_cmp_class_f16_e32 vcc, vcc_hi, v2       ; encoding: [0x6b,0x04,0xfa,0x7c]
 0x6b,0x04,0xfa,0x7c
+# W32: v_cmp_class_f16_e32 vcc_lo, vcc_hi, v2  ; encoding: [0x6b,0x04,0xfa,0x7c]
+# W64: v_cmp_class_f16_e32 vcc, vcc_hi, v2     ; encoding: [0x6b,0x04,0xfa,0x7c]
 
-# W32: v_cmp_class_f16_e32 vcc_lo, ttmp15, v2    ; encoding: [0x7b,0x04,0xfa,0x7c]
-# W64: v_cmp_class_f16_e32 vcc, ttmp15, v2       ; encoding: [0x7b,0x04,0xfa,0x7c]
 0x7b,0x04,0xfa,0x7c
+# W32: v_cmp_class_f16_e32 vcc_lo, ttmp15, v2  ; encoding: [0x7b,0x04,0xfa,0x7c]
+# W64: v_cmp_class_f16_e32 vcc, ttmp15, v2     ; encoding: [0x7b,0x04,0xfa,0x7c]
 
-# W32: v_cmp_class_f16_e32 vcc_lo, m0, v2        ; encoding: [0x7d,0x04,0xfa,0x7c]
-# W64: v_cmp_class_f16_e32 vcc, m0, v2           ; encoding: [0x7d,0x04,0xfa,0x7c]
 0x7d,0x04,0xfa,0x7c
+# W32: v_cmp_class_f16_e32 vcc_lo, m0, v2      ; encoding: [0x7d,0x04,0xfa,0x7c]
+# W64: v_cmp_class_f16_e32 vcc, m0, v2         ; encoding: [0x7d,0x04,0xfa,0x7c]
 
-# W32: v_cmp_class_f16_e32 vcc_lo, exec_lo, v2   ; encoding: [0x7e,0x04,0xfa,0x7c]
-# W64: v_cmp_class_f16_e32 vcc, exec_lo, v2      ; encoding: [0x7e,0x04,0xfa,0x7c]
 0x7e,0x04,0xfa,0x7c
+# W32: v_cmp_class_f16_e32 vcc_lo, exec_lo, v2 ; encoding: [0x7e,0x04,0xfa,0x7c]
+# W64: v_cmp_class_f16_e32 vcc, exec_lo, v2    ; encoding: [0x7e,0x04,0xfa,0x7c]
 
-# W32: v_cmp_class_f16_e32 vcc_lo, exec_hi, v2   ; encoding: [0x7f,0x04,0xfa,0x7c]
-# W64: v_cmp_class_f16_e32 vcc, exec_hi, v2      ; encoding: [0x7f,0x04,0xfa,0x7c]
 0x7f,0x04,0xfa,0x7c
+# W32: v_cmp_class_f16_e32 vcc_lo, exec_hi, v2 ; encoding: [0x7f,0x04,0xfa,0x7c]
+# W64: v_cmp_class_f16_e32 vcc, exec_hi, v2    ; encoding: [0x7f,0x04,0xfa,0x7c]
 
-# W32: v_cmp_class_f16_e32 vcc_lo, null, v2      ; encoding: [0x7c,0x04,0xfa,0x7c]
-# W64: v_cmp_class_f16_e32 vcc, null, v2         ; encoding: [0x7c,0x04,0xfa,0x7c]
 0x7c,0x04,0xfa,0x7c
+# W32: v_cmp_class_f16_e32 vcc_lo, null, v2    ; encoding: [0x7c,0x04,0xfa,0x7c]
+# W64: v_cmp_class_f16_e32 vcc, null, v2       ; encoding: [0x7c,0x04,0xfa,0x7c]
 
-# W32: v_cmp_class_f16_e32 vcc_lo, -1, v2        ; encoding: [0xc1,0x04,0xfa,0x7c]
-# W64: v_cmp_class_f16_e32 vcc, -1, v2           ; encoding: [0xc1,0x04,0xfa,0x7c]
 0xc1,0x04,0xfa,0x7c
+# W32: v_cmp_class_f16_e32 vcc_lo, -1, v2      ; encoding: [0xc1,0x04,0xfa,0x7c]
+# W64: v_cmp_class_f16_e32 vcc, -1, v2         ; encoding: [0xc1,0x04,0xfa,0x7c]
 
-# W32: v_cmp_class_f16_e32 vcc_lo, 0.5, v2       ; encoding: [0xf0,0x04,0xfa,0x7c]
-# W64: v_cmp_class_f16_e32 vcc, 0.5, v2          ; encoding: [0xf0,0x04,0xfa,0x7c]
 0xf0,0x04,0xfa,0x7c
+# W32: v_cmp_class_f16_e32 vcc_lo, 0.5, v2     ; encoding: [0xf0,0x04,0xfa,0x7c]
+# W64: v_cmp_class_f16_e32 vcc, 0.5, v2        ; encoding: [0xf0,0x04,0xfa,0x7c]
 
-# W32: v_cmp_class_f16_e32 vcc_lo, src_scc, v2   ; encoding: [0xfd,0x04,0xfa,0x7c]
-# W64: v_cmp_class_f16_e32 vcc, src_scc, v2      ; encoding: [0xfd,0x04,0xfa,0x7c]
 0xfd,0x04,0xfa,0x7c
+# W32: v_cmp_class_f16_e32 vcc_lo, src_scc, v2 ; encoding: [0xfd,0x04,0xfa,0x7c]
+# W64: v_cmp_class_f16_e32 vcc, src_scc, v2    ; encoding: [0xfd,0x04,0xfa,0x7c]
 
-# W32: v_cmp_class_f16_e32 vcc_lo, 0xfe0b, v127  ; encoding: [0xff,0xfe,0xfa,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_class_f16_e32 vcc, 0xfe0b, v127     ; encoding: [0xff,0xfe,0xfa,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0xfa,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_class_f16_e32 vcc_lo, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfa,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_class_f16_e32 vcc, 0xfe0b, v127   ; encoding: [0xff,0xfe,0xfa,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_class_f32_e32 vcc_lo, v1, v2        ; encoding: [0x01,0x05,0xfc,0x7c]
-# W64: v_cmp_class_f32_e32 vcc, v1, v2           ; encoding: [0x01,0x05,0xfc,0x7c]
 0x01,0x05,0xfc,0x7c
+# W32: v_cmp_class_f32_e32 vcc_lo, v1, v2      ; encoding: [0x01,0x05,0xfc,0x7c]
+# W64: v_cmp_class_f32_e32 vcc, v1, v2         ; encoding: [0x01,0x05,0xfc,0x7c]
 
-# W32: v_cmp_class_f32_e32 vcc_lo, v255, v2      ; encoding: [0xff,0x05,0xfc,0x7c]
-# W64: v_cmp_class_f32_e32 vcc, v255, v2         ; encoding: [0xff,0x05,0xfc,0x7c]
 0xff,0x05,0xfc,0x7c
+# W32: v_cmp_class_f32_e32 vcc_lo, v255, v2    ; encoding: [0xff,0x05,0xfc,0x7c]
+# W64: v_cmp_class_f32_e32 vcc, v255, v2       ; encoding: [0xff,0x05,0xfc,0x7c]
 
-# W32: v_cmp_class_f32_e32 vcc_lo, s1, v2        ; encoding: [0x01,0x04,0xfc,0x7c]
-# W64: v_cmp_class_f32_e32 vcc, s1, v2           ; encoding: [0x01,0x04,0xfc,0x7c]
 0x01,0x04,0xfc,0x7c
+# W32: v_cmp_class_f32_e32 vcc_lo, s1, v2      ; encoding: [0x01,0x04,0xfc,0x7c]
+# W64: v_cmp_class_f32_e32 vcc, s1, v2         ; encoding: [0x01,0x04,0xfc,0x7c]
 
-# W32: v_cmp_class_f32_e32 vcc_lo, s105, v2      ; encoding: [0x69,0x04,0xfc,0x7c]
-# W64: v_cmp_class_f32_e32 vcc, s105, v2         ; encoding: [0x69,0x04,0xfc,0x7c]
 0x69,0x04,0xfc,0x7c
+# W32: v_cmp_class_f32_e32 vcc_lo, s105, v2    ; encoding: [0x69,0x04,0xfc,0x7c]
+# W64: v_cmp_class_f32_e32 vcc, s105, v2       ; encoding: [0x69,0x04,0xfc,0x7c]
 
-# W32: v_cmp_class_f32_e32 vcc_lo, vcc_lo, v2    ; encoding: [0x6a,0x04,0xfc,0x7c]
-# W64: v_cmp_class_f32_e32 vcc, vcc_lo, v2       ; encoding: [0x6a,0x04,0xfc,0x7c]
 0x6a,0x04,0xfc,0x7c
+# W32: v_cmp_class_f32_e32 vcc_lo, vcc_lo, v2  ; encoding: [0x6a,0x04,0xfc,0x7c]
+# W64: v_cmp_class_f32_e32 vcc, vcc_lo, v2     ; encoding: [0x6a,0x04,0xfc,0x7c]
 
-# W32: v_cmp_class_f32_e32 vcc_lo, vcc_hi, v2    ; encoding: [0x6b,0x04,0xfc,0x7c]
-# W64: v_cmp_class_f32_e32 vcc, vcc_hi, v2       ; encoding: [0x6b,0x04,0xfc,0x7c]
 0x6b,0x04,0xfc,0x7c
+# W32: v_cmp_class_f32_e32 vcc_lo, vcc_hi, v2  ; encoding: [0x6b,0x04,0xfc,0x7c]
+# W64: v_cmp_class_f32_e32 vcc, vcc_hi, v2     ; encoding: [0x6b,0x04,0xfc,0x7c]
 
-# W32: v_cmp_class_f32_e32 vcc_lo, ttmp15, v2    ; encoding: [0x7b,0x04,0xfc,0x7c]
-# W64: v_cmp_class_f32_e32 vcc, ttmp15, v2       ; encoding: [0x7b,0x04,0xfc,0x7c]
 0x7b,0x04,0xfc,0x7c
+# W32: v_cmp_class_f32_e32 vcc_lo, ttmp15, v2  ; encoding: [0x7b,0x04,0xfc,0x7c]
+# W64: v_cmp_class_f32_e32 vcc, ttmp15, v2     ; encoding: [0x7b,0x04,0xfc,0x7c]
 
-# W32: v_cmp_class_f32_e32 vcc_lo, m0, v2        ; encoding: [0x7d,0x04,0xfc,0x7c]
-# W64: v_cmp_class_f32_e32 vcc, m0, v2           ; encoding: [0x7d,0x04,0xfc,0x7c]
 0x7d,0x04,0xfc,0x7c
+# W32: v_cmp_class_f32_e32 vcc_lo, m0, v2      ; encoding: [0x7d,0x04,0xfc,0x7c]
+# W64: v_cmp_class_f32_e32 vcc, m0, v2         ; encoding: [0x7d,0x04,0xfc,0x7c]
 
-# W32: v_cmp_class_f32_e32 vcc_lo, exec_lo, v2   ; encoding: [0x7e,0x04,0xfc,0x7c]
-# W64: v_cmp_class_f32_e32 vcc, exec_lo, v2      ; encoding: [0x7e,0x04,0xfc,0x7c]
 0x7e,0x04,0xfc,0x7c
+# W32: v_cmp_class_f32_e32 vcc_lo, exec_lo, v2 ; encoding: [0x7e,0x04,0xfc,0x7c]
+# W64: v_cmp_class_f32_e32 vcc, exec_lo, v2    ; encoding: [0x7e,0x04,0xfc,0x7c]
 
-# W32: v_cmp_class_f32_e32 vcc_lo, exec_hi, v2   ; encoding: [0x7f,0x04,0xfc,0x7c]
-# W64: v_cmp_class_f32_e32 vcc, exec_hi, v2      ; encoding: [0x7f,0x04,0xfc,0x7c]
 0x7f,0x04,0xfc,0x7c
+# W32: v_cmp_class_f32_e32 vcc_lo, exec_hi, v2 ; encoding: [0x7f,0x04,0xfc,0x7c]
+# W64: v_cmp_class_f32_e32 vcc, exec_hi, v2    ; encoding: [0x7f,0x04,0xfc,0x7c]
 
-# W32: v_cmp_class_f32_e32 vcc_lo, null, v2      ; encoding: [0x7c,0x04,0xfc,0x7c]
-# W64: v_cmp_class_f32_e32 vcc, null, v2         ; encoding: [0x7c,0x04,0xfc,0x7c]
 0x7c,0x04,0xfc,0x7c
+# W32: v_cmp_class_f32_e32 vcc_lo, null, v2    ; encoding: [0x7c,0x04,0xfc,0x7c]
+# W64: v_cmp_class_f32_e32 vcc, null, v2       ; encoding: [0x7c,0x04,0xfc,0x7c]
 
-# W32: v_cmp_class_f32_e32 vcc_lo, -1, v2        ; encoding: [0xc1,0x04,0xfc,0x7c]
-# W64: v_cmp_class_f32_e32 vcc, -1, v2           ; encoding: [0xc1,0x04,0xfc,0x7c]
 0xc1,0x04,0xfc,0x7c
+# W32: v_cmp_class_f32_e32 vcc_lo, -1, v2      ; encoding: [0xc1,0x04,0xfc,0x7c]
+# W64: v_cmp_class_f32_e32 vcc, -1, v2         ; encoding: [0xc1,0x04,0xfc,0x7c]
 
-# W32: v_cmp_class_f32_e32 vcc_lo, 0.5, v2       ; encoding: [0xf0,0x04,0xfc,0x7c]
-# W64: v_cmp_class_f32_e32 vcc, 0.5, v2          ; encoding: [0xf0,0x04,0xfc,0x7c]
 0xf0,0x04,0xfc,0x7c
+# W32: v_cmp_class_f32_e32 vcc_lo, 0.5, v2     ; encoding: [0xf0,0x04,0xfc,0x7c]
+# W64: v_cmp_class_f32_e32 vcc, 0.5, v2        ; encoding: [0xf0,0x04,0xfc,0x7c]
 
-# W32: v_cmp_class_f32_e32 vcc_lo, src_scc, v2   ; encoding: [0xfd,0x04,0xfc,0x7c]
-# W64: v_cmp_class_f32_e32 vcc, src_scc, v2      ; encoding: [0xfd,0x04,0xfc,0x7c]
 0xfd,0x04,0xfc,0x7c
+# W32: v_cmp_class_f32_e32 vcc_lo, src_scc, v2 ; encoding: [0xfd,0x04,0xfc,0x7c]
+# W64: v_cmp_class_f32_e32 vcc, src_scc, v2    ; encoding: [0xfd,0x04,0xfc,0x7c]
 
+0xff,0xfe,0xfd,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_class_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xfd,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_class_f32_e32 vcc, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xfd,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfe,0xfd,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_class_f64_e32 vcc_lo, v[1:2], v2    ; encoding: [0x01,0x05,0xfe,0x7c]
-# W64: v_cmp_class_f64_e32 vcc, v[1:2], v2       ; encoding: [0x01,0x05,0xfe,0x7c]
 0x01,0x05,0xfe,0x7c
+# W32: v_cmp_class_f64_e32 vcc_lo, v[1:2], v2  ; encoding: [0x01,0x05,0xfe,0x7c]
+# W64: v_cmp_class_f64_e32 vcc, v[1:2], v2     ; encoding: [0x01,0x05,0xfe,0x7c]
 
-# W32: v_cmp_class_f64_e32 vcc_lo, v[254:255], v2 ; encoding: [0xfe,0x05,0xfe,0x7c]
-# W64: v_cmp_class_f64_e32 vcc, v[254:255], v2   ; encoding: [0xfe,0x05,0xfe,0x7c]
 0xfe,0x05,0xfe,0x7c
+# W32: v_cmp_class_f64_e32 vcc_lo, v[254:255], v2 ; encoding: [0xfe,0x05,0xfe,0x7c]
+# W64: v_cmp_class_f64_e32 vcc, v[254:255], v2 ; encoding: [0xfe,0x05,0xfe,0x7c]
 
-# W32: v_cmp_class_f64_e32 vcc_lo, s[2:3], v2    ; encoding: [0x02,0x04,0xfe,0x7c]
-# W64: v_cmp_class_f64_e32 vcc, s[2:3], v2       ; encoding: [0x02,0x04,0xfe,0x7c]
 0x02,0x04,0xfe,0x7c
+# W32: v_cmp_class_f64_e32 vcc_lo, s[2:3], v2  ; encoding: [0x02,0x04,0xfe,0x7c]
+# W64: v_cmp_class_f64_e32 vcc, s[2:3], v2     ; encoding: [0x02,0x04,0xfe,0x7c]
 
-# W32: v_cmp_class_f64_e32 vcc_lo, s[104:105], v2 ; encoding: [0x68,0x04,0xfe,0x7c]
-# W64: v_cmp_class_f64_e32 vcc, s[104:105], v2   ; encoding: [0x68,0x04,0xfe,0x7c]
 0x68,0x04,0xfe,0x7c
+# W32: v_cmp_class_f64_e32 vcc_lo, s[104:105], v2 ; encoding: [0x68,0x04,0xfe,0x7c]
+# W64: v_cmp_class_f64_e32 vcc, s[104:105], v2 ; encoding: [0x68,0x04,0xfe,0x7c]
 
-# W32: v_cmp_class_f64_e32 vcc_lo, vcc, v2       ; encoding: [0x6a,0x04,0xfe,0x7c]
-# W64: v_cmp_class_f64_e32 vcc, vcc, v2          ; encoding: [0x6a,0x04,0xfe,0x7c]
 0x6a,0x04,0xfe,0x7c
+# W32: v_cmp_class_f64_e32 vcc_lo, vcc, v2     ; encoding: [0x6a,0x04,0xfe,0x7c]
+# W64: v_cmp_class_f64_e32 vcc, vcc, v2        ; encoding: [0x6a,0x04,0xfe,0x7c]
 
-# W32: v_cmp_class_f64_e32 vcc_lo, ttmp[14:15], v2 ; encoding: [0x7a,0x04,0xfe,0x7c]
-# W64: v_cmp_class_f64_e32 vcc, ttmp[14:15], v2  ; encoding: [0x7a,0x04,0xfe,0x7c]
 0x7a,0x04,0xfe,0x7c
+# W32: v_cmp_class_f64_e32 vcc_lo, ttmp[14:15], v2 ; encoding: [0x7a,0x04,0xfe,0x7c]
+# W64: v_cmp_class_f64_e32 vcc, ttmp[14:15], v2 ; encoding: [0x7a,0x04,0xfe,0x7c]
 
-# W32: v_cmp_class_f64_e32 vcc_lo, exec, v2      ; encoding: [0x7e,0x04,0xfe,0x7c]
-# W64: v_cmp_class_f64_e32 vcc, exec, v2         ; encoding: [0x7e,0x04,0xfe,0x7c]
 0x7e,0x04,0xfe,0x7c
+# W32: v_cmp_class_f64_e32 vcc_lo, exec, v2    ; encoding: [0x7e,0x04,0xfe,0x7c]
+# W64: v_cmp_class_f64_e32 vcc, exec, v2       ; encoding: [0x7e,0x04,0xfe,0x7c]
 
-# W32: v_cmp_class_f64_e32 vcc_lo, null, v2      ; encoding: [0x7c,0x04,0xfe,0x7c]
-# W64: v_cmp_class_f64_e32 vcc, null, v2         ; encoding: [0x7c,0x04,0xfe,0x7c]
 0x7c,0x04,0xfe,0x7c
+# W32: v_cmp_class_f64_e32 vcc_lo, null, v2    ; encoding: [0x7c,0x04,0xfe,0x7c]
+# W64: v_cmp_class_f64_e32 vcc, null, v2       ; encoding: [0x7c,0x04,0xfe,0x7c]
 
-# W32: v_cmp_class_f64_e32 vcc_lo, -1, v2        ; encoding: [0xc1,0x04,0xfe,0x7c]
-# W64: v_cmp_class_f64_e32 vcc, -1, v2           ; encoding: [0xc1,0x04,0xfe,0x7c]
 0xc1,0x04,0xfe,0x7c
+# W32: v_cmp_class_f64_e32 vcc_lo, -1, v2      ; encoding: [0xc1,0x04,0xfe,0x7c]
+# W64: v_cmp_class_f64_e32 vcc, -1, v2         ; encoding: [0xc1,0x04,0xfe,0x7c]
 
-# W32: v_cmp_class_f64_e32 vcc_lo, 0.5, v2       ; encoding: [0xf0,0x04,0xfe,0x7c]
-# W64: v_cmp_class_f64_e32 vcc, 0.5, v2          ; encoding: [0xf0,0x04,0xfe,0x7c]
 0xf0,0x04,0xfe,0x7c
+# W32: v_cmp_class_f64_e32 vcc_lo, 0.5, v2     ; encoding: [0xf0,0x04,0xfe,0x7c]
+# W64: v_cmp_class_f64_e32 vcc, 0.5, v2        ; encoding: [0xf0,0x04,0xfe,0x7c]
 
-# W32: v_cmp_class_f64_e32 vcc_lo, src_scc, v2   ; encoding: [0xfd,0x04,0xfe,0x7c]
-# W64: v_cmp_class_f64_e32 vcc, src_scc, v2      ; encoding: [0xfd,0x04,0xfe,0x7c]
 0xfd,0x04,0xfe,0x7c
+# W32: v_cmp_class_f64_e32 vcc_lo, src_scc, v2 ; encoding: [0xfd,0x04,0xfe,0x7c]
+# W64: v_cmp_class_f64_e32 vcc, src_scc, v2    ; encoding: [0xfd,0x04,0xfe,0x7c]
 
+0xff,0xfe,0xff,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_class_f64_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_class_f64_e32 vcc, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfe,0xff,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_eq_f16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x04,0x7c]
-# W64: v_cmp_eq_f16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x04,0x7c]
 0x01,0x05,0x04,0x7c
+# W32: v_cmp_eq_f16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x04,0x7c]
+# W64: v_cmp_eq_f16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x04,0x7c]
 
-# W32: v_cmp_eq_f16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x04,0x7c]
-# W64: v_cmp_eq_f16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x04,0x7c]
 0x7f,0x05,0x04,0x7c
+# W32: v_cmp_eq_f16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x04,0x7c]
+# W64: v_cmp_eq_f16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x04,0x7c]
 
-# W32: v_cmp_eq_f16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x04,0x7c]
-# W64: v_cmp_eq_f16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x04,0x7c]
 0x01,0x04,0x04,0x7c
+# W32: v_cmp_eq_f16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x04,0x7c]
+# W64: v_cmp_eq_f16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x04,0x7c]
 
-# W32: v_cmp_eq_f16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x04,0x7c]
-# W64: v_cmp_eq_f16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x04,0x7c]
 0x69,0x04,0x04,0x7c
+# W32: v_cmp_eq_f16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x04,0x7c]
+# W64: v_cmp_eq_f16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x04,0x7c]
 
-# W32: v_cmp_eq_f16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x04,0x7c]
-# W64: v_cmp_eq_f16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x04,0x7c]
 0x6a,0x04,0x04,0x7c
+# W32: v_cmp_eq_f16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x04,0x7c]
+# W64: v_cmp_eq_f16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x04,0x7c]
 
-# W32: v_cmp_eq_f16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x04,0x7c]
-# W64: v_cmp_eq_f16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x04,0x7c]
 0x6b,0x04,0x04,0x7c
+# W32: v_cmp_eq_f16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x04,0x7c]
+# W64: v_cmp_eq_f16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x04,0x7c]
 
-# W32: v_cmp_eq_f16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x04,0x7c]
-# W64: v_cmp_eq_f16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x04,0x7c]
 0x7b,0x04,0x04,0x7c
+# W32: v_cmp_eq_f16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x04,0x7c]
+# W64: v_cmp_eq_f16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x04,0x7c]
 
-# W32: v_cmp_eq_f16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x04,0x7c]
-# W64: v_cmp_eq_f16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x04,0x7c]
 0x7d,0x04,0x04,0x7c
+# W32: v_cmp_eq_f16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x04,0x7c]
+# W64: v_cmp_eq_f16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x04,0x7c]
 
-# W32: v_cmp_eq_f16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x04,0x7c]
-# W64: v_cmp_eq_f16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x04,0x7c]
 0x7e,0x04,0x04,0x7c
+# W32: v_cmp_eq_f16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x04,0x7c]
+# W64: v_cmp_eq_f16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x04,0x7c]
 
-# W32: v_cmp_eq_f16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x04,0x7c]
-# W64: v_cmp_eq_f16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x04,0x7c]
 0x7f,0x04,0x04,0x7c
+# W32: v_cmp_eq_f16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x04,0x7c]
+# W64: v_cmp_eq_f16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x04,0x7c]
 
-# W32: v_cmp_eq_f16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x04,0x7c]
-# W64: v_cmp_eq_f16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x04,0x7c]
 0x7c,0x04,0x04,0x7c
+# W32: v_cmp_eq_f16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x04,0x7c]
+# W64: v_cmp_eq_f16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x04,0x7c]
 
-# W32: v_cmp_eq_f16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x04,0x7c]
-# W64: v_cmp_eq_f16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x04,0x7c]
 0xc1,0x04,0x04,0x7c
+# W32: v_cmp_eq_f16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x04,0x7c]
+# W64: v_cmp_eq_f16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x04,0x7c]
 
-# W32: v_cmp_eq_f16_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x04,0x7c]
-# W64: v_cmp_eq_f16_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x04,0x7c]
 0xf0,0x04,0x04,0x7c
+# W32: v_cmp_eq_f16_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x04,0x7c]
+# W64: v_cmp_eq_f16_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x04,0x7c]
 
-# W32: v_cmp_eq_f16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x04,0x7c]
-# W64: v_cmp_eq_f16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x04,0x7c]
 0xfd,0x04,0x04,0x7c
+# W32: v_cmp_eq_f16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x04,0x7c]
+# W64: v_cmp_eq_f16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x04,0x7c]
 
-# W32: v_cmp_eq_f16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x04,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_eq_f16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x04,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x04,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_eq_f16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x04,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_eq_f16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x04,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_eq_f32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x24,0x7c]
-# W64: v_cmp_eq_f32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x24,0x7c]
 0x01,0x05,0x24,0x7c
+# W32: v_cmp_eq_f32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x24,0x7c]
+# W64: v_cmp_eq_f32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x24,0x7c]
 
-# W32: v_cmp_eq_f32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x24,0x7c]
-# W64: v_cmp_eq_f32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x24,0x7c]
 0xff,0x05,0x24,0x7c
+# W32: v_cmp_eq_f32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x24,0x7c]
+# W64: v_cmp_eq_f32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x24,0x7c]
 
-# W32: v_cmp_eq_f32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x24,0x7c]
-# W64: v_cmp_eq_f32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x24,0x7c]
 0x01,0x04,0x24,0x7c
+# W32: v_cmp_eq_f32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x24,0x7c]
+# W64: v_cmp_eq_f32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x24,0x7c]
 
-# W32: v_cmp_eq_f32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x24,0x7c]
-# W64: v_cmp_eq_f32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x24,0x7c]
 0x69,0x04,0x24,0x7c
+# W32: v_cmp_eq_f32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x24,0x7c]
+# W64: v_cmp_eq_f32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x24,0x7c]
 
-# W32: v_cmp_eq_f32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x24,0x7c]
-# W64: v_cmp_eq_f32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x24,0x7c]
 0x6a,0x04,0x24,0x7c
+# W32: v_cmp_eq_f32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x24,0x7c]
+# W64: v_cmp_eq_f32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x24,0x7c]
 
-# W32: v_cmp_eq_f32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x24,0x7c]
-# W64: v_cmp_eq_f32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x24,0x7c]
 0x6b,0x04,0x24,0x7c
+# W32: v_cmp_eq_f32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x24,0x7c]
+# W64: v_cmp_eq_f32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x24,0x7c]
 
-# W32: v_cmp_eq_f32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x24,0x7c]
-# W64: v_cmp_eq_f32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x24,0x7c]
 0x7b,0x04,0x24,0x7c
+# W32: v_cmp_eq_f32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x24,0x7c]
+# W64: v_cmp_eq_f32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x24,0x7c]
 
-# W32: v_cmp_eq_f32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x24,0x7c]
-# W64: v_cmp_eq_f32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x24,0x7c]
 0x7d,0x04,0x24,0x7c
+# W32: v_cmp_eq_f32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x24,0x7c]
+# W64: v_cmp_eq_f32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x24,0x7c]
 
-# W32: v_cmp_eq_f32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x24,0x7c]
-# W64: v_cmp_eq_f32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x24,0x7c]
 0x7e,0x04,0x24,0x7c
+# W32: v_cmp_eq_f32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x24,0x7c]
+# W64: v_cmp_eq_f32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x24,0x7c]
 
-# W32: v_cmp_eq_f32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x24,0x7c]
-# W64: v_cmp_eq_f32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x24,0x7c]
 0x7f,0x04,0x24,0x7c
+# W32: v_cmp_eq_f32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x24,0x7c]
+# W64: v_cmp_eq_f32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x24,0x7c]
 
-# W32: v_cmp_eq_f32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x24,0x7c]
-# W64: v_cmp_eq_f32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x24,0x7c]
 0x7c,0x04,0x24,0x7c
+# W32: v_cmp_eq_f32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x24,0x7c]
+# W64: v_cmp_eq_f32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x24,0x7c]
 
-# W32: v_cmp_eq_f32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x24,0x7c]
-# W64: v_cmp_eq_f32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x24,0x7c]
 0xc1,0x04,0x24,0x7c
+# W32: v_cmp_eq_f32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x24,0x7c]
+# W64: v_cmp_eq_f32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x24,0x7c]
 
-# W32: v_cmp_eq_f32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x24,0x7c]
-# W64: v_cmp_eq_f32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x24,0x7c]
 0xf0,0x04,0x24,0x7c
+# W32: v_cmp_eq_f32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x24,0x7c]
+# W64: v_cmp_eq_f32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x24,0x7c]
 
-# W32: v_cmp_eq_f32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x24,0x7c]
-# W64: v_cmp_eq_f32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x24,0x7c]
 0xfd,0x04,0x24,0x7c
+# W32: v_cmp_eq_f32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x24,0x7c]
+# W64: v_cmp_eq_f32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x24,0x7c]
 
-# W32: v_cmp_eq_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x25,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_eq_f32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x25,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x25,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_eq_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x25,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_eq_f32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x25,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_eq_f64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0x44,0x7c]
-# W64: v_cmp_eq_f64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0x44,0x7c]
 0x01,0x05,0x44,0x7c
+# W32: v_cmp_eq_f64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0x44,0x7c]
+# W64: v_cmp_eq_f64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0x44,0x7c]
 
-# W32: v_cmp_eq_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x44,0x7c]
-# W64: v_cmp_eq_f64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0x44,0x7c]
 0xfe,0x05,0x44,0x7c
+# W32: v_cmp_eq_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x44,0x7c]
+# W64: v_cmp_eq_f64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x44,0x7c]
 
-# W32: v_cmp_eq_f64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0x44,0x7c]
-# W64: v_cmp_eq_f64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0x44,0x7c]
 0x02,0x04,0x44,0x7c
+# W32: v_cmp_eq_f64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0x44,0x7c]
+# W64: v_cmp_eq_f64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0x44,0x7c]
 
-# W32: v_cmp_eq_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x44,0x7c]
-# W64: v_cmp_eq_f64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0x44,0x7c]
 0x68,0x04,0x44,0x7c
+# W32: v_cmp_eq_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x44,0x7c]
+# W64: v_cmp_eq_f64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x44,0x7c]
 
-# W32: v_cmp_eq_f64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0x44,0x7c]
-# W64: v_cmp_eq_f64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0x44,0x7c]
 0x6a,0x04,0x44,0x7c
+# W32: v_cmp_eq_f64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0x44,0x7c]
+# W64: v_cmp_eq_f64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0x44,0x7c]
 
+0x7a,0x04,0x44,0x7c
 # W32: v_cmp_eq_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x44,0x7c]
 # W64: v_cmp_eq_f64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x44,0x7c]
-0x7a,0x04,0x44,0x7c
 
-# W32: v_cmp_eq_f64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0x44,0x7c]
-# W64: v_cmp_eq_f64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0x44,0x7c]
 0x7e,0x04,0x44,0x7c
+# W32: v_cmp_eq_f64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0x44,0x7c]
+# W64: v_cmp_eq_f64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0x44,0x7c]
 
-# W32: v_cmp_eq_f64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0x44,0x7c]
-# W64: v_cmp_eq_f64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0x44,0x7c]
 0x7c,0x04,0x44,0x7c
+# W32: v_cmp_eq_f64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0x44,0x7c]
+# W64: v_cmp_eq_f64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0x44,0x7c]
 
-# W32: v_cmp_eq_f64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0x44,0x7c]
-# W64: v_cmp_eq_f64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0x44,0x7c]
 0xc1,0x04,0x44,0x7c
+# W32: v_cmp_eq_f64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0x44,0x7c]
+# W64: v_cmp_eq_f64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0x44,0x7c]
 
-# W32: v_cmp_eq_f64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0x44,0x7c]
-# W64: v_cmp_eq_f64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0x44,0x7c]
 0xf0,0x04,0x44,0x7c
+# W32: v_cmp_eq_f64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0x44,0x7c]
+# W64: v_cmp_eq_f64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0x44,0x7c]
 
-# W32: v_cmp_eq_f64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0x44,0x7c]
-# W64: v_cmp_eq_f64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0x44,0x7c]
 0xfd,0x04,0x44,0x7c
+# W32: v_cmp_eq_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x44,0x7c]
+# W64: v_cmp_eq_f64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0x44,0x7c]
 
+0xff,0xfc,0x45,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_eq_f64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x45,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_eq_f64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x45,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0x45,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_eq_i16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x64,0x7c]
-# W64: v_cmp_eq_i16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x64,0x7c]
 0x01,0x05,0x64,0x7c
+# W32: v_cmp_eq_i16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x64,0x7c]
+# W64: v_cmp_eq_i16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x64,0x7c]
 
-# W32: v_cmp_eq_i16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x64,0x7c]
-# W64: v_cmp_eq_i16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x64,0x7c]
 0x7f,0x05,0x64,0x7c
+# W32: v_cmp_eq_i16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x64,0x7c]
+# W64: v_cmp_eq_i16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x64,0x7c]
 
-# W32: v_cmp_eq_i16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x64,0x7c]
-# W64: v_cmp_eq_i16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x64,0x7c]
 0x01,0x04,0x64,0x7c
+# W32: v_cmp_eq_i16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x64,0x7c]
+# W64: v_cmp_eq_i16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x64,0x7c]
 
-# W32: v_cmp_eq_i16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x64,0x7c]
-# W64: v_cmp_eq_i16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x64,0x7c]
 0x69,0x04,0x64,0x7c
+# W32: v_cmp_eq_i16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x64,0x7c]
+# W64: v_cmp_eq_i16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x64,0x7c]
 
-# W32: v_cmp_eq_i16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x64,0x7c]
-# W64: v_cmp_eq_i16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x64,0x7c]
 0x6a,0x04,0x64,0x7c
+# W32: v_cmp_eq_i16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x64,0x7c]
+# W64: v_cmp_eq_i16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x64,0x7c]
 
-# W32: v_cmp_eq_i16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x64,0x7c]
-# W64: v_cmp_eq_i16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x64,0x7c]
 0x6b,0x04,0x64,0x7c
+# W32: v_cmp_eq_i16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x64,0x7c]
+# W64: v_cmp_eq_i16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x64,0x7c]
 
-# W32: v_cmp_eq_i16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x64,0x7c]
-# W64: v_cmp_eq_i16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x64,0x7c]
 0x7b,0x04,0x64,0x7c
+# W32: v_cmp_eq_i16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x64,0x7c]
+# W64: v_cmp_eq_i16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x64,0x7c]
 
-# W32: v_cmp_eq_i16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x64,0x7c]
-# W64: v_cmp_eq_i16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x64,0x7c]
 0x7d,0x04,0x64,0x7c
+# W32: v_cmp_eq_i16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x64,0x7c]
+# W64: v_cmp_eq_i16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x64,0x7c]
 
-# W32: v_cmp_eq_i16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x64,0x7c]
-# W64: v_cmp_eq_i16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x64,0x7c]
 0x7e,0x04,0x64,0x7c
+# W32: v_cmp_eq_i16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x64,0x7c]
+# W64: v_cmp_eq_i16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x64,0x7c]
 
-# W32: v_cmp_eq_i16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x64,0x7c]
-# W64: v_cmp_eq_i16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x64,0x7c]
 0x7f,0x04,0x64,0x7c
+# W32: v_cmp_eq_i16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x64,0x7c]
+# W64: v_cmp_eq_i16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x64,0x7c]
 
-# W32: v_cmp_eq_i16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x64,0x7c]
-# W64: v_cmp_eq_i16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x64,0x7c]
 0x7c,0x04,0x64,0x7c
+# W32: v_cmp_eq_i16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x64,0x7c]
+# W64: v_cmp_eq_i16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x64,0x7c]
 
-# W32: v_cmp_eq_i16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x64,0x7c]
-# W64: v_cmp_eq_i16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x64,0x7c]
 0xc1,0x04,0x64,0x7c
+# W32: v_cmp_eq_i16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x64,0x7c]
+# W64: v_cmp_eq_i16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x64,0x7c]
 
-# W32: v_cmp_eq_i16_e32 vcc_lo, 0x3800, v2
-# W64: v_cmp_eq_i16_e32 vcc, 0x3800, v2          ; encoding: [0xff,0x04,0x64,0x7c,0x00,0x38,0x00,0x00]
 0xf0,0x04,0x64,0x7c
+# W32: v_cmp_eq_i16_e32 vcc_lo, 0x3800, v2     ; encoding: [0xff,0x04,0x64,0x7c,0x00,0x38,0x00,0x00]
+# W64: v_cmp_eq_i16_e32 vcc, 0x3800, v2        ; encoding: [0xff,0x04,0x64,0x7c,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_eq_i16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x64,0x7c]
-# W64: v_cmp_eq_i16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x64,0x7c]
 0xfd,0x04,0x64,0x7c
+# W32: v_cmp_eq_i16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x64,0x7c]
+# W64: v_cmp_eq_i16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x64,0x7c]
 
-# W32: v_cmp_eq_i16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x64,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_eq_i16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x64,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x64,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_eq_i16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x64,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_eq_i16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x64,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_eq_i32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x84,0x7c]
-# W64: v_cmp_eq_i32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x84,0x7c]
 0x01,0x05,0x84,0x7c
+# W32: v_cmp_eq_i32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x84,0x7c]
+# W64: v_cmp_eq_i32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x84,0x7c]
 
-# W32: v_cmp_eq_i32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x84,0x7c]
-# W64: v_cmp_eq_i32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x84,0x7c]
 0xff,0x05,0x84,0x7c
+# W32: v_cmp_eq_i32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x84,0x7c]
+# W64: v_cmp_eq_i32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x84,0x7c]
 
-# W32: v_cmp_eq_i32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x84,0x7c]
-# W64: v_cmp_eq_i32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x84,0x7c]
 0x01,0x04,0x84,0x7c
+# W32: v_cmp_eq_i32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x84,0x7c]
+# W64: v_cmp_eq_i32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x84,0x7c]
 
-# W32: v_cmp_eq_i32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x84,0x7c]
-# W64: v_cmp_eq_i32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x84,0x7c]
 0x69,0x04,0x84,0x7c
+# W32: v_cmp_eq_i32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x84,0x7c]
+# W64: v_cmp_eq_i32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x84,0x7c]
 
-# W32: v_cmp_eq_i32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x84,0x7c]
-# W64: v_cmp_eq_i32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x84,0x7c]
 0x6a,0x04,0x84,0x7c
+# W32: v_cmp_eq_i32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x84,0x7c]
+# W64: v_cmp_eq_i32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x84,0x7c]
 
-# W32: v_cmp_eq_i32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x84,0x7c]
-# W64: v_cmp_eq_i32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x84,0x7c]
 0x6b,0x04,0x84,0x7c
+# W32: v_cmp_eq_i32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x84,0x7c]
+# W64: v_cmp_eq_i32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x84,0x7c]
 
-# W32: v_cmp_eq_i32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x84,0x7c]
-# W64: v_cmp_eq_i32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x84,0x7c]
 0x7b,0x04,0x84,0x7c
+# W32: v_cmp_eq_i32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x84,0x7c]
+# W64: v_cmp_eq_i32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x84,0x7c]
 
-# W32: v_cmp_eq_i32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x84,0x7c]
-# W64: v_cmp_eq_i32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x84,0x7c]
 0x7d,0x04,0x84,0x7c
+# W32: v_cmp_eq_i32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x84,0x7c]
+# W64: v_cmp_eq_i32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x84,0x7c]
 
-# W32: v_cmp_eq_i32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x84,0x7c]
-# W64: v_cmp_eq_i32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x84,0x7c]
 0x7e,0x04,0x84,0x7c
+# W32: v_cmp_eq_i32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x84,0x7c]
+# W64: v_cmp_eq_i32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x84,0x7c]
 
-# W32: v_cmp_eq_i32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x84,0x7c]
-# W64: v_cmp_eq_i32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x84,0x7c]
 0x7f,0x04,0x84,0x7c
+# W32: v_cmp_eq_i32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x84,0x7c]
+# W64: v_cmp_eq_i32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x84,0x7c]
 
-# W32: v_cmp_eq_i32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x84,0x7c]
-# W64: v_cmp_eq_i32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x84,0x7c]
 0x7c,0x04,0x84,0x7c
+# W32: v_cmp_eq_i32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x84,0x7c]
+# W64: v_cmp_eq_i32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x84,0x7c]
 
-# W32: v_cmp_eq_i32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x84,0x7c]
-# W64: v_cmp_eq_i32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x84,0x7c]
 0xc1,0x04,0x84,0x7c
+# W32: v_cmp_eq_i32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x84,0x7c]
+# W64: v_cmp_eq_i32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x84,0x7c]
 
-# W32: v_cmp_eq_i32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x84,0x7c]
-# W64: v_cmp_eq_i32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x84,0x7c]
 0xf0,0x04,0x84,0x7c
+# W32: v_cmp_eq_i32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x84,0x7c]
+# W64: v_cmp_eq_i32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x84,0x7c]
 
-# W32: v_cmp_eq_i32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x84,0x7c]
-# W64: v_cmp_eq_i32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x84,0x7c]
 0xfd,0x04,0x84,0x7c
+# W32: v_cmp_eq_i32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x84,0x7c]
+# W64: v_cmp_eq_i32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x84,0x7c]
 
-# W32: v_cmp_eq_i32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x85,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_eq_i32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x85,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x85,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_eq_i32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x85,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_eq_i32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x85,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_eq_i64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0xa4,0x7c]
-# W64: v_cmp_eq_i64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0xa4,0x7c]
 0x01,0x05,0xa4,0x7c
+# W32: v_cmp_eq_i64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0xa4,0x7c]
+# W64: v_cmp_eq_i64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0xa4,0x7c]
 
-# W32: v_cmp_eq_i64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xa4,0x7c]
-# W64: v_cmp_eq_i64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0xa4,0x7c]
 0xfe,0x05,0xa4,0x7c
+# W32: v_cmp_eq_i64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xa4,0x7c]
+# W64: v_cmp_eq_i64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xa4,0x7c]
 
-# W32: v_cmp_eq_i64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0xa4,0x7c]
-# W64: v_cmp_eq_i64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0xa4,0x7c]
 0x02,0x04,0xa4,0x7c
+# W32: v_cmp_eq_i64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0xa4,0x7c]
+# W64: v_cmp_eq_i64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0xa4,0x7c]
 
-# W32: v_cmp_eq_i64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xa4,0x7c]
-# W64: v_cmp_eq_i64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0xa4,0x7c]
 0x68,0x04,0xa4,0x7c
+# W32: v_cmp_eq_i64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xa4,0x7c]
+# W64: v_cmp_eq_i64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xa4,0x7c]
 
-# W32: v_cmp_eq_i64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0xa4,0x7c]
-# W64: v_cmp_eq_i64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0xa4,0x7c]
 0x6a,0x04,0xa4,0x7c
+# W32: v_cmp_eq_i64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0xa4,0x7c]
+# W64: v_cmp_eq_i64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0xa4,0x7c]
 
+0x7a,0x04,0xa4,0x7c
 # W32: v_cmp_eq_i64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xa4,0x7c]
 # W64: v_cmp_eq_i64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xa4,0x7c]
-0x7a,0x04,0xa4,0x7c
 
-# W32: v_cmp_eq_i64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0xa4,0x7c]
-# W64: v_cmp_eq_i64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0xa4,0x7c]
 0x7e,0x04,0xa4,0x7c
+# W32: v_cmp_eq_i64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0xa4,0x7c]
+# W64: v_cmp_eq_i64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0xa4,0x7c]
 
-# W32: v_cmp_eq_i64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0xa4,0x7c]
-# W64: v_cmp_eq_i64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0xa4,0x7c]
 0x7c,0x04,0xa4,0x7c
+# W32: v_cmp_eq_i64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0xa4,0x7c]
+# W64: v_cmp_eq_i64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0xa4,0x7c]
 
-# W32: v_cmp_eq_i64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0xa4,0x7c]
-# W64: v_cmp_eq_i64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0xa4,0x7c]
 0xc1,0x04,0xa4,0x7c
+# W32: v_cmp_eq_i64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0xa4,0x7c]
+# W64: v_cmp_eq_i64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0xa4,0x7c]
 
-# W32: v_cmp_eq_i64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0xa4,0x7c]
-# W64: v_cmp_eq_i64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0xa4,0x7c]
 0xf0,0x04,0xa4,0x7c
+# W32: v_cmp_eq_i64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0xa4,0x7c]
+# W64: v_cmp_eq_i64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0xa4,0x7c]
 
-# W32: v_cmp_eq_i64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0xa4,0x7c]
-# W64: v_cmp_eq_i64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0xa4,0x7c]
 0xfd,0x04,0xa4,0x7c
+# W32: v_cmp_eq_i64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0xa4,0x7c]
+# W64: v_cmp_eq_i64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0xa4,0x7c]
 
+0xff,0xfc,0xa5,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_eq_i64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa5,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_eq_i64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa5,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0xa5,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_eq_u16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x74,0x7c]
-# W64: v_cmp_eq_u16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x74,0x7c]
 0x01,0x05,0x74,0x7c
+# W32: v_cmp_eq_u16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x74,0x7c]
+# W64: v_cmp_eq_u16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x74,0x7c]
 
-# W32: v_cmp_eq_u16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x74,0x7c]
-# W64: v_cmp_eq_u16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x74,0x7c]
 0x7f,0x05,0x74,0x7c
+# W32: v_cmp_eq_u16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x74,0x7c]
+# W64: v_cmp_eq_u16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x74,0x7c]
 
-# W32: v_cmp_eq_u16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x74,0x7c]
-# W64: v_cmp_eq_u16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x74,0x7c]
 0x01,0x04,0x74,0x7c
+# W32: v_cmp_eq_u16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x74,0x7c]
+# W64: v_cmp_eq_u16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x74,0x7c]
 
-# W32: v_cmp_eq_u16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x74,0x7c]
-# W64: v_cmp_eq_u16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x74,0x7c]
 0x69,0x04,0x74,0x7c
+# W32: v_cmp_eq_u16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x74,0x7c]
+# W64: v_cmp_eq_u16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x74,0x7c]
 
-# W32: v_cmp_eq_u16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x74,0x7c]
-# W64: v_cmp_eq_u16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x74,0x7c]
 0x6a,0x04,0x74,0x7c
+# W32: v_cmp_eq_u16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x74,0x7c]
+# W64: v_cmp_eq_u16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x74,0x7c]
 
-# W32: v_cmp_eq_u16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x74,0x7c]
-# W64: v_cmp_eq_u16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x74,0x7c]
 0x6b,0x04,0x74,0x7c
+# W32: v_cmp_eq_u16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x74,0x7c]
+# W64: v_cmp_eq_u16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x74,0x7c]
 
-# W32: v_cmp_eq_u16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x74,0x7c]
-# W64: v_cmp_eq_u16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x74,0x7c]
 0x7b,0x04,0x74,0x7c
+# W32: v_cmp_eq_u16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x74,0x7c]
+# W64: v_cmp_eq_u16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x74,0x7c]
 
-# W32: v_cmp_eq_u16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x74,0x7c]
-# W64: v_cmp_eq_u16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x74,0x7c]
 0x7d,0x04,0x74,0x7c
+# W32: v_cmp_eq_u16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x74,0x7c]
+# W64: v_cmp_eq_u16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x74,0x7c]
 
-# W32: v_cmp_eq_u16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x74,0x7c]
-# W64: v_cmp_eq_u16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x74,0x7c]
 0x7e,0x04,0x74,0x7c
+# W32: v_cmp_eq_u16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x74,0x7c]
+# W64: v_cmp_eq_u16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x74,0x7c]
 
-# W32: v_cmp_eq_u16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x74,0x7c]
-# W64: v_cmp_eq_u16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x74,0x7c]
 0x7f,0x04,0x74,0x7c
+# W32: v_cmp_eq_u16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x74,0x7c]
+# W64: v_cmp_eq_u16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x74,0x7c]
 
-# W32: v_cmp_eq_u16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x74,0x7c]
-# W64: v_cmp_eq_u16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x74,0x7c]
 0x7c,0x04,0x74,0x7c
+# W32: v_cmp_eq_u16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x74,0x7c]
+# W64: v_cmp_eq_u16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x74,0x7c]
 
-# W32: v_cmp_eq_u16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x74,0x7c]
-# W64: v_cmp_eq_u16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x74,0x7c]
 0xc1,0x04,0x74,0x7c
+# W32: v_cmp_eq_u16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x74,0x7c]
+# W64: v_cmp_eq_u16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x74,0x7c]
 
-# W32: v_cmp_eq_u16_e32 vcc_lo, 0x3800, v2
-# W64: v_cmp_eq_u16_e32 vcc, 0x3800, v2          ; encoding: [0xff,0x04,0x74,0x7c,0x00,0x38,0x00,0x00]
 0xf0,0x04,0x74,0x7c
+# W32: v_cmp_eq_u16_e32 vcc_lo, 0x3800, v2     ; encoding: [0xff,0x04,0x74,0x7c,0x00,0x38,0x00,0x00]
+# W64: v_cmp_eq_u16_e32 vcc, 0x3800, v2        ; encoding: [0xff,0x04,0x74,0x7c,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_eq_u16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x74,0x7c]
-# W64: v_cmp_eq_u16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x74,0x7c]
 0xfd,0x04,0x74,0x7c
+# W32: v_cmp_eq_u16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x74,0x7c]
+# W64: v_cmp_eq_u16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x74,0x7c]
 
-# W32: v_cmp_eq_u16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x74,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_eq_u16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x74,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x74,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_eq_u16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x74,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_eq_u16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x74,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_eq_u32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x94,0x7c]
-# W64: v_cmp_eq_u32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x94,0x7c]
 0x01,0x05,0x94,0x7c
+# W32: v_cmp_eq_u32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x94,0x7c]
+# W64: v_cmp_eq_u32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x94,0x7c]
 
-# W32: v_cmp_eq_u32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x94,0x7c]
-# W64: v_cmp_eq_u32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x94,0x7c]
 0xff,0x05,0x94,0x7c
+# W32: v_cmp_eq_u32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x94,0x7c]
+# W64: v_cmp_eq_u32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x94,0x7c]
 
-# W32: v_cmp_eq_u32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x94,0x7c]
-# W64: v_cmp_eq_u32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x94,0x7c]
 0x01,0x04,0x94,0x7c
+# W32: v_cmp_eq_u32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x94,0x7c]
+# W64: v_cmp_eq_u32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x94,0x7c]
 
-# W32: v_cmp_eq_u32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x94,0x7c]
-# W64: v_cmp_eq_u32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x94,0x7c]
 0x69,0x04,0x94,0x7c
+# W32: v_cmp_eq_u32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x94,0x7c]
+# W64: v_cmp_eq_u32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x94,0x7c]
 
-# W32: v_cmp_eq_u32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x94,0x7c]
-# W64: v_cmp_eq_u32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x94,0x7c]
 0x6a,0x04,0x94,0x7c
+# W32: v_cmp_eq_u32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x94,0x7c]
+# W64: v_cmp_eq_u32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x94,0x7c]
 
-# W32: v_cmp_eq_u32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x94,0x7c]
-# W64: v_cmp_eq_u32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x94,0x7c]
 0x6b,0x04,0x94,0x7c
+# W32: v_cmp_eq_u32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x94,0x7c]
+# W64: v_cmp_eq_u32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x94,0x7c]
 
-# W32: v_cmp_eq_u32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x94,0x7c]
-# W64: v_cmp_eq_u32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x94,0x7c]
 0x7b,0x04,0x94,0x7c
+# W32: v_cmp_eq_u32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x94,0x7c]
+# W64: v_cmp_eq_u32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x94,0x7c]
 
-# W32: v_cmp_eq_u32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x94,0x7c]
-# W64: v_cmp_eq_u32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x94,0x7c]
 0x7d,0x04,0x94,0x7c
+# W32: v_cmp_eq_u32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x94,0x7c]
+# W64: v_cmp_eq_u32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x94,0x7c]
 
-# W32: v_cmp_eq_u32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x94,0x7c]
-# W64: v_cmp_eq_u32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x94,0x7c]
 0x7e,0x04,0x94,0x7c
+# W32: v_cmp_eq_u32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x94,0x7c]
+# W64: v_cmp_eq_u32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x94,0x7c]
 
-# W32: v_cmp_eq_u32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x94,0x7c]
-# W64: v_cmp_eq_u32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x94,0x7c]
 0x7f,0x04,0x94,0x7c
+# W32: v_cmp_eq_u32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x94,0x7c]
+# W64: v_cmp_eq_u32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x94,0x7c]
 
-# W32: v_cmp_eq_u32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x94,0x7c]
-# W64: v_cmp_eq_u32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x94,0x7c]
 0x7c,0x04,0x94,0x7c
+# W32: v_cmp_eq_u32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x94,0x7c]
+# W64: v_cmp_eq_u32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x94,0x7c]
 
-# W32: v_cmp_eq_u32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x94,0x7c]
-# W64: v_cmp_eq_u32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x94,0x7c]
 0xc1,0x04,0x94,0x7c
+# W32: v_cmp_eq_u32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x94,0x7c]
+# W64: v_cmp_eq_u32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x94,0x7c]
 
-# W32: v_cmp_eq_u32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x94,0x7c]
-# W64: v_cmp_eq_u32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x94,0x7c]
 0xf0,0x04,0x94,0x7c
+# W32: v_cmp_eq_u32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x94,0x7c]
+# W64: v_cmp_eq_u32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x94,0x7c]
 
-# W32: v_cmp_eq_u32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x94,0x7c]
-# W64: v_cmp_eq_u32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x94,0x7c]
 0xfd,0x04,0x94,0x7c
+# W32: v_cmp_eq_u32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x94,0x7c]
+# W64: v_cmp_eq_u32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x94,0x7c]
 
-# W32: v_cmp_eq_u32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x95,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_eq_u32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x95,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x95,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_eq_u32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x95,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_eq_u32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x95,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0xb4,0x7c]
-# W64: v_cmp_eq_u64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0xb4,0x7c]
 0x01,0x05,0xb4,0x7c
+# W32: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0xb4,0x7c]
+# W64: v_cmp_eq_u64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0xb4,0x7c]
 
-# W32: v_cmp_eq_u64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xb4,0x7c]
-# W64: v_cmp_eq_u64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0xb4,0x7c]
 0xfe,0x05,0xb4,0x7c
+# W32: v_cmp_eq_u64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xb4,0x7c]
+# W64: v_cmp_eq_u64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xb4,0x7c]
 
-# W32: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0xb4,0x7c]
-# W64: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0xb4,0x7c]
 0x02,0x04,0xb4,0x7c
+# W32: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0xb4,0x7c]
+# W64: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0xb4,0x7c]
 
-# W32: v_cmp_eq_u64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xb4,0x7c]
-# W64: v_cmp_eq_u64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0xb4,0x7c]
 0x68,0x04,0xb4,0x7c
+# W32: v_cmp_eq_u64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xb4,0x7c]
+# W64: v_cmp_eq_u64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xb4,0x7c]
 
-# W32: v_cmp_eq_u64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0xb4,0x7c]
-# W64: v_cmp_eq_u64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0xb4,0x7c]
 0x6a,0x04,0xb4,0x7c
+# W32: v_cmp_eq_u64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0xb4,0x7c]
+# W64: v_cmp_eq_u64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0xb4,0x7c]
 
+0x7a,0x04,0xb4,0x7c
 # W32: v_cmp_eq_u64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xb4,0x7c]
 # W64: v_cmp_eq_u64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xb4,0x7c]
-0x7a,0x04,0xb4,0x7c
 
-# W32: v_cmp_eq_u64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0xb4,0x7c]
-# W64: v_cmp_eq_u64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0xb4,0x7c]
 0x7e,0x04,0xb4,0x7c
+# W32: v_cmp_eq_u64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0xb4,0x7c]
+# W64: v_cmp_eq_u64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0xb4,0x7c]
 
-# W32: v_cmp_eq_u64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0xb4,0x7c]
-# W64: v_cmp_eq_u64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0xb4,0x7c]
 0x7c,0x04,0xb4,0x7c
+# W32: v_cmp_eq_u64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0xb4,0x7c]
+# W64: v_cmp_eq_u64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0xb4,0x7c]
 
-# W32: v_cmp_eq_u64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0xb4,0x7c]
-# W64: v_cmp_eq_u64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0xb4,0x7c]
 0xc1,0x04,0xb4,0x7c
+# W32: v_cmp_eq_u64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0xb4,0x7c]
+# W64: v_cmp_eq_u64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0xb4,0x7c]
 
-# W32: v_cmp_eq_u64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0xb4,0x7c]
-# W64: v_cmp_eq_u64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0xb4,0x7c]
 0xf0,0x04,0xb4,0x7c
+# W32: v_cmp_eq_u64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0xb4,0x7c]
+# W64: v_cmp_eq_u64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0xb4,0x7c]
 
-# W32: v_cmp_eq_u64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0xb4,0x7c]
-# W64: v_cmp_eq_u64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0xb4,0x7c]
 0xfd,0x04,0xb4,0x7c
+# W32: v_cmp_eq_u64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0xb4,0x7c]
+# W64: v_cmp_eq_u64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0xb4,0x7c]
 
+0xff,0xfc,0xb5,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_eq_u64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb5,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_eq_u64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb5,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0xb5,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_ge_f16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x0c,0x7c]
-# W64: v_cmp_ge_f16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x0c,0x7c]
 0x01,0x05,0x0c,0x7c
+# W32: v_cmp_ge_f16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x0c,0x7c]
+# W64: v_cmp_ge_f16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x0c,0x7c]
 
-# W32: v_cmp_ge_f16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x0c,0x7c]
-# W64: v_cmp_ge_f16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x0c,0x7c]
 0x7f,0x05,0x0c,0x7c
+# W32: v_cmp_ge_f16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x0c,0x7c]
+# W64: v_cmp_ge_f16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x0c,0x7c]
 
-# W32: v_cmp_ge_f16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x0c,0x7c]
-# W64: v_cmp_ge_f16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x0c,0x7c]
 0x01,0x04,0x0c,0x7c
+# W32: v_cmp_ge_f16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x0c,0x7c]
+# W64: v_cmp_ge_f16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x0c,0x7c]
 
-# W32: v_cmp_ge_f16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x0c,0x7c]
-# W64: v_cmp_ge_f16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x0c,0x7c]
 0x69,0x04,0x0c,0x7c
+# W32: v_cmp_ge_f16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x0c,0x7c]
+# W64: v_cmp_ge_f16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x0c,0x7c]
 
-# W32: v_cmp_ge_f16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x0c,0x7c]
-# W64: v_cmp_ge_f16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x0c,0x7c]
 0x6a,0x04,0x0c,0x7c
+# W32: v_cmp_ge_f16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x0c,0x7c]
+# W64: v_cmp_ge_f16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x0c,0x7c]
 
-# W32: v_cmp_ge_f16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x0c,0x7c]
-# W64: v_cmp_ge_f16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x0c,0x7c]
 0x6b,0x04,0x0c,0x7c
+# W32: v_cmp_ge_f16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x0c,0x7c]
+# W64: v_cmp_ge_f16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x0c,0x7c]
 
-# W32: v_cmp_ge_f16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x0c,0x7c]
-# W64: v_cmp_ge_f16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x0c,0x7c]
 0x7b,0x04,0x0c,0x7c
+# W32: v_cmp_ge_f16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x0c,0x7c]
+# W64: v_cmp_ge_f16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x0c,0x7c]
 
-# W32: v_cmp_ge_f16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x0c,0x7c]
-# W64: v_cmp_ge_f16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x0c,0x7c]
 0x7d,0x04,0x0c,0x7c
+# W32: v_cmp_ge_f16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x0c,0x7c]
+# W64: v_cmp_ge_f16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x0c,0x7c]
 
-# W32: v_cmp_ge_f16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x0c,0x7c]
-# W64: v_cmp_ge_f16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x0c,0x7c]
 0x7e,0x04,0x0c,0x7c
+# W32: v_cmp_ge_f16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x0c,0x7c]
+# W64: v_cmp_ge_f16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x0c,0x7c]
 
-# W32: v_cmp_ge_f16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x0c,0x7c]
-# W64: v_cmp_ge_f16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x0c,0x7c]
 0x7f,0x04,0x0c,0x7c
+# W32: v_cmp_ge_f16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x0c,0x7c]
+# W64: v_cmp_ge_f16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x0c,0x7c]
 
-# W32: v_cmp_ge_f16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x0c,0x7c]
-# W64: v_cmp_ge_f16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x0c,0x7c]
 0x7c,0x04,0x0c,0x7c
+# W32: v_cmp_ge_f16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x0c,0x7c]
+# W64: v_cmp_ge_f16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x0c,0x7c]
 
-# W32: v_cmp_ge_f16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x0c,0x7c]
-# W64: v_cmp_ge_f16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x0c,0x7c]
 0xc1,0x04,0x0c,0x7c
+# W32: v_cmp_ge_f16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x0c,0x7c]
+# W64: v_cmp_ge_f16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x0c,0x7c]
 
-# W32: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x0c,0x7c]
-# W64: v_cmp_ge_f16_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x0c,0x7c]
 0xf0,0x04,0x0c,0x7c
+# W32: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x0c,0x7c]
+# W64: v_cmp_ge_f16_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x0c,0x7c]
 
-# W32: v_cmp_ge_f16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x0c,0x7c]
-# W64: v_cmp_ge_f16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x0c,0x7c]
 0xfd,0x04,0x0c,0x7c
+# W32: v_cmp_ge_f16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x0c,0x7c]
+# W64: v_cmp_ge_f16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x0c,0x7c]
 
-# W32: v_cmp_ge_f16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x0c,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_ge_f16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x0c,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x0c,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_ge_f16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x0c,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_ge_f16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x0c,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ge_f32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x2c,0x7c]
-# W64: v_cmp_ge_f32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x2c,0x7c]
 0x01,0x05,0x2c,0x7c
+# W32: v_cmp_ge_f32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x2c,0x7c]
+# W64: v_cmp_ge_f32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x2c,0x7c]
 
-# W32: v_cmp_ge_f32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x2c,0x7c]
-# W64: v_cmp_ge_f32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x2c,0x7c]
 0xff,0x05,0x2c,0x7c
+# W32: v_cmp_ge_f32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x2c,0x7c]
+# W64: v_cmp_ge_f32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x2c,0x7c]
 
-# W32: v_cmp_ge_f32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x2c,0x7c]
-# W64: v_cmp_ge_f32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x2c,0x7c]
 0x01,0x04,0x2c,0x7c
+# W32: v_cmp_ge_f32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x2c,0x7c]
+# W64: v_cmp_ge_f32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x2c,0x7c]
 
-# W32: v_cmp_ge_f32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x2c,0x7c]
-# W64: v_cmp_ge_f32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x2c,0x7c]
 0x69,0x04,0x2c,0x7c
+# W32: v_cmp_ge_f32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x2c,0x7c]
+# W64: v_cmp_ge_f32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x2c,0x7c]
 
-# W32: v_cmp_ge_f32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x2c,0x7c]
-# W64: v_cmp_ge_f32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x2c,0x7c]
 0x6a,0x04,0x2c,0x7c
+# W32: v_cmp_ge_f32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x2c,0x7c]
+# W64: v_cmp_ge_f32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x2c,0x7c]
 
-# W32: v_cmp_ge_f32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x2c,0x7c]
-# W64: v_cmp_ge_f32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x2c,0x7c]
 0x6b,0x04,0x2c,0x7c
+# W32: v_cmp_ge_f32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x2c,0x7c]
+# W64: v_cmp_ge_f32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x2c,0x7c]
 
-# W32: v_cmp_ge_f32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x2c,0x7c]
-# W64: v_cmp_ge_f32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x2c,0x7c]
 0x7b,0x04,0x2c,0x7c
+# W32: v_cmp_ge_f32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x2c,0x7c]
+# W64: v_cmp_ge_f32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x2c,0x7c]
 
-# W32: v_cmp_ge_f32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x2c,0x7c]
-# W64: v_cmp_ge_f32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x2c,0x7c]
 0x7d,0x04,0x2c,0x7c
+# W32: v_cmp_ge_f32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x2c,0x7c]
+# W64: v_cmp_ge_f32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x2c,0x7c]
 
-# W32: v_cmp_ge_f32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x2c,0x7c]
-# W64: v_cmp_ge_f32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x2c,0x7c]
 0x7e,0x04,0x2c,0x7c
+# W32: v_cmp_ge_f32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x2c,0x7c]
+# W64: v_cmp_ge_f32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x2c,0x7c]
 
-# W32: v_cmp_ge_f32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x2c,0x7c]
-# W64: v_cmp_ge_f32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x2c,0x7c]
 0x7f,0x04,0x2c,0x7c
+# W32: v_cmp_ge_f32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x2c,0x7c]
+# W64: v_cmp_ge_f32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x2c,0x7c]
 
-# W32: v_cmp_ge_f32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x2c,0x7c]
-# W64: v_cmp_ge_f32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x2c,0x7c]
 0x7c,0x04,0x2c,0x7c
+# W32: v_cmp_ge_f32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x2c,0x7c]
+# W64: v_cmp_ge_f32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x2c,0x7c]
 
-# W32: v_cmp_ge_f32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x2c,0x7c]
-# W64: v_cmp_ge_f32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x2c,0x7c]
 0xc1,0x04,0x2c,0x7c
+# W32: v_cmp_ge_f32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x2c,0x7c]
+# W64: v_cmp_ge_f32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x2c,0x7c]
 
-# W32: v_cmp_ge_f32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x2c,0x7c]
-# W64: v_cmp_ge_f32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x2c,0x7c]
 0xf0,0x04,0x2c,0x7c
+# W32: v_cmp_ge_f32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x2c,0x7c]
+# W64: v_cmp_ge_f32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x2c,0x7c]
 
-# W32: v_cmp_ge_f32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x2c,0x7c]
-# W64: v_cmp_ge_f32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x2c,0x7c]
 0xfd,0x04,0x2c,0x7c
+# W32: v_cmp_ge_f32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x2c,0x7c]
+# W64: v_cmp_ge_f32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x2c,0x7c]
 
-# W32: v_cmp_ge_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x2d,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_ge_f32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x2d,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x2d,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_ge_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x2d,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_ge_f32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x2d,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ge_f64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0x4c,0x7c]
-# W64: v_cmp_ge_f64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0x4c,0x7c]
 0x01,0x05,0x4c,0x7c
+# W32: v_cmp_ge_f64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0x4c,0x7c]
+# W64: v_cmp_ge_f64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0x4c,0x7c]
 
-# W32: v_cmp_ge_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x4c,0x7c]
-# W64: v_cmp_ge_f64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0x4c,0x7c]
 0xfe,0x05,0x4c,0x7c
+# W32: v_cmp_ge_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x4c,0x7c]
+# W64: v_cmp_ge_f64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x4c,0x7c]
 
-# W32: v_cmp_ge_f64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0x4c,0x7c]
-# W64: v_cmp_ge_f64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0x4c,0x7c]
 0x02,0x04,0x4c,0x7c
+# W32: v_cmp_ge_f64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0x4c,0x7c]
+# W64: v_cmp_ge_f64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0x4c,0x7c]
 
-# W32: v_cmp_ge_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x4c,0x7c]
-# W64: v_cmp_ge_f64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0x4c,0x7c]
 0x68,0x04,0x4c,0x7c
+# W32: v_cmp_ge_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x4c,0x7c]
+# W64: v_cmp_ge_f64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x4c,0x7c]
 
-# W32: v_cmp_ge_f64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0x4c,0x7c]
-# W64: v_cmp_ge_f64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0x4c,0x7c]
 0x6a,0x04,0x4c,0x7c
+# W32: v_cmp_ge_f64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0x4c,0x7c]
+# W64: v_cmp_ge_f64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0x4c,0x7c]
 
+0x7a,0x04,0x4c,0x7c
 # W32: v_cmp_ge_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x4c,0x7c]
 # W64: v_cmp_ge_f64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x4c,0x7c]
-0x7a,0x04,0x4c,0x7c
 
-# W32: v_cmp_ge_f64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0x4c,0x7c]
-# W64: v_cmp_ge_f64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0x4c,0x7c]
 0x7e,0x04,0x4c,0x7c
+# W32: v_cmp_ge_f64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0x4c,0x7c]
+# W64: v_cmp_ge_f64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0x4c,0x7c]
 
-# W32: v_cmp_ge_f64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0x4c,0x7c]
-# W64: v_cmp_ge_f64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0x4c,0x7c]
 0x7c,0x04,0x4c,0x7c
+# W32: v_cmp_ge_f64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0x4c,0x7c]
+# W64: v_cmp_ge_f64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0x4c,0x7c]
 
-# W32: v_cmp_ge_f64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0x4c,0x7c]
-# W64: v_cmp_ge_f64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0x4c,0x7c]
 0xc1,0x04,0x4c,0x7c
+# W32: v_cmp_ge_f64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0x4c,0x7c]
+# W64: v_cmp_ge_f64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0x4c,0x7c]
 
-# W32: v_cmp_ge_f64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0x4c,0x7c]
-# W64: v_cmp_ge_f64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0x4c,0x7c]
 0xf0,0x04,0x4c,0x7c
+# W32: v_cmp_ge_f64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0x4c,0x7c]
+# W64: v_cmp_ge_f64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0x4c,0x7c]
 
-# W32: v_cmp_ge_f64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0x4c,0x7c]
-# W64: v_cmp_ge_f64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0x4c,0x7c]
 0xfd,0x04,0x4c,0x7c
+# W32: v_cmp_ge_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x4c,0x7c]
+# W64: v_cmp_ge_f64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0x4c,0x7c]
 
+0xff,0xfc,0x4d,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_ge_f64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4d,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_ge_f64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4d,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0x4d,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_ge_i16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x6c,0x7c]
-# W64: v_cmp_ge_i16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x6c,0x7c]
 0x01,0x05,0x6c,0x7c
+# W32: v_cmp_ge_i16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x6c,0x7c]
+# W64: v_cmp_ge_i16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x6c,0x7c]
 
-# W32: v_cmp_ge_i16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x6c,0x7c]
-# W64: v_cmp_ge_i16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x6c,0x7c]
 0x7f,0x05,0x6c,0x7c
+# W32: v_cmp_ge_i16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x6c,0x7c]
+# W64: v_cmp_ge_i16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x6c,0x7c]
 
-# W32: v_cmp_ge_i16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x6c,0x7c]
-# W64: v_cmp_ge_i16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x6c,0x7c]
 0x01,0x04,0x6c,0x7c
+# W32: v_cmp_ge_i16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x6c,0x7c]
+# W64: v_cmp_ge_i16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x6c,0x7c]
 
-# W32: v_cmp_ge_i16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x6c,0x7c]
-# W64: v_cmp_ge_i16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x6c,0x7c]
 0x69,0x04,0x6c,0x7c
+# W32: v_cmp_ge_i16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x6c,0x7c]
+# W64: v_cmp_ge_i16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x6c,0x7c]
 
-# W32: v_cmp_ge_i16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x6c,0x7c]
-# W64: v_cmp_ge_i16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x6c,0x7c]
 0x6a,0x04,0x6c,0x7c
+# W32: v_cmp_ge_i16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x6c,0x7c]
+# W64: v_cmp_ge_i16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x6c,0x7c]
 
-# W32: v_cmp_ge_i16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x6c,0x7c]
-# W64: v_cmp_ge_i16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x6c,0x7c]
 0x6b,0x04,0x6c,0x7c
+# W32: v_cmp_ge_i16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x6c,0x7c]
+# W64: v_cmp_ge_i16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x6c,0x7c]
 
-# W32: v_cmp_ge_i16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x6c,0x7c]
-# W64: v_cmp_ge_i16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x6c,0x7c]
 0x7b,0x04,0x6c,0x7c
+# W32: v_cmp_ge_i16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x6c,0x7c]
+# W64: v_cmp_ge_i16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x6c,0x7c]
 
-# W32: v_cmp_ge_i16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x6c,0x7c]
-# W64: v_cmp_ge_i16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x6c,0x7c]
 0x7d,0x04,0x6c,0x7c
+# W32: v_cmp_ge_i16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x6c,0x7c]
+# W64: v_cmp_ge_i16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x6c,0x7c]
 
-# W32: v_cmp_ge_i16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x6c,0x7c]
-# W64: v_cmp_ge_i16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x6c,0x7c]
 0x7e,0x04,0x6c,0x7c
+# W32: v_cmp_ge_i16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x6c,0x7c]
+# W64: v_cmp_ge_i16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x6c,0x7c]
 
-# W32: v_cmp_ge_i16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x6c,0x7c]
-# W64: v_cmp_ge_i16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x6c,0x7c]
 0x7f,0x04,0x6c,0x7c
+# W32: v_cmp_ge_i16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x6c,0x7c]
+# W64: v_cmp_ge_i16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x6c,0x7c]
 
-# W32: v_cmp_ge_i16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x6c,0x7c]
-# W64: v_cmp_ge_i16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x6c,0x7c]
 0x7c,0x04,0x6c,0x7c
+# W32: v_cmp_ge_i16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x6c,0x7c]
+# W64: v_cmp_ge_i16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x6c,0x7c]
 
-# W32: v_cmp_ge_i16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x6c,0x7c]
-# W64: v_cmp_ge_i16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x6c,0x7c]
 0xc1,0x04,0x6c,0x7c
+# W32: v_cmp_ge_i16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x6c,0x7c]
+# W64: v_cmp_ge_i16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x6c,0x7c]
 
-# W32: v_cmp_ge_i16_e32 vcc_lo, 0x3800, v2
-# W64: v_cmp_ge_i16_e32 vcc, 0x3800, v2          ; encoding: [0xff,0x04,0x6c,0x7c,0x00,0x38,0x00,0x00]
 0xf0,0x04,0x6c,0x7c
+# W32: v_cmp_ge_i16_e32 vcc_lo, 0x3800, v2     ; encoding: [0xff,0x04,0x6c,0x7c,0x00,0x38,0x00,0x00]
+# W64: v_cmp_ge_i16_e32 vcc, 0x3800, v2        ; encoding: [0xff,0x04,0x6c,0x7c,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_ge_i16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x6c,0x7c]
-# W64: v_cmp_ge_i16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x6c,0x7c]
 0xfd,0x04,0x6c,0x7c
+# W32: v_cmp_ge_i16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x6c,0x7c]
+# W64: v_cmp_ge_i16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x6c,0x7c]
 
-# W32: v_cmp_ge_i16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x6c,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_ge_i16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x6c,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x6c,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_ge_i16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x6c,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_ge_i16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x6c,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ge_i32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x8c,0x7c]
-# W64: v_cmp_ge_i32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x8c,0x7c]
 0x01,0x05,0x8c,0x7c
+# W32: v_cmp_ge_i32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x8c,0x7c]
+# W64: v_cmp_ge_i32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x8c,0x7c]
 
-# W32: v_cmp_ge_i32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x8c,0x7c]
-# W64: v_cmp_ge_i32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x8c,0x7c]
 0xff,0x05,0x8c,0x7c
+# W32: v_cmp_ge_i32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x8c,0x7c]
+# W64: v_cmp_ge_i32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x8c,0x7c]
 
-# W32: v_cmp_ge_i32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x8c,0x7c]
-# W64: v_cmp_ge_i32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x8c,0x7c]
 0x01,0x04,0x8c,0x7c
+# W32: v_cmp_ge_i32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x8c,0x7c]
+# W64: v_cmp_ge_i32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x8c,0x7c]
 
-# W32: v_cmp_ge_i32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x8c,0x7c]
-# W64: v_cmp_ge_i32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x8c,0x7c]
 0x69,0x04,0x8c,0x7c
+# W32: v_cmp_ge_i32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x8c,0x7c]
+# W64: v_cmp_ge_i32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x8c,0x7c]
 
-# W32: v_cmp_ge_i32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x8c,0x7c]
-# W64: v_cmp_ge_i32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x8c,0x7c]
 0x6a,0x04,0x8c,0x7c
+# W32: v_cmp_ge_i32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x8c,0x7c]
+# W64: v_cmp_ge_i32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x8c,0x7c]
 
-# W32: v_cmp_ge_i32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x8c,0x7c]
-# W64: v_cmp_ge_i32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x8c,0x7c]
 0x6b,0x04,0x8c,0x7c
+# W32: v_cmp_ge_i32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x8c,0x7c]
+# W64: v_cmp_ge_i32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x8c,0x7c]
 
-# W32: v_cmp_ge_i32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x8c,0x7c]
-# W64: v_cmp_ge_i32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x8c,0x7c]
 0x7b,0x04,0x8c,0x7c
+# W32: v_cmp_ge_i32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x8c,0x7c]
+# W64: v_cmp_ge_i32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x8c,0x7c]
 
-# W32: v_cmp_ge_i32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x8c,0x7c]
-# W64: v_cmp_ge_i32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x8c,0x7c]
 0x7d,0x04,0x8c,0x7c
+# W32: v_cmp_ge_i32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x8c,0x7c]
+# W64: v_cmp_ge_i32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x8c,0x7c]
 
-# W32: v_cmp_ge_i32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x8c,0x7c]
-# W64: v_cmp_ge_i32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x8c,0x7c]
 0x7e,0x04,0x8c,0x7c
+# W32: v_cmp_ge_i32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x8c,0x7c]
+# W64: v_cmp_ge_i32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x8c,0x7c]
 
-# W32: v_cmp_ge_i32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x8c,0x7c]
-# W64: v_cmp_ge_i32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x8c,0x7c]
 0x7f,0x04,0x8c,0x7c
+# W32: v_cmp_ge_i32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x8c,0x7c]
+# W64: v_cmp_ge_i32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x8c,0x7c]
 
-# W32: v_cmp_ge_i32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x8c,0x7c]
-# W64: v_cmp_ge_i32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x8c,0x7c]
 0x7c,0x04,0x8c,0x7c
+# W32: v_cmp_ge_i32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x8c,0x7c]
+# W64: v_cmp_ge_i32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x8c,0x7c]
 
-# W32: v_cmp_ge_i32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x8c,0x7c]
-# W64: v_cmp_ge_i32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x8c,0x7c]
 0xc1,0x04,0x8c,0x7c
+# W32: v_cmp_ge_i32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x8c,0x7c]
+# W64: v_cmp_ge_i32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x8c,0x7c]
 
-# W32: v_cmp_ge_i32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x8c,0x7c]
-# W64: v_cmp_ge_i32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x8c,0x7c]
 0xf0,0x04,0x8c,0x7c
+# W32: v_cmp_ge_i32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x8c,0x7c]
+# W64: v_cmp_ge_i32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x8c,0x7c]
 
-# W32: v_cmp_ge_i32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x8c,0x7c]
-# W64: v_cmp_ge_i32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x8c,0x7c]
 0xfd,0x04,0x8c,0x7c
+# W32: v_cmp_ge_i32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x8c,0x7c]
+# W64: v_cmp_ge_i32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x8c,0x7c]
 
-# W32: v_cmp_ge_i32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x8d,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_ge_i32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x8d,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x8d,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_ge_i32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x8d,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_ge_i32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x8d,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ge_i64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0xac,0x7c]
-# W64: v_cmp_ge_i64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0xac,0x7c]
 0x01,0x05,0xac,0x7c
+# W32: v_cmp_ge_i64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0xac,0x7c]
+# W64: v_cmp_ge_i64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0xac,0x7c]
 
-# W32: v_cmp_ge_i64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xac,0x7c]
-# W64: v_cmp_ge_i64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0xac,0x7c]
 0xfe,0x05,0xac,0x7c
+# W32: v_cmp_ge_i64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xac,0x7c]
+# W64: v_cmp_ge_i64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xac,0x7c]
 
-# W32: v_cmp_ge_i64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0xac,0x7c]
-# W64: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0xac,0x7c]
 0x02,0x04,0xac,0x7c
+# W32: v_cmp_ge_i64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0xac,0x7c]
+# W64: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0xac,0x7c]
 
-# W32: v_cmp_ge_i64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xac,0x7c]
-# W64: v_cmp_ge_i64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0xac,0x7c]
 0x68,0x04,0xac,0x7c
+# W32: v_cmp_ge_i64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xac,0x7c]
+# W64: v_cmp_ge_i64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xac,0x7c]
 
-# W32: v_cmp_ge_i64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0xac,0x7c]
-# W64: v_cmp_ge_i64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0xac,0x7c]
 0x6a,0x04,0xac,0x7c
+# W32: v_cmp_ge_i64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0xac,0x7c]
+# W64: v_cmp_ge_i64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0xac,0x7c]
 
+0x7a,0x04,0xac,0x7c
 # W32: v_cmp_ge_i64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xac,0x7c]
 # W64: v_cmp_ge_i64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xac,0x7c]
-0x7a,0x04,0xac,0x7c
 
-# W32: v_cmp_ge_i64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0xac,0x7c]
-# W64: v_cmp_ge_i64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0xac,0x7c]
 0x7e,0x04,0xac,0x7c
+# W32: v_cmp_ge_i64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0xac,0x7c]
+# W64: v_cmp_ge_i64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0xac,0x7c]
 
-# W32: v_cmp_ge_i64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0xac,0x7c]
-# W64: v_cmp_ge_i64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0xac,0x7c]
 0x7c,0x04,0xac,0x7c
+# W32: v_cmp_ge_i64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0xac,0x7c]
+# W64: v_cmp_ge_i64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0xac,0x7c]
 
-# W32: v_cmp_ge_i64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0xac,0x7c]
-# W64: v_cmp_ge_i64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0xac,0x7c]
 0xc1,0x04,0xac,0x7c
+# W32: v_cmp_ge_i64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0xac,0x7c]
+# W64: v_cmp_ge_i64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0xac,0x7c]
 
-# W32: v_cmp_ge_i64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0xac,0x7c]
-# W64: v_cmp_ge_i64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0xac,0x7c]
 0xf0,0x04,0xac,0x7c
+# W32: v_cmp_ge_i64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0xac,0x7c]
+# W64: v_cmp_ge_i64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0xac,0x7c]
 
-# W32: v_cmp_ge_i64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0xac,0x7c]
-# W64: v_cmp_ge_i64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0xac,0x7c]
 0xfd,0x04,0xac,0x7c
+# W32: v_cmp_ge_i64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0xac,0x7c]
+# W64: v_cmp_ge_i64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0xac,0x7c]
 
+0xff,0xfc,0xad,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_ge_i64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xad,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_ge_i64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xad,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0xad,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_ge_u16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x7c,0x7c]
-# W64: v_cmp_ge_u16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x7c,0x7c]
 0x01,0x05,0x7c,0x7c
+# W32: v_cmp_ge_u16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x7c,0x7c]
+# W64: v_cmp_ge_u16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x7c,0x7c]
 
-# W32: v_cmp_ge_u16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x7c,0x7c]
-# W64: v_cmp_ge_u16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x7c,0x7c]
 0x7f,0x05,0x7c,0x7c
+# W32: v_cmp_ge_u16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x7c,0x7c]
+# W64: v_cmp_ge_u16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x7c,0x7c]
 
-# W32: v_cmp_ge_u16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x7c,0x7c]
-# W64: v_cmp_ge_u16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x7c,0x7c]
 0x01,0x04,0x7c,0x7c
+# W32: v_cmp_ge_u16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x7c,0x7c]
+# W64: v_cmp_ge_u16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x7c,0x7c]
 
-# W32: v_cmp_ge_u16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x7c,0x7c]
-# W64: v_cmp_ge_u16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x7c,0x7c]
 0x69,0x04,0x7c,0x7c
+# W32: v_cmp_ge_u16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x7c,0x7c]
+# W64: v_cmp_ge_u16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x7c,0x7c]
 
-# W32: v_cmp_ge_u16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x7c,0x7c]
-# W64: v_cmp_ge_u16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x7c,0x7c]
 0x6a,0x04,0x7c,0x7c
+# W32: v_cmp_ge_u16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x7c,0x7c]
+# W64: v_cmp_ge_u16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x7c,0x7c]
 
-# W32: v_cmp_ge_u16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x7c,0x7c]
-# W64: v_cmp_ge_u16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x7c,0x7c]
 0x6b,0x04,0x7c,0x7c
+# W32: v_cmp_ge_u16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x7c,0x7c]
+# W64: v_cmp_ge_u16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x7c,0x7c]
 
-# W32: v_cmp_ge_u16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x7c,0x7c]
-# W64: v_cmp_ge_u16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x7c,0x7c]
 0x7b,0x04,0x7c,0x7c
+# W32: v_cmp_ge_u16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x7c,0x7c]
+# W64: v_cmp_ge_u16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x7c,0x7c]
 
-# W32: v_cmp_ge_u16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x7c,0x7c]
-# W64: v_cmp_ge_u16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x7c,0x7c]
 0x7d,0x04,0x7c,0x7c
+# W32: v_cmp_ge_u16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x7c,0x7c]
+# W64: v_cmp_ge_u16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x7c,0x7c]
 
-# W32: v_cmp_ge_u16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x7c,0x7c]
-# W64: v_cmp_ge_u16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x7c,0x7c]
 0x7e,0x04,0x7c,0x7c
+# W32: v_cmp_ge_u16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x7c,0x7c]
+# W64: v_cmp_ge_u16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x7c,0x7c]
 
-# W32: v_cmp_ge_u16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x7c,0x7c]
-# W64: v_cmp_ge_u16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x7c,0x7c]
 0x7f,0x04,0x7c,0x7c
+# W32: v_cmp_ge_u16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x7c,0x7c]
+# W64: v_cmp_ge_u16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x7c,0x7c]
 
-# W32: v_cmp_ge_u16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x7c,0x7c]
-# W64: v_cmp_ge_u16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x7c,0x7c]
 0x7c,0x04,0x7c,0x7c
+# W32: v_cmp_ge_u16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x7c,0x7c]
+# W64: v_cmp_ge_u16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x7c,0x7c]
 
-# W32: v_cmp_ge_u16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x7c,0x7c]
-# W64: v_cmp_ge_u16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x7c,0x7c]
 0xc1,0x04,0x7c,0x7c
+# W32: v_cmp_ge_u16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x7c,0x7c]
+# W64: v_cmp_ge_u16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x7c,0x7c]
 
-# W32: v_cmp_ge_u16_e32 vcc_lo, 0x3800, v2
-# W64: v_cmp_ge_u16_e32 vcc, 0x3800, v2          ; encoding: [0xff,0x04,0x7c,0x7c,0x00,0x38,0x00,0x00]
 0xf0,0x04,0x7c,0x7c
+# W32: v_cmp_ge_u16_e32 vcc_lo, 0x3800, v2     ; encoding: [0xff,0x04,0x7c,0x7c,0x00,0x38,0x00,0x00]
+# W64: v_cmp_ge_u16_e32 vcc, 0x3800, v2        ; encoding: [0xff,0x04,0x7c,0x7c,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_ge_u16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x7c,0x7c]
-# W64: v_cmp_ge_u16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x7c,0x7c]
 0xfd,0x04,0x7c,0x7c
+# W32: v_cmp_ge_u16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x7c,0x7c]
+# W64: v_cmp_ge_u16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x7c,0x7c]
 
-# W32: v_cmp_ge_u16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x7c,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_ge_u16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x7c,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x7c,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_ge_u16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x7c,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_ge_u16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x7c,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ge_u32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x9c,0x7c]
-# W64: v_cmp_ge_u32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x9c,0x7c]
 0x01,0x05,0x9c,0x7c
+# W32: v_cmp_ge_u32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x9c,0x7c]
+# W64: v_cmp_ge_u32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x9c,0x7c]
 
-# W32: v_cmp_ge_u32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x9c,0x7c]
-# W64: v_cmp_ge_u32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x9c,0x7c]
 0xff,0x05,0x9c,0x7c
+# W32: v_cmp_ge_u32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x9c,0x7c]
+# W64: v_cmp_ge_u32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x9c,0x7c]
 
-# W32: v_cmp_ge_u32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x9c,0x7c]
-# W64: v_cmp_ge_u32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x9c,0x7c]
 0x01,0x04,0x9c,0x7c
+# W32: v_cmp_ge_u32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x9c,0x7c]
+# W64: v_cmp_ge_u32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x9c,0x7c]
 
-# W32: v_cmp_ge_u32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x9c,0x7c]
-# W64: v_cmp_ge_u32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x9c,0x7c]
 0x69,0x04,0x9c,0x7c
+# W32: v_cmp_ge_u32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x9c,0x7c]
+# W64: v_cmp_ge_u32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x9c,0x7c]
 
-# W32: v_cmp_ge_u32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x9c,0x7c]
-# W64: v_cmp_ge_u32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x9c,0x7c]
 0x6a,0x04,0x9c,0x7c
+# W32: v_cmp_ge_u32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x9c,0x7c]
+# W64: v_cmp_ge_u32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x9c,0x7c]
 
-# W32: v_cmp_ge_u32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x9c,0x7c]
-# W64: v_cmp_ge_u32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x9c,0x7c]
 0x6b,0x04,0x9c,0x7c
+# W32: v_cmp_ge_u32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x9c,0x7c]
+# W64: v_cmp_ge_u32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x9c,0x7c]
 
-# W32: v_cmp_ge_u32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x9c,0x7c]
-# W64: v_cmp_ge_u32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x9c,0x7c]
 0x7b,0x04,0x9c,0x7c
+# W32: v_cmp_ge_u32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x9c,0x7c]
+# W64: v_cmp_ge_u32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x9c,0x7c]
 
-# W32: v_cmp_ge_u32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x9c,0x7c]
-# W64: v_cmp_ge_u32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x9c,0x7c]
 0x7d,0x04,0x9c,0x7c
+# W32: v_cmp_ge_u32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x9c,0x7c]
+# W64: v_cmp_ge_u32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x9c,0x7c]
 
-# W32: v_cmp_ge_u32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x9c,0x7c]
-# W64: v_cmp_ge_u32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x9c,0x7c]
 0x7e,0x04,0x9c,0x7c
+# W32: v_cmp_ge_u32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x9c,0x7c]
+# W64: v_cmp_ge_u32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x9c,0x7c]
 
-# W32: v_cmp_ge_u32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x9c,0x7c]
-# W64: v_cmp_ge_u32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x9c,0x7c]
 0x7f,0x04,0x9c,0x7c
+# W32: v_cmp_ge_u32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x9c,0x7c]
+# W64: v_cmp_ge_u32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x9c,0x7c]
 
-# W32: v_cmp_ge_u32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x9c,0x7c]
-# W64: v_cmp_ge_u32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x9c,0x7c]
 0x7c,0x04,0x9c,0x7c
+# W32: v_cmp_ge_u32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x9c,0x7c]
+# W64: v_cmp_ge_u32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x9c,0x7c]
 
-# W32: v_cmp_ge_u32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x9c,0x7c]
-# W64: v_cmp_ge_u32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x9c,0x7c]
 0xc1,0x04,0x9c,0x7c
+# W32: v_cmp_ge_u32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x9c,0x7c]
+# W64: v_cmp_ge_u32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x9c,0x7c]
 
-# W32: v_cmp_ge_u32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x9c,0x7c]
-# W64: v_cmp_ge_u32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x9c,0x7c]
 0xf0,0x04,0x9c,0x7c
+# W32: v_cmp_ge_u32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x9c,0x7c]
+# W64: v_cmp_ge_u32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x9c,0x7c]
 
-# W32: v_cmp_ge_u32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x9c,0x7c]
-# W64: v_cmp_ge_u32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x9c,0x7c]
 0xfd,0x04,0x9c,0x7c
+# W32: v_cmp_ge_u32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x9c,0x7c]
+# W64: v_cmp_ge_u32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x9c,0x7c]
 
-# W32: v_cmp_ge_u32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x9d,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_ge_u32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x9d,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x9d,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_ge_u32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x9d,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_ge_u32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x9d,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ge_u64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0xbc,0x7c]
-# W64: v_cmp_ge_u64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0xbc,0x7c]
 0x01,0x05,0xbc,0x7c
+# W32: v_cmp_ge_u64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0xbc,0x7c]
+# W64: v_cmp_ge_u64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0xbc,0x7c]
 
-# W32: v_cmp_ge_u64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xbc,0x7c]
-# W64: v_cmp_ge_u64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0xbc,0x7c]
 0xfe,0x05,0xbc,0x7c
+# W32: v_cmp_ge_u64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xbc,0x7c]
+# W64: v_cmp_ge_u64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xbc,0x7c]
 
-# W32: v_cmp_ge_u64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0xbc,0x7c]
-# W64: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0xbc,0x7c]
 0x02,0x04,0xbc,0x7c
+# W32: v_cmp_ge_u64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0xbc,0x7c]
+# W64: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0xbc,0x7c]
 
-# W32: v_cmp_ge_u64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xbc,0x7c]
-# W64: v_cmp_ge_u64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0xbc,0x7c]
 0x68,0x04,0xbc,0x7c
+# W32: v_cmp_ge_u64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xbc,0x7c]
+# W64: v_cmp_ge_u64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xbc,0x7c]
 
-# W32: v_cmp_ge_u64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0xbc,0x7c]
-# W64: v_cmp_ge_u64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0xbc,0x7c]
 0x6a,0x04,0xbc,0x7c
+# W32: v_cmp_ge_u64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0xbc,0x7c]
+# W64: v_cmp_ge_u64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0xbc,0x7c]
 
+0x7a,0x04,0xbc,0x7c
 # W32: v_cmp_ge_u64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xbc,0x7c]
 # W64: v_cmp_ge_u64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xbc,0x7c]
-0x7a,0x04,0xbc,0x7c
 
-# W32: v_cmp_ge_u64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0xbc,0x7c]
-# W64: v_cmp_ge_u64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0xbc,0x7c]
 0x7e,0x04,0xbc,0x7c
+# W32: v_cmp_ge_u64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0xbc,0x7c]
+# W64: v_cmp_ge_u64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0xbc,0x7c]
 
-# W32: v_cmp_ge_u64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0xbc,0x7c]
-# W64: v_cmp_ge_u64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0xbc,0x7c]
 0x7c,0x04,0xbc,0x7c
+# W32: v_cmp_ge_u64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0xbc,0x7c]
+# W64: v_cmp_ge_u64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0xbc,0x7c]
 
-# W32: v_cmp_ge_u64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0xbc,0x7c]
-# W64: v_cmp_ge_u64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0xbc,0x7c]
 0xc1,0x04,0xbc,0x7c
+# W32: v_cmp_ge_u64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0xbc,0x7c]
+# W64: v_cmp_ge_u64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0xbc,0x7c]
 
-# W32: v_cmp_ge_u64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0xbc,0x7c]
-# W64: v_cmp_ge_u64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0xbc,0x7c]
 0xf0,0x04,0xbc,0x7c
+# W32: v_cmp_ge_u64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0xbc,0x7c]
+# W64: v_cmp_ge_u64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0xbc,0x7c]
 
-# W32: v_cmp_ge_u64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0xbc,0x7c]
-# W64: v_cmp_ge_u64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0xbc,0x7c]
 0xfd,0x04,0xbc,0x7c
+# W32: v_cmp_ge_u64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0xbc,0x7c]
+# W64: v_cmp_ge_u64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0xbc,0x7c]
 
+0xff,0xfc,0xbd,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_ge_u64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbd,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_ge_u64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbd,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0xbd,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_gt_f16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x08,0x7c]
-# W64: v_cmp_gt_f16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x08,0x7c]
 0x01,0x05,0x08,0x7c
+# W32: v_cmp_gt_f16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x08,0x7c]
+# W64: v_cmp_gt_f16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x08,0x7c]
 
-# W32: v_cmp_gt_f16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x08,0x7c]
-# W64: v_cmp_gt_f16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x08,0x7c]
 0x7f,0x05,0x08,0x7c
+# W32: v_cmp_gt_f16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x08,0x7c]
+# W64: v_cmp_gt_f16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x08,0x7c]
 
-# W32: v_cmp_gt_f16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x08,0x7c]
-# W64: v_cmp_gt_f16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x08,0x7c]
 0x01,0x04,0x08,0x7c
+# W32: v_cmp_gt_f16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x08,0x7c]
+# W64: v_cmp_gt_f16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x08,0x7c]
 
-# W32: v_cmp_gt_f16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x08,0x7c]
-# W64: v_cmp_gt_f16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x08,0x7c]
 0x69,0x04,0x08,0x7c
+# W32: v_cmp_gt_f16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x08,0x7c]
+# W64: v_cmp_gt_f16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x08,0x7c]
 
-# W32: v_cmp_gt_f16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x08,0x7c]
-# W64: v_cmp_gt_f16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x08,0x7c]
 0x6a,0x04,0x08,0x7c
+# W32: v_cmp_gt_f16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x08,0x7c]
+# W64: v_cmp_gt_f16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x08,0x7c]
 
-# W32: v_cmp_gt_f16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x08,0x7c]
-# W64: v_cmp_gt_f16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x08,0x7c]
 0x6b,0x04,0x08,0x7c
+# W32: v_cmp_gt_f16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x08,0x7c]
+# W64: v_cmp_gt_f16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x08,0x7c]
 
-# W32: v_cmp_gt_f16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x08,0x7c]
-# W64: v_cmp_gt_f16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x08,0x7c]
 0x7b,0x04,0x08,0x7c
+# W32: v_cmp_gt_f16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x08,0x7c]
+# W64: v_cmp_gt_f16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x08,0x7c]
 
-# W32: v_cmp_gt_f16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x08,0x7c]
-# W64: v_cmp_gt_f16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x08,0x7c]
 0x7d,0x04,0x08,0x7c
+# W32: v_cmp_gt_f16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x08,0x7c]
+# W64: v_cmp_gt_f16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x08,0x7c]
 
-# W32: v_cmp_gt_f16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x08,0x7c]
-# W64: v_cmp_gt_f16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x08,0x7c]
 0x7e,0x04,0x08,0x7c
+# W32: v_cmp_gt_f16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x08,0x7c]
+# W64: v_cmp_gt_f16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x08,0x7c]
 
-# W32: v_cmp_gt_f16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x08,0x7c]
-# W64: v_cmp_gt_f16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x08,0x7c]
 0x7f,0x04,0x08,0x7c
+# W32: v_cmp_gt_f16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x08,0x7c]
+# W64: v_cmp_gt_f16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x08,0x7c]
 
-# W32: v_cmp_gt_f16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x08,0x7c]
-# W64: v_cmp_gt_f16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x08,0x7c]
 0x7c,0x04,0x08,0x7c
+# W32: v_cmp_gt_f16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x08,0x7c]
+# W64: v_cmp_gt_f16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x08,0x7c]
 
-# W32: v_cmp_gt_f16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x08,0x7c]
-# W64: v_cmp_gt_f16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x08,0x7c]
 0xc1,0x04,0x08,0x7c
+# W32: v_cmp_gt_f16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x08,0x7c]
+# W64: v_cmp_gt_f16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x08,0x7c]
 
-# W32: v_cmp_gt_f16_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x08,0x7c]
-# W64: v_cmp_gt_f16_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x08,0x7c]
 0xf0,0x04,0x08,0x7c
+# W32: v_cmp_gt_f16_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x08,0x7c]
+# W64: v_cmp_gt_f16_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x08,0x7c]
 
-# W32: v_cmp_gt_f16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x08,0x7c]
-# W64: v_cmp_gt_f16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x08,0x7c]
 0xfd,0x04,0x08,0x7c
+# W32: v_cmp_gt_f16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x08,0x7c]
+# W64: v_cmp_gt_f16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x08,0x7c]
 
-# W32: v_cmp_gt_f16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x08,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_gt_f16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x08,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x08,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_gt_f16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x08,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_gt_f16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x08,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_gt_f32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x28,0x7c]
-# W64: v_cmp_gt_f32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x28,0x7c]
 0x01,0x05,0x28,0x7c
+# W32: v_cmp_gt_f32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x28,0x7c]
+# W64: v_cmp_gt_f32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x28,0x7c]
 
-# W32: v_cmp_gt_f32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x28,0x7c]
-# W64: v_cmp_gt_f32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x28,0x7c]
 0xff,0x05,0x28,0x7c
+# W32: v_cmp_gt_f32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x28,0x7c]
+# W64: v_cmp_gt_f32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x28,0x7c]
 
-# W32: v_cmp_gt_f32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x28,0x7c]
-# W64: v_cmp_gt_f32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x28,0x7c]
 0x01,0x04,0x28,0x7c
+# W32: v_cmp_gt_f32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x28,0x7c]
+# W64: v_cmp_gt_f32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x28,0x7c]
 
-# W32: v_cmp_gt_f32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x28,0x7c]
-# W64: v_cmp_gt_f32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x28,0x7c]
 0x69,0x04,0x28,0x7c
+# W32: v_cmp_gt_f32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x28,0x7c]
+# W64: v_cmp_gt_f32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x28,0x7c]
 
-# W32: v_cmp_gt_f32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x28,0x7c]
-# W64: v_cmp_gt_f32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x28,0x7c]
 0x6a,0x04,0x28,0x7c
+# W32: v_cmp_gt_f32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x28,0x7c]
+# W64: v_cmp_gt_f32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x28,0x7c]
 
-# W32: v_cmp_gt_f32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x28,0x7c]
-# W64: v_cmp_gt_f32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x28,0x7c]
 0x6b,0x04,0x28,0x7c
+# W32: v_cmp_gt_f32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x28,0x7c]
+# W64: v_cmp_gt_f32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x28,0x7c]
 
-# W32: v_cmp_gt_f32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x28,0x7c]
-# W64: v_cmp_gt_f32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x28,0x7c]
 0x7b,0x04,0x28,0x7c
+# W32: v_cmp_gt_f32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x28,0x7c]
+# W64: v_cmp_gt_f32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x28,0x7c]
 
-# W32: v_cmp_gt_f32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x28,0x7c]
-# W64: v_cmp_gt_f32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x28,0x7c]
 0x7d,0x04,0x28,0x7c
+# W32: v_cmp_gt_f32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x28,0x7c]
+# W64: v_cmp_gt_f32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x28,0x7c]
 
-# W32: v_cmp_gt_f32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x28,0x7c]
-# W64: v_cmp_gt_f32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x28,0x7c]
 0x7e,0x04,0x28,0x7c
+# W32: v_cmp_gt_f32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x28,0x7c]
+# W64: v_cmp_gt_f32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x28,0x7c]
 
-# W32: v_cmp_gt_f32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x28,0x7c]
-# W64: v_cmp_gt_f32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x28,0x7c]
 0x7f,0x04,0x28,0x7c
+# W32: v_cmp_gt_f32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x28,0x7c]
+# W64: v_cmp_gt_f32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x28,0x7c]
 
-# W32: v_cmp_gt_f32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x28,0x7c]
-# W64: v_cmp_gt_f32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x28,0x7c]
 0x7c,0x04,0x28,0x7c
+# W32: v_cmp_gt_f32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x28,0x7c]
+# W64: v_cmp_gt_f32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x28,0x7c]
 
-# W32: v_cmp_gt_f32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x28,0x7c]
-# W64: v_cmp_gt_f32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x28,0x7c]
 0xc1,0x04,0x28,0x7c
+# W32: v_cmp_gt_f32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x28,0x7c]
+# W64: v_cmp_gt_f32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x28,0x7c]
 
-# W32: v_cmp_gt_f32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x28,0x7c]
-# W64: v_cmp_gt_f32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x28,0x7c]
 0xf0,0x04,0x28,0x7c
+# W32: v_cmp_gt_f32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x28,0x7c]
+# W64: v_cmp_gt_f32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x28,0x7c]
 
-# W32: v_cmp_gt_f32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x28,0x7c]
-# W64: v_cmp_gt_f32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x28,0x7c]
 0xfd,0x04,0x28,0x7c
+# W32: v_cmp_gt_f32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x28,0x7c]
+# W64: v_cmp_gt_f32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x28,0x7c]
 
-# W32: v_cmp_gt_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x29,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_gt_f32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x29,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x29,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_gt_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x29,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_gt_f32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x29,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_gt_f64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0x48,0x7c]
-# W64: v_cmp_gt_f64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0x48,0x7c]
 0x01,0x05,0x48,0x7c
+# W32: v_cmp_gt_f64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0x48,0x7c]
+# W64: v_cmp_gt_f64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0x48,0x7c]
 
-# W32: v_cmp_gt_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x48,0x7c]
-# W64: v_cmp_gt_f64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0x48,0x7c]
 0xfe,0x05,0x48,0x7c
+# W32: v_cmp_gt_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x48,0x7c]
+# W64: v_cmp_gt_f64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x48,0x7c]
 
-# W32: v_cmp_gt_f64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0x48,0x7c]
-# W64: v_cmp_gt_f64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0x48,0x7c]
 0x02,0x04,0x48,0x7c
+# W32: v_cmp_gt_f64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0x48,0x7c]
+# W64: v_cmp_gt_f64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0x48,0x7c]
 
-# W32: v_cmp_gt_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x48,0x7c]
-# W64: v_cmp_gt_f64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0x48,0x7c]
 0x68,0x04,0x48,0x7c
+# W32: v_cmp_gt_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x48,0x7c]
+# W64: v_cmp_gt_f64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x48,0x7c]
 
-# W32: v_cmp_gt_f64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0x48,0x7c]
-# W64: v_cmp_gt_f64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0x48,0x7c]
 0x6a,0x04,0x48,0x7c
+# W32: v_cmp_gt_f64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0x48,0x7c]
+# W64: v_cmp_gt_f64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0x48,0x7c]
 
+0x7a,0x04,0x48,0x7c
 # W32: v_cmp_gt_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x48,0x7c]
 # W64: v_cmp_gt_f64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x48,0x7c]
-0x7a,0x04,0x48,0x7c
 
-# W32: v_cmp_gt_f64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0x48,0x7c]
-# W64: v_cmp_gt_f64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0x48,0x7c]
 0x7e,0x04,0x48,0x7c
+# W32: v_cmp_gt_f64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0x48,0x7c]
+# W64: v_cmp_gt_f64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0x48,0x7c]
 
-# W32: v_cmp_gt_f64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0x48,0x7c]
-# W64: v_cmp_gt_f64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0x48,0x7c]
 0x7c,0x04,0x48,0x7c
+# W32: v_cmp_gt_f64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0x48,0x7c]
+# W64: v_cmp_gt_f64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0x48,0x7c]
 
-# W32: v_cmp_gt_f64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0x48,0x7c]
-# W64: v_cmp_gt_f64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0x48,0x7c]
 0xc1,0x04,0x48,0x7c
+# W32: v_cmp_gt_f64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0x48,0x7c]
+# W64: v_cmp_gt_f64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0x48,0x7c]
 
-# W32: v_cmp_gt_f64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0x48,0x7c]
-# W64: v_cmp_gt_f64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0x48,0x7c]
 0xf0,0x04,0x48,0x7c
+# W32: v_cmp_gt_f64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0x48,0x7c]
+# W64: v_cmp_gt_f64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0x48,0x7c]
 
-# W32: v_cmp_gt_f64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0x48,0x7c]
-# W64: v_cmp_gt_f64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0x48,0x7c]
 0xfd,0x04,0x48,0x7c
+# W32: v_cmp_gt_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x48,0x7c]
+# W64: v_cmp_gt_f64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0x48,0x7c]
 
+0xff,0xfc,0x49,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_gt_f64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x49,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_gt_f64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x49,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0x49,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_gt_i16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x68,0x7c]
-# W64: v_cmp_gt_i16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x68,0x7c]
 0x01,0x05,0x68,0x7c
+# W32: v_cmp_gt_i16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x68,0x7c]
+# W64: v_cmp_gt_i16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x68,0x7c]
 
-# W32: v_cmp_gt_i16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x68,0x7c]
-# W64: v_cmp_gt_i16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x68,0x7c]
 0x7f,0x05,0x68,0x7c
+# W32: v_cmp_gt_i16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x68,0x7c]
+# W64: v_cmp_gt_i16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x68,0x7c]
 
-# W32: v_cmp_gt_i16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x68,0x7c]
-# W64: v_cmp_gt_i16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x68,0x7c]
 0x01,0x04,0x68,0x7c
+# W32: v_cmp_gt_i16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x68,0x7c]
+# W64: v_cmp_gt_i16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x68,0x7c]
 
-# W32: v_cmp_gt_i16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x68,0x7c]
-# W64: v_cmp_gt_i16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x68,0x7c]
 0x69,0x04,0x68,0x7c
+# W32: v_cmp_gt_i16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x68,0x7c]
+# W64: v_cmp_gt_i16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x68,0x7c]
 
-# W32: v_cmp_gt_i16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x68,0x7c]
-# W64: v_cmp_gt_i16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x68,0x7c]
 0x6a,0x04,0x68,0x7c
+# W32: v_cmp_gt_i16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x68,0x7c]
+# W64: v_cmp_gt_i16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x68,0x7c]
 
-# W32: v_cmp_gt_i16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x68,0x7c]
-# W64: v_cmp_gt_i16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x68,0x7c]
 0x6b,0x04,0x68,0x7c
+# W32: v_cmp_gt_i16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x68,0x7c]
+# W64: v_cmp_gt_i16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x68,0x7c]
 
-# W32: v_cmp_gt_i16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x68,0x7c]
-# W64: v_cmp_gt_i16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x68,0x7c]
 0x7b,0x04,0x68,0x7c
+# W32: v_cmp_gt_i16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x68,0x7c]
+# W64: v_cmp_gt_i16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x68,0x7c]
 
-# W32: v_cmp_gt_i16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x68,0x7c]
-# W64: v_cmp_gt_i16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x68,0x7c]
 0x7d,0x04,0x68,0x7c
+# W32: v_cmp_gt_i16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x68,0x7c]
+# W64: v_cmp_gt_i16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x68,0x7c]
 
-# W32: v_cmp_gt_i16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x68,0x7c]
-# W64: v_cmp_gt_i16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x68,0x7c]
 0x7e,0x04,0x68,0x7c
+# W32: v_cmp_gt_i16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x68,0x7c]
+# W64: v_cmp_gt_i16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x68,0x7c]
 
-# W32: v_cmp_gt_i16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x68,0x7c]
-# W64: v_cmp_gt_i16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x68,0x7c]
 0x7f,0x04,0x68,0x7c
+# W32: v_cmp_gt_i16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x68,0x7c]
+# W64: v_cmp_gt_i16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x68,0x7c]
 
-# W32: v_cmp_gt_i16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x68,0x7c]
-# W64: v_cmp_gt_i16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x68,0x7c]
 0x7c,0x04,0x68,0x7c
+# W32: v_cmp_gt_i16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x68,0x7c]
+# W64: v_cmp_gt_i16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x68,0x7c]
 
-# W32: v_cmp_gt_i16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x68,0x7c]
-# W64: v_cmp_gt_i16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x68,0x7c]
 0xc1,0x04,0x68,0x7c
+# W32: v_cmp_gt_i16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x68,0x7c]
+# W64: v_cmp_gt_i16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x68,0x7c]
 
-# W32: v_cmp_gt_i16_e32 vcc_lo, 0x3800, v2
-# W64: v_cmp_gt_i16_e32 vcc, 0x3800, v2          ; encoding: [0xff,0x04,0x68,0x7c,0x00,0x38,0x00,0x00]
 0xf0,0x04,0x68,0x7c
+# W32: v_cmp_gt_i16_e32 vcc_lo, 0x3800, v2     ; encoding: [0xff,0x04,0x68,0x7c,0x00,0x38,0x00,0x00]
+# W64: v_cmp_gt_i16_e32 vcc, 0x3800, v2        ; encoding: [0xff,0x04,0x68,0x7c,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_gt_i16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x68,0x7c]
-# W64: v_cmp_gt_i16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x68,0x7c]
 0xfd,0x04,0x68,0x7c
+# W32: v_cmp_gt_i16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x68,0x7c]
+# W64: v_cmp_gt_i16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x68,0x7c]
 
-# W32: v_cmp_gt_i16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x68,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_gt_i16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x68,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x68,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_gt_i16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x68,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_gt_i16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x68,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_gt_i32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x88,0x7c]
-# W64: v_cmp_gt_i32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x88,0x7c]
 0x01,0x05,0x88,0x7c
+# W32: v_cmp_gt_i32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x88,0x7c]
+# W64: v_cmp_gt_i32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x88,0x7c]
 
-# W32: v_cmp_gt_i32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x88,0x7c]
-# W64: v_cmp_gt_i32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x88,0x7c]
 0xff,0x05,0x88,0x7c
+# W32: v_cmp_gt_i32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x88,0x7c]
+# W64: v_cmp_gt_i32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x88,0x7c]
 
-# W32: v_cmp_gt_i32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x88,0x7c]
-# W64: v_cmp_gt_i32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x88,0x7c]
 0x01,0x04,0x88,0x7c
+# W32: v_cmp_gt_i32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x88,0x7c]
+# W64: v_cmp_gt_i32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x88,0x7c]
 
-# W32: v_cmp_gt_i32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x88,0x7c]
-# W64: v_cmp_gt_i32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x88,0x7c]
 0x69,0x04,0x88,0x7c
+# W32: v_cmp_gt_i32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x88,0x7c]
+# W64: v_cmp_gt_i32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x88,0x7c]
 
-# W32: v_cmp_gt_i32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x88,0x7c]
-# W64: v_cmp_gt_i32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x88,0x7c]
 0x6a,0x04,0x88,0x7c
+# W32: v_cmp_gt_i32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x88,0x7c]
+# W64: v_cmp_gt_i32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x88,0x7c]
 
-# W32: v_cmp_gt_i32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x88,0x7c]
-# W64: v_cmp_gt_i32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x88,0x7c]
 0x6b,0x04,0x88,0x7c
+# W32: v_cmp_gt_i32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x88,0x7c]
+# W64: v_cmp_gt_i32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x88,0x7c]
 
-# W32: v_cmp_gt_i32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x88,0x7c]
-# W64: v_cmp_gt_i32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x88,0x7c]
 0x7b,0x04,0x88,0x7c
+# W32: v_cmp_gt_i32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x88,0x7c]
+# W64: v_cmp_gt_i32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x88,0x7c]
 
-# W32: v_cmp_gt_i32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x88,0x7c]
-# W64: v_cmp_gt_i32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x88,0x7c]
 0x7d,0x04,0x88,0x7c
+# W32: v_cmp_gt_i32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x88,0x7c]
+# W64: v_cmp_gt_i32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x88,0x7c]
 
-# W32: v_cmp_gt_i32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x88,0x7c]
-# W64: v_cmp_gt_i32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x88,0x7c]
 0x7e,0x04,0x88,0x7c
+# W32: v_cmp_gt_i32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x88,0x7c]
+# W64: v_cmp_gt_i32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x88,0x7c]
 
-# W32: v_cmp_gt_i32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x88,0x7c]
-# W64: v_cmp_gt_i32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x88,0x7c]
 0x7f,0x04,0x88,0x7c
+# W32: v_cmp_gt_i32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x88,0x7c]
+# W64: v_cmp_gt_i32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x88,0x7c]
 
-# W32: v_cmp_gt_i32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x88,0x7c]
-# W64: v_cmp_gt_i32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x88,0x7c]
 0x7c,0x04,0x88,0x7c
+# W32: v_cmp_gt_i32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x88,0x7c]
+# W64: v_cmp_gt_i32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x88,0x7c]
 
-# W32: v_cmp_gt_i32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x88,0x7c]
-# W64: v_cmp_gt_i32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x88,0x7c]
 0xc1,0x04,0x88,0x7c
+# W32: v_cmp_gt_i32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x88,0x7c]
+# W64: v_cmp_gt_i32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x88,0x7c]
 
-# W32: v_cmp_gt_i32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x88,0x7c]
-# W64: v_cmp_gt_i32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x88,0x7c]
 0xf0,0x04,0x88,0x7c
+# W32: v_cmp_gt_i32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x88,0x7c]
+# W64: v_cmp_gt_i32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x88,0x7c]
 
-# W32: v_cmp_gt_i32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x88,0x7c]
-# W64: v_cmp_gt_i32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x88,0x7c]
 0xfd,0x04,0x88,0x7c
+# W32: v_cmp_gt_i32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x88,0x7c]
+# W64: v_cmp_gt_i32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x88,0x7c]
 
-# W32: v_cmp_gt_i32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x89,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_gt_i32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x89,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x89,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_gt_i32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x89,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_gt_i32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x89,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0xa8,0x7c]
-# W64: v_cmp_gt_i64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0xa8,0x7c]
 0x01,0x05,0xa8,0x7c
+# W32: v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0xa8,0x7c]
+# W64: v_cmp_gt_i64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0xa8,0x7c]
 
-# W32: v_cmp_gt_i64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xa8,0x7c]
-# W64: v_cmp_gt_i64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0xa8,0x7c]
 0xfe,0x05,0xa8,0x7c
+# W32: v_cmp_gt_i64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xa8,0x7c]
+# W64: v_cmp_gt_i64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xa8,0x7c]
 
-# W32: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0xa8,0x7c]
-# W64: v_cmp_gt_i64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0xa8,0x7c]
 0x02,0x04,0xa8,0x7c
+# W32: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0xa8,0x7c]
+# W64: v_cmp_gt_i64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0xa8,0x7c]
 
-# W32: v_cmp_gt_i64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xa8,0x7c]
-# W64: v_cmp_gt_i64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0xa8,0x7c]
 0x68,0x04,0xa8,0x7c
+# W32: v_cmp_gt_i64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xa8,0x7c]
+# W64: v_cmp_gt_i64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xa8,0x7c]
 
-# W32: v_cmp_gt_i64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0xa8,0x7c]
-# W64: v_cmp_gt_i64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0xa8,0x7c]
 0x6a,0x04,0xa8,0x7c
+# W32: v_cmp_gt_i64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0xa8,0x7c]
+# W64: v_cmp_gt_i64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0xa8,0x7c]
 
+0x7a,0x04,0xa8,0x7c
 # W32: v_cmp_gt_i64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xa8,0x7c]
 # W64: v_cmp_gt_i64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xa8,0x7c]
-0x7a,0x04,0xa8,0x7c
 
-# W32: v_cmp_gt_i64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0xa8,0x7c]
-# W64: v_cmp_gt_i64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0xa8,0x7c]
 0x7e,0x04,0xa8,0x7c
+# W32: v_cmp_gt_i64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0xa8,0x7c]
+# W64: v_cmp_gt_i64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0xa8,0x7c]
 
-# W32: v_cmp_gt_i64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0xa8,0x7c]
-# W64: v_cmp_gt_i64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0xa8,0x7c]
 0x7c,0x04,0xa8,0x7c
+# W32: v_cmp_gt_i64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0xa8,0x7c]
+# W64: v_cmp_gt_i64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0xa8,0x7c]
 
-# W32: v_cmp_gt_i64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0xa8,0x7c]
-# W64: v_cmp_gt_i64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0xa8,0x7c]
 0xc1,0x04,0xa8,0x7c
+# W32: v_cmp_gt_i64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0xa8,0x7c]
+# W64: v_cmp_gt_i64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0xa8,0x7c]
 
-# W32: v_cmp_gt_i64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0xa8,0x7c]
-# W64: v_cmp_gt_i64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0xa8,0x7c]
 0xf0,0x04,0xa8,0x7c
+# W32: v_cmp_gt_i64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0xa8,0x7c]
+# W64: v_cmp_gt_i64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0xa8,0x7c]
 
-# W32: v_cmp_gt_i64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0xa8,0x7c]
-# W64: v_cmp_gt_i64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0xa8,0x7c]
 0xfd,0x04,0xa8,0x7c
+# W32: v_cmp_gt_i64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0xa8,0x7c]
+# W64: v_cmp_gt_i64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0xa8,0x7c]
 
+0xff,0xfc,0xa9,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_gt_i64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa9,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_gt_i64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa9,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0xa9,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_gt_u16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x78,0x7c]
-# W64: v_cmp_gt_u16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x78,0x7c]
 0x01,0x05,0x78,0x7c
+# W32: v_cmp_gt_u16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x78,0x7c]
+# W64: v_cmp_gt_u16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x78,0x7c]
 
-# W32: v_cmp_gt_u16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x78,0x7c]
-# W64: v_cmp_gt_u16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x78,0x7c]
 0x7f,0x05,0x78,0x7c
+# W32: v_cmp_gt_u16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x78,0x7c]
+# W64: v_cmp_gt_u16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x78,0x7c]
 
-# W32: v_cmp_gt_u16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x78,0x7c]
-# W64: v_cmp_gt_u16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x78,0x7c]
 0x01,0x04,0x78,0x7c
+# W32: v_cmp_gt_u16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x78,0x7c]
+# W64: v_cmp_gt_u16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x78,0x7c]
 
-# W32: v_cmp_gt_u16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x78,0x7c]
-# W64: v_cmp_gt_u16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x78,0x7c]
 0x69,0x04,0x78,0x7c
+# W32: v_cmp_gt_u16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x78,0x7c]
+# W64: v_cmp_gt_u16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x78,0x7c]
 
-# W32: v_cmp_gt_u16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x78,0x7c]
-# W64: v_cmp_gt_u16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x78,0x7c]
 0x6a,0x04,0x78,0x7c
+# W32: v_cmp_gt_u16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x78,0x7c]
+# W64: v_cmp_gt_u16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x78,0x7c]
 
-# W32: v_cmp_gt_u16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x78,0x7c]
-# W64: v_cmp_gt_u16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x78,0x7c]
 0x6b,0x04,0x78,0x7c
+# W32: v_cmp_gt_u16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x78,0x7c]
+# W64: v_cmp_gt_u16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x78,0x7c]
 
-# W32: v_cmp_gt_u16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x78,0x7c]
-# W64: v_cmp_gt_u16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x78,0x7c]
 0x7b,0x04,0x78,0x7c
+# W32: v_cmp_gt_u16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x78,0x7c]
+# W64: v_cmp_gt_u16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x78,0x7c]
 
-# W32: v_cmp_gt_u16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x78,0x7c]
-# W64: v_cmp_gt_u16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x78,0x7c]
 0x7d,0x04,0x78,0x7c
+# W32: v_cmp_gt_u16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x78,0x7c]
+# W64: v_cmp_gt_u16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x78,0x7c]
 
-# W32: v_cmp_gt_u16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x78,0x7c]
-# W64: v_cmp_gt_u16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x78,0x7c]
 0x7e,0x04,0x78,0x7c
+# W32: v_cmp_gt_u16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x78,0x7c]
+# W64: v_cmp_gt_u16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x78,0x7c]
 
-# W32: v_cmp_gt_u16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x78,0x7c]
-# W64: v_cmp_gt_u16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x78,0x7c]
 0x7f,0x04,0x78,0x7c
+# W32: v_cmp_gt_u16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x78,0x7c]
+# W64: v_cmp_gt_u16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x78,0x7c]
 
-# W32: v_cmp_gt_u16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x78,0x7c]
-# W64: v_cmp_gt_u16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x78,0x7c]
 0x7c,0x04,0x78,0x7c
+# W32: v_cmp_gt_u16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x78,0x7c]
+# W64: v_cmp_gt_u16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x78,0x7c]
 
-# W32: v_cmp_gt_u16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x78,0x7c]
-# W64: v_cmp_gt_u16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x78,0x7c]
 0xc1,0x04,0x78,0x7c
+# W32: v_cmp_gt_u16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x78,0x7c]
+# W64: v_cmp_gt_u16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x78,0x7c]
 
-# W32: v_cmp_gt_u16_e32 vcc_lo, 0x3800, v2
-# W64: v_cmp_gt_u16_e32 vcc, 0x3800, v2          ; encoding: [0xff,0x04,0x78,0x7c,0x00,0x38,0x00,0x00]
 0xf0,0x04,0x78,0x7c
+# W32: v_cmp_gt_u16_e32 vcc_lo, 0x3800, v2     ; encoding: [0xff,0x04,0x78,0x7c,0x00,0x38,0x00,0x00]
+# W64: v_cmp_gt_u16_e32 vcc, 0x3800, v2        ; encoding: [0xff,0x04,0x78,0x7c,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_gt_u16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x78,0x7c]
-# W64: v_cmp_gt_u16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x78,0x7c]
 0xfd,0x04,0x78,0x7c
+# W32: v_cmp_gt_u16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x78,0x7c]
+# W64: v_cmp_gt_u16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x78,0x7c]
 
-# W32: v_cmp_gt_u16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x78,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_gt_u16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x78,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x78,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_gt_u16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x78,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_gt_u16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x78,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_gt_u32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x98,0x7c]
-# W64: v_cmp_gt_u32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x98,0x7c]
 0x01,0x05,0x98,0x7c
+# W32: v_cmp_gt_u32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x98,0x7c]
+# W64: v_cmp_gt_u32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x98,0x7c]
 
-# W32: v_cmp_gt_u32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x98,0x7c]
-# W64: v_cmp_gt_u32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x98,0x7c]
 0xff,0x05,0x98,0x7c
+# W32: v_cmp_gt_u32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x98,0x7c]
+# W64: v_cmp_gt_u32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x98,0x7c]
 
-# W32: v_cmp_gt_u32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x98,0x7c]
-# W64: v_cmp_gt_u32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x98,0x7c]
 0x01,0x04,0x98,0x7c
+# W32: v_cmp_gt_u32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x98,0x7c]
+# W64: v_cmp_gt_u32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x98,0x7c]
 
-# W32: v_cmp_gt_u32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x98,0x7c]
-# W64: v_cmp_gt_u32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x98,0x7c]
 0x69,0x04,0x98,0x7c
+# W32: v_cmp_gt_u32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x98,0x7c]
+# W64: v_cmp_gt_u32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x98,0x7c]
 
-# W32: v_cmp_gt_u32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x98,0x7c]
-# W64: v_cmp_gt_u32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x98,0x7c]
 0x6a,0x04,0x98,0x7c
+# W32: v_cmp_gt_u32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x98,0x7c]
+# W64: v_cmp_gt_u32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x98,0x7c]
 
-# W32: v_cmp_gt_u32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x98,0x7c]
-# W64: v_cmp_gt_u32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x98,0x7c]
 0x6b,0x04,0x98,0x7c
+# W32: v_cmp_gt_u32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x98,0x7c]
+# W64: v_cmp_gt_u32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x98,0x7c]
 
-# W32: v_cmp_gt_u32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x98,0x7c]
-# W64: v_cmp_gt_u32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x98,0x7c]
 0x7b,0x04,0x98,0x7c
+# W32: v_cmp_gt_u32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x98,0x7c]
+# W64: v_cmp_gt_u32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x98,0x7c]
 
-# W32: v_cmp_gt_u32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x98,0x7c]
-# W64: v_cmp_gt_u32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x98,0x7c]
 0x7d,0x04,0x98,0x7c
+# W32: v_cmp_gt_u32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x98,0x7c]
+# W64: v_cmp_gt_u32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x98,0x7c]
 
-# W32: v_cmp_gt_u32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x98,0x7c]
-# W64: v_cmp_gt_u32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x98,0x7c]
 0x7e,0x04,0x98,0x7c
+# W32: v_cmp_gt_u32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x98,0x7c]
+# W64: v_cmp_gt_u32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x98,0x7c]
 
-# W32: v_cmp_gt_u32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x98,0x7c]
-# W64: v_cmp_gt_u32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x98,0x7c]
 0x7f,0x04,0x98,0x7c
+# W32: v_cmp_gt_u32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x98,0x7c]
+# W64: v_cmp_gt_u32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x98,0x7c]
 
-# W32: v_cmp_gt_u32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x98,0x7c]
-# W64: v_cmp_gt_u32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x98,0x7c]
 0x7c,0x04,0x98,0x7c
+# W32: v_cmp_gt_u32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x98,0x7c]
+# W64: v_cmp_gt_u32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x98,0x7c]
 
-# W32: v_cmp_gt_u32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x98,0x7c]
-# W64: v_cmp_gt_u32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x98,0x7c]
 0xc1,0x04,0x98,0x7c
+# W32: v_cmp_gt_u32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x98,0x7c]
+# W64: v_cmp_gt_u32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x98,0x7c]
 
-# W32: v_cmp_gt_u32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x98,0x7c]
-# W64: v_cmp_gt_u32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x98,0x7c]
 0xf0,0x04,0x98,0x7c
+# W32: v_cmp_gt_u32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x98,0x7c]
+# W64: v_cmp_gt_u32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x98,0x7c]
 
-# W32: v_cmp_gt_u32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x98,0x7c]
-# W64: v_cmp_gt_u32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x98,0x7c]
 0xfd,0x04,0x98,0x7c
+# W32: v_cmp_gt_u32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x98,0x7c]
+# W64: v_cmp_gt_u32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x98,0x7c]
 
-# W32: v_cmp_gt_u32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x99,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_gt_u32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x99,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x99,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_gt_u32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x99,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_gt_u32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x99,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0xb8,0x7c]
-# W64: v_cmp_gt_u64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0xb8,0x7c]
 0x01,0x05,0xb8,0x7c
+# W32: v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0xb8,0x7c]
+# W64: v_cmp_gt_u64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0xb8,0x7c]
 
-# W32: v_cmp_gt_u64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xb8,0x7c]
-# W64: v_cmp_gt_u64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0xb8,0x7c]
 0xfe,0x05,0xb8,0x7c
+# W32: v_cmp_gt_u64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xb8,0x7c]
+# W64: v_cmp_gt_u64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xb8,0x7c]
 
-# W32: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0xb8,0x7c]
-# W64: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0xb8,0x7c]
 0x02,0x04,0xb8,0x7c
+# W32: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0xb8,0x7c]
+# W64: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0xb8,0x7c]
 
-# W32: v_cmp_gt_u64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xb8,0x7c]
-# W64: v_cmp_gt_u64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0xb8,0x7c]
 0x68,0x04,0xb8,0x7c
+# W32: v_cmp_gt_u64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xb8,0x7c]
+# W64: v_cmp_gt_u64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xb8,0x7c]
 
-# W32: v_cmp_gt_u64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0xb8,0x7c]
-# W64: v_cmp_gt_u64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0xb8,0x7c]
 0x6a,0x04,0xb8,0x7c
+# W32: v_cmp_gt_u64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0xb8,0x7c]
+# W64: v_cmp_gt_u64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0xb8,0x7c]
 
+0x7a,0x04,0xb8,0x7c
 # W32: v_cmp_gt_u64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xb8,0x7c]
 # W64: v_cmp_gt_u64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xb8,0x7c]
-0x7a,0x04,0xb8,0x7c
 
-# W32: v_cmp_gt_u64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0xb8,0x7c]
-# W64: v_cmp_gt_u64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0xb8,0x7c]
 0x7e,0x04,0xb8,0x7c
+# W32: v_cmp_gt_u64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0xb8,0x7c]
+# W64: v_cmp_gt_u64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0xb8,0x7c]
 
-# W32: v_cmp_gt_u64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0xb8,0x7c]
-# W64: v_cmp_gt_u64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0xb8,0x7c]
 0x7c,0x04,0xb8,0x7c
+# W32: v_cmp_gt_u64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0xb8,0x7c]
+# W64: v_cmp_gt_u64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0xb8,0x7c]
 
-# W32: v_cmp_gt_u64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0xb8,0x7c]
-# W64: v_cmp_gt_u64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0xb8,0x7c]
 0xc1,0x04,0xb8,0x7c
+# W32: v_cmp_gt_u64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0xb8,0x7c]
+# W64: v_cmp_gt_u64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0xb8,0x7c]
 
-# W32: v_cmp_gt_u64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0xb8,0x7c]
-# W64: v_cmp_gt_u64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0xb8,0x7c]
 0xf0,0x04,0xb8,0x7c
+# W32: v_cmp_gt_u64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0xb8,0x7c]
+# W64: v_cmp_gt_u64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0xb8,0x7c]
 
-# W32: v_cmp_gt_u64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0xb8,0x7c]
-# W64: v_cmp_gt_u64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0xb8,0x7c]
 0xfd,0x04,0xb8,0x7c
+# W32: v_cmp_gt_u64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0xb8,0x7c]
+# W64: v_cmp_gt_u64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0xb8,0x7c]
 
+0xff,0xfc,0xb9,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_gt_u64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb9,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_gt_u64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb9,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0xb9,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_le_f16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x06,0x7c]
-# W64: v_cmp_le_f16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x06,0x7c]
 0x01,0x05,0x06,0x7c
+# W32: v_cmp_le_f16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x06,0x7c]
+# W64: v_cmp_le_f16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x06,0x7c]
 
-# W32: v_cmp_le_f16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x06,0x7c]
-# W64: v_cmp_le_f16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x06,0x7c]
 0x7f,0x05,0x06,0x7c
+# W32: v_cmp_le_f16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x06,0x7c]
+# W64: v_cmp_le_f16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x06,0x7c]
 
-# W32: v_cmp_le_f16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x06,0x7c]
-# W64: v_cmp_le_f16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x06,0x7c]
 0x01,0x04,0x06,0x7c
+# W32: v_cmp_le_f16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x06,0x7c]
+# W64: v_cmp_le_f16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x06,0x7c]
 
-# W32: v_cmp_le_f16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x06,0x7c]
-# W64: v_cmp_le_f16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x06,0x7c]
 0x69,0x04,0x06,0x7c
+# W32: v_cmp_le_f16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x06,0x7c]
+# W64: v_cmp_le_f16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x06,0x7c]
 
-# W32: v_cmp_le_f16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x06,0x7c]
-# W64: v_cmp_le_f16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x06,0x7c]
 0x6a,0x04,0x06,0x7c
+# W32: v_cmp_le_f16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x06,0x7c]
+# W64: v_cmp_le_f16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x06,0x7c]
 
-# W32: v_cmp_le_f16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x06,0x7c]
-# W64: v_cmp_le_f16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x06,0x7c]
 0x6b,0x04,0x06,0x7c
+# W32: v_cmp_le_f16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x06,0x7c]
+# W64: v_cmp_le_f16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x06,0x7c]
 
-# W32: v_cmp_le_f16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x06,0x7c]
-# W64: v_cmp_le_f16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x06,0x7c]
 0x7b,0x04,0x06,0x7c
+# W32: v_cmp_le_f16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x06,0x7c]
+# W64: v_cmp_le_f16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x06,0x7c]
 
-# W32: v_cmp_le_f16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x06,0x7c]
-# W64: v_cmp_le_f16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x06,0x7c]
 0x7d,0x04,0x06,0x7c
+# W32: v_cmp_le_f16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x06,0x7c]
+# W64: v_cmp_le_f16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x06,0x7c]
 
-# W32: v_cmp_le_f16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x06,0x7c]
-# W64: v_cmp_le_f16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x06,0x7c]
 0x7e,0x04,0x06,0x7c
+# W32: v_cmp_le_f16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x06,0x7c]
+# W64: v_cmp_le_f16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x06,0x7c]
 
-# W32: v_cmp_le_f16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x06,0x7c]
-# W64: v_cmp_le_f16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x06,0x7c]
 0x7f,0x04,0x06,0x7c
+# W32: v_cmp_le_f16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x06,0x7c]
+# W64: v_cmp_le_f16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x06,0x7c]
 
-# W32: v_cmp_le_f16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x06,0x7c]
-# W64: v_cmp_le_f16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x06,0x7c]
 0x7c,0x04,0x06,0x7c
+# W32: v_cmp_le_f16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x06,0x7c]
+# W64: v_cmp_le_f16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x06,0x7c]
 
-# W32: v_cmp_le_f16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x06,0x7c]
-# W64: v_cmp_le_f16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x06,0x7c]
 0xc1,0x04,0x06,0x7c
+# W32: v_cmp_le_f16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x06,0x7c]
+# W64: v_cmp_le_f16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x06,0x7c]
 
-# W32: v_cmp_le_f16_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x06,0x7c]
-# W64: v_cmp_le_f16_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x06,0x7c]
 0xf0,0x04,0x06,0x7c
+# W32: v_cmp_le_f16_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x06,0x7c]
+# W64: v_cmp_le_f16_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x06,0x7c]
 
-# W32: v_cmp_le_f16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x06,0x7c]
-# W64: v_cmp_le_f16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x06,0x7c]
 0xfd,0x04,0x06,0x7c
+# W32: v_cmp_le_f16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x06,0x7c]
+# W64: v_cmp_le_f16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x06,0x7c]
 
-# W32: v_cmp_le_f16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x06,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_le_f16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x06,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x06,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_le_f16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x06,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_le_f16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x06,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_le_f32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x26,0x7c]
-# W64: v_cmp_le_f32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x26,0x7c]
 0x01,0x05,0x26,0x7c
+# W32: v_cmp_le_f32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x26,0x7c]
+# W64: v_cmp_le_f32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x26,0x7c]
 
-# W32: v_cmp_le_f32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x26,0x7c]
-# W64: v_cmp_le_f32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x26,0x7c]
 0xff,0x05,0x26,0x7c
+# W32: v_cmp_le_f32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x26,0x7c]
+# W64: v_cmp_le_f32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x26,0x7c]
 
-# W32: v_cmp_le_f32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x26,0x7c]
-# W64: v_cmp_le_f32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x26,0x7c]
 0x01,0x04,0x26,0x7c
+# W32: v_cmp_le_f32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x26,0x7c]
+# W64: v_cmp_le_f32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x26,0x7c]
 
-# W32: v_cmp_le_f32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x26,0x7c]
-# W64: v_cmp_le_f32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x26,0x7c]
 0x69,0x04,0x26,0x7c
+# W32: v_cmp_le_f32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x26,0x7c]
+# W64: v_cmp_le_f32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x26,0x7c]
 
-# W32: v_cmp_le_f32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x26,0x7c]
-# W64: v_cmp_le_f32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x26,0x7c]
 0x6a,0x04,0x26,0x7c
+# W32: v_cmp_le_f32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x26,0x7c]
+# W64: v_cmp_le_f32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x26,0x7c]
 
-# W32: v_cmp_le_f32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x26,0x7c]
-# W64: v_cmp_le_f32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x26,0x7c]
 0x6b,0x04,0x26,0x7c
+# W32: v_cmp_le_f32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x26,0x7c]
+# W64: v_cmp_le_f32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x26,0x7c]
 
-# W32: v_cmp_le_f32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x26,0x7c]
-# W64: v_cmp_le_f32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x26,0x7c]
 0x7b,0x04,0x26,0x7c
+# W32: v_cmp_le_f32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x26,0x7c]
+# W64: v_cmp_le_f32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x26,0x7c]
 
-# W32: v_cmp_le_f32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x26,0x7c]
-# W64: v_cmp_le_f32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x26,0x7c]
 0x7d,0x04,0x26,0x7c
+# W32: v_cmp_le_f32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x26,0x7c]
+# W64: v_cmp_le_f32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x26,0x7c]
 
-# W32: v_cmp_le_f32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x26,0x7c]
-# W64: v_cmp_le_f32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x26,0x7c]
 0x7e,0x04,0x26,0x7c
+# W32: v_cmp_le_f32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x26,0x7c]
+# W64: v_cmp_le_f32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x26,0x7c]
 
-# W32: v_cmp_le_f32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x26,0x7c]
-# W64: v_cmp_le_f32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x26,0x7c]
 0x7f,0x04,0x26,0x7c
+# W32: v_cmp_le_f32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x26,0x7c]
+# W64: v_cmp_le_f32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x26,0x7c]
 
-# W32: v_cmp_le_f32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x26,0x7c]
-# W64: v_cmp_le_f32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x26,0x7c]
 0x7c,0x04,0x26,0x7c
+# W32: v_cmp_le_f32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x26,0x7c]
+# W64: v_cmp_le_f32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x26,0x7c]
 
-# W32: v_cmp_le_f32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x26,0x7c]
-# W64: v_cmp_le_f32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x26,0x7c]
 0xc1,0x04,0x26,0x7c
+# W32: v_cmp_le_f32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x26,0x7c]
+# W64: v_cmp_le_f32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x26,0x7c]
 
-# W32: v_cmp_le_f32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x26,0x7c]
-# W64: v_cmp_le_f32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x26,0x7c]
 0xf0,0x04,0x26,0x7c
+# W32: v_cmp_le_f32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x26,0x7c]
+# W64: v_cmp_le_f32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x26,0x7c]
 
-# W32: v_cmp_le_f32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x26,0x7c]
-# W64: v_cmp_le_f32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x26,0x7c]
 0xfd,0x04,0x26,0x7c
+# W32: v_cmp_le_f32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x26,0x7c]
+# W64: v_cmp_le_f32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x26,0x7c]
 
-# W32: v_cmp_le_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x27,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_le_f32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x27,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x27,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_le_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x27,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_le_f32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x27,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_le_f64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0x46,0x7c]
-# W64: v_cmp_le_f64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0x46,0x7c]
 0x01,0x05,0x46,0x7c
+# W32: v_cmp_le_f64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0x46,0x7c]
+# W64: v_cmp_le_f64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0x46,0x7c]
 
-# W32: v_cmp_le_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x46,0x7c]
-# W64: v_cmp_le_f64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0x46,0x7c]
 0xfe,0x05,0x46,0x7c
+# W32: v_cmp_le_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x46,0x7c]
+# W64: v_cmp_le_f64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x46,0x7c]
 
-# W32: v_cmp_le_f64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0x46,0x7c]
-# W64: v_cmp_le_f64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0x46,0x7c]
 0x02,0x04,0x46,0x7c
+# W32: v_cmp_le_f64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0x46,0x7c]
+# W64: v_cmp_le_f64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0x46,0x7c]
 
-# W32: v_cmp_le_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x46,0x7c]
-# W64: v_cmp_le_f64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0x46,0x7c]
 0x68,0x04,0x46,0x7c
+# W32: v_cmp_le_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x46,0x7c]
+# W64: v_cmp_le_f64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x46,0x7c]
 
-# W32: v_cmp_le_f64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0x46,0x7c]
-# W64: v_cmp_le_f64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0x46,0x7c]
 0x6a,0x04,0x46,0x7c
+# W32: v_cmp_le_f64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0x46,0x7c]
+# W64: v_cmp_le_f64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0x46,0x7c]
 
+0x7a,0x04,0x46,0x7c
 # W32: v_cmp_le_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x46,0x7c]
 # W64: v_cmp_le_f64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x46,0x7c]
-0x7a,0x04,0x46,0x7c
 
-# W32: v_cmp_le_f64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0x46,0x7c]
-# W64: v_cmp_le_f64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0x46,0x7c]
 0x7e,0x04,0x46,0x7c
+# W32: v_cmp_le_f64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0x46,0x7c]
+# W64: v_cmp_le_f64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0x46,0x7c]
 
-# W32: v_cmp_le_f64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0x46,0x7c]
-# W64: v_cmp_le_f64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0x46,0x7c]
 0x7c,0x04,0x46,0x7c
+# W32: v_cmp_le_f64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0x46,0x7c]
+# W64: v_cmp_le_f64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0x46,0x7c]
 
-# W32: v_cmp_le_f64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0x46,0x7c]
-# W64: v_cmp_le_f64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0x46,0x7c]
 0xc1,0x04,0x46,0x7c
+# W32: v_cmp_le_f64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0x46,0x7c]
+# W64: v_cmp_le_f64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0x46,0x7c]
 
-# W32: v_cmp_le_f64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0x46,0x7c]
-# W64: v_cmp_le_f64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0x46,0x7c]
 0xf0,0x04,0x46,0x7c
+# W32: v_cmp_le_f64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0x46,0x7c]
+# W64: v_cmp_le_f64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0x46,0x7c]
 
-# W32: v_cmp_le_f64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0x46,0x7c]
-# W64: v_cmp_le_f64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0x46,0x7c]
 0xfd,0x04,0x46,0x7c
+# W32: v_cmp_le_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x46,0x7c]
+# W64: v_cmp_le_f64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0x46,0x7c]
 
+0xff,0xfc,0x47,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_le_f64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x47,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_le_f64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x47,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0x47,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_le_i16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x66,0x7c]
-# W64: v_cmp_le_i16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x66,0x7c]
 0x01,0x05,0x66,0x7c
+# W32: v_cmp_le_i16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x66,0x7c]
+# W64: v_cmp_le_i16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x66,0x7c]
 
-# W32: v_cmp_le_i16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x66,0x7c]
-# W64: v_cmp_le_i16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x66,0x7c]
 0x7f,0x05,0x66,0x7c
+# W32: v_cmp_le_i16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x66,0x7c]
+# W64: v_cmp_le_i16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x66,0x7c]
 
-# W32: v_cmp_le_i16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x66,0x7c]
-# W64: v_cmp_le_i16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x66,0x7c]
 0x01,0x04,0x66,0x7c
+# W32: v_cmp_le_i16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x66,0x7c]
+# W64: v_cmp_le_i16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x66,0x7c]
 
-# W32: v_cmp_le_i16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x66,0x7c]
-# W64: v_cmp_le_i16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x66,0x7c]
 0x69,0x04,0x66,0x7c
+# W32: v_cmp_le_i16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x66,0x7c]
+# W64: v_cmp_le_i16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x66,0x7c]
 
-# W32: v_cmp_le_i16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x66,0x7c]
-# W64: v_cmp_le_i16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x66,0x7c]
 0x6a,0x04,0x66,0x7c
+# W32: v_cmp_le_i16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x66,0x7c]
+# W64: v_cmp_le_i16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x66,0x7c]
 
-# W32: v_cmp_le_i16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x66,0x7c]
-# W64: v_cmp_le_i16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x66,0x7c]
 0x6b,0x04,0x66,0x7c
+# W32: v_cmp_le_i16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x66,0x7c]
+# W64: v_cmp_le_i16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x66,0x7c]
 
-# W32: v_cmp_le_i16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x66,0x7c]
-# W64: v_cmp_le_i16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x66,0x7c]
 0x7b,0x04,0x66,0x7c
+# W32: v_cmp_le_i16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x66,0x7c]
+# W64: v_cmp_le_i16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x66,0x7c]
 
-# W32: v_cmp_le_i16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x66,0x7c]
-# W64: v_cmp_le_i16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x66,0x7c]
 0x7d,0x04,0x66,0x7c
+# W32: v_cmp_le_i16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x66,0x7c]
+# W64: v_cmp_le_i16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x66,0x7c]
 
-# W32: v_cmp_le_i16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x66,0x7c]
-# W64: v_cmp_le_i16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x66,0x7c]
 0x7e,0x04,0x66,0x7c
+# W32: v_cmp_le_i16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x66,0x7c]
+# W64: v_cmp_le_i16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x66,0x7c]
 
-# W32: v_cmp_le_i16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x66,0x7c]
-# W64: v_cmp_le_i16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x66,0x7c]
 0x7f,0x04,0x66,0x7c
+# W32: v_cmp_le_i16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x66,0x7c]
+# W64: v_cmp_le_i16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x66,0x7c]
 
-# W32: v_cmp_le_i16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x66,0x7c]
-# W64: v_cmp_le_i16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x66,0x7c]
 0x7c,0x04,0x66,0x7c
+# W32: v_cmp_le_i16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x66,0x7c]
+# W64: v_cmp_le_i16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x66,0x7c]
 
-# W32: v_cmp_le_i16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x66,0x7c]
-# W64: v_cmp_le_i16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x66,0x7c]
 0xc1,0x04,0x66,0x7c
+# W32: v_cmp_le_i16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x66,0x7c]
+# W64: v_cmp_le_i16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x66,0x7c]
 
-# W32: v_cmp_le_i16_e32 vcc_lo, 0x3800, v2
-# W64: v_cmp_le_i16_e32 vcc, 0x3800, v2          ; encoding: [0xff,0x04,0x66,0x7c,0x00,0x38,0x00,0x00]
 0xf0,0x04,0x66,0x7c
+# W32: v_cmp_le_i16_e32 vcc_lo, 0x3800, v2     ; encoding: [0xff,0x04,0x66,0x7c,0x00,0x38,0x00,0x00]
+# W64: v_cmp_le_i16_e32 vcc, 0x3800, v2        ; encoding: [0xff,0x04,0x66,0x7c,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_le_i16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x66,0x7c]
-# W64: v_cmp_le_i16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x66,0x7c]
 0xfd,0x04,0x66,0x7c
+# W32: v_cmp_le_i16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x66,0x7c]
+# W64: v_cmp_le_i16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x66,0x7c]
 
-# W32: v_cmp_le_i16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x66,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_le_i16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x66,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x66,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_le_i16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x66,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_le_i16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x66,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_le_i32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x86,0x7c]
-# W64: v_cmp_le_i32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x86,0x7c]
 0x01,0x05,0x86,0x7c
+# W32: v_cmp_le_i32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x86,0x7c]
+# W64: v_cmp_le_i32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x86,0x7c]
 
-# W32: v_cmp_le_i32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x86,0x7c]
-# W64: v_cmp_le_i32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x86,0x7c]
 0xff,0x05,0x86,0x7c
+# W32: v_cmp_le_i32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x86,0x7c]
+# W64: v_cmp_le_i32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x86,0x7c]
 
-# W32: v_cmp_le_i32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x86,0x7c]
-# W64: v_cmp_le_i32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x86,0x7c]
 0x01,0x04,0x86,0x7c
+# W32: v_cmp_le_i32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x86,0x7c]
+# W64: v_cmp_le_i32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x86,0x7c]
 
-# W32: v_cmp_le_i32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x86,0x7c]
-# W64: v_cmp_le_i32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x86,0x7c]
 0x69,0x04,0x86,0x7c
+# W32: v_cmp_le_i32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x86,0x7c]
+# W64: v_cmp_le_i32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x86,0x7c]
 
-# W32: v_cmp_le_i32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x86,0x7c]
-# W64: v_cmp_le_i32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x86,0x7c]
 0x6a,0x04,0x86,0x7c
+# W32: v_cmp_le_i32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x86,0x7c]
+# W64: v_cmp_le_i32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x86,0x7c]
 
-# W32: v_cmp_le_i32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x86,0x7c]
-# W64: v_cmp_le_i32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x86,0x7c]
 0x6b,0x04,0x86,0x7c
+# W32: v_cmp_le_i32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x86,0x7c]
+# W64: v_cmp_le_i32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x86,0x7c]
 
-# W32: v_cmp_le_i32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x86,0x7c]
-# W64: v_cmp_le_i32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x86,0x7c]
 0x7b,0x04,0x86,0x7c
+# W32: v_cmp_le_i32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x86,0x7c]
+# W64: v_cmp_le_i32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x86,0x7c]
 
-# W32: v_cmp_le_i32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x86,0x7c]
-# W64: v_cmp_le_i32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x86,0x7c]
 0x7d,0x04,0x86,0x7c
+# W32: v_cmp_le_i32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x86,0x7c]
+# W64: v_cmp_le_i32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x86,0x7c]
 
-# W32: v_cmp_le_i32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x86,0x7c]
-# W64: v_cmp_le_i32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x86,0x7c]
 0x7e,0x04,0x86,0x7c
+# W32: v_cmp_le_i32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x86,0x7c]
+# W64: v_cmp_le_i32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x86,0x7c]
 
-# W32: v_cmp_le_i32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x86,0x7c]
-# W64: v_cmp_le_i32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x86,0x7c]
 0x7f,0x04,0x86,0x7c
+# W32: v_cmp_le_i32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x86,0x7c]
+# W64: v_cmp_le_i32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x86,0x7c]
 
-# W32: v_cmp_le_i32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x86,0x7c]
-# W64: v_cmp_le_i32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x86,0x7c]
 0x7c,0x04,0x86,0x7c
+# W32: v_cmp_le_i32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x86,0x7c]
+# W64: v_cmp_le_i32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x86,0x7c]
 
-# W32: v_cmp_le_i32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x86,0x7c]
-# W64: v_cmp_le_i32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x86,0x7c]
 0xc1,0x04,0x86,0x7c
+# W32: v_cmp_le_i32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x86,0x7c]
+# W64: v_cmp_le_i32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x86,0x7c]
 
-# W32: v_cmp_le_i32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x86,0x7c]
-# W64: v_cmp_le_i32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x86,0x7c]
 0xf0,0x04,0x86,0x7c
+# W32: v_cmp_le_i32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x86,0x7c]
+# W64: v_cmp_le_i32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x86,0x7c]
 
-# W32: v_cmp_le_i32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x86,0x7c]
-# W64: v_cmp_le_i32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x86,0x7c]
 0xfd,0x04,0x86,0x7c
+# W32: v_cmp_le_i32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x86,0x7c]
+# W64: v_cmp_le_i32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x86,0x7c]
 
-# W32: v_cmp_le_i32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x87,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_le_i32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x87,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x87,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_le_i32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x87,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_le_i32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x87,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_le_i64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0xa6,0x7c]
-# W64: v_cmp_le_i64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0xa6,0x7c]
 0x01,0x05,0xa6,0x7c
+# W32: v_cmp_le_i64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0xa6,0x7c]
+# W64: v_cmp_le_i64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0xa6,0x7c]
 
-# W32: v_cmp_le_i64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xa6,0x7c]
-# W64: v_cmp_le_i64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0xa6,0x7c]
 0xfe,0x05,0xa6,0x7c
+# W32: v_cmp_le_i64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xa6,0x7c]
+# W64: v_cmp_le_i64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xa6,0x7c]
 
-# W32: v_cmp_le_i64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0xa6,0x7c]
-# W64: v_cmp_le_i64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0xa6,0x7c]
 0x02,0x04,0xa6,0x7c
+# W32: v_cmp_le_i64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0xa6,0x7c]
+# W64: v_cmp_le_i64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0xa6,0x7c]
 
-# W32: v_cmp_le_i64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xa6,0x7c]
-# W64: v_cmp_le_i64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0xa6,0x7c]
 0x68,0x04,0xa6,0x7c
+# W32: v_cmp_le_i64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xa6,0x7c]
+# W64: v_cmp_le_i64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xa6,0x7c]
 
-# W32: v_cmp_le_i64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0xa6,0x7c]
-# W64: v_cmp_le_i64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0xa6,0x7c]
 0x6a,0x04,0xa6,0x7c
+# W32: v_cmp_le_i64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0xa6,0x7c]
+# W64: v_cmp_le_i64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0xa6,0x7c]
 
+0x7a,0x04,0xa6,0x7c
 # W32: v_cmp_le_i64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xa6,0x7c]
 # W64: v_cmp_le_i64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xa6,0x7c]
-0x7a,0x04,0xa6,0x7c
 
-# W32: v_cmp_le_i64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0xa6,0x7c]
-# W64: v_cmp_le_i64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0xa6,0x7c]
 0x7e,0x04,0xa6,0x7c
+# W32: v_cmp_le_i64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0xa6,0x7c]
+# W64: v_cmp_le_i64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0xa6,0x7c]
 
-# W32: v_cmp_le_i64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0xa6,0x7c]
-# W64: v_cmp_le_i64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0xa6,0x7c]
 0x7c,0x04,0xa6,0x7c
+# W32: v_cmp_le_i64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0xa6,0x7c]
+# W64: v_cmp_le_i64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0xa6,0x7c]
 
-# W32: v_cmp_le_i64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0xa6,0x7c]
-# W64: v_cmp_le_i64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0xa6,0x7c]
 0xc1,0x04,0xa6,0x7c
+# W32: v_cmp_le_i64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0xa6,0x7c]
+# W64: v_cmp_le_i64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0xa6,0x7c]
 
-# W32: v_cmp_le_i64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0xa6,0x7c]
-# W64: v_cmp_le_i64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0xa6,0x7c]
 0xf0,0x04,0xa6,0x7c
+# W32: v_cmp_le_i64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0xa6,0x7c]
+# W64: v_cmp_le_i64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0xa6,0x7c]
 
-# W32: v_cmp_le_i64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0xa6,0x7c]
-# W64: v_cmp_le_i64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0xa6,0x7c]
 0xfd,0x04,0xa6,0x7c
+# W32: v_cmp_le_i64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0xa6,0x7c]
+# W64: v_cmp_le_i64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0xa6,0x7c]
 
+0xff,0xfc,0xa7,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_le_i64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa7,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_le_i64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa7,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0xa7,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_le_u16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x76,0x7c]
-# W64: v_cmp_le_u16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x76,0x7c]
 0x01,0x05,0x76,0x7c
+# W32: v_cmp_le_u16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x76,0x7c]
+# W64: v_cmp_le_u16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x76,0x7c]
 
-# W32: v_cmp_le_u16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x76,0x7c]
-# W64: v_cmp_le_u16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x76,0x7c]
 0x7f,0x05,0x76,0x7c
+# W32: v_cmp_le_u16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x76,0x7c]
+# W64: v_cmp_le_u16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x76,0x7c]
 
-# W32: v_cmp_le_u16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x76,0x7c]
-# W64: v_cmp_le_u16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x76,0x7c]
 0x01,0x04,0x76,0x7c
+# W32: v_cmp_le_u16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x76,0x7c]
+# W64: v_cmp_le_u16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x76,0x7c]
 
-# W32: v_cmp_le_u16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x76,0x7c]
-# W64: v_cmp_le_u16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x76,0x7c]
 0x69,0x04,0x76,0x7c
+# W32: v_cmp_le_u16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x76,0x7c]
+# W64: v_cmp_le_u16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x76,0x7c]
 
-# W32: v_cmp_le_u16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x76,0x7c]
-# W64: v_cmp_le_u16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x76,0x7c]
 0x6a,0x04,0x76,0x7c
+# W32: v_cmp_le_u16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x76,0x7c]
+# W64: v_cmp_le_u16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x76,0x7c]
 
-# W32: v_cmp_le_u16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x76,0x7c]
-# W64: v_cmp_le_u16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x76,0x7c]
 0x6b,0x04,0x76,0x7c
+# W32: v_cmp_le_u16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x76,0x7c]
+# W64: v_cmp_le_u16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x76,0x7c]
 
-# W32: v_cmp_le_u16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x76,0x7c]
-# W64: v_cmp_le_u16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x76,0x7c]
 0x7b,0x04,0x76,0x7c
+# W32: v_cmp_le_u16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x76,0x7c]
+# W64: v_cmp_le_u16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x76,0x7c]
 
-# W32: v_cmp_le_u16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x76,0x7c]
-# W64: v_cmp_le_u16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x76,0x7c]
 0x7d,0x04,0x76,0x7c
+# W32: v_cmp_le_u16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x76,0x7c]
+# W64: v_cmp_le_u16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x76,0x7c]
 
-# W32: v_cmp_le_u16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x76,0x7c]
-# W64: v_cmp_le_u16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x76,0x7c]
 0x7e,0x04,0x76,0x7c
+# W32: v_cmp_le_u16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x76,0x7c]
+# W64: v_cmp_le_u16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x76,0x7c]
 
-# W32: v_cmp_le_u16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x76,0x7c]
-# W64: v_cmp_le_u16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x76,0x7c]
 0x7f,0x04,0x76,0x7c
+# W32: v_cmp_le_u16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x76,0x7c]
+# W64: v_cmp_le_u16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x76,0x7c]
 
-# W32: v_cmp_le_u16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x76,0x7c]
-# W64: v_cmp_le_u16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x76,0x7c]
 0x7c,0x04,0x76,0x7c
+# W32: v_cmp_le_u16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x76,0x7c]
+# W64: v_cmp_le_u16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x76,0x7c]
 
-# W32: v_cmp_le_u16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x76,0x7c]
-# W64: v_cmp_le_u16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x76,0x7c]
 0xc1,0x04,0x76,0x7c
+# W32: v_cmp_le_u16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x76,0x7c]
+# W64: v_cmp_le_u16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x76,0x7c]
 
-# W32: v_cmp_le_u16_e32 vcc_lo, 0x3800, v2
-# W64: v_cmp_le_u16_e32 vcc, 0x3800, v2          ; encoding: [0xff,0x04,0x76,0x7c,0x00,0x38,0x00,0x00]
 0xf0,0x04,0x76,0x7c
+# W32: v_cmp_le_u16_e32 vcc_lo, 0x3800, v2     ; encoding: [0xff,0x04,0x76,0x7c,0x00,0x38,0x00,0x00]
+# W64: v_cmp_le_u16_e32 vcc, 0x3800, v2        ; encoding: [0xff,0x04,0x76,0x7c,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_le_u16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x76,0x7c]
-# W64: v_cmp_le_u16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x76,0x7c]
 0xfd,0x04,0x76,0x7c
+# W32: v_cmp_le_u16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x76,0x7c]
+# W64: v_cmp_le_u16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x76,0x7c]
 
-# W32: v_cmp_le_u16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x76,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_le_u16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x76,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x76,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_le_u16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x76,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_le_u16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x76,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_le_u32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x96,0x7c]
-# W64: v_cmp_le_u32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x96,0x7c]
 0x01,0x05,0x96,0x7c
+# W32: v_cmp_le_u32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x96,0x7c]
+# W64: v_cmp_le_u32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x96,0x7c]
 
-# W32: v_cmp_le_u32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x96,0x7c]
-# W64: v_cmp_le_u32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x96,0x7c]
 0xff,0x05,0x96,0x7c
+# W32: v_cmp_le_u32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x96,0x7c]
+# W64: v_cmp_le_u32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x96,0x7c]
 
-# W32: v_cmp_le_u32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x96,0x7c]
-# W64: v_cmp_le_u32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x96,0x7c]
 0x01,0x04,0x96,0x7c
+# W32: v_cmp_le_u32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x96,0x7c]
+# W64: v_cmp_le_u32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x96,0x7c]
 
-# W32: v_cmp_le_u32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x96,0x7c]
-# W64: v_cmp_le_u32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x96,0x7c]
 0x69,0x04,0x96,0x7c
+# W32: v_cmp_le_u32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x96,0x7c]
+# W64: v_cmp_le_u32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x96,0x7c]
 
-# W32: v_cmp_le_u32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x96,0x7c]
-# W64: v_cmp_le_u32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x96,0x7c]
 0x6a,0x04,0x96,0x7c
+# W32: v_cmp_le_u32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x96,0x7c]
+# W64: v_cmp_le_u32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x96,0x7c]
 
-# W32: v_cmp_le_u32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x96,0x7c]
-# W64: v_cmp_le_u32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x96,0x7c]
 0x6b,0x04,0x96,0x7c
+# W32: v_cmp_le_u32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x96,0x7c]
+# W64: v_cmp_le_u32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x96,0x7c]
 
-# W32: v_cmp_le_u32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x96,0x7c]
-# W64: v_cmp_le_u32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x96,0x7c]
 0x7b,0x04,0x96,0x7c
+# W32: v_cmp_le_u32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x96,0x7c]
+# W64: v_cmp_le_u32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x96,0x7c]
 
-# W32: v_cmp_le_u32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x96,0x7c]
-# W64: v_cmp_le_u32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x96,0x7c]
 0x7d,0x04,0x96,0x7c
+# W32: v_cmp_le_u32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x96,0x7c]
+# W64: v_cmp_le_u32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x96,0x7c]
 
-# W32: v_cmp_le_u32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x96,0x7c]
-# W64: v_cmp_le_u32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x96,0x7c]
 0x7e,0x04,0x96,0x7c
+# W32: v_cmp_le_u32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x96,0x7c]
+# W64: v_cmp_le_u32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x96,0x7c]
 
-# W32: v_cmp_le_u32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x96,0x7c]
-# W64: v_cmp_le_u32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x96,0x7c]
 0x7f,0x04,0x96,0x7c
+# W32: v_cmp_le_u32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x96,0x7c]
+# W64: v_cmp_le_u32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x96,0x7c]
 
-# W32: v_cmp_le_u32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x96,0x7c]
-# W64: v_cmp_le_u32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x96,0x7c]
 0x7c,0x04,0x96,0x7c
+# W32: v_cmp_le_u32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x96,0x7c]
+# W64: v_cmp_le_u32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x96,0x7c]
 
-# W32: v_cmp_le_u32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x96,0x7c]
-# W64: v_cmp_le_u32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x96,0x7c]
 0xc1,0x04,0x96,0x7c
+# W32: v_cmp_le_u32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x96,0x7c]
+# W64: v_cmp_le_u32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x96,0x7c]
 
-# W32: v_cmp_le_u32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x96,0x7c]
-# W64: v_cmp_le_u32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x96,0x7c]
 0xf0,0x04,0x96,0x7c
+# W32: v_cmp_le_u32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x96,0x7c]
+# W64: v_cmp_le_u32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x96,0x7c]
 
-# W32: v_cmp_le_u32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x96,0x7c]
-# W64: v_cmp_le_u32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x96,0x7c]
 0xfd,0x04,0x96,0x7c
+# W32: v_cmp_le_u32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x96,0x7c]
+# W64: v_cmp_le_u32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x96,0x7c]
 
-# W32: v_cmp_le_u32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x97,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_le_u32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x97,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x97,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_le_u32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x97,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_le_u32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x97,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_le_u64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0xb6,0x7c]
-# W64: v_cmp_le_u64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0xb6,0x7c]
 0x01,0x05,0xb6,0x7c
+# W32: v_cmp_le_u64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0xb6,0x7c]
+# W64: v_cmp_le_u64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0xb6,0x7c]
 
-# W32: v_cmp_le_u64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xb6,0x7c]
-# W64: v_cmp_le_u64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0xb6,0x7c]
 0xfe,0x05,0xb6,0x7c
+# W32: v_cmp_le_u64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xb6,0x7c]
+# W64: v_cmp_le_u64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xb6,0x7c]
 
-# W32: v_cmp_le_u64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0xb6,0x7c]
-# W64: v_cmp_le_u64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0xb6,0x7c]
 0x02,0x04,0xb6,0x7c
+# W32: v_cmp_le_u64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0xb6,0x7c]
+# W64: v_cmp_le_u64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0xb6,0x7c]
 
-# W32: v_cmp_le_u64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xb6,0x7c]
-# W64: v_cmp_le_u64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0xb6,0x7c]
 0x68,0x04,0xb6,0x7c
+# W32: v_cmp_le_u64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xb6,0x7c]
+# W64: v_cmp_le_u64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xb6,0x7c]
 
-# W32: v_cmp_le_u64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0xb6,0x7c]
-# W64: v_cmp_le_u64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0xb6,0x7c]
 0x6a,0x04,0xb6,0x7c
+# W32: v_cmp_le_u64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0xb6,0x7c]
+# W64: v_cmp_le_u64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0xb6,0x7c]
 
+0x7a,0x04,0xb6,0x7c
 # W32: v_cmp_le_u64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xb6,0x7c]
 # W64: v_cmp_le_u64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xb6,0x7c]
-0x7a,0x04,0xb6,0x7c
 
-# W32: v_cmp_le_u64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0xb6,0x7c]
-# W64: v_cmp_le_u64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0xb6,0x7c]
 0x7e,0x04,0xb6,0x7c
+# W32: v_cmp_le_u64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0xb6,0x7c]
+# W64: v_cmp_le_u64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0xb6,0x7c]
 
-# W32: v_cmp_le_u64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0xb6,0x7c]
-# W64: v_cmp_le_u64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0xb6,0x7c]
 0x7c,0x04,0xb6,0x7c
+# W32: v_cmp_le_u64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0xb6,0x7c]
+# W64: v_cmp_le_u64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0xb6,0x7c]
 
-# W32: v_cmp_le_u64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0xb6,0x7c]
-# W64: v_cmp_le_u64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0xb6,0x7c]
 0xc1,0x04,0xb6,0x7c
+# W32: v_cmp_le_u64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0xb6,0x7c]
+# W64: v_cmp_le_u64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0xb6,0x7c]
 
-# W32: v_cmp_le_u64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0xb6,0x7c]
-# W64: v_cmp_le_u64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0xb6,0x7c]
 0xf0,0x04,0xb6,0x7c
+# W32: v_cmp_le_u64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0xb6,0x7c]
+# W64: v_cmp_le_u64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0xb6,0x7c]
 
-# W32: v_cmp_le_u64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0xb6,0x7c]
-# W64: v_cmp_le_u64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0xb6,0x7c]
 0xfd,0x04,0xb6,0x7c
+# W32: v_cmp_le_u64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0xb6,0x7c]
+# W64: v_cmp_le_u64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0xb6,0x7c]
 
+0xff,0xfc,0xb7,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_le_u64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb7,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_le_u64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb7,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0xb7,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_lg_f16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x0a,0x7c]
-# W64: v_cmp_lg_f16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x0a,0x7c]
 0x01,0x05,0x0a,0x7c
+# W32: v_cmp_lg_f16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x0a,0x7c]
+# W64: v_cmp_lg_f16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x0a,0x7c]
 
-# W32: v_cmp_lg_f16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x0a,0x7c]
-# W64: v_cmp_lg_f16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x0a,0x7c]
 0x7f,0x05,0x0a,0x7c
+# W32: v_cmp_lg_f16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x0a,0x7c]
+# W64: v_cmp_lg_f16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x0a,0x7c]
 
-# W32: v_cmp_lg_f16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x0a,0x7c]
-# W64: v_cmp_lg_f16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x0a,0x7c]
 0x01,0x04,0x0a,0x7c
+# W32: v_cmp_lg_f16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x0a,0x7c]
+# W64: v_cmp_lg_f16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x0a,0x7c]
 
-# W32: v_cmp_lg_f16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x0a,0x7c]
-# W64: v_cmp_lg_f16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x0a,0x7c]
 0x69,0x04,0x0a,0x7c
+# W32: v_cmp_lg_f16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x0a,0x7c]
+# W64: v_cmp_lg_f16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x0a,0x7c]
 
-# W32: v_cmp_lg_f16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x0a,0x7c]
-# W64: v_cmp_lg_f16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x0a,0x7c]
 0x6a,0x04,0x0a,0x7c
+# W32: v_cmp_lg_f16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x0a,0x7c]
+# W64: v_cmp_lg_f16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x0a,0x7c]
 
-# W32: v_cmp_lg_f16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x0a,0x7c]
-# W64: v_cmp_lg_f16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x0a,0x7c]
 0x6b,0x04,0x0a,0x7c
+# W32: v_cmp_lg_f16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x0a,0x7c]
+# W64: v_cmp_lg_f16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x0a,0x7c]
 
-# W32: v_cmp_lg_f16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x0a,0x7c]
-# W64: v_cmp_lg_f16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x0a,0x7c]
 0x7b,0x04,0x0a,0x7c
+# W32: v_cmp_lg_f16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x0a,0x7c]
+# W64: v_cmp_lg_f16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x0a,0x7c]
 
-# W32: v_cmp_lg_f16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x0a,0x7c]
-# W64: v_cmp_lg_f16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x0a,0x7c]
 0x7d,0x04,0x0a,0x7c
+# W32: v_cmp_lg_f16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x0a,0x7c]
+# W64: v_cmp_lg_f16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x0a,0x7c]
 
-# W32: v_cmp_lg_f16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x0a,0x7c]
-# W64: v_cmp_lg_f16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x0a,0x7c]
 0x7e,0x04,0x0a,0x7c
+# W32: v_cmp_lg_f16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x0a,0x7c]
+# W64: v_cmp_lg_f16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x0a,0x7c]
 
-# W32: v_cmp_lg_f16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x0a,0x7c]
-# W64: v_cmp_lg_f16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x0a,0x7c]
 0x7f,0x04,0x0a,0x7c
+# W32: v_cmp_lg_f16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x0a,0x7c]
+# W64: v_cmp_lg_f16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x0a,0x7c]
 
-# W32: v_cmp_lg_f16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x0a,0x7c]
-# W64: v_cmp_lg_f16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x0a,0x7c]
 0x7c,0x04,0x0a,0x7c
+# W32: v_cmp_lg_f16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x0a,0x7c]
+# W64: v_cmp_lg_f16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x0a,0x7c]
 
-# W32: v_cmp_lg_f16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x0a,0x7c]
-# W64: v_cmp_lg_f16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x0a,0x7c]
 0xc1,0x04,0x0a,0x7c
+# W32: v_cmp_lg_f16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x0a,0x7c]
+# W64: v_cmp_lg_f16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x0a,0x7c]
 
-# W32: v_cmp_lg_f16_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x0a,0x7c]
-# W64: v_cmp_lg_f16_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x0a,0x7c]
 0xf0,0x04,0x0a,0x7c
+# W32: v_cmp_lg_f16_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x0a,0x7c]
+# W64: v_cmp_lg_f16_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x0a,0x7c]
 
-# W32: v_cmp_lg_f16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x0a,0x7c]
-# W64: v_cmp_lg_f16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x0a,0x7c]
 0xfd,0x04,0x0a,0x7c
+# W32: v_cmp_lg_f16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x0a,0x7c]
+# W64: v_cmp_lg_f16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x0a,0x7c]
 
-# W32: v_cmp_lg_f16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x0a,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_lg_f16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x0a,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x0a,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_lg_f16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x0a,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_lg_f16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x0a,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_lg_f32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x2a,0x7c]
-# W64: v_cmp_lg_f32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x2a,0x7c]
 0x01,0x05,0x2a,0x7c
+# W32: v_cmp_lg_f32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x2a,0x7c]
+# W64: v_cmp_lg_f32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x2a,0x7c]
 
-# W32: v_cmp_lg_f32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x2a,0x7c]
-# W64: v_cmp_lg_f32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x2a,0x7c]
 0xff,0x05,0x2a,0x7c
+# W32: v_cmp_lg_f32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x2a,0x7c]
+# W64: v_cmp_lg_f32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x2a,0x7c]
 
-# W32: v_cmp_lg_f32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x2a,0x7c]
-# W64: v_cmp_lg_f32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x2a,0x7c]
 0x01,0x04,0x2a,0x7c
+# W32: v_cmp_lg_f32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x2a,0x7c]
+# W64: v_cmp_lg_f32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x2a,0x7c]
 
-# W32: v_cmp_lg_f32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x2a,0x7c]
-# W64: v_cmp_lg_f32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x2a,0x7c]
 0x69,0x04,0x2a,0x7c
+# W32: v_cmp_lg_f32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x2a,0x7c]
+# W64: v_cmp_lg_f32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x2a,0x7c]
 
-# W32: v_cmp_lg_f32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x2a,0x7c]
-# W64: v_cmp_lg_f32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x2a,0x7c]
 0x6a,0x04,0x2a,0x7c
+# W32: v_cmp_lg_f32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x2a,0x7c]
+# W64: v_cmp_lg_f32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x2a,0x7c]
 
-# W32: v_cmp_lg_f32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x2a,0x7c]
-# W64: v_cmp_lg_f32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x2a,0x7c]
 0x6b,0x04,0x2a,0x7c
+# W32: v_cmp_lg_f32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x2a,0x7c]
+# W64: v_cmp_lg_f32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x2a,0x7c]
 
-# W32: v_cmp_lg_f32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x2a,0x7c]
-# W64: v_cmp_lg_f32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x2a,0x7c]
 0x7b,0x04,0x2a,0x7c
+# W32: v_cmp_lg_f32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x2a,0x7c]
+# W64: v_cmp_lg_f32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x2a,0x7c]
 
-# W32: v_cmp_lg_f32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x2a,0x7c]
-# W64: v_cmp_lg_f32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x2a,0x7c]
 0x7d,0x04,0x2a,0x7c
+# W32: v_cmp_lg_f32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x2a,0x7c]
+# W64: v_cmp_lg_f32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x2a,0x7c]
 
-# W32: v_cmp_lg_f32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x2a,0x7c]
-# W64: v_cmp_lg_f32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x2a,0x7c]
 0x7e,0x04,0x2a,0x7c
+# W32: v_cmp_lg_f32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x2a,0x7c]
+# W64: v_cmp_lg_f32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x2a,0x7c]
 
-# W32: v_cmp_lg_f32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x2a,0x7c]
-# W64: v_cmp_lg_f32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x2a,0x7c]
 0x7f,0x04,0x2a,0x7c
+# W32: v_cmp_lg_f32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x2a,0x7c]
+# W64: v_cmp_lg_f32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x2a,0x7c]
 
-# W32: v_cmp_lg_f32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x2a,0x7c]
-# W64: v_cmp_lg_f32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x2a,0x7c]
 0x7c,0x04,0x2a,0x7c
+# W32: v_cmp_lg_f32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x2a,0x7c]
+# W64: v_cmp_lg_f32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x2a,0x7c]
 
-# W32: v_cmp_lg_f32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x2a,0x7c]
-# W64: v_cmp_lg_f32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x2a,0x7c]
 0xc1,0x04,0x2a,0x7c
+# W32: v_cmp_lg_f32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x2a,0x7c]
+# W64: v_cmp_lg_f32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x2a,0x7c]
 
-# W32: v_cmp_lg_f32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x2a,0x7c]
-# W64: v_cmp_lg_f32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x2a,0x7c]
 0xf0,0x04,0x2a,0x7c
+# W32: v_cmp_lg_f32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x2a,0x7c]
+# W64: v_cmp_lg_f32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x2a,0x7c]
 
-# W32: v_cmp_lg_f32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x2a,0x7c]
-# W64: v_cmp_lg_f32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x2a,0x7c]
 0xfd,0x04,0x2a,0x7c
+# W32: v_cmp_lg_f32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x2a,0x7c]
+# W64: v_cmp_lg_f32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x2a,0x7c]
 
-# W32: v_cmp_lg_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x2b,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_lg_f32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x2b,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x2b,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_lg_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x2b,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_lg_f32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x2b,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lg_f64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0x4a,0x7c]
-# W64: v_cmp_lg_f64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0x4a,0x7c]
 0x01,0x05,0x4a,0x7c
+# W32: v_cmp_lg_f64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0x4a,0x7c]
+# W64: v_cmp_lg_f64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0x4a,0x7c]
 
-# W32: v_cmp_lg_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x4a,0x7c]
-# W64: v_cmp_lg_f64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0x4a,0x7c]
 0xfe,0x05,0x4a,0x7c
+# W32: v_cmp_lg_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x4a,0x7c]
+# W64: v_cmp_lg_f64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x4a,0x7c]
 
-# W32: v_cmp_lg_f64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0x4a,0x7c]
-# W64: v_cmp_lg_f64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0x4a,0x7c]
 0x02,0x04,0x4a,0x7c
+# W32: v_cmp_lg_f64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0x4a,0x7c]
+# W64: v_cmp_lg_f64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0x4a,0x7c]
 
-# W32: v_cmp_lg_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x4a,0x7c]
-# W64: v_cmp_lg_f64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0x4a,0x7c]
 0x68,0x04,0x4a,0x7c
+# W32: v_cmp_lg_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x4a,0x7c]
+# W64: v_cmp_lg_f64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x4a,0x7c]
 
-# W32: v_cmp_lg_f64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0x4a,0x7c]
-# W64: v_cmp_lg_f64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0x4a,0x7c]
 0x6a,0x04,0x4a,0x7c
+# W32: v_cmp_lg_f64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0x4a,0x7c]
+# W64: v_cmp_lg_f64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0x4a,0x7c]
 
+0x7a,0x04,0x4a,0x7c
 # W32: v_cmp_lg_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x4a,0x7c]
 # W64: v_cmp_lg_f64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x4a,0x7c]
-0x7a,0x04,0x4a,0x7c
 
-# W32: v_cmp_lg_f64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0x4a,0x7c]
-# W64: v_cmp_lg_f64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0x4a,0x7c]
 0x7e,0x04,0x4a,0x7c
+# W32: v_cmp_lg_f64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0x4a,0x7c]
+# W64: v_cmp_lg_f64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0x4a,0x7c]
 
-# W32: v_cmp_lg_f64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0x4a,0x7c]
-# W64: v_cmp_lg_f64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0x4a,0x7c]
 0x7c,0x04,0x4a,0x7c
+# W32: v_cmp_lg_f64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0x4a,0x7c]
+# W64: v_cmp_lg_f64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0x4a,0x7c]
 
-# W32: v_cmp_lg_f64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0x4a,0x7c]
-# W64: v_cmp_lg_f64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0x4a,0x7c]
 0xc1,0x04,0x4a,0x7c
+# W32: v_cmp_lg_f64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0x4a,0x7c]
+# W64: v_cmp_lg_f64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0x4a,0x7c]
 
-# W32: v_cmp_lg_f64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0x4a,0x7c]
-# W64: v_cmp_lg_f64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0x4a,0x7c]
 0xf0,0x04,0x4a,0x7c
+# W32: v_cmp_lg_f64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0x4a,0x7c]
+# W64: v_cmp_lg_f64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0x4a,0x7c]
 
-# W32: v_cmp_lg_f64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0x4a,0x7c]
-# W64: v_cmp_lg_f64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0x4a,0x7c]
 0xfd,0x04,0x4a,0x7c
+# W32: v_cmp_lg_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x4a,0x7c]
+# W64: v_cmp_lg_f64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0x4a,0x7c]
 
+0xff,0xfc,0x4b,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_lg_f64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4b,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_lg_f64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4b,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0x4b,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_lt_f16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x02,0x7c]
-# W64: v_cmp_lt_f16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x02,0x7c]
 0x01,0x05,0x02,0x7c
+# W32: v_cmp_lt_f16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x02,0x7c]
+# W64: v_cmp_lt_f16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x02,0x7c]
 
-# W32: v_cmp_lt_f16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x02,0x7c]
-# W64: v_cmp_lt_f16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x02,0x7c]
 0x7f,0x05,0x02,0x7c
+# W32: v_cmp_lt_f16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x02,0x7c]
+# W64: v_cmp_lt_f16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x02,0x7c]
 
-# W32: v_cmp_lt_f16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x02,0x7c]
-# W64: v_cmp_lt_f16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x02,0x7c]
 0x01,0x04,0x02,0x7c
+# W32: v_cmp_lt_f16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x02,0x7c]
+# W64: v_cmp_lt_f16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x02,0x7c]
 
-# W32: v_cmp_lt_f16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x02,0x7c]
-# W64: v_cmp_lt_f16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x02,0x7c]
 0x69,0x04,0x02,0x7c
+# W32: v_cmp_lt_f16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x02,0x7c]
+# W64: v_cmp_lt_f16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x02,0x7c]
 
-# W32: v_cmp_lt_f16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x02,0x7c]
-# W64: v_cmp_lt_f16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x02,0x7c]
 0x6a,0x04,0x02,0x7c
+# W32: v_cmp_lt_f16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x02,0x7c]
+# W64: v_cmp_lt_f16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x02,0x7c]
 
-# W32: v_cmp_lt_f16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x02,0x7c]
-# W64: v_cmp_lt_f16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x02,0x7c]
 0x6b,0x04,0x02,0x7c
+# W32: v_cmp_lt_f16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x02,0x7c]
+# W64: v_cmp_lt_f16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x02,0x7c]
 
-# W32: v_cmp_lt_f16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x02,0x7c]
-# W64: v_cmp_lt_f16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x02,0x7c]
 0x7b,0x04,0x02,0x7c
+# W32: v_cmp_lt_f16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x02,0x7c]
+# W64: v_cmp_lt_f16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x02,0x7c]
 
-# W32: v_cmp_lt_f16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x02,0x7c]
-# W64: v_cmp_lt_f16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x02,0x7c]
 0x7d,0x04,0x02,0x7c
+# W32: v_cmp_lt_f16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x02,0x7c]
+# W64: v_cmp_lt_f16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x02,0x7c]
 
-# W32: v_cmp_lt_f16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x02,0x7c]
-# W64: v_cmp_lt_f16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x02,0x7c]
 0x7e,0x04,0x02,0x7c
+# W32: v_cmp_lt_f16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x02,0x7c]
+# W64: v_cmp_lt_f16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x02,0x7c]
 
-# W32: v_cmp_lt_f16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x02,0x7c]
-# W64: v_cmp_lt_f16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x02,0x7c]
 0x7f,0x04,0x02,0x7c
+# W32: v_cmp_lt_f16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x02,0x7c]
+# W64: v_cmp_lt_f16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x02,0x7c]
 
-# W32: v_cmp_lt_f16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x02,0x7c]
-# W64: v_cmp_lt_f16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x02,0x7c]
 0x7c,0x04,0x02,0x7c
+# W32: v_cmp_lt_f16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x02,0x7c]
+# W64: v_cmp_lt_f16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x02,0x7c]
 
-# W32: v_cmp_lt_f16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x02,0x7c]
-# W64: v_cmp_lt_f16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x02,0x7c]
 0xc1,0x04,0x02,0x7c
+# W32: v_cmp_lt_f16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x02,0x7c]
+# W64: v_cmp_lt_f16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x02,0x7c]
 
-# W32: v_cmp_lt_f16_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x02,0x7c]
-# W64: v_cmp_lt_f16_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x02,0x7c]
 0xf0,0x04,0x02,0x7c
+# W32: v_cmp_lt_f16_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x02,0x7c]
+# W64: v_cmp_lt_f16_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x02,0x7c]
 
-# W32: v_cmp_lt_f16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x02,0x7c]
-# W64: v_cmp_lt_f16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x02,0x7c]
 0xfd,0x04,0x02,0x7c
+# W32: v_cmp_lt_f16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x02,0x7c]
+# W64: v_cmp_lt_f16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x02,0x7c]
 
-# W32: v_cmp_lt_f16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x02,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_lt_f16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x02,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x02,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_lt_f16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x02,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_lt_f16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x02,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_lt_f32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x22,0x7c]
-# W64: v_cmp_lt_f32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x22,0x7c]
 0x01,0x05,0x22,0x7c
+# W32: v_cmp_lt_f32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x22,0x7c]
+# W64: v_cmp_lt_f32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x22,0x7c]
 
-# W32: v_cmp_lt_f32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x22,0x7c]
-# W64: v_cmp_lt_f32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x22,0x7c]
 0xff,0x05,0x22,0x7c
+# W32: v_cmp_lt_f32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x22,0x7c]
+# W64: v_cmp_lt_f32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x22,0x7c]
 
-# W32: v_cmp_lt_f32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x22,0x7c]
-# W64: v_cmp_lt_f32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x22,0x7c]
 0x01,0x04,0x22,0x7c
+# W32: v_cmp_lt_f32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x22,0x7c]
+# W64: v_cmp_lt_f32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x22,0x7c]
 
-# W32: v_cmp_lt_f32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x22,0x7c]
-# W64: v_cmp_lt_f32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x22,0x7c]
 0x69,0x04,0x22,0x7c
+# W32: v_cmp_lt_f32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x22,0x7c]
+# W64: v_cmp_lt_f32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x22,0x7c]
 
-# W32: v_cmp_lt_f32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x22,0x7c]
-# W64: v_cmp_lt_f32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x22,0x7c]
 0x6a,0x04,0x22,0x7c
+# W32: v_cmp_lt_f32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x22,0x7c]
+# W64: v_cmp_lt_f32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x22,0x7c]
 
-# W32: v_cmp_lt_f32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x22,0x7c]
-# W64: v_cmp_lt_f32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x22,0x7c]
 0x6b,0x04,0x22,0x7c
+# W32: v_cmp_lt_f32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x22,0x7c]
+# W64: v_cmp_lt_f32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x22,0x7c]
 
-# W32: v_cmp_lt_f32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x22,0x7c]
-# W64: v_cmp_lt_f32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x22,0x7c]
 0x7b,0x04,0x22,0x7c
+# W32: v_cmp_lt_f32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x22,0x7c]
+# W64: v_cmp_lt_f32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x22,0x7c]
 
-# W32: v_cmp_lt_f32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x22,0x7c]
-# W64: v_cmp_lt_f32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x22,0x7c]
 0x7d,0x04,0x22,0x7c
+# W32: v_cmp_lt_f32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x22,0x7c]
+# W64: v_cmp_lt_f32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x22,0x7c]
 
-# W32: v_cmp_lt_f32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x22,0x7c]
-# W64: v_cmp_lt_f32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x22,0x7c]
 0x7e,0x04,0x22,0x7c
+# W32: v_cmp_lt_f32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x22,0x7c]
+# W64: v_cmp_lt_f32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x22,0x7c]
 
-# W32: v_cmp_lt_f32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x22,0x7c]
-# W64: v_cmp_lt_f32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x22,0x7c]
 0x7f,0x04,0x22,0x7c
+# W32: v_cmp_lt_f32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x22,0x7c]
+# W64: v_cmp_lt_f32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x22,0x7c]
 
-# W32: v_cmp_lt_f32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x22,0x7c]
-# W64: v_cmp_lt_f32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x22,0x7c]
 0x7c,0x04,0x22,0x7c
+# W32: v_cmp_lt_f32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x22,0x7c]
+# W64: v_cmp_lt_f32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x22,0x7c]
 
-# W32: v_cmp_lt_f32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x22,0x7c]
-# W64: v_cmp_lt_f32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x22,0x7c]
 0xc1,0x04,0x22,0x7c
+# W32: v_cmp_lt_f32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x22,0x7c]
+# W64: v_cmp_lt_f32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x22,0x7c]
 
-# W32: v_cmp_lt_f32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x22,0x7c]
-# W64: v_cmp_lt_f32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x22,0x7c]
 0xf0,0x04,0x22,0x7c
+# W32: v_cmp_lt_f32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x22,0x7c]
+# W64: v_cmp_lt_f32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x22,0x7c]
 
-# W32: v_cmp_lt_f32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x22,0x7c]
-# W64: v_cmp_lt_f32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x22,0x7c]
 0xfd,0x04,0x22,0x7c
+# W32: v_cmp_lt_f32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x22,0x7c]
+# W64: v_cmp_lt_f32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x22,0x7c]
 
-# W32: v_cmp_lt_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x23,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_lt_f32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x23,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x23,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_lt_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x23,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_lt_f32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x23,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lt_f64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0x42,0x7c]
-# W64: v_cmp_lt_f64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0x42,0x7c]
 0x01,0x05,0x42,0x7c
+# W32: v_cmp_lt_f64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0x42,0x7c]
+# W64: v_cmp_lt_f64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0x42,0x7c]
 
-# W32: v_cmp_lt_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x42,0x7c]
-# W64: v_cmp_lt_f64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0x42,0x7c]
 0xfe,0x05,0x42,0x7c
+# W32: v_cmp_lt_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x42,0x7c]
+# W64: v_cmp_lt_f64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x42,0x7c]
 
-# W32: v_cmp_lt_f64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0x42,0x7c]
-# W64: v_cmp_lt_f64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0x42,0x7c]
 0x02,0x04,0x42,0x7c
+# W32: v_cmp_lt_f64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0x42,0x7c]
+# W64: v_cmp_lt_f64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0x42,0x7c]
 
-# W32: v_cmp_lt_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x42,0x7c]
-# W64: v_cmp_lt_f64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0x42,0x7c]
 0x68,0x04,0x42,0x7c
+# W32: v_cmp_lt_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x42,0x7c]
+# W64: v_cmp_lt_f64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x42,0x7c]
 
-# W32: v_cmp_lt_f64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0x42,0x7c]
-# W64: v_cmp_lt_f64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0x42,0x7c]
 0x6a,0x04,0x42,0x7c
+# W32: v_cmp_lt_f64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0x42,0x7c]
+# W64: v_cmp_lt_f64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0x42,0x7c]
 
+0x7a,0x04,0x42,0x7c
 # W32: v_cmp_lt_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x42,0x7c]
 # W64: v_cmp_lt_f64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x42,0x7c]
-0x7a,0x04,0x42,0x7c
 
-# W32: v_cmp_lt_f64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0x42,0x7c]
-# W64: v_cmp_lt_f64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0x42,0x7c]
 0x7e,0x04,0x42,0x7c
+# W32: v_cmp_lt_f64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0x42,0x7c]
+# W64: v_cmp_lt_f64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0x42,0x7c]
 
-# W32: v_cmp_lt_f64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0x42,0x7c]
-# W64: v_cmp_lt_f64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0x42,0x7c]
 0x7c,0x04,0x42,0x7c
+# W32: v_cmp_lt_f64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0x42,0x7c]
+# W64: v_cmp_lt_f64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0x42,0x7c]
 
-# W32: v_cmp_lt_f64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0x42,0x7c]
-# W64: v_cmp_lt_f64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0x42,0x7c]
 0xc1,0x04,0x42,0x7c
+# W32: v_cmp_lt_f64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0x42,0x7c]
+# W64: v_cmp_lt_f64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0x42,0x7c]
 
-# W32: v_cmp_lt_f64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0x42,0x7c]
-# W64: v_cmp_lt_f64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0x42,0x7c]
 0xf0,0x04,0x42,0x7c
+# W32: v_cmp_lt_f64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0x42,0x7c]
+# W64: v_cmp_lt_f64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0x42,0x7c]
 
-# W32: v_cmp_lt_f64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0x42,0x7c]
-# W64: v_cmp_lt_f64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0x42,0x7c]
 0xfd,0x04,0x42,0x7c
+# W32: v_cmp_lt_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x42,0x7c]
+# W64: v_cmp_lt_f64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0x42,0x7c]
 
+0xff,0xfc,0x43,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_lt_f64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x43,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_lt_f64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x43,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0x43,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_lt_i16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x62,0x7c]
-# W64: v_cmp_lt_i16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x62,0x7c]
 0x01,0x05,0x62,0x7c
+# W32: v_cmp_lt_i16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x62,0x7c]
+# W64: v_cmp_lt_i16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x62,0x7c]
 
-# W32: v_cmp_lt_i16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x62,0x7c]
-# W64: v_cmp_lt_i16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x62,0x7c]
 0x7f,0x05,0x62,0x7c
+# W32: v_cmp_lt_i16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x62,0x7c]
+# W64: v_cmp_lt_i16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x62,0x7c]
 
-# W32: v_cmp_lt_i16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x62,0x7c]
-# W64: v_cmp_lt_i16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x62,0x7c]
 0x01,0x04,0x62,0x7c
+# W32: v_cmp_lt_i16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x62,0x7c]
+# W64: v_cmp_lt_i16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x62,0x7c]
 
-# W32: v_cmp_lt_i16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x62,0x7c]
-# W64: v_cmp_lt_i16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x62,0x7c]
 0x69,0x04,0x62,0x7c
+# W32: v_cmp_lt_i16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x62,0x7c]
+# W64: v_cmp_lt_i16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x62,0x7c]
 
-# W32: v_cmp_lt_i16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x62,0x7c]
-# W64: v_cmp_lt_i16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x62,0x7c]
 0x6a,0x04,0x62,0x7c
+# W32: v_cmp_lt_i16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x62,0x7c]
+# W64: v_cmp_lt_i16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x62,0x7c]
 
-# W32: v_cmp_lt_i16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x62,0x7c]
-# W64: v_cmp_lt_i16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x62,0x7c]
 0x6b,0x04,0x62,0x7c
+# W32: v_cmp_lt_i16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x62,0x7c]
+# W64: v_cmp_lt_i16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x62,0x7c]
 
-# W32: v_cmp_lt_i16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x62,0x7c]
-# W64: v_cmp_lt_i16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x62,0x7c]
 0x7b,0x04,0x62,0x7c
+# W32: v_cmp_lt_i16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x62,0x7c]
+# W64: v_cmp_lt_i16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x62,0x7c]
 
-# W32: v_cmp_lt_i16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x62,0x7c]
-# W64: v_cmp_lt_i16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x62,0x7c]
 0x7d,0x04,0x62,0x7c
+# W32: v_cmp_lt_i16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x62,0x7c]
+# W64: v_cmp_lt_i16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x62,0x7c]
 
-# W32: v_cmp_lt_i16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x62,0x7c]
-# W64: v_cmp_lt_i16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x62,0x7c]
 0x7e,0x04,0x62,0x7c
+# W32: v_cmp_lt_i16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x62,0x7c]
+# W64: v_cmp_lt_i16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x62,0x7c]
 
-# W32: v_cmp_lt_i16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x62,0x7c]
-# W64: v_cmp_lt_i16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x62,0x7c]
 0x7f,0x04,0x62,0x7c
+# W32: v_cmp_lt_i16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x62,0x7c]
+# W64: v_cmp_lt_i16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x62,0x7c]
 
-# W32: v_cmp_lt_i16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x62,0x7c]
-# W64: v_cmp_lt_i16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x62,0x7c]
 0x7c,0x04,0x62,0x7c
+# W32: v_cmp_lt_i16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x62,0x7c]
+# W64: v_cmp_lt_i16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x62,0x7c]
 
-# W32: v_cmp_lt_i16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x62,0x7c]
-# W64: v_cmp_lt_i16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x62,0x7c]
 0xc1,0x04,0x62,0x7c
+# W32: v_cmp_lt_i16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x62,0x7c]
+# W64: v_cmp_lt_i16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x62,0x7c]
 
-# W32: v_cmp_lt_i16_e32 vcc_lo, 0x3800, v2
-# W64: v_cmp_lt_i16_e32 vcc, 0x3800, v2          ; encoding: [0xff,0x04,0x62,0x7c,0x00,0x38,0x00,0x00]
 0xf0,0x04,0x62,0x7c
+# W32: v_cmp_lt_i16_e32 vcc_lo, 0x3800, v2     ; encoding: [0xff,0x04,0x62,0x7c,0x00,0x38,0x00,0x00]
+# W64: v_cmp_lt_i16_e32 vcc, 0x3800, v2        ; encoding: [0xff,0x04,0x62,0x7c,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_lt_i16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x62,0x7c]
-# W64: v_cmp_lt_i16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x62,0x7c]
 0xfd,0x04,0x62,0x7c
+# W32: v_cmp_lt_i16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x62,0x7c]
+# W64: v_cmp_lt_i16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x62,0x7c]
 
-# W32: v_cmp_lt_i16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x62,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_lt_i16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x62,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x62,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_lt_i16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x62,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_lt_i16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x62,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_lt_i32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x82,0x7c]
-# W64: v_cmp_lt_i32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x82,0x7c]
 0x01,0x05,0x82,0x7c
+# W32: v_cmp_lt_i32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x82,0x7c]
+# W64: v_cmp_lt_i32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x82,0x7c]
 
-# W32: v_cmp_lt_i32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x82,0x7c]
-# W64: v_cmp_lt_i32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x82,0x7c]
 0xff,0x05,0x82,0x7c
+# W32: v_cmp_lt_i32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x82,0x7c]
+# W64: v_cmp_lt_i32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x82,0x7c]
 
-# W32: v_cmp_lt_i32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x82,0x7c]
-# W64: v_cmp_lt_i32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x82,0x7c]
 0x01,0x04,0x82,0x7c
+# W32: v_cmp_lt_i32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x82,0x7c]
+# W64: v_cmp_lt_i32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x82,0x7c]
 
-# W32: v_cmp_lt_i32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x82,0x7c]
-# W64: v_cmp_lt_i32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x82,0x7c]
 0x69,0x04,0x82,0x7c
+# W32: v_cmp_lt_i32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x82,0x7c]
+# W64: v_cmp_lt_i32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x82,0x7c]
 
-# W32: v_cmp_lt_i32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x82,0x7c]
-# W64: v_cmp_lt_i32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x82,0x7c]
 0x6a,0x04,0x82,0x7c
+# W32: v_cmp_lt_i32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x82,0x7c]
+# W64: v_cmp_lt_i32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x82,0x7c]
 
-# W32: v_cmp_lt_i32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x82,0x7c]
-# W64: v_cmp_lt_i32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x82,0x7c]
 0x6b,0x04,0x82,0x7c
+# W32: v_cmp_lt_i32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x82,0x7c]
+# W64: v_cmp_lt_i32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x82,0x7c]
 
-# W32: v_cmp_lt_i32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x82,0x7c]
-# W64: v_cmp_lt_i32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x82,0x7c]
 0x7b,0x04,0x82,0x7c
+# W32: v_cmp_lt_i32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x82,0x7c]
+# W64: v_cmp_lt_i32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x82,0x7c]
 
-# W32: v_cmp_lt_i32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x82,0x7c]
-# W64: v_cmp_lt_i32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x82,0x7c]
 0x7d,0x04,0x82,0x7c
+# W32: v_cmp_lt_i32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x82,0x7c]
+# W64: v_cmp_lt_i32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x82,0x7c]
 
-# W32: v_cmp_lt_i32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x82,0x7c]
-# W64: v_cmp_lt_i32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x82,0x7c]
 0x7e,0x04,0x82,0x7c
+# W32: v_cmp_lt_i32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x82,0x7c]
+# W64: v_cmp_lt_i32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x82,0x7c]
 
-# W32: v_cmp_lt_i32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x82,0x7c]
-# W64: v_cmp_lt_i32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x82,0x7c]
 0x7f,0x04,0x82,0x7c
+# W32: v_cmp_lt_i32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x82,0x7c]
+# W64: v_cmp_lt_i32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x82,0x7c]
 
-# W32: v_cmp_lt_i32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x82,0x7c]
-# W64: v_cmp_lt_i32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x82,0x7c]
 0x7c,0x04,0x82,0x7c
+# W32: v_cmp_lt_i32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x82,0x7c]
+# W64: v_cmp_lt_i32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x82,0x7c]
 
-# W32: v_cmp_lt_i32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x82,0x7c]
-# W64: v_cmp_lt_i32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x82,0x7c]
 0xc1,0x04,0x82,0x7c
+# W32: v_cmp_lt_i32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x82,0x7c]
+# W64: v_cmp_lt_i32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x82,0x7c]
 
-# W32: v_cmp_lt_i32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x82,0x7c]
-# W64: v_cmp_lt_i32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x82,0x7c]
 0xf0,0x04,0x82,0x7c
+# W32: v_cmp_lt_i32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x82,0x7c]
+# W64: v_cmp_lt_i32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x82,0x7c]
 
-# W32: v_cmp_lt_i32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x82,0x7c]
-# W64: v_cmp_lt_i32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x82,0x7c]
 0xfd,0x04,0x82,0x7c
+# W32: v_cmp_lt_i32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x82,0x7c]
+# W64: v_cmp_lt_i32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x82,0x7c]
 
-# W32: v_cmp_lt_i32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x83,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_lt_i32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x83,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x83,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_lt_i32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x83,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_lt_i32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x83,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0xa2,0x7c]
-# W64: v_cmp_lt_i64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0xa2,0x7c]
 0x01,0x05,0xa2,0x7c
+# W32: v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0xa2,0x7c]
+# W64: v_cmp_lt_i64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0xa2,0x7c]
 
-# W32: v_cmp_lt_i64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xa2,0x7c]
-# W64: v_cmp_lt_i64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0xa2,0x7c]
 0xfe,0x05,0xa2,0x7c
+# W32: v_cmp_lt_i64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xa2,0x7c]
+# W64: v_cmp_lt_i64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xa2,0x7c]
 
-# W32: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0xa2,0x7c]
-# W64: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0xa2,0x7c]
 0x02,0x04,0xa2,0x7c
+# W32: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0xa2,0x7c]
+# W64: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0xa2,0x7c]
 
-# W32: v_cmp_lt_i64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xa2,0x7c]
-# W64: v_cmp_lt_i64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0xa2,0x7c]
 0x68,0x04,0xa2,0x7c
+# W32: v_cmp_lt_i64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xa2,0x7c]
+# W64: v_cmp_lt_i64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xa2,0x7c]
 
-# W32: v_cmp_lt_i64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0xa2,0x7c]
-# W64: v_cmp_lt_i64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0xa2,0x7c]
 0x6a,0x04,0xa2,0x7c
+# W32: v_cmp_lt_i64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0xa2,0x7c]
+# W64: v_cmp_lt_i64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0xa2,0x7c]
 
+0x7a,0x04,0xa2,0x7c
 # W32: v_cmp_lt_i64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xa2,0x7c]
 # W64: v_cmp_lt_i64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xa2,0x7c]
-0x7a,0x04,0xa2,0x7c
 
-# W32: v_cmp_lt_i64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0xa2,0x7c]
-# W64: v_cmp_lt_i64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0xa2,0x7c]
 0x7e,0x04,0xa2,0x7c
+# W32: v_cmp_lt_i64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0xa2,0x7c]
+# W64: v_cmp_lt_i64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0xa2,0x7c]
 
-# W32: v_cmp_lt_i64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0xa2,0x7c]
-# W64: v_cmp_lt_i64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0xa2,0x7c]
 0x7c,0x04,0xa2,0x7c
+# W32: v_cmp_lt_i64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0xa2,0x7c]
+# W64: v_cmp_lt_i64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0xa2,0x7c]
 
-# W32: v_cmp_lt_i64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0xa2,0x7c]
-# W64: v_cmp_lt_i64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0xa2,0x7c]
 0xc1,0x04,0xa2,0x7c
+# W32: v_cmp_lt_i64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0xa2,0x7c]
+# W64: v_cmp_lt_i64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0xa2,0x7c]
 
-# W32: v_cmp_lt_i64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0xa2,0x7c]
-# W64: v_cmp_lt_i64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0xa2,0x7c]
 0xf0,0x04,0xa2,0x7c
+# W32: v_cmp_lt_i64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0xa2,0x7c]
+# W64: v_cmp_lt_i64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0xa2,0x7c]
 
-# W32: v_cmp_lt_i64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0xa2,0x7c]
-# W64: v_cmp_lt_i64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0xa2,0x7c]
 0xfd,0x04,0xa2,0x7c
+# W32: v_cmp_lt_i64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0xa2,0x7c]
+# W64: v_cmp_lt_i64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0xa2,0x7c]
 
+0xff,0xfc,0xa3,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_lt_i64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa3,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_lt_i64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa3,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0xa3,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_lt_u16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x72,0x7c]
-# W64: v_cmp_lt_u16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x72,0x7c]
 0x01,0x05,0x72,0x7c
+# W32: v_cmp_lt_u16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x72,0x7c]
+# W64: v_cmp_lt_u16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x72,0x7c]
 
-# W32: v_cmp_lt_u16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x72,0x7c]
-# W64: v_cmp_lt_u16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x72,0x7c]
 0x7f,0x05,0x72,0x7c
+# W32: v_cmp_lt_u16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x72,0x7c]
+# W64: v_cmp_lt_u16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x72,0x7c]
 
-# W32: v_cmp_lt_u16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x72,0x7c]
-# W64: v_cmp_lt_u16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x72,0x7c]
 0x01,0x04,0x72,0x7c
+# W32: v_cmp_lt_u16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x72,0x7c]
+# W64: v_cmp_lt_u16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x72,0x7c]
 
-# W32: v_cmp_lt_u16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x72,0x7c]
-# W64: v_cmp_lt_u16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x72,0x7c]
 0x69,0x04,0x72,0x7c
+# W32: v_cmp_lt_u16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x72,0x7c]
+# W64: v_cmp_lt_u16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x72,0x7c]
 
-# W32: v_cmp_lt_u16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x72,0x7c]
-# W64: v_cmp_lt_u16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x72,0x7c]
 0x6a,0x04,0x72,0x7c
+# W32: v_cmp_lt_u16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x72,0x7c]
+# W64: v_cmp_lt_u16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x72,0x7c]
 
-# W32: v_cmp_lt_u16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x72,0x7c]
-# W64: v_cmp_lt_u16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x72,0x7c]
 0x6b,0x04,0x72,0x7c
+# W32: v_cmp_lt_u16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x72,0x7c]
+# W64: v_cmp_lt_u16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x72,0x7c]
 
-# W32: v_cmp_lt_u16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x72,0x7c]
-# W64: v_cmp_lt_u16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x72,0x7c]
 0x7b,0x04,0x72,0x7c
+# W32: v_cmp_lt_u16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x72,0x7c]
+# W64: v_cmp_lt_u16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x72,0x7c]
 
-# W32: v_cmp_lt_u16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x72,0x7c]
-# W64: v_cmp_lt_u16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x72,0x7c]
 0x7d,0x04,0x72,0x7c
+# W32: v_cmp_lt_u16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x72,0x7c]
+# W64: v_cmp_lt_u16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x72,0x7c]
 
-# W32: v_cmp_lt_u16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x72,0x7c]
-# W64: v_cmp_lt_u16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x72,0x7c]
 0x7e,0x04,0x72,0x7c
+# W32: v_cmp_lt_u16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x72,0x7c]
+# W64: v_cmp_lt_u16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x72,0x7c]
 
-# W32: v_cmp_lt_u16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x72,0x7c]
-# W64: v_cmp_lt_u16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x72,0x7c]
 0x7f,0x04,0x72,0x7c
+# W32: v_cmp_lt_u16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x72,0x7c]
+# W64: v_cmp_lt_u16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x72,0x7c]
 
-# W32: v_cmp_lt_u16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x72,0x7c]
-# W64: v_cmp_lt_u16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x72,0x7c]
 0x7c,0x04,0x72,0x7c
+# W32: v_cmp_lt_u16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x72,0x7c]
+# W64: v_cmp_lt_u16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x72,0x7c]
 
-# W32: v_cmp_lt_u16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x72,0x7c]
-# W64: v_cmp_lt_u16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x72,0x7c]
 0xc1,0x04,0x72,0x7c
+# W32: v_cmp_lt_u16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x72,0x7c]
+# W64: v_cmp_lt_u16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x72,0x7c]
 
-# W32: v_cmp_lt_u16_e32 vcc_lo, 0x3800, v2
-# W64: v_cmp_lt_u16_e32 vcc, 0x3800, v2          ; encoding: [0xff,0x04,0x72,0x7c,0x00,0x38,0x00,0x00]
 0xf0,0x04,0x72,0x7c
+# W32: v_cmp_lt_u16_e32 vcc_lo, 0x3800, v2     ; encoding: [0xff,0x04,0x72,0x7c,0x00,0x38,0x00,0x00]
+# W64: v_cmp_lt_u16_e32 vcc, 0x3800, v2        ; encoding: [0xff,0x04,0x72,0x7c,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_lt_u16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x72,0x7c]
-# W64: v_cmp_lt_u16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x72,0x7c]
 0xfd,0x04,0x72,0x7c
+# W32: v_cmp_lt_u16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x72,0x7c]
+# W64: v_cmp_lt_u16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x72,0x7c]
 
-# W32: v_cmp_lt_u16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x72,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_lt_u16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x72,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x72,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_lt_u16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x72,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_lt_u16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x72,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_lt_u32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x92,0x7c]
-# W64: v_cmp_lt_u32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x92,0x7c]
 0x01,0x05,0x92,0x7c
+# W32: v_cmp_lt_u32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x92,0x7c]
+# W64: v_cmp_lt_u32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x92,0x7c]
 
-# W32: v_cmp_lt_u32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x92,0x7c]
-# W64: v_cmp_lt_u32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x92,0x7c]
 0xff,0x05,0x92,0x7c
+# W32: v_cmp_lt_u32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x92,0x7c]
+# W64: v_cmp_lt_u32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x92,0x7c]
 
-# W32: v_cmp_lt_u32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x92,0x7c]
-# W64: v_cmp_lt_u32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x92,0x7c]
 0x01,0x04,0x92,0x7c
+# W32: v_cmp_lt_u32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x92,0x7c]
+# W64: v_cmp_lt_u32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x92,0x7c]
 
-# W32: v_cmp_lt_u32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x92,0x7c]
-# W64: v_cmp_lt_u32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x92,0x7c]
 0x69,0x04,0x92,0x7c
+# W32: v_cmp_lt_u32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x92,0x7c]
+# W64: v_cmp_lt_u32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x92,0x7c]
 
-# W32: v_cmp_lt_u32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x92,0x7c]
-# W64: v_cmp_lt_u32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x92,0x7c]
 0x6a,0x04,0x92,0x7c
+# W32: v_cmp_lt_u32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x92,0x7c]
+# W64: v_cmp_lt_u32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x92,0x7c]
 
-# W32: v_cmp_lt_u32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x92,0x7c]
-# W64: v_cmp_lt_u32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x92,0x7c]
 0x6b,0x04,0x92,0x7c
+# W32: v_cmp_lt_u32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x92,0x7c]
+# W64: v_cmp_lt_u32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x92,0x7c]
 
-# W32: v_cmp_lt_u32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x92,0x7c]
-# W64: v_cmp_lt_u32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x92,0x7c]
 0x7b,0x04,0x92,0x7c
+# W32: v_cmp_lt_u32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x92,0x7c]
+# W64: v_cmp_lt_u32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x92,0x7c]
 
-# W32: v_cmp_lt_u32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x92,0x7c]
-# W64: v_cmp_lt_u32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x92,0x7c]
 0x7d,0x04,0x92,0x7c
+# W32: v_cmp_lt_u32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x92,0x7c]
+# W64: v_cmp_lt_u32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x92,0x7c]
 
-# W32: v_cmp_lt_u32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x92,0x7c]
-# W64: v_cmp_lt_u32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x92,0x7c]
 0x7e,0x04,0x92,0x7c
+# W32: v_cmp_lt_u32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x92,0x7c]
+# W64: v_cmp_lt_u32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x92,0x7c]
 
-# W32: v_cmp_lt_u32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x92,0x7c]
-# W64: v_cmp_lt_u32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x92,0x7c]
 0x7f,0x04,0x92,0x7c
+# W32: v_cmp_lt_u32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x92,0x7c]
+# W64: v_cmp_lt_u32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x92,0x7c]
 
-# W32: v_cmp_lt_u32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x92,0x7c]
-# W64: v_cmp_lt_u32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x92,0x7c]
 0x7c,0x04,0x92,0x7c
+# W32: v_cmp_lt_u32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x92,0x7c]
+# W64: v_cmp_lt_u32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x92,0x7c]
 
-# W32: v_cmp_lt_u32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x92,0x7c]
-# W64: v_cmp_lt_u32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x92,0x7c]
 0xc1,0x04,0x92,0x7c
+# W32: v_cmp_lt_u32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x92,0x7c]
+# W64: v_cmp_lt_u32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x92,0x7c]
 
-# W32: v_cmp_lt_u32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x92,0x7c]
-# W64: v_cmp_lt_u32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x92,0x7c]
 0xf0,0x04,0x92,0x7c
+# W32: v_cmp_lt_u32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x92,0x7c]
+# W64: v_cmp_lt_u32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x92,0x7c]
 
-# W32: v_cmp_lt_u32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x92,0x7c]
-# W64: v_cmp_lt_u32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x92,0x7c]
 0xfd,0x04,0x92,0x7c
+# W32: v_cmp_lt_u32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x92,0x7c]
+# W64: v_cmp_lt_u32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x92,0x7c]
 
-# W32: v_cmp_lt_u32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x93,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_lt_u32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x93,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x93,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_lt_u32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x93,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_lt_u32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x93,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0xb2,0x7c]
-# W64: v_cmp_lt_u64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0xb2,0x7c]
 0x01,0x05,0xb2,0x7c
+# W32: v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0xb2,0x7c]
+# W64: v_cmp_lt_u64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0xb2,0x7c]
 
-# W32: v_cmp_lt_u64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xb2,0x7c]
-# W64: v_cmp_lt_u64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0xb2,0x7c]
 0xfe,0x05,0xb2,0x7c
+# W32: v_cmp_lt_u64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xb2,0x7c]
+# W64: v_cmp_lt_u64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xb2,0x7c]
 
-# W32: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0xb2,0x7c]
-# W64: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0xb2,0x7c]
 0x02,0x04,0xb2,0x7c
+# W32: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0xb2,0x7c]
+# W64: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0xb2,0x7c]
 
-# W32: v_cmp_lt_u64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xb2,0x7c]
-# W64: v_cmp_lt_u64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0xb2,0x7c]
 0x68,0x04,0xb2,0x7c
+# W32: v_cmp_lt_u64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xb2,0x7c]
+# W64: v_cmp_lt_u64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xb2,0x7c]
 
-# W32: v_cmp_lt_u64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0xb2,0x7c]
-# W64: v_cmp_lt_u64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0xb2,0x7c]
 0x6a,0x04,0xb2,0x7c
+# W32: v_cmp_lt_u64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0xb2,0x7c]
+# W64: v_cmp_lt_u64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0xb2,0x7c]
 
+0x7a,0x04,0xb2,0x7c
 # W32: v_cmp_lt_u64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xb2,0x7c]
 # W64: v_cmp_lt_u64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xb2,0x7c]
-0x7a,0x04,0xb2,0x7c
 
-# W32: v_cmp_lt_u64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0xb2,0x7c]
-# W64: v_cmp_lt_u64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0xb2,0x7c]
 0x7e,0x04,0xb2,0x7c
+# W32: v_cmp_lt_u64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0xb2,0x7c]
+# W64: v_cmp_lt_u64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0xb2,0x7c]
 
-# W32: v_cmp_lt_u64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0xb2,0x7c]
-# W64: v_cmp_lt_u64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0xb2,0x7c]
 0x7c,0x04,0xb2,0x7c
+# W32: v_cmp_lt_u64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0xb2,0x7c]
+# W64: v_cmp_lt_u64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0xb2,0x7c]
 
-# W32: v_cmp_lt_u64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0xb2,0x7c]
-# W64: v_cmp_lt_u64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0xb2,0x7c]
 0xc1,0x04,0xb2,0x7c
+# W32: v_cmp_lt_u64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0xb2,0x7c]
+# W64: v_cmp_lt_u64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0xb2,0x7c]
 
-# W32: v_cmp_lt_u64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0xb2,0x7c]
-# W64: v_cmp_lt_u64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0xb2,0x7c]
 0xf0,0x04,0xb2,0x7c
+# W32: v_cmp_lt_u64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0xb2,0x7c]
+# W64: v_cmp_lt_u64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0xb2,0x7c]
 
-# W32: v_cmp_lt_u64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0xb2,0x7c]
-# W64: v_cmp_lt_u64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0xb2,0x7c]
 0xfd,0x04,0xb2,0x7c
+# W32: v_cmp_lt_u64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0xb2,0x7c]
+# W64: v_cmp_lt_u64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0xb2,0x7c]
 
+0xff,0xfc,0xb3,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_lt_u64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb3,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_lt_u64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb3,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0xb3,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_ne_i16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x6a,0x7c]
-# W64: v_cmp_ne_i16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x6a,0x7c]
 0x01,0x05,0x6a,0x7c
+# W32: v_cmp_ne_i16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x6a,0x7c]
+# W64: v_cmp_ne_i16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x6a,0x7c]
 
-# W32: v_cmp_ne_i16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x6a,0x7c]
-# W64: v_cmp_ne_i16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x6a,0x7c]
 0x7f,0x05,0x6a,0x7c
+# W32: v_cmp_ne_i16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x6a,0x7c]
+# W64: v_cmp_ne_i16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x6a,0x7c]
 
-# W32: v_cmp_ne_i16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x6a,0x7c]
-# W64: v_cmp_ne_i16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x6a,0x7c]
 0x01,0x04,0x6a,0x7c
+# W32: v_cmp_ne_i16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x6a,0x7c]
+# W64: v_cmp_ne_i16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x6a,0x7c]
 
-# W32: v_cmp_ne_i16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x6a,0x7c]
-# W64: v_cmp_ne_i16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x6a,0x7c]
 0x69,0x04,0x6a,0x7c
+# W32: v_cmp_ne_i16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x6a,0x7c]
+# W64: v_cmp_ne_i16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x6a,0x7c]
 
-# W32: v_cmp_ne_i16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x6a,0x7c]
-# W64: v_cmp_ne_i16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x6a,0x7c]
 0x6a,0x04,0x6a,0x7c
+# W32: v_cmp_ne_i16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x6a,0x7c]
+# W64: v_cmp_ne_i16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x6a,0x7c]
 
-# W32: v_cmp_ne_i16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x6a,0x7c]
-# W64: v_cmp_ne_i16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x6a,0x7c]
 0x6b,0x04,0x6a,0x7c
+# W32: v_cmp_ne_i16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x6a,0x7c]
+# W64: v_cmp_ne_i16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x6a,0x7c]
 
-# W32: v_cmp_ne_i16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x6a,0x7c]
-# W64: v_cmp_ne_i16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x6a,0x7c]
 0x7b,0x04,0x6a,0x7c
+# W32: v_cmp_ne_i16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x6a,0x7c]
+# W64: v_cmp_ne_i16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x6a,0x7c]
 
-# W32: v_cmp_ne_i16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x6a,0x7c]
-# W64: v_cmp_ne_i16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x6a,0x7c]
 0x7d,0x04,0x6a,0x7c
+# W32: v_cmp_ne_i16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x6a,0x7c]
+# W64: v_cmp_ne_i16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x6a,0x7c]
 
-# W32: v_cmp_ne_i16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x6a,0x7c]
-# W64: v_cmp_ne_i16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x6a,0x7c]
 0x7e,0x04,0x6a,0x7c
+# W32: v_cmp_ne_i16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x6a,0x7c]
+# W64: v_cmp_ne_i16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x6a,0x7c]
 
-# W32: v_cmp_ne_i16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x6a,0x7c]
-# W64: v_cmp_ne_i16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x6a,0x7c]
 0x7f,0x04,0x6a,0x7c
+# W32: v_cmp_ne_i16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x6a,0x7c]
+# W64: v_cmp_ne_i16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x6a,0x7c]
 
-# W32: v_cmp_ne_i16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x6a,0x7c]
-# W64: v_cmp_ne_i16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x6a,0x7c]
 0x7c,0x04,0x6a,0x7c
+# W32: v_cmp_ne_i16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x6a,0x7c]
+# W64: v_cmp_ne_i16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x6a,0x7c]
 
-# W32: v_cmp_ne_i16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x6a,0x7c]
-# W64: v_cmp_ne_i16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x6a,0x7c]
 0xc1,0x04,0x6a,0x7c
+# W32: v_cmp_ne_i16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x6a,0x7c]
+# W64: v_cmp_ne_i16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x6a,0x7c]
 
-# W32: v_cmp_ne_i16_e32 vcc_lo, 0x3800, v2
-# W64: v_cmp_ne_i16_e32 vcc, 0x3800, v2          ; encoding: [0xff,0x04,0x6a,0x7c,0x00,0x38,0x00,0x00]
 0xf0,0x04,0x6a,0x7c
+# W32: v_cmp_ne_i16_e32 vcc_lo, 0x3800, v2     ; encoding: [0xff,0x04,0x6a,0x7c,0x00,0x38,0x00,0x00]
+# W64: v_cmp_ne_i16_e32 vcc, 0x3800, v2        ; encoding: [0xff,0x04,0x6a,0x7c,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_ne_i16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x6a,0x7c]
-# W64: v_cmp_ne_i16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x6a,0x7c]
 0xfd,0x04,0x6a,0x7c
+# W32: v_cmp_ne_i16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x6a,0x7c]
+# W64: v_cmp_ne_i16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x6a,0x7c]
 
-# W32: v_cmp_ne_i16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x6a,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_ne_i16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x6a,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x6a,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_ne_i16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x6a,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_ne_i16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x6a,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ne_i32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x8a,0x7c]
-# W64: v_cmp_ne_i32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x8a,0x7c]
 0x01,0x05,0x8a,0x7c
+# W32: v_cmp_ne_i32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x8a,0x7c]
+# W64: v_cmp_ne_i32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x8a,0x7c]
 
-# W32: v_cmp_ne_i32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x8a,0x7c]
-# W64: v_cmp_ne_i32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x8a,0x7c]
 0xff,0x05,0x8a,0x7c
+# W32: v_cmp_ne_i32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x8a,0x7c]
+# W64: v_cmp_ne_i32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x8a,0x7c]
 
-# W32: v_cmp_ne_i32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x8a,0x7c]
-# W64: v_cmp_ne_i32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x8a,0x7c]
 0x01,0x04,0x8a,0x7c
+# W32: v_cmp_ne_i32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x8a,0x7c]
+# W64: v_cmp_ne_i32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x8a,0x7c]
 
-# W32: v_cmp_ne_i32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x8a,0x7c]
-# W64: v_cmp_ne_i32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x8a,0x7c]
 0x69,0x04,0x8a,0x7c
+# W32: v_cmp_ne_i32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x8a,0x7c]
+# W64: v_cmp_ne_i32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x8a,0x7c]
 
-# W32: v_cmp_ne_i32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x8a,0x7c]
-# W64: v_cmp_ne_i32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x8a,0x7c]
 0x6a,0x04,0x8a,0x7c
+# W32: v_cmp_ne_i32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x8a,0x7c]
+# W64: v_cmp_ne_i32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x8a,0x7c]
 
-# W32: v_cmp_ne_i32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x8a,0x7c]
-# W64: v_cmp_ne_i32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x8a,0x7c]
 0x6b,0x04,0x8a,0x7c
+# W32: v_cmp_ne_i32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x8a,0x7c]
+# W64: v_cmp_ne_i32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x8a,0x7c]
 
-# W32: v_cmp_ne_i32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x8a,0x7c]
-# W64: v_cmp_ne_i32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x8a,0x7c]
 0x7b,0x04,0x8a,0x7c
+# W32: v_cmp_ne_i32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x8a,0x7c]
+# W64: v_cmp_ne_i32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x8a,0x7c]
 
-# W32: v_cmp_ne_i32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x8a,0x7c]
-# W64: v_cmp_ne_i32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x8a,0x7c]
 0x7d,0x04,0x8a,0x7c
+# W32: v_cmp_ne_i32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x8a,0x7c]
+# W64: v_cmp_ne_i32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x8a,0x7c]
 
-# W32: v_cmp_ne_i32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x8a,0x7c]
-# W64: v_cmp_ne_i32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x8a,0x7c]
 0x7e,0x04,0x8a,0x7c
+# W32: v_cmp_ne_i32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x8a,0x7c]
+# W64: v_cmp_ne_i32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x8a,0x7c]
 
-# W32: v_cmp_ne_i32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x8a,0x7c]
-# W64: v_cmp_ne_i32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x8a,0x7c]
 0x7f,0x04,0x8a,0x7c
+# W32: v_cmp_ne_i32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x8a,0x7c]
+# W64: v_cmp_ne_i32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x8a,0x7c]
 
-# W32: v_cmp_ne_i32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x8a,0x7c]
-# W64: v_cmp_ne_i32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x8a,0x7c]
 0x7c,0x04,0x8a,0x7c
+# W32: v_cmp_ne_i32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x8a,0x7c]
+# W64: v_cmp_ne_i32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x8a,0x7c]
 
-# W32: v_cmp_ne_i32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x8a,0x7c]
-# W64: v_cmp_ne_i32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x8a,0x7c]
 0xc1,0x04,0x8a,0x7c
+# W32: v_cmp_ne_i32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x8a,0x7c]
+# W64: v_cmp_ne_i32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x8a,0x7c]
 
-# W32: v_cmp_ne_i32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x8a,0x7c]
-# W64: v_cmp_ne_i32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x8a,0x7c]
 0xf0,0x04,0x8a,0x7c
+# W32: v_cmp_ne_i32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x8a,0x7c]
+# W64: v_cmp_ne_i32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x8a,0x7c]
 
-# W32: v_cmp_ne_i32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x8a,0x7c]
-# W64: v_cmp_ne_i32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x8a,0x7c]
 0xfd,0x04,0x8a,0x7c
+# W32: v_cmp_ne_i32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x8a,0x7c]
+# W64: v_cmp_ne_i32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x8a,0x7c]
 
-# W32: v_cmp_ne_i32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x8b,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_ne_i32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x8b,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x8b,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_ne_i32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x8b,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_ne_i32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x8b,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ne_i64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0xaa,0x7c]
-# W64: v_cmp_ne_i64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0xaa,0x7c]
 0x01,0x05,0xaa,0x7c
+# W32: v_cmp_ne_i64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0xaa,0x7c]
+# W64: v_cmp_ne_i64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0xaa,0x7c]
 
-# W32: v_cmp_ne_i64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xaa,0x7c]
-# W64: v_cmp_ne_i64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0xaa,0x7c]
 0xfe,0x05,0xaa,0x7c
+# W32: v_cmp_ne_i64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xaa,0x7c]
+# W64: v_cmp_ne_i64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xaa,0x7c]
 
-# W32: v_cmp_ne_i64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0xaa,0x7c]
-# W64: v_cmp_ne_i64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0xaa,0x7c]
 0x02,0x04,0xaa,0x7c
+# W32: v_cmp_ne_i64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0xaa,0x7c]
+# W64: v_cmp_ne_i64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0xaa,0x7c]
 
-# W32: v_cmp_ne_i64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xaa,0x7c]
-# W64: v_cmp_ne_i64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0xaa,0x7c]
 0x68,0x04,0xaa,0x7c
+# W32: v_cmp_ne_i64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xaa,0x7c]
+# W64: v_cmp_ne_i64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xaa,0x7c]
 
-# W32: v_cmp_ne_i64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0xaa,0x7c]
-# W64: v_cmp_ne_i64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0xaa,0x7c]
 0x6a,0x04,0xaa,0x7c
+# W32: v_cmp_ne_i64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0xaa,0x7c]
+# W64: v_cmp_ne_i64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0xaa,0x7c]
 
+0x7a,0x04,0xaa,0x7c
 # W32: v_cmp_ne_i64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xaa,0x7c]
 # W64: v_cmp_ne_i64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xaa,0x7c]
-0x7a,0x04,0xaa,0x7c
 
-# W32: v_cmp_ne_i64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0xaa,0x7c]
-# W64: v_cmp_ne_i64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0xaa,0x7c]
 0x7e,0x04,0xaa,0x7c
+# W32: v_cmp_ne_i64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0xaa,0x7c]
+# W64: v_cmp_ne_i64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0xaa,0x7c]
 
-# W32: v_cmp_ne_i64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0xaa,0x7c]
-# W64: v_cmp_ne_i64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0xaa,0x7c]
 0x7c,0x04,0xaa,0x7c
+# W32: v_cmp_ne_i64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0xaa,0x7c]
+# W64: v_cmp_ne_i64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0xaa,0x7c]
 
-# W32: v_cmp_ne_i64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0xaa,0x7c]
-# W64: v_cmp_ne_i64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0xaa,0x7c]
 0xc1,0x04,0xaa,0x7c
+# W32: v_cmp_ne_i64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0xaa,0x7c]
+# W64: v_cmp_ne_i64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0xaa,0x7c]
 
-# W32: v_cmp_ne_i64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0xaa,0x7c]
-# W64: v_cmp_ne_i64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0xaa,0x7c]
 0xf0,0x04,0xaa,0x7c
+# W32: v_cmp_ne_i64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0xaa,0x7c]
+# W64: v_cmp_ne_i64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0xaa,0x7c]
 
-# W32: v_cmp_ne_i64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0xaa,0x7c]
-# W64: v_cmp_ne_i64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0xaa,0x7c]
 0xfd,0x04,0xaa,0x7c
+# W32: v_cmp_ne_i64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0xaa,0x7c]
+# W64: v_cmp_ne_i64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0xaa,0x7c]
 
+0xff,0xfc,0xab,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_ne_i64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xab,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_ne_i64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xab,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0xab,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_ne_u16_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x7a,0x7c]
-# W64: v_cmp_ne_u16_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x7a,0x7c]
 0x01,0x05,0x7a,0x7c
+# W32: v_cmp_ne_u16_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x7a,0x7c]
+# W64: v_cmp_ne_u16_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x7a,0x7c]
 
-# W32: v_cmp_ne_u16_e32 vcc_lo, v127, v2         ; encoding: [0x7f,0x05,0x7a,0x7c]
-# W64: v_cmp_ne_u16_e32 vcc, v127, v2            ; encoding: [0x7f,0x05,0x7a,0x7c]
 0x7f,0x05,0x7a,0x7c
+# W32: v_cmp_ne_u16_e32 vcc_lo, v127, v2       ; encoding: [0x7f,0x05,0x7a,0x7c]
+# W64: v_cmp_ne_u16_e32 vcc, v127, v2          ; encoding: [0x7f,0x05,0x7a,0x7c]
 
-# W32: v_cmp_ne_u16_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x7a,0x7c]
-# W64: v_cmp_ne_u16_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x7a,0x7c]
 0x01,0x04,0x7a,0x7c
+# W32: v_cmp_ne_u16_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x7a,0x7c]
+# W64: v_cmp_ne_u16_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x7a,0x7c]
 
-# W32: v_cmp_ne_u16_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x7a,0x7c]
-# W64: v_cmp_ne_u16_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x7a,0x7c]
 0x69,0x04,0x7a,0x7c
+# W32: v_cmp_ne_u16_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x7a,0x7c]
+# W64: v_cmp_ne_u16_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x7a,0x7c]
 
-# W32: v_cmp_ne_u16_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x7a,0x7c]
-# W64: v_cmp_ne_u16_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x7a,0x7c]
 0x6a,0x04,0x7a,0x7c
+# W32: v_cmp_ne_u16_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x7a,0x7c]
+# W64: v_cmp_ne_u16_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x7a,0x7c]
 
-# W32: v_cmp_ne_u16_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x7a,0x7c]
-# W64: v_cmp_ne_u16_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x7a,0x7c]
 0x6b,0x04,0x7a,0x7c
+# W32: v_cmp_ne_u16_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x7a,0x7c]
+# W64: v_cmp_ne_u16_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x7a,0x7c]
 
-# W32: v_cmp_ne_u16_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x7a,0x7c]
-# W64: v_cmp_ne_u16_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x7a,0x7c]
 0x7b,0x04,0x7a,0x7c
+# W32: v_cmp_ne_u16_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x7a,0x7c]
+# W64: v_cmp_ne_u16_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x7a,0x7c]
 
-# W32: v_cmp_ne_u16_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x7a,0x7c]
-# W64: v_cmp_ne_u16_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x7a,0x7c]
 0x7d,0x04,0x7a,0x7c
+# W32: v_cmp_ne_u16_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x7a,0x7c]
+# W64: v_cmp_ne_u16_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x7a,0x7c]
 
-# W32: v_cmp_ne_u16_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x7a,0x7c]
-# W64: v_cmp_ne_u16_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x7a,0x7c]
 0x7e,0x04,0x7a,0x7c
+# W32: v_cmp_ne_u16_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x7a,0x7c]
+# W64: v_cmp_ne_u16_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x7a,0x7c]
 
-# W32: v_cmp_ne_u16_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x7a,0x7c]
-# W64: v_cmp_ne_u16_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x7a,0x7c]
 0x7f,0x04,0x7a,0x7c
+# W32: v_cmp_ne_u16_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x7a,0x7c]
+# W64: v_cmp_ne_u16_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x7a,0x7c]
 
-# W32: v_cmp_ne_u16_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x7a,0x7c]
-# W64: v_cmp_ne_u16_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x7a,0x7c]
 0x7c,0x04,0x7a,0x7c
+# W32: v_cmp_ne_u16_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x7a,0x7c]
+# W64: v_cmp_ne_u16_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x7a,0x7c]
 
-# W32: v_cmp_ne_u16_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x7a,0x7c]
-# W64: v_cmp_ne_u16_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x7a,0x7c]
 0xc1,0x04,0x7a,0x7c
+# W32: v_cmp_ne_u16_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x7a,0x7c]
+# W64: v_cmp_ne_u16_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x7a,0x7c]
 
-# W32: v_cmp_ne_u16_e32 vcc_lo, 0x3800, v2
-# W64: v_cmp_ne_u16_e32 vcc, 0x3800, v2          ; encoding: [0xff,0x04,0x7a,0x7c,0x00,0x38,0x00,0x00]
 0xf0,0x04,0x7a,0x7c
+# W32: v_cmp_ne_u16_e32 vcc_lo, 0x3800, v2     ; encoding: [0xff,0x04,0x7a,0x7c,0x00,0x38,0x00,0x00]
+# W64: v_cmp_ne_u16_e32 vcc, 0x3800, v2        ; encoding: [0xff,0x04,0x7a,0x7c,0x00,0x38,0x00,0x00]
 
-# W32: v_cmp_ne_u16_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x7a,0x7c]
-# W64: v_cmp_ne_u16_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x7a,0x7c]
 0xfd,0x04,0x7a,0x7c
+# W32: v_cmp_ne_u16_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x7a,0x7c]
+# W64: v_cmp_ne_u16_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x7a,0x7c]
 
-# W32: v_cmp_ne_u16_e32 vcc_lo, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x7a,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_ne_u16_e32 vcc, 0xfe0b, v127        ; encoding: [0xff,0xfe,0x7a,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x7a,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_ne_u16_e32 vcc_lo, 0xfe0b, v127   ; encoding: [0xff,0xfe,0x7a,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_ne_u16_e32 vcc, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x7a,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ne_u32_e32 vcc_lo, v1, v2           ; encoding: [0x01,0x05,0x9a,0x7c]
-# W64: v_cmp_ne_u32_e32 vcc, v1, v2              ; encoding: [0x01,0x05,0x9a,0x7c]
 0x01,0x05,0x9a,0x7c
+# W32: v_cmp_ne_u32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x9a,0x7c]
+# W64: v_cmp_ne_u32_e32 vcc, v1, v2            ; encoding: [0x01,0x05,0x9a,0x7c]
 
-# W32: v_cmp_ne_u32_e32 vcc_lo, v255, v2         ; encoding: [0xff,0x05,0x9a,0x7c]
-# W64: v_cmp_ne_u32_e32 vcc, v255, v2            ; encoding: [0xff,0x05,0x9a,0x7c]
 0xff,0x05,0x9a,0x7c
+# W32: v_cmp_ne_u32_e32 vcc_lo, v255, v2       ; encoding: [0xff,0x05,0x9a,0x7c]
+# W64: v_cmp_ne_u32_e32 vcc, v255, v2          ; encoding: [0xff,0x05,0x9a,0x7c]
 
-# W32: v_cmp_ne_u32_e32 vcc_lo, s1, v2           ; encoding: [0x01,0x04,0x9a,0x7c]
-# W64: v_cmp_ne_u32_e32 vcc, s1, v2              ; encoding: [0x01,0x04,0x9a,0x7c]
 0x01,0x04,0x9a,0x7c
+# W32: v_cmp_ne_u32_e32 vcc_lo, s1, v2         ; encoding: [0x01,0x04,0x9a,0x7c]
+# W64: v_cmp_ne_u32_e32 vcc, s1, v2            ; encoding: [0x01,0x04,0x9a,0x7c]
 
-# W32: v_cmp_ne_u32_e32 vcc_lo, s105, v2         ; encoding: [0x69,0x04,0x9a,0x7c]
-# W64: v_cmp_ne_u32_e32 vcc, s105, v2            ; encoding: [0x69,0x04,0x9a,0x7c]
 0x69,0x04,0x9a,0x7c
+# W32: v_cmp_ne_u32_e32 vcc_lo, s105, v2       ; encoding: [0x69,0x04,0x9a,0x7c]
+# W64: v_cmp_ne_u32_e32 vcc, s105, v2          ; encoding: [0x69,0x04,0x9a,0x7c]
 
-# W32: v_cmp_ne_u32_e32 vcc_lo, vcc_lo, v2       ; encoding: [0x6a,0x04,0x9a,0x7c]
-# W64: v_cmp_ne_u32_e32 vcc, vcc_lo, v2          ; encoding: [0x6a,0x04,0x9a,0x7c]
 0x6a,0x04,0x9a,0x7c
+# W32: v_cmp_ne_u32_e32 vcc_lo, vcc_lo, v2     ; encoding: [0x6a,0x04,0x9a,0x7c]
+# W64: v_cmp_ne_u32_e32 vcc, vcc_lo, v2        ; encoding: [0x6a,0x04,0x9a,0x7c]
 
-# W32: v_cmp_ne_u32_e32 vcc_lo, vcc_hi, v2       ; encoding: [0x6b,0x04,0x9a,0x7c]
-# W64: v_cmp_ne_u32_e32 vcc, vcc_hi, v2          ; encoding: [0x6b,0x04,0x9a,0x7c]
 0x6b,0x04,0x9a,0x7c
+# W32: v_cmp_ne_u32_e32 vcc_lo, vcc_hi, v2     ; encoding: [0x6b,0x04,0x9a,0x7c]
+# W64: v_cmp_ne_u32_e32 vcc, vcc_hi, v2        ; encoding: [0x6b,0x04,0x9a,0x7c]
 
-# W32: v_cmp_ne_u32_e32 vcc_lo, ttmp15, v2       ; encoding: [0x7b,0x04,0x9a,0x7c]
-# W64: v_cmp_ne_u32_e32 vcc, ttmp15, v2          ; encoding: [0x7b,0x04,0x9a,0x7c]
 0x7b,0x04,0x9a,0x7c
+# W32: v_cmp_ne_u32_e32 vcc_lo, ttmp15, v2     ; encoding: [0x7b,0x04,0x9a,0x7c]
+# W64: v_cmp_ne_u32_e32 vcc, ttmp15, v2        ; encoding: [0x7b,0x04,0x9a,0x7c]
 
-# W32: v_cmp_ne_u32_e32 vcc_lo, m0, v2           ; encoding: [0x7d,0x04,0x9a,0x7c]
-# W64: v_cmp_ne_u32_e32 vcc, m0, v2              ; encoding: [0x7d,0x04,0x9a,0x7c]
 0x7d,0x04,0x9a,0x7c
+# W32: v_cmp_ne_u32_e32 vcc_lo, m0, v2         ; encoding: [0x7d,0x04,0x9a,0x7c]
+# W64: v_cmp_ne_u32_e32 vcc, m0, v2            ; encoding: [0x7d,0x04,0x9a,0x7c]
 
-# W32: v_cmp_ne_u32_e32 vcc_lo, exec_lo, v2      ; encoding: [0x7e,0x04,0x9a,0x7c]
-# W64: v_cmp_ne_u32_e32 vcc, exec_lo, v2         ; encoding: [0x7e,0x04,0x9a,0x7c]
 0x7e,0x04,0x9a,0x7c
+# W32: v_cmp_ne_u32_e32 vcc_lo, exec_lo, v2    ; encoding: [0x7e,0x04,0x9a,0x7c]
+# W64: v_cmp_ne_u32_e32 vcc, exec_lo, v2       ; encoding: [0x7e,0x04,0x9a,0x7c]
 
-# W32: v_cmp_ne_u32_e32 vcc_lo, exec_hi, v2      ; encoding: [0x7f,0x04,0x9a,0x7c]
-# W64: v_cmp_ne_u32_e32 vcc, exec_hi, v2         ; encoding: [0x7f,0x04,0x9a,0x7c]
 0x7f,0x04,0x9a,0x7c
+# W32: v_cmp_ne_u32_e32 vcc_lo, exec_hi, v2    ; encoding: [0x7f,0x04,0x9a,0x7c]
+# W64: v_cmp_ne_u32_e32 vcc, exec_hi, v2       ; encoding: [0x7f,0x04,0x9a,0x7c]
 
-# W32: v_cmp_ne_u32_e32 vcc_lo, null, v2         ; encoding: [0x7c,0x04,0x9a,0x7c]
-# W64: v_cmp_ne_u32_e32 vcc, null, v2            ; encoding: [0x7c,0x04,0x9a,0x7c]
 0x7c,0x04,0x9a,0x7c
+# W32: v_cmp_ne_u32_e32 vcc_lo, null, v2       ; encoding: [0x7c,0x04,0x9a,0x7c]
+# W64: v_cmp_ne_u32_e32 vcc, null, v2          ; encoding: [0x7c,0x04,0x9a,0x7c]
 
-# W32: v_cmp_ne_u32_e32 vcc_lo, -1, v2           ; encoding: [0xc1,0x04,0x9a,0x7c]
-# W64: v_cmp_ne_u32_e32 vcc, -1, v2              ; encoding: [0xc1,0x04,0x9a,0x7c]
 0xc1,0x04,0x9a,0x7c
+# W32: v_cmp_ne_u32_e32 vcc_lo, -1, v2         ; encoding: [0xc1,0x04,0x9a,0x7c]
+# W64: v_cmp_ne_u32_e32 vcc, -1, v2            ; encoding: [0xc1,0x04,0x9a,0x7c]
 
-# W32: v_cmp_ne_u32_e32 vcc_lo, 0.5, v2          ; encoding: [0xf0,0x04,0x9a,0x7c]
-# W64: v_cmp_ne_u32_e32 vcc, 0.5, v2             ; encoding: [0xf0,0x04,0x9a,0x7c]
 0xf0,0x04,0x9a,0x7c
+# W32: v_cmp_ne_u32_e32 vcc_lo, 0.5, v2        ; encoding: [0xf0,0x04,0x9a,0x7c]
+# W64: v_cmp_ne_u32_e32 vcc, 0.5, v2           ; encoding: [0xf0,0x04,0x9a,0x7c]
 
-# W32: v_cmp_ne_u32_e32 vcc_lo, src_scc, v2      ; encoding: [0xfd,0x04,0x9a,0x7c]
-# W64: v_cmp_ne_u32_e32 vcc, src_scc, v2         ; encoding: [0xfd,0x04,0x9a,0x7c]
 0xfd,0x04,0x9a,0x7c
+# W32: v_cmp_ne_u32_e32 vcc_lo, src_scc, v2    ; encoding: [0xfd,0x04,0x9a,0x7c]
+# W64: v_cmp_ne_u32_e32 vcc, src_scc, v2       ; encoding: [0xfd,0x04,0x9a,0x7c]
 
-# W32: v_cmp_ne_u32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x9b,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_ne_u32_e32 vcc, 0xaf123456, v255    ; encoding: [0xff,0xfe,0x9b,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x9b,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_ne_u32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x9b,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_ne_u32_e32 vcc, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x9b,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ne_u64_e32 vcc_lo, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0xba,0x7c]
-# W64: v_cmp_ne_u64_e32 vcc, v[1:2], v[2:3]      ; encoding: [0x01,0x05,0xba,0x7c]
 0x01,0x05,0xba,0x7c
+# W32: v_cmp_ne_u64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0xba,0x7c]
+# W64: v_cmp_ne_u64_e32 vcc, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0xba,0x7c]
 
-# W32: v_cmp_ne_u64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xba,0x7c]
-# W64: v_cmp_ne_u64_e32 vcc, v[254:255], v[2:3]  ; encoding: [0xfe,0x05,0xba,0x7c]
 0xfe,0x05,0xba,0x7c
+# W32: v_cmp_ne_u64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xba,0x7c]
+# W64: v_cmp_ne_u64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0xba,0x7c]
 
-# W32: v_cmp_ne_u64_e32 vcc_lo, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0xba,0x7c]
-# W64: v_cmp_ne_u64_e32 vcc, s[2:3], v[2:3]      ; encoding: [0x02,0x04,0xba,0x7c]
 0x02,0x04,0xba,0x7c
+# W32: v_cmp_ne_u64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0xba,0x7c]
+# W64: v_cmp_ne_u64_e32 vcc, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0xba,0x7c]
 
-# W32: v_cmp_ne_u64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xba,0x7c]
-# W64: v_cmp_ne_u64_e32 vcc, s[104:105], v[2:3]  ; encoding: [0x68,0x04,0xba,0x7c]
 0x68,0x04,0xba,0x7c
+# W32: v_cmp_ne_u64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xba,0x7c]
+# W64: v_cmp_ne_u64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0xba,0x7c]
 
-# W32: v_cmp_ne_u64_e32 vcc_lo, vcc, v[2:3]      ; encoding: [0x6a,0x04,0xba,0x7c]
-# W64: v_cmp_ne_u64_e32 vcc, vcc, v[2:3]         ; encoding: [0x6a,0x04,0xba,0x7c]
 0x6a,0x04,0xba,0x7c
+# W32: v_cmp_ne_u64_e32 vcc_lo, vcc, v[2:3]    ; encoding: [0x6a,0x04,0xba,0x7c]
+# W64: v_cmp_ne_u64_e32 vcc, vcc, v[2:3]       ; encoding: [0x6a,0x04,0xba,0x7c]
 
+0x7a,0x04,0xba,0x7c
 # W32: v_cmp_ne_u64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xba,0x7c]
 # W64: v_cmp_ne_u64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0xba,0x7c]
-0x7a,0x04,0xba,0x7c
 
-# W32: v_cmp_ne_u64_e32 vcc_lo, exec, v[2:3]     ; encoding: [0x7e,0x04,0xba,0x7c]
-# W64: v_cmp_ne_u64_e32 vcc, exec, v[2:3]        ; encoding: [0x7e,0x04,0xba,0x7c]
 0x7e,0x04,0xba,0x7c
+# W32: v_cmp_ne_u64_e32 vcc_lo, exec, v[2:3]   ; encoding: [0x7e,0x04,0xba,0x7c]
+# W64: v_cmp_ne_u64_e32 vcc, exec, v[2:3]      ; encoding: [0x7e,0x04,0xba,0x7c]
 
-# W32: v_cmp_ne_u64_e32 vcc_lo, null, v[2:3]     ; encoding: [0x7c,0x04,0xba,0x7c]
-# W64: v_cmp_ne_u64_e32 vcc, null, v[2:3]        ; encoding: [0x7c,0x04,0xba,0x7c]
 0x7c,0x04,0xba,0x7c
+# W32: v_cmp_ne_u64_e32 vcc_lo, null, v[2:3]   ; encoding: [0x7c,0x04,0xba,0x7c]
+# W64: v_cmp_ne_u64_e32 vcc, null, v[2:3]      ; encoding: [0x7c,0x04,0xba,0x7c]
 
-# W32: v_cmp_ne_u64_e32 vcc_lo, -1, v[2:3]       ; encoding: [0xc1,0x04,0xba,0x7c]
-# W64: v_cmp_ne_u64_e32 vcc, -1, v[2:3]          ; encoding: [0xc1,0x04,0xba,0x7c]
 0xc1,0x04,0xba,0x7c
+# W32: v_cmp_ne_u64_e32 vcc_lo, -1, v[2:3]     ; encoding: [0xc1,0x04,0xba,0x7c]
+# W64: v_cmp_ne_u64_e32 vcc, -1, v[2:3]        ; encoding: [0xc1,0x04,0xba,0x7c]
 
-# W32: v_cmp_ne_u64_e32 vcc_lo, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0xba,0x7c]
-# W64: v_cmp_ne_u64_e32 vcc, 0.5, v[2:3]         ; encoding: [0xf0,0x04,0xba,0x7c]
 0xf0,0x04,0xba,0x7c
+# W32: v_cmp_ne_u64_e32 vcc_lo, 0.5, v[2:3]    ; encoding: [0xf0,0x04,0xba,0x7c]
+# W64: v_cmp_ne_u64_e32 vcc, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0xba,0x7c]
 
-# W32: v_cmp_ne_u64_e32 vcc_lo, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0xba,0x7c]
-# W64: v_cmp_ne_u64_e32 vcc, src_scc, v[2:3]     ; encoding: [0xfd,0x04,0xba,0x7c]
 0xfd,0x04,0xba,0x7c
+# W32: v_cmp_ne_u64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0xba,0x7c]
+# W64: v_cmp_ne_u64_e32 vcc, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0xba,0x7c]
 
+0xff,0xfc,0xbb,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_ne_u64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbb,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_ne_u64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbb,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0xbb,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_neq_f16_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x1a,0x7c]
-# W64: v_cmp_neq_f16_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x1a,0x7c]
 0x01,0x05,0x1a,0x7c
+# W32: v_cmp_neq_f16_e32 vcc_lo, v1, v2        ; encoding: [0x01,0x05,0x1a,0x7c]
+# W64: v_cmp_neq_f16_e32 vcc, v1, v2           ; encoding: [0x01,0x05,0x1a,0x7c]
 
-# W32: v_cmp_neq_f16_e32 vcc_lo, v127, v2        ; encoding: [0x7f,0x05,0x1a,0x7c]
-# W64: v_cmp_neq_f16_e32 vcc, v127, v2           ; encoding: [0x7f,0x05,0x1a,0x7c]
 0x7f,0x05,0x1a,0x7c
+# W32: v_cmp_neq_f16_e32 vcc_lo, v127, v2      ; encoding: [0x7f,0x05,0x1a,0x7c]
+# W64: v_cmp_neq_f16_e32 vcc, v127, v2         ; encoding: [0x7f,0x05,0x1a,0x7c]
 
-# W32: v_cmp_neq_f16_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x1a,0x7c]
-# W64: v_cmp_neq_f16_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x1a,0x7c]
 0x01,0x04,0x1a,0x7c
+# W32: v_cmp_neq_f16_e32 vcc_lo, s1, v2        ; encoding: [0x01,0x04,0x1a,0x7c]
+# W64: v_cmp_neq_f16_e32 vcc, s1, v2           ; encoding: [0x01,0x04,0x1a,0x7c]
 
-# W32: v_cmp_neq_f16_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x1a,0x7c]
-# W64: v_cmp_neq_f16_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x1a,0x7c]
 0x69,0x04,0x1a,0x7c
+# W32: v_cmp_neq_f16_e32 vcc_lo, s105, v2      ; encoding: [0x69,0x04,0x1a,0x7c]
+# W64: v_cmp_neq_f16_e32 vcc, s105, v2         ; encoding: [0x69,0x04,0x1a,0x7c]
 
-# W32: v_cmp_neq_f16_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x1a,0x7c]
-# W64: v_cmp_neq_f16_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x1a,0x7c]
 0x6a,0x04,0x1a,0x7c
+# W32: v_cmp_neq_f16_e32 vcc_lo, vcc_lo, v2    ; encoding: [0x6a,0x04,0x1a,0x7c]
+# W64: v_cmp_neq_f16_e32 vcc, vcc_lo, v2       ; encoding: [0x6a,0x04,0x1a,0x7c]
 
-# W32: v_cmp_neq_f16_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x1a,0x7c]
-# W64: v_cmp_neq_f16_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x1a,0x7c]
 0x6b,0x04,0x1a,0x7c
+# W32: v_cmp_neq_f16_e32 vcc_lo, vcc_hi, v2    ; encoding: [0x6b,0x04,0x1a,0x7c]
+# W64: v_cmp_neq_f16_e32 vcc, vcc_hi, v2       ; encoding: [0x6b,0x04,0x1a,0x7c]
 
-# W32: v_cmp_neq_f16_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x1a,0x7c]
-# W64: v_cmp_neq_f16_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x1a,0x7c]
 0x7b,0x04,0x1a,0x7c
+# W32: v_cmp_neq_f16_e32 vcc_lo, ttmp15, v2    ; encoding: [0x7b,0x04,0x1a,0x7c]
+# W64: v_cmp_neq_f16_e32 vcc, ttmp15, v2       ; encoding: [0x7b,0x04,0x1a,0x7c]
 
-# W32: v_cmp_neq_f16_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x1a,0x7c]
-# W64: v_cmp_neq_f16_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x1a,0x7c]
 0x7d,0x04,0x1a,0x7c
+# W32: v_cmp_neq_f16_e32 vcc_lo, m0, v2        ; encoding: [0x7d,0x04,0x1a,0x7c]
+# W64: v_cmp_neq_f16_e32 vcc, m0, v2           ; encoding: [0x7d,0x04,0x1a,0x7c]
 
-# W32: v_cmp_neq_f16_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x1a,0x7c]
-# W64: v_cmp_neq_f16_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x1a,0x7c]
 0x7e,0x04,0x1a,0x7c
+# W32: v_cmp_neq_f16_e32 vcc_lo, exec_lo, v2   ; encoding: [0x7e,0x04,0x1a,0x7c]
+# W64: v_cmp_neq_f16_e32 vcc, exec_lo, v2      ; encoding: [0x7e,0x04,0x1a,0x7c]
 
-# W32: v_cmp_neq_f16_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x1a,0x7c]
-# W64: v_cmp_neq_f16_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x1a,0x7c]
 0x7f,0x04,0x1a,0x7c
+# W32: v_cmp_neq_f16_e32 vcc_lo, exec_hi, v2   ; encoding: [0x7f,0x04,0x1a,0x7c]
+# W64: v_cmp_neq_f16_e32 vcc, exec_hi, v2      ; encoding: [0x7f,0x04,0x1a,0x7c]
 
-# W32: v_cmp_neq_f16_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x1a,0x7c]
-# W64: v_cmp_neq_f16_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x1a,0x7c]
 0x7c,0x04,0x1a,0x7c
+# W32: v_cmp_neq_f16_e32 vcc_lo, null, v2      ; encoding: [0x7c,0x04,0x1a,0x7c]
+# W64: v_cmp_neq_f16_e32 vcc, null, v2         ; encoding: [0x7c,0x04,0x1a,0x7c]
 
-# W32: v_cmp_neq_f16_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x1a,0x7c]
-# W64: v_cmp_neq_f16_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x1a,0x7c]
 0xc1,0x04,0x1a,0x7c
+# W32: v_cmp_neq_f16_e32 vcc_lo, -1, v2        ; encoding: [0xc1,0x04,0x1a,0x7c]
+# W64: v_cmp_neq_f16_e32 vcc, -1, v2           ; encoding: [0xc1,0x04,0x1a,0x7c]
 
-# W32: v_cmp_neq_f16_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x1a,0x7c]
-# W64: v_cmp_neq_f16_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x1a,0x7c]
 0xf0,0x04,0x1a,0x7c
+# W32: v_cmp_neq_f16_e32 vcc_lo, 0.5, v2       ; encoding: [0xf0,0x04,0x1a,0x7c]
+# W64: v_cmp_neq_f16_e32 vcc, 0.5, v2          ; encoding: [0xf0,0x04,0x1a,0x7c]
 
-# W32: v_cmp_neq_f16_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x1a,0x7c]
-# W64: v_cmp_neq_f16_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x1a,0x7c]
 0xfd,0x04,0x1a,0x7c
+# W32: v_cmp_neq_f16_e32 vcc_lo, src_scc, v2   ; encoding: [0xfd,0x04,0x1a,0x7c]
+# W64: v_cmp_neq_f16_e32 vcc, src_scc, v2      ; encoding: [0xfd,0x04,0x1a,0x7c]
 
-# W32: v_cmp_neq_f16_e32 vcc_lo, 0xfe0b, v127    ; encoding: [0xff,0xfe,0x1a,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_neq_f16_e32 vcc, 0xfe0b, v127       ; encoding: [0xff,0xfe,0x1a,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x1a,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_neq_f16_e32 vcc_lo, 0xfe0b, v127  ; encoding: [0xff,0xfe,0x1a,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_neq_f16_e32 vcc, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x1a,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_neq_f32_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x3a,0x7c]
-# W64: v_cmp_neq_f32_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x3a,0x7c]
 0x01,0x05,0x3a,0x7c
+# W32: v_cmp_neq_f32_e32 vcc_lo, v1, v2        ; encoding: [0x01,0x05,0x3a,0x7c]
+# W64: v_cmp_neq_f32_e32 vcc, v1, v2           ; encoding: [0x01,0x05,0x3a,0x7c]
 
-# W32: v_cmp_neq_f32_e32 vcc_lo, v255, v2        ; encoding: [0xff,0x05,0x3a,0x7c]
-# W64: v_cmp_neq_f32_e32 vcc, v255, v2           ; encoding: [0xff,0x05,0x3a,0x7c]
 0xff,0x05,0x3a,0x7c
+# W32: v_cmp_neq_f32_e32 vcc_lo, v255, v2      ; encoding: [0xff,0x05,0x3a,0x7c]
+# W64: v_cmp_neq_f32_e32 vcc, v255, v2         ; encoding: [0xff,0x05,0x3a,0x7c]
 
-# W32: v_cmp_neq_f32_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x3a,0x7c]
-# W64: v_cmp_neq_f32_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x3a,0x7c]
 0x01,0x04,0x3a,0x7c
+# W32: v_cmp_neq_f32_e32 vcc_lo, s1, v2        ; encoding: [0x01,0x04,0x3a,0x7c]
+# W64: v_cmp_neq_f32_e32 vcc, s1, v2           ; encoding: [0x01,0x04,0x3a,0x7c]
 
-# W32: v_cmp_neq_f32_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x3a,0x7c]
-# W64: v_cmp_neq_f32_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x3a,0x7c]
 0x69,0x04,0x3a,0x7c
+# W32: v_cmp_neq_f32_e32 vcc_lo, s105, v2      ; encoding: [0x69,0x04,0x3a,0x7c]
+# W64: v_cmp_neq_f32_e32 vcc, s105, v2         ; encoding: [0x69,0x04,0x3a,0x7c]
 
-# W32: v_cmp_neq_f32_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x3a,0x7c]
-# W64: v_cmp_neq_f32_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x3a,0x7c]
 0x6a,0x04,0x3a,0x7c
+# W32: v_cmp_neq_f32_e32 vcc_lo, vcc_lo, v2    ; encoding: [0x6a,0x04,0x3a,0x7c]
+# W64: v_cmp_neq_f32_e32 vcc, vcc_lo, v2       ; encoding: [0x6a,0x04,0x3a,0x7c]
 
-# W32: v_cmp_neq_f32_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x3a,0x7c]
-# W64: v_cmp_neq_f32_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x3a,0x7c]
 0x6b,0x04,0x3a,0x7c
+# W32: v_cmp_neq_f32_e32 vcc_lo, vcc_hi, v2    ; encoding: [0x6b,0x04,0x3a,0x7c]
+# W64: v_cmp_neq_f32_e32 vcc, vcc_hi, v2       ; encoding: [0x6b,0x04,0x3a,0x7c]
 
-# W32: v_cmp_neq_f32_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x3a,0x7c]
-# W64: v_cmp_neq_f32_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x3a,0x7c]
 0x7b,0x04,0x3a,0x7c
+# W32: v_cmp_neq_f32_e32 vcc_lo, ttmp15, v2    ; encoding: [0x7b,0x04,0x3a,0x7c]
+# W64: v_cmp_neq_f32_e32 vcc, ttmp15, v2       ; encoding: [0x7b,0x04,0x3a,0x7c]
 
-# W32: v_cmp_neq_f32_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x3a,0x7c]
-# W64: v_cmp_neq_f32_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x3a,0x7c]
 0x7d,0x04,0x3a,0x7c
+# W32: v_cmp_neq_f32_e32 vcc_lo, m0, v2        ; encoding: [0x7d,0x04,0x3a,0x7c]
+# W64: v_cmp_neq_f32_e32 vcc, m0, v2           ; encoding: [0x7d,0x04,0x3a,0x7c]
 
-# W32: v_cmp_neq_f32_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x3a,0x7c]
-# W64: v_cmp_neq_f32_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x3a,0x7c]
 0x7e,0x04,0x3a,0x7c
+# W32: v_cmp_neq_f32_e32 vcc_lo, exec_lo, v2   ; encoding: [0x7e,0x04,0x3a,0x7c]
+# W64: v_cmp_neq_f32_e32 vcc, exec_lo, v2      ; encoding: [0x7e,0x04,0x3a,0x7c]
 
-# W32: v_cmp_neq_f32_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x3a,0x7c]
-# W64: v_cmp_neq_f32_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x3a,0x7c]
 0x7f,0x04,0x3a,0x7c
+# W32: v_cmp_neq_f32_e32 vcc_lo, exec_hi, v2   ; encoding: [0x7f,0x04,0x3a,0x7c]
+# W64: v_cmp_neq_f32_e32 vcc, exec_hi, v2      ; encoding: [0x7f,0x04,0x3a,0x7c]
 
-# W32: v_cmp_neq_f32_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x3a,0x7c]
-# W64: v_cmp_neq_f32_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x3a,0x7c]
 0x7c,0x04,0x3a,0x7c
+# W32: v_cmp_neq_f32_e32 vcc_lo, null, v2      ; encoding: [0x7c,0x04,0x3a,0x7c]
+# W64: v_cmp_neq_f32_e32 vcc, null, v2         ; encoding: [0x7c,0x04,0x3a,0x7c]
 
-# W32: v_cmp_neq_f32_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x3a,0x7c]
-# W64: v_cmp_neq_f32_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x3a,0x7c]
 0xc1,0x04,0x3a,0x7c
+# W32: v_cmp_neq_f32_e32 vcc_lo, -1, v2        ; encoding: [0xc1,0x04,0x3a,0x7c]
+# W64: v_cmp_neq_f32_e32 vcc, -1, v2           ; encoding: [0xc1,0x04,0x3a,0x7c]
 
-# W32: v_cmp_neq_f32_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x3a,0x7c]
-# W64: v_cmp_neq_f32_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x3a,0x7c]
 0xf0,0x04,0x3a,0x7c
+# W32: v_cmp_neq_f32_e32 vcc_lo, 0.5, v2       ; encoding: [0xf0,0x04,0x3a,0x7c]
+# W64: v_cmp_neq_f32_e32 vcc, 0.5, v2          ; encoding: [0xf0,0x04,0x3a,0x7c]
 
-# W32: v_cmp_neq_f32_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x3a,0x7c]
-# W64: v_cmp_neq_f32_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x3a,0x7c]
 0xfd,0x04,0x3a,0x7c
+# W32: v_cmp_neq_f32_e32 vcc_lo, src_scc, v2   ; encoding: [0xfd,0x04,0x3a,0x7c]
+# W64: v_cmp_neq_f32_e32 vcc, src_scc, v2      ; encoding: [0xfd,0x04,0x3a,0x7c]
 
-# W32: v_cmp_neq_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x3b,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_neq_f32_e32 vcc, 0xaf123456, v255   ; encoding: [0xff,0xfe,0x3b,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x3b,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_neq_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x3b,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_neq_f32_e32 vcc, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x3b,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_neq_f64_e32 vcc_lo, v[1:2], v[2:3]  ; encoding: [0x01,0x05,0x5a,0x7c]
-# W64: v_cmp_neq_f64_e32 vcc, v[1:2], v[2:3]     ; encoding: [0x01,0x05,0x5a,0x7c]
 0x01,0x05,0x5a,0x7c
+# W32: v_cmp_neq_f64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0x5a,0x7c]
+# W64: v_cmp_neq_f64_e32 vcc, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0x5a,0x7c]
 
+0xfe,0x05,0x5a,0x7c
 # W32: v_cmp_neq_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x5a,0x7c]
 # W64: v_cmp_neq_f64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x5a,0x7c]
-0xfe,0x05,0x5a,0x7c
 
-# W32: v_cmp_neq_f64_e32 vcc_lo, s[2:3], v[2:3]  ; encoding: [0x02,0x04,0x5a,0x7c]
-# W64: v_cmp_neq_f64_e32 vcc, s[2:3], v[2:3]     ; encoding: [0x02,0x04,0x5a,0x7c]
 0x02,0x04,0x5a,0x7c
+# W32: v_cmp_neq_f64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0x5a,0x7c]
+# W64: v_cmp_neq_f64_e32 vcc, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0x5a,0x7c]
 
+0x68,0x04,0x5a,0x7c
 # W32: v_cmp_neq_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x5a,0x7c]
 # W64: v_cmp_neq_f64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x5a,0x7c]
-0x68,0x04,0x5a,0x7c
 
-# W32: v_cmp_neq_f64_e32 vcc_lo, vcc, v[2:3]     ; encoding: [0x6a,0x04,0x5a,0x7c]
-# W64: v_cmp_neq_f64_e32 vcc, vcc, v[2:3]        ; encoding: [0x6a,0x04,0x5a,0x7c]
 0x6a,0x04,0x5a,0x7c
+# W32: v_cmp_neq_f64_e32 vcc_lo, vcc, v[2:3]   ; encoding: [0x6a,0x04,0x5a,0x7c]
+# W64: v_cmp_neq_f64_e32 vcc, vcc, v[2:3]      ; encoding: [0x6a,0x04,0x5a,0x7c]
 
+0x7a,0x04,0x5a,0x7c
 # W32: v_cmp_neq_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x5a,0x7c]
 # W64: v_cmp_neq_f64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x5a,0x7c]
-0x7a,0x04,0x5a,0x7c
 
-# W32: v_cmp_neq_f64_e32 vcc_lo, exec, v[2:3]    ; encoding: [0x7e,0x04,0x5a,0x7c]
-# W64: v_cmp_neq_f64_e32 vcc, exec, v[2:3]       ; encoding: [0x7e,0x04,0x5a,0x7c]
 0x7e,0x04,0x5a,0x7c
+# W32: v_cmp_neq_f64_e32 vcc_lo, exec, v[2:3]  ; encoding: [0x7e,0x04,0x5a,0x7c]
+# W64: v_cmp_neq_f64_e32 vcc, exec, v[2:3]     ; encoding: [0x7e,0x04,0x5a,0x7c]
 
-# W32: v_cmp_neq_f64_e32 vcc_lo, null, v[2:3]    ; encoding: [0x7c,0x04,0x5a,0x7c]
-# W64: v_cmp_neq_f64_e32 vcc, null, v[2:3]       ; encoding: [0x7c,0x04,0x5a,0x7c]
 0x7c,0x04,0x5a,0x7c
+# W32: v_cmp_neq_f64_e32 vcc_lo, null, v[2:3]  ; encoding: [0x7c,0x04,0x5a,0x7c]
+# W64: v_cmp_neq_f64_e32 vcc, null, v[2:3]     ; encoding: [0x7c,0x04,0x5a,0x7c]
 
-# W32: v_cmp_neq_f64_e32 vcc_lo, -1, v[2:3]      ; encoding: [0xc1,0x04,0x5a,0x7c]
-# W64: v_cmp_neq_f64_e32 vcc, -1, v[2:3]         ; encoding: [0xc1,0x04,0x5a,0x7c]
 0xc1,0x04,0x5a,0x7c
+# W32: v_cmp_neq_f64_e32 vcc_lo, -1, v[2:3]    ; encoding: [0xc1,0x04,0x5a,0x7c]
+# W64: v_cmp_neq_f64_e32 vcc, -1, v[2:3]       ; encoding: [0xc1,0x04,0x5a,0x7c]
 
-# W32: v_cmp_neq_f64_e32 vcc_lo, 0.5, v[2:3]     ; encoding: [0xf0,0x04,0x5a,0x7c]
-# W64: v_cmp_neq_f64_e32 vcc, 0.5, v[2:3]        ; encoding: [0xf0,0x04,0x5a,0x7c]
 0xf0,0x04,0x5a,0x7c
+# W32: v_cmp_neq_f64_e32 vcc_lo, 0.5, v[2:3]   ; encoding: [0xf0,0x04,0x5a,0x7c]
+# W64: v_cmp_neq_f64_e32 vcc, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0x5a,0x7c]
 
-# W32: v_cmp_neq_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x5a,0x7c]
-# W64: v_cmp_neq_f64_e32 vcc, src_scc, v[2:3]    ; encoding: [0xfd,0x04,0x5a,0x7c]
 0xfd,0x04,0x5a,0x7c
+# W32: v_cmp_neq_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x5a,0x7c]
+# W64: v_cmp_neq_f64_e32 vcc, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0x5a,0x7c]
 
+0xff,0xfc,0x5b,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_neq_f64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5b,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_neq_f64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5b,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0x5b,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_nge_f16_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x12,0x7c]
-# W64: v_cmp_nge_f16_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x12,0x7c]
 0x01,0x05,0x12,0x7c
+# W32: v_cmp_nge_f16_e32 vcc_lo, v1, v2        ; encoding: [0x01,0x05,0x12,0x7c]
+# W64: v_cmp_nge_f16_e32 vcc, v1, v2           ; encoding: [0x01,0x05,0x12,0x7c]
 
-# W32: v_cmp_nge_f16_e32 vcc_lo, v127, v2        ; encoding: [0x7f,0x05,0x12,0x7c]
-# W64: v_cmp_nge_f16_e32 vcc, v127, v2           ; encoding: [0x7f,0x05,0x12,0x7c]
 0x7f,0x05,0x12,0x7c
+# W32: v_cmp_nge_f16_e32 vcc_lo, v127, v2      ; encoding: [0x7f,0x05,0x12,0x7c]
+# W64: v_cmp_nge_f16_e32 vcc, v127, v2         ; encoding: [0x7f,0x05,0x12,0x7c]
 
-# W32: v_cmp_nge_f16_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x12,0x7c]
-# W64: v_cmp_nge_f16_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x12,0x7c]
 0x01,0x04,0x12,0x7c
+# W32: v_cmp_nge_f16_e32 vcc_lo, s1, v2        ; encoding: [0x01,0x04,0x12,0x7c]
+# W64: v_cmp_nge_f16_e32 vcc, s1, v2           ; encoding: [0x01,0x04,0x12,0x7c]
 
-# W32: v_cmp_nge_f16_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x12,0x7c]
-# W64: v_cmp_nge_f16_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x12,0x7c]
 0x69,0x04,0x12,0x7c
+# W32: v_cmp_nge_f16_e32 vcc_lo, s105, v2      ; encoding: [0x69,0x04,0x12,0x7c]
+# W64: v_cmp_nge_f16_e32 vcc, s105, v2         ; encoding: [0x69,0x04,0x12,0x7c]
 
-# W32: v_cmp_nge_f16_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x12,0x7c]
-# W64: v_cmp_nge_f16_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x12,0x7c]
 0x6a,0x04,0x12,0x7c
+# W32: v_cmp_nge_f16_e32 vcc_lo, vcc_lo, v2    ; encoding: [0x6a,0x04,0x12,0x7c]
+# W64: v_cmp_nge_f16_e32 vcc, vcc_lo, v2       ; encoding: [0x6a,0x04,0x12,0x7c]
 
-# W32: v_cmp_nge_f16_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x12,0x7c]
-# W64: v_cmp_nge_f16_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x12,0x7c]
 0x6b,0x04,0x12,0x7c
+# W32: v_cmp_nge_f16_e32 vcc_lo, vcc_hi, v2    ; encoding: [0x6b,0x04,0x12,0x7c]
+# W64: v_cmp_nge_f16_e32 vcc, vcc_hi, v2       ; encoding: [0x6b,0x04,0x12,0x7c]
 
-# W32: v_cmp_nge_f16_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x12,0x7c]
-# W64: v_cmp_nge_f16_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x12,0x7c]
 0x7b,0x04,0x12,0x7c
+# W32: v_cmp_nge_f16_e32 vcc_lo, ttmp15, v2    ; encoding: [0x7b,0x04,0x12,0x7c]
+# W64: v_cmp_nge_f16_e32 vcc, ttmp15, v2       ; encoding: [0x7b,0x04,0x12,0x7c]
 
-# W32: v_cmp_nge_f16_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x12,0x7c]
-# W64: v_cmp_nge_f16_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x12,0x7c]
 0x7d,0x04,0x12,0x7c
+# W32: v_cmp_nge_f16_e32 vcc_lo, m0, v2        ; encoding: [0x7d,0x04,0x12,0x7c]
+# W64: v_cmp_nge_f16_e32 vcc, m0, v2           ; encoding: [0x7d,0x04,0x12,0x7c]
 
-# W32: v_cmp_nge_f16_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x12,0x7c]
-# W64: v_cmp_nge_f16_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x12,0x7c]
 0x7e,0x04,0x12,0x7c
+# W32: v_cmp_nge_f16_e32 vcc_lo, exec_lo, v2   ; encoding: [0x7e,0x04,0x12,0x7c]
+# W64: v_cmp_nge_f16_e32 vcc, exec_lo, v2      ; encoding: [0x7e,0x04,0x12,0x7c]
 
-# W32: v_cmp_nge_f16_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x12,0x7c]
-# W64: v_cmp_nge_f16_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x12,0x7c]
 0x7f,0x04,0x12,0x7c
+# W32: v_cmp_nge_f16_e32 vcc_lo, exec_hi, v2   ; encoding: [0x7f,0x04,0x12,0x7c]
+# W64: v_cmp_nge_f16_e32 vcc, exec_hi, v2      ; encoding: [0x7f,0x04,0x12,0x7c]
 
-# W32: v_cmp_nge_f16_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x12,0x7c]
-# W64: v_cmp_nge_f16_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x12,0x7c]
 0x7c,0x04,0x12,0x7c
+# W32: v_cmp_nge_f16_e32 vcc_lo, null, v2      ; encoding: [0x7c,0x04,0x12,0x7c]
+# W64: v_cmp_nge_f16_e32 vcc, null, v2         ; encoding: [0x7c,0x04,0x12,0x7c]
 
-# W32: v_cmp_nge_f16_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x12,0x7c]
-# W64: v_cmp_nge_f16_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x12,0x7c]
 0xc1,0x04,0x12,0x7c
+# W32: v_cmp_nge_f16_e32 vcc_lo, -1, v2        ; encoding: [0xc1,0x04,0x12,0x7c]
+# W64: v_cmp_nge_f16_e32 vcc, -1, v2           ; encoding: [0xc1,0x04,0x12,0x7c]
 
-# W32: v_cmp_nge_f16_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x12,0x7c]
-# W64: v_cmp_nge_f16_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x12,0x7c]
 0xf0,0x04,0x12,0x7c
+# W32: v_cmp_nge_f16_e32 vcc_lo, 0.5, v2       ; encoding: [0xf0,0x04,0x12,0x7c]
+# W64: v_cmp_nge_f16_e32 vcc, 0.5, v2          ; encoding: [0xf0,0x04,0x12,0x7c]
 
-# W32: v_cmp_nge_f16_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x12,0x7c]
-# W64: v_cmp_nge_f16_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x12,0x7c]
 0xfd,0x04,0x12,0x7c
+# W32: v_cmp_nge_f16_e32 vcc_lo, src_scc, v2   ; encoding: [0xfd,0x04,0x12,0x7c]
+# W64: v_cmp_nge_f16_e32 vcc, src_scc, v2      ; encoding: [0xfd,0x04,0x12,0x7c]
 
-# W32: v_cmp_nge_f16_e32 vcc_lo, 0xfe0b, v127    ; encoding: [0xff,0xfe,0x12,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_nge_f16_e32 vcc, 0xfe0b, v127       ; encoding: [0xff,0xfe,0x12,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x12,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_nge_f16_e32 vcc_lo, 0xfe0b, v127  ; encoding: [0xff,0xfe,0x12,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_nge_f16_e32 vcc, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x12,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_nge_f32_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x32,0x7c]
-# W64: v_cmp_nge_f32_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x32,0x7c]
 0x01,0x05,0x32,0x7c
+# W32: v_cmp_nge_f32_e32 vcc_lo, v1, v2        ; encoding: [0x01,0x05,0x32,0x7c]
+# W64: v_cmp_nge_f32_e32 vcc, v1, v2           ; encoding: [0x01,0x05,0x32,0x7c]
 
-# W32: v_cmp_nge_f32_e32 vcc_lo, v255, v2        ; encoding: [0xff,0x05,0x32,0x7c]
-# W64: v_cmp_nge_f32_e32 vcc, v255, v2           ; encoding: [0xff,0x05,0x32,0x7c]
 0xff,0x05,0x32,0x7c
+# W32: v_cmp_nge_f32_e32 vcc_lo, v255, v2      ; encoding: [0xff,0x05,0x32,0x7c]
+# W64: v_cmp_nge_f32_e32 vcc, v255, v2         ; encoding: [0xff,0x05,0x32,0x7c]
 
-# W32: v_cmp_nge_f32_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x32,0x7c]
-# W64: v_cmp_nge_f32_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x32,0x7c]
 0x01,0x04,0x32,0x7c
+# W32: v_cmp_nge_f32_e32 vcc_lo, s1, v2        ; encoding: [0x01,0x04,0x32,0x7c]
+# W64: v_cmp_nge_f32_e32 vcc, s1, v2           ; encoding: [0x01,0x04,0x32,0x7c]
 
-# W32: v_cmp_nge_f32_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x32,0x7c]
-# W64: v_cmp_nge_f32_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x32,0x7c]
 0x69,0x04,0x32,0x7c
+# W32: v_cmp_nge_f32_e32 vcc_lo, s105, v2      ; encoding: [0x69,0x04,0x32,0x7c]
+# W64: v_cmp_nge_f32_e32 vcc, s105, v2         ; encoding: [0x69,0x04,0x32,0x7c]
 
-# W32: v_cmp_nge_f32_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x32,0x7c]
-# W64: v_cmp_nge_f32_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x32,0x7c]
 0x6a,0x04,0x32,0x7c
+# W32: v_cmp_nge_f32_e32 vcc_lo, vcc_lo, v2    ; encoding: [0x6a,0x04,0x32,0x7c]
+# W64: v_cmp_nge_f32_e32 vcc, vcc_lo, v2       ; encoding: [0x6a,0x04,0x32,0x7c]
 
-# W32: v_cmp_nge_f32_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x32,0x7c]
-# W64: v_cmp_nge_f32_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x32,0x7c]
 0x6b,0x04,0x32,0x7c
+# W32: v_cmp_nge_f32_e32 vcc_lo, vcc_hi, v2    ; encoding: [0x6b,0x04,0x32,0x7c]
+# W64: v_cmp_nge_f32_e32 vcc, vcc_hi, v2       ; encoding: [0x6b,0x04,0x32,0x7c]
 
-# W32: v_cmp_nge_f32_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x32,0x7c]
-# W64: v_cmp_nge_f32_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x32,0x7c]
 0x7b,0x04,0x32,0x7c
+# W32: v_cmp_nge_f32_e32 vcc_lo, ttmp15, v2    ; encoding: [0x7b,0x04,0x32,0x7c]
+# W64: v_cmp_nge_f32_e32 vcc, ttmp15, v2       ; encoding: [0x7b,0x04,0x32,0x7c]
 
-# W32: v_cmp_nge_f32_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x32,0x7c]
-# W64: v_cmp_nge_f32_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x32,0x7c]
 0x7d,0x04,0x32,0x7c
+# W32: v_cmp_nge_f32_e32 vcc_lo, m0, v2        ; encoding: [0x7d,0x04,0x32,0x7c]
+# W64: v_cmp_nge_f32_e32 vcc, m0, v2           ; encoding: [0x7d,0x04,0x32,0x7c]
 
-# W32: v_cmp_nge_f32_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x32,0x7c]
-# W64: v_cmp_nge_f32_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x32,0x7c]
 0x7e,0x04,0x32,0x7c
+# W32: v_cmp_nge_f32_e32 vcc_lo, exec_lo, v2   ; encoding: [0x7e,0x04,0x32,0x7c]
+# W64: v_cmp_nge_f32_e32 vcc, exec_lo, v2      ; encoding: [0x7e,0x04,0x32,0x7c]
 
-# W32: v_cmp_nge_f32_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x32,0x7c]
-# W64: v_cmp_nge_f32_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x32,0x7c]
 0x7f,0x04,0x32,0x7c
+# W32: v_cmp_nge_f32_e32 vcc_lo, exec_hi, v2   ; encoding: [0x7f,0x04,0x32,0x7c]
+# W64: v_cmp_nge_f32_e32 vcc, exec_hi, v2      ; encoding: [0x7f,0x04,0x32,0x7c]
 
-# W32: v_cmp_nge_f32_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x32,0x7c]
-# W64: v_cmp_nge_f32_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x32,0x7c]
 0x7c,0x04,0x32,0x7c
+# W32: v_cmp_nge_f32_e32 vcc_lo, null, v2      ; encoding: [0x7c,0x04,0x32,0x7c]
+# W64: v_cmp_nge_f32_e32 vcc, null, v2         ; encoding: [0x7c,0x04,0x32,0x7c]
 
-# W32: v_cmp_nge_f32_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x32,0x7c]
-# W64: v_cmp_nge_f32_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x32,0x7c]
 0xc1,0x04,0x32,0x7c
+# W32: v_cmp_nge_f32_e32 vcc_lo, -1, v2        ; encoding: [0xc1,0x04,0x32,0x7c]
+# W64: v_cmp_nge_f32_e32 vcc, -1, v2           ; encoding: [0xc1,0x04,0x32,0x7c]
 
-# W32: v_cmp_nge_f32_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x32,0x7c]
-# W64: v_cmp_nge_f32_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x32,0x7c]
 0xf0,0x04,0x32,0x7c
+# W32: v_cmp_nge_f32_e32 vcc_lo, 0.5, v2       ; encoding: [0xf0,0x04,0x32,0x7c]
+# W64: v_cmp_nge_f32_e32 vcc, 0.5, v2          ; encoding: [0xf0,0x04,0x32,0x7c]
 
-# W32: v_cmp_nge_f32_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x32,0x7c]
-# W64: v_cmp_nge_f32_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x32,0x7c]
 0xfd,0x04,0x32,0x7c
+# W32: v_cmp_nge_f32_e32 vcc_lo, src_scc, v2   ; encoding: [0xfd,0x04,0x32,0x7c]
+# W64: v_cmp_nge_f32_e32 vcc, src_scc, v2      ; encoding: [0xfd,0x04,0x32,0x7c]
 
-# W32: v_cmp_nge_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x33,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_nge_f32_e32 vcc, 0xaf123456, v255   ; encoding: [0xff,0xfe,0x33,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x33,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_nge_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x33,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_nge_f32_e32 vcc, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x33,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_nge_f64_e32 vcc_lo, v[1:2], v[2:3]  ; encoding: [0x01,0x05,0x52,0x7c]
-# W64: v_cmp_nge_f64_e32 vcc, v[1:2], v[2:3]     ; encoding: [0x01,0x05,0x52,0x7c]
 0x01,0x05,0x52,0x7c
+# W32: v_cmp_nge_f64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0x52,0x7c]
+# W64: v_cmp_nge_f64_e32 vcc, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0x52,0x7c]
 
+0xfe,0x05,0x52,0x7c
 # W32: v_cmp_nge_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x52,0x7c]
 # W64: v_cmp_nge_f64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x52,0x7c]
-0xfe,0x05,0x52,0x7c
 
-# W32: v_cmp_nge_f64_e32 vcc_lo, s[2:3], v[2:3]  ; encoding: [0x02,0x04,0x52,0x7c]
-# W64: v_cmp_nge_f64_e32 vcc, s[2:3], v[2:3]     ; encoding: [0x02,0x04,0x52,0x7c]
 0x02,0x04,0x52,0x7c
+# W32: v_cmp_nge_f64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0x52,0x7c]
+# W64: v_cmp_nge_f64_e32 vcc, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0x52,0x7c]
 
+0x68,0x04,0x52,0x7c
 # W32: v_cmp_nge_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x52,0x7c]
 # W64: v_cmp_nge_f64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x52,0x7c]
-0x68,0x04,0x52,0x7c
 
-# W32: v_cmp_nge_f64_e32 vcc_lo, vcc, v[2:3]     ; encoding: [0x6a,0x04,0x52,0x7c]
-# W64: v_cmp_nge_f64_e32 vcc, vcc, v[2:3]        ; encoding: [0x6a,0x04,0x52,0x7c]
 0x6a,0x04,0x52,0x7c
+# W32: v_cmp_nge_f64_e32 vcc_lo, vcc, v[2:3]   ; encoding: [0x6a,0x04,0x52,0x7c]
+# W64: v_cmp_nge_f64_e32 vcc, vcc, v[2:3]      ; encoding: [0x6a,0x04,0x52,0x7c]
 
+0x7a,0x04,0x52,0x7c
 # W32: v_cmp_nge_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x52,0x7c]
 # W64: v_cmp_nge_f64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x52,0x7c]
-0x7a,0x04,0x52,0x7c
 
-# W32: v_cmp_nge_f64_e32 vcc_lo, exec, v[2:3]    ; encoding: [0x7e,0x04,0x52,0x7c]
-# W64: v_cmp_nge_f64_e32 vcc, exec, v[2:3]       ; encoding: [0x7e,0x04,0x52,0x7c]
 0x7e,0x04,0x52,0x7c
+# W32: v_cmp_nge_f64_e32 vcc_lo, exec, v[2:3]  ; encoding: [0x7e,0x04,0x52,0x7c]
+# W64: v_cmp_nge_f64_e32 vcc, exec, v[2:3]     ; encoding: [0x7e,0x04,0x52,0x7c]
 
-# W32: v_cmp_nge_f64_e32 vcc_lo, null, v[2:3]    ; encoding: [0x7c,0x04,0x52,0x7c]
-# W64: v_cmp_nge_f64_e32 vcc, null, v[2:3]       ; encoding: [0x7c,0x04,0x52,0x7c]
 0x7c,0x04,0x52,0x7c
+# W32: v_cmp_nge_f64_e32 vcc_lo, null, v[2:3]  ; encoding: [0x7c,0x04,0x52,0x7c]
+# W64: v_cmp_nge_f64_e32 vcc, null, v[2:3]     ; encoding: [0x7c,0x04,0x52,0x7c]
 
-# W32: v_cmp_nge_f64_e32 vcc_lo, -1, v[2:3]      ; encoding: [0xc1,0x04,0x52,0x7c]
-# W64: v_cmp_nge_f64_e32 vcc, -1, v[2:3]         ; encoding: [0xc1,0x04,0x52,0x7c]
 0xc1,0x04,0x52,0x7c
+# W32: v_cmp_nge_f64_e32 vcc_lo, -1, v[2:3]    ; encoding: [0xc1,0x04,0x52,0x7c]
+# W64: v_cmp_nge_f64_e32 vcc, -1, v[2:3]       ; encoding: [0xc1,0x04,0x52,0x7c]
 
-# W32: v_cmp_nge_f64_e32 vcc_lo, 0.5, v[2:3]     ; encoding: [0xf0,0x04,0x52,0x7c]
-# W64: v_cmp_nge_f64_e32 vcc, 0.5, v[2:3]        ; encoding: [0xf0,0x04,0x52,0x7c]
 0xf0,0x04,0x52,0x7c
+# W32: v_cmp_nge_f64_e32 vcc_lo, 0.5, v[2:3]   ; encoding: [0xf0,0x04,0x52,0x7c]
+# W64: v_cmp_nge_f64_e32 vcc, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0x52,0x7c]
 
-# W32: v_cmp_nge_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x52,0x7c]
-# W64: v_cmp_nge_f64_e32 vcc, src_scc, v[2:3]    ; encoding: [0xfd,0x04,0x52,0x7c]
 0xfd,0x04,0x52,0x7c
+# W32: v_cmp_nge_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x52,0x7c]
+# W64: v_cmp_nge_f64_e32 vcc, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0x52,0x7c]
 
+0xff,0xfc,0x53,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_nge_f64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x53,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_nge_f64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x53,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0x53,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_ngt_f16_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x16,0x7c]
-# W64: v_cmp_ngt_f16_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x16,0x7c]
 0x01,0x05,0x16,0x7c
+# W32: v_cmp_ngt_f16_e32 vcc_lo, v1, v2        ; encoding: [0x01,0x05,0x16,0x7c]
+# W64: v_cmp_ngt_f16_e32 vcc, v1, v2           ; encoding: [0x01,0x05,0x16,0x7c]
 
-# W32: v_cmp_ngt_f16_e32 vcc_lo, v127, v2        ; encoding: [0x7f,0x05,0x16,0x7c]
-# W64: v_cmp_ngt_f16_e32 vcc, v127, v2           ; encoding: [0x7f,0x05,0x16,0x7c]
 0x7f,0x05,0x16,0x7c
+# W32: v_cmp_ngt_f16_e32 vcc_lo, v127, v2      ; encoding: [0x7f,0x05,0x16,0x7c]
+# W64: v_cmp_ngt_f16_e32 vcc, v127, v2         ; encoding: [0x7f,0x05,0x16,0x7c]
 
-# W32: v_cmp_ngt_f16_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x16,0x7c]
-# W64: v_cmp_ngt_f16_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x16,0x7c]
 0x01,0x04,0x16,0x7c
+# W32: v_cmp_ngt_f16_e32 vcc_lo, s1, v2        ; encoding: [0x01,0x04,0x16,0x7c]
+# W64: v_cmp_ngt_f16_e32 vcc, s1, v2           ; encoding: [0x01,0x04,0x16,0x7c]
 
-# W32: v_cmp_ngt_f16_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x16,0x7c]
-# W64: v_cmp_ngt_f16_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x16,0x7c]
 0x69,0x04,0x16,0x7c
+# W32: v_cmp_ngt_f16_e32 vcc_lo, s105, v2      ; encoding: [0x69,0x04,0x16,0x7c]
+# W64: v_cmp_ngt_f16_e32 vcc, s105, v2         ; encoding: [0x69,0x04,0x16,0x7c]
 
-# W32: v_cmp_ngt_f16_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x16,0x7c]
-# W64: v_cmp_ngt_f16_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x16,0x7c]
 0x6a,0x04,0x16,0x7c
+# W32: v_cmp_ngt_f16_e32 vcc_lo, vcc_lo, v2    ; encoding: [0x6a,0x04,0x16,0x7c]
+# W64: v_cmp_ngt_f16_e32 vcc, vcc_lo, v2       ; encoding: [0x6a,0x04,0x16,0x7c]
 
-# W32: v_cmp_ngt_f16_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x16,0x7c]
-# W64: v_cmp_ngt_f16_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x16,0x7c]
 0x6b,0x04,0x16,0x7c
+# W32: v_cmp_ngt_f16_e32 vcc_lo, vcc_hi, v2    ; encoding: [0x6b,0x04,0x16,0x7c]
+# W64: v_cmp_ngt_f16_e32 vcc, vcc_hi, v2       ; encoding: [0x6b,0x04,0x16,0x7c]
 
-# W32: v_cmp_ngt_f16_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x16,0x7c]
-# W64: v_cmp_ngt_f16_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x16,0x7c]
 0x7b,0x04,0x16,0x7c
+# W32: v_cmp_ngt_f16_e32 vcc_lo, ttmp15, v2    ; encoding: [0x7b,0x04,0x16,0x7c]
+# W64: v_cmp_ngt_f16_e32 vcc, ttmp15, v2       ; encoding: [0x7b,0x04,0x16,0x7c]
 
-# W32: v_cmp_ngt_f16_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x16,0x7c]
-# W64: v_cmp_ngt_f16_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x16,0x7c]
 0x7d,0x04,0x16,0x7c
+# W32: v_cmp_ngt_f16_e32 vcc_lo, m0, v2        ; encoding: [0x7d,0x04,0x16,0x7c]
+# W64: v_cmp_ngt_f16_e32 vcc, m0, v2           ; encoding: [0x7d,0x04,0x16,0x7c]
 
-# W32: v_cmp_ngt_f16_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x16,0x7c]
-# W64: v_cmp_ngt_f16_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x16,0x7c]
 0x7e,0x04,0x16,0x7c
+# W32: v_cmp_ngt_f16_e32 vcc_lo, exec_lo, v2   ; encoding: [0x7e,0x04,0x16,0x7c]
+# W64: v_cmp_ngt_f16_e32 vcc, exec_lo, v2      ; encoding: [0x7e,0x04,0x16,0x7c]
 
-# W32: v_cmp_ngt_f16_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x16,0x7c]
-# W64: v_cmp_ngt_f16_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x16,0x7c]
 0x7f,0x04,0x16,0x7c
+# W32: v_cmp_ngt_f16_e32 vcc_lo, exec_hi, v2   ; encoding: [0x7f,0x04,0x16,0x7c]
+# W64: v_cmp_ngt_f16_e32 vcc, exec_hi, v2      ; encoding: [0x7f,0x04,0x16,0x7c]
 
-# W32: v_cmp_ngt_f16_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x16,0x7c]
-# W64: v_cmp_ngt_f16_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x16,0x7c]
 0x7c,0x04,0x16,0x7c
+# W32: v_cmp_ngt_f16_e32 vcc_lo, null, v2      ; encoding: [0x7c,0x04,0x16,0x7c]
+# W64: v_cmp_ngt_f16_e32 vcc, null, v2         ; encoding: [0x7c,0x04,0x16,0x7c]
 
-# W32: v_cmp_ngt_f16_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x16,0x7c]
-# W64: v_cmp_ngt_f16_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x16,0x7c]
 0xc1,0x04,0x16,0x7c
+# W32: v_cmp_ngt_f16_e32 vcc_lo, -1, v2        ; encoding: [0xc1,0x04,0x16,0x7c]
+# W64: v_cmp_ngt_f16_e32 vcc, -1, v2           ; encoding: [0xc1,0x04,0x16,0x7c]
 
-# W32: v_cmp_ngt_f16_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x16,0x7c]
-# W64: v_cmp_ngt_f16_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x16,0x7c]
 0xf0,0x04,0x16,0x7c
+# W32: v_cmp_ngt_f16_e32 vcc_lo, 0.5, v2       ; encoding: [0xf0,0x04,0x16,0x7c]
+# W64: v_cmp_ngt_f16_e32 vcc, 0.5, v2          ; encoding: [0xf0,0x04,0x16,0x7c]
 
-# W32: v_cmp_ngt_f16_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x16,0x7c]
-# W64: v_cmp_ngt_f16_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x16,0x7c]
 0xfd,0x04,0x16,0x7c
+# W32: v_cmp_ngt_f16_e32 vcc_lo, src_scc, v2   ; encoding: [0xfd,0x04,0x16,0x7c]
+# W64: v_cmp_ngt_f16_e32 vcc, src_scc, v2      ; encoding: [0xfd,0x04,0x16,0x7c]
 
-# W32: v_cmp_ngt_f16_e32 vcc_lo, 0xfe0b, v127    ; encoding: [0xff,0xfe,0x16,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_ngt_f16_e32 vcc, 0xfe0b, v127       ; encoding: [0xff,0xfe,0x16,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x16,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_ngt_f16_e32 vcc_lo, 0xfe0b, v127  ; encoding: [0xff,0xfe,0x16,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_ngt_f16_e32 vcc, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x16,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_ngt_f32_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x36,0x7c]
-# W64: v_cmp_ngt_f32_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x36,0x7c]
 0x01,0x05,0x36,0x7c
+# W32: v_cmp_ngt_f32_e32 vcc_lo, v1, v2        ; encoding: [0x01,0x05,0x36,0x7c]
+# W64: v_cmp_ngt_f32_e32 vcc, v1, v2           ; encoding: [0x01,0x05,0x36,0x7c]
 
-# W32: v_cmp_ngt_f32_e32 vcc_lo, v255, v2        ; encoding: [0xff,0x05,0x36,0x7c]
-# W64: v_cmp_ngt_f32_e32 vcc, v255, v2           ; encoding: [0xff,0x05,0x36,0x7c]
 0xff,0x05,0x36,0x7c
+# W32: v_cmp_ngt_f32_e32 vcc_lo, v255, v2      ; encoding: [0xff,0x05,0x36,0x7c]
+# W64: v_cmp_ngt_f32_e32 vcc, v255, v2         ; encoding: [0xff,0x05,0x36,0x7c]
 
-# W32: v_cmp_ngt_f32_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x36,0x7c]
-# W64: v_cmp_ngt_f32_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x36,0x7c]
 0x01,0x04,0x36,0x7c
+# W32: v_cmp_ngt_f32_e32 vcc_lo, s1, v2        ; encoding: [0x01,0x04,0x36,0x7c]
+# W64: v_cmp_ngt_f32_e32 vcc, s1, v2           ; encoding: [0x01,0x04,0x36,0x7c]
 
-# W32: v_cmp_ngt_f32_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x36,0x7c]
-# W64: v_cmp_ngt_f32_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x36,0x7c]
 0x69,0x04,0x36,0x7c
+# W32: v_cmp_ngt_f32_e32 vcc_lo, s105, v2      ; encoding: [0x69,0x04,0x36,0x7c]
+# W64: v_cmp_ngt_f32_e32 vcc, s105, v2         ; encoding: [0x69,0x04,0x36,0x7c]
 
-# W32: v_cmp_ngt_f32_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x36,0x7c]
-# W64: v_cmp_ngt_f32_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x36,0x7c]
 0x6a,0x04,0x36,0x7c
+# W32: v_cmp_ngt_f32_e32 vcc_lo, vcc_lo, v2    ; encoding: [0x6a,0x04,0x36,0x7c]
+# W64: v_cmp_ngt_f32_e32 vcc, vcc_lo, v2       ; encoding: [0x6a,0x04,0x36,0x7c]
 
-# W32: v_cmp_ngt_f32_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x36,0x7c]
-# W64: v_cmp_ngt_f32_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x36,0x7c]
 0x6b,0x04,0x36,0x7c
+# W32: v_cmp_ngt_f32_e32 vcc_lo, vcc_hi, v2    ; encoding: [0x6b,0x04,0x36,0x7c]
+# W64: v_cmp_ngt_f32_e32 vcc, vcc_hi, v2       ; encoding: [0x6b,0x04,0x36,0x7c]
 
-# W32: v_cmp_ngt_f32_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x36,0x7c]
-# W64: v_cmp_ngt_f32_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x36,0x7c]
 0x7b,0x04,0x36,0x7c
+# W32: v_cmp_ngt_f32_e32 vcc_lo, ttmp15, v2    ; encoding: [0x7b,0x04,0x36,0x7c]
+# W64: v_cmp_ngt_f32_e32 vcc, ttmp15, v2       ; encoding: [0x7b,0x04,0x36,0x7c]
 
-# W32: v_cmp_ngt_f32_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x36,0x7c]
-# W64: v_cmp_ngt_f32_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x36,0x7c]
 0x7d,0x04,0x36,0x7c
+# W32: v_cmp_ngt_f32_e32 vcc_lo, m0, v2        ; encoding: [0x7d,0x04,0x36,0x7c]
+# W64: v_cmp_ngt_f32_e32 vcc, m0, v2           ; encoding: [0x7d,0x04,0x36,0x7c]
 
-# W32: v_cmp_ngt_f32_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x36,0x7c]
-# W64: v_cmp_ngt_f32_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x36,0x7c]
 0x7e,0x04,0x36,0x7c
+# W32: v_cmp_ngt_f32_e32 vcc_lo, exec_lo, v2   ; encoding: [0x7e,0x04,0x36,0x7c]
+# W64: v_cmp_ngt_f32_e32 vcc, exec_lo, v2      ; encoding: [0x7e,0x04,0x36,0x7c]
 
-# W32: v_cmp_ngt_f32_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x36,0x7c]
-# W64: v_cmp_ngt_f32_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x36,0x7c]
 0x7f,0x04,0x36,0x7c
+# W32: v_cmp_ngt_f32_e32 vcc_lo, exec_hi, v2   ; encoding: [0x7f,0x04,0x36,0x7c]
+# W64: v_cmp_ngt_f32_e32 vcc, exec_hi, v2      ; encoding: [0x7f,0x04,0x36,0x7c]
 
-# W32: v_cmp_ngt_f32_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x36,0x7c]
-# W64: v_cmp_ngt_f32_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x36,0x7c]
 0x7c,0x04,0x36,0x7c
+# W32: v_cmp_ngt_f32_e32 vcc_lo, null, v2      ; encoding: [0x7c,0x04,0x36,0x7c]
+# W64: v_cmp_ngt_f32_e32 vcc, null, v2         ; encoding: [0x7c,0x04,0x36,0x7c]
 
-# W32: v_cmp_ngt_f32_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x36,0x7c]
-# W64: v_cmp_ngt_f32_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x36,0x7c]
 0xc1,0x04,0x36,0x7c
+# W32: v_cmp_ngt_f32_e32 vcc_lo, -1, v2        ; encoding: [0xc1,0x04,0x36,0x7c]
+# W64: v_cmp_ngt_f32_e32 vcc, -1, v2           ; encoding: [0xc1,0x04,0x36,0x7c]
 
-# W32: v_cmp_ngt_f32_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x36,0x7c]
-# W64: v_cmp_ngt_f32_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x36,0x7c]
 0xf0,0x04,0x36,0x7c
+# W32: v_cmp_ngt_f32_e32 vcc_lo, 0.5, v2       ; encoding: [0xf0,0x04,0x36,0x7c]
+# W64: v_cmp_ngt_f32_e32 vcc, 0.5, v2          ; encoding: [0xf0,0x04,0x36,0x7c]
 
-# W32: v_cmp_ngt_f32_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x36,0x7c]
-# W64: v_cmp_ngt_f32_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x36,0x7c]
 0xfd,0x04,0x36,0x7c
+# W32: v_cmp_ngt_f32_e32 vcc_lo, src_scc, v2   ; encoding: [0xfd,0x04,0x36,0x7c]
+# W64: v_cmp_ngt_f32_e32 vcc, src_scc, v2      ; encoding: [0xfd,0x04,0x36,0x7c]
 
-# W32: v_cmp_ngt_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x37,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_ngt_f32_e32 vcc, 0xaf123456, v255   ; encoding: [0xff,0xfe,0x37,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x37,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_ngt_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x37,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_ngt_f32_e32 vcc, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x37,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_ngt_f64_e32 vcc_lo, v[1:2], v[2:3]  ; encoding: [0x01,0x05,0x56,0x7c]
-# W64: v_cmp_ngt_f64_e32 vcc, v[1:2], v[2:3]     ; encoding: [0x01,0x05,0x56,0x7c]
 0x01,0x05,0x56,0x7c
+# W32: v_cmp_ngt_f64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0x56,0x7c]
+# W64: v_cmp_ngt_f64_e32 vcc, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0x56,0x7c]
 
+0xfe,0x05,0x56,0x7c
 # W32: v_cmp_ngt_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x56,0x7c]
 # W64: v_cmp_ngt_f64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x56,0x7c]
-0xfe,0x05,0x56,0x7c
 
-# W32: v_cmp_ngt_f64_e32 vcc_lo, s[2:3], v[2:3]  ; encoding: [0x02,0x04,0x56,0x7c]
-# W64: v_cmp_ngt_f64_e32 vcc, s[2:3], v[2:3]     ; encoding: [0x02,0x04,0x56,0x7c]
 0x02,0x04,0x56,0x7c
+# W32: v_cmp_ngt_f64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0x56,0x7c]
+# W64: v_cmp_ngt_f64_e32 vcc, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0x56,0x7c]
 
+0x68,0x04,0x56,0x7c
 # W32: v_cmp_ngt_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x56,0x7c]
 # W64: v_cmp_ngt_f64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x56,0x7c]
-0x68,0x04,0x56,0x7c
 
-# W32: v_cmp_ngt_f64_e32 vcc_lo, vcc, v[2:3]     ; encoding: [0x6a,0x04,0x56,0x7c]
-# W64: v_cmp_ngt_f64_e32 vcc, vcc, v[2:3]        ; encoding: [0x6a,0x04,0x56,0x7c]
 0x6a,0x04,0x56,0x7c
+# W32: v_cmp_ngt_f64_e32 vcc_lo, vcc, v[2:3]   ; encoding: [0x6a,0x04,0x56,0x7c]
+# W64: v_cmp_ngt_f64_e32 vcc, vcc, v[2:3]      ; encoding: [0x6a,0x04,0x56,0x7c]
 
+0x7a,0x04,0x56,0x7c
 # W32: v_cmp_ngt_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x56,0x7c]
 # W64: v_cmp_ngt_f64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x56,0x7c]
-0x7a,0x04,0x56,0x7c
 
-# W32: v_cmp_ngt_f64_e32 vcc_lo, exec, v[2:3]    ; encoding: [0x7e,0x04,0x56,0x7c]
-# W64: v_cmp_ngt_f64_e32 vcc, exec, v[2:3]       ; encoding: [0x7e,0x04,0x56,0x7c]
 0x7e,0x04,0x56,0x7c
+# W32: v_cmp_ngt_f64_e32 vcc_lo, exec, v[2:3]  ; encoding: [0x7e,0x04,0x56,0x7c]
+# W64: v_cmp_ngt_f64_e32 vcc, exec, v[2:3]     ; encoding: [0x7e,0x04,0x56,0x7c]
 
-# W32: v_cmp_ngt_f64_e32 vcc_lo, null, v[2:3]    ; encoding: [0x7c,0x04,0x56,0x7c]
-# W64: v_cmp_ngt_f64_e32 vcc, null, v[2:3]       ; encoding: [0x7c,0x04,0x56,0x7c]
 0x7c,0x04,0x56,0x7c
+# W32: v_cmp_ngt_f64_e32 vcc_lo, null, v[2:3]  ; encoding: [0x7c,0x04,0x56,0x7c]
+# W64: v_cmp_ngt_f64_e32 vcc, null, v[2:3]     ; encoding: [0x7c,0x04,0x56,0x7c]
 
-# W32: v_cmp_ngt_f64_e32 vcc_lo, -1, v[2:3]      ; encoding: [0xc1,0x04,0x56,0x7c]
-# W64: v_cmp_ngt_f64_e32 vcc, -1, v[2:3]         ; encoding: [0xc1,0x04,0x56,0x7c]
 0xc1,0x04,0x56,0x7c
+# W32: v_cmp_ngt_f64_e32 vcc_lo, -1, v[2:3]    ; encoding: [0xc1,0x04,0x56,0x7c]
+# W64: v_cmp_ngt_f64_e32 vcc, -1, v[2:3]       ; encoding: [0xc1,0x04,0x56,0x7c]
 
-# W32: v_cmp_ngt_f64_e32 vcc_lo, 0.5, v[2:3]     ; encoding: [0xf0,0x04,0x56,0x7c]
-# W64: v_cmp_ngt_f64_e32 vcc, 0.5, v[2:3]        ; encoding: [0xf0,0x04,0x56,0x7c]
 0xf0,0x04,0x56,0x7c
+# W32: v_cmp_ngt_f64_e32 vcc_lo, 0.5, v[2:3]   ; encoding: [0xf0,0x04,0x56,0x7c]
+# W64: v_cmp_ngt_f64_e32 vcc, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0x56,0x7c]
 
-# W32: v_cmp_ngt_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x56,0x7c]
-# W64: v_cmp_ngt_f64_e32 vcc, src_scc, v[2:3]    ; encoding: [0xfd,0x04,0x56,0x7c]
 0xfd,0x04,0x56,0x7c
+# W32: v_cmp_ngt_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x56,0x7c]
+# W64: v_cmp_ngt_f64_e32 vcc, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0x56,0x7c]
 
+0xff,0xfc,0x57,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_ngt_f64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x57,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_ngt_f64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x57,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0x57,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_nle_f16_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x18,0x7c]
-# W64: v_cmp_nle_f16_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x18,0x7c]
 0x01,0x05,0x18,0x7c
+# W32: v_cmp_nle_f16_e32 vcc_lo, v1, v2        ; encoding: [0x01,0x05,0x18,0x7c]
+# W64: v_cmp_nle_f16_e32 vcc, v1, v2           ; encoding: [0x01,0x05,0x18,0x7c]
 
-# W32: v_cmp_nle_f16_e32 vcc_lo, v127, v2        ; encoding: [0x7f,0x05,0x18,0x7c]
-# W64: v_cmp_nle_f16_e32 vcc, v127, v2           ; encoding: [0x7f,0x05,0x18,0x7c]
 0x7f,0x05,0x18,0x7c
+# W32: v_cmp_nle_f16_e32 vcc_lo, v127, v2      ; encoding: [0x7f,0x05,0x18,0x7c]
+# W64: v_cmp_nle_f16_e32 vcc, v127, v2         ; encoding: [0x7f,0x05,0x18,0x7c]
 
-# W32: v_cmp_nle_f16_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x18,0x7c]
-# W64: v_cmp_nle_f16_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x18,0x7c]
 0x01,0x04,0x18,0x7c
+# W32: v_cmp_nle_f16_e32 vcc_lo, s1, v2        ; encoding: [0x01,0x04,0x18,0x7c]
+# W64: v_cmp_nle_f16_e32 vcc, s1, v2           ; encoding: [0x01,0x04,0x18,0x7c]
 
-# W32: v_cmp_nle_f16_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x18,0x7c]
-# W64: v_cmp_nle_f16_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x18,0x7c]
 0x69,0x04,0x18,0x7c
+# W32: v_cmp_nle_f16_e32 vcc_lo, s105, v2      ; encoding: [0x69,0x04,0x18,0x7c]
+# W64: v_cmp_nle_f16_e32 vcc, s105, v2         ; encoding: [0x69,0x04,0x18,0x7c]
 
-# W32: v_cmp_nle_f16_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x18,0x7c]
-# W64: v_cmp_nle_f16_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x18,0x7c]
 0x6a,0x04,0x18,0x7c
+# W32: v_cmp_nle_f16_e32 vcc_lo, vcc_lo, v2    ; encoding: [0x6a,0x04,0x18,0x7c]
+# W64: v_cmp_nle_f16_e32 vcc, vcc_lo, v2       ; encoding: [0x6a,0x04,0x18,0x7c]
 
-# W32: v_cmp_nle_f16_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x18,0x7c]
-# W64: v_cmp_nle_f16_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x18,0x7c]
 0x6b,0x04,0x18,0x7c
+# W32: v_cmp_nle_f16_e32 vcc_lo, vcc_hi, v2    ; encoding: [0x6b,0x04,0x18,0x7c]
+# W64: v_cmp_nle_f16_e32 vcc, vcc_hi, v2       ; encoding: [0x6b,0x04,0x18,0x7c]
 
-# W32: v_cmp_nle_f16_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x18,0x7c]
-# W64: v_cmp_nle_f16_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x18,0x7c]
 0x7b,0x04,0x18,0x7c
+# W32: v_cmp_nle_f16_e32 vcc_lo, ttmp15, v2    ; encoding: [0x7b,0x04,0x18,0x7c]
+# W64: v_cmp_nle_f16_e32 vcc, ttmp15, v2       ; encoding: [0x7b,0x04,0x18,0x7c]
 
-# W32: v_cmp_nle_f16_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x18,0x7c]
-# W64: v_cmp_nle_f16_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x18,0x7c]
 0x7d,0x04,0x18,0x7c
+# W32: v_cmp_nle_f16_e32 vcc_lo, m0, v2        ; encoding: [0x7d,0x04,0x18,0x7c]
+# W64: v_cmp_nle_f16_e32 vcc, m0, v2           ; encoding: [0x7d,0x04,0x18,0x7c]
 
-# W32: v_cmp_nle_f16_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x18,0x7c]
-# W64: v_cmp_nle_f16_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x18,0x7c]
 0x7e,0x04,0x18,0x7c
+# W32: v_cmp_nle_f16_e32 vcc_lo, exec_lo, v2   ; encoding: [0x7e,0x04,0x18,0x7c]
+# W64: v_cmp_nle_f16_e32 vcc, exec_lo, v2      ; encoding: [0x7e,0x04,0x18,0x7c]
 
-# W32: v_cmp_nle_f16_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x18,0x7c]
-# W64: v_cmp_nle_f16_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x18,0x7c]
 0x7f,0x04,0x18,0x7c
+# W32: v_cmp_nle_f16_e32 vcc_lo, exec_hi, v2   ; encoding: [0x7f,0x04,0x18,0x7c]
+# W64: v_cmp_nle_f16_e32 vcc, exec_hi, v2      ; encoding: [0x7f,0x04,0x18,0x7c]
 
-# W32: v_cmp_nle_f16_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x18,0x7c]
-# W64: v_cmp_nle_f16_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x18,0x7c]
 0x7c,0x04,0x18,0x7c
+# W32: v_cmp_nle_f16_e32 vcc_lo, null, v2      ; encoding: [0x7c,0x04,0x18,0x7c]
+# W64: v_cmp_nle_f16_e32 vcc, null, v2         ; encoding: [0x7c,0x04,0x18,0x7c]
 
-# W32: v_cmp_nle_f16_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x18,0x7c]
-# W64: v_cmp_nle_f16_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x18,0x7c]
 0xc1,0x04,0x18,0x7c
+# W32: v_cmp_nle_f16_e32 vcc_lo, -1, v2        ; encoding: [0xc1,0x04,0x18,0x7c]
+# W64: v_cmp_nle_f16_e32 vcc, -1, v2           ; encoding: [0xc1,0x04,0x18,0x7c]
 
-# W32: v_cmp_nle_f16_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x18,0x7c]
-# W64: v_cmp_nle_f16_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x18,0x7c]
 0xf0,0x04,0x18,0x7c
+# W32: v_cmp_nle_f16_e32 vcc_lo, 0.5, v2       ; encoding: [0xf0,0x04,0x18,0x7c]
+# W64: v_cmp_nle_f16_e32 vcc, 0.5, v2          ; encoding: [0xf0,0x04,0x18,0x7c]
 
-# W32: v_cmp_nle_f16_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x18,0x7c]
-# W64: v_cmp_nle_f16_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x18,0x7c]
 0xfd,0x04,0x18,0x7c
+# W32: v_cmp_nle_f16_e32 vcc_lo, src_scc, v2   ; encoding: [0xfd,0x04,0x18,0x7c]
+# W64: v_cmp_nle_f16_e32 vcc, src_scc, v2      ; encoding: [0xfd,0x04,0x18,0x7c]
 
-# W32: v_cmp_nle_f16_e32 vcc_lo, 0xfe0b, v127    ; encoding: [0xff,0xfe,0x18,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_nle_f16_e32 vcc, 0xfe0b, v127       ; encoding: [0xff,0xfe,0x18,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x18,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_nle_f16_e32 vcc_lo, 0xfe0b, v127  ; encoding: [0xff,0xfe,0x18,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_nle_f16_e32 vcc, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x18,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_nle_f32_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x38,0x7c]
-# W64: v_cmp_nle_f32_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x38,0x7c]
 0x01,0x05,0x38,0x7c
+# W32: v_cmp_nle_f32_e32 vcc_lo, v1, v2        ; encoding: [0x01,0x05,0x38,0x7c]
+# W64: v_cmp_nle_f32_e32 vcc, v1, v2           ; encoding: [0x01,0x05,0x38,0x7c]
 
-# W32: v_cmp_nle_f32_e32 vcc_lo, v255, v2        ; encoding: [0xff,0x05,0x38,0x7c]
-# W64: v_cmp_nle_f32_e32 vcc, v255, v2           ; encoding: [0xff,0x05,0x38,0x7c]
 0xff,0x05,0x38,0x7c
+# W32: v_cmp_nle_f32_e32 vcc_lo, v255, v2      ; encoding: [0xff,0x05,0x38,0x7c]
+# W64: v_cmp_nle_f32_e32 vcc, v255, v2         ; encoding: [0xff,0x05,0x38,0x7c]
 
-# W32: v_cmp_nle_f32_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x38,0x7c]
-# W64: v_cmp_nle_f32_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x38,0x7c]
 0x01,0x04,0x38,0x7c
+# W32: v_cmp_nle_f32_e32 vcc_lo, s1, v2        ; encoding: [0x01,0x04,0x38,0x7c]
+# W64: v_cmp_nle_f32_e32 vcc, s1, v2           ; encoding: [0x01,0x04,0x38,0x7c]
 
-# W32: v_cmp_nle_f32_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x38,0x7c]
-# W64: v_cmp_nle_f32_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x38,0x7c]
 0x69,0x04,0x38,0x7c
+# W32: v_cmp_nle_f32_e32 vcc_lo, s105, v2      ; encoding: [0x69,0x04,0x38,0x7c]
+# W64: v_cmp_nle_f32_e32 vcc, s105, v2         ; encoding: [0x69,0x04,0x38,0x7c]
 
-# W32: v_cmp_nle_f32_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x38,0x7c]
-# W64: v_cmp_nle_f32_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x38,0x7c]
 0x6a,0x04,0x38,0x7c
+# W32: v_cmp_nle_f32_e32 vcc_lo, vcc_lo, v2    ; encoding: [0x6a,0x04,0x38,0x7c]
+# W64: v_cmp_nle_f32_e32 vcc, vcc_lo, v2       ; encoding: [0x6a,0x04,0x38,0x7c]
 
-# W32: v_cmp_nle_f32_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x38,0x7c]
-# W64: v_cmp_nle_f32_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x38,0x7c]
 0x6b,0x04,0x38,0x7c
+# W32: v_cmp_nle_f32_e32 vcc_lo, vcc_hi, v2    ; encoding: [0x6b,0x04,0x38,0x7c]
+# W64: v_cmp_nle_f32_e32 vcc, vcc_hi, v2       ; encoding: [0x6b,0x04,0x38,0x7c]
 
-# W32: v_cmp_nle_f32_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x38,0x7c]
-# W64: v_cmp_nle_f32_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x38,0x7c]
 0x7b,0x04,0x38,0x7c
+# W32: v_cmp_nle_f32_e32 vcc_lo, ttmp15, v2    ; encoding: [0x7b,0x04,0x38,0x7c]
+# W64: v_cmp_nle_f32_e32 vcc, ttmp15, v2       ; encoding: [0x7b,0x04,0x38,0x7c]
 
-# W32: v_cmp_nle_f32_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x38,0x7c]
-# W64: v_cmp_nle_f32_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x38,0x7c]
 0x7d,0x04,0x38,0x7c
+# W32: v_cmp_nle_f32_e32 vcc_lo, m0, v2        ; encoding: [0x7d,0x04,0x38,0x7c]
+# W64: v_cmp_nle_f32_e32 vcc, m0, v2           ; encoding: [0x7d,0x04,0x38,0x7c]
 
-# W32: v_cmp_nle_f32_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x38,0x7c]
-# W64: v_cmp_nle_f32_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x38,0x7c]
 0x7e,0x04,0x38,0x7c
+# W32: v_cmp_nle_f32_e32 vcc_lo, exec_lo, v2   ; encoding: [0x7e,0x04,0x38,0x7c]
+# W64: v_cmp_nle_f32_e32 vcc, exec_lo, v2      ; encoding: [0x7e,0x04,0x38,0x7c]
 
-# W32: v_cmp_nle_f32_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x38,0x7c]
-# W64: v_cmp_nle_f32_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x38,0x7c]
 0x7f,0x04,0x38,0x7c
+# W32: v_cmp_nle_f32_e32 vcc_lo, exec_hi, v2   ; encoding: [0x7f,0x04,0x38,0x7c]
+# W64: v_cmp_nle_f32_e32 vcc, exec_hi, v2      ; encoding: [0x7f,0x04,0x38,0x7c]
 
-# W32: v_cmp_nle_f32_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x38,0x7c]
-# W64: v_cmp_nle_f32_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x38,0x7c]
 0x7c,0x04,0x38,0x7c
+# W32: v_cmp_nle_f32_e32 vcc_lo, null, v2      ; encoding: [0x7c,0x04,0x38,0x7c]
+# W64: v_cmp_nle_f32_e32 vcc, null, v2         ; encoding: [0x7c,0x04,0x38,0x7c]
 
-# W32: v_cmp_nle_f32_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x38,0x7c]
-# W64: v_cmp_nle_f32_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x38,0x7c]
 0xc1,0x04,0x38,0x7c
+# W32: v_cmp_nle_f32_e32 vcc_lo, -1, v2        ; encoding: [0xc1,0x04,0x38,0x7c]
+# W64: v_cmp_nle_f32_e32 vcc, -1, v2           ; encoding: [0xc1,0x04,0x38,0x7c]
 
-# W32: v_cmp_nle_f32_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x38,0x7c]
-# W64: v_cmp_nle_f32_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x38,0x7c]
 0xf0,0x04,0x38,0x7c
+# W32: v_cmp_nle_f32_e32 vcc_lo, 0.5, v2       ; encoding: [0xf0,0x04,0x38,0x7c]
+# W64: v_cmp_nle_f32_e32 vcc, 0.5, v2          ; encoding: [0xf0,0x04,0x38,0x7c]
 
-# W32: v_cmp_nle_f32_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x38,0x7c]
-# W64: v_cmp_nle_f32_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x38,0x7c]
 0xfd,0x04,0x38,0x7c
+# W32: v_cmp_nle_f32_e32 vcc_lo, src_scc, v2   ; encoding: [0xfd,0x04,0x38,0x7c]
+# W64: v_cmp_nle_f32_e32 vcc, src_scc, v2      ; encoding: [0xfd,0x04,0x38,0x7c]
 
-# W32: v_cmp_nle_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x39,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_nle_f32_e32 vcc, 0xaf123456, v255   ; encoding: [0xff,0xfe,0x39,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x39,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_nle_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x39,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_nle_f32_e32 vcc, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x39,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_nle_f64_e32 vcc_lo, v[1:2], v[2:3]  ; encoding: [0x01,0x05,0x58,0x7c]
-# W64: v_cmp_nle_f64_e32 vcc, v[1:2], v[2:3]     ; encoding: [0x01,0x05,0x58,0x7c]
 0x01,0x05,0x58,0x7c
+# W32: v_cmp_nle_f64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0x58,0x7c]
+# W64: v_cmp_nle_f64_e32 vcc, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0x58,0x7c]
 
+0xfe,0x05,0x58,0x7c
 # W32: v_cmp_nle_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x58,0x7c]
 # W64: v_cmp_nle_f64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x58,0x7c]
-0xfe,0x05,0x58,0x7c
 
-# W32: v_cmp_nle_f64_e32 vcc_lo, s[2:3], v[2:3]  ; encoding: [0x02,0x04,0x58,0x7c]
-# W64: v_cmp_nle_f64_e32 vcc, s[2:3], v[2:3]     ; encoding: [0x02,0x04,0x58,0x7c]
 0x02,0x04,0x58,0x7c
+# W32: v_cmp_nle_f64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0x58,0x7c]
+# W64: v_cmp_nle_f64_e32 vcc, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0x58,0x7c]
 
+0x68,0x04,0x58,0x7c
 # W32: v_cmp_nle_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x58,0x7c]
 # W64: v_cmp_nle_f64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x58,0x7c]
-0x68,0x04,0x58,0x7c
 
-# W32: v_cmp_nle_f64_e32 vcc_lo, vcc, v[2:3]     ; encoding: [0x6a,0x04,0x58,0x7c]
-# W64: v_cmp_nle_f64_e32 vcc, vcc, v[2:3]        ; encoding: [0x6a,0x04,0x58,0x7c]
 0x6a,0x04,0x58,0x7c
+# W32: v_cmp_nle_f64_e32 vcc_lo, vcc, v[2:3]   ; encoding: [0x6a,0x04,0x58,0x7c]
+# W64: v_cmp_nle_f64_e32 vcc, vcc, v[2:3]      ; encoding: [0x6a,0x04,0x58,0x7c]
 
+0x7a,0x04,0x58,0x7c
 # W32: v_cmp_nle_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x58,0x7c]
 # W64: v_cmp_nle_f64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x58,0x7c]
-0x7a,0x04,0x58,0x7c
 
-# W32: v_cmp_nle_f64_e32 vcc_lo, exec, v[2:3]    ; encoding: [0x7e,0x04,0x58,0x7c]
-# W64: v_cmp_nle_f64_e32 vcc, exec, v[2:3]       ; encoding: [0x7e,0x04,0x58,0x7c]
 0x7e,0x04,0x58,0x7c
+# W32: v_cmp_nle_f64_e32 vcc_lo, exec, v[2:3]  ; encoding: [0x7e,0x04,0x58,0x7c]
+# W64: v_cmp_nle_f64_e32 vcc, exec, v[2:3]     ; encoding: [0x7e,0x04,0x58,0x7c]
 
-# W32: v_cmp_nle_f64_e32 vcc_lo, null, v[2:3]    ; encoding: [0x7c,0x04,0x58,0x7c]
-# W64: v_cmp_nle_f64_e32 vcc, null, v[2:3]       ; encoding: [0x7c,0x04,0x58,0x7c]
 0x7c,0x04,0x58,0x7c
+# W32: v_cmp_nle_f64_e32 vcc_lo, null, v[2:3]  ; encoding: [0x7c,0x04,0x58,0x7c]
+# W64: v_cmp_nle_f64_e32 vcc, null, v[2:3]     ; encoding: [0x7c,0x04,0x58,0x7c]
 
-# W32: v_cmp_nle_f64_e32 vcc_lo, -1, v[2:3]      ; encoding: [0xc1,0x04,0x58,0x7c]
-# W64: v_cmp_nle_f64_e32 vcc, -1, v[2:3]         ; encoding: [0xc1,0x04,0x58,0x7c]
 0xc1,0x04,0x58,0x7c
+# W32: v_cmp_nle_f64_e32 vcc_lo, -1, v[2:3]    ; encoding: [0xc1,0x04,0x58,0x7c]
+# W64: v_cmp_nle_f64_e32 vcc, -1, v[2:3]       ; encoding: [0xc1,0x04,0x58,0x7c]
 
-# W32: v_cmp_nle_f64_e32 vcc_lo, 0.5, v[2:3]     ; encoding: [0xf0,0x04,0x58,0x7c]
-# W64: v_cmp_nle_f64_e32 vcc, 0.5, v[2:3]        ; encoding: [0xf0,0x04,0x58,0x7c]
 0xf0,0x04,0x58,0x7c
+# W32: v_cmp_nle_f64_e32 vcc_lo, 0.5, v[2:3]   ; encoding: [0xf0,0x04,0x58,0x7c]
+# W64: v_cmp_nle_f64_e32 vcc, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0x58,0x7c]
 
-# W32: v_cmp_nle_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x58,0x7c]
-# W64: v_cmp_nle_f64_e32 vcc, src_scc, v[2:3]    ; encoding: [0xfd,0x04,0x58,0x7c]
 0xfd,0x04,0x58,0x7c
+# W32: v_cmp_nle_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x58,0x7c]
+# W64: v_cmp_nle_f64_e32 vcc, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0x58,0x7c]
 
+0xff,0xfc,0x59,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_nle_f64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x59,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_nle_f64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x59,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0x59,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_nlg_f16_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x14,0x7c]
-# W64: v_cmp_nlg_f16_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x14,0x7c]
 0x01,0x05,0x14,0x7c
+# W32: v_cmp_nlg_f16_e32 vcc_lo, v1, v2        ; encoding: [0x01,0x05,0x14,0x7c]
+# W64: v_cmp_nlg_f16_e32 vcc, v1, v2           ; encoding: [0x01,0x05,0x14,0x7c]
 
-# W32: v_cmp_nlg_f16_e32 vcc_lo, v127, v2        ; encoding: [0x7f,0x05,0x14,0x7c]
-# W64: v_cmp_nlg_f16_e32 vcc, v127, v2           ; encoding: [0x7f,0x05,0x14,0x7c]
 0x7f,0x05,0x14,0x7c
+# W32: v_cmp_nlg_f16_e32 vcc_lo, v127, v2      ; encoding: [0x7f,0x05,0x14,0x7c]
+# W64: v_cmp_nlg_f16_e32 vcc, v127, v2         ; encoding: [0x7f,0x05,0x14,0x7c]
 
-# W32: v_cmp_nlg_f16_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x14,0x7c]
-# W64: v_cmp_nlg_f16_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x14,0x7c]
 0x01,0x04,0x14,0x7c
+# W32: v_cmp_nlg_f16_e32 vcc_lo, s1, v2        ; encoding: [0x01,0x04,0x14,0x7c]
+# W64: v_cmp_nlg_f16_e32 vcc, s1, v2           ; encoding: [0x01,0x04,0x14,0x7c]
 
-# W32: v_cmp_nlg_f16_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x14,0x7c]
-# W64: v_cmp_nlg_f16_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x14,0x7c]
 0x69,0x04,0x14,0x7c
+# W32: v_cmp_nlg_f16_e32 vcc_lo, s105, v2      ; encoding: [0x69,0x04,0x14,0x7c]
+# W64: v_cmp_nlg_f16_e32 vcc, s105, v2         ; encoding: [0x69,0x04,0x14,0x7c]
 
-# W32: v_cmp_nlg_f16_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x14,0x7c]
-# W64: v_cmp_nlg_f16_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x14,0x7c]
 0x6a,0x04,0x14,0x7c
+# W32: v_cmp_nlg_f16_e32 vcc_lo, vcc_lo, v2    ; encoding: [0x6a,0x04,0x14,0x7c]
+# W64: v_cmp_nlg_f16_e32 vcc, vcc_lo, v2       ; encoding: [0x6a,0x04,0x14,0x7c]
 
-# W32: v_cmp_nlg_f16_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x14,0x7c]
-# W64: v_cmp_nlg_f16_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x14,0x7c]
 0x6b,0x04,0x14,0x7c
+# W32: v_cmp_nlg_f16_e32 vcc_lo, vcc_hi, v2    ; encoding: [0x6b,0x04,0x14,0x7c]
+# W64: v_cmp_nlg_f16_e32 vcc, vcc_hi, v2       ; encoding: [0x6b,0x04,0x14,0x7c]
 
-# W32: v_cmp_nlg_f16_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x14,0x7c]
-# W64: v_cmp_nlg_f16_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x14,0x7c]
 0x7b,0x04,0x14,0x7c
+# W32: v_cmp_nlg_f16_e32 vcc_lo, ttmp15, v2    ; encoding: [0x7b,0x04,0x14,0x7c]
+# W64: v_cmp_nlg_f16_e32 vcc, ttmp15, v2       ; encoding: [0x7b,0x04,0x14,0x7c]
 
-# W32: v_cmp_nlg_f16_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x14,0x7c]
-# W64: v_cmp_nlg_f16_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x14,0x7c]
 0x7d,0x04,0x14,0x7c
+# W32: v_cmp_nlg_f16_e32 vcc_lo, m0, v2        ; encoding: [0x7d,0x04,0x14,0x7c]
+# W64: v_cmp_nlg_f16_e32 vcc, m0, v2           ; encoding: [0x7d,0x04,0x14,0x7c]
 
-# W32: v_cmp_nlg_f16_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x14,0x7c]
-# W64: v_cmp_nlg_f16_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x14,0x7c]
 0x7e,0x04,0x14,0x7c
+# W32: v_cmp_nlg_f16_e32 vcc_lo, exec_lo, v2   ; encoding: [0x7e,0x04,0x14,0x7c]
+# W64: v_cmp_nlg_f16_e32 vcc, exec_lo, v2      ; encoding: [0x7e,0x04,0x14,0x7c]
 
-# W32: v_cmp_nlg_f16_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x14,0x7c]
-# W64: v_cmp_nlg_f16_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x14,0x7c]
 0x7f,0x04,0x14,0x7c
+# W32: v_cmp_nlg_f16_e32 vcc_lo, exec_hi, v2   ; encoding: [0x7f,0x04,0x14,0x7c]
+# W64: v_cmp_nlg_f16_e32 vcc, exec_hi, v2      ; encoding: [0x7f,0x04,0x14,0x7c]
 
-# W32: v_cmp_nlg_f16_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x14,0x7c]
-# W64: v_cmp_nlg_f16_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x14,0x7c]
 0x7c,0x04,0x14,0x7c
+# W32: v_cmp_nlg_f16_e32 vcc_lo, null, v2      ; encoding: [0x7c,0x04,0x14,0x7c]
+# W64: v_cmp_nlg_f16_e32 vcc, null, v2         ; encoding: [0x7c,0x04,0x14,0x7c]
 
-# W32: v_cmp_nlg_f16_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x14,0x7c]
-# W64: v_cmp_nlg_f16_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x14,0x7c]
 0xc1,0x04,0x14,0x7c
+# W32: v_cmp_nlg_f16_e32 vcc_lo, -1, v2        ; encoding: [0xc1,0x04,0x14,0x7c]
+# W64: v_cmp_nlg_f16_e32 vcc, -1, v2           ; encoding: [0xc1,0x04,0x14,0x7c]
 
-# W32: v_cmp_nlg_f16_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x14,0x7c]
-# W64: v_cmp_nlg_f16_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x14,0x7c]
 0xf0,0x04,0x14,0x7c
+# W32: v_cmp_nlg_f16_e32 vcc_lo, 0.5, v2       ; encoding: [0xf0,0x04,0x14,0x7c]
+# W64: v_cmp_nlg_f16_e32 vcc, 0.5, v2          ; encoding: [0xf0,0x04,0x14,0x7c]
 
-# W32: v_cmp_nlg_f16_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x14,0x7c]
-# W64: v_cmp_nlg_f16_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x14,0x7c]
 0xfd,0x04,0x14,0x7c
+# W32: v_cmp_nlg_f16_e32 vcc_lo, src_scc, v2   ; encoding: [0xfd,0x04,0x14,0x7c]
+# W64: v_cmp_nlg_f16_e32 vcc, src_scc, v2      ; encoding: [0xfd,0x04,0x14,0x7c]
 
-# W32: v_cmp_nlg_f16_e32 vcc_lo, 0xfe0b, v127    ; encoding: [0xff,0xfe,0x14,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_nlg_f16_e32 vcc, 0xfe0b, v127       ; encoding: [0xff,0xfe,0x14,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x14,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_nlg_f16_e32 vcc_lo, 0xfe0b, v127  ; encoding: [0xff,0xfe,0x14,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_nlg_f16_e32 vcc, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x14,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_nlg_f32_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x34,0x7c]
-# W64: v_cmp_nlg_f32_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x34,0x7c]
 0x01,0x05,0x34,0x7c
+# W32: v_cmp_nlg_f32_e32 vcc_lo, v1, v2        ; encoding: [0x01,0x05,0x34,0x7c]
+# W64: v_cmp_nlg_f32_e32 vcc, v1, v2           ; encoding: [0x01,0x05,0x34,0x7c]
 
-# W32: v_cmp_nlg_f32_e32 vcc_lo, v255, v2        ; encoding: [0xff,0x05,0x34,0x7c]
-# W64: v_cmp_nlg_f32_e32 vcc, v255, v2           ; encoding: [0xff,0x05,0x34,0x7c]
 0xff,0x05,0x34,0x7c
+# W32: v_cmp_nlg_f32_e32 vcc_lo, v255, v2      ; encoding: [0xff,0x05,0x34,0x7c]
+# W64: v_cmp_nlg_f32_e32 vcc, v255, v2         ; encoding: [0xff,0x05,0x34,0x7c]
 
-# W32: v_cmp_nlg_f32_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x34,0x7c]
-# W64: v_cmp_nlg_f32_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x34,0x7c]
 0x01,0x04,0x34,0x7c
+# W32: v_cmp_nlg_f32_e32 vcc_lo, s1, v2        ; encoding: [0x01,0x04,0x34,0x7c]
+# W64: v_cmp_nlg_f32_e32 vcc, s1, v2           ; encoding: [0x01,0x04,0x34,0x7c]
 
-# W32: v_cmp_nlg_f32_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x34,0x7c]
-# W64: v_cmp_nlg_f32_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x34,0x7c]
 0x69,0x04,0x34,0x7c
+# W32: v_cmp_nlg_f32_e32 vcc_lo, s105, v2      ; encoding: [0x69,0x04,0x34,0x7c]
+# W64: v_cmp_nlg_f32_e32 vcc, s105, v2         ; encoding: [0x69,0x04,0x34,0x7c]
 
-# W32: v_cmp_nlg_f32_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x34,0x7c]
-# W64: v_cmp_nlg_f32_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x34,0x7c]
 0x6a,0x04,0x34,0x7c
+# W32: v_cmp_nlg_f32_e32 vcc_lo, vcc_lo, v2    ; encoding: [0x6a,0x04,0x34,0x7c]
+# W64: v_cmp_nlg_f32_e32 vcc, vcc_lo, v2       ; encoding: [0x6a,0x04,0x34,0x7c]
 
-# W32: v_cmp_nlg_f32_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x34,0x7c]
-# W64: v_cmp_nlg_f32_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x34,0x7c]
 0x6b,0x04,0x34,0x7c
+# W32: v_cmp_nlg_f32_e32 vcc_lo, vcc_hi, v2    ; encoding: [0x6b,0x04,0x34,0x7c]
+# W64: v_cmp_nlg_f32_e32 vcc, vcc_hi, v2       ; encoding: [0x6b,0x04,0x34,0x7c]
 
-# W32: v_cmp_nlg_f32_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x34,0x7c]
-# W64: v_cmp_nlg_f32_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x34,0x7c]
 0x7b,0x04,0x34,0x7c
+# W32: v_cmp_nlg_f32_e32 vcc_lo, ttmp15, v2    ; encoding: [0x7b,0x04,0x34,0x7c]
+# W64: v_cmp_nlg_f32_e32 vcc, ttmp15, v2       ; encoding: [0x7b,0x04,0x34,0x7c]
 
-# W32: v_cmp_nlg_f32_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x34,0x7c]
-# W64: v_cmp_nlg_f32_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x34,0x7c]
 0x7d,0x04,0x34,0x7c
+# W32: v_cmp_nlg_f32_e32 vcc_lo, m0, v2        ; encoding: [0x7d,0x04,0x34,0x7c]
+# W64: v_cmp_nlg_f32_e32 vcc, m0, v2           ; encoding: [0x7d,0x04,0x34,0x7c]
 
-# W32: v_cmp_nlg_f32_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x34,0x7c]
-# W64: v_cmp_nlg_f32_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x34,0x7c]
 0x7e,0x04,0x34,0x7c
+# W32: v_cmp_nlg_f32_e32 vcc_lo, exec_lo, v2   ; encoding: [0x7e,0x04,0x34,0x7c]
+# W64: v_cmp_nlg_f32_e32 vcc, exec_lo, v2      ; encoding: [0x7e,0x04,0x34,0x7c]
 
-# W32: v_cmp_nlg_f32_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x34,0x7c]
-# W64: v_cmp_nlg_f32_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x34,0x7c]
 0x7f,0x04,0x34,0x7c
+# W32: v_cmp_nlg_f32_e32 vcc_lo, exec_hi, v2   ; encoding: [0x7f,0x04,0x34,0x7c]
+# W64: v_cmp_nlg_f32_e32 vcc, exec_hi, v2      ; encoding: [0x7f,0x04,0x34,0x7c]
 
-# W32: v_cmp_nlg_f32_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x34,0x7c]
-# W64: v_cmp_nlg_f32_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x34,0x7c]
 0x7c,0x04,0x34,0x7c
+# W32: v_cmp_nlg_f32_e32 vcc_lo, null, v2      ; encoding: [0x7c,0x04,0x34,0x7c]
+# W64: v_cmp_nlg_f32_e32 vcc, null, v2         ; encoding: [0x7c,0x04,0x34,0x7c]
 
-# W32: v_cmp_nlg_f32_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x34,0x7c]
-# W64: v_cmp_nlg_f32_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x34,0x7c]
 0xc1,0x04,0x34,0x7c
+# W32: v_cmp_nlg_f32_e32 vcc_lo, -1, v2        ; encoding: [0xc1,0x04,0x34,0x7c]
+# W64: v_cmp_nlg_f32_e32 vcc, -1, v2           ; encoding: [0xc1,0x04,0x34,0x7c]
 
-# W32: v_cmp_nlg_f32_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x34,0x7c]
-# W64: v_cmp_nlg_f32_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x34,0x7c]
 0xf0,0x04,0x34,0x7c
+# W32: v_cmp_nlg_f32_e32 vcc_lo, 0.5, v2       ; encoding: [0xf0,0x04,0x34,0x7c]
+# W64: v_cmp_nlg_f32_e32 vcc, 0.5, v2          ; encoding: [0xf0,0x04,0x34,0x7c]
 
-# W32: v_cmp_nlg_f32_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x34,0x7c]
-# W64: v_cmp_nlg_f32_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x34,0x7c]
 0xfd,0x04,0x34,0x7c
+# W32: v_cmp_nlg_f32_e32 vcc_lo, src_scc, v2   ; encoding: [0xfd,0x04,0x34,0x7c]
+# W64: v_cmp_nlg_f32_e32 vcc, src_scc, v2      ; encoding: [0xfd,0x04,0x34,0x7c]
 
-# W32: v_cmp_nlg_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x35,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_nlg_f32_e32 vcc, 0xaf123456, v255   ; encoding: [0xff,0xfe,0x35,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x35,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_nlg_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x35,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_nlg_f32_e32 vcc, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x35,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_nlg_f64_e32 vcc_lo, v[1:2], v[2:3]  ; encoding: [0x01,0x05,0x54,0x7c]
-# W64: v_cmp_nlg_f64_e32 vcc, v[1:2], v[2:3]     ; encoding: [0x01,0x05,0x54,0x7c]
 0x01,0x05,0x54,0x7c
+# W32: v_cmp_nlg_f64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0x54,0x7c]
+# W64: v_cmp_nlg_f64_e32 vcc, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0x54,0x7c]
 
+0xfe,0x05,0x54,0x7c
 # W32: v_cmp_nlg_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x54,0x7c]
 # W64: v_cmp_nlg_f64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x54,0x7c]
-0xfe,0x05,0x54,0x7c
 
-# W32: v_cmp_nlg_f64_e32 vcc_lo, s[2:3], v[2:3]  ; encoding: [0x02,0x04,0x54,0x7c]
-# W64: v_cmp_nlg_f64_e32 vcc, s[2:3], v[2:3]     ; encoding: [0x02,0x04,0x54,0x7c]
 0x02,0x04,0x54,0x7c
+# W32: v_cmp_nlg_f64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0x54,0x7c]
+# W64: v_cmp_nlg_f64_e32 vcc, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0x54,0x7c]
 
+0x68,0x04,0x54,0x7c
 # W32: v_cmp_nlg_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x54,0x7c]
 # W64: v_cmp_nlg_f64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x54,0x7c]
-0x68,0x04,0x54,0x7c
 
-# W32: v_cmp_nlg_f64_e32 vcc_lo, vcc, v[2:3]     ; encoding: [0x6a,0x04,0x54,0x7c]
-# W64: v_cmp_nlg_f64_e32 vcc, vcc, v[2:3]        ; encoding: [0x6a,0x04,0x54,0x7c]
 0x6a,0x04,0x54,0x7c
+# W32: v_cmp_nlg_f64_e32 vcc_lo, vcc, v[2:3]   ; encoding: [0x6a,0x04,0x54,0x7c]
+# W64: v_cmp_nlg_f64_e32 vcc, vcc, v[2:3]      ; encoding: [0x6a,0x04,0x54,0x7c]
 
+0x7a,0x04,0x54,0x7c
 # W32: v_cmp_nlg_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x54,0x7c]
 # W64: v_cmp_nlg_f64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x54,0x7c]
-0x7a,0x04,0x54,0x7c
 
-# W32: v_cmp_nlg_f64_e32 vcc_lo, exec, v[2:3]    ; encoding: [0x7e,0x04,0x54,0x7c]
-# W64: v_cmp_nlg_f64_e32 vcc, exec, v[2:3]       ; encoding: [0x7e,0x04,0x54,0x7c]
 0x7e,0x04,0x54,0x7c
+# W32: v_cmp_nlg_f64_e32 vcc_lo, exec, v[2:3]  ; encoding: [0x7e,0x04,0x54,0x7c]
+# W64: v_cmp_nlg_f64_e32 vcc, exec, v[2:3]     ; encoding: [0x7e,0x04,0x54,0x7c]
 
-# W32: v_cmp_nlg_f64_e32 vcc_lo, null, v[2:3]    ; encoding: [0x7c,0x04,0x54,0x7c]
-# W64: v_cmp_nlg_f64_e32 vcc, null, v[2:3]       ; encoding: [0x7c,0x04,0x54,0x7c]
 0x7c,0x04,0x54,0x7c
+# W32: v_cmp_nlg_f64_e32 vcc_lo, null, v[2:3]  ; encoding: [0x7c,0x04,0x54,0x7c]
+# W64: v_cmp_nlg_f64_e32 vcc, null, v[2:3]     ; encoding: [0x7c,0x04,0x54,0x7c]
 
-# W32: v_cmp_nlg_f64_e32 vcc_lo, -1, v[2:3]      ; encoding: [0xc1,0x04,0x54,0x7c]
-# W64: v_cmp_nlg_f64_e32 vcc, -1, v[2:3]         ; encoding: [0xc1,0x04,0x54,0x7c]
 0xc1,0x04,0x54,0x7c
+# W32: v_cmp_nlg_f64_e32 vcc_lo, -1, v[2:3]    ; encoding: [0xc1,0x04,0x54,0x7c]
+# W64: v_cmp_nlg_f64_e32 vcc, -1, v[2:3]       ; encoding: [0xc1,0x04,0x54,0x7c]
 
-# W32: v_cmp_nlg_f64_e32 vcc_lo, 0.5, v[2:3]     ; encoding: [0xf0,0x04,0x54,0x7c]
-# W64: v_cmp_nlg_f64_e32 vcc, 0.5, v[2:3]        ; encoding: [0xf0,0x04,0x54,0x7c]
 0xf0,0x04,0x54,0x7c
+# W32: v_cmp_nlg_f64_e32 vcc_lo, 0.5, v[2:3]   ; encoding: [0xf0,0x04,0x54,0x7c]
+# W64: v_cmp_nlg_f64_e32 vcc, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0x54,0x7c]
 
-# W32: v_cmp_nlg_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x54,0x7c]
-# W64: v_cmp_nlg_f64_e32 vcc, src_scc, v[2:3]    ; encoding: [0xfd,0x04,0x54,0x7c]
 0xfd,0x04,0x54,0x7c
+# W32: v_cmp_nlg_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x54,0x7c]
+# W64: v_cmp_nlg_f64_e32 vcc, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0x54,0x7c]
 
+0xff,0xfc,0x55,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_nlg_f64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x55,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_nlg_f64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x55,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0x55,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_nlt_f16_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x1c,0x7c]
-# W64: v_cmp_nlt_f16_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x1c,0x7c]
 0x01,0x05,0x1c,0x7c
+# W32: v_cmp_nlt_f16_e32 vcc_lo, v1, v2        ; encoding: [0x01,0x05,0x1c,0x7c]
+# W64: v_cmp_nlt_f16_e32 vcc, v1, v2           ; encoding: [0x01,0x05,0x1c,0x7c]
 
-# W32: v_cmp_nlt_f16_e32 vcc_lo, v127, v2        ; encoding: [0x7f,0x05,0x1c,0x7c]
-# W64: v_cmp_nlt_f16_e32 vcc, v127, v2           ; encoding: [0x7f,0x05,0x1c,0x7c]
 0x7f,0x05,0x1c,0x7c
+# W32: v_cmp_nlt_f16_e32 vcc_lo, v127, v2      ; encoding: [0x7f,0x05,0x1c,0x7c]
+# W64: v_cmp_nlt_f16_e32 vcc, v127, v2         ; encoding: [0x7f,0x05,0x1c,0x7c]
 
-# W32: v_cmp_nlt_f16_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x1c,0x7c]
-# W64: v_cmp_nlt_f16_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x1c,0x7c]
 0x01,0x04,0x1c,0x7c
+# W32: v_cmp_nlt_f16_e32 vcc_lo, s1, v2        ; encoding: [0x01,0x04,0x1c,0x7c]
+# W64: v_cmp_nlt_f16_e32 vcc, s1, v2           ; encoding: [0x01,0x04,0x1c,0x7c]
 
-# W32: v_cmp_nlt_f16_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x1c,0x7c]
-# W64: v_cmp_nlt_f16_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x1c,0x7c]
 0x69,0x04,0x1c,0x7c
+# W32: v_cmp_nlt_f16_e32 vcc_lo, s105, v2      ; encoding: [0x69,0x04,0x1c,0x7c]
+# W64: v_cmp_nlt_f16_e32 vcc, s105, v2         ; encoding: [0x69,0x04,0x1c,0x7c]
 
-# W32: v_cmp_nlt_f16_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x1c,0x7c]
-# W64: v_cmp_nlt_f16_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x1c,0x7c]
 0x6a,0x04,0x1c,0x7c
+# W32: v_cmp_nlt_f16_e32 vcc_lo, vcc_lo, v2    ; encoding: [0x6a,0x04,0x1c,0x7c]
+# W64: v_cmp_nlt_f16_e32 vcc, vcc_lo, v2       ; encoding: [0x6a,0x04,0x1c,0x7c]
 
-# W32: v_cmp_nlt_f16_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x1c,0x7c]
-# W64: v_cmp_nlt_f16_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x1c,0x7c]
 0x6b,0x04,0x1c,0x7c
+# W32: v_cmp_nlt_f16_e32 vcc_lo, vcc_hi, v2    ; encoding: [0x6b,0x04,0x1c,0x7c]
+# W64: v_cmp_nlt_f16_e32 vcc, vcc_hi, v2       ; encoding: [0x6b,0x04,0x1c,0x7c]
 
-# W32: v_cmp_nlt_f16_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x1c,0x7c]
-# W64: v_cmp_nlt_f16_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x1c,0x7c]
 0x7b,0x04,0x1c,0x7c
+# W32: v_cmp_nlt_f16_e32 vcc_lo, ttmp15, v2    ; encoding: [0x7b,0x04,0x1c,0x7c]
+# W64: v_cmp_nlt_f16_e32 vcc, ttmp15, v2       ; encoding: [0x7b,0x04,0x1c,0x7c]
 
-# W32: v_cmp_nlt_f16_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x1c,0x7c]
-# W64: v_cmp_nlt_f16_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x1c,0x7c]
 0x7d,0x04,0x1c,0x7c
+# W32: v_cmp_nlt_f16_e32 vcc_lo, m0, v2        ; encoding: [0x7d,0x04,0x1c,0x7c]
+# W64: v_cmp_nlt_f16_e32 vcc, m0, v2           ; encoding: [0x7d,0x04,0x1c,0x7c]
 
-# W32: v_cmp_nlt_f16_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x1c,0x7c]
-# W64: v_cmp_nlt_f16_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x1c,0x7c]
 0x7e,0x04,0x1c,0x7c
+# W32: v_cmp_nlt_f16_e32 vcc_lo, exec_lo, v2   ; encoding: [0x7e,0x04,0x1c,0x7c]
+# W64: v_cmp_nlt_f16_e32 vcc, exec_lo, v2      ; encoding: [0x7e,0x04,0x1c,0x7c]
 
-# W32: v_cmp_nlt_f16_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x1c,0x7c]
-# W64: v_cmp_nlt_f16_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x1c,0x7c]
 0x7f,0x04,0x1c,0x7c
+# W32: v_cmp_nlt_f16_e32 vcc_lo, exec_hi, v2   ; encoding: [0x7f,0x04,0x1c,0x7c]
+# W64: v_cmp_nlt_f16_e32 vcc, exec_hi, v2      ; encoding: [0x7f,0x04,0x1c,0x7c]
 
-# W32: v_cmp_nlt_f16_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x1c,0x7c]
-# W64: v_cmp_nlt_f16_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x1c,0x7c]
 0x7c,0x04,0x1c,0x7c
+# W32: v_cmp_nlt_f16_e32 vcc_lo, null, v2      ; encoding: [0x7c,0x04,0x1c,0x7c]
+# W64: v_cmp_nlt_f16_e32 vcc, null, v2         ; encoding: [0x7c,0x04,0x1c,0x7c]
 
-# W32: v_cmp_nlt_f16_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x1c,0x7c]
-# W64: v_cmp_nlt_f16_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x1c,0x7c]
 0xc1,0x04,0x1c,0x7c
+# W32: v_cmp_nlt_f16_e32 vcc_lo, -1, v2        ; encoding: [0xc1,0x04,0x1c,0x7c]
+# W64: v_cmp_nlt_f16_e32 vcc, -1, v2           ; encoding: [0xc1,0x04,0x1c,0x7c]
 
-# W32: v_cmp_nlt_f16_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x1c,0x7c]
-# W64: v_cmp_nlt_f16_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x1c,0x7c]
 0xf0,0x04,0x1c,0x7c
+# W32: v_cmp_nlt_f16_e32 vcc_lo, 0.5, v2       ; encoding: [0xf0,0x04,0x1c,0x7c]
+# W64: v_cmp_nlt_f16_e32 vcc, 0.5, v2          ; encoding: [0xf0,0x04,0x1c,0x7c]
 
-# W32: v_cmp_nlt_f16_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x1c,0x7c]
-# W64: v_cmp_nlt_f16_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x1c,0x7c]
 0xfd,0x04,0x1c,0x7c
+# W32: v_cmp_nlt_f16_e32 vcc_lo, src_scc, v2   ; encoding: [0xfd,0x04,0x1c,0x7c]
+# W64: v_cmp_nlt_f16_e32 vcc, src_scc, v2      ; encoding: [0xfd,0x04,0x1c,0x7c]
 
-# W32: v_cmp_nlt_f16_e32 vcc_lo, 0xfe0b, v127    ; encoding: [0xff,0xfe,0x1c,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_nlt_f16_e32 vcc, 0xfe0b, v127       ; encoding: [0xff,0xfe,0x1c,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x1c,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_nlt_f16_e32 vcc_lo, 0xfe0b, v127  ; encoding: [0xff,0xfe,0x1c,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_nlt_f16_e32 vcc, 0xfe0b, v127     ; encoding: [0xff,0xfe,0x1c,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_nlt_f32_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x3c,0x7c]
-# W64: v_cmp_nlt_f32_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x3c,0x7c]
 0x01,0x05,0x3c,0x7c
+# W32: v_cmp_nlt_f32_e32 vcc_lo, v1, v2        ; encoding: [0x01,0x05,0x3c,0x7c]
+# W64: v_cmp_nlt_f32_e32 vcc, v1, v2           ; encoding: [0x01,0x05,0x3c,0x7c]
 
-# W32: v_cmp_nlt_f32_e32 vcc_lo, v255, v2        ; encoding: [0xff,0x05,0x3c,0x7c]
-# W64: v_cmp_nlt_f32_e32 vcc, v255, v2           ; encoding: [0xff,0x05,0x3c,0x7c]
 0xff,0x05,0x3c,0x7c
+# W32: v_cmp_nlt_f32_e32 vcc_lo, v255, v2      ; encoding: [0xff,0x05,0x3c,0x7c]
+# W64: v_cmp_nlt_f32_e32 vcc, v255, v2         ; encoding: [0xff,0x05,0x3c,0x7c]
 
-# W32: v_cmp_nlt_f32_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x3c,0x7c]
-# W64: v_cmp_nlt_f32_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x3c,0x7c]
 0x01,0x04,0x3c,0x7c
+# W32: v_cmp_nlt_f32_e32 vcc_lo, s1, v2        ; encoding: [0x01,0x04,0x3c,0x7c]
+# W64: v_cmp_nlt_f32_e32 vcc, s1, v2           ; encoding: [0x01,0x04,0x3c,0x7c]
 
-# W32: v_cmp_nlt_f32_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x3c,0x7c]
-# W64: v_cmp_nlt_f32_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x3c,0x7c]
 0x69,0x04,0x3c,0x7c
+# W32: v_cmp_nlt_f32_e32 vcc_lo, s105, v2      ; encoding: [0x69,0x04,0x3c,0x7c]
+# W64: v_cmp_nlt_f32_e32 vcc, s105, v2         ; encoding: [0x69,0x04,0x3c,0x7c]
 
-# W32: v_cmp_nlt_f32_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x3c,0x7c]
-# W64: v_cmp_nlt_f32_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x3c,0x7c]
 0x6a,0x04,0x3c,0x7c
+# W32: v_cmp_nlt_f32_e32 vcc_lo, vcc_lo, v2    ; encoding: [0x6a,0x04,0x3c,0x7c]
+# W64: v_cmp_nlt_f32_e32 vcc, vcc_lo, v2       ; encoding: [0x6a,0x04,0x3c,0x7c]
 
-# W32: v_cmp_nlt_f32_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x3c,0x7c]
-# W64: v_cmp_nlt_f32_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x3c,0x7c]
 0x6b,0x04,0x3c,0x7c
+# W32: v_cmp_nlt_f32_e32 vcc_lo, vcc_hi, v2    ; encoding: [0x6b,0x04,0x3c,0x7c]
+# W64: v_cmp_nlt_f32_e32 vcc, vcc_hi, v2       ; encoding: [0x6b,0x04,0x3c,0x7c]
 
-# W32: v_cmp_nlt_f32_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x3c,0x7c]
-# W64: v_cmp_nlt_f32_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x3c,0x7c]
 0x7b,0x04,0x3c,0x7c
+# W32: v_cmp_nlt_f32_e32 vcc_lo, ttmp15, v2    ; encoding: [0x7b,0x04,0x3c,0x7c]
+# W64: v_cmp_nlt_f32_e32 vcc, ttmp15, v2       ; encoding: [0x7b,0x04,0x3c,0x7c]
 
-# W32: v_cmp_nlt_f32_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x3c,0x7c]
-# W64: v_cmp_nlt_f32_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x3c,0x7c]
 0x7d,0x04,0x3c,0x7c
+# W32: v_cmp_nlt_f32_e32 vcc_lo, m0, v2        ; encoding: [0x7d,0x04,0x3c,0x7c]
+# W64: v_cmp_nlt_f32_e32 vcc, m0, v2           ; encoding: [0x7d,0x04,0x3c,0x7c]
 
-# W32: v_cmp_nlt_f32_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x3c,0x7c]
-# W64: v_cmp_nlt_f32_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x3c,0x7c]
 0x7e,0x04,0x3c,0x7c
+# W32: v_cmp_nlt_f32_e32 vcc_lo, exec_lo, v2   ; encoding: [0x7e,0x04,0x3c,0x7c]
+# W64: v_cmp_nlt_f32_e32 vcc, exec_lo, v2      ; encoding: [0x7e,0x04,0x3c,0x7c]
 
-# W32: v_cmp_nlt_f32_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x3c,0x7c]
-# W64: v_cmp_nlt_f32_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x3c,0x7c]
 0x7f,0x04,0x3c,0x7c
+# W32: v_cmp_nlt_f32_e32 vcc_lo, exec_hi, v2   ; encoding: [0x7f,0x04,0x3c,0x7c]
+# W64: v_cmp_nlt_f32_e32 vcc, exec_hi, v2      ; encoding: [0x7f,0x04,0x3c,0x7c]
 
-# W32: v_cmp_nlt_f32_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x3c,0x7c]
-# W64: v_cmp_nlt_f32_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x3c,0x7c]
 0x7c,0x04,0x3c,0x7c
+# W32: v_cmp_nlt_f32_e32 vcc_lo, null, v2      ; encoding: [0x7c,0x04,0x3c,0x7c]
+# W64: v_cmp_nlt_f32_e32 vcc, null, v2         ; encoding: [0x7c,0x04,0x3c,0x7c]
 
-# W32: v_cmp_nlt_f32_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x3c,0x7c]
-# W64: v_cmp_nlt_f32_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x3c,0x7c]
 0xc1,0x04,0x3c,0x7c
+# W32: v_cmp_nlt_f32_e32 vcc_lo, -1, v2        ; encoding: [0xc1,0x04,0x3c,0x7c]
+# W64: v_cmp_nlt_f32_e32 vcc, -1, v2           ; encoding: [0xc1,0x04,0x3c,0x7c]
 
-# W32: v_cmp_nlt_f32_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x3c,0x7c]
-# W64: v_cmp_nlt_f32_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x3c,0x7c]
 0xf0,0x04,0x3c,0x7c
+# W32: v_cmp_nlt_f32_e32 vcc_lo, 0.5, v2       ; encoding: [0xf0,0x04,0x3c,0x7c]
+# W64: v_cmp_nlt_f32_e32 vcc, 0.5, v2          ; encoding: [0xf0,0x04,0x3c,0x7c]
 
-# W32: v_cmp_nlt_f32_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x3c,0x7c]
-# W64: v_cmp_nlt_f32_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x3c,0x7c]
 0xfd,0x04,0x3c,0x7c
+# W32: v_cmp_nlt_f32_e32 vcc_lo, src_scc, v2   ; encoding: [0xfd,0x04,0x3c,0x7c]
+# W64: v_cmp_nlt_f32_e32 vcc, src_scc, v2      ; encoding: [0xfd,0x04,0x3c,0x7c]
 
-# W32: v_cmp_nlt_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x3d,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_nlt_f32_e32 vcc, 0xaf123456, v255   ; encoding: [0xff,0xfe,0x3d,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x3d,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_nlt_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x3d,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_nlt_f32_e32 vcc, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x3d,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_nlt_f64_e32 vcc_lo, v[1:2], v[2:3]  ; encoding: [0x01,0x05,0x5c,0x7c]
-# W64: v_cmp_nlt_f64_e32 vcc, v[1:2], v[2:3]     ; encoding: [0x01,0x05,0x5c,0x7c]
 0x01,0x05,0x5c,0x7c
+# W32: v_cmp_nlt_f64_e32 vcc_lo, v[1:2], v[2:3] ; encoding: [0x01,0x05,0x5c,0x7c]
+# W64: v_cmp_nlt_f64_e32 vcc, v[1:2], v[2:3]   ; encoding: [0x01,0x05,0x5c,0x7c]
 
+0xfe,0x05,0x5c,0x7c
 # W32: v_cmp_nlt_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x5c,0x7c]
 # W64: v_cmp_nlt_f64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x5c,0x7c]
-0xfe,0x05,0x5c,0x7c
 
-# W32: v_cmp_nlt_f64_e32 vcc_lo, s[2:3], v[2:3]  ; encoding: [0x02,0x04,0x5c,0x7c]
-# W64: v_cmp_nlt_f64_e32 vcc, s[2:3], v[2:3]     ; encoding: [0x02,0x04,0x5c,0x7c]
 0x02,0x04,0x5c,0x7c
+# W32: v_cmp_nlt_f64_e32 vcc_lo, s[2:3], v[2:3] ; encoding: [0x02,0x04,0x5c,0x7c]
+# W64: v_cmp_nlt_f64_e32 vcc, s[2:3], v[2:3]   ; encoding: [0x02,0x04,0x5c,0x7c]
 
+0x68,0x04,0x5c,0x7c
 # W32: v_cmp_nlt_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x5c,0x7c]
 # W64: v_cmp_nlt_f64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x5c,0x7c]
-0x68,0x04,0x5c,0x7c
 
-# W32: v_cmp_nlt_f64_e32 vcc_lo, vcc, v[2:3]     ; encoding: [0x6a,0x04,0x5c,0x7c]
-# W64: v_cmp_nlt_f64_e32 vcc, vcc, v[2:3]        ; encoding: [0x6a,0x04,0x5c,0x7c]
 0x6a,0x04,0x5c,0x7c
+# W32: v_cmp_nlt_f64_e32 vcc_lo, vcc, v[2:3]   ; encoding: [0x6a,0x04,0x5c,0x7c]
+# W64: v_cmp_nlt_f64_e32 vcc, vcc, v[2:3]      ; encoding: [0x6a,0x04,0x5c,0x7c]
 
+0x7a,0x04,0x5c,0x7c
 # W32: v_cmp_nlt_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x5c,0x7c]
 # W64: v_cmp_nlt_f64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x5c,0x7c]
-0x7a,0x04,0x5c,0x7c
 
-# W32: v_cmp_nlt_f64_e32 vcc_lo, exec, v[2:3]    ; encoding: [0x7e,0x04,0x5c,0x7c]
-# W64: v_cmp_nlt_f64_e32 vcc, exec, v[2:3]       ; encoding: [0x7e,0x04,0x5c,0x7c]
 0x7e,0x04,0x5c,0x7c
+# W32: v_cmp_nlt_f64_e32 vcc_lo, exec, v[2:3]  ; encoding: [0x7e,0x04,0x5c,0x7c]
+# W64: v_cmp_nlt_f64_e32 vcc, exec, v[2:3]     ; encoding: [0x7e,0x04,0x5c,0x7c]
 
-# W32: v_cmp_nlt_f64_e32 vcc_lo, null, v[2:3]    ; encoding: [0x7c,0x04,0x5c,0x7c]
-# W64: v_cmp_nlt_f64_e32 vcc, null, v[2:3]       ; encoding: [0x7c,0x04,0x5c,0x7c]
 0x7c,0x04,0x5c,0x7c
+# W32: v_cmp_nlt_f64_e32 vcc_lo, null, v[2:3]  ; encoding: [0x7c,0x04,0x5c,0x7c]
+# W64: v_cmp_nlt_f64_e32 vcc, null, v[2:3]     ; encoding: [0x7c,0x04,0x5c,0x7c]
 
-# W32: v_cmp_nlt_f64_e32 vcc_lo, -1, v[2:3]      ; encoding: [0xc1,0x04,0x5c,0x7c]
-# W64: v_cmp_nlt_f64_e32 vcc, -1, v[2:3]         ; encoding: [0xc1,0x04,0x5c,0x7c]
 0xc1,0x04,0x5c,0x7c
+# W32: v_cmp_nlt_f64_e32 vcc_lo, -1, v[2:3]    ; encoding: [0xc1,0x04,0x5c,0x7c]
+# W64: v_cmp_nlt_f64_e32 vcc, -1, v[2:3]       ; encoding: [0xc1,0x04,0x5c,0x7c]
 
-# W32: v_cmp_nlt_f64_e32 vcc_lo, 0.5, v[2:3]     ; encoding: [0xf0,0x04,0x5c,0x7c]
-# W64: v_cmp_nlt_f64_e32 vcc, 0.5, v[2:3]        ; encoding: [0xf0,0x04,0x5c,0x7c]
 0xf0,0x04,0x5c,0x7c
+# W32: v_cmp_nlt_f64_e32 vcc_lo, 0.5, v[2:3]   ; encoding: [0xf0,0x04,0x5c,0x7c]
+# W64: v_cmp_nlt_f64_e32 vcc, 0.5, v[2:3]      ; encoding: [0xf0,0x04,0x5c,0x7c]
 
-# W32: v_cmp_nlt_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x5c,0x7c]
-# W64: v_cmp_nlt_f64_e32 vcc, src_scc, v[2:3]    ; encoding: [0xfd,0x04,0x5c,0x7c]
 0xfd,0x04,0x5c,0x7c
+# W32: v_cmp_nlt_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x5c,0x7c]
+# W64: v_cmp_nlt_f64_e32 vcc, src_scc, v[2:3]  ; encoding: [0xfd,0x04,0x5c,0x7c]
 
+0xff,0xfc,0x5d,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_nlt_f64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5d,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_nlt_f64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5d,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0x5d,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_o_f16_e32 vcc_lo, v1, v2            ; encoding: [0x01,0x05,0x0e,0x7c]
-# W64: v_cmp_o_f16_e32 vcc, v1, v2               ; encoding: [0x01,0x05,0x0e,0x7c]
 0x01,0x05,0x0e,0x7c
+# W32: v_cmp_o_f16_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x0e,0x7c]
+# W64: v_cmp_o_f16_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x0e,0x7c]
 
-# W32: v_cmp_o_f16_e32 vcc_lo, v127, v2          ; encoding: [0x7f,0x05,0x0e,0x7c]
-# W64: v_cmp_o_f16_e32 vcc, v127, v2             ; encoding: [0x7f,0x05,0x0e,0x7c]
 0x7f,0x05,0x0e,0x7c
+# W32: v_cmp_o_f16_e32 vcc_lo, v127, v2        ; encoding: [0x7f,0x05,0x0e,0x7c]
+# W64: v_cmp_o_f16_e32 vcc, v127, v2           ; encoding: [0x7f,0x05,0x0e,0x7c]
 
-# W32: v_cmp_o_f16_e32 vcc_lo, s1, v2            ; encoding: [0x01,0x04,0x0e,0x7c]
-# W64: v_cmp_o_f16_e32 vcc, s1, v2               ; encoding: [0x01,0x04,0x0e,0x7c]
 0x01,0x04,0x0e,0x7c
+# W32: v_cmp_o_f16_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x0e,0x7c]
+# W64: v_cmp_o_f16_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x0e,0x7c]
 
-# W32: v_cmp_o_f16_e32 vcc_lo, s105, v2          ; encoding: [0x69,0x04,0x0e,0x7c]
-# W64: v_cmp_o_f16_e32 vcc, s105, v2             ; encoding: [0x69,0x04,0x0e,0x7c]
 0x69,0x04,0x0e,0x7c
+# W32: v_cmp_o_f16_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x0e,0x7c]
+# W64: v_cmp_o_f16_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x0e,0x7c]
 
-# W32: v_cmp_o_f16_e32 vcc_lo, vcc_lo, v2        ; encoding: [0x6a,0x04,0x0e,0x7c]
-# W64: v_cmp_o_f16_e32 vcc, vcc_lo, v2           ; encoding: [0x6a,0x04,0x0e,0x7c]
 0x6a,0x04,0x0e,0x7c
+# W32: v_cmp_o_f16_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x0e,0x7c]
+# W64: v_cmp_o_f16_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x0e,0x7c]
 
-# W32: v_cmp_o_f16_e32 vcc_lo, vcc_hi, v2        ; encoding: [0x6b,0x04,0x0e,0x7c]
-# W64: v_cmp_o_f16_e32 vcc, vcc_hi, v2           ; encoding: [0x6b,0x04,0x0e,0x7c]
 0x6b,0x04,0x0e,0x7c
+# W32: v_cmp_o_f16_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x0e,0x7c]
+# W64: v_cmp_o_f16_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x0e,0x7c]
 
-# W32: v_cmp_o_f16_e32 vcc_lo, ttmp15, v2        ; encoding: [0x7b,0x04,0x0e,0x7c]
-# W64: v_cmp_o_f16_e32 vcc, ttmp15, v2           ; encoding: [0x7b,0x04,0x0e,0x7c]
 0x7b,0x04,0x0e,0x7c
+# W32: v_cmp_o_f16_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x0e,0x7c]
+# W64: v_cmp_o_f16_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x0e,0x7c]
 
-# W32: v_cmp_o_f16_e32 vcc_lo, m0, v2            ; encoding: [0x7d,0x04,0x0e,0x7c]
-# W64: v_cmp_o_f16_e32 vcc, m0, v2               ; encoding: [0x7d,0x04,0x0e,0x7c]
 0x7d,0x04,0x0e,0x7c
+# W32: v_cmp_o_f16_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x0e,0x7c]
+# W64: v_cmp_o_f16_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x0e,0x7c]
 
-# W32: v_cmp_o_f16_e32 vcc_lo, exec_lo, v2       ; encoding: [0x7e,0x04,0x0e,0x7c]
-# W64: v_cmp_o_f16_e32 vcc, exec_lo, v2          ; encoding: [0x7e,0x04,0x0e,0x7c]
 0x7e,0x04,0x0e,0x7c
+# W32: v_cmp_o_f16_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x0e,0x7c]
+# W64: v_cmp_o_f16_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x0e,0x7c]
 
-# W32: v_cmp_o_f16_e32 vcc_lo, exec_hi, v2       ; encoding: [0x7f,0x04,0x0e,0x7c]
-# W64: v_cmp_o_f16_e32 vcc, exec_hi, v2          ; encoding: [0x7f,0x04,0x0e,0x7c]
 0x7f,0x04,0x0e,0x7c
+# W32: v_cmp_o_f16_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x0e,0x7c]
+# W64: v_cmp_o_f16_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x0e,0x7c]
 
-# W32: v_cmp_o_f16_e32 vcc_lo, null, v2          ; encoding: [0x7c,0x04,0x0e,0x7c]
-# W64: v_cmp_o_f16_e32 vcc, null, v2             ; encoding: [0x7c,0x04,0x0e,0x7c]
 0x7c,0x04,0x0e,0x7c
+# W32: v_cmp_o_f16_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x0e,0x7c]
+# W64: v_cmp_o_f16_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x0e,0x7c]
 
-# W32: v_cmp_o_f16_e32 vcc_lo, -1, v2            ; encoding: [0xc1,0x04,0x0e,0x7c]
-# W64: v_cmp_o_f16_e32 vcc, -1, v2               ; encoding: [0xc1,0x04,0x0e,0x7c]
 0xc1,0x04,0x0e,0x7c
+# W32: v_cmp_o_f16_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x0e,0x7c]
+# W64: v_cmp_o_f16_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x0e,0x7c]
 
-# W32: v_cmp_o_f16_e32 vcc_lo, 0.5, v2           ; encoding: [0xf0,0x04,0x0e,0x7c]
-# W64: v_cmp_o_f16_e32 vcc, 0.5, v2              ; encoding: [0xf0,0x04,0x0e,0x7c]
 0xf0,0x04,0x0e,0x7c
+# W32: v_cmp_o_f16_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x0e,0x7c]
+# W64: v_cmp_o_f16_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x0e,0x7c]
 
-# W32: v_cmp_o_f16_e32 vcc_lo, src_scc, v2       ; encoding: [0xfd,0x04,0x0e,0x7c]
-# W64: v_cmp_o_f16_e32 vcc, src_scc, v2          ; encoding: [0xfd,0x04,0x0e,0x7c]
 0xfd,0x04,0x0e,0x7c
+# W32: v_cmp_o_f16_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x0e,0x7c]
+# W64: v_cmp_o_f16_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x0e,0x7c]
 
-# W32: v_cmp_o_f16_e32 vcc_lo, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x0e,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_o_f16_e32 vcc, 0xfe0b, v127         ; encoding: [0xff,0xfe,0x0e,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x0e,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_o_f16_e32 vcc_lo, 0xfe0b, v127    ; encoding: [0xff,0xfe,0x0e,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_o_f16_e32 vcc, 0xfe0b, v127       ; encoding: [0xff,0xfe,0x0e,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_o_f32_e32 vcc_lo, v1, v2            ; encoding: [0x01,0x05,0x2e,0x7c]
-# W64: v_cmp_o_f32_e32 vcc, v1, v2               ; encoding: [0x01,0x05,0x2e,0x7c]
 0x01,0x05,0x2e,0x7c
+# W32: v_cmp_o_f32_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x2e,0x7c]
+# W64: v_cmp_o_f32_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x2e,0x7c]
 
-# W32: v_cmp_o_f32_e32 vcc_lo, v255, v2          ; encoding: [0xff,0x05,0x2e,0x7c]
-# W64: v_cmp_o_f32_e32 vcc, v255, v2             ; encoding: [0xff,0x05,0x2e,0x7c]
 0xff,0x05,0x2e,0x7c
+# W32: v_cmp_o_f32_e32 vcc_lo, v255, v2        ; encoding: [0xff,0x05,0x2e,0x7c]
+# W64: v_cmp_o_f32_e32 vcc, v255, v2           ; encoding: [0xff,0x05,0x2e,0x7c]
 
-# W32: v_cmp_o_f32_e32 vcc_lo, s1, v2            ; encoding: [0x01,0x04,0x2e,0x7c]
-# W64: v_cmp_o_f32_e32 vcc, s1, v2               ; encoding: [0x01,0x04,0x2e,0x7c]
 0x01,0x04,0x2e,0x7c
+# W32: v_cmp_o_f32_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x2e,0x7c]
+# W64: v_cmp_o_f32_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x2e,0x7c]
 
-# W32: v_cmp_o_f32_e32 vcc_lo, s105, v2          ; encoding: [0x69,0x04,0x2e,0x7c]
-# W64: v_cmp_o_f32_e32 vcc, s105, v2             ; encoding: [0x69,0x04,0x2e,0x7c]
 0x69,0x04,0x2e,0x7c
+# W32: v_cmp_o_f32_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x2e,0x7c]
+# W64: v_cmp_o_f32_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x2e,0x7c]
 
-# W32: v_cmp_o_f32_e32 vcc_lo, vcc_lo, v2        ; encoding: [0x6a,0x04,0x2e,0x7c]
-# W64: v_cmp_o_f32_e32 vcc, vcc_lo, v2           ; encoding: [0x6a,0x04,0x2e,0x7c]
 0x6a,0x04,0x2e,0x7c
+# W32: v_cmp_o_f32_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x2e,0x7c]
+# W64: v_cmp_o_f32_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x2e,0x7c]
 
-# W32: v_cmp_o_f32_e32 vcc_lo, vcc_hi, v2        ; encoding: [0x6b,0x04,0x2e,0x7c]
-# W64: v_cmp_o_f32_e32 vcc, vcc_hi, v2           ; encoding: [0x6b,0x04,0x2e,0x7c]
 0x6b,0x04,0x2e,0x7c
+# W32: v_cmp_o_f32_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x2e,0x7c]
+# W64: v_cmp_o_f32_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x2e,0x7c]
 
-# W32: v_cmp_o_f32_e32 vcc_lo, ttmp15, v2        ; encoding: [0x7b,0x04,0x2e,0x7c]
-# W64: v_cmp_o_f32_e32 vcc, ttmp15, v2           ; encoding: [0x7b,0x04,0x2e,0x7c]
 0x7b,0x04,0x2e,0x7c
+# W32: v_cmp_o_f32_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x2e,0x7c]
+# W64: v_cmp_o_f32_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x2e,0x7c]
 
-# W32: v_cmp_o_f32_e32 vcc_lo, m0, v2            ; encoding: [0x7d,0x04,0x2e,0x7c]
-# W64: v_cmp_o_f32_e32 vcc, m0, v2               ; encoding: [0x7d,0x04,0x2e,0x7c]
 0x7d,0x04,0x2e,0x7c
+# W32: v_cmp_o_f32_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x2e,0x7c]
+# W64: v_cmp_o_f32_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x2e,0x7c]
 
-# W32: v_cmp_o_f32_e32 vcc_lo, exec_lo, v2       ; encoding: [0x7e,0x04,0x2e,0x7c]
-# W64: v_cmp_o_f32_e32 vcc, exec_lo, v2          ; encoding: [0x7e,0x04,0x2e,0x7c]
 0x7e,0x04,0x2e,0x7c
+# W32: v_cmp_o_f32_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x2e,0x7c]
+# W64: v_cmp_o_f32_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x2e,0x7c]
 
-# W32: v_cmp_o_f32_e32 vcc_lo, exec_hi, v2       ; encoding: [0x7f,0x04,0x2e,0x7c]
-# W64: v_cmp_o_f32_e32 vcc, exec_hi, v2          ; encoding: [0x7f,0x04,0x2e,0x7c]
 0x7f,0x04,0x2e,0x7c
+# W32: v_cmp_o_f32_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x2e,0x7c]
+# W64: v_cmp_o_f32_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x2e,0x7c]
 
-# W32: v_cmp_o_f32_e32 vcc_lo, null, v2          ; encoding: [0x7c,0x04,0x2e,0x7c]
-# W64: v_cmp_o_f32_e32 vcc, null, v2             ; encoding: [0x7c,0x04,0x2e,0x7c]
 0x7c,0x04,0x2e,0x7c
+# W32: v_cmp_o_f32_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x2e,0x7c]
+# W64: v_cmp_o_f32_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x2e,0x7c]
 
-# W32: v_cmp_o_f32_e32 vcc_lo, -1, v2            ; encoding: [0xc1,0x04,0x2e,0x7c]
-# W64: v_cmp_o_f32_e32 vcc, -1, v2               ; encoding: [0xc1,0x04,0x2e,0x7c]
 0xc1,0x04,0x2e,0x7c
+# W32: v_cmp_o_f32_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x2e,0x7c]
+# W64: v_cmp_o_f32_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x2e,0x7c]
 
-# W32: v_cmp_o_f32_e32 vcc_lo, 0.5, v2           ; encoding: [0xf0,0x04,0x2e,0x7c]
-# W64: v_cmp_o_f32_e32 vcc, 0.5, v2              ; encoding: [0xf0,0x04,0x2e,0x7c]
 0xf0,0x04,0x2e,0x7c
+# W32: v_cmp_o_f32_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x2e,0x7c]
+# W64: v_cmp_o_f32_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x2e,0x7c]
 
-# W32: v_cmp_o_f32_e32 vcc_lo, src_scc, v2       ; encoding: [0xfd,0x04,0x2e,0x7c]
-# W64: v_cmp_o_f32_e32 vcc, src_scc, v2          ; encoding: [0xfd,0x04,0x2e,0x7c]
 0xfd,0x04,0x2e,0x7c
+# W32: v_cmp_o_f32_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x2e,0x7c]
+# W64: v_cmp_o_f32_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x2e,0x7c]
 
-# W32: v_cmp_o_f32_e32 vcc_lo, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x2f,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_o_f32_e32 vcc, 0xaf123456, v255     ; encoding: [0xff,0xfe,0x2f,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x2f,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_o_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x2f,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_o_f32_e32 vcc, 0xaf123456, v255   ; encoding: [0xff,0xfe,0x2f,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_o_f64_e32 vcc_lo, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0x4e,0x7c]
-# W64: v_cmp_o_f64_e32 vcc, v[1:2], v[2:3]       ; encoding: [0x01,0x05,0x4e,0x7c]
 0x01,0x05,0x4e,0x7c
+# W32: v_cmp_o_f64_e32 vcc_lo, v[1:2], v[2:3]  ; encoding: [0x01,0x05,0x4e,0x7c]
+# W64: v_cmp_o_f64_e32 vcc, v[1:2], v[2:3]     ; encoding: [0x01,0x05,0x4e,0x7c]
 
-# W32: v_cmp_o_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x4e,0x7c]
-# W64: v_cmp_o_f64_e32 vcc, v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0x4e,0x7c]
 0xfe,0x05,0x4e,0x7c
+# W32: v_cmp_o_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x4e,0x7c]
+# W64: v_cmp_o_f64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x4e,0x7c]
 
-# W32: v_cmp_o_f64_e32 vcc_lo, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0x4e,0x7c]
-# W64: v_cmp_o_f64_e32 vcc, s[2:3], v[2:3]       ; encoding: [0x02,0x04,0x4e,0x7c]
 0x02,0x04,0x4e,0x7c
+# W32: v_cmp_o_f64_e32 vcc_lo, s[2:3], v[2:3]  ; encoding: [0x02,0x04,0x4e,0x7c]
+# W64: v_cmp_o_f64_e32 vcc, s[2:3], v[2:3]     ; encoding: [0x02,0x04,0x4e,0x7c]
 
-# W32: v_cmp_o_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x4e,0x7c]
-# W64: v_cmp_o_f64_e32 vcc, s[104:105], v[2:3]   ; encoding: [0x68,0x04,0x4e,0x7c]
 0x68,0x04,0x4e,0x7c
+# W32: v_cmp_o_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x4e,0x7c]
+# W64: v_cmp_o_f64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x4e,0x7c]
 
-# W32: v_cmp_o_f64_e32 vcc_lo, vcc, v[2:3]       ; encoding: [0x6a,0x04,0x4e,0x7c]
-# W64: v_cmp_o_f64_e32 vcc, vcc, v[2:3]          ; encoding: [0x6a,0x04,0x4e,0x7c]
 0x6a,0x04,0x4e,0x7c
+# W32: v_cmp_o_f64_e32 vcc_lo, vcc, v[2:3]     ; encoding: [0x6a,0x04,0x4e,0x7c]
+# W64: v_cmp_o_f64_e32 vcc, vcc, v[2:3]        ; encoding: [0x6a,0x04,0x4e,0x7c]
 
-# W32: v_cmp_o_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x4e,0x7c]
-# W64: v_cmp_o_f64_e32 vcc, ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0x4e,0x7c]
 0x7a,0x04,0x4e,0x7c
+# W32: v_cmp_o_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x4e,0x7c]
+# W64: v_cmp_o_f64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x4e,0x7c]
 
-# W32: v_cmp_o_f64_e32 vcc_lo, exec, v[2:3]      ; encoding: [0x7e,0x04,0x4e,0x7c]
-# W64: v_cmp_o_f64_e32 vcc, exec, v[2:3]         ; encoding: [0x7e,0x04,0x4e,0x7c]
 0x7e,0x04,0x4e,0x7c
+# W32: v_cmp_o_f64_e32 vcc_lo, exec, v[2:3]    ; encoding: [0x7e,0x04,0x4e,0x7c]
+# W64: v_cmp_o_f64_e32 vcc, exec, v[2:3]       ; encoding: [0x7e,0x04,0x4e,0x7c]
 
-# W32: v_cmp_o_f64_e32 vcc_lo, null, v[2:3]      ; encoding: [0x7c,0x04,0x4e,0x7c]
-# W64: v_cmp_o_f64_e32 vcc, null, v[2:3]         ; encoding: [0x7c,0x04,0x4e,0x7c]
 0x7c,0x04,0x4e,0x7c
+# W32: v_cmp_o_f64_e32 vcc_lo, null, v[2:3]    ; encoding: [0x7c,0x04,0x4e,0x7c]
+# W64: v_cmp_o_f64_e32 vcc, null, v[2:3]       ; encoding: [0x7c,0x04,0x4e,0x7c]
 
-# W32: v_cmp_o_f64_e32 vcc_lo, -1, v[2:3]        ; encoding: [0xc1,0x04,0x4e,0x7c]
-# W64: v_cmp_o_f64_e32 vcc, -1, v[2:3]           ; encoding: [0xc1,0x04,0x4e,0x7c]
 0xc1,0x04,0x4e,0x7c
+# W32: v_cmp_o_f64_e32 vcc_lo, -1, v[2:3]      ; encoding: [0xc1,0x04,0x4e,0x7c]
+# W64: v_cmp_o_f64_e32 vcc, -1, v[2:3]         ; encoding: [0xc1,0x04,0x4e,0x7c]
 
-# W32: v_cmp_o_f64_e32 vcc_lo, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0x4e,0x7c]
-# W64: v_cmp_o_f64_e32 vcc, 0.5, v[2:3]          ; encoding: [0xf0,0x04,0x4e,0x7c]
 0xf0,0x04,0x4e,0x7c
+# W32: v_cmp_o_f64_e32 vcc_lo, 0.5, v[2:3]     ; encoding: [0xf0,0x04,0x4e,0x7c]
+# W64: v_cmp_o_f64_e32 vcc, 0.5, v[2:3]        ; encoding: [0xf0,0x04,0x4e,0x7c]
 
-# W32: v_cmp_o_f64_e32 vcc_lo, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0x4e,0x7c]
-# W64: v_cmp_o_f64_e32 vcc, src_scc, v[2:3]      ; encoding: [0xfd,0x04,0x4e,0x7c]
 0xfd,0x04,0x4e,0x7c
+# W32: v_cmp_o_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x4e,0x7c]
+# W64: v_cmp_o_f64_e32 vcc, src_scc, v[2:3]    ; encoding: [0xfd,0x04,0x4e,0x7c]
 
+0xff,0xfc,0x4f,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_o_f64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4f,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_o_f64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4f,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0x4f,0x7c,0x56,0x34,0x12,0xaf
 
-# W32: v_cmp_u_f16_e32 vcc_lo, v1, v2            ; encoding: [0x01,0x05,0x10,0x7c]
-# W64: v_cmp_u_f16_e32 vcc, v1, v2               ; encoding: [0x01,0x05,0x10,0x7c]
 0x01,0x05,0x10,0x7c
+# W32: v_cmp_u_f16_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x10,0x7c]
+# W64: v_cmp_u_f16_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x10,0x7c]
 
-# W32: v_cmp_u_f16_e32 vcc_lo, v127, v2          ; encoding: [0x7f,0x05,0x10,0x7c]
-# W64: v_cmp_u_f16_e32 vcc, v127, v2             ; encoding: [0x7f,0x05,0x10,0x7c]
 0x7f,0x05,0x10,0x7c
+# W32: v_cmp_u_f16_e32 vcc_lo, v127, v2        ; encoding: [0x7f,0x05,0x10,0x7c]
+# W64: v_cmp_u_f16_e32 vcc, v127, v2           ; encoding: [0x7f,0x05,0x10,0x7c]
 
-# W32: v_cmp_u_f16_e32 vcc_lo, s1, v2            ; encoding: [0x01,0x04,0x10,0x7c]
-# W64: v_cmp_u_f16_e32 vcc, s1, v2               ; encoding: [0x01,0x04,0x10,0x7c]
 0x01,0x04,0x10,0x7c
+# W32: v_cmp_u_f16_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x10,0x7c]
+# W64: v_cmp_u_f16_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x10,0x7c]
 
-# W32: v_cmp_u_f16_e32 vcc_lo, s105, v2          ; encoding: [0x69,0x04,0x10,0x7c]
-# W64: v_cmp_u_f16_e32 vcc, s105, v2             ; encoding: [0x69,0x04,0x10,0x7c]
 0x69,0x04,0x10,0x7c
+# W32: v_cmp_u_f16_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x10,0x7c]
+# W64: v_cmp_u_f16_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x10,0x7c]
 
-# W32: v_cmp_u_f16_e32 vcc_lo, vcc_lo, v2        ; encoding: [0x6a,0x04,0x10,0x7c]
-# W64: v_cmp_u_f16_e32 vcc, vcc_lo, v2           ; encoding: [0x6a,0x04,0x10,0x7c]
 0x6a,0x04,0x10,0x7c
+# W32: v_cmp_u_f16_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x10,0x7c]
+# W64: v_cmp_u_f16_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x10,0x7c]
 
-# W32: v_cmp_u_f16_e32 vcc_lo, vcc_hi, v2        ; encoding: [0x6b,0x04,0x10,0x7c]
-# W64: v_cmp_u_f16_e32 vcc, vcc_hi, v2           ; encoding: [0x6b,0x04,0x10,0x7c]
 0x6b,0x04,0x10,0x7c
+# W32: v_cmp_u_f16_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x10,0x7c]
+# W64: v_cmp_u_f16_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x10,0x7c]
 
-# W32: v_cmp_u_f16_e32 vcc_lo, ttmp15, v2        ; encoding: [0x7b,0x04,0x10,0x7c]
-# W64: v_cmp_u_f16_e32 vcc, ttmp15, v2           ; encoding: [0x7b,0x04,0x10,0x7c]
 0x7b,0x04,0x10,0x7c
+# W32: v_cmp_u_f16_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x10,0x7c]
+# W64: v_cmp_u_f16_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x10,0x7c]
 
-# W32: v_cmp_u_f16_e32 vcc_lo, m0, v2            ; encoding: [0x7d,0x04,0x10,0x7c]
-# W64: v_cmp_u_f16_e32 vcc, m0, v2               ; encoding: [0x7d,0x04,0x10,0x7c]
 0x7d,0x04,0x10,0x7c
+# W32: v_cmp_u_f16_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x10,0x7c]
+# W64: v_cmp_u_f16_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x10,0x7c]
 
-# W32: v_cmp_u_f16_e32 vcc_lo, exec_lo, v2       ; encoding: [0x7e,0x04,0x10,0x7c]
-# W64: v_cmp_u_f16_e32 vcc, exec_lo, v2          ; encoding: [0x7e,0x04,0x10,0x7c]
 0x7e,0x04,0x10,0x7c
+# W32: v_cmp_u_f16_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x10,0x7c]
+# W64: v_cmp_u_f16_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x10,0x7c]
 
-# W32: v_cmp_u_f16_e32 vcc_lo, exec_hi, v2       ; encoding: [0x7f,0x04,0x10,0x7c]
-# W64: v_cmp_u_f16_e32 vcc, exec_hi, v2          ; encoding: [0x7f,0x04,0x10,0x7c]
 0x7f,0x04,0x10,0x7c
+# W32: v_cmp_u_f16_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x10,0x7c]
+# W64: v_cmp_u_f16_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x10,0x7c]
 
-# W32: v_cmp_u_f16_e32 vcc_lo, null, v2          ; encoding: [0x7c,0x04,0x10,0x7c]
-# W64: v_cmp_u_f16_e32 vcc, null, v2             ; encoding: [0x7c,0x04,0x10,0x7c]
 0x7c,0x04,0x10,0x7c
+# W32: v_cmp_u_f16_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x10,0x7c]
+# W64: v_cmp_u_f16_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x10,0x7c]
 
-# W32: v_cmp_u_f16_e32 vcc_lo, -1, v2            ; encoding: [0xc1,0x04,0x10,0x7c]
-# W64: v_cmp_u_f16_e32 vcc, -1, v2               ; encoding: [0xc1,0x04,0x10,0x7c]
 0xc1,0x04,0x10,0x7c
+# W32: v_cmp_u_f16_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x10,0x7c]
+# W64: v_cmp_u_f16_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x10,0x7c]
 
-# W32: v_cmp_u_f16_e32 vcc_lo, 0.5, v2           ; encoding: [0xf0,0x04,0x10,0x7c]
-# W64: v_cmp_u_f16_e32 vcc, 0.5, v2              ; encoding: [0xf0,0x04,0x10,0x7c]
 0xf0,0x04,0x10,0x7c
+# W32: v_cmp_u_f16_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x10,0x7c]
+# W64: v_cmp_u_f16_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x10,0x7c]
 
-# W32: v_cmp_u_f16_e32 vcc_lo, src_scc, v2       ; encoding: [0xfd,0x04,0x10,0x7c]
-# W64: v_cmp_u_f16_e32 vcc, src_scc, v2          ; encoding: [0xfd,0x04,0x10,0x7c]
 0xfd,0x04,0x10,0x7c
+# W32: v_cmp_u_f16_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x10,0x7c]
+# W64: v_cmp_u_f16_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x10,0x7c]
 
-# W32: v_cmp_u_f16_e32 vcc_lo, 0xfe0b, v127      ; encoding: [0xff,0xfe,0x10,0x7c,0x0b,0xfe,0x00,0x00]
-# W64: v_cmp_u_f16_e32 vcc, 0xfe0b, v127         ; encoding: [0xff,0xfe,0x10,0x7c,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x10,0x7c,0x0b,0xfe,0x00,0x00
+# W32: v_cmp_u_f16_e32 vcc_lo, 0xfe0b, v127    ; encoding: [0xff,0xfe,0x10,0x7c,0x0b,0xfe,0x00,0x00]
+# W64: v_cmp_u_f16_e32 vcc, 0xfe0b, v127       ; encoding: [0xff,0xfe,0x10,0x7c,0x0b,0xfe,0x00,0x00]
 
-# W32: v_cmp_u_f32_e32 vcc_lo, v1, v2            ; encoding: [0x01,0x05,0x30,0x7c]
-# W64: v_cmp_u_f32_e32 vcc, v1, v2               ; encoding: [0x01,0x05,0x30,0x7c]
 0x01,0x05,0x30,0x7c
+# W32: v_cmp_u_f32_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x30,0x7c]
+# W64: v_cmp_u_f32_e32 vcc, v1, v2             ; encoding: [0x01,0x05,0x30,0x7c]
 
-# W32: v_cmp_u_f32_e32 vcc_lo, v255, v2          ; encoding: [0xff,0x05,0x30,0x7c]
-# W64: v_cmp_u_f32_e32 vcc, v255, v2             ; encoding: [0xff,0x05,0x30,0x7c]
 0xff,0x05,0x30,0x7c
+# W32: v_cmp_u_f32_e32 vcc_lo, v255, v2        ; encoding: [0xff,0x05,0x30,0x7c]
+# W64: v_cmp_u_f32_e32 vcc, v255, v2           ; encoding: [0xff,0x05,0x30,0x7c]
 
-# W32: v_cmp_u_f32_e32 vcc_lo, s1, v2            ; encoding: [0x01,0x04,0x30,0x7c]
-# W64: v_cmp_u_f32_e32 vcc, s1, v2               ; encoding: [0x01,0x04,0x30,0x7c]
 0x01,0x04,0x30,0x7c
+# W32: v_cmp_u_f32_e32 vcc_lo, s1, v2          ; encoding: [0x01,0x04,0x30,0x7c]
+# W64: v_cmp_u_f32_e32 vcc, s1, v2             ; encoding: [0x01,0x04,0x30,0x7c]
 
-# W32: v_cmp_u_f32_e32 vcc_lo, s105, v2          ; encoding: [0x69,0x04,0x30,0x7c]
-# W64: v_cmp_u_f32_e32 vcc, s105, v2             ; encoding: [0x69,0x04,0x30,0x7c]
 0x69,0x04,0x30,0x7c
+# W32: v_cmp_u_f32_e32 vcc_lo, s105, v2        ; encoding: [0x69,0x04,0x30,0x7c]
+# W64: v_cmp_u_f32_e32 vcc, s105, v2           ; encoding: [0x69,0x04,0x30,0x7c]
 
-# W32: v_cmp_u_f32_e32 vcc_lo, vcc_lo, v2        ; encoding: [0x6a,0x04,0x30,0x7c]
-# W64: v_cmp_u_f32_e32 vcc, vcc_lo, v2           ; encoding: [0x6a,0x04,0x30,0x7c]
 0x6a,0x04,0x30,0x7c
+# W32: v_cmp_u_f32_e32 vcc_lo, vcc_lo, v2      ; encoding: [0x6a,0x04,0x30,0x7c]
+# W64: v_cmp_u_f32_e32 vcc, vcc_lo, v2         ; encoding: [0x6a,0x04,0x30,0x7c]
 
-# W32: v_cmp_u_f32_e32 vcc_lo, vcc_hi, v2        ; encoding: [0x6b,0x04,0x30,0x7c]
-# W64: v_cmp_u_f32_e32 vcc, vcc_hi, v2           ; encoding: [0x6b,0x04,0x30,0x7c]
 0x6b,0x04,0x30,0x7c
+# W32: v_cmp_u_f32_e32 vcc_lo, vcc_hi, v2      ; encoding: [0x6b,0x04,0x30,0x7c]
+# W64: v_cmp_u_f32_e32 vcc, vcc_hi, v2         ; encoding: [0x6b,0x04,0x30,0x7c]
 
-# W32: v_cmp_u_f32_e32 vcc_lo, ttmp15, v2        ; encoding: [0x7b,0x04,0x30,0x7c]
-# W64: v_cmp_u_f32_e32 vcc, ttmp15, v2           ; encoding: [0x7b,0x04,0x30,0x7c]
 0x7b,0x04,0x30,0x7c
+# W32: v_cmp_u_f32_e32 vcc_lo, ttmp15, v2      ; encoding: [0x7b,0x04,0x30,0x7c]
+# W64: v_cmp_u_f32_e32 vcc, ttmp15, v2         ; encoding: [0x7b,0x04,0x30,0x7c]
 
-# W32: v_cmp_u_f32_e32 vcc_lo, m0, v2            ; encoding: [0x7d,0x04,0x30,0x7c]
-# W64: v_cmp_u_f32_e32 vcc, m0, v2               ; encoding: [0x7d,0x04,0x30,0x7c]
 0x7d,0x04,0x30,0x7c
+# W32: v_cmp_u_f32_e32 vcc_lo, m0, v2          ; encoding: [0x7d,0x04,0x30,0x7c]
+# W64: v_cmp_u_f32_e32 vcc, m0, v2             ; encoding: [0x7d,0x04,0x30,0x7c]
 
-# W32: v_cmp_u_f32_e32 vcc_lo, exec_lo, v2       ; encoding: [0x7e,0x04,0x30,0x7c]
-# W64: v_cmp_u_f32_e32 vcc, exec_lo, v2          ; encoding: [0x7e,0x04,0x30,0x7c]
 0x7e,0x04,0x30,0x7c
+# W32: v_cmp_u_f32_e32 vcc_lo, exec_lo, v2     ; encoding: [0x7e,0x04,0x30,0x7c]
+# W64: v_cmp_u_f32_e32 vcc, exec_lo, v2        ; encoding: [0x7e,0x04,0x30,0x7c]
 
-# W32: v_cmp_u_f32_e32 vcc_lo, exec_hi, v2       ; encoding: [0x7f,0x04,0x30,0x7c]
-# W64: v_cmp_u_f32_e32 vcc, exec_hi, v2          ; encoding: [0x7f,0x04,0x30,0x7c]
 0x7f,0x04,0x30,0x7c
+# W32: v_cmp_u_f32_e32 vcc_lo, exec_hi, v2     ; encoding: [0x7f,0x04,0x30,0x7c]
+# W64: v_cmp_u_f32_e32 vcc, exec_hi, v2        ; encoding: [0x7f,0x04,0x30,0x7c]
 
-# W32: v_cmp_u_f32_e32 vcc_lo, null, v2          ; encoding: [0x7c,0x04,0x30,0x7c]
-# W64: v_cmp_u_f32_e32 vcc, null, v2             ; encoding: [0x7c,0x04,0x30,0x7c]
 0x7c,0x04,0x30,0x7c
+# W32: v_cmp_u_f32_e32 vcc_lo, null, v2        ; encoding: [0x7c,0x04,0x30,0x7c]
+# W64: v_cmp_u_f32_e32 vcc, null, v2           ; encoding: [0x7c,0x04,0x30,0x7c]
 
-# W32: v_cmp_u_f32_e32 vcc_lo, -1, v2            ; encoding: [0xc1,0x04,0x30,0x7c]
-# W64: v_cmp_u_f32_e32 vcc, -1, v2               ; encoding: [0xc1,0x04,0x30,0x7c]
 0xc1,0x04,0x30,0x7c
+# W32: v_cmp_u_f32_e32 vcc_lo, -1, v2          ; encoding: [0xc1,0x04,0x30,0x7c]
+# W64: v_cmp_u_f32_e32 vcc, -1, v2             ; encoding: [0xc1,0x04,0x30,0x7c]
 
-# W32: v_cmp_u_f32_e32 vcc_lo, 0.5, v2           ; encoding: [0xf0,0x04,0x30,0x7c]
-# W64: v_cmp_u_f32_e32 vcc, 0.5, v2              ; encoding: [0xf0,0x04,0x30,0x7c]
 0xf0,0x04,0x30,0x7c
+# W32: v_cmp_u_f32_e32 vcc_lo, 0.5, v2         ; encoding: [0xf0,0x04,0x30,0x7c]
+# W64: v_cmp_u_f32_e32 vcc, 0.5, v2            ; encoding: [0xf0,0x04,0x30,0x7c]
 
-# W32: v_cmp_u_f32_e32 vcc_lo, src_scc, v2       ; encoding: [0xfd,0x04,0x30,0x7c]
-# W64: v_cmp_u_f32_e32 vcc, src_scc, v2          ; encoding: [0xfd,0x04,0x30,0x7c]
 0xfd,0x04,0x30,0x7c
+# W32: v_cmp_u_f32_e32 vcc_lo, src_scc, v2     ; encoding: [0xfd,0x04,0x30,0x7c]
+# W64: v_cmp_u_f32_e32 vcc, src_scc, v2        ; encoding: [0xfd,0x04,0x30,0x7c]
 
-# W32: v_cmp_u_f32_e32 vcc_lo, 0xaf123456, v255  ; encoding: [0xff,0xfe,0x31,0x7c,0x56,0x34,0x12,0xaf]
-# W64: v_cmp_u_f32_e32 vcc, 0xaf123456, v255     ; encoding: [0xff,0xfe,0x31,0x7c,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x31,0x7c,0x56,0x34,0x12,0xaf
+# W32: v_cmp_u_f32_e32 vcc_lo, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x31,0x7c,0x56,0x34,0x12,0xaf]
+# W64: v_cmp_u_f32_e32 vcc, 0xaf123456, v255   ; encoding: [0xff,0xfe,0x31,0x7c,0x56,0x34,0x12,0xaf]
 
-# W32: v_cmp_u_f64_e32 vcc_lo, v[1:2], v[2:3]    ; encoding: [0x01,0x05,0x50,0x7c]
-# W64: v_cmp_u_f64_e32 vcc, v[1:2], v[2:3]       ; encoding: [0x01,0x05,0x50,0x7c]
 0x01,0x05,0x50,0x7c
+# W32: v_cmp_u_f64_e32 vcc_lo, v[1:2], v[2:3]  ; encoding: [0x01,0x05,0x50,0x7c]
+# W64: v_cmp_u_f64_e32 vcc, v[1:2], v[2:3]     ; encoding: [0x01,0x05,0x50,0x7c]
 
-# W32: v_cmp_u_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x50,0x7c]
-# W64: v_cmp_u_f64_e32 vcc, v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0x50,0x7c]
 0xfe,0x05,0x50,0x7c
+# W32: v_cmp_u_f64_e32 vcc_lo, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x50,0x7c]
+# W64: v_cmp_u_f64_e32 vcc, v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x50,0x7c]
 
-# W32: v_cmp_u_f64_e32 vcc_lo, s[2:3], v[2:3]    ; encoding: [0x02,0x04,0x50,0x7c]
-# W64: v_cmp_u_f64_e32 vcc, s[2:3], v[2:3]       ; encoding: [0x02,0x04,0x50,0x7c]
 0x02,0x04,0x50,0x7c
+# W32: v_cmp_u_f64_e32 vcc_lo, s[2:3], v[2:3]  ; encoding: [0x02,0x04,0x50,0x7c]
+# W64: v_cmp_u_f64_e32 vcc, s[2:3], v[2:3]     ; encoding: [0x02,0x04,0x50,0x7c]
 
-# W32: v_cmp_u_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x50,0x7c]
-# W64: v_cmp_u_f64_e32 vcc, s[104:105], v[2:3]   ; encoding: [0x68,0x04,0x50,0x7c]
 0x68,0x04,0x50,0x7c
+# W32: v_cmp_u_f64_e32 vcc_lo, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x50,0x7c]
+# W64: v_cmp_u_f64_e32 vcc, s[104:105], v[2:3] ; encoding: [0x68,0x04,0x50,0x7c]
 
-# W32: v_cmp_u_f64_e32 vcc_lo, vcc, v[2:3]       ; encoding: [0x6a,0x04,0x50,0x7c]
-# W64: v_cmp_u_f64_e32 vcc, vcc, v[2:3]          ; encoding: [0x6a,0x04,0x50,0x7c]
 0x6a,0x04,0x50,0x7c
+# W32: v_cmp_u_f64_e32 vcc_lo, vcc, v[2:3]     ; encoding: [0x6a,0x04,0x50,0x7c]
+# W64: v_cmp_u_f64_e32 vcc, vcc, v[2:3]        ; encoding: [0x6a,0x04,0x50,0x7c]
 
-# W32: v_cmp_u_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x50,0x7c]
-# W64: v_cmp_u_f64_e32 vcc, ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0x50,0x7c]
 0x7a,0x04,0x50,0x7c
+# W32: v_cmp_u_f64_e32 vcc_lo, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x50,0x7c]
+# W64: v_cmp_u_f64_e32 vcc, ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x50,0x7c]
 
-# W32: v_cmp_u_f64_e32 vcc_lo, exec, v[2:3]      ; encoding: [0x7e,0x04,0x50,0x7c]
-# W64: v_cmp_u_f64_e32 vcc, exec, v[2:3]         ; encoding: [0x7e,0x04,0x50,0x7c]
 0x7e,0x04,0x50,0x7c
+# W32: v_cmp_u_f64_e32 vcc_lo, exec, v[2:3]    ; encoding: [0x7e,0x04,0x50,0x7c]
+# W64: v_cmp_u_f64_e32 vcc, exec, v[2:3]       ; encoding: [0x7e,0x04,0x50,0x7c]
 
-# W32: v_cmp_u_f64_e32 vcc_lo, null, v[2:3]      ; encoding: [0x7c,0x04,0x50,0x7c]
-# W64: v_cmp_u_f64_e32 vcc, null, v[2:3]         ; encoding: [0x7c,0x04,0x50,0x7c]
 0x7c,0x04,0x50,0x7c
+# W32: v_cmp_u_f64_e32 vcc_lo, null, v[2:3]    ; encoding: [0x7c,0x04,0x50,0x7c]
+# W64: v_cmp_u_f64_e32 vcc, null, v[2:3]       ; encoding: [0x7c,0x04,0x50,0x7c]
 
-# W32: v_cmp_u_f64_e32 vcc_lo, -1, v[2:3]        ; encoding: [0xc1,0x04,0x50,0x7c]
-# W64: v_cmp_u_f64_e32 vcc, -1, v[2:3]           ; encoding: [0xc1,0x04,0x50,0x7c]
 0xc1,0x04,0x50,0x7c
+# W32: v_cmp_u_f64_e32 vcc_lo, -1, v[2:3]      ; encoding: [0xc1,0x04,0x50,0x7c]
+# W64: v_cmp_u_f64_e32 vcc, -1, v[2:3]         ; encoding: [0xc1,0x04,0x50,0x7c]
 
-# W32: v_cmp_u_f64_e32 vcc_lo, 0.5, v[2:3]       ; encoding: [0xf0,0x04,0x50,0x7c]
-# W64: v_cmp_u_f64_e32 vcc, 0.5, v[2:3]          ; encoding: [0xf0,0x04,0x50,0x7c]
 0xf0,0x04,0x50,0x7c
+# W32: v_cmp_u_f64_e32 vcc_lo, 0.5, v[2:3]     ; encoding: [0xf0,0x04,0x50,0x7c]
+# W64: v_cmp_u_f64_e32 vcc, 0.5, v[2:3]        ; encoding: [0xf0,0x04,0x50,0x7c]
 
-# W32: v_cmp_u_f64_e32 vcc_lo, src_scc, v[2:3]   ; encoding: [0xfd,0x04,0x50,0x7c]
-# W64: v_cmp_u_f64_e32 vcc, src_scc, v[2:3]      ; encoding: [0xfd,0x04,0x50,0x7c]
 0xfd,0x04,0x50,0x7c
+# W32: v_cmp_u_f64_e32 vcc_lo, src_scc, v[2:3] ; encoding: [0xfd,0x04,0x50,0x7c]
+# W64: v_cmp_u_f64_e32 vcc, src_scc, v[2:3]    ; encoding: [0xfd,0x04,0x50,0x7c]
 
+0xff,0xfc,0x51,0x7c,0x56,0x34,0x12,0xaf
 # W32: v_cmp_u_f64_e32 vcc_lo, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x51,0x7c,0x56,0x34,0x12,0xaf]
 # W64: v_cmp_u_f64_e32 vcc, 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x51,0x7c,0x56,0x34,0x12,0xaf]
-0xff,0xfc,0x51,0x7c,0x56,0x34,0x12,0xaf
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopc_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopc_dpp16.txt
index a840f0a9c2bec..4541c669793ad 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopc_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopc_dpp16.txt
@@ -1,3028 +1,3029 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W32
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W64
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W32
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W64
 
+0xfa,0x04,0xfa,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_class_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_class_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0xfa,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0xfa,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_class_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_class_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0xfa,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0xfa,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_class_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_class_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0xfa,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0xfa,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_class_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_class_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0xfa,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0xfa,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_class_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_class_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0xfa,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0xfa,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_class_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_class_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0xfa,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0xfa,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_class_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_class_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0xfa,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0xfa,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_class_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_class_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0xfa,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0xfa,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_class_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_class_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0xfa,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0xfa,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_class_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_class_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0xfa,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0xfa,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_class_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_class_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0xfa,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0xfa,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_class_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_class_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0xfa,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0xfa,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_class_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_class_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0xfa,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0xfa,0x7c,0x7f,0x6f,0x3d,0x30
 # W32: v_cmp_class_f16 vcc_lo, -|v127|, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfa,0x7c,0x7f,0x6f,0x3d,0x30]
 # W64: v_cmp_class_f16 vcc, -|v127|, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfa,0x7c,0x7f,0x6f,0x3d,0x30]
-0xfa,0xfe,0xfa,0x7c,0x7f,0x6f,0x3d,0x30
 
+0xfa,0x04,0xfc,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_class_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_class_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0xfc,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0xfc,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_class_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_class_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0xfc,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0xfc,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_class_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_class_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0xfc,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0xfc,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_class_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_class_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0xfc,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0xfc,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_class_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_class_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0xfc,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0xfc,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_class_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_class_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0xfc,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0xfc,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_class_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_class_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0xfc,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0xfc,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_class_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_class_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0xfc,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0xfc,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_class_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_class_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0xfc,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0xfc,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_class_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_class_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0xfc,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0xfc,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_class_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_class_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0xfc,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0xfc,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_class_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_class_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0xfc,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0xfc,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_class_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_class_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0xfc,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0xfd,0x7c,0xff,0x6f,0x3d,0x30
 # W32: v_cmp_class_f32 vcc_lo, -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfd,0x7c,0xff,0x6f,0x3d,0x30]
 # W64: v_cmp_class_f32 vcc, -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfd,0x7c,0xff,0x6f,0x3d,0x30]
-0xfa,0xfe,0xfd,0x7c,0xff,0x6f,0x3d,0x30
 
+0xfa,0x04,0x04,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_eq_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_eq_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x04,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x04,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_eq_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_eq_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x04,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x04,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_eq_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_eq_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x04,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x04,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_eq_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_eq_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x04,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x04,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_eq_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_eq_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x04,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x04,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_eq_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x04,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x04,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_eq_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_eq_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x04,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x04,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_eq_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_eq_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x04,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x04,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_eq_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_eq_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x04,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x04,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_eq_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_eq_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x04,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x04,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_eq_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_eq_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x04,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x04,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_eq_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_eq_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x04,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x04,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_eq_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_eq_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x04,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x04,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x04,0x7c,0x7f,0x6f,0xfd,0x30
 # W32: v_cmp_eq_f16 vcc_lo, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x04,0x7c,0x7f,0x6f,0xfd,0x30]
 # W64: v_cmp_eq_f16 vcc, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x04,0x7c,0x7f,0x6f,0xfd,0x30]
-0xfa,0xfe,0x04,0x7c,0x7f,0x6f,0xfd,0x30
 
+0xfa,0x04,0x24,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_eq_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_eq_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x24,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x24,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_eq_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_eq_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x24,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x24,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_eq_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_eq_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x24,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x24,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_eq_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_eq_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x24,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x24,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_eq_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_eq_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x24,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x24,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_eq_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x24,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x24,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_eq_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_eq_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x24,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x24,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_eq_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_eq_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x24,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x24,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_eq_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_eq_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x24,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x24,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_eq_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_eq_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x24,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x24,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_eq_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_eq_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x24,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x24,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_eq_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_eq_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x24,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x24,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_eq_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_eq_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x24,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x24,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x25,0x7c,0xff,0x6f,0xfd,0x30
 # W32: v_cmp_eq_f32 vcc_lo, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x25,0x7c,0xff,0x6f,0xfd,0x30]
 # W64: v_cmp_eq_f32 vcc, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x25,0x7c,0xff,0x6f,0xfd,0x30]
-0xfa,0xfe,0x25,0x7c,0xff,0x6f,0xfd,0x30
 
+0xfa,0x04,0x64,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_eq_i16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_eq_i16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x64,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x64,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_eq_i16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_eq_i16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x64,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x64,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_eq_i16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_eq_i16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x64,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x64,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_eq_i16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_eq_i16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x64,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x64,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_eq_i16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_eq_i16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x64,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x64,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_eq_i16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_i16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x64,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x64,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_eq_i16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_eq_i16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x64,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x64,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_eq_i16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_eq_i16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x64,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x64,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_eq_i16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_eq_i16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x64,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x64,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_eq_i16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_eq_i16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x64,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x64,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_eq_i16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_eq_i16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x64,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x64,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_eq_i16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_eq_i16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x64,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x64,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_eq_i16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_eq_i16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x64,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x64,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x64,0x7c,0x7f,0x6f,0x0d,0x30
 # W32: v_cmp_eq_i16 vcc_lo, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x64,0x7c,0x7f,0x6f,0x0d,0x30]
 # W64: v_cmp_eq_i16 vcc, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x64,0x7c,0x7f,0x6f,0x0d,0x30]
-0xfa,0xfe,0x64,0x7c,0x7f,0x6f,0x0d,0x30
 
+0xfa,0x04,0x84,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_eq_i32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_eq_i32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x84,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x84,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_eq_i32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_eq_i32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x84,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x84,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_eq_i32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_eq_i32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x84,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x84,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_eq_i32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_eq_i32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x84,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x84,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_eq_i32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_eq_i32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x84,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x84,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_eq_i32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_i32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x84,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x84,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_eq_i32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_eq_i32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x84,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x84,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_eq_i32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_eq_i32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x84,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x84,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_eq_i32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_eq_i32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x84,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x84,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_eq_i32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_eq_i32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x84,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x84,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_eq_i32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_eq_i32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x84,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x84,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_eq_i32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_eq_i32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x84,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x84,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_eq_i32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_eq_i32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x84,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x84,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x85,0x7c,0xff,0x6f,0x0d,0x30
 # W32: v_cmp_eq_i32 vcc_lo, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x85,0x7c,0xff,0x6f,0x0d,0x30]
 # W64: v_cmp_eq_i32 vcc, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x85,0x7c,0xff,0x6f,0x0d,0x30]
-0xfa,0xfe,0x85,0x7c,0xff,0x6f,0x0d,0x30
 
+0xfa,0x04,0x74,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_eq_u16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_eq_u16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x74,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x74,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_eq_u16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_eq_u16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x74,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x74,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_eq_u16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_eq_u16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x74,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x74,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_eq_u16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_eq_u16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x74,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x74,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_eq_u16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_eq_u16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x74,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x74,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_eq_u16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_u16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x74,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x74,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_eq_u16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_eq_u16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x74,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x74,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_eq_u16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_eq_u16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x74,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x74,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_eq_u16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_eq_u16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x74,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x74,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_eq_u16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_eq_u16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x74,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x74,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_eq_u16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_eq_u16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x74,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x74,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_eq_u16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_eq_u16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x74,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x74,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_eq_u16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_eq_u16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x74,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x74,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x74,0x7c,0x7f,0x6f,0x0d,0x30
 # W32: v_cmp_eq_u16 vcc_lo, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x74,0x7c,0x7f,0x6f,0x0d,0x30]
 # W64: v_cmp_eq_u16 vcc, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x74,0x7c,0x7f,0x6f,0x0d,0x30]
-0xfa,0xfe,0x74,0x7c,0x7f,0x6f,0x0d,0x30
 
+0xfa,0x04,0x94,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_eq_u32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_eq_u32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x94,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x94,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_eq_u32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_eq_u32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x94,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x94,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_eq_u32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_eq_u32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x94,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x94,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_eq_u32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_eq_u32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x94,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x94,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_eq_u32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_eq_u32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x94,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x94,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_eq_u32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_u32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x94,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x94,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_eq_u32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_eq_u32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x94,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x94,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_eq_u32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_eq_u32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x94,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x94,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_eq_u32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_eq_u32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x94,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x94,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_eq_u32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_eq_u32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x94,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x94,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_eq_u32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_eq_u32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x94,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x94,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_eq_u32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_eq_u32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x94,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x94,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_eq_u32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_eq_u32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x94,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x94,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x95,0x7c,0xff,0x6f,0x0d,0x30
 # W32: v_cmp_eq_u32 vcc_lo, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x95,0x7c,0xff,0x6f,0x0d,0x30]
 # W64: v_cmp_eq_u32 vcc, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x95,0x7c,0xff,0x6f,0x0d,0x30]
-0xfa,0xfe,0x95,0x7c,0xff,0x6f,0x0d,0x30
 
+0xfa,0x04,0x0c,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ge_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ge_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x0c,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x0c,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ge_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ge_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x0c,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x0c,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_ge_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ge_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x0c,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x0c,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_ge_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ge_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x0c,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x0c,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_ge_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ge_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x0c,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x0c,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ge_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x0c,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x0c,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_ge_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ge_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x0c,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x0c,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ge_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ge_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x0c,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x0c,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_ge_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ge_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x0c,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x0c,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ge_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ge_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x0c,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x0c,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_ge_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ge_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x0c,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x0c,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ge_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ge_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x0c,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x0c,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_ge_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ge_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0c,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x0c,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x0c,0x7c,0x7f,0x6f,0xfd,0x30
 # W32: v_cmp_ge_f16 vcc_lo, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0c,0x7c,0x7f,0x6f,0xfd,0x30]
 # W64: v_cmp_ge_f16 vcc, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0c,0x7c,0x7f,0x6f,0xfd,0x30]
-0xfa,0xfe,0x0c,0x7c,0x7f,0x6f,0xfd,0x30
 
+0xfa,0x04,0x2c,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ge_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ge_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x2c,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x2c,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ge_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ge_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x2c,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x2c,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_ge_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ge_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x2c,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x2c,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_ge_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ge_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x2c,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x2c,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_ge_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ge_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x2c,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x2c,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ge_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x2c,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x2c,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_ge_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ge_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x2c,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x2c,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ge_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ge_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x2c,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x2c,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_ge_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ge_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x2c,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x2c,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ge_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ge_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x2c,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x2c,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_ge_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ge_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x2c,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x2c,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ge_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ge_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x2c,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x2c,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_ge_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ge_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x2c,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x2c,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x2d,0x7c,0xff,0x6f,0xfd,0x30
 # W32: v_cmp_ge_f32 vcc_lo, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2d,0x7c,0xff,0x6f,0xfd,0x30]
 # W64: v_cmp_ge_f32 vcc, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2d,0x7c,0xff,0x6f,0xfd,0x30]
-0xfa,0xfe,0x2d,0x7c,0xff,0x6f,0xfd,0x30
 
+0xfa,0x04,0x6c,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ge_i16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ge_i16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x6c,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x6c,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ge_i16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ge_i16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x6c,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x6c,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_ge_i16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ge_i16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x6c,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x6c,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_ge_i16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ge_i16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x6c,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x6c,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_ge_i16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ge_i16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x6c,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x6c,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ge_i16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_i16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x6c,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x6c,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_ge_i16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ge_i16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x6c,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x6c,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ge_i16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ge_i16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x6c,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x6c,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_ge_i16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ge_i16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x6c,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x6c,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ge_i16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ge_i16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x6c,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x6c,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_ge_i16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ge_i16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x6c,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x6c,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ge_i16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ge_i16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x6c,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x6c,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_ge_i16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ge_i16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6c,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x6c,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x6c,0x7c,0x7f,0x6f,0x0d,0x30
 # W32: v_cmp_ge_i16 vcc_lo, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6c,0x7c,0x7f,0x6f,0x0d,0x30]
 # W64: v_cmp_ge_i16 vcc, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6c,0x7c,0x7f,0x6f,0x0d,0x30]
-0xfa,0xfe,0x6c,0x7c,0x7f,0x6f,0x0d,0x30
 
+0xfa,0x04,0x8c,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ge_i32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ge_i32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x8c,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x8c,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ge_i32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ge_i32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x8c,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x8c,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_ge_i32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ge_i32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x8c,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x8c,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_ge_i32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ge_i32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x8c,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x8c,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_ge_i32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ge_i32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x8c,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x8c,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ge_i32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_i32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x8c,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x8c,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_ge_i32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ge_i32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x8c,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x8c,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ge_i32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ge_i32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x8c,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x8c,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_ge_i32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ge_i32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x8c,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x8c,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ge_i32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ge_i32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x8c,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x8c,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_ge_i32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ge_i32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x8c,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x8c,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ge_i32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ge_i32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x8c,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x8c,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_ge_i32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ge_i32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x8c,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x8c,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x8d,0x7c,0xff,0x6f,0x0d,0x30
 # W32: v_cmp_ge_i32 vcc_lo, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x8d,0x7c,0xff,0x6f,0x0d,0x30]
 # W64: v_cmp_ge_i32 vcc, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x8d,0x7c,0xff,0x6f,0x0d,0x30]
-0xfa,0xfe,0x8d,0x7c,0xff,0x6f,0x0d,0x30
 
+0xfa,0x04,0x7c,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ge_u16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ge_u16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x7c,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x7c,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ge_u16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ge_u16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x7c,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x7c,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_ge_u16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ge_u16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x7c,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x7c,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_ge_u16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ge_u16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x7c,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x7c,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_ge_u16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ge_u16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x7c,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x7c,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ge_u16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_u16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x7c,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x7c,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_ge_u16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ge_u16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x7c,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x7c,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ge_u16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ge_u16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x7c,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x7c,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_ge_u16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ge_u16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x7c,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x7c,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ge_u16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ge_u16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x7c,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x7c,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_ge_u16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ge_u16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x7c,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x7c,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ge_u16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ge_u16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x7c,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x7c,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_ge_u16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ge_u16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7c,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x7c,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x7c,0x7c,0x7f,0x6f,0x0d,0x30
 # W32: v_cmp_ge_u16 vcc_lo, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7c,0x7c,0x7f,0x6f,0x0d,0x30]
 # W64: v_cmp_ge_u16 vcc, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7c,0x7c,0x7f,0x6f,0x0d,0x30]
-0xfa,0xfe,0x7c,0x7c,0x7f,0x6f,0x0d,0x30
 
+0xfa,0x04,0x9c,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ge_u32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ge_u32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x9c,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x9c,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ge_u32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ge_u32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x9c,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x9c,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_ge_u32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ge_u32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x9c,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x9c,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_ge_u32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ge_u32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x9c,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x9c,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_ge_u32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ge_u32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x9c,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x9c,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ge_u32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_u32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x9c,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x9c,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_ge_u32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ge_u32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x9c,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x9c,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ge_u32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ge_u32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x9c,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x9c,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_ge_u32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ge_u32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x9c,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x9c,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ge_u32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ge_u32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x9c,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x9c,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_ge_u32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ge_u32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x9c,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x9c,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ge_u32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ge_u32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x9c,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x9c,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_ge_u32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ge_u32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x9c,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x9c,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x9d,0x7c,0xff,0x6f,0x0d,0x30
 # W32: v_cmp_ge_u32 vcc_lo, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x9d,0x7c,0xff,0x6f,0x0d,0x30]
 # W64: v_cmp_ge_u32 vcc, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x9d,0x7c,0xff,0x6f,0x0d,0x30]
-0xfa,0xfe,0x9d,0x7c,0xff,0x6f,0x0d,0x30
 
+0xfa,0x04,0x08,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_gt_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_gt_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x08,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x08,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_gt_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_gt_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x08,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x08,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_gt_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_gt_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x08,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x08,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_gt_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_gt_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x08,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x08,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_gt_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_gt_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x08,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x08,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_gt_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x08,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x08,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_gt_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_gt_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x08,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x08,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_gt_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_gt_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x08,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x08,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_gt_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_gt_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x08,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x08,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_gt_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_gt_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x08,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x08,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_gt_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_gt_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x08,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x08,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_gt_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_gt_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x08,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x08,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_gt_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_gt_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x08,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x08,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x08,0x7c,0x7f,0x6f,0xfd,0x30
 # W32: v_cmp_gt_f16 vcc_lo, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x08,0x7c,0x7f,0x6f,0xfd,0x30]
 # W64: v_cmp_gt_f16 vcc, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x08,0x7c,0x7f,0x6f,0xfd,0x30]
-0xfa,0xfe,0x08,0x7c,0x7f,0x6f,0xfd,0x30
 
+0xfa,0x04,0x28,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_gt_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_gt_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x28,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x28,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_gt_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_gt_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x28,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x28,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_gt_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_gt_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x28,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x28,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_gt_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_gt_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x28,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x28,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_gt_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_gt_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x28,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x28,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_gt_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x28,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x28,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_gt_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_gt_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x28,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x28,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_gt_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_gt_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x28,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x28,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_gt_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_gt_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x28,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x28,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_gt_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_gt_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x28,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x28,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_gt_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_gt_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x28,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x28,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_gt_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_gt_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x28,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x28,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_gt_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_gt_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x28,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x28,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x29,0x7c,0xff,0x6f,0xfd,0x30
 # W32: v_cmp_gt_f32 vcc_lo, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x29,0x7c,0xff,0x6f,0xfd,0x30]
 # W64: v_cmp_gt_f32 vcc, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x29,0x7c,0xff,0x6f,0xfd,0x30]
-0xfa,0xfe,0x29,0x7c,0xff,0x6f,0xfd,0x30
 
+0xfa,0x04,0x68,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_gt_i16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_gt_i16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x68,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x68,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_gt_i16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_gt_i16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x68,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x68,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_gt_i16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_gt_i16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x68,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x68,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_gt_i16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_gt_i16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x68,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x68,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_gt_i16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_gt_i16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x68,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x68,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_gt_i16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_i16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x68,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x68,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_gt_i16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_gt_i16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x68,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x68,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_gt_i16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_gt_i16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x68,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x68,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_gt_i16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_gt_i16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x68,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x68,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_gt_i16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_gt_i16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x68,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x68,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_gt_i16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_gt_i16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x68,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x68,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_gt_i16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_gt_i16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x68,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x68,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_gt_i16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_gt_i16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x68,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x68,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x68,0x7c,0x7f,0x6f,0x0d,0x30
 # W32: v_cmp_gt_i16 vcc_lo, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x68,0x7c,0x7f,0x6f,0x0d,0x30]
 # W64: v_cmp_gt_i16 vcc, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x68,0x7c,0x7f,0x6f,0x0d,0x30]
-0xfa,0xfe,0x68,0x7c,0x7f,0x6f,0x0d,0x30
 
+0xfa,0x04,0x88,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_gt_i32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_gt_i32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x88,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x88,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_gt_i32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_gt_i32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x88,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x88,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_gt_i32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_gt_i32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x88,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x88,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_gt_i32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_gt_i32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x88,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x88,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_gt_i32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_gt_i32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x88,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x88,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_gt_i32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_i32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x88,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x88,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_gt_i32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_gt_i32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x88,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x88,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_gt_i32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_gt_i32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x88,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x88,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_gt_i32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_gt_i32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x88,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x88,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_gt_i32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_gt_i32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x88,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x88,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_gt_i32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_gt_i32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x88,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x88,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_gt_i32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_gt_i32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x88,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x88,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_gt_i32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_gt_i32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x88,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x88,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x89,0x7c,0xff,0x6f,0x0d,0x30
 # W32: v_cmp_gt_i32 vcc_lo, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x89,0x7c,0xff,0x6f,0x0d,0x30]
 # W64: v_cmp_gt_i32 vcc, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x89,0x7c,0xff,0x6f,0x0d,0x30]
-0xfa,0xfe,0x89,0x7c,0xff,0x6f,0x0d,0x30
 
+0xfa,0x04,0x78,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_gt_u16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_gt_u16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x78,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x78,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_gt_u16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_gt_u16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x78,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x78,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_gt_u16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_gt_u16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x78,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x78,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_gt_u16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_gt_u16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x78,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x78,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_gt_u16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_gt_u16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x78,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x78,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_gt_u16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_u16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x78,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x78,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_gt_u16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_gt_u16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x78,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x78,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_gt_u16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_gt_u16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x78,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x78,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_gt_u16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_gt_u16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x78,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x78,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_gt_u16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_gt_u16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x78,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x78,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_gt_u16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_gt_u16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x78,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x78,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_gt_u16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_gt_u16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x78,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x78,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_gt_u16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_gt_u16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x78,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x78,0x7c,0x7f,0x6f,0x0d,0x30
 # W32: v_cmp_gt_u16 vcc_lo, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x78,0x7c,0x7f,0x6f,0x0d,0x30]
 # W64: v_cmp_gt_u16 vcc, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x78,0x7c,0x7f,0x6f,0x0d,0x30]
-0xfa,0xfe,0x78,0x7c,0x7f,0x6f,0x0d,0x30
 
+0xfa,0x04,0x98,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_gt_u32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_gt_u32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x98,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x98,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_gt_u32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_gt_u32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x98,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x98,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_gt_u32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_gt_u32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x98,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x98,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_gt_u32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_gt_u32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x98,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x98,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_gt_u32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_gt_u32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x98,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x98,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_gt_u32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_u32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x98,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x98,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_gt_u32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_gt_u32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x98,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x98,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_gt_u32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_gt_u32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x98,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x98,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_gt_u32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_gt_u32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x98,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x98,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_gt_u32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_gt_u32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x98,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x98,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_gt_u32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_gt_u32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x98,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x98,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_gt_u32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_gt_u32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x98,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x98,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_gt_u32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_gt_u32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x98,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x98,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x99,0x7c,0xff,0x6f,0x0d,0x30
 # W32: v_cmp_gt_u32 vcc_lo, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x99,0x7c,0xff,0x6f,0x0d,0x30]
 # W64: v_cmp_gt_u32 vcc, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x99,0x7c,0xff,0x6f,0x0d,0x30]
-0xfa,0xfe,0x99,0x7c,0xff,0x6f,0x0d,0x30
 
+0xfa,0x04,0x06,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_le_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_le_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x06,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x06,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_le_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_le_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x06,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x06,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_le_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_le_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x06,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x06,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_le_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_le_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x06,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x06,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_le_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_le_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x06,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x06,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_le_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x06,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x06,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_le_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_le_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x06,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x06,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_le_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_le_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x06,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x06,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_le_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_le_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x06,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x06,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_le_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_le_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x06,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x06,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_le_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_le_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x06,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x06,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_le_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_le_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x06,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x06,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_le_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_le_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x06,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x06,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x06,0x7c,0x7f,0x6f,0xfd,0x30
 # W32: v_cmp_le_f16 vcc_lo, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x06,0x7c,0x7f,0x6f,0xfd,0x30]
 # W64: v_cmp_le_f16 vcc, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x06,0x7c,0x7f,0x6f,0xfd,0x30]
-0xfa,0xfe,0x06,0x7c,0x7f,0x6f,0xfd,0x30
 
+0xfa,0x04,0x26,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_le_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_le_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x26,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x26,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_le_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_le_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x26,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x26,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_le_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_le_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x26,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x26,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_le_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_le_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x26,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x26,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_le_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_le_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x26,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x26,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_le_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x26,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x26,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_le_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_le_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x26,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x26,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_le_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_le_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x26,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x26,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_le_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_le_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x26,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x26,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_le_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_le_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x26,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x26,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_le_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_le_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x26,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x26,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_le_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_le_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x26,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x26,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_le_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_le_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x26,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x27,0x7c,0xff,0x6f,0xfd,0x30
 # W32: v_cmp_le_f32 vcc_lo, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x27,0x7c,0xff,0x6f,0xfd,0x30]
 # W64: v_cmp_le_f32 vcc, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x27,0x7c,0xff,0x6f,0xfd,0x30]
-0xfa,0xfe,0x27,0x7c,0xff,0x6f,0xfd,0x30
 
+0xfa,0x04,0x66,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_le_i16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_le_i16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x66,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x66,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_le_i16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_le_i16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x66,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x66,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_le_i16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_le_i16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x66,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x66,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_le_i16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_le_i16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x66,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x66,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_le_i16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_le_i16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x66,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x66,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_le_i16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_i16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x66,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x66,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_le_i16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_le_i16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x66,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x66,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_le_i16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_le_i16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x66,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x66,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_le_i16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_le_i16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x66,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x66,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_le_i16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_le_i16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x66,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x66,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_le_i16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_le_i16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x66,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x66,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_le_i16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_le_i16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x66,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x66,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_le_i16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_le_i16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x66,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x66,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x66,0x7c,0x7f,0x6f,0x0d,0x30
 # W32: v_cmp_le_i16 vcc_lo, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x66,0x7c,0x7f,0x6f,0x0d,0x30]
 # W64: v_cmp_le_i16 vcc, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x66,0x7c,0x7f,0x6f,0x0d,0x30]
-0xfa,0xfe,0x66,0x7c,0x7f,0x6f,0x0d,0x30
 
+0xfa,0x04,0x86,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_le_i32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_le_i32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x86,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x86,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_le_i32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_le_i32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x86,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x86,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_le_i32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_le_i32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x86,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x86,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_le_i32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_le_i32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x86,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x86,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_le_i32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_le_i32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x86,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x86,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_le_i32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_i32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x86,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x86,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_le_i32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_le_i32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x86,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x86,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_le_i32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_le_i32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x86,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x86,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_le_i32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_le_i32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x86,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x86,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_le_i32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_le_i32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x86,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x86,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_le_i32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_le_i32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x86,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x86,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_le_i32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_le_i32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x86,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x86,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_le_i32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_le_i32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x86,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x86,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x87,0x7c,0xff,0x6f,0x0d,0x30
 # W32: v_cmp_le_i32 vcc_lo, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x87,0x7c,0xff,0x6f,0x0d,0x30]
 # W64: v_cmp_le_i32 vcc, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x87,0x7c,0xff,0x6f,0x0d,0x30]
-0xfa,0xfe,0x87,0x7c,0xff,0x6f,0x0d,0x30
 
+0xfa,0x04,0x76,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_le_u16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_le_u16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x76,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x76,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_le_u16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_le_u16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x76,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x76,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_le_u16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_le_u16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x76,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x76,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_le_u16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_le_u16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x76,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x76,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_le_u16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_le_u16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x76,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x76,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_le_u16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_u16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x76,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x76,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_le_u16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_le_u16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x76,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x76,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_le_u16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_le_u16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x76,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x76,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_le_u16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_le_u16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x76,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x76,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_le_u16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_le_u16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x76,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x76,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_le_u16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_le_u16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x76,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x76,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_le_u16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_le_u16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x76,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x76,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_le_u16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_le_u16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x76,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x76,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x76,0x7c,0x7f,0x6f,0x0d,0x30
 # W32: v_cmp_le_u16 vcc_lo, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x76,0x7c,0x7f,0x6f,0x0d,0x30]
 # W64: v_cmp_le_u16 vcc, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x76,0x7c,0x7f,0x6f,0x0d,0x30]
-0xfa,0xfe,0x76,0x7c,0x7f,0x6f,0x0d,0x30
 
+0xfa,0x04,0x96,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_le_u32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_le_u32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x96,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x96,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_le_u32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_le_u32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x96,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x96,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_le_u32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_le_u32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x96,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x96,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_le_u32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_le_u32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x96,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x96,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_le_u32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_le_u32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x96,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x96,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_le_u32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_u32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x96,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x96,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_le_u32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_le_u32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x96,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x96,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_le_u32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_le_u32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x96,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x96,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_le_u32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_le_u32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x96,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x96,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_le_u32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_le_u32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x96,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x96,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_le_u32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_le_u32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x96,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x96,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_le_u32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_le_u32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x96,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x96,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_le_u32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_le_u32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x96,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x96,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x97,0x7c,0xff,0x6f,0x0d,0x30
 # W32: v_cmp_le_u32 vcc_lo, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x97,0x7c,0xff,0x6f,0x0d,0x30]
 # W64: v_cmp_le_u32 vcc, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x97,0x7c,0xff,0x6f,0x0d,0x30]
-0xfa,0xfe,0x97,0x7c,0xff,0x6f,0x0d,0x30
 
+0xfa,0x04,0x0a,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lg_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lg_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x0a,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x0a,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lg_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lg_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x0a,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x0a,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_lg_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lg_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x0a,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x0a,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_lg_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lg_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x0a,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x0a,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_lg_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lg_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x0a,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x0a,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lg_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lg_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x0a,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x0a,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_lg_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lg_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x0a,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x0a,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lg_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lg_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x0a,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x0a,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_lg_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lg_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x0a,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x0a,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lg_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lg_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x0a,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x0a,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_lg_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lg_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x0a,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x0a,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lg_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lg_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x0a,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x0a,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_lg_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lg_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x0a,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x0a,0x7c,0x7f,0x6f,0xfd,0x30
 # W32: v_cmp_lg_f16 vcc_lo, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0a,0x7c,0x7f,0x6f,0xfd,0x30]
 # W64: v_cmp_lg_f16 vcc, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0a,0x7c,0x7f,0x6f,0xfd,0x30]
-0xfa,0xfe,0x0a,0x7c,0x7f,0x6f,0xfd,0x30
 
+0xfa,0x04,0x2a,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lg_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lg_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x2a,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x2a,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lg_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lg_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x2a,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x2a,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_lg_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lg_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x2a,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x2a,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_lg_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lg_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x2a,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x2a,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_lg_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lg_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x2a,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x2a,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lg_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lg_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x2a,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x2a,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_lg_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lg_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x2a,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x2a,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lg_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lg_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x2a,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x2a,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_lg_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lg_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x2a,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x2a,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lg_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lg_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x2a,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x2a,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_lg_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lg_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x2a,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x2a,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lg_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lg_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x2a,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x2a,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_lg_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lg_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x2a,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x2a,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x2b,0x7c,0xff,0x6f,0xfd,0x30
 # W32: v_cmp_lg_f32 vcc_lo, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2b,0x7c,0xff,0x6f,0xfd,0x30]
 # W64: v_cmp_lg_f32 vcc, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2b,0x7c,0xff,0x6f,0xfd,0x30]
-0xfa,0xfe,0x2b,0x7c,0xff,0x6f,0xfd,0x30
 
+0xfa,0x04,0x02,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lt_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lt_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x02,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x02,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lt_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lt_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x02,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x02,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_lt_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lt_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x02,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x02,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_lt_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lt_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x02,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x02,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_lt_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lt_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x02,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x02,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lt_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x02,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x02,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_lt_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lt_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x02,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x02,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lt_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lt_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x02,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x02,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_lt_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lt_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x02,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x02,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lt_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lt_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x02,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x02,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_lt_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lt_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x02,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x02,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lt_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lt_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x02,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x02,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_lt_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lt_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x02,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x02,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x02,0x7c,0x7f,0x6f,0xfd,0x30
 # W32: v_cmp_lt_f16 vcc_lo, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x02,0x7c,0x7f,0x6f,0xfd,0x30]
 # W64: v_cmp_lt_f16 vcc, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x02,0x7c,0x7f,0x6f,0xfd,0x30]
-0xfa,0xfe,0x02,0x7c,0x7f,0x6f,0xfd,0x30
 
+0xfa,0x04,0x22,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lt_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lt_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x22,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x22,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lt_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lt_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x22,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x22,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_lt_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lt_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x22,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x22,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_lt_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lt_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x22,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x22,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_lt_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lt_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x22,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x22,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lt_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x22,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x22,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_lt_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lt_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x22,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x22,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lt_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lt_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x22,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x22,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_lt_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lt_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x22,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x22,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lt_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lt_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x22,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x22,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_lt_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lt_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x22,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x22,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lt_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lt_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x22,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x22,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_lt_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lt_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x22,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x22,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x23,0x7c,0xff,0x6f,0xfd,0x30
 # W32: v_cmp_lt_f32 vcc_lo, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x23,0x7c,0xff,0x6f,0xfd,0x30]
 # W64: v_cmp_lt_f32 vcc, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x23,0x7c,0xff,0x6f,0xfd,0x30]
-0xfa,0xfe,0x23,0x7c,0xff,0x6f,0xfd,0x30
 
+0xfa,0x04,0x62,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lt_i16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lt_i16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x62,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x62,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lt_i16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lt_i16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x62,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x62,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_lt_i16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lt_i16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x62,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x62,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_lt_i16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lt_i16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x62,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x62,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_lt_i16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lt_i16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x62,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x62,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lt_i16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_i16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x62,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x62,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_lt_i16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lt_i16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x62,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x62,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lt_i16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lt_i16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x62,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x62,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_lt_i16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lt_i16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x62,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x62,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lt_i16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lt_i16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x62,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x62,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_lt_i16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lt_i16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x62,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x62,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lt_i16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lt_i16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x62,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x62,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_lt_i16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lt_i16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x62,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x62,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x62,0x7c,0x7f,0x6f,0x0d,0x30
 # W32: v_cmp_lt_i16 vcc_lo, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x62,0x7c,0x7f,0x6f,0x0d,0x30]
 # W64: v_cmp_lt_i16 vcc, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x62,0x7c,0x7f,0x6f,0x0d,0x30]
-0xfa,0xfe,0x62,0x7c,0x7f,0x6f,0x0d,0x30
 
+0xfa,0x04,0x82,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lt_i32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lt_i32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x82,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x82,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lt_i32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lt_i32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x82,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x82,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_lt_i32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lt_i32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x82,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x82,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_lt_i32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lt_i32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x82,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x82,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_lt_i32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lt_i32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x82,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x82,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lt_i32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_i32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x82,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x82,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_lt_i32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lt_i32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x82,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x82,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lt_i32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lt_i32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x82,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x82,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_lt_i32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lt_i32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x82,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x82,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lt_i32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lt_i32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x82,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x82,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_lt_i32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lt_i32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x82,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x82,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lt_i32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lt_i32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x82,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x82,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_lt_i32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lt_i32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x82,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x82,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x83,0x7c,0xff,0x6f,0x0d,0x30
 # W32: v_cmp_lt_i32 vcc_lo, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x83,0x7c,0xff,0x6f,0x0d,0x30]
 # W64: v_cmp_lt_i32 vcc, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x83,0x7c,0xff,0x6f,0x0d,0x30]
-0xfa,0xfe,0x83,0x7c,0xff,0x6f,0x0d,0x30
 
+0xfa,0x04,0x72,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lt_u16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lt_u16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x72,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x72,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lt_u16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lt_u16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x72,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x72,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_lt_u16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lt_u16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x72,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x72,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_lt_u16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lt_u16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x72,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x72,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_lt_u16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lt_u16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x72,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x72,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lt_u16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_u16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x72,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x72,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_lt_u16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lt_u16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x72,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x72,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lt_u16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lt_u16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x72,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x72,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_lt_u16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lt_u16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x72,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x72,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lt_u16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lt_u16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x72,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x72,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_lt_u16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lt_u16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x72,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x72,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lt_u16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lt_u16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x72,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x72,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_lt_u16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lt_u16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x72,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x72,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x72,0x7c,0x7f,0x6f,0x0d,0x30
 # W32: v_cmp_lt_u16 vcc_lo, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x72,0x7c,0x7f,0x6f,0x0d,0x30]
 # W64: v_cmp_lt_u16 vcc, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x72,0x7c,0x7f,0x6f,0x0d,0x30]
-0xfa,0xfe,0x72,0x7c,0x7f,0x6f,0x0d,0x30
 
+0xfa,0x04,0x92,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_lt_u32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_lt_u32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x92,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x92,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_lt_u32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_lt_u32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x92,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x92,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_lt_u32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_lt_u32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x92,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x92,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_lt_u32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_lt_u32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x92,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x92,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_lt_u32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_lt_u32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x92,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x92,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_lt_u32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_u32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x92,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x92,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_lt_u32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_lt_u32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x92,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x92,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_lt_u32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_lt_u32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x92,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x92,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_lt_u32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_lt_u32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x92,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x92,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_lt_u32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_lt_u32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x92,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x92,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_lt_u32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_lt_u32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x92,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x92,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_lt_u32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_lt_u32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x92,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x92,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_lt_u32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_lt_u32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x92,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x92,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x93,0x7c,0xff,0x6f,0x0d,0x30
 # W32: v_cmp_lt_u32 vcc_lo, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x93,0x7c,0xff,0x6f,0x0d,0x30]
 # W64: v_cmp_lt_u32 vcc, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x93,0x7c,0xff,0x6f,0x0d,0x30]
-0xfa,0xfe,0x93,0x7c,0xff,0x6f,0x0d,0x30
 
+0xfa,0x04,0x6a,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ne_i16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ne_i16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x6a,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x6a,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ne_i16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ne_i16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x6a,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x6a,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_ne_i16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ne_i16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x6a,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x6a,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_ne_i16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ne_i16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x6a,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x6a,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_ne_i16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ne_i16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x6a,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x6a,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ne_i16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ne_i16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x6a,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x6a,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_ne_i16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ne_i16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x6a,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x6a,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ne_i16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ne_i16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x6a,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x6a,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_ne_i16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ne_i16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x6a,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x6a,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ne_i16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ne_i16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x6a,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x6a,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_ne_i16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ne_i16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x6a,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x6a,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ne_i16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ne_i16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x6a,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x6a,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_ne_i16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ne_i16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6a,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x6a,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x6a,0x7c,0x7f,0x6f,0x0d,0x30
 # W32: v_cmp_ne_i16 vcc_lo, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6a,0x7c,0x7f,0x6f,0x0d,0x30]
 # W64: v_cmp_ne_i16 vcc, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6a,0x7c,0x7f,0x6f,0x0d,0x30]
-0xfa,0xfe,0x6a,0x7c,0x7f,0x6f,0x0d,0x30
 
+0xfa,0x04,0x8a,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ne_i32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ne_i32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x8a,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x8a,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ne_i32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ne_i32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x8a,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x8a,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_ne_i32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ne_i32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x8a,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x8a,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_ne_i32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ne_i32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x8a,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x8a,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_ne_i32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ne_i32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x8a,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x8a,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ne_i32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ne_i32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x8a,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x8a,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_ne_i32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ne_i32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x8a,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x8a,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ne_i32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ne_i32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x8a,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x8a,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_ne_i32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ne_i32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x8a,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x8a,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ne_i32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ne_i32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x8a,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x8a,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_ne_i32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ne_i32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x8a,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x8a,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ne_i32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ne_i32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x8a,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x8a,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_ne_i32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ne_i32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x8a,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x8a,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x8b,0x7c,0xff,0x6f,0x0d,0x30
 # W32: v_cmp_ne_i32 vcc_lo, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x8b,0x7c,0xff,0x6f,0x0d,0x30]
 # W64: v_cmp_ne_i32 vcc, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x8b,0x7c,0xff,0x6f,0x0d,0x30]
-0xfa,0xfe,0x8b,0x7c,0xff,0x6f,0x0d,0x30
 
+0xfa,0x04,0x7a,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ne_u16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ne_u16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x7a,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x7a,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ne_u16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ne_u16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x7a,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x7a,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_ne_u16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ne_u16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x7a,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x7a,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_ne_u16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ne_u16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x7a,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x7a,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_ne_u16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ne_u16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x7a,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x7a,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ne_u16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ne_u16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x7a,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x7a,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_ne_u16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ne_u16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x7a,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x7a,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ne_u16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ne_u16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x7a,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x7a,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_ne_u16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ne_u16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x7a,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x7a,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ne_u16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ne_u16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x7a,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x7a,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_ne_u16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ne_u16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x7a,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x7a,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ne_u16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ne_u16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x7a,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x7a,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_ne_u16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ne_u16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7a,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x7a,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x7a,0x7c,0x7f,0x6f,0x0d,0x30
 # W32: v_cmp_ne_u16 vcc_lo, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7a,0x7c,0x7f,0x6f,0x0d,0x30]
 # W64: v_cmp_ne_u16 vcc, v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7a,0x7c,0x7f,0x6f,0x0d,0x30]
-0xfa,0xfe,0x7a,0x7c,0x7f,0x6f,0x0d,0x30
 
+0xfa,0x04,0x9a,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ne_u32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ne_u32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x9a,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x9a,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ne_u32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ne_u32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x9a,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x9a,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_ne_u32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ne_u32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x9a,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x9a,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_ne_u32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ne_u32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x9a,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x9a,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_ne_u32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ne_u32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x9a,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x9a,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ne_u32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ne_u32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x9a,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x9a,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_ne_u32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ne_u32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x9a,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x9a,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ne_u32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ne_u32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x9a,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x9a,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_ne_u32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ne_u32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x9a,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x9a,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ne_u32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ne_u32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x9a,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x9a,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_ne_u32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ne_u32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x9a,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x9a,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ne_u32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ne_u32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x9a,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x9a,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_ne_u32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ne_u32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x9a,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x9a,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x9b,0x7c,0xff,0x6f,0x0d,0x30
 # W32: v_cmp_ne_u32 vcc_lo, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x9b,0x7c,0xff,0x6f,0x0d,0x30]
 # W64: v_cmp_ne_u32 vcc, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x9b,0x7c,0xff,0x6f,0x0d,0x30]
-0xfa,0xfe,0x9b,0x7c,0xff,0x6f,0x0d,0x30
 
+0xfa,0x04,0x1a,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_neq_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_neq_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x1a,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x1a,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_neq_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_neq_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x1a,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x1a,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_neq_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_neq_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x1a,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x1a,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_neq_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_neq_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x1a,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x1a,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_neq_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_neq_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x1a,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x1a,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_neq_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_neq_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x1a,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x1a,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_neq_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_neq_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x1a,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x1a,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_neq_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_neq_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x1a,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x1a,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_neq_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_neq_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x1a,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x1a,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_neq_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_neq_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x1a,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x1a,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_neq_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_neq_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x1a,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x1a,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_neq_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_neq_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x1a,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x1a,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_neq_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_neq_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1a,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x1a,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x1a,0x7c,0x7f,0x6f,0xfd,0x30
 # W32: v_cmp_neq_f16 vcc_lo, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1a,0x7c,0x7f,0x6f,0xfd,0x30]
 # W64: v_cmp_neq_f16 vcc, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1a,0x7c,0x7f,0x6f,0xfd,0x30]
-0xfa,0xfe,0x1a,0x7c,0x7f,0x6f,0xfd,0x30
 
+0xfa,0x04,0x3a,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_neq_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_neq_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x3a,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x3a,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_neq_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_neq_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x3a,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x3a,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_neq_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_neq_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x3a,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x3a,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_neq_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_neq_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x3a,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x3a,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_neq_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_neq_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x3a,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x3a,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_neq_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_neq_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x3a,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x3a,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_neq_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_neq_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x3a,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x3a,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_neq_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_neq_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x3a,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x3a,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_neq_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_neq_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x3a,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x3a,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_neq_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_neq_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x3a,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x3a,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_neq_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_neq_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x3a,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x3a,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_neq_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_neq_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x3a,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x3a,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_neq_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_neq_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x3a,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x3a,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x3b,0x7c,0xff,0x6f,0xfd,0x30
 # W32: v_cmp_neq_f32 vcc_lo, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x3b,0x7c,0xff,0x6f,0xfd,0x30]
 # W64: v_cmp_neq_f32 vcc, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x3b,0x7c,0xff,0x6f,0xfd,0x30]
-0xfa,0xfe,0x3b,0x7c,0xff,0x6f,0xfd,0x30
 
+0xfa,0x04,0x12,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nge_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nge_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x12,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x12,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nge_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nge_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x12,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x12,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_nge_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nge_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x12,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x12,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_nge_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nge_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x12,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x12,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_nge_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nge_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x12,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x12,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nge_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nge_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x12,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x12,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_nge_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nge_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x12,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x12,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nge_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nge_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x12,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x12,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_nge_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nge_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x12,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x12,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nge_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nge_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x12,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x12,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_nge_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nge_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x12,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x12,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nge_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nge_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x12,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x12,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_nge_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nge_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x12,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x12,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x12,0x7c,0x7f,0x6f,0xfd,0x30
 # W32: v_cmp_nge_f16 vcc_lo, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x12,0x7c,0x7f,0x6f,0xfd,0x30]
 # W64: v_cmp_nge_f16 vcc, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x12,0x7c,0x7f,0x6f,0xfd,0x30]
-0xfa,0xfe,0x12,0x7c,0x7f,0x6f,0xfd,0x30
 
+0xfa,0x04,0x32,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nge_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nge_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x32,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x32,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nge_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nge_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x32,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x32,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_nge_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nge_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x32,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x32,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_nge_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nge_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x32,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x32,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_nge_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nge_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x32,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x32,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nge_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nge_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x32,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x32,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_nge_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nge_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x32,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x32,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nge_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nge_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x32,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x32,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_nge_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nge_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x32,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x32,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nge_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nge_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x32,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x32,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_nge_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nge_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x32,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x32,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nge_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nge_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x32,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x32,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_nge_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nge_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x32,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x32,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x33,0x7c,0xff,0x6f,0xfd,0x30
 # W32: v_cmp_nge_f32 vcc_lo, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x33,0x7c,0xff,0x6f,0xfd,0x30]
 # W64: v_cmp_nge_f32 vcc, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x33,0x7c,0xff,0x6f,0xfd,0x30]
-0xfa,0xfe,0x33,0x7c,0xff,0x6f,0xfd,0x30
 
+0xfa,0x04,0x16,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ngt_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ngt_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x16,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x16,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ngt_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ngt_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x16,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x16,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_ngt_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ngt_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x16,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x16,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_ngt_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ngt_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x16,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x16,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_ngt_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ngt_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x16,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x16,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ngt_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ngt_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x16,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x16,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_ngt_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ngt_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x16,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x16,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ngt_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ngt_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x16,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x16,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_ngt_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ngt_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x16,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x16,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ngt_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ngt_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x16,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x16,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_ngt_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ngt_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x16,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x16,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ngt_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ngt_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x16,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x16,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_ngt_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ngt_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x16,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x16,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x16,0x7c,0x7f,0x6f,0xfd,0x30
 # W32: v_cmp_ngt_f16 vcc_lo, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x16,0x7c,0x7f,0x6f,0xfd,0x30]
 # W64: v_cmp_ngt_f16 vcc, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x16,0x7c,0x7f,0x6f,0xfd,0x30]
-0xfa,0xfe,0x16,0x7c,0x7f,0x6f,0xfd,0x30
 
+0xfa,0x04,0x36,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_ngt_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ngt_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x36,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x36,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_ngt_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_ngt_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x36,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x36,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_ngt_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_ngt_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x36,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x36,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_ngt_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_ngt_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x36,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x36,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_ngt_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_ngt_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x36,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x36,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_ngt_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ngt_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x36,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x36,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_ngt_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_ngt_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x36,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x36,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_ngt_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_ngt_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x36,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x36,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_ngt_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_ngt_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x36,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x36,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_ngt_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_ngt_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x36,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x36,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_ngt_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_ngt_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x36,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x36,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_ngt_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_ngt_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x36,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x36,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_ngt_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_ngt_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x36,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x36,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x37,0x7c,0xff,0x6f,0xfd,0x30
 # W32: v_cmp_ngt_f32 vcc_lo, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x37,0x7c,0xff,0x6f,0xfd,0x30]
 # W64: v_cmp_ngt_f32 vcc, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x37,0x7c,0xff,0x6f,0xfd,0x30]
-0xfa,0xfe,0x37,0x7c,0xff,0x6f,0xfd,0x30
 
+0xfa,0x04,0x18,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nle_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nle_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x18,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x18,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nle_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nle_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x18,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x18,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_nle_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nle_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x18,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x18,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_nle_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nle_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x18,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x18,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_nle_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nle_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x18,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x18,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nle_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nle_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x18,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x18,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_nle_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nle_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x18,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x18,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nle_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nle_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x18,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x18,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_nle_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nle_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x18,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x18,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nle_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nle_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x18,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x18,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_nle_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nle_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x18,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x18,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nle_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nle_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x18,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x18,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_nle_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nle_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x18,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x18,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x18,0x7c,0x7f,0x6f,0xfd,0x30
 # W32: v_cmp_nle_f16 vcc_lo, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x18,0x7c,0x7f,0x6f,0xfd,0x30]
 # W64: v_cmp_nle_f16 vcc, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x18,0x7c,0x7f,0x6f,0xfd,0x30]
-0xfa,0xfe,0x18,0x7c,0x7f,0x6f,0xfd,0x30
 
+0xfa,0x04,0x38,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nle_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nle_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x38,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x38,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nle_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nle_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x38,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x38,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_nle_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nle_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x38,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x38,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_nle_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nle_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x38,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x38,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_nle_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nle_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x38,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x38,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nle_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nle_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x38,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x38,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_nle_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nle_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x38,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x38,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nle_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nle_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x38,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x38,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_nle_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nle_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x38,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x38,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nle_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nle_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x38,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x38,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_nle_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nle_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x38,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x38,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nle_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nle_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x38,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x38,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_nle_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nle_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x38,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x38,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x39,0x7c,0xff,0x6f,0xfd,0x30
 # W32: v_cmp_nle_f32 vcc_lo, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x39,0x7c,0xff,0x6f,0xfd,0x30]
 # W64: v_cmp_nle_f32 vcc, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x39,0x7c,0xff,0x6f,0xfd,0x30]
-0xfa,0xfe,0x39,0x7c,0xff,0x6f,0xfd,0x30
 
+0xfa,0x04,0x14,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nlg_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nlg_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x14,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x14,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nlg_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nlg_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x14,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x14,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_nlg_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nlg_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x14,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x14,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_nlg_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nlg_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x14,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x14,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_nlg_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nlg_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x14,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x14,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nlg_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nlg_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x14,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x14,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_nlg_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nlg_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x14,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x14,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nlg_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nlg_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x14,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x14,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_nlg_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nlg_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x14,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x14,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nlg_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nlg_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x14,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x14,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_nlg_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nlg_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x14,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x14,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nlg_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nlg_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x14,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x14,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_nlg_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nlg_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x14,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x14,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x14,0x7c,0x7f,0x6f,0xfd,0x30
 # W32: v_cmp_nlg_f16 vcc_lo, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x14,0x7c,0x7f,0x6f,0xfd,0x30]
 # W64: v_cmp_nlg_f16 vcc, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x14,0x7c,0x7f,0x6f,0xfd,0x30]
-0xfa,0xfe,0x14,0x7c,0x7f,0x6f,0xfd,0x30
 
+0xfa,0x04,0x34,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nlg_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nlg_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x34,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x34,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nlg_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nlg_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x34,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x34,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_nlg_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nlg_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x34,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x34,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_nlg_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nlg_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x34,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x34,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_nlg_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nlg_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x34,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x34,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nlg_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nlg_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x34,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x34,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_nlg_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nlg_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x34,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x34,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nlg_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nlg_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x34,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x34,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_nlg_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nlg_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x34,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x34,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nlg_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nlg_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x34,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x34,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_nlg_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nlg_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x34,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x34,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nlg_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nlg_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x34,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x34,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_nlg_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nlg_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x34,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x34,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x35,0x7c,0xff,0x6f,0xfd,0x30
 # W32: v_cmp_nlg_f32 vcc_lo, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x35,0x7c,0xff,0x6f,0xfd,0x30]
 # W64: v_cmp_nlg_f32 vcc, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x35,0x7c,0xff,0x6f,0xfd,0x30]
-0xfa,0xfe,0x35,0x7c,0xff,0x6f,0xfd,0x30
 
+0xfa,0x04,0x1c,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nlt_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nlt_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x1c,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x1c,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nlt_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nlt_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x1c,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x1c,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_nlt_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nlt_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x1c,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x1c,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_nlt_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nlt_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x1c,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x1c,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_nlt_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nlt_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x1c,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x1c,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nlt_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nlt_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x1c,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x1c,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_nlt_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nlt_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x1c,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x1c,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nlt_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nlt_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x1c,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x1c,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_nlt_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nlt_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x1c,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x1c,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nlt_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nlt_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x1c,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x1c,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_nlt_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nlt_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x1c,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x1c,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nlt_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nlt_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x1c,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x1c,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_nlt_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nlt_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1c,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x1c,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x1c,0x7c,0x7f,0x6f,0xfd,0x30
 # W32: v_cmp_nlt_f16 vcc_lo, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1c,0x7c,0x7f,0x6f,0xfd,0x30]
 # W64: v_cmp_nlt_f16 vcc, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1c,0x7c,0x7f,0x6f,0xfd,0x30]
-0xfa,0xfe,0x1c,0x7c,0x7f,0x6f,0xfd,0x30
 
+0xfa,0x04,0x3c,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_nlt_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_nlt_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x3c,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x3c,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_nlt_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_nlt_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x3c,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x3c,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_nlt_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_nlt_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x3c,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x3c,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_nlt_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_nlt_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x3c,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x3c,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_nlt_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_nlt_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x3c,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x3c,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_nlt_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nlt_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x3c,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x3c,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_nlt_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_nlt_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x3c,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x3c,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_nlt_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_nlt_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x3c,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x3c,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_nlt_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_nlt_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x3c,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x3c,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_nlt_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_nlt_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x3c,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x3c,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_nlt_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_nlt_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x3c,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x3c,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_nlt_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_nlt_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x3c,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x3c,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_nlt_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_nlt_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x3c,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x3c,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x3d,0x7c,0xff,0x6f,0xfd,0x30
 # W32: v_cmp_nlt_f32 vcc_lo, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x3d,0x7c,0xff,0x6f,0xfd,0x30]
 # W64: v_cmp_nlt_f32 vcc, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x3d,0x7c,0xff,0x6f,0xfd,0x30]
-0xfa,0xfe,0x3d,0x7c,0xff,0x6f,0xfd,0x30
 
+0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_o_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_o_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x0e,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_o_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_o_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x0e,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x0e,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_o_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_o_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x0e,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x0e,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_o_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_o_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x0e,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x0e,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_o_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_o_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x0e,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x0e,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_o_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_o_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x0e,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x0e,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_o_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_o_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x0e,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x0e,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_o_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_o_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x0e,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x0e,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_o_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_o_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x0e,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x0e,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_o_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_o_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x0e,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x0e,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_o_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_o_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x0e,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x0e,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_o_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_o_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x0e,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x0e,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_o_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_o_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0e,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x0e,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x0e,0x7c,0x7f,0x6f,0xfd,0x30
 # W32: v_cmp_o_f16 vcc_lo, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0e,0x7c,0x7f,0x6f,0xfd,0x30]
 # W64: v_cmp_o_f16 vcc, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0e,0x7c,0x7f,0x6f,0xfd,0x30]
-0xfa,0xfe,0x0e,0x7c,0x7f,0x6f,0xfd,0x30
 
+0xfa,0x04,0x2e,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_o_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_o_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x2e,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x2e,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_o_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_o_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x2e,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x2e,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_o_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_o_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x2e,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x2e,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_o_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_o_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x2e,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x2e,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_o_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_o_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x2e,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x2e,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_o_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_o_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x2e,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x2e,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_o_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_o_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x2e,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x2e,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_o_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_o_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x2e,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x2e,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_o_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_o_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x2e,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x2e,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_o_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_o_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x2e,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x2e,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_o_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_o_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x2e,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x2e,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_o_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_o_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x2e,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x2e,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_o_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_o_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x2e,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x2e,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x2f,0x7c,0xff,0x6f,0xfd,0x30
 # W32: v_cmp_o_f32 vcc_lo, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2f,0x7c,0xff,0x6f,0xfd,0x30]
 # W64: v_cmp_o_f32 vcc, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2f,0x7c,0xff,0x6f,0xfd,0x30]
-0xfa,0xfe,0x2f,0x7c,0xff,0x6f,0xfd,0x30
 
+0xfa,0x04,0x10,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_u_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_u_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x10,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x10,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_u_f16 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_u_f16 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x10,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x10,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_u_f16 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_u_f16 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x10,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x10,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_u_f16 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_u_f16 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x10,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x10,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_u_f16 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_u_f16 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x10,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x10,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_u_f16 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_u_f16 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x10,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x10,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_u_f16 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_u_f16 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x10,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x10,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_u_f16 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_u_f16 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x10,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x10,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_u_f16 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_u_f16 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x10,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x10,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_u_f16 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_u_f16 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x10,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x10,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_u_f16 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_u_f16 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x10,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x10,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_u_f16 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_u_f16 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x10,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x10,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_u_f16 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_u_f16 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x10,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x10,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x10,0x7c,0x7f,0x6f,0xfd,0x30
 # W32: v_cmp_u_f16 vcc_lo, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x10,0x7c,0x7f,0x6f,0xfd,0x30]
 # W64: v_cmp_u_f16 vcc, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x10,0x7c,0x7f,0x6f,0xfd,0x30]
-0xfa,0xfe,0x10,0x7c,0x7f,0x6f,0xfd,0x30
 
+0xfa,0x04,0x30,0x7c,0x01,0x1b,0x00,0xff
 # W32: v_cmp_u_f32 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_u_f32 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x1b,0x00,0xff]
-0xfa,0x04,0x30,0x7c,0x01,0x1b,0x00,0xff
 
+0xfa,0x04,0x30,0x7c,0x01,0xe4,0x00,0xff
 # W32: v_cmp_u_f32 vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0xe4,0x00,0xff]
 # W64: v_cmp_u_f32 vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0xe4,0x00,0xff]
-0xfa,0x04,0x30,0x7c,0x01,0xe4,0x00,0xff
 
+0xfa,0x04,0x30,0x7c,0x01,0x40,0x01,0xff
 # W32: v_cmp_u_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x40,0x01,0xff]
 # W64: v_cmp_u_f32 vcc, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x40,0x01,0xff]
-0xfa,0x04,0x30,0x7c,0x01,0x40,0x01,0xff
 
+0xfa,0x04,0x30,0x7c,0x01,0x41,0x01,0xff
 # W32: v_cmp_u_f32 vcc_lo, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x41,0x01,0xff]
 # W64: v_cmp_u_f32 vcc, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x41,0x01,0xff]
-0xfa,0x04,0x30,0x7c,0x01,0x41,0x01,0xff
 
+0xfa,0x04,0x30,0x7c,0x01,0x01,0x01,0xff
 # W32: v_cmp_u_f32 vcc_lo, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x01,0x01,0xff]
 # W64: v_cmp_u_f32 vcc, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x01,0x01,0xff]
-0xfa,0x04,0x30,0x7c,0x01,0x01,0x01,0xff
 
+0xfa,0x04,0x30,0x7c,0x01,0x0f,0x01,0xff
 # W32: v_cmp_u_f32 vcc_lo, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_u_f32 vcc, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x0f,0x01,0xff]
-0xfa,0x04,0x30,0x7c,0x01,0x0f,0x01,0xff
 
+0xfa,0x04,0x30,0x7c,0x01,0x11,0x01,0xff
 # W32: v_cmp_u_f32 vcc_lo, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x11,0x01,0xff]
 # W64: v_cmp_u_f32 vcc, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x11,0x01,0xff]
-0xfa,0x04,0x30,0x7c,0x01,0x11,0x01,0xff
 
+0xfa,0x04,0x30,0x7c,0x01,0x1f,0x01,0xff
 # W32: v_cmp_u_f32 vcc_lo, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x1f,0x01,0xff]
 # W64: v_cmp_u_f32 vcc, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x1f,0x01,0xff]
-0xfa,0x04,0x30,0x7c,0x01,0x1f,0x01,0xff
 
+0xfa,0x04,0x30,0x7c,0x01,0x21,0x01,0xff
 # W32: v_cmp_u_f32 vcc_lo, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x21,0x01,0xff]
 # W64: v_cmp_u_f32 vcc, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x21,0x01,0xff]
-0xfa,0x04,0x30,0x7c,0x01,0x21,0x01,0xff
 
+0xfa,0x04,0x30,0x7c,0x01,0x2f,0x01,0xff
 # W32: v_cmp_u_f32 vcc_lo, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x2f,0x01,0xff]
 # W64: v_cmp_u_f32 vcc, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x2f,0x01,0xff]
-0xfa,0x04,0x30,0x7c,0x01,0x2f,0x01,0xff
 
+0xfa,0x04,0x30,0x7c,0x01,0x50,0x01,0xff
 # W32: v_cmp_u_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x50,0x01,0xff]
 # W64: v_cmp_u_f32 vcc, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x50,0x01,0xff]
-0xfa,0x04,0x30,0x7c,0x01,0x50,0x01,0xff
 
+0xfa,0x04,0x30,0x7c,0x01,0x5f,0x01,0x01
 # W32: v_cmp_u_f32 vcc_lo, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_u_f32 vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x5f,0x01,0x01]
-0xfa,0x04,0x30,0x7c,0x01,0x5f,0x01,0x01
 
+0xfa,0x04,0x30,0x7c,0x01,0x60,0x01,0x13
 # W32: v_cmp_u_f32 vcc_lo, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x60,0x01,0x13]
 # W64: v_cmp_u_f32 vcc, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x30,0x7c,0x01,0x60,0x01,0x13]
-0xfa,0x04,0x30,0x7c,0x01,0x60,0x01,0x13
 
+0xfa,0xfe,0x31,0x7c,0xff,0x6f,0xfd,0x30
 # W32: v_cmp_u_f32 vcc_lo, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x31,0x7c,0xff,0x6f,0xfd,0x30]
 # W64: v_cmp_u_f32 vcc, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x31,0x7c,0xff,0x6f,0xfd,0x30]
-0xfa,0xfe,0x31,0x7c,0xff,0x6f,0xfd,0x30
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopc_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopc_dpp8.txt
index 0300ff215c352..a060e97e0161b 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopc_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopc_dpp8.txt
@@ -1,436 +1,437 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W32
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W64
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W32
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W64
 
+0xe9,0x04,0xfa,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_class_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_class_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0xfa,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0xfa,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_class_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfa,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_class_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfa,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0xfa,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0xfc,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_class_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfc,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_class_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfc,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0xfc,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0xfd,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_class_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfd,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_class_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfd,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0xfd,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x04,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x04,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x04,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x04,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x04,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_eq_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x04,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_eq_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x04,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x04,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x24,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x24,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x24,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x24,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x25,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_eq_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x25,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_eq_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x25,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x25,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x64,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_i16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x64,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_i16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x64,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x64,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x64,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_eq_i16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x64,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_eq_i16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x64,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x64,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x84,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_i32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x84,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_i32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x84,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x84,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x85,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_eq_i32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x85,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_eq_i32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x85,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x85,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x74,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_u16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x74,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_u16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x74,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x74,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x74,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_eq_u16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x74,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_eq_u16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x74,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x74,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x94,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_eq_u32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x94,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_u32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x94,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x94,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x95,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_eq_u32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x95,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_eq_u32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x95,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x95,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x0c,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0c,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0c,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x0c,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x0c,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_ge_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0c,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_ge_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0c,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x0c,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x2c,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2c,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2c,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x2c,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x2d,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_ge_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2d,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_ge_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2d,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x2d,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x6c,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_i16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6c,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_i16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6c,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x6c,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x6c,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_ge_i16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6c,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_ge_i16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6c,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x6c,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x8c,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_i32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x8c,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_i32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x8c,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x8c,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x8d,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_ge_i32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x8d,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_ge_i32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x8d,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x8d,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x7c,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_u16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7c,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_u16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7c,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x7c,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x7c,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_ge_u16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7c,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_ge_u16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7c,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x7c,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x9c,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_ge_u32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x9c,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_u32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x9c,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x9c,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x9d,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_ge_u32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x9d,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_ge_u32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x9d,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x9d,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x08,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x08,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x08,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x08,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x08,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_gt_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x08,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_gt_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x08,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x08,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x28,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x28,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x28,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x28,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x29,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_gt_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x29,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_gt_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x29,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x29,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x68,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_i16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x68,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_i16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x68,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x68,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x68,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_gt_i16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x68,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_gt_i16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x68,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x68,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x88,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_i32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x88,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_i32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x88,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x88,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x89,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_gt_i32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x89,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_gt_i32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x89,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x89,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x78,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_u16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x78,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_u16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x78,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x78,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x78,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_gt_u16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x78,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_gt_u16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x78,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x78,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x98,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_gt_u32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x98,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_u32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x98,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x98,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x99,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_gt_u32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x99,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_gt_u32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x99,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x99,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x06,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x06,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x06,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x06,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x06,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_le_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x06,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_le_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x06,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x06,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x26,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x26,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x26,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x26,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x27,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_le_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x27,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_le_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x27,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x27,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x66,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_i16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x66,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_i16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x66,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x66,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x66,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_le_i16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x66,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_le_i16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x66,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x66,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x86,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_i32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x86,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_i32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x86,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x86,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x87,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_le_i32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x87,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_le_i32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x87,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x87,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x76,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_u16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x76,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_u16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x76,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x76,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x76,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_le_u16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x76,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_le_u16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x76,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x76,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x96,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_le_u32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x96,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_u32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x96,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x96,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x97,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_le_u32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x97,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_le_u32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x97,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x97,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x0a,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_lg_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lg_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x0a,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x0a,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_lg_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_lg_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x0a,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x2a,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_lg_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2a,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lg_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2a,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x2a,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x2b,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_lg_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2b,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_lg_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2b,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x2b,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x02,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x02,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x02,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x02,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x02,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_lt_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x02,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_lt_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x02,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x02,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x22,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x22,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x22,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x22,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x23,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_lt_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x23,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_lt_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x23,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x23,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x62,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_i16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x62,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_i16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x62,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x62,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x62,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_lt_i16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x62,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_lt_i16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x62,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x62,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x82,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_i32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x82,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_i32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x82,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x82,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x83,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_lt_i32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x83,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_lt_i32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x83,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x83,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x72,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_u16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x72,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_u16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x72,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x72,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x72,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_lt_u16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x72,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_lt_u16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x72,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x72,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x92,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_lt_u32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x92,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_u32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x92,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x92,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x93,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_lt_u32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x93,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_lt_u32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x93,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x93,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x6a,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_i16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6a,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_i16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6a,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x6a,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x6a,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_ne_i16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6a,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_ne_i16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6a,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x6a,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x8a,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_i32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x8a,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_i32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x8a,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x8a,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x8b,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_ne_i32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x8b,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_ne_i32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x8b,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x8b,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x7a,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_u16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7a,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_u16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7a,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x7a,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x7a,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_ne_u16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7a,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_ne_u16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7a,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x7a,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x9a,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_ne_u32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x9a,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_u32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x9a,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x9a,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x9b,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_ne_u32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x9b,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_ne_u32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x9b,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x9b,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x1a,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_neq_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1a,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_neq_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1a,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x1a,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x1a,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_neq_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1a,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_neq_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1a,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x1a,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x3a,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_neq_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3a,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_neq_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3a,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x3a,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x3b,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_neq_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x3b,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_neq_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x3b,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x3b,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x12,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_nge_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x12,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nge_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x12,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x12,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x12,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_nge_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x12,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_nge_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x12,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x12,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x32,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_nge_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x32,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nge_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x32,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x32,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x33,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_nge_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x33,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_nge_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x33,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x33,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x16,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_ngt_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x16,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ngt_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x16,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x16,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x16,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_ngt_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x16,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_ngt_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x16,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x16,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x36,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_ngt_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x36,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ngt_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x36,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x36,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x37,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_ngt_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x37,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_ngt_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x37,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x37,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x18,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_nle_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x18,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nle_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x18,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x18,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x18,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_nle_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x18,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_nle_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x18,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x18,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x38,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_nle_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x38,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nle_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x38,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x38,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x39,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_nle_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x39,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_nle_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x39,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x39,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x14,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlg_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x14,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlg_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x14,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x14,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x14,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_nlg_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x14,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_nlg_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x14,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x14,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x34,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlg_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x34,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlg_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x34,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x34,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x35,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_nlg_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x35,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_nlg_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x35,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x35,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x1c,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlt_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1c,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlt_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1c,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x1c,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x1c,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_nlt_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1c,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_nlt_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1c,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x1c,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x3c,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_nlt_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3c,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlt_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3c,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x3c,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x3d,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_nlt_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x3d,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_nlt_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x3d,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x3d,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x0e,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_o_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0e,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_o_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0e,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x0e,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x0e,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_o_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0e,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_o_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0e,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x0e,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x2e,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_o_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2e,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_o_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2e,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x2e,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x2f,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_o_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2f,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_o_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2f,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x2f,0x7c,0xff,0x00,0x00,0x00
 
+0xe9,0x04,0x10,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_u_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x10,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_u_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x10,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x10,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x10,0x7c,0x7f,0x00,0x00,0x00
 # W32: v_cmp_u_f16 vcc_lo, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x10,0x7c,0x7f,0x00,0x00,0x00]
 # W64: v_cmp_u_f16 vcc, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x10,0x7c,0x7f,0x00,0x00,0x00]
-0xea,0xfe,0x10,0x7c,0x7f,0x00,0x00,0x00
 
+0xe9,0x04,0x30,0x7c,0x01,0x77,0x39,0x05
 # W32: v_cmp_u_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x30,0x7c,0x01,0x77,0x39,0x05]
 # W64: v_cmp_u_f32 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x30,0x7c,0x01,0x77,0x39,0x05]
-0xe9,0x04,0x30,0x7c,0x01,0x77,0x39,0x05
 
+0xea,0xfe,0x31,0x7c,0xff,0x00,0x00,0x00
 # W32: v_cmp_u_f32 vcc_lo, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x31,0x7c,0xff,0x00,0x00,0x00]
 # W64: v_cmp_u_f32 vcc, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x31,0x7c,0xff,0x00,0x00,0x00]
-0xea,0xfe,0x31,0x7c,0xff,0x00,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt
index 74213ba162ae7..180ec987280d1 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt
@@ -1,3406 +1,3407 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12
 
-# GFX12: v_cmpx_class_f16_e32 v1, v2             ; encoding: [0x01,0x05,0xfa,0x7d]
 0x01,0x05,0xfa,0x7d
+# GFX12: v_cmpx_class_f16_e32 v1, v2             ; encoding: [0x01,0x05,0xfa,0x7d]
 
-# GFX12: v_cmpx_class_f16_e32 v127, v2           ; encoding: [0x7f,0x05,0xfa,0x7d]
 0x7f,0x05,0xfa,0x7d
+# GFX12: v_cmpx_class_f16_e32 v127, v2           ; encoding: [0x7f,0x05,0xfa,0x7d]
 
-# GFX12: v_cmpx_class_f16_e32 s1, v2             ; encoding: [0x01,0x04,0xfa,0x7d]
 0x01,0x04,0xfa,0x7d
+# GFX12: v_cmpx_class_f16_e32 s1, v2             ; encoding: [0x01,0x04,0xfa,0x7d]
 
-# GFX12: v_cmpx_class_f16_e32 s105, v2           ; encoding: [0x69,0x04,0xfa,0x7d]
 0x69,0x04,0xfa,0x7d
+# GFX12: v_cmpx_class_f16_e32 s105, v2           ; encoding: [0x69,0x04,0xfa,0x7d]
 
-# GFX12: v_cmpx_class_f16_e32 vcc_lo, v2         ; encoding: [0x6a,0x04,0xfa,0x7d]
 0x6a,0x04,0xfa,0x7d
+# GFX12: v_cmpx_class_f16_e32 vcc_lo, v2         ; encoding: [0x6a,0x04,0xfa,0x7d]
 
-# GFX12: v_cmpx_class_f16_e32 vcc_hi, v2         ; encoding: [0x6b,0x04,0xfa,0x7d]
 0x6b,0x04,0xfa,0x7d
+# GFX12: v_cmpx_class_f16_e32 vcc_hi, v2         ; encoding: [0x6b,0x04,0xfa,0x7d]
 
-# GFX12: v_cmpx_class_f16_e32 ttmp15, v2         ; encoding: [0x7b,0x04,0xfa,0x7d]
 0x7b,0x04,0xfa,0x7d
+# GFX12: v_cmpx_class_f16_e32 ttmp15, v2         ; encoding: [0x7b,0x04,0xfa,0x7d]
 
-# GFX12: v_cmpx_class_f16_e32 m0, v2             ; encoding: [0x7d,0x04,0xfa,0x7d]
 0x7d,0x04,0xfa,0x7d
+# GFX12: v_cmpx_class_f16_e32 m0, v2             ; encoding: [0x7d,0x04,0xfa,0x7d]
 
-# GFX12: v_cmpx_class_f16_e32 exec_lo, v2        ; encoding: [0x7e,0x04,0xfa,0x7d]
 0x7e,0x04,0xfa,0x7d
+# GFX12: v_cmpx_class_f16_e32 exec_lo, v2        ; encoding: [0x7e,0x04,0xfa,0x7d]
 
-# GFX12: v_cmpx_class_f16_e32 exec_hi, v2        ; encoding: [0x7f,0x04,0xfa,0x7d]
 0x7f,0x04,0xfa,0x7d
+# GFX12: v_cmpx_class_f16_e32 exec_hi, v2        ; encoding: [0x7f,0x04,0xfa,0x7d]
 
-# GFX12: v_cmpx_class_f16_e32 null, v2           ; encoding: [0x7c,0x04,0xfa,0x7d]
 0x7c,0x04,0xfa,0x7d
+# GFX12: v_cmpx_class_f16_e32 null, v2           ; encoding: [0x7c,0x04,0xfa,0x7d]
 
-# GFX12: v_cmpx_class_f16_e32 -1, v2             ; encoding: [0xc1,0x04,0xfa,0x7d]
 0xc1,0x04,0xfa,0x7d
+# GFX12: v_cmpx_class_f16_e32 -1, v2             ; encoding: [0xc1,0x04,0xfa,0x7d]
 
-# GFX12: v_cmpx_class_f16_e32 0.5, v2            ; encoding: [0xf0,0x04,0xfa,0x7d]
 0xf0,0x04,0xfa,0x7d
+# GFX12: v_cmpx_class_f16_e32 0.5, v2            ; encoding: [0xf0,0x04,0xfa,0x7d]
 
-# GFX12: v_cmpx_class_f16_e32 src_scc, v2        ; encoding: [0xfd,0x04,0xfa,0x7d]
 0xfd,0x04,0xfa,0x7d
+# GFX12: v_cmpx_class_f16_e32 src_scc, v2        ; encoding: [0xfd,0x04,0xfa,0x7d]
 
-# GFX12: v_cmpx_class_f16_e32 0xfe0b, v127       ; encoding: [0xff,0xfe,0xfa,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0xfa,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_class_f16_e32 0xfe0b, v127       ; encoding: [0xff,0xfe,0xfa,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_class_f32_e32 v1, v2             ; encoding: [0x01,0x05,0xfc,0x7d]
 0x01,0x05,0xfc,0x7d
+# GFX12: v_cmpx_class_f32_e32 v1, v2             ; encoding: [0x01,0x05,0xfc,0x7d]
 
-# GFX12: v_cmpx_class_f32_e32 v255, v2           ; encoding: [0xff,0x05,0xfc,0x7d]
 0xff,0x05,0xfc,0x7d
+# GFX12: v_cmpx_class_f32_e32 v255, v2           ; encoding: [0xff,0x05,0xfc,0x7d]
 
-# GFX12: v_cmpx_class_f32_e32 s1, v2             ; encoding: [0x01,0x04,0xfc,0x7d]
 0x01,0x04,0xfc,0x7d
+# GFX12: v_cmpx_class_f32_e32 s1, v2             ; encoding: [0x01,0x04,0xfc,0x7d]
 
-# GFX12: v_cmpx_class_f32_e32 s105, v2           ; encoding: [0x69,0x04,0xfc,0x7d]
 0x69,0x04,0xfc,0x7d
+# GFX12: v_cmpx_class_f32_e32 s105, v2           ; encoding: [0x69,0x04,0xfc,0x7d]
 
-# GFX12: v_cmpx_class_f32_e32 vcc_lo, v2         ; encoding: [0x6a,0x04,0xfc,0x7d]
 0x6a,0x04,0xfc,0x7d
+# GFX12: v_cmpx_class_f32_e32 vcc_lo, v2         ; encoding: [0x6a,0x04,0xfc,0x7d]
 
-# GFX12: v_cmpx_class_f32_e32 vcc_hi, v2         ; encoding: [0x6b,0x04,0xfc,0x7d]
 0x6b,0x04,0xfc,0x7d
+# GFX12: v_cmpx_class_f32_e32 vcc_hi, v2         ; encoding: [0x6b,0x04,0xfc,0x7d]
 
-# GFX12: v_cmpx_class_f32_e32 ttmp15, v2         ; encoding: [0x7b,0x04,0xfc,0x7d]
 0x7b,0x04,0xfc,0x7d
+# GFX12: v_cmpx_class_f32_e32 ttmp15, v2         ; encoding: [0x7b,0x04,0xfc,0x7d]
 
-# GFX12: v_cmpx_class_f32_e32 m0, v2             ; encoding: [0x7d,0x04,0xfc,0x7d]
 0x7d,0x04,0xfc,0x7d
+# GFX12: v_cmpx_class_f32_e32 m0, v2             ; encoding: [0x7d,0x04,0xfc,0x7d]
 
-# GFX12: v_cmpx_class_f32_e32 exec_lo, v2        ; encoding: [0x7e,0x04,0xfc,0x7d]
 0x7e,0x04,0xfc,0x7d
+# GFX12: v_cmpx_class_f32_e32 exec_lo, v2        ; encoding: [0x7e,0x04,0xfc,0x7d]
 
-# GFX12: v_cmpx_class_f32_e32 exec_hi, v2        ; encoding: [0x7f,0x04,0xfc,0x7d]
 0x7f,0x04,0xfc,0x7d
+# GFX12: v_cmpx_class_f32_e32 exec_hi, v2        ; encoding: [0x7f,0x04,0xfc,0x7d]
 
-# GFX12: v_cmpx_class_f32_e32 null, v2           ; encoding: [0x7c,0x04,0xfc,0x7d]
 0x7c,0x04,0xfc,0x7d
+# GFX12: v_cmpx_class_f32_e32 null, v2           ; encoding: [0x7c,0x04,0xfc,0x7d]
 
-# GFX12: v_cmpx_class_f32_e32 -1, v2             ; encoding: [0xc1,0x04,0xfc,0x7d]
 0xc1,0x04,0xfc,0x7d
+# GFX12: v_cmpx_class_f32_e32 -1, v2             ; encoding: [0xc1,0x04,0xfc,0x7d]
 
-# GFX12: v_cmpx_class_f32_e32 0.5, v2            ; encoding: [0xf0,0x04,0xfc,0x7d]
 0xf0,0x04,0xfc,0x7d
+# GFX12: v_cmpx_class_f32_e32 0.5, v2            ; encoding: [0xf0,0x04,0xfc,0x7d]
 
-# GFX12: v_cmpx_class_f32_e32 src_scc, v2        ; encoding: [0xfd,0x04,0xfc,0x7d]
 0xfd,0x04,0xfc,0x7d
+# GFX12: v_cmpx_class_f32_e32 src_scc, v2        ; encoding: [0xfd,0x04,0xfc,0x7d]
 
-# GFX12: v_cmpx_class_f32_e32 0xaf123456, v255   ; encoding: [0xff,0xfe,0xfd,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0xfd,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_class_f32_e32 0xaf123456, v255   ; encoding: [0xff,0xfe,0xfd,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_class_f64_e32 v[1:2], v2         ; encoding: [0x01,0x05,0xfe,0x7d]
 0x01,0x05,0xfe,0x7d
+# GFX12: v_cmpx_class_f64_e32 v[1:2], v2         ; encoding: [0x01,0x05,0xfe,0x7d]
 
-# GFX12: v_cmpx_class_f64_e32 v[254:255], v2     ; encoding: [0xfe,0x05,0xfe,0x7d]
 0xfe,0x05,0xfe,0x7d
+# GFX12: v_cmpx_class_f64_e32 v[254:255], v2     ; encoding: [0xfe,0x05,0xfe,0x7d]
 
-# GFX12: v_cmpx_class_f64_e32 s[2:3], v2         ; encoding: [0x02,0x04,0xfe,0x7d]
 0x02,0x04,0xfe,0x7d
+# GFX12: v_cmpx_class_f64_e32 s[2:3], v2         ; encoding: [0x02,0x04,0xfe,0x7d]
 
-# GFX12: v_cmpx_class_f64_e32 s[104:105], v2     ; encoding: [0x68,0x04,0xfe,0x7d]
 0x68,0x04,0xfe,0x7d
+# GFX12: v_cmpx_class_f64_e32 s[104:105], v2     ; encoding: [0x68,0x04,0xfe,0x7d]
 
-# GFX12: v_cmpx_class_f64_e32 vcc, v2            ; encoding: [0x6a,0x04,0xfe,0x7d]
 0x6a,0x04,0xfe,0x7d
+# GFX12: v_cmpx_class_f64_e32 vcc, v2            ; encoding: [0x6a,0x04,0xfe,0x7d]
 
-# GFX12: v_cmpx_class_f64_e32 ttmp[14:15], v2    ; encoding: [0x7a,0x04,0xfe,0x7d]
 0x7a,0x04,0xfe,0x7d
+# GFX12: v_cmpx_class_f64_e32 ttmp[14:15], v2    ; encoding: [0x7a,0x04,0xfe,0x7d]
 
-# GFX12: v_cmpx_class_f64_e32 exec, v2           ; encoding: [0x7e,0x04,0xfe,0x7d]
 0x7e,0x04,0xfe,0x7d
+# GFX12: v_cmpx_class_f64_e32 exec, v2           ; encoding: [0x7e,0x04,0xfe,0x7d]
 
-# GFX12: v_cmpx_class_f64_e32 null, v2           ; encoding: [0x7c,0x04,0xfe,0x7d]
 0x7c,0x04,0xfe,0x7d
+# GFX12: v_cmpx_class_f64_e32 null, v2           ; encoding: [0x7c,0x04,0xfe,0x7d]
 
-# GFX12: v_cmpx_class_f64_e32 -1, v2             ; encoding: [0xc1,0x04,0xfe,0x7d]
 0xc1,0x04,0xfe,0x7d
+# GFX12: v_cmpx_class_f64_e32 -1, v2             ; encoding: [0xc1,0x04,0xfe,0x7d]
 
-# GFX12: v_cmpx_class_f64_e32 0.5, v2            ; encoding: [0xf0,0x04,0xfe,0x7d]
 0xf0,0x04,0xfe,0x7d
+# GFX12: v_cmpx_class_f64_e32 0.5, v2            ; encoding: [0xf0,0x04,0xfe,0x7d]
 
-# GFX12: v_cmpx_class_f64_e32 src_scc, v2        ; encoding: [0xfd,0x04,0xfe,0x7d]
 0xfd,0x04,0xfe,0x7d
+# GFX12: v_cmpx_class_f64_e32 src_scc, v2        ; encoding: [0xfd,0x04,0xfe,0x7d]
 
-# GFX12: v_cmpx_class_f64_e32 0xaf123456, v255   ; encoding: [0xff,0xfe,0xff,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0xff,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_class_f64_e32 0xaf123456, v255   ; encoding: [0xff,0xfe,0xff,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_eq_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x04,0x7d]
 0x01,0x05,0x04,0x7d
+# GFX12: v_cmpx_eq_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x04,0x7d]
 
-# GFX12: v_cmpx_eq_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x04,0x7d]
 0x7f,0x05,0x04,0x7d
+# GFX12: v_cmpx_eq_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x04,0x7d]
 
-# GFX12: v_cmpx_eq_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x04,0x7d]
 0x01,0x04,0x04,0x7d
+# GFX12: v_cmpx_eq_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x04,0x7d]
 
-# GFX12: v_cmpx_eq_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x04,0x7d]
 0x69,0x04,0x04,0x7d
+# GFX12: v_cmpx_eq_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x04,0x7d]
 
-# GFX12: v_cmpx_eq_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x04,0x7d]
 0x6a,0x04,0x04,0x7d
+# GFX12: v_cmpx_eq_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x04,0x7d]
 
-# GFX12: v_cmpx_eq_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x04,0x7d]
 0x6b,0x04,0x04,0x7d
+# GFX12: v_cmpx_eq_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x04,0x7d]
 
-# GFX12: v_cmpx_eq_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x04,0x7d]
 0x7b,0x04,0x04,0x7d
+# GFX12: v_cmpx_eq_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x04,0x7d]
 
-# GFX12: v_cmpx_eq_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x04,0x7d]
 0x7d,0x04,0x04,0x7d
+# GFX12: v_cmpx_eq_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x04,0x7d]
 
-# GFX12: v_cmpx_eq_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x04,0x7d]
 0x7e,0x04,0x04,0x7d
+# GFX12: v_cmpx_eq_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x04,0x7d]
 
-# GFX12: v_cmpx_eq_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x04,0x7d]
 0x7f,0x04,0x04,0x7d
+# GFX12: v_cmpx_eq_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x04,0x7d]
 
-# GFX12: v_cmpx_eq_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x04,0x7d]
 0x7c,0x04,0x04,0x7d
+# GFX12: v_cmpx_eq_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x04,0x7d]
 
-# GFX12: v_cmpx_eq_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x04,0x7d]
 0xc1,0x04,0x04,0x7d
+# GFX12: v_cmpx_eq_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x04,0x7d]
 
-# GFX12: v_cmpx_eq_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x04,0x7d]
 0xf0,0x04,0x04,0x7d
+# GFX12: v_cmpx_eq_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x04,0x7d]
 
-# GFX12: v_cmpx_eq_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x04,0x7d]
 0xfd,0x04,0x04,0x7d
+# GFX12: v_cmpx_eq_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x04,0x7d]
 
-# GFX12: v_cmpx_eq_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x04,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x04,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_eq_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x04,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_eq_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x24,0x7d]
 0x01,0x05,0x24,0x7d
+# GFX12: v_cmpx_eq_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x24,0x7d]
 
-# GFX12: v_cmpx_eq_f32_e32 v255, v2              ; encoding: [0xff,0x05,0x24,0x7d]
 0xff,0x05,0x24,0x7d
+# GFX12: v_cmpx_eq_f32_e32 v255, v2              ; encoding: [0xff,0x05,0x24,0x7d]
 
-# GFX12: v_cmpx_eq_f32_e32 s1, v2                ; encoding: [0x01,0x04,0x24,0x7d]
 0x01,0x04,0x24,0x7d
+# GFX12: v_cmpx_eq_f32_e32 s1, v2                ; encoding: [0x01,0x04,0x24,0x7d]
 
-# GFX12: v_cmpx_eq_f32_e32 s105, v2              ; encoding: [0x69,0x04,0x24,0x7d]
 0x69,0x04,0x24,0x7d
+# GFX12: v_cmpx_eq_f32_e32 s105, v2              ; encoding: [0x69,0x04,0x24,0x7d]
 
-# GFX12: v_cmpx_eq_f32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x24,0x7d]
 0x6a,0x04,0x24,0x7d
+# GFX12: v_cmpx_eq_f32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x24,0x7d]
 
-# GFX12: v_cmpx_eq_f32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x24,0x7d]
 0x6b,0x04,0x24,0x7d
+# GFX12: v_cmpx_eq_f32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x24,0x7d]
 
-# GFX12: v_cmpx_eq_f32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x24,0x7d]
 0x7b,0x04,0x24,0x7d
+# GFX12: v_cmpx_eq_f32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x24,0x7d]
 
-# GFX12: v_cmpx_eq_f32_e32 m0, v2                ; encoding: [0x7d,0x04,0x24,0x7d]
 0x7d,0x04,0x24,0x7d
+# GFX12: v_cmpx_eq_f32_e32 m0, v2                ; encoding: [0x7d,0x04,0x24,0x7d]
 
-# GFX12: v_cmpx_eq_f32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x24,0x7d]
 0x7e,0x04,0x24,0x7d
+# GFX12: v_cmpx_eq_f32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x24,0x7d]
 
-# GFX12: v_cmpx_eq_f32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x24,0x7d]
 0x7f,0x04,0x24,0x7d
+# GFX12: v_cmpx_eq_f32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x24,0x7d]
 
-# GFX12: v_cmpx_eq_f32_e32 null, v2              ; encoding: [0x7c,0x04,0x24,0x7d]
 0x7c,0x04,0x24,0x7d
+# GFX12: v_cmpx_eq_f32_e32 null, v2              ; encoding: [0x7c,0x04,0x24,0x7d]
 
-# GFX12: v_cmpx_eq_f32_e32 -1, v2                ; encoding: [0xc1,0x04,0x24,0x7d]
 0xc1,0x04,0x24,0x7d
+# GFX12: v_cmpx_eq_f32_e32 -1, v2                ; encoding: [0xc1,0x04,0x24,0x7d]
 
-# GFX12: v_cmpx_eq_f32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x24,0x7d]
 0xf0,0x04,0x24,0x7d
+# GFX12: v_cmpx_eq_f32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x24,0x7d]
 
-# GFX12: v_cmpx_eq_f32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x24,0x7d]
 0xfd,0x04,0x24,0x7d
+# GFX12: v_cmpx_eq_f32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x24,0x7d]
 
-# GFX12: v_cmpx_eq_f32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x25,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x25,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_eq_f32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x25,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_eq_f64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0x44,0x7d]
 0x01,0x05,0x44,0x7d
+# GFX12: v_cmpx_eq_f64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0x44,0x7d]
 
-# GFX12: v_cmpx_eq_f64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0x44,0x7d]
 0xfe,0x05,0x44,0x7d
+# GFX12: v_cmpx_eq_f64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0x44,0x7d]
 
-# GFX12: v_cmpx_eq_f64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0x44,0x7d]
 0x02,0x04,0x44,0x7d
+# GFX12: v_cmpx_eq_f64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0x44,0x7d]
 
-# GFX12: v_cmpx_eq_f64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0x44,0x7d]
 0x68,0x04,0x44,0x7d
+# GFX12: v_cmpx_eq_f64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0x44,0x7d]
 
-# GFX12: v_cmpx_eq_f64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0x44,0x7d]
 0x6a,0x04,0x44,0x7d
+# GFX12: v_cmpx_eq_f64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0x44,0x7d]
 
-# GFX12: v_cmpx_eq_f64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0x44,0x7d]
 0x7a,0x04,0x44,0x7d
+# GFX12: v_cmpx_eq_f64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0x44,0x7d]
 
-# GFX12: v_cmpx_eq_f64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0x44,0x7d]
 0x7e,0x04,0x44,0x7d
+# GFX12: v_cmpx_eq_f64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0x44,0x7d]
 
-# GFX12: v_cmpx_eq_f64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0x44,0x7d]
 0x7c,0x04,0x44,0x7d
+# GFX12: v_cmpx_eq_f64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0x44,0x7d]
 
-# GFX12: v_cmpx_eq_f64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0x44,0x7d]
 0xc1,0x04,0x44,0x7d
+# GFX12: v_cmpx_eq_f64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0x44,0x7d]
 
-# GFX12: v_cmpx_eq_f64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0x44,0x7d]
 0xf0,0x04,0x44,0x7d
+# GFX12: v_cmpx_eq_f64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0x44,0x7d]
 
-# GFX12: v_cmpx_eq_f64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0x44,0x7d]
 0xfd,0x04,0x44,0x7d
+# GFX12: v_cmpx_eq_f64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0x44,0x7d]
 
-# GFX12: v_cmpx_eq_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x45,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x45,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_eq_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x45,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_eq_i16_e32 v1, v2                ; encoding: [0x01,0x05,0x64,0x7d]
 0x01,0x05,0x64,0x7d
+# GFX12: v_cmpx_eq_i16_e32 v1, v2                ; encoding: [0x01,0x05,0x64,0x7d]
 
-# GFX12: v_cmpx_eq_i16_e32 v127, v2              ; encoding: [0x7f,0x05,0x64,0x7d]
 0x7f,0x05,0x64,0x7d
+# GFX12: v_cmpx_eq_i16_e32 v127, v2              ; encoding: [0x7f,0x05,0x64,0x7d]
 
-# GFX12: v_cmpx_eq_i16_e32 s1, v2                ; encoding: [0x01,0x04,0x64,0x7d]
 0x01,0x04,0x64,0x7d
+# GFX12: v_cmpx_eq_i16_e32 s1, v2                ; encoding: [0x01,0x04,0x64,0x7d]
 
-# GFX12: v_cmpx_eq_i16_e32 s105, v2              ; encoding: [0x69,0x04,0x64,0x7d]
 0x69,0x04,0x64,0x7d
+# GFX12: v_cmpx_eq_i16_e32 s105, v2              ; encoding: [0x69,0x04,0x64,0x7d]
 
-# GFX12: v_cmpx_eq_i16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x64,0x7d]
 0x6a,0x04,0x64,0x7d
+# GFX12: v_cmpx_eq_i16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x64,0x7d]
 
-# GFX12: v_cmpx_eq_i16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x64,0x7d]
 0x6b,0x04,0x64,0x7d
+# GFX12: v_cmpx_eq_i16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x64,0x7d]
 
-# GFX12: v_cmpx_eq_i16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x64,0x7d]
 0x7b,0x04,0x64,0x7d
+# GFX12: v_cmpx_eq_i16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x64,0x7d]
 
-# GFX12: v_cmpx_eq_i16_e32 m0, v2                ; encoding: [0x7d,0x04,0x64,0x7d]
 0x7d,0x04,0x64,0x7d
+# GFX12: v_cmpx_eq_i16_e32 m0, v2                ; encoding: [0x7d,0x04,0x64,0x7d]
 
-# GFX12: v_cmpx_eq_i16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x64,0x7d]
 0x7e,0x04,0x64,0x7d
+# GFX12: v_cmpx_eq_i16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x64,0x7d]
 
-# GFX12: v_cmpx_eq_i16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x64,0x7d]
 0x7f,0x04,0x64,0x7d
+# GFX12: v_cmpx_eq_i16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x64,0x7d]
 
-# GFX12: v_cmpx_eq_i16_e32 null, v2              ; encoding: [0x7c,0x04,0x64,0x7d]
 0x7c,0x04,0x64,0x7d
+# GFX12: v_cmpx_eq_i16_e32 null, v2              ; encoding: [0x7c,0x04,0x64,0x7d]
 
-# GFX12: v_cmpx_eq_i16_e32 -1, v2                ; encoding: [0xc1,0x04,0x64,0x7d]
 0xc1,0x04,0x64,0x7d
+# GFX12: v_cmpx_eq_i16_e32 -1, v2                ; encoding: [0xc1,0x04,0x64,0x7d]
 
-# GFX12: v_cmpx_eq_i16_e32 0x3800, v2
 0xf0,0x04,0x64,0x7d
+# GFX12: v_cmpx_eq_i16_e32 0x3800, v2            ; encoding: [0xff,0x04,0x64,0x7d,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_eq_i16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x64,0x7d]
 0xfd,0x04,0x64,0x7d
+# GFX12: v_cmpx_eq_i16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x64,0x7d]
 
-# GFX12: v_cmpx_eq_i16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x64,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x64,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_eq_i16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x64,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_eq_i32_e32 v1, v2                ; encoding: [0x01,0x05,0x84,0x7d]
 0x01,0x05,0x84,0x7d
+# GFX12: v_cmpx_eq_i32_e32 v1, v2                ; encoding: [0x01,0x05,0x84,0x7d]
 
-# GFX12: v_cmpx_eq_i32_e32 v255, v2              ; encoding: [0xff,0x05,0x84,0x7d]
 0xff,0x05,0x84,0x7d
+# GFX12: v_cmpx_eq_i32_e32 v255, v2              ; encoding: [0xff,0x05,0x84,0x7d]
 
-# GFX12: v_cmpx_eq_i32_e32 s1, v2                ; encoding: [0x01,0x04,0x84,0x7d]
 0x01,0x04,0x84,0x7d
+# GFX12: v_cmpx_eq_i32_e32 s1, v2                ; encoding: [0x01,0x04,0x84,0x7d]
 
-# GFX12: v_cmpx_eq_i32_e32 s105, v2              ; encoding: [0x69,0x04,0x84,0x7d]
 0x69,0x04,0x84,0x7d
+# GFX12: v_cmpx_eq_i32_e32 s105, v2              ; encoding: [0x69,0x04,0x84,0x7d]
 
-# GFX12: v_cmpx_eq_i32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x84,0x7d]
 0x6a,0x04,0x84,0x7d
+# GFX12: v_cmpx_eq_i32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x84,0x7d]
 
-# GFX12: v_cmpx_eq_i32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x84,0x7d]
 0x6b,0x04,0x84,0x7d
+# GFX12: v_cmpx_eq_i32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x84,0x7d]
 
-# GFX12: v_cmpx_eq_i32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x84,0x7d]
 0x7b,0x04,0x84,0x7d
+# GFX12: v_cmpx_eq_i32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x84,0x7d]
 
-# GFX12: v_cmpx_eq_i32_e32 m0, v2                ; encoding: [0x7d,0x04,0x84,0x7d]
 0x7d,0x04,0x84,0x7d
+# GFX12: v_cmpx_eq_i32_e32 m0, v2                ; encoding: [0x7d,0x04,0x84,0x7d]
 
-# GFX12: v_cmpx_eq_i32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x84,0x7d]
 0x7e,0x04,0x84,0x7d
+# GFX12: v_cmpx_eq_i32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x84,0x7d]
 
-# GFX12: v_cmpx_eq_i32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x84,0x7d]
 0x7f,0x04,0x84,0x7d
+# GFX12: v_cmpx_eq_i32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x84,0x7d]
 
-# GFX12: v_cmpx_eq_i32_e32 null, v2              ; encoding: [0x7c,0x04,0x84,0x7d]
 0x7c,0x04,0x84,0x7d
+# GFX12: v_cmpx_eq_i32_e32 null, v2              ; encoding: [0x7c,0x04,0x84,0x7d]
 
-# GFX12: v_cmpx_eq_i32_e32 -1, v2                ; encoding: [0xc1,0x04,0x84,0x7d]
 0xc1,0x04,0x84,0x7d
+# GFX12: v_cmpx_eq_i32_e32 -1, v2                ; encoding: [0xc1,0x04,0x84,0x7d]
 
-# GFX12: v_cmpx_eq_i32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x84,0x7d]
 0xf0,0x04,0x84,0x7d
+# GFX12: v_cmpx_eq_i32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x84,0x7d]
 
-# GFX12: v_cmpx_eq_i32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x84,0x7d]
 0xfd,0x04,0x84,0x7d
+# GFX12: v_cmpx_eq_i32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x84,0x7d]
 
-# GFX12: v_cmpx_eq_i32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x85,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x85,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_eq_i32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x85,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_eq_i64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xa4,0x7d]
 0x01,0x05,0xa4,0x7d
+# GFX12: v_cmpx_eq_i64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xa4,0x7d]
 
-# GFX12: v_cmpx_eq_i64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xa4,0x7d]
 0xfe,0x05,0xa4,0x7d
+# GFX12: v_cmpx_eq_i64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xa4,0x7d]
 
-# GFX12: v_cmpx_eq_i64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xa4,0x7d]
 0x02,0x04,0xa4,0x7d
+# GFX12: v_cmpx_eq_i64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xa4,0x7d]
 
-# GFX12: v_cmpx_eq_i64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xa4,0x7d]
 0x68,0x04,0xa4,0x7d
+# GFX12: v_cmpx_eq_i64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xa4,0x7d]
 
-# GFX12: v_cmpx_eq_i64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xa4,0x7d]
 0x6a,0x04,0xa4,0x7d
+# GFX12: v_cmpx_eq_i64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xa4,0x7d]
 
-# GFX12: v_cmpx_eq_i64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xa4,0x7d]
 0x7a,0x04,0xa4,0x7d
+# GFX12: v_cmpx_eq_i64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xa4,0x7d]
 
-# GFX12: v_cmpx_eq_i64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xa4,0x7d]
 0x7e,0x04,0xa4,0x7d
+# GFX12: v_cmpx_eq_i64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xa4,0x7d]
 
-# GFX12: v_cmpx_eq_i64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xa4,0x7d]
 0x7c,0x04,0xa4,0x7d
+# GFX12: v_cmpx_eq_i64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xa4,0x7d]
 
-# GFX12: v_cmpx_eq_i64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xa4,0x7d]
 0xc1,0x04,0xa4,0x7d
+# GFX12: v_cmpx_eq_i64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xa4,0x7d]
 
-# GFX12: v_cmpx_eq_i64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xa4,0x7d]
 0xf0,0x04,0xa4,0x7d
+# GFX12: v_cmpx_eq_i64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xa4,0x7d]
 
-# GFX12: v_cmpx_eq_i64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xa4,0x7d]
 0xfd,0x04,0xa4,0x7d
+# GFX12: v_cmpx_eq_i64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xa4,0x7d]
 
-# GFX12: v_cmpx_eq_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa5,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0xa5,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_eq_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa5,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_eq_u16_e32 v1, v2                ; encoding: [0x01,0x05,0x74,0x7d]
 0x01,0x05,0x74,0x7d
+# GFX12: v_cmpx_eq_u16_e32 v1, v2                ; encoding: [0x01,0x05,0x74,0x7d]
 
-# GFX12: v_cmpx_eq_u16_e32 v127, v2              ; encoding: [0x7f,0x05,0x74,0x7d]
 0x7f,0x05,0x74,0x7d
+# GFX12: v_cmpx_eq_u16_e32 v127, v2              ; encoding: [0x7f,0x05,0x74,0x7d]
 
-# GFX12: v_cmpx_eq_u16_e32 s1, v2                ; encoding: [0x01,0x04,0x74,0x7d]
 0x01,0x04,0x74,0x7d
+# GFX12: v_cmpx_eq_u16_e32 s1, v2                ; encoding: [0x01,0x04,0x74,0x7d]
 
-# GFX12: v_cmpx_eq_u16_e32 s105, v2              ; encoding: [0x69,0x04,0x74,0x7d]
 0x69,0x04,0x74,0x7d
+# GFX12: v_cmpx_eq_u16_e32 s105, v2              ; encoding: [0x69,0x04,0x74,0x7d]
 
-# GFX12: v_cmpx_eq_u16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x74,0x7d]
 0x6a,0x04,0x74,0x7d
+# GFX12: v_cmpx_eq_u16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x74,0x7d]
 
-# GFX12: v_cmpx_eq_u16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x74,0x7d]
 0x6b,0x04,0x74,0x7d
+# GFX12: v_cmpx_eq_u16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x74,0x7d]
 
-# GFX12: v_cmpx_eq_u16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x74,0x7d]
 0x7b,0x04,0x74,0x7d
+# GFX12: v_cmpx_eq_u16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x74,0x7d]
 
-# GFX12: v_cmpx_eq_u16_e32 m0, v2                ; encoding: [0x7d,0x04,0x74,0x7d]
 0x7d,0x04,0x74,0x7d
+# GFX12: v_cmpx_eq_u16_e32 m0, v2                ; encoding: [0x7d,0x04,0x74,0x7d]
 
-# GFX12: v_cmpx_eq_u16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x74,0x7d]
 0x7e,0x04,0x74,0x7d
+# GFX12: v_cmpx_eq_u16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x74,0x7d]
 
-# GFX12: v_cmpx_eq_u16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x74,0x7d]
 0x7f,0x04,0x74,0x7d
+# GFX12: v_cmpx_eq_u16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x74,0x7d]
 
-# GFX12: v_cmpx_eq_u16_e32 null, v2              ; encoding: [0x7c,0x04,0x74,0x7d]
 0x7c,0x04,0x74,0x7d
+# GFX12: v_cmpx_eq_u16_e32 null, v2              ; encoding: [0x7c,0x04,0x74,0x7d]
 
-# GFX12: v_cmpx_eq_u16_e32 -1, v2                ; encoding: [0xc1,0x04,0x74,0x7d]
 0xc1,0x04,0x74,0x7d
+# GFX12: v_cmpx_eq_u16_e32 -1, v2                ; encoding: [0xc1,0x04,0x74,0x7d]
 
-# GFX12: v_cmpx_eq_u16_e32 0x3800, v2
 0xf0,0x04,0x74,0x7d
+# GFX12: v_cmpx_eq_u16_e32 0x3800, v2            ; encoding: [0xff,0x04,0x74,0x7d,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_eq_u16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x74,0x7d]
 0xfd,0x04,0x74,0x7d
+# GFX12: v_cmpx_eq_u16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x74,0x7d]
 
-# GFX12: v_cmpx_eq_u16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x74,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x74,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_eq_u16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x74,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_eq_u32_e32 v1, v2                ; encoding: [0x01,0x05,0x94,0x7d]
 0x01,0x05,0x94,0x7d
+# GFX12: v_cmpx_eq_u32_e32 v1, v2                ; encoding: [0x01,0x05,0x94,0x7d]
 
-# GFX12: v_cmpx_eq_u32_e32 v255, v2              ; encoding: [0xff,0x05,0x94,0x7d]
 0xff,0x05,0x94,0x7d
+# GFX12: v_cmpx_eq_u32_e32 v255, v2              ; encoding: [0xff,0x05,0x94,0x7d]
 
-# GFX12: v_cmpx_eq_u32_e32 s1, v2                ; encoding: [0x01,0x04,0x94,0x7d]
 0x01,0x04,0x94,0x7d
+# GFX12: v_cmpx_eq_u32_e32 s1, v2                ; encoding: [0x01,0x04,0x94,0x7d]
 
-# GFX12: v_cmpx_eq_u32_e32 s105, v2              ; encoding: [0x69,0x04,0x94,0x7d]
 0x69,0x04,0x94,0x7d
+# GFX12: v_cmpx_eq_u32_e32 s105, v2              ; encoding: [0x69,0x04,0x94,0x7d]
 
-# GFX12: v_cmpx_eq_u32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x94,0x7d]
 0x6a,0x04,0x94,0x7d
+# GFX12: v_cmpx_eq_u32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x94,0x7d]
 
-# GFX12: v_cmpx_eq_u32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x94,0x7d]
 0x6b,0x04,0x94,0x7d
+# GFX12: v_cmpx_eq_u32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x94,0x7d]
 
-# GFX12: v_cmpx_eq_u32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x94,0x7d]
 0x7b,0x04,0x94,0x7d
+# GFX12: v_cmpx_eq_u32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x94,0x7d]
 
-# GFX12: v_cmpx_eq_u32_e32 m0, v2                ; encoding: [0x7d,0x04,0x94,0x7d]
 0x7d,0x04,0x94,0x7d
+# GFX12: v_cmpx_eq_u32_e32 m0, v2                ; encoding: [0x7d,0x04,0x94,0x7d]
 
-# GFX12: v_cmpx_eq_u32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x94,0x7d]
 0x7e,0x04,0x94,0x7d
+# GFX12: v_cmpx_eq_u32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x94,0x7d]
 
-# GFX12: v_cmpx_eq_u32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x94,0x7d]
 0x7f,0x04,0x94,0x7d
+# GFX12: v_cmpx_eq_u32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x94,0x7d]
 
-# GFX12: v_cmpx_eq_u32_e32 null, v2              ; encoding: [0x7c,0x04,0x94,0x7d]
 0x7c,0x04,0x94,0x7d
+# GFX12: v_cmpx_eq_u32_e32 null, v2              ; encoding: [0x7c,0x04,0x94,0x7d]
 
-# GFX12: v_cmpx_eq_u32_e32 -1, v2                ; encoding: [0xc1,0x04,0x94,0x7d]
 0xc1,0x04,0x94,0x7d
+# GFX12: v_cmpx_eq_u32_e32 -1, v2                ; encoding: [0xc1,0x04,0x94,0x7d]
 
-# GFX12: v_cmpx_eq_u32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x94,0x7d]
 0xf0,0x04,0x94,0x7d
+# GFX12: v_cmpx_eq_u32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x94,0x7d]
 
-# GFX12: v_cmpx_eq_u32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x94,0x7d]
 0xfd,0x04,0x94,0x7d
+# GFX12: v_cmpx_eq_u32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x94,0x7d]
 
-# GFX12: v_cmpx_eq_u32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x95,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x95,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_eq_u32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x95,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_eq_u64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xb4,0x7d]
 0x01,0x05,0xb4,0x7d
+# GFX12: v_cmpx_eq_u64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xb4,0x7d]
 
-# GFX12: v_cmpx_eq_u64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xb4,0x7d]
 0xfe,0x05,0xb4,0x7d
+# GFX12: v_cmpx_eq_u64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xb4,0x7d]
 
-# GFX12: v_cmpx_eq_u64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xb4,0x7d]
 0x02,0x04,0xb4,0x7d
+# GFX12: v_cmpx_eq_u64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xb4,0x7d]
 
-# GFX12: v_cmpx_eq_u64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xb4,0x7d]
 0x68,0x04,0xb4,0x7d
+# GFX12: v_cmpx_eq_u64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xb4,0x7d]
 
-# GFX12: v_cmpx_eq_u64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xb4,0x7d]
 0x6a,0x04,0xb4,0x7d
+# GFX12: v_cmpx_eq_u64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xb4,0x7d]
 
-# GFX12: v_cmpx_eq_u64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xb4,0x7d]
 0x7a,0x04,0xb4,0x7d
+# GFX12: v_cmpx_eq_u64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xb4,0x7d]
 
-# GFX12: v_cmpx_eq_u64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xb4,0x7d]
 0x7e,0x04,0xb4,0x7d
+# GFX12: v_cmpx_eq_u64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xb4,0x7d]
 
-# GFX12: v_cmpx_eq_u64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xb4,0x7d]
 0x7c,0x04,0xb4,0x7d
+# GFX12: v_cmpx_eq_u64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xb4,0x7d]
 
-# GFX12: v_cmpx_eq_u64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xb4,0x7d]
 0xc1,0x04,0xb4,0x7d
+# GFX12: v_cmpx_eq_u64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xb4,0x7d]
 
-# GFX12: v_cmpx_eq_u64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xb4,0x7d]
 0xf0,0x04,0xb4,0x7d
+# GFX12: v_cmpx_eq_u64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xb4,0x7d]
 
-# GFX12: v_cmpx_eq_u64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xb4,0x7d]
 0xfd,0x04,0xb4,0x7d
+# GFX12: v_cmpx_eq_u64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xb4,0x7d]
 
-# GFX12: v_cmpx_eq_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb5,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0xb5,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_eq_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb5,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ge_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x0c,0x7d]
 0x01,0x05,0x0c,0x7d
+# GFX12: v_cmpx_ge_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x0c,0x7d]
 
-# GFX12: v_cmpx_ge_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x0c,0x7d]
 0x7f,0x05,0x0c,0x7d
+# GFX12: v_cmpx_ge_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x0c,0x7d]
 
-# GFX12: v_cmpx_ge_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x0c,0x7d]
 0x01,0x04,0x0c,0x7d
+# GFX12: v_cmpx_ge_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x0c,0x7d]
 
-# GFX12: v_cmpx_ge_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x0c,0x7d]
 0x69,0x04,0x0c,0x7d
+# GFX12: v_cmpx_ge_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x0c,0x7d]
 
-# GFX12: v_cmpx_ge_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x0c,0x7d]
 0x6a,0x04,0x0c,0x7d
+# GFX12: v_cmpx_ge_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x0c,0x7d]
 
-# GFX12: v_cmpx_ge_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x0c,0x7d]
 0x6b,0x04,0x0c,0x7d
+# GFX12: v_cmpx_ge_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x0c,0x7d]
 
-# GFX12: v_cmpx_ge_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x0c,0x7d]
 0x7b,0x04,0x0c,0x7d
+# GFX12: v_cmpx_ge_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x0c,0x7d]
 
-# GFX12: v_cmpx_ge_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x0c,0x7d]
 0x7d,0x04,0x0c,0x7d
+# GFX12: v_cmpx_ge_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x0c,0x7d]
 
-# GFX12: v_cmpx_ge_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x0c,0x7d]
 0x7e,0x04,0x0c,0x7d
+# GFX12: v_cmpx_ge_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x0c,0x7d]
 
-# GFX12: v_cmpx_ge_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x0c,0x7d]
 0x7f,0x04,0x0c,0x7d
+# GFX12: v_cmpx_ge_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x0c,0x7d]
 
-# GFX12: v_cmpx_ge_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x0c,0x7d]
 0x7c,0x04,0x0c,0x7d
+# GFX12: v_cmpx_ge_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x0c,0x7d]
 
-# GFX12: v_cmpx_ge_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x0c,0x7d]
 0xc1,0x04,0x0c,0x7d
+# GFX12: v_cmpx_ge_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x0c,0x7d]
 
-# GFX12: v_cmpx_ge_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x0c,0x7d]
 0xf0,0x04,0x0c,0x7d
+# GFX12: v_cmpx_ge_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x0c,0x7d]
 
-# GFX12: v_cmpx_ge_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x0c,0x7d]
 0xfd,0x04,0x0c,0x7d
+# GFX12: v_cmpx_ge_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x0c,0x7d]
 
-# GFX12: v_cmpx_ge_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x0c,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x0c,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_ge_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x0c,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_ge_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x2c,0x7d]
 0x01,0x05,0x2c,0x7d
+# GFX12: v_cmpx_ge_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x2c,0x7d]
 
-# GFX12: v_cmpx_ge_f32_e32 v255, v2              ; encoding: [0xff,0x05,0x2c,0x7d]
 0xff,0x05,0x2c,0x7d
+# GFX12: v_cmpx_ge_f32_e32 v255, v2              ; encoding: [0xff,0x05,0x2c,0x7d]
 
-# GFX12: v_cmpx_ge_f32_e32 s1, v2                ; encoding: [0x01,0x04,0x2c,0x7d]
 0x01,0x04,0x2c,0x7d
+# GFX12: v_cmpx_ge_f32_e32 s1, v2                ; encoding: [0x01,0x04,0x2c,0x7d]
 
-# GFX12: v_cmpx_ge_f32_e32 s105, v2              ; encoding: [0x69,0x04,0x2c,0x7d]
 0x69,0x04,0x2c,0x7d
+# GFX12: v_cmpx_ge_f32_e32 s105, v2              ; encoding: [0x69,0x04,0x2c,0x7d]
 
-# GFX12: v_cmpx_ge_f32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x2c,0x7d]
 0x6a,0x04,0x2c,0x7d
+# GFX12: v_cmpx_ge_f32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x2c,0x7d]
 
-# GFX12: v_cmpx_ge_f32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x2c,0x7d]
 0x6b,0x04,0x2c,0x7d
+# GFX12: v_cmpx_ge_f32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x2c,0x7d]
 
-# GFX12: v_cmpx_ge_f32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x2c,0x7d]
 0x7b,0x04,0x2c,0x7d
+# GFX12: v_cmpx_ge_f32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x2c,0x7d]
 
-# GFX12: v_cmpx_ge_f32_e32 m0, v2                ; encoding: [0x7d,0x04,0x2c,0x7d]
 0x7d,0x04,0x2c,0x7d
+# GFX12: v_cmpx_ge_f32_e32 m0, v2                ; encoding: [0x7d,0x04,0x2c,0x7d]
 
-# GFX12: v_cmpx_ge_f32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x2c,0x7d]
 0x7e,0x04,0x2c,0x7d
+# GFX12: v_cmpx_ge_f32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x2c,0x7d]
 
-# GFX12: v_cmpx_ge_f32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x2c,0x7d]
 0x7f,0x04,0x2c,0x7d
+# GFX12: v_cmpx_ge_f32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x2c,0x7d]
 
-# GFX12: v_cmpx_ge_f32_e32 null, v2              ; encoding: [0x7c,0x04,0x2c,0x7d]
 0x7c,0x04,0x2c,0x7d
+# GFX12: v_cmpx_ge_f32_e32 null, v2              ; encoding: [0x7c,0x04,0x2c,0x7d]
 
-# GFX12: v_cmpx_ge_f32_e32 -1, v2                ; encoding: [0xc1,0x04,0x2c,0x7d]
 0xc1,0x04,0x2c,0x7d
+# GFX12: v_cmpx_ge_f32_e32 -1, v2                ; encoding: [0xc1,0x04,0x2c,0x7d]
 
-# GFX12: v_cmpx_ge_f32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x2c,0x7d]
 0xf0,0x04,0x2c,0x7d
+# GFX12: v_cmpx_ge_f32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x2c,0x7d]
 
-# GFX12: v_cmpx_ge_f32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x2c,0x7d]
 0xfd,0x04,0x2c,0x7d
+# GFX12: v_cmpx_ge_f32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x2c,0x7d]
 
-# GFX12: v_cmpx_ge_f32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x2d,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x2d,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ge_f32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x2d,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ge_f64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0x4c,0x7d]
 0x01,0x05,0x4c,0x7d
+# GFX12: v_cmpx_ge_f64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0x4c,0x7d]
 
-# GFX12: v_cmpx_ge_f64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0x4c,0x7d]
 0xfe,0x05,0x4c,0x7d
+# GFX12: v_cmpx_ge_f64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0x4c,0x7d]
 
-# GFX12: v_cmpx_ge_f64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0x4c,0x7d]
 0x02,0x04,0x4c,0x7d
+# GFX12: v_cmpx_ge_f64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0x4c,0x7d]
 
-# GFX12: v_cmpx_ge_f64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0x4c,0x7d]
 0x68,0x04,0x4c,0x7d
+# GFX12: v_cmpx_ge_f64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0x4c,0x7d]
 
-# GFX12: v_cmpx_ge_f64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0x4c,0x7d]
 0x6a,0x04,0x4c,0x7d
+# GFX12: v_cmpx_ge_f64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0x4c,0x7d]
 
-# GFX12: v_cmpx_ge_f64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0x4c,0x7d]
 0x7a,0x04,0x4c,0x7d
+# GFX12: v_cmpx_ge_f64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0x4c,0x7d]
 
-# GFX12: v_cmpx_ge_f64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0x4c,0x7d]
 0x7e,0x04,0x4c,0x7d
+# GFX12: v_cmpx_ge_f64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0x4c,0x7d]
 
-# GFX12: v_cmpx_ge_f64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0x4c,0x7d]
 0x7c,0x04,0x4c,0x7d
+# GFX12: v_cmpx_ge_f64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0x4c,0x7d]
 
-# GFX12: v_cmpx_ge_f64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0x4c,0x7d]
 0xc1,0x04,0x4c,0x7d
+# GFX12: v_cmpx_ge_f64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0x4c,0x7d]
 
-# GFX12: v_cmpx_ge_f64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0x4c,0x7d]
 0xf0,0x04,0x4c,0x7d
+# GFX12: v_cmpx_ge_f64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0x4c,0x7d]
 
-# GFX12: v_cmpx_ge_f64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0x4c,0x7d]
 0xfd,0x04,0x4c,0x7d
+# GFX12: v_cmpx_ge_f64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0x4c,0x7d]
 
-# GFX12: v_cmpx_ge_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4d,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x4d,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ge_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4d,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ge_i16_e32 v1, v2                ; encoding: [0x01,0x05,0x6c,0x7d]
 0x01,0x05,0x6c,0x7d
+# GFX12: v_cmpx_ge_i16_e32 v1, v2                ; encoding: [0x01,0x05,0x6c,0x7d]
 
-# GFX12: v_cmpx_ge_i16_e32 v127, v2              ; encoding: [0x7f,0x05,0x6c,0x7d]
 0x7f,0x05,0x6c,0x7d
+# GFX12: v_cmpx_ge_i16_e32 v127, v2              ; encoding: [0x7f,0x05,0x6c,0x7d]
 
-# GFX12: v_cmpx_ge_i16_e32 s1, v2                ; encoding: [0x01,0x04,0x6c,0x7d]
 0x01,0x04,0x6c,0x7d
+# GFX12: v_cmpx_ge_i16_e32 s1, v2                ; encoding: [0x01,0x04,0x6c,0x7d]
 
-# GFX12: v_cmpx_ge_i16_e32 s105, v2              ; encoding: [0x69,0x04,0x6c,0x7d]
 0x69,0x04,0x6c,0x7d
+# GFX12: v_cmpx_ge_i16_e32 s105, v2              ; encoding: [0x69,0x04,0x6c,0x7d]
 
-# GFX12: v_cmpx_ge_i16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x6c,0x7d]
 0x6a,0x04,0x6c,0x7d
+# GFX12: v_cmpx_ge_i16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x6c,0x7d]
 
-# GFX12: v_cmpx_ge_i16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x6c,0x7d]
 0x6b,0x04,0x6c,0x7d
+# GFX12: v_cmpx_ge_i16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x6c,0x7d]
 
-# GFX12: v_cmpx_ge_i16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x6c,0x7d]
 0x7b,0x04,0x6c,0x7d
+# GFX12: v_cmpx_ge_i16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x6c,0x7d]
 
-# GFX12: v_cmpx_ge_i16_e32 m0, v2                ; encoding: [0x7d,0x04,0x6c,0x7d]
 0x7d,0x04,0x6c,0x7d
+# GFX12: v_cmpx_ge_i16_e32 m0, v2                ; encoding: [0x7d,0x04,0x6c,0x7d]
 
-# GFX12: v_cmpx_ge_i16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x6c,0x7d]
 0x7e,0x04,0x6c,0x7d
+# GFX12: v_cmpx_ge_i16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x6c,0x7d]
 
-# GFX12: v_cmpx_ge_i16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x6c,0x7d]
 0x7f,0x04,0x6c,0x7d
+# GFX12: v_cmpx_ge_i16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x6c,0x7d]
 
-# GFX12: v_cmpx_ge_i16_e32 null, v2              ; encoding: [0x7c,0x04,0x6c,0x7d]
 0x7c,0x04,0x6c,0x7d
+# GFX12: v_cmpx_ge_i16_e32 null, v2              ; encoding: [0x7c,0x04,0x6c,0x7d]
 
-# GFX12: v_cmpx_ge_i16_e32 -1, v2                ; encoding: [0xc1,0x04,0x6c,0x7d]
 0xc1,0x04,0x6c,0x7d
+# GFX12: v_cmpx_ge_i16_e32 -1, v2                ; encoding: [0xc1,0x04,0x6c,0x7d]
 
-# GFX12: v_cmpx_ge_i16_e32 0x3800, v2
 0xf0,0x04,0x6c,0x7d
+# GFX12: v_cmpx_ge_i16_e32 0x3800, v2            ; encoding: [0xff,0x04,0x6c,0x7d,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_ge_i16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x6c,0x7d]
 0xfd,0x04,0x6c,0x7d
+# GFX12: v_cmpx_ge_i16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x6c,0x7d]
 
-# GFX12: v_cmpx_ge_i16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x6c,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x6c,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_ge_i16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x6c,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_ge_i32_e32 v1, v2                ; encoding: [0x01,0x05,0x8c,0x7d]
 0x01,0x05,0x8c,0x7d
+# GFX12: v_cmpx_ge_i32_e32 v1, v2                ; encoding: [0x01,0x05,0x8c,0x7d]
 
-# GFX12: v_cmpx_ge_i32_e32 v255, v2              ; encoding: [0xff,0x05,0x8c,0x7d]
 0xff,0x05,0x8c,0x7d
+# GFX12: v_cmpx_ge_i32_e32 v255, v2              ; encoding: [0xff,0x05,0x8c,0x7d]
 
-# GFX12: v_cmpx_ge_i32_e32 s1, v2                ; encoding: [0x01,0x04,0x8c,0x7d]
 0x01,0x04,0x8c,0x7d
+# GFX12: v_cmpx_ge_i32_e32 s1, v2                ; encoding: [0x01,0x04,0x8c,0x7d]
 
-# GFX12: v_cmpx_ge_i32_e32 s105, v2              ; encoding: [0x69,0x04,0x8c,0x7d]
 0x69,0x04,0x8c,0x7d
+# GFX12: v_cmpx_ge_i32_e32 s105, v2              ; encoding: [0x69,0x04,0x8c,0x7d]
 
-# GFX12: v_cmpx_ge_i32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x8c,0x7d]
 0x6a,0x04,0x8c,0x7d
+# GFX12: v_cmpx_ge_i32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x8c,0x7d]
 
-# GFX12: v_cmpx_ge_i32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x8c,0x7d]
 0x6b,0x04,0x8c,0x7d
+# GFX12: v_cmpx_ge_i32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x8c,0x7d]
 
-# GFX12: v_cmpx_ge_i32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x8c,0x7d]
 0x7b,0x04,0x8c,0x7d
+# GFX12: v_cmpx_ge_i32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x8c,0x7d]
 
-# GFX12: v_cmpx_ge_i32_e32 m0, v2                ; encoding: [0x7d,0x04,0x8c,0x7d]
 0x7d,0x04,0x8c,0x7d
+# GFX12: v_cmpx_ge_i32_e32 m0, v2                ; encoding: [0x7d,0x04,0x8c,0x7d]
 
-# GFX12: v_cmpx_ge_i32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x8c,0x7d]
 0x7e,0x04,0x8c,0x7d
+# GFX12: v_cmpx_ge_i32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x8c,0x7d]
 
-# GFX12: v_cmpx_ge_i32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x8c,0x7d]
 0x7f,0x04,0x8c,0x7d
+# GFX12: v_cmpx_ge_i32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x8c,0x7d]
 
-# GFX12: v_cmpx_ge_i32_e32 null, v2              ; encoding: [0x7c,0x04,0x8c,0x7d]
 0x7c,0x04,0x8c,0x7d
+# GFX12: v_cmpx_ge_i32_e32 null, v2              ; encoding: [0x7c,0x04,0x8c,0x7d]
 
-# GFX12: v_cmpx_ge_i32_e32 -1, v2                ; encoding: [0xc1,0x04,0x8c,0x7d]
 0xc1,0x04,0x8c,0x7d
+# GFX12: v_cmpx_ge_i32_e32 -1, v2                ; encoding: [0xc1,0x04,0x8c,0x7d]
 
-# GFX12: v_cmpx_ge_i32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x8c,0x7d]
 0xf0,0x04,0x8c,0x7d
+# GFX12: v_cmpx_ge_i32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x8c,0x7d]
 
-# GFX12: v_cmpx_ge_i32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x8c,0x7d]
 0xfd,0x04,0x8c,0x7d
+# GFX12: v_cmpx_ge_i32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x8c,0x7d]
 
-# GFX12: v_cmpx_ge_i32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x8d,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x8d,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ge_i32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x8d,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ge_i64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xac,0x7d]
 0x01,0x05,0xac,0x7d
+# GFX12: v_cmpx_ge_i64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xac,0x7d]
 
-# GFX12: v_cmpx_ge_i64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xac,0x7d]
 0xfe,0x05,0xac,0x7d
+# GFX12: v_cmpx_ge_i64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xac,0x7d]
 
-# GFX12: v_cmpx_ge_i64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xac,0x7d]
 0x02,0x04,0xac,0x7d
+# GFX12: v_cmpx_ge_i64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xac,0x7d]
 
-# GFX12: v_cmpx_ge_i64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xac,0x7d]
 0x68,0x04,0xac,0x7d
+# GFX12: v_cmpx_ge_i64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xac,0x7d]
 
-# GFX12: v_cmpx_ge_i64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xac,0x7d]
 0x6a,0x04,0xac,0x7d
+# GFX12: v_cmpx_ge_i64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xac,0x7d]
 
-# GFX12: v_cmpx_ge_i64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xac,0x7d]
 0x7a,0x04,0xac,0x7d
+# GFX12: v_cmpx_ge_i64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xac,0x7d]
 
-# GFX12: v_cmpx_ge_i64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xac,0x7d]
 0x7e,0x04,0xac,0x7d
+# GFX12: v_cmpx_ge_i64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xac,0x7d]
 
-# GFX12: v_cmpx_ge_i64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xac,0x7d]
 0x7c,0x04,0xac,0x7d
+# GFX12: v_cmpx_ge_i64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xac,0x7d]
 
-# GFX12: v_cmpx_ge_i64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xac,0x7d]
 0xc1,0x04,0xac,0x7d
+# GFX12: v_cmpx_ge_i64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xac,0x7d]
 
-# GFX12: v_cmpx_ge_i64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xac,0x7d]
 0xf0,0x04,0xac,0x7d
+# GFX12: v_cmpx_ge_i64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xac,0x7d]
 
-# GFX12: v_cmpx_ge_i64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xac,0x7d]
 0xfd,0x04,0xac,0x7d
+# GFX12: v_cmpx_ge_i64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xac,0x7d]
 
-# GFX12: v_cmpx_ge_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xad,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0xad,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ge_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xad,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ge_u16_e32 v1, v2                ; encoding: [0x01,0x05,0x7c,0x7d]
 0x01,0x05,0x7c,0x7d
+# GFX12: v_cmpx_ge_u16_e32 v1, v2                ; encoding: [0x01,0x05,0x7c,0x7d]
 
-# GFX12: v_cmpx_ge_u16_e32 v127, v2              ; encoding: [0x7f,0x05,0x7c,0x7d]
 0x7f,0x05,0x7c,0x7d
+# GFX12: v_cmpx_ge_u16_e32 v127, v2              ; encoding: [0x7f,0x05,0x7c,0x7d]
 
-# GFX12: v_cmpx_ge_u16_e32 s1, v2                ; encoding: [0x01,0x04,0x7c,0x7d]
 0x01,0x04,0x7c,0x7d
+# GFX12: v_cmpx_ge_u16_e32 s1, v2                ; encoding: [0x01,0x04,0x7c,0x7d]
 
-# GFX12: v_cmpx_ge_u16_e32 s105, v2              ; encoding: [0x69,0x04,0x7c,0x7d]
 0x69,0x04,0x7c,0x7d
+# GFX12: v_cmpx_ge_u16_e32 s105, v2              ; encoding: [0x69,0x04,0x7c,0x7d]
 
-# GFX12: v_cmpx_ge_u16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x7c,0x7d]
 0x6a,0x04,0x7c,0x7d
+# GFX12: v_cmpx_ge_u16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x7c,0x7d]
 
-# GFX12: v_cmpx_ge_u16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x7c,0x7d]
 0x6b,0x04,0x7c,0x7d
+# GFX12: v_cmpx_ge_u16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x7c,0x7d]
 
-# GFX12: v_cmpx_ge_u16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x7c,0x7d]
 0x7b,0x04,0x7c,0x7d
+# GFX12: v_cmpx_ge_u16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x7c,0x7d]
 
-# GFX12: v_cmpx_ge_u16_e32 m0, v2                ; encoding: [0x7d,0x04,0x7c,0x7d]
 0x7d,0x04,0x7c,0x7d
+# GFX12: v_cmpx_ge_u16_e32 m0, v2                ; encoding: [0x7d,0x04,0x7c,0x7d]
 
-# GFX12: v_cmpx_ge_u16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x7c,0x7d]
 0x7e,0x04,0x7c,0x7d
+# GFX12: v_cmpx_ge_u16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x7c,0x7d]
 
-# GFX12: v_cmpx_ge_u16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x7c,0x7d]
 0x7f,0x04,0x7c,0x7d
+# GFX12: v_cmpx_ge_u16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x7c,0x7d]
 
-# GFX12: v_cmpx_ge_u16_e32 null, v2              ; encoding: [0x7c,0x04,0x7c,0x7d]
 0x7c,0x04,0x7c,0x7d
+# GFX12: v_cmpx_ge_u16_e32 null, v2              ; encoding: [0x7c,0x04,0x7c,0x7d]
 
-# GFX12: v_cmpx_ge_u16_e32 -1, v2                ; encoding: [0xc1,0x04,0x7c,0x7d]
 0xc1,0x04,0x7c,0x7d
+# GFX12: v_cmpx_ge_u16_e32 -1, v2                ; encoding: [0xc1,0x04,0x7c,0x7d]
 
-# GFX12: v_cmpx_ge_u16_e32 0x3800, v2
 0xf0,0x04,0x7c,0x7d
+# GFX12: v_cmpx_ge_u16_e32 0x3800, v2            ; encoding: [0xff,0x04,0x7c,0x7d,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_ge_u16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x7c,0x7d]
 0xfd,0x04,0x7c,0x7d
+# GFX12: v_cmpx_ge_u16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x7c,0x7d]
 
-# GFX12: v_cmpx_ge_u16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x7c,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x7c,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_ge_u16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x7c,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_ge_u32_e32 v1, v2                ; encoding: [0x01,0x05,0x9c,0x7d]
 0x01,0x05,0x9c,0x7d
+# GFX12: v_cmpx_ge_u32_e32 v1, v2                ; encoding: [0x01,0x05,0x9c,0x7d]
 
-# GFX12: v_cmpx_ge_u32_e32 v255, v2              ; encoding: [0xff,0x05,0x9c,0x7d]
 0xff,0x05,0x9c,0x7d
+# GFX12: v_cmpx_ge_u32_e32 v255, v2              ; encoding: [0xff,0x05,0x9c,0x7d]
 
-# GFX12: v_cmpx_ge_u32_e32 s1, v2                ; encoding: [0x01,0x04,0x9c,0x7d]
 0x01,0x04,0x9c,0x7d
+# GFX12: v_cmpx_ge_u32_e32 s1, v2                ; encoding: [0x01,0x04,0x9c,0x7d]
 
-# GFX12: v_cmpx_ge_u32_e32 s105, v2              ; encoding: [0x69,0x04,0x9c,0x7d]
 0x69,0x04,0x9c,0x7d
+# GFX12: v_cmpx_ge_u32_e32 s105, v2              ; encoding: [0x69,0x04,0x9c,0x7d]
 
-# GFX12: v_cmpx_ge_u32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x9c,0x7d]
 0x6a,0x04,0x9c,0x7d
+# GFX12: v_cmpx_ge_u32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x9c,0x7d]
 
-# GFX12: v_cmpx_ge_u32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x9c,0x7d]
 0x6b,0x04,0x9c,0x7d
+# GFX12: v_cmpx_ge_u32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x9c,0x7d]
 
-# GFX12: v_cmpx_ge_u32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x9c,0x7d]
 0x7b,0x04,0x9c,0x7d
+# GFX12: v_cmpx_ge_u32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x9c,0x7d]
 
-# GFX12: v_cmpx_ge_u32_e32 m0, v2                ; encoding: [0x7d,0x04,0x9c,0x7d]
 0x7d,0x04,0x9c,0x7d
+# GFX12: v_cmpx_ge_u32_e32 m0, v2                ; encoding: [0x7d,0x04,0x9c,0x7d]
 
-# GFX12: v_cmpx_ge_u32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x9c,0x7d]
 0x7e,0x04,0x9c,0x7d
+# GFX12: v_cmpx_ge_u32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x9c,0x7d]
 
-# GFX12: v_cmpx_ge_u32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x9c,0x7d]
 0x7f,0x04,0x9c,0x7d
+# GFX12: v_cmpx_ge_u32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x9c,0x7d]
 
-# GFX12: v_cmpx_ge_u32_e32 null, v2              ; encoding: [0x7c,0x04,0x9c,0x7d]
 0x7c,0x04,0x9c,0x7d
+# GFX12: v_cmpx_ge_u32_e32 null, v2              ; encoding: [0x7c,0x04,0x9c,0x7d]
 
-# GFX12: v_cmpx_ge_u32_e32 -1, v2                ; encoding: [0xc1,0x04,0x9c,0x7d]
 0xc1,0x04,0x9c,0x7d
+# GFX12: v_cmpx_ge_u32_e32 -1, v2                ; encoding: [0xc1,0x04,0x9c,0x7d]
 
-# GFX12: v_cmpx_ge_u32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x9c,0x7d]
 0xf0,0x04,0x9c,0x7d
+# GFX12: v_cmpx_ge_u32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x9c,0x7d]
 
-# GFX12: v_cmpx_ge_u32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x9c,0x7d]
 0xfd,0x04,0x9c,0x7d
+# GFX12: v_cmpx_ge_u32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x9c,0x7d]
 
-# GFX12: v_cmpx_ge_u32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x9d,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x9d,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ge_u32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x9d,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ge_u64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xbc,0x7d]
 0x01,0x05,0xbc,0x7d
+# GFX12: v_cmpx_ge_u64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xbc,0x7d]
 
-# GFX12: v_cmpx_ge_u64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xbc,0x7d]
 0xfe,0x05,0xbc,0x7d
+# GFX12: v_cmpx_ge_u64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xbc,0x7d]
 
-# GFX12: v_cmpx_ge_u64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xbc,0x7d]
 0x02,0x04,0xbc,0x7d
+# GFX12: v_cmpx_ge_u64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xbc,0x7d]
 
-# GFX12: v_cmpx_ge_u64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xbc,0x7d]
 0x68,0x04,0xbc,0x7d
+# GFX12: v_cmpx_ge_u64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xbc,0x7d]
 
-# GFX12: v_cmpx_ge_u64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xbc,0x7d]
 0x6a,0x04,0xbc,0x7d
+# GFX12: v_cmpx_ge_u64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xbc,0x7d]
 
-# GFX12: v_cmpx_ge_u64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xbc,0x7d]
 0x7a,0x04,0xbc,0x7d
+# GFX12: v_cmpx_ge_u64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xbc,0x7d]
 
-# GFX12: v_cmpx_ge_u64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xbc,0x7d]
 0x7e,0x04,0xbc,0x7d
+# GFX12: v_cmpx_ge_u64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xbc,0x7d]
 
-# GFX12: v_cmpx_ge_u64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xbc,0x7d]
 0x7c,0x04,0xbc,0x7d
+# GFX12: v_cmpx_ge_u64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xbc,0x7d]
 
-# GFX12: v_cmpx_ge_u64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xbc,0x7d]
 0xc1,0x04,0xbc,0x7d
+# GFX12: v_cmpx_ge_u64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xbc,0x7d]
 
-# GFX12: v_cmpx_ge_u64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xbc,0x7d]
 0xf0,0x04,0xbc,0x7d
+# GFX12: v_cmpx_ge_u64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xbc,0x7d]
 
-# GFX12: v_cmpx_ge_u64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xbc,0x7d]
 0xfd,0x04,0xbc,0x7d
+# GFX12: v_cmpx_ge_u64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xbc,0x7d]
 
-# GFX12: v_cmpx_ge_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbd,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0xbd,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ge_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbd,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_gt_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x08,0x7d]
 0x01,0x05,0x08,0x7d
+# GFX12: v_cmpx_gt_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x08,0x7d]
 
-# GFX12: v_cmpx_gt_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x08,0x7d]
 0x7f,0x05,0x08,0x7d
+# GFX12: v_cmpx_gt_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x08,0x7d]
 
-# GFX12: v_cmpx_gt_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x08,0x7d]
 0x01,0x04,0x08,0x7d
+# GFX12: v_cmpx_gt_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x08,0x7d]
 
-# GFX12: v_cmpx_gt_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x08,0x7d]
 0x69,0x04,0x08,0x7d
+# GFX12: v_cmpx_gt_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x08,0x7d]
 
-# GFX12: v_cmpx_gt_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x08,0x7d]
 0x6a,0x04,0x08,0x7d
+# GFX12: v_cmpx_gt_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x08,0x7d]
 
-# GFX12: v_cmpx_gt_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x08,0x7d]
 0x6b,0x04,0x08,0x7d
+# GFX12: v_cmpx_gt_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x08,0x7d]
 
-# GFX12: v_cmpx_gt_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x08,0x7d]
 0x7b,0x04,0x08,0x7d
+# GFX12: v_cmpx_gt_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x08,0x7d]
 
-# GFX12: v_cmpx_gt_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x08,0x7d]
 0x7d,0x04,0x08,0x7d
+# GFX12: v_cmpx_gt_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x08,0x7d]
 
-# GFX12: v_cmpx_gt_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x08,0x7d]
 0x7e,0x04,0x08,0x7d
+# GFX12: v_cmpx_gt_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x08,0x7d]
 
-# GFX12: v_cmpx_gt_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x08,0x7d]
 0x7f,0x04,0x08,0x7d
+# GFX12: v_cmpx_gt_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x08,0x7d]
 
-# GFX12: v_cmpx_gt_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x08,0x7d]
 0x7c,0x04,0x08,0x7d
+# GFX12: v_cmpx_gt_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x08,0x7d]
 
-# GFX12: v_cmpx_gt_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x08,0x7d]
 0xc1,0x04,0x08,0x7d
+# GFX12: v_cmpx_gt_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x08,0x7d]
 
-# GFX12: v_cmpx_gt_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x08,0x7d]
 0xf0,0x04,0x08,0x7d
+# GFX12: v_cmpx_gt_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x08,0x7d]
 
-# GFX12: v_cmpx_gt_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x08,0x7d]
 0xfd,0x04,0x08,0x7d
+# GFX12: v_cmpx_gt_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x08,0x7d]
 
-# GFX12: v_cmpx_gt_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x08,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x08,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_gt_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x08,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_gt_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x28,0x7d]
 0x01,0x05,0x28,0x7d
+# GFX12: v_cmpx_gt_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x28,0x7d]
 
-# GFX12: v_cmpx_gt_f32_e32 v255, v2              ; encoding: [0xff,0x05,0x28,0x7d]
 0xff,0x05,0x28,0x7d
+# GFX12: v_cmpx_gt_f32_e32 v255, v2              ; encoding: [0xff,0x05,0x28,0x7d]
 
-# GFX12: v_cmpx_gt_f32_e32 s1, v2                ; encoding: [0x01,0x04,0x28,0x7d]
 0x01,0x04,0x28,0x7d
+# GFX12: v_cmpx_gt_f32_e32 s1, v2                ; encoding: [0x01,0x04,0x28,0x7d]
 
-# GFX12: v_cmpx_gt_f32_e32 s105, v2              ; encoding: [0x69,0x04,0x28,0x7d]
 0x69,0x04,0x28,0x7d
+# GFX12: v_cmpx_gt_f32_e32 s105, v2              ; encoding: [0x69,0x04,0x28,0x7d]
 
-# GFX12: v_cmpx_gt_f32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x28,0x7d]
 0x6a,0x04,0x28,0x7d
+# GFX12: v_cmpx_gt_f32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x28,0x7d]
 
-# GFX12: v_cmpx_gt_f32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x28,0x7d]
 0x6b,0x04,0x28,0x7d
+# GFX12: v_cmpx_gt_f32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x28,0x7d]
 
-# GFX12: v_cmpx_gt_f32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x28,0x7d]
 0x7b,0x04,0x28,0x7d
+# GFX12: v_cmpx_gt_f32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x28,0x7d]
 
-# GFX12: v_cmpx_gt_f32_e32 m0, v2                ; encoding: [0x7d,0x04,0x28,0x7d]
 0x7d,0x04,0x28,0x7d
+# GFX12: v_cmpx_gt_f32_e32 m0, v2                ; encoding: [0x7d,0x04,0x28,0x7d]
 
-# GFX12: v_cmpx_gt_f32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x28,0x7d]
 0x7e,0x04,0x28,0x7d
+# GFX12: v_cmpx_gt_f32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x28,0x7d]
 
-# GFX12: v_cmpx_gt_f32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x28,0x7d]
 0x7f,0x04,0x28,0x7d
+# GFX12: v_cmpx_gt_f32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x28,0x7d]
 
-# GFX12: v_cmpx_gt_f32_e32 null, v2              ; encoding: [0x7c,0x04,0x28,0x7d]
 0x7c,0x04,0x28,0x7d
+# GFX12: v_cmpx_gt_f32_e32 null, v2              ; encoding: [0x7c,0x04,0x28,0x7d]
 
-# GFX12: v_cmpx_gt_f32_e32 -1, v2                ; encoding: [0xc1,0x04,0x28,0x7d]
 0xc1,0x04,0x28,0x7d
+# GFX12: v_cmpx_gt_f32_e32 -1, v2                ; encoding: [0xc1,0x04,0x28,0x7d]
 
-# GFX12: v_cmpx_gt_f32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x28,0x7d]
 0xf0,0x04,0x28,0x7d
+# GFX12: v_cmpx_gt_f32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x28,0x7d]
 
-# GFX12: v_cmpx_gt_f32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x28,0x7d]
 0xfd,0x04,0x28,0x7d
+# GFX12: v_cmpx_gt_f32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x28,0x7d]
 
-# GFX12: v_cmpx_gt_f32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x29,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x29,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_gt_f32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x29,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_gt_f64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0x48,0x7d]
 0x01,0x05,0x48,0x7d
+# GFX12: v_cmpx_gt_f64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0x48,0x7d]
 
-# GFX12: v_cmpx_gt_f64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0x48,0x7d]
 0xfe,0x05,0x48,0x7d
+# GFX12: v_cmpx_gt_f64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0x48,0x7d]
 
-# GFX12: v_cmpx_gt_f64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0x48,0x7d]
 0x02,0x04,0x48,0x7d
+# GFX12: v_cmpx_gt_f64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0x48,0x7d]
 
-# GFX12: v_cmpx_gt_f64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0x48,0x7d]
 0x68,0x04,0x48,0x7d
+# GFX12: v_cmpx_gt_f64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0x48,0x7d]
 
-# GFX12: v_cmpx_gt_f64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0x48,0x7d]
 0x6a,0x04,0x48,0x7d
+# GFX12: v_cmpx_gt_f64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0x48,0x7d]
 
-# GFX12: v_cmpx_gt_f64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0x48,0x7d]
 0x7a,0x04,0x48,0x7d
+# GFX12: v_cmpx_gt_f64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0x48,0x7d]
 
-# GFX12: v_cmpx_gt_f64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0x48,0x7d]
 0x7e,0x04,0x48,0x7d
+# GFX12: v_cmpx_gt_f64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0x48,0x7d]
 
-# GFX12: v_cmpx_gt_f64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0x48,0x7d]
 0x7c,0x04,0x48,0x7d
+# GFX12: v_cmpx_gt_f64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0x48,0x7d]
 
-# GFX12: v_cmpx_gt_f64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0x48,0x7d]
 0xc1,0x04,0x48,0x7d
+# GFX12: v_cmpx_gt_f64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0x48,0x7d]
 
-# GFX12: v_cmpx_gt_f64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0x48,0x7d]
 0xf0,0x04,0x48,0x7d
+# GFX12: v_cmpx_gt_f64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0x48,0x7d]
 
-# GFX12: v_cmpx_gt_f64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0x48,0x7d]
 0xfd,0x04,0x48,0x7d
+# GFX12: v_cmpx_gt_f64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0x48,0x7d]
 
-# GFX12: v_cmpx_gt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x49,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x49,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_gt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x49,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_gt_i16_e32 v1, v2                ; encoding: [0x01,0x05,0x68,0x7d]
 0x01,0x05,0x68,0x7d
+# GFX12: v_cmpx_gt_i16_e32 v1, v2                ; encoding: [0x01,0x05,0x68,0x7d]
 
-# GFX12: v_cmpx_gt_i16_e32 v127, v2              ; encoding: [0x7f,0x05,0x68,0x7d]
 0x7f,0x05,0x68,0x7d
+# GFX12: v_cmpx_gt_i16_e32 v127, v2              ; encoding: [0x7f,0x05,0x68,0x7d]
 
-# GFX12: v_cmpx_gt_i16_e32 s1, v2                ; encoding: [0x01,0x04,0x68,0x7d]
 0x01,0x04,0x68,0x7d
+# GFX12: v_cmpx_gt_i16_e32 s1, v2                ; encoding: [0x01,0x04,0x68,0x7d]
 
-# GFX12: v_cmpx_gt_i16_e32 s105, v2              ; encoding: [0x69,0x04,0x68,0x7d]
 0x69,0x04,0x68,0x7d
+# GFX12: v_cmpx_gt_i16_e32 s105, v2              ; encoding: [0x69,0x04,0x68,0x7d]
 
-# GFX12: v_cmpx_gt_i16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x68,0x7d]
 0x6a,0x04,0x68,0x7d
+# GFX12: v_cmpx_gt_i16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x68,0x7d]
 
-# GFX12: v_cmpx_gt_i16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x68,0x7d]
 0x6b,0x04,0x68,0x7d
+# GFX12: v_cmpx_gt_i16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x68,0x7d]
 
-# GFX12: v_cmpx_gt_i16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x68,0x7d]
 0x7b,0x04,0x68,0x7d
+# GFX12: v_cmpx_gt_i16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x68,0x7d]
 
-# GFX12: v_cmpx_gt_i16_e32 m0, v2                ; encoding: [0x7d,0x04,0x68,0x7d]
 0x7d,0x04,0x68,0x7d
+# GFX12: v_cmpx_gt_i16_e32 m0, v2                ; encoding: [0x7d,0x04,0x68,0x7d]
 
-# GFX12: v_cmpx_gt_i16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x68,0x7d]
 0x7e,0x04,0x68,0x7d
+# GFX12: v_cmpx_gt_i16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x68,0x7d]
 
-# GFX12: v_cmpx_gt_i16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x68,0x7d]
 0x7f,0x04,0x68,0x7d
+# GFX12: v_cmpx_gt_i16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x68,0x7d]
 
-# GFX12: v_cmpx_gt_i16_e32 null, v2              ; encoding: [0x7c,0x04,0x68,0x7d]
 0x7c,0x04,0x68,0x7d
+# GFX12: v_cmpx_gt_i16_e32 null, v2              ; encoding: [0x7c,0x04,0x68,0x7d]
 
-# GFX12: v_cmpx_gt_i16_e32 -1, v2                ; encoding: [0xc1,0x04,0x68,0x7d]
 0xc1,0x04,0x68,0x7d
+# GFX12: v_cmpx_gt_i16_e32 -1, v2                ; encoding: [0xc1,0x04,0x68,0x7d]
 
-# GFX12: v_cmpx_gt_i16_e32 0x3800, v2
 0xf0,0x04,0x68,0x7d
+# GFX12: v_cmpx_gt_i16_e32 0x3800, v2            ; encoding: [0xff,0x04,0x68,0x7d,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_gt_i16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x68,0x7d]
 0xfd,0x04,0x68,0x7d
+# GFX12: v_cmpx_gt_i16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x68,0x7d]
 
-# GFX12: v_cmpx_gt_i16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x68,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x68,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_gt_i16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x68,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_gt_i32_e32 v1, v2                ; encoding: [0x01,0x05,0x88,0x7d]
 0x01,0x05,0x88,0x7d
+# GFX12: v_cmpx_gt_i32_e32 v1, v2                ; encoding: [0x01,0x05,0x88,0x7d]
 
-# GFX12: v_cmpx_gt_i32_e32 v255, v2              ; encoding: [0xff,0x05,0x88,0x7d]
 0xff,0x05,0x88,0x7d
+# GFX12: v_cmpx_gt_i32_e32 v255, v2              ; encoding: [0xff,0x05,0x88,0x7d]
 
-# GFX12: v_cmpx_gt_i32_e32 s1, v2                ; encoding: [0x01,0x04,0x88,0x7d]
 0x01,0x04,0x88,0x7d
+# GFX12: v_cmpx_gt_i32_e32 s1, v2                ; encoding: [0x01,0x04,0x88,0x7d]
 
-# GFX12: v_cmpx_gt_i32_e32 s105, v2              ; encoding: [0x69,0x04,0x88,0x7d]
 0x69,0x04,0x88,0x7d
+# GFX12: v_cmpx_gt_i32_e32 s105, v2              ; encoding: [0x69,0x04,0x88,0x7d]
 
-# GFX12: v_cmpx_gt_i32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x88,0x7d]
 0x6a,0x04,0x88,0x7d
+# GFX12: v_cmpx_gt_i32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x88,0x7d]
 
-# GFX12: v_cmpx_gt_i32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x88,0x7d]
 0x6b,0x04,0x88,0x7d
+# GFX12: v_cmpx_gt_i32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x88,0x7d]
 
-# GFX12: v_cmpx_gt_i32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x88,0x7d]
 0x7b,0x04,0x88,0x7d
+# GFX12: v_cmpx_gt_i32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x88,0x7d]
 
-# GFX12: v_cmpx_gt_i32_e32 m0, v2                ; encoding: [0x7d,0x04,0x88,0x7d]
 0x7d,0x04,0x88,0x7d
+# GFX12: v_cmpx_gt_i32_e32 m0, v2                ; encoding: [0x7d,0x04,0x88,0x7d]
 
-# GFX12: v_cmpx_gt_i32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x88,0x7d]
 0x7e,0x04,0x88,0x7d
+# GFX12: v_cmpx_gt_i32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x88,0x7d]
 
-# GFX12: v_cmpx_gt_i32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x88,0x7d]
 0x7f,0x04,0x88,0x7d
+# GFX12: v_cmpx_gt_i32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x88,0x7d]
 
-# GFX12: v_cmpx_gt_i32_e32 null, v2              ; encoding: [0x7c,0x04,0x88,0x7d]
 0x7c,0x04,0x88,0x7d
+# GFX12: v_cmpx_gt_i32_e32 null, v2              ; encoding: [0x7c,0x04,0x88,0x7d]
 
-# GFX12: v_cmpx_gt_i32_e32 -1, v2                ; encoding: [0xc1,0x04,0x88,0x7d]
 0xc1,0x04,0x88,0x7d
+# GFX12: v_cmpx_gt_i32_e32 -1, v2                ; encoding: [0xc1,0x04,0x88,0x7d]
 
-# GFX12: v_cmpx_gt_i32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x88,0x7d]
 0xf0,0x04,0x88,0x7d
+# GFX12: v_cmpx_gt_i32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x88,0x7d]
 
-# GFX12: v_cmpx_gt_i32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x88,0x7d]
 0xfd,0x04,0x88,0x7d
+# GFX12: v_cmpx_gt_i32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x88,0x7d]
 
-# GFX12: v_cmpx_gt_i32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x89,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x89,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_gt_i32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x89,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_gt_i64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xa8,0x7d]
 0x01,0x05,0xa8,0x7d
+# GFX12: v_cmpx_gt_i64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xa8,0x7d]
 
-# GFX12: v_cmpx_gt_i64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xa8,0x7d]
 0xfe,0x05,0xa8,0x7d
+# GFX12: v_cmpx_gt_i64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xa8,0x7d]
 
-# GFX12: v_cmpx_gt_i64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xa8,0x7d]
 0x02,0x04,0xa8,0x7d
+# GFX12: v_cmpx_gt_i64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xa8,0x7d]
 
-# GFX12: v_cmpx_gt_i64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xa8,0x7d]
 0x68,0x04,0xa8,0x7d
+# GFX12: v_cmpx_gt_i64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xa8,0x7d]
 
-# GFX12: v_cmpx_gt_i64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xa8,0x7d]
 0x6a,0x04,0xa8,0x7d
+# GFX12: v_cmpx_gt_i64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xa8,0x7d]
 
-# GFX12: v_cmpx_gt_i64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xa8,0x7d]
 0x7a,0x04,0xa8,0x7d
+# GFX12: v_cmpx_gt_i64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xa8,0x7d]
 
-# GFX12: v_cmpx_gt_i64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xa8,0x7d]
 0x7e,0x04,0xa8,0x7d
+# GFX12: v_cmpx_gt_i64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xa8,0x7d]
 
-# GFX12: v_cmpx_gt_i64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xa8,0x7d]
 0x7c,0x04,0xa8,0x7d
+# GFX12: v_cmpx_gt_i64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xa8,0x7d]
 
-# GFX12: v_cmpx_gt_i64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xa8,0x7d]
 0xc1,0x04,0xa8,0x7d
+# GFX12: v_cmpx_gt_i64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xa8,0x7d]
 
-# GFX12: v_cmpx_gt_i64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xa8,0x7d]
 0xf0,0x04,0xa8,0x7d
+# GFX12: v_cmpx_gt_i64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xa8,0x7d]
 
-# GFX12: v_cmpx_gt_i64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xa8,0x7d]
 0xfd,0x04,0xa8,0x7d
+# GFX12: v_cmpx_gt_i64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xa8,0x7d]
 
-# GFX12: v_cmpx_gt_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa9,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0xa9,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_gt_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa9,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_gt_u16_e32 v1, v2                ; encoding: [0x01,0x05,0x78,0x7d]
 0x01,0x05,0x78,0x7d
+# GFX12: v_cmpx_gt_u16_e32 v1, v2                ; encoding: [0x01,0x05,0x78,0x7d]
 
-# GFX12: v_cmpx_gt_u16_e32 v127, v2              ; encoding: [0x7f,0x05,0x78,0x7d]
 0x7f,0x05,0x78,0x7d
+# GFX12: v_cmpx_gt_u16_e32 v127, v2              ; encoding: [0x7f,0x05,0x78,0x7d]
 
-# GFX12: v_cmpx_gt_u16_e32 s1, v2                ; encoding: [0x01,0x04,0x78,0x7d]
 0x01,0x04,0x78,0x7d
+# GFX12: v_cmpx_gt_u16_e32 s1, v2                ; encoding: [0x01,0x04,0x78,0x7d]
 
-# GFX12: v_cmpx_gt_u16_e32 s105, v2              ; encoding: [0x69,0x04,0x78,0x7d]
 0x69,0x04,0x78,0x7d
+# GFX12: v_cmpx_gt_u16_e32 s105, v2              ; encoding: [0x69,0x04,0x78,0x7d]
 
-# GFX12: v_cmpx_gt_u16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x78,0x7d]
 0x6a,0x04,0x78,0x7d
+# GFX12: v_cmpx_gt_u16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x78,0x7d]
 
-# GFX12: v_cmpx_gt_u16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x78,0x7d]
 0x6b,0x04,0x78,0x7d
+# GFX12: v_cmpx_gt_u16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x78,0x7d]
 
-# GFX12: v_cmpx_gt_u16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x78,0x7d]
 0x7b,0x04,0x78,0x7d
+# GFX12: v_cmpx_gt_u16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x78,0x7d]
 
-# GFX12: v_cmpx_gt_u16_e32 m0, v2                ; encoding: [0x7d,0x04,0x78,0x7d]
 0x7d,0x04,0x78,0x7d
+# GFX12: v_cmpx_gt_u16_e32 m0, v2                ; encoding: [0x7d,0x04,0x78,0x7d]
 
-# GFX12: v_cmpx_gt_u16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x78,0x7d]
 0x7e,0x04,0x78,0x7d
+# GFX12: v_cmpx_gt_u16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x78,0x7d]
 
-# GFX12: v_cmpx_gt_u16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x78,0x7d]
 0x7f,0x04,0x78,0x7d
+# GFX12: v_cmpx_gt_u16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x78,0x7d]
 
-# GFX12: v_cmpx_gt_u16_e32 null, v2              ; encoding: [0x7c,0x04,0x78,0x7d]
 0x7c,0x04,0x78,0x7d
+# GFX12: v_cmpx_gt_u16_e32 null, v2              ; encoding: [0x7c,0x04,0x78,0x7d]
 
-# GFX12: v_cmpx_gt_u16_e32 -1, v2                ; encoding: [0xc1,0x04,0x78,0x7d]
 0xc1,0x04,0x78,0x7d
+# GFX12: v_cmpx_gt_u16_e32 -1, v2                ; encoding: [0xc1,0x04,0x78,0x7d]
 
-# GFX12: v_cmpx_gt_u16_e32 0x3800, v2
 0xf0,0x04,0x78,0x7d
+# GFX12: v_cmpx_gt_u16_e32 0x3800, v2            ; encoding: [0xff,0x04,0x78,0x7d,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_gt_u16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x78,0x7d]
 0xfd,0x04,0x78,0x7d
+# GFX12: v_cmpx_gt_u16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x78,0x7d]
 
-# GFX12: v_cmpx_gt_u16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x78,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x78,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_gt_u16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x78,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_gt_u32_e32 v1, v2                ; encoding: [0x01,0x05,0x98,0x7d]
 0x01,0x05,0x98,0x7d
+# GFX12: v_cmpx_gt_u32_e32 v1, v2                ; encoding: [0x01,0x05,0x98,0x7d]
 
-# GFX12: v_cmpx_gt_u32_e32 v255, v2              ; encoding: [0xff,0x05,0x98,0x7d]
 0xff,0x05,0x98,0x7d
+# GFX12: v_cmpx_gt_u32_e32 v255, v2              ; encoding: [0xff,0x05,0x98,0x7d]
 
-# GFX12: v_cmpx_gt_u32_e32 s1, v2                ; encoding: [0x01,0x04,0x98,0x7d]
 0x01,0x04,0x98,0x7d
+# GFX12: v_cmpx_gt_u32_e32 s1, v2                ; encoding: [0x01,0x04,0x98,0x7d]
 
-# GFX12: v_cmpx_gt_u32_e32 s105, v2              ; encoding: [0x69,0x04,0x98,0x7d]
 0x69,0x04,0x98,0x7d
+# GFX12: v_cmpx_gt_u32_e32 s105, v2              ; encoding: [0x69,0x04,0x98,0x7d]
 
-# GFX12: v_cmpx_gt_u32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x98,0x7d]
 0x6a,0x04,0x98,0x7d
+# GFX12: v_cmpx_gt_u32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x98,0x7d]
 
-# GFX12: v_cmpx_gt_u32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x98,0x7d]
 0x6b,0x04,0x98,0x7d
+# GFX12: v_cmpx_gt_u32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x98,0x7d]
 
-# GFX12: v_cmpx_gt_u32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x98,0x7d]
 0x7b,0x04,0x98,0x7d
+# GFX12: v_cmpx_gt_u32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x98,0x7d]
 
-# GFX12: v_cmpx_gt_u32_e32 m0, v2                ; encoding: [0x7d,0x04,0x98,0x7d]
 0x7d,0x04,0x98,0x7d
+# GFX12: v_cmpx_gt_u32_e32 m0, v2                ; encoding: [0x7d,0x04,0x98,0x7d]
 
-# GFX12: v_cmpx_gt_u32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x98,0x7d]
 0x7e,0x04,0x98,0x7d
+# GFX12: v_cmpx_gt_u32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x98,0x7d]
 
-# GFX12: v_cmpx_gt_u32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x98,0x7d]
 0x7f,0x04,0x98,0x7d
+# GFX12: v_cmpx_gt_u32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x98,0x7d]
 
-# GFX12: v_cmpx_gt_u32_e32 null, v2              ; encoding: [0x7c,0x04,0x98,0x7d]
 0x7c,0x04,0x98,0x7d
+# GFX12: v_cmpx_gt_u32_e32 null, v2              ; encoding: [0x7c,0x04,0x98,0x7d]
 
-# GFX12: v_cmpx_gt_u32_e32 -1, v2                ; encoding: [0xc1,0x04,0x98,0x7d]
 0xc1,0x04,0x98,0x7d
+# GFX12: v_cmpx_gt_u32_e32 -1, v2                ; encoding: [0xc1,0x04,0x98,0x7d]
 
-# GFX12: v_cmpx_gt_u32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x98,0x7d]
 0xf0,0x04,0x98,0x7d
+# GFX12: v_cmpx_gt_u32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x98,0x7d]
 
-# GFX12: v_cmpx_gt_u32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x98,0x7d]
 0xfd,0x04,0x98,0x7d
+# GFX12: v_cmpx_gt_u32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x98,0x7d]
 
-# GFX12: v_cmpx_gt_u32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x99,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x99,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_gt_u32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x99,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_gt_u64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xb8,0x7d]
 0x01,0x05,0xb8,0x7d
+# GFX12: v_cmpx_gt_u64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xb8,0x7d]
 
-# GFX12: v_cmpx_gt_u64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xb8,0x7d]
 0xfe,0x05,0xb8,0x7d
+# GFX12: v_cmpx_gt_u64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xb8,0x7d]
 
-# GFX12: v_cmpx_gt_u64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xb8,0x7d]
 0x02,0x04,0xb8,0x7d
+# GFX12: v_cmpx_gt_u64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xb8,0x7d]
 
-# GFX12: v_cmpx_gt_u64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xb8,0x7d]
 0x68,0x04,0xb8,0x7d
+# GFX12: v_cmpx_gt_u64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xb8,0x7d]
 
-# GFX12: v_cmpx_gt_u64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xb8,0x7d]
 0x6a,0x04,0xb8,0x7d
+# GFX12: v_cmpx_gt_u64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xb8,0x7d]
 
-# GFX12: v_cmpx_gt_u64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xb8,0x7d]
 0x7a,0x04,0xb8,0x7d
+# GFX12: v_cmpx_gt_u64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xb8,0x7d]
 
-# GFX12: v_cmpx_gt_u64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xb8,0x7d]
 0x7e,0x04,0xb8,0x7d
+# GFX12: v_cmpx_gt_u64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xb8,0x7d]
 
-# GFX12: v_cmpx_gt_u64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xb8,0x7d]
 0x7c,0x04,0xb8,0x7d
+# GFX12: v_cmpx_gt_u64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xb8,0x7d]
 
-# GFX12: v_cmpx_gt_u64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xb8,0x7d]
 0xc1,0x04,0xb8,0x7d
+# GFX12: v_cmpx_gt_u64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xb8,0x7d]
 
-# GFX12: v_cmpx_gt_u64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xb8,0x7d]
 0xf0,0x04,0xb8,0x7d
+# GFX12: v_cmpx_gt_u64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xb8,0x7d]
 
-# GFX12: v_cmpx_gt_u64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xb8,0x7d]
 0xfd,0x04,0xb8,0x7d
+# GFX12: v_cmpx_gt_u64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xb8,0x7d]
 
-# GFX12: v_cmpx_gt_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb9,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0xb9,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_gt_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb9,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_le_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x06,0x7d]
 0x01,0x05,0x06,0x7d
+# GFX12: v_cmpx_le_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x06,0x7d]
 
-# GFX12: v_cmpx_le_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x06,0x7d]
 0x7f,0x05,0x06,0x7d
+# GFX12: v_cmpx_le_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x06,0x7d]
 
-# GFX12: v_cmpx_le_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x06,0x7d]
 0x01,0x04,0x06,0x7d
+# GFX12: v_cmpx_le_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x06,0x7d]
 
-# GFX12: v_cmpx_le_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x06,0x7d]
 0x69,0x04,0x06,0x7d
+# GFX12: v_cmpx_le_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x06,0x7d]
 
-# GFX12: v_cmpx_le_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x06,0x7d]
 0x6a,0x04,0x06,0x7d
+# GFX12: v_cmpx_le_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x06,0x7d]
 
-# GFX12: v_cmpx_le_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x06,0x7d]
 0x6b,0x04,0x06,0x7d
+# GFX12: v_cmpx_le_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x06,0x7d]
 
-# GFX12: v_cmpx_le_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x06,0x7d]
 0x7b,0x04,0x06,0x7d
+# GFX12: v_cmpx_le_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x06,0x7d]
 
-# GFX12: v_cmpx_le_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x06,0x7d]
 0x7d,0x04,0x06,0x7d
+# GFX12: v_cmpx_le_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x06,0x7d]
 
-# GFX12: v_cmpx_le_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x06,0x7d]
 0x7e,0x04,0x06,0x7d
+# GFX12: v_cmpx_le_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x06,0x7d]
 
-# GFX12: v_cmpx_le_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x06,0x7d]
 0x7f,0x04,0x06,0x7d
+# GFX12: v_cmpx_le_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x06,0x7d]
 
-# GFX12: v_cmpx_le_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x06,0x7d]
 0x7c,0x04,0x06,0x7d
+# GFX12: v_cmpx_le_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x06,0x7d]
 
-# GFX12: v_cmpx_le_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x06,0x7d]
 0xc1,0x04,0x06,0x7d
+# GFX12: v_cmpx_le_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x06,0x7d]
 
-# GFX12: v_cmpx_le_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x06,0x7d]
 0xf0,0x04,0x06,0x7d
+# GFX12: v_cmpx_le_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x06,0x7d]
 
-# GFX12: v_cmpx_le_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x06,0x7d]
 0xfd,0x04,0x06,0x7d
+# GFX12: v_cmpx_le_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x06,0x7d]
 
-# GFX12: v_cmpx_le_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x06,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x06,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_le_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x06,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_le_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x26,0x7d]
 0x01,0x05,0x26,0x7d
+# GFX12: v_cmpx_le_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x26,0x7d]
 
-# GFX12: v_cmpx_le_f32_e32 v255, v2              ; encoding: [0xff,0x05,0x26,0x7d]
 0xff,0x05,0x26,0x7d
+# GFX12: v_cmpx_le_f32_e32 v255, v2              ; encoding: [0xff,0x05,0x26,0x7d]
 
-# GFX12: v_cmpx_le_f32_e32 s1, v2                ; encoding: [0x01,0x04,0x26,0x7d]
 0x01,0x04,0x26,0x7d
+# GFX12: v_cmpx_le_f32_e32 s1, v2                ; encoding: [0x01,0x04,0x26,0x7d]
 
-# GFX12: v_cmpx_le_f32_e32 s105, v2              ; encoding: [0x69,0x04,0x26,0x7d]
 0x69,0x04,0x26,0x7d
+# GFX12: v_cmpx_le_f32_e32 s105, v2              ; encoding: [0x69,0x04,0x26,0x7d]
 
-# GFX12: v_cmpx_le_f32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x26,0x7d]
 0x6a,0x04,0x26,0x7d
+# GFX12: v_cmpx_le_f32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x26,0x7d]
 
-# GFX12: v_cmpx_le_f32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x26,0x7d]
 0x6b,0x04,0x26,0x7d
+# GFX12: v_cmpx_le_f32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x26,0x7d]
 
-# GFX12: v_cmpx_le_f32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x26,0x7d]
 0x7b,0x04,0x26,0x7d
+# GFX12: v_cmpx_le_f32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x26,0x7d]
 
-# GFX12: v_cmpx_le_f32_e32 m0, v2                ; encoding: [0x7d,0x04,0x26,0x7d]
 0x7d,0x04,0x26,0x7d
+# GFX12: v_cmpx_le_f32_e32 m0, v2                ; encoding: [0x7d,0x04,0x26,0x7d]
 
-# GFX12: v_cmpx_le_f32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x26,0x7d]
 0x7e,0x04,0x26,0x7d
+# GFX12: v_cmpx_le_f32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x26,0x7d]
 
-# GFX12: v_cmpx_le_f32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x26,0x7d]
 0x7f,0x04,0x26,0x7d
+# GFX12: v_cmpx_le_f32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x26,0x7d]
 
-# GFX12: v_cmpx_le_f32_e32 null, v2              ; encoding: [0x7c,0x04,0x26,0x7d]
 0x7c,0x04,0x26,0x7d
+# GFX12: v_cmpx_le_f32_e32 null, v2              ; encoding: [0x7c,0x04,0x26,0x7d]
 
-# GFX12: v_cmpx_le_f32_e32 -1, v2                ; encoding: [0xc1,0x04,0x26,0x7d]
 0xc1,0x04,0x26,0x7d
+# GFX12: v_cmpx_le_f32_e32 -1, v2                ; encoding: [0xc1,0x04,0x26,0x7d]
 
-# GFX12: v_cmpx_le_f32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x26,0x7d]
 0xf0,0x04,0x26,0x7d
+# GFX12: v_cmpx_le_f32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x26,0x7d]
 
-# GFX12: v_cmpx_le_f32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x26,0x7d]
 0xfd,0x04,0x26,0x7d
+# GFX12: v_cmpx_le_f32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x26,0x7d]
 
-# GFX12: v_cmpx_le_f32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x27,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x27,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_le_f32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x27,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_le_f64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0x46,0x7d]
 0x01,0x05,0x46,0x7d
+# GFX12: v_cmpx_le_f64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0x46,0x7d]
 
-# GFX12: v_cmpx_le_f64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0x46,0x7d]
 0xfe,0x05,0x46,0x7d
+# GFX12: v_cmpx_le_f64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0x46,0x7d]
 
-# GFX12: v_cmpx_le_f64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0x46,0x7d]
 0x02,0x04,0x46,0x7d
+# GFX12: v_cmpx_le_f64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0x46,0x7d]
 
-# GFX12: v_cmpx_le_f64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0x46,0x7d]
 0x68,0x04,0x46,0x7d
+# GFX12: v_cmpx_le_f64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0x46,0x7d]
 
-# GFX12: v_cmpx_le_f64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0x46,0x7d]
 0x6a,0x04,0x46,0x7d
+# GFX12: v_cmpx_le_f64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0x46,0x7d]
 
-# GFX12: v_cmpx_le_f64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0x46,0x7d]
 0x7a,0x04,0x46,0x7d
+# GFX12: v_cmpx_le_f64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0x46,0x7d]
 
-# GFX12: v_cmpx_le_f64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0x46,0x7d]
 0x7e,0x04,0x46,0x7d
+# GFX12: v_cmpx_le_f64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0x46,0x7d]
 
-# GFX12: v_cmpx_le_f64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0x46,0x7d]
 0x7c,0x04,0x46,0x7d
+# GFX12: v_cmpx_le_f64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0x46,0x7d]
 
-# GFX12: v_cmpx_le_f64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0x46,0x7d]
 0xc1,0x04,0x46,0x7d
+# GFX12: v_cmpx_le_f64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0x46,0x7d]
 
-# GFX12: v_cmpx_le_f64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0x46,0x7d]
 0xf0,0x04,0x46,0x7d
+# GFX12: v_cmpx_le_f64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0x46,0x7d]
 
-# GFX12: v_cmpx_le_f64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0x46,0x7d]
 0xfd,0x04,0x46,0x7d
+# GFX12: v_cmpx_le_f64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0x46,0x7d]
 
-# GFX12: v_cmpx_le_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x47,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x47,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_le_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x47,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_le_i16_e32 v1, v2                ; encoding: [0x01,0x05,0x66,0x7d]
 0x01,0x05,0x66,0x7d
+# GFX12: v_cmpx_le_i16_e32 v1, v2                ; encoding: [0x01,0x05,0x66,0x7d]
 
-# GFX12: v_cmpx_le_i16_e32 v127, v2              ; encoding: [0x7f,0x05,0x66,0x7d]
 0x7f,0x05,0x66,0x7d
+# GFX12: v_cmpx_le_i16_e32 v127, v2              ; encoding: [0x7f,0x05,0x66,0x7d]
 
-# GFX12: v_cmpx_le_i16_e32 s1, v2                ; encoding: [0x01,0x04,0x66,0x7d]
 0x01,0x04,0x66,0x7d
+# GFX12: v_cmpx_le_i16_e32 s1, v2                ; encoding: [0x01,0x04,0x66,0x7d]
 
-# GFX12: v_cmpx_le_i16_e32 s105, v2              ; encoding: [0x69,0x04,0x66,0x7d]
 0x69,0x04,0x66,0x7d
+# GFX12: v_cmpx_le_i16_e32 s105, v2              ; encoding: [0x69,0x04,0x66,0x7d]
 
-# GFX12: v_cmpx_le_i16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x66,0x7d]
 0x6a,0x04,0x66,0x7d
+# GFX12: v_cmpx_le_i16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x66,0x7d]
 
-# GFX12: v_cmpx_le_i16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x66,0x7d]
 0x6b,0x04,0x66,0x7d
+# GFX12: v_cmpx_le_i16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x66,0x7d]
 
-# GFX12: v_cmpx_le_i16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x66,0x7d]
 0x7b,0x04,0x66,0x7d
+# GFX12: v_cmpx_le_i16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x66,0x7d]
 
-# GFX12: v_cmpx_le_i16_e32 m0, v2                ; encoding: [0x7d,0x04,0x66,0x7d]
 0x7d,0x04,0x66,0x7d
+# GFX12: v_cmpx_le_i16_e32 m0, v2                ; encoding: [0x7d,0x04,0x66,0x7d]
 
-# GFX12: v_cmpx_le_i16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x66,0x7d]
 0x7e,0x04,0x66,0x7d
+# GFX12: v_cmpx_le_i16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x66,0x7d]
 
-# GFX12: v_cmpx_le_i16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x66,0x7d]
 0x7f,0x04,0x66,0x7d
+# GFX12: v_cmpx_le_i16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x66,0x7d]
 
-# GFX12: v_cmpx_le_i16_e32 null, v2              ; encoding: [0x7c,0x04,0x66,0x7d]
 0x7c,0x04,0x66,0x7d
+# GFX12: v_cmpx_le_i16_e32 null, v2              ; encoding: [0x7c,0x04,0x66,0x7d]
 
-# GFX12: v_cmpx_le_i16_e32 -1, v2                ; encoding: [0xc1,0x04,0x66,0x7d]
 0xc1,0x04,0x66,0x7d
+# GFX12: v_cmpx_le_i16_e32 -1, v2                ; encoding: [0xc1,0x04,0x66,0x7d]
 
-# GFX12: v_cmpx_le_i16_e32 0x3800, v2
 0xf0,0x04,0x66,0x7d
+# GFX12: v_cmpx_le_i16_e32 0x3800, v2            ; encoding: [0xff,0x04,0x66,0x7d,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_le_i16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x66,0x7d]
 0xfd,0x04,0x66,0x7d
+# GFX12: v_cmpx_le_i16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x66,0x7d]
 
-# GFX12: v_cmpx_le_i16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x66,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x66,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_le_i16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x66,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_le_i32_e32 v1, v2                ; encoding: [0x01,0x05,0x86,0x7d]
 0x01,0x05,0x86,0x7d
+# GFX12: v_cmpx_le_i32_e32 v1, v2                ; encoding: [0x01,0x05,0x86,0x7d]
 
-# GFX12: v_cmpx_le_i32_e32 v255, v2              ; encoding: [0xff,0x05,0x86,0x7d]
 0xff,0x05,0x86,0x7d
+# GFX12: v_cmpx_le_i32_e32 v255, v2              ; encoding: [0xff,0x05,0x86,0x7d]
 
-# GFX12: v_cmpx_le_i32_e32 s1, v2                ; encoding: [0x01,0x04,0x86,0x7d]
 0x01,0x04,0x86,0x7d
+# GFX12: v_cmpx_le_i32_e32 s1, v2                ; encoding: [0x01,0x04,0x86,0x7d]
 
-# GFX12: v_cmpx_le_i32_e32 s105, v2              ; encoding: [0x69,0x04,0x86,0x7d]
 0x69,0x04,0x86,0x7d
+# GFX12: v_cmpx_le_i32_e32 s105, v2              ; encoding: [0x69,0x04,0x86,0x7d]
 
-# GFX12: v_cmpx_le_i32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x86,0x7d]
 0x6a,0x04,0x86,0x7d
+# GFX12: v_cmpx_le_i32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x86,0x7d]
 
-# GFX12: v_cmpx_le_i32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x86,0x7d]
 0x6b,0x04,0x86,0x7d
+# GFX12: v_cmpx_le_i32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x86,0x7d]
 
-# GFX12: v_cmpx_le_i32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x86,0x7d]
 0x7b,0x04,0x86,0x7d
+# GFX12: v_cmpx_le_i32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x86,0x7d]
 
-# GFX12: v_cmpx_le_i32_e32 m0, v2                ; encoding: [0x7d,0x04,0x86,0x7d]
 0x7d,0x04,0x86,0x7d
+# GFX12: v_cmpx_le_i32_e32 m0, v2                ; encoding: [0x7d,0x04,0x86,0x7d]
 
-# GFX12: v_cmpx_le_i32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x86,0x7d]
 0x7e,0x04,0x86,0x7d
+# GFX12: v_cmpx_le_i32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x86,0x7d]
 
-# GFX12: v_cmpx_le_i32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x86,0x7d]
 0x7f,0x04,0x86,0x7d
+# GFX12: v_cmpx_le_i32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x86,0x7d]
 
-# GFX12: v_cmpx_le_i32_e32 null, v2              ; encoding: [0x7c,0x04,0x86,0x7d]
 0x7c,0x04,0x86,0x7d
+# GFX12: v_cmpx_le_i32_e32 null, v2              ; encoding: [0x7c,0x04,0x86,0x7d]
 
-# GFX12: v_cmpx_le_i32_e32 -1, v2                ; encoding: [0xc1,0x04,0x86,0x7d]
 0xc1,0x04,0x86,0x7d
+# GFX12: v_cmpx_le_i32_e32 -1, v2                ; encoding: [0xc1,0x04,0x86,0x7d]
 
-# GFX12: v_cmpx_le_i32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x86,0x7d]
 0xf0,0x04,0x86,0x7d
+# GFX12: v_cmpx_le_i32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x86,0x7d]
 
-# GFX12: v_cmpx_le_i32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x86,0x7d]
 0xfd,0x04,0x86,0x7d
+# GFX12: v_cmpx_le_i32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x86,0x7d]
 
-# GFX12: v_cmpx_le_i32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x87,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x87,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_le_i32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x87,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_le_i64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xa6,0x7d]
 0x01,0x05,0xa6,0x7d
+# GFX12: v_cmpx_le_i64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xa6,0x7d]
 
-# GFX12: v_cmpx_le_i64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xa6,0x7d]
 0xfe,0x05,0xa6,0x7d
+# GFX12: v_cmpx_le_i64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xa6,0x7d]
 
-# GFX12: v_cmpx_le_i64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xa6,0x7d]
 0x02,0x04,0xa6,0x7d
+# GFX12: v_cmpx_le_i64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xa6,0x7d]
 
-# GFX12: v_cmpx_le_i64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xa6,0x7d]
 0x68,0x04,0xa6,0x7d
+# GFX12: v_cmpx_le_i64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xa6,0x7d]
 
-# GFX12: v_cmpx_le_i64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xa6,0x7d]
 0x6a,0x04,0xa6,0x7d
+# GFX12: v_cmpx_le_i64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xa6,0x7d]
 
-# GFX12: v_cmpx_le_i64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xa6,0x7d]
 0x7a,0x04,0xa6,0x7d
+# GFX12: v_cmpx_le_i64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xa6,0x7d]
 
-# GFX12: v_cmpx_le_i64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xa6,0x7d]
 0x7e,0x04,0xa6,0x7d
+# GFX12: v_cmpx_le_i64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xa6,0x7d]
 
-# GFX12: v_cmpx_le_i64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xa6,0x7d]
 0x7c,0x04,0xa6,0x7d
+# GFX12: v_cmpx_le_i64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xa6,0x7d]
 
-# GFX12: v_cmpx_le_i64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xa6,0x7d]
 0xc1,0x04,0xa6,0x7d
+# GFX12: v_cmpx_le_i64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xa6,0x7d]
 
-# GFX12: v_cmpx_le_i64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xa6,0x7d]
 0xf0,0x04,0xa6,0x7d
+# GFX12: v_cmpx_le_i64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xa6,0x7d]
 
-# GFX12: v_cmpx_le_i64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xa6,0x7d]
 0xfd,0x04,0xa6,0x7d
+# GFX12: v_cmpx_le_i64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xa6,0x7d]
 
-# GFX12: v_cmpx_le_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa7,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0xa7,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_le_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa7,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_le_u16_e32 v1, v2                ; encoding: [0x01,0x05,0x76,0x7d]
 0x01,0x05,0x76,0x7d
+# GFX12: v_cmpx_le_u16_e32 v1, v2                ; encoding: [0x01,0x05,0x76,0x7d]
 
-# GFX12: v_cmpx_le_u16_e32 v127, v2              ; encoding: [0x7f,0x05,0x76,0x7d]
 0x7f,0x05,0x76,0x7d
+# GFX12: v_cmpx_le_u16_e32 v127, v2              ; encoding: [0x7f,0x05,0x76,0x7d]
 
-# GFX12: v_cmpx_le_u16_e32 s1, v2                ; encoding: [0x01,0x04,0x76,0x7d]
 0x01,0x04,0x76,0x7d
+# GFX12: v_cmpx_le_u16_e32 s1, v2                ; encoding: [0x01,0x04,0x76,0x7d]
 
-# GFX12: v_cmpx_le_u16_e32 s105, v2              ; encoding: [0x69,0x04,0x76,0x7d]
 0x69,0x04,0x76,0x7d
+# GFX12: v_cmpx_le_u16_e32 s105, v2              ; encoding: [0x69,0x04,0x76,0x7d]
 
-# GFX12: v_cmpx_le_u16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x76,0x7d]
 0x6a,0x04,0x76,0x7d
+# GFX12: v_cmpx_le_u16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x76,0x7d]
 
-# GFX12: v_cmpx_le_u16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x76,0x7d]
 0x6b,0x04,0x76,0x7d
+# GFX12: v_cmpx_le_u16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x76,0x7d]
 
-# GFX12: v_cmpx_le_u16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x76,0x7d]
 0x7b,0x04,0x76,0x7d
+# GFX12: v_cmpx_le_u16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x76,0x7d]
 
-# GFX12: v_cmpx_le_u16_e32 m0, v2                ; encoding: [0x7d,0x04,0x76,0x7d]
 0x7d,0x04,0x76,0x7d
+# GFX12: v_cmpx_le_u16_e32 m0, v2                ; encoding: [0x7d,0x04,0x76,0x7d]
 
-# GFX12: v_cmpx_le_u16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x76,0x7d]
 0x7e,0x04,0x76,0x7d
+# GFX12: v_cmpx_le_u16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x76,0x7d]
 
-# GFX12: v_cmpx_le_u16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x76,0x7d]
 0x7f,0x04,0x76,0x7d
+# GFX12: v_cmpx_le_u16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x76,0x7d]
 
-# GFX12: v_cmpx_le_u16_e32 null, v2              ; encoding: [0x7c,0x04,0x76,0x7d]
 0x7c,0x04,0x76,0x7d
+# GFX12: v_cmpx_le_u16_e32 null, v2              ; encoding: [0x7c,0x04,0x76,0x7d]
 
-# GFX12: v_cmpx_le_u16_e32 -1, v2                ; encoding: [0xc1,0x04,0x76,0x7d]
 0xc1,0x04,0x76,0x7d
+# GFX12: v_cmpx_le_u16_e32 -1, v2                ; encoding: [0xc1,0x04,0x76,0x7d]
 
-# GFX12: v_cmpx_le_u16_e32 0x3800, v2
 0xf0,0x04,0x76,0x7d
+# GFX12: v_cmpx_le_u16_e32 0x3800, v2            ; encoding: [0xff,0x04,0x76,0x7d,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_le_u16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x76,0x7d]
 0xfd,0x04,0x76,0x7d
+# GFX12: v_cmpx_le_u16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x76,0x7d]
 
-# GFX12: v_cmpx_le_u16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x76,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x76,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_le_u16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x76,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_le_u32_e32 v1, v2                ; encoding: [0x01,0x05,0x96,0x7d]
 0x01,0x05,0x96,0x7d
+# GFX12: v_cmpx_le_u32_e32 v1, v2                ; encoding: [0x01,0x05,0x96,0x7d]
 
-# GFX12: v_cmpx_le_u32_e32 v255, v2              ; encoding: [0xff,0x05,0x96,0x7d]
 0xff,0x05,0x96,0x7d
+# GFX12: v_cmpx_le_u32_e32 v255, v2              ; encoding: [0xff,0x05,0x96,0x7d]
 
-# GFX12: v_cmpx_le_u32_e32 s1, v2                ; encoding: [0x01,0x04,0x96,0x7d]
 0x01,0x04,0x96,0x7d
+# GFX12: v_cmpx_le_u32_e32 s1, v2                ; encoding: [0x01,0x04,0x96,0x7d]
 
-# GFX12: v_cmpx_le_u32_e32 s105, v2              ; encoding: [0x69,0x04,0x96,0x7d]
 0x69,0x04,0x96,0x7d
+# GFX12: v_cmpx_le_u32_e32 s105, v2              ; encoding: [0x69,0x04,0x96,0x7d]
 
-# GFX12: v_cmpx_le_u32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x96,0x7d]
 0x6a,0x04,0x96,0x7d
+# GFX12: v_cmpx_le_u32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x96,0x7d]
 
-# GFX12: v_cmpx_le_u32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x96,0x7d]
 0x6b,0x04,0x96,0x7d
+# GFX12: v_cmpx_le_u32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x96,0x7d]
 
-# GFX12: v_cmpx_le_u32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x96,0x7d]
 0x7b,0x04,0x96,0x7d
+# GFX12: v_cmpx_le_u32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x96,0x7d]
 
-# GFX12: v_cmpx_le_u32_e32 m0, v2                ; encoding: [0x7d,0x04,0x96,0x7d]
 0x7d,0x04,0x96,0x7d
+# GFX12: v_cmpx_le_u32_e32 m0, v2                ; encoding: [0x7d,0x04,0x96,0x7d]
 
-# GFX12: v_cmpx_le_u32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x96,0x7d]
 0x7e,0x04,0x96,0x7d
+# GFX12: v_cmpx_le_u32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x96,0x7d]
 
-# GFX12: v_cmpx_le_u32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x96,0x7d]
 0x7f,0x04,0x96,0x7d
+# GFX12: v_cmpx_le_u32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x96,0x7d]
 
-# GFX12: v_cmpx_le_u32_e32 null, v2              ; encoding: [0x7c,0x04,0x96,0x7d]
 0x7c,0x04,0x96,0x7d
+# GFX12: v_cmpx_le_u32_e32 null, v2              ; encoding: [0x7c,0x04,0x96,0x7d]
 
-# GFX12: v_cmpx_le_u32_e32 -1, v2                ; encoding: [0xc1,0x04,0x96,0x7d]
 0xc1,0x04,0x96,0x7d
+# GFX12: v_cmpx_le_u32_e32 -1, v2                ; encoding: [0xc1,0x04,0x96,0x7d]
 
-# GFX12: v_cmpx_le_u32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x96,0x7d]
 0xf0,0x04,0x96,0x7d
+# GFX12: v_cmpx_le_u32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x96,0x7d]
 
-# GFX12: v_cmpx_le_u32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x96,0x7d]
 0xfd,0x04,0x96,0x7d
+# GFX12: v_cmpx_le_u32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x96,0x7d]
 
-# GFX12: v_cmpx_le_u32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x97,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x97,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_le_u32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x97,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_le_u64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xb6,0x7d]
 0x01,0x05,0xb6,0x7d
+# GFX12: v_cmpx_le_u64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xb6,0x7d]
 
-# GFX12: v_cmpx_le_u64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xb6,0x7d]
 0xfe,0x05,0xb6,0x7d
+# GFX12: v_cmpx_le_u64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xb6,0x7d]
 
-# GFX12: v_cmpx_le_u64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xb6,0x7d]
 0x02,0x04,0xb6,0x7d
+# GFX12: v_cmpx_le_u64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xb6,0x7d]
 
-# GFX12: v_cmpx_le_u64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xb6,0x7d]
 0x68,0x04,0xb6,0x7d
+# GFX12: v_cmpx_le_u64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xb6,0x7d]
 
-# GFX12: v_cmpx_le_u64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xb6,0x7d]
 0x6a,0x04,0xb6,0x7d
+# GFX12: v_cmpx_le_u64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xb6,0x7d]
 
-# GFX12: v_cmpx_le_u64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xb6,0x7d]
 0x7a,0x04,0xb6,0x7d
+# GFX12: v_cmpx_le_u64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xb6,0x7d]
 
-# GFX12: v_cmpx_le_u64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xb6,0x7d]
 0x7e,0x04,0xb6,0x7d
+# GFX12: v_cmpx_le_u64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xb6,0x7d]
 
-# GFX12: v_cmpx_le_u64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xb6,0x7d]
 0x7c,0x04,0xb6,0x7d
+# GFX12: v_cmpx_le_u64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xb6,0x7d]
 
-# GFX12: v_cmpx_le_u64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xb6,0x7d]
 0xc1,0x04,0xb6,0x7d
+# GFX12: v_cmpx_le_u64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xb6,0x7d]
 
-# GFX12: v_cmpx_le_u64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xb6,0x7d]
 0xf0,0x04,0xb6,0x7d
+# GFX12: v_cmpx_le_u64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xb6,0x7d]
 
-# GFX12: v_cmpx_le_u64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xb6,0x7d]
 0xfd,0x04,0xb6,0x7d
+# GFX12: v_cmpx_le_u64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xb6,0x7d]
 
-# GFX12: v_cmpx_le_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb7,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0xb7,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_le_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb7,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_lg_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x0a,0x7d]
 0x01,0x05,0x0a,0x7d
+# GFX12: v_cmpx_lg_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x0a,0x7d]
 
-# GFX12: v_cmpx_lg_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x0a,0x7d]
 0x7f,0x05,0x0a,0x7d
+# GFX12: v_cmpx_lg_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x0a,0x7d]
 
-# GFX12: v_cmpx_lg_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x0a,0x7d]
 0x01,0x04,0x0a,0x7d
+# GFX12: v_cmpx_lg_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x0a,0x7d]
 
-# GFX12: v_cmpx_lg_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x0a,0x7d]
 0x69,0x04,0x0a,0x7d
+# GFX12: v_cmpx_lg_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x0a,0x7d]
 
-# GFX12: v_cmpx_lg_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x0a,0x7d]
 0x6a,0x04,0x0a,0x7d
+# GFX12: v_cmpx_lg_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x0a,0x7d]
 
-# GFX12: v_cmpx_lg_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x0a,0x7d]
 0x6b,0x04,0x0a,0x7d
+# GFX12: v_cmpx_lg_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x0a,0x7d]
 
-# GFX12: v_cmpx_lg_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x0a,0x7d]
 0x7b,0x04,0x0a,0x7d
+# GFX12: v_cmpx_lg_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x0a,0x7d]
 
-# GFX12: v_cmpx_lg_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x0a,0x7d]
 0x7d,0x04,0x0a,0x7d
+# GFX12: v_cmpx_lg_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x0a,0x7d]
 
-# GFX12: v_cmpx_lg_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x0a,0x7d]
 0x7e,0x04,0x0a,0x7d
+# GFX12: v_cmpx_lg_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x0a,0x7d]
 
-# GFX12: v_cmpx_lg_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x0a,0x7d]
 0x7f,0x04,0x0a,0x7d
+# GFX12: v_cmpx_lg_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x0a,0x7d]
 
-# GFX12: v_cmpx_lg_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x0a,0x7d]
 0x7c,0x04,0x0a,0x7d
+# GFX12: v_cmpx_lg_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x0a,0x7d]
 
-# GFX12: v_cmpx_lg_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x0a,0x7d]
 0xc1,0x04,0x0a,0x7d
+# GFX12: v_cmpx_lg_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x0a,0x7d]
 
-# GFX12: v_cmpx_lg_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x0a,0x7d]
 0xf0,0x04,0x0a,0x7d
+# GFX12: v_cmpx_lg_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x0a,0x7d]
 
-# GFX12: v_cmpx_lg_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x0a,0x7d]
 0xfd,0x04,0x0a,0x7d
+# GFX12: v_cmpx_lg_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x0a,0x7d]
 
-# GFX12: v_cmpx_lg_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x0a,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x0a,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_lg_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x0a,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_lg_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x2a,0x7d]
 0x01,0x05,0x2a,0x7d
+# GFX12: v_cmpx_lg_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x2a,0x7d]
 
-# GFX12: v_cmpx_lg_f32_e32 v255, v2              ; encoding: [0xff,0x05,0x2a,0x7d]
 0xff,0x05,0x2a,0x7d
+# GFX12: v_cmpx_lg_f32_e32 v255, v2              ; encoding: [0xff,0x05,0x2a,0x7d]
 
-# GFX12: v_cmpx_lg_f32_e32 s1, v2                ; encoding: [0x01,0x04,0x2a,0x7d]
 0x01,0x04,0x2a,0x7d
+# GFX12: v_cmpx_lg_f32_e32 s1, v2                ; encoding: [0x01,0x04,0x2a,0x7d]
 
-# GFX12: v_cmpx_lg_f32_e32 s105, v2              ; encoding: [0x69,0x04,0x2a,0x7d]
 0x69,0x04,0x2a,0x7d
+# GFX12: v_cmpx_lg_f32_e32 s105, v2              ; encoding: [0x69,0x04,0x2a,0x7d]
 
-# GFX12: v_cmpx_lg_f32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x2a,0x7d]
 0x6a,0x04,0x2a,0x7d
+# GFX12: v_cmpx_lg_f32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x2a,0x7d]
 
-# GFX12: v_cmpx_lg_f32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x2a,0x7d]
 0x6b,0x04,0x2a,0x7d
+# GFX12: v_cmpx_lg_f32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x2a,0x7d]
 
-# GFX12: v_cmpx_lg_f32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x2a,0x7d]
 0x7b,0x04,0x2a,0x7d
+# GFX12: v_cmpx_lg_f32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x2a,0x7d]
 
-# GFX12: v_cmpx_lg_f32_e32 m0, v2                ; encoding: [0x7d,0x04,0x2a,0x7d]
 0x7d,0x04,0x2a,0x7d
+# GFX12: v_cmpx_lg_f32_e32 m0, v2                ; encoding: [0x7d,0x04,0x2a,0x7d]
 
-# GFX12: v_cmpx_lg_f32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x2a,0x7d]
 0x7e,0x04,0x2a,0x7d
+# GFX12: v_cmpx_lg_f32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x2a,0x7d]
 
-# GFX12: v_cmpx_lg_f32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x2a,0x7d]
 0x7f,0x04,0x2a,0x7d
+# GFX12: v_cmpx_lg_f32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x2a,0x7d]
 
-# GFX12: v_cmpx_lg_f32_e32 null, v2              ; encoding: [0x7c,0x04,0x2a,0x7d]
 0x7c,0x04,0x2a,0x7d
+# GFX12: v_cmpx_lg_f32_e32 null, v2              ; encoding: [0x7c,0x04,0x2a,0x7d]
 
-# GFX12: v_cmpx_lg_f32_e32 -1, v2                ; encoding: [0xc1,0x04,0x2a,0x7d]
 0xc1,0x04,0x2a,0x7d
+# GFX12: v_cmpx_lg_f32_e32 -1, v2                ; encoding: [0xc1,0x04,0x2a,0x7d]
 
-# GFX12: v_cmpx_lg_f32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x2a,0x7d]
 0xf0,0x04,0x2a,0x7d
+# GFX12: v_cmpx_lg_f32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x2a,0x7d]
 
-# GFX12: v_cmpx_lg_f32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x2a,0x7d]
 0xfd,0x04,0x2a,0x7d
+# GFX12: v_cmpx_lg_f32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x2a,0x7d]
 
-# GFX12: v_cmpx_lg_f32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x2b,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x2b,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_lg_f32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x2b,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_lg_f64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0x4a,0x7d]
 0x01,0x05,0x4a,0x7d
+# GFX12: v_cmpx_lg_f64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0x4a,0x7d]
 
-# GFX12: v_cmpx_lg_f64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0x4a,0x7d]
 0xfe,0x05,0x4a,0x7d
+# GFX12: v_cmpx_lg_f64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0x4a,0x7d]
 
-# GFX12: v_cmpx_lg_f64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0x4a,0x7d]
 0x02,0x04,0x4a,0x7d
+# GFX12: v_cmpx_lg_f64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0x4a,0x7d]
 
-# GFX12: v_cmpx_lg_f64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0x4a,0x7d]
 0x68,0x04,0x4a,0x7d
+# GFX12: v_cmpx_lg_f64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0x4a,0x7d]
 
-# GFX12: v_cmpx_lg_f64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0x4a,0x7d]
 0x6a,0x04,0x4a,0x7d
+# GFX12: v_cmpx_lg_f64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0x4a,0x7d]
 
-# GFX12: v_cmpx_lg_f64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0x4a,0x7d]
 0x7a,0x04,0x4a,0x7d
+# GFX12: v_cmpx_lg_f64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0x4a,0x7d]
 
-# GFX12: v_cmpx_lg_f64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0x4a,0x7d]
 0x7e,0x04,0x4a,0x7d
+# GFX12: v_cmpx_lg_f64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0x4a,0x7d]
 
-# GFX12: v_cmpx_lg_f64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0x4a,0x7d]
 0x7c,0x04,0x4a,0x7d
+# GFX12: v_cmpx_lg_f64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0x4a,0x7d]
 
-# GFX12: v_cmpx_lg_f64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0x4a,0x7d]
 0xc1,0x04,0x4a,0x7d
+# GFX12: v_cmpx_lg_f64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0x4a,0x7d]
 
-# GFX12: v_cmpx_lg_f64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0x4a,0x7d]
 0xf0,0x04,0x4a,0x7d
+# GFX12: v_cmpx_lg_f64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0x4a,0x7d]
 
-# GFX12: v_cmpx_lg_f64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0x4a,0x7d]
 0xfd,0x04,0x4a,0x7d
+# GFX12: v_cmpx_lg_f64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0x4a,0x7d]
 
-# GFX12: v_cmpx_lg_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4b,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x4b,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_lg_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4b,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_lt_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x02,0x7d]
 0x01,0x05,0x02,0x7d
+# GFX12: v_cmpx_lt_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x02,0x7d]
 
-# GFX12: v_cmpx_lt_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x02,0x7d]
 0x7f,0x05,0x02,0x7d
+# GFX12: v_cmpx_lt_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x02,0x7d]
 
-# GFX12: v_cmpx_lt_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x02,0x7d]
 0x01,0x04,0x02,0x7d
+# GFX12: v_cmpx_lt_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x02,0x7d]
 
-# GFX12: v_cmpx_lt_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x02,0x7d]
 0x69,0x04,0x02,0x7d
+# GFX12: v_cmpx_lt_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x02,0x7d]
 
-# GFX12: v_cmpx_lt_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x02,0x7d]
 0x6a,0x04,0x02,0x7d
+# GFX12: v_cmpx_lt_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x02,0x7d]
 
-# GFX12: v_cmpx_lt_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x02,0x7d]
 0x6b,0x04,0x02,0x7d
+# GFX12: v_cmpx_lt_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x02,0x7d]
 
-# GFX12: v_cmpx_lt_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x02,0x7d]
 0x7b,0x04,0x02,0x7d
+# GFX12: v_cmpx_lt_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x02,0x7d]
 
-# GFX12: v_cmpx_lt_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x02,0x7d]
 0x7d,0x04,0x02,0x7d
+# GFX12: v_cmpx_lt_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x02,0x7d]
 
-# GFX12: v_cmpx_lt_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x02,0x7d]
 0x7e,0x04,0x02,0x7d
+# GFX12: v_cmpx_lt_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x02,0x7d]
 
-# GFX12: v_cmpx_lt_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x02,0x7d]
 0x7f,0x04,0x02,0x7d
+# GFX12: v_cmpx_lt_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x02,0x7d]
 
-# GFX12: v_cmpx_lt_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x02,0x7d]
 0x7c,0x04,0x02,0x7d
+# GFX12: v_cmpx_lt_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x02,0x7d]
 
-# GFX12: v_cmpx_lt_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x02,0x7d]
 0xc1,0x04,0x02,0x7d
+# GFX12: v_cmpx_lt_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x02,0x7d]
 
-# GFX12: v_cmpx_lt_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x02,0x7d]
 0xf0,0x04,0x02,0x7d
+# GFX12: v_cmpx_lt_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x02,0x7d]
 
-# GFX12: v_cmpx_lt_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x02,0x7d]
 0xfd,0x04,0x02,0x7d
+# GFX12: v_cmpx_lt_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x02,0x7d]
 
-# GFX12: v_cmpx_lt_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x02,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x02,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_lt_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x02,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_lt_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x22,0x7d]
 0x01,0x05,0x22,0x7d
+# GFX12: v_cmpx_lt_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x22,0x7d]
 
-# GFX12: v_cmpx_lt_f32_e32 v255, v2              ; encoding: [0xff,0x05,0x22,0x7d]
 0xff,0x05,0x22,0x7d
+# GFX12: v_cmpx_lt_f32_e32 v255, v2              ; encoding: [0xff,0x05,0x22,0x7d]
 
-# GFX12: v_cmpx_lt_f32_e32 s1, v2                ; encoding: [0x01,0x04,0x22,0x7d]
 0x01,0x04,0x22,0x7d
+# GFX12: v_cmpx_lt_f32_e32 s1, v2                ; encoding: [0x01,0x04,0x22,0x7d]
 
-# GFX12: v_cmpx_lt_f32_e32 s105, v2              ; encoding: [0x69,0x04,0x22,0x7d]
 0x69,0x04,0x22,0x7d
+# GFX12: v_cmpx_lt_f32_e32 s105, v2              ; encoding: [0x69,0x04,0x22,0x7d]
 
-# GFX12: v_cmpx_lt_f32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x22,0x7d]
 0x6a,0x04,0x22,0x7d
+# GFX12: v_cmpx_lt_f32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x22,0x7d]
 
-# GFX12: v_cmpx_lt_f32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x22,0x7d]
 0x6b,0x04,0x22,0x7d
+# GFX12: v_cmpx_lt_f32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x22,0x7d]
 
-# GFX12: v_cmpx_lt_f32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x22,0x7d]
 0x7b,0x04,0x22,0x7d
+# GFX12: v_cmpx_lt_f32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x22,0x7d]
 
-# GFX12: v_cmpx_lt_f32_e32 m0, v2                ; encoding: [0x7d,0x04,0x22,0x7d]
 0x7d,0x04,0x22,0x7d
+# GFX12: v_cmpx_lt_f32_e32 m0, v2                ; encoding: [0x7d,0x04,0x22,0x7d]
 
-# GFX12: v_cmpx_lt_f32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x22,0x7d]
 0x7e,0x04,0x22,0x7d
+# GFX12: v_cmpx_lt_f32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x22,0x7d]
 
-# GFX12: v_cmpx_lt_f32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x22,0x7d]
 0x7f,0x04,0x22,0x7d
+# GFX12: v_cmpx_lt_f32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x22,0x7d]
 
-# GFX12: v_cmpx_lt_f32_e32 null, v2              ; encoding: [0x7c,0x04,0x22,0x7d]
 0x7c,0x04,0x22,0x7d
+# GFX12: v_cmpx_lt_f32_e32 null, v2              ; encoding: [0x7c,0x04,0x22,0x7d]
 
-# GFX12: v_cmpx_lt_f32_e32 -1, v2                ; encoding: [0xc1,0x04,0x22,0x7d]
 0xc1,0x04,0x22,0x7d
+# GFX12: v_cmpx_lt_f32_e32 -1, v2                ; encoding: [0xc1,0x04,0x22,0x7d]
 
-# GFX12: v_cmpx_lt_f32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x22,0x7d]
 0xf0,0x04,0x22,0x7d
+# GFX12: v_cmpx_lt_f32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x22,0x7d]
 
-# GFX12: v_cmpx_lt_f32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x22,0x7d]
 0xfd,0x04,0x22,0x7d
+# GFX12: v_cmpx_lt_f32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x22,0x7d]
 
-# GFX12: v_cmpx_lt_f32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x23,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x23,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_lt_f32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x23,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_lt_f64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0x42,0x7d]
 0x01,0x05,0x42,0x7d
+# GFX12: v_cmpx_lt_f64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0x42,0x7d]
 
-# GFX12: v_cmpx_lt_f64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0x42,0x7d]
 0xfe,0x05,0x42,0x7d
+# GFX12: v_cmpx_lt_f64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0x42,0x7d]
 
-# GFX12: v_cmpx_lt_f64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0x42,0x7d]
 0x02,0x04,0x42,0x7d
+# GFX12: v_cmpx_lt_f64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0x42,0x7d]
 
-# GFX12: v_cmpx_lt_f64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0x42,0x7d]
 0x68,0x04,0x42,0x7d
+# GFX12: v_cmpx_lt_f64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0x42,0x7d]
 
-# GFX12: v_cmpx_lt_f64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0x42,0x7d]
 0x6a,0x04,0x42,0x7d
+# GFX12: v_cmpx_lt_f64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0x42,0x7d]
 
-# GFX12: v_cmpx_lt_f64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0x42,0x7d]
 0x7a,0x04,0x42,0x7d
+# GFX12: v_cmpx_lt_f64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0x42,0x7d]
 
-# GFX12: v_cmpx_lt_f64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0x42,0x7d]
 0x7e,0x04,0x42,0x7d
+# GFX12: v_cmpx_lt_f64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0x42,0x7d]
 
-# GFX12: v_cmpx_lt_f64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0x42,0x7d]
 0x7c,0x04,0x42,0x7d
+# GFX12: v_cmpx_lt_f64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0x42,0x7d]
 
-# GFX12: v_cmpx_lt_f64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0x42,0x7d]
 0xc1,0x04,0x42,0x7d
+# GFX12: v_cmpx_lt_f64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0x42,0x7d]
 
-# GFX12: v_cmpx_lt_f64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0x42,0x7d]
 0xf0,0x04,0x42,0x7d
+# GFX12: v_cmpx_lt_f64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0x42,0x7d]
 
-# GFX12: v_cmpx_lt_f64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0x42,0x7d]
 0xfd,0x04,0x42,0x7d
+# GFX12: v_cmpx_lt_f64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0x42,0x7d]
 
-# GFX12: v_cmpx_lt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x43,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x43,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_lt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x43,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_lt_i16_e32 v1, v2                ; encoding: [0x01,0x05,0x62,0x7d]
 0x01,0x05,0x62,0x7d
+# GFX12: v_cmpx_lt_i16_e32 v1, v2                ; encoding: [0x01,0x05,0x62,0x7d]
 
-# GFX12: v_cmpx_lt_i16_e32 v127, v2              ; encoding: [0x7f,0x05,0x62,0x7d]
 0x7f,0x05,0x62,0x7d
+# GFX12: v_cmpx_lt_i16_e32 v127, v2              ; encoding: [0x7f,0x05,0x62,0x7d]
 
-# GFX12: v_cmpx_lt_i16_e32 s1, v2                ; encoding: [0x01,0x04,0x62,0x7d]
 0x01,0x04,0x62,0x7d
+# GFX12: v_cmpx_lt_i16_e32 s1, v2                ; encoding: [0x01,0x04,0x62,0x7d]
 
-# GFX12: v_cmpx_lt_i16_e32 s105, v2              ; encoding: [0x69,0x04,0x62,0x7d]
 0x69,0x04,0x62,0x7d
+# GFX12: v_cmpx_lt_i16_e32 s105, v2              ; encoding: [0x69,0x04,0x62,0x7d]
 
-# GFX12: v_cmpx_lt_i16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x62,0x7d]
 0x6a,0x04,0x62,0x7d
+# GFX12: v_cmpx_lt_i16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x62,0x7d]
 
-# GFX12: v_cmpx_lt_i16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x62,0x7d]
 0x6b,0x04,0x62,0x7d
+# GFX12: v_cmpx_lt_i16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x62,0x7d]
 
-# GFX12: v_cmpx_lt_i16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x62,0x7d]
 0x7b,0x04,0x62,0x7d
+# GFX12: v_cmpx_lt_i16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x62,0x7d]
 
-# GFX12: v_cmpx_lt_i16_e32 m0, v2                ; encoding: [0x7d,0x04,0x62,0x7d]
 0x7d,0x04,0x62,0x7d
+# GFX12: v_cmpx_lt_i16_e32 m0, v2                ; encoding: [0x7d,0x04,0x62,0x7d]
 
-# GFX12: v_cmpx_lt_i16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x62,0x7d]
 0x7e,0x04,0x62,0x7d
+# GFX12: v_cmpx_lt_i16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x62,0x7d]
 
-# GFX12: v_cmpx_lt_i16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x62,0x7d]
 0x7f,0x04,0x62,0x7d
+# GFX12: v_cmpx_lt_i16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x62,0x7d]
 
-# GFX12: v_cmpx_lt_i16_e32 null, v2              ; encoding: [0x7c,0x04,0x62,0x7d]
 0x7c,0x04,0x62,0x7d
+# GFX12: v_cmpx_lt_i16_e32 null, v2              ; encoding: [0x7c,0x04,0x62,0x7d]
 
-# GFX12: v_cmpx_lt_i16_e32 -1, v2                ; encoding: [0xc1,0x04,0x62,0x7d]
 0xc1,0x04,0x62,0x7d
+# GFX12: v_cmpx_lt_i16_e32 -1, v2                ; encoding: [0xc1,0x04,0x62,0x7d]
 
-# GFX12: v_cmpx_lt_i16_e32 0x3800, v2
 0xf0,0x04,0x62,0x7d
+# GFX12: v_cmpx_lt_i16_e32 0x3800, v2            ; encoding: [0xff,0x04,0x62,0x7d,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_lt_i16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x62,0x7d]
 0xfd,0x04,0x62,0x7d
+# GFX12: v_cmpx_lt_i16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x62,0x7d]
 
-# GFX12: v_cmpx_lt_i16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x62,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x62,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_lt_i16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x62,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_lt_i32_e32 v1, v2                ; encoding: [0x01,0x05,0x82,0x7d]
 0x01,0x05,0x82,0x7d
+# GFX12: v_cmpx_lt_i32_e32 v1, v2                ; encoding: [0x01,0x05,0x82,0x7d]
 
-# GFX12: v_cmpx_lt_i32_e32 v255, v2              ; encoding: [0xff,0x05,0x82,0x7d]
 0xff,0x05,0x82,0x7d
+# GFX12: v_cmpx_lt_i32_e32 v255, v2              ; encoding: [0xff,0x05,0x82,0x7d]
 
-# GFX12: v_cmpx_lt_i32_e32 s1, v2                ; encoding: [0x01,0x04,0x82,0x7d]
 0x01,0x04,0x82,0x7d
+# GFX12: v_cmpx_lt_i32_e32 s1, v2                ; encoding: [0x01,0x04,0x82,0x7d]
 
-# GFX12: v_cmpx_lt_i32_e32 s105, v2              ; encoding: [0x69,0x04,0x82,0x7d]
 0x69,0x04,0x82,0x7d
+# GFX12: v_cmpx_lt_i32_e32 s105, v2              ; encoding: [0x69,0x04,0x82,0x7d]
 
-# GFX12: v_cmpx_lt_i32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x82,0x7d]
 0x6a,0x04,0x82,0x7d
+# GFX12: v_cmpx_lt_i32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x82,0x7d]
 
-# GFX12: v_cmpx_lt_i32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x82,0x7d]
 0x6b,0x04,0x82,0x7d
+# GFX12: v_cmpx_lt_i32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x82,0x7d]
 
-# GFX12: v_cmpx_lt_i32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x82,0x7d]
 0x7b,0x04,0x82,0x7d
+# GFX12: v_cmpx_lt_i32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x82,0x7d]
 
-# GFX12: v_cmpx_lt_i32_e32 m0, v2                ; encoding: [0x7d,0x04,0x82,0x7d]
 0x7d,0x04,0x82,0x7d
+# GFX12: v_cmpx_lt_i32_e32 m0, v2                ; encoding: [0x7d,0x04,0x82,0x7d]
 
-# GFX12: v_cmpx_lt_i32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x82,0x7d]
 0x7e,0x04,0x82,0x7d
+# GFX12: v_cmpx_lt_i32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x82,0x7d]
 
-# GFX12: v_cmpx_lt_i32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x82,0x7d]
 0x7f,0x04,0x82,0x7d
+# GFX12: v_cmpx_lt_i32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x82,0x7d]
 
-# GFX12: v_cmpx_lt_i32_e32 null, v2              ; encoding: [0x7c,0x04,0x82,0x7d]
 0x7c,0x04,0x82,0x7d
+# GFX12: v_cmpx_lt_i32_e32 null, v2              ; encoding: [0x7c,0x04,0x82,0x7d]
 
-# GFX12: v_cmpx_lt_i32_e32 -1, v2                ; encoding: [0xc1,0x04,0x82,0x7d]
 0xc1,0x04,0x82,0x7d
+# GFX12: v_cmpx_lt_i32_e32 -1, v2                ; encoding: [0xc1,0x04,0x82,0x7d]
 
-# GFX12: v_cmpx_lt_i32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x82,0x7d]
 0xf0,0x04,0x82,0x7d
+# GFX12: v_cmpx_lt_i32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x82,0x7d]
 
-# GFX12: v_cmpx_lt_i32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x82,0x7d]
 0xfd,0x04,0x82,0x7d
+# GFX12: v_cmpx_lt_i32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x82,0x7d]
 
-# GFX12: v_cmpx_lt_i32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x83,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x83,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_lt_i32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x83,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_lt_i64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xa2,0x7d]
 0x01,0x05,0xa2,0x7d
+# GFX12: v_cmpx_lt_i64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xa2,0x7d]
 
-# GFX12: v_cmpx_lt_i64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xa2,0x7d]
 0xfe,0x05,0xa2,0x7d
+# GFX12: v_cmpx_lt_i64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xa2,0x7d]
 
-# GFX12: v_cmpx_lt_i64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xa2,0x7d]
 0x02,0x04,0xa2,0x7d
+# GFX12: v_cmpx_lt_i64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xa2,0x7d]
 
-# GFX12: v_cmpx_lt_i64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xa2,0x7d]
 0x68,0x04,0xa2,0x7d
+# GFX12: v_cmpx_lt_i64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xa2,0x7d]
 
-# GFX12: v_cmpx_lt_i64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xa2,0x7d]
 0x6a,0x04,0xa2,0x7d
+# GFX12: v_cmpx_lt_i64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xa2,0x7d]
 
-# GFX12: v_cmpx_lt_i64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xa2,0x7d]
 0x7a,0x04,0xa2,0x7d
+# GFX12: v_cmpx_lt_i64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xa2,0x7d]
 
-# GFX12: v_cmpx_lt_i64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xa2,0x7d]
 0x7e,0x04,0xa2,0x7d
+# GFX12: v_cmpx_lt_i64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xa2,0x7d]
 
-# GFX12: v_cmpx_lt_i64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xa2,0x7d]
 0x7c,0x04,0xa2,0x7d
+# GFX12: v_cmpx_lt_i64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xa2,0x7d]
 
-# GFX12: v_cmpx_lt_i64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xa2,0x7d]
 0xc1,0x04,0xa2,0x7d
+# GFX12: v_cmpx_lt_i64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xa2,0x7d]
 
-# GFX12: v_cmpx_lt_i64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xa2,0x7d]
 0xf0,0x04,0xa2,0x7d
+# GFX12: v_cmpx_lt_i64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xa2,0x7d]
 
-# GFX12: v_cmpx_lt_i64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xa2,0x7d]
 0xfd,0x04,0xa2,0x7d
+# GFX12: v_cmpx_lt_i64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xa2,0x7d]
 
-# GFX12: v_cmpx_lt_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa3,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0xa3,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_lt_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa3,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_lt_u16_e32 v1, v2                ; encoding: [0x01,0x05,0x72,0x7d]
 0x01,0x05,0x72,0x7d
+# GFX12: v_cmpx_lt_u16_e32 v1, v2                ; encoding: [0x01,0x05,0x72,0x7d]
 
-# GFX12: v_cmpx_lt_u16_e32 v127, v2              ; encoding: [0x7f,0x05,0x72,0x7d]
 0x7f,0x05,0x72,0x7d
+# GFX12: v_cmpx_lt_u16_e32 v127, v2              ; encoding: [0x7f,0x05,0x72,0x7d]
 
-# GFX12: v_cmpx_lt_u16_e32 s1, v2                ; encoding: [0x01,0x04,0x72,0x7d]
 0x01,0x04,0x72,0x7d
+# GFX12: v_cmpx_lt_u16_e32 s1, v2                ; encoding: [0x01,0x04,0x72,0x7d]
 
-# GFX12: v_cmpx_lt_u16_e32 s105, v2              ; encoding: [0x69,0x04,0x72,0x7d]
 0x69,0x04,0x72,0x7d
+# GFX12: v_cmpx_lt_u16_e32 s105, v2              ; encoding: [0x69,0x04,0x72,0x7d]
 
-# GFX12: v_cmpx_lt_u16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x72,0x7d]
 0x6a,0x04,0x72,0x7d
+# GFX12: v_cmpx_lt_u16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x72,0x7d]
 
-# GFX12: v_cmpx_lt_u16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x72,0x7d]
 0x6b,0x04,0x72,0x7d
+# GFX12: v_cmpx_lt_u16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x72,0x7d]
 
-# GFX12: v_cmpx_lt_u16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x72,0x7d]
 0x7b,0x04,0x72,0x7d
+# GFX12: v_cmpx_lt_u16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x72,0x7d]
 
-# GFX12: v_cmpx_lt_u16_e32 m0, v2                ; encoding: [0x7d,0x04,0x72,0x7d]
 0x7d,0x04,0x72,0x7d
+# GFX12: v_cmpx_lt_u16_e32 m0, v2                ; encoding: [0x7d,0x04,0x72,0x7d]
 
-# GFX12: v_cmpx_lt_u16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x72,0x7d]
 0x7e,0x04,0x72,0x7d
+# GFX12: v_cmpx_lt_u16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x72,0x7d]
 
-# GFX12: v_cmpx_lt_u16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x72,0x7d]
 0x7f,0x04,0x72,0x7d
+# GFX12: v_cmpx_lt_u16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x72,0x7d]
 
-# GFX12: v_cmpx_lt_u16_e32 null, v2              ; encoding: [0x7c,0x04,0x72,0x7d]
 0x7c,0x04,0x72,0x7d
+# GFX12: v_cmpx_lt_u16_e32 null, v2              ; encoding: [0x7c,0x04,0x72,0x7d]
 
-# GFX12: v_cmpx_lt_u16_e32 -1, v2                ; encoding: [0xc1,0x04,0x72,0x7d]
 0xc1,0x04,0x72,0x7d
+# GFX12: v_cmpx_lt_u16_e32 -1, v2                ; encoding: [0xc1,0x04,0x72,0x7d]
 
-# GFX12: v_cmpx_lt_u16_e32 0x3800, v2
 0xf0,0x04,0x72,0x7d
+# GFX12: v_cmpx_lt_u16_e32 0x3800, v2            ; encoding: [0xff,0x04,0x72,0x7d,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_lt_u16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x72,0x7d]
 0xfd,0x04,0x72,0x7d
+# GFX12: v_cmpx_lt_u16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x72,0x7d]
 
-# GFX12: v_cmpx_lt_u16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x72,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x72,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_lt_u16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x72,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_lt_u32_e32 v1, v2                ; encoding: [0x01,0x05,0x92,0x7d]
 0x01,0x05,0x92,0x7d
+# GFX12: v_cmpx_lt_u32_e32 v1, v2                ; encoding: [0x01,0x05,0x92,0x7d]
 
-# GFX12: v_cmpx_lt_u32_e32 v255, v2              ; encoding: [0xff,0x05,0x92,0x7d]
 0xff,0x05,0x92,0x7d
+# GFX12: v_cmpx_lt_u32_e32 v255, v2              ; encoding: [0xff,0x05,0x92,0x7d]
 
-# GFX12: v_cmpx_lt_u32_e32 s1, v2                ; encoding: [0x01,0x04,0x92,0x7d]
 0x01,0x04,0x92,0x7d
+# GFX12: v_cmpx_lt_u32_e32 s1, v2                ; encoding: [0x01,0x04,0x92,0x7d]
 
-# GFX12: v_cmpx_lt_u32_e32 s105, v2              ; encoding: [0x69,0x04,0x92,0x7d]
 0x69,0x04,0x92,0x7d
+# GFX12: v_cmpx_lt_u32_e32 s105, v2              ; encoding: [0x69,0x04,0x92,0x7d]
 
-# GFX12: v_cmpx_lt_u32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x92,0x7d]
 0x6a,0x04,0x92,0x7d
+# GFX12: v_cmpx_lt_u32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x92,0x7d]
 
-# GFX12: v_cmpx_lt_u32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x92,0x7d]
 0x6b,0x04,0x92,0x7d
+# GFX12: v_cmpx_lt_u32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x92,0x7d]
 
-# GFX12: v_cmpx_lt_u32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x92,0x7d]
 0x7b,0x04,0x92,0x7d
+# GFX12: v_cmpx_lt_u32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x92,0x7d]
 
-# GFX12: v_cmpx_lt_u32_e32 m0, v2                ; encoding: [0x7d,0x04,0x92,0x7d]
 0x7d,0x04,0x92,0x7d
+# GFX12: v_cmpx_lt_u32_e32 m0, v2                ; encoding: [0x7d,0x04,0x92,0x7d]
 
-# GFX12: v_cmpx_lt_u32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x92,0x7d]
 0x7e,0x04,0x92,0x7d
+# GFX12: v_cmpx_lt_u32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x92,0x7d]
 
-# GFX12: v_cmpx_lt_u32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x92,0x7d]
 0x7f,0x04,0x92,0x7d
+# GFX12: v_cmpx_lt_u32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x92,0x7d]
 
-# GFX12: v_cmpx_lt_u32_e32 null, v2              ; encoding: [0x7c,0x04,0x92,0x7d]
 0x7c,0x04,0x92,0x7d
+# GFX12: v_cmpx_lt_u32_e32 null, v2              ; encoding: [0x7c,0x04,0x92,0x7d]
 
-# GFX12: v_cmpx_lt_u32_e32 -1, v2                ; encoding: [0xc1,0x04,0x92,0x7d]
 0xc1,0x04,0x92,0x7d
+# GFX12: v_cmpx_lt_u32_e32 -1, v2                ; encoding: [0xc1,0x04,0x92,0x7d]
 
-# GFX12: v_cmpx_lt_u32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x92,0x7d]
 0xf0,0x04,0x92,0x7d
+# GFX12: v_cmpx_lt_u32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x92,0x7d]
 
-# GFX12: v_cmpx_lt_u32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x92,0x7d]
 0xfd,0x04,0x92,0x7d
+# GFX12: v_cmpx_lt_u32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x92,0x7d]
 
-# GFX12: v_cmpx_lt_u32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x93,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x93,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_lt_u32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x93,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_lt_u64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xb2,0x7d]
 0x01,0x05,0xb2,0x7d
+# GFX12: v_cmpx_lt_u64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xb2,0x7d]
 
-# GFX12: v_cmpx_lt_u64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xb2,0x7d]
 0xfe,0x05,0xb2,0x7d
+# GFX12: v_cmpx_lt_u64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xb2,0x7d]
 
-# GFX12: v_cmpx_lt_u64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xb2,0x7d]
 0x02,0x04,0xb2,0x7d
+# GFX12: v_cmpx_lt_u64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xb2,0x7d]
 
-# GFX12: v_cmpx_lt_u64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xb2,0x7d]
 0x68,0x04,0xb2,0x7d
+# GFX12: v_cmpx_lt_u64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xb2,0x7d]
 
-# GFX12: v_cmpx_lt_u64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xb2,0x7d]
 0x6a,0x04,0xb2,0x7d
+# GFX12: v_cmpx_lt_u64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xb2,0x7d]
 
-# GFX12: v_cmpx_lt_u64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xb2,0x7d]
 0x7a,0x04,0xb2,0x7d
+# GFX12: v_cmpx_lt_u64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xb2,0x7d]
 
-# GFX12: v_cmpx_lt_u64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xb2,0x7d]
 0x7e,0x04,0xb2,0x7d
+# GFX12: v_cmpx_lt_u64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xb2,0x7d]
 
-# GFX12: v_cmpx_lt_u64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xb2,0x7d]
 0x7c,0x04,0xb2,0x7d
+# GFX12: v_cmpx_lt_u64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xb2,0x7d]
 
-# GFX12: v_cmpx_lt_u64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xb2,0x7d]
 0xc1,0x04,0xb2,0x7d
+# GFX12: v_cmpx_lt_u64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xb2,0x7d]
 
-# GFX12: v_cmpx_lt_u64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xb2,0x7d]
 0xf0,0x04,0xb2,0x7d
+# GFX12: v_cmpx_lt_u64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xb2,0x7d]
 
-# GFX12: v_cmpx_lt_u64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xb2,0x7d]
 0xfd,0x04,0xb2,0x7d
+# GFX12: v_cmpx_lt_u64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xb2,0x7d]
 
-# GFX12: v_cmpx_lt_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb3,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0xb3,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_lt_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb3,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ne_i16_e32 v1, v2                ; encoding: [0x01,0x05,0x6a,0x7d]
 0x01,0x05,0x6a,0x7d
+# GFX12: v_cmpx_ne_i16_e32 v1, v2                ; encoding: [0x01,0x05,0x6a,0x7d]
 
-# GFX12: v_cmpx_ne_i16_e32 v127, v2              ; encoding: [0x7f,0x05,0x6a,0x7d]
 0x7f,0x05,0x6a,0x7d
+# GFX12: v_cmpx_ne_i16_e32 v127, v2              ; encoding: [0x7f,0x05,0x6a,0x7d]
 
-# GFX12: v_cmpx_ne_i16_e32 s1, v2                ; encoding: [0x01,0x04,0x6a,0x7d]
 0x01,0x04,0x6a,0x7d
+# GFX12: v_cmpx_ne_i16_e32 s1, v2                ; encoding: [0x01,0x04,0x6a,0x7d]
 
-# GFX12: v_cmpx_ne_i16_e32 s105, v2              ; encoding: [0x69,0x04,0x6a,0x7d]
 0x69,0x04,0x6a,0x7d
+# GFX12: v_cmpx_ne_i16_e32 s105, v2              ; encoding: [0x69,0x04,0x6a,0x7d]
 
-# GFX12: v_cmpx_ne_i16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x6a,0x7d]
 0x6a,0x04,0x6a,0x7d
+# GFX12: v_cmpx_ne_i16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x6a,0x7d]
 
-# GFX12: v_cmpx_ne_i16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x6a,0x7d]
 0x6b,0x04,0x6a,0x7d
+# GFX12: v_cmpx_ne_i16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x6a,0x7d]
 
-# GFX12: v_cmpx_ne_i16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x6a,0x7d]
 0x7b,0x04,0x6a,0x7d
+# GFX12: v_cmpx_ne_i16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x6a,0x7d]
 
-# GFX12: v_cmpx_ne_i16_e32 m0, v2                ; encoding: [0x7d,0x04,0x6a,0x7d]
 0x7d,0x04,0x6a,0x7d
+# GFX12: v_cmpx_ne_i16_e32 m0, v2                ; encoding: [0x7d,0x04,0x6a,0x7d]
 
-# GFX12: v_cmpx_ne_i16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x6a,0x7d]
 0x7e,0x04,0x6a,0x7d
+# GFX12: v_cmpx_ne_i16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x6a,0x7d]
 
-# GFX12: v_cmpx_ne_i16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x6a,0x7d]
 0x7f,0x04,0x6a,0x7d
+# GFX12: v_cmpx_ne_i16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x6a,0x7d]
 
-# GFX12: v_cmpx_ne_i16_e32 null, v2              ; encoding: [0x7c,0x04,0x6a,0x7d]
 0x7c,0x04,0x6a,0x7d
+# GFX12: v_cmpx_ne_i16_e32 null, v2              ; encoding: [0x7c,0x04,0x6a,0x7d]
 
-# GFX12: v_cmpx_ne_i16_e32 -1, v2                ; encoding: [0xc1,0x04,0x6a,0x7d]
 0xc1,0x04,0x6a,0x7d
+# GFX12: v_cmpx_ne_i16_e32 -1, v2                ; encoding: [0xc1,0x04,0x6a,0x7d]
 
-# GFX12: v_cmpx_ne_i16_e32 0x3800, v2
 0xf0,0x04,0x6a,0x7d
+# GFX12: v_cmpx_ne_i16_e32 0x3800, v2            ; encoding: [0xff,0x04,0x6a,0x7d,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_ne_i16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x6a,0x7d]
 0xfd,0x04,0x6a,0x7d
+# GFX12: v_cmpx_ne_i16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x6a,0x7d]
 
-# GFX12: v_cmpx_ne_i16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x6a,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x6a,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_ne_i16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x6a,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_ne_i32_e32 v1, v2                ; encoding: [0x01,0x05,0x8a,0x7d]
 0x01,0x05,0x8a,0x7d
+# GFX12: v_cmpx_ne_i32_e32 v1, v2                ; encoding: [0x01,0x05,0x8a,0x7d]
 
-# GFX12: v_cmpx_ne_i32_e32 v255, v2              ; encoding: [0xff,0x05,0x8a,0x7d]
 0xff,0x05,0x8a,0x7d
+# GFX12: v_cmpx_ne_i32_e32 v255, v2              ; encoding: [0xff,0x05,0x8a,0x7d]
 
-# GFX12: v_cmpx_ne_i32_e32 s1, v2                ; encoding: [0x01,0x04,0x8a,0x7d]
 0x01,0x04,0x8a,0x7d
+# GFX12: v_cmpx_ne_i32_e32 s1, v2                ; encoding: [0x01,0x04,0x8a,0x7d]
 
-# GFX12: v_cmpx_ne_i32_e32 s105, v2              ; encoding: [0x69,0x04,0x8a,0x7d]
 0x69,0x04,0x8a,0x7d
+# GFX12: v_cmpx_ne_i32_e32 s105, v2              ; encoding: [0x69,0x04,0x8a,0x7d]
 
-# GFX12: v_cmpx_ne_i32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x8a,0x7d]
 0x6a,0x04,0x8a,0x7d
+# GFX12: v_cmpx_ne_i32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x8a,0x7d]
 
-# GFX12: v_cmpx_ne_i32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x8a,0x7d]
 0x6b,0x04,0x8a,0x7d
+# GFX12: v_cmpx_ne_i32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x8a,0x7d]
 
-# GFX12: v_cmpx_ne_i32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x8a,0x7d]
 0x7b,0x04,0x8a,0x7d
+# GFX12: v_cmpx_ne_i32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x8a,0x7d]
 
-# GFX12: v_cmpx_ne_i32_e32 m0, v2                ; encoding: [0x7d,0x04,0x8a,0x7d]
 0x7d,0x04,0x8a,0x7d
+# GFX12: v_cmpx_ne_i32_e32 m0, v2                ; encoding: [0x7d,0x04,0x8a,0x7d]
 
-# GFX12: v_cmpx_ne_i32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x8a,0x7d]
 0x7e,0x04,0x8a,0x7d
+# GFX12: v_cmpx_ne_i32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x8a,0x7d]
 
-# GFX12: v_cmpx_ne_i32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x8a,0x7d]
 0x7f,0x04,0x8a,0x7d
+# GFX12: v_cmpx_ne_i32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x8a,0x7d]
 
-# GFX12: v_cmpx_ne_i32_e32 null, v2              ; encoding: [0x7c,0x04,0x8a,0x7d]
 0x7c,0x04,0x8a,0x7d
+# GFX12: v_cmpx_ne_i32_e32 null, v2              ; encoding: [0x7c,0x04,0x8a,0x7d]
 
-# GFX12: v_cmpx_ne_i32_e32 -1, v2                ; encoding: [0xc1,0x04,0x8a,0x7d]
 0xc1,0x04,0x8a,0x7d
+# GFX12: v_cmpx_ne_i32_e32 -1, v2                ; encoding: [0xc1,0x04,0x8a,0x7d]
 
-# GFX12: v_cmpx_ne_i32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x8a,0x7d]
 0xf0,0x04,0x8a,0x7d
+# GFX12: v_cmpx_ne_i32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x8a,0x7d]
 
-# GFX12: v_cmpx_ne_i32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x8a,0x7d]
 0xfd,0x04,0x8a,0x7d
+# GFX12: v_cmpx_ne_i32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x8a,0x7d]
 
-# GFX12: v_cmpx_ne_i32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x8b,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x8b,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ne_i32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x8b,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ne_i64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xaa,0x7d]
 0x01,0x05,0xaa,0x7d
+# GFX12: v_cmpx_ne_i64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xaa,0x7d]
 
-# GFX12: v_cmpx_ne_i64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xaa,0x7d]
 0xfe,0x05,0xaa,0x7d
+# GFX12: v_cmpx_ne_i64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xaa,0x7d]
 
-# GFX12: v_cmpx_ne_i64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xaa,0x7d]
 0x02,0x04,0xaa,0x7d
+# GFX12: v_cmpx_ne_i64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xaa,0x7d]
 
-# GFX12: v_cmpx_ne_i64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xaa,0x7d]
 0x68,0x04,0xaa,0x7d
+# GFX12: v_cmpx_ne_i64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xaa,0x7d]
 
-# GFX12: v_cmpx_ne_i64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xaa,0x7d]
 0x6a,0x04,0xaa,0x7d
+# GFX12: v_cmpx_ne_i64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xaa,0x7d]
 
-# GFX12: v_cmpx_ne_i64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xaa,0x7d]
 0x7a,0x04,0xaa,0x7d
+# GFX12: v_cmpx_ne_i64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xaa,0x7d]
 
-# GFX12: v_cmpx_ne_i64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xaa,0x7d]
 0x7e,0x04,0xaa,0x7d
+# GFX12: v_cmpx_ne_i64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xaa,0x7d]
 
-# GFX12: v_cmpx_ne_i64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xaa,0x7d]
 0x7c,0x04,0xaa,0x7d
+# GFX12: v_cmpx_ne_i64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xaa,0x7d]
 
-# GFX12: v_cmpx_ne_i64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xaa,0x7d]
 0xc1,0x04,0xaa,0x7d
+# GFX12: v_cmpx_ne_i64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xaa,0x7d]
 
-# GFX12: v_cmpx_ne_i64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xaa,0x7d]
 0xf0,0x04,0xaa,0x7d
+# GFX12: v_cmpx_ne_i64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xaa,0x7d]
 
-# GFX12: v_cmpx_ne_i64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xaa,0x7d]
 0xfd,0x04,0xaa,0x7d
+# GFX12: v_cmpx_ne_i64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xaa,0x7d]
 
-# GFX12: v_cmpx_ne_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xab,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0xab,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ne_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xab,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ne_u16_e32 v1, v2                ; encoding: [0x01,0x05,0x7a,0x7d]
 0x01,0x05,0x7a,0x7d
+# GFX12: v_cmpx_ne_u16_e32 v1, v2                ; encoding: [0x01,0x05,0x7a,0x7d]
 
-# GFX12: v_cmpx_ne_u16_e32 v127, v2              ; encoding: [0x7f,0x05,0x7a,0x7d]
 0x7f,0x05,0x7a,0x7d
+# GFX12: v_cmpx_ne_u16_e32 v127, v2              ; encoding: [0x7f,0x05,0x7a,0x7d]
 
-# GFX12: v_cmpx_ne_u16_e32 s1, v2                ; encoding: [0x01,0x04,0x7a,0x7d]
 0x01,0x04,0x7a,0x7d
+# GFX12: v_cmpx_ne_u16_e32 s1, v2                ; encoding: [0x01,0x04,0x7a,0x7d]
 
-# GFX12: v_cmpx_ne_u16_e32 s105, v2              ; encoding: [0x69,0x04,0x7a,0x7d]
 0x69,0x04,0x7a,0x7d
+# GFX12: v_cmpx_ne_u16_e32 s105, v2              ; encoding: [0x69,0x04,0x7a,0x7d]
 
-# GFX12: v_cmpx_ne_u16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x7a,0x7d]
 0x6a,0x04,0x7a,0x7d
+# GFX12: v_cmpx_ne_u16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x7a,0x7d]
 
-# GFX12: v_cmpx_ne_u16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x7a,0x7d]
 0x6b,0x04,0x7a,0x7d
+# GFX12: v_cmpx_ne_u16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x7a,0x7d]
 
-# GFX12: v_cmpx_ne_u16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x7a,0x7d]
 0x7b,0x04,0x7a,0x7d
+# GFX12: v_cmpx_ne_u16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x7a,0x7d]
 
-# GFX12: v_cmpx_ne_u16_e32 m0, v2                ; encoding: [0x7d,0x04,0x7a,0x7d]
 0x7d,0x04,0x7a,0x7d
+# GFX12: v_cmpx_ne_u16_e32 m0, v2                ; encoding: [0x7d,0x04,0x7a,0x7d]
 
-# GFX12: v_cmpx_ne_u16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x7a,0x7d]
 0x7e,0x04,0x7a,0x7d
+# GFX12: v_cmpx_ne_u16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x7a,0x7d]
 
-# GFX12: v_cmpx_ne_u16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x7a,0x7d]
 0x7f,0x04,0x7a,0x7d
+# GFX12: v_cmpx_ne_u16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x7a,0x7d]
 
-# GFX12: v_cmpx_ne_u16_e32 null, v2              ; encoding: [0x7c,0x04,0x7a,0x7d]
 0x7c,0x04,0x7a,0x7d
+# GFX12: v_cmpx_ne_u16_e32 null, v2              ; encoding: [0x7c,0x04,0x7a,0x7d]
 
-# GFX12: v_cmpx_ne_u16_e32 -1, v2                ; encoding: [0xc1,0x04,0x7a,0x7d]
 0xc1,0x04,0x7a,0x7d
+# GFX12: v_cmpx_ne_u16_e32 -1, v2                ; encoding: [0xc1,0x04,0x7a,0x7d]
 
-# GFX12: v_cmpx_ne_u16_e32 0x3800, v2
 0xf0,0x04,0x7a,0x7d
+# GFX12: v_cmpx_ne_u16_e32 0x3800, v2            ; encoding: [0xff,0x04,0x7a,0x7d,0x00,0x38,0x00,0x00]
 
-# GFX12: v_cmpx_ne_u16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x7a,0x7d]
 0xfd,0x04,0x7a,0x7d
+# GFX12: v_cmpx_ne_u16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x7a,0x7d]
 
-# GFX12: v_cmpx_ne_u16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x7a,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x7a,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_ne_u16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x7a,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_ne_u32_e32 v1, v2                ; encoding: [0x01,0x05,0x9a,0x7d]
 0x01,0x05,0x9a,0x7d
+# GFX12: v_cmpx_ne_u32_e32 v1, v2                ; encoding: [0x01,0x05,0x9a,0x7d]
 
-# GFX12: v_cmpx_ne_u32_e32 v255, v2              ; encoding: [0xff,0x05,0x9a,0x7d]
 0xff,0x05,0x9a,0x7d
+# GFX12: v_cmpx_ne_u32_e32 v255, v2              ; encoding: [0xff,0x05,0x9a,0x7d]
 
-# GFX12: v_cmpx_ne_u32_e32 s1, v2                ; encoding: [0x01,0x04,0x9a,0x7d]
 0x01,0x04,0x9a,0x7d
+# GFX12: v_cmpx_ne_u32_e32 s1, v2                ; encoding: [0x01,0x04,0x9a,0x7d]
 
-# GFX12: v_cmpx_ne_u32_e32 s105, v2              ; encoding: [0x69,0x04,0x9a,0x7d]
 0x69,0x04,0x9a,0x7d
+# GFX12: v_cmpx_ne_u32_e32 s105, v2              ; encoding: [0x69,0x04,0x9a,0x7d]
 
-# GFX12: v_cmpx_ne_u32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x9a,0x7d]
 0x6a,0x04,0x9a,0x7d
+# GFX12: v_cmpx_ne_u32_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x9a,0x7d]
 
-# GFX12: v_cmpx_ne_u32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x9a,0x7d]
 0x6b,0x04,0x9a,0x7d
+# GFX12: v_cmpx_ne_u32_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x9a,0x7d]
 
-# GFX12: v_cmpx_ne_u32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x9a,0x7d]
 0x7b,0x04,0x9a,0x7d
+# GFX12: v_cmpx_ne_u32_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x9a,0x7d]
 
-# GFX12: v_cmpx_ne_u32_e32 m0, v2                ; encoding: [0x7d,0x04,0x9a,0x7d]
 0x7d,0x04,0x9a,0x7d
+# GFX12: v_cmpx_ne_u32_e32 m0, v2                ; encoding: [0x7d,0x04,0x9a,0x7d]
 
-# GFX12: v_cmpx_ne_u32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x9a,0x7d]
 0x7e,0x04,0x9a,0x7d
+# GFX12: v_cmpx_ne_u32_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x9a,0x7d]
 
-# GFX12: v_cmpx_ne_u32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x9a,0x7d]
 0x7f,0x04,0x9a,0x7d
+# GFX12: v_cmpx_ne_u32_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x9a,0x7d]
 
-# GFX12: v_cmpx_ne_u32_e32 null, v2              ; encoding: [0x7c,0x04,0x9a,0x7d]
 0x7c,0x04,0x9a,0x7d
+# GFX12: v_cmpx_ne_u32_e32 null, v2              ; encoding: [0x7c,0x04,0x9a,0x7d]
 
-# GFX12: v_cmpx_ne_u32_e32 -1, v2                ; encoding: [0xc1,0x04,0x9a,0x7d]
 0xc1,0x04,0x9a,0x7d
+# GFX12: v_cmpx_ne_u32_e32 -1, v2                ; encoding: [0xc1,0x04,0x9a,0x7d]
 
-# GFX12: v_cmpx_ne_u32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x9a,0x7d]
 0xf0,0x04,0x9a,0x7d
+# GFX12: v_cmpx_ne_u32_e32 0.5, v2               ; encoding: [0xf0,0x04,0x9a,0x7d]
 
-# GFX12: v_cmpx_ne_u32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x9a,0x7d]
 0xfd,0x04,0x9a,0x7d
+# GFX12: v_cmpx_ne_u32_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x9a,0x7d]
 
-# GFX12: v_cmpx_ne_u32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x9b,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x9b,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ne_u32_e32 0xaf123456, v255      ; encoding: [0xff,0xfe,0x9b,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ne_u64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xba,0x7d]
 0x01,0x05,0xba,0x7d
+# GFX12: v_cmpx_ne_u64_e32 v[1:2], v[2:3]        ; encoding: [0x01,0x05,0xba,0x7d]
 
-# GFX12: v_cmpx_ne_u64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xba,0x7d]
 0xfe,0x05,0xba,0x7d
+# GFX12: v_cmpx_ne_u64_e32 v[254:255], v[2:3]    ; encoding: [0xfe,0x05,0xba,0x7d]
 
-# GFX12: v_cmpx_ne_u64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xba,0x7d]
 0x02,0x04,0xba,0x7d
+# GFX12: v_cmpx_ne_u64_e32 s[2:3], v[2:3]        ; encoding: [0x02,0x04,0xba,0x7d]
 
-# GFX12: v_cmpx_ne_u64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xba,0x7d]
 0x68,0x04,0xba,0x7d
+# GFX12: v_cmpx_ne_u64_e32 s[104:105], v[2:3]    ; encoding: [0x68,0x04,0xba,0x7d]
 
-# GFX12: v_cmpx_ne_u64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xba,0x7d]
 0x6a,0x04,0xba,0x7d
+# GFX12: v_cmpx_ne_u64_e32 vcc, v[2:3]           ; encoding: [0x6a,0x04,0xba,0x7d]
 
-# GFX12: v_cmpx_ne_u64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xba,0x7d]
 0x7a,0x04,0xba,0x7d
+# GFX12: v_cmpx_ne_u64_e32 ttmp[14:15], v[2:3]   ; encoding: [0x7a,0x04,0xba,0x7d]
 
-# GFX12: v_cmpx_ne_u64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xba,0x7d]
 0x7e,0x04,0xba,0x7d
+# GFX12: v_cmpx_ne_u64_e32 exec, v[2:3]          ; encoding: [0x7e,0x04,0xba,0x7d]
 
-# GFX12: v_cmpx_ne_u64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xba,0x7d]
 0x7c,0x04,0xba,0x7d
+# GFX12: v_cmpx_ne_u64_e32 null, v[2:3]          ; encoding: [0x7c,0x04,0xba,0x7d]
 
-# GFX12: v_cmpx_ne_u64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xba,0x7d]
 0xc1,0x04,0xba,0x7d
+# GFX12: v_cmpx_ne_u64_e32 -1, v[2:3]            ; encoding: [0xc1,0x04,0xba,0x7d]
 
-# GFX12: v_cmpx_ne_u64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xba,0x7d]
 0xf0,0x04,0xba,0x7d
+# GFX12: v_cmpx_ne_u64_e32 0.5, v[2:3]           ; encoding: [0xf0,0x04,0xba,0x7d]
 
-# GFX12: v_cmpx_ne_u64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xba,0x7d]
 0xfd,0x04,0xba,0x7d
+# GFX12: v_cmpx_ne_u64_e32 src_scc, v[2:3]       ; encoding: [0xfd,0x04,0xba,0x7d]
 
-# GFX12: v_cmpx_ne_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbb,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0xbb,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ne_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbb,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_neq_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x1a,0x7d]
 0x01,0x05,0x1a,0x7d
+# GFX12: v_cmpx_neq_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x1a,0x7d]
 
-# GFX12: v_cmpx_neq_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x1a,0x7d]
 0x7f,0x05,0x1a,0x7d
+# GFX12: v_cmpx_neq_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x1a,0x7d]
 
-# GFX12: v_cmpx_neq_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x1a,0x7d]
 0x01,0x04,0x1a,0x7d
+# GFX12: v_cmpx_neq_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x1a,0x7d]
 
-# GFX12: v_cmpx_neq_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x1a,0x7d]
 0x69,0x04,0x1a,0x7d
+# GFX12: v_cmpx_neq_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x1a,0x7d]
 
-# GFX12: v_cmpx_neq_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x1a,0x7d]
 0x6a,0x04,0x1a,0x7d
+# GFX12: v_cmpx_neq_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x1a,0x7d]
 
-# GFX12: v_cmpx_neq_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x1a,0x7d]
 0x6b,0x04,0x1a,0x7d
+# GFX12: v_cmpx_neq_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x1a,0x7d]
 
-# GFX12: v_cmpx_neq_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x1a,0x7d]
 0x7b,0x04,0x1a,0x7d
+# GFX12: v_cmpx_neq_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x1a,0x7d]
 
-# GFX12: v_cmpx_neq_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x1a,0x7d]
 0x7d,0x04,0x1a,0x7d
+# GFX12: v_cmpx_neq_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x1a,0x7d]
 
-# GFX12: v_cmpx_neq_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x1a,0x7d]
 0x7e,0x04,0x1a,0x7d
+# GFX12: v_cmpx_neq_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x1a,0x7d]
 
-# GFX12: v_cmpx_neq_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x1a,0x7d]
 0x7f,0x04,0x1a,0x7d
+# GFX12: v_cmpx_neq_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x1a,0x7d]
 
-# GFX12: v_cmpx_neq_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x1a,0x7d]
 0x7c,0x04,0x1a,0x7d
+# GFX12: v_cmpx_neq_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x1a,0x7d]
 
-# GFX12: v_cmpx_neq_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x1a,0x7d]
 0xc1,0x04,0x1a,0x7d
+# GFX12: v_cmpx_neq_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x1a,0x7d]
 
-# GFX12: v_cmpx_neq_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x1a,0x7d]
 0xf0,0x04,0x1a,0x7d
+# GFX12: v_cmpx_neq_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x1a,0x7d]
 
-# GFX12: v_cmpx_neq_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x1a,0x7d]
 0xfd,0x04,0x1a,0x7d
+# GFX12: v_cmpx_neq_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x1a,0x7d]
 
-# GFX12: v_cmpx_neq_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x1a,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x1a,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_neq_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x1a,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_neq_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x3a,0x7d]
 0x01,0x05,0x3a,0x7d
+# GFX12: v_cmpx_neq_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x3a,0x7d]
 
-# GFX12: v_cmpx_neq_f32_e32 v255, v2             ; encoding: [0xff,0x05,0x3a,0x7d]
 0xff,0x05,0x3a,0x7d
+# GFX12: v_cmpx_neq_f32_e32 v255, v2             ; encoding: [0xff,0x05,0x3a,0x7d]
 
-# GFX12: v_cmpx_neq_f32_e32 s1, v2               ; encoding: [0x01,0x04,0x3a,0x7d]
 0x01,0x04,0x3a,0x7d
+# GFX12: v_cmpx_neq_f32_e32 s1, v2               ; encoding: [0x01,0x04,0x3a,0x7d]
 
-# GFX12: v_cmpx_neq_f32_e32 s105, v2             ; encoding: [0x69,0x04,0x3a,0x7d]
 0x69,0x04,0x3a,0x7d
+# GFX12: v_cmpx_neq_f32_e32 s105, v2             ; encoding: [0x69,0x04,0x3a,0x7d]
 
-# GFX12: v_cmpx_neq_f32_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x3a,0x7d]
 0x6a,0x04,0x3a,0x7d
+# GFX12: v_cmpx_neq_f32_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x3a,0x7d]
 
-# GFX12: v_cmpx_neq_f32_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x3a,0x7d]
 0x6b,0x04,0x3a,0x7d
+# GFX12: v_cmpx_neq_f32_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x3a,0x7d]
 
-# GFX12: v_cmpx_neq_f32_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x3a,0x7d]
 0x7b,0x04,0x3a,0x7d
+# GFX12: v_cmpx_neq_f32_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x3a,0x7d]
 
-# GFX12: v_cmpx_neq_f32_e32 m0, v2               ; encoding: [0x7d,0x04,0x3a,0x7d]
 0x7d,0x04,0x3a,0x7d
+# GFX12: v_cmpx_neq_f32_e32 m0, v2               ; encoding: [0x7d,0x04,0x3a,0x7d]
 
-# GFX12: v_cmpx_neq_f32_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x3a,0x7d]
 0x7e,0x04,0x3a,0x7d
+# GFX12: v_cmpx_neq_f32_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x3a,0x7d]
 
-# GFX12: v_cmpx_neq_f32_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x3a,0x7d]
 0x7f,0x04,0x3a,0x7d
+# GFX12: v_cmpx_neq_f32_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x3a,0x7d]
 
-# GFX12: v_cmpx_neq_f32_e32 null, v2             ; encoding: [0x7c,0x04,0x3a,0x7d]
 0x7c,0x04,0x3a,0x7d
+# GFX12: v_cmpx_neq_f32_e32 null, v2             ; encoding: [0x7c,0x04,0x3a,0x7d]
 
-# GFX12: v_cmpx_neq_f32_e32 -1, v2               ; encoding: [0xc1,0x04,0x3a,0x7d]
 0xc1,0x04,0x3a,0x7d
+# GFX12: v_cmpx_neq_f32_e32 -1, v2               ; encoding: [0xc1,0x04,0x3a,0x7d]
 
-# GFX12: v_cmpx_neq_f32_e32 0.5, v2              ; encoding: [0xf0,0x04,0x3a,0x7d]
 0xf0,0x04,0x3a,0x7d
+# GFX12: v_cmpx_neq_f32_e32 0.5, v2              ; encoding: [0xf0,0x04,0x3a,0x7d]
 
-# GFX12: v_cmpx_neq_f32_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x3a,0x7d]
 0xfd,0x04,0x3a,0x7d
+# GFX12: v_cmpx_neq_f32_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x3a,0x7d]
 
-# GFX12: v_cmpx_neq_f32_e32 0xaf123456, v255     ; encoding: [0xff,0xfe,0x3b,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x3b,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_neq_f32_e32 0xaf123456, v255     ; encoding: [0xff,0xfe,0x3b,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_neq_f64_e32 v[1:2], v[2:3]       ; encoding: [0x01,0x05,0x5a,0x7d]
 0x01,0x05,0x5a,0x7d
+# GFX12: v_cmpx_neq_f64_e32 v[1:2], v[2:3]       ; encoding: [0x01,0x05,0x5a,0x7d]
 
-# GFX12: v_cmpx_neq_f64_e32 v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0x5a,0x7d]
 0xfe,0x05,0x5a,0x7d
+# GFX12: v_cmpx_neq_f64_e32 v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0x5a,0x7d]
 
-# GFX12: v_cmpx_neq_f64_e32 s[2:3], v[2:3]       ; encoding: [0x02,0x04,0x5a,0x7d]
 0x02,0x04,0x5a,0x7d
+# GFX12: v_cmpx_neq_f64_e32 s[2:3], v[2:3]       ; encoding: [0x02,0x04,0x5a,0x7d]
 
-# GFX12: v_cmpx_neq_f64_e32 s[104:105], v[2:3]   ; encoding: [0x68,0x04,0x5a,0x7d]
 0x68,0x04,0x5a,0x7d
+# GFX12: v_cmpx_neq_f64_e32 s[104:105], v[2:3]   ; encoding: [0x68,0x04,0x5a,0x7d]
 
-# GFX12: v_cmpx_neq_f64_e32 vcc, v[2:3]          ; encoding: [0x6a,0x04,0x5a,0x7d]
 0x6a,0x04,0x5a,0x7d
+# GFX12: v_cmpx_neq_f64_e32 vcc, v[2:3]          ; encoding: [0x6a,0x04,0x5a,0x7d]
 
-# GFX12: v_cmpx_neq_f64_e32 ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0x5a,0x7d]
 0x7a,0x04,0x5a,0x7d
+# GFX12: v_cmpx_neq_f64_e32 ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0x5a,0x7d]
 
-# GFX12: v_cmpx_neq_f64_e32 exec, v[2:3]         ; encoding: [0x7e,0x04,0x5a,0x7d]
 0x7e,0x04,0x5a,0x7d
+# GFX12: v_cmpx_neq_f64_e32 exec, v[2:3]         ; encoding: [0x7e,0x04,0x5a,0x7d]
 
-# GFX12: v_cmpx_neq_f64_e32 null, v[2:3]         ; encoding: [0x7c,0x04,0x5a,0x7d]
 0x7c,0x04,0x5a,0x7d
+# GFX12: v_cmpx_neq_f64_e32 null, v[2:3]         ; encoding: [0x7c,0x04,0x5a,0x7d]
 
-# GFX12: v_cmpx_neq_f64_e32 -1, v[2:3]           ; encoding: [0xc1,0x04,0x5a,0x7d]
 0xc1,0x04,0x5a,0x7d
+# GFX12: v_cmpx_neq_f64_e32 -1, v[2:3]           ; encoding: [0xc1,0x04,0x5a,0x7d]
 
-# GFX12: v_cmpx_neq_f64_e32 0.5, v[2:3]          ; encoding: [0xf0,0x04,0x5a,0x7d]
 0xf0,0x04,0x5a,0x7d
+# GFX12: v_cmpx_neq_f64_e32 0.5, v[2:3]          ; encoding: [0xf0,0x04,0x5a,0x7d]
 
-# GFX12: v_cmpx_neq_f64_e32 src_scc, v[2:3]      ; encoding: [0xfd,0x04,0x5a,0x7d]
 0xfd,0x04,0x5a,0x7d
+# GFX12: v_cmpx_neq_f64_e32 src_scc, v[2:3]      ; encoding: [0xfd,0x04,0x5a,0x7d]
 
-# GFX12: v_cmpx_neq_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5b,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x5b,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_neq_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5b,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_nge_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x12,0x7d]
 0x01,0x05,0x12,0x7d
+# GFX12: v_cmpx_nge_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x12,0x7d]
 
-# GFX12: v_cmpx_nge_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x12,0x7d]
 0x7f,0x05,0x12,0x7d
+# GFX12: v_cmpx_nge_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x12,0x7d]
 
-# GFX12: v_cmpx_nge_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x12,0x7d]
 0x01,0x04,0x12,0x7d
+# GFX12: v_cmpx_nge_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x12,0x7d]
 
-# GFX12: v_cmpx_nge_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x12,0x7d]
 0x69,0x04,0x12,0x7d
+# GFX12: v_cmpx_nge_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x12,0x7d]
 
-# GFX12: v_cmpx_nge_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x12,0x7d]
 0x6a,0x04,0x12,0x7d
+# GFX12: v_cmpx_nge_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x12,0x7d]
 
-# GFX12: v_cmpx_nge_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x12,0x7d]
 0x6b,0x04,0x12,0x7d
+# GFX12: v_cmpx_nge_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x12,0x7d]
 
-# GFX12: v_cmpx_nge_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x12,0x7d]
 0x7b,0x04,0x12,0x7d
+# GFX12: v_cmpx_nge_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x12,0x7d]
 
-# GFX12: v_cmpx_nge_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x12,0x7d]
 0x7d,0x04,0x12,0x7d
+# GFX12: v_cmpx_nge_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x12,0x7d]
 
-# GFX12: v_cmpx_nge_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x12,0x7d]
 0x7e,0x04,0x12,0x7d
+# GFX12: v_cmpx_nge_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x12,0x7d]
 
-# GFX12: v_cmpx_nge_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x12,0x7d]
 0x7f,0x04,0x12,0x7d
+# GFX12: v_cmpx_nge_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x12,0x7d]
 
-# GFX12: v_cmpx_nge_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x12,0x7d]
 0x7c,0x04,0x12,0x7d
+# GFX12: v_cmpx_nge_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x12,0x7d]
 
-# GFX12: v_cmpx_nge_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x12,0x7d]
 0xc1,0x04,0x12,0x7d
+# GFX12: v_cmpx_nge_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x12,0x7d]
 
-# GFX12: v_cmpx_nge_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x12,0x7d]
 0xf0,0x04,0x12,0x7d
+# GFX12: v_cmpx_nge_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x12,0x7d]
 
-# GFX12: v_cmpx_nge_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x12,0x7d]
 0xfd,0x04,0x12,0x7d
+# GFX12: v_cmpx_nge_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x12,0x7d]
 
-# GFX12: v_cmpx_nge_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x12,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x12,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_nge_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x12,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_nge_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x32,0x7d]
 0x01,0x05,0x32,0x7d
+# GFX12: v_cmpx_nge_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x32,0x7d]
 
-# GFX12: v_cmpx_nge_f32_e32 v255, v2             ; encoding: [0xff,0x05,0x32,0x7d]
 0xff,0x05,0x32,0x7d
+# GFX12: v_cmpx_nge_f32_e32 v255, v2             ; encoding: [0xff,0x05,0x32,0x7d]
 
-# GFX12: v_cmpx_nge_f32_e32 s1, v2               ; encoding: [0x01,0x04,0x32,0x7d]
 0x01,0x04,0x32,0x7d
+# GFX12: v_cmpx_nge_f32_e32 s1, v2               ; encoding: [0x01,0x04,0x32,0x7d]
 
-# GFX12: v_cmpx_nge_f32_e32 s105, v2             ; encoding: [0x69,0x04,0x32,0x7d]
 0x69,0x04,0x32,0x7d
+# GFX12: v_cmpx_nge_f32_e32 s105, v2             ; encoding: [0x69,0x04,0x32,0x7d]
 
-# GFX12: v_cmpx_nge_f32_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x32,0x7d]
 0x6a,0x04,0x32,0x7d
+# GFX12: v_cmpx_nge_f32_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x32,0x7d]
 
-# GFX12: v_cmpx_nge_f32_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x32,0x7d]
 0x6b,0x04,0x32,0x7d
+# GFX12: v_cmpx_nge_f32_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x32,0x7d]
 
-# GFX12: v_cmpx_nge_f32_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x32,0x7d]
 0x7b,0x04,0x32,0x7d
+# GFX12: v_cmpx_nge_f32_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x32,0x7d]
 
-# GFX12: v_cmpx_nge_f32_e32 m0, v2               ; encoding: [0x7d,0x04,0x32,0x7d]
 0x7d,0x04,0x32,0x7d
+# GFX12: v_cmpx_nge_f32_e32 m0, v2               ; encoding: [0x7d,0x04,0x32,0x7d]
 
-# GFX12: v_cmpx_nge_f32_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x32,0x7d]
 0x7e,0x04,0x32,0x7d
+# GFX12: v_cmpx_nge_f32_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x32,0x7d]
 
-# GFX12: v_cmpx_nge_f32_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x32,0x7d]
 0x7f,0x04,0x32,0x7d
+# GFX12: v_cmpx_nge_f32_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x32,0x7d]
 
-# GFX12: v_cmpx_nge_f32_e32 null, v2             ; encoding: [0x7c,0x04,0x32,0x7d]
 0x7c,0x04,0x32,0x7d
+# GFX12: v_cmpx_nge_f32_e32 null, v2             ; encoding: [0x7c,0x04,0x32,0x7d]
 
-# GFX12: v_cmpx_nge_f32_e32 -1, v2               ; encoding: [0xc1,0x04,0x32,0x7d]
 0xc1,0x04,0x32,0x7d
+# GFX12: v_cmpx_nge_f32_e32 -1, v2               ; encoding: [0xc1,0x04,0x32,0x7d]
 
-# GFX12: v_cmpx_nge_f32_e32 0.5, v2              ; encoding: [0xf0,0x04,0x32,0x7d]
 0xf0,0x04,0x32,0x7d
+# GFX12: v_cmpx_nge_f32_e32 0.5, v2              ; encoding: [0xf0,0x04,0x32,0x7d]
 
-# GFX12: v_cmpx_nge_f32_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x32,0x7d]
 0xfd,0x04,0x32,0x7d
+# GFX12: v_cmpx_nge_f32_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x32,0x7d]
 
-# GFX12: v_cmpx_nge_f32_e32 0xaf123456, v255     ; encoding: [0xff,0xfe,0x33,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x33,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_nge_f32_e32 0xaf123456, v255     ; encoding: [0xff,0xfe,0x33,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_nge_f64_e32 v[1:2], v[2:3]       ; encoding: [0x01,0x05,0x52,0x7d]
 0x01,0x05,0x52,0x7d
+# GFX12: v_cmpx_nge_f64_e32 v[1:2], v[2:3]       ; encoding: [0x01,0x05,0x52,0x7d]
 
-# GFX12: v_cmpx_nge_f64_e32 v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0x52,0x7d]
 0xfe,0x05,0x52,0x7d
+# GFX12: v_cmpx_nge_f64_e32 v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0x52,0x7d]
 
-# GFX12: v_cmpx_nge_f64_e32 s[2:3], v[2:3]       ; encoding: [0x02,0x04,0x52,0x7d]
 0x02,0x04,0x52,0x7d
+# GFX12: v_cmpx_nge_f64_e32 s[2:3], v[2:3]       ; encoding: [0x02,0x04,0x52,0x7d]
 
-# GFX12: v_cmpx_nge_f64_e32 s[104:105], v[2:3]   ; encoding: [0x68,0x04,0x52,0x7d]
 0x68,0x04,0x52,0x7d
+# GFX12: v_cmpx_nge_f64_e32 s[104:105], v[2:3]   ; encoding: [0x68,0x04,0x52,0x7d]
 
-# GFX12: v_cmpx_nge_f64_e32 vcc, v[2:3]          ; encoding: [0x6a,0x04,0x52,0x7d]
 0x6a,0x04,0x52,0x7d
+# GFX12: v_cmpx_nge_f64_e32 vcc, v[2:3]          ; encoding: [0x6a,0x04,0x52,0x7d]
 
-# GFX12: v_cmpx_nge_f64_e32 ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0x52,0x7d]
 0x7a,0x04,0x52,0x7d
+# GFX12: v_cmpx_nge_f64_e32 ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0x52,0x7d]
 
-# GFX12: v_cmpx_nge_f64_e32 exec, v[2:3]         ; encoding: [0x7e,0x04,0x52,0x7d]
 0x7e,0x04,0x52,0x7d
+# GFX12: v_cmpx_nge_f64_e32 exec, v[2:3]         ; encoding: [0x7e,0x04,0x52,0x7d]
 
-# GFX12: v_cmpx_nge_f64_e32 null, v[2:3]         ; encoding: [0x7c,0x04,0x52,0x7d]
 0x7c,0x04,0x52,0x7d
+# GFX12: v_cmpx_nge_f64_e32 null, v[2:3]         ; encoding: [0x7c,0x04,0x52,0x7d]
 
-# GFX12: v_cmpx_nge_f64_e32 -1, v[2:3]           ; encoding: [0xc1,0x04,0x52,0x7d]
 0xc1,0x04,0x52,0x7d
+# GFX12: v_cmpx_nge_f64_e32 -1, v[2:3]           ; encoding: [0xc1,0x04,0x52,0x7d]
 
-# GFX12: v_cmpx_nge_f64_e32 0.5, v[2:3]          ; encoding: [0xf0,0x04,0x52,0x7d]
 0xf0,0x04,0x52,0x7d
+# GFX12: v_cmpx_nge_f64_e32 0.5, v[2:3]          ; encoding: [0xf0,0x04,0x52,0x7d]
 
-# GFX12: v_cmpx_nge_f64_e32 src_scc, v[2:3]      ; encoding: [0xfd,0x04,0x52,0x7d]
 0xfd,0x04,0x52,0x7d
+# GFX12: v_cmpx_nge_f64_e32 src_scc, v[2:3]      ; encoding: [0xfd,0x04,0x52,0x7d]
 
-# GFX12: v_cmpx_nge_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x53,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x53,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_nge_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x53,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ngt_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x16,0x7d]
 0x01,0x05,0x16,0x7d
+# GFX12: v_cmpx_ngt_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x16,0x7d]
 
-# GFX12: v_cmpx_ngt_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x16,0x7d]
 0x7f,0x05,0x16,0x7d
+# GFX12: v_cmpx_ngt_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x16,0x7d]
 
-# GFX12: v_cmpx_ngt_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x16,0x7d]
 0x01,0x04,0x16,0x7d
+# GFX12: v_cmpx_ngt_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x16,0x7d]
 
-# GFX12: v_cmpx_ngt_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x16,0x7d]
 0x69,0x04,0x16,0x7d
+# GFX12: v_cmpx_ngt_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x16,0x7d]
 
-# GFX12: v_cmpx_ngt_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x16,0x7d]
 0x6a,0x04,0x16,0x7d
+# GFX12: v_cmpx_ngt_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x16,0x7d]
 
-# GFX12: v_cmpx_ngt_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x16,0x7d]
 0x6b,0x04,0x16,0x7d
+# GFX12: v_cmpx_ngt_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x16,0x7d]
 
-# GFX12: v_cmpx_ngt_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x16,0x7d]
 0x7b,0x04,0x16,0x7d
+# GFX12: v_cmpx_ngt_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x16,0x7d]
 
-# GFX12: v_cmpx_ngt_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x16,0x7d]
 0x7d,0x04,0x16,0x7d
+# GFX12: v_cmpx_ngt_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x16,0x7d]
 
-# GFX12: v_cmpx_ngt_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x16,0x7d]
 0x7e,0x04,0x16,0x7d
+# GFX12: v_cmpx_ngt_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x16,0x7d]
 
-# GFX12: v_cmpx_ngt_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x16,0x7d]
 0x7f,0x04,0x16,0x7d
+# GFX12: v_cmpx_ngt_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x16,0x7d]
 
-# GFX12: v_cmpx_ngt_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x16,0x7d]
 0x7c,0x04,0x16,0x7d
+# GFX12: v_cmpx_ngt_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x16,0x7d]
 
-# GFX12: v_cmpx_ngt_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x16,0x7d]
 0xc1,0x04,0x16,0x7d
+# GFX12: v_cmpx_ngt_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x16,0x7d]
 
-# GFX12: v_cmpx_ngt_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x16,0x7d]
 0xf0,0x04,0x16,0x7d
+# GFX12: v_cmpx_ngt_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x16,0x7d]
 
-# GFX12: v_cmpx_ngt_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x16,0x7d]
 0xfd,0x04,0x16,0x7d
+# GFX12: v_cmpx_ngt_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x16,0x7d]
 
-# GFX12: v_cmpx_ngt_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x16,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x16,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_ngt_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x16,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_ngt_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x36,0x7d]
 0x01,0x05,0x36,0x7d
+# GFX12: v_cmpx_ngt_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x36,0x7d]
 
-# GFX12: v_cmpx_ngt_f32_e32 v255, v2             ; encoding: [0xff,0x05,0x36,0x7d]
 0xff,0x05,0x36,0x7d
+# GFX12: v_cmpx_ngt_f32_e32 v255, v2             ; encoding: [0xff,0x05,0x36,0x7d]
 
-# GFX12: v_cmpx_ngt_f32_e32 s1, v2               ; encoding: [0x01,0x04,0x36,0x7d]
 0x01,0x04,0x36,0x7d
+# GFX12: v_cmpx_ngt_f32_e32 s1, v2               ; encoding: [0x01,0x04,0x36,0x7d]
 
-# GFX12: v_cmpx_ngt_f32_e32 s105, v2             ; encoding: [0x69,0x04,0x36,0x7d]
 0x69,0x04,0x36,0x7d
+# GFX12: v_cmpx_ngt_f32_e32 s105, v2             ; encoding: [0x69,0x04,0x36,0x7d]
 
-# GFX12: v_cmpx_ngt_f32_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x36,0x7d]
 0x6a,0x04,0x36,0x7d
+# GFX12: v_cmpx_ngt_f32_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x36,0x7d]
 
-# GFX12: v_cmpx_ngt_f32_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x36,0x7d]
 0x6b,0x04,0x36,0x7d
+# GFX12: v_cmpx_ngt_f32_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x36,0x7d]
 
-# GFX12: v_cmpx_ngt_f32_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x36,0x7d]
 0x7b,0x04,0x36,0x7d
+# GFX12: v_cmpx_ngt_f32_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x36,0x7d]
 
-# GFX12: v_cmpx_ngt_f32_e32 m0, v2               ; encoding: [0x7d,0x04,0x36,0x7d]
 0x7d,0x04,0x36,0x7d
+# GFX12: v_cmpx_ngt_f32_e32 m0, v2               ; encoding: [0x7d,0x04,0x36,0x7d]
 
-# GFX12: v_cmpx_ngt_f32_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x36,0x7d]
 0x7e,0x04,0x36,0x7d
+# GFX12: v_cmpx_ngt_f32_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x36,0x7d]
 
-# GFX12: v_cmpx_ngt_f32_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x36,0x7d]
 0x7f,0x04,0x36,0x7d
+# GFX12: v_cmpx_ngt_f32_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x36,0x7d]
 
-# GFX12: v_cmpx_ngt_f32_e32 null, v2             ; encoding: [0x7c,0x04,0x36,0x7d]
 0x7c,0x04,0x36,0x7d
+# GFX12: v_cmpx_ngt_f32_e32 null, v2             ; encoding: [0x7c,0x04,0x36,0x7d]
 
-# GFX12: v_cmpx_ngt_f32_e32 -1, v2               ; encoding: [0xc1,0x04,0x36,0x7d]
 0xc1,0x04,0x36,0x7d
+# GFX12: v_cmpx_ngt_f32_e32 -1, v2               ; encoding: [0xc1,0x04,0x36,0x7d]
 
-# GFX12: v_cmpx_ngt_f32_e32 0.5, v2              ; encoding: [0xf0,0x04,0x36,0x7d]
 0xf0,0x04,0x36,0x7d
+# GFX12: v_cmpx_ngt_f32_e32 0.5, v2              ; encoding: [0xf0,0x04,0x36,0x7d]
 
-# GFX12: v_cmpx_ngt_f32_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x36,0x7d]
 0xfd,0x04,0x36,0x7d
+# GFX12: v_cmpx_ngt_f32_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x36,0x7d]
 
-# GFX12: v_cmpx_ngt_f32_e32 0xaf123456, v255     ; encoding: [0xff,0xfe,0x37,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x37,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ngt_f32_e32 0xaf123456, v255     ; encoding: [0xff,0xfe,0x37,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_ngt_f64_e32 v[1:2], v[2:3]       ; encoding: [0x01,0x05,0x56,0x7d]
 0x01,0x05,0x56,0x7d
+# GFX12: v_cmpx_ngt_f64_e32 v[1:2], v[2:3]       ; encoding: [0x01,0x05,0x56,0x7d]
 
-# GFX12: v_cmpx_ngt_f64_e32 v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0x56,0x7d]
 0xfe,0x05,0x56,0x7d
+# GFX12: v_cmpx_ngt_f64_e32 v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0x56,0x7d]
 
-# GFX12: v_cmpx_ngt_f64_e32 s[2:3], v[2:3]       ; encoding: [0x02,0x04,0x56,0x7d]
 0x02,0x04,0x56,0x7d
+# GFX12: v_cmpx_ngt_f64_e32 s[2:3], v[2:3]       ; encoding: [0x02,0x04,0x56,0x7d]
 
-# GFX12: v_cmpx_ngt_f64_e32 s[104:105], v[2:3]   ; encoding: [0x68,0x04,0x56,0x7d]
 0x68,0x04,0x56,0x7d
+# GFX12: v_cmpx_ngt_f64_e32 s[104:105], v[2:3]   ; encoding: [0x68,0x04,0x56,0x7d]
 
-# GFX12: v_cmpx_ngt_f64_e32 vcc, v[2:3]          ; encoding: [0x6a,0x04,0x56,0x7d]
 0x6a,0x04,0x56,0x7d
+# GFX12: v_cmpx_ngt_f64_e32 vcc, v[2:3]          ; encoding: [0x6a,0x04,0x56,0x7d]
 
-# GFX12: v_cmpx_ngt_f64_e32 ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0x56,0x7d]
 0x7a,0x04,0x56,0x7d
+# GFX12: v_cmpx_ngt_f64_e32 ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0x56,0x7d]
 
-# GFX12: v_cmpx_ngt_f64_e32 exec, v[2:3]         ; encoding: [0x7e,0x04,0x56,0x7d]
 0x7e,0x04,0x56,0x7d
+# GFX12: v_cmpx_ngt_f64_e32 exec, v[2:3]         ; encoding: [0x7e,0x04,0x56,0x7d]
 
-# GFX12: v_cmpx_ngt_f64_e32 null, v[2:3]         ; encoding: [0x7c,0x04,0x56,0x7d]
 0x7c,0x04,0x56,0x7d
+# GFX12: v_cmpx_ngt_f64_e32 null, v[2:3]         ; encoding: [0x7c,0x04,0x56,0x7d]
 
-# GFX12: v_cmpx_ngt_f64_e32 -1, v[2:3]           ; encoding: [0xc1,0x04,0x56,0x7d]
 0xc1,0x04,0x56,0x7d
+# GFX12: v_cmpx_ngt_f64_e32 -1, v[2:3]           ; encoding: [0xc1,0x04,0x56,0x7d]
 
-# GFX12: v_cmpx_ngt_f64_e32 0.5, v[2:3]          ; encoding: [0xf0,0x04,0x56,0x7d]
 0xf0,0x04,0x56,0x7d
+# GFX12: v_cmpx_ngt_f64_e32 0.5, v[2:3]          ; encoding: [0xf0,0x04,0x56,0x7d]
 
-# GFX12: v_cmpx_ngt_f64_e32 src_scc, v[2:3]      ; encoding: [0xfd,0x04,0x56,0x7d]
 0xfd,0x04,0x56,0x7d
+# GFX12: v_cmpx_ngt_f64_e32 src_scc, v[2:3]      ; encoding: [0xfd,0x04,0x56,0x7d]
 
-# GFX12: v_cmpx_ngt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x57,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x57,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_ngt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x57,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_nle_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x18,0x7d]
 0x01,0x05,0x18,0x7d
+# GFX12: v_cmpx_nle_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x18,0x7d]
 
-# GFX12: v_cmpx_nle_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x18,0x7d]
 0x7f,0x05,0x18,0x7d
+# GFX12: v_cmpx_nle_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x18,0x7d]
 
-# GFX12: v_cmpx_nle_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x18,0x7d]
 0x01,0x04,0x18,0x7d
+# GFX12: v_cmpx_nle_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x18,0x7d]
 
-# GFX12: v_cmpx_nle_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x18,0x7d]
 0x69,0x04,0x18,0x7d
+# GFX12: v_cmpx_nle_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x18,0x7d]
 
-# GFX12: v_cmpx_nle_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x18,0x7d]
 0x6a,0x04,0x18,0x7d
+# GFX12: v_cmpx_nle_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x18,0x7d]
 
-# GFX12: v_cmpx_nle_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x18,0x7d]
 0x6b,0x04,0x18,0x7d
+# GFX12: v_cmpx_nle_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x18,0x7d]
 
-# GFX12: v_cmpx_nle_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x18,0x7d]
 0x7b,0x04,0x18,0x7d
+# GFX12: v_cmpx_nle_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x18,0x7d]
 
-# GFX12: v_cmpx_nle_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x18,0x7d]
 0x7d,0x04,0x18,0x7d
+# GFX12: v_cmpx_nle_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x18,0x7d]
 
-# GFX12: v_cmpx_nle_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x18,0x7d]
 0x7e,0x04,0x18,0x7d
+# GFX12: v_cmpx_nle_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x18,0x7d]
 
-# GFX12: v_cmpx_nle_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x18,0x7d]
 0x7f,0x04,0x18,0x7d
+# GFX12: v_cmpx_nle_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x18,0x7d]
 
-# GFX12: v_cmpx_nle_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x18,0x7d]
 0x7c,0x04,0x18,0x7d
+# GFX12: v_cmpx_nle_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x18,0x7d]
 
-# GFX12: v_cmpx_nle_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x18,0x7d]
 0xc1,0x04,0x18,0x7d
+# GFX12: v_cmpx_nle_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x18,0x7d]
 
-# GFX12: v_cmpx_nle_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x18,0x7d]
 0xf0,0x04,0x18,0x7d
+# GFX12: v_cmpx_nle_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x18,0x7d]
 
-# GFX12: v_cmpx_nle_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x18,0x7d]
 0xfd,0x04,0x18,0x7d
+# GFX12: v_cmpx_nle_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x18,0x7d]
 
-# GFX12: v_cmpx_nle_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x18,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x18,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_nle_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x18,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_nle_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x38,0x7d]
 0x01,0x05,0x38,0x7d
+# GFX12: v_cmpx_nle_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x38,0x7d]
 
-# GFX12: v_cmpx_nle_f32_e32 v255, v2             ; encoding: [0xff,0x05,0x38,0x7d]
 0xff,0x05,0x38,0x7d
+# GFX12: v_cmpx_nle_f32_e32 v255, v2             ; encoding: [0xff,0x05,0x38,0x7d]
 
-# GFX12: v_cmpx_nle_f32_e32 s1, v2               ; encoding: [0x01,0x04,0x38,0x7d]
 0x01,0x04,0x38,0x7d
+# GFX12: v_cmpx_nle_f32_e32 s1, v2               ; encoding: [0x01,0x04,0x38,0x7d]
 
-# GFX12: v_cmpx_nle_f32_e32 s105, v2             ; encoding: [0x69,0x04,0x38,0x7d]
 0x69,0x04,0x38,0x7d
+# GFX12: v_cmpx_nle_f32_e32 s105, v2             ; encoding: [0x69,0x04,0x38,0x7d]
 
-# GFX12: v_cmpx_nle_f32_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x38,0x7d]
 0x6a,0x04,0x38,0x7d
+# GFX12: v_cmpx_nle_f32_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x38,0x7d]
 
-# GFX12: v_cmpx_nle_f32_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x38,0x7d]
 0x6b,0x04,0x38,0x7d
+# GFX12: v_cmpx_nle_f32_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x38,0x7d]
 
-# GFX12: v_cmpx_nle_f32_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x38,0x7d]
 0x7b,0x04,0x38,0x7d
+# GFX12: v_cmpx_nle_f32_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x38,0x7d]
 
-# GFX12: v_cmpx_nle_f32_e32 m0, v2               ; encoding: [0x7d,0x04,0x38,0x7d]
 0x7d,0x04,0x38,0x7d
+# GFX12: v_cmpx_nle_f32_e32 m0, v2               ; encoding: [0x7d,0x04,0x38,0x7d]
 
-# GFX12: v_cmpx_nle_f32_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x38,0x7d]
 0x7e,0x04,0x38,0x7d
+# GFX12: v_cmpx_nle_f32_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x38,0x7d]
 
-# GFX12: v_cmpx_nle_f32_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x38,0x7d]
 0x7f,0x04,0x38,0x7d
+# GFX12: v_cmpx_nle_f32_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x38,0x7d]
 
-# GFX12: v_cmpx_nle_f32_e32 null, v2             ; encoding: [0x7c,0x04,0x38,0x7d]
 0x7c,0x04,0x38,0x7d
+# GFX12: v_cmpx_nle_f32_e32 null, v2             ; encoding: [0x7c,0x04,0x38,0x7d]
 
-# GFX12: v_cmpx_nle_f32_e32 -1, v2               ; encoding: [0xc1,0x04,0x38,0x7d]
 0xc1,0x04,0x38,0x7d
+# GFX12: v_cmpx_nle_f32_e32 -1, v2               ; encoding: [0xc1,0x04,0x38,0x7d]
 
-# GFX12: v_cmpx_nle_f32_e32 0.5, v2              ; encoding: [0xf0,0x04,0x38,0x7d]
 0xf0,0x04,0x38,0x7d
+# GFX12: v_cmpx_nle_f32_e32 0.5, v2              ; encoding: [0xf0,0x04,0x38,0x7d]
 
-# GFX12: v_cmpx_nle_f32_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x38,0x7d]
 0xfd,0x04,0x38,0x7d
+# GFX12: v_cmpx_nle_f32_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x38,0x7d]
 
-# GFX12: v_cmpx_nle_f32_e32 0xaf123456, v255     ; encoding: [0xff,0xfe,0x39,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x39,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_nle_f32_e32 0xaf123456, v255     ; encoding: [0xff,0xfe,0x39,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_nle_f64_e32 v[1:2], v[2:3]       ; encoding: [0x01,0x05,0x58,0x7d]
 0x01,0x05,0x58,0x7d
+# GFX12: v_cmpx_nle_f64_e32 v[1:2], v[2:3]       ; encoding: [0x01,0x05,0x58,0x7d]
 
-# GFX12: v_cmpx_nle_f64_e32 v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0x58,0x7d]
 0xfe,0x05,0x58,0x7d
+# GFX12: v_cmpx_nle_f64_e32 v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0x58,0x7d]
 
-# GFX12: v_cmpx_nle_f64_e32 s[2:3], v[2:3]       ; encoding: [0x02,0x04,0x58,0x7d]
 0x02,0x04,0x58,0x7d
+# GFX12: v_cmpx_nle_f64_e32 s[2:3], v[2:3]       ; encoding: [0x02,0x04,0x58,0x7d]
 
-# GFX12: v_cmpx_nle_f64_e32 s[104:105], v[2:3]   ; encoding: [0x68,0x04,0x58,0x7d]
 0x68,0x04,0x58,0x7d
+# GFX12: v_cmpx_nle_f64_e32 s[104:105], v[2:3]   ; encoding: [0x68,0x04,0x58,0x7d]
 
-# GFX12: v_cmpx_nle_f64_e32 vcc, v[2:3]          ; encoding: [0x6a,0x04,0x58,0x7d]
 0x6a,0x04,0x58,0x7d
+# GFX12: v_cmpx_nle_f64_e32 vcc, v[2:3]          ; encoding: [0x6a,0x04,0x58,0x7d]
 
-# GFX12: v_cmpx_nle_f64_e32 ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0x58,0x7d]
 0x7a,0x04,0x58,0x7d
+# GFX12: v_cmpx_nle_f64_e32 ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0x58,0x7d]
 
-# GFX12: v_cmpx_nle_f64_e32 exec, v[2:3]         ; encoding: [0x7e,0x04,0x58,0x7d]
 0x7e,0x04,0x58,0x7d
+# GFX12: v_cmpx_nle_f64_e32 exec, v[2:3]         ; encoding: [0x7e,0x04,0x58,0x7d]
 
-# GFX12: v_cmpx_nle_f64_e32 null, v[2:3]         ; encoding: [0x7c,0x04,0x58,0x7d]
 0x7c,0x04,0x58,0x7d
+# GFX12: v_cmpx_nle_f64_e32 null, v[2:3]         ; encoding: [0x7c,0x04,0x58,0x7d]
 
-# GFX12: v_cmpx_nle_f64_e32 -1, v[2:3]           ; encoding: [0xc1,0x04,0x58,0x7d]
 0xc1,0x04,0x58,0x7d
+# GFX12: v_cmpx_nle_f64_e32 -1, v[2:3]           ; encoding: [0xc1,0x04,0x58,0x7d]
 
-# GFX12: v_cmpx_nle_f64_e32 0.5, v[2:3]          ; encoding: [0xf0,0x04,0x58,0x7d]
 0xf0,0x04,0x58,0x7d
+# GFX12: v_cmpx_nle_f64_e32 0.5, v[2:3]          ; encoding: [0xf0,0x04,0x58,0x7d]
 
-# GFX12: v_cmpx_nle_f64_e32 src_scc, v[2:3]      ; encoding: [0xfd,0x04,0x58,0x7d]
 0xfd,0x04,0x58,0x7d
+# GFX12: v_cmpx_nle_f64_e32 src_scc, v[2:3]      ; encoding: [0xfd,0x04,0x58,0x7d]
 
-# GFX12: v_cmpx_nle_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x59,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x59,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_nle_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x59,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_nlg_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x14,0x7d]
 0x01,0x05,0x14,0x7d
+# GFX12: v_cmpx_nlg_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x14,0x7d]
 
-# GFX12: v_cmpx_nlg_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x14,0x7d]
 0x7f,0x05,0x14,0x7d
+# GFX12: v_cmpx_nlg_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x14,0x7d]
 
-# GFX12: v_cmpx_nlg_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x14,0x7d]
 0x01,0x04,0x14,0x7d
+# GFX12: v_cmpx_nlg_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x14,0x7d]
 
-# GFX12: v_cmpx_nlg_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x14,0x7d]
 0x69,0x04,0x14,0x7d
+# GFX12: v_cmpx_nlg_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x14,0x7d]
 
-# GFX12: v_cmpx_nlg_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x14,0x7d]
 0x6a,0x04,0x14,0x7d
+# GFX12: v_cmpx_nlg_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x14,0x7d]
 
-# GFX12: v_cmpx_nlg_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x14,0x7d]
 0x6b,0x04,0x14,0x7d
+# GFX12: v_cmpx_nlg_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x14,0x7d]
 
-# GFX12: v_cmpx_nlg_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x14,0x7d]
 0x7b,0x04,0x14,0x7d
+# GFX12: v_cmpx_nlg_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x14,0x7d]
 
-# GFX12: v_cmpx_nlg_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x14,0x7d]
 0x7d,0x04,0x14,0x7d
+# GFX12: v_cmpx_nlg_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x14,0x7d]
 
-# GFX12: v_cmpx_nlg_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x14,0x7d]
 0x7e,0x04,0x14,0x7d
+# GFX12: v_cmpx_nlg_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x14,0x7d]
 
-# GFX12: v_cmpx_nlg_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x14,0x7d]
 0x7f,0x04,0x14,0x7d
+# GFX12: v_cmpx_nlg_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x14,0x7d]
 
-# GFX12: v_cmpx_nlg_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x14,0x7d]
 0x7c,0x04,0x14,0x7d
+# GFX12: v_cmpx_nlg_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x14,0x7d]
 
-# GFX12: v_cmpx_nlg_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x14,0x7d]
 0xc1,0x04,0x14,0x7d
+# GFX12: v_cmpx_nlg_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x14,0x7d]
 
-# GFX12: v_cmpx_nlg_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x14,0x7d]
 0xf0,0x04,0x14,0x7d
+# GFX12: v_cmpx_nlg_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x14,0x7d]
 
-# GFX12: v_cmpx_nlg_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x14,0x7d]
 0xfd,0x04,0x14,0x7d
+# GFX12: v_cmpx_nlg_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x14,0x7d]
 
-# GFX12: v_cmpx_nlg_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x14,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x14,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_nlg_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x14,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_nlg_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x34,0x7d]
 0x01,0x05,0x34,0x7d
+# GFX12: v_cmpx_nlg_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x34,0x7d]
 
-# GFX12: v_cmpx_nlg_f32_e32 v255, v2             ; encoding: [0xff,0x05,0x34,0x7d]
 0xff,0x05,0x34,0x7d
+# GFX12: v_cmpx_nlg_f32_e32 v255, v2             ; encoding: [0xff,0x05,0x34,0x7d]
 
-# GFX12: v_cmpx_nlg_f32_e32 s1, v2               ; encoding: [0x01,0x04,0x34,0x7d]
 0x01,0x04,0x34,0x7d
+# GFX12: v_cmpx_nlg_f32_e32 s1, v2               ; encoding: [0x01,0x04,0x34,0x7d]
 
-# GFX12: v_cmpx_nlg_f32_e32 s105, v2             ; encoding: [0x69,0x04,0x34,0x7d]
 0x69,0x04,0x34,0x7d
+# GFX12: v_cmpx_nlg_f32_e32 s105, v2             ; encoding: [0x69,0x04,0x34,0x7d]
 
-# GFX12: v_cmpx_nlg_f32_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x34,0x7d]
 0x6a,0x04,0x34,0x7d
+# GFX12: v_cmpx_nlg_f32_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x34,0x7d]
 
-# GFX12: v_cmpx_nlg_f32_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x34,0x7d]
 0x6b,0x04,0x34,0x7d
+# GFX12: v_cmpx_nlg_f32_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x34,0x7d]
 
-# GFX12: v_cmpx_nlg_f32_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x34,0x7d]
 0x7b,0x04,0x34,0x7d
+# GFX12: v_cmpx_nlg_f32_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x34,0x7d]
 
-# GFX12: v_cmpx_nlg_f32_e32 m0, v2               ; encoding: [0x7d,0x04,0x34,0x7d]
 0x7d,0x04,0x34,0x7d
+# GFX12: v_cmpx_nlg_f32_e32 m0, v2               ; encoding: [0x7d,0x04,0x34,0x7d]
 
-# GFX12: v_cmpx_nlg_f32_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x34,0x7d]
 0x7e,0x04,0x34,0x7d
+# GFX12: v_cmpx_nlg_f32_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x34,0x7d]
 
-# GFX12: v_cmpx_nlg_f32_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x34,0x7d]
 0x7f,0x04,0x34,0x7d
+# GFX12: v_cmpx_nlg_f32_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x34,0x7d]
 
-# GFX12: v_cmpx_nlg_f32_e32 null, v2             ; encoding: [0x7c,0x04,0x34,0x7d]
 0x7c,0x04,0x34,0x7d
+# GFX12: v_cmpx_nlg_f32_e32 null, v2             ; encoding: [0x7c,0x04,0x34,0x7d]
 
-# GFX12: v_cmpx_nlg_f32_e32 -1, v2               ; encoding: [0xc1,0x04,0x34,0x7d]
 0xc1,0x04,0x34,0x7d
+# GFX12: v_cmpx_nlg_f32_e32 -1, v2               ; encoding: [0xc1,0x04,0x34,0x7d]
 
-# GFX12: v_cmpx_nlg_f32_e32 0.5, v2              ; encoding: [0xf0,0x04,0x34,0x7d]
 0xf0,0x04,0x34,0x7d
+# GFX12: v_cmpx_nlg_f32_e32 0.5, v2              ; encoding: [0xf0,0x04,0x34,0x7d]
 
-# GFX12: v_cmpx_nlg_f32_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x34,0x7d]
 0xfd,0x04,0x34,0x7d
+# GFX12: v_cmpx_nlg_f32_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x34,0x7d]
 
-# GFX12: v_cmpx_nlg_f32_e32 0xaf123456, v255     ; encoding: [0xff,0xfe,0x35,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x35,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_nlg_f32_e32 0xaf123456, v255     ; encoding: [0xff,0xfe,0x35,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_nlg_f64_e32 v[1:2], v[2:3]       ; encoding: [0x01,0x05,0x54,0x7d]
 0x01,0x05,0x54,0x7d
+# GFX12: v_cmpx_nlg_f64_e32 v[1:2], v[2:3]       ; encoding: [0x01,0x05,0x54,0x7d]
 
-# GFX12: v_cmpx_nlg_f64_e32 v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0x54,0x7d]
 0xfe,0x05,0x54,0x7d
+# GFX12: v_cmpx_nlg_f64_e32 v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0x54,0x7d]
 
-# GFX12: v_cmpx_nlg_f64_e32 s[2:3], v[2:3]       ; encoding: [0x02,0x04,0x54,0x7d]
 0x02,0x04,0x54,0x7d
+# GFX12: v_cmpx_nlg_f64_e32 s[2:3], v[2:3]       ; encoding: [0x02,0x04,0x54,0x7d]
 
-# GFX12: v_cmpx_nlg_f64_e32 s[104:105], v[2:3]   ; encoding: [0x68,0x04,0x54,0x7d]
 0x68,0x04,0x54,0x7d
+# GFX12: v_cmpx_nlg_f64_e32 s[104:105], v[2:3]   ; encoding: [0x68,0x04,0x54,0x7d]
 
-# GFX12: v_cmpx_nlg_f64_e32 vcc, v[2:3]          ; encoding: [0x6a,0x04,0x54,0x7d]
 0x6a,0x04,0x54,0x7d
+# GFX12: v_cmpx_nlg_f64_e32 vcc, v[2:3]          ; encoding: [0x6a,0x04,0x54,0x7d]
 
-# GFX12: v_cmpx_nlg_f64_e32 ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0x54,0x7d]
 0x7a,0x04,0x54,0x7d
+# GFX12: v_cmpx_nlg_f64_e32 ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0x54,0x7d]
 
-# GFX12: v_cmpx_nlg_f64_e32 exec, v[2:3]         ; encoding: [0x7e,0x04,0x54,0x7d]
 0x7e,0x04,0x54,0x7d
+# GFX12: v_cmpx_nlg_f64_e32 exec, v[2:3]         ; encoding: [0x7e,0x04,0x54,0x7d]
 
-# GFX12: v_cmpx_nlg_f64_e32 null, v[2:3]         ; encoding: [0x7c,0x04,0x54,0x7d]
 0x7c,0x04,0x54,0x7d
+# GFX12: v_cmpx_nlg_f64_e32 null, v[2:3]         ; encoding: [0x7c,0x04,0x54,0x7d]
 
-# GFX12: v_cmpx_nlg_f64_e32 -1, v[2:3]           ; encoding: [0xc1,0x04,0x54,0x7d]
 0xc1,0x04,0x54,0x7d
+# GFX12: v_cmpx_nlg_f64_e32 -1, v[2:3]           ; encoding: [0xc1,0x04,0x54,0x7d]
 
-# GFX12: v_cmpx_nlg_f64_e32 0.5, v[2:3]          ; encoding: [0xf0,0x04,0x54,0x7d]
 0xf0,0x04,0x54,0x7d
+# GFX12: v_cmpx_nlg_f64_e32 0.5, v[2:3]          ; encoding: [0xf0,0x04,0x54,0x7d]
 
-# GFX12: v_cmpx_nlg_f64_e32 src_scc, v[2:3]      ; encoding: [0xfd,0x04,0x54,0x7d]
 0xfd,0x04,0x54,0x7d
+# GFX12: v_cmpx_nlg_f64_e32 src_scc, v[2:3]      ; encoding: [0xfd,0x04,0x54,0x7d]
 
-# GFX12: v_cmpx_nlg_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x55,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x55,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_nlg_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x55,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_nlt_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x1c,0x7d]
 0x01,0x05,0x1c,0x7d
+# GFX12: v_cmpx_nlt_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x1c,0x7d]
 
-# GFX12: v_cmpx_nlt_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x1c,0x7d]
 0x7f,0x05,0x1c,0x7d
+# GFX12: v_cmpx_nlt_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x1c,0x7d]
 
-# GFX12: v_cmpx_nlt_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x1c,0x7d]
 0x01,0x04,0x1c,0x7d
+# GFX12: v_cmpx_nlt_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x1c,0x7d]
 
-# GFX12: v_cmpx_nlt_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x1c,0x7d]
 0x69,0x04,0x1c,0x7d
+# GFX12: v_cmpx_nlt_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x1c,0x7d]
 
-# GFX12: v_cmpx_nlt_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x1c,0x7d]
 0x6a,0x04,0x1c,0x7d
+# GFX12: v_cmpx_nlt_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x1c,0x7d]
 
-# GFX12: v_cmpx_nlt_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x1c,0x7d]
 0x6b,0x04,0x1c,0x7d
+# GFX12: v_cmpx_nlt_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x1c,0x7d]
 
-# GFX12: v_cmpx_nlt_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x1c,0x7d]
 0x7b,0x04,0x1c,0x7d
+# GFX12: v_cmpx_nlt_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x1c,0x7d]
 
-# GFX12: v_cmpx_nlt_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x1c,0x7d]
 0x7d,0x04,0x1c,0x7d
+# GFX12: v_cmpx_nlt_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x1c,0x7d]
 
-# GFX12: v_cmpx_nlt_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x1c,0x7d]
 0x7e,0x04,0x1c,0x7d
+# GFX12: v_cmpx_nlt_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x1c,0x7d]
 
-# GFX12: v_cmpx_nlt_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x1c,0x7d]
 0x7f,0x04,0x1c,0x7d
+# GFX12: v_cmpx_nlt_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x1c,0x7d]
 
-# GFX12: v_cmpx_nlt_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x1c,0x7d]
 0x7c,0x04,0x1c,0x7d
+# GFX12: v_cmpx_nlt_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x1c,0x7d]
 
-# GFX12: v_cmpx_nlt_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x1c,0x7d]
 0xc1,0x04,0x1c,0x7d
+# GFX12: v_cmpx_nlt_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x1c,0x7d]
 
-# GFX12: v_cmpx_nlt_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x1c,0x7d]
 0xf0,0x04,0x1c,0x7d
+# GFX12: v_cmpx_nlt_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x1c,0x7d]
 
-# GFX12: v_cmpx_nlt_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x1c,0x7d]
 0xfd,0x04,0x1c,0x7d
+# GFX12: v_cmpx_nlt_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x1c,0x7d]
 
-# GFX12: v_cmpx_nlt_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x1c,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x1c,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_nlt_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x1c,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_nlt_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x3c,0x7d]
 0x01,0x05,0x3c,0x7d
+# GFX12: v_cmpx_nlt_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x3c,0x7d]
 
-# GFX12: v_cmpx_nlt_f32_e32 v255, v2             ; encoding: [0xff,0x05,0x3c,0x7d]
 0xff,0x05,0x3c,0x7d
+# GFX12: v_cmpx_nlt_f32_e32 v255, v2             ; encoding: [0xff,0x05,0x3c,0x7d]
 
-# GFX12: v_cmpx_nlt_f32_e32 s1, v2               ; encoding: [0x01,0x04,0x3c,0x7d]
 0x01,0x04,0x3c,0x7d
+# GFX12: v_cmpx_nlt_f32_e32 s1, v2               ; encoding: [0x01,0x04,0x3c,0x7d]
 
-# GFX12: v_cmpx_nlt_f32_e32 s105, v2             ; encoding: [0x69,0x04,0x3c,0x7d]
 0x69,0x04,0x3c,0x7d
+# GFX12: v_cmpx_nlt_f32_e32 s105, v2             ; encoding: [0x69,0x04,0x3c,0x7d]
 
-# GFX12: v_cmpx_nlt_f32_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x3c,0x7d]
 0x6a,0x04,0x3c,0x7d
+# GFX12: v_cmpx_nlt_f32_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x3c,0x7d]
 
-# GFX12: v_cmpx_nlt_f32_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x3c,0x7d]
 0x6b,0x04,0x3c,0x7d
+# GFX12: v_cmpx_nlt_f32_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x3c,0x7d]
 
-# GFX12: v_cmpx_nlt_f32_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x3c,0x7d]
 0x7b,0x04,0x3c,0x7d
+# GFX12: v_cmpx_nlt_f32_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x3c,0x7d]
 
-# GFX12: v_cmpx_nlt_f32_e32 m0, v2               ; encoding: [0x7d,0x04,0x3c,0x7d]
 0x7d,0x04,0x3c,0x7d
+# GFX12: v_cmpx_nlt_f32_e32 m0, v2               ; encoding: [0x7d,0x04,0x3c,0x7d]
 
-# GFX12: v_cmpx_nlt_f32_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x3c,0x7d]
 0x7e,0x04,0x3c,0x7d
+# GFX12: v_cmpx_nlt_f32_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x3c,0x7d]
 
-# GFX12: v_cmpx_nlt_f32_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x3c,0x7d]
 0x7f,0x04,0x3c,0x7d
+# GFX12: v_cmpx_nlt_f32_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x3c,0x7d]
 
-# GFX12: v_cmpx_nlt_f32_e32 null, v2             ; encoding: [0x7c,0x04,0x3c,0x7d]
 0x7c,0x04,0x3c,0x7d
+# GFX12: v_cmpx_nlt_f32_e32 null, v2             ; encoding: [0x7c,0x04,0x3c,0x7d]
 
-# GFX12: v_cmpx_nlt_f32_e32 -1, v2               ; encoding: [0xc1,0x04,0x3c,0x7d]
 0xc1,0x04,0x3c,0x7d
+# GFX12: v_cmpx_nlt_f32_e32 -1, v2               ; encoding: [0xc1,0x04,0x3c,0x7d]
 
-# GFX12: v_cmpx_nlt_f32_e32 0.5, v2              ; encoding: [0xf0,0x04,0x3c,0x7d]
 0xf0,0x04,0x3c,0x7d
+# GFX12: v_cmpx_nlt_f32_e32 0.5, v2              ; encoding: [0xf0,0x04,0x3c,0x7d]
 
-# GFX12: v_cmpx_nlt_f32_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x3c,0x7d]
 0xfd,0x04,0x3c,0x7d
+# GFX12: v_cmpx_nlt_f32_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x3c,0x7d]
 
-# GFX12: v_cmpx_nlt_f32_e32 0xaf123456, v255     ; encoding: [0xff,0xfe,0x3d,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x3d,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_nlt_f32_e32 0xaf123456, v255     ; encoding: [0xff,0xfe,0x3d,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_nlt_f64_e32 v[1:2], v[2:3]       ; encoding: [0x01,0x05,0x5c,0x7d]
 0x01,0x05,0x5c,0x7d
+# GFX12: v_cmpx_nlt_f64_e32 v[1:2], v[2:3]       ; encoding: [0x01,0x05,0x5c,0x7d]
 
-# GFX12: v_cmpx_nlt_f64_e32 v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0x5c,0x7d]
 0xfe,0x05,0x5c,0x7d
+# GFX12: v_cmpx_nlt_f64_e32 v[254:255], v[2:3]   ; encoding: [0xfe,0x05,0x5c,0x7d]
 
-# GFX12: v_cmpx_nlt_f64_e32 s[2:3], v[2:3]       ; encoding: [0x02,0x04,0x5c,0x7d]
 0x02,0x04,0x5c,0x7d
+# GFX12: v_cmpx_nlt_f64_e32 s[2:3], v[2:3]       ; encoding: [0x02,0x04,0x5c,0x7d]
 
-# GFX12: v_cmpx_nlt_f64_e32 s[104:105], v[2:3]   ; encoding: [0x68,0x04,0x5c,0x7d]
 0x68,0x04,0x5c,0x7d
+# GFX12: v_cmpx_nlt_f64_e32 s[104:105], v[2:3]   ; encoding: [0x68,0x04,0x5c,0x7d]
 
-# GFX12: v_cmpx_nlt_f64_e32 vcc, v[2:3]          ; encoding: [0x6a,0x04,0x5c,0x7d]
 0x6a,0x04,0x5c,0x7d
+# GFX12: v_cmpx_nlt_f64_e32 vcc, v[2:3]          ; encoding: [0x6a,0x04,0x5c,0x7d]
 
-# GFX12: v_cmpx_nlt_f64_e32 ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0x5c,0x7d]
 0x7a,0x04,0x5c,0x7d
+# GFX12: v_cmpx_nlt_f64_e32 ttmp[14:15], v[2:3]  ; encoding: [0x7a,0x04,0x5c,0x7d]
 
-# GFX12: v_cmpx_nlt_f64_e32 exec, v[2:3]         ; encoding: [0x7e,0x04,0x5c,0x7d]
 0x7e,0x04,0x5c,0x7d
+# GFX12: v_cmpx_nlt_f64_e32 exec, v[2:3]         ; encoding: [0x7e,0x04,0x5c,0x7d]
 
-# GFX12: v_cmpx_nlt_f64_e32 null, v[2:3]         ; encoding: [0x7c,0x04,0x5c,0x7d]
 0x7c,0x04,0x5c,0x7d
+# GFX12: v_cmpx_nlt_f64_e32 null, v[2:3]         ; encoding: [0x7c,0x04,0x5c,0x7d]
 
-# GFX12: v_cmpx_nlt_f64_e32 -1, v[2:3]           ; encoding: [0xc1,0x04,0x5c,0x7d]
 0xc1,0x04,0x5c,0x7d
+# GFX12: v_cmpx_nlt_f64_e32 -1, v[2:3]           ; encoding: [0xc1,0x04,0x5c,0x7d]
 
-# GFX12: v_cmpx_nlt_f64_e32 0.5, v[2:3]          ; encoding: [0xf0,0x04,0x5c,0x7d]
 0xf0,0x04,0x5c,0x7d
+# GFX12: v_cmpx_nlt_f64_e32 0.5, v[2:3]          ; encoding: [0xf0,0x04,0x5c,0x7d]
 
-# GFX12: v_cmpx_nlt_f64_e32 src_scc, v[2:3]      ; encoding: [0xfd,0x04,0x5c,0x7d]
 0xfd,0x04,0x5c,0x7d
+# GFX12: v_cmpx_nlt_f64_e32 src_scc, v[2:3]      ; encoding: [0xfd,0x04,0x5c,0x7d]
 
-# GFX12: v_cmpx_nlt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5d,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x5d,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_nlt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5d,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_o_f16_e32 v1, v2                 ; encoding: [0x01,0x05,0x0e,0x7d]
 0x01,0x05,0x0e,0x7d
+# GFX12: v_cmpx_o_f16_e32 v1, v2                 ; encoding: [0x01,0x05,0x0e,0x7d]
 
-# GFX12: v_cmpx_o_f16_e32 v127, v2               ; encoding: [0x7f,0x05,0x0e,0x7d]
 0x7f,0x05,0x0e,0x7d
+# GFX12: v_cmpx_o_f16_e32 v127, v2               ; encoding: [0x7f,0x05,0x0e,0x7d]
 
-# GFX12: v_cmpx_o_f16_e32 s1, v2                 ; encoding: [0x01,0x04,0x0e,0x7d]
 0x01,0x04,0x0e,0x7d
+# GFX12: v_cmpx_o_f16_e32 s1, v2                 ; encoding: [0x01,0x04,0x0e,0x7d]
 
-# GFX12: v_cmpx_o_f16_e32 s105, v2               ; encoding: [0x69,0x04,0x0e,0x7d]
 0x69,0x04,0x0e,0x7d
+# GFX12: v_cmpx_o_f16_e32 s105, v2               ; encoding: [0x69,0x04,0x0e,0x7d]
 
-# GFX12: v_cmpx_o_f16_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x0e,0x7d]
 0x6a,0x04,0x0e,0x7d
+# GFX12: v_cmpx_o_f16_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x0e,0x7d]
 
-# GFX12: v_cmpx_o_f16_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x0e,0x7d]
 0x6b,0x04,0x0e,0x7d
+# GFX12: v_cmpx_o_f16_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x0e,0x7d]
 
-# GFX12: v_cmpx_o_f16_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x0e,0x7d]
 0x7b,0x04,0x0e,0x7d
+# GFX12: v_cmpx_o_f16_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x0e,0x7d]
 
-# GFX12: v_cmpx_o_f16_e32 m0, v2                 ; encoding: [0x7d,0x04,0x0e,0x7d]
 0x7d,0x04,0x0e,0x7d
+# GFX12: v_cmpx_o_f16_e32 m0, v2                 ; encoding: [0x7d,0x04,0x0e,0x7d]
 
-# GFX12: v_cmpx_o_f16_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x0e,0x7d]
 0x7e,0x04,0x0e,0x7d
+# GFX12: v_cmpx_o_f16_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x0e,0x7d]
 
-# GFX12: v_cmpx_o_f16_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x0e,0x7d]
 0x7f,0x04,0x0e,0x7d
+# GFX12: v_cmpx_o_f16_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x0e,0x7d]
 
-# GFX12: v_cmpx_o_f16_e32 null, v2               ; encoding: [0x7c,0x04,0x0e,0x7d]
 0x7c,0x04,0x0e,0x7d
+# GFX12: v_cmpx_o_f16_e32 null, v2               ; encoding: [0x7c,0x04,0x0e,0x7d]
 
-# GFX12: v_cmpx_o_f16_e32 -1, v2                 ; encoding: [0xc1,0x04,0x0e,0x7d]
 0xc1,0x04,0x0e,0x7d
+# GFX12: v_cmpx_o_f16_e32 -1, v2                 ; encoding: [0xc1,0x04,0x0e,0x7d]
 
-# GFX12: v_cmpx_o_f16_e32 0.5, v2                ; encoding: [0xf0,0x04,0x0e,0x7d]
 0xf0,0x04,0x0e,0x7d
+# GFX12: v_cmpx_o_f16_e32 0.5, v2                ; encoding: [0xf0,0x04,0x0e,0x7d]
 
-# GFX12: v_cmpx_o_f16_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x0e,0x7d]
 0xfd,0x04,0x0e,0x7d
+# GFX12: v_cmpx_o_f16_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x0e,0x7d]
 
-# GFX12: v_cmpx_o_f16_e32 0xfe0b, v127           ; encoding: [0xff,0xfe,0x0e,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x0e,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_o_f16_e32 0xfe0b, v127           ; encoding: [0xff,0xfe,0x0e,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_o_f32_e32 v1, v2                 ; encoding: [0x01,0x05,0x2e,0x7d]
 0x01,0x05,0x2e,0x7d
+# GFX12: v_cmpx_o_f32_e32 v1, v2                 ; encoding: [0x01,0x05,0x2e,0x7d]
 
-# GFX12: v_cmpx_o_f32_e32 v255, v2               ; encoding: [0xff,0x05,0x2e,0x7d]
 0xff,0x05,0x2e,0x7d
+# GFX12: v_cmpx_o_f32_e32 v255, v2               ; encoding: [0xff,0x05,0x2e,0x7d]
 
-# GFX12: v_cmpx_o_f32_e32 s1, v2                 ; encoding: [0x01,0x04,0x2e,0x7d]
 0x01,0x04,0x2e,0x7d
+# GFX12: v_cmpx_o_f32_e32 s1, v2                 ; encoding: [0x01,0x04,0x2e,0x7d]
 
-# GFX12: v_cmpx_o_f32_e32 s105, v2               ; encoding: [0x69,0x04,0x2e,0x7d]
 0x69,0x04,0x2e,0x7d
+# GFX12: v_cmpx_o_f32_e32 s105, v2               ; encoding: [0x69,0x04,0x2e,0x7d]
 
-# GFX12: v_cmpx_o_f32_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x2e,0x7d]
 0x6a,0x04,0x2e,0x7d
+# GFX12: v_cmpx_o_f32_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x2e,0x7d]
 
-# GFX12: v_cmpx_o_f32_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x2e,0x7d]
 0x6b,0x04,0x2e,0x7d
+# GFX12: v_cmpx_o_f32_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x2e,0x7d]
 
-# GFX12: v_cmpx_o_f32_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x2e,0x7d]
 0x7b,0x04,0x2e,0x7d
+# GFX12: v_cmpx_o_f32_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x2e,0x7d]
 
-# GFX12: v_cmpx_o_f32_e32 m0, v2                 ; encoding: [0x7d,0x04,0x2e,0x7d]
 0x7d,0x04,0x2e,0x7d
+# GFX12: v_cmpx_o_f32_e32 m0, v2                 ; encoding: [0x7d,0x04,0x2e,0x7d]
 
-# GFX12: v_cmpx_o_f32_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x2e,0x7d]
 0x7e,0x04,0x2e,0x7d
+# GFX12: v_cmpx_o_f32_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x2e,0x7d]
 
-# GFX12: v_cmpx_o_f32_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x2e,0x7d]
 0x7f,0x04,0x2e,0x7d
+# GFX12: v_cmpx_o_f32_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x2e,0x7d]
 
-# GFX12: v_cmpx_o_f32_e32 null, v2               ; encoding: [0x7c,0x04,0x2e,0x7d]
 0x7c,0x04,0x2e,0x7d
+# GFX12: v_cmpx_o_f32_e32 null, v2               ; encoding: [0x7c,0x04,0x2e,0x7d]
 
-# GFX12: v_cmpx_o_f32_e32 -1, v2                 ; encoding: [0xc1,0x04,0x2e,0x7d]
 0xc1,0x04,0x2e,0x7d
+# GFX12: v_cmpx_o_f32_e32 -1, v2                 ; encoding: [0xc1,0x04,0x2e,0x7d]
 
-# GFX12: v_cmpx_o_f32_e32 0.5, v2                ; encoding: [0xf0,0x04,0x2e,0x7d]
 0xf0,0x04,0x2e,0x7d
+# GFX12: v_cmpx_o_f32_e32 0.5, v2                ; encoding: [0xf0,0x04,0x2e,0x7d]
 
-# GFX12: v_cmpx_o_f32_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x2e,0x7d]
 0xfd,0x04,0x2e,0x7d
+# GFX12: v_cmpx_o_f32_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x2e,0x7d]
 
-# GFX12: v_cmpx_o_f32_e32 0xaf123456, v255       ; encoding: [0xff,0xfe,0x2f,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x2f,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_o_f32_e32 0xaf123456, v255       ; encoding: [0xff,0xfe,0x2f,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_o_f64_e32 v[1:2], v[2:3]         ; encoding: [0x01,0x05,0x4e,0x7d]
 0x01,0x05,0x4e,0x7d
+# GFX12: v_cmpx_o_f64_e32 v[1:2], v[2:3]         ; encoding: [0x01,0x05,0x4e,0x7d]
 
-# GFX12: v_cmpx_o_f64_e32 v[254:255], v[2:3]     ; encoding: [0xfe,0x05,0x4e,0x7d]
 0xfe,0x05,0x4e,0x7d
+# GFX12: v_cmpx_o_f64_e32 v[254:255], v[2:3]     ; encoding: [0xfe,0x05,0x4e,0x7d]
 
-# GFX12: v_cmpx_o_f64_e32 s[2:3], v[2:3]         ; encoding: [0x02,0x04,0x4e,0x7d]
 0x02,0x04,0x4e,0x7d
+# GFX12: v_cmpx_o_f64_e32 s[2:3], v[2:3]         ; encoding: [0x02,0x04,0x4e,0x7d]
 
-# GFX12: v_cmpx_o_f64_e32 s[104:105], v[2:3]     ; encoding: [0x68,0x04,0x4e,0x7d]
 0x68,0x04,0x4e,0x7d
+# GFX12: v_cmpx_o_f64_e32 s[104:105], v[2:3]     ; encoding: [0x68,0x04,0x4e,0x7d]
 
-# GFX12: v_cmpx_o_f64_e32 vcc, v[2:3]            ; encoding: [0x6a,0x04,0x4e,0x7d]
 0x6a,0x04,0x4e,0x7d
+# GFX12: v_cmpx_o_f64_e32 vcc, v[2:3]            ; encoding: [0x6a,0x04,0x4e,0x7d]
 
-# GFX12: v_cmpx_o_f64_e32 ttmp[14:15], v[2:3]    ; encoding: [0x7a,0x04,0x4e,0x7d]
 0x7a,0x04,0x4e,0x7d
+# GFX12: v_cmpx_o_f64_e32 ttmp[14:15], v[2:3]    ; encoding: [0x7a,0x04,0x4e,0x7d]
 
-# GFX12: v_cmpx_o_f64_e32 exec, v[2:3]           ; encoding: [0x7e,0x04,0x4e,0x7d]
 0x7e,0x04,0x4e,0x7d
+# GFX12: v_cmpx_o_f64_e32 exec, v[2:3]           ; encoding: [0x7e,0x04,0x4e,0x7d]
 
-# GFX12: v_cmpx_o_f64_e32 null, v[2:3]           ; encoding: [0x7c,0x04,0x4e,0x7d]
 0x7c,0x04,0x4e,0x7d
+# GFX12: v_cmpx_o_f64_e32 null, v[2:3]           ; encoding: [0x7c,0x04,0x4e,0x7d]
 
-# GFX12: v_cmpx_o_f64_e32 -1, v[2:3]             ; encoding: [0xc1,0x04,0x4e,0x7d]
 0xc1,0x04,0x4e,0x7d
+# GFX12: v_cmpx_o_f64_e32 -1, v[2:3]             ; encoding: [0xc1,0x04,0x4e,0x7d]
 
-# GFX12: v_cmpx_o_f64_e32 0.5, v[2:3]            ; encoding: [0xf0,0x04,0x4e,0x7d]
 0xf0,0x04,0x4e,0x7d
+# GFX12: v_cmpx_o_f64_e32 0.5, v[2:3]            ; encoding: [0xf0,0x04,0x4e,0x7d]
 
-# GFX12: v_cmpx_o_f64_e32 src_scc, v[2:3]        ; encoding: [0xfd,0x04,0x4e,0x7d]
 0xfd,0x04,0x4e,0x7d
+# GFX12: v_cmpx_o_f64_e32 src_scc, v[2:3]        ; encoding: [0xfd,0x04,0x4e,0x7d]
 
-# GFX12: v_cmpx_o_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4f,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x4f,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_o_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4f,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_u_f16_e32 v1, v2                 ; encoding: [0x01,0x05,0x10,0x7d]
 0x01,0x05,0x10,0x7d
+# GFX12: v_cmpx_u_f16_e32 v1, v2                 ; encoding: [0x01,0x05,0x10,0x7d]
 
-# GFX12: v_cmpx_u_f16_e32 v127, v2               ; encoding: [0x7f,0x05,0x10,0x7d]
 0x7f,0x05,0x10,0x7d
+# GFX12: v_cmpx_u_f16_e32 v127, v2               ; encoding: [0x7f,0x05,0x10,0x7d]
 
-# GFX12: v_cmpx_u_f16_e32 s1, v2                 ; encoding: [0x01,0x04,0x10,0x7d]
 0x01,0x04,0x10,0x7d
+# GFX12: v_cmpx_u_f16_e32 s1, v2                 ; encoding: [0x01,0x04,0x10,0x7d]
 
-# GFX12: v_cmpx_u_f16_e32 s105, v2               ; encoding: [0x69,0x04,0x10,0x7d]
 0x69,0x04,0x10,0x7d
+# GFX12: v_cmpx_u_f16_e32 s105, v2               ; encoding: [0x69,0x04,0x10,0x7d]
 
-# GFX12: v_cmpx_u_f16_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x10,0x7d]
 0x6a,0x04,0x10,0x7d
+# GFX12: v_cmpx_u_f16_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x10,0x7d]
 
-# GFX12: v_cmpx_u_f16_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x10,0x7d]
 0x6b,0x04,0x10,0x7d
+# GFX12: v_cmpx_u_f16_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x10,0x7d]
 
-# GFX12: v_cmpx_u_f16_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x10,0x7d]
 0x7b,0x04,0x10,0x7d
+# GFX12: v_cmpx_u_f16_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x10,0x7d]
 
-# GFX12: v_cmpx_u_f16_e32 m0, v2                 ; encoding: [0x7d,0x04,0x10,0x7d]
 0x7d,0x04,0x10,0x7d
+# GFX12: v_cmpx_u_f16_e32 m0, v2                 ; encoding: [0x7d,0x04,0x10,0x7d]
 
-# GFX12: v_cmpx_u_f16_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x10,0x7d]
 0x7e,0x04,0x10,0x7d
+# GFX12: v_cmpx_u_f16_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x10,0x7d]
 
-# GFX12: v_cmpx_u_f16_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x10,0x7d]
 0x7f,0x04,0x10,0x7d
+# GFX12: v_cmpx_u_f16_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x10,0x7d]
 
-# GFX12: v_cmpx_u_f16_e32 null, v2               ; encoding: [0x7c,0x04,0x10,0x7d]
 0x7c,0x04,0x10,0x7d
+# GFX12: v_cmpx_u_f16_e32 null, v2               ; encoding: [0x7c,0x04,0x10,0x7d]
 
-# GFX12: v_cmpx_u_f16_e32 -1, v2                 ; encoding: [0xc1,0x04,0x10,0x7d]
 0xc1,0x04,0x10,0x7d
+# GFX12: v_cmpx_u_f16_e32 -1, v2                 ; encoding: [0xc1,0x04,0x10,0x7d]
 
-# GFX12: v_cmpx_u_f16_e32 0.5, v2                ; encoding: [0xf0,0x04,0x10,0x7d]
 0xf0,0x04,0x10,0x7d
+# GFX12: v_cmpx_u_f16_e32 0.5, v2                ; encoding: [0xf0,0x04,0x10,0x7d]
 
-# GFX12: v_cmpx_u_f16_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x10,0x7d]
 0xfd,0x04,0x10,0x7d
+# GFX12: v_cmpx_u_f16_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x10,0x7d]
 
-# GFX12: v_cmpx_u_f16_e32 0xfe0b, v127           ; encoding: [0xff,0xfe,0x10,0x7d,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0x10,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_u_f16_e32 0xfe0b, v127           ; encoding: [0xff,0xfe,0x10,0x7d,0x0b,0xfe,0x00,0x00]
 
-# GFX12: v_cmpx_u_f32_e32 v1, v2                 ; encoding: [0x01,0x05,0x30,0x7d]
 0x01,0x05,0x30,0x7d
+# GFX12: v_cmpx_u_f32_e32 v1, v2                 ; encoding: [0x01,0x05,0x30,0x7d]
 
-# GFX12: v_cmpx_u_f32_e32 v255, v2               ; encoding: [0xff,0x05,0x30,0x7d]
 0xff,0x05,0x30,0x7d
+# GFX12: v_cmpx_u_f32_e32 v255, v2               ; encoding: [0xff,0x05,0x30,0x7d]
 
-# GFX12: v_cmpx_u_f32_e32 s1, v2                 ; encoding: [0x01,0x04,0x30,0x7d]
 0x01,0x04,0x30,0x7d
+# GFX12: v_cmpx_u_f32_e32 s1, v2                 ; encoding: [0x01,0x04,0x30,0x7d]
 
-# GFX12: v_cmpx_u_f32_e32 s105, v2               ; encoding: [0x69,0x04,0x30,0x7d]
 0x69,0x04,0x30,0x7d
+# GFX12: v_cmpx_u_f32_e32 s105, v2               ; encoding: [0x69,0x04,0x30,0x7d]
 
-# GFX12: v_cmpx_u_f32_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x30,0x7d]
 0x6a,0x04,0x30,0x7d
+# GFX12: v_cmpx_u_f32_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x30,0x7d]
 
-# GFX12: v_cmpx_u_f32_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x30,0x7d]
 0x6b,0x04,0x30,0x7d
+# GFX12: v_cmpx_u_f32_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x30,0x7d]
 
-# GFX12: v_cmpx_u_f32_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x30,0x7d]
 0x7b,0x04,0x30,0x7d
+# GFX12: v_cmpx_u_f32_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x30,0x7d]
 
-# GFX12: v_cmpx_u_f32_e32 m0, v2                 ; encoding: [0x7d,0x04,0x30,0x7d]
 0x7d,0x04,0x30,0x7d
+# GFX12: v_cmpx_u_f32_e32 m0, v2                 ; encoding: [0x7d,0x04,0x30,0x7d]
 
-# GFX12: v_cmpx_u_f32_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x30,0x7d]
 0x7e,0x04,0x30,0x7d
+# GFX12: v_cmpx_u_f32_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x30,0x7d]
 
-# GFX12: v_cmpx_u_f32_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x30,0x7d]
 0x7f,0x04,0x30,0x7d
+# GFX12: v_cmpx_u_f32_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x30,0x7d]
 
-# GFX12: v_cmpx_u_f32_e32 null, v2               ; encoding: [0x7c,0x04,0x30,0x7d]
 0x7c,0x04,0x30,0x7d
+# GFX12: v_cmpx_u_f32_e32 null, v2               ; encoding: [0x7c,0x04,0x30,0x7d]
 
-# GFX12: v_cmpx_u_f32_e32 -1, v2                 ; encoding: [0xc1,0x04,0x30,0x7d]
 0xc1,0x04,0x30,0x7d
+# GFX12: v_cmpx_u_f32_e32 -1, v2                 ; encoding: [0xc1,0x04,0x30,0x7d]
 
-# GFX12: v_cmpx_u_f32_e32 0.5, v2                ; encoding: [0xf0,0x04,0x30,0x7d]
 0xf0,0x04,0x30,0x7d
+# GFX12: v_cmpx_u_f32_e32 0.5, v2                ; encoding: [0xf0,0x04,0x30,0x7d]
 
-# GFX12: v_cmpx_u_f32_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x30,0x7d]
 0xfd,0x04,0x30,0x7d
+# GFX12: v_cmpx_u_f32_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x30,0x7d]
 
-# GFX12: v_cmpx_u_f32_e32 0xaf123456, v255       ; encoding: [0xff,0xfe,0x31,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0x31,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_u_f32_e32 0xaf123456, v255       ; encoding: [0xff,0xfe,0x31,0x7d,0x56,0x34,0x12,0xaf]
 
-# GFX12: v_cmpx_u_f64_e32 v[1:2], v[2:3]         ; encoding: [0x01,0x05,0x50,0x7d]
 0x01,0x05,0x50,0x7d
+# GFX12: v_cmpx_u_f64_e32 v[1:2], v[2:3]         ; encoding: [0x01,0x05,0x50,0x7d]
 
-# GFX12: v_cmpx_u_f64_e32 v[254:255], v[2:3]     ; encoding: [0xfe,0x05,0x50,0x7d]
 0xfe,0x05,0x50,0x7d
+# GFX12: v_cmpx_u_f64_e32 v[254:255], v[2:3]     ; encoding: [0xfe,0x05,0x50,0x7d]
 
-# GFX12: v_cmpx_u_f64_e32 s[2:3], v[2:3]         ; encoding: [0x02,0x04,0x50,0x7d]
 0x02,0x04,0x50,0x7d
+# GFX12: v_cmpx_u_f64_e32 s[2:3], v[2:3]         ; encoding: [0x02,0x04,0x50,0x7d]
 
-# GFX12: v_cmpx_u_f64_e32 s[104:105], v[2:3]     ; encoding: [0x68,0x04,0x50,0x7d]
 0x68,0x04,0x50,0x7d
+# GFX12: v_cmpx_u_f64_e32 s[104:105], v[2:3]     ; encoding: [0x68,0x04,0x50,0x7d]
 
-# GFX12: v_cmpx_u_f64_e32 vcc, v[2:3]            ; encoding: [0x6a,0x04,0x50,0x7d]
 0x6a,0x04,0x50,0x7d
+# GFX12: v_cmpx_u_f64_e32 vcc, v[2:3]            ; encoding: [0x6a,0x04,0x50,0x7d]
 
-# GFX12: v_cmpx_u_f64_e32 ttmp[14:15], v[2:3]    ; encoding: [0x7a,0x04,0x50,0x7d]
 0x7a,0x04,0x50,0x7d
+# GFX12: v_cmpx_u_f64_e32 ttmp[14:15], v[2:3]    ; encoding: [0x7a,0x04,0x50,0x7d]
 
-# GFX12: v_cmpx_u_f64_e32 exec, v[2:3]           ; encoding: [0x7e,0x04,0x50,0x7d]
 0x7e,0x04,0x50,0x7d
+# GFX12: v_cmpx_u_f64_e32 exec, v[2:3]           ; encoding: [0x7e,0x04,0x50,0x7d]
 
-# GFX12: v_cmpx_u_f64_e32 null, v[2:3]           ; encoding: [0x7c,0x04,0x50,0x7d]
 0x7c,0x04,0x50,0x7d
+# GFX12: v_cmpx_u_f64_e32 null, v[2:3]           ; encoding: [0x7c,0x04,0x50,0x7d]
 
-# GFX12: v_cmpx_u_f64_e32 -1, v[2:3]             ; encoding: [0xc1,0x04,0x50,0x7d]
 0xc1,0x04,0x50,0x7d
+# GFX12: v_cmpx_u_f64_e32 -1, v[2:3]             ; encoding: [0xc1,0x04,0x50,0x7d]
 
-# GFX12: v_cmpx_u_f64_e32 0.5, v[2:3]            ; encoding: [0xf0,0x04,0x50,0x7d]
 0xf0,0x04,0x50,0x7d
+# GFX12: v_cmpx_u_f64_e32 0.5, v[2:3]            ; encoding: [0xf0,0x04,0x50,0x7d]
 
-# GFX12: v_cmpx_u_f64_e32 src_scc, v[2:3]        ; encoding: [0xfd,0x04,0x50,0x7d]
 0xfd,0x04,0x50,0x7d
+# GFX12: v_cmpx_u_f64_e32 src_scc, v[2:3]        ; encoding: [0xfd,0x04,0x50,0x7d]
 
-# GFX12: v_cmpx_u_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x51,0x7d,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x51,0x7d,0x56,0x34,0x12,0xaf
+# GFX12: v_cmpx_u_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x51,0x7d,0x56,0x34,0x12,0xaf]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt
index fe9ef4f9e90d0..e65d451116d29 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt
@@ -1,2272 +1,2273 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12
 
-# GFX12: v_cmpx_class_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_class_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_class_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0xfa,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_class_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_class_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0xfa,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_class_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_class_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0xfa,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_class_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_class_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0xfa,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_class_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_class_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0xfa,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_class_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_class_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0xfa,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_class_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_class_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0xfa,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_class_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_class_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0xfa,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_class_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_class_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0xfa,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_class_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_class_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0xfa,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_class_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_class_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0xfa,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_class_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_class_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0xfa,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_class_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_class_f16 -|v127|, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfa,0x7d,0x7f,0x6f,0x3d,0x30]
 0xfa,0xfe,0xfa,0x7d,0x7f,0x6f,0x3d,0x30
+# GFX12: v_cmpx_class_f16 -|v127|, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfa,0x7d,0x7f,0x6f,0x3d,0x30]
 
-# GFX12: v_cmpx_class_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0xfc,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_class_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_class_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0xfc,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_class_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_class_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0xfc,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_class_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_class_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0xfc,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_class_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_class_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0xfc,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_class_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_class_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0xfc,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_class_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_class_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0xfc,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_class_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_class_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0xfc,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_class_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_class_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0xfc,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_class_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_class_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0xfc,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_class_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_class_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0xfc,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_class_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_class_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0xfc,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_class_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_class_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0xfc,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_class_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_class_f32 -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfd,0x7d,0xff,0x6f,0x3d,0x30]
 0xfa,0xfe,0xfd,0x7d,0xff,0x6f,0x3d,0x30
+# GFX12: v_cmpx_class_f32 -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfd,0x7d,0xff,0x6f,0x3d,0x30]
 
-# GFX12: v_cmpx_eq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x04,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_eq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_eq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x04,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_eq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_eq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x04,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_eq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x04,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_eq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x04,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_eq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x04,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_eq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x04,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_eq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x04,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_eq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x04,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_eq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x04,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_eq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x04,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_eq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x04,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_eq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_eq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x04,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_eq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_eq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x04,0x7d,0x7f,0x6f,0xfd,0x30]
 0xfa,0xfe,0x04,0x7d,0x7f,0x6f,0xfd,0x30
+# GFX12: v_cmpx_eq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x04,0x7d,0x7f,0x6f,0xfd,0x30]
 
-# GFX12: v_cmpx_eq_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x24,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_eq_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_eq_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x24,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_eq_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_eq_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x24,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_eq_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x24,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_eq_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x24,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_eq_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x24,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_eq_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x24,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_eq_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x24,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_eq_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x24,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_eq_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x24,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_eq_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x24,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_eq_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_eq_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x24,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_eq_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_eq_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x24,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_eq_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_eq_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x25,0x7d,0xff,0x6f,0xfd,0x30]
 0xfa,0xfe,0x25,0x7d,0xff,0x6f,0xfd,0x30
+# GFX12: v_cmpx_eq_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x25,0x7d,0xff,0x6f,0xfd,0x30]
 
-# GFX12: v_cmpx_eq_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x64,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_eq_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_eq_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x64,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_eq_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_eq_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x64,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_eq_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x64,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_eq_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x64,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_eq_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x64,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_eq_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x64,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_eq_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x64,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_eq_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x64,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_eq_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x64,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_eq_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x64,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_eq_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x64,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_eq_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_eq_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x64,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_eq_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_eq_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x64,0x7d,0x7f,0x6f,0x0d,0x30]
 0xfa,0xfe,0x64,0x7d,0x7f,0x6f,0x0d,0x30
+# GFX12: v_cmpx_eq_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x64,0x7d,0x7f,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_eq_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x84,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_eq_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_eq_i32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x84,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_eq_i32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_eq_i32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x84,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_eq_i32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x84,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_eq_i32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x84,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_eq_i32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x84,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_eq_i32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x84,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_eq_i32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x84,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_eq_i32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x84,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_eq_i32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x84,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_eq_i32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x84,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_eq_i32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_eq_i32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x84,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_eq_i32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_eq_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x84,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_eq_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_eq_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x85,0x7d,0xff,0x6f,0x0d,0x30]
 0xfa,0xfe,0x85,0x7d,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_eq_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x85,0x7d,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_eq_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x74,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_eq_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_eq_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x74,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_eq_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_eq_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x74,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_eq_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x74,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_eq_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x74,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_eq_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x74,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_eq_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x74,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_eq_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x74,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_eq_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x74,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_eq_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x74,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_eq_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x74,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_eq_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x74,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_eq_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_eq_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x74,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_eq_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_eq_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x74,0x7d,0x7f,0x6f,0x0d,0x30]
 0xfa,0xfe,0x74,0x7d,0x7f,0x6f,0x0d,0x30
+# GFX12: v_cmpx_eq_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x74,0x7d,0x7f,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_eq_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x94,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_eq_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_eq_u32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x94,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_eq_u32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_eq_u32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x94,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_eq_u32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x94,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_eq_u32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x94,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_eq_u32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x94,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_eq_u32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x94,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_eq_u32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x94,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_eq_u32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x94,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_eq_u32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x94,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_eq_u32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x94,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_eq_u32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_eq_u32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x94,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_eq_u32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_eq_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x94,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_eq_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_eq_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x95,0x7d,0xff,0x6f,0x0d,0x30]
 0xfa,0xfe,0x95,0x7d,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_eq_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x95,0x7d,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_ge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x0c,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_ge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_ge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x0c,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_ge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_ge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x0c,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_ge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x0c,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_ge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x0c,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_ge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x0c,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_ge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x0c,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_ge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x0c,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_ge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x0c,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_ge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x0c,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x0c,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_ge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x0c,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_ge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_ge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x0c,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_ge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_ge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0c,0x7d,0x7f,0x6f,0xfd,0x30]
 0xfa,0xfe,0x0c,0x7d,0x7f,0x6f,0xfd,0x30
+# GFX12: v_cmpx_ge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0c,0x7d,0x7f,0x6f,0xfd,0x30]
 
-# GFX12: v_cmpx_ge_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x2c,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_ge_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_ge_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x2c,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_ge_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_ge_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x2c,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_ge_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x2c,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_ge_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x2c,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_ge_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x2c,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_ge_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x2c,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_ge_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x2c,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_ge_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x2c,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_ge_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x2c,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ge_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x2c,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_ge_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_ge_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x2c,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_ge_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_ge_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x2c,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_ge_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_ge_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2d,0x7d,0xff,0x6f,0xfd,0x30]
 0xfa,0xfe,0x2d,0x7d,0xff,0x6f,0xfd,0x30
+# GFX12: v_cmpx_ge_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2d,0x7d,0xff,0x6f,0xfd,0x30]
 
-# GFX12: v_cmpx_ge_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x6c,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_ge_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_ge_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x6c,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_ge_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_ge_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x6c,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_ge_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x6c,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_ge_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x6c,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_ge_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x6c,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_ge_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x6c,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_ge_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x6c,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_ge_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x6c,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_ge_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x6c,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ge_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x6c,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_ge_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x6c,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_ge_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_ge_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x6c,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_ge_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_ge_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6c,0x7d,0x7f,0x6f,0x0d,0x30]
 0xfa,0xfe,0x6c,0x7d,0x7f,0x6f,0x0d,0x30
+# GFX12: v_cmpx_ge_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6c,0x7d,0x7f,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_ge_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x8c,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_ge_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_ge_i32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x8c,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_ge_i32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_ge_i32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x8c,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_ge_i32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x8c,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_ge_i32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x8c,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_ge_i32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x8c,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_ge_i32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x8c,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_ge_i32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x8c,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_ge_i32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x8c,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_ge_i32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x8c,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ge_i32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x8c,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_ge_i32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_ge_i32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x8c,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_ge_i32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_ge_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x8c,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_ge_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_ge_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x8d,0x7d,0xff,0x6f,0x0d,0x30]
 0xfa,0xfe,0x8d,0x7d,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_ge_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x8d,0x7d,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_ge_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x7c,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_ge_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_ge_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x7c,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_ge_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_ge_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x7c,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_ge_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x7c,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_ge_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x7c,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_ge_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x7c,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_ge_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x7c,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_ge_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x7c,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_ge_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x7c,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_ge_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x7c,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ge_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x7c,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_ge_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x7c,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_ge_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_ge_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x7c,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_ge_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_ge_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7c,0x7d,0x7f,0x6f,0x0d,0x30]
 0xfa,0xfe,0x7c,0x7d,0x7f,0x6f,0x0d,0x30
+# GFX12: v_cmpx_ge_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7c,0x7d,0x7f,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_ge_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x9c,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_ge_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_ge_u32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x9c,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_ge_u32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_ge_u32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x9c,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_ge_u32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x9c,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_ge_u32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x9c,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_ge_u32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x9c,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_ge_u32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x9c,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_ge_u32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x9c,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_ge_u32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x9c,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_ge_u32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x9c,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ge_u32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x9c,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_ge_u32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_ge_u32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x9c,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_ge_u32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_ge_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x9c,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_ge_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_ge_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x9d,0x7d,0xff,0x6f,0x0d,0x30]
 0xfa,0xfe,0x9d,0x7d,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_ge_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x9d,0x7d,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_gt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x08,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_gt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_gt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x08,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_gt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_gt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x08,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_gt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x08,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_gt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x08,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_gt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x08,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_gt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x08,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_gt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x08,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_gt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x08,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_gt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x08,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_gt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x08,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_gt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x08,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_gt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_gt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x08,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_gt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_gt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x08,0x7d,0x7f,0x6f,0xfd,0x30]
 0xfa,0xfe,0x08,0x7d,0x7f,0x6f,0xfd,0x30
+# GFX12: v_cmpx_gt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x08,0x7d,0x7f,0x6f,0xfd,0x30]
 
-# GFX12: v_cmpx_gt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x28,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_gt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_gt_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x28,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_gt_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_gt_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x28,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_gt_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x28,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_gt_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x28,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_gt_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x28,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_gt_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x28,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_gt_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x28,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_gt_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x28,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_gt_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x28,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_gt_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x28,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_gt_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_gt_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x28,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_gt_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_gt_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x28,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_gt_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_gt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x29,0x7d,0xff,0x6f,0xfd,0x30]
 0xfa,0xfe,0x29,0x7d,0xff,0x6f,0xfd,0x30
+# GFX12: v_cmpx_gt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x29,0x7d,0xff,0x6f,0xfd,0x30]
 
-# GFX12: v_cmpx_gt_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x68,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_gt_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_gt_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x68,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_gt_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_gt_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x68,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_gt_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x68,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_gt_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x68,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_gt_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x68,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_gt_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x68,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_gt_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x68,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_gt_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x68,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_gt_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x68,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_gt_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x68,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_gt_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x68,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_gt_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_gt_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x68,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_gt_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_gt_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x68,0x7d,0x7f,0x6f,0x0d,0x30]
 0xfa,0xfe,0x68,0x7d,0x7f,0x6f,0x0d,0x30
+# GFX12: v_cmpx_gt_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x68,0x7d,0x7f,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_gt_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x88,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_gt_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_gt_i32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x88,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_gt_i32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_gt_i32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x88,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_gt_i32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x88,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_gt_i32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x88,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_gt_i32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x88,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_gt_i32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x88,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_gt_i32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x88,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_gt_i32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x88,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_gt_i32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x88,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_gt_i32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x88,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_gt_i32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_gt_i32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x88,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_gt_i32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_gt_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x88,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_gt_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_gt_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x89,0x7d,0xff,0x6f,0x0d,0x30]
 0xfa,0xfe,0x89,0x7d,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_gt_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x89,0x7d,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_gt_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x78,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_gt_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_gt_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x78,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_gt_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_gt_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x78,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_gt_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x78,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_gt_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x78,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_gt_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x78,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_gt_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x78,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_gt_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x78,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_gt_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x78,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_gt_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x78,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_gt_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x78,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_gt_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x78,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_gt_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_gt_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x78,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_gt_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_gt_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x78,0x7d,0x7f,0x6f,0x0d,0x30]
 0xfa,0xfe,0x78,0x7d,0x7f,0x6f,0x0d,0x30
+# GFX12: v_cmpx_gt_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x78,0x7d,0x7f,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_gt_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x98,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_gt_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_gt_u32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x98,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_gt_u32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_gt_u32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x98,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_gt_u32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x98,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_gt_u32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x98,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_gt_u32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x98,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_gt_u32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x98,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_gt_u32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x98,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_gt_u32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x98,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_gt_u32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x98,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_gt_u32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x98,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_gt_u32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_gt_u32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x98,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_gt_u32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_gt_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x98,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_gt_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_gt_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x99,0x7d,0xff,0x6f,0x0d,0x30]
 0xfa,0xfe,0x99,0x7d,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_gt_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x99,0x7d,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_le_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x06,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_le_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_le_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x06,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_le_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_le_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x06,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_le_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_le_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x06,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_le_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_le_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x06,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_le_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_le_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x06,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_le_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_le_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x06,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_le_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_le_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x06,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_le_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_le_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x06,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_le_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_le_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x06,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_le_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_le_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x06,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_le_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_le_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x06,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_le_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_le_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x06,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_le_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_le_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x06,0x7d,0x7f,0x6f,0xfd,0x30]
 0xfa,0xfe,0x06,0x7d,0x7f,0x6f,0xfd,0x30
+# GFX12: v_cmpx_le_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x06,0x7d,0x7f,0x6f,0xfd,0x30]
 
-# GFX12: v_cmpx_le_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x26,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_le_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_le_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x26,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_le_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_le_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x26,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_le_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_le_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x26,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_le_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_le_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x26,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_le_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_le_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x26,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_le_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_le_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x26,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_le_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_le_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x26,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_le_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_le_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x26,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_le_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_le_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x26,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_le_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_le_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x26,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_le_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_le_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x26,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_le_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_le_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x26,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_le_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_le_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x27,0x7d,0xff,0x6f,0xfd,0x30]
 0xfa,0xfe,0x27,0x7d,0xff,0x6f,0xfd,0x30
+# GFX12: v_cmpx_le_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x27,0x7d,0xff,0x6f,0xfd,0x30]
 
-# GFX12: v_cmpx_le_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x66,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_le_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_le_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x66,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_le_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_le_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x66,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_le_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_le_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x66,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_le_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_le_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x66,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_le_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_le_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x66,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_le_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_le_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x66,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_le_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_le_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x66,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_le_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_le_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x66,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_le_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_le_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x66,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_le_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_le_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x66,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_le_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_le_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x66,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_le_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_le_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x66,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_le_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_le_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x66,0x7d,0x7f,0x6f,0x0d,0x30]
 0xfa,0xfe,0x66,0x7d,0x7f,0x6f,0x0d,0x30
+# GFX12: v_cmpx_le_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x66,0x7d,0x7f,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_le_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x86,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_le_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_le_i32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x86,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_le_i32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_le_i32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x86,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_le_i32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_le_i32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x86,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_le_i32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_le_i32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x86,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_le_i32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_le_i32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x86,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_le_i32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_le_i32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x86,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_le_i32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_le_i32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x86,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_le_i32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_le_i32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x86,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_le_i32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_le_i32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x86,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_le_i32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_le_i32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x86,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_le_i32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_le_i32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x86,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_le_i32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_le_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x86,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_le_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_le_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x87,0x7d,0xff,0x6f,0x0d,0x30]
 0xfa,0xfe,0x87,0x7d,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_le_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x87,0x7d,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_le_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x76,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_le_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_le_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x76,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_le_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_le_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x76,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_le_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_le_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x76,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_le_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_le_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x76,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_le_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_le_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x76,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_le_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_le_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x76,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_le_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_le_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x76,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_le_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_le_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x76,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_le_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_le_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x76,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_le_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_le_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x76,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_le_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_le_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x76,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_le_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_le_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x76,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_le_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_le_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x76,0x7d,0x7f,0x6f,0x0d,0x30]
 0xfa,0xfe,0x76,0x7d,0x7f,0x6f,0x0d,0x30
+# GFX12: v_cmpx_le_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x76,0x7d,0x7f,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_le_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x96,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_le_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_le_u32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x96,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_le_u32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_le_u32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x96,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_le_u32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_le_u32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x96,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_le_u32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_le_u32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x96,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_le_u32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_le_u32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x96,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_le_u32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_le_u32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x96,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_le_u32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_le_u32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x96,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_le_u32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_le_u32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x96,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_le_u32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_le_u32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x96,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_le_u32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_le_u32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x96,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_le_u32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_le_u32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x96,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_le_u32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_le_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x96,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_le_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_le_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x97,0x7d,0xff,0x6f,0x0d,0x30]
 0xfa,0xfe,0x97,0x7d,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_le_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x97,0x7d,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_lg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x0a,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_lg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_lg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x0a,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_lg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_lg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x0a,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_lg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x0a,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_lg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x0a,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_lg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x0a,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_lg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x0a,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_lg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x0a,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_lg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x0a,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_lg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x0a,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_lg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x0a,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_lg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x0a,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_lg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_lg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x0a,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_lg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_lg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0a,0x7d,0x7f,0x6f,0xfd,0x30]
 0xfa,0xfe,0x0a,0x7d,0x7f,0x6f,0xfd,0x30
+# GFX12: v_cmpx_lg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0a,0x7d,0x7f,0x6f,0xfd,0x30]
 
-# GFX12: v_cmpx_lg_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x2a,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_lg_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_lg_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x2a,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_lg_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_lg_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x2a,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_lg_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x2a,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_lg_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x2a,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_lg_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x2a,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_lg_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x2a,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_lg_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x2a,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_lg_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x2a,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_lg_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x2a,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_lg_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x2a,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_lg_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_lg_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x2a,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_lg_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_lg_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x2a,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_lg_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_lg_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2b,0x7d,0xff,0x6f,0xfd,0x30]
 0xfa,0xfe,0x2b,0x7d,0xff,0x6f,0xfd,0x30
+# GFX12: v_cmpx_lg_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2b,0x7d,0xff,0x6f,0xfd,0x30]
 
-# GFX12: v_cmpx_lt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x02,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_lt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_lt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x02,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_lt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_lt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x02,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_lt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x02,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_lt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x02,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_lt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x02,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_lt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x02,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_lt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x02,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_lt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x02,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_lt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x02,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_lt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x02,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_lt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x02,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_lt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_lt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x02,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_lt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_lt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x02,0x7d,0x7f,0x6f,0xfd,0x30]
 0xfa,0xfe,0x02,0x7d,0x7f,0x6f,0xfd,0x30
+# GFX12: v_cmpx_lt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x02,0x7d,0x7f,0x6f,0xfd,0x30]
 
-# GFX12: v_cmpx_lt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x22,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_lt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_lt_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x22,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_lt_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_lt_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x22,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_lt_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x22,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_lt_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x22,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_lt_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x22,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_lt_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x22,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_lt_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x22,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_lt_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x22,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_lt_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x22,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_lt_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x22,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_lt_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_lt_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x22,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_lt_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_lt_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x22,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_lt_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_lt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x23,0x7d,0xff,0x6f,0xfd,0x30]
 0xfa,0xfe,0x23,0x7d,0xff,0x6f,0xfd,0x30
+# GFX12: v_cmpx_lt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x23,0x7d,0xff,0x6f,0xfd,0x30]
 
-# GFX12: v_cmpx_lt_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x62,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_lt_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_lt_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x62,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_lt_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_lt_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x62,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_lt_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x62,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_lt_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x62,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_lt_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x62,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_lt_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x62,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_lt_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x62,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_lt_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x62,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_lt_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x62,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_lt_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x62,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_lt_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x62,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_lt_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_lt_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x62,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_lt_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_lt_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x62,0x7d,0x7f,0x6f,0x0d,0x30]
 0xfa,0xfe,0x62,0x7d,0x7f,0x6f,0x0d,0x30
+# GFX12: v_cmpx_lt_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x62,0x7d,0x7f,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_lt_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x82,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_lt_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_lt_i32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x82,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_lt_i32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_lt_i32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x82,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_lt_i32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x82,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_lt_i32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x82,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_lt_i32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x82,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_lt_i32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x82,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_lt_i32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x82,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_lt_i32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x82,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_lt_i32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x82,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_lt_i32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x82,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_lt_i32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_lt_i32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x82,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_lt_i32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_lt_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x82,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_lt_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_lt_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x83,0x7d,0xff,0x6f,0x0d,0x30]
 0xfa,0xfe,0x83,0x7d,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_lt_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x83,0x7d,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_lt_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x72,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_lt_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_lt_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x72,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_lt_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_lt_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x72,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_lt_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x72,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_lt_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x72,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_lt_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x72,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_lt_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x72,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_lt_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x72,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_lt_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x72,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_lt_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x72,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_lt_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x72,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_lt_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x72,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_lt_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_lt_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x72,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_lt_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_lt_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x72,0x7d,0x7f,0x6f,0x0d,0x30]
 0xfa,0xfe,0x72,0x7d,0x7f,0x6f,0x0d,0x30
+# GFX12: v_cmpx_lt_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x72,0x7d,0x7f,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_lt_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x92,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_lt_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_lt_u32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x92,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_lt_u32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_lt_u32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x92,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_lt_u32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x92,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_lt_u32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x92,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_lt_u32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x92,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_lt_u32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x92,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_lt_u32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x92,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_lt_u32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x92,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_lt_u32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x92,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_lt_u32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x92,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_lt_u32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_lt_u32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x92,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_lt_u32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_lt_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x92,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_lt_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_lt_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x93,0x7d,0xff,0x6f,0x0d,0x30]
 0xfa,0xfe,0x93,0x7d,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_lt_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x93,0x7d,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_ne_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x6a,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_ne_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_ne_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x6a,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_ne_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_ne_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x6a,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_ne_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x6a,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_ne_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x6a,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_ne_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x6a,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_ne_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x6a,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_ne_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x6a,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_ne_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x6a,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_ne_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x6a,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ne_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x6a,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_ne_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x6a,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_ne_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_ne_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x6a,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_ne_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_ne_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6a,0x7d,0x7f,0x6f,0x0d,0x30]
 0xfa,0xfe,0x6a,0x7d,0x7f,0x6f,0x0d,0x30
+# GFX12: v_cmpx_ne_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6a,0x7d,0x7f,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_ne_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x8a,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_ne_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_ne_i32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x8a,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_ne_i32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_ne_i32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x8a,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_ne_i32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x8a,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_ne_i32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x8a,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_ne_i32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x8a,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_ne_i32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x8a,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_ne_i32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x8a,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_ne_i32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x8a,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_ne_i32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x8a,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ne_i32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x8a,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_ne_i32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_ne_i32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x8a,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_ne_i32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_ne_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x8a,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_ne_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_ne_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x8b,0x7d,0xff,0x6f,0x0d,0x30]
 0xfa,0xfe,0x8b,0x7d,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_ne_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x8b,0x7d,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_ne_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x7a,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_ne_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_ne_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x7a,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_ne_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_ne_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x7a,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_ne_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x7a,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_ne_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x7a,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_ne_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x7a,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_ne_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x7a,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_ne_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x7a,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_ne_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x7a,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_ne_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x7a,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ne_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x7a,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_ne_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x7a,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_ne_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_ne_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x7a,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_ne_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_ne_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7a,0x7d,0x7f,0x6f,0x0d,0x30]
 0xfa,0xfe,0x7a,0x7d,0x7f,0x6f,0x0d,0x30
+# GFX12: v_cmpx_ne_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7a,0x7d,0x7f,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_ne_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x9a,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_ne_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_ne_u32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x9a,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_ne_u32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_ne_u32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x9a,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_ne_u32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x9a,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_ne_u32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x9a,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_ne_u32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x9a,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_ne_u32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x9a,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_ne_u32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x9a,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_ne_u32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x9a,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_ne_u32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x9a,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ne_u32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x9a,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_ne_u32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_ne_u32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x9a,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_ne_u32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_ne_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x9a,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_ne_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_ne_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x9b,0x7d,0xff,0x6f,0x0d,0x30]
 0xfa,0xfe,0x9b,0x7d,0xff,0x6f,0x0d,0x30
+# GFX12: v_cmpx_ne_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x9b,0x7d,0xff,0x6f,0x0d,0x30]
 
-# GFX12: v_cmpx_neq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x1a,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_neq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_neq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x1a,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_neq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_neq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x1a,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_neq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x1a,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_neq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x1a,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_neq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x1a,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_neq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x1a,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_neq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x1a,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_neq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x1a,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_neq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x1a,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_neq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x1a,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_neq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x1a,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_neq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_neq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x1a,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_neq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_neq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1a,0x7d,0x7f,0x6f,0xfd,0x30]
 0xfa,0xfe,0x1a,0x7d,0x7f,0x6f,0xfd,0x30
+# GFX12: v_cmpx_neq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1a,0x7d,0x7f,0x6f,0xfd,0x30]
 
-# GFX12: v_cmpx_neq_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x3a,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_neq_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_neq_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x3a,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_neq_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_neq_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x3a,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_neq_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x3a,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_neq_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x3a,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_neq_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x3a,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_neq_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x3a,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_neq_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x3a,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_neq_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x3a,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_neq_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x3a,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_neq_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x3a,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_neq_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_neq_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x3a,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_neq_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_neq_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x3a,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_neq_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_neq_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x3b,0x7d,0xff,0x6f,0xfd,0x30]
 0xfa,0xfe,0x3b,0x7d,0xff,0x6f,0xfd,0x30
+# GFX12: v_cmpx_neq_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x3b,0x7d,0xff,0x6f,0xfd,0x30]
 
-# GFX12: v_cmpx_nge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x12,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_nge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_nge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x12,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_nge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_nge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x12,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_nge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x12,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_nge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x12,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_nge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x12,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_nge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x12,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_nge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x12,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_nge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x12,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_nge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x12,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_nge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x12,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_nge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x12,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_nge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_nge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x12,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_nge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_nge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x12,0x7d,0x7f,0x6f,0xfd,0x30]
 0xfa,0xfe,0x12,0x7d,0x7f,0x6f,0xfd,0x30
+# GFX12: v_cmpx_nge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x12,0x7d,0x7f,0x6f,0xfd,0x30]
 
-# GFX12: v_cmpx_nge_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x32,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_nge_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_nge_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x32,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_nge_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_nge_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x32,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_nge_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x32,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_nge_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x32,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_nge_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x32,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_nge_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x32,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_nge_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x32,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_nge_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x32,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_nge_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x32,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_nge_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x32,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_nge_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_nge_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x32,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_nge_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_nge_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x32,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_nge_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_nge_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x33,0x7d,0xff,0x6f,0xfd,0x30]
 0xfa,0xfe,0x33,0x7d,0xff,0x6f,0xfd,0x30
+# GFX12: v_cmpx_nge_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x33,0x7d,0xff,0x6f,0xfd,0x30]
 
-# GFX12: v_cmpx_ngt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x16,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_ngt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_ngt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x16,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_ngt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_ngt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x16,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_ngt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x16,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_ngt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x16,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_ngt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x16,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_ngt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x16,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_ngt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x16,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_ngt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x16,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_ngt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x16,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ngt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x16,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_ngt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x16,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_ngt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_ngt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x16,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_ngt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_ngt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x16,0x7d,0x7f,0x6f,0xfd,0x30]
 0xfa,0xfe,0x16,0x7d,0x7f,0x6f,0xfd,0x30
+# GFX12: v_cmpx_ngt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x16,0x7d,0x7f,0x6f,0xfd,0x30]
 
-# GFX12: v_cmpx_ngt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x36,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_ngt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_ngt_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x36,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_ngt_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_ngt_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x36,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_ngt_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x36,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_ngt_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x36,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_ngt_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x36,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_ngt_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x36,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_ngt_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x36,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_ngt_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x36,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_ngt_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x36,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_ngt_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x36,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_ngt_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_ngt_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x36,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_ngt_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_ngt_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x36,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_ngt_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_ngt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x37,0x7d,0xff,0x6f,0xfd,0x30]
 0xfa,0xfe,0x37,0x7d,0xff,0x6f,0xfd,0x30
+# GFX12: v_cmpx_ngt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x37,0x7d,0xff,0x6f,0xfd,0x30]
 
-# GFX12: v_cmpx_nle_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x18,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_nle_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_nle_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x18,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_nle_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_nle_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x18,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_nle_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x18,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_nle_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x18,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_nle_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x18,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_nle_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x18,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_nle_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x18,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_nle_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x18,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_nle_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x18,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_nle_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x18,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_nle_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x18,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_nle_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_nle_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x18,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_nle_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_nle_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x18,0x7d,0x7f,0x6f,0xfd,0x30]
 0xfa,0xfe,0x18,0x7d,0x7f,0x6f,0xfd,0x30
+# GFX12: v_cmpx_nle_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x18,0x7d,0x7f,0x6f,0xfd,0x30]
 
-# GFX12: v_cmpx_nle_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x38,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_nle_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_nle_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x38,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_nle_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_nle_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x38,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_nle_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x38,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_nle_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x38,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_nle_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x38,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_nle_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x38,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_nle_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x38,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_nle_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x38,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_nle_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x38,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_nle_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x38,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_nle_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_nle_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x38,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_nle_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_nle_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x38,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_nle_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_nle_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x39,0x7d,0xff,0x6f,0xfd,0x30]
 0xfa,0xfe,0x39,0x7d,0xff,0x6f,0xfd,0x30
+# GFX12: v_cmpx_nle_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x39,0x7d,0xff,0x6f,0xfd,0x30]
 
-# GFX12: v_cmpx_nlg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x14,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_nlg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_nlg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x14,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_nlg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_nlg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x14,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_nlg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x14,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_nlg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x14,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_nlg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x14,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_nlg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x14,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_nlg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x14,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_nlg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x14,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_nlg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x14,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_nlg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x14,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_nlg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x14,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_nlg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_nlg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x14,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_nlg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_nlg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x14,0x7d,0x7f,0x6f,0xfd,0x30]
 0xfa,0xfe,0x14,0x7d,0x7f,0x6f,0xfd,0x30
+# GFX12: v_cmpx_nlg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x14,0x7d,0x7f,0x6f,0xfd,0x30]
 
-# GFX12: v_cmpx_nlg_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x34,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_nlg_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_nlg_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x34,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_nlg_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_nlg_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x34,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_nlg_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x34,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_nlg_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x34,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_nlg_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x34,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_nlg_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x34,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_nlg_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x34,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_nlg_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x34,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_nlg_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x34,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_nlg_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x34,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_nlg_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_nlg_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x34,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_nlg_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_nlg_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x34,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_nlg_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_nlg_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x35,0x7d,0xff,0x6f,0xfd,0x30]
 0xfa,0xfe,0x35,0x7d,0xff,0x6f,0xfd,0x30
+# GFX12: v_cmpx_nlg_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x35,0x7d,0xff,0x6f,0xfd,0x30]
 
-# GFX12: v_cmpx_nlt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x1c,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_nlt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_nlt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x1c,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_nlt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_nlt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x1c,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_nlt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x1c,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_nlt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x1c,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_nlt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x1c,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_nlt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x1c,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_nlt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x1c,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_nlt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x1c,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_nlt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x1c,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_nlt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x1c,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_nlt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x1c,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_nlt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_nlt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x1c,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_nlt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_nlt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1c,0x7d,0x7f,0x6f,0xfd,0x30]
 0xfa,0xfe,0x1c,0x7d,0x7f,0x6f,0xfd,0x30
+# GFX12: v_cmpx_nlt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1c,0x7d,0x7f,0x6f,0xfd,0x30]
 
-# GFX12: v_cmpx_nlt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x3c,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_nlt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_nlt_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x3c,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_nlt_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_nlt_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x3c,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_nlt_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x3c,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_nlt_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x3c,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_nlt_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x3c,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_nlt_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x3c,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_nlt_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x3c,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_nlt_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x3c,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_nlt_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x3c,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_nlt_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x3c,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_nlt_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_nlt_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x3c,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_nlt_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_nlt_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x3c,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_nlt_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_nlt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x3d,0x7d,0xff,0x6f,0xfd,0x30]
 0xfa,0xfe,0x3d,0x7d,0xff,0x6f,0xfd,0x30
+# GFX12: v_cmpx_nlt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x3d,0x7d,0xff,0x6f,0xfd,0x30]
 
-# GFX12: v_cmpx_o_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x0e,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_o_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_o_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x0e,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_o_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_o_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x0e,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_o_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_o_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x0e,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_o_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_o_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x0e,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_o_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_o_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x0e,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_o_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_o_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x0e,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_o_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_o_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x0e,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_o_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_o_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x0e,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_o_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_o_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x0e,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_o_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_o_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x0e,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_o_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_o_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x0e,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_o_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_o_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x0e,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_o_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_o_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0e,0x7d,0x7f,0x6f,0xfd,0x30]
 0xfa,0xfe,0x0e,0x7d,0x7f,0x6f,0xfd,0x30
+# GFX12: v_cmpx_o_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0e,0x7d,0x7f,0x6f,0xfd,0x30]
 
-# GFX12: v_cmpx_o_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x2e,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_o_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_o_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x2e,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_o_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_o_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x2e,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_o_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_o_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x2e,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_o_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_o_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x2e,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_o_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_o_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x2e,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_o_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_o_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x2e,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_o_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_o_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x2e,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_o_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_o_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x2e,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_o_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_o_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x2e,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_o_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_o_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x2e,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_o_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_o_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x2e,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_o_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_o_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x2e,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_o_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_o_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2f,0x7d,0xff,0x6f,0xfd,0x30]
 0xfa,0xfe,0x2f,0x7d,0xff,0x6f,0xfd,0x30
+# GFX12: v_cmpx_o_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2f,0x7d,0xff,0x6f,0xfd,0x30]
 
-# GFX12: v_cmpx_u_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x10,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_u_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_u_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x10,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_u_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_u_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x10,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_u_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_u_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x10,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_u_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_u_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x10,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_u_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_u_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x10,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_u_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_u_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x10,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_u_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_u_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x10,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_u_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_u_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x10,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_u_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_u_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x10,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_u_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_u_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x10,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_u_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_u_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x10,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_u_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_u_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x10,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_u_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_u_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x10,0x7d,0x7f,0x6f,0xfd,0x30]
 0xfa,0xfe,0x10,0x7d,0x7f,0x6f,0xfd,0x30
+# GFX12: v_cmpx_u_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x10,0x7d,0x7f,0x6f,0xfd,0x30]
 
-# GFX12: v_cmpx_u_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x1b,0x00,0xff]
 0xfa,0x04,0x30,0x7d,0x01,0x1b,0x00,0xff
+# GFX12: v_cmpx_u_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x1b,0x00,0xff]
 
-# GFX12: v_cmpx_u_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0xe4,0x00,0xff]
 0xfa,0x04,0x30,0x7d,0x01,0xe4,0x00,0xff
+# GFX12: v_cmpx_u_f32 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0xe4,0x00,0xff]
 
-# GFX12: v_cmpx_u_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x40,0x01,0xff]
 0xfa,0x04,0x30,0x7d,0x01,0x40,0x01,0xff
+# GFX12: v_cmpx_u_f32 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x40,0x01,0xff]
 
-# GFX12: v_cmpx_u_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x41,0x01,0xff]
 0xfa,0x04,0x30,0x7d,0x01,0x41,0x01,0xff
+# GFX12: v_cmpx_u_f32 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x41,0x01,0xff]
 
-# GFX12: v_cmpx_u_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x01,0x01,0xff]
 0xfa,0x04,0x30,0x7d,0x01,0x01,0x01,0xff
+# GFX12: v_cmpx_u_f32 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x01,0x01,0xff]
 
-# GFX12: v_cmpx_u_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x0f,0x01,0xff]
 0xfa,0x04,0x30,0x7d,0x01,0x0f,0x01,0xff
+# GFX12: v_cmpx_u_f32 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x0f,0x01,0xff]
 
-# GFX12: v_cmpx_u_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x11,0x01,0xff]
 0xfa,0x04,0x30,0x7d,0x01,0x11,0x01,0xff
+# GFX12: v_cmpx_u_f32 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x11,0x01,0xff]
 
-# GFX12: v_cmpx_u_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x1f,0x01,0xff]
 0xfa,0x04,0x30,0x7d,0x01,0x1f,0x01,0xff
+# GFX12: v_cmpx_u_f32 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x1f,0x01,0xff]
 
-# GFX12: v_cmpx_u_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x21,0x01,0xff]
 0xfa,0x04,0x30,0x7d,0x01,0x21,0x01,0xff
+# GFX12: v_cmpx_u_f32 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x21,0x01,0xff]
 
-# GFX12: v_cmpx_u_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x2f,0x01,0xff]
 0xfa,0x04,0x30,0x7d,0x01,0x2f,0x01,0xff
+# GFX12: v_cmpx_u_f32 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x2f,0x01,0xff]
 
-# GFX12: v_cmpx_u_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x50,0x01,0xff]
 0xfa,0x04,0x30,0x7d,0x01,0x50,0x01,0xff
+# GFX12: v_cmpx_u_f32 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x50,0x01,0xff]
 
-# GFX12: v_cmpx_u_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x5f,0x01,0x01]
 0xfa,0x04,0x30,0x7d,0x01,0x5f,0x01,0x01
+# GFX12: v_cmpx_u_f32 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x5f,0x01,0x01]
 
-# GFX12: v_cmpx_u_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x60,0x01,0x13]
 0xfa,0x04,0x30,0x7d,0x01,0x60,0x01,0x13
+# GFX12: v_cmpx_u_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x60,0x01,0x13]
 
-# GFX12: v_cmpx_u_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x31,0x7d,0xff,0x6f,0xfd,0x30]
 0xfa,0xfe,0x31,0x7d,0xff,0x6f,0xfd,0x30
+# GFX12: v_cmpx_u_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x31,0x7d,0xff,0x6f,0xfd,0x30]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt
index 53f15e8ae4314..4449cbcfb3608 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt
@@ -1,328 +1,329 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12
 
-# GFX12: v_cmpx_class_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_class_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_class_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfa,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0xfa,0x7d,0x7f,0x00,0x00,0x00
+# GFX12: v_cmpx_class_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfa,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_class_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfc,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0xfc,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_class_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfc,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_class_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfd,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0xfd,0x7d,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_class_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfd,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_eq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x04,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x04,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_eq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x04,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_eq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x04,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x04,0x7d,0x7f,0x00,0x00,0x00
+# GFX12: v_cmpx_eq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x04,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_eq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x24,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x24,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_eq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x24,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_eq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x25,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x25,0x7d,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_eq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x25,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_eq_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x64,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x64,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_eq_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x64,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_eq_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x64,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x64,0x7d,0x7f,0x00,0x00,0x00
+# GFX12: v_cmpx_eq_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x64,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_eq_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x84,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x84,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_eq_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x84,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_eq_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x85,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x85,0x7d,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_eq_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x85,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_eq_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x74,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x74,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_eq_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x74,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_eq_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x74,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x74,0x7d,0x7f,0x00,0x00,0x00
+# GFX12: v_cmpx_eq_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x74,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_eq_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x94,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x94,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_eq_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x94,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_eq_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x95,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x95,0x7d,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_eq_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x95,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_ge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0c,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x0c,0x7d,0x7f,0x00,0x00,0x00
+# GFX12: v_cmpx_ge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0c,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_ge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2c,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x2c,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2c,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ge_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2d,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x2d,0x7d,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_ge_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2d,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_ge_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6c,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x6c,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ge_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6c,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ge_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6c,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x6c,0x7d,0x7f,0x00,0x00,0x00
+# GFX12: v_cmpx_ge_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6c,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_ge_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x8c,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x8c,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ge_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x8c,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ge_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x8d,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x8d,0x7d,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_ge_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x8d,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_ge_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7c,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x7c,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ge_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7c,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ge_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7c,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x7c,0x7d,0x7f,0x00,0x00,0x00
+# GFX12: v_cmpx_ge_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7c,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_ge_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x9c,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x9c,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ge_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x9c,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ge_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x9d,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x9d,0x7d,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_ge_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x9d,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_gt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x08,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x08,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_gt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x08,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_gt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x08,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x08,0x7d,0x7f,0x00,0x00,0x00
+# GFX12: v_cmpx_gt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x08,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_gt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x28,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x28,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_gt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x28,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_gt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x29,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x29,0x7d,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_gt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x29,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_gt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x68,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x68,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_gt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x68,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_gt_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x68,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x68,0x7d,0x7f,0x00,0x00,0x00
+# GFX12: v_cmpx_gt_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x68,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_gt_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x88,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x88,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_gt_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x88,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_gt_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x89,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x89,0x7d,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_gt_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x89,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_gt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x78,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x78,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_gt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x78,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_gt_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x78,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x78,0x7d,0x7f,0x00,0x00,0x00
+# GFX12: v_cmpx_gt_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x78,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_gt_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x98,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x98,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_gt_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x98,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_gt_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x99,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x99,0x7d,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_gt_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x99,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_le_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x06,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x06,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_le_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x06,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_le_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x06,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x06,0x7d,0x7f,0x00,0x00,0x00
+# GFX12: v_cmpx_le_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x06,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_le_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x26,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x26,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_le_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x26,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_le_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x27,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x27,0x7d,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_le_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x27,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_le_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x66,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x66,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_le_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x66,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_le_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x66,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x66,0x7d,0x7f,0x00,0x00,0x00
+# GFX12: v_cmpx_le_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x66,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_le_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x86,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x86,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_le_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x86,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_le_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x87,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x87,0x7d,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_le_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x87,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_le_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x76,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x76,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_le_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x76,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_le_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x76,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x76,0x7d,0x7f,0x00,0x00,0x00
+# GFX12: v_cmpx_le_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x76,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_le_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x96,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x96,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_le_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x96,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_le_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x97,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x97,0x7d,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_le_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x97,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_lg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x0a,0x7d,0x7f,0x00,0x00,0x00
+# GFX12: v_cmpx_lg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_lg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2a,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x2a,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2a,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lg_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2b,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x2b,0x7d,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_lg_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2b,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_lt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x02,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x02,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x02,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x02,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x02,0x7d,0x7f,0x00,0x00,0x00
+# GFX12: v_cmpx_lt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x02,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_lt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x22,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x22,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x22,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x23,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x23,0x7d,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_lt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x23,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_lt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x62,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x62,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x62,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lt_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x62,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x62,0x7d,0x7f,0x00,0x00,0x00
+# GFX12: v_cmpx_lt_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x62,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_lt_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x82,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x82,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lt_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x82,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lt_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x83,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x83,0x7d,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_lt_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x83,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_lt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x72,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x72,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x72,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lt_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x72,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x72,0x7d,0x7f,0x00,0x00,0x00
+# GFX12: v_cmpx_lt_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x72,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_lt_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x92,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x92,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_lt_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x92,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_lt_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x93,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x93,0x7d,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_lt_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x93,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_ne_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6a,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x6a,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ne_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6a,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ne_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6a,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x6a,0x7d,0x7f,0x00,0x00,0x00
+# GFX12: v_cmpx_ne_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6a,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_ne_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x8a,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x8a,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ne_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x8a,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ne_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x8b,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x8b,0x7d,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_ne_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x8b,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_ne_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7a,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x7a,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ne_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7a,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ne_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7a,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x7a,0x7d,0x7f,0x00,0x00,0x00
+# GFX12: v_cmpx_ne_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7a,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_ne_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x9a,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x9a,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ne_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x9a,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ne_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x9b,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x9b,0x7d,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_ne_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x9b,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_neq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_neq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_neq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1a,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x1a,0x7d,0x7f,0x00,0x00,0x00
+# GFX12: v_cmpx_neq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1a,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_neq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3a,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x3a,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_neq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3a,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_neq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x3b,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x3b,0x7d,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_neq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x3b,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_nge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x12,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x12,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x12,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x12,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x12,0x7d,0x7f,0x00,0x00,0x00
+# GFX12: v_cmpx_nge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x12,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_nge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x32,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x32,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x32,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nge_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x33,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x33,0x7d,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_nge_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x33,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_ngt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x16,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x16,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ngt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x16,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ngt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x16,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x16,0x7d,0x7f,0x00,0x00,0x00
+# GFX12: v_cmpx_ngt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x16,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_ngt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x36,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x36,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_ngt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x36,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_ngt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x37,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x37,0x7d,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_ngt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x37,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_nle_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x18,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x18,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nle_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x18,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nle_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x18,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x18,0x7d,0x7f,0x00,0x00,0x00
+# GFX12: v_cmpx_nle_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x18,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_nle_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x38,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x38,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nle_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x38,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nle_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x39,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x39,0x7d,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_nle_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x39,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_nlg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x14,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x14,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nlg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x14,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nlg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x14,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x14,0x7d,0x7f,0x00,0x00,0x00
+# GFX12: v_cmpx_nlg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x14,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_nlg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x34,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x34,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nlg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x34,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nlg_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x35,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x35,0x7d,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_nlg_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x35,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_nlt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nlt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nlt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1c,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x1c,0x7d,0x7f,0x00,0x00,0x00
+# GFX12: v_cmpx_nlt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1c,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_nlt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3c,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x3c,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_nlt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3c,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_nlt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x3d,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x3d,0x7d,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_nlt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x3d,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_o_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_o_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_o_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0e,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x0e,0x7d,0x7f,0x00,0x00,0x00
+# GFX12: v_cmpx_o_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0e,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_o_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2e,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x2e,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_o_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2e,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_o_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2f,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x2f,0x7d,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_o_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2f,0x7d,0xff,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_u_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x10,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x10,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_u_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x10,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_u_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x10,0x7d,0x7f,0x00,0x00,0x00]
 0xea,0xfe,0x10,0x7d,0x7f,0x00,0x00,0x00
+# GFX12: v_cmpx_u_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x10,0x7d,0x7f,0x00,0x00,0x00]
 
-# GFX12: v_cmpx_u_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x30,0x7d,0x01,0x77,0x39,0x05]
 0xe9,0x04,0x30,0x7d,0x01,0x77,0x39,0x05
+# GFX12: v_cmpx_u_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x30,0x7d,0x01,0x77,0x39,0x05]
 
-# GFX12: v_cmpx_u_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x31,0x7d,0xff,0x00,0x00,0x00]
 0xea,0xfe,0x31,0x7d,0xff,0x00,0x00,0x00
+# GFX12: v_cmpx_u_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x31,0x7d,0xff,0x00,0x00,0x00]

From 24bb180e8aeae95cb830e5c3da73e750edaa139f Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Fri, 10 Jan 2025 07:12:24 -0800
Subject: [PATCH 063/408] [RISCV] Attempt to widen SEW before generic shuffle
 lowering (#122311)

This takes inspiration from AArch64 which does the same thing to assist
with zip/trn/etc.. Doing this recursion unconditionally when the mask
allows is slightly questionable, but seems to work out okay in practice.

As a bit of context, it's helpful to realize that we have existing logic
in both DAGCombine and InstCombine which mutates the element width of in
an analogous manner. However, that code has two restriction which
prevent it from handling the motivating cases here. First, it only
triggers if there is a bitcast involving a different element type.
Second, the matcher used considers a partially undef wide element to be
a non-match. I considered trying to relax those assumptions, but the
information loss for undef in mid-level opt seemed more likely to open a
can of worms than I wanted.
---
 llvm/include/llvm/Analysis/VectorUtils.h      |  11 +-
 llvm/lib/Analysis/VectorUtils.cpp             |  35 ++
 .../Target/AArch64/AArch64ISelLowering.cpp    |  40 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  43 ++
 .../RISCV/rvv/fixed-vectors-int-shuffles.ll   |  46 +-
 .../rvv/fixed-vectors-interleaved-access.ll   | 516 ++++++++----------
 .../rvv/fixed-vectors-shuffle-reverse.ll      | 151 ++---
 .../RISCV/rvv/fixed-vectors-shuffle-rotate.ll |  24 +-
 8 files changed, 388 insertions(+), 478 deletions(-)

diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index b72efac0a4887..a903eaa6cbe54 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -235,10 +235,17 @@ void narrowShuffleMaskElts(int Scale, ArrayRef<int> Mask,
 bool widenShuffleMaskElts(int Scale, ArrayRef<int> Mask,
                           SmallVectorImpl<int> &ScaledMask);
 
+/// A variant of the previous method which is specialized for Scale=2, and
+/// treats -1 as undef and allows widening when a wider element is partially
+/// undef in the narrow form of the mask.  This transformation discards
+/// information about which bytes in the original shuffle were undef.
+bool widenShuffleMaskElts(ArrayRef<int> M, SmallVectorImpl<int> &NewMask);
+
 /// Attempt to narrow/widen the \p Mask shuffle mask to the \p NumDstElts target
 /// width. Internally this will call narrowShuffleMaskElts/widenShuffleMaskElts.
-/// This will assert unless NumDstElts is a multiple of Mask.size (or vice-versa).
-/// Returns false on failure, and ScaledMask will be in an undefined state.
+/// This will assert unless NumDstElts is a multiple of Mask.size (or
+/// vice-versa). Returns false on failure, and ScaledMask will be in an
+/// undefined state.
 bool scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef<int> Mask,
                           SmallVectorImpl<int> &ScaledMask);
 
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 6c2502ce21cca..b4b311cb727a1 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -479,6 +479,41 @@ bool llvm::widenShuffleMaskElts(int Scale, ArrayRef<int> Mask,
   return true;
 }
 
+bool llvm::widenShuffleMaskElts(ArrayRef<int> M,
+                                SmallVectorImpl<int> &NewMask) {
+  unsigned NumElts = M.size();
+  if (NumElts % 2 != 0)
+    return false;
+
+  NewMask.clear();
+  for (unsigned i = 0; i < NumElts; i += 2) {
+    int M0 = M[i];
+    int M1 = M[i + 1];
+
+    // If both elements are undef, new mask is undef too.
+    if (M0 == -1 && M1 == -1) {
+      NewMask.push_back(-1);
+      continue;
+    }
+
+    if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) {
+      NewMask.push_back(M1 / 2);
+      continue;
+    }
+
+    if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) {
+      NewMask.push_back(M0 / 2);
+      continue;
+    }
+
+    NewMask.clear();
+    return false;
+  }
+
+  assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!");
+  return true;
+}
+
 bool llvm::scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef<int> Mask,
                                 SmallVectorImpl<int> &ScaledMask) {
   unsigned NumSrcElts = Mask.size();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5686ef5c25154..a9ab5b56f5755 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13723,44 +13723,6 @@ static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
   return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
 }
 
-// Return true if we can get a new shuffle mask by checking the parameter mask
-// array to test whether every two adjacent mask values are continuous and
-// starting from an even number.
-static bool isWideTypeMask(ArrayRef<int> M, EVT VT,
-                           SmallVectorImpl<int> &NewMask) {
-  unsigned NumElts = VT.getVectorNumElements();
-  if (NumElts % 2 != 0)
-    return false;
-
-  NewMask.clear();
-  for (unsigned i = 0; i < NumElts; i += 2) {
-    int M0 = M[i];
-    int M1 = M[i + 1];
-
-    // If both elements are undef, new mask is undef too.
-    if (M0 == -1 && M1 == -1) {
-      NewMask.push_back(-1);
-      continue;
-    }
-
-    if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) {
-      NewMask.push_back(M1 / 2);
-      continue;
-    }
-
-    if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) {
-      NewMask.push_back(M0 / 2);
-      continue;
-    }
-
-    NewMask.clear();
-    return false;
-  }
-
-  assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!");
-  return true;
-}
-
 // Try to widen element type to get a new mask value for a better permutation
 // sequence, so that we can use NEON shuffle instructions, such as zip1/2,
 // UZP1/2, TRN1/2, REV, INS, etc.
@@ -13787,7 +13749,7 @@ static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) {
     return SDValue();
 
   SmallVector<int, 8> NewMask;
-  if (isWideTypeMask(Mask, VT, NewMask)) {
+  if (widenShuffleMaskElts(Mask, NewMask)) {
     MVT NewEltVT = VT.isFloatingPoint()
                        ? MVT::getFloatingPointVT(ElementSize * 2)
                        : MVT::getIntegerVT(ElementSize * 2);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 6c58989b1afb4..95f1deed8b6c0 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5261,6 +5261,39 @@ static SDValue lowerDisjointIndicesShuffle(ShuffleVectorSDNode *SVN,
   return DAG.getVectorShuffle(VT, DL, Select, DAG.getUNDEF(VT), NewMask);
 }
 
+/// Try to widen element type to get a new mask value for a better permutation
+/// sequence.  This doesn't try to inspect the widened mask for profitability;
+/// we speculate the widened form is equal or better.  This has the effect of
+/// reducing mask constant sizes - allowing cheaper materialization sequences
+/// - and index sequence sizes - reducing register pressure and materialization
+/// cost, at the cost of (possibly) an extra VTYPE toggle.
+static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  MVT VT = Op.getSimpleValueType();
+  MVT ScalarVT = VT.getVectorElementType();
+  unsigned ElementSize = ScalarVT.getFixedSizeInBits();
+  SDValue V0 = Op.getOperand(0);
+  SDValue V1 = Op.getOperand(1);
+  ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
+
+  // Avoid wasted work leading to isTypeLegal check failing below
+  if (ElementSize > 32)
+    return SDValue();
+
+  SmallVector<int, 8> NewMask;
+  if (!widenShuffleMaskElts(Mask, NewMask))
+    return SDValue();
+
+  MVT NewEltVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(ElementSize * 2)
+                                      : MVT::getIntegerVT(ElementSize * 2);
+  MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(NewVT))
+    return SDValue();
+  V0 = DAG.getBitcast(NewVT, V0);
+  V1 = DAG.getBitcast(NewVT, V1);
+  return DAG.getBitcast(VT, DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
+}
+
 static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
                                    const RISCVSubtarget &Subtarget) {
   SDValue V1 = Op.getOperand(0);
@@ -5506,6 +5539,11 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
     if (SDValue V = lowerVECTOR_SHUFFLEAsRotate(SVN, DAG, Subtarget))
       return V;
 
+    // Before hitting generic lowering fallbacks, try to widen the mask
+    // to a wider SEW.
+    if (SDValue V = tryWidenMaskForShuffle(Op, DAG))
+      return V;
+
     // Can we generate a vcompress instead of a vrgather?  These scale better
     // at high LMUL, at the cost of not being able to fold a following select
     // into them.  The mask constants are also smaller than the index vector
@@ -5615,6 +5653,11 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
     if (SDValue V = lowerDisjointIndicesShuffle(SVN, DAG, Subtarget))
       return V;
 
+  // Before hitting generic lowering fallbacks, try to widen the mask
+  // to a wider SEW.
+  if (SDValue V = tryWidenMaskForShuffle(Op, DAG))
+    return V;
+
   // Try to pick a profitable operand order.
   bool SwapOps = DAG.isSplatValue(V2) && !DAG.isSplatValue(V1);
   SwapOps = SwapOps ^ ShuffleVectorInst::isIdentityMask(ShuffleMaskRHS, NumElts);
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index 8915603471ec7..2ffb079e83b0b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -603,10 +603,8 @@ define <8 x i8> @concat_4xi8_start_undef(<8 x i8> %v, <8 x i8> %w) {
 define <8 x i8> @concat_4xi8_start_undef_at_start(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: concat_4xi8_start_undef_at_start:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, -32
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vslideup.vi v8, v9, 4, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 2
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 9, i32 10, i32 11>
   ret <8 x i8> %res
@@ -704,8 +702,8 @@ define <8 x i32> @shuffle_v8i32_2(<8 x i32> %x, <8 x i32> %y) {
 ; CHECK-LABEL: shuffle_v8i32_2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v0, -13
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 13
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; CHECK-NEXT:    ret
   %s = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
@@ -756,9 +754,9 @@ define <8 x i16> @shuffle_compress_singlesrc_e16(<8 x i16> %v) {
 define <8 x i32> @shuffle_compress_singlesrc_e32(<8 x i32> %v) {
 ; CHECK-LABEL: shuffle_compress_singlesrc_e32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 115
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.s.x v12, a0
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v12, 13
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:    vcompress.vm v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
@@ -832,26 +830,16 @@ define <8 x i32> @shuffle_spread2_singlesrc_e32_index2(<8 x i32> %v) {
 }
 
 define <8 x i32> @shuffle_spread3_singlesrc_e32(<8 x i32> %v) {
-; RV32-LABEL: shuffle_spread3_singlesrc_e32:
-; RV32:       # %bb.0:
-; RV32-NEXT:    lui a0, %hi(.LCPI57_0)
-; RV32-NEXT:    addi a0, a0, %lo(.LCPI57_0)
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vle16.v v12, (a0)
-; RV32-NEXT:    vrgatherei16.vv v10, v8, v12
-; RV32-NEXT:    vmv.v.v v8, v10
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: shuffle_spread3_singlesrc_e32:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, 32769
-; RV64-NEXT:    slli a0, a0, 21
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v12, a0
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64-NEXT:    vrgatherei16.vv v10, v8, v12
-; RV64-NEXT:    vmv.v.v v8, v10
-; RV64-NEXT:    ret
+; CHECK-LABEL: shuffle_spread3_singlesrc_e32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    li a0, 1
+; CHECK-NEXT:    vslide1down.vx v12, v10, a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
   %out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> <i32 0, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 2, i32 undef>
   ret <8 x i32> %out
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 67d649902b022..0c7d7925edf39 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -183,498 +183,456 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 96
+; RV32-NEXT:    li a3, 92
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xe0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 96 * vlenb
-; RV32-NEXT:    addi a3, a1, 128
-; RV32-NEXT:    addi a4, a1, 256
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xdc, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 92 * vlenb
+; RV32-NEXT:    addi a3, a1, 256
+; RV32-NEXT:    addi a4, a1, 128
 ; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    li a5, 768
-; RV32-NEXT:    lui a6, 12291
-; RV32-NEXT:    lui a7, %hi(.LCPI8_1)
-; RV32-NEXT:    addi a7, a7, %lo(.LCPI8_1)
+; RV32-NEXT:    lui a5, 12291
+; RV32-NEXT:    lui a6, %hi(.LCPI8_0)
+; RV32-NEXT:    addi a6, a6, %lo(.LCPI8_0)
+; RV32-NEXT:    li a7, 768
+; RV32-NEXT:    lui t0, 49164
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT:    vle32.v v16, (a1)
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li t1, 76
+; RV32-NEXT:    mul a1, a1, t1
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vle32.v v8, (a4)
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li t0, 88
-; RV32-NEXT:    mul a4, a4, t0
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv.s.x v0, a5
-; RV32-NEXT:    vle32.v v24, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a4, 72
+; RV32-NEXT:    li a4, 68
 ; RV32-NEXT:    mul a1, a1, a4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a5, a5, 3
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vle16.v v6, (a6)
+; RV32-NEXT:    vmv.s.x v0, a5
+; RV32-NEXT:    lui a1, %hi(.LCPI8_1)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_1)
+; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT:    vmerge.vvm v16, v8, v16, v0
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vrgatherei16.vv v24, v16, v6
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    li a5, 52
+; RV32-NEXT:    mul a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vle32.v v16, (a3)
+; RV32-NEXT:    addi t0, t0, 12
+; RV32-NEXT:    vmv.s.x v0, a7
+; RV32-NEXT:    vmv.s.x v7, t0
+; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; RV32-NEXT:    vle16.v v4, (a1)
+; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    li a3, 60
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a6, a6, 3
-; RV32-NEXT:    vle16.v v4, (a7)
-; RV32-NEXT:    vmv.s.x v3, a6
-; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v16, v8, 16
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 80
+; RV32-NEXT:    li a3, 84
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmerge.vvm v8, v16, v8, v0
+; RV32-NEXT:    vmerge.vvm v20, v24, v16, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 52
+; RV32-NEXT:    li a3, 40
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv1r.v v0, v3
+; RV32-NEXT:    vs4r.v v20, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    li a3, 76
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmerge.vvm v16, v8, v24, v0
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v4
+; RV32-NEXT:    vmerge.vvm v24, v8, v16, v0
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v4
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    li a3, 44
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    li a1, 3
-; RV32-NEXT:    lui a3, 49164
-; RV32-NEXT:    lui a4, %hi(.LCPI8_3)
-; RV32-NEXT:    addi a4, a4, %lo(.LCPI8_3)
+; RV32-NEXT:    lui a3, 196656
+; RV32-NEXT:    lui a4, %hi(.LCPI8_2)
+; RV32-NEXT:    addi a4, a4, %lo(.LCPI8_2)
 ; RV32-NEXT:    slli a1, a1, 10
-; RV32-NEXT:    addi a3, a3, 12
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vmv.s.x v0, a1
-; RV32-NEXT:    vle16.v v16, (a4)
-; RV32-NEXT:    vmv.s.x v20, a3
+; RV32-NEXT:    vle16.v v14, (a4)
+; RV32-NEXT:    vmv.s.x v12, a3
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 88
+; RV32-NEXT:    li a3, 84
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vmv4r.v v8, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 80
+; RV32-NEXT:    li a3, 60
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmerge.vvm v8, v8, v24, v0
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmerge.vvm v8, v24, v8, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 48
+; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv1r.v v0, v20
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vmv1r.v v0, v12
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 72
+; RV32-NEXT:    li a3, 68
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmerge.vvm v24, v8, v24, v0
-; RV32-NEXT:    vrgatherei16.vv v8, v24, v16
+; RV32-NEXT:    vmerge.vvm v24, v24, v16, v0
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vrgatherei16.vv v16, v24, v14
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, 3
-; RV32-NEXT:    lui a3, 196656
+; RV32-NEXT:    lui a3, 786624
 ; RV32-NEXT:    lui a4, 12
-; RV32-NEXT:    lui a5, 786624
+; RV32-NEXT:    lui a5, 768
 ; RV32-NEXT:    li a6, 48
-; RV32-NEXT:    lui a7, 768
+; RV32-NEXT:    lui a7, 3073
 ; RV32-NEXT:    li t0, 192
 ; RV32-NEXT:    addi a1, a1, 3
-; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    addi a3, a3, 192
 ; RV32-NEXT:    addi a4, a4, 12
-; RV32-NEXT:    addi a5, a5, 192
-; RV32-NEXT:    addi a7, a7, 768
+; RV32-NEXT:    addi a5, a5, 768
+; RV32-NEXT:    addi a7, a7, -1024
 ; RV32-NEXT:    vmv.s.x v1, a6
-; RV32-NEXT:    vmv.s.x v8, t0
-; RV32-NEXT:    addi a6, sp, 16
-; RV32-NEXT:    vs1r.v v8, (a6) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv.s.x v12, t0
 ; RV32-NEXT:    vmv.s.x v0, a1
-; RV32-NEXT:    vmv.s.x v14, a3
-; RV32-NEXT:    vmv.s.x v7, a4
-; RV32-NEXT:    vmv.s.x v3, a5
-; RV32-NEXT:    vmv.s.x v2, a7
+; RV32-NEXT:    vmv.s.x v3, a3
+; RV32-NEXT:    vmv.s.x v2, a4
+; RV32-NEXT:    vmv.s.x v13, a5
+; RV32-NEXT:    vmv.s.x v14, a7
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 80
+; RV32-NEXT:    li a3, 60
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vmv4r.v v8, v24
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vmv4r.v v8, v16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 88
+; RV32-NEXT:    li a3, 84
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmerge.vvm v8, v8, v24, v0
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv1r.v v0, v14
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmerge.vvm v20, v8, v16, v0
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vs4r.v v20, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv1r.v v0, v3
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 72
+; RV32-NEXT:    li a3, 68
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    li a3, 76
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmerge.vvm v16, v8, v16, v0
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv1r.v v0, v7
+; RV32-NEXT:    vmerge.vvm v24, v16, v24, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 88
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv1r.v v0, v2
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 80
+; RV32-NEXT:    li a3, 84
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmerge.vvm v4, v24, v16, v0
-; RV32-NEXT:    vmv1r.v v0, v3
+; RV32-NEXT:    vmerge.vvm v24, v8, v24, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 72
+; RV32-NEXT:    li a3, 12
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmerge.vvm v16, v8, v16, v0
+; RV32-NEXT:    vs4r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv1r.v v0, v13
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 28
+; RV32-NEXT:    li a3, 76
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT:    vmerge.vvm v24, v16, v24, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 88
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv1r.v v0, v1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 80
+; RV32-NEXT:    li a3, 84
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmerge.vvm v16, v24, v16, v0
+; RV32-NEXT:    vmerge.vvm v4, v8, v24, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 36
+; RV32-NEXT:    li a3, 28
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv1r.v v0, v2
+; RV32-NEXT:    vs4r.v v4, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv1r.v v0, v14
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 72
+; RV32-NEXT:    li a3, 76
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmerge.vvm v16, v8, v16, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 20
+; RV32-NEXT:    li a3, 68
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT:    vmerge.vvm v16, v24, v16, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 88
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    li a2, 76
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv1r.v v0, v12
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 80
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    li a2, 84
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmerge.vvm v8, v16, v8, v0
+; RV32-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 88
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    li a2, 68
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a1, %hi(.LCPI8_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_0)
-; RV32-NEXT:    lui a3, %hi(.LCPI8_5)
-; RV32-NEXT:    addi a3, a3, %lo(.LCPI8_5)
-; RV32-NEXT:    lui a4, 3073
-; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; RV32-NEXT:    vle16.v v24, (a3)
-; RV32-NEXT:    addi a3, a4, -1024
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vle16.v v2, (a1)
-; RV32-NEXT:    vmv.s.x v0, a3
+; RV32-NEXT:    lui a1, 32
+; RV32-NEXT:    addi a1, a1, 4
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vrgatherei16.vv v16, v8, v24
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vrgatherei16.vv v20, v8, v16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    li a2, 52
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
+; RV32-NEXT:    vmv.v.v v20, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 72
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vmerge.vvm v16, v8, v16, v0
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    li a2, 84
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v20, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a1, 48
+; RV32-NEXT:    lui a2, %hi(.LCPI8_3)
+; RV32-NEXT:    addi a2, a2, %lo(.LCPI8_3)
+; RV32-NEXT:    addi a1, a1, 5
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vle16.v v28, (a2)
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v20, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 52
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vrgatherei16.vv v16, v12, v2
-; RV32-NEXT:    lui a1, %hi(.LCPI8_2)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_2)
-; RV32-NEXT:    vle16.v v12, (a1)
-; RV32-NEXT:    lui a1, %hi(.LCPI8_4)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_4)
-; RV32-NEXT:    vle16.v v14, (a1)
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vrgatherei16.vv v8, v12, v20
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 56
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    li a2, 44
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v16, v24
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 80
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv.v.v v8, v16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 48
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    li a2, 52
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vrgatherei16.vv v24, v16, v12
+; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vrgatherei16.vv v24, v12, v28
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
+; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v24, v16
+; RV32-NEXT:    lui a1, %hi(.LCPI8_4)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_4)
+; RV32-NEXT:    lui a2, %hi(.LCPI8_5)
+; RV32-NEXT:    addi a2, a2, %lo(.LCPI8_5)
+; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; RV32-NEXT:    vle16.v v12, (a1)
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vle16.v v28, (a2)
+; RV32-NEXT:    lui a1, %hi(.LCPI8_6)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_6)
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vle16.v v30, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 72
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v16, v0, v12
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    li a2, 12
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vrgatherei16.vv v20, v16, v14
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vrgatherei16.vv v12, v20, v28
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v20, v8
+; RV32-NEXT:    vmv.v.v v12, v16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v20, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a1, %hi(.LCPI8_6)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_6)
-; RV32-NEXT:    lui a3, %hi(.LCPI8_7)
-; RV32-NEXT:    addi a3, a3, %lo(.LCPI8_7)
-; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; RV32-NEXT:    vle16.v v20, (a3)
-; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT:    vle16.v v16, (a1)
+; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vrgatherei16.vv v16, v0, v30
+; RV32-NEXT:    lui a1, %hi(.LCPI8_7)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_7)
+; RV32-NEXT:    lui a2, %hi(.LCPI8_8)
+; RV32-NEXT:    addi a2, a2, %lo(.LCPI8_8)
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vle16.v v20, (a1)
 ; RV32-NEXT:    lui a1, %hi(.LCPI8_9)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_9)
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vle16.v v0, (a1)
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 28
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v24, v20
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vrgatherei16.vv v20, v4, v16
-; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v20, v8
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 20
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vrgatherei16.vv v8, v24, v0
-; RV32-NEXT:    lui a1, %hi(.LCPI8_8)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_8)
-; RV32-NEXT:    lui a3, %hi(.LCPI8_10)
-; RV32-NEXT:    addi a3, a3, %lo(.LCPI8_10)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT:    vle16.v v12, (a1)
-; RV32-NEXT:    lui a1, %hi(.LCPI8_11)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_11)
-; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; RV32-NEXT:    vle16.v v16, (a1)
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vle16.v v14, (a3)
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 52
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs2r.v v14, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vle16.v v8, (a2)
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vle16.v v10, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 36
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    li a2, 28
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v0, v4, v12
+; RV32-NEXT:    vl4r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v28, v0, v20
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v0, v8
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vrgatherei16.vv v8, v24, v16
+; RV32-NEXT:    vmv.v.v v28, v16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    li a2, 76
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vrgatherei16.vv v16, v0, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 88
+; RV32-NEXT:    li a2, 60
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 52
+; RV32-NEXT:    li a2, 68
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl2r.v v12, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v12
+; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vrgatherei16.vv v16, v4, v10
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    li a2, 60
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v8, v24
+; RV32-NEXT:    vmv.v.v v16, v0
 ; RV32-NEXT:    addi a1, a0, 320
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vse32.v v8, (a1)
+; RV32-NEXT:    vse32.v v16, (a1)
 ; RV32-NEXT:    addi a1, a0, 256
-; RV32-NEXT:    vse32.v v0, (a1)
+; RV32-NEXT:    vse32.v v28, (a1)
 ; RV32-NEXT:    addi a1, a0, 192
-; RV32-NEXT:    vse32.v v20, (a1)
+; RV32-NEXT:    vse32.v v12, (a1)
 ; RV32-NEXT:    addi a1, a0, 128
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 6
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vse32.v v8, (a1)
+; RV32-NEXT:    vse32.v v24, (a1)
 ; RV32-NEXT:    addi a1, a0, 64
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 72
+; RV32-NEXT:    li a3, 52
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 80
+; RV32-NEXT:    li a2, 84
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a0)
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 96
+; RV32-NEXT:    li a1, 92
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
index f7647ff38c8a0..5fd7e47507f71 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
@@ -771,135 +771,43 @@ define <8 x double> @reverse_v8f64(<8 x double> %a) {
 
 
 define <3 x i64> @reverse_v3i64(<3 x i64> %a) {
-; RV32-LABEL: reverse_v3i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    lui a0, %hi(.LCPI44_0)
-; RV32-NEXT:    addi a0, a0, %lo(.LCPI44_0)
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vle16.v v12, (a0)
-; RV32-NEXT:    vrgatherei16.vv v10, v8, v12
-; RV32-NEXT:    vmv.v.v v8, v10
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: reverse_v3i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; RV64-NEXT:    vid.v v10
-; RV64-NEXT:    vrsub.vi v12, v10, 2
-; RV64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64-NEXT:    vrgatherei16.vv v10, v8, v12
-; RV64-NEXT:    vmv.v.v v8, v10
-; RV64-NEXT:    ret
-;
-; RV32-ZVBB-LABEL: reverse_v3i64:
-; RV32-ZVBB:       # %bb.0:
-; RV32-ZVBB-NEXT:    lui a0, %hi(.LCPI44_0)
-; RV32-ZVBB-NEXT:    addi a0, a0, %lo(.LCPI44_0)
-; RV32-ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-ZVBB-NEXT:    vle16.v v12, (a0)
-; RV32-ZVBB-NEXT:    vrgatherei16.vv v10, v8, v12
-; RV32-ZVBB-NEXT:    vmv.v.v v8, v10
-; RV32-ZVBB-NEXT:    ret
-;
-; RV64-ZVBB-LABEL: reverse_v3i64:
-; RV64-ZVBB:       # %bb.0:
-; RV64-ZVBB-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; RV64-ZVBB-NEXT:    vid.v v10
-; RV64-ZVBB-NEXT:    vrsub.vi v12, v10, 2
-; RV64-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64-ZVBB-NEXT:    vrgatherei16.vv v10, v8, v12
-; RV64-ZVBB-NEXT:    vmv.v.v v8, v10
-; RV64-ZVBB-NEXT:    ret
+; CHECK-LABEL: reverse_v3i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vrsub.vi v12, v10, 2
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
   %res = shufflevector <3 x i64> %a, <3 x i64> poison, <3 x i32> <i32 2, i32 1, i32 0>
   ret <3 x i64> %res
 }
 
 define <6 x i64> @reverse_v6i64(<6 x i64> %a) {
-; RV32-LABEL: reverse_v6i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    lui a0, %hi(.LCPI45_0)
-; RV32-NEXT:    addi a0, a0, %lo(.LCPI45_0)
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vle16.v v16, (a0)
-; RV32-NEXT:    vrgatherei16.vv v12, v8, v16
-; RV32-NEXT:    vmv.v.v v8, v12
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: reverse_v6i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV64-NEXT:    vid.v v12
-; RV64-NEXT:    vrsub.vi v16, v12, 5
-; RV64-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV64-NEXT:    vrgatherei16.vv v12, v8, v16
-; RV64-NEXT:    vmv.v.v v8, v12
-; RV64-NEXT:    ret
-;
-; RV32-ZVBB-LABEL: reverse_v6i64:
-; RV32-ZVBB:       # %bb.0:
-; RV32-ZVBB-NEXT:    lui a0, %hi(.LCPI45_0)
-; RV32-ZVBB-NEXT:    addi a0, a0, %lo(.LCPI45_0)
-; RV32-ZVBB-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-ZVBB-NEXT:    vle16.v v16, (a0)
-; RV32-ZVBB-NEXT:    vrgatherei16.vv v12, v8, v16
-; RV32-ZVBB-NEXT:    vmv.v.v v8, v12
-; RV32-ZVBB-NEXT:    ret
-;
-; RV64-ZVBB-LABEL: reverse_v6i64:
-; RV64-ZVBB:       # %bb.0:
-; RV64-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV64-ZVBB-NEXT:    vid.v v12
-; RV64-ZVBB-NEXT:    vrsub.vi v16, v12, 5
-; RV64-ZVBB-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV64-ZVBB-NEXT:    vrgatherei16.vv v12, v8, v16
-; RV64-ZVBB-NEXT:    vmv.v.v v8, v12
-; RV64-ZVBB-NEXT:    ret
+; CHECK-LABEL: reverse_v6i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vid.v v12
+; CHECK-NEXT:    vrsub.vi v16, v12, 5
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
   %res = shufflevector <6 x i64> %a, <6 x i64> poison, <6 x i32> <i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <6 x i64> %res
 }
 
 define <12 x i64> @reverse_v12i64(<12 x i64> %a) {
-; RV32-LABEL: reverse_v12i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    lui a1, %hi(.LCPI46_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI46_0)
-; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; RV32-NEXT:    vle16.v v24, (a1)
-; RV32-NEXT:    vrgatherei16.vv v16, v8, v24
-; RV32-NEXT:    vmv.v.v v8, v16
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: reverse_v12i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vid.v v16
-; RV64-NEXT:    vrsub.vi v24, v16, 11
-; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64-NEXT:    vrgatherei16.vv v16, v8, v24
-; RV64-NEXT:    vmv.v.v v8, v16
-; RV64-NEXT:    ret
-;
-; RV32-ZVBB-LABEL: reverse_v12i64:
-; RV32-ZVBB:       # %bb.0:
-; RV32-ZVBB-NEXT:    li a0, 32
-; RV32-ZVBB-NEXT:    lui a1, %hi(.LCPI46_0)
-; RV32-ZVBB-NEXT:    addi a1, a1, %lo(.LCPI46_0)
-; RV32-ZVBB-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; RV32-ZVBB-NEXT:    vle16.v v24, (a1)
-; RV32-ZVBB-NEXT:    vrgatherei16.vv v16, v8, v24
-; RV32-ZVBB-NEXT:    vmv.v.v v8, v16
-; RV32-ZVBB-NEXT:    ret
-;
-; RV64-ZVBB-LABEL: reverse_v12i64:
-; RV64-ZVBB:       # %bb.0:
-; RV64-ZVBB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-ZVBB-NEXT:    vid.v v16
-; RV64-ZVBB-NEXT:    vrsub.vi v24, v16, 11
-; RV64-ZVBB-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64-ZVBB-NEXT:    vrgatherei16.vv v16, v8, v24
-; RV64-ZVBB-NEXT:    vmv.v.v v8, v16
-; RV64-ZVBB-NEXT:    ret
+; CHECK-LABEL: reverse_v12i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vid.v v16
+; CHECK-NEXT:    vrsub.vi v24, v16, 11
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v16, v8, v24
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
   %res = shufflevector <12 x i64> %a, <12 x i64> poison, <12 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <12 x i64> %res
 }
@@ -1512,3 +1420,8 @@ define <16 x i32> @reverse_v16i32_exact_vlen_256(<16 x i32> %a) vscale_range(4,
   %res = shufflevector <16 x i32> %a, <16 x i32> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <16 x i32> %res
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV32-ZVBB: {{.*}}
+; RV64: {{.*}}
+; RV64-ZVBB: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll
index 02355d331e13f..464b4eca35aba 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll
@@ -556,12 +556,14 @@ define <8 x i16> @shuffle_v8i16_as_i64_32(<8 x i16> %v) {
 ;
 ; ZVKB-ZVE32X-LABEL: shuffle_v8i16_as_i64_32:
 ; ZVKB-ZVE32X:       # %bb.0:
-; ZVKB-ZVE32X-NEXT:    lui a0, %hi(.LCPI20_0)
-; ZVKB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI20_0)
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m2, ta, ma
-; ZVKB-ZVE32X-NEXT:    vle8.v v10, (a0)
+; ZVKB-ZVE32X-NEXT:    lui a0, 8240
+; ZVKB-ZVE32X-NEXT:    addi a0, a0, 1
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; ZVKB-ZVE32X-NEXT:    vmv.s.x v10, a0
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; ZVKB-ZVE32X-NEXT:    vsext.vf2 v12, v10
-; ZVKB-ZVE32X-NEXT:    vrgather.vv v10, v8, v12
+; ZVKB-ZVE32X-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vrgatherei16.vv v10, v8, v12
 ; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
@@ -765,12 +767,14 @@ define <8 x half> @shuffle_v8f16_as_i64_32(<8 x half> %v) {
 ;
 ; ZVKB-ZVE32X-LABEL: shuffle_v8f16_as_i64_32:
 ; ZVKB-ZVE32X:       # %bb.0:
-; ZVKB-ZVE32X-NEXT:    lui a0, %hi(.LCPI25_0)
-; ZVKB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI25_0)
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m2, ta, ma
-; ZVKB-ZVE32X-NEXT:    vle8.v v10, (a0)
+; ZVKB-ZVE32X-NEXT:    lui a0, 8240
+; ZVKB-ZVE32X-NEXT:    addi a0, a0, 1
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; ZVKB-ZVE32X-NEXT:    vmv.s.x v10, a0
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; ZVKB-ZVE32X-NEXT:    vsext.vf2 v12, v10
-; ZVKB-ZVE32X-NEXT:    vrgather.vv v10, v8, v12
+; ZVKB-ZVE32X-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vrgatherei16.vv v10, v8, v12
 ; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x half> %v, <8 x half> poison, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>

From 35a392553d790064566d4430f249b1e740052dfa Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 10 Jan 2025 15:13:53 +0000
Subject: [PATCH 064/408] [X86] widenSubVector - widen from smaller build
 vector if the upper elements are already the same padding elements (#122445)

Further simplifies some shuffle masks to help additional combines
---
 llvm/lib/Target/X86/X86ISelLowering.cpp           | 15 +++++++++++++--
 llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll     |  4 ++--
 .../CodeGen/X86/vector-shuffle-combining-avx.ll   |  3 +--
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6b0eb38e7e095..fbfcfc700ed62 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -4144,9 +4144,20 @@ static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
 static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
                               const X86Subtarget &Subtarget, SelectionDAG &DAG,
                               const SDLoc &dl) {
-  assert(Vec.getValueSizeInBits().getFixedValue() <= VT.getFixedSizeInBits() &&
-         Vec.getValueType().getScalarType() == VT.getScalarType() &&
+  EVT VecVT = Vec.getValueType();
+  assert(VecVT.getFixedSizeInBits() <= VT.getFixedSizeInBits() &&
+         VecVT.getScalarType() == VT.getScalarType() &&
          "Unsupported vector widening type");
+  // If the upper 128-bits of a build vector are already undef/zero, then try to
+  // widen from the lower 128-bits.
+  if (Vec.getOpcode() == ISD::BUILD_VECTOR && VecVT.is256BitVector()) {
+    unsigned NumSrcElts = VecVT.getVectorNumElements();
+    ArrayRef<SDUse> Hi = Vec->ops().drop_front(NumSrcElts / 2);
+    if (all_of(Hi, [&](SDValue V) {
+          return V.isUndef() || (ZeroNewElements && X86::isZeroNode(V));
+        }))
+      Vec = extract128BitVector(Vec, 0, DAG, dl);
+  }
   SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
                                 : DAG.getUNDEF(VT);
   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index 445468d06fb04..e7557134b1486 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -442,7 +442,7 @@ define <4 x double> @PR34175(ptr %p) {
 ;
 ; AVX512BW-LABEL: PR34175:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [0,8,32,40,0,8,32,40,0,8,32,40,0,8,32,40]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [0,8,32,40,0,0,0,0]
 ; AVX512BW-NEXT:    vmovdqu (%rdi), %ymm1
 ; AVX512BW-NEXT:    vmovdqu 32(%rdi), %ymm2
 ; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
@@ -461,7 +461,7 @@ define <4 x double> @PR34175(ptr %p) {
 ;
 ; AVX512VBMI-LABEL: PR34175:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [0,8,32,40,0,8,32,40,0,8,32,40,0,8,32,40]
+; AVX512VBMI-NEXT:    vmovq {{.*#+}} xmm0 = [0,8,32,40,0,0,0,0]
 ; AVX512VBMI-NEXT:    vmovdqu (%rdi), %ymm1
 ; AVX512VBMI-NEXT:    vmovdqu 32(%rdi), %ymm2
 ; AVX512VBMI-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
index 81ce14132c879..05071064fc60e 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -598,8 +598,7 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
 ; X64-AVX512-NEXT:    vmovapd {{.*#+}} ymm3 = [0,3,10,1]
 ; X64-AVX512-NEXT:    vpermi2pd %zmm0, %zmm4, %zmm3
 ; X64-AVX512-NEXT:    vmovapd %ymm3, (%rsi)
-; X64-AVX512-NEXT:    vbroadcastf128 {{.*#+}} ymm3 = [3,11,3,11]
-; X64-AVX512-NEXT:    # ymm3 = mem[0,1,0,1]
+; X64-AVX512-NEXT:    vmovapd {{.*#+}} xmm3 = [3,11]
 ; X64-AVX512-NEXT:    vpermi2pd %zmm1, %zmm0, %zmm3
 ; X64-AVX512-NEXT:    vmovapd {{.*#+}} ymm0 = [2,8,9,3]
 ; X64-AVX512-NEXT:    vpermi2pd %zmm3, %zmm2, %zmm0

From d6b6598e8075a5ba0271ee06a20c5a5609c0ec37 Mon Sep 17 00:00:00 2001
From: thebrandre <andre.brand@mailbox.org>
Date: Fri, 10 Jan 2025 16:31:58 +0100
Subject: [PATCH 065/408] [clang] Fix implicit integer conversion for opaque
 enums declared in class templates (#121039)

This commit fixes issues with enumeration types instantiated from an
opaque-enum-declarations
(see [dcl.enum]) in class templates broke basic assumptions during
parsing of arithmetic
expressions due to absent (NULL TYPE) promotion types of instances of
EnumDecl.

To this end, we repeat the simple steps in `Sema::ActOnTag` to evaluate
the promotion type
of a fixed enumeration based on its underlying type (see C++11
[conv.prom] p4).

Note that if, instead, a full *enum-specifier* (subsequent curly braces)
is provided,
`Sema::ActOnEnumBody` is re-invoked on template instantiation anyway
overriding the
promotion type and hiding the issue. This is analog to how enumerations
declarations
outside of template declarations are handled.
Note that, in contrast to `Sema::ActOnEnumBody`, `Sema::ActOnTag` is
*not* called again
for the instantiated enumeration type.

Fixes #117960.

---------

Co-authored-by: cor3ntin <corentinjabot@gmail.com>
---
 clang/docs/ReleaseNotes.rst                   |   3 +
 .../lib/Sema/SemaTemplateInstantiateDecl.cpp  |  13 ++
 ...que-enum-declaration-in-class-template.cpp | 214 ++++++++++++++++++
 3 files changed, 230 insertions(+)
 create mode 100644 clang/test/SemaCXX/opaque-enum-declaration-in-class-template.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 511a28c5554bb..440b045399d99 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -958,6 +958,9 @@ Miscellaneous Clang Crashes Fixed
 - Fixed internal assertion firing when a declaration in the implicit global
   module is found through ADL. (GH#109879)
 
+- Fixed a crash when an unscoped enumeration declared by an opaque-enum-declaration within a class template
+  with a dependent underlying type is subject to integral promotion. (#GH117960)
+
 OpenACC Specific Changes
 ------------------------
 
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index e058afe81da58..6a2331e59477a 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -1571,6 +1571,19 @@ Decl *TemplateDeclInstantiator::VisitEnumDecl(EnumDecl *D) {
         Enum->setIntegerType(SemaRef.Context.IntTy);
       else
         Enum->setIntegerTypeSourceInfo(NewTI);
+
+      // C++23 [conv.prom]p4
+      // if integral promotion can be applied to its underlying type, a prvalue
+      // of an unscoped enumeration type whose underlying type is fixed can also
+      // be converted to a prvalue of the promoted underlying type.
+      //
+      // FIXME: that logic is already implemented in ActOnEnumBody, factor out
+      // into (Re)BuildEnumBody.
+      QualType UnderlyingType = Enum->getIntegerType();
+      Enum->setPromotionType(
+          SemaRef.Context.isPromotableIntegerType(UnderlyingType)
+              ? SemaRef.Context.getPromotedIntegerType(UnderlyingType)
+              : UnderlyingType);
     } else {
       assert(!D->getIntegerType()->isDependentType()
              && "Dependent type without type source info");
diff --git a/clang/test/SemaCXX/opaque-enum-declaration-in-class-template.cpp b/clang/test/SemaCXX/opaque-enum-declaration-in-class-template.cpp
new file mode 100644
index 0000000000000..7101a153c6ebb
--- /dev/null
+++ b/clang/test/SemaCXX/opaque-enum-declaration-in-class-template.cpp
@@ -0,0 +1,214 @@
+// RUN: %clang_cc1 -std=c++11 -Wredeclared-class-member -Wconstant-conversion -Wdeprecated-declarations -Wc++11-narrowing -fsyntax-only %s -verify
+// RUN: %clang_cc1 -std=c++14 -Wredeclared-class-member -Wconstant-conversion -Wdeprecated-declarations -Wc++11-narrowing -fsyntax-only %s -verify
+// RUN: %clang_cc1 -std=c++20 -Wredeclared-class-member -Wconstant-conversion -Wdeprecated-declarations -Wc++11-narrowing -fsyntax-only %s -verify
+
+// Test that opaque-enum-declarations are handled correctly w.r.t integral promotions.
+// The key sections in the C++11 standard are:
+// C++11 [dcl.enum]p3: An enumeration declared by an opaque-enum-declaration
+// has a fixed underlying type and is a complete type.
+// C++11 [conv.prom]: A prvalue of an unscoped enumeration type whose underlying type
+// is fixed ([dcl.enum]) can be converted to a prvalue of its underlying type.
+
+// This program causes clang 19 and earlier to crash because
+// EnumDecl::PromotionType has not been set on the instantiated enum.
+// See GitHub Issue #117960.
+namespace Issue117960 {
+template <typename T>
+struct A {
+  enum E : T;
+};
+
+int b = A<int>::E{} + 0;
+}
+
+
+namespace test {
+template <typename T1, typename T2>
+struct IsSame {
+  static constexpr bool check() { return false; }
+};
+
+template <typename T>
+struct IsSame<T, T> {
+  static constexpr bool check() { return true; }
+};
+}  // namespace test
+
+
+template <typename T>
+struct S1 {
+  enum E : T;
+};
+// checks if EnumDecl::PromotionType is set
+int X1 = S1<int>::E{} + 0;
+int Y1 = S1<unsigned>::E{} + 0;
+static_assert(test::IsSame<decltype(S1<int>::E{}+0), int>::check(), "");
+static_assert(test::IsSame<decltype(S1<unsigned>::E{}+0), unsigned>::check(), "");
+char Z1 = S1<unsigned>::E(-1) + 0; // expected-warning{{implicit conversion from 'unsigned int' to 'char'}}
+
+template <typename Traits>
+struct S2 {
+  enum E : typename Traits::IntegerType;
+};
+
+template <typename T>
+struct Traits {
+  typedef T IntegerType;
+};
+
+int X2 = S2<Traits<int>>::E{} + 0;
+int Y2 = S2<Traits<unsigned>>::E{} + 0;
+static_assert(test::IsSame<decltype(S2<Traits<int>>::E{}+0), int>::check(), "");
+static_assert(test::IsSame<decltype(S2<Traits<unsigned>>::E{}+0), unsigned>::check(), "");
+// C++11 [conv.prom]p4:
+// A prvalue of an unscoped enumeration type whose underlying type is fixed can be converted to a
+// prvalue of its underlying type. Moreover, if integral promotion can be applied to its underlying type, a
+// prvalue of an unscoped enumeration type whose underlying type is fixed can also be converted to a prvalue
+// of the promoted underlying type.
+static_assert(test::IsSame<decltype(S2<Traits<char>>::E{}+char(0)), int>::check(), "");
+
+
+template <typename T>
+struct S3 {
+  enum E : unsigned;
+};
+
+int X3 = S3<float>::E{} + 0;
+
+// fails in clang 19 and earlier (see the discussion on GitHub Issue #117960):
+static_assert(test::IsSame<decltype(S3<float>::E{}+0), unsigned>::check(), "");
+
+template <typename T>
+struct S4 {
+  enum E1 : char;
+  enum E2 : T;
+};
+
+int X4 = S4<char>::E1{} + '\0';
+int Y4 = S4<char>::E2{} + '\0';
+
+template <typename T>
+struct S5 {
+  enum class E1 : char;
+  enum class E2 : T;
+};
+
+int X5 = S5<char>::E1{} + '\0'; // expected-error{{invalid operands to binary expression}}
+int Y5 = S5<char>::E2{} + '\0'; // expected-error{{invalid operands to binary expression}}
+
+
+template <typename T>
+struct S6 {
+  enum E1 : T;
+  enum E2 : E1; // expected-error{{invalid underlying type}}
+};
+
+template struct S6<int>; // expected-note{{in instantiation of template class 'S6<int>' requested here}}
+
+
+template <typename T>
+struct S7 {
+  enum E : T;
+  enum E : T { X, Y, Z }; // expected-note{{previous declaration is here}}
+  enum E : T; // expected-warning{{class member cannot be redeclared}}
+};
+
+template struct S7<int>;
+
+template <typename T>
+struct S8 {
+  enum E : char;
+  enum E : char { X, Y, Z }; // expected-note{{previous declaration is here}}
+  enum E : char; // expected-warning{{class member cannot be redeclared}}
+};
+
+template struct S8<float>;
+
+template <typename T>
+struct S9 {
+  enum class E1 : T;
+  enum class E1 : T { X, Y, Z }; // expected-note{{previous declaration is here}}
+  enum class E1 : T; // expected-warning{{class member cannot be redeclared}}
+  enum class E2 : char;
+  enum class E2 : char { X, Y, Z }; // expected-note{{previous declaration is here}}
+  enum class E2 : char; // expected-warning{{class member cannot be redeclared}}
+};
+
+template struct S9<int>;
+
+#if defined(__cplusplus) && __cplusplus >= 201402L
+template <typename T>
+struct S10 {
+  enum [[deprecated("for reasons")]] E : T; // expected-note{{explicitly marked deprecated here}}
+};
+
+int X10 = S10<int>::E{} + 0; // expected-warning{{deprecated: for reasons}}
+#endif
+
+template <typename T>
+struct S11 {};
+
+template <>
+struct S11<unsigned> {
+  enum E : unsigned;
+};
+
+unsigned X11 = S11<unsigned>::E{} + 0u;
+
+#if defined(__cplusplus) && __cplusplus >= 201402L
+template <typename T>
+struct S12 {
+  enum [[deprecated("for reasons")]] E1 : T; // expected-note{{explicitly marked deprecated here}}
+  enum [[deprecated("for reasons")]] E2 : T;
+};
+
+template <>
+struct S12<float> {
+  enum E1 : unsigned;
+  enum E2 : unsigned;
+};
+
+unsigned X12 = S12<float>::E1{} + 0u;
+unsigned Y12 = S12<float>::E2{} + 0u;
+int Z12 = S12<int>::E1{} + 0; // expected-warning{{deprecated: for reasons}}
+#endif
+
+template <typename T>
+struct S13 {
+  enum __attribute__((packed)) E { X, Y };
+};
+
+static_assert(sizeof(S13<int>::E) == 1, "");
+
+template<typename T>
+struct S14 {
+  enum E : float; // expected-error {{invalid underlying type}}
+};
+
+template<typename T>
+struct S15 {
+  enum E : T; // expected-error {{invalid underlying type}}
+};
+
+template struct S15<float>; // expected-note {{in instantiation of template class 'S15<float>' requested here}}
+
+
+
+
+template <typename T>
+int f1() {
+  enum E : T;
+  return E{} + 0;
+}
+
+int F1 = f1<int>();
+
+template <typename T>
+int f2() {
+  struct LocalClass {
+    enum E : T;
+  };
+  return typename LocalClass::E{} + 0;
+}
+
+int F2 = f2<int>();

From aee51b4d75089b4e7d9eb20877e2adbf6adea999 Mon Sep 17 00:00:00 2001
From: Congcong Cai <congcongcai0907@163.com>
Date: Fri, 10 Jan 2025 23:42:29 +0800
Subject: [PATCH 066/408] [clang-tidy][NFC] optimize cache for config option
 (#121406)

Current implement will cache `OptionsSource` for each path, it will
create lots of copy of `OptionsSource` when project has deep nested
folder structure.
New implement use vector to store `OptionsSource` and only cache the
index. It can reduce memory usage and avoid meaningless copy.
---
 .../clang-tidy/ClangTidyOptions.cpp           | 47 ++++++++++---------
 .../clang-tidy/ClangTidyOptions.h             |  5 +-
 2 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp b/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp
index 445c7f85c900c..e1d5df75f3e5a 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp
+++ b/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp
@@ -337,33 +337,34 @@ FileOptionsBaseProvider::FileOptionsBaseProvider(
 void FileOptionsBaseProvider::addRawFileOptions(
     llvm::StringRef AbsolutePath, std::vector<OptionsSource> &CurOptions) {
   auto CurSize = CurOptions.size();
-
   // Look for a suitable configuration file in all parent directories of the
   // file. Start with the immediate parent directory and move up.
-  StringRef Path = llvm::sys::path::parent_path(AbsolutePath);
-  for (StringRef CurrentPath = Path; !CurrentPath.empty();
-       CurrentPath = llvm::sys::path::parent_path(CurrentPath)) {
-    std::optional<OptionsSource> Result;
-
-    auto Iter = CachedOptions.find(CurrentPath);
-    if (Iter != CachedOptions.end())
-      Result = Iter->second;
-
-    if (!Result)
-      Result = tryReadConfigFile(CurrentPath);
-
-    if (Result) {
-      // Store cached value for all intermediate directories.
-      while (Path != CurrentPath) {
+  StringRef RootPath = llvm::sys::path::parent_path(AbsolutePath);
+  auto MemorizedConfigFile =
+      [this, &RootPath](StringRef CurrentPath) -> std::optional<OptionsSource> {
+    const auto Iter = CachedOptions.Memorized.find(CurrentPath);
+    if (Iter != CachedOptions.Memorized.end())
+      return CachedOptions.Storage[Iter->second];
+    std::optional<OptionsSource> OptionsSource = tryReadConfigFile(CurrentPath);
+    if (OptionsSource) {
+      const size_t Index = CachedOptions.Storage.size();
+      CachedOptions.Storage.emplace_back(OptionsSource.value());
+      while (RootPath != CurrentPath) {
         LLVM_DEBUG(llvm::dbgs()
-                   << "Caching configuration for path " << Path << ".\n");
-        if (!CachedOptions.count(Path))
-          CachedOptions[Path] = *Result;
-        Path = llvm::sys::path::parent_path(Path);
+                   << "Caching configuration for path " << RootPath << ".\n");
+        CachedOptions.Memorized[RootPath] = Index;
+        RootPath = llvm::sys::path::parent_path(RootPath);
       }
-      CachedOptions[Path] = *Result;
-
-      CurOptions.push_back(*Result);
+      CachedOptions.Memorized[CurrentPath] = Index;
+      RootPath = llvm::sys::path::parent_path(CurrentPath);
+    }
+    return OptionsSource;
+  };
+  for (StringRef CurrentPath = RootPath; !CurrentPath.empty();
+       CurrentPath = llvm::sys::path::parent_path(CurrentPath)) {
+    if (std::optional<OptionsSource> Result =
+            MemorizedConfigFile(CurrentPath)) {
+      CurOptions.emplace_back(Result.value());
       if (!Result->first.InheritParentConfig.value_or(false))
         break;
     }
diff --git a/clang-tools-extra/clang-tidy/ClangTidyOptions.h b/clang-tools-extra/clang-tidy/ClangTidyOptions.h
index 85d5a02ebbc1b..568f60cf98b21 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyOptions.h
+++ b/clang-tools-extra/clang-tidy/ClangTidyOptions.h
@@ -241,7 +241,10 @@ class FileOptionsBaseProvider : public DefaultOptionsProvider {
   /// \c ConfigHandlers.
   std::optional<OptionsSource> tryReadConfigFile(llvm::StringRef Directory);
 
-  llvm::StringMap<OptionsSource> CachedOptions;
+  struct OptionsCache {
+    llvm::StringMap<size_t> Memorized;
+    llvm::SmallVector<OptionsSource, 4U> Storage;
+  } CachedOptions;
   ClangTidyOptions OverrideOptions;
   ConfigFileHandlers ConfigHandlers;
   llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS;

From bbb53d1a8cd37cbb31ec5ec7938a0f24f628c821 Mon Sep 17 00:00:00 2001
From: Paul Bowen-Huggett <paulhuggett@mac.com>
Date: Fri, 10 Jan 2025 16:43:14 +0100
Subject: [PATCH 067/408] [NFC] Make AMDGPUCombinerHelper methods const
 (#121903)

(This replaces #121740. Sorry for wasting your time.)

This is a follow-up to a previous commit (ee7ca0d) which eliminated
several "TODO: make CombinerHelper methods const" remarks. As promised
in that ealier commit, this change completes the set by also making the
methods of AMDGPUCombinerHelper const so that the Helper member of
AMDGPUPreLegalizerCombinerImpl can be const rather than explicitly
mutable.
---
 llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp       | 10 +++++-----
 llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h         | 10 +++++-----
 llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp |  3 +--
 3 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
index f6f9f4bc0fb1b..46194ab46ff6a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
@@ -197,7 +197,7 @@ static unsigned inverseMinMax(unsigned Opc) {
 }
 
 bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI,
-                                             MachineInstr *&MatchInfo) {
+                                             MachineInstr *&MatchInfo) const {
   Register Src = MI.getOperand(1).getReg();
   MatchInfo = MRI.getVRegDef(Src);
 
@@ -266,7 +266,7 @@ bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI,
 }
 
 void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI,
-                                             MachineInstr *&MatchInfo) {
+                                             MachineInstr *&MatchInfo) const {
   // Transform:
   // %A = inst %Op1, ...
   // %B = fneg %A
@@ -425,7 +425,7 @@ static bool isFPExtFromF16OrConst(const MachineRegisterInfo &MRI,
 bool AMDGPUCombinerHelper::matchExpandPromotedF16FMed3(MachineInstr &MI,
                                                        Register Src0,
                                                        Register Src1,
-                                                       Register Src2) {
+                                                       Register Src2) const {
   assert(MI.getOpcode() == TargetOpcode::G_FPTRUNC);
   Register SrcReg = MI.getOperand(1).getReg();
   if (!MRI.hasOneNonDBGUse(SrcReg) || MRI.getType(SrcReg) != LLT::scalar(32))
@@ -438,7 +438,7 @@ bool AMDGPUCombinerHelper::matchExpandPromotedF16FMed3(MachineInstr &MI,
 void AMDGPUCombinerHelper::applyExpandPromotedF16FMed3(MachineInstr &MI,
                                                        Register Src0,
                                                        Register Src1,
-                                                       Register Src2) {
+                                                       Register Src2) const {
   // We expect fptrunc (fpext x) to fold out, and to constant fold any constant
   // sources.
   Src0 = Builder.buildFPTrunc(LLT::scalar(16), Src0).getReg(0);
@@ -455,7 +455,7 @@ void AMDGPUCombinerHelper::applyExpandPromotedF16FMed3(MachineInstr &MI,
 
 bool AMDGPUCombinerHelper::matchCombineFmulWithSelectToFldexp(
     MachineInstr &MI, MachineInstr &Sel,
-    std::function<void(MachineIRBuilder &)> &MatchInfo) {
+    std::function<void(MachineIRBuilder &)> &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_FMUL);
   assert(Sel.getOpcode() == TargetOpcode::G_SELECT);
   assert(MI.getOperand(2).getReg() == Sel.getOperand(0).getReg());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h
index 893b3f5415f8c..bc3d9daef87c5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h
@@ -32,17 +32,17 @@ class AMDGPUCombinerHelper : public CombinerHelper {
                        MachineDominatorTree *MDT, const LegalizerInfo *LI,
                        const GCNSubtarget &STI);
 
-  bool matchFoldableFneg(MachineInstr &MI, MachineInstr *&MatchInfo);
-  void applyFoldableFneg(MachineInstr &MI, MachineInstr *&MatchInfo);
+  bool matchFoldableFneg(MachineInstr &MI, MachineInstr *&MatchInfo) const;
+  void applyFoldableFneg(MachineInstr &MI, MachineInstr *&MatchInfo) const;
 
   bool matchExpandPromotedF16FMed3(MachineInstr &MI, Register Src0,
-                                   Register Src1, Register Src2);
+                                   Register Src1, Register Src2) const;
   void applyExpandPromotedF16FMed3(MachineInstr &MI, Register Src0,
-                                   Register Src1, Register Src2);
+                                   Register Src1, Register Src2) const;
 
   bool matchCombineFmulWithSelectToFldexp(
       MachineInstr &MI, MachineInstr &Sel,
-      std::function<void(MachineIRBuilder &)> &MatchInfo);
+      std::function<void(MachineIRBuilder &)> &MatchInfo) const;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index e1564d5de415d..52c6e5274ae5b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -45,8 +45,7 @@ class AMDGPUPreLegalizerCombinerImpl : public Combiner {
 protected:
   const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig;
   const GCNSubtarget &STI;
-  // TODO: Make CombinerHelper methods const.
-  mutable AMDGPUCombinerHelper Helper;
+  const AMDGPUCombinerHelper Helper;
 
 public:
   AMDGPUPreLegalizerCombinerImpl(

From 9d7df23f4d6537752854d54b0c4c583512b930d0 Mon Sep 17 00:00:00 2001
From: Santanu Das <quic_santdas@quicinc.com>
Date: Fri, 10 Jan 2025 21:24:02 +0530
Subject: [PATCH 068/408] [Hexagon] Add missing pattern for v8i1 type (#120703)

HexagonISD::PFALSE and PTRUE patterns do not form independently in
general as they are treated like operands of all 0s or all 1s. Eg: i32 =
transfer HEXAGONISD::PFALSE.
In this case, v8i1 = HEXAGONISD::PFALSE is formed independently without
accompanying opcode.

This patch adds a pattern to transfer all 0s or all 1s to a scalar
register and then use that register and this PFALSE/PTRUE opcode to
transfer to a predicate register like v8i1.
---
 llvm/lib/Target/Hexagon/HexagonPatterns.td   |  3 +++
 llvm/test/CodeGen/Hexagon/isel/isel-tfrrp.ll | 15 +++++++++++++++
 2 files changed, 18 insertions(+)
 create mode 100644 llvm/test/CodeGen/Hexagon/isel/isel-tfrrp.ll

diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td
index 9bd45c72b7d4d..cba5ff1ab0d9b 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatterns.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td
@@ -108,6 +108,9 @@ def ptrue:  PatFrag<(ops), (HexagonPTRUE)>;
 def pfalse: PatFrag<(ops), (HexagonPFALSE)>;
 def pnot:   PatFrag<(ops node:$Pu), (xor node:$Pu, ptrue)>;
 
+def: Pat<(v8i1 (HexagonPFALSE)), (C2_tfrrp (A2_tfrsi (i32 0)))>;
+def: Pat<(v8i1 (HexagonPTRUE)), (C2_tfrrp (A2_tfrsi (i32 -1)))>;
+
 def valign: PatFrag<(ops node:$Vt, node:$Vs, node:$Ru),
                     (HexagonVALIGN node:$Vt, node:$Vs, node:$Ru)>;
 def valignaddr: PatFrag<(ops node:$Addr), (HexagonVALIGNADDR node:$Addr)>;
diff --git a/llvm/test/CodeGen/Hexagon/isel/isel-tfrrp.ll b/llvm/test/CodeGen/Hexagon/isel/isel-tfrrp.ll
new file mode 100644
index 0000000000000..b2a9f732bdddc
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/isel/isel-tfrrp.ll
@@ -0,0 +1,15 @@
+; Check if a C2_tfrrp instruction with constant i32 0 input is generated
+; The constant 0 is generated by a transfer immediate instruction.
+
+; RUN: llc -march=hexagon -debug-only=isel 2>&1 < %s - | FileCheck %s
+
+; CHECK: [[R0:%[0-9]+]]:intregs = A2_tfrsi 0
+; CHECK-NEXT: predregs = C2_tfrrp killed [[R0]]:intregs
+
+define void @test_false(i1 %0) {
+  %2 = insertelement <1024 x i1> zeroinitializer, i1 %0, i64 0
+  tail call void @llvm.masked.store.v1024f32.p0(<1024 x float> zeroinitializer, ptr null, i32 1, <1024 x i1> %2)
+  ret void
+}
+
+declare void @llvm.masked.store.v1024f32.p0(<1024 x float>, ptr nocapture, i32 immarg, <1024 x i1>)

From c664a7f9750356319c329408be94f669cf5f799e Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Fri, 10 Jan 2025 11:03:50 -0500
Subject: [PATCH 069/408] [libc++] Remove obsolete bsd_locale_defaults.h
 (#122276)

Supported platforms who used to need this header now go through the new
locale base API instead, so that header is not required anymore.
---
 libcxx/include/CMakeLists.txt                 |   1 -
 libcxx/include/__config                       |   4 -
 libcxx/include/__locale_dir/locale_base_api.h |   6 +-
 .../locale_base_api/bsd_locale_defaults.h     | 116 ------------------
 libcxx/include/locale                         |   2 +-
 libcxx/include/module.modulemap               |   1 -
 6 files changed, 2 insertions(+), 128 deletions(-)
 delete mode 100644 libcxx/include/__locale_dir/locale_base_api/bsd_locale_defaults.h

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 96071527b5283..f7721b1047b81 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -498,7 +498,6 @@ set(files
   __locale
   __locale_dir/locale_base_api.h
   __locale_dir/locale_base_api/android.h
-  __locale_dir/locale_base_api/bsd_locale_defaults.h
   __locale_dir/locale_base_api/bsd_locale_fallbacks.h
   __locale_dir/locale_base_api/fuchsia.h
   __locale_dir/locale_base_api/ibm.h
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 2de4c009b5afd..658a7e16fae91 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -629,10 +629,6 @@ typedef __char32_t char32_t;
 #    define _LIBCPP_DECLARE_STRONG_ENUM_EPILOG(x)
 #  endif // _LIBCPP_CXX03_LANG
 
-#  if defined(__APPLE__) || defined(__FreeBSD__) || defined(_LIBCPP_MSVCRT_LIKE) || defined(__NetBSD__)
-#    define _LIBCPP_LOCALE__L_EXTENSIONS 1
-#  endif
-
 #  ifdef __FreeBSD__
 #    define _DECLARE_C99_LDBL_MATH 1
 #  endif
diff --git a/libcxx/include/__locale_dir/locale_base_api.h b/libcxx/include/__locale_dir/locale_base_api.h
index cda6033b03db7..bb0da889f4c84 100644
--- a/libcxx/include/__locale_dir/locale_base_api.h
+++ b/libcxx/include/__locale_dir/locale_base_api.h
@@ -117,11 +117,7 @@
 #    include <__locale_dir/locale_base_api/musl.h>
 #  endif
 
-#  ifdef _LIBCPP_LOCALE__L_EXTENSIONS
-#    include <__locale_dir/locale_base_api/bsd_locale_defaults.h>
-#  else
-#    include <__locale_dir/locale_base_api/bsd_locale_fallbacks.h>
-#  endif
+#  include <__locale_dir/locale_base_api/bsd_locale_fallbacks.h>
 
 #  include <__cstddef/size_t.h>
 #  include <__utility/forward.h>
diff --git a/libcxx/include/__locale_dir/locale_base_api/bsd_locale_defaults.h b/libcxx/include/__locale_dir/locale_base_api/bsd_locale_defaults.h
deleted file mode 100644
index 73ab635d28785..0000000000000
--- a/libcxx/include/__locale_dir/locale_base_api/bsd_locale_defaults.h
+++ /dev/null
@@ -1,116 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// The BSDs have lots of *_l functions.  We don't want to define those symbols
-// on other platforms though, for fear of conflicts with user code.  So here,
-// we will define the mapping from an internal macro to the real BSD symbol.
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_BSD_LOCALE_DEFAULTS_H
-#define _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_BSD_LOCALE_DEFAULTS_H
-
-#include <ctype.h>
-#include <stdio.h>
-#include <stdlib.h>
-#if _LIBCPP_HAS_WIDE_CHARACTERS
-#  include <wchar.h>
-#endif
-
-// <xlocale.h> must come after the includes above since the functions it includes depend on
-// what headers have been included up to that point.
-#if defined(__APPLE__) || defined(__FreeBSD__)
-#  include <xlocale.h>
-#endif
-
-#include <__config>
-#include <__cstddef/size_t.h>
-#include <__std_mbstate_t.h>
-#include <__utility/forward.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-inline _LIBCPP_HIDE_FROM_ABI decltype(MB_CUR_MAX) __libcpp_mb_cur_max_l(locale_t __loc) { return MB_CUR_MAX_L(__loc); }
-
-#if _LIBCPP_HAS_WIDE_CHARACTERS
-inline _LIBCPP_HIDE_FROM_ABI wint_t __libcpp_btowc_l(int __c, locale_t __loc) { return ::btowc_l(__c, __loc); }
-
-inline _LIBCPP_HIDE_FROM_ABI int __libcpp_wctob_l(wint_t __c, locale_t __loc) { return ::wctob_l(__c, __loc); }
-
-inline _LIBCPP_HIDE_FROM_ABI size_t __libcpp_wcsnrtombs_l(
-    char* __dest, const wchar_t** __src, size_t __nwc, size_t __len, mbstate_t* __ps, locale_t __loc) {
-  return ::wcsnrtombs_l(__dest, __src, __nwc, __len, __ps, __loc);
-}
-
-inline _LIBCPP_HIDE_FROM_ABI size_t __libcpp_wcrtomb_l(char* __s, wchar_t __wc, mbstate_t* __ps, locale_t __loc) {
-  return ::wcrtomb_l(__s, __wc, __ps, __loc);
-}
-
-inline _LIBCPP_HIDE_FROM_ABI size_t __libcpp_mbsnrtowcs_l(
-    wchar_t* __dest, const char** __src, size_t __nms, size_t __len, mbstate_t* __ps, locale_t __loc) {
-  return ::mbsnrtowcs_l(__dest, __src, __nms, __len, __ps, __loc);
-}
-
-inline _LIBCPP_HIDE_FROM_ABI size_t
-__libcpp_mbrtowc_l(wchar_t* __pwc, const char* __s, size_t __n, mbstate_t* __ps, locale_t __loc) {
-  return ::mbrtowc_l(__pwc, __s, __n, __ps, __loc);
-}
-
-inline _LIBCPP_HIDE_FROM_ABI int __libcpp_mbtowc_l(wchar_t* __pwc, const char* __pmb, size_t __max, locale_t __loc) {
-  return ::mbtowc_l(__pwc, __pmb, __max, __loc);
-}
-
-inline _LIBCPP_HIDE_FROM_ABI size_t __libcpp_mbrlen_l(const char* __s, size_t __n, mbstate_t* __ps, locale_t __loc) {
-  return ::mbrlen_l(__s, __n, __ps, __loc);
-}
-#endif // _LIBCPP_HAS_WIDE_CHARACTERS
-
-inline _LIBCPP_HIDE_FROM_ABI lconv* __libcpp_localeconv_l(locale_t& __loc) { return ::localeconv_l(__loc); }
-
-#if _LIBCPP_HAS_WIDE_CHARACTERS
-inline _LIBCPP_HIDE_FROM_ABI size_t
-__libcpp_mbsrtowcs_l(wchar_t* __dest, const char** __src, size_t __len, mbstate_t* __ps, locale_t __loc) {
-  return ::mbsrtowcs_l(__dest, __src, __len, __ps, __loc);
-}
-#endif
-
-_LIBCPP_DIAGNOSTIC_PUSH
-_LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wgcc-compat")
-_LIBCPP_GCC_DIAGNOSTIC_IGNORED("-Wformat-nonliteral") // GCC doesn't support [[gnu::format]] on variadic templates
-#ifdef _LIBCPP_COMPILER_CLANG_BASED
-#  define _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(...) _LIBCPP_ATTRIBUTE_FORMAT(__VA_ARGS__)
-#else
-#  define _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT
-#endif
-
-template <class... _Args>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(__printf__, 4, 5) int __libcpp_snprintf_l(
-    char* __s, size_t __n, locale_t __loc, const char* __format, _Args&&... __args) {
-  return ::snprintf_l(__s, __n, __loc, __format, std::forward<_Args>(__args)...);
-}
-
-template <class... _Args>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(__printf__, 3, 4) int __libcpp_asprintf_l(
-    char** __s, locale_t __loc, const char* __format, _Args&&... __args) {
-  return ::asprintf_l(__s, __loc, __format, std::forward<_Args>(__args)...);
-}
-
-template <class... _Args>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(__scanf__, 3, 4) int __libcpp_sscanf_l(
-    const char* __s, locale_t __loc, const char* __format, _Args&&... __args) {
-  return ::sscanf_l(__s, __loc, __format, std::forward<_Args>(__args)...);
-}
-_LIBCPP_DIAGNOSTIC_POP
-#undef _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_BSD_LOCALE_DEFAULTS_H
diff --git a/libcxx/include/locale b/libcxx/include/locale
index 981f25ed1e98c..5af674d19414b 100644
--- a/libcxx/include/locale
+++ b/libcxx/include/locale
@@ -219,7 +219,7 @@ template <class charT> class messages_byname;
 #    include <streambuf>
 #    include <version>
 
-// TODO: Properly qualify calls now that __bsd_locale_defaults.h defines functions instead of macros
+// TODO: Properly qualify calls now that the locale base API defines functions instead of macros
 // NOLINTBEGIN(libcpp-robust-against-adl)
 
 #    if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index 70aee7da79cba..07ab5649ae45c 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -1483,7 +1483,6 @@ module std [system] {
 
     module locale_base_api {
       textual header "__locale_dir/locale_base_api/android.h"
-      textual header "__locale_dir/locale_base_api/bsd_locale_defaults.h"
       textual header "__locale_dir/locale_base_api/bsd_locale_fallbacks.h"
       textual header "__locale_dir/locale_base_api/fuchsia.h"
       textual header "__locale_dir/locale_base_api/ibm.h"

From 513fa28901fc1906f10a7f9d2855266be8b18b90 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 10 Jan 2025 16:05:01 +0000
Subject: [PATCH 070/408] [gn build] Port c664a7f97503

---
 llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 1144402befa4d..83c0811b6814a 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -1461,7 +1461,6 @@ if (current_toolchain == default_toolchain) {
       "__locale",
       "__locale_dir/locale_base_api.h",
       "__locale_dir/locale_base_api/android.h",
-      "__locale_dir/locale_base_api/bsd_locale_defaults.h",
       "__locale_dir/locale_base_api/bsd_locale_fallbacks.h",
       "__locale_dir/locale_base_api/fuchsia.h",
       "__locale_dir/locale_base_api/ibm.h",

From 920c58916a6a1c0b13b9330b5e8640bd7f4b0115 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Fri, 10 Jan 2025 07:42:35 -0800
Subject: [PATCH 071/408] [SLP][NFC]Add a test with the mask translate after
 buildvector shuffle cost estimation

---
 .../SLPVectorizer/X86/bv-shuffle-mask.ll      | 98 +++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll b/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll
new file mode 100644
index 0000000000000..469421b660f31
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define i16 @test(i16 %v1, i16 %v2) {
+; CHECK-LABEL: define i16 @test(
+; CHECK-SAME: i16 [[V1:%.*]], i16 [[V2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> <i16 0, i16 0, i16 0, i16 poison>, i16 [[V2]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> <i16 0, i16 0, i16 0, i16 poison>, i16 [[V1]], i32 3
+; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i16> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i16> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i16> [[TMP5]], i16 [[V2]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i16> [[TMP6]], <2 x i16> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = and <4 x i16> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = and <4 x i16> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <4 x i16> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = or <4 x i1> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i16> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i16> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i16> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i16> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i1> zeroinitializer, [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <4 x i1> [[TMP12]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP18]], i32 2
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[TMP18]], i32 3
+; CHECK-NEXT:    [[TMP21:%.*]] = or i1 [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i1> [[TMP18]], i32 1
+; CHECK-NEXT:    [[TMP23:%.*]] = or i1 false, [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = freeze <4 x i1> [[TMP18]]
+; CHECK-NEXT:    [[TMP25:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP24]])
+; CHECK-NEXT:    [[SPEC_SELECT31:%.*]] = select i1 [[TMP25]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i1> [[TMP18]], i32 0
+; CHECK-NEXT:    [[TMP27:%.*]] = or i1 false, [[TMP26]]
+; CHECK-NEXT:    store i32 [[SPEC_SELECT31]], ptr null, align 4
+; CHECK-NEXT:    ret i16 0
+;
+entry:
+  %0 = and i16 %v2, %v1
+  %1 = and i16 %0, 0
+  %2 = and i16 %1, 0
+  %3 = icmp ne i16 %2, 0
+  %.not5.not = or i1 %3, false
+  %inc.1.1.i82.i.i = or i16 %v2, 0
+  %inc.143.1.i98.i.i = or i16 0, 0
+  %4 = or i16 %inc.1.1.i82.i.i, 0
+  %5 = or i16 %4, 0
+  %6 = or i16 %5, 0
+  %7 = icmp ne i16 %6, 0
+  %.not7.not = or i1 false, %7
+  %8 = or i1 %.not5.not, %.not7.not
+  %9 = and i16 0, %inc.143.1.i98.i.i
+  %10 = and i16 %9, 0
+  %11 = icmp ne i16 %10, 0
+  %.not5.not.1 = or i1 %11, false
+  %inc.143.i76.i.i.1 = or i16 %v1, 0
+  %inc.143.1.i98.i.i.1 = or i16 0, 0
+  %12 = or i16 0, %inc.143.i76.i.i.1
+  %13 = or i16 %12, 0
+  %14 = or i16 %13, 0
+  %15 = icmp ne i16 %14, 0
+  %.not7.not.1 = or i1 false, %15
+  %16 = or i1 %.not5.not.1, %.not7.not.1
+  %17 = or i1 %8, %16
+  %18 = and i16 0, %inc.143.1.i98.i.i.1
+  %19 = and i16 %18, 0
+  %20 = icmp ne i16 %19, 0
+  %.not5.not.2 = or i1 %20, false
+  %inc.143.i76.i.i.2 = or i16 %v1, 0
+  %inc.143.1.i98.i.i.2 = or i16 0, 0
+  %21 = or i16 0, %inc.143.i76.i.i.2
+  %22 = or i16 %21, 0
+  %23 = or i16 %22, 0
+  %24 = icmp ne i16 %23, 0
+  %.not7.not.2 = or i1 false, %24
+  %25 = or i1 %.not5.not.2, %.not7.not.2
+  %26 = or i1 false, %25
+  %27 = and i16 0, %inc.143.1.i98.i.i.2
+  %28 = and i16 %27, 0
+  %29 = icmp ne i16 %28, 0
+  %.not5.not.3 = or i1 %29, false
+  %inc.143.i76.i.i.3 = or i16 %v1, 0
+  %30 = or i16 0, %inc.143.i76.i.i.3
+  %31 = or i16 %30, 0
+  %32 = or i16 %31, 0
+  %33 = icmp ne i16 %32, 0
+  %.not7.not.3 = or i1 false, %33
+  %34 = or i1 %.not5.not.3, %.not7.not.3
+  %35 = select i1 %34, i1 true, i1 %25
+  %36 = select i1 %35, i1 true, i1 %16
+  %37 = select i1 %36, i1 true, i1 %8
+  %spec.select31 = select i1 %37, i32 0, i32 0
+  %38 = or i1 false, %34
+  store i32 %spec.select31, ptr null, align 4
+  ret i16 0
+}

From 7ebf0df409c8e2045b7725da5a912854c58e0f6a Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 10 Jan 2025 23:10:26 +0700
Subject: [PATCH 072/408] AMDGPU: Test gfx940 mfma intrinsics on gfx950

This requires splitting the xf32 cases into a separate file
---
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx940.ll | 43 +++---------------
 .../AMDGPU/llvm.amdgcn.mfma.xf32.gfx940.ll    | 45 +++++++++++++++++++
 2 files changed, 50 insertions(+), 38 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx940.ll

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx940.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx940.ll
index 702e513aff4c7..da191e4aa419d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx940.ll
@@ -3,10 +3,13 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX940,AGPRCD %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 -global-isel -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL,AGPRCD %s
 
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX940,VGPRCD %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL,VGPRCD %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -stress-regalloc=10 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX940,AGPRCD %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -stress-regalloc=10 -global-isel -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL,AGPRCD %s
+
 declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64, i64, <4 x i32>, i32, i32, i32)
 declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64, i64, <16 x i32>, i32, i32, i32)
-declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float>, <2 x float>, <4 x float>, i32, i32, i32)
-declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float>, <2 x float>, <16 x float>, i32, i32, i32)
 declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64, i64, <4 x float>, i32, i32, i32)
 declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64, i64, <4 x float>, i32, i32, i32)
 declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64, i64, <4 x float>, i32, i32, i32)
@@ -66,42 +69,6 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f32_16x16x8xf32:
-; GFX940-DAG:  v_mov_b32_e32 v[[ONE:[0-9]+]], 1.0
-; GFX940-DAG:  v_mov_b32_e32 v[[TWO:[0-9]+]], 2.0
-; GFX940-DAG:  v_mov_b32_e32 v[[THREE:[0-9]+]], 0x40400000
-; GFX940-DAG:  v_mov_b32_e32 v[[FOUR:[0-9]+]], 4.0
-; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX940:      v_mfma_f32_16x16x8_xf32 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:[[TWO]]], v[[[THREE]]:[[FOUR]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GISEL:       v_mfma_f32_16x16x8_xf32 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT:     v_accvgpr_read_b32
-; GCN:         global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(ptr addrspace(1) %arg) #0 {
-bb:
-  %in.1 = load <4 x float>, ptr addrspace(1) %arg
-  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <4 x float> %in.1, i32 1, i32 2, i32 3)
-  store <4 x float> %mai.1, ptr addrspace(1) %arg
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x4xf32:
-; GFX940-DAG:  v_mov_b32_e32 v[[ONE:[0-9]+]], 1.0
-; GFX940-DAG:  v_mov_b32_e32 v[[TWO:[0-9]+]], 2.0
-; GFX940-DAG:  v_mov_b32_e32 v[[THREE:[0-9]+]], 0x40400000
-; GFX940-DAG:  v_mov_b32_e32 v[[FOUR:[0-9]+]], 4.0
-; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX940:      v_mfma_f32_32x32x4_xf32 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:[[TWO]]], v[[[THREE]]:[[FOUR]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GISEL:       v_mfma_f32_32x32x4_xf32 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT:     v_accvgpr_read_b32
-; GCN:         global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 {
-bb:
-  %in.1 = load <16 x float>, ptr addrspace(1) %arg
-  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <16 x float> %in.1, i32 1, i32 2, i32 3)
-  store <16 x float> %mai.1, ptr addrspace(1) %arg
-  ret void
-}
-
 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_bf8_bf8:
 ; GFX940-DAG:  v_mov_b32_e32 v[[ONE:[0-9]+]], 1
 ; GFX940-DAG:  v_mov_b32_e32 v[[TWO:[0-9]+]], 2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx940.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx940.ll
new file mode 100644
index 0000000000000..0ee1ecfaffb15
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx940.ll
@@ -0,0 +1,45 @@
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX940 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -global-isel -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX940 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 -global-isel -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
+
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float>, <2 x float>, <4 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float>, <2 x float>, <16 x float>, i32, i32, i32)
+
+; GCN-LABEL: {{^}}test_mfma_f32_16x16x8xf32:
+; GFX940-DAG:  v_mov_b32_e32 v[[ONE:[0-9]+]], 1.0
+; GFX940-DAG:  v_mov_b32_e32 v[[TWO:[0-9]+]], 2.0
+; GFX940-DAG:  v_mov_b32_e32 v[[THREE:[0-9]+]], 0x40400000
+; GFX940-DAG:  v_mov_b32_e32 v[[FOUR:[0-9]+]], 4.0
+; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
+; GFX940:      v_mfma_f32_16x16x8_xf32 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:[[TWO]]], v[[[THREE]]:[[FOUR]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GISEL:       v_mfma_f32_16x16x8_xf32 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GCN-NOT:     v_accvgpr_read_b32
+; GCN:         global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
+define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(ptr addrspace(1) %arg) #0 {
+bb:
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
+  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <4 x float> %in.1, i32 1, i32 2, i32 3)
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f32_32x32x4xf32:
+; GFX940-DAG:  v_mov_b32_e32 v[[ONE:[0-9]+]], 1.0
+; GFX940-DAG:  v_mov_b32_e32 v[[TWO:[0-9]+]], 2.0
+; GFX940-DAG:  v_mov_b32_e32 v[[THREE:[0-9]+]], 0x40400000
+; GFX940-DAG:  v_mov_b32_e32 v[[FOUR:[0-9]+]], 4.0
+; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
+; GFX940:      v_mfma_f32_32x32x4_xf32 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:[[TWO]]], v[[[THREE]]:[[FOUR]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GISEL:       v_mfma_f32_32x32x4_xf32 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GCN-NOT:     v_accvgpr_read_b32
+; GCN:         global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
+define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 {
+bb:
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
+  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <16 x float> %in.1, i32 1, i32 2, i32 3)
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
+  ret void
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }

From 547ba9730bf05df3383150f730a689f2c8336206 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Fri, 10 Jan 2025 08:13:46 -0800
Subject: [PATCH 073/408] [SLP]Fix mask generation after cost estimation

When estimating the cost of entries shuffles for buildvectors, need to
rebuild original mask, not a generated submask, used for subregisters
analysis.

Fixes #122430
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 29 ++++++++++++++-----
 .../SLPVectorizer/X86/bv-shuffle-mask.ll      |  4 +--
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index cdfec332af37a..e0d1f0e1d43a5 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13443,14 +13443,15 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       for_each(SubMask, [&](int &Idx) {
         if (Idx == PoisonMaskElem)
           return;
-        Idx = (Idx % VF) - (MinElement % VF) +
+        Idx = (Idx % VF) - ((MinElement / NewVF) * NewVF) +
               (Idx >= static_cast<int>(VF) ? NewVF : 0);
       });
-      VF = NewVF;
+    } else {
+      NewVF = VF;
     }
 
     constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-    auto *VecTy = getWidenedType(VL.front()->getType(), VF);
+    auto *VecTy = getWidenedType(VL.front()->getType(), NewVF);
     auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
     auto GetShuffleCost = [&,
                            &TTI = *TTI](ArrayRef<int> Mask,
@@ -13475,7 +13476,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       APInt DemandedElts = APInt::getAllOnes(SubMask.size());
       bool IsIdentity = true;
       for (auto [I, Idx] : enumerate(FirstMask)) {
-        if (Idx >= static_cast<int>(VF)) {
+        if (Idx >= static_cast<int>(NewVF)) {
           Idx = PoisonMaskElem;
         } else {
           DemandedElts.clearBit(I);
@@ -13498,12 +13499,12 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       APInt DemandedElts = APInt::getAllOnes(SubMask.size());
       bool IsIdentity = true;
       for (auto [I, Idx] : enumerate(SecondMask)) {
-        if (Idx < static_cast<int>(VF) && Idx >= 0) {
+        if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
           Idx = PoisonMaskElem;
         } else {
           DemandedElts.clearBit(I);
           if (Idx != PoisonMaskElem) {
-            Idx -= VF;
+            Idx -= NewVF;
             IsIdentity &= static_cast<int>(I) == Idx;
           }
         }
@@ -13523,12 +13524,24 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
                                       /*Extract=*/false, CostKind);
     const TreeEntry *BestEntry = nullptr;
     if (FirstShuffleCost < ShuffleCost) {
-      copy(FirstMask, std::next(Mask.begin(), Part * VL.size()));
+      std::for_each(std::next(Mask.begin(), Part * VL.size()),
+                    std::next(Mask.begin(), (Part + 1) * VL.size()),
+                    [&](int &Idx) {
+                      if (Idx >= static_cast<int>(VF))
+                        Idx = PoisonMaskElem;
+                    });
       BestEntry = Entries.front();
       ShuffleCost = FirstShuffleCost;
     }
     if (SecondShuffleCost < ShuffleCost) {
-      copy(SecondMask, std::next(Mask.begin(), Part * VL.size()));
+      std::for_each(std::next(Mask.begin(), Part * VL.size()),
+                    std::next(Mask.begin(), (Part + 1) * VL.size()),
+                    [&](int &Idx) {
+                      if (Idx < static_cast<int>(VF))
+                        Idx = PoisonMaskElem;
+                      else
+                        Idx -= VF;
+                    });
       BestEntry = Entries[1];
       ShuffleCost = SecondShuffleCost;
     }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll b/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll
index 469421b660f31..766916fe71f35 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll
@@ -10,8 +10,8 @@ define i16 @test(i16 %v1, i16 %v2) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i16> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i16> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i16> [[TMP5]], i16 [[V2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 poison, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i16> [[TMP5]], i16 [[V1]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i16> [[TMP6]], <2 x i16> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = and <4 x i16> [[TMP4]], zeroinitializer

From 5d26a6d7590f13d21d78f7f0a443b92b04c80f98 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Fri, 10 Jan 2025 17:18:53 +0100
Subject: [PATCH 074/408] [mlir][Interfaces] `ViewLikeOpInterface`: Remove
 parser/printer overloads (#122436)

#115808 adds additional `custom<>` parser/printer variants. The overall
list of overloads/variants is getting larger.

This commit removes overloads that are not needed, to keep the
parser/printer simple.
---
 .../mlir/Dialect/Affine/IR/AffineOps.td       |  4 +--
 .../mlir/Interfaces/ViewLikeInterface.h       | 35 -------------------
 2 files changed, 2 insertions(+), 37 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
index e2eab1fb2178e..6cd3408e2b2e9 100644
--- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
+++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
@@ -1118,7 +1118,7 @@ def AffineDelinearizeIndexOp : Affine_Op<"delinearize_index", [Pure]> {
 
   let assemblyFormat = [{
     $linear_index `into`
-    custom<DynamicIndexList>($dynamic_basis, $static_basis, "::mlir::AsmParser::Delimiter::Paren")
+    custom<DynamicIndexList>($dynamic_basis, $static_basis, "{}", "::mlir::AsmParser::Delimiter::Paren")
     attr-dict `:` type($multi_index)
   }];
 
@@ -1219,7 +1219,7 @@ def AffineLinearizeIndexOp : Affine_Op<"linearize_index",
   let assemblyFormat = [{
     (`disjoint` $disjoint^)? ` `
     `[` $multi_index `]` `by`
-    custom<DynamicIndexList>($dynamic_basis, $static_basis, "::mlir::AsmParser::Delimiter::Paren")
+    custom<DynamicIndexList>($dynamic_basis, $static_basis, "{}", "::mlir::AsmParser::Delimiter::Paren")
     attr-dict `:` type($linear_index)
   }];
 
diff --git a/mlir/include/mlir/Interfaces/ViewLikeInterface.h b/mlir/include/mlir/Interfaces/ViewLikeInterface.h
index 3dcbd2f1af193..eb046bc742298 100644
--- a/mlir/include/mlir/Interfaces/ViewLikeInterface.h
+++ b/mlir/include/mlir/Interfaces/ViewLikeInterface.h
@@ -109,13 +109,6 @@ void printDynamicIndexList(
     ArrayRef<int64_t> integers, ArrayRef<bool> scalables,
     TypeRange valueTypes = TypeRange(),
     AsmParser::Delimiter delimiter = AsmParser::Delimiter::Square);
-inline void printDynamicIndexList(OpAsmPrinter &printer, Operation *op,
-                                  OperandRange values,
-                                  ArrayRef<int64_t> integers,
-                                  AsmParser::Delimiter delimiter) {
-  return printDynamicIndexList(printer, op, values, integers, {}, TypeRange(),
-                               delimiter);
-}
 inline void printDynamicIndexList(
     OpAsmPrinter &printer, Operation *op, OperandRange values,
     ArrayRef<int64_t> integers, TypeRange valueTypes = TypeRange(),
@@ -151,15 +144,6 @@ ParseResult parseDynamicIndexList(
     DenseI64ArrayAttr &integers, DenseBoolArrayAttr &scalableVals,
     SmallVectorImpl<Type> *valueTypes = nullptr,
     AsmParser::Delimiter delimiter = AsmParser::Delimiter::Square);
-inline ParseResult
-parseDynamicIndexList(OpAsmParser &parser,
-                      SmallVectorImpl<OpAsmParser::UnresolvedOperand> &values,
-                      DenseI64ArrayAttr &integers,
-                      AsmParser::Delimiter delimiter) {
-  DenseBoolArrayAttr scalableVals = {};
-  return parseDynamicIndexList(parser, values, integers, scalableVals, nullptr,
-                               delimiter);
-}
 inline ParseResult parseDynamicIndexList(
     OpAsmParser &parser,
     SmallVectorImpl<OpAsmParser::UnresolvedOperand> &values,
@@ -169,25 +153,6 @@ inline ParseResult parseDynamicIndexList(
   return parseDynamicIndexList(parser, values, integers, scalableVals,
                                valueTypes, delimiter);
 }
-inline ParseResult parseDynamicIndexList(
-    OpAsmParser &parser,
-    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &values,
-    DenseI64ArrayAttr &integers, SmallVectorImpl<Type> &valueTypes,
-    AsmParser::Delimiter delimiter = AsmParser::Delimiter::Square) {
-  DenseBoolArrayAttr scalableVals = {};
-  return parseDynamicIndexList(parser, values, integers, scalableVals,
-                               &valueTypes, delimiter);
-}
-inline ParseResult parseDynamicIndexList(
-    OpAsmParser &parser,
-    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &values,
-    DenseI64ArrayAttr &integers, SmallVectorImpl<Type> &valueTypes,
-    DenseBoolArrayAttr &scalableVals,
-    AsmParser::Delimiter delimiter = AsmParser::Delimiter::Square) {
-
-  return parseDynamicIndexList(parser, values, integers, scalableVals,
-                               &valueTypes, delimiter);
-}
 
 /// Verify that a the `values` has as many elements as the number of entries in
 /// `attr` for which `isDynamic` evaluates to true.

From dab6463e748aed1223487da536075cbff192940b Mon Sep 17 00:00:00 2001
From: LoS <kaffedesk@gmail.com>
Date: Fri, 10 Jan 2025 17:42:42 +0100
Subject: [PATCH 075/408] [libc++] Remove duplicated _LIBCPP_HIDE_FROM_ABI from
 a few declarations (#122323)

---
 libcxx/include/__cxx03/__functional/function.h | 2 +-
 libcxx/include/__cxx03/future                  | 2 +-
 libcxx/include/__cxx03/regex                   | 4 +---
 libcxx/include/__functional/function.h         | 2 +-
 libcxx/include/future                          | 2 +-
 libcxx/include/regex                           | 4 +---
 6 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/libcxx/include/__cxx03/__functional/function.h b/libcxx/include/__cxx03/__functional/function.h
index 1d60391494da8..891652f1da25f 100644
--- a/libcxx/include/__cxx03/__functional/function.h
+++ b/libcxx/include/__cxx03/__functional/function.h
@@ -853,7 +853,7 @@ class _LIBCPP_TEMPLATE_VIS function<_Rp(_ArgTypes...)>
 
   // construct/copy/destroy:
   _LIBCPP_HIDE_FROM_ABI function() _NOEXCEPT {}
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_HIDE_FROM_ABI function(nullptr_t) _NOEXCEPT {}
+  _LIBCPP_HIDE_FROM_ABI function(nullptr_t) _NOEXCEPT {}
   _LIBCPP_HIDE_FROM_ABI function(const function&);
   _LIBCPP_HIDE_FROM_ABI function(function&&) _NOEXCEPT;
   template <class _Fp, class = _EnableIfLValueCallable<_Fp>>
diff --git a/libcxx/include/__cxx03/future b/libcxx/include/__cxx03/future
index f92bc1266939e..9f43b87e04f32 100644
--- a/libcxx/include/__cxx03/future
+++ b/libcxx/include/__cxx03/future
@@ -1472,7 +1472,7 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI void swap(__packaged_task_function&) _NOEXCEPT;
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_HIDE_FROM_ABI _Rp operator()(_ArgTypes...) const;
+  _LIBCPP_HIDE_FROM_ABI _Rp operator()(_ArgTypes...) const;
 };
 
 template <class _Rp, class... _ArgTypes>
diff --git a/libcxx/include/__cxx03/regex b/libcxx/include/__cxx03/regex
index c05c8768a89a1..ea9512cd56953 100644
--- a/libcxx/include/__cxx03/regex
+++ b/libcxx/include/__cxx03/regex
@@ -5544,9 +5544,7 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI bool operator==(const regex_token_iterator& __x) const;
 #if _LIBCPP_STD_VER >= 20
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_HIDE_FROM_ABI bool operator==(default_sentinel_t) const {
-    return *this == regex_token_iterator();
-  }
+  _LIBCPP_HIDE_FROM_ABI bool operator==(default_sentinel_t) const { return *this == regex_token_iterator(); }
 #endif
 #if _LIBCPP_STD_VER < 20
   _LIBCPP_HIDE_FROM_ABI bool operator!=(const regex_token_iterator& __x) const { return !(*this == __x); }
diff --git a/libcxx/include/__functional/function.h b/libcxx/include/__functional/function.h
index b483e8ea8f856..2924f6cad6578 100644
--- a/libcxx/include/__functional/function.h
+++ b/libcxx/include/__functional/function.h
@@ -854,7 +854,7 @@ class _LIBCPP_TEMPLATE_VIS function<_Rp(_ArgTypes...)>
 
   // construct/copy/destroy:
   _LIBCPP_HIDE_FROM_ABI function() _NOEXCEPT {}
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_HIDE_FROM_ABI function(nullptr_t) _NOEXCEPT {}
+  _LIBCPP_HIDE_FROM_ABI function(nullptr_t) _NOEXCEPT {}
   _LIBCPP_HIDE_FROM_ABI function(const function&);
   _LIBCPP_HIDE_FROM_ABI function(function&&) _NOEXCEPT;
   template <class _Fp, class = _EnableIfLValueCallable<_Fp>>
diff --git a/libcxx/include/future b/libcxx/include/future
index 95a51fa425e41..d777ed8d6016f 100644
--- a/libcxx/include/future
+++ b/libcxx/include/future
@@ -1492,7 +1492,7 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI void swap(__packaged_task_function&) _NOEXCEPT;
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_HIDE_FROM_ABI _Rp operator()(_ArgTypes...) const;
+  _LIBCPP_HIDE_FROM_ABI _Rp operator()(_ArgTypes...) const;
 };
 
 template <class _Rp, class... _ArgTypes>
diff --git a/libcxx/include/regex b/libcxx/include/regex
index 5cad0bc4b812d..dcee77cfacc3d 100644
--- a/libcxx/include/regex
+++ b/libcxx/include/regex
@@ -5548,9 +5548,7 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI bool operator==(const regex_token_iterator& __x) const;
 #  if _LIBCPP_STD_VER >= 20
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_HIDE_FROM_ABI bool operator==(default_sentinel_t) const {
-    return *this == regex_token_iterator();
-  }
+  _LIBCPP_HIDE_FROM_ABI bool operator==(default_sentinel_t) const { return *this == regex_token_iterator(); }
 #  endif
 #  if _LIBCPP_STD_VER < 20
   _LIBCPP_HIDE_FROM_ABI bool operator!=(const regex_token_iterator& __x) const { return !(*this == __x); }

From c189df842c67a2476a59363fa36a0c1b1137f533 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Fri, 10 Jan 2025 08:44:19 -0800
Subject: [PATCH 076/408] [flang][cuda] Fix resolution of overloaded operator
 (#122402)

---
 flang/lib/Semantics/resolve-names.cpp | 39 +++++++++++++--------------
 flang/test/Semantics/cuf10.cuf        | 19 +++++++++++++
 2 files changed, 38 insertions(+), 20 deletions(-)

diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 724f1b2807835..51e7c5960dc2e 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -8970,18 +8970,6 @@ void ResolveNamesVisitor::FinishSpecificationPart(
   misparsedStmtFuncFound_ = false;
   funcResultStack().CompleteFunctionResultType();
   CheckImports();
-  bool inDeviceSubprogram = false;
-  if (auto *subp{currScope().symbol()
-              ? currScope().symbol()->detailsIf<SubprogramDetails>()
-              : nullptr}) {
-    if (auto attrs{subp->cudaSubprogramAttrs()}) {
-      if (*attrs == common::CUDASubprogramAttrs::Device ||
-          *attrs == common::CUDASubprogramAttrs::Global ||
-          *attrs == common::CUDASubprogramAttrs::Grid_Global) {
-        inDeviceSubprogram = true;
-      }
-    }
-  }
   for (auto &pair : currScope()) {
     auto &symbol{*pair.second};
     if (inInterfaceBlock()) {
@@ -8990,14 +8978,6 @@ void ResolveNamesVisitor::FinishSpecificationPart(
     if (NeedsExplicitType(symbol)) {
       ApplyImplicitRules(symbol);
     }
-    if (inDeviceSubprogram && symbol.has<ObjectEntityDetails>()) {
-      auto *object{symbol.detailsIf<ObjectEntityDetails>()};
-      if (!object->cudaDataAttr() && !IsValue(symbol) &&
-          (IsDummy(symbol) || object->IsArray())) {
-        // Implicitly set device attribute if none is set in device context.
-        object->set_cudaDataAttr(common::CUDADataAttr::Device);
-      }
-    }
     if (IsDummy(symbol) && isImplicitNoneType() &&
         symbol.test(Symbol::Flag::Implicit) && !context().HasError(symbol)) {
       Say(symbol.name(),
@@ -9522,6 +9502,7 @@ void ResolveNamesVisitor::ResolveSpecificationParts(ProgramTree &node) {
       },
       node.stmt());
   Walk(node.spec());
+  bool inDeviceSubprogram = false;
   // If this is a function, convert result to an object. This is to prevent the
   // result from being converted later to a function symbol if it is called
   // inside the function.
@@ -9535,6 +9516,15 @@ void ResolveNamesVisitor::ResolveSpecificationParts(ProgramTree &node) {
       if (details->isFunction()) {
         ConvertToObjectEntity(const_cast<Symbol &>(details->result()));
       }
+      // Check the current procedure is a device procedure to apply implicit
+      // attribute at the end.
+      if (auto attrs{details->cudaSubprogramAttrs()}) {
+        if (*attrs == common::CUDASubprogramAttrs::Device ||
+            *attrs == common::CUDASubprogramAttrs::Global ||
+            *attrs == common::CUDASubprogramAttrs::Grid_Global) {
+          inDeviceSubprogram = true;
+        }
+      }
     }
   }
   if (node.IsModule()) {
@@ -9561,6 +9551,15 @@ void ResolveNamesVisitor::ResolveSpecificationParts(ProgramTree &node) {
           symbol.GetType() ? Symbol::Flag::Function : Symbol::Flag::Subroutine);
     }
     ApplyImplicitRules(symbol);
+    // Apply CUDA implicit attributes if needed.
+    if (inDeviceSubprogram && symbol.has<ObjectEntityDetails>()) {
+      auto *object{symbol.detailsIf<ObjectEntityDetails>()};
+      if (!object->cudaDataAttr() && !IsValue(symbol) &&
+          (IsDummy(symbol) || object->IsArray())) {
+        // Implicitly set device attribute if none is set in device context.
+        object->set_cudaDataAttr(common::CUDADataAttr::Device);
+      }
+    }
   }
 }
 
diff --git a/flang/test/Semantics/cuf10.cuf b/flang/test/Semantics/cuf10.cuf
index 24b596b1fa55d..f85471855ec57 100644
--- a/flang/test/Semantics/cuf10.cuf
+++ b/flang/test/Semantics/cuf10.cuf
@@ -3,6 +3,13 @@ module m
   real, device :: a(4,8)
   real, managed, allocatable :: b(:,:)
   integer, constant :: x = 1
+  type :: int
+     real :: i, s 
+  end type int
+  interface operator (+)
+     module procedure addHost
+     module procedure addDevice
+  end interface operator (+)
  contains
   attributes(global) subroutine kernel(a,b,c,n,m)
     integer, value :: n
@@ -30,4 +37,16 @@ module m
   subroutine sub2()
     call sub1<<<1,1>>>(x) ! actual constant to device dummy
   end
+  function addHost(a, b) result(c)
+    type(int), intent(in) :: a, b
+    type(int) :: c
+  end function addHost
+  attributes(device) function addDevice(a, b) result(c)
+    type(int), device :: c
+    type(int), intent(in) :: a ,b
+  end function addDevice
+  attributes(global) subroutine overload(c, a, b)
+    type (int) :: c, a, b
+    c = a+b ! ok resolve to addDevice
+  end subroutine overload
 end

From 3c9c94a24fd147578c8dcf2837e94923213ac7af Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Fri, 10 Jan 2025 08:45:46 -0800
Subject: [PATCH 077/408] Revert "[SLP]Fix mask generation after cost
 estimation"

This reverts commit 547ba9730bf05df3383150f730a689f2c8336206 to fix
buildbots reported in
https://lab.llvm.org/buildbot/#/builders/123/builds/11370, https://lab.llvm.org/buildbot/#/builders/133/builds/9492
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 29 +++++--------------
 .../SLPVectorizer/X86/bv-shuffle-mask.ll      |  4 +--
 2 files changed, 10 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e0d1f0e1d43a5..cdfec332af37a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13443,15 +13443,14 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       for_each(SubMask, [&](int &Idx) {
         if (Idx == PoisonMaskElem)
           return;
-        Idx = (Idx % VF) - ((MinElement / NewVF) * NewVF) +
+        Idx = (Idx % VF) - (MinElement % VF) +
               (Idx >= static_cast<int>(VF) ? NewVF : 0);
       });
-    } else {
-      NewVF = VF;
+      VF = NewVF;
     }
 
     constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-    auto *VecTy = getWidenedType(VL.front()->getType(), NewVF);
+    auto *VecTy = getWidenedType(VL.front()->getType(), VF);
     auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
     auto GetShuffleCost = [&,
                            &TTI = *TTI](ArrayRef<int> Mask,
@@ -13476,7 +13475,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       APInt DemandedElts = APInt::getAllOnes(SubMask.size());
       bool IsIdentity = true;
       for (auto [I, Idx] : enumerate(FirstMask)) {
-        if (Idx >= static_cast<int>(NewVF)) {
+        if (Idx >= static_cast<int>(VF)) {
           Idx = PoisonMaskElem;
         } else {
           DemandedElts.clearBit(I);
@@ -13499,12 +13498,12 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       APInt DemandedElts = APInt::getAllOnes(SubMask.size());
       bool IsIdentity = true;
       for (auto [I, Idx] : enumerate(SecondMask)) {
-        if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
+        if (Idx < static_cast<int>(VF) && Idx >= 0) {
           Idx = PoisonMaskElem;
         } else {
           DemandedElts.clearBit(I);
           if (Idx != PoisonMaskElem) {
-            Idx -= NewVF;
+            Idx -= VF;
             IsIdentity &= static_cast<int>(I) == Idx;
           }
         }
@@ -13524,24 +13523,12 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
                                       /*Extract=*/false, CostKind);
     const TreeEntry *BestEntry = nullptr;
     if (FirstShuffleCost < ShuffleCost) {
-      std::for_each(std::next(Mask.begin(), Part * VL.size()),
-                    std::next(Mask.begin(), (Part + 1) * VL.size()),
-                    [&](int &Idx) {
-                      if (Idx >= static_cast<int>(VF))
-                        Idx = PoisonMaskElem;
-                    });
+      copy(FirstMask, std::next(Mask.begin(), Part * VL.size()));
       BestEntry = Entries.front();
       ShuffleCost = FirstShuffleCost;
     }
     if (SecondShuffleCost < ShuffleCost) {
-      std::for_each(std::next(Mask.begin(), Part * VL.size()),
-                    std::next(Mask.begin(), (Part + 1) * VL.size()),
-                    [&](int &Idx) {
-                      if (Idx < static_cast<int>(VF))
-                        Idx = PoisonMaskElem;
-                      else
-                        Idx -= VF;
-                    });
+      copy(SecondMask, std::next(Mask.begin(), Part * VL.size()));
       BestEntry = Entries[1];
       ShuffleCost = SecondShuffleCost;
     }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll b/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll
index 766916fe71f35..469421b660f31 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll
@@ -10,8 +10,8 @@ define i16 @test(i16 %v1, i16 %v2) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i16> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i16> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 poison, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i16> [[TMP5]], i16 [[V1]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i16> [[TMP5]], i16 [[V2]], i32 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i16> [[TMP6]], <2 x i16> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = and <4 x i16> [[TMP4]], zeroinitializer

From b43c97c2ddfe9e922bb044de01312adb81591a48 Mon Sep 17 00:00:00 2001
From: Evgenii Kudriashov <evgenii.kudriashov@intel.com>
Date: Fri, 10 Jan 2025 17:52:09 +0100
Subject: [PATCH 078/408] [Headers][X86] amxintrin.h - fix attributes according
 to Intel SDM (#122204)

`tileloadd`, `tileloaddt1` and `tilestored` are part of `amx-tile`
feature.

The problem is observed if `__tile_loadd` intrinsic is invoked,
`_tile_loadd_internal` requiring `amx-int8` is inlined into
`__tile_loadd` that has only `amx-tile`.
---
 clang/lib/Headers/amxintrin.h     |  6 ++---
 clang/test/CodeGen/X86/amx_api.c  | 30 -------------------------
 clang/test/CodeGen/X86/amx_tile.c | 37 +++++++++++++++++++++++++++++++
 3 files changed, 40 insertions(+), 33 deletions(-)
 create mode 100644 clang/test/CodeGen/X86/amx_tile.c

diff --git a/clang/lib/Headers/amxintrin.h b/clang/lib/Headers/amxintrin.h
index b0140615677f2..a7da10d9951e7 100644
--- a/clang/lib/Headers/amxintrin.h
+++ b/clang/lib/Headers/amxintrin.h
@@ -234,7 +234,7 @@ typedef int _tile1024i_1024a
     __attribute__((__vector_size__(1024), __aligned__(1024)));
 
 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TILE
 _tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
                      __SIZE_TYPE__ stride) {
   return __builtin_ia32_tileloadd64_internal(m, n, base,
@@ -242,7 +242,7 @@ _tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
 }
 
 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TILE
 _tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
                        __SIZE_TYPE__ stride) {
   return __builtin_ia32_tileloaddt164_internal(m, n, base,
@@ -278,7 +278,7 @@ _tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
 }
 
 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ void __DEFAULT_FN_ATTRS_INT8
+static __inline__ void __DEFAULT_FN_ATTRS_TILE
 _tile_stored_internal(unsigned short m, unsigned short n, void *base,
                       __SIZE_TYPE__ stride, _tile1024i tile) {
   return __builtin_ia32_tilestored64_internal(m, n, base,
diff --git a/clang/test/CodeGen/X86/amx_api.c b/clang/test/CodeGen/X86/amx_api.c
index 5b6d50da27c6d..d770c03eb06d2 100644
--- a/clang/test/CodeGen/X86/amx_api.c
+++ b/clang/test/CodeGen/X86/amx_api.c
@@ -33,22 +33,6 @@ void test_api(int cond, short row, short col) {
   __tile_stored(buf, STRIDE, c);
 }
 
-void test_tile_loadd(short row, short col) {
-  //CHECK-LABEL: @test_tile_loadd
-  //CHECK-DAG: call x86_amx @llvm.x86.tileloadd64.internal
-  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  __tile1024i a = {row, col};
-  __tile_loadd(&a, buf, STRIDE);
-}
-
-void test_tile_stream_loadd(short row, short col) {
-  //CHECK-LABEL: @test_tile_stream_loadd
-  //CHECK-DAG: call x86_amx @llvm.x86.tileloaddt164.internal
-  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  __tile1024i a = {row, col};
-  __tile_stream_loadd(&a, buf, STRIDE);
-}
-
 void test_tile_dpbssd(__tile1024i a, __tile1024i b, __tile1024i c) {
   //CHECK-LABEL: @test_tile_dpbssd
   //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
@@ -81,20 +65,6 @@ void test_tile_dpbuud(__tile1024i a, __tile1024i b, __tile1024i c) {
   __tile_dpbuud(&c, a, b);
 }
 
-void test_tile_stored(__tile1024i c) {
-  //CHECK-LABEL: @test_tile_stored
-  //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
-  //CHECK-DAG: call void @llvm.x86.tilestored64.internal
-  __tile_stored(buf, STRIDE, c);
-}
-
-void test_tile_zero(__tile1024i c) {
-  //CHECK-LABEL: @test_tile_zero
-  //CHECK-DAG: call x86_amx @llvm.x86.tilezero.internal
-  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  __tile_zero(&c);
-}
-
 void test_tile_dpbf16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
   //CHECK-LABEL: @test_tile_dpbf16ps
   //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
diff --git a/clang/test/CodeGen/X86/amx_tile.c b/clang/test/CodeGen/X86/amx_tile.c
new file mode 100644
index 0000000000000..1c87ae5ba1eaa
--- /dev/null
+++ b/clang/test/CodeGen/X86/amx_tile.c
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-unknown-unknown  -target-feature +amx-tile  \
+// RUN: -emit-llvm -o - -Werror -pedantic | FileCheck %s --check-prefixes=CHECK
+
+#include <immintrin.h>
+
+char buf[1024];
+#define STRIDE 32
+
+void test_tile_loadd(short row, short col) {
+  //CHECK-LABEL: @test_tile_loadd
+  //CHECK-DAG: call x86_amx @llvm.x86.tileloadd64.internal
+  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+  __tile1024i a = {row, col};
+  __tile_loadd(&a, buf, STRIDE);
+}
+
+void test_tile_stream_loadd(short row, short col) {
+  //CHECK-LABEL: @test_tile_stream_loadd
+  //CHECK-DAG: call x86_amx @llvm.x86.tileloaddt164.internal
+  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+  __tile1024i a = {row, col};
+  __tile_stream_loadd(&a, buf, STRIDE);
+}
+
+void test_tile_stored(__tile1024i c) {
+  //CHECK-LABEL: @test_tile_stored
+  //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
+  //CHECK-DAG: call void @llvm.x86.tilestored64.internal
+  __tile_stored(buf, STRIDE, c);
+}
+
+void test_tile_zero(__tile1024i c) {
+  //CHECK-LABEL: @test_tile_zero
+  //CHECK-DAG: call x86_amx @llvm.x86.tilezero.internal
+  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+  __tile_zero(&c);
+}

From 953beb9fe969bf8ab1857924ea0d3dd6ea506ab1 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Fri, 10 Jan 2025 10:58:26 -0600
Subject: [PATCH 079/408] [CUDA] Move CUDA to new driver by default (#122312)

Summary:
This patch updates the --offload-new-driver flag to be default for CUDA.
This mostly just required updating a lot of tests to use the old format.
I tried to update them where possible, but some were directly checking
the old format.


https://discourse.llvm.org/t/rfc-use-the-new-offloding-driver-for-cuda-and-hip-compilation-by-default/77468/18
---
 clang/docs/ReleaseNotes.rst                |  6 +++
 clang/lib/Driver/Driver.cpp                |  6 ++-
 clang/lib/Driver/ToolChains/Clang.cpp      |  9 +++--
 clang/lib/Driver/ToolChains/Cuda.cpp       |  2 +-
 clang/test/Driver/cuda-arch-translation.cu | 26 ++++++-------
 clang/test/Driver/cuda-bindings.cu         | 43 ++++++----------------
 clang/test/Driver/cuda-options.cu          | 30 +++++++--------
 clang/test/Driver/cuda-output-asm.cu       |  4 --
 8 files changed, 57 insertions(+), 69 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 440b045399d99..5a48d6fbc01fa 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1083,6 +1083,12 @@ CUDA Support
 - Clang now supports CUDA SDK up to 12.6
 - Added support for sm_100
 - Added support for `__grid_constant__` attribute.
+- CUDA now uses the new offloading driver by default. The new driver supports
+  device-side LTO, interoperability with OpenMP and other languages, and native ``-fgpu-rdc``
+  support with static libraries. The old behavior can be returned using the
+  ``--no-offload-new-driver`` flag. The binary format is no longer compatible
+  with the NVIDIA compiler's RDC-mode support. More information can be found at:
+  https://clang.llvm.org/docs/OffloadingDesign.html
 
 AIX Support
 ^^^^^^^^^^^
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 04f2664ffeadd..4d9492ea08f64 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -4339,7 +4339,8 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
       Args.hasFlag(options::OPT_foffload_via_llvm,
                    options::OPT_fno_offload_via_llvm, false) ||
       Args.hasFlag(options::OPT_offload_new_driver,
-                   options::OPT_no_offload_new_driver, false);
+                   options::OPT_no_offload_new_driver,
+                   C.isOffloadingHostKind(Action::OFK_Cuda));
 
   // Builder to be used to build offloading actions.
   std::unique_ptr<OffloadingActionBuilder> OffloadBuilder =
@@ -5089,7 +5090,8 @@ Action *Driver::ConstructPhaseAction(
                    offloadDeviceOnly() ||
                    (TargetDeviceOffloadKind == Action::OFK_HIP &&
                     !Args.hasFlag(options::OPT_offload_new_driver,
-                                  options::OPT_no_offload_new_driver, false)))
+                                  options::OPT_no_offload_new_driver,
+                                  C.isOffloadingHostKind(Action::OFK_Cuda))))
               ? types::TY_LLVM_IR
               : types::TY_LLVM_BC;
       return C.MakeAction<BackendJobAction>(Input, Output);
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index c4b5374d3fff9..f81691f8aeaf9 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5064,7 +5064,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
       JA.isHostOffloading(Action::OFK_SYCL) ||
       (JA.isHostOffloading(C.getActiveOffloadKinds()) &&
        Args.hasFlag(options::OPT_offload_new_driver,
-                    options::OPT_no_offload_new_driver, false));
+                    options::OPT_no_offload_new_driver,
+                    C.isOffloadingHostKind(Action::OFK_Cuda)));
 
   bool IsRDCMode =
       Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
@@ -5419,7 +5420,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     if (IsUsingLTO) {
       if (IsDeviceOffloadAction && !JA.isDeviceOffloading(Action::OFK_OpenMP) &&
           !Args.hasFlag(options::OPT_offload_new_driver,
-                        options::OPT_no_offload_new_driver, false) &&
+                        options::OPT_no_offload_new_driver,
+                        C.isOffloadingHostKind(Action::OFK_Cuda)) &&
           !Triple.isAMDGPU()) {
         D.Diag(diag::err_drv_unsupported_opt_for_target)
             << Args.getLastArg(options::OPT_foffload_lto,
@@ -6896,7 +6898,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
                    options::OPT_fno_offload_via_llvm, false)) {
     CmdArgs.append({"--offload-new-driver", "-foffload-via-llvm"});
   } else if (Args.hasFlag(options::OPT_offload_new_driver,
-                          options::OPT_no_offload_new_driver, false)) {
+                          options::OPT_no_offload_new_driver,
+                          C.isOffloadingHostKind(Action::OFK_Cuda))) {
     CmdArgs.push_back("--offload-new-driver");
   }
 
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 214f1e5d83478..8967115bcc73d 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -506,7 +506,7 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
 static bool shouldIncludePTX(const ArgList &Args, StringRef InputArch) {
   // The new driver does not include PTX by default to avoid overhead.
   bool includePTX = !Args.hasFlag(options::OPT_offload_new_driver,
-                                  options::OPT_no_offload_new_driver, false);
+                                  options::OPT_no_offload_new_driver, true);
   for (Arg *A : Args.filtered(options::OPT_cuda_include_ptx_EQ,
                               options::OPT_no_cuda_include_ptx_EQ)) {
     A->claim();
diff --git a/clang/test/Driver/cuda-arch-translation.cu b/clang/test/Driver/cuda-arch-translation.cu
index e96191cc9d418..a0ae16452692b 100644
--- a/clang/test/Driver/cuda-arch-translation.cu
+++ b/clang/test/Driver/cuda-arch-translation.cu
@@ -68,19 +68,19 @@
 
 // HIP: clang-offload-bundler
 
-// SM20:--image=profile=sm_20{{.*}}--image=profile=compute_20
-// SM21:--image=profile=sm_21{{.*}}--image=profile=compute_20
-// SM30:--image=profile=sm_30{{.*}}--image=profile=compute_30
-// SM32:--image=profile=sm_32{{.*}}--image=profile=compute_32
-// SM35:--image=profile=sm_35{{.*}}--image=profile=compute_35
-// SM37:--image=profile=sm_37{{.*}}--image=profile=compute_37
-// SM50:--image=profile=sm_50{{.*}}--image=profile=compute_50
-// SM52:--image=profile=sm_52{{.*}}--image=profile=compute_52
-// SM53:--image=profile=sm_53{{.*}}--image=profile=compute_53
-// SM60:--image=profile=sm_60{{.*}}--image=profile=compute_60
-// SM61:--image=profile=sm_61{{.*}}--image=profile=compute_61
-// SM62:--image=profile=sm_62{{.*}}--image=profile=compute_62
-// SM70:--image=profile=sm_70{{.*}}--image=profile=compute_70
+// SM20:--image=profile=sm_20{{.*}}
+// SM21:--image=profile=sm_21{{.*}}
+// SM30:--image=profile=sm_30{{.*}}
+// SM32:--image=profile=sm_32{{.*}}
+// SM35:--image=profile=sm_35{{.*}}
+// SM37:--image=profile=sm_37{{.*}}
+// SM50:--image=profile=sm_50{{.*}}
+// SM52:--image=profile=sm_52{{.*}}
+// SM53:--image=profile=sm_53{{.*}}
+// SM60:--image=profile=sm_60{{.*}}
+// SM61:--image=profile=sm_61{{.*}}
+// SM62:--image=profile=sm_62{{.*}}
+// SM70:--image=profile=sm_70{{.*}}
 // GFX600:-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx600
 // GFX601:-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx601
 // GFX602:-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx602
diff --git a/clang/test/Driver/cuda-bindings.cu b/clang/test/Driver/cuda-bindings.cu
index 8ee1884936c06..5b6f944621439 100644
--- a/clang/test/Driver/cuda-bindings.cu
+++ b/clang/test/Driver/cuda-bindings.cu
@@ -23,14 +23,14 @@
 // BIN-NOT: cuda-bindings-device-cuda-nvptx64
 // BIN: # "powerpc64le-ibm-linux-gnu" - "clang",{{.*}}  output:
 // BIN-NOT: cuda-bindings-device-cuda-nvptx64
-// BIN: # "powerpc64le-ibm-linux-gnu" - "GNU::Linker", inputs:{{.*}}, output: "a.out"
+// BIN: # "powerpc64le-ibm-linux-gnu" - "Offload::Linker", inputs:{{.*}}, output: "a.out"
 
 //
 // Test single gpu architecture up to the assemble phase.
 //
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings --cuda-gpu-arch=sm_30 %s -S 2>&1 \
 // RUN: | FileCheck -check-prefix=ASM %s
-// ASM-DAG: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "cuda-bindings-cuda-nvptx64-nvidia-cuda-sm_30.s"
+// ASM-DAG: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "[[PTX:.+]].s"
 // ASM-DAG: # "powerpc64le-ibm-linux-gnu" - "clang",{{.*}} output: "cuda-bindings.s"
 
 //
@@ -61,40 +61,21 @@
 // BIN2-NOT: cuda-bindings-device-cuda-nvptx64
 // BIN2: # "powerpc64le-ibm-linux-gnu" - "clang",{{.*}}  output:
 // BIN2-NOT: cuda-bindings-device-cuda-nvptx64
-// AOUT: # "powerpc64le-ibm-linux-gnu" - "GNU::Linker", inputs:{{.*}}, output: "a.out"
-// TOUT: # "powerpc64le-ibm-linux-gnu" - "GNU::Linker", inputs:{{.*}}, output: "{{.*}}/out"
+// AOUT: # "powerpc64le-ibm-linux-gnu" - "Offload::Linker", inputs:{{.*}}, output: "a.out"
+// TOUT: # "powerpc64le-ibm-linux-gnu" - "Offload::Linker", inputs:{{.*}}, output: "{{.*}}/out"
 
 // .. same, but with -fsyntax-only
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings -fsyntax-only \
 // RUN:       --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \
-// RUN: | FileCheck -check-prefix=SYN %s
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings -fsyntax-only \
-// RUN:        --offload-arch=sm_30,sm_35 %s -o %t/out 2>&1 \
-// RUN: | FileCheck -check-prefix=SYN %s
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings -fsyntax-only \
-// RUN:       --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \
-// RUN: | FileCheck -check-prefix=SYN %s
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings -fsyntax-only \
-// RUN:        --offload-arch=sm_30,sm_35 %s -o %t/out 2>&1 \
-// RUN: | FileCheck -check-prefix=SYN %s
-// SYN-NOT: inputs:
-// SYN: # "powerpc64le-ibm-linux-gnu" - "clang", inputs: [{{.*}}], output: (nothing)
-// SYN-NEXT: # "nvptx64-nvidia-cuda" - "clang", inputs: [{{.*}}], output: (nothing)
-// SYN-NEXT: # "nvptx64-nvidia-cuda" - "clang", inputs: [{{.*}}], output: (nothing)
-// SYN-NOT: inputs
-
-// .. and with --offload-new-driver
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings -fsyntax-only \
-// RUN:       --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 --offload-new-driver %s 2>&1 \
 // RUN: | FileCheck -check-prefix=NDSYN %s
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings -fsyntax-only \
-// RUN:        --offload-arch=sm_30,sm_35 %s --offload-new-driver -o %t/out 2>&1 \
+// RUN:        --offload-arch=sm_30,sm_35 %s -o %t/out 2>&1 \
 // RUN: | FileCheck -check-prefix=NDSYN %s
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings -fsyntax-only \
-// RUN:       --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --offload-new-driver 2>&1 \
+// RUN:       --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \
 // RUN: | FileCheck -check-prefix=NDSYN %s
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings -fsyntax-only \
-// RUN:        --offload-arch=sm_30,sm_35 %s --offload-new-driver -o %t/out 2>&1 \
+// RUN:        --offload-arch=sm_30,sm_35 %s -o %t/out 2>&1 \
 // RUN: | FileCheck -check-prefix=NDSYN %s
 // NDSYN-NOT: inputs:
 // NDSYN: # "nvptx64-nvidia-cuda" - "clang", inputs: [{{.*}}], output: (nothing)
@@ -109,8 +90,8 @@
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings \
 // RUN:        --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \
 // RUN: | FileCheck -check-prefix=ASM2 %s
-// ASM2-DAG: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "cuda-bindings-cuda-nvptx64-nvidia-cuda-sm_30.s"
-// ASM2-DAG: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "cuda-bindings-cuda-nvptx64-nvidia-cuda-sm_35.s"
+// ASM2-DAG: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "[[SM30:.+]].s"
+// ASM2-DAG: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "[[SM35:.+]].s"
 // ASM2-DAG: # "powerpc64le-ibm-linux-gnu" - "clang",{{.*}} output: "cuda-bindings.s"
 
 //
@@ -125,7 +106,7 @@
 // RUN: | FileCheck -check-prefix=HBIN %s
 // HBIN: # "powerpc64le-ibm-linux-gnu" - "clang",{{.*}}  output:
 // HBIN-NOT: cuda-bindings-device-cuda-nvptx64
-// HBIN: # "powerpc64le-ibm-linux-gnu" - "GNU::Linker", inputs:{{.*}}, output: "a.out"
+// HBIN: # "powerpc64le-ibm-linux-gnu" - "Offload::Linker", inputs:{{.*}}, output: "a.out"
 
 //
 // Test one or more gpu architecture up to the assemble phase in host-only
@@ -163,7 +144,7 @@
 // Test two gpu architectures with complete compilation in device-only
 // compilation mode.
 //
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings \
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings --no-offload-new-driver \
 // RUN:        --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only 2>&1 \
 // RUN: | FileCheck -check-prefix=DBIN2 %s
 // DBIN2: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output:
@@ -177,7 +158,7 @@
 // Test two gpu architectures up to the assemble phase in device-only
 // compilation mode.
 //
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings \
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings --no-offload-new-driver \
 // RUN:        --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only -S 2>&1 \
 // RUN: | FileCheck -check-prefix=DASM2 %s
 // DASM2: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "cuda-bindings-cuda-nvptx64-nvidia-cuda-sm_30.s"
diff --git a/clang/test/Driver/cuda-options.cu b/clang/test/Driver/cuda-options.cu
index 67facf77f6c68..db6536ca9e03b 100644
--- a/clang/test/Driver/cuda-options.cu
+++ b/clang/test/Driver/cuda-options.cu
@@ -2,13 +2,13 @@
 
 // Simple compilation case. Compile device-side to PTX assembly and make sure
 // we use it on the host side.
-// RUN: %clang -### -target x86_64-linux-gnu -c -nogpulib -nogpuinc %s 2>&1 \
+// RUN: %clang -### --cuda-include-ptx=all -target x86_64-linux-gnu -c -nogpulib -nogpuinc %s 2>&1 \
 // RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
 // RUN:    -check-prefix HOST -check-prefix INCLUDES-DEVICE \
 // RUN:    -check-prefix NOLINK %s
 
 // Typical compilation + link case.
-// RUN: %clang -### -target x86_64-linux-gnu -nogpulib -nogpuinc %s 2>&1 \
+// RUN: %clang -### --cuda-include-ptx=all -target x86_64-linux-gnu -nogpulib -nogpuinc %s 2>&1 \
 // RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
 // RUN:    -check-prefix HOST -check-prefix INCLUDES-DEVICE \
 // RUN:    -check-prefix LINK %s
@@ -33,7 +33,7 @@
 // RUN:    -check-prefix NOINCLUDES-DEVICE -check-prefix LINK %s
 
 // RUN: %clang -### --target=x86_64-linux-gnu --cuda-compile-host-device \
-// RUN:    --cuda-host-only -nogpulib -nogpuinc %s 2>&1 \
+// RUN:    --cuda-host-only --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
 // RUN: | FileCheck -check-prefix NODEVICE -check-prefix HOST \
 // RUN:    -check-prefix NOINCLUDES-DEVICE -check-prefix LINK %s
 
@@ -47,13 +47,13 @@
 // RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
 // RUN:    -check-prefix NOHOST -check-prefix NOLINK %s
 
-// RUN: %clang -### --target=x86_64-linux-gnu --cuda-host-only \
+// RUN: %clang -### --cuda-include-ptx=all --target=x86_64-linux-gnu --cuda-host-only \
 // RUN:   -nogpulib -nogpuinc --cuda-compile-host-device %s 2>&1 \
 // RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
 // RUN:    -check-prefix HOST -check-prefix INCLUDES-DEVICE \
 // RUN:    -check-prefix LINK %s
 
-// RUN: %clang -### --target=x86_64-linux-gnu --cuda-device-only \
+// RUN: %clang -### --cuda-include-ptx=all --target=x86_64-linux-gnu --cuda-device-only \
 // RUN:   -nogpulib -nogpuinc --cuda-compile-host-device %s 2>&1 \
 // RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
 // RUN:    -check-prefix HOST -check-prefix INCLUDES-DEVICE \
@@ -61,14 +61,14 @@
 
 // Verify that --cuda-gpu-arch option passes the correct GPU architecture to
 // device compilation.
-// RUN: %clang -### -nogpulib -nogpuinc --target=x86_64-linux-gnu --cuda-gpu-arch=sm_52 -c %s 2>&1 \
+// RUN: %clang -### -nogpulib -nogpuinc --cuda-include-ptx=all --target=x86_64-linux-gnu --cuda-gpu-arch=sm_52 -c %s 2>&1 \
 // RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
 // RUN:    -check-prefix DEVICE-SM52 -check-prefix HOST \
 // RUN:    -check-prefix INCLUDES-DEVICE -check-prefix NOLINK %s
 
 // Verify that there is one device-side compilation per --cuda-gpu-arch args
 // and that all results are included on the host side.
-// RUN: %clang -### --target=x86_64-linux-gnu \
+// RUN: %clang -### --cuda-include-ptx=all --target=x86_64-linux-gnu \
 // RUN:   -nogpulib -nogpuinc --cuda-gpu-arch=sm_60 --cuda-gpu-arch=sm_52 -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes DEVICE,DEVICE-NOSAVE,DEVICE2 \
 // RUN:             -check-prefixes DEVICE-SM52,DEVICE2-SM60 \
@@ -128,9 +128,9 @@
 // f) --no-cuda-gpu-arch=all negates all preceding --cuda-gpu-arch=X
 // RUN: %clang -### -target x86_64-linux-gnu --cuda-device-only \
 // RUN:   -nogpulib -nogpuinc --cuda-gpu-arch=sm_60 --cuda-gpu-arch=sm_52 \
-// RUN:   --no-cuda-gpu-arch=all \
+// RUN:   --no-cuda-version-check --no-cuda-gpu-arch=all \
 // RUN:   --cuda-gpu-arch=sm_70 \
-// RUN:   -c -nogpulib -nogpuinc %s 2>&1 \
+// RUN:   -c --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
 // RUN: | FileCheck -check-prefixes NOARCH-SM52,NOARCH-SM60,ARCH-SM70 %s
 
 // g) There's no --cuda-gpu-arch=all
@@ -141,9 +141,9 @@
 
 
 // Verify that --[no-]cuda-include-ptx arguments are handled correctly.
-// a) by default we're including PTX for all GPUs.
+// a) by default we're not including PTX for all GPUs.
 // RUN: %clang -### --target=x86_64-linux-gnu -nogpulib -nogpuinc \
-// RUN:   --cuda-gpu-arch=sm_60 --cuda-gpu-arch=sm_52 \
+// RUN:   --cuda-include-ptx=all --cuda-gpu-arch=sm_60 --cuda-gpu-arch=sm_52 \
 // RUN:   -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes FATBIN-COMMON,PTX-SM60,PTX-SM52 %s
 
@@ -157,12 +157,12 @@
 // c) --no-cuda-include-ptx=sm_XX disables PTX inclusion for that GPU only.
 // RUN: %clang -### --target=x86_64-linux-gnu -nogpulib -nogpuinc \
 // RUN:   --cuda-gpu-arch=sm_60 --cuda-gpu-arch=sm_52 \
-// RUN:   --no-cuda-include-ptx=sm_60 \
+// RUN:   --no-cuda-include-ptx=sm_60 --cuda-include-ptx=sm_52 \
 // RUN:   -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes FATBIN-COMMON,NOPTX-SM60,PTX-SM52 %s
 // RUN: %clang -### --target=x86_64-linux-gnu -nogpulib -nogpuinc \
 // RUN:   --cuda-gpu-arch=sm_60 --cuda-gpu-arch=sm_52 \
-// RUN:   --no-cuda-include-ptx=sm_52 \
+// RUN:   --no-cuda-include-ptx=sm_52 --cuda-include-ptx=sm_60 \
 // RUN:   -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes FATBIN-COMMON,PTX-SM60,NOPTX-SM52 %s
 
@@ -183,8 +183,8 @@
 // Verify -flto=thin -fwhole-program-vtables handling. This should result in
 // both options being passed to the host compilation, with neither passed to
 // the device compilation.
-// RUN: %clang -### --target=x86_64-linux-gnu -nogpulib -nogpuinc -c -flto=thin -fwhole-program-vtables %s 2>&1 \
-// RUN: | FileCheck -check-prefixes DEVICE,DEVICE-NOSAVE,HOST,INCLUDES-DEVICE,NOLINK,THINLTOWPD %s
+// RUN: %clang -### --cuda-include-ptx=sm_60 --target=x86_64-linux-gnu -nogpulib -nogpuinc -c -flto=thin -fwhole-program-vtables %s 2>&1 \
+// RUN: | FileCheck -check-prefixes DEVICE,DEVICE-NOSAVE,HOST,NOLINK,THINLTOWPD %s
 // THINLTOWPD-NOT: error: invalid argument '-fwhole-program-vtables' only allowed with '-flto'
 
 // ARCH-SM52: "-cc1"{{.*}}"-target-cpu" "sm_52"
diff --git a/clang/test/Driver/cuda-output-asm.cu b/clang/test/Driver/cuda-output-asm.cu
index 6b944d1891724..9d5b86bcbc1b4 100644
--- a/clang/test/Driver/cuda-output-asm.cu
+++ b/clang/test/Driver/cuda-output-asm.cu
@@ -17,13 +17,9 @@
 // SM30-DAG: "-cc1" "-triple" "nvptx64-nvidia-cuda"
 // SM30-same: "-target-cpu" "sm_30"
 
-// RUN: not %clang -### -S --target=x86_64-linux-gnu -o foo.s %s 2>&1 \
-// RUN:   | FileCheck -check-prefix MULTIPLE-OUTPUT-FILES %s
 // RUN: not %clang -### -S --target=x86_64-linux-gnu --cuda-device-only \
 // RUN:   --cuda-gpu-arch=sm_20 --cuda-gpu-arch=sm_30 -o foo.s %s 2>&1 \
 // RUN:   | FileCheck -check-prefix MULTIPLE-OUTPUT-FILES %s
-// RUN: not %clang -### -emit-llvm -c --target=x86_64-linux-gnu -o foo.s %s 2>&1 \
-// RUN:   | FileCheck -check-prefix MULTIPLE-OUTPUT-FILES %s
 // MULTIPLE-OUTPUT-FILES: error: cannot specify -o when generating multiple output files
 // Make sure we do not get duplicate diagnostics.
 // MULTIPLE-OUTPUT-FILES-NOT: error: cannot specify -o when generating multiple output files

From 372044ee09d39942925824f8f335aef40bfe92f0 Mon Sep 17 00:00:00 2001
From: Durgadoss R <durgadossr@nvidia.com>
Date: Fri, 10 Jan 2025 22:31:53 +0530
Subject: [PATCH 080/408] [NVPTX] Add TMA Bulk Copy intrinsics (#122344)

PR #96083 added intrinsics for async copy of 'tensor' data
using TMA. Following a similar design, this PR adds intrinsics
for async copy of bulk data (non-tensor variants) through TMA.

* These intrinsics optionally support multicast and cache_hints,
   as indicated by the boolean arguments at the end of the intrinsics.
* The backend looks through these flag arguments and lowers to the
   appropriate PTX instructions.
* Lit tests are added for all combinations of these intrinsics in
   cp-async-bulk.ll.
* The generated PTX is verified with a 12.3 ptxas executable.
* Added docs for these intrinsics in NVPTXUsage.rst file.

PTX Spec reference:
https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk

Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
---
 llvm/docs/NVPTXUsage.rst                    |  88 +++++++++++++++
 llvm/include/llvm/IR/IntrinsicsNVVM.td      |  43 +++++++
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp |  75 +++++++++++++
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h   |   2 +
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td    |  68 ++++++++++-
 llvm/test/CodeGen/NVPTX/cp-async-bulk.ll    | 118 ++++++++++++++++++++
 6 files changed, 391 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/cp-async-bulk.ll

diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 313e84f3722a9..25a230f65fd3d 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -465,6 +465,94 @@ least-significant bit position. 0xffffffff is returned if no 1 bit is found.
 TMA family of Intrinsics
 ------------------------
 
+'``llvm.nvvm.cp.async.bulk.global.to.shared.cluster``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+  declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %mbar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.cp.async.bulk.global.to.shared.cluster``' intrinsic
+corresponds to the ``cp.async.bulk.shared::cluster.global.*`` family
+of PTX instructions. These instructions initiate an asynchronous
+copy of bulk data from global memory to shared::cluster memory.
+The 32-bit operand ``%size`` specifies the amount of memory to be
+copied and it must be a multiple of 16.
+
+* The last two arguments to these intrinsics are boolean flags
+  indicating support for cache_hint and/or multicast modifiers.
+  These flag arguments must be compile-time constants. The backend
+  looks through these flags and lowers the intrinsics appropriately.
+
+* The Nth argument (denoted by ``i1 %flag_ch``) when set, indicates
+  a valid cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint``
+  variant of the PTX instruction.
+
+* The [N-1]th argument (denoted by ``i1 %flag_mc``) when set, indicates
+  the presence of a multicast mask (``i16 %mc``) and generates the PTX
+  instruction with the ``.multicast::cluster`` modifier.
+
+For more information, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk>`_.
+
+'``llvm.nvvm.cp.async.bulk.shared.cta.to.global``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+  declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i1 %flag_ch)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.cp.async.bulk.shared.cta.to.global``' intrinsic
+corresponds to the ``cp.async.bulk.global.shared::cta.*`` set of PTX
+instructions. These instructions initiate an asynchronous copy from
+shared::cta to global memory. The 32-bit operand ``%size`` specifies
+the amount of memory to be copied and it must be a multiple of 16.
+
+* The last argument to these intrinsics is a boolean flag
+  indicating support for cache_hint. This flag argument must
+  be a compile-time constant. When set, it indicates a valid
+  cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint``
+  variant of the PTX instruction.
+
+For more information, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk>`_.
+
+'``llvm.nvvm.cp.async.bulk.shared.cta.to.cluster``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+  declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %mbar, ptr addrspace(3) %src, i32 %size)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.cp.async.bulk.shared.cta.to.cluster``' intrinsic
+corresponds to the ``cp.async.bulk.shared::cluster.shared::cta.*``
+PTX instruction. This instruction initiates an asynchronous copy from
+shared::cta to shared::cluster memory. The destination has to be in
+the shared memory of a different CTA within the cluster. The 32-bit
+operand ``%size`` specifies the amount of memory to be copied and
+it must be a multiple of 16.
+
+For more information, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk>`_.
+
 '``llvm.nvvm.cp.async.bulk.tensor.g2s.tile.[1-5]d``'
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index fd07d131ce15b..ae04a130bc825 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -4980,4 +4980,47 @@ foreach dim = [1, 2, 3, 4, 5] in {
   }
 }
 
+// Intrinsics for Bulk Copy using TMA (non-tensor)
+// From Global to Shared Cluster
+def int_nvvm_cp_async_bulk_global_to_shared_cluster
+  : DefaultAttrsIntrinsic<[],
+      [llvm_shared_ptr_ty, // dst_smem_ptr
+       llvm_shared_ptr_ty, // mbarrier_ptr
+       llvm_global_ptr_ty, // src_gmem_ptr
+       llvm_i32_ty,        // copy_size
+       llvm_i16_ty,        // cta_mask
+       llvm_i64_ty,        // cache_hint
+       llvm_i1_ty,         // Flag for cta_mask
+       llvm_i1_ty],        // Flag for cache_hint
+      [IntrConvergent, IntrArgMemOnly,
+       WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
+       NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
+       NoCapture<ArgIndex<2>>, ImmArg<ArgIndex<6>>,
+       ImmArg<ArgIndex<7>>]>;
+
+// From Shared CTA to Shared Cluster
+def int_nvvm_cp_async_bulk_shared_cta_to_cluster
+  : DefaultAttrsIntrinsic<[],
+      [llvm_shared_ptr_ty, // dst_smem_ptr
+       llvm_shared_ptr_ty, // mbarrier_ptr
+       llvm_shared_ptr_ty, // src_smem_ptr
+       llvm_i32_ty],       // copy_size
+      [IntrConvergent, IntrArgMemOnly,
+       WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
+       NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
+       NoCapture<ArgIndex<2>>]>;
+
+// From Shared CTA to Global memory
+def int_nvvm_cp_async_bulk_shared_cta_to_global
+  : DefaultAttrsIntrinsic<[],
+      [llvm_global_ptr_ty, // dst_gmem_ptr
+       llvm_shared_ptr_ty, // src_smem_ptr
+       llvm_i32_ty,        // copy_size
+       llvm_i64_ty,        // cache_hint
+       llvm_i1_ty],        // Flag for cache_hint
+      [IntrConvergent, IntrArgMemOnly,
+       WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
+       NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
+       ImmArg<ArgIndex<4>>]>;
+
 } // let TargetPrefix = "nvvm"
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index ef97844142d40..1341f8a8fca1f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -3024,6 +3024,75 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
   ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
 }
 
+void NVPTXDAGToDAGISel::SelectCpAsyncBulkS2G(SDNode *N) {
+  // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
+  // dst, src, size, cache_hint, cache_hint_flag
+  // NumOperands = {Chain, IID} + {Actual intrinsic args}
+  //             = {2}          + {5}
+  size_t NumOps = N->getNumOperands();
+  bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
+  size_t NumArgs = IsCacheHint ? 4 : 3; // src, dst, size, cache_hint
+
+  SDLoc DL(N);
+  SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumArgs));
+  Ops.push_back(N->getOperand(0)); // Chain operand
+
+  bool IsShared32 =
+      CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
+  unsigned Opcode;
+  if (IsCacheHint)
+    Opcode = IsShared32 ? NVPTX::CP_ASYNC_BULK_S2G_SHARED32_CH
+                        : NVPTX::CP_ASYNC_BULK_S2G_CH;
+  else
+    Opcode = IsShared32 ? NVPTX::CP_ASYNC_BULK_S2G_SHARED32
+                        : NVPTX::CP_ASYNC_BULK_S2G;
+  ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
+}
+
+void NVPTXDAGToDAGISel::SelectCpAsyncBulkG2S(SDNode *N) {
+  // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
+  // {dst, mbar, src, size, multicast, cache_hint,
+  // multicast_flag, cache_hint_flag}
+  // NumOperands = {Chain, IID} + {Actual intrinsic args}
+  //             = {2}          + {8}
+  size_t NumOps = N->getNumOperands();
+  bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
+  bool IsMultiCast = N->getConstantOperandVal(NumOps - 2) == 1;
+  size_t NumBaseArgs = 4;                // dst, mbar, src, size
+  size_t MultiCastIdx = NumBaseArgs + 2; // for Chain and IID
+
+  SDLoc DL(N);
+  SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumBaseArgs));
+
+  // Push MultiCast operand, if available
+  if (IsMultiCast)
+    Ops.push_back(N->getOperand(MultiCastIdx));
+
+  // Push CacheHint operand, if available
+  if (IsCacheHint)
+    Ops.push_back(N->getOperand(MultiCastIdx + 1));
+
+  // Finally, the chain operand
+  Ops.push_back(N->getOperand(0));
+
+  bool IsShared32 =
+      CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
+  unsigned Opcode = [&]() {
+    if (IsMultiCast && IsCacheHint)
+      return IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32_MC_CH
+                        : NVPTX::CP_ASYNC_BULK_G2S_MC_CH;
+    if (IsMultiCast)
+      return IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32_MC
+                        : NVPTX::CP_ASYNC_BULK_G2S_MC;
+    if (IsCacheHint)
+      return IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32_CH
+                        : NVPTX::CP_ASYNC_BULK_G2S_CH;
+    return IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32
+                      : NVPTX::CP_ASYNC_BULK_G2S;
+  }();
+  ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
+}
+
 bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
   unsigned IID = N->getConstantOperandVal(1);
   using TMARedTy = llvm::nvvm::TMAReductionOp;
@@ -3031,6 +3100,12 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
   switch (IID) {
   default:
     return false;
+  case Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster:
+    SelectCpAsyncBulkG2S(N);
+    return true;
+  case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_global:
+    SelectCpAsyncBulkS2G(N);
+    return true;
   case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_1d:
   case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_2d:
   case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_3d:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index c307f28fcc6c0..4b67a370c3fe0 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -90,6 +90,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
   bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N);
   void SelectV2I64toI128(SDNode *N);
   void SelectI128toV2I64(SDNode *N);
+  void SelectCpAsyncBulkG2S(SDNode *N);
+  void SelectCpAsyncBulkS2G(SDNode *N);
   void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false);
   void SelectCpAsyncBulkTensorS2GCommon(SDNode *N, bool IsIm2Col = false);
   void SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N, bool IsIm2Col = false);
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 8ede1ec4f20dc..22339ebc5484f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -498,9 +498,71 @@ def CP_ASYNC_BULK_WAIT_GROUP_READ :
   [(int_nvvm_cp_async_bulk_wait_group_read timm:$n)]>,
   Requires<[hasPTX<80>, hasSM<90>]>;
 
-//-----------------------------------
-// TMA Async Tensor Copy Functions
-//-----------------------------------
+//------------------------------
+// TMA Async Bulk Copy Functions
+//------------------------------
+
+class CpAsyncBulkStr<bit mc, bit ch> {
+  // Shared to Global memory
+  string S2G = "cp.async.bulk.global.shared::cta.bulk_group"
+               # !if(ch, ".L2::cache_hint", "");
+
+  // Global to Shared cluster memory
+  string G2S = "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes"
+               # !if(mc, ".multicast::cluster", "")
+               # !if(ch, ".L2::cache_hint", "");
+
+  // Shared CTA to Cluster memory
+  string C2C = "cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes";
+}
+
+multiclass CP_ASYNC_BULK_S2G<NVPTXRegClass rc> {
+  def NAME: NVPTXInst<(outs),
+            (ins Int64Regs:$dst, rc:$src, Int32Regs:$size),
+            !strconcat(CpAsyncBulkStr<0, 0>.S2G, " [$dst], [$src], $size;"), []>,
+            Requires<[hasPTX<80>, hasSM<90>]>;
+  def NAME # _CH: NVPTXInst<(outs),
+                  (ins Int64Regs:$dst, rc:$src, Int32Regs:$size, Int64Regs:$ch),
+                  !strconcat(CpAsyncBulkStr<0, 1>.S2G, " [$dst], [$src], $size, $ch;"), []>,
+                  Requires<[hasPTX<80>, hasSM<90>]>;
+}
+defm CP_ASYNC_BULK_S2G : CP_ASYNC_BULK_S2G<Int64Regs>;
+defm CP_ASYNC_BULK_S2G_SHARED32 : CP_ASYNC_BULK_S2G<Int32Regs>;
+
+multiclass CP_ASYNC_BULK_G2S<NVPTXRegClass rc> {
+  def NAME: NVPTXInst<(outs),
+            (ins rc:$dst, rc:$mbar, Int64Regs:$src, Int32Regs:$size),
+            !strconcat(CpAsyncBulkStr<0, 0>.G2S, " [$dst], [$src], $size, [$mbar];"), []>,
+            Requires<[hasPTX<80>, hasSM<90>]>;
+  def NAME # _MC: NVPTXInst<(outs),
+                  (ins rc:$dst, rc:$mbar, Int64Regs:$src, Int32Regs:$size, Int16Regs:$mc),
+                  !strconcat(CpAsyncBulkStr<1, 0>.G2S, " [$dst], [$src], $size, [$mbar], $mc;"), []>,
+                  Requires<[hasPTX<80>, hasSM<90>]>;
+  def NAME # _CH: NVPTXInst<(outs),
+                  (ins rc:$dst, rc:$mbar, Int64Regs:$src, Int32Regs:$size, Int64Regs:$ch),
+                  !strconcat(CpAsyncBulkStr<0, 1>.G2S, " [$dst], [$src], $size, [$mbar], $ch;"), []>,
+                  Requires<[hasPTX<80>, hasSM<90>]>;
+  def NAME # _MC_CH: NVPTXInst<(outs),
+                     (ins rc:$dst, rc:$mbar, Int64Regs:$src, Int32Regs:$size, Int16Regs:$mc, Int64Regs:$ch),
+                     !strconcat(CpAsyncBulkStr<1, 1>.G2S, " [$dst], [$src], $size, [$mbar], $mc, $ch;"), []>,
+                     Requires<[hasPTX<80>, hasSM<90>]>;
+}
+defm CP_ASYNC_BULK_G2S : CP_ASYNC_BULK_G2S<Int64Regs>;
+defm CP_ASYNC_BULK_G2S_SHARED32 : CP_ASYNC_BULK_G2S<Int32Regs>;
+
+multiclass CP_ASYNC_BULK_CTA_TO_CLUSTER<NVPTXRegClass rc> {
+  def NAME: NVPTXInst<(outs),
+            (ins rc:$dst, rc:$mbar, rc:$src, Int32Regs:$size),
+            !strconcat(CpAsyncBulkStr<0, 0>.C2C, " [$dst], [$src], $size, [$mbar];"),
+            [(int_nvvm_cp_async_bulk_shared_cta_to_cluster rc:$dst, rc:$mbar, rc:$src, Int32Regs:$size)]>,
+            Requires<[hasPTX<80>, hasSM<90>]>;
+}
+defm CP_ASYNC_BULK_CTA_TO_CLUSTER : CP_ASYNC_BULK_CTA_TO_CLUSTER<Int64Regs>;
+defm CP_ASYNC_BULK_CTA_TO_CLUSTER_SHARED32 : CP_ASYNC_BULK_CTA_TO_CLUSTER<Int32Regs>;
+
+//-------------------------------------
+// TMA Async Bulk Tensor Copy Functions
+//-------------------------------------
 
 // From Global to Shared memory (G2S)
 class G2S_STRINGS<int dim, string mode, bit mc, bit ch, bit is_shared32 = 0> {
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll
new file mode 100644
index 0000000000000..aefd18a0632a0
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll
@@ -0,0 +1,118 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %}
+; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3), ptr addrspace(3), ptr addrspace(1), i32, i16, i64, i1, i1)
+declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1), ptr addrspace(3), i32, i64, i1)
+declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3), ptr addrspace(3), ptr addrspace(3), i32)
+
+define void @cp_async_bulk_g2s(ptr addrspace(1) %src, ptr addrspace(3) %bar, ptr addrspace(3) %dst, i32 %size, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_g2s(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<2>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_g2s_param_0];
+; CHECK-PTX64-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_g2s_param_1];
+; CHECK-PTX64-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_g2s_param_2];
+; CHECK-PTX64-NEXT:    ld.param.u32 %r1, [cp_async_bulk_g2s_param_3];
+; CHECK-PTX64-NEXT:    cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%rd3], [%rd1], %r1, [%rd2];
+; CHECK-PTX64-NEXT:    ld.param.u64 %rd4, [cp_async_bulk_g2s_param_5];
+; CHECK-PTX64-NEXT:    cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint [%rd3], [%rd1], %r1, [%rd2], %rd4;
+; CHECK-PTX64-NEXT:    ld.param.u16 %rs1, [cp_async_bulk_g2s_param_4];
+; CHECK-PTX64-NEXT:    cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%rd3], [%rd1], %r1, [%rd2], %rs1;
+; CHECK-PTX64-NEXT:    cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd3], [%rd1], %r1, [%rd2], %rs1, %rd4;
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_g2s(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<4>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_g2s_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r1, [cp_async_bulk_g2s_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r2, [cp_async_bulk_g2s_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r3, [cp_async_bulk_g2s_param_3];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%r2], [%rd1], %r3, [%r1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_g2s_param_5];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint [%r2], [%rd1], %r3, [%r1], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    ld.param.u16 %rs1, [cp_async_bulk_g2s_param_4];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%r2], [%rd1], %r3, [%r1], %rs1;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r2], [%rd1], %r3, [%r1], %rs1, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 0, i64 0, i1 0, i1 0)
+  tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 0, i64 %ch, i1 0, i1 1)
+  tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 0, i1 1, i1 0)
+  tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 %ch, i1 1, i1 1)
+  ret void
+}
+
+define void @cp_async_bulk_s2g(ptr addrspace(3) %src, ptr addrspace(1) %dst, i32 %size, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_s2g(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b32 %r<2>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<4>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_s2g_param_0];
+; CHECK-PTX64-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_s2g_param_1];
+; CHECK-PTX64-NEXT:    ld.param.u32 %r1, [cp_async_bulk_s2g_param_2];
+; CHECK-PTX64-NEXT:    cp.async.bulk.global.shared::cta.bulk_group [%rd2], [%rd1], %r1;
+; CHECK-PTX64-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_s2g_param_3];
+; CHECK-PTX64-NEXT:    cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint [%rd2], [%rd1], %r1, %rd3;
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_s2g(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<3>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r1, [cp_async_bulk_s2g_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_s2g_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r2, [cp_async_bulk_s2g_param_2];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.global.shared::cta.bulk_group [%rd1], [%r1], %r2;
+; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_s2g_param_3];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint [%rd1], [%r1], %r2, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 0, i1 0)
+  tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i1 1)
+  ret void
+}
+
+define void @cp_async_bulk_cta_to_cluster(ptr addrspace(3) %src, ptr addrspace(3) %bar, ptr addrspace(3) %dst, i32 %size) {
+; CHECK-PTX64-LABEL: cp_async_bulk_cta_to_cluster(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b32 %r<2>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<4>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_cta_to_cluster_param_0];
+; CHECK-PTX64-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_cta_to_cluster_param_1];
+; CHECK-PTX64-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_cta_to_cluster_param_2];
+; CHECK-PTX64-NEXT:    ld.param.u32 %r1, [cp_async_bulk_cta_to_cluster_param_3];
+; CHECK-PTX64-NEXT:    cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%rd3], [%rd1], %r1, [%rd2];
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_cta_to_cluster(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<5>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r1, [cp_async_bulk_cta_to_cluster_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r2, [cp_async_bulk_cta_to_cluster_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r3, [cp_async_bulk_cta_to_cluster_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r4, [cp_async_bulk_cta_to_cluster_param_3];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%r3], [%r1], %r4, [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(3) %src, i32 %size)
+  ret void
+}

From 70e96dc3fb895e95dc659f87c2ed188507831801 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Fri, 10 Jan 2025 11:05:23 -0600
Subject: [PATCH 081/408] [flang][OpenMP] Parsing context selectors for
 METADIRECTIVE (#121815)

This is just adding parsers for context selectors. There are no tests
because there is no way to execute these parsers yet.
---
 flang/docs/ParserCombinators.md              |   2 +-
 flang/include/flang/Parser/characters.h      |   2 +
 flang/include/flang/Parser/dump-parse-tree.h |  13 ++
 flang/include/flang/Parser/parse-tree.h      | 150 +++++++++++++++++++
 flang/lib/Parser/basic-parsers.h             |  54 +++----
 flang/lib/Parser/openmp-parsers.cpp          |  77 ++++++++++
 flang/lib/Parser/token-parsers.h             |   4 +
 flang/lib/Parser/unparse.cpp                 |  35 +++++
 8 files changed, 305 insertions(+), 32 deletions(-)

diff --git a/flang/docs/ParserCombinators.md b/flang/docs/ParserCombinators.md
index 7cb77deba2197..076e76f703c49 100644
--- a/flang/docs/ParserCombinators.md
+++ b/flang/docs/ParserCombinators.md
@@ -141,7 +141,7 @@ collect the values that they return.
 * `applyLambda([](&&x){}, p1, p2, ...)` is the same thing, but for lambdas
   and other function objects.
 * `applyMem(mf, p1, p2, ...)` is the same thing, but invokes a member
-  function of the result of the first parser for updates in place.
+  function of the result of the first parser.
 
 ### Token Parsers
 Last, we have these basic parsers on which the actual grammar of the Fortran
diff --git a/flang/include/flang/Parser/characters.h b/flang/include/flang/Parser/characters.h
index df188d674b9ee..dbdc058c44995 100644
--- a/flang/include/flang/Parser/characters.h
+++ b/flang/include/flang/Parser/characters.h
@@ -180,6 +180,8 @@ inline constexpr bool IsValidFortranTokenCharacter(char ch) {
   case '>':
   case '[':
   case ']':
+  case '{': // Used in OpenMP context selector specification
+  case '}': //
     return true;
   default:
     return IsLegalIdentifierStart(ch) || IsDecimalDigit(ch);
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index 3331520922bc6..11725991e9c9a 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -476,6 +476,19 @@ class ParseTreeDumper {
   NODE(parser, NullInit)
   NODE(parser, ObjectDecl)
   NODE(parser, OldParameterStmt)
+  NODE(parser, OmpTraitPropertyName)
+  NODE(parser, OmpTraitScore)
+  NODE(parser, OmpTraitPropertyExtension)
+  NODE(OmpTraitPropertyExtension, ExtensionValue)
+  NODE(parser, OmpTraitProperty)
+  NODE(parser, OmpTraitSelectorName)
+  NODE_ENUM(OmpTraitSelectorName, Value)
+  NODE(parser, OmpTraitSelector)
+  NODE(OmpTraitSelector, Properties)
+  NODE(parser, OmpTraitSetSelectorName)
+  NODE_ENUM(OmpTraitSetSelectorName, Value)
+  NODE(parser, OmpTraitSetSelector)
+  NODE(parser, OmpContextSelectorSpecification)
   NODE(parser, OmpMapper)
   NODE(parser, OmpMapType)
   NODE_ENUM(OmpMapType, Value)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 941d70d387629..00d85aa05fb3a 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -3453,6 +3453,9 @@ WRAPPER_CLASS(PauseStmt, std::optional<StopCode>);
 
 // --- Common definitions
 
+struct OmpClause;
+struct OmpClauseList;
+
 // 2.1 Directives or clauses may accept a list or extended-list.
 //     A list item is a variable, array section or common block name (enclosed
 //     in slashes). An extended list item is a list item or a procedure Name.
@@ -3474,6 +3477,150 @@ WRAPPER_CLASS(OmpObjectList, std::list<OmpObject>);
 
 #define MODIFIERS() std::optional<std::list<Modifier>>
 
+inline namespace traits {
+// trait-property-name ->
+//    identifier | string-literal
+//
+// This is a bit of a problematic case. The spec says that a word in quotes,
+// and the same word without quotes are equivalent. We currently parse both
+// as a string, but it's likely just a temporary solution.
+//
+// The problem is that trait-property can be (among other things) a
+// trait-property-name or a trait-property-expression. A simple identifier
+// can be either, there is no reasonably simple way of telling them apart
+// in the parser. There is a similar issue with extensions. Some of that
+// disambiguation may need to be done in the "canonicalization" pass and
+// then some of those AST nodes would be rewritten into different ones.
+//
+struct OmpTraitPropertyName {
+  CharBlock source;
+  WRAPPER_CLASS_BOILERPLATE(OmpTraitPropertyName, std::string);
+};
+
+// trait-score ->
+//    SCORE(non-negative-const-integer-expression)
+struct OmpTraitScore {
+  CharBlock source;
+  WRAPPER_CLASS_BOILERPLATE(OmpTraitScore, ScalarIntExpr);
+};
+
+// trait-property-extension ->
+//    trait-property-name (trait-property-value, ...)
+// trait-property-value ->
+//    trait-property-name |
+//    scalar-integer-expression |
+//    trait-property-extension
+//
+// The grammar in OpenMP 5.2+ spec is ambiguous, the above is a different
+// version (but equivalent) that doesn't have ambiguities.
+// The ambiguity is in
+//   trait-property:
+//      trait-property-name          <- (a)
+//      trait-property-clause
+//      trait-property-expression    <- (b)
+//      trait-property-extension     <- this conflicts with (a) and (b)
+//   trait-property-extension:
+//      trait-property-name          <- conflict with (a)
+//      identifier(trait-property-extension[, trait-property-extension[, ...]])
+//      constant integer expression  <- conflict with (b)
+//
+struct OmpTraitPropertyExtension {
+  CharBlock source;
+  TUPLE_CLASS_BOILERPLATE(OmpTraitPropertyExtension);
+  struct ExtensionValue {
+    CharBlock source;
+    UNION_CLASS_BOILERPLATE(ExtensionValue);
+    std::variant<OmpTraitPropertyName, ScalarExpr,
+        common::Indirection<OmpTraitPropertyExtension>>
+        u;
+  };
+  using ExtensionList = std::list<ExtensionValue>;
+  std::tuple<OmpTraitPropertyName, ExtensionList> t;
+};
+
+// trait-property ->
+//    trait-property-name | OmpClause |
+//    trait-property-expression | trait-property-extension
+// trait-property-expression ->
+//    scalar-logical-expression | scalar-integer-expression
+//
+// The parser for a logical expression will accept an integer expression,
+// and if it's not logical, it will flag an error later. The same thing
+// will happen if the scalar integer expression sees a logical expresion.
+// To avoid this, parse all expressions as scalar expressions.
+struct OmpTraitProperty {
+  CharBlock source;
+  UNION_CLASS_BOILERPLATE(OmpTraitProperty);
+  std::variant<OmpTraitPropertyName, common::Indirection<OmpClause>,
+      ScalarExpr, // trait-property-expresion
+      OmpTraitPropertyExtension>
+      u;
+};
+
+// trait-selector-name ->
+//    KIND |              DT       // name-list (host, nohost, +/add-def-doc)
+//    ISA |               DT       // name-list (isa_name, ... /impl-defined)
+//    ARCH |              DT       // name-list (arch_name, ... /impl-defined)
+//    directive-name |    C        // no properties
+//    SIMD |              C        // clause-list (from declare_simd)
+//                                 // (at least simdlen, inbranch/notinbranch)
+//    DEVICE_NUM |        T        // device-number
+//    UID |               T        // unique-string-id /impl-defined
+//    VENDOR |            I        // name-list (vendor-id /add-def-doc)
+//    EXTENSION |         I        // name-list (ext_name /impl-defined)
+//    ATOMIC_DEFAULT_MEM_ORDER I | // value of admo
+//    REQUIRES |          I        // clause-list (from requires)
+//    CONDITION           U        // logical-expr
+//
+// Trait-set-selectors:
+//    [D]evice, [T]arget_device, [C]onstruct, [I]mplementation, [U]ser.
+struct OmpTraitSelectorName {
+  CharBlock source;
+  UNION_CLASS_BOILERPLATE(OmpTraitSelectorName);
+  ENUM_CLASS(Value, Arch, Atomic_Default_Mem_Order, Condition, Device_Num,
+      Extension, Isa, Kind, Requires, Simd, Uid, Vendor)
+  std::variant<Value, llvm::omp::Directive> u;
+};
+
+// trait-selector ->
+//    trait-selector-name |
+//    trait-selector-name ([trait-score:] trait-property, ...)
+struct OmpTraitSelector {
+  CharBlock source;
+  TUPLE_CLASS_BOILERPLATE(OmpTraitSelector);
+  struct Properties {
+    TUPLE_CLASS_BOILERPLATE(Properties);
+    std::tuple<std::optional<OmpTraitScore>, std::list<OmpTraitProperty>> t;
+  };
+  std::tuple<OmpTraitSelectorName, std::optional<Properties>> t;
+};
+
+// trait-set-selector-name ->
+//    CONSTRUCT | DEVICE | IMPLEMENTATION | USER |  // since 5.0
+//    TARGET_DEVICE                                 // since 5.1
+struct OmpTraitSetSelectorName {
+  CharBlock source;
+  ENUM_CLASS(Value, Construct, Device, Implementation, Target_Device, User)
+  WRAPPER_CLASS_BOILERPLATE(OmpTraitSetSelectorName, Value);
+};
+
+// trait-set-selector ->
+//    trait-set-selector-name = {trait-selector, ...}
+struct OmpTraitSetSelector {
+  CharBlock source;
+  TUPLE_CLASS_BOILERPLATE(OmpTraitSetSelector);
+  std::tuple<OmpTraitSetSelectorName, std::list<OmpTraitSelector>> t;
+};
+
+// context-selector-specification ->
+//    trait-set-selector, ...
+struct OmpContextSelectorSpecification { // Modifier
+  CharBlock source;
+  WRAPPER_CLASS_BOILERPLATE(
+      OmpContextSelectorSpecification, std::list<OmpTraitSetSelector>);
+};
+} // namespace traits
+
 inline namespace modifier {
 // For uniformity, in all keyword modifiers the name of the type defined
 // by ENUM_CLASS is "Value", e.g.
@@ -3744,6 +3891,9 @@ struct OmpVariableCategory {
   ENUM_CLASS(Value, Aggregate, All, Allocatable, Pointer, Scalar)
   WRAPPER_CLASS_BOILERPLATE(OmpVariableCategory, Value);
 };
+
+// context-selector
+using OmpContextSelector = traits::OmpContextSelectorSpecification;
 } // namespace modifier
 
 // --- Clauses
diff --git a/flang/lib/Parser/basic-parsers.h b/flang/lib/Parser/basic-parsers.h
index 515b5993d6737..1a8c14e7048f6 100644
--- a/flang/lib/Parser/basic-parsers.h
+++ b/flang/lib/Parser/basic-parsers.h
@@ -580,11 +580,11 @@ template <typename PA> inline constexpr auto defaulted(PA p) {
 // applyLambda(f, ...) is the same concept extended to std::function<> functors.
 // It is not constexpr.
 //
-// Member function application is supported by applyMem(f, a).  If the
-// parser a succeeds and returns some value ax, the result is that returned
-// by ax.f().  Additional parser arguments can be specified to supply their
-// results to the member function call, so applyMem(f, a, b) succeeds if
-// both a and b do so and returns the result of calling ax.f(std::move(bx)).
+// Member function application is supported by applyMem(&C::f, a).  If the
+// parser a succeeds and returns some value ax of type C, the result is that
+// returned by ax.f().  Additional parser arguments can be specified to supply
+// their results to the member function call, so applyMem(&C::f, a, b) succeeds
+// if both a and b do so and returns the result of calling ax.f(std::move(bx)).
 
 // Runs a sequence of parsers until one fails or all have succeeded.
 // Collects their results in a std::tuple<std::optional<>...>.
@@ -654,39 +654,31 @@ inline /* not constexpr */ auto applyLambda(
 }
 
 // Member function application
-template <typename OBJPARSER, typename... PARSER> class AMFPHelper {
-  using resultType = typename OBJPARSER::resultType;
-
-public:
-  using type = void (resultType::*)(typename PARSER::resultType &&...);
-};
-template <typename OBJPARSER, typename... PARSER>
-using ApplicableMemberFunctionPointer =
-    typename AMFPHelper<OBJPARSER, PARSER...>::type;
-
-template <typename OBJPARSER, typename... PARSER, std::size_t... J>
-inline auto ApplyHelperMember(
-    ApplicableMemberFunctionPointer<OBJPARSER, PARSER...> mfp,
-    ApplyArgs<OBJPARSER, PARSER...> &&args, std::index_sequence<J...>) ->
-    typename OBJPARSER::resultType {
-  ((*std::get<0>(args)).*mfp)(std::move(*std::get<J + 1>(args))...);
-  return std::get<0>(std::move(args));
+template <typename MEMFUNC, typename OBJPARSER, typename... PARSER,
+    std::size_t... J>
+inline auto ApplyHelperMember(MEMFUNC mfp,
+    ApplyArgs<OBJPARSER, PARSER...> &&args, std::index_sequence<J...>) {
+  return ((*std::get<0>(args)).*mfp)(std::move(*std::get<J + 1>(args))...);
 }
 
-template <typename OBJPARSER, typename... PARSER> class ApplyMemberFunction {
-  using funcType = ApplicableMemberFunctionPointer<OBJPARSER, PARSER...>;
+template <typename MEMFUNC, typename OBJPARSER, typename... PARSER>
+class ApplyMemberFunction {
+  static_assert(std::is_member_function_pointer_v<MEMFUNC>);
+  using funcType = MEMFUNC;
 
 public:
-  using resultType = typename OBJPARSER::resultType;
+  using resultType =
+      std::invoke_result_t<MEMFUNC, typename OBJPARSER::resultType, PARSER...>;
+
   constexpr ApplyMemberFunction(const ApplyMemberFunction &) = default;
-  constexpr ApplyMemberFunction(funcType f, OBJPARSER o, PARSER... p)
+  constexpr ApplyMemberFunction(MEMFUNC f, OBJPARSER o, PARSER... p)
       : function_{f}, parsers_{o, p...} {}
   std::optional<resultType> Parse(ParseState &state) const {
     ApplyArgs<OBJPARSER, PARSER...> results;
     using Sequence1 = std::index_sequence_for<OBJPARSER, PARSER...>;
     using Sequence2 = std::index_sequence_for<PARSER...>;
     if (ApplyHelperArgs(parsers_, results, state, Sequence1{})) {
-      return ApplyHelperMember<OBJPARSER, PARSER...>(
+      return ApplyHelperMember<MEMFUNC, OBJPARSER, PARSER...>(
           function_, std::move(results), Sequence2{});
     } else {
       return std::nullopt;
@@ -698,11 +690,11 @@ template <typename OBJPARSER, typename... PARSER> class ApplyMemberFunction {
   const std::tuple<OBJPARSER, PARSER...> parsers_;
 };
 
-template <typename OBJPARSER, typename... PARSER>
+template <typename MEMFUNC, typename OBJPARSER, typename... PARSER>
 inline constexpr auto applyMem(
-    ApplicableMemberFunctionPointer<OBJPARSER, PARSER...> mfp,
-    const OBJPARSER &objParser, PARSER... parser) {
-  return ApplyMemberFunction<OBJPARSER, PARSER...>{mfp, objParser, parser...};
+    MEMFUNC memfn, const OBJPARSER &objParser, PARSER... parser) {
+  return ApplyMemberFunction<MEMFUNC, OBJPARSER, PARSER...>{
+      memfn, objParser, parser...};
 }
 
 // As is done with function application via applyFunction() above, class
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index 894c458a335b2..5ff91da082c85 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -153,6 +153,83 @@ static TypeDeclarationStmt makeIterSpecDecl(std::list<ObjectName> &&names) {
       makeEntityList(std::move(names)));
 }
 
+// --- Parsers for context traits -------------------------------------
+
+static std::string nameToString(Name &&name) { return name.ToString(); }
+
+TYPE_PARSER(sourced(construct<OmpTraitPropertyName>( //
+    (space >> charLiteralConstantWithoutKind) ||
+    applyFunction(nameToString, Parser<Name>{}))))
+
+TYPE_PARSER(sourced(construct<OmpTraitScore>( //
+    "SCORE" >> parenthesized(scalarIntExpr))))
+
+TYPE_PARSER(sourced(construct<OmpTraitPropertyExtension::ExtensionValue>(
+    // Parse nested extension first.
+    construct<OmpTraitPropertyExtension::ExtensionValue>(
+        indirect(Parser<OmpTraitPropertyExtension>{})) ||
+    construct<OmpTraitPropertyExtension::ExtensionValue>(
+        Parser<OmpTraitPropertyName>{}) ||
+    construct<OmpTraitPropertyExtension::ExtensionValue>(scalarExpr))))
+
+TYPE_PARSER(sourced(construct<OmpTraitPropertyExtension>( //
+    Parser<OmpTraitPropertyName>{},
+    parenthesized(nonemptySeparated(
+        Parser<OmpTraitPropertyExtension::ExtensionValue>{}, ","_tok)))))
+
+TYPE_PARSER(sourced(construct<OmpTraitProperty>(
+    // Try clause first, then extension before OmpTraitPropertyName.
+    construct<OmpTraitProperty>(indirect(Parser<OmpClause>{})) ||
+    construct<OmpTraitProperty>(Parser<OmpTraitPropertyExtension>{}) ||
+    construct<OmpTraitProperty>(Parser<OmpTraitPropertyName>{}) ||
+    construct<OmpTraitProperty>(scalarExpr))))
+
+TYPE_PARSER(construct<OmpTraitSelectorName::Value>(
+    "ARCH" >> pure(OmpTraitSelectorName::Value::Arch) ||
+    "ATOMIC_DEFAULT_MEM_ORDER" >>
+        pure(OmpTraitSelectorName::Value::Atomic_Default_Mem_Order) ||
+    "CONDITION" >> pure(OmpTraitSelectorName::Value::Condition) ||
+    "DEVICE_NUM" >> pure(OmpTraitSelectorName::Value::Device_Num) ||
+    "EXTENSION" >> pure(OmpTraitSelectorName::Value::Extension) ||
+    "ISA" >> pure(OmpTraitSelectorName::Value::Isa) ||
+    "KIND" >> pure(OmpTraitSelectorName::Value::Kind) ||
+    "REQUIRES" >> pure(OmpTraitSelectorName::Value::Requires) ||
+    "SIMD" >> pure(OmpTraitSelectorName::Value::Simd) ||
+    "UID" >> pure(OmpTraitSelectorName::Value::Uid) ||
+    "VENDOR" >> pure(OmpTraitSelectorName::Value::Vendor)))
+
+TYPE_PARSER(sourced(construct<OmpTraitSelectorName>(
+    // Parse predefined names first (because of SIMD).
+    construct<OmpTraitSelectorName>(Parser<OmpTraitSelectorName::Value>{}) ||
+    construct<OmpTraitSelectorName>(OmpDirectiveNameParser{}))))
+
+TYPE_PARSER(construct<OmpTraitSelector::Properties>(
+    maybe(Parser<OmpTraitScore>{} / ":"_tok),
+    nonemptySeparated(Parser<OmpTraitProperty>{}, ","_tok)))
+
+TYPE_PARSER(sourced(construct<OmpTraitSelector>( //
+    Parser<OmpTraitSelectorName>{}, //
+    maybe(parenthesized(Parser<OmpTraitSelector::Properties>{})))))
+
+TYPE_PARSER(construct<OmpTraitSetSelectorName::Value>(
+    "CONSTRUCT" >> pure(OmpTraitSetSelectorName::Value::Construct) ||
+    "DEVICE" >> pure(OmpTraitSetSelectorName::Value::Device) ||
+    "IMPLEMENTATION" >> pure(OmpTraitSetSelectorName::Value::Implementation) ||
+    "TARGET_DEVICE" >> pure(OmpTraitSetSelectorName::Value::Target_Device) ||
+    "USER" >> pure(OmpTraitSetSelectorName::Value::User)))
+
+TYPE_PARSER(sourced(construct<OmpTraitSetSelectorName>(
+    Parser<OmpTraitSetSelectorName::Value>{})))
+
+TYPE_PARSER(sourced(construct<OmpTraitSetSelector>( //
+    Parser<OmpTraitSetSelectorName>{},
+    "=" >> braced(nonemptySeparated(Parser<OmpTraitSelector>{}, ","_tok)))))
+
+TYPE_PARSER(sourced(construct<OmpContextSelectorSpecification>(
+    nonemptySeparated(Parser<OmpTraitSetSelector>{}, ","_tok))))
+
+// Parser<OmpContextSelector> == Parser<traits::OmpContextSelectorSpecification>
+
 // --- Parsers for clause modifiers -----------------------------------
 
 TYPE_PARSER(construct<OmpAlignment>(scalarIntExpr))
diff --git a/flang/lib/Parser/token-parsers.h b/flang/lib/Parser/token-parsers.h
index fe6bc1f69f576..3e0c59b89d964 100644
--- a/flang/lib/Parser/token-parsers.h
+++ b/flang/lib/Parser/token-parsers.h
@@ -215,6 +215,10 @@ template <class PA> inline constexpr auto bracketed(const PA &p) {
   return "[" >> p / "]";
 }
 
+template <class PA> inline constexpr auto braced(const PA &p) {
+  return "{" >> p / "}";
+}
+
 // Quoted character literal constants.
 struct CharLiteralChar {
   using resultType = std::pair<char, bool /* was escaped */>;
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index 58820476c51bc..7bf404bba2c3e 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -2067,6 +2067,38 @@ class UnparseVisitor {
   }
 
   // OpenMP Clauses & Directives
+  void Unparse(const llvm::omp::Directive &x) {
+    Word(llvm::omp::getOpenMPDirectiveName(x).str());
+  }
+  void Unparse(const OmpTraitScore &x) {
+    Word("SCORE(");
+    Walk(x.v);
+    Put(")");
+  }
+  void Unparse(const OmpTraitPropertyExtension &x) {
+    Walk(std::get<OmpTraitPropertyName>(x.t));
+    Put("(");
+    Walk(std::get<OmpTraitPropertyExtension::ExtensionList>(x.t), ",");
+    Put(")");
+  }
+  void Unparse(const OmpTraitSelector &x) {
+    Walk(std::get<OmpTraitSelectorName>(x.t));
+    Walk(std::get<std::optional<OmpTraitSelector::Properties>>(x.t));
+  }
+  void Unparse(const OmpTraitSelector::Properties &x) {
+    Put("(");
+    Walk(std::get<std::optional<OmpTraitScore>>(x.t), ": ");
+    Walk(std::get<std::list<OmpTraitProperty>>(x.t));
+    Put(")");
+  }
+  void Unparse(const OmpTraitSetSelector &x) {
+    Walk(std::get<OmpTraitSetSelectorName>(x.t));
+    Put("={");
+    Walk(std::get<std::list<OmpTraitSelector>>(x.t));
+    Put("}");
+  }
+  void Unparse(const OmpContextSelectorSpecification &x) { Walk(x.v, ", "); }
+
   void Unparse(const OmpObject &x) {
     common::visit(common::visitors{
                       [&](const Designator &y) { Walk(y); },
@@ -2916,6 +2948,9 @@ class UnparseVisitor {
   WALK_NESTED_ENUM(OmpPrescriptiveness, Value) // OMP prescriptiveness
   WALK_NESTED_ENUM(OmpMapType, Value) // OMP map-type
   WALK_NESTED_ENUM(OmpMapTypeModifier, Value) // OMP map-type-modifier
+  WALK_NESTED_ENUM(OmpTraitSelectorName, Value)
+  WALK_NESTED_ENUM(OmpTraitSetSelectorName, Value)
+
 #undef WALK_NESTED_ENUM
   void Unparse(const ReductionOperator::Operator x) {
     switch (x) {

From 59ced72bc211f150518cf31606b58b11cb6ff310 Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean@nvidia.com>
Date: Fri, 10 Jan 2025 09:17:44 -0800
Subject: [PATCH 082/408] [ValueTracking] Add rotate idiom to
 haveNoCommonBitsSet special cases (#122165)

An occasional idiom for rotation is "(A << B) + (A >> (BitWidth - B))".
Currently this is not well handled on targets with native
funnel-shift/rotate support. Add a special case to haveNoCommonBitsSet
to ensure that the addition is converted to a disjoint or in InstCombine
so during instruction selection the idiom can be converted to an
efficient rotation implementation.

Proof: https://alive2.llvm.org/ce/z/WdCZsN
---
 llvm/lib/Analysis/ValueTracking.cpp        |  13 +++
 llvm/test/Transforms/InstCombine/rotate.ll | 117 +++++++++++++++++++--
 2 files changed, 124 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 9a61b36efa51d..4b246c013e96f 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -229,6 +229,19 @@ static bool haveNoCommonBitsSetSpecialCases(const Value *LHS, const Value *RHS,
       return true;
   }
 
+  // Look for: (X << V) op (Y >> (BitWidth - V))
+  // or        (X >> V) op (Y << (BitWidth - V))
+  {
+    const Value *V;
+    const APInt *R;
+    if (((match(RHS, m_Shl(m_Value(), m_Sub(m_APInt(R), m_Value(V)))) &&
+          match(LHS, m_LShr(m_Value(), m_Specific(V)))) ||
+         (match(RHS, m_LShr(m_Value(), m_Sub(m_APInt(R), m_Value(V)))) &&
+          match(LHS, m_Shl(m_Value(), m_Specific(V))))) &&
+        R->uge(LHS->getType()->getScalarSizeInBits()))
+      return true;
+  }
+
   return false;
 }
 
diff --git a/llvm/test/Transforms/InstCombine/rotate.ll b/llvm/test/Transforms/InstCombine/rotate.ll
index bae50736de0c3..a4d4ec375954f 100644
--- a/llvm/test/Transforms/InstCombine/rotate.ll
+++ b/llvm/test/Transforms/InstCombine/rotate.ll
@@ -191,7 +191,7 @@ define i32 @rotl_i32(i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[SUB:%.*]] = sub i32 32, [[Y:%.*]]
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[X:%.*]], [[Y]]
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X]], [[SUB]]
-; CHECK-NEXT:    [[R:%.*]] = or i32 [[SHR]], [[SHL]]
+; CHECK-NEXT:    [[R:%.*]] = or disjoint i32 [[SHR]], [[SHL]]
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %sub = sub i32 32, %y
@@ -208,7 +208,7 @@ define i37 @rotr_i37(i37 %x, i37 %y) {
 ; CHECK-NEXT:    [[SUB:%.*]] = sub i37 37, [[Y:%.*]]
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i37 [[X:%.*]], [[SUB]]
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr i37 [[X]], [[Y]]
-; CHECK-NEXT:    [[R:%.*]] = or i37 [[SHR]], [[SHL]]
+; CHECK-NEXT:    [[R:%.*]] = or disjoint i37 [[SHR]], [[SHL]]
 ; CHECK-NEXT:    ret i37 [[R]]
 ;
   %sub = sub i37 37, %y
@@ -225,7 +225,7 @@ define i8 @rotr_i8_commute(i8 %x, i8 %y) {
 ; CHECK-NEXT:    [[SUB:%.*]] = sub i8 8, [[Y:%.*]]
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i8 [[X:%.*]], [[SUB]]
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr i8 [[X]], [[Y]]
-; CHECK-NEXT:    [[R:%.*]] = or i8 [[SHL]], [[SHR]]
+; CHECK-NEXT:    [[R:%.*]] = or disjoint i8 [[SHL]], [[SHR]]
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
   %sub = sub i8 8, %y
@@ -242,7 +242,7 @@ define <4 x i32> @rotl_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> splat (i32 32), [[Y:%.*]]
 ; CHECK-NEXT:    [[SHL:%.*]] = shl <4 x i32> [[X:%.*]], [[Y]]
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr <4 x i32> [[X]], [[SUB]]
-; CHECK-NEXT:    [[R:%.*]] = or <4 x i32> [[SHL]], [[SHR]]
+; CHECK-NEXT:    [[R:%.*]] = or disjoint <4 x i32> [[SHL]], [[SHR]]
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
   %sub = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %y
@@ -259,7 +259,7 @@ define <3 x i42> @rotr_v3i42(<3 x i42> %x, <3 x i42> %y) {
 ; CHECK-NEXT:    [[SUB:%.*]] = sub <3 x i42> splat (i42 42), [[Y:%.*]]
 ; CHECK-NEXT:    [[SHL:%.*]] = shl <3 x i42> [[X:%.*]], [[SUB]]
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr <3 x i42> [[X]], [[Y]]
-; CHECK-NEXT:    [[R:%.*]] = or <3 x i42> [[SHR]], [[SHL]]
+; CHECK-NEXT:    [[R:%.*]] = or disjoint <3 x i42> [[SHR]], [[SHL]]
 ; CHECK-NEXT:    ret <3 x i42> [[R]]
 ;
   %sub = sub <3 x i42> <i42 42, i42 42, i42 42>, %y
@@ -838,7 +838,7 @@ define i24 @rotl_select_weird_type(i24 %x, i24 %shamt) {
 ; CHECK-NEXT:    [[SUB:%.*]] = sub i24 24, [[SHAMT]]
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr i24 [[X:%.*]], [[SUB]]
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i24 [[X]], [[SHAMT]]
-; CHECK-NEXT:    [[OR:%.*]] = or i24 [[SHL]], [[SHR]]
+; CHECK-NEXT:    [[OR:%.*]] = or disjoint i24 [[SHL]], [[SHR]]
 ; CHECK-NEXT:    [[R:%.*]] = select i1 [[CMP]], i24 [[X]], i24 [[OR]]
 ; CHECK-NEXT:    ret i24 [[R]]
 ;
@@ -981,3 +981,108 @@ define i16 @check_rotate_masked_16bit(i8 %shamt, i32 %cond) {
   %trunc = trunc i32 %or to i16
   ret i16 %trunc
 }
+
+define i32 @rotl_i32_add(i32 %x, i32 %y) {
+; CHECK-LABEL: @rotl_i32_add(
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 32, [[Y:%.*]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[X:%.*]], [[Y]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X]], [[SUB]]
+; CHECK-NEXT:    [[R:%.*]] = or disjoint i32 [[SHR]], [[SHL]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %sub = sub i32 32, %y
+  %shl = shl i32 %x, %y
+  %shr = lshr i32 %x, %sub
+  %r = add i32 %shr, %shl
+  ret i32 %r
+}
+
+define i32 @rotr_i32_add(i32 %x, i32 %y) {
+; CHECK-LABEL: @rotr_i32_add(
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 32, [[Y:%.*]]
+; CHECK-NEXT:    [[SHL:%.*]] = lshr i32 [[X:%.*]], [[Y]]
+; CHECK-NEXT:    [[SHR:%.*]] = shl i32 [[X]], [[SUB]]
+; CHECK-NEXT:    [[R:%.*]] = or disjoint i32 [[SHR]], [[SHL]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %sub = sub i32 32, %y
+  %shl = lshr i32 %x, %y
+  %shr = shl i32 %x, %sub
+  %r = add i32 %shr, %shl
+  ret i32 %r
+}
+
+define i32 @fshr_i32_add(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @fshr_i32_add(
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 32, [[Z:%.*]]
+; CHECK-NEXT:    [[SHL:%.*]] = lshr i32 [[X:%.*]], [[Z]]
+; CHECK-NEXT:    [[SHR:%.*]] = shl i32 [[Y:%.*]], [[SUB]]
+; CHECK-NEXT:    [[R:%.*]] = or disjoint i32 [[SHR]], [[SHL]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %sub = sub i32 32, %z
+  %shl = lshr i32 %x, %z
+  %shr = shl i32 %y, %sub
+  %r = add i32 %shr, %shl
+  ret i32 %r
+}
+
+define i32 @fshl_i32_add(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @fshl_i32_add(
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 32, [[Z:%.*]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[Y:%.*]], [[Z]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], [[SUB]]
+; CHECK-NEXT:    [[R:%.*]] = or disjoint i32 [[SHR]], [[SHL]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %sub = sub i32 32, %z
+  %shl = shl i32 %y, %z
+  %shr = lshr i32 %x, %sub
+  %r = add i32 %shr, %shl
+  ret i32 %r
+}
+
+define i32 @rotl_i32_add_greater(i32 %x, i32 %y) {
+; CHECK-LABEL: @rotl_i32_add_greater(
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 33, [[Y:%.*]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[X:%.*]], [[Y]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X]], [[SUB]]
+; CHECK-NEXT:    [[R:%.*]] = or disjoint i32 [[SHR]], [[SHL]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %sub = sub i32 33, %y
+  %shl = shl i32 %x, %y
+  %shr = lshr i32 %x, %sub
+  %r = add i32 %shr, %shl
+  ret i32 %r
+}
+
+define i32 @rotr_i32_add_greater(i32 %x, i32 %y) {
+; CHECK-LABEL: @rotr_i32_add_greater(
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 34, [[Y:%.*]]
+; CHECK-NEXT:    [[SHL:%.*]] = lshr i32 [[X:%.*]], [[Y]]
+; CHECK-NEXT:    [[SHR:%.*]] = shl i32 [[X]], [[SUB]]
+; CHECK-NEXT:    [[R:%.*]] = or disjoint i32 [[SHR]], [[SHL]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %sub = sub i32 34, %y
+  %shl = lshr i32 %x, %y
+  %shr = shl i32 %x, %sub
+  %r = add i32 %shr, %shl
+  ret i32 %r
+}
+
+define i32 @not_rotl_i32_add_less(i32 %x, i32 %y) {
+; CHECK-LABEL: @not_rotl_i32_add_less(
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 31, [[Y:%.*]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[X:%.*]], [[Y]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X]], [[SUB]]
+; CHECK-NEXT:    [[R:%.*]] = add i32 [[SHR]], [[SHL]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %sub = sub i32 31, %y
+  %shl = shl i32 %x, %y
+  %shr = lshr i32 %x, %sub
+  %r = add i32 %shr, %shl
+  ret i32 %r
+}

From 986f2ac48f369bc025a3f1830e2d5bba235be0fd Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean@nvidia.com>
Date: Fri, 10 Jan 2025 09:18:28 -0800
Subject: [PATCH 083/408] [SLPVectorizer] minor tweaks around lambdas for
 compatibility with older compilers (#122348)

Older version of msvc do not have great lambda support and are not able
to handle uses of class data or lambdas with implicit return types in
some cases. These minor changes improve the sources compatibility with
older msvc and don't hurt readability either.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index cdfec332af37a..29b81114ef705 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6908,7 +6908,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
     return L1.second > L2.second;
   };
 
-  auto IsMaskedGatherSupported = [&](ArrayRef<LoadInst *> Loads) {
+  auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
     ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
                              Loads.size());
     Align Alignment = computeCommonAlignment<LoadInst>(Values);
@@ -7075,9 +7075,10 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
           }
           SmallVector<std::pair<LoadInst *, int>> LocalLoadsDists(LoadsDists);
           SmallVector<LoadInst *> OriginalLoads(LocalLoadsDists.size());
-          transform(
-              LoadsDists, OriginalLoads.begin(),
-              [](const std::pair<LoadInst *, int> &L) { return L.first; });
+          transform(LoadsDists, OriginalLoads.begin(),
+                    [](const std::pair<LoadInst *, int> &L) -> LoadInst * {
+                      return L.first;
+                    });
           stable_sort(LocalLoadsDists, LoadSorter);
           SmallVector<LoadInst *> Loads;
           unsigned MaxConsecutiveDistance = 0;
@@ -7304,7 +7305,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
     if (!Ref.empty() && !NonVectorized.empty() &&
         std::accumulate(
             Ref.begin(), Ref.end(), 0u,
-            [](unsigned S, ArrayRef<std::pair<LoadInst *, int>> LoadsDists) {
+            [](unsigned S,
+               ArrayRef<std::pair<LoadInst *, int>> LoadsDists) -> unsigned {
               return S + LoadsDists.size();
             }) != NonVectorized.size() &&
         IsMaskedGatherSupported(NonVectorized)) {
@@ -16979,8 +16981,9 @@ void BoUpSLP::optimizeGatherSequence() {
   // and its mask indeces are the same as in the first one or undefs. E.g.
   // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
   // poison, <0, 0, 0, 0>.
-  auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2,
-                                           SmallVectorImpl<int> &NewMask) {
+  auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,
+                                                Instruction *I2,
+                                                SmallVectorImpl<int> &NewMask) {
     if (I1->getType() != I2->getType())
       return false;
     auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
@@ -17774,7 +17777,7 @@ bool BoUpSLP::collectValuesToDemote(
     BitWidth = std::max(BitWidth, BitWidth1);
     return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
   };
-  auto FinalAnalysis = [&]() {
+  auto FinalAnalysis = [&, TTI = TTI]() {
     if (!IsProfitableToDemote)
       return false;
     bool Res = all_of(

From ac2d529be31d7a670326298036a4b9b3eaed59d3 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Fri, 10 Jan 2025 12:21:52 -0500
Subject: [PATCH 084/408] [NFC][Clang] Auto generate check lines for
 `preserve_vec3.cl`

---
 clang/test/CodeGenOpenCL/preserve_vec3.cl | 124 ++++++++++++++++------
 1 file changed, 91 insertions(+), 33 deletions(-)

diff --git a/clang/test/CodeGenOpenCL/preserve_vec3.cl b/clang/test/CodeGenOpenCL/preserve_vec3.cl
index 19f0cdff60a9d..c84effe0c4b6e 100644
--- a/clang/test/CodeGenOpenCL/preserve_vec3.cl
+++ b/clang/test/CodeGenOpenCL/preserve_vec3.cl
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 %s -emit-llvm -o - -triple spir-unknown-unknown -fpreserve-vec3-type | FileCheck %s
 
 typedef char char3 __attribute__((ext_vector_type(3)));
@@ -7,71 +8,128 @@ typedef double double2 __attribute__((ext_vector_type(2)));
 typedef float float3 __attribute__((ext_vector_type(3)));
 typedef float float4 __attribute__((ext_vector_type(4)));
 
+// CHECK-LABEL: define dso_local spir_kernel void @foo(
+// CHECK-SAME: ptr addrspace(1) nocapture noundef readonly align 16 [[A:%.*]], ptr addrspace(1) nocapture noundef writeonly align 16 initializes((0, 12)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META6:![0-9]+]] !kernel_arg_type_qual [[META7:![0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8:![0-9]+]]
+// CHECK-NEXT:    store <3 x float> [[TMP0]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT:    ret void
+//
 void kernel foo(global float3 *a, global float3 *b) {
-  // CHECK-LABEL: spir_kernel void @foo
-  // CHECK: %[[LOAD_A:.*]] = load <3 x float>, ptr addrspace(1) %a
-  // CHECK: store <3 x float> %[[LOAD_A]], ptr addrspace(1) %b
   *b = *a;
 }
 
+// CHECK-LABEL: define dso_local spir_kernel void @float4_to_float3(
+// CHECK-SAME: ptr addrspace(1) nocapture noundef writeonly align 16 initializes((0, 12)) [[A:%.*]], ptr addrspace(1) nocapture noundef readonly align 16 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META12:![0-9]+]] !kernel_arg_type_qual [[META7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[ASTYPE:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    store <3 x float> [[ASTYPE]], ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT:    ret void
+//
 void kernel float4_to_float3(global float3 *a, global float4 *b) {
-  // CHECK-LABEL: spir_kernel void @float4_to_float3
-  // CHECK: %[[LOAD_A:.*]] = load <4 x float>, ptr addrspace(1) %b, align 16
-  // CHECK: %[[ASTYPE:.*]] = shufflevector <4 x float> %[[LOAD_A]], <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
-  // CHECK: store <3 x float> %[[ASTYPE]], ptr addrspace(1) %a, align 16
   *a = __builtin_astype(*b, float3);
 }
 
+// CHECK-LABEL: define dso_local spir_kernel void @float3_to_float4(
+// CHECK-SAME: ptr addrspace(1) nocapture noundef readonly align 16 [[A:%.*]], ptr addrspace(1) nocapture noundef writeonly align 16 initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[ASTYPE:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x float> [[ASTYPE]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT:    ret void
+//
 void kernel float3_to_float4(global float3 *a, global float4 *b) {
-  // CHECK-LABEL: spir_kernel void @float3_to_float4
-  // CHECK: %[[LOAD_A:.*]] = load <3 x float>, ptr addrspace(1) %a, align 16
-  // CHECK: %[[ASTYPE:.*]] = shufflevector <3 x float> %[[LOAD_A]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-  // CHECK: store <4 x float> %[[ASTYPE]], ptr addrspace(1) %b, align 16
   *b = __builtin_astype(*a, float4);
 }
 
+// CHECK-LABEL: define dso_local spir_kernel void @float3_to_double2(
+// CHECK-SAME: ptr addrspace(1) nocapture noundef readonly align 16 [[A:%.*]], ptr addrspace(1) nocapture noundef writeonly align 16 initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x float> [[TMP1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT:    ret void
+//
 void kernel float3_to_double2(global float3 *a, global double2 *b) {
-  // CHECK-LABEL: spir_kernel void @float3_to_double2
-  // CHECK: %[[LOAD_A:.*]] = load <3 x float>, ptr addrspace(1) %a, align 16
-  // CHECK: %[[ASTYPE:.*]] = shufflevector <3 x float> %[[LOAD_A]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-  // CHECK: store <4 x float> %[[ASTYPE]], ptr addrspace(1) %b, align 16
   *b = __builtin_astype(*a, double2);
 }
 
+// CHECK-LABEL: define dso_local spir_kernel void @char8_to_short3(
+// CHECK-SAME: ptr addrspace(1) nocapture noundef writeonly align 8 initializes((0, 6)) [[A:%.*]], ptr addrspace(1) nocapture noundef readonly align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[ASTYPE:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    store <3 x i16> [[ASTYPE]], ptr addrspace(1) [[A]], align 8, !tbaa [[TBAA8]]
+// CHECK-NEXT:    ret void
+//
 void kernel char8_to_short3(global short3 *a, global char8 *b) {
-  // CHECK-LABEL: spir_kernel void @char8_to_short3
-  // CHECK: %[[LOAD_B:.*]] = load <4 x i16>, ptr addrspace(1) %b
-  // CHECK: %[[ASTYPE:.*]] = shufflevector <4 x i16> %[[LOAD_B]], <4 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
-  // CHECK: store <3 x i16> %[[ASTYPE]], ptr addrspace(1) %a, align 8
   *a = __builtin_astype(*b, short3);
 }
 
+// CHECK-LABEL: define dso_local spir_func void @from_char3(
+// CHECK-SAME: <3 x i8> noundef [[A:%.*]], ptr addrspace(1) nocapture noundef writeonly initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i8> [[A]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x i8> [[TMP0]], ptr addrspace(1) [[OUT]], align 4, !tbaa [[TBAA17:![0-9]+]]
+// CHECK-NEXT:    ret void
+//
 void from_char3(char3 a, global int *out) {
-  // CHECK-LABEL: void @from_char3
-  // CHECK: %[[ASTYPE:.*]] = shufflevector <3 x i8> %a, <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-  // CHECK: store <4 x i8> %[[ASTYPE]], ptr addrspace(1) %out
   *out = __builtin_astype(a, int);
 }
 
+// CHECK-LABEL: define dso_local spir_func void @from_short3(
+// CHECK-SAME: <3 x i16> noundef [[A:%.*]], ptr addrspace(1) nocapture noundef writeonly initializes((0, 8)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[A]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[TBAA19:![0-9]+]]
+// CHECK-NEXT:    ret void
+//
 void from_short3(short3 a, global long *out) {
-  // CHECK-LABEL: void @from_short3
-  // CHECK: %[[ASTYPE:.*]] = shufflevector <3 x i16> %a, <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-  // CHECK: store <4 x i16> %[[ASTYPE]], ptr addrspace(1) %out
   *out = __builtin_astype(a, long);
 }
 
+// CHECK-LABEL: define dso_local spir_func void @scalar_to_char3(
+// CHECK-SAME: i32 noundef [[A:%.*]], ptr addrspace(1) nocapture noundef writeonly initializes((0, 3)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A]] to <4 x i8>
+// CHECK-NEXT:    [[ASTYPE:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    store <3 x i8> [[ASTYPE]], ptr addrspace(1) [[OUT]], align 4, !tbaa [[TBAA8]]
+// CHECK-NEXT:    ret void
+//
 void scalar_to_char3(int a, global char3 *out) {
-  // CHECK-LABEL: void @scalar_to_char3
-  // CHECK: %[[IN_BC:.*]] = bitcast i32 %a to <4 x i8>
-  // CHECK: %[[ASTYPE:.*]] = shufflevector <4 x i8> %[[IN_BC]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
-  // CHECK: store <3 x i8> %[[ASTYPE]], ptr addrspace(1) %out
   *out = __builtin_astype(a, char3);
 }
 
+// CHECK-LABEL: define dso_local spir_func void @scalar_to_short3(
+// CHECK-SAME: i64 noundef [[A:%.*]], ptr addrspace(1) nocapture noundef writeonly initializes((0, 6)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <4 x i16>
+// CHECK-NEXT:    [[ASTYPE:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    store <3 x i16> [[ASTYPE]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[TBAA8]]
+// CHECK-NEXT:    ret void
+//
 void scalar_to_short3(long a, global short3 *out) {
-  // CHECK-LABEL: void @scalar_to_short3
-  // CHECK: %[[IN_BC:.*]] = bitcast i64 %a to <4 x i16>
-  // CHECK: %[[ASTYPE:.*]] = shufflevector <4 x i16> %[[IN_BC]], <4 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
-  // CHECK: store <3 x i16> %[[ASTYPE]], ptr addrspace(1) %out
   *out = __builtin_astype(a, short3);
 }
+
+//.
+// CHECK: [[META3]] = !{i32 1, i32 1}
+// CHECK: [[META4]] = !{!"none", !"none"}
+// CHECK: [[META5]] = !{!"float3*", !"float3*"}
+// CHECK: [[META6]] = !{!"float __attribute__((ext_vector_type(3)))*", !"float __attribute__((ext_vector_type(3)))*"}
+// CHECK: [[META7]] = !{!"", !""}
+// CHECK: [[TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
+// CHECK: [[META9]] = !{!"omnipotent char", [[META10:![0-9]+]], i64 0}
+// CHECK: [[META10]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[META11]] = !{!"float3*", !"float4*"}
+// CHECK: [[META12]] = !{!"float __attribute__((ext_vector_type(3)))*", !"float __attribute__((ext_vector_type(4)))*"}
+// CHECK: [[META13]] = !{!"float3*", !"double2*"}
+// CHECK: [[META14]] = !{!"float __attribute__((ext_vector_type(3)))*", !"double __attribute__((ext_vector_type(2)))*"}
+// CHECK: [[META15]] = !{!"short3*", !"char8*"}
+// CHECK: [[META16]] = !{!"short __attribute__((ext_vector_type(3)))*", !"char __attribute__((ext_vector_type(8)))*"}
+// CHECK: [[TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
+// CHECK: [[META18]] = !{!"int", [[META9]], i64 0}
+// CHECK: [[TBAA19]] = !{[[META20:![0-9]+]], [[META20]], i64 0}
+// CHECK: [[META20]] = !{!"long", [[META9]], i64 0}
+//.

From 20f0290af0604a5f2656533d7ecaff6ff438e261 Mon Sep 17 00:00:00 2001
From: Prashanth <TheStarOne01@proton.me>
Date: Fri, 10 Jan 2025 22:53:42 +0530
Subject: [PATCH 085/408] [docs][libc] Add AIO documentation refering POSIX and
 include in build (#122219)

With reference to #122006 , add a new header reference (aio.yaml) to doc
---
 libc/docs/CMakeLists.txt    |  1 +
 libc/docs/headers/index.rst |  1 +
 libc/utils/docgen/aio.yaml  | 35 +++++++++++++++++++++++++++++++++++
 3 files changed, 37 insertions(+)
 create mode 100644 libc/utils/docgen/aio.yaml

diff --git a/libc/docs/CMakeLists.txt b/libc/docs/CMakeLists.txt
index 5a3f8275bb932..f48005f303436 100644
--- a/libc/docs/CMakeLists.txt
+++ b/libc/docs/CMakeLists.txt
@@ -32,6 +32,7 @@ if (SPHINX_FOUND)
     # math.h), those should be omitted and exist statically in
     # libc/docs/headers/.
     list(APPEND docgen_list
+      aio
       arpa/inet
       assert
       ctype
diff --git a/libc/docs/headers/index.rst b/libc/docs/headers/index.rst
index 07ab6dd9b2674..70fb04089d9a2 100644
--- a/libc/docs/headers/index.rst
+++ b/libc/docs/headers/index.rst
@@ -4,6 +4,7 @@ Implementation Status
 .. toctree::
    :maxdepth: 1
 
+   aio
    arpa/inet
    assert
    complex
diff --git a/libc/utils/docgen/aio.yaml b/libc/utils/docgen/aio.yaml
new file mode 100644
index 0000000000000..2c381558676a1
--- /dev/null
+++ b/libc/utils/docgen/aio.yaml
@@ -0,0 +1,35 @@
+macros:
+  AIO_ALLDONE:
+    in-latest-posix: ''
+  AIO_CANCELED:
+    in-latest-posix: ''
+  AIO_NOTCANCELED:
+    in-latest-posix: ''
+  LIO_NOP:
+    in-latest-posix: ''
+  LIO_NOWAIT:
+    in-latest-posix: ''
+  LIO_READ:
+    in-latest-posix: ''
+  LIO_WAIT:
+    in-latest-posix: ''
+  LIO_WRITE:
+    in-latest-posix: ''
+    
+functions:
+  aio_cancel:
+    in-latest-posix: ''
+  aio_error:
+    in-latest-posix: ''
+  aio_fsync:
+    in-latest-posix: ''
+  aio_read:
+    in-latest-posix: ''
+  aio_return:
+    in-latest-posix: ''
+  aio_suspend:
+    in-latest-posix: ''
+  aio_write:
+    in-latest-posix: ''
+  lio_listio:
+    in-latest-posix: ''

From dff7ef2353fec9f1006895c0e99bde704296eaa9 Mon Sep 17 00:00:00 2001
From: Prashanth <TheStarOne01@proton.me>
Date: Fri, 10 Jan 2025 22:54:02 +0530
Subject: [PATCH 086/408] [libc][docs] Add netinet/in header documentation by
 referring to POSIX standards (#122411)

This pull request introduces the following changes to the project with
reference to ( #122006 ):

1. **Documentation Update**:
- Added a new YAML file `in.yaml` to document network protocol and
address macros.
   - The `in.yaml` file includes the following macros:
     - `IPPROTO_IP`
     - `IPPROTO_IPV6`
     - `IPPROTO_ICMP`
     - `IPPROTO_RAW`
     - `IPPROTO_TCP`
     - `IPPROTO_UDP`
     - `INADDR_ANY`
     - `INADDR_BROADCAST`
     - `INET_ADDRSTRLEN`
     - `IPV6_JOIN_GROUP`
     - `IPV6_LEAVE_GROUP`
     - `IPV6_MULTICAST_HOPS`
     - `IPV6_MULTICAST_IF`
     - `IPV6_MULTICAST_LOOP`
     - `IPV6_UNICAST_HOPS`
     - `IPV6_V6ONLY`
     - `IN6_IS_ADDR_UNSPECIFIED`
     - `IN6_IS_ADDR_LOOPBACK`
     - `IN6_IS_ADDR_MULTICAST`
     - `IN6_IS_ADDR_LINKLOCAL`
     - `IN6_IS_ADDR_SITELOCAL`
     - `IN6_IS_ADDR_V4MAPPED`
     - `IN6_IS_ADDR_V4COMPAT`
     - `IN6_IS_ADDR_MC_NODELOCAL`
     - `IN6_IS_ADDR_MC_LINKLOCAL`
     - `IN6_IS_ADDR_MC_SITELOCAL`
     - `IN6_IS_ADDR_MC_ORGLOCAL`
     - `IN6_IS_ADDR_MC_GLOBAL`

_I believe, all these macros are necessary and should be documented._

2. **CMake Configuration Update**:
- Updated the `CMakeLists.txt` file to create the necessary directory
for the `netinet` headers.
- Included the `netinet/in` documentation in the Sphinx build
configuration.

3. **Index Update**:
- Updated the `index.rst` file to include a reference to the newly added
`netinet/in` documentation.

**Purpose**:
- This pull request adds documentation for network protocol and address
macros in the `netinet/in` header.
- Updates the CMake configuration to support the new documentation.

**Testing**:
- Verified that the new YAML file is correctly referenced in the
`index.rst`.
- Ensured that the documentation builds without errors and includes the
new network interface documentation.

This pull request ensures that the `netinet/in` header macros are
documented and included in the project's documentation, and updates the
CMake configuration to support these changes.
---
 libc/docs/CMakeLists.txt          |  2 ++
 libc/docs/headers/index.rst       |  1 +
 libc/utils/docgen/netinet/in.yaml | 59 +++++++++++++++++++++++++++++++
 3 files changed, 62 insertions(+)
 create mode 100644 libc/utils/docgen/netinet/in.yaml

diff --git a/libc/docs/CMakeLists.txt b/libc/docs/CMakeLists.txt
index f48005f303436..9ab1d90a682ec 100644
--- a/libc/docs/CMakeLists.txt
+++ b/libc/docs/CMakeLists.txt
@@ -18,6 +18,7 @@ if (SPHINX_FOUND)
     # shell.
     file(MAKE_DIRECTORY
       "${CMAKE_CURRENT_BINARY_DIR}/headers/arpa/"
+      "${CMAKE_CURRENT_BINARY_DIR}/headers/netinet/"
       "${CMAKE_CURRENT_BINARY_DIR}/headers/sys/"
     )
 
@@ -41,6 +42,7 @@ if (SPHINX_FOUND)
       float
       inttypes
       locale
+      netinet/in
       setjmp
       signal
       stdbit
diff --git a/libc/docs/headers/index.rst b/libc/docs/headers/index.rst
index 70fb04089d9a2..f6f62826600c8 100644
--- a/libc/docs/headers/index.rst
+++ b/libc/docs/headers/index.rst
@@ -15,6 +15,7 @@ Implementation Status
    inttypes
    locale
    math/index.rst
+   netinet/in
    search
    setjmp
    signal
diff --git a/libc/utils/docgen/netinet/in.yaml b/libc/utils/docgen/netinet/in.yaml
new file mode 100644
index 0000000000000..69cab90181841
--- /dev/null
+++ b/libc/utils/docgen/netinet/in.yaml
@@ -0,0 +1,59 @@
+macros:
+  IPPROTO_IP:
+    in-latest-posix: ''
+  IPPROTO_IPV6:
+    in-latest-posix: ''
+  IPPROTO_ICMP:
+    in-latest-posix: ''
+  IPPROTO_RAW:
+    in-latest-posix: ''
+  IPPROTO_TCP:
+    in-latest-posix: ''
+  IPPROTO_UDP:
+    in-latest-posix: ''
+  INADDR_ANY:
+    in-latest-posix: ''
+  INET6_ADDRSTRLEN:
+    in-latest-posix: ''
+  INADDR_BROADCAST:
+    in-latest-posix: ''
+  INET_ADDRSTRLEN:
+    in-latest-posix: ''
+  IPV6_JOIN_GROUP:
+    in-latest-posix: ''
+  IPV6_LEAVE_GROUP:
+    in-latest-posix: ''
+  IPV6_MULTICAST_HOPS:
+    in-latest-posix: ''
+  IPV6_MULTICAST_IF:
+    in-latest-posix: ''
+  IPV6_MULTICAST_LOOP:
+    in-latest-posix: ''
+  IPV6_UNICAST_HOPS:
+    in-latest-posix: ''
+  IPV6_V6ONLY:
+    in-latest-posix: ''
+  IN6_IS_ADDR_UNSPECIFIED:
+    in-latest-posix: ''
+  IN6_IS_ADDR_LOOPBACK:
+    in-latest-posix: ''
+  IN6_IS_ADDR_MULTICAST:
+    in-latest-posix: ''
+  IN6_IS_ADDR_LINKLOCAL:
+    in-latest-posix: ''
+  IN6_IS_ADDR_SITELOCAL:
+    in-latest-posix: ''
+  IN6_IS_ADDR_V4MAPPED:
+    in-latest-posix: ''
+  IN6_IS_ADDR_V4COMPAT:
+    in-latest-posix: ''
+  IN6_IS_ADDR_MC_NODELOCAL:
+    in-latest-posix: ''
+  IN6_IS_ADDR_MC_LINKLOCAL:
+    in-latest-posix: ''
+  IN6_IS_ADDR_MC_SITELOCAL:
+    in-latest-posix: ''
+  IN6_IS_ADDR_MC_ORGLOCAL:
+    in-latest-posix: ''
+  IN6_IS_ADDR_MC_GLOBAL:
+    in-latest-posix: ''
\ No newline at end of file

From 0afee850de1ebe9af71bdf727d906fefa78ad68c Mon Sep 17 00:00:00 2001
From: Prashanth <TheStarOne01@proton.me>
Date: Fri, 10 Jan 2025 22:57:40 +0530
Subject: [PATCH 087/408] [libc][docs] Add net/if.h documentation by referring
 to POSIX standards (#122406)

This pull request introduces the following changes to the project with
reference to issue ( #122006 ):

1. **Documentation Update**:
- Added a new YAML file `if.yaml` under `net` to document network
interface functions and macros.
   - The `if.yaml` file includes the following functions and macros:
     - Functions:
       - `if_freenameindex`
       - `if_indextoname`
       - `if_nameindex`
       - `if_nametoindex`
     - Macros:
       - `IF_NAMESIZE`

2. **CMake Configuration Update**:
- Updated the `CMakeLists.txt` file to create the necessary directory
for the `net` headers.
- Included the `net/if` documentation in the Sphinx build configuration.

3. **Index Update**:
- Updated the `index.rst` file to include a reference to the newly added
`net/if` documentation.

**Purpose**:
- This pull request adds documentation for network interface functions
and macros, ensuring they are included in the project's documentation.
- Updates the CMake configuration to support the new documentation.

**Testing**:
- Verified that the new YAML file is correctly referenced in the
`index.rst`.
- Ensured that the documentation builds without errors and includes the
new network interface documentation.

Co-authored-by: Nick Desaulniers <ndesaulniers@google.com>
---
 libc/docs/CMakeLists.txt      |  2 ++
 libc/docs/headers/index.rst   |  1 +
 libc/utils/docgen/net/if.yaml | 13 +++++++++++++
 3 files changed, 16 insertions(+)
 create mode 100644 libc/utils/docgen/net/if.yaml

diff --git a/libc/docs/CMakeLists.txt b/libc/docs/CMakeLists.txt
index 9ab1d90a682ec..4fa621fb2a051 100644
--- a/libc/docs/CMakeLists.txt
+++ b/libc/docs/CMakeLists.txt
@@ -18,6 +18,7 @@ if (SPHINX_FOUND)
     # shell.
     file(MAKE_DIRECTORY
       "${CMAKE_CURRENT_BINARY_DIR}/headers/arpa/"
+      "${CMAKE_CURRENT_BINARY_DIR}/headers/net/"
       "${CMAKE_CURRENT_BINARY_DIR}/headers/netinet/"
       "${CMAKE_CURRENT_BINARY_DIR}/headers/sys/"
     )
@@ -42,6 +43,7 @@ if (SPHINX_FOUND)
       float
       inttypes
       locale
+      net/if
       netinet/in
       setjmp
       signal
diff --git a/libc/docs/headers/index.rst b/libc/docs/headers/index.rst
index f6f62826600c8..2295a8cc8853c 100644
--- a/libc/docs/headers/index.rst
+++ b/libc/docs/headers/index.rst
@@ -15,6 +15,7 @@ Implementation Status
    inttypes
    locale
    math/index.rst
+   net/if
    netinet/in
    search
    setjmp
diff --git a/libc/utils/docgen/net/if.yaml b/libc/utils/docgen/net/if.yaml
new file mode 100644
index 0000000000000..085d27b2bb94f
--- /dev/null
+++ b/libc/utils/docgen/net/if.yaml
@@ -0,0 +1,13 @@
+functions:
+  if_freenameindex:
+    in-latest-posix: ''
+  if_indextoname:
+    in-latest-posix: ''
+  if_nameindex:
+    in-latest-posix: ''
+  if_nametoindex:
+    in-latest-posix: ''
+
+macros:
+  IF_NAMESIZE:
+    in-latest-posix: ''
\ No newline at end of file

From beba4b08f72152abbb7d26df024f0d9338a7038b Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Fri, 10 Jan 2025 09:28:03 -0800
Subject: [PATCH 088/408] [flang][NFC] Removed unneeded engineering option.
 (#122305)

---
 .../HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp          | 8 --------
 flang/test/HLFIR/simplify-hlfir-intrinsics-sum.fir        | 2 +-
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
index bf3d261e7e883..314ced8679521 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
@@ -28,11 +28,6 @@ namespace hlfir {
 #include "flang/Optimizer/HLFIR/Passes.h.inc"
 } // namespace hlfir
 
-static llvm::cl::opt<bool>
-    simplifySum("flang-simplify-hlfir-sum",
-                llvm::cl::desc("Expand hlfir.sum into an inline sequence"),
-                llvm::cl::init(true));
-
 namespace {
 
 class TransposeAsElementalConversion
@@ -109,9 +104,6 @@ class SumAsElementalConversion : public mlir::OpRewritePattern<hlfir::SumOp> {
   llvm::LogicalResult
   matchAndRewrite(hlfir::SumOp sum,
                   mlir::PatternRewriter &rewriter) const override {
-    if (!simplifySum)
-      return rewriter.notifyMatchFailure(sum, "SUM simplification is disabled");
-
     hlfir::Entity array = hlfir::Entity{sum.getArray()};
     bool isTotalReduction = hlfir::Entity{sum}.getRank() == 0;
     mlir::Value dim = sum.getDim();
diff --git a/flang/test/HLFIR/simplify-hlfir-intrinsics-sum.fir b/flang/test/HLFIR/simplify-hlfir-intrinsics-sum.fir
index 58a2144947b14..d1915952d77ea 100644
--- a/flang/test/HLFIR/simplify-hlfir-intrinsics-sum.fir
+++ b/flang/test/HLFIR/simplify-hlfir-intrinsics-sum.fir
@@ -1,4 +1,4 @@
-// RUN: fir-opt --simplify-hlfir-intrinsics -flang-simplify-hlfir-sum %s | FileCheck %s
+// RUN: fir-opt --simplify-hlfir-intrinsics %s | FileCheck %s
 
 // box with known extents
 func.func @sum_box_known_extents(%arg0: !fir.box<!fir.array<2x3xi32>>) -> !hlfir.expr<2xi32> {

From cc88a5e61578e58afdd8ef4e9f1b7cd10d77fba3 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 10 Jan 2025 09:32:03 -0800
Subject: [PATCH 089/408] [lld-macho,NFC] Switch to increasing priorities

--order_file, call graph profile, and BalancedPartitioning currently
build the section order vector by decreasing priority (from SIZE_MAX to
0). However, it's conventional to use an increasing key (see
OutputSection::inputOrder).

Switch to increasing priorities, remove the global variable
highestAvailablePriority, and remove the highestAvailablePriority
parameter from BPSectionOrderer. Change size_t to int.

This improves consistenty with the ELF and COFF ports. The ELF port
utilizes negative priorities for --symbol-ordering-file and call graph
profile, and non-negative priorities for --shuffle-sections (no Mach-O
counterpart yet).

Pull Request: https://github.com/llvm/llvm-project/pull/121727
---
 lld/Common/BPSectionOrdererBase.cpp           | 12 +++---
 lld/MachO/BPSectionOrderer.cpp                | 11 +++---
 lld/MachO/BPSectionOrderer.h                  |  5 +--
 lld/MachO/SectionPriorities.cpp               | 39 ++++++++-----------
 lld/MachO/SectionPriorities.h                 | 16 +++-----
 lld/MachO/Writer.cpp                          |  4 +-
 lld/include/lld/Common/BPSectionOrdererBase.h |  8 ++--
 7 files changed, 41 insertions(+), 54 deletions(-)

diff --git a/lld/Common/BPSectionOrdererBase.cpp b/lld/Common/BPSectionOrdererBase.cpp
index 75be4f6aa9bc0..7d26a5fb84483 100644
--- a/lld/Common/BPSectionOrdererBase.cpp
+++ b/lld/Common/BPSectionOrdererBase.cpp
@@ -96,11 +96,10 @@ static SmallVector<std::pair<unsigned, UtilityNodes>> getUnsForCompression(
   return sectionUns;
 }
 
-llvm::DenseMap<const BPSectionBase *, size_t>
+llvm::DenseMap<const BPSectionBase *, int>
 BPSectionBase::reorderSectionsByBalancedPartitioning(
-    size_t &highestAvailablePriority, llvm::StringRef profilePath,
-    bool forFunctionCompression, bool forDataCompression,
-    bool compressionSortStartupFunctions, bool verbose,
+    llvm::StringRef profilePath, bool forFunctionCompression,
+    bool forDataCompression, bool compressionSortStartupFunctions, bool verbose,
     SmallVector<std::unique_ptr<BPSectionBase>> &inputSections) {
   TimeTraceScope timeScope("Setup Balanced Partitioning");
   SmallVector<const BPSectionBase *> sections;
@@ -364,8 +363,9 @@ BPSectionBase::reorderSectionsByBalancedPartitioning(
     }
   }
 
-  DenseMap<const BPSectionBase *, size_t> sectionPriorities;
+  DenseMap<const BPSectionBase *, int> sectionPriorities;
+  int prio = -orderedSections.size();
   for (const auto *isec : orderedSections)
-    sectionPriorities[isec] = --highestAvailablePriority;
+    sectionPriorities[isec] = prio++;
   return sectionPriorities;
 }
diff --git a/lld/MachO/BPSectionOrderer.cpp b/lld/MachO/BPSectionOrderer.cpp
index 0ffbf16007fda..18c8aad58344f 100644
--- a/lld/MachO/BPSectionOrderer.cpp
+++ b/lld/MachO/BPSectionOrderer.cpp
@@ -15,9 +15,8 @@
 using namespace llvm;
 using namespace lld::macho;
 
-DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
-    size_t &highestAvailablePriority, StringRef profilePath,
-    bool forFunctionCompression, bool forDataCompression,
+DenseMap<const InputSection *, int> lld::macho::runBalancedPartitioning(
+    StringRef profilePath, bool forFunctionCompression, bool forDataCompression,
     bool compressionSortStartupFunctions, bool verbose) {
 
   SmallVector<std::unique_ptr<BPSectionBase>> sections;
@@ -34,10 +33,10 @@ DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
   }
 
   auto reorderedSections = BPSectionBase::reorderSectionsByBalancedPartitioning(
-      highestAvailablePriority, profilePath, forFunctionCompression,
-      forDataCompression, compressionSortStartupFunctions, verbose, sections);
+      profilePath, forFunctionCompression, forDataCompression,
+      compressionSortStartupFunctions, verbose, sections);
 
-  DenseMap<const InputSection *, size_t> result;
+  DenseMap<const InputSection *, int> result;
   for (const auto &[sec, priority] : reorderedSections) {
     if (auto *machoSection = dyn_cast<BPSectionMacho>(sec)) {
       result.try_emplace(
diff --git a/lld/MachO/BPSectionOrderer.h b/lld/MachO/BPSectionOrderer.h
index 8ba911fcc546b..4facb652d4c87 100644
--- a/lld/MachO/BPSectionOrderer.h
+++ b/lld/MachO/BPSectionOrderer.h
@@ -145,9 +145,8 @@ class BPSectionMacho : public BPSectionBase {
 ///
 /// It is important that .subsections_via_symbols is used to ensure functions
 /// and data are in their own sections and thus can be reordered.
-llvm::DenseMap<const lld::macho::InputSection *, size_t>
-runBalancedPartitioning(size_t &highestAvailablePriority,
-                        llvm::StringRef profilePath,
+llvm::DenseMap<const lld::macho::InputSection *, int>
+runBalancedPartitioning(llvm::StringRef profilePath,
                         bool forFunctionCompression, bool forDataCompression,
                         bool compressionSortStartupFunctions, bool verbose);
 
diff --git a/lld/MachO/SectionPriorities.cpp b/lld/MachO/SectionPriorities.cpp
index 0a15112c1250d..7a4a5d8465f64 100644
--- a/lld/MachO/SectionPriorities.cpp
+++ b/lld/MachO/SectionPriorities.cpp
@@ -38,9 +38,6 @@ using namespace lld::macho;
 PriorityBuilder macho::priorityBuilder;
 
 namespace {
-
-size_t highestAvailablePriority = std::numeric_limits<size_t>::max();
-
 struct Edge {
   int from;
   uint64_t weight;
@@ -67,7 +64,7 @@ class CallGraphSort {
 public:
   CallGraphSort(const MapVector<SectionPair, uint64_t> &profile);
 
-  DenseMap<const InputSection *, size_t> run();
+  DenseMap<const InputSection *, int> run();
 
 private:
   std::vector<Cluster> clusters;
@@ -157,7 +154,7 @@ static void mergeClusters(std::vector<Cluster> &cs, Cluster &into, int intoIdx,
 
 // Group InputSections into clusters using the Call-Chain Clustering heuristic
 // then sort the clusters by density.
-DenseMap<const InputSection *, size_t> CallGraphSort::run() {
+DenseMap<const InputSection *, int> CallGraphSort::run() {
   const uint64_t maxClusterSize = target->getPageSize();
 
   // Cluster indices sorted by density.
@@ -205,16 +202,14 @@ DenseMap<const InputSection *, size_t> CallGraphSort::run() {
     return clusters[a].getDensity() > clusters[b].getDensity();
   });
 
-  DenseMap<const InputSection *, size_t> orderMap;
+  DenseMap<const InputSection *, int> orderMap;
 
   // Sections will be sorted by decreasing order. Absent sections will have
   // priority 0 and be placed at the end of sections.
-  // NB: This is opposite from COFF/ELF to be compatible with the existing
-  // order-file code.
-  int curOrder = highestAvailablePriority;
+  int curOrder = -clusters.size();
   for (int leader : sorted) {
     for (int i = leader;;) {
-      orderMap[sections[i]] = curOrder--;
+      orderMap[sections[i]] = curOrder++;
       i = clusters[i].next;
       if (i == leader)
         break;
@@ -250,7 +245,7 @@ DenseMap<const InputSection *, size_t> CallGraphSort::run() {
   return orderMap;
 }
 
-std::optional<size_t>
+std::optional<int>
 macho::PriorityBuilder::getSymbolPriority(const Defined *sym) {
   if (sym->isAbsolute())
     return std::nullopt;
@@ -270,7 +265,7 @@ macho::PriorityBuilder::getSymbolPriority(const Defined *sym) {
   else
     filename = saver().save(path::filename(f->archiveName) + "(" +
                             path::filename(f->getName()) + ")");
-  return std::max(entry.objectFiles.lookup(filename), entry.anyObjectFile);
+  return std::min(entry.objectFiles.lookup(filename), entry.anyObjectFile);
 }
 
 void macho::PriorityBuilder::extractCallGraphProfile() {
@@ -302,6 +297,7 @@ void macho::PriorityBuilder::parseOrderFile(StringRef path) {
     return;
   }
 
+  int prio = std::numeric_limits<int>::min();
   MemoryBufferRef mbref = *buffer;
   for (StringRef line : args::getLines(mbref)) {
     StringRef objectFile, symbol;
@@ -339,25 +335,22 @@ void macho::PriorityBuilder::parseOrderFile(StringRef path) {
     if (!symbol.empty()) {
       SymbolPriorityEntry &entry = priorities[symbol];
       if (!objectFile.empty())
-        entry.objectFiles.insert(
-            std::make_pair(objectFile, highestAvailablePriority));
+        entry.objectFiles.insert(std::make_pair(objectFile, prio));
       else
-        entry.anyObjectFile =
-            std::max(entry.anyObjectFile, highestAvailablePriority);
+        entry.anyObjectFile = std::min(entry.anyObjectFile, prio);
     }
 
-    --highestAvailablePriority;
+    ++prio;
   }
 }
 
-DenseMap<const InputSection *, size_t>
+DenseMap<const InputSection *, int>
 macho::PriorityBuilder::buildInputSectionPriorities() {
-  DenseMap<const InputSection *, size_t> sectionPriorities;
+  DenseMap<const InputSection *, int> sectionPriorities;
   if (config->bpStartupFunctionSort || config->bpFunctionOrderForCompression ||
       config->bpDataOrderForCompression) {
     TimeTraceScope timeScope("Balanced Partitioning Section Orderer");
     sectionPriorities = runBalancedPartitioning(
-        highestAvailablePriority,
         config->bpStartupFunctionSort ? config->irpgoProfilePath : "",
         config->bpFunctionOrderForCompression,
         config->bpDataOrderForCompression,
@@ -378,11 +371,11 @@ macho::PriorityBuilder::buildInputSectionPriorities() {
     return sectionPriorities;
 
   auto addSym = [&](const Defined *sym) {
-    std::optional<size_t> symbolPriority = getSymbolPriority(sym);
+    std::optional<int> symbolPriority = getSymbolPriority(sym);
     if (!symbolPriority)
       return;
-    size_t &priority = sectionPriorities[sym->isec()];
-    priority = std::max(priority, *symbolPriority);
+    int &priority = sectionPriorities[sym->isec()];
+    priority = std::min(priority, *symbolPriority);
   };
 
   // TODO: Make sure this handles weak symbols correctly.
diff --git a/lld/MachO/SectionPriorities.h b/lld/MachO/SectionPriorities.h
index 83cfe354e7263..44fb101990c51 100644
--- a/lld/MachO/SectionPriorities.h
+++ b/lld/MachO/SectionPriorities.h
@@ -53,24 +53,20 @@ class PriorityBuilder {
   //
   // Each section gets assigned the priority of the highest-priority symbol it
   // contains.
-  llvm::DenseMap<const InputSection *, size_t> buildInputSectionPriorities();
+  llvm::DenseMap<const InputSection *, int> buildInputSectionPriorities();
 
 private:
-  // The symbol with the highest priority should be ordered first in the output
-  // section (modulo input section contiguity constraints). Using priority
-  // (highest first) instead of order (lowest first) has the convenient property
-  // that the default-constructed zero priority -- for symbols/sections without
-  // a user-defined order -- naturally ends up putting them at the end of the
-  // output.
+  // The symbol with the smallest priority should be ordered first in the output
+  // section (modulo input section contiguity constraints).
   struct SymbolPriorityEntry {
     // The priority given to a matching symbol, regardless of which object file
     // it originated from.
-    size_t anyObjectFile = 0;
+    int anyObjectFile = 0;
     // The priority given to a matching symbol from a particular object file.
-    llvm::DenseMap<llvm::StringRef, size_t> objectFiles;
+    llvm::DenseMap<llvm::StringRef, int> objectFiles;
   };
 
-  std::optional<size_t> getSymbolPriority(const Defined *sym);
+  std::optional<int> getSymbolPriority(const Defined *sym);
   llvm::DenseMap<llvm::StringRef, SymbolPriorityEntry> priorities;
   llvm::MapVector<SectionPair, uint64_t> callGraphProfile;
 };
diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp
index 6a1dd0ae7ecaf..bec980e18e18b 100644
--- a/lld/MachO/Writer.cpp
+++ b/lld/MachO/Writer.cpp
@@ -975,7 +975,7 @@ static void sortSegmentsAndSections() {
   TimeTraceScope timeScope("Sort segments and sections");
   sortOutputSegments();
 
-  DenseMap<const InputSection *, size_t> isecPriorities =
+  DenseMap<const InputSection *, int> isecPriorities =
       priorityBuilder.buildInputSectionPriorities();
 
   uint32_t sectionIndex = 0;
@@ -1008,7 +1008,7 @@ static void sortSegmentsAndSections() {
         if (auto *merged = dyn_cast<ConcatOutputSection>(osec)) {
           llvm::stable_sort(
               merged->inputs, [&](InputSection *a, InputSection *b) {
-                return isecPriorities.lookup(a) > isecPriorities.lookup(b);
+                return isecPriorities.lookup(a) < isecPriorities.lookup(b);
               });
         }
       }
diff --git a/lld/include/lld/Common/BPSectionOrdererBase.h b/lld/include/lld/Common/BPSectionOrdererBase.h
index e2cb41f69cc68..bd5bd638ccd2a 100644
--- a/lld/include/lld/Common/BPSectionOrdererBase.h
+++ b/lld/include/lld/Common/BPSectionOrdererBase.h
@@ -66,11 +66,11 @@ class BPSectionBase {
 
   /// Reorders sections using balanced partitioning algorithm based on profile
   /// data.
-  static llvm::DenseMap<const BPSectionBase *, size_t>
+  static llvm::DenseMap<const BPSectionBase *, int>
   reorderSectionsByBalancedPartitioning(
-      size_t &highestAvailablePriority, llvm::StringRef profilePath,
-      bool forFunctionCompression, bool forDataCompression,
-      bool compressionSortStartupFunctions, bool verbose,
+      llvm::StringRef profilePath, bool forFunctionCompression,
+      bool forDataCompression, bool compressionSortStartupFunctions,
+      bool verbose,
       llvm::SmallVector<std::unique_ptr<BPSectionBase>> &inputSections);
 };
 

From 681c83a2f99431d4bb9d4975a08771320e30a80b Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Fri, 10 Jan 2025 08:13:46 -0800
Subject: [PATCH 090/408] [SLP]Fix mask generation after cost estimation

When estimating the cost of entries shuffles for buildvectors, need to
rebuild original mask, not a generated submask, used for subregisters
analysis.

Fixes #122430
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 29 ++++++++++++++-----
 .../SLPVectorizer/X86/bv-shuffle-mask.ll      |  4 +--
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 29b81114ef705..a3f47a273d2ec 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13445,14 +13445,15 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       for_each(SubMask, [&](int &Idx) {
         if (Idx == PoisonMaskElem)
           return;
-        Idx = (Idx % VF) - (MinElement % VF) +
+        Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
               (Idx >= static_cast<int>(VF) ? NewVF : 0);
       });
-      VF = NewVF;
+    } else {
+      NewVF = VF;
     }
 
     constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-    auto *VecTy = getWidenedType(VL.front()->getType(), VF);
+    auto *VecTy = getWidenedType(VL.front()->getType(), NewVF);
     auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
     auto GetShuffleCost = [&,
                            &TTI = *TTI](ArrayRef<int> Mask,
@@ -13477,7 +13478,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       APInt DemandedElts = APInt::getAllOnes(SubMask.size());
       bool IsIdentity = true;
       for (auto [I, Idx] : enumerate(FirstMask)) {
-        if (Idx >= static_cast<int>(VF)) {
+        if (Idx >= static_cast<int>(NewVF)) {
           Idx = PoisonMaskElem;
         } else {
           DemandedElts.clearBit(I);
@@ -13500,12 +13501,12 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       APInt DemandedElts = APInt::getAllOnes(SubMask.size());
       bool IsIdentity = true;
       for (auto [I, Idx] : enumerate(SecondMask)) {
-        if (Idx < static_cast<int>(VF) && Idx >= 0) {
+        if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
           Idx = PoisonMaskElem;
         } else {
           DemandedElts.clearBit(I);
           if (Idx != PoisonMaskElem) {
-            Idx -= VF;
+            Idx -= NewVF;
             IsIdentity &= static_cast<int>(I) == Idx;
           }
         }
@@ -13525,12 +13526,24 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
                                       /*Extract=*/false, CostKind);
     const TreeEntry *BestEntry = nullptr;
     if (FirstShuffleCost < ShuffleCost) {
-      copy(FirstMask, std::next(Mask.begin(), Part * VL.size()));
+      std::for_each(std::next(Mask.begin(), Part * VL.size()),
+                    std::next(Mask.begin(), (Part + 1) * VL.size()),
+                    [&](int &Idx) {
+                      if (Idx >= static_cast<int>(VF))
+                        Idx = PoisonMaskElem;
+                    });
       BestEntry = Entries.front();
       ShuffleCost = FirstShuffleCost;
     }
     if (SecondShuffleCost < ShuffleCost) {
-      copy(SecondMask, std::next(Mask.begin(), Part * VL.size()));
+      std::for_each(std::next(Mask.begin(), Part * VL.size()),
+                    std::next(Mask.begin(), (Part + 1) * VL.size()),
+                    [&](int &Idx) {
+                      if (Idx < static_cast<int>(VF))
+                        Idx = PoisonMaskElem;
+                      else
+                        Idx -= VF;
+                    });
       BestEntry = Entries[1];
       ShuffleCost = SecondShuffleCost;
     }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll b/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll
index 469421b660f31..766916fe71f35 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll
@@ -10,8 +10,8 @@ define i16 @test(i16 %v1, i16 %v2) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i16> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i16> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i16> [[TMP5]], i16 [[V2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 poison, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i16> [[TMP5]], i16 [[V1]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i16> [[TMP6]], <2 x i16> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = and <4 x i16> [[TMP4]], zeroinitializer

From 6f53886a9a5e65136619ada7713f31942a1cc1fa Mon Sep 17 00:00:00 2001
From: Raphael Moreira Zinsly <rzinsly@ventanamicro.com>
Date: Fri, 10 Jan 2025 14:48:21 -0300
Subject: [PATCH 091/408] [RISCV] Add stack clash vector support (#119458)

Use the probe loop structure to allocate vector code in the stack as
well. We add the pseudo instruction RISCV::PROBED_STACKALLOC_RVV to
differentiate from the normal loop.
---
 llvm/lib/Target/RISCV/RISCVFrameLowering.cpp  | 144 ++++++-
 llvm/lib/Target/RISCV/RISCVFrameLowering.h    |   5 +
 llvm/lib/Target/RISCV/RISCVInstrInfo.td       |   4 +
 .../RISCV/rvv/access-fixed-objects-by-rvv.ll  |  46 ++
 .../CodeGen/RISCV/rvv/stack-probing-rvv.ll    | 400 ++++++++++++++++++
 .../CodeGen/RISCV/stack-clash-prologue.ll     |  68 +++
 6 files changed, 644 insertions(+), 23 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/stack-probing-rvv.ll

diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index f036f14b189ab..ed3ec31028067 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -499,6 +499,54 @@ getPushOrLibCallsSavedInfo(const MachineFunction &MF,
   return PushOrLibCallsCSI;
 }
 
+void RISCVFrameLowering::allocateAndProbeStackForRVV(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MBBI, const DebugLoc &DL, int64_t Amount,
+    MachineInstr::MIFlag Flag, bool EmitCFI) const {
+  assert(Amount != 0 && "Did not need to adjust stack pointer for RVV.");
+
+  // Emit a variable-length allocation probing loop.
+
+  // Get VLEN in TargetReg
+  const RISCVInstrInfo *TII = STI.getInstrInfo();
+  Register TargetReg = RISCV::X6;
+  uint32_t NumOfVReg = Amount / (RISCV::RVVBitsPerBlock / 8);
+  BuildMI(MBB, MBBI, DL, TII->get(RISCV::PseudoReadVLENB), TargetReg)
+      .setMIFlag(Flag);
+  TII->mulImm(MF, MBB, MBBI, DL, TargetReg, NumOfVReg, Flag);
+
+  if (EmitCFI) {
+    // Set the CFA register to TargetReg.
+    unsigned Reg = STI.getRegisterInfo()->getDwarfRegNum(TargetReg, true);
+    unsigned CFIIndex =
+        MF.addFrameInst(MCCFIInstruction::cfiDefCfa(nullptr, Reg, -Amount));
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex)
+        .setMIFlags(MachineInstr::FrameSetup);
+  }
+
+  // It will be expanded to a probe loop in `inlineStackProbe`.
+  BuildMI(MBB, MBBI, DL, TII->get(RISCV::PROBED_STACKALLOC_RVV))
+      .addReg(SPReg)
+      .addReg(TargetReg);
+
+  if (EmitCFI) {
+    // Set the CFA register back to SP.
+    unsigned Reg = STI.getRegisterInfo()->getDwarfRegNum(SPReg, true);
+    unsigned CFIIndex =
+        MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg));
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex)
+        .setMIFlags(MachineInstr::FrameSetup);
+  }
+
+  // SUB SP, SP, T1
+  BuildMI(MBB, MBBI, DL, TII->get(RISCV::SUB), SPReg)
+      .addReg(SPReg)
+      .addReg(TargetReg)
+      .setMIFlag(Flag);
+}
+
 static void appendScalableVectorExpression(const TargetRegisterInfo &TRI,
                                            SmallVectorImpl<char> &Expr,
                                            int FixedOffset, int ScalableOffset,
@@ -857,10 +905,10 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
         .setMIFlag(MachineInstr::FrameSetup);
   }
 
+  uint64_t SecondSPAdjustAmount = 0;
   // Emit the second SP adjustment after saving callee saved registers.
   if (FirstSPAdjustAmount) {
-    uint64_t SecondSPAdjustAmount =
-        getStackSizeWithRVVPadding(MF) - FirstSPAdjustAmount;
+    SecondSPAdjustAmount = getStackSizeWithRVVPadding(MF) - FirstSPAdjustAmount;
     assert(SecondSPAdjustAmount > 0 &&
            "SecondSPAdjustAmount should be greater than zero");
 
@@ -870,11 +918,16 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
   }
 
   if (RVVStackSize) {
-    // We must keep the stack pointer aligned through any intermediate
-    // updates.
-    RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
-                  StackOffset::getScalable(-RVVStackSize),
-                  MachineInstr::FrameSetup, getStackAlign());
+    if (NeedProbe) {
+      allocateAndProbeStackForRVV(MF, MBB, MBBI, DL, RVVStackSize,
+                                  MachineInstr::FrameSetup, !hasFP(MF));
+    } else {
+      // We must keep the stack pointer aligned through any intermediate
+      // updates.
+      RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
+                    StackOffset::getScalable(-RVVStackSize),
+                    MachineInstr::FrameSetup, getStackAlign());
+    }
 
     if (!hasFP(MF)) {
       // Emit .cfi_def_cfa_expression "sp + StackSize + RVVStackSize * vlenb".
@@ -914,6 +967,19 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
             .addImm(ShiftAmount)
             .setMIFlag(MachineInstr::FrameSetup);
       }
+      if (NeedProbe && RVVStackSize == 0) {
+        // Do a probe if the align + size allocated just passed the probe size
+        // and was not yet probed.
+        if (SecondSPAdjustAmount < ProbeSize &&
+            SecondSPAdjustAmount + MaxAlignment.value() >= ProbeSize) {
+          bool IsRV64 = STI.is64Bit();
+          BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
+              .addReg(RISCV::X0)
+              .addReg(SPReg)
+              .addImm(0)
+              .setMIFlags(MachineInstr::FrameSetup);
+        }
+      }
       // FP will be used to restore the frame in the epilogue, so we need
       // another base register BP to record SP after re-alignment. SP will
       // track the current stack after allocating variable sized objects.
@@ -2019,8 +2085,9 @@ TargetStackID::Value RISCVFrameLowering::getStackIDForScalableVectors() const {
 
 // Synthesize the probe loop.
 static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator MBBI,
-                                 DebugLoc DL) {
+                                 MachineBasicBlock::iterator MBBI, DebugLoc DL,
+                                 Register TargetReg, bool IsRVV) {
+  assert(TargetReg != RISCV::X2 && "New top of stack cannot already be in SP");
 
   auto &Subtarget = MF.getSubtarget<RISCVSubtarget>();
   const RISCVInstrInfo *TII = Subtarget.getInstrInfo();
@@ -2036,7 +2103,6 @@ static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
   MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
   MF.insert(MBBInsertPoint, ExitMBB);
   MachineInstr::MIFlag Flags = MachineInstr::FrameSetup;
-  Register TargetReg = RISCV::X6;
   Register ScratchReg = RISCV::X7;
 
   // ScratchReg = ProbeSize
@@ -2057,12 +2123,29 @@ static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
       .addImm(0)
       .setMIFlags(Flags);
 
-  //   BNE SP, TargetReg, LoopTest
-  BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::BNE))
-      .addReg(SPReg)
-      .addReg(TargetReg)
-      .addMBB(LoopTestMBB)
-      .setMIFlags(Flags);
+  if (IsRVV) {
+    //  SUB TargetReg, TargetReg, ProbeSize
+    BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::SUB),
+            TargetReg)
+        .addReg(TargetReg)
+        .addReg(ScratchReg)
+        .setMIFlags(Flags);
+
+    //  BGE TargetReg, ProbeSize, LoopTest
+    BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::BGE))
+        .addReg(TargetReg)
+        .addReg(ScratchReg)
+        .addMBB(LoopTestMBB)
+        .setMIFlags(Flags);
+
+  } else {
+    //  BNE SP, TargetReg, LoopTest
+    BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::BNE))
+        .addReg(SPReg)
+        .addReg(TargetReg)
+        .addMBB(LoopTestMBB)
+        .setMIFlags(Flags);
+  }
 
   ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
 
@@ -2075,12 +2158,27 @@ static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
 
 void RISCVFrameLowering::inlineStackProbe(MachineFunction &MF,
                                           MachineBasicBlock &MBB) const {
-  auto Where = llvm::find_if(MBB, [](MachineInstr &MI) {
-    return MI.getOpcode() == RISCV::PROBED_STACKALLOC;
-  });
-  if (Where != MBB.end()) {
-    DebugLoc DL = MBB.findDebugLoc(Where);
-    emitStackProbeInline(MF, MBB, Where, DL);
-    Where->eraseFromParent();
+  // Get the instructions that need to be replaced. We emit at most two of
+  // these. Remember them in order to avoid complications coming from the need
+  // to traverse the block while potentially creating more blocks.
+  SmallVector<MachineInstr *, 4> ToReplace;
+  for (MachineInstr &MI : MBB) {
+    unsigned Opc = MI.getOpcode();
+    if (Opc == RISCV::PROBED_STACKALLOC ||
+        Opc == RISCV::PROBED_STACKALLOC_RVV) {
+      ToReplace.push_back(&MI);
+    }
+  }
+
+  for (MachineInstr *MI : ToReplace) {
+    if (MI->getOpcode() == RISCV::PROBED_STACKALLOC ||
+        MI->getOpcode() == RISCV::PROBED_STACKALLOC_RVV) {
+      MachineBasicBlock::iterator MBBI = MI->getIterator();
+      DebugLoc DL = MBB.findDebugLoc(MBBI);
+      Register TargetReg = MI->getOperand(1).getReg();
+      emitStackProbeInline(MF, MBB, MBBI, DL, TargetReg,
+                           (MI->getOpcode() == RISCV::PROBED_STACKALLOC_RVV));
+      MBBI->eraseFromParent();
+    }
   }
 }
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
index 190c063d9d3b5..26d2a26d681c3 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
@@ -107,6 +107,11 @@ class RISCVFrameLowering : public TargetFrameLowering {
   // Replace a StackProbe stub (if any) with the actual probe code inline
   void inlineStackProbe(MachineFunction &MF,
                         MachineBasicBlock &PrologueMBB) const override;
+  void allocateAndProbeStackForRVV(MachineFunction &MF, MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   const DebugLoc &DL, int64_t Amount,
+                                   MachineInstr::MIFlag Flag,
+                                   bool EmitCFI) const;
 };
 } // namespace llvm
 #endif
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 1260f99ad9dcd..ee86f53a5c8a8 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1424,6 +1424,10 @@ def PROBED_STACKALLOC : Pseudo<(outs GPR:$sp),
                                (ins GPR:$scratch),
                                []>,
                                Sched<[]>;
+def PROBED_STACKALLOC_RVV : Pseudo<(outs GPR:$sp),
+                               (ins GPR:$scratch),
+                               []>,
+                               Sched<[]>;
 }
 
 /// HI and ADD_LO address nodes.
diff --git a/llvm/test/CodeGen/RISCV/rvv/access-fixed-objects-by-rvv.ll b/llvm/test/CodeGen/RISCV/rvv/access-fixed-objects-by-rvv.ll
index c6a3649c9ba8f..0052f4b9c041e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/access-fixed-objects-by-rvv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/access-fixed-objects-by-rvv.ll
@@ -64,3 +64,49 @@ define <vscale x 1 x i64> @access_fixed_and_vector_objects(ptr %val) {
 
   ret <vscale x 1 x i64> %a
 }
+
+define <vscale x 1 x i64> @probe_fixed_and_vector_objects(ptr %val, <vscale x 1 x i64> %dummy) "probe-stack"="inline-asm" {
+; RV64IV-LABEL: probe_fixed_and_vector_objects:
+; RV64IV:       # %bb.0:
+; RV64IV-NEXT:    addi sp, sp, -528
+; RV64IV-NEXT:    .cfi_def_cfa_offset 528
+; RV64IV-NEXT:    csrr t1, vlenb
+; RV64IV-NEXT:    .cfi_def_cfa t1, -8
+; RV64IV-NEXT:    lui t2, 1
+; RV64IV-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
+; RV64IV-NEXT:    sub sp, sp, t2
+; RV64IV-NEXT:    sd zero, 0(sp)
+; RV64IV-NEXT:    sub t1, t1, t2
+; RV64IV-NEXT:    bge t1, t2, .LBB2_1
+; RV64IV-NEXT:  # %bb.2:
+; RV64IV-NEXT:    .cfi_def_cfa_register sp
+; RV64IV-NEXT:    sub sp, sp, t1
+; RV64IV-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x90, 0x04, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 528 + 1 * vlenb
+; RV64IV-NEXT:    addi a0, sp, 8
+; RV64IV-NEXT:    vl1re64.v v9, (a0)
+; RV64IV-NEXT:    addi a0, sp, 528
+; RV64IV-NEXT:    vl1re64.v v10, (a0)
+; RV64IV-NEXT:    ld a0, 520(sp)
+; RV64IV-NEXT:    vsetvli zero, a0, e64, m1, tu, ma
+; RV64IV-NEXT:    vadd.vv v8, v9, v10
+; RV64IV-NEXT:    csrr a0, vlenb
+; RV64IV-NEXT:    add sp, sp, a0
+; RV64IV-NEXT:    .cfi_def_cfa sp, 528
+; RV64IV-NEXT:    addi sp, sp, 528
+; RV64IV-NEXT:    .cfi_def_cfa_offset 0
+; RV64IV-NEXT:    ret
+  %local = alloca i64
+  %vector = alloca <vscale x 1 x i64>
+  %array = alloca [64 x i64]
+  %v1 = load <vscale x 1 x i64>, ptr %array
+  %v2 = load <vscale x 1 x i64>, ptr %vector
+  %len = load i64, ptr %local
+
+  %a = call <vscale x 1 x i64> @llvm.riscv.vadd.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> %dummy,
+    <vscale x 1 x i64> %v1,
+    <vscale x 1 x i64> %v2,
+    i64 %len)
+
+  ret <vscale x 1 x i64> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/stack-probing-rvv.ll b/llvm/test/CodeGen/RISCV/rvv/stack-probing-rvv.ll
new file mode 100644
index 0000000000000..d7f9ae73eaea5
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/stack-probing-rvv.ll
@@ -0,0 +1,400 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v -O2 < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64IV
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v -O2 < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32IV
+
+; Tests adapted from AArch64.
+
+; Test prolog sequences for stack probing when vector is involved.
+
+; The space for vector objects needs probing in the general case, because
+; the stack adjustment may happen to be too big (i.e. greater than the
+; probe size).
+
+define void @f_vector(ptr %out) #0 {
+; RV64IV-LABEL: f_vector:
+; RV64IV:       # %bb.0: # %entry
+; RV64IV-NEXT:    csrr t1, vlenb
+; RV64IV-NEXT:    slli t1, t1, 1
+; RV64IV-NEXT:    .cfi_def_cfa t1, -16
+; RV64IV-NEXT:    lui t2, 1
+; RV64IV-NEXT:  .LBB0_1: # %entry
+; RV64IV-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64IV-NEXT:    sub sp, sp, t2
+; RV64IV-NEXT:    sd zero, 0(sp)
+; RV64IV-NEXT:    sub t1, t1, t2
+; RV64IV-NEXT:    bge t1, t2, .LBB0_1
+; RV64IV-NEXT:  # %bb.2: # %entry
+; RV64IV-NEXT:    .cfi_def_cfa_register sp
+; RV64IV-NEXT:    sub sp, sp, t1
+; RV64IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 2 * vlenb
+; RV64IV-NEXT:    csrr a0, vlenb
+; RV64IV-NEXT:    slli a0, a0, 1
+; RV64IV-NEXT:    add sp, sp, a0
+; RV64IV-NEXT:    .cfi_def_cfa sp, 0
+; RV64IV-NEXT:    ret
+;
+; RV32IV-LABEL: f_vector:
+; RV32IV:       # %bb.0: # %entry
+; RV32IV-NEXT:    csrr t1, vlenb
+; RV32IV-NEXT:    slli t1, t1, 1
+; RV32IV-NEXT:    .cfi_def_cfa t1, -16
+; RV32IV-NEXT:    lui t2, 1
+; RV32IV-NEXT:  .LBB0_1: # %entry
+; RV32IV-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32IV-NEXT:    sub sp, sp, t2
+; RV32IV-NEXT:    sw zero, 0(sp)
+; RV32IV-NEXT:    sub t1, t1, t2
+; RV32IV-NEXT:    bge t1, t2, .LBB0_1
+; RV32IV-NEXT:  # %bb.2: # %entry
+; RV32IV-NEXT:    .cfi_def_cfa_register sp
+; RV32IV-NEXT:    sub sp, sp, t1
+; RV32IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 2 * vlenb
+; RV32IV-NEXT:    csrr a0, vlenb
+; RV32IV-NEXT:    slli a0, a0, 1
+; RV32IV-NEXT:    add sp, sp, a0
+; RV32IV-NEXT:    .cfi_def_cfa sp, 0
+; RV32IV-NEXT:    ret
+entry:
+  %vec = alloca <vscale x 4 x float>, align 16
+  ret void
+}
+
+; As above, but with 4 vectors of stack space.
+define void @f4_vector(ptr %out) #0 {
+; RV64IV-LABEL: f4_vector:
+; RV64IV:       # %bb.0: # %entry
+; RV64IV-NEXT:    csrr t1, vlenb
+; RV64IV-NEXT:    slli t1, t1, 3
+; RV64IV-NEXT:    .cfi_def_cfa t1, -64
+; RV64IV-NEXT:    lui t2, 1
+; RV64IV-NEXT:  .LBB1_1: # %entry
+; RV64IV-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64IV-NEXT:    sub sp, sp, t2
+; RV64IV-NEXT:    sd zero, 0(sp)
+; RV64IV-NEXT:    sub t1, t1, t2
+; RV64IV-NEXT:    bge t1, t2, .LBB1_1
+; RV64IV-NEXT:  # %bb.2: # %entry
+; RV64IV-NEXT:    .cfi_def_cfa_register sp
+; RV64IV-NEXT:    sub sp, sp, t1
+; RV64IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 8 * vlenb
+; RV64IV-NEXT:    csrr a0, vlenb
+; RV64IV-NEXT:    slli a0, a0, 3
+; RV64IV-NEXT:    add sp, sp, a0
+; RV64IV-NEXT:    .cfi_def_cfa sp, 0
+; RV64IV-NEXT:    ret
+;
+; RV32IV-LABEL: f4_vector:
+; RV32IV:       # %bb.0: # %entry
+; RV32IV-NEXT:    csrr t1, vlenb
+; RV32IV-NEXT:    slli t1, t1, 3
+; RV32IV-NEXT:    .cfi_def_cfa t1, -64
+; RV32IV-NEXT:    lui t2, 1
+; RV32IV-NEXT:  .LBB1_1: # %entry
+; RV32IV-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32IV-NEXT:    sub sp, sp, t2
+; RV32IV-NEXT:    sw zero, 0(sp)
+; RV32IV-NEXT:    sub t1, t1, t2
+; RV32IV-NEXT:    bge t1, t2, .LBB1_1
+; RV32IV-NEXT:  # %bb.2: # %entry
+; RV32IV-NEXT:    .cfi_def_cfa_register sp
+; RV32IV-NEXT:    sub sp, sp, t1
+; RV32IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 8 * vlenb
+; RV32IV-NEXT:    csrr a0, vlenb
+; RV32IV-NEXT:    slli a0, a0, 3
+; RV32IV-NEXT:    add sp, sp, a0
+; RV32IV-NEXT:    .cfi_def_cfa sp, 0
+; RV32IV-NEXT:    ret
+entry:
+  %vec1 = alloca <vscale x 4 x float>, align 16
+  %vec2 = alloca <vscale x 4 x float>, align 16
+  %vec3 = alloca <vscale x 4 x float>, align 16
+  %vec4 = alloca <vscale x 4 x float>, align 16
+  ret void
+}
+
+; As above, but with 16 vectors of stack space.
+; The stack adjustment is less than or equal to 16 x 256 = 4096, so
+; we can allocate the locals at once.
+define void @f16_vector(ptr %out) #0 {
+; RV64IV-LABEL: f16_vector:
+; RV64IV:       # %bb.0: # %entry
+; RV64IV-NEXT:    csrr t1, vlenb
+; RV64IV-NEXT:    slli t1, t1, 5
+; RV64IV-NEXT:    .cfi_def_cfa t1, -256
+; RV64IV-NEXT:    lui t2, 1
+; RV64IV-NEXT:  .LBB2_1: # %entry
+; RV64IV-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64IV-NEXT:    sub sp, sp, t2
+; RV64IV-NEXT:    sd zero, 0(sp)
+; RV64IV-NEXT:    sub t1, t1, t2
+; RV64IV-NEXT:    bge t1, t2, .LBB2_1
+; RV64IV-NEXT:  # %bb.2: # %entry
+; RV64IV-NEXT:    .cfi_def_cfa_register sp
+; RV64IV-NEXT:    sub sp, sp, t1
+; RV64IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 * vlenb
+; RV64IV-NEXT:    csrr a0, vlenb
+; RV64IV-NEXT:    slli a0, a0, 5
+; RV64IV-NEXT:    add sp, sp, a0
+; RV64IV-NEXT:    .cfi_def_cfa sp, 0
+; RV64IV-NEXT:    ret
+;
+; RV32IV-LABEL: f16_vector:
+; RV32IV:       # %bb.0: # %entry
+; RV32IV-NEXT:    csrr t1, vlenb
+; RV32IV-NEXT:    slli t1, t1, 5
+; RV32IV-NEXT:    .cfi_def_cfa t1, -256
+; RV32IV-NEXT:    lui t2, 1
+; RV32IV-NEXT:  .LBB2_1: # %entry
+; RV32IV-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32IV-NEXT:    sub sp, sp, t2
+; RV32IV-NEXT:    sw zero, 0(sp)
+; RV32IV-NEXT:    sub t1, t1, t2
+; RV32IV-NEXT:    bge t1, t2, .LBB2_1
+; RV32IV-NEXT:  # %bb.2: # %entry
+; RV32IV-NEXT:    .cfi_def_cfa_register sp
+; RV32IV-NEXT:    sub sp, sp, t1
+; RV32IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 * vlenb
+; RV32IV-NEXT:    csrr a0, vlenb
+; RV32IV-NEXT:    slli a0, a0, 5
+; RV32IV-NEXT:    add sp, sp, a0
+; RV32IV-NEXT:    .cfi_def_cfa sp, 0
+; RV32IV-NEXT:    ret
+entry:
+  %vec1 = alloca <vscale x 4 x float>, align 16
+  %vec2 = alloca <vscale x 4 x float>, align 16
+  %vec3 = alloca <vscale x 4 x float>, align 16
+  %vec4 = alloca <vscale x 4 x float>, align 16
+  %vec5 = alloca <vscale x 4 x float>, align 16
+  %vec6 = alloca <vscale x 4 x float>, align 16
+  %vec7 = alloca <vscale x 4 x float>, align 16
+  %vec8 = alloca <vscale x 4 x float>, align 16
+  %vec9 = alloca <vscale x 4 x float>, align 16
+  %vec10 = alloca <vscale x 4 x float>, align 16
+  %vec11 = alloca <vscale x 4 x float>, align 16
+  %vec12 = alloca <vscale x 4 x float>, align 16
+  %vec13 = alloca <vscale x 4 x float>, align 16
+  %vec14 = alloca <vscale x 4 x float>, align 16
+  %vec15 = alloca <vscale x 4 x float>, align 16
+  %vec16 = alloca <vscale x 4 x float>, align 16
+  ret void
+}
+
+; As above, but with 17 vectors of stack space.
+define void @f17_vector(ptr %out) #0 {
+; RV64IV-LABEL: f17_vector:
+; RV64IV:       # %bb.0: # %entry
+; RV64IV-NEXT:    csrr t1, vlenb
+; RV64IV-NEXT:    li a0, 34
+; RV64IV-NEXT:    mul t1, t1, a0
+; RV64IV-NEXT:    .cfi_def_cfa t1, -272
+; RV64IV-NEXT:    lui t2, 1
+; RV64IV-NEXT:  .LBB3_1: # %entry
+; RV64IV-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64IV-NEXT:    sub sp, sp, t2
+; RV64IV-NEXT:    sd zero, 0(sp)
+; RV64IV-NEXT:    sub t1, t1, t2
+; RV64IV-NEXT:    bge t1, t2, .LBB3_1
+; RV64IV-NEXT:  # %bb.2: # %entry
+; RV64IV-NEXT:    .cfi_def_cfa_register sp
+; RV64IV-NEXT:    sub sp, sp, t1
+; RV64IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 34 * vlenb
+; RV64IV-NEXT:    csrr a0, vlenb
+; RV64IV-NEXT:    li a1, 34
+; RV64IV-NEXT:    mul a0, a0, a1
+; RV64IV-NEXT:    add sp, sp, a0
+; RV64IV-NEXT:    .cfi_def_cfa sp, 0
+; RV64IV-NEXT:    ret
+;
+; RV32IV-LABEL: f17_vector:
+; RV32IV:       # %bb.0: # %entry
+; RV32IV-NEXT:    csrr t1, vlenb
+; RV32IV-NEXT:    li a0, 34
+; RV32IV-NEXT:    mul t1, t1, a0
+; RV32IV-NEXT:    .cfi_def_cfa t1, -272
+; RV32IV-NEXT:    lui t2, 1
+; RV32IV-NEXT:  .LBB3_1: # %entry
+; RV32IV-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32IV-NEXT:    sub sp, sp, t2
+; RV32IV-NEXT:    sw zero, 0(sp)
+; RV32IV-NEXT:    sub t1, t1, t2
+; RV32IV-NEXT:    bge t1, t2, .LBB3_1
+; RV32IV-NEXT:  # %bb.2: # %entry
+; RV32IV-NEXT:    .cfi_def_cfa_register sp
+; RV32IV-NEXT:    sub sp, sp, t1
+; RV32IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 34 * vlenb
+; RV32IV-NEXT:    csrr a0, vlenb
+; RV32IV-NEXT:    li a1, 34
+; RV32IV-NEXT:    mul a0, a0, a1
+; RV32IV-NEXT:    add sp, sp, a0
+; RV32IV-NEXT:    .cfi_def_cfa sp, 0
+; RV32IV-NEXT:    ret
+entry:
+  %vec1 = alloca <vscale x 4 x float>, align 16
+  %vec2 = alloca <vscale x 4 x float>, align 16
+  %vec3 = alloca <vscale x 4 x float>, align 16
+  %vec4 = alloca <vscale x 4 x float>, align 16
+  %vec5 = alloca <vscale x 4 x float>, align 16
+  %vec6 = alloca <vscale x 4 x float>, align 16
+  %vec7 = alloca <vscale x 4 x float>, align 16
+  %vec8 = alloca <vscale x 4 x float>, align 16
+  %vec9 = alloca <vscale x 4 x float>, align 16
+  %vec10 = alloca <vscale x 4 x float>, align 16
+  %vec11 = alloca <vscale x 4 x float>, align 16
+  %vec12 = alloca <vscale x 4 x float>, align 16
+  %vec13 = alloca <vscale x 4 x float>, align 16
+  %vec14 = alloca <vscale x 4 x float>, align 16
+  %vec15 = alloca <vscale x 4 x float>, align 16
+  %vec16 = alloca <vscale x 4 x float>, align 16
+  %vec17 = alloca <vscale x 4 x float>, align 16
+  ret void
+}
+
+; A vector and a 16-byte fixed size object.
+define void @f1_vector_16_arr(ptr %out) #0 {
+; RV64IV-LABEL: f1_vector_16_arr:
+; RV64IV:       # %bb.0: # %entry
+; RV64IV-NEXT:    addi sp, sp, -16
+; RV64IV-NEXT:    .cfi_def_cfa_offset 16
+; RV64IV-NEXT:    csrr t1, vlenb
+; RV64IV-NEXT:    slli t1, t1, 1
+; RV64IV-NEXT:    .cfi_def_cfa t1, -16
+; RV64IV-NEXT:    lui t2, 1
+; RV64IV-NEXT:  .LBB4_1: # %entry
+; RV64IV-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64IV-NEXT:    sub sp, sp, t2
+; RV64IV-NEXT:    sd zero, 0(sp)
+; RV64IV-NEXT:    sub t1, t1, t2
+; RV64IV-NEXT:    bge t1, t2, .LBB4_1
+; RV64IV-NEXT:  # %bb.2: # %entry
+; RV64IV-NEXT:    .cfi_def_cfa_register sp
+; RV64IV-NEXT:    sub sp, sp, t1
+; RV64IV-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; RV64IV-NEXT:    csrr a0, vlenb
+; RV64IV-NEXT:    slli a0, a0, 1
+; RV64IV-NEXT:    add sp, sp, a0
+; RV64IV-NEXT:    .cfi_def_cfa sp, 16
+; RV64IV-NEXT:    addi sp, sp, 16
+; RV64IV-NEXT:    .cfi_def_cfa_offset 0
+; RV64IV-NEXT:    ret
+;
+; RV32IV-LABEL: f1_vector_16_arr:
+; RV32IV:       # %bb.0: # %entry
+; RV32IV-NEXT:    addi sp, sp, -16
+; RV32IV-NEXT:    .cfi_def_cfa_offset 16
+; RV32IV-NEXT:    csrr t1, vlenb
+; RV32IV-NEXT:    slli t1, t1, 1
+; RV32IV-NEXT:    .cfi_def_cfa t1, -16
+; RV32IV-NEXT:    lui t2, 1
+; RV32IV-NEXT:  .LBB4_1: # %entry
+; RV32IV-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32IV-NEXT:    sub sp, sp, t2
+; RV32IV-NEXT:    sw zero, 0(sp)
+; RV32IV-NEXT:    sub t1, t1, t2
+; RV32IV-NEXT:    bge t1, t2, .LBB4_1
+; RV32IV-NEXT:  # %bb.2: # %entry
+; RV32IV-NEXT:    .cfi_def_cfa_register sp
+; RV32IV-NEXT:    sub sp, sp, t1
+; RV32IV-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; RV32IV-NEXT:    csrr a0, vlenb
+; RV32IV-NEXT:    slli a0, a0, 1
+; RV32IV-NEXT:    add sp, sp, a0
+; RV32IV-NEXT:    .cfi_def_cfa sp, 16
+; RV32IV-NEXT:    addi sp, sp, 16
+; RV32IV-NEXT:    .cfi_def_cfa_offset 0
+; RV32IV-NEXT:    ret
+entry:
+  %vec = alloca <vscale x 4 x float>, align 16
+  %arr = alloca i8, i64 16, align 1
+  ret void
+}
+
+; A large vector object and a large slot, both of which need probing.
+define void @f1_vector_4096_arr(ptr %out) #0 {
+; RV64IV-LABEL: f1_vector_4096_arr:
+; RV64IV:       # %bb.0: # %entry
+; RV64IV-NEXT:    lui a0, 1
+; RV64IV-NEXT:    sub sp, sp, a0
+; RV64IV-NEXT:    sd zero, 0(sp)
+; RV64IV-NEXT:    .cfi_def_cfa_offset 4096
+; RV64IV-NEXT:    lui a0, 1
+; RV64IV-NEXT:    sub sp, sp, a0
+; RV64IV-NEXT:    sd zero, 0(sp)
+; RV64IV-NEXT:    .cfi_def_cfa_offset 8192
+; RV64IV-NEXT:    lui a0, 1
+; RV64IV-NEXT:    sub sp, sp, a0
+; RV64IV-NEXT:    sd zero, 0(sp)
+; RV64IV-NEXT:    .cfi_def_cfa_offset 12288
+; RV64IV-NEXT:    addi sp, sp, -16
+; RV64IV-NEXT:    .cfi_def_cfa_offset 12304
+; RV64IV-NEXT:    csrr t1, vlenb
+; RV64IV-NEXT:    slli t1, t1, 7
+; RV64IV-NEXT:    .cfi_def_cfa t1, -1024
+; RV64IV-NEXT:    lui t2, 1
+; RV64IV-NEXT:  .LBB5_1: # %entry
+; RV64IV-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64IV-NEXT:    sub sp, sp, t2
+; RV64IV-NEXT:    sd zero, 0(sp)
+; RV64IV-NEXT:    sub t1, t1, t2
+; RV64IV-NEXT:    bge t1, t2, .LBB5_1
+; RV64IV-NEXT:  # %bb.2: # %entry
+; RV64IV-NEXT:    .cfi_def_cfa_register sp
+; RV64IV-NEXT:    sub sp, sp, t1
+; RV64IV-NEXT:    .cfi_escape 0x0f, 0x10, 0x72, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 12304 + 128 * vlenb
+; RV64IV-NEXT:    csrr a0, vlenb
+; RV64IV-NEXT:    slli a0, a0, 7
+; RV64IV-NEXT:    add sp, sp, a0
+; RV64IV-NEXT:    .cfi_def_cfa sp, 12304
+; RV64IV-NEXT:    lui a0, 3
+; RV64IV-NEXT:    addiw a0, a0, 16
+; RV64IV-NEXT:    add sp, sp, a0
+; RV64IV-NEXT:    .cfi_def_cfa_offset 0
+; RV64IV-NEXT:    ret
+;
+; RV32IV-LABEL: f1_vector_4096_arr:
+; RV32IV:       # %bb.0: # %entry
+; RV32IV-NEXT:    lui a0, 1
+; RV32IV-NEXT:    sub sp, sp, a0
+; RV32IV-NEXT:    sw zero, 0(sp)
+; RV32IV-NEXT:    .cfi_def_cfa_offset 4096
+; RV32IV-NEXT:    lui a0, 1
+; RV32IV-NEXT:    sub sp, sp, a0
+; RV32IV-NEXT:    sw zero, 0(sp)
+; RV32IV-NEXT:    .cfi_def_cfa_offset 8192
+; RV32IV-NEXT:    lui a0, 1
+; RV32IV-NEXT:    sub sp, sp, a0
+; RV32IV-NEXT:    sw zero, 0(sp)
+; RV32IV-NEXT:    .cfi_def_cfa_offset 12288
+; RV32IV-NEXT:    addi sp, sp, -16
+; RV32IV-NEXT:    .cfi_def_cfa_offset 12304
+; RV32IV-NEXT:    csrr t1, vlenb
+; RV32IV-NEXT:    slli t1, t1, 7
+; RV32IV-NEXT:    .cfi_def_cfa t1, -1024
+; RV32IV-NEXT:    lui t2, 1
+; RV32IV-NEXT:  .LBB5_1: # %entry
+; RV32IV-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32IV-NEXT:    sub sp, sp, t2
+; RV32IV-NEXT:    sw zero, 0(sp)
+; RV32IV-NEXT:    sub t1, t1, t2
+; RV32IV-NEXT:    bge t1, t2, .LBB5_1
+; RV32IV-NEXT:  # %bb.2: # %entry
+; RV32IV-NEXT:    .cfi_def_cfa_register sp
+; RV32IV-NEXT:    sub sp, sp, t1
+; RV32IV-NEXT:    .cfi_escape 0x0f, 0x10, 0x72, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 12304 + 128 * vlenb
+; RV32IV-NEXT:    csrr a0, vlenb
+; RV32IV-NEXT:    slli a0, a0, 7
+; RV32IV-NEXT:    add sp, sp, a0
+; RV32IV-NEXT:    .cfi_def_cfa sp, 12304
+; RV32IV-NEXT:    lui a0, 3
+; RV32IV-NEXT:    addi a0, a0, 16
+; RV32IV-NEXT:    add sp, sp, a0
+; RV32IV-NEXT:    .cfi_def_cfa_offset 0
+; RV32IV-NEXT:    ret
+entry:
+  %vec = alloca <vscale x 256 x float>, align 16
+  %arr = alloca i8, i64 12288, align 1
+  ret void
+}
+
+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" }
diff --git a/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll b/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll
index 18af080e86747..843e57a42d926 100644
--- a/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll
+++ b/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll
@@ -538,4 +538,72 @@ define i32 @f9(i64 %i) local_unnamed_addr #0 {
   ret i32 %c
 }
 
+; alloca < probe_size, align < probe_size, alloca + align > probe_size
+define i32 @f10(i64 %i) local_unnamed_addr #0 {
+; RV64I-LABEL: f10:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -2032
+; RV64I-NEXT:    .cfi_def_cfa_offset 2032
+; RV64I-NEXT:    sd ra, 2024(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 2016(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    addi s0, sp, 2032
+; RV64I-NEXT:    .cfi_def_cfa s0, 0
+; RV64I-NEXT:    addi sp, sp, -2048
+; RV64I-NEXT:    addi sp, sp, -1040
+; RV64I-NEXT:    andi sp, sp, -1024
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    slli a0, a0, 2
+; RV64I-NEXT:    addi a1, sp, 1024
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    sw a1, 0(a0)
+; RV64I-NEXT:    lw a0, 1024(sp)
+; RV64I-NEXT:    addi sp, s0, -2032
+; RV64I-NEXT:    .cfi_def_cfa sp, 2032
+; RV64I-NEXT:    ld ra, 2024(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 2016(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    addi sp, sp, 2032
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: f10:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -2032
+; RV32I-NEXT:    .cfi_def_cfa_offset 2032
+; RV32I-NEXT:    sw ra, 2028(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 2024(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    addi s0, sp, 2032
+; RV32I-NEXT:    .cfi_def_cfa s0, 0
+; RV32I-NEXT:    addi sp, sp, -2048
+; RV32I-NEXT:    addi sp, sp, -1040
+; RV32I-NEXT:    andi sp, sp, -1024
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    slli a0, a0, 2
+; RV32I-NEXT:    addi a1, sp, 1024
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    li a1, 1
+; RV32I-NEXT:    sw a1, 0(a0)
+; RV32I-NEXT:    lw a0, 1024(sp)
+; RV32I-NEXT:    addi sp, s0, -2032
+; RV32I-NEXT:    .cfi_def_cfa sp, 2032
+; RV32I-NEXT:    lw ra, 2028(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 2024(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    addi sp, sp, 2032
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+  %a = alloca i32, i32 1000, align 1024
+  %b = getelementptr inbounds i32, ptr %a, i64 %i
+  store volatile i32 1, ptr %b
+  %c = load volatile i32, ptr %a
+  ret i32 %c
+}
+
 attributes #0 = { "probe-stack"="inline-asm" }

From b93ffa8e4a11b89a8da02f409139f2ea862aabf0 Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas@arm.com>
Date: Fri, 10 Jan 2025 17:50:35 +0000
Subject: [PATCH 092/408] [FMV][AArch64] Changes in fmv-features metadata.
 (#122192)

* We want the default version to have this attribute too otherwise it
becomes indistinguishable from non-versioned functions.

* We don't need the '+' unlike target-features which can negate. This
will allow using the parsing API of target_version/clones for the
metadata too.
---
 clang/lib/CodeGen/CodeGenModule.cpp           |  23 ++--
 clang/test/CodeGen/AArch64/fmv-features.c     | 100 +++++++++---------
 clang/test/CodeGen/AArch64/fmv-priority.c     |   2 +-
 clang/test/CodeGen/AArch64/fmv-streaming.c    |  25 +++--
 .../test/CodeGen/attr-target-clones-aarch64.c |  56 +++++-----
 clang/test/CodeGen/attr-target-version.c      |  86 +++++++--------
 6 files changed, 154 insertions(+), 138 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 7db1ed72fa5cd..dfb51b11e1d85 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -73,6 +73,7 @@
 #include "llvm/TargetParser/X86TargetParser.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include <optional>
+#include <set>
 
 using namespace clang;
 using namespace CodeGen;
@@ -2748,17 +2749,26 @@ bool CodeGenModule::GetCPUAndFeaturesAttributes(GlobalDecl GD,
     Attrs.addAttribute("target-features", llvm::join(Features, ","));
     AddedAttr = true;
   }
+  // Add metadata for AArch64 Function Multi Versioning.
   if (getTarget().getTriple().isAArch64()) {
     llvm::SmallVector<StringRef, 8> Feats;
-    if (TV)
+    bool IsDefault = false;
+    if (TV) {
+      IsDefault = TV->isDefaultVersion();
       TV->getFeatures(Feats);
-    else if (TC)
+    } else if (TC) {
+      IsDefault = TC->isDefaultVersion(GD.getMultiVersionIndex());
       TC->getFeatures(Feats, GD.getMultiVersionIndex());
-    if (!Feats.empty()) {
-      llvm::sort(Feats);
+    }
+    if (IsDefault) {
+      Attrs.addAttribute("fmv-features");
+      AddedAttr = true;
+    } else if (!Feats.empty()) {
+      // Sort features and remove duplicates.
+      std::set<StringRef> OrderedFeats(Feats.begin(), Feats.end());
       std::string FMVFeatures;
-      for (StringRef F : Feats)
-        FMVFeatures.append(",+" + F.str());
+      for (StringRef F : OrderedFeats)
+        FMVFeatures.append("," + F.str());
       Attrs.addAttribute("fmv-features", FMVFeatures.substr(1));
       AddedAttr = true;
     }
@@ -2800,6 +2810,7 @@ void CodeGenModule::setNonAliasAttributes(GlobalDecl GD,
         llvm::AttributeMask RemoveAttrs;
         RemoveAttrs.addAttribute("target-cpu");
         RemoveAttrs.addAttribute("target-features");
+        RemoveAttrs.addAttribute("fmv-features");
         RemoveAttrs.addAttribute("tune-cpu");
         F->removeFnAttrs(RemoveAttrs);
         F->addFnAttrs(Attrs);
diff --git a/clang/test/CodeGen/AArch64/fmv-features.c b/clang/test/CodeGen/AArch64/fmv-features.c
index f78bf4b5d59c2..d191f8187eb6b 100644
--- a/clang/test/CodeGen/AArch64/fmv-features.c
+++ b/clang/test/CodeGen/AArch64/fmv-features.c
@@ -139,62 +139,64 @@ __attribute__((target_version("sve2-sm4"))) int fmv(void) { return 0; }
 // CHECK: define dso_local i32 @fmv._Mwfxt() #[[wfxt:[0-9]+]] {
 __attribute__((target_version("wfxt"))) int fmv(void) { return 0; }
 
-// CHECK: define dso_local i32 @fmv._MaesMbf16MbtiMcrc() #[[multiple_features:[0-9]+]] {
-__attribute__((target_version("aes+bf16+bti+crc"))) int fmv(void) { return 0; }
+// CHECK: define dso_local i32 @fmv._MaesMbf16MbtiMcrc() #[[unordered_features_with_duplicates:[0-9]+]] {
+__attribute__((target_version("crc+bti+bti+bti+aes+aes+bf16"))) int fmv(void) { return 0; }
 
 // CHECK-NOT: define dso_local i32 @fmv._M{{.*}}
 __attribute__((target_version("non_existent_extension"))) int fmv(void);
 
+// CHECK: define dso_local i32 @fmv.default() #[[default:[0-9]+]] {
 __attribute__((target_version("default"))) int fmv(void);
 
 int caller() {
   return fmv();
 }
 
-// CHECK: attributes #[[aes]] = { {{.*}} "fmv-features"="+aes"
-// CHECK: attributes #[[bf16]] = { {{.*}} "fmv-features"="+bf16"
-// CHECK: attributes #[[bti]] = { {{.*}} "fmv-features"="+bti"
-// CHECK: attributes #[[crc]] = { {{.*}} "fmv-features"="+crc"
-// CHECK: attributes #[[dit]] = { {{.*}} "fmv-features"="+dit"
-// CHECK: attributes #[[dotprod]] = { {{.*}} "fmv-features"="+dotprod"
-// CHECK: attributes #[[dpb]] = { {{.*}} "fmv-features"="+dpb"
-// CHECK: attributes #[[dpb2]] = { {{.*}} "fmv-features"="+dpb2"
-// CHECK: attributes #[[f32mm]] = { {{.*}} "fmv-features"="+f32mm"
-// CHECK: attributes #[[f64mm]] = { {{.*}} "fmv-features"="+f64mm"
-// CHECK: attributes #[[fcma]] = { {{.*}} "fmv-features"="+fcma"
-// CHECK: attributes #[[flagm]] = { {{.*}} "fmv-features"="+flagm"
-// CHECK: attributes #[[flagm2]] = { {{.*}} "fmv-features"="+flagm2"
-// CHECK: attributes #[[fp]] = { {{.*}} "fmv-features"="+fp"
-// CHECK: attributes #[[fp16]] = { {{.*}} "fmv-features"="+fp16"
-// CHECK: attributes #[[fp16fml]] = { {{.*}} "fmv-features"="+fp16fml"
-// CHECK: attributes #[[frintts]] = { {{.*}} "fmv-features"="+frintts"
-// CHECK: attributes #[[i8mm]] = { {{.*}} "fmv-features"="+i8mm"
-// CHECK: attributes #[[jscvt]] = { {{.*}} "fmv-features"="+jscvt"
-// CHECK: attributes #[[ls64]] = { {{.*}} "fmv-features"="+ls64"
-// CHECK: attributes #[[lse]] = { {{.*}} "fmv-features"="+lse"
-// CHECK: attributes #[[memtag]] = { {{.*}} "fmv-features"="+memtag"
-// CHECK: attributes #[[mops]] = { {{.*}} "fmv-features"="+mops"
-// CHECK: attributes #[[predres]] = { {{.*}} "fmv-features"="+predres"
-// CHECK: attributes #[[rcpc]] = { {{.*}} "fmv-features"="+rcpc"
-// CHECK: attributes #[[rcpc2]] = { {{.*}} "fmv-features"="+rcpc2"
-// CHECK: attributes #[[rcpc3]] = { {{.*}} "fmv-features"="+rcpc3"
-// CHECK: attributes #[[rdm]] = { {{.*}} "fmv-features"="+rdm"
-// CHECK: attributes #[[rng]] = { {{.*}} "fmv-features"="+rng"
-// CHECK: attributes #[[sb]] = { {{.*}} "fmv-features"="+sb"
-// CHECK: attributes #[[sha2]] = { {{.*}} "fmv-features"="+sha2"
-// CHECK: attributes #[[sha3]] = { {{.*}} "fmv-features"="+sha3"
-// CHECK: attributes #[[simd]] = { {{.*}} "fmv-features"="+simd"
-// CHECK: attributes #[[sm4]] = { {{.*}} "fmv-features"="+sm4"
-// CHECK: attributes #[[sme]] = { {{.*}} "fmv-features"="+sme"
-// CHECK: attributes #[[sme_f64f64]] = { {{.*}} "fmv-features"="+sme-f64f64"
-// CHECK: attributes #[[sme_i16i64]] = { {{.*}} "fmv-features"="+sme-i16i64"
-// CHECK: attributes #[[sme2]] = { {{.*}} "fmv-features"="+sme2"
-// CHECK: attributes #[[ssbs]] = { {{.*}} "fmv-features"="+ssbs"
-// CHECK: attributes #[[sve]] = { {{.*}} "fmv-features"="+sve"
-// CHECK: attributes #[[sve2]] = { {{.*}} "fmv-features"="+sve2"
-// CHECK: attributes #[[sve2_aes]] = { {{.*}} "fmv-features"="+sve2-aes"
-// CHECK: attributes #[[sve2_bitperm]] = { {{.*}} "fmv-features"="+sve2-bitperm"
-// CHECK: attributes #[[sve2_sha3]] = { {{.*}} "fmv-features"="+sve2-sha3"
-// CHECK: attributes #[[sve2_sm4]] = { {{.*}} "fmv-features"="+sve2-sm4"
-// CHECK: attributes #[[wfxt]] = { {{.*}} "fmv-features"="+wfxt"
-// CHECK: attributes #[[multiple_features]] = { {{.*}} "fmv-features"="+aes,+bf16,+bti,+crc"
+// CHECK: attributes #[[aes]] = {{.*}} "fmv-features"="aes"
+// CHECK: attributes #[[bf16]] = {{.*}} "fmv-features"="bf16"
+// CHECK: attributes #[[bti]] = {{.*}} "fmv-features"="bti"
+// CHECK: attributes #[[crc]] = {{.*}} "fmv-features"="crc"
+// CHECK: attributes #[[dit]] = {{.*}} "fmv-features"="dit"
+// CHECK: attributes #[[dotprod]] = {{.*}} "fmv-features"="dotprod"
+// CHECK: attributes #[[dpb]] = {{.*}} "fmv-features"="dpb"
+// CHECK: attributes #[[dpb2]] = {{.*}} "fmv-features"="dpb2"
+// CHECK: attributes #[[f32mm]] = {{.*}} "fmv-features"="f32mm"
+// CHECK: attributes #[[f64mm]] = {{.*}} "fmv-features"="f64mm"
+// CHECK: attributes #[[fcma]] = {{.*}} "fmv-features"="fcma"
+// CHECK: attributes #[[flagm]] = {{.*}} "fmv-features"="flagm"
+// CHECK: attributes #[[flagm2]] = {{.*}} "fmv-features"="flagm2"
+// CHECK: attributes #[[fp]] = {{.*}} "fmv-features"="fp"
+// CHECK: attributes #[[fp16]] = {{.*}} "fmv-features"="fp16"
+// CHECK: attributes #[[fp16fml]] = {{.*}} "fmv-features"="fp16fml"
+// CHECK: attributes #[[frintts]] = {{.*}} "fmv-features"="frintts"
+// CHECK: attributes #[[i8mm]] = {{.*}} "fmv-features"="i8mm"
+// CHECK: attributes #[[jscvt]] = {{.*}} "fmv-features"="jscvt"
+// CHECK: attributes #[[ls64]] = {{.*}} "fmv-features"="ls64"
+// CHECK: attributes #[[lse]] = {{.*}} "fmv-features"="lse"
+// CHECK: attributes #[[memtag]] = {{.*}} "fmv-features"="memtag"
+// CHECK: attributes #[[mops]] = {{.*}} "fmv-features"="mops"
+// CHECK: attributes #[[predres]] = {{.*}} "fmv-features"="predres"
+// CHECK: attributes #[[rcpc]] = {{.*}} "fmv-features"="rcpc"
+// CHECK: attributes #[[rcpc2]] = {{.*}} "fmv-features"="rcpc2"
+// CHECK: attributes #[[rcpc3]] = {{.*}} "fmv-features"="rcpc3"
+// CHECK: attributes #[[rdm]] = {{.*}} "fmv-features"="rdm"
+// CHECK: attributes #[[rng]] = {{.*}} "fmv-features"="rng"
+// CHECK: attributes #[[sb]] = {{.*}} "fmv-features"="sb"
+// CHECK: attributes #[[sha2]] = {{.*}} "fmv-features"="sha2"
+// CHECK: attributes #[[sha3]] = {{.*}} "fmv-features"="sha3"
+// CHECK: attributes #[[simd]] = {{.*}} "fmv-features"="simd"
+// CHECK: attributes #[[sm4]] = {{.*}} "fmv-features"="sm4"
+// CHECK: attributes #[[sme]] = {{.*}} "fmv-features"="sme"
+// CHECK: attributes #[[sme_f64f64]] = {{.*}} "fmv-features"="sme-f64f64"
+// CHECK: attributes #[[sme_i16i64]] = {{.*}} "fmv-features"="sme-i16i64"
+// CHECK: attributes #[[sme2]] = {{.*}} "fmv-features"="sme2"
+// CHECK: attributes #[[ssbs]] = {{.*}} "fmv-features"="ssbs"
+// CHECK: attributes #[[sve]] = {{.*}} "fmv-features"="sve"
+// CHECK: attributes #[[sve2]] = {{.*}} "fmv-features"="sve2"
+// CHECK: attributes #[[sve2_aes]] = {{.*}} "fmv-features"="sve2-aes"
+// CHECK: attributes #[[sve2_bitperm]] = {{.*}} "fmv-features"="sve2-bitperm"
+// CHECK: attributes #[[sve2_sha3]] = {{.*}} "fmv-features"="sve2-sha3"
+// CHECK: attributes #[[sve2_sm4]] = {{.*}} "fmv-features"="sve2-sm4"
+// CHECK: attributes #[[wfxt]] = {{.*}} "fmv-features"="wfxt"
+// CHECK: attributes #[[unordered_features_with_duplicates]] = {{.*}} "fmv-features"="aes,bf16,bti,crc"
+// CHECK: attributes #[[default]] = {{.*}} "fmv-features"
diff --git a/clang/test/CodeGen/AArch64/fmv-priority.c b/clang/test/CodeGen/AArch64/fmv-priority.c
index 080bb54736a75..ff82aef89a33d 100644
--- a/clang/test/CodeGen/AArch64/fmv-priority.c
+++ b/clang/test/CodeGen/AArch64/fmv-priority.c
@@ -26,7 +26,7 @@ int call() { return fn(); }
 //
 //
 // CHECK-LABEL: define dso_local i32 @call(
-// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[CALL:%.*]] = call i32 @fn()
 // CHECK-NEXT:    ret i32 [[CALL]]
diff --git a/clang/test/CodeGen/AArch64/fmv-streaming.c b/clang/test/CodeGen/AArch64/fmv-streaming.c
index 68ba3e5cfaa78..dc0c35a9a3077 100644
--- a/clang/test/CodeGen/AArch64/fmv-streaming.c
+++ b/clang/test/CodeGen/AArch64/fmv-streaming.c
@@ -53,10 +53,10 @@ __attribute__((target_version("default"))) void sc_callee(void) __arm_streaming_
 
 
 // CHECK-LABEL: define {{[^@]+}}@n_caller
-// CHECK-SAME: () #[[default]] {
+// CHECK-SAME: () #[[caller:[0-9]+]] {
 // CHECK:    call void @n_callee()
-// CHECK:    call void @s_callee() #[[streaming:[0-9]+]]
-// CHECK:    call void @sc_callee() #[[streaming_compatible:[0-9]+]]
+// CHECK:    call void @s_callee() #[[callsite_streaming:[0-9]+]]
+// CHECK:    call void @sc_callee() #[[callsite_streaming_compatible:[0-9]+]]
 //
 void n_caller(void) {
   n_callee();
@@ -66,10 +66,10 @@ void n_caller(void) {
 
 
 // CHECK-LABEL: define {{[^@]+}}@s_caller
-// CHECK-SAME: () #[[default_streaming]] {
+// CHECK-SAME: () #[[caller_streaming:[0-9]+]] {
 // CHECK:    call void @n_callee()
-// CHECK:    call void @s_callee() #[[streaming]]
-// CHECK:    call void @sc_callee() #[[streaming_compatible]]
+// CHECK:    call void @s_callee() #[[callsite_streaming]]
+// CHECK:    call void @sc_callee() #[[callsite_streaming_compatible]]
 //
 void s_caller(void) __arm_streaming {
   n_callee();
@@ -79,10 +79,10 @@ void s_caller(void) __arm_streaming {
 
 
 // CHECK-LABEL: define {{[^@]+}}@sc_caller
-// CHECK-SAME: () #[[default_streaming_compatible]] {
+// CHECK-SAME: () #[[caller_streaming_compatible:[0-9]+]] {
 // CHECK:    call void @n_callee()
-// CHECK:    call void @s_callee() #[[streaming]]
-// CHECK:    call void @sc_callee() #[[streaming_compatible]]
+// CHECK:    call void @s_callee() #[[callsite_streaming]]
+// CHECK:    call void @sc_callee() #[[callsite_streaming_compatible]]
 //
 void sc_caller(void) __arm_streaming_compatible {
   n_callee();
@@ -103,5 +103,8 @@ void sc_caller(void) __arm_streaming_compatible {
 // CHECK: attributes #[[simd_streaming_compatible]] = {{.*}} "aarch64_pstate_sm_compatible"
 // CHECK: attributes #[[locally_streaming_sme2_streaming_compatible]] = {{.*}} "aarch64_pstate_sm_body" "aarch64_pstate_sm_compatible"
 // CHECK: attributes #[[default_streaming_compatible]] = {{.*}} "aarch64_pstate_sm_compatible"
-// CHECK: attributes #[[streaming]] = {{.*}} "aarch64_pstate_sm_enabled"
-// CHECK: attributes #[[streaming_compatible]] = {{.*}} "aarch64_pstate_sm_compatible"
+// CHECK: attributes #[[caller]] = {{.*}}
+// CHECK: attributes #[[caller_streaming]] = {{.*}} "aarch64_pstate_sm_enabled"
+// CHECK: attributes #[[caller_streaming_compatible]] = {{.*}} "aarch64_pstate_sm_compatible"
+// CHECK: attributes #[[callsite_streaming]] = {{.*}} "aarch64_pstate_sm_enabled"
+// CHECK: attributes #[[callsite_streaming_compatible]] = {{.*}} "aarch64_pstate_sm_compatible"
diff --git a/clang/test/CodeGen/attr-target-clones-aarch64.c b/clang/test/CodeGen/attr-target-clones-aarch64.c
index b7e3a328db877..9e1588cd48336 100644
--- a/clang/test/CodeGen/attr-target-clones-aarch64.c
+++ b/clang/test/CodeGen/attr-target-clones-aarch64.c
@@ -252,56 +252,56 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc.default
-// CHECK-SAME: () #[[ATTR8]] {
+// CHECK-SAME: () #[[ATTR9:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_def.default
-// CHECK-SAME: () #[[ATTR8]] {
+// CHECK-SAME: () #[[ATTR9]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_dup1.default
-// CHECK-SAME: () #[[ATTR8]] {
+// CHECK-SAME: () #[[ATTR9]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 2
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_dup2.default
-// CHECK-SAME: () #[[ATTR8]] {
+// CHECK-SAME: () #[[ATTR9]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 3
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_dup3.default
-// CHECK-SAME: () #[[ATTR8]] {
+// CHECK-SAME: () #[[ATTR9]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 4
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_inline2._Mfp16
-// CHECK-SAME: () #[[ATTR9:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR10:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 2
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_inline2._MfcmaMsve2-bitperm
-// CHECK-SAME: () #[[ATTR10:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR11:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 2
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_inline2.default
-// CHECK-SAME: () #[[ATTR8]] {
+// CHECK-SAME: () #[[ATTR9]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 2
 //
@@ -330,28 +330,28 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_inline1._MrngMsimd
-// CHECK-SAME: () #[[ATTR11:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR12:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_inline1._MpredresMrcpc
-// CHECK-SAME: () #[[ATTR12:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR13:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_inline1._Msve2-aesMwfxt
-// CHECK-SAME: () #[[ATTR13:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR14:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_inline1.default
-// CHECK-SAME: () #[[ATTR8]] {
+// CHECK-SAME: () #[[ATTR9]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
@@ -395,14 +395,14 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_inline3._MsbMsve
-// CHECK-SAME: () #[[ATTR14:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR15:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 3
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_inline3.default
-// CHECK-SAME: () #[[ATTR8]] {
+// CHECK-SAME: () #[[ATTR9]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 3
 //
@@ -709,56 +709,56 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc.default
-// CHECK-MTE-BTI-SAME: () #[[ATTR8]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR9:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 0
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_def.default
-// CHECK-MTE-BTI-SAME: () #[[ATTR8]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR9]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 1
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_dup1.default
-// CHECK-MTE-BTI-SAME: () #[[ATTR8]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR9]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 2
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_dup2.default
-// CHECK-MTE-BTI-SAME: () #[[ATTR8]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR9]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 3
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_dup3.default
-// CHECK-MTE-BTI-SAME: () #[[ATTR8]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR9]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 4
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline2._Mfp16
-// CHECK-MTE-BTI-SAME: () #[[ATTR9:[0-9]+]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR10:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 2
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline2._MfcmaMsve2-bitperm
-// CHECK-MTE-BTI-SAME: () #[[ATTR10:[0-9]+]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR11:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 2
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline2.default
-// CHECK-MTE-BTI-SAME: () #[[ATTR8]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR9]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 2
 //
@@ -787,28 +787,28 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline1._MrngMsimd
-// CHECK-MTE-BTI-SAME: () #[[ATTR11:[0-9]+]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR12:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 1
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline1._MpredresMrcpc
-// CHECK-MTE-BTI-SAME: () #[[ATTR12:[0-9]+]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR13:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 1
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline1._Msve2-aesMwfxt
-// CHECK-MTE-BTI-SAME: () #[[ATTR13:[0-9]+]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR14:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 1
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline1.default
-// CHECK-MTE-BTI-SAME: () #[[ATTR8]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR9]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 1
 //
@@ -852,14 +852,14 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline3._MsbMsve
-// CHECK-MTE-BTI-SAME: () #[[ATTR14:[0-9]+]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR15:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 3
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline3.default
-// CHECK-MTE-BTI-SAME: () #[[ATTR8]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR9]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 3
 //
diff --git a/clang/test/CodeGen/attr-target-version.c b/clang/test/CodeGen/attr-target-version.c
index 336d8b0a4dffa..a75514d63bce3 100644
--- a/clang/test/CodeGen/attr-target-version.c
+++ b/clang/test/CodeGen/attr-target-version.c
@@ -272,7 +272,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@foo
-// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-SAME: () #[[ATTR15:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CALL:%.*]] = call i32 @fmv()
 // CHECK-NEXT:    [[CALL1:%.*]] = call i32 @fmv_one()
@@ -298,7 +298,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_c._Mssbs
-// CHECK-SAME: () #[[ATTR15:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR16:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret void
 //
@@ -312,7 +312,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@goo
-// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-SAME: () #[[ATTR15]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CALL:%.*]] = call i32 @fmv_inline()
 // CHECK-NEXT:    [[CALL1:%.*]] = call i32 @fmv_e()
@@ -324,7 +324,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@recur
-// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-SAME: () #[[ATTR15]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    call void @reca()
 // CHECK-NEXT:    ret void
@@ -332,7 +332,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@hoo
-// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-SAME: () #[[ATTR15]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FP1:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[FP2:%.*]] = alloca ptr, align 8
@@ -349,28 +349,28 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_with_forward_default_decl._Mmops
-// CHECK-SAME: () #[[ATTR17:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR19:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_extern_forward_default_decl._Mdotprod
-// CHECK-SAME: () #[[ATTR18:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR20:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_with_default_decl._Maes
-// CHECK-SAME: () #[[ATTR19:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR21:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_with_default_def._Msve
-// CHECK-SAME: () #[[ATTR20:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR22:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
@@ -384,7 +384,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_default_def._Mfp16
-// CHECK-SAME: () #[[ATTR21:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR23:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
@@ -398,49 +398,49 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_forward_default_def.default
-// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-SAME: () #[[ATTR15]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_forward_default_def._Mlse
-// CHECK-SAME: () #[[ATTR22:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR24:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_without_default._Mrdm
-// CHECK-SAME: () #[[ATTR23:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR25:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@default_def_with_version_decls.default
-// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-SAME: () #[[ATTR15]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@used_def_without_default_decl._Mjscvt
-// CHECK-SAME: () #[[ATTR25:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR27:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@used_def_without_default_decl._Mrdm
-// CHECK-SAME: () #[[ATTR26:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR28:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 2
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@caller
-// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-SAME: () #[[ATTR15]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CALL:%.*]] = call i32 @used_def_without_default_decl()
 // CHECK-NEXT:    [[CALL1:%.*]] = call i32 @used_decl_without_default_decl()
@@ -605,7 +605,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_d._Msb
-// CHECK-SAME: () #[[ATTR28:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR30:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
@@ -652,112 +652,112 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_inline._MaesMf64mmMsha2
-// CHECK-SAME: () #[[ATTR29:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR31:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_inline._MfcmaMfp16MrdmMsme
-// CHECK-SAME: () #[[ATTR30:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR32:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 2
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mf32mmMi8mmMsha3
-// CHECK-SAME: () #[[ATTR31:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR33:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 12
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mbf16Mdit
-// CHECK-SAME: () #[[ATTR32:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR34:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 8
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_inline._MdpbMrcpc2
-// CHECK-SAME: () #[[ATTR33:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR35:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 6
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mdpb2Mjscvt
-// CHECK-SAME: () #[[ATTR34:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR36:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 7
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_inline._MfrinttsMrcpc
-// CHECK-SAME: () #[[ATTR35:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR37:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 3
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mbf16Msve
-// CHECK-SAME: () #[[ATTR36:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR38:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 4
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_inline._Msve2-aesMsve2-sha3
-// CHECK-SAME: () #[[ATTR37:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR39:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 5
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_inline._Msve2Msve2-aesMsve2-bitperm
-// CHECK-SAME: () #[[ATTR38:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR40:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 9
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_inline._MmemtagMsve2-sm4
-// CHECK-SAME: () #[[ATTR39:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR41:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 10
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_inline._MmemtagMmopsMrcpc3
-// CHECK-SAME: () #[[ATTR40:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR42:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 11
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_inline._MaesMdotprod
-// CHECK-SAME: () #[[ATTR41:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR43:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 13
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mfp16fmlMsimd
-// CHECK-SAME: () #[[ATTR42:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR44:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 14
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_inline._MfpMsm4
-// CHECK-SAME: () #[[ATTR43:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR45:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 15
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_inline._MlseMrdm
-// CHECK-SAME: () #[[ATTR44:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR46:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 16
 //
@@ -990,7 +990,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@func
-// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-SAME: () #[[ATTR15]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret void
 //
@@ -1014,21 +1014,21 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv
-// CHECK-NOFMV-SAME: () #[[ATTR0]] {
+// CHECK-NOFMV-SAME: () #[[ATTR1:[0-9]+]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret i32 0
 //
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv_one
-// CHECK-NOFMV-SAME: () #[[ATTR0]] {
+// CHECK-NOFMV-SAME: () #[[ATTR1]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret i32 0
 //
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv_two
-// CHECK-NOFMV-SAME: () #[[ATTR0]] {
+// CHECK-NOFMV-SAME: () #[[ATTR1]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret i32 0
 //
@@ -1054,21 +1054,21 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv_d
-// CHECK-NOFMV-SAME: () #[[ATTR0]] {
+// CHECK-NOFMV-SAME: () #[[ATTR1]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret i32 1
 //
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv_c
-// CHECK-NOFMV-SAME: () #[[ATTR0]] {
+// CHECK-NOFMV-SAME: () #[[ATTR1]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret void
 //
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv_default
-// CHECK-NOFMV-SAME: () #[[ATTR0]] {
+// CHECK-NOFMV-SAME: () #[[ATTR1]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret i32 111
 //
@@ -1121,7 +1121,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: define {{[^@]+}}@main
-// CHECK-NOFMV-SAME: () #[[ATTR0]] {
+// CHECK-NOFMV-SAME: () #[[ATTR1]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK-NOFMV-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -1132,7 +1132,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: define {{[^@]+}}@unused_with_default_def
-// CHECK-NOFMV-SAME: () #[[ATTR0]] {
+// CHECK-NOFMV-SAME: () #[[ATTR1]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret i32 1
 //

From 0a079c711de6805fc4b64e5f7723964c7f9ea05d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= <mgorny@gentoo.org>
Date: Fri, 10 Jan 2025 19:04:57 +0100
Subject: [PATCH 093/408] Revert "[flang] Fix finding system install of
 LLVM/Clang/MLIR in standalone builds (#120914)"

This reverts commit 8e12037d38e2a9a1cfc6402be2b33283e3220bcc.

It broke the flang-aarch64-out-of-tree buildbot.
---
 flang/CMakeLists.txt | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt
index 34b3fedc88494..68947eaa9c9bd 100644
--- a/flang/CMakeLists.txt
+++ b/flang/CMakeLists.txt
@@ -89,16 +89,13 @@ if (FLANG_STANDALONE_BUILD)
     mark_as_advanced(LLVM_ENABLE_ASSERTIONS)
   endif()
 
-  # We need a pre-built/installed version of LLVM.
-  find_package(LLVM REQUIRED HINTS "${LLVM_DIR}")
   # If the user specifies a relative path to LLVM_DIR, the calls to include
   # LLVM modules fail. Append the absolute path to LLVM_DIR instead.
   get_filename_component(LLVM_DIR_ABSOLUTE ${LLVM_DIR}
     REALPATH BASE_DIR ${CMAKE_CURRENT_BINARY_DIR})
   list(APPEND CMAKE_MODULE_PATH ${LLVM_DIR_ABSOLUTE})
-
-  # TODO: Remove when libclangDriver is lifted out of Clang
-  find_package(Clang REQUIRED PATHS "${CLANG_DIR}")
+  # We need a pre-built/installed version of LLVM.
+  find_package(LLVM REQUIRED HINTS "${LLVM_DIR_ABSOLUTE}")
 
   # Users might specify a path to CLANG_DIR that's:
   #   * a full path, or
@@ -107,11 +104,17 @@ if (FLANG_STANDALONE_BUILD)
   # cases.
   get_filename_component(
     CLANG_DIR_ABSOLUTE
-    ${Clang_DIR}
+    ${CLANG_DIR}
     REALPATH
     BASE_DIR ${CMAKE_CURRENT_BINARY_DIR})
   list(APPEND CMAKE_MODULE_PATH ${CLANG_DIR_ABSOLUTE})
 
+  # TODO: Remove when libclangDriver is lifted out of Clang
+  find_package(Clang REQUIRED PATHS "${CLANG_DIR_ABSOLUTE}" NO_DEFAULT_PATH)
+  if (NOT Clang_FOUND)
+    message(FATAL_ERROR "Failed to find Clang")
+  endif()
+
   # If LLVM links to zlib we need the imported targets so we can too.
   if(LLVM_ENABLE_ZLIB)
     find_package(ZLIB REQUIRED)
@@ -129,12 +132,12 @@ if (FLANG_STANDALONE_BUILD)
   include(AddClang)
 
   include(TableGen)
-  find_package(MLIR REQUIRED CONFIG HINTS ${MLIR_DIR})
   # If the user specifies a relative path to MLIR_DIR, the calls to include
   # MLIR modules fail. Append the absolute path to MLIR_DIR instead.
   get_filename_component(MLIR_DIR_ABSOLUTE ${MLIR_DIR}
     REALPATH BASE_DIR ${CMAKE_CURRENT_BINARY_DIR})
   list(APPEND CMAKE_MODULE_PATH ${MLIR_DIR_ABSOLUTE})
+  find_package(MLIR REQUIRED CONFIG HINTS ${MLIR_DIR_ABSOLUTE})
   # Use SYSTEM for the same reasons as for LLVM includes
   include_directories(SYSTEM ${MLIR_INCLUDE_DIRS})
   include(AddMLIR)

From 35e76b6a4fc74e64bd6c91e5b9b9eb6a03aa802e Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen@sifive.com>
Date: Fri, 10 Jan 2025 10:09:54 -0800
Subject: [PATCH 094/408] Revert "[SLP] NFC. Replace MainOp and AltOp in
 TreeEntry with InstructionsState. (#120198)"

This reverts commit f3d6cdc5aebafac3961d4fccbd2ca0e302c6082c.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 210 +++++++++---------
 1 file changed, 103 insertions(+), 107 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a3f47a273d2ec..8a6fbd808de35 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2414,16 +2414,15 @@ class BoUpSLP {
     }
 
     /// Go through the instructions in VL and append their operands.
-    void appendOperandsOfVL(ArrayRef<Value *> VL, const InstructionsState &S) {
+    void appendOperandsOfVL(ArrayRef<Value *> VL, Instruction *VL0) {
       assert(!VL.empty() && "Bad VL");
       assert((empty() || VL.size() == getNumLanes()) &&
              "Expected same number of lanes");
       // IntrinsicInst::isCommutative returns true if swapping the first "two"
       // arguments to the intrinsic produces the same result.
       constexpr unsigned IntrinsicNumOperands = 2;
-      unsigned NumOperands = S.getMainOp()->getNumOperands();
-      ArgSize = isa<IntrinsicInst>(S.getMainOp()) ? IntrinsicNumOperands
-                                                  : NumOperands;
+      unsigned NumOperands = VL0->getNumOperands();
+      ArgSize = isa<IntrinsicInst>(VL0) ? IntrinsicNumOperands : NumOperands;
       OpsVec.resize(NumOperands);
       unsigned NumLanes = VL.size();
       for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
@@ -2443,8 +2442,8 @@ class BoUpSLP {
           // tell the inverse operations by checking commutativity.
           if (isa<PoisonValue>(VL[Lane])) {
             OpsVec[OpIdx][Lane] = {
-                PoisonValue::get(S.getMainOp()->getOperand(OpIdx)->getType()),
-                true, false};
+                PoisonValue::get(VL0->getOperand(OpIdx)->getType()), true,
+                false};
             continue;
           }
           bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
@@ -2556,12 +2555,11 @@ class BoUpSLP {
 
   public:
     /// Initialize with all the operands of the instruction vector \p RootVL.
-    VLOperands(ArrayRef<Value *> RootVL, const InstructionsState &S,
-               const BoUpSLP &R)
+    VLOperands(ArrayRef<Value *> RootVL, Instruction *VL0, const BoUpSLP &R)
         : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
-          L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
+          L(R.LI->getLoopFor((VL0->getParent()))) {
       // Append all the operands of RootVL.
-      appendOperandsOfVL(RootVL, S);
+      appendOperandsOfVL(RootVL, VL0);
     }
 
     /// \Returns a value vector with the operands across all lanes for the
@@ -3034,7 +3032,7 @@ class BoUpSLP {
   /// non-identity permutation that allows to reuse extract instructions.
   /// \param ResizeAllowed indicates whether it is allowed to handle subvector
   /// extract order.
-  bool canReuseExtract(ArrayRef<Value *> VL,
+  bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
                        SmallVectorImpl<unsigned> &CurrentOrder,
                        bool ResizeAllowed = false) const;
 
@@ -3261,7 +3259,7 @@ class BoUpSLP {
     };
 
     /// Checks if the current node is a gather node.
-    bool isGather() const { return State == NeedToGather; }
+    bool isGather() const {return State == NeedToGather; }
 
     /// A vector of scalars.
     ValueList Scalars;
@@ -3325,9 +3323,9 @@ class BoUpSLP {
     /// reordering of operands during buildTree_rec() and vectorizeTree().
     SmallVector<ValueList, 2> Operands;
 
-    /// MainOp and AltOp are recorded inside. S should be obtained from
-    /// newTreeEntry.
-    InstructionsState S = InstructionsState::invalid();
+    /// The main/alternate instruction.
+    Instruction *MainOp = nullptr;
+    Instruction *AltOp = nullptr;
 
     /// Interleaving factor for interleaved loads Vectorize nodes.
     unsigned InterleaveFactor = 0;
@@ -3351,10 +3349,10 @@ class BoUpSLP {
 
     /// Set this bundle's operand from Scalars.
     void setOperand(const BoUpSLP &R, bool RequireReorder = false) {
-      VLOperands Ops(Scalars, S, R);
+      VLOperands Ops(Scalars, MainOp, R);
       if (RequireReorder)
         Ops.reorder();
-      for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands()))
+      for (unsigned I : seq<unsigned>(MainOp->getNumOperands()))
         setOperand(I, Ops.getVL(I));
     }
 
@@ -3387,9 +3385,13 @@ class BoUpSLP {
     }
 
     /// Some of the instructions in the list have alternate opcodes.
-    bool isAltShuffle() const { return S.isAltShuffle(); }
+    bool isAltShuffle() const { return MainOp != AltOp; }
 
-    bool isOpcodeOrAlt(Instruction *I) const { return S.isOpcodeOrAlt(I); }
+    bool isOpcodeOrAlt(Instruction *I) const {
+      unsigned CheckedOpcode = I->getOpcode();
+      return (getOpcode() == CheckedOpcode ||
+              getAltOpcode() == CheckedOpcode);
+    }
 
     /// Chooses the correct key for scheduling data. If \p Op has the same (or
     /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
@@ -3398,24 +3400,31 @@ class BoUpSLP {
       auto *I = dyn_cast<Instruction>(Op);
       if (I && isOpcodeOrAlt(I))
         return Op;
-      return S.getMainOp();
+      return MainOp;
     }
 
     void setOperations(const InstructionsState &S) {
       assert(S && "InstructionsState is invalid.");
-      this->S = S;
+      MainOp = S.getMainOp();
+      AltOp = S.getAltOp();
     }
 
-    Instruction *getMainOp() const { return S.getMainOp(); }
+    Instruction *getMainOp() const {
+      return MainOp;
+    }
 
-    Instruction *getAltOp() const { return S.getAltOp(); }
+    Instruction *getAltOp() const {
+      return AltOp;
+    }
 
     /// The main/alternate opcodes for the list of instructions.
-    unsigned getOpcode() const { return S.getOpcode(); }
-
-    unsigned getAltOpcode() const { return S.getAltOpcode(); }
+    unsigned getOpcode() const {
+      return MainOp ? MainOp->getOpcode() : 0;
+    }
 
-    bool hasState() const { return S.valid(); }
+    unsigned getAltOpcode() const {
+      return AltOp ? AltOp->getOpcode() : 0;
+    }
 
     /// When ReuseReorderShuffleIndices is empty it just returns position of \p
     /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
@@ -3511,13 +3520,16 @@ class BoUpSLP {
         dbgs() << "CombinedVectorize\n";
         break;
       }
-      if (S) {
-        dbgs() << "MainOp: " << *S.getMainOp() << "\n";
-        dbgs() << "AltOp: " << *S.getAltOp() << "\n";
-      } else {
-        dbgs() << "MainOp: NULL\n";
-        dbgs() << "AltOp: NULL\n";
-      }
+      dbgs() << "MainOp: ";
+      if (MainOp)
+        dbgs() << *MainOp << "\n";
+      else
+        dbgs() << "NULL\n";
+      dbgs() << "AltOp: ";
+      if (AltOp)
+        dbgs() << *AltOp << "\n";
+      else
+        dbgs() << "NULL\n";
       dbgs() << "VectorizedValue: ";
       if (VectorizedValue)
         dbgs() << *VectorizedValue << "\n";
@@ -3692,13 +3704,9 @@ class BoUpSLP {
   }
 #endif
 
-  TreeEntry *getTreeEntry(Value *V) {
-    assert(V && "V cannot be nullptr.");
-    return ScalarToTreeEntry.lookup(V);
-  }
+  TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
 
   const TreeEntry *getTreeEntry(Value *V) const {
-    assert(V && "V cannot be nullptr.");
     return ScalarToTreeEntry.lookup(V);
   }
 
@@ -5579,7 +5587,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
     // Try build correct order for extractelement instructions.
     SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
                                 TE.ReuseShuffleIndices.end());
-    if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
+    if (TE.getOpcode() == Instruction::ExtractElement &&
         all_of(TE.Scalars, [Sz](Value *V) {
           if (isa<PoisonValue>(V))
             return true;
@@ -5741,11 +5749,10 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
       return std::nullopt; // No need to reorder.
     return std::move(Phis);
   }
-  if (TE.isGather() && (!TE.hasState() || !TE.isAltShuffle()) &&
-      allSameType(TE.Scalars)) {
+  if (TE.isGather() && !TE.isAltShuffle() && allSameType(TE.Scalars)) {
     // TODO: add analysis of other gather nodes with extractelement
     // instructions and other values/instructions, not only undefs.
-    if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
+    if ((TE.getOpcode() == Instruction::ExtractElement ||
          (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
           any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
         all_of(TE.Scalars, [](Value *V) {
@@ -5755,8 +5762,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
       // Check that gather of extractelements can be represented as
       // just a shuffle of a single vector.
       OrdersType CurrentOrder;
-      bool Reuse =
-          canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
+      bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
+                                   /*ResizeAllowed=*/true);
       if (Reuse || !CurrentOrder.empty())
         return std::move(CurrentOrder);
     }
@@ -5805,7 +5812,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
         return Order;
     // Check if can include the order of vectorized loads. For masked gathers do
     // extra analysis later, so include such nodes into a special list.
-    if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
+    if (TE.isGather() && TE.getOpcode() == Instruction::Load) {
       SmallVector<Value *> PointerOps;
       OrdersType CurrentOrder;
       LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
@@ -5920,7 +5927,7 @@ void BoUpSLP::reorderTopToBottom() {
     // Patterns like [fadd,fsub] can be combined into a single instruction in
     // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
     // to take into account their order when looking for the most used order.
-    if (TE->hasState() && TE->isAltShuffle()) {
+    if (TE->isAltShuffle()) {
       VectorType *VecTy =
           getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
       unsigned Opcode0 = TE->getOpcode();
@@ -5999,7 +6006,7 @@ void BoUpSLP::reorderTopToBottom() {
           if (It != GathersToOrders.end())
             return It->second;
         }
-        if (OpTE->hasState() && OpTE->isAltShuffle()) {
+        if (OpTE->isAltShuffle()) {
           auto It = AltShufflesToOrders.find(OpTE);
           if (It != AltShufflesToOrders.end())
             return It->second;
@@ -7602,7 +7609,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
   }
   case Instruction::ExtractValue:
   case Instruction::ExtractElement: {
-    bool Reuse = canReuseExtract(VL, CurrentOrder);
+    bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
     // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
     if (!has_single_bit(VL.size()))
       return TreeEntry::NeedToGather;
@@ -8615,7 +8622,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                  TE->dump());
 
       ValueList Left, Right;
-      VLOperands Ops(VL, S, *this);
+      VLOperands Ops(VL, VL0, *this);
       if (cast<CmpInst>(VL0)->isCommutative()) {
         // Commutative predicate - collect + sort operands of the instructions
         // so that each side is more likely to have the same opcode.
@@ -8883,7 +8890,7 @@ unsigned BoUpSLP::canMapToVector(Type *T) const {
   return N;
 }
 
-bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
+bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
                               SmallVectorImpl<unsigned> &CurrentOrder,
                               bool ResizeAllowed) const {
   const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
@@ -9537,7 +9544,7 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
 
   // Do not reorder nodes if it small (just 2 elements), all-constant or all
   // instructions have same opcode already.
-  if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
+  if (TE.Scalars.size() == 2 || (TE.getOpcode() && !TE.isAltShuffle()) ||
       all_of(TE.Scalars, isConstant))
     return;
 
@@ -9756,7 +9763,7 @@ void BoUpSLP::transformNodes() {
       // Do not try partial vectorization for small nodes (<= 2), nodes with the
       // same opcode and same parent block or all constants.
       if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
-          !(!E.hasState() || E.getOpcode() == Instruction::Load ||
+          !(!E.getOpcode() || E.getOpcode() == Instruction::Load ||
             E.isAltShuffle() || !allSameBlock(VL)) ||
           allConstant(VL) || isSplat(VL))
         continue;
@@ -9899,8 +9906,6 @@ void BoUpSLP::transformNodes() {
         E.ReorderIndices.clear();
       }
     }
-    if (!E.hasState())
-      continue;
     switch (E.getOpcode()) {
     case Instruction::Load: {
       // No need to reorder masked gather loads, just reorder the scalar
@@ -10020,7 +10025,7 @@ void BoUpSLP::transformNodes() {
         getCanonicalGraphSize() <= SmallTree &&
         count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
                  [](const std::unique_ptr<TreeEntry> &TE) {
-                   return TE->isGather() && TE->hasState() &&
+                   return TE->isGather() &&
                           TE->getOpcode() == Instruction::Load &&
                           !allSameBlock(TE->Scalars);
                  }) == 1)
@@ -10036,13 +10041,13 @@ void BoUpSLP::transformNodes() {
   for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
     TreeEntry &E = *TE;
     if (E.isGather() &&
-        ((E.hasState() && E.getOpcode() == Instruction::Load) ||
-         (!E.hasState() && any_of(E.Scalars,
-                                  [&](Value *V) {
-                                    return isa<LoadInst>(V) &&
-                                           !isVectorized(V) &&
-                                           !isDeleted(cast<Instruction>(V));
-                                  }))) &&
+        (E.getOpcode() == Instruction::Load ||
+         (!E.getOpcode() && any_of(E.Scalars,
+                                   [&](Value *V) {
+                                     return isa<LoadInst>(V) &&
+                                            !isVectorized(V) &&
+                                            !isDeleted(cast<Instruction>(V));
+                                   }))) &&
         !isSplat(E.Scalars)) {
       for (Value *V : E.Scalars) {
         auto *LI = dyn_cast<LoadInst>(V);
@@ -10636,7 +10641,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     bool PrevNodeFound = any_of(
         ArrayRef(R.VectorizableTree).take_front(E->Idx),
         [&](const std::unique_ptr<TreeEntry> &TE) {
-          return ((TE->hasState() && !TE->isAltShuffle() &&
+          return ((!TE->isAltShuffle() &&
                    TE->getOpcode() == Instruction::ExtractElement) ||
                   TE->isGather()) &&
                  all_of(enumerate(TE->Scalars), [&](auto &&Data) {
@@ -11762,7 +11767,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
         if (TE.get() == E)
           break;
-        if (TE->hasState() && TE->isAltShuffle() &&
+        if (TE->isAltShuffle() &&
             ((TE->getOpcode() == E->getOpcode() &&
               TE->getAltOpcode() == E->getAltOpcode()) ||
              (TE->getOpcode() == E->getAltOpcode() &&
@@ -11924,12 +11929,10 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
                    [this](Value *V) { return EphValues.contains(V); }) &&
            (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
             TE->Scalars.size() < Limit ||
-            (((TE->hasState() &&
-               TE->getOpcode() == Instruction::ExtractElement) ||
+            ((TE->getOpcode() == Instruction::ExtractElement ||
               all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
              isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
-            ((TE->hasState() && TE->getOpcode() == Instruction::Load) &&
-             (!TE->hasState() || !TE->isAltShuffle())) ||
+            (TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()) ||
             any_of(TE->Scalars, IsaPred<LoadInst>));
   };
 
@@ -12058,10 +12061,9 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
       !VectorizableTree.empty() &&
       all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
         return (TE->isGather() &&
-                (!TE->hasState() ||
-                 TE->getOpcode() != Instruction::ExtractElement) &&
+                TE->getOpcode() != Instruction::ExtractElement &&
                 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
-               (TE->hasState() && TE->getOpcode() == Instruction::PHI);
+               TE->getOpcode() == Instruction::PHI;
       }))
     return true;
 
@@ -12079,7 +12081,7 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
   // somewhere.
   bool IsAllowedSingleBVNode =
       VectorizableTree.size() > 1 ||
-      (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
+      (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
        !VectorizableTree.front()->isAltShuffle() &&
        VectorizableTree.front()->getOpcode() != Instruction::PHI &&
        VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
@@ -12095,7 +12097,6 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
     return false;
 
   if (VectorizableTree.back()->isGather() &&
-      VectorizableTree.back()->hasState() &&
       VectorizableTree.back()->isAltShuffle() &&
       VectorizableTree.back()->getVectorFactor() > 2 &&
       allSameBlock(VectorizableTree.back()->Scalars) &&
@@ -12120,7 +12121,7 @@ bool BoUpSLP::isTreeNotExtendable() const {
         getCanonicalGraphSize() <= SmallTree &&
         count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
                  [](const std::unique_ptr<TreeEntry> &TE) {
-                   return TE->isGather() && TE->hasState() &&
+                   return TE->isGather() &&
                           TE->getOpcode() == Instruction::Load &&
                           !allSameBlock(TE->Scalars);
                  }) == 1)
@@ -12132,7 +12133,7 @@ bool BoUpSLP::isTreeNotExtendable() const {
     TreeEntry &E = *VectorizableTree[Idx];
     if (!E.isGather())
       continue;
-    if (E.hasState() && E.getOpcode() != Instruction::Load)
+    if (E.getOpcode() && E.getOpcode() != Instruction::Load)
       return false;
     if (isSplat(E.Scalars) || allConstant(E.Scalars))
       continue;
@@ -12442,7 +12443,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
           TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
       continue;
     }
-    if (TE.isGather() && TE.hasState()) {
+    if (TE.isGather()) {
       if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
           E && E->getVectorFactor() == TE.getVectorFactor() &&
           E->isSame(TE.Scalars)) {
@@ -14886,15 +14887,14 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
       }
     }
     // Gather extracts after we check for full matched gathers only.
-    if (!ExtractShuffles.empty() || !E->hasState() ||
-        E->getOpcode() != Instruction::Load ||
-        (((E->hasState() && E->getOpcode() == Instruction::Load) ||
+    if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
+        ((E->getOpcode() == Instruction::Load ||
           any_of(E->Scalars, IsaPred<LoadInst>)) &&
          any_of(E->Scalars,
                 [this](Value *V) {
                   return isa<LoadInst>(V) && getTreeEntry(V);
                 })) ||
-        (E->hasState() && E->isAltShuffle()) ||
+        E->isAltShuffle() ||
         all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
         isSplat(E->Scalars) ||
         (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
@@ -15274,7 +15274,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
   auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
   if (E->isGather()) {
     // Set insert point for non-reduction initial nodes.
-    if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
+    if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
       setInsertPointAfterBundle(E);
     Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);
     E->VectorizedValue = Vec;
@@ -18163,9 +18163,10 @@ void BoUpSLP::computeMinimumValueSizes() {
     return;
 
   SmallVector<unsigned> ToDemote;
-  auto ComputeMaxBitWidth =
-      [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
-          unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
+  auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot,
+                                bool IsProfitableToDemoteRoot, unsigned Opcode,
+                                unsigned Limit, bool IsTruncRoot,
+                                bool IsSignedCmp) -> unsigned {
     ToDemote.clear();
     // Check if the root is trunc and the next node is gather/buildvector, then
     // keep trunc in scalars, which is free in most cases.
@@ -18206,14 +18207,11 @@ void BoUpSLP::computeMinimumValueSizes() {
       return MaxBitWidth;
     }
 
-    if (!E.hasState())
-      return 0u;
-
     unsigned VF = E.getVectorFactor();
     Type *ScalarTy = E.Scalars.front()->getType();
     unsigned ScalarTyNumElements = getNumElements(ScalarTy);
     auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
-    if (!TreeRootIT)
+    if (!TreeRootIT || !Opcode)
       return 0u;
 
     if (any_of(E.Scalars,
@@ -18285,7 +18283,6 @@ void BoUpSLP::computeMinimumValueSizes() {
                 IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
       return 0u;
 
-    unsigned Opcode = E.getOpcode();
     bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
                                 Opcode == Instruction::SExt ||
                                 Opcode == Instruction::ZExt || NumParts > 1;
@@ -18366,14 +18363,15 @@ void BoUpSLP::computeMinimumValueSizes() {
   while (NodeIdx < VectorizableTree.size()) {
     ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
     unsigned Limit = 2;
+    unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
     if (IsTopRoot &&
         ReductionBitWidth ==
             DL->getTypeSizeInBits(
                 VectorizableTree.front()->Scalars.front()->getType()))
       Limit = 3;
     unsigned MaxBitWidth = ComputeMaxBitWidth(
-        *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
-        IsTruncRoot, IsSignedCmp);
+        *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Opcode,
+        Limit, IsTruncRoot, IsSignedCmp);
     if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
       if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
         ReductionBitWidth = bit_ceil(MaxBitWidth);
@@ -18416,21 +18414,19 @@ void BoUpSLP::computeMinimumValueSizes() {
                  });
       IsSignedCmp =
           NodeIdx < VectorizableTree.size() &&
-          any_of(
-              VectorizableTree[NodeIdx]->UserTreeIndices,
-              [&](const EdgeInfo &EI) {
-                return (EI.UserTE->hasState() &&
-                        EI.UserTE->getOpcode() == Instruction::ICmp) &&
-                       any_of(EI.UserTE->Scalars, [&](Value *V) {
-                         auto *IC = dyn_cast<ICmpInst>(V);
-                         return IC &&
-                                (IC->isSigned() ||
-                                 !isKnownNonNegative(IC->getOperand(0),
-                                                     SimplifyQuery(*DL)) ||
-                                 !isKnownNonNegative(IC->getOperand(1),
-                                                     SimplifyQuery(*DL)));
-                       });
-              });
+          any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
+                 [&](const EdgeInfo &EI) {
+                   return EI.UserTE->getOpcode() == Instruction::ICmp &&
+                          any_of(EI.UserTE->Scalars, [&](Value *V) {
+                            auto *IC = dyn_cast<ICmpInst>(V);
+                            return IC &&
+                                   (IC->isSigned() ||
+                                    !isKnownNonNegative(IC->getOperand(0),
+                                                        SimplifyQuery(*DL)) ||
+                                    !isKnownNonNegative(IC->getOperand(1),
+                                                        SimplifyQuery(*DL)));
+                          });
+                 });
     }
 
     // If the maximum bit width we compute is less than the width of the roots'

From 44058e5b5f19e2a9c311047f3d55fa0b5fcf5b6c Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 10 Jan 2025 18:49:43 +0000
Subject: [PATCH 095/408] [LV] Precommit tests for #106441.

Tests for https://github.com/llvm/llvm-project/pull/106441
from https://github.com/llvm/llvm-project/issues/82936.
---
 ...sform-narrow-interleave-to-widen-memory.ll |  754 ++++++++
 ...sform-narrow-interleave-to-widen-memory.ll | 1676 +++++++++++++++++
 2 files changed, 2430 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory.ll

diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
new file mode 100644
index 0000000000000..e32f1a0859a39
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
@@ -0,0 +1,754 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -mcpu=skylake -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux"
+
+; Test cases for https://github.com/llvm/llvm-project/issues/82936.
+define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_4xi64(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i64> [[STRIDED_VEC2]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i64> [[STRIDED_VEC2]], [[STRIDED_VEC1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[STRIDED_VEC2]], [[STRIDED_VEC4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul <4 x i64> [[STRIDED_VEC2]], [[STRIDED_VEC3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP8]], <8 x i64> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP10]], <16 x i64> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[DATA_2:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8
+; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 0
+; CHECK-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_2]], [[L_0]]
+; CHECK-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 1
+; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_2]], [[L_1]]
+; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[DATA_4:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 2
+; CHECK-NEXT:    [[L_4:%.*]] = load i64, ptr [[DATA_4]], align 8
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul i64 [[L_2]], [[L_4]]
+; CHECK-NEXT:    store i64 [[MUL_2]], ptr [[DATA_4]], align 8
+; CHECK-NEXT:    [[DATA_3:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 3
+; CHECK-NEXT:    [[L_3:%.*]] = load i64, ptr [[DATA_3]], align 8
+; CHECK-NEXT:    [[MUL_3:%.*]] = mul i64 [[L_2]], [[L_3]]
+; CHECK-NEXT:    store i64 [[MUL_3]], ptr [[DATA_3]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %data.0 = getelementptr inbounds { i64 , i64, i64, i64 }, ptr %data, i64 %iv, i32 0
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  store i64 %mul.0, ptr %data.0, align 8
+  %data.1 = getelementptr inbounds { i64 , i64, i64, i64 }, ptr %data, i64 %iv, i32 1
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  store i64 %mul.1, ptr %data.1, align 8
+  %data.2 = getelementptr inbounds { i64 , i64, i64, i64 }, ptr %data, i64 %iv, i32 2
+  %l.2 = load i64, ptr %data.2, align 8
+  %mul.2 = mul i64 %l.factor, %l.2
+  store i64 %mul.2, ptr %data.2, align 8
+  %data.3 = getelementptr inbounds { i64 , i64, i64, i64 }, ptr %data, i64 %iv, i32 3
+  %l.3 = load i64, ptr %data.3, align 8
+  %mul.3 = mul i64 %l.factor, %l.3
+  store i64 %mul.3, ptr %data.3, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_2xi64(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP3]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
+; CHECK-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %1 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  store i64 %mul.0, ptr %data.0, align 8
+  %3 = or disjoint i64 %1, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  store i64 %mul.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_2xi64_interleave_loads_order_flipped(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP3]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
+; CHECK-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %1 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+  %l.0 = load i64, ptr %data.0, align 8
+  %3 = or disjoint i64 %1, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.0 = mul i64 %l.factor, %l.1
+  store i64 %mul.0, ptr %data.0, align 8
+  %mul.1 = mul i64 %l.factor, %l.0
+  store i64 %mul.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_2xi64_store_order_flipped_1(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP3]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
+; CHECK-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_0]], align 8
+; CHECK-NEXT:    store i64 [[MUL_0]], ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %1 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  %3 = or disjoint i64 %1, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  store i64 %mul.1, ptr %data.0, align 8
+  store i64 %mul.0, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_2xi64_store_order_flipped_2(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP3]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP8]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
+; CHECK-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT:    store i64 [[MUL_0]], ptr [[DATA_1]], align 8
+; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %1 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  %3 = or disjoint i64 %1, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  store i64 %mul.0, ptr %data.1, align 8
+  store i64 %mul.1, ptr %data.0, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noalias %src.0, ptr noalias %src.1, i64 noundef %n) {
+; CHECK-LABEL: define void @test_2xi64_different_loads_feeding_fmul(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[SRC_0:%.*]], ptr noalias [[SRC_1:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 4, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[TMP3]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds i64, ptr [[SRC_0]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[GEP_SRC_0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP5]]
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint i64 [[TMP5]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[SRC_1]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = mul <4 x i64> [[WIDE_LOAD2]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP14]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP16]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr inbounds i64, ptr [[SRC_0]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_SRC_0:%.*]] = load i64, ptr [[GEP_SRC_2]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
+; CHECK-NEXT:    [[L_0:%.*]] = load i64, ptr [[GEP_SRC_2]], align 8
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_SRC_0]], [[L_0]]
+; CHECK-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds i64, ptr [[SRC_1]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_SRC_1:%.*]] = load i64, ptr [[GEP_SRC_1]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_SRC_1]], [[L_1]]
+; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.src.0 = getelementptr inbounds i64, ptr %src.0, i64 %iv
+  %l.src.0 = load i64, ptr %gep.src.0, align 8
+  %1 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+  %l.0 = load i64, ptr %gep.src.0, align 8
+  %mul.0 = mul i64 %l.src.0, %l.0
+  store i64 %mul.0, ptr %data.0, align 8
+  %3 = or disjoint i64 %1, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+  %l.1 = load i64, ptr %data.1, align 8
+  %gep.src.1 = getelementptr inbounds i64, ptr %src.1, i64 %iv
+  %l.src.1 = load i64, ptr %gep.src.1, align 8
+  %mul.1 = mul i64 %l.src.1, %l.1
+  store i64 %mul.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_3xi64(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i64> [[TMP10]], <8 x i64> [[TMP11]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP12]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 0
+; CHECK-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 1
+; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[DATA_2:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 2
+; CHECK-NEXT:    [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul i64 [[L_FACTOR]], [[L_2]]
+; CHECK-NEXT:    store i64 [[MUL_2]], ptr [[DATA_2]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %data.0 = getelementptr inbounds { i64 , i64, i64 }, ptr %data, i64 %iv, i32 0
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  store i64 %mul.0, ptr %data.0, align 8
+  %data.1 = getelementptr inbounds { i64 , i64, i64 }, ptr %data, i64 %iv, i32 1
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  store i64 %mul.1, ptr %data.1, align 8
+  %data.2 = getelementptr inbounds { i64 , i64, i64 }, ptr %data, i64 %iv, i32 2
+  %l.2 = load i64, ptr %data.2, align 8
+  %mul.2 = mul i64 %l.factor, %l.2
+  store i64 %mul.2, ptr %data.2, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_2xi32(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT:    [[L_FACTOR:%.*]] = load i32, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 0
+; CHECK-NEXT:    [[L_0:%.*]] = load i32, ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul i32 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT:    store i32 [[MUL_0]], ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 1
+; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT:    store i32 [[MUL_1]], ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i32 , ptr %arrayidx, align 8
+  %data.0 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 0
+  %l.0 = load i32, ptr %data.0, align 8
+  %mul.0 = mul i32 %l.factor, %l.0
+  store i32 %mul.0, ptr %data.0, align 8
+  %data.1 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 1
+  %l.1 = load i32, ptr %data.1, align 8
+  %mul.1 = mul i32 %l.factor, %l.1
+  store i32%mul.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_3xi32(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 8, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[TMP1]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <24 x i32>, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC1]], <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <24 x i32> [[WIDE_VEC1]], <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <24 x i32> [[WIDE_VEC1]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP12]], <16 x i32> [[TMP13]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP14]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; CHECK-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_FACTOR:%.*]] = load i32, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV1]], i32 0
+; CHECK-NEXT:    [[L_0:%.*]] = load i32, ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul i32 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT:    store i32 [[MUL_0]], ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV1]], i32 1
+; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT:    store i32 [[MUL_1]], ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[DATA_2:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV1]], i32 2
+; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[DATA_2]], align 8
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[L_FACTOR]], [[L_2]]
+; CHECK-NEXT:    store i32 [[MUL_2]], ptr [[DATA_2]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i32 , ptr %arrayidx, align 8
+  %data.0 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 0
+  %l.0 = load i32, ptr %data.0, align 8
+  %mul.0 = mul i32 %l.factor, %l.0
+  store i32 %mul.0, ptr %data.0, align 8
+  %data.1 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 1
+  %l.1 = load i32, ptr %data.1, align 8
+  %mul.1 = mul i32 %l.factor, %l.1
+  store i32%mul.1, ptr %data.1, align 8
+  %data.2 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 2
+  %l.2 = load i32, ptr %data.2, align 8
+  %mul.2 = mul i32 %l.factor, %l.2
+  store i32 %mul.2, ptr %data.2, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory.ll
new file mode 100644
index 0000000000000..c234cd8775ca9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory.ll
@@ -0,0 +1,1676 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -force-vector-width=2 -S %s | FileCheck --check-prefixes=VF2 %s
+; RUN: opt -p loop-vectorize -force-vector-width=4 -S %s | FileCheck --check-prefixes=VF4 %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+
+define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; VF2-LABEL: define void @test_2xi64(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*]]:
+; VF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP3:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 1)
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
+; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
+; VF2-NEXT:    [[TMP12:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP11]]
+; VF2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; VF2-NEXT:    store i64 [[TMP13]], ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP15:%.*]] = or disjoint <2 x i64> [[TMP3]], splat (i64 1)
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP15]], i32 0
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP16]]
+; VF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP15]], i32 1
+; VF2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP18]]
+; VF2-NEXT:    [[TMP20:%.*]] = load i64, ptr [[TMP17]], align 8
+; VF2-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP19]], align 8
+; VF2-NEXT:    [[TMP22:%.*]] = insertelement <2 x i64> poison, i64 [[TMP20]], i32 0
+; VF2-NEXT:    [[TMP23:%.*]] = insertelement <2 x i64> [[TMP22]], i64 [[TMP21]], i32 1
+; VF2-NEXT:    [[TMP24:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP23]]
+; VF2-NEXT:    [[TMP25:%.*]] = extractelement <2 x i64> [[TMP24]], i32 0
+; VF2-NEXT:    store i64 [[TMP25]], ptr [[TMP17]], align 8
+; VF2-NEXT:    [[TMP26:%.*]] = extractelement <2 x i64> [[TMP24]], i32 1
+; VF2-NEXT:    store i64 [[TMP26]], ptr [[TMP19]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; VF2-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2:       [[SCALAR_PH]]:
+; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT:    br label %[[LOOP:.*]]
+; VF2:       [[LOOP]]:
+; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF2-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF2-NEXT:    [[TMP28:%.*]] = shl nsw i64 [[IV]], 1
+; VF2-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP28]]
+; VF2-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF2-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[TMP29:%.*]] = or disjoint i64 [[TMP28]], 1
+; VF2-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP29]]
+; VF2-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF2-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*]]:
+; VF4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], splat (i64 1)
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i32 0
+; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 1
+; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2
+; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3
+; VF4-NEXT:    [[TMP20:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP19]]
+; VF4-NEXT:    [[TMP21:%.*]] = extractelement <4 x i64> [[TMP20]], i32 0
+; VF4-NEXT:    store i64 [[TMP21]], ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP20]], i32 1
+; VF4-NEXT:    store i64 [[TMP22]], ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP20]], i32 2
+; VF4-NEXT:    store i64 [[TMP23]], ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP20]], i32 3
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP25:%.*]] = or disjoint <4 x i64> [[TMP3]], splat (i64 1)
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP25]], i32 0
+; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP26]]
+; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP25]], i32 1
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP28]]
+; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP25]], i32 2
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP30]]
+; VF4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP25]], i32 3
+; VF4-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP32]]
+; VF4-NEXT:    [[TMP34:%.*]] = load i64, ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP35:%.*]] = load i64, ptr [[TMP29]], align 8
+; VF4-NEXT:    [[TMP36:%.*]] = load i64, ptr [[TMP31]], align 8
+; VF4-NEXT:    [[TMP37:%.*]] = load i64, ptr [[TMP33]], align 8
+; VF4-NEXT:    [[TMP38:%.*]] = insertelement <4 x i64> poison, i64 [[TMP34]], i32 0
+; VF4-NEXT:    [[TMP39:%.*]] = insertelement <4 x i64> [[TMP38]], i64 [[TMP35]], i32 1
+; VF4-NEXT:    [[TMP40:%.*]] = insertelement <4 x i64> [[TMP39]], i64 [[TMP36]], i32 2
+; VF4-NEXT:    [[TMP41:%.*]] = insertelement <4 x i64> [[TMP40]], i64 [[TMP37]], i32 3
+; VF4-NEXT:    [[TMP42:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP41]]
+; VF4-NEXT:    [[TMP43:%.*]] = extractelement <4 x i64> [[TMP42]], i32 0
+; VF4-NEXT:    store i64 [[TMP43]], ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP44:%.*]] = extractelement <4 x i64> [[TMP42]], i32 1
+; VF4-NEXT:    store i64 [[TMP44]], ptr [[TMP29]], align 8
+; VF4-NEXT:    [[TMP45:%.*]] = extractelement <4 x i64> [[TMP42]], i32 2
+; VF4-NEXT:    store i64 [[TMP45]], ptr [[TMP31]], align 8
+; VF4-NEXT:    [[TMP46:%.*]] = extractelement <4 x i64> [[TMP42]], i32 3
+; VF4-NEXT:    store i64 [[TMP46]], ptr [[TMP33]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; VF4-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[TMP47]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4:       [[SCALAR_PH]]:
+; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT:    br label %[[LOOP:.*]]
+; VF4:       [[LOOP]]:
+; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF4-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF4-NEXT:    [[TMP48:%.*]] = shl nsw i64 [[IV]], 1
+; VF4-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP48]]
+; VF4-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF4-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[TMP49:%.*]] = or disjoint i64 [[TMP48]], 1
+; VF4-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP49]]
+; VF4-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF4-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %1 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  store i64 %mul.0, ptr %data.0, align 8
+  %3 = or disjoint i64 %1, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  store i64 %mul.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; VF2-LABEL: define void @test_2xi64_interleave_loads_order_flipped(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*]]:
+; VF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP3:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 1)
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
+; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
+; VF2-NEXT:    [[TMP12:%.*]] = or disjoint <2 x i64> [[TMP3]], splat (i64 1)
+; VF2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP13]]
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP15]]
+; VF2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP14]], align 8
+; VF2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 8
+; VF2-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[TMP17]], i32 0
+; VF2-NEXT:    [[TMP20:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[TMP18]], i32 1
+; VF2-NEXT:    [[TMP21:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP20]]
+; VF2-NEXT:    [[TMP22:%.*]] = extractelement <2 x i64> [[TMP21]], i32 0
+; VF2-NEXT:    store i64 [[TMP22]], ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP23:%.*]] = extractelement <2 x i64> [[TMP21]], i32 1
+; VF2-NEXT:    store i64 [[TMP23]], ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP24:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP11]]
+; VF2-NEXT:    [[TMP25:%.*]] = extractelement <2 x i64> [[TMP24]], i32 0
+; VF2-NEXT:    store i64 [[TMP25]], ptr [[TMP14]], align 8
+; VF2-NEXT:    [[TMP26:%.*]] = extractelement <2 x i64> [[TMP24]], i32 1
+; VF2-NEXT:    store i64 [[TMP26]], ptr [[TMP16]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; VF2-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2:       [[SCALAR_PH]]:
+; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT:    br label %[[LOOP:.*]]
+; VF2:       [[LOOP]]:
+; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF2-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF2-NEXT:    [[TMP28:%.*]] = shl nsw i64 [[IV]], 1
+; VF2-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP28]]
+; VF2-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[TMP29:%.*]] = or disjoint i64 [[TMP28]], 1
+; VF2-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP29]]
+; VF2-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF2-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF2-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_interleave_loads_order_flipped(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*]]:
+; VF4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], splat (i64 1)
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i32 0
+; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 1
+; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2
+; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3
+; VF4-NEXT:    [[TMP20:%.*]] = or disjoint <4 x i64> [[TMP3]], splat (i64 1)
+; VF4-NEXT:    [[TMP21:%.*]] = extractelement <4 x i64> [[TMP20]], i32 0
+; VF4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP21]]
+; VF4-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP20]], i32 1
+; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP23]]
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i32 2
+; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP25]]
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i32 3
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP27]]
+; VF4-NEXT:    [[TMP29:%.*]] = load i64, ptr [[TMP22]], align 8
+; VF4-NEXT:    [[TMP30:%.*]] = load i64, ptr [[TMP24]], align 8
+; VF4-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP26]], align 8
+; VF4-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP28]], align 8
+; VF4-NEXT:    [[TMP33:%.*]] = insertelement <4 x i64> poison, i64 [[TMP29]], i32 0
+; VF4-NEXT:    [[TMP34:%.*]] = insertelement <4 x i64> [[TMP33]], i64 [[TMP30]], i32 1
+; VF4-NEXT:    [[TMP35:%.*]] = insertelement <4 x i64> [[TMP34]], i64 [[TMP31]], i32 2
+; VF4-NEXT:    [[TMP36:%.*]] = insertelement <4 x i64> [[TMP35]], i64 [[TMP32]], i32 3
+; VF4-NEXT:    [[TMP37:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP36]]
+; VF4-NEXT:    [[TMP38:%.*]] = extractelement <4 x i64> [[TMP37]], i32 0
+; VF4-NEXT:    store i64 [[TMP38]], ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP39:%.*]] = extractelement <4 x i64> [[TMP37]], i32 1
+; VF4-NEXT:    store i64 [[TMP39]], ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP40:%.*]] = extractelement <4 x i64> [[TMP37]], i32 2
+; VF4-NEXT:    store i64 [[TMP40]], ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP41:%.*]] = extractelement <4 x i64> [[TMP37]], i32 3
+; VF4-NEXT:    store i64 [[TMP41]], ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP42:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP19]]
+; VF4-NEXT:    [[TMP43:%.*]] = extractelement <4 x i64> [[TMP42]], i32 0
+; VF4-NEXT:    store i64 [[TMP43]], ptr [[TMP22]], align 8
+; VF4-NEXT:    [[TMP44:%.*]] = extractelement <4 x i64> [[TMP42]], i32 1
+; VF4-NEXT:    store i64 [[TMP44]], ptr [[TMP24]], align 8
+; VF4-NEXT:    [[TMP45:%.*]] = extractelement <4 x i64> [[TMP42]], i32 2
+; VF4-NEXT:    store i64 [[TMP45]], ptr [[TMP26]], align 8
+; VF4-NEXT:    [[TMP46:%.*]] = extractelement <4 x i64> [[TMP42]], i32 3
+; VF4-NEXT:    store i64 [[TMP46]], ptr [[TMP28]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; VF4-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[TMP47]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4:       [[SCALAR_PH]]:
+; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT:    br label %[[LOOP:.*]]
+; VF4:       [[LOOP]]:
+; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF4-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF4-NEXT:    [[TMP48:%.*]] = shl nsw i64 [[IV]], 1
+; VF4-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP48]]
+; VF4-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[TMP49:%.*]] = or disjoint i64 [[TMP48]], 1
+; VF4-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP49]]
+; VF4-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF4-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF4-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %1 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+  %l.0 = load i64, ptr %data.0, align 8
+  %3 = or disjoint i64 %1, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.0 = mul i64 %l.factor, %l.1
+  store i64 %mul.0, ptr %data.0, align 8
+  %mul.1 = mul i64 %l.factor, %l.0
+  store i64 %mul.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; VF2-LABEL: define void @test_2xi64_store_order_flipped_1(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*]]:
+; VF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP3:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 1)
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
+; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
+; VF2-NEXT:    [[TMP12:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP11]]
+; VF2-NEXT:    [[TMP13:%.*]] = or disjoint <2 x i64> [[TMP3]], splat (i64 1)
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
+; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP14]]
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP16]]
+; VF2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP17]], align 8
+; VF2-NEXT:    [[TMP20:%.*]] = insertelement <2 x i64> poison, i64 [[TMP18]], i32 0
+; VF2-NEXT:    [[TMP21:%.*]] = insertelement <2 x i64> [[TMP20]], i64 [[TMP19]], i32 1
+; VF2-NEXT:    [[TMP22:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP21]]
+; VF2-NEXT:    [[TMP23:%.*]] = extractelement <2 x i64> [[TMP22]], i32 0
+; VF2-NEXT:    store i64 [[TMP23]], ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP24:%.*]] = extractelement <2 x i64> [[TMP22]], i32 1
+; VF2-NEXT:    store i64 [[TMP24]], ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP25:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; VF2-NEXT:    store i64 [[TMP25]], ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP26:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; VF2-NEXT:    store i64 [[TMP26]], ptr [[TMP17]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; VF2-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2:       [[SCALAR_PH]]:
+; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT:    br label %[[LOOP:.*]]
+; VF2:       [[LOOP]]:
+; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF2-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF2-NEXT:    [[TMP28:%.*]] = shl nsw i64 [[IV]], 1
+; VF2-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP28]]
+; VF2-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF2-NEXT:    [[TMP29:%.*]] = or disjoint i64 [[TMP28]], 1
+; VF2-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP29]]
+; VF2-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF2-NEXT:    store i64 [[MUL_1]], ptr [[DATA_0]], align 8
+; VF2-NEXT:    store i64 [[MUL_0]], ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_store_order_flipped_1(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*]]:
+; VF4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], splat (i64 1)
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i32 0
+; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 1
+; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2
+; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3
+; VF4-NEXT:    [[TMP20:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP19]]
+; VF4-NEXT:    [[TMP21:%.*]] = or disjoint <4 x i64> [[TMP3]], splat (i64 1)
+; VF4-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0
+; VF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP22]]
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1
+; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP24]]
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2
+; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP26]]
+; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP28]]
+; VF4-NEXT:    [[TMP30:%.*]] = load i64, ptr [[TMP23]], align 8
+; VF4-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP25]], align 8
+; VF4-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP33:%.*]] = load i64, ptr [[TMP29]], align 8
+; VF4-NEXT:    [[TMP34:%.*]] = insertelement <4 x i64> poison, i64 [[TMP30]], i32 0
+; VF4-NEXT:    [[TMP35:%.*]] = insertelement <4 x i64> [[TMP34]], i64 [[TMP31]], i32 1
+; VF4-NEXT:    [[TMP36:%.*]] = insertelement <4 x i64> [[TMP35]], i64 [[TMP32]], i32 2
+; VF4-NEXT:    [[TMP37:%.*]] = insertelement <4 x i64> [[TMP36]], i64 [[TMP33]], i32 3
+; VF4-NEXT:    [[TMP38:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP37]]
+; VF4-NEXT:    [[TMP39:%.*]] = extractelement <4 x i64> [[TMP38]], i32 0
+; VF4-NEXT:    store i64 [[TMP39]], ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP40:%.*]] = extractelement <4 x i64> [[TMP38]], i32 1
+; VF4-NEXT:    store i64 [[TMP40]], ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP41:%.*]] = extractelement <4 x i64> [[TMP38]], i32 2
+; VF4-NEXT:    store i64 [[TMP41]], ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP42:%.*]] = extractelement <4 x i64> [[TMP38]], i32 3
+; VF4-NEXT:    store i64 [[TMP42]], ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP43:%.*]] = extractelement <4 x i64> [[TMP20]], i32 0
+; VF4-NEXT:    store i64 [[TMP43]], ptr [[TMP23]], align 8
+; VF4-NEXT:    [[TMP44:%.*]] = extractelement <4 x i64> [[TMP20]], i32 1
+; VF4-NEXT:    store i64 [[TMP44]], ptr [[TMP25]], align 8
+; VF4-NEXT:    [[TMP45:%.*]] = extractelement <4 x i64> [[TMP20]], i32 2
+; VF4-NEXT:    store i64 [[TMP45]], ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP46:%.*]] = extractelement <4 x i64> [[TMP20]], i32 3
+; VF4-NEXT:    store i64 [[TMP46]], ptr [[TMP29]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; VF4-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[TMP47]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4:       [[SCALAR_PH]]:
+; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT:    br label %[[LOOP:.*]]
+; VF4:       [[LOOP]]:
+; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF4-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF4-NEXT:    [[TMP48:%.*]] = shl nsw i64 [[IV]], 1
+; VF4-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP48]]
+; VF4-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF4-NEXT:    [[TMP49:%.*]] = or disjoint i64 [[TMP48]], 1
+; VF4-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP49]]
+; VF4-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF4-NEXT:    store i64 [[MUL_1]], ptr [[DATA_0]], align 8
+; VF4-NEXT:    store i64 [[MUL_0]], ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %1 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  %3 = or disjoint i64 %1, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  store i64 %mul.1, ptr %data.0, align 8
+  store i64 %mul.0, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; VF2-LABEL: define void @test_2xi64_store_order_flipped_2(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*]]:
+; VF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP3:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 1)
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
+; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
+; VF2-NEXT:    [[TMP12:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP11]]
+; VF2-NEXT:    [[TMP13:%.*]] = or disjoint <2 x i64> [[TMP3]], splat (i64 1)
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
+; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP14]]
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP16]]
+; VF2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP17]], align 8
+; VF2-NEXT:    [[TMP20:%.*]] = insertelement <2 x i64> poison, i64 [[TMP18]], i32 0
+; VF2-NEXT:    [[TMP21:%.*]] = insertelement <2 x i64> [[TMP20]], i64 [[TMP19]], i32 1
+; VF2-NEXT:    [[TMP22:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP21]]
+; VF2-NEXT:    [[TMP23:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; VF2-NEXT:    store i64 [[TMP23]], ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP24:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; VF2-NEXT:    store i64 [[TMP24]], ptr [[TMP17]], align 8
+; VF2-NEXT:    [[TMP25:%.*]] = extractelement <2 x i64> [[TMP22]], i32 0
+; VF2-NEXT:    store i64 [[TMP25]], ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP26:%.*]] = extractelement <2 x i64> [[TMP22]], i32 1
+; VF2-NEXT:    store i64 [[TMP26]], ptr [[TMP7]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; VF2-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2:       [[SCALAR_PH]]:
+; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT:    br label %[[LOOP:.*]]
+; VF2:       [[LOOP]]:
+; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF2-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF2-NEXT:    [[TMP28:%.*]] = shl nsw i64 [[IV]], 1
+; VF2-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP28]]
+; VF2-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF2-NEXT:    [[TMP29:%.*]] = or disjoint i64 [[TMP28]], 1
+; VF2-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP29]]
+; VF2-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF2-NEXT:    store i64 [[MUL_0]], ptr [[DATA_1]], align 8
+; VF2-NEXT:    store i64 [[MUL_1]], ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_store_order_flipped_2(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*]]:
+; VF4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], splat (i64 1)
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i32 0
+; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 1
+; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2
+; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3
+; VF4-NEXT:    [[TMP20:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP19]]
+; VF4-NEXT:    [[TMP21:%.*]] = or disjoint <4 x i64> [[TMP3]], splat (i64 1)
+; VF4-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0
+; VF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP22]]
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1
+; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP24]]
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2
+; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP26]]
+; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP28]]
+; VF4-NEXT:    [[TMP30:%.*]] = load i64, ptr [[TMP23]], align 8
+; VF4-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP25]], align 8
+; VF4-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP33:%.*]] = load i64, ptr [[TMP29]], align 8
+; VF4-NEXT:    [[TMP34:%.*]] = insertelement <4 x i64> poison, i64 [[TMP30]], i32 0
+; VF4-NEXT:    [[TMP35:%.*]] = insertelement <4 x i64> [[TMP34]], i64 [[TMP31]], i32 1
+; VF4-NEXT:    [[TMP36:%.*]] = insertelement <4 x i64> [[TMP35]], i64 [[TMP32]], i32 2
+; VF4-NEXT:    [[TMP37:%.*]] = insertelement <4 x i64> [[TMP36]], i64 [[TMP33]], i32 3
+; VF4-NEXT:    [[TMP38:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP37]]
+; VF4-NEXT:    [[TMP39:%.*]] = extractelement <4 x i64> [[TMP20]], i32 0
+; VF4-NEXT:    store i64 [[TMP39]], ptr [[TMP23]], align 8
+; VF4-NEXT:    [[TMP40:%.*]] = extractelement <4 x i64> [[TMP20]], i32 1
+; VF4-NEXT:    store i64 [[TMP40]], ptr [[TMP25]], align 8
+; VF4-NEXT:    [[TMP41:%.*]] = extractelement <4 x i64> [[TMP20]], i32 2
+; VF4-NEXT:    store i64 [[TMP41]], ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP42:%.*]] = extractelement <4 x i64> [[TMP20]], i32 3
+; VF4-NEXT:    store i64 [[TMP42]], ptr [[TMP29]], align 8
+; VF4-NEXT:    [[TMP43:%.*]] = extractelement <4 x i64> [[TMP38]], i32 0
+; VF4-NEXT:    store i64 [[TMP43]], ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP44:%.*]] = extractelement <4 x i64> [[TMP38]], i32 1
+; VF4-NEXT:    store i64 [[TMP44]], ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP45:%.*]] = extractelement <4 x i64> [[TMP38]], i32 2
+; VF4-NEXT:    store i64 [[TMP45]], ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP46:%.*]] = extractelement <4 x i64> [[TMP38]], i32 3
+; VF4-NEXT:    store i64 [[TMP46]], ptr [[TMP11]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; VF4-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[TMP47]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4:       [[SCALAR_PH]]:
+; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT:    br label %[[LOOP:.*]]
+; VF4:       [[LOOP]]:
+; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF4-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF4-NEXT:    [[TMP48:%.*]] = shl nsw i64 [[IV]], 1
+; VF4-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP48]]
+; VF4-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF4-NEXT:    [[TMP49:%.*]] = or disjoint i64 [[TMP48]], 1
+; VF4-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP49]]
+; VF4-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF4-NEXT:    store i64 [[MUL_0]], ptr [[DATA_1]], align 8
+; VF4-NEXT:    store i64 [[MUL_1]], ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %1 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  %3 = or disjoint i64 %1, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  store i64 %mul.0, ptr %data.1, align 8
+  store i64 %mul.1, ptr %data.0, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noalias %src.0, ptr noalias %src.1, i64 noundef %n) {
+; VF2-LABEL: define void @test_2xi64_different_loads_feeding_fmul(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[SRC_0:%.*]], ptr noalias [[SRC_1:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*]]:
+; VF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[SRC_0]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP3:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 1)
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP8:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; VF2-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP8]], i32 0
+; VF2-NEXT:    store i64 [[TMP9]], ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP8]], i32 1
+; VF2-NEXT:    store i64 [[TMP10]], ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP11:%.*]] = or disjoint <2 x i64> [[TMP3]], splat (i64 1)
+; VF2-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP12]]
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP14]]
+; VF2-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP18:%.*]] = insertelement <2 x i64> poison, i64 [[TMP16]], i32 0
+; VF2-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> [[TMP18]], i64 [[TMP17]], i32 1
+; VF2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[SRC_1]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[TMP20]], i32 0
+; VF2-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i64>, ptr [[TMP21]], align 8
+; VF2-NEXT:    [[TMP22:%.*]] = mul <2 x i64> [[WIDE_LOAD2]], [[TMP19]]
+; VF2-NEXT:    [[TMP23:%.*]] = extractelement <2 x i64> [[TMP22]], i32 0
+; VF2-NEXT:    store i64 [[TMP23]], ptr [[TMP13]], align 8
+; VF2-NEXT:    [[TMP24:%.*]] = extractelement <2 x i64> [[TMP22]], i32 1
+; VF2-NEXT:    store i64 [[TMP24]], ptr [[TMP15]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; VF2-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2:       [[SCALAR_PH]]:
+; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT:    br label %[[LOOP:.*]]
+; VF2:       [[LOOP]]:
+; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds i64, ptr [[SRC_0]], i64 [[IV]]
+; VF2-NEXT:    [[L_SRC_0:%.*]] = load i64, ptr [[GEP_SRC_0]], align 8
+; VF2-NEXT:    [[TMP26:%.*]] = shl nsw i64 [[IV]], 1
+; VF2-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP26]]
+; VF2-NEXT:    [[L_0:%.*]] = load i64, ptr [[GEP_SRC_0]], align 8
+; VF2-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_SRC_0]], [[L_0]]
+; VF2-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[TMP27:%.*]] = or disjoint i64 [[TMP26]], 1
+; VF2-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP27]]
+; VF2-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds i64, ptr [[SRC_1]], i64 [[IV]]
+; VF2-NEXT:    [[L_SRC_1:%.*]] = load i64, ptr [[GEP_SRC_1]], align 8
+; VF2-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_SRC_1]], [[L_1]]
+; VF2-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_different_loads_feeding_fmul(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[SRC_0:%.*]], ptr noalias [[SRC_1:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*]]:
+; VF4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[SRC_0]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], splat (i64 1)
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP10]]
+; VF4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP12]], i32 0
+; VF4-NEXT:    store i64 [[TMP13]], ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP12]], i32 1
+; VF4-NEXT:    store i64 [[TMP14]], ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = extractelement <4 x i64> [[TMP12]], i32 2
+; VF4-NEXT:    store i64 [[TMP15]], ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = extractelement <4 x i64> [[TMP12]], i32 3
+; VF4-NEXT:    store i64 [[TMP16]], ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = or disjoint <4 x i64> [[TMP3]], splat (i64 1)
+; VF4-NEXT:    [[TMP18:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0
+; VF4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP18]]
+; VF4-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1
+; VF4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP20]]
+; VF4-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2
+; VF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP22]]
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3
+; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP24]]
+; VF4-NEXT:    [[TMP26:%.*]] = load i64, ptr [[TMP19]], align 8
+; VF4-NEXT:    [[TMP27:%.*]] = load i64, ptr [[TMP21]], align 8
+; VF4-NEXT:    [[TMP28:%.*]] = load i64, ptr [[TMP23]], align 8
+; VF4-NEXT:    [[TMP29:%.*]] = load i64, ptr [[TMP25]], align 8
+; VF4-NEXT:    [[TMP30:%.*]] = insertelement <4 x i64> poison, i64 [[TMP26]], i32 0
+; VF4-NEXT:    [[TMP31:%.*]] = insertelement <4 x i64> [[TMP30]], i64 [[TMP27]], i32 1
+; VF4-NEXT:    [[TMP32:%.*]] = insertelement <4 x i64> [[TMP31]], i64 [[TMP28]], i32 2
+; VF4-NEXT:    [[TMP33:%.*]] = insertelement <4 x i64> [[TMP32]], i64 [[TMP29]], i32 3
+; VF4-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i64, ptr [[SRC_1]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i64, ptr [[TMP34]], i32 0
+; VF4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP35]], align 8
+; VF4-NEXT:    [[TMP36:%.*]] = mul <4 x i64> [[WIDE_LOAD2]], [[TMP33]]
+; VF4-NEXT:    [[TMP37:%.*]] = extractelement <4 x i64> [[TMP36]], i32 0
+; VF4-NEXT:    store i64 [[TMP37]], ptr [[TMP19]], align 8
+; VF4-NEXT:    [[TMP38:%.*]] = extractelement <4 x i64> [[TMP36]], i32 1
+; VF4-NEXT:    store i64 [[TMP38]], ptr [[TMP21]], align 8
+; VF4-NEXT:    [[TMP39:%.*]] = extractelement <4 x i64> [[TMP36]], i32 2
+; VF4-NEXT:    store i64 [[TMP39]], ptr [[TMP23]], align 8
+; VF4-NEXT:    [[TMP40:%.*]] = extractelement <4 x i64> [[TMP36]], i32 3
+; VF4-NEXT:    store i64 [[TMP40]], ptr [[TMP25]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; VF4-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[TMP41]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4:       [[SCALAR_PH]]:
+; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT:    br label %[[LOOP:.*]]
+; VF4:       [[LOOP]]:
+; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds i64, ptr [[SRC_0]], i64 [[IV]]
+; VF4-NEXT:    [[L_SRC_0:%.*]] = load i64, ptr [[GEP_SRC_0]], align 8
+; VF4-NEXT:    [[TMP42:%.*]] = shl nsw i64 [[IV]], 1
+; VF4-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP42]]
+; VF4-NEXT:    [[L_0:%.*]] = load i64, ptr [[GEP_SRC_0]], align 8
+; VF4-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_SRC_0]], [[L_0]]
+; VF4-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[TMP43:%.*]] = or disjoint i64 [[TMP42]], 1
+; VF4-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP43]]
+; VF4-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds i64, ptr [[SRC_1]], i64 [[IV]]
+; VF4-NEXT:    [[L_SRC_1:%.*]] = load i64, ptr [[GEP_SRC_1]], align 8
+; VF4-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_SRC_1]], [[L_1]]
+; VF4-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.src.0 = getelementptr inbounds i64, ptr %src.0, i64 %iv
+  %l.src.0 = load i64, ptr %gep.src.0, align 8
+  %1 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+  %l.0 = load i64, ptr %gep.src.0, align 8
+  %mul.0 = mul i64 %l.src.0, %l.0
+  store i64 %mul.0, ptr %data.0, align 8
+  %3 = or disjoint i64 %1, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+  %l.1 = load i64, ptr %data.1, align 8
+  %gep.src.1 = getelementptr inbounds i64, ptr %src.1, i64 %iv
+  %l.src.1 = load i64, ptr %gep.src.1, align 8
+  %mul.1 = mul i64 %l.src.1, %l.1
+  store i64 %mul.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; VF2-LABEL: define void @test_3xi64(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*]]:
+; VF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 0
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 0
+; VF2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> poison, i64 [[TMP6]], i32 0
+; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> [[TMP8]], i64 [[TMP7]], i32 1
+; VF2-NEXT:    [[TMP10:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP9]]
+; VF2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP10]], i32 0
+; VF2-NEXT:    store i64 [[TMP11]], ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP10]], i32 1
+; VF2-NEXT:    store i64 [[TMP12]], ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 1
+; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 1
+; VF2-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF2-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP14]], align 8
+; VF2-NEXT:    [[TMP17:%.*]] = insertelement <2 x i64> poison, i64 [[TMP15]], i32 0
+; VF2-NEXT:    [[TMP18:%.*]] = insertelement <2 x i64> [[TMP17]], i64 [[TMP16]], i32 1
+; VF2-NEXT:    [[TMP19:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP18]]
+; VF2-NEXT:    [[TMP20:%.*]] = extractelement <2 x i64> [[TMP19]], i32 0
+; VF2-NEXT:    store i64 [[TMP20]], ptr [[TMP13]], align 8
+; VF2-NEXT:    [[TMP21:%.*]] = extractelement <2 x i64> [[TMP19]], i32 1
+; VF2-NEXT:    store i64 [[TMP21]], ptr [[TMP14]], align 8
+; VF2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 2
+; VF2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 2
+; VF2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 8
+; VF2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 8
+; VF2-NEXT:    [[TMP26:%.*]] = insertelement <2 x i64> poison, i64 [[TMP24]], i32 0
+; VF2-NEXT:    [[TMP27:%.*]] = insertelement <2 x i64> [[TMP26]], i64 [[TMP25]], i32 1
+; VF2-NEXT:    [[TMP28:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP27]]
+; VF2-NEXT:    [[TMP29:%.*]] = extractelement <2 x i64> [[TMP28]], i32 0
+; VF2-NEXT:    store i64 [[TMP29]], ptr [[TMP22]], align 8
+; VF2-NEXT:    [[TMP30:%.*]] = extractelement <2 x i64> [[TMP28]], i32 1
+; VF2-NEXT:    store i64 [[TMP30]], ptr [[TMP23]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2:       [[SCALAR_PH]]:
+; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT:    br label %[[LOOP:.*]]
+; VF2:       [[LOOP]]:
+; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF2-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF2-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 0
+; VF2-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF2-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 1
+; VF2-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF2-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[DATA_2:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 2
+; VF2-NEXT:    [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8
+; VF2-NEXT:    [[MUL_2:%.*]] = mul i64 [[L_FACTOR]], [[L_2]]
+; VF2-NEXT:    store i64 [[MUL_2]], ptr [[DATA_2]], align 8
+; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_3xi64(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*]]:
+; VF4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; VF4-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 0
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 0
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP2]], i32 0
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP3]], i32 0
+; VF4-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i32 0
+; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[TMP11]], i32 1
+; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 2
+; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 3
+; VF4-NEXT:    [[TMP18:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP17]]
+; VF4-NEXT:    [[TMP19:%.*]] = extractelement <4 x i64> [[TMP18]], i32 0
+; VF4-NEXT:    store i64 [[TMP19]], ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP18]], i32 1
+; VF4-NEXT:    store i64 [[TMP20]], ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP21:%.*]] = extractelement <4 x i64> [[TMP18]], i32 2
+; VF4-NEXT:    store i64 [[TMP21]], ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP18]], i32 3
+; VF4-NEXT:    store i64 [[TMP22]], ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 1
+; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 1
+; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP3]], i32 1
+; VF4-NEXT:    [[TMP27:%.*]] = load i64, ptr [[TMP23]], align 8
+; VF4-NEXT:    [[TMP28:%.*]] = load i64, ptr [[TMP24]], align 8
+; VF4-NEXT:    [[TMP29:%.*]] = load i64, ptr [[TMP25]], align 8
+; VF4-NEXT:    [[TMP30:%.*]] = load i64, ptr [[TMP26]], align 8
+; VF4-NEXT:    [[TMP31:%.*]] = insertelement <4 x i64> poison, i64 [[TMP27]], i32 0
+; VF4-NEXT:    [[TMP32:%.*]] = insertelement <4 x i64> [[TMP31]], i64 [[TMP28]], i32 1
+; VF4-NEXT:    [[TMP33:%.*]] = insertelement <4 x i64> [[TMP32]], i64 [[TMP29]], i32 2
+; VF4-NEXT:    [[TMP34:%.*]] = insertelement <4 x i64> [[TMP33]], i64 [[TMP30]], i32 3
+; VF4-NEXT:    [[TMP35:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP34]]
+; VF4-NEXT:    [[TMP36:%.*]] = extractelement <4 x i64> [[TMP35]], i32 0
+; VF4-NEXT:    store i64 [[TMP36]], ptr [[TMP23]], align 8
+; VF4-NEXT:    [[TMP37:%.*]] = extractelement <4 x i64> [[TMP35]], i32 1
+; VF4-NEXT:    store i64 [[TMP37]], ptr [[TMP24]], align 8
+; VF4-NEXT:    [[TMP38:%.*]] = extractelement <4 x i64> [[TMP35]], i32 2
+; VF4-NEXT:    store i64 [[TMP38]], ptr [[TMP25]], align 8
+; VF4-NEXT:    [[TMP39:%.*]] = extractelement <4 x i64> [[TMP35]], i32 3
+; VF4-NEXT:    store i64 [[TMP39]], ptr [[TMP26]], align 8
+; VF4-NEXT:    [[TMP40:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 2
+; VF4-NEXT:    [[TMP41:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 2
+; VF4-NEXT:    [[TMP42:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP2]], i32 2
+; VF4-NEXT:    [[TMP43:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP3]], i32 2
+; VF4-NEXT:    [[TMP44:%.*]] = load i64, ptr [[TMP40]], align 8
+; VF4-NEXT:    [[TMP45:%.*]] = load i64, ptr [[TMP41]], align 8
+; VF4-NEXT:    [[TMP46:%.*]] = load i64, ptr [[TMP42]], align 8
+; VF4-NEXT:    [[TMP47:%.*]] = load i64, ptr [[TMP43]], align 8
+; VF4-NEXT:    [[TMP48:%.*]] = insertelement <4 x i64> poison, i64 [[TMP44]], i32 0
+; VF4-NEXT:    [[TMP49:%.*]] = insertelement <4 x i64> [[TMP48]], i64 [[TMP45]], i32 1
+; VF4-NEXT:    [[TMP50:%.*]] = insertelement <4 x i64> [[TMP49]], i64 [[TMP46]], i32 2
+; VF4-NEXT:    [[TMP51:%.*]] = insertelement <4 x i64> [[TMP50]], i64 [[TMP47]], i32 3
+; VF4-NEXT:    [[TMP52:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP51]]
+; VF4-NEXT:    [[TMP53:%.*]] = extractelement <4 x i64> [[TMP52]], i32 0
+; VF4-NEXT:    store i64 [[TMP53]], ptr [[TMP40]], align 8
+; VF4-NEXT:    [[TMP54:%.*]] = extractelement <4 x i64> [[TMP52]], i32 1
+; VF4-NEXT:    store i64 [[TMP54]], ptr [[TMP41]], align 8
+; VF4-NEXT:    [[TMP55:%.*]] = extractelement <4 x i64> [[TMP52]], i32 2
+; VF4-NEXT:    store i64 [[TMP55]], ptr [[TMP42]], align 8
+; VF4-NEXT:    [[TMP56:%.*]] = extractelement <4 x i64> [[TMP52]], i32 3
+; VF4-NEXT:    store i64 [[TMP56]], ptr [[TMP43]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP57:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[TMP57]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4:       [[SCALAR_PH]]:
+; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT:    br label %[[LOOP:.*]]
+; VF4:       [[LOOP]]:
+; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF4-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF4-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 0
+; VF4-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF4-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 1
+; VF4-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF4-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[DATA_2:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 2
+; VF4-NEXT:    [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8
+; VF4-NEXT:    [[MUL_2:%.*]] = mul i64 [[L_FACTOR]], [[L_2]]
+; VF4-NEXT:    store i64 [[MUL_2]], ptr [[DATA_2]], align 8
+; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %data.0 = getelementptr inbounds { i64 , i64, i64 }, ptr %data, i64 %iv, i32 0
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  store i64 %mul.0, ptr %data.0, align 8
+  %data.1 = getelementptr inbounds { i64 , i64, i64 }, ptr %data, i64 %iv, i32 1
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  store i64 %mul.1, ptr %data.1, align 8
+  %data.2 = getelementptr inbounds { i64 , i64, i64 }, ptr %data, i64 %iv, i32 2
+  %l.2 = load i64, ptr %data.2, align 8
+  %mul.2 = mul i64 %l.factor, %l.2
+  store i64 %mul.2, ptr %data.2, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+
+define void @test_2xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; VF2-LABEL: define void @test_2xi32(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*]]:
+; VF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 8
+; VF2-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0
+; VF2-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 0
+; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 0
+; VF2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 8
+; VF2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 8
+; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
+; VF2-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP11]], i32 1
+; VF2-NEXT:    [[TMP14:%.*]] = mul <2 x i32> [[TMP7]], [[TMP13]]
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[TMP14]], i32 0
+; VF2-NEXT:    store i32 [[TMP15]], ptr [[TMP8]], align 8
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[TMP14]], i32 1
+; VF2-NEXT:    store i32 [[TMP16]], ptr [[TMP9]], align 8
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 1
+; VF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 1
+; VF2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 8
+; VF2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP18]], align 8
+; VF2-NEXT:    [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i32 0
+; VF2-NEXT:    [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i32 1
+; VF2-NEXT:    [[TMP23:%.*]] = mul <2 x i32> [[TMP7]], [[TMP22]]
+; VF2-NEXT:    [[TMP24:%.*]] = extractelement <2 x i32> [[TMP23]], i32 0
+; VF2-NEXT:    store i32 [[TMP24]], ptr [[TMP17]], align 8
+; VF2-NEXT:    [[TMP25:%.*]] = extractelement <2 x i32> [[TMP23]], i32 1
+; VF2-NEXT:    store i32 [[TMP25]], ptr [[TMP18]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2:       [[SCALAR_PH]]:
+; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT:    br label %[[LOOP:.*]]
+; VF2:       [[LOOP]]:
+; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF2-NEXT:    [[L_FACTOR:%.*]] = load i32, ptr [[ARRAYIDX]], align 8
+; VF2-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 0
+; VF2-NEXT:    [[L_0:%.*]] = load i32, ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[MUL_0:%.*]] = mul i32 [[L_FACTOR]], [[L_0]]
+; VF2-NEXT:    store i32 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 1
+; VF2-NEXT:    [[L_1:%.*]] = load i32, ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[MUL_1:%.*]] = mul i32 [[L_FACTOR]], [[L_1]]
+; VF2-NEXT:    store i32 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi32(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*]]:
+; VF4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; VF4-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 8
+; VF4-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> poison, i32 [[TMP8]], i32 0
+; VF4-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 1
+; VF4-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 2
+; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP11]], i32 3
+; VF4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 0
+; VF4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 0
+; VF4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP2]], i32 0
+; VF4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP3]], i32 0
+; VF4-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP16]], align 8
+; VF4-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP17]], align 8
+; VF4-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP18]], align 8
+; VF4-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP19]], align 8
+; VF4-NEXT:    [[TMP24:%.*]] = insertelement <4 x i32> poison, i32 [[TMP20]], i32 0
+; VF4-NEXT:    [[TMP25:%.*]] = insertelement <4 x i32> [[TMP24]], i32 [[TMP21]], i32 1
+; VF4-NEXT:    [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP22]], i32 2
+; VF4-NEXT:    [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP23]], i32 3
+; VF4-NEXT:    [[TMP28:%.*]] = mul <4 x i32> [[TMP15]], [[TMP27]]
+; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[TMP28]], i32 0
+; VF4-NEXT:    store i32 [[TMP29]], ptr [[TMP16]], align 8
+; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP28]], i32 1
+; VF4-NEXT:    store i32 [[TMP30]], ptr [[TMP17]], align 8
+; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[TMP28]], i32 2
+; VF4-NEXT:    store i32 [[TMP31]], ptr [[TMP18]], align 8
+; VF4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP28]], i32 3
+; VF4-NEXT:    store i32 [[TMP32]], ptr [[TMP19]], align 8
+; VF4-NEXT:    [[TMP33:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 1
+; VF4-NEXT:    [[TMP34:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 1
+; VF4-NEXT:    [[TMP35:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP36:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP3]], i32 1
+; VF4-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP33]], align 8
+; VF4-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP34]], align 8
+; VF4-NEXT:    [[TMP39:%.*]] = load i32, ptr [[TMP35]], align 8
+; VF4-NEXT:    [[TMP40:%.*]] = load i32, ptr [[TMP36]], align 8
+; VF4-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> poison, i32 [[TMP37]], i32 0
+; VF4-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP38]], i32 1
+; VF4-NEXT:    [[TMP43:%.*]] = insertelement <4 x i32> [[TMP42]], i32 [[TMP39]], i32 2
+; VF4-NEXT:    [[TMP44:%.*]] = insertelement <4 x i32> [[TMP43]], i32 [[TMP40]], i32 3
+; VF4-NEXT:    [[TMP45:%.*]] = mul <4 x i32> [[TMP15]], [[TMP44]]
+; VF4-NEXT:    [[TMP46:%.*]] = extractelement <4 x i32> [[TMP45]], i32 0
+; VF4-NEXT:    store i32 [[TMP46]], ptr [[TMP33]], align 8
+; VF4-NEXT:    [[TMP47:%.*]] = extractelement <4 x i32> [[TMP45]], i32 1
+; VF4-NEXT:    store i32 [[TMP47]], ptr [[TMP34]], align 8
+; VF4-NEXT:    [[TMP48:%.*]] = extractelement <4 x i32> [[TMP45]], i32 2
+; VF4-NEXT:    store i32 [[TMP48]], ptr [[TMP35]], align 8
+; VF4-NEXT:    [[TMP49:%.*]] = extractelement <4 x i32> [[TMP45]], i32 3
+; VF4-NEXT:    store i32 [[TMP49]], ptr [[TMP36]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[TMP50]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4:       [[SCALAR_PH]]:
+; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT:    br label %[[LOOP:.*]]
+; VF4:       [[LOOP]]:
+; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF4-NEXT:    [[L_FACTOR:%.*]] = load i32, ptr [[ARRAYIDX]], align 8
+; VF4-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 0
+; VF4-NEXT:    [[L_0:%.*]] = load i32, ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[MUL_0:%.*]] = mul i32 [[L_FACTOR]], [[L_0]]
+; VF4-NEXT:    store i32 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 1
+; VF4-NEXT:    [[L_1:%.*]] = load i32, ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[MUL_1:%.*]] = mul i32 [[L_FACTOR]], [[L_1]]
+; VF4-NEXT:    store i32 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i32 , ptr %arrayidx, align 8
+  %data.0 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 0
+  %l.0 = load i32, ptr %data.0, align 8
+  %mul.0 = mul i32 %l.factor, %l.0
+  store i32 %mul.0, ptr %data.0, align 8
+  %data.1 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 1
+  %l.1 = load i32, ptr %data.1, align 8
+  %mul.1 = mul i32 %l.factor, %l.1
+  store i32%mul.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; VF2-LABEL: define void @test_3xi32(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*]]:
+; VF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 8
+; VF2-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0
+; VF2-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 0
+; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 0
+; VF2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 8
+; VF2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 8
+; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
+; VF2-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP11]], i32 1
+; VF2-NEXT:    [[TMP14:%.*]] = mul <2 x i32> [[TMP7]], [[TMP13]]
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[TMP14]], i32 0
+; VF2-NEXT:    store i32 [[TMP15]], ptr [[TMP8]], align 8
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[TMP14]], i32 1
+; VF2-NEXT:    store i32 [[TMP16]], ptr [[TMP9]], align 8
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 1
+; VF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 1
+; VF2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 8
+; VF2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP18]], align 8
+; VF2-NEXT:    [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i32 0
+; VF2-NEXT:    [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i32 1
+; VF2-NEXT:    [[TMP23:%.*]] = mul <2 x i32> [[TMP7]], [[TMP22]]
+; VF2-NEXT:    [[TMP24:%.*]] = extractelement <2 x i32> [[TMP23]], i32 0
+; VF2-NEXT:    store i32 [[TMP24]], ptr [[TMP17]], align 8
+; VF2-NEXT:    [[TMP25:%.*]] = extractelement <2 x i32> [[TMP23]], i32 1
+; VF2-NEXT:    store i32 [[TMP25]], ptr [[TMP18]], align 8
+; VF2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 2
+; VF2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 2
+; VF2-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP26]], align 8
+; VF2-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP27]], align 8
+; VF2-NEXT:    [[TMP30:%.*]] = insertelement <2 x i32> poison, i32 [[TMP28]], i32 0
+; VF2-NEXT:    [[TMP31:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP29]], i32 1
+; VF2-NEXT:    [[TMP32:%.*]] = mul <2 x i32> [[TMP7]], [[TMP31]]
+; VF2-NEXT:    [[TMP33:%.*]] = extractelement <2 x i32> [[TMP32]], i32 0
+; VF2-NEXT:    store i32 [[TMP33]], ptr [[TMP26]], align 8
+; VF2-NEXT:    [[TMP34:%.*]] = extractelement <2 x i32> [[TMP32]], i32 1
+; VF2-NEXT:    store i32 [[TMP34]], ptr [[TMP27]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[TMP35]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2:       [[SCALAR_PH]]:
+; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT:    br label %[[LOOP:.*]]
+; VF2:       [[LOOP]]:
+; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF2-NEXT:    [[L_FACTOR:%.*]] = load i32, ptr [[ARRAYIDX]], align 8
+; VF2-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 0
+; VF2-NEXT:    [[L_0:%.*]] = load i32, ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[MUL_0:%.*]] = mul i32 [[L_FACTOR]], [[L_0]]
+; VF2-NEXT:    store i32 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 1
+; VF2-NEXT:    [[L_1:%.*]] = load i32, ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[MUL_1:%.*]] = mul i32 [[L_FACTOR]], [[L_1]]
+; VF2-NEXT:    store i32 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[DATA_2:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 2
+; VF2-NEXT:    [[L_2:%.*]] = load i32, ptr [[DATA_2]], align 8
+; VF2-NEXT:    [[MUL_2:%.*]] = mul i32 [[L_FACTOR]], [[L_2]]
+; VF2-NEXT:    store i32 [[MUL_2]], ptr [[DATA_2]], align 8
+; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_3xi32(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*]]:
+; VF4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; VF4-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 8
+; VF4-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> poison, i32 [[TMP8]], i32 0
+; VF4-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 1
+; VF4-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 2
+; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP11]], i32 3
+; VF4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 0
+; VF4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 0
+; VF4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP2]], i32 0
+; VF4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP3]], i32 0
+; VF4-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP16]], align 8
+; VF4-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP17]], align 8
+; VF4-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP18]], align 8
+; VF4-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP19]], align 8
+; VF4-NEXT:    [[TMP24:%.*]] = insertelement <4 x i32> poison, i32 [[TMP20]], i32 0
+; VF4-NEXT:    [[TMP25:%.*]] = insertelement <4 x i32> [[TMP24]], i32 [[TMP21]], i32 1
+; VF4-NEXT:    [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP22]], i32 2
+; VF4-NEXT:    [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP23]], i32 3
+; VF4-NEXT:    [[TMP28:%.*]] = mul <4 x i32> [[TMP15]], [[TMP27]]
+; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[TMP28]], i32 0
+; VF4-NEXT:    store i32 [[TMP29]], ptr [[TMP16]], align 8
+; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP28]], i32 1
+; VF4-NEXT:    store i32 [[TMP30]], ptr [[TMP17]], align 8
+; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[TMP28]], i32 2
+; VF4-NEXT:    store i32 [[TMP31]], ptr [[TMP18]], align 8
+; VF4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP28]], i32 3
+; VF4-NEXT:    store i32 [[TMP32]], ptr [[TMP19]], align 8
+; VF4-NEXT:    [[TMP33:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 1
+; VF4-NEXT:    [[TMP34:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 1
+; VF4-NEXT:    [[TMP35:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP36:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP3]], i32 1
+; VF4-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP33]], align 8
+; VF4-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP34]], align 8
+; VF4-NEXT:    [[TMP39:%.*]] = load i32, ptr [[TMP35]], align 8
+; VF4-NEXT:    [[TMP40:%.*]] = load i32, ptr [[TMP36]], align 8
+; VF4-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> poison, i32 [[TMP37]], i32 0
+; VF4-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP38]], i32 1
+; VF4-NEXT:    [[TMP43:%.*]] = insertelement <4 x i32> [[TMP42]], i32 [[TMP39]], i32 2
+; VF4-NEXT:    [[TMP44:%.*]] = insertelement <4 x i32> [[TMP43]], i32 [[TMP40]], i32 3
+; VF4-NEXT:    [[TMP45:%.*]] = mul <4 x i32> [[TMP15]], [[TMP44]]
+; VF4-NEXT:    [[TMP46:%.*]] = extractelement <4 x i32> [[TMP45]], i32 0
+; VF4-NEXT:    store i32 [[TMP46]], ptr [[TMP33]], align 8
+; VF4-NEXT:    [[TMP47:%.*]] = extractelement <4 x i32> [[TMP45]], i32 1
+; VF4-NEXT:    store i32 [[TMP47]], ptr [[TMP34]], align 8
+; VF4-NEXT:    [[TMP48:%.*]] = extractelement <4 x i32> [[TMP45]], i32 2
+; VF4-NEXT:    store i32 [[TMP48]], ptr [[TMP35]], align 8
+; VF4-NEXT:    [[TMP49:%.*]] = extractelement <4 x i32> [[TMP45]], i32 3
+; VF4-NEXT:    store i32 [[TMP49]], ptr [[TMP36]], align 8
+; VF4-NEXT:    [[TMP50:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 2
+; VF4-NEXT:    [[TMP51:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 2
+; VF4-NEXT:    [[TMP52:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP2]], i32 2
+; VF4-NEXT:    [[TMP53:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP3]], i32 2
+; VF4-NEXT:    [[TMP54:%.*]] = load i32, ptr [[TMP50]], align 8
+; VF4-NEXT:    [[TMP55:%.*]] = load i32, ptr [[TMP51]], align 8
+; VF4-NEXT:    [[TMP56:%.*]] = load i32, ptr [[TMP52]], align 8
+; VF4-NEXT:    [[TMP57:%.*]] = load i32, ptr [[TMP53]], align 8
+; VF4-NEXT:    [[TMP58:%.*]] = insertelement <4 x i32> poison, i32 [[TMP54]], i32 0
+; VF4-NEXT:    [[TMP59:%.*]] = insertelement <4 x i32> [[TMP58]], i32 [[TMP55]], i32 1
+; VF4-NEXT:    [[TMP60:%.*]] = insertelement <4 x i32> [[TMP59]], i32 [[TMP56]], i32 2
+; VF4-NEXT:    [[TMP61:%.*]] = insertelement <4 x i32> [[TMP60]], i32 [[TMP57]], i32 3
+; VF4-NEXT:    [[TMP62:%.*]] = mul <4 x i32> [[TMP15]], [[TMP61]]
+; VF4-NEXT:    [[TMP63:%.*]] = extractelement <4 x i32> [[TMP62]], i32 0
+; VF4-NEXT:    store i32 [[TMP63]], ptr [[TMP50]], align 8
+; VF4-NEXT:    [[TMP64:%.*]] = extractelement <4 x i32> [[TMP62]], i32 1
+; VF4-NEXT:    store i32 [[TMP64]], ptr [[TMP51]], align 8
+; VF4-NEXT:    [[TMP65:%.*]] = extractelement <4 x i32> [[TMP62]], i32 2
+; VF4-NEXT:    store i32 [[TMP65]], ptr [[TMP52]], align 8
+; VF4-NEXT:    [[TMP66:%.*]] = extractelement <4 x i32> [[TMP62]], i32 3
+; VF4-NEXT:    store i32 [[TMP66]], ptr [[TMP53]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP67:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[TMP67]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4:       [[SCALAR_PH]]:
+; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT:    br label %[[LOOP:.*]]
+; VF4:       [[LOOP]]:
+; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF4-NEXT:    [[L_FACTOR:%.*]] = load i32, ptr [[ARRAYIDX]], align 8
+; VF4-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 0
+; VF4-NEXT:    [[L_0:%.*]] = load i32, ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[MUL_0:%.*]] = mul i32 [[L_FACTOR]], [[L_0]]
+; VF4-NEXT:    store i32 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 1
+; VF4-NEXT:    [[L_1:%.*]] = load i32, ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[MUL_1:%.*]] = mul i32 [[L_FACTOR]], [[L_1]]
+; VF4-NEXT:    store i32 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[DATA_2:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 2
+; VF4-NEXT:    [[L_2:%.*]] = load i32, ptr [[DATA_2]], align 8
+; VF4-NEXT:    [[MUL_2:%.*]] = mul i32 [[L_FACTOR]], [[L_2]]
+; VF4-NEXT:    store i32 [[MUL_2]], ptr [[DATA_2]], align 8
+; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i32 , ptr %arrayidx, align 8
+  %data.0 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 0
+  %l.0 = load i32, ptr %data.0, align 8
+  %mul.0 = mul i32 %l.factor, %l.0
+  store i32 %mul.0, ptr %data.0, align 8
+  %data.1 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 1
+  %l.1 = load i32, ptr %data.1, align 8
+  %mul.1 = mul i32 %l.factor, %l.1
+  store i32%mul.1, ptr %data.1, align 8
+  %data.2 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 2
+  %l.2 = load i32, ptr %data.2, align 8
+  %mul.2 = mul i32 %l.factor, %l.2
+  store i32 %mul.2, ptr %data.2, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+;.
+; VF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; VF2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; VF2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; VF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; VF2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; VF2: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; VF2: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; VF2: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; VF2: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; VF2: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; VF2: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; VF2: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; VF2: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; VF2: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; VF2: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; VF2: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+; VF2: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; VF2: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
+;.
+; VF4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; VF4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; VF4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; VF4: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; VF4: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; VF4: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; VF4: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; VF4: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; VF4: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; VF4: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; VF4: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; VF4: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; VF4: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; VF4: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; VF4: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; VF4: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+; VF4: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; VF4: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
+;.

From c3910823c741eb3ad6f977bda82d7b55101499ef Mon Sep 17 00:00:00 2001
From: Nick Sarnie <nick.sarnie@intel.com>
Date: Sat, 11 Jan 2025 04:02:53 +0900
Subject: [PATCH 096/408] [clang][Driver][SPIR-V] Make tool names consistent
 (#122343)

Some use `SPIRV` and some use `SPIR-V`, just use `SPIR-V` which is what
we use normally.

I noticed this when fixing the test in
https://github.com/llvm/llvm-project/pull/122310.

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 clang/lib/Driver/ToolChains/SPIRV.h        |  4 ++--
 clang/test/Driver/spirv-openmp-toolchain.c |  2 +-
 clang/test/Driver/spirv-toolchain.cl       | 10 ++++++++++
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/SPIRV.h b/clang/lib/Driver/ToolChains/SPIRV.h
index 44187084e34ec..6223d55eca643 100644
--- a/clang/lib/Driver/ToolChains/SPIRV.h
+++ b/clang/lib/Driver/ToolChains/SPIRV.h
@@ -43,7 +43,7 @@ class LLVM_LIBRARY_VISIBILITY Translator : public Tool {
 
 class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
-  Linker(const ToolChain &TC) : Tool("SPIRV::Linker", "spirv-link", TC) {}
+  Linker(const ToolChain &TC) : Tool("SPIR-V::Linker", "spirv-link", TC) {}
   bool hasIntegratedCPP() const override { return false; }
   bool isLinkJob() const override { return true; }
   void ConstructJob(Compilation &C, const JobAction &JA,
@@ -54,7 +54,7 @@ class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 
 class LLVM_LIBRARY_VISIBILITY Assembler final : public Tool {
 public:
-  Assembler(const ToolChain &TC) : Tool("SPIRV::Assembler", "spirv-as", TC) {}
+  Assembler(const ToolChain &TC) : Tool("SPIR-V::Assembler", "spirv-as", TC) {}
   bool hasIntegratedAssembler() const override { return false; }
   bool hasIntegratedCPP() const override { return false; }
   void ConstructJob(Compilation &C, const JobAction &JA,
diff --git a/clang/test/Driver/spirv-openmp-toolchain.c b/clang/test/Driver/spirv-openmp-toolchain.c
index 3d585d78e86c2..5a76bf70e7ed3 100644
--- a/clang/test/Driver/spirv-openmp-toolchain.c
+++ b/clang/test/Driver/spirv-openmp-toolchain.c
@@ -45,7 +45,7 @@
 // CHECK-BINDINGS-TEMPS: "spirv64-intel" - "clang", inputs: ["[[INPUT]]"], output: "[[DEVICE_PP:.+]]"
 // CHECK-BINDINGS-TEMPS: "spirv64-intel" - "clang", inputs: ["[[DEVICE_PP]]", "[[HOST_BC]]"], output: "[[DEVICE_TEMP_BC:.+]]"
 // CHECK-BINDINGS-TEMPS: "spirv64-intel" - "SPIR-V::Translator", inputs: ["[[DEVICE_TEMP_BC]]"], output: "[[DEVICE_ASM:.+]]"
-// CHECK-BINDINGS-TEMPS: "spirv64-intel" - "SPIRV::Assembler", inputs: ["[[DEVICE_ASM]]"], output: "[[DEVICE_SPV:.+]]"
+// CHECK-BINDINGS-TEMPS: "spirv64-intel" - "SPIR-V::Assembler", inputs: ["[[DEVICE_ASM]]"], output: "[[DEVICE_SPV:.+]]"
 // CHECK-BINDINGS-TEMPS: "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[DEVICE_SPV]]"], output: "[[DEVICE_IMAGE:.+]]"
 // CHECK-BINDINGS-TEMPS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]", "[[DEVICE_IMAGE]]"], output: "[[HOST_ASM:.+]]"
 // CHECK-BINDINGS-TEMPS: "x86_64-unknown-linux-gnu" - "clang::as", inputs: ["[[HOST_ASM]]"], output: "[[HOST_OBJ:.+]]"
diff --git a/clang/test/Driver/spirv-toolchain.cl b/clang/test/Driver/spirv-toolchain.cl
index 33c7bc0a63adf..fd29dbe71e910 100644
--- a/clang/test/Driver/spirv-toolchain.cl
+++ b/clang/test/Driver/spirv-toolchain.cl
@@ -70,6 +70,16 @@
 // SPLINK: {{llvm-spirv.*"}} [[BC]] "-o" [[SPV2:".*o"]]
 // SPLINK: {{spirv-link.*"}} [[SPV1]] [[SPV2]] "-o" "a.out"
 
+//-----------------------------------------------------------------------------
+// Check bindings when linking when multiple input files are passed.
+// RUN: %clang -### -target spirv64 -ccc-print-bindings %s %s 2>&1 | FileCheck --check-prefix=SPLINK-BINDINGS %s
+
+// SPLINK-BINDINGS: "clang", inputs: [[[CL:".*cl"]]], output: [[BC1:".*bc"]]
+// SPLINK-BINDINGS: "SPIR-V::Translator", inputs: [[[BC1]]], output: [[OBJ1:".*o"]]
+// SPLINK-BINDINGS: "clang", inputs: [[[CL]]], output: [[BC2:".*bc"]]
+// SPLINK-BINDINGS: "SPIR-V::Translator", inputs: [[[BC2]]], output: [[OBJ2:".*o"]]
+// SPLINK-BINDINGS: "SPIR-V::Linker", inputs: [[[OBJ1]], [[OBJ2]]], output: "a.out"
+
 //-----------------------------------------------------------------------------
 // Check external vs internal object emission.
 // RUN: %clang -### --target=spirv64 -fno-integrated-objemitter %s 2>&1 | FileCheck --check-prefix=XTOR %s

From c91d805e6627987bec8ec34ea67c1e8240940039 Mon Sep 17 00:00:00 2001
From: Jakub Mazurkiewicz <mazkuba3@gmail.com>
Date: Fri, 10 Jan 2025 20:14:14 +0100
Subject: [PATCH 097/408] [libc++] Implement std::not_fn<NTTP> (#86133)

Implement `std::not_fn<NTTP>` from "P2714R1 Bind front and back to NTTP callables".
---
 libcxx/docs/FeatureTestMacroTable.rst         |   2 +
 libcxx/docs/Status/Cxx2cPapers.csv            |   2 +-
 libcxx/include/__functional/not_fn.h          |  23 ++
 libcxx/include/functional                     |   4 +-
 libcxx/include/version                        |   5 +-
 .../func.not.fn/not_fn.nttp.compile.pass.cpp  |  43 +++
 .../not_fn.nttp.nodiscard.verify.cpp          |  24 ++
 .../functional.version.compile.pass.cpp       |   5 +-
 .../version.version.compile.pass.cpp          |   5 +-
 .../func.not_fn/not_fn.nttp.pass.cpp          | 310 ++++++++++++++++++
 .../func.not_fn/not_fn.nttp.verify.cpp        |  29 ++
 .../generate_feature_test_macro_components.py |   2 +-
 12 files changed, 446 insertions(+), 8 deletions(-)
 create mode 100644 libcxx/test/libcxx/utilities/function.objects/func.not.fn/not_fn.nttp.compile.pass.cpp
 create mode 100644 libcxx/test/libcxx/utilities/function.objects/func.not.fn/not_fn.nttp.nodiscard.verify.cpp
 create mode 100644 libcxx/test/std/utilities/function.objects/func.not_fn/not_fn.nttp.pass.cpp
 create mode 100644 libcxx/test/std/utilities/function.objects/func.not_fn/not_fn.nttp.verify.cpp

diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst
index 3c4a13332661e..cfb0e5cfb129c 100644
--- a/libcxx/docs/FeatureTestMacroTable.rst
+++ b/libcxx/docs/FeatureTestMacroTable.rst
@@ -458,6 +458,8 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_mdspan``                                       ``202406L``
     ---------------------------------------------------------- -----------------
+    ``__cpp_lib_not_fn``                                       ``202306L``
+    ---------------------------------------------------------- -----------------
     ``__cpp_lib_optional_range_support``                       *unimplemented*
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_out_ptr``                                      ``202311L``
diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv
index 3a8a794ca4ea1..aa896e85fcb1f 100644
--- a/libcxx/docs/Status/Cxx2cPapers.csv
+++ b/libcxx/docs/Status/Cxx2cPapers.csv
@@ -24,7 +24,7 @@
 "`P1383R2 <https://wg21.link/P1383R2>`__","More ``constexpr`` for ``<cmath>`` and ``<complex>``","2023-06 (Varna)","","",""
 "`P2734R0 <https://wg21.link/P2734R0>`__","Adding the new SI prefixes","2023-06 (Varna)","|Complete|","17",""
 "`P2548R6 <https://wg21.link/P2548R6>`__","``copyable_function``","2023-06 (Varna)","","",""
-"`P2714R1 <https://wg21.link/P2714R1>`__","Bind front and back to NTTP callables","2023-06 (Varna)","","",""
+"`P2714R1 <https://wg21.link/P2714R1>`__","Bind front and back to NTTP callables","2023-06 (Varna)","|Partial|","20","``not_fn`` only"
 "`P2630R4 <https://wg21.link/P2630R4>`__","``submdspan``","2023-06 (Varna)","","",""
 "","","","","",""
 "`P0543R3 <https://wg21.link/P0543R3>`__","Saturation arithmetic","2023-11 (Kona)","|Complete|","18",""
diff --git a/libcxx/include/__functional/not_fn.h b/libcxx/include/__functional/not_fn.h
index 4b3ce5524a743..e6f14be799db3 100644
--- a/libcxx/include/__functional/not_fn.h
+++ b/libcxx/include/__functional/not_fn.h
@@ -16,6 +16,8 @@
 #include <__type_traits/decay.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_constructible.h>
+#include <__type_traits/is_member_pointer.h>
+#include <__type_traits/is_pointer.h>
 #include <__utility/forward.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -48,6 +50,27 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 auto not_fn(_Fn&& __f) {
 
 #endif // _LIBCPP_STD_VER >= 17
 
+#if _LIBCPP_STD_VER >= 26
+
+template <auto _Fn>
+struct __nttp_not_fn_t {
+  template <class... _Args>
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI constexpr auto operator()(_Args&&... __args) const
+      noexcept(noexcept(!std::invoke(_Fn, std::forward<_Args>(__args)...)))
+          -> decltype(!std::invoke(_Fn, std::forward<_Args>(__args)...)) {
+    return !std::invoke(_Fn, std::forward<_Args>(__args)...);
+  }
+};
+
+template <auto _Fn>
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI constexpr auto not_fn() noexcept {
+  if constexpr (using _Ty = decltype(_Fn); is_pointer_v<_Ty> || is_member_pointer_v<_Ty>)
+    static_assert(_Fn != nullptr, "f cannot be equal to nullptr");
+  return __nttp_not_fn_t<_Fn>();
+}
+
+#endif // _LIBCPP_STD_VER >= 26
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___FUNCTIONAL_NOT_FN_H
diff --git a/libcxx/include/functional b/libcxx/include/functional
index 4bb163fd1f33c..b121a19d6bcb3 100644
--- a/libcxx/include/functional
+++ b/libcxx/include/functional
@@ -214,7 +214,9 @@ template <class Predicate> // deprecated in C++17, removed in C++20
 binary_negate<Predicate> not2(const Predicate& pred);
 
 template <class F>
-constexpr unspecified not_fn(F&& f); // C++17, constexpr in C++20
+  constexpr unspecified not_fn(F&& f);     // C++17, constexpr in C++20
+template <auto f>
+  constexpr unspecified not_fn() noexcept; // C++26
 
 // [func.bind.partial], function templates bind_front and bind_back
 template<class F, class... Args>
diff --git a/libcxx/include/version b/libcxx/include/version
index 23cffd960e5d3..f5b5e7a906f50 100644
--- a/libcxx/include/version
+++ b/libcxx/include/version
@@ -171,7 +171,8 @@ __cpp_lib_nonmember_container_access                    201411L <array> <deque>
                                                                 <iterator> <list> <map>
                                                                 <regex> <set> <string>
                                                                 <unordered_map> <unordered_set> <vector>
-__cpp_lib_not_fn                                        201603L <functional>
+__cpp_lib_not_fn                                        202306L <functional>
+                                                        201603L // C++17
 __cpp_lib_null_iterators                                201304L <iterator>
 __cpp_lib_optional                                      202110L <optional>
                                                         202106L // C++20
@@ -557,6 +558,8 @@ __cpp_lib_void_t                                        201411L <type_traits>
 // # define __cpp_lib_linalg                               202311L
 # undef  __cpp_lib_mdspan
 # define __cpp_lib_mdspan                               202406L
+# undef  __cpp_lib_not_fn
+# define __cpp_lib_not_fn                               202306L
 // # define __cpp_lib_optional_range_support               202406L
 # undef  __cpp_lib_out_ptr
 # define __cpp_lib_out_ptr                              202311L
diff --git a/libcxx/test/libcxx/utilities/function.objects/func.not.fn/not_fn.nttp.compile.pass.cpp b/libcxx/test/libcxx/utilities/function.objects/func.not.fn/not_fn.nttp.compile.pass.cpp
new file mode 100644
index 0000000000000..b0128ae6ff053
--- /dev/null
+++ b/libcxx/test/libcxx/utilities/function.objects/func.not.fn/not_fn.nttp.compile.pass.cpp
@@ -0,0 +1,43 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+
+// <functional>
+
+// Type of `std::not_fn<NTTP>()` is always empty.
+
+#include <functional>
+#include <type_traits>
+
+struct NonEmptyFunctionObject {
+  bool val = true;
+  bool operator()() const;
+};
+
+bool func();
+
+struct SomeClass {
+  bool member_object;
+  bool member_function();
+};
+
+using ResultWithEmptyFuncObject = decltype(std::not_fn<std::false_type{}>());
+static_assert(std::is_empty_v<ResultWithEmptyFuncObject>);
+
+using ResultWithNotEmptyFuncObject = decltype(std::not_fn<NonEmptyFunctionObject{}>());
+static_assert(std::is_empty_v<ResultWithNotEmptyFuncObject>);
+
+using ResultWithFunctionPointer = decltype(std::not_fn<&func>());
+static_assert(std::is_empty_v<ResultWithFunctionPointer>);
+
+using ResultWithMemberObjectPointer = decltype(std::not_fn<&SomeClass::member_object>());
+static_assert(std::is_empty_v<ResultWithMemberObjectPointer>);
+
+using ResultWithMemberFunctionPointer = decltype(std::not_fn<&SomeClass::member_function>());
+static_assert(std::is_empty_v<ResultWithMemberFunctionPointer>);
diff --git a/libcxx/test/libcxx/utilities/function.objects/func.not.fn/not_fn.nttp.nodiscard.verify.cpp b/libcxx/test/libcxx/utilities/function.objects/func.not.fn/not_fn.nttp.nodiscard.verify.cpp
new file mode 100644
index 0000000000000..acef2d3d24a29
--- /dev/null
+++ b/libcxx/test/libcxx/utilities/function.objects/func.not.fn/not_fn.nttp.nodiscard.verify.cpp
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+
+// <functional>
+
+// Test the libc++ extension that std::not_fn<NTTP> is marked as [[nodiscard]].
+
+#include <functional>
+#include <type_traits>
+
+void test() {
+  using F = std::true_type;
+  std::not_fn<F{}>(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  auto negated = std::not_fn<F{}>();
+  negated(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+}
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/functional.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/functional.version.compile.pass.cpp
index 8ea1934590363..e19ed321515ec 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/functional.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/functional.version.compile.pass.cpp
@@ -27,6 +27,7 @@
     __cpp_lib_invoke_r                 202106L [C++23]
     __cpp_lib_move_only_function       202110L [C++23]
     __cpp_lib_not_fn                   201603L [C++17]
+                                       202306L [C++26]
     __cpp_lib_ranges                   202110L [C++20]
                                        202406L [C++23]
     __cpp_lib_reference_wrapper        202403L [C++26]
@@ -525,8 +526,8 @@
 # ifndef __cpp_lib_not_fn
 #   error "__cpp_lib_not_fn should be defined in c++26"
 # endif
-# if __cpp_lib_not_fn != 201603L
-#   error "__cpp_lib_not_fn should have the value 201603L in c++26"
+# if __cpp_lib_not_fn != 202306L
+#   error "__cpp_lib_not_fn should have the value 202306L in c++26"
 # endif
 
 # ifndef __cpp_lib_ranges
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
index b0f8b2f80067d..7c03955df681d 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
@@ -156,6 +156,7 @@
     __cpp_lib_node_extract                                  201606L [C++17]
     __cpp_lib_nonmember_container_access                    201411L [C++17]
     __cpp_lib_not_fn                                        201603L [C++17]
+                                                            202306L [C++26]
     __cpp_lib_null_iterators                                201304L [C++14]
     __cpp_lib_optional                                      201606L [C++17]
                                                             202106L [C++20]
@@ -7405,8 +7406,8 @@
 # ifndef __cpp_lib_not_fn
 #   error "__cpp_lib_not_fn should be defined in c++26"
 # endif
-# if __cpp_lib_not_fn != 201603L
-#   error "__cpp_lib_not_fn should have the value 201603L in c++26"
+# if __cpp_lib_not_fn != 202306L
+#   error "__cpp_lib_not_fn should have the value 202306L in c++26"
 # endif
 
 # ifndef __cpp_lib_null_iterators
diff --git a/libcxx/test/std/utilities/function.objects/func.not_fn/not_fn.nttp.pass.cpp b/libcxx/test/std/utilities/function.objects/func.not_fn/not_fn.nttp.pass.cpp
new file mode 100644
index 0000000000000..688049c772f08
--- /dev/null
+++ b/libcxx/test/std/utilities/function.objects/func.not_fn/not_fn.nttp.pass.cpp
@@ -0,0 +1,310 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+
+// <functional>
+
+// template<auto f> constexpr unspecified not_fn() noexcept;
+
+#include <functional>
+
+#include <bit>
+#include <cassert>
+#include <concepts>
+#include <type_traits>
+#include <utility>
+
+#include "test_macros.h"
+
+class BooleanTestable {
+  bool val_;
+
+public:
+  constexpr explicit BooleanTestable(bool val) : val_(val) {}
+  constexpr operator bool() const { return val_; }
+  constexpr BooleanTestable operator!() const { return BooleanTestable{!val_}; }
+};
+
+LIBCPP_STATIC_ASSERT(std::__boolean_testable<BooleanTestable>);
+
+class FakeBool {
+  int val_;
+
+public:
+  constexpr FakeBool(int val) : val_(val) {}
+  constexpr FakeBool operator!() const { return FakeBool{-val_}; }
+  constexpr bool operator==(int other) const { return val_ == other; }
+};
+
+template <bool IsNoexcept>
+struct MaybeNoexceptFn {
+  bool operator()() const noexcept(IsNoexcept); // not defined
+};
+
+template <bool IsNoexcept>
+struct MaybeNoexceptNegation {
+  bool operator!() noexcept(IsNoexcept); // not defined
+};
+
+template <bool IsNoexcept>
+MaybeNoexceptNegation<IsNoexcept> maybe_noexcept_negation() noexcept {
+  return {};
+}
+
+constexpr void basic_tests() {
+  { // Test constant functions
+    auto false_fn = std::not_fn<std::false_type{}>();
+    assert(false_fn());
+
+    auto true_fn = std::not_fn<std::true_type{}>();
+    assert(!true_fn());
+
+    static_assert(noexcept(std::not_fn<std::false_type{}>()));
+    static_assert(noexcept(std::not_fn<std::true_type{}>()));
+  }
+
+  { // Test function with one argument
+    auto is_odd = std::not_fn<[](auto x) { return x % 2 == 0; }>();
+    assert(is_odd(1));
+    assert(!is_odd(2));
+    assert(is_odd(3));
+    assert(!is_odd(4));
+    assert(is_odd(5));
+  }
+
+  { // Test function with multiple arguments
+    auto at_least_10 = [](auto... vals) { return (vals + ... + 0) >= 10; };
+    auto at_most_9   = std::not_fn<at_least_10>();
+    assert(at_most_9());
+    assert(at_most_9(1));
+    assert(at_most_9(1, 2, 3, 4, -1));
+    assert(at_most_9(3, 3, 2, 1, -2));
+    assert(!at_most_9(10, -1, 2));
+    assert(!at_most_9(5, 5));
+    static_assert(noexcept(std::not_fn<at_least_10>()));
+  }
+
+  { // Test function that returns boolean-testable type other than bool
+    auto is_product_even = [](auto... vals) { return BooleanTestable{(vals * ... * 1) % 2 == 0}; };
+    auto is_product_odd  = std::not_fn<is_product_even>();
+    assert(is_product_odd());
+    assert(is_product_odd(1, 3, 5, 9));
+    assert(is_product_odd(3, 3, 3, 3));
+    assert(!is_product_odd(3, 5, 9, 11, 0));
+    assert(!is_product_odd(11, 7, 5, 3, 2));
+    static_assert(noexcept(std::not_fn<is_product_even>()));
+  }
+
+  { // Test function that returns non-boolean-testable type
+    auto sum         = [](auto... vals) -> FakeBool { return (vals + ... + 0); };
+    auto negated_sum = std::not_fn<sum>();
+    assert(negated_sum() == 0);
+    assert(negated_sum(3) == -3);
+    assert(negated_sum(4, 5, 1, 3) == -13);
+    assert(negated_sum(4, 2, 5, 6, 1) == -18);
+    assert(negated_sum(-1, 3, 2, -8) == 4);
+    static_assert(noexcept(std::not_fn<sum>()));
+  }
+
+  { // Test member pointers
+    struct MemberPointerTester {
+      bool value = true;
+      constexpr bool not_value() const { return !value; }
+      constexpr bool value_and(bool other) noexcept { return value && other; }
+    };
+
+    MemberPointerTester tester;
+
+    auto not_mem_object = std::not_fn<&MemberPointerTester::value>();
+    assert(!not_mem_object(tester));
+    assert(!not_mem_object(std::as_const(tester)));
+    static_assert(noexcept(not_mem_object(tester)));
+    static_assert(noexcept(not_mem_object(std::as_const(tester))));
+
+    auto not_nullary_mem_fn = std::not_fn<&MemberPointerTester::not_value>();
+    assert(not_nullary_mem_fn(tester));
+    static_assert(!noexcept(not_nullary_mem_fn(tester)));
+
+    auto not_unary_mem_fn = std::not_fn<&MemberPointerTester::value_and>();
+    assert(not_unary_mem_fn(tester, false));
+    static_assert(noexcept(not_unary_mem_fn(tester, false)));
+    static_assert(!std::is_invocable_v<decltype(not_unary_mem_fn), const MemberPointerTester&, bool>);
+  }
+}
+
+constexpr void test_perfect_forwarding_call_wrapper() {
+  { // Make sure we call the correctly cv-ref qualified operator()
+    // based on the value category of the not_fn<NTTP> unspecified-type.
+    struct X {
+      constexpr FakeBool operator()() & { return 1; }
+      constexpr FakeBool operator()() const& { return 2; }
+      constexpr FakeBool operator()() && { return 3; }
+      constexpr FakeBool operator()() const&& { return 4; }
+    };
+
+    auto f  = std::not_fn<X{}>();
+    using F = decltype(f);
+    assert(static_cast<F&>(f)() == -2);
+    assert(static_cast<const F&>(f)() == -2);
+    assert(static_cast<F&&>(f)() == -2);
+    assert(static_cast<const F&&>(f)() == -2);
+  }
+
+  // Call to `not_fn<NTTP>` unspecified-type's operator() should always result in call to the const& overload of the underlying function object.
+  {
+    { // Make sure unspecified-type is still callable when we delete the & overload.
+      struct X {
+        FakeBool operator()() & = delete;
+        FakeBool operator()() const&;
+        FakeBool operator()() &&;
+        FakeBool operator()() const&&;
+      };
+
+      using F = decltype(std::not_fn<X{}>());
+      static_assert(std::invocable<F&>);
+      static_assert(std::invocable<const F&>);
+      static_assert(std::invocable<F>);
+      static_assert(std::invocable<const F>);
+    }
+
+    { // Make sure unspecified-type is not callable when we delete the const& overload.
+      struct X {
+        FakeBool operator()() &;
+        FakeBool operator()() const& = delete;
+        FakeBool operator()() &&;
+        FakeBool operator()() const&&;
+      };
+
+      using F = decltype(std::not_fn<X{}>());
+      static_assert(!std::invocable<F&>);
+      static_assert(!std::invocable<const F&>);
+      static_assert(!std::invocable<F>);
+      static_assert(!std::invocable<const F>);
+    }
+
+    { // Make sure unspecified-type is still callable when we delete the && overload.
+      struct X {
+        FakeBool operator()() &;
+        FakeBool operator()() const&;
+        FakeBool operator()() && = delete;
+        FakeBool operator()() const&&;
+      };
+
+      using F = decltype(std::not_fn<X{}>());
+      static_assert(std::invocable<F&>);
+      static_assert(std::invocable<const F&>);
+      static_assert(std::invocable<F>);
+      static_assert(std::invocable<const F>);
+    }
+
+    { // Make sure unspecified-type is still callable when we delete the const&& overload.
+      struct X {
+        FakeBool operator()() &;
+        FakeBool operator()() const&;
+        FakeBool operator()() &&;
+        FakeBool operator()() const&& = delete;
+      };
+
+      using F = decltype(std::not_fn<X{}>());
+      static_assert(std::invocable<F&>);
+      static_assert(std::invocable<const F&>);
+      static_assert(std::invocable<F>);
+      static_assert(std::invocable<const F>);
+    }
+  }
+
+  { // Test perfect forwarding
+    auto f = [](int& val) {
+      val = 5;
+      return false;
+    };
+
+    auto not_f = std::not_fn<f>();
+    int val    = 0;
+    assert(not_f(val));
+    assert(val == 5);
+
+    using NotF = decltype(not_f);
+    static_assert(std::invocable<NotF, int&>);
+    static_assert(!std::invocable<NotF, int>);
+  }
+}
+
+constexpr void test_return_type() {
+  { // Test constructors and assignment operators
+    struct IsPowerOfTwo {
+      constexpr bool operator()(unsigned int x) const { return std::has_single_bit(x); }
+    };
+
+    auto is_not_power_of_2 = std::not_fn<IsPowerOfTwo{}>();
+    assert(is_not_power_of_2(5));
+    assert(!is_not_power_of_2(4));
+
+    auto moved = std::move(is_not_power_of_2);
+    assert(moved(5));
+    assert(!moved(4));
+
+    auto copied = is_not_power_of_2;
+    assert(copied(7));
+    assert(!copied(8));
+
+    moved = std::move(copied);
+    assert(copied(9));
+    assert(!copied(16));
+
+    copied = moved;
+    assert(copied(11));
+    assert(!copied(32));
+  }
+
+  { // Make sure `not_fn<NTTP>` unspecified-type's operator() is SFINAE-friendly.
+    using F = decltype(std::not_fn<[](int x) { return !x; }>());
+    static_assert(!std::is_invocable<F>::value);
+    static_assert(std::is_invocable<F, int>::value);
+    static_assert(!std::is_invocable<F, void*>::value);
+    static_assert(!std::is_invocable<F, int, int>::value);
+  }
+
+  { // Test noexceptness
+    auto always_noexcept = std::not_fn<MaybeNoexceptFn<true>{}>();
+    static_assert(noexcept(always_noexcept()));
+
+    auto never_noexcept = std::not_fn<MaybeNoexceptFn<false>{}>();
+    static_assert(!noexcept(never_noexcept()));
+
+    auto always_noexcept_negation = std::not_fn<maybe_noexcept_negation<true>>();
+    static_assert(noexcept(always_noexcept_negation()));
+
+    auto never_noexcept_negation = std::not_fn<maybe_noexcept_negation<false>>();
+    static_assert(!noexcept(never_noexcept_negation()));
+  }
+
+  { // Test calling volatile wrapper
+    using NotFn = decltype(std::not_fn<std::false_type{}>());
+    static_assert(!std::invocable<volatile NotFn&>);
+    static_assert(!std::invocable<const volatile NotFn&>);
+    static_assert(!std::invocable<volatile NotFn>);
+    static_assert(!std::invocable<const volatile NotFn>);
+  }
+}
+
+constexpr bool test() {
+  basic_tests();
+  test_perfect_forwarding_call_wrapper();
+  test_return_type();
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/utilities/function.objects/func.not_fn/not_fn.nttp.verify.cpp b/libcxx/test/std/utilities/function.objects/func.not_fn/not_fn.nttp.verify.cpp
new file mode 100644
index 0000000000000..f4ebea7e8927e
--- /dev/null
+++ b/libcxx/test/std/utilities/function.objects/func.not_fn/not_fn.nttp.verify.cpp
@@ -0,0 +1,29 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+
+// <functional>
+
+// template<auto f> constexpr unspecified not_fn() noexcept;
+// Mandates: If is_pointer_v<F> || is_member_pointer_v<F> is true, then f != nullptr is true.
+
+#include <functional>
+
+struct X {};
+
+void test() {
+  auto not_fn1 = std::not_fn<static_cast<bool (*)()>(nullptr)>();
+  // expected-error@*:* {{static assertion failed due to requirement 'nullptr != nullptr': f cannot be equal to nullptr}}
+
+  auto not_fn2 = std::not_fn<static_cast<bool X::*>(nullptr)>();
+  // expected-error@*:* {{static assertion failed due to requirement 'nullptr != nullptr': f cannot be equal to nullptr}}
+
+  auto not_fn3 = std::not_fn<static_cast<bool (X::*)()>(nullptr)>();
+  // expected-error@*:* {{static assertion failed due to requirement 'nullptr != nullptr': f cannot be equal to nullptr}}
+}
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index a2ce69892eeb7..dae827f5de50c 100755
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -931,7 +931,7 @@ def add_version_header(tc):
             "name": "__cpp_lib_not_fn",
             "values": {
                 "c++17": 201603,
-                # "c++26": 202306, # P2714R1 Bind front and back to NTTP callables
+                "c++26": 202306,  # P2714R1 Bind front and back to NTTP callables
             },
             "headers": ["functional"],
         },

From b900379e26d9f49977c4d772f1b2b681fc5147d4 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi@microsoft.com>
Date: Fri, 10 Jan 2025 14:16:27 -0500
Subject: [PATCH 098/408] [HLSL]  Reapply Move length support out of the
 DirectX Backend (#121611) (#122337)

## Changes
- Delete DirectX length intrinsic
- Delete HLSL length lang builtin
- Implement length algorithm entirely in the header.

## History
- In the past if an HLSL intrinsic lowered to either a spirv op code or
a DXIL opcode we represented it with intrinsics

## Why we are moving away?
- To make HLSL apis more portable the team decided that it makes sense
for some intrinsics to be defined only in the header.
- Since there tends to be more SPIRV opcodes than DXIL opcodes the plan
is to support SPIRV opcodes either with target specific builtins or via
pattern matching.
---
 clang/include/clang/Basic/Builtins.td         |   6 -
 clang/lib/CodeGen/CGBuiltin.cpp               |  14 --
 clang/lib/CodeGen/CGHLSLRuntime.h             |   1 -
 clang/lib/Headers/hlsl/hlsl_detail.h          |  20 ++
 clang/lib/Headers/hlsl/hlsl_intrinsics.h      |  32 ++-
 clang/lib/Sema/SemaHLSL.cpp                   |  18 --
 clang/test/CodeGenHLSL/builtins/length.hlsl   | 188 +++++++++++-------
 .../test/SemaHLSL/BuiltIns/length-errors.hlsl |  51 +++--
 llvm/include/llvm/IR/IntrinsicsDirectX.td     |   1 -
 .../Target/DirectX/DXILIntrinsicExpansion.cpp |  30 ---
 llvm/test/CodeGen/DirectX/length.ll           | 116 -----------
 llvm/test/CodeGen/DirectX/length_error.ll     |  10 -
 .../DirectX/length_invalid_intrinsic_error.ll |  10 -
 .../length_invalid_intrinsic_error_scalar.ll  |  10 -
 14 files changed, 183 insertions(+), 324 deletions(-)
 delete mode 100644 llvm/test/CodeGen/DirectX/length.ll
 delete mode 100644 llvm/test/CodeGen/DirectX/length_error.ll
 delete mode 100644 llvm/test/CodeGen/DirectX/length_invalid_intrinsic_error.ll
 delete mode 100644 llvm/test/CodeGen/DirectX/length_invalid_intrinsic_error_scalar.ll

diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 468c16050e2bf..f4216bd01a074 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -4865,12 +4865,6 @@ def HLSLIsinf : LangBuiltin<"HLSL_LANG"> {
   let Prototype = "void(...)";
 }
 
-def HLSLLength : LangBuiltin<"HLSL_LANG"> {
-  let Spellings = ["__builtin_hlsl_length"];
-  let Attributes = [NoThrow, Const];
-  let Prototype = "void(...)";
-}
-
 def HLSLLerp : LangBuiltin<"HLSL_LANG"> {
   let Spellings = ["__builtin_hlsl_lerp"];
   let Attributes = [NoThrow, Const];
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index ca03fb665d423..2b09197669996 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -19334,20 +19334,6 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
         /*ReturnType=*/X->getType(), CGM.getHLSLRuntime().getLerpIntrinsic(),
         ArrayRef<Value *>{X, Y, S}, nullptr, "hlsl.lerp");
   }
-  case Builtin::BI__builtin_hlsl_length: {
-    Value *X = EmitScalarExpr(E->getArg(0));
-
-    assert(E->getArg(0)->getType()->hasFloatingRepresentation() &&
-           "length operand must have a float representation");
-    // if the operand is a scalar, we can use the fabs llvm intrinsic directly
-    if (!E->getArg(0)->getType()->isVectorType())
-      return EmitFAbs(*this, X);
-
-    return Builder.CreateIntrinsic(
-        /*ReturnType=*/X->getType()->getScalarType(),
-        CGM.getHLSLRuntime().getLengthIntrinsic(), ArrayRef<Value *>{X},
-        nullptr, "hlsl.length");
-  }
   case Builtin::BI__builtin_hlsl_normalize: {
     Value *X = EmitScalarExpr(E->getArg(0));
 
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index 46e472f0aae21..00e110e8e6fa2 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -77,7 +77,6 @@ class CGHLSLRuntime {
   GENERATE_HLSL_INTRINSIC_FUNCTION(Cross, cross)
   GENERATE_HLSL_INTRINSIC_FUNCTION(Degrees, degrees)
   GENERATE_HLSL_INTRINSIC_FUNCTION(Frac, frac)
-  GENERATE_HLSL_INTRINSIC_FUNCTION(Length, length)
   GENERATE_HLSL_INTRINSIC_FUNCTION(Lerp, lerp)
   GENERATE_HLSL_INTRINSIC_FUNCTION(Normalize, normalize)
   GENERATE_HLSL_INTRINSIC_FUNCTION(Rsqrt, rsqrt)
diff --git a/clang/lib/Headers/hlsl/hlsl_detail.h b/clang/lib/Headers/hlsl/hlsl_detail.h
index 8d5fd94133153..33d394f7883a6 100644
--- a/clang/lib/Headers/hlsl/hlsl_detail.h
+++ b/clang/lib/Headers/hlsl/hlsl_detail.h
@@ -13,6 +13,14 @@ namespace hlsl {
 
 namespace __detail {
 
+template <typename T, typename U> struct is_same {
+  static const bool value = false;
+};
+
+template <typename T> struct is_same<T, T> {
+  static const bool value = true;
+};
+
 template <bool B, typename T> struct enable_if {};
 
 template <typename T> struct enable_if<true, T> {
@@ -33,6 +41,18 @@ constexpr enable_if_t<sizeof(U) == sizeof(T), U> bit_cast(T F) {
   return __builtin_bit_cast(U, F);
 }
 
+template <typename T>
+constexpr enable_if_t<is_same<float, T>::value || is_same<half, T>::value, T>
+length_impl(T X) {
+  return __builtin_elementwise_abs(X);
+}
+
+template <typename T, int N>
+constexpr enable_if_t<is_same<float, T>::value || is_same<half, T>::value, T>
+length_vec_impl(vector<T, N> X) {
+  return __builtin_elementwise_sqrt(__builtin_hlsl_dot(X, X));
+}
+
 } // namespace __detail
 } // namespace hlsl
 #endif //_HLSL_HLSL_DETAILS_H_
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index b745997f1d5a2..e5ff7531d25d9 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -1298,26 +1298,18 @@ float4 lerp(float4, float4, float4);
 /// Length is based on the following formula: sqrt(x[0]^2 + x[1]^2 + ...).
 
 _HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_length)
-half length(half);
-_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_length)
-half length(half2);
-_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_length)
-half length(half3);
-_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_length)
-half length(half4);
-
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_length)
-float length(float);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_length)
-float length(float2);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_length)
-float length(float3);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_length)
-float length(float4);
+const inline half length(half X) { return __detail::length_impl(X); }
+const inline float length(float X) { return __detail::length_impl(X); }
+
+template <int N>
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+const inline half length(vector<half, N> X) {
+  return __detail::length_vec_impl(X);
+}
+
+template <int N> const inline float length(vector<float, N> X) {
+  return __detail::length_vec_impl(X);
+}
 
 //===----------------------------------------------------------------------===//
 // log builtins
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 83c38fad2df68..65ddee05a2151 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -2112,24 +2112,6 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
       return true;
     break;
   }
-  case Builtin::BI__builtin_hlsl_length: {
-    if (CheckFloatOrHalfRepresentations(&SemaRef, TheCall))
-      return true;
-    if (SemaRef.checkArgCount(TheCall, 1))
-      return true;
-
-    ExprResult A = TheCall->getArg(0);
-    QualType ArgTyA = A.get()->getType();
-    QualType RetTy;
-
-    if (auto *VTy = ArgTyA->getAs<VectorType>())
-      RetTy = VTy->getElementType();
-    else
-      RetTy = TheCall->getArg(0)->getType();
-
-    TheCall->setType(RetTy);
-    break;
-  }
   case Builtin::BI__builtin_hlsl_mad: {
     if (SemaRef.checkArgCount(TheCall, 3))
       return true;
diff --git a/clang/test/CodeGenHLSL/builtins/length.hlsl b/clang/test/CodeGenHLSL/builtins/length.hlsl
index a24f01d275440..fcf3ee76ba5bb 100644
--- a/clang/test/CodeGenHLSL/builtins/length.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/length.hlsl
@@ -1,73 +1,115 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
-
-// NATIVE_HALF: define noundef nofpclass(nan inf) half @
-// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.fabs.f16(half
-// NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.fabs.f32(float
-// NATIVE_HALF: ret half
-// NO_HALF: ret float
-half test_length_half(half p0)
-{
-  return length(p0);
-}
-// NATIVE_HALF: define noundef nofpclass(nan inf) half @
-// NATIVE_HALF: %hlsl.length = call reassoc nnan ninf nsz arcp afn half @llvm.dx.length.v2f16
-// NO_HALF: %hlsl.length = call reassoc nnan ninf nsz arcp afn float @llvm.dx.length.v2f32(
-// NATIVE_HALF: ret half %hlsl.length
-// NO_HALF: ret float %hlsl.length
-half test_length_half2(half2 p0)
-{
-  return length(p0);
-}
-// NATIVE_HALF: define noundef nofpclass(nan inf) half @
-// NATIVE_HALF: %hlsl.length = call reassoc nnan ninf nsz arcp afn half @llvm.dx.length.v3f16
-// NO_HALF: %hlsl.length = call reassoc nnan ninf nsz arcp afn float @llvm.dx.length.v3f32(
-// NATIVE_HALF: ret half %hlsl.length
-// NO_HALF: ret float %hlsl.length
-half test_length_half3(half3 p0)
-{
-  return length(p0);
-}
-// NATIVE_HALF: define noundef nofpclass(nan inf) half @
-// NATIVE_HALF: %hlsl.length = call reassoc nnan ninf nsz arcp afn half @llvm.dx.length.v4f16
-// NO_HALF: %hlsl.length = call reassoc nnan ninf nsz arcp afn float @llvm.dx.length.v4f32(
-// NATIVE_HALF: ret half %hlsl.length
-// NO_HALF: ret float %hlsl.length
-half test_length_half4(half4 p0)
-{
-  return length(p0);
-}
-
-// CHECK: define noundef nofpclass(nan inf) float @
-// CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.fabs.f32(float
-// CHECK: ret float
-float test_length_float(float p0)
-{
-  return length(p0);
-}
-// CHECK: define noundef nofpclass(nan inf) float @
-// CHECK: %hlsl.length = call reassoc nnan ninf nsz arcp afn float @llvm.dx.length.v2f32(
-// CHECK: ret float %hlsl.length
-float test_length_float2(float2 p0)
-{
-  return length(p0);
-}
-// CHECK: define noundef nofpclass(nan inf) float @
-// CHECK: %hlsl.length = call reassoc nnan ninf nsz arcp afn float @llvm.dx.length.v3f32(
-// CHECK: ret float %hlsl.length
-float test_length_float3(float3 p0)
-{
-  return length(p0);
-}
-// CHECK: define noundef nofpclass(nan inf) float @
-// CHECK: %hlsl.length = call reassoc nnan ninf nsz arcp afn float @llvm.dx.length.v4f32(
-// CHECK: ret float %hlsl.length
-float test_length_float4(float4 p0)
-{
-  return length(p0);
-}
+// RUN: %clang_cc1 -finclude-default-header -triple \
+// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN: -emit-llvm -O1 -o - | FileCheck %s --check-prefixes=CHECK,DXCHECK \
+// RUN: -DTARGET=dx
+
+// RUN: %clang_cc1 -finclude-default-header -triple \
+// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN: -emit-llvm -O1 -o - | FileCheck %s --check-prefixes=CHECK,SPVCHECK \
+// RUN: -DTARGET=spv
+
+
+// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z16test_length_halfDh(
+// DXCHECK-LABEL: define noundef nofpclass(nan inf) half @_Z16test_length_halfDh(
+// CHECK-SAME: half noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.fabs.f16(half [[P0]])
+// CHECK-NEXT:    ret half [[ELT_ABS_I]]
+//
+
+half test_length_half(half p0)
+{
+  return length(p0);
+}
+
+// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z17test_length_half2Dv2_Dh(
+// DXCHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half2Dv2_Dh(
+// CHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.[[TARGET]].fdot.v2f16(<2 x half> [[P0]], <2 x half> [[P0]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]])
+// CHECK-NEXT:    ret half [[TMP0]]
+//
+
+
+half test_length_half2(half2 p0)
+{
+  return length(p0);
+}
+
+// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z17test_length_half3Dv3_Dh(
+// DXCHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half3Dv3_Dh(
+// CHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.[[TARGET]].fdot.v3f16(<3 x half> [[P0]], <3 x half> [[P0]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]])
+// CHECK-NEXT:    ret half [[TMP0]]
+//
+half test_length_half3(half3 p0)
+{
+  return length(p0);
+}
+
+// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z17test_length_half4Dv4_Dh(
+// DXCHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half4Dv4_Dh(
+// CHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.[[TARGET]].fdot.v4f16(<4 x half> [[P0]], <4 x half> [[P0]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]])
+// CHECK-NEXT:    ret half [[TMP0]]
+//
+half test_length_half4(half4 p0)
+{
+  return length(p0);
+}
+
+// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z17test_length_floatf(
+// DXCHECK-LABEL: define noundef nofpclass(nan inf) float @_Z17test_length_floatf(
+// CHECK-SAME: float noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.fabs.f32(float [[P0]])
+// CHECK-NEXT:    ret float [[ELT_ABS_I]]
+//
+float test_length_float(float p0)
+{
+  return length(p0);
+}
+
+// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z18test_length_float2Dv2_f(
+// DXCHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float2Dv2_f(
+// CHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.[[TARGET]].fdot.v2f32(<2 x float> [[P0]], <2 x float> [[P0]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]])
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+float test_length_float2(float2 p0)
+{
+  return length(p0);
+}
+
+// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z18test_length_float3Dv3_f(
+// DXCHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float3Dv3_f(
+// CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.[[TARGET]].fdot.v3f32(<3 x float> [[P0]], <3 x float> [[P0]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]])
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+float test_length_float3(float3 p0)
+{
+  return length(p0);
+}
+
+// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z18test_length_float4Dv4_f(
+// DXCHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float4Dv4_f(
+// CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.[[TARGET]].fdot.v4f32(<4 x float> [[P0]], <4 x float> [[P0]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]])
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+float test_length_float4(float4 p0)
+{
+  return length(p0);
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl
index 281faada6f5e9..a191f0419fbba 100644
--- a/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl
@@ -1,32 +1,53 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify -verify-ignore-unexpected
-
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
 
 void test_too_few_arg()
 {
-  return __builtin_hlsl_length();
-  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+  return length();
+  // expected-error@-1 {{no matching function for call to 'length'}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function not viable: requires single argument 'X', but no arguments were provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function not viable: requires single argument 'X', but no arguments were provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires single argument 'X', but no arguments were provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires single argument 'X', but no arguments were provided}}
 }
 
 void test_too_many_arg(float2 p0)
 {
-  return __builtin_hlsl_length(p0, p0);
-  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
+  return length(p0, p0);
+  // expected-error@-1 {{no matching function for call to 'length'}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function not viable: requires single argument 'X', but 2 arguments were provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires single argument 'X', but 2 arguments were provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function not viable: requires single argument 'X', but 2 arguments were provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires single argument 'X', but 2 arguments were provided}}
+}
+
+float double_to_float_type(double p0) {
+  return length(p0);
+  // expected-error@-1  {{call to 'length' is ambiguous}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function}}
 }
 
-bool builtin_bool_to_float_type_promotion(bool p1)
+
+float bool_to_float_type_promotion(bool p1)
 {
-  return __builtin_hlsl_length(p1);
-  // expected-error@-1 {passing 'bool' to parameter of incompatible type 'float'}}
+  return length(p1);
+  // expected-error@-1  {{call to 'length' is ambiguous}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function}}
 }
 
-bool builtin_length_int_to_float_promotion(int p1)
+float length_int_to_float_promotion(int p1)
 {
-  return __builtin_hlsl_length(p1);
-  // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}}
+  return length(p1);
+  // expected-error@-1  {{call to 'length' is ambiguous}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function}}
 }
 
-bool2 builtin_length_int2_to_float2_promotion(int2 p1)
+float2 length_int2_to_float2_promotion(int2 p1)
 {
-  return __builtin_hlsl_length(p1);
-  // expected-error@-1 {{passing 'int2' (aka 'vector<int, 2>') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}}
+  return length(p1);
+  // expected-error@-1  {{call to 'length' is ambiguous}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function}}
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index 3b1d1a88e01a8..ef48af5b42dbf 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -93,7 +93,6 @@ def int_dx_isinf : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1
 def int_dx_lerp : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>,LLVMMatchType<0>],
     [IntrNoMem]>;
 
-def int_dx_length : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], [llvm_anyfloat_ty], [IntrNoMem]>;
 def int_dx_imad : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
 def int_dx_umad : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
 def int_dx_normalize : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>;
diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
index 51dc3025f0c37..cf142806bb1df 100644
--- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
+++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
@@ -57,7 +57,6 @@ static bool isIntrinsicExpansion(Function &F) {
   case Intrinsic::dx_nclamp:
   case Intrinsic::dx_degrees:
   case Intrinsic::dx_lerp:
-  case Intrinsic::dx_length:
   case Intrinsic::dx_normalize:
   case Intrinsic::dx_fdot:
   case Intrinsic::dx_sdot:
@@ -292,32 +291,6 @@ static Value *expandAnyOrAllIntrinsic(CallInst *Orig,
   return Result;
 }
 
-static Value *expandLengthIntrinsic(CallInst *Orig) {
-  Value *X = Orig->getOperand(0);
-  IRBuilder<> Builder(Orig);
-  Type *Ty = X->getType();
-  Type *EltTy = Ty->getScalarType();
-
-  // Though dx.length does work on scalar type, we can optimize it to just emit
-  // fabs, in CGBuiltin.cpp. We shouldn't see a scalar type here because
-  // CGBuiltin.cpp should have emitted a fabs call.
-  Value *Elt = Builder.CreateExtractElement(X, (uint64_t)0);
-  auto *XVec = dyn_cast<FixedVectorType>(Ty);
-  unsigned XVecSize = XVec->getNumElements();
-  if (!(Ty->isVectorTy() && XVecSize > 1))
-    report_fatal_error(Twine("Invalid input type for length intrinsic"),
-                       /* gen_crash_diag=*/false);
-
-  Value *Sum = Builder.CreateFMul(Elt, Elt);
-  for (unsigned I = 1; I < XVecSize; I++) {
-    Elt = Builder.CreateExtractElement(X, I);
-    Value *Mul = Builder.CreateFMul(Elt, Elt);
-    Sum = Builder.CreateFAdd(Sum, Mul);
-  }
-  return Builder.CreateIntrinsic(EltTy, Intrinsic::sqrt, ArrayRef<Value *>{Sum},
-                                 nullptr, "elt.sqrt");
-}
-
 static Value *expandLerpIntrinsic(CallInst *Orig) {
   Value *X = Orig->getOperand(0);
   Value *Y = Orig->getOperand(1);
@@ -589,9 +562,6 @@ static bool expandIntrinsic(Function &F, CallInst *Orig) {
   case Intrinsic::dx_lerp:
     Result = expandLerpIntrinsic(Orig);
     break;
-  case Intrinsic::dx_length:
-    Result = expandLengthIntrinsic(Orig);
-    break;
   case Intrinsic::dx_normalize:
     Result = expandNormalizeIntrinsic(Orig);
     break;
diff --git a/llvm/test/CodeGen/DirectX/length.ll b/llvm/test/CodeGen/DirectX/length.ll
deleted file mode 100644
index fc5868a7f6e82..0000000000000
--- a/llvm/test/CodeGen/DirectX/length.ll
+++ /dev/null
@@ -1,116 +0,0 @@
-; RUN: opt -S  -dxil-intrinsic-expansion  < %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK
-; RUN: opt -S  -dxil-intrinsic-expansion -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library < %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK
-
-; Make sure dxil operation function calls for length are generated for half/float.
-
-declare half @llvm.fabs.f16(half)
-declare half @llvm.dx.length.v2f16(<2 x half>)
-declare half @llvm.dx.length.v3f16(<3 x half>)
-declare half @llvm.dx.length.v4f16(<4 x half>)
-
-declare float @llvm.fabs.f32(float)
-declare float @llvm.dx.length.v2f32(<2 x float>)
-declare float @llvm.dx.length.v3f32(<3 x float>)
-declare float @llvm.dx.length.v4f32(<4 x float>)
-
-define noundef half @test_length_half2(<2 x half> noundef %p0) {
-entry:
-  ; CHECK: extractelement <2 x half> %{{.*}}, i64 0
-  ; CHECK: fmul half %{{.*}}, %{{.*}}
-  ; CHECK: extractelement <2 x half> %{{.*}}, i64 1
-  ; CHECK: fmul half %{{.*}}, %{{.*}}
-  ; CHECK: fadd half %{{.*}}, %{{.*}}
-  ; EXPCHECK: call half @llvm.sqrt.f16(half %{{.*}})
-  ; DOPCHECK: call half @dx.op.unary.f16(i32 24, half %{{.*}})
-
-  %hlsl.length = call half @llvm.dx.length.v2f16(<2 x half> %p0)
-  ret half %hlsl.length
-}
-
-define noundef half @test_length_half3(<3 x half> noundef %p0) {
-entry:
-  ; CHECK: extractelement <3 x half> %{{.*}}, i64 0
-  ; CHECK: fmul half %{{.*}}, %{{.*}}
-  ; CHECK: extractelement <3 x half> %{{.*}}, i64 1
-  ; CHECK: fmul half %{{.*}}, %{{.*}}
-  ; CHECK: fadd half %{{.*}}, %{{.*}}
-  ; CHECK: extractelement <3 x half> %{{.*}}, i64 2
-  ; CHECK: fmul half %{{.*}}, %{{.*}}
-  ; CHECK: fadd half %{{.*}}, %{{.*}}
-  ; EXPCHECK: call half @llvm.sqrt.f16(half %{{.*}})
-  ; DOPCHECK: call half @dx.op.unary.f16(i32 24, half %{{.*}})
-
-  %hlsl.length = call half @llvm.dx.length.v3f16(<3 x half> %p0)
-  ret half %hlsl.length
-}
-
-define noundef half @test_length_half4(<4 x half> noundef %p0) {
-entry:
-  ; CHECK: extractelement <4 x half> %{{.*}}, i64 0
-  ; CHECK: fmul half %{{.*}}, %{{.*}}
-  ; CHECK: extractelement <4 x half> %{{.*}}, i64 1
-  ; CHECK: fmul half %{{.*}}, %{{.*}}
-  ; CHECK: fadd half %{{.*}}, %{{.*}}
-  ; CHECK: extractelement <4 x half> %{{.*}}, i64 2
-  ; CHECK: fmul half %{{.*}}, %{{.*}}
-  ; CHECK: fadd half %{{.*}}, %{{.*}}
-  ; CHECK: extractelement <4 x half> %{{.*}}, i64 3
-  ; CHECK: fmul half %{{.*}}, %{{.*}}
-  ; CHECK: fadd half %{{.*}}, %{{.*}}
-  ; EXPCHECK: call half @llvm.sqrt.f16(half %{{.*}})
-  ; DOPCHECK:  call half @dx.op.unary.f16(i32 24, half %{{.*}})
-
-  %hlsl.length = call half @llvm.dx.length.v4f16(<4 x half> %p0)
-  ret half %hlsl.length
-}
-
-define noundef float @test_length_float2(<2 x float> noundef %p0) {
-entry:
-  ; CHECK: extractelement <2 x float> %{{.*}}, i64 0
-  ; CHECK: fmul float %{{.*}}, %{{.*}}
-  ; CHECK: extractelement <2 x float> %{{.*}}, i64 1
-  ; CHECK: fmul float %{{.*}}, %{{.*}}
-  ; CHECK: fadd float %{{.*}}, %{{.*}}
-  ; EXPCHECK: call float @llvm.sqrt.f32(float %{{.*}})
-  ; DOPCHECK: call float @dx.op.unary.f32(i32 24, float %{{.*}})
-
-  %hlsl.length = call float @llvm.dx.length.v2f32(<2 x float> %p0)
-  ret float %hlsl.length
-}
-
-define noundef float @test_length_float3(<3 x float> noundef %p0) {
-entry:
-  ; CHECK: extractelement <3 x float> %{{.*}}, i64 0
-  ; CHECK: fmul float %{{.*}}, %{{.*}}
-  ; CHECK: extractelement <3 x float> %{{.*}}, i64 1
-  ; CHECK: fmul float %{{.*}}, %{{.*}}
-  ; CHECK: fadd float %{{.*}}, %{{.*}}
-  ; CHECK: extractelement <3 x float> %{{.*}}, i64 2
-  ; CHECK: fmul float %{{.*}}, %{{.*}}
-  ; CHECK: fadd float %{{.*}}, %{{.*}}
-  ; EXPCHECK: call float @llvm.sqrt.f32(float %{{.*}})
-  ; DOPCHECK: call float @dx.op.unary.f32(i32 24, float %{{.*}})
-
-  %hlsl.length = call float @llvm.dx.length.v3f32(<3 x float> %p0)
-  ret float %hlsl.length
-}
-
-define noundef float @test_length_float4(<4 x float> noundef %p0) {
-entry:
-  ; CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  ; CHECK: fmul float %{{.*}}, %{{.*}}
-  ; CHECK: extractelement <4 x float> %{{.*}}, i64 1
-  ; CHECK: fmul float %{{.*}}, %{{.*}}
-  ; CHECK: fadd float %{{.*}}, %{{.*}}
-  ; CHECK: extractelement <4 x float> %{{.*}}, i64 2
-  ; CHECK: fmul float %{{.*}}, %{{.*}}
-  ; CHECK: fadd float %{{.*}}, %{{.*}}
-  ; CHECK: extractelement <4 x float> %{{.*}}, i64 3
-  ; CHECK: fmul float %{{.*}}, %{{.*}}
-  ; CHECK: fadd float %{{.*}}, %{{.*}}
-  ; EXPCHECK: call float @llvm.sqrt.f32(float %{{.*}})
-  ; DOPCHECK:  call float @dx.op.unary.f32(i32 24, float %{{.*}})
-
-  %hlsl.length = call float @llvm.dx.length.v4f32(<4 x float> %p0)
-  ret float %hlsl.length
-}
diff --git a/llvm/test/CodeGen/DirectX/length_error.ll b/llvm/test/CodeGen/DirectX/length_error.ll
deleted file mode 100644
index 143b41fc506e1..0000000000000
--- a/llvm/test/CodeGen/DirectX/length_error.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: not opt -S -dxil-intrinsic-expansion -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
-
-; DXIL operation length does not support double overload type
-; CHECK: Cannot create Sqrt operation: Invalid overload type
-
-define noundef double @test_length_double2(<2 x double> noundef %p0) {
-entry:
-  %hlsl.length = call double @llvm.dx.length.v2f32(<2 x double> %p0)
-  ret double %hlsl.length
-}
diff --git a/llvm/test/CodeGen/DirectX/length_invalid_intrinsic_error.ll b/llvm/test/CodeGen/DirectX/length_invalid_intrinsic_error.ll
deleted file mode 100644
index f722de2f9029e..0000000000000
--- a/llvm/test/CodeGen/DirectX/length_invalid_intrinsic_error.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: not opt -S -dxil-intrinsic-expansion -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
-
-; DXIL operation length does not support 1-element vector types.
-; CHECK: LLVM ERROR: Invalid input type for length intrinsic
-
-define noundef float @test_length_float(<1 x float> noundef %p0) {
-entry:
-  %hlsl.length = call float @llvm.dx.length.v1f32(<1 x float> %p0)
-  ret float %hlsl.length
-}
diff --git a/llvm/test/CodeGen/DirectX/length_invalid_intrinsic_error_scalar.ll b/llvm/test/CodeGen/DirectX/length_invalid_intrinsic_error_scalar.ll
deleted file mode 100644
index ac3a0513eb6b2..0000000000000
--- a/llvm/test/CodeGen/DirectX/length_invalid_intrinsic_error_scalar.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
-
-; DXIL operation length does not support scalar types
-; CHECK: error: invalid intrinsic signature
-
-define noundef float @test_length_float(float noundef %p0) {
-entry:
-  %hlsl.length = call float @llvm.dx.length.f32(float %p0)
-  ret float %hlsl.length
-}

From 2e5c2982819625d84e0b61aea0ec00de859f0e95 Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow@amd.com>
Date: Fri, 10 Jan 2025 11:39:02 -0800
Subject: [PATCH 099/408] [AMDGPU] Add backward compatibility layer for kernarg
 preloading (#119167)

Add a prologue to the kernel entry to handle cases where code designed
for kernarg preloading is executed on hardware equipped with
incompatible firmware. If hardware has compatible firmware the 256 bytes
at the start of the kernel entry will be skipped. This skipping is done
automatically by hardware that supports the feature.

A pass is added which is intended to be run at the very end of the
pipeline to avoid any optimizations that would assume the prologue is a
real predecessor block to the actual code start. In reality we have two
possible entry points for the function. 1. The optimized path that
supports kernarg preloading which begins at an offset of 256 bytes. 2.
The backwards compatible entry point which starts at offset 0.
---
 llvm/docs/AMDGPUUsage.rst                     |   5 +-
 llvm/lib/Target/AMDGPU/AMDGPU.h               |   4 +
 .../Target/AMDGPU/AMDGPUArgumentUsageInfo.h   |   3 +
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp   |   6 -
 .../AMDGPU/AMDGPUPreloadKernArgProlog.cpp     | 211 +++++
 .../AMDGPU/AMDGPUPreloadKernArgProlog.h       |  25 +
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   2 +
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |   1 +
 .../MCTargetDesc/AMDGPUTargetStreamer.cpp     |  23 -
 .../MCTargetDesc/AMDGPUTargetStreamer.h       |  14 -
 .../Target/AMDGPU/SIMachineFunctionInfo.cpp   |   2 +
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |   5 +
 .../AMDGPU/preload-implicit-kernargs.ll       | 477 ++++++++---
 .../CodeGen/AMDGPU/preload-kernarg-header.ll  |  57 +-
 llvm/test/CodeGen/AMDGPU/preload-kernargs.ll  | 770 +++++++++++++-----
 15 files changed, 1203 insertions(+), 402 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.h

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 5c6034753eb4a..40b393224f15d 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -5914,10 +5914,7 @@ additional 256 bytes to the kernel_code_entry_byte_offset. This addition
 facilitates the incorporation of a prologue to the kernel entry to handle cases
 where code designed for kernarg preloading is executed on hardware equipped with
 incompatible firmware. If hardware has compatible firmware the 256 bytes at the
-start of the kernel entry will be skipped. Additionally, the compiler backend
-may insert a trap instruction at the start of the kernel prologue to manage
-situations where kernarg preloading is attempted on hardware with incompatible
-firmware.
+start of the kernel entry will be skipped.
 
 With code object V5 and later, hidden kernel arguments that are normally
 accessed through the Implicit Argument Ptr, may be preloaded into User SGPRs.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index b9769a1baf4d1..ad5ee75f0c5d1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -64,6 +64,7 @@ createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM = nullptr);
 ModulePass *createAMDGPULowerBufferFatPointersPass();
 FunctionPass *createSIModeRegisterPass();
 FunctionPass *createGCNPreRAOptimizationsPass();
+FunctionPass *createAMDGPUPreloadKernArgPrologLegacyPass();
 
 struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> {
   AMDGPUSimplifyLibCallsPass() {}
@@ -230,6 +231,9 @@ extern char &AMDGPUPerfHintAnalysisLegacyID;
 void initializeGCNRegPressurePrinterPass(PassRegistry &);
 extern char &GCNRegPressurePrinterID;
 
+void initializeAMDGPUPreloadKernArgPrologLegacyPass(PassRegistry &);
+extern char &AMDGPUPreloadKernArgPrologLegacyID;
+
 // Passes common to R600 and SI
 FunctionPass *createAMDGPUPromoteAlloca();
 void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index 06b2f181c276c..e07d47381ecca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
 
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/Register.h"
 #include "llvm/Pass.h"
@@ -161,6 +162,8 @@ struct AMDGPUFunctionArgInfo {
 
   // Map the index of preloaded kernel arguments to its descriptor.
   SmallDenseMap<int, KernArgPreloadDescriptor> PreloadKernArgs{};
+  // The first user SGPR allocated for kernarg preloading.
+  Register FirstKernArgPreloadReg;
 
   std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT>
   getPreloadedValue(PreloadedValue Value) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 90c341ac0819c..737b2f740d6f7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -207,12 +207,6 @@ void AMDGPUAsmPrinter::emitFunctionBodyStart() {
 
   if (STM.isAmdHsaOS())
     HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
-
-  if (MFI.getNumKernargPreloadedSGPRs() > 0) {
-    assert(AMDGPU::hasKernargPreload(STM));
-    getTargetStreamer()->EmitKernargPreloadHeader(*getGlobalSTI(),
-                                                  STM.isAmdHsaOS());
-  }
 }
 
 void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp
new file mode 100644
index 0000000000000..b3a2139dfd24e
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp
@@ -0,0 +1,211 @@
+//===- AMDGPUPreloadKernArgProlog.cpp - Preload KernArg Prolog ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass creates a backward compatibility layer for kernel argument
+/// preloading in situations where code is compiled with kernel argument
+/// preloading enabled but executed on hardware without firmware support for it.
+///
+/// To avoid recompilation, the pass inserts a block at the beginning of the
+/// program that loads the kernel arguments into SGPRs using s_load
+/// instructions. This sets up the registers exactly as they would be on systems
+/// with compatible firmware.
+///
+/// This effectively creates two entry points for the kernel. Firmware that
+/// supports the feature will automatically jump past the first 256 bytes of the
+/// program, skipping the compatibility layer and directly starting execution on
+/// the optimized code path.
+///
+/// This pass should be run as late as possible to prevent any optimizations
+/// that might assume the padding is dead code or that the added prologue is a
+/// true predecessor of the kernel entry block.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUPreloadKernArgProlog.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/TargetParser/TargetParser.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-preload-kern-arg-prolog"
+
+namespace {
+
+// Used to build s_loads maping user SGPRs to kernel arguments
+struct LoadConfig {
+  unsigned Size;
+  const TargetRegisterClass *RegClass;
+  unsigned Opcode;
+  Register LoadReg = Register();
+};
+
+class AMDGPUPreloadKernArgProlog {
+public:
+  AMDGPUPreloadKernArgProlog(MachineFunction &MF);
+
+  bool run();
+
+private:
+  MachineFunction &MF;
+  const GCNSubtarget &ST;
+  const SIMachineFunctionInfo &MFI;
+  const SIInstrInfo &TII;
+  const TargetRegisterInfo &TRI;
+
+  // Create a new block before the entry point to the kernel. Firmware that
+  // supports preloading kernel arguments will automatically jump past this
+  // block to the alternative kernel entry point.
+  void createBackCompatBlock(unsigned NumKernArgPreloadSGPRs);
+
+  // Add instructions to load kernel arguments into SGPRs.
+  void addBackCompatLoads(MachineBasicBlock *BackCompatMBB,
+                          Register KernArgSegmentPtr,
+                          unsigned NumKernArgPreloadSGPRs);
+};
+
+class AMDGPUPreloadKernArgPrologLegacy : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AMDGPUPreloadKernArgPrologLegacy() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override {
+    return "AMDGPU Preload Kernel Arguments Prolog";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // end anonymous namespace
+
+char AMDGPUPreloadKernArgPrologLegacy::ID = 0;
+
+INITIALIZE_PASS(AMDGPUPreloadKernArgPrologLegacy, DEBUG_TYPE,
+                "AMDGPU Preload Kernel Arguments Prolog", false, false)
+
+char &llvm::AMDGPUPreloadKernArgPrologLegacyID =
+    AMDGPUPreloadKernArgPrologLegacy::ID;
+
+FunctionPass *llvm::createAMDGPUPreloadKernArgPrologLegacyPass() {
+  return new AMDGPUPreloadKernArgPrologLegacy();
+}
+
+bool AMDGPUPreloadKernArgPrologLegacy::runOnMachineFunction(
+    MachineFunction &MF) {
+  return AMDGPUPreloadKernArgProlog(MF).run();
+}
+
+AMDGPUPreloadKernArgProlog::AMDGPUPreloadKernArgProlog(MachineFunction &MF)
+    : MF(MF), ST(MF.getSubtarget<GCNSubtarget>()),
+      MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(*ST.getInstrInfo()),
+      TRI(*ST.getRegisterInfo()) {}
+
+bool AMDGPUPreloadKernArgProlog::run() {
+  if (!ST.hasKernargPreload())
+    return false;
+
+  unsigned NumKernArgPreloadSGPRs = MFI.getNumKernargPreloadedSGPRs();
+  if (!NumKernArgPreloadSGPRs)
+    return false;
+
+  createBackCompatBlock(NumKernArgPreloadSGPRs);
+  return true;
+}
+
+void AMDGPUPreloadKernArgProlog::createBackCompatBlock(
+    unsigned NumKernArgPreloadSGPRs) {
+  auto KernelEntryMBB = MF.begin();
+  MachineBasicBlock *BackCompatMBB = MF.CreateMachineBasicBlock();
+  MF.insert(KernelEntryMBB, BackCompatMBB);
+
+  assert(MFI.getUserSGPRInfo().hasKernargSegmentPtr() &&
+         "Kernel argument segment pointer register not set.");
+  Register KernArgSegmentPtr = MFI.getArgInfo().KernargSegmentPtr.getRegister();
+  BackCompatMBB->addLiveIn(KernArgSegmentPtr);
+
+  // Load kernel arguments to SGPRs
+  addBackCompatLoads(BackCompatMBB, KernArgSegmentPtr, NumKernArgPreloadSGPRs);
+
+  // Wait for loads to complete
+  AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
+  unsigned Waitcnt =
+      AMDGPU::encodeWaitcnt(IV, getVmcntBitMask(IV), getExpcntBitMask(IV), 0);
+  BuildMI(BackCompatMBB, DebugLoc(), TII.get(AMDGPU::S_WAITCNT))
+      .addImm(Waitcnt);
+
+  // Branch to kernel start
+  BuildMI(BackCompatMBB, DebugLoc(), TII.get(AMDGPU::S_BRANCH))
+      .addMBB(&*KernelEntryMBB);
+  BackCompatMBB->addSuccessor(&*KernelEntryMBB);
+
+  // Create a new basic block for padding to 256 bytes
+  MachineBasicBlock *PadMBB = MF.CreateMachineBasicBlock();
+  MF.insert(++BackCompatMBB->getIterator(), PadMBB);
+  PadMBB->setAlignment(Align(256));
+  PadMBB->addSuccessor(&*KernelEntryMBB);
+}
+
+/// Find the largest possible load size that fits with SGPR alignment
+static LoadConfig getLoadParameters(const TargetRegisterInfo &TRI,
+                                    Register KernArgPreloadSGPR,
+                                    unsigned NumKernArgPreloadSGPRs) {
+  static constexpr LoadConfig Configs[] = {
+      {8, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM},
+      {4, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM},
+      {2, &AMDGPU::SReg_64RegClass, AMDGPU::S_LOAD_DWORDX2_IMM}};
+
+  for (const auto &Config : Configs) {
+    if (NumKernArgPreloadSGPRs >= Config.Size) {
+      Register LoadReg = TRI.getMatchingSuperReg(KernArgPreloadSGPR,
+                                                 AMDGPU::sub0, Config.RegClass);
+      if (LoadReg) {
+        LoadConfig C(Config);
+        C.LoadReg = LoadReg;
+        return C;
+      }
+    }
+  }
+
+  // Fallback to a single register
+  return LoadConfig{1, &AMDGPU::SReg_32RegClass, AMDGPU::S_LOAD_DWORD_IMM,
+                    KernArgPreloadSGPR};
+}
+
+void AMDGPUPreloadKernArgProlog::addBackCompatLoads(
+    MachineBasicBlock *BackCompatMBB, Register KernArgSegmentPtr,
+    unsigned NumKernArgPreloadSGPRs) {
+  Register KernArgPreloadSGPR = MFI.getArgInfo().FirstKernArgPreloadReg;
+  unsigned Offset = 0;
+  // Fill all user SGPRs used for kernarg preloading with sequential data from
+  // the kernarg segment
+  while (NumKernArgPreloadSGPRs > 0) {
+    LoadConfig Config =
+        getLoadParameters(TRI, KernArgPreloadSGPR, NumKernArgPreloadSGPRs);
+
+    BuildMI(BackCompatMBB, DebugLoc(), TII.get(Config.Opcode), Config.LoadReg)
+        .addReg(KernArgSegmentPtr)
+        .addImm(Offset)
+        .addImm(0);
+
+    Offset += 4 * Config.Size;
+    KernArgPreloadSGPR = KernArgPreloadSGPR.asMCReg() + Config.Size;
+    NumKernArgPreloadSGPRs -= Config.Size;
+  }
+}
+
+PreservedAnalyses
+AMDGPUPreloadKernArgPrologPass::run(MachineFunction &MF,
+                                    MachineFunctionAnalysisManager &) {
+  if (!AMDGPUPreloadKernArgProlog(MF).run())
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.h b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.h
new file mode 100644
index 0000000000000..1216132923ade
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.h
@@ -0,0 +1,25 @@
+//===- AMDGPUPreloadKernargProlog.h ----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_PRELOAD_KERNARG_PROLOG_H
+#define LLVM_LIB_TARGET_AMDGPU_PRELOAD_KERNARG_PROLOG_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+
+class AMDGPUPreloadKernArgPrologPass
+    : public PassInfoMixin<AMDGPUPreloadKernArgPrologPass> {
+public:
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &AM);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_PRELOAD_KERNARG_PROLOG_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 7256eec89008a..0c9d7d00a8a4a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -540,6 +540,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeGCNPreRALongBranchRegPass(*PR);
   initializeGCNRewritePartialRegUsesPass(*PR);
   initializeGCNRegPressurePrinterPass(*PR);
+  initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -1669,6 +1670,7 @@ void GCNPassConfig::addPreEmitPass() {
     addPass(&AMDGPUInsertDelayAluID);
 
   addPass(&BranchRelaxationPassID);
+  addPass(createAMDGPUPreloadKernArgPrologLegacyPass());
 }
 
 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 03038caab521d..97a0d59cfeeda 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -88,6 +88,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUPerfHintAnalysis.cpp
   AMDGPUPostLegalizerCombiner.cpp
   AMDGPUPreLegalizerCombiner.cpp
+  AMDGPUPreloadKernArgProlog.cpp
   AMDGPUPrintfRuntimeBinding.cpp
   AMDGPUPromoteAlloca.cpp
   AMDGPUPromoteKernelArguments.cpp
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index ffde4d33f1341..eccd77d6c00f0 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -338,15 +338,6 @@ bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(
   return true;
 }
 
-bool AMDGPUTargetAsmStreamer::EmitKernargPreloadHeader(
-    const MCSubtargetInfo &STI, bool TrapEnabled) {
-  OS << (TrapEnabled ? "\ts_trap 2" : "\ts_endpgm")
-     << " ; Kernarg preload header. Trap with incompatible firmware that "
-        "doesn't support preloading kernel arguments.\n";
-  OS << "\t.fill 63, 4, 0xbf800000 ; s_nop 0\n";
-  return true;
-}
-
 bool AMDGPUTargetAsmStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
   const uint32_t Encoded_s_code_end = 0xbf9f0000;
   const uint32_t Encoded_s_nop = 0xbf800000;
@@ -935,20 +926,6 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata(msgpack::Document &HSAMetadataDoc,
   return true;
 }
 
-bool AMDGPUTargetELFStreamer::EmitKernargPreloadHeader(
-    const MCSubtargetInfo &STI, bool TrapEnabled) {
-  const uint32_t Encoded_s_nop = 0xbf800000;
-  const uint32_t Encoded_s_trap = 0xbf920002;
-  const uint32_t Encoded_s_endpgm = 0xbf810000;
-  const uint32_t TrapInstr = TrapEnabled ? Encoded_s_trap : Encoded_s_endpgm;
-  MCStreamer &OS = getStreamer();
-  OS.emitInt32(TrapInstr);
-  for (int i = 0; i < 63; ++i) {
-    OS.emitInt32(Encoded_s_nop);
-  }
-  return true;
-}
-
 bool AMDGPUTargetELFStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
   const uint32_t Encoded_s_code_end = 0xbf9f0000;
   const uint32_t Encoded_s_nop = 0xbf800000;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 6a91ad06de5d1..9c49020850584 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -96,12 +96,6 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
   /// \returns True on success, false on failure.
   virtual bool EmitCodeEnd(const MCSubtargetInfo &STI) { return true; }
 
-  /// \returns True on success, false on failure.
-  virtual bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI,
-                                        bool TrapEnabled) {
-    return true;
-  }
-
   virtual void
   EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName,
                              const AMDGPU::MCKernelDescriptor &KernelDescriptor,
@@ -168,10 +162,6 @@ class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
   /// \returns True on success, false on failure.
   bool EmitCodeEnd(const MCSubtargetInfo &STI) override;
 
-  /// \returns True on success, false on failure.
-  bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI,
-                                bool TrapEnabled) override;
-
   void
   EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName,
                              const AMDGPU::MCKernelDescriptor &KernelDescriptor,
@@ -225,10 +215,6 @@ class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
   /// \returns True on success, false on failure.
   bool EmitCodeEnd(const MCSubtargetInfo &STI) override;
 
-  /// \returns True on success, false on failure.
-  bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI,
-                                bool TrapEnabled) override;
-
   void
   EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName,
                              const AMDGPU::MCKernelDescriptor &KernelDescriptor,
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 8d7df73f3cee8..169f1369fb543 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -262,6 +262,8 @@ SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg(
   // If the available register tuples are aligned with the kernarg to be
   // preloaded use that register, otherwise we need to use a set of SGPRs and
   // merge them.
+  if (!ArgInfo.FirstKernArgPreloadReg)
+    ArgInfo.FirstKernArgPreloadReg = getNextUserSGPR();
   Register PreloadReg =
       TRI.getMatchingSuperReg(getNextUserSGPR(), AMDGPU::sub0, RC);
   if (PreloadReg &&
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index e77f4f69e265b..b2708cf13cbf3 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -144,6 +144,7 @@
 ; GCN-O0-NEXT:        SI Final Branch Preparation
 ; GCN-O0-NEXT:        Post RA hazard recognizer
 ; GCN-O0-NEXT:        Branch relaxation pass
+; GCN-O0-NEXT:        AMDGPU Preload Kernel Arguments Prolog
 ; GCN-O0-NEXT:        Register Usage Information Collector Pass
 ; GCN-O0-NEXT:        Remove Loads Into Fake Uses
 ; GCN-O0-NEXT:        Live DEBUG_VALUE analysis
@@ -424,6 +425,7 @@
 ; GCN-O1-NEXT:        Post RA hazard recognizer
 ; GCN-O1-NEXT:        AMDGPU Insert Delay ALU
 ; GCN-O1-NEXT:        Branch relaxation pass
+; GCN-O1-NEXT:        AMDGPU Preload Kernel Arguments Prolog
 ; GCN-O1-NEXT:        Register Usage Information Collector Pass
 ; GCN-O1-NEXT:        Remove Loads Into Fake Uses
 ; GCN-O1-NEXT:        Live DEBUG_VALUE analysis
@@ -732,6 +734,7 @@
 ; GCN-O1-OPTS-NEXT:        Post RA hazard recognizer
 ; GCN-O1-OPTS-NEXT:        AMDGPU Insert Delay ALU
 ; GCN-O1-OPTS-NEXT:        Branch relaxation pass
+; GCN-O1-OPTS-NEXT:        AMDGPU Preload Kernel Arguments Prolog
 ; GCN-O1-OPTS-NEXT:        Register Usage Information Collector Pass
 ; GCN-O1-OPTS-NEXT:        Remove Loads Into Fake Uses
 ; GCN-O1-OPTS-NEXT:        Live DEBUG_VALUE analysis
@@ -1046,6 +1049,7 @@
 ; GCN-O2-NEXT:        Post RA hazard recognizer
 ; GCN-O2-NEXT:        AMDGPU Insert Delay ALU
 ; GCN-O2-NEXT:        Branch relaxation pass
+; GCN-O2-NEXT:        AMDGPU Preload Kernel Arguments Prolog
 ; GCN-O2-NEXT:        Register Usage Information Collector Pass
 ; GCN-O2-NEXT:        Remove Loads Into Fake Uses
 ; GCN-O2-NEXT:        Live DEBUG_VALUE analysis
@@ -1373,6 +1377,7 @@
 ; GCN-O3-NEXT:        Post RA hazard recognizer
 ; GCN-O3-NEXT:        AMDGPU Insert Delay ALU
 ; GCN-O3-NEXT:        Branch relaxation pass
+; GCN-O3-NEXT:        AMDGPU Preload Kernel Arguments Prolog
 ; GCN-O3-NEXT:        Register Usage Information Collector Pass
 ; GCN-O3-NEXT:        Remove Loads Into Fake Uses
 ; GCN-O3-NEXT:        Live DEBUG_VALUE analysis
diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
index 0c6d8dce193da..31beb7a3cce24 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
@@ -4,18 +4,28 @@
 
 define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0 {
 ; GFX940-LABEL: preload_block_count_x:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB0_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB0_0:
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: preload_block_count_x:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB0_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB0_0:
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
@@ -28,18 +38,30 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0
 
 define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inreg %out, i32 inreg) #0 {
 ; GFX940-LABEL: preload_unused_arg_block_count_x:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NEXT:    s_load_dword s6, s[0:1], 0x10
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB1_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB1_0:
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: preload_unused_arg_block_count_x:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT:    s_load_dword s10, s[4:5], 0x10
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB1_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB1_0:
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s10
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
@@ -52,9 +74,13 @@ define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inr
 
 define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %out, i256 inreg) {
 ; GFX940-LABEL: no_free_sgprs_block_count_x:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB2_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB2_0:
 ; GFX940-NEXT:    s_load_dword s0, s[4:5], 0x28
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
@@ -63,9 +89,13 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %o
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: no_free_sgprs_block_count_x:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx4 s[12:15], s[8:9], 0x0
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB2_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB2_0:
 ; GFX90a-NEXT:    s_load_dword s0, s[8:9], 0x28
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
@@ -135,9 +165,13 @@ define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32
 
 define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inreg %out) #0 {
 ; GFX940-LABEL: incorrect_type_i64_block_count_x:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB5_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB5_0:
 ; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
@@ -146,9 +180,13 @@ define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inr
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: incorrect_type_i64_block_count_x:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB5_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB5_0:
 ; GFX90a-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
 ; GFX90a-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
@@ -163,9 +201,13 @@ define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inr
 
 define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inreg %out) #0 {
 ; GFX940-LABEL: incorrect_type_i16_block_count_x:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB6_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB6_0:
 ; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
@@ -174,9 +216,13 @@ define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inr
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: incorrect_type_i16_block_count_x:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB6_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB6_0:
 ; GFX90a-NEXT:    s_load_dword s0, s[4:5], 0x8
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
@@ -191,18 +237,28 @@ define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inr
 
 define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out) #0 {
 ; GFX940-LABEL: preload_block_count_y:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB7_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB7_0:
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: preload_block_count_y:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB7_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB7_0:
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
@@ -216,9 +272,13 @@ define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out) #0
 
 define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) #0 {
 ; GFX940-LABEL: random_incorrect_offset:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB8_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB8_0:
 ; GFX940-NEXT:    s_mov_b32 s4, 8
 ; GFX940-NEXT:    s_load_dword s0, s[0:1], s4 offset:0x2
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
@@ -228,9 +288,13 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out)
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: random_incorrect_offset:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB8_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB8_0:
 ; GFX90a-NEXT:    s_mov_b32 s0, 8
 ; GFX90a-NEXT:    s_load_dword s0, s[4:5], s0 offset:0x2
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
@@ -247,18 +311,30 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out)
 
 define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) inreg %out) #0 {
 ; GFX940-LABEL: preload_block_count_z:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NEXT:    s_load_dword s6, s[0:1], 0x10
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB9_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB9_0:
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: preload_block_count_z:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT:    s_load_dword s10, s[4:5], 0x10
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB9_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB9_0:
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s10
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
@@ -272,9 +348,15 @@ define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) inreg %out) #0
 
 define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspace(1) inreg %out, i8 inreg %val) #0 {
 ; GFX940-LABEL: preload_block_count_x_imparg_align_ptr_i8:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NEXT:    s_load_dword s6, s[0:1], 0x10
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB10_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB10_0:
 ; GFX940-NEXT:    s_and_b32 s0, s4, 0xff
 ; GFX940-NEXT:    s_add_i32 s0, s6, s0
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
@@ -283,9 +365,15 @@ define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspa
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: preload_block_count_x_imparg_align_ptr_i8:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT:    s_load_dword s10, s[4:5], 0x10
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB10_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB10_0:
 ; GFX90a-NEXT:    s_and_b32 s0, s8, 0xff
 ; GFX90a-NEXT:    s_add_i32 s0, s10, s0
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
@@ -302,9 +390,15 @@ define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspa
 
 define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out) #0 {
 ; GFX940-LABEL: preload_block_count_xyz:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NEXT:    s_load_dword s6, s[0:1], 0x10
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB11_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB11_0:
 ; GFX940-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s5
@@ -313,9 +407,15 @@ define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out)
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: preload_block_count_xyz:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT:    s_load_dword s10, s[4:5], 0x10
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB11_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB11_0:
 ; GFX90a-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s9
@@ -338,9 +438,14 @@ define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out)
 
 define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out) #0 {
 ; GFX940-LABEL: preload_workgroup_size_x:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB12_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB12_0:
 ; GFX940-NEXT:    s_and_b32 s0, s7, 0xffff
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s0
@@ -348,9 +453,14 @@ define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out)
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: preload_workgroup_size_x:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB12_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB12_0:
 ; GFX90a-NEXT:    s_and_b32 s0, s11, 0xffff
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
@@ -366,9 +476,14 @@ define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out)
 
 define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out) #0 {
 ; GFX940-LABEL: preload_workgroup_size_y:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB13_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB13_0:
 ; GFX940-NEXT:    s_lshr_b32 s0, s7, 16
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s0
@@ -376,9 +491,14 @@ define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out)
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: preload_workgroup_size_y:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB13_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB13_0:
 ; GFX90a-NEXT:    s_lshr_b32 s0, s11, 16
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
@@ -394,9 +514,15 @@ define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out)
 
 define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out) #0 {
 ; GFX940-LABEL: preload_workgroup_size_z:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX940-NEXT:    s_load_dword s8, s[0:1], 0x18
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB14_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB14_0:
 ; GFX940-NEXT:    s_and_b32 s0, s8, 0xffff
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s0
@@ -404,9 +530,15 @@ define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out)
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: preload_workgroup_size_z:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT:    s_load_dword s12, s[4:5], 0x18
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB14_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB14_0:
 ; GFX90a-NEXT:    s_and_b32 s0, s12, 0xffff
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
@@ -422,9 +554,15 @@ define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out)
 
 define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %out) #0 {
 ; GFX940-LABEL: preload_workgroup_size_xyz:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX940-NEXT:    s_load_dword s8, s[0:1], 0x18
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB15_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB15_0:
 ; GFX940-NEXT:    s_lshr_b32 s0, s7, 16
 ; GFX940-NEXT:    s_and_b32 s1, s7, 0xffff
 ; GFX940-NEXT:    s_and_b32 s4, s8, 0xffff
@@ -436,9 +574,15 @@ define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %ou
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: preload_workgroup_size_xyz:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT:    s_load_dword s12, s[4:5], 0x18
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB15_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB15_0:
 ; GFX90a-NEXT:    s_lshr_b32 s0, s11, 16
 ; GFX90a-NEXT:    s_and_b32 s1, s11, 0xffff
 ; GFX90a-NEXT:    s_and_b32 s2, s12, 0xffff
@@ -467,9 +611,15 @@ define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %ou
 
 define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out) #0 {
 ; GFX940-LABEL: preload_remainder_x:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX940-NEXT:    s_load_dword s8, s[0:1], 0x18
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB16_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB16_0:
 ; GFX940-NEXT:    s_lshr_b32 s0, s8, 16
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s0
@@ -477,9 +627,15 @@ define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out) #0 {
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: preload_remainder_x:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT:    s_load_dword s12, s[4:5], 0x18
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB16_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB16_0:
 ; GFX90a-NEXT:    s_lshr_b32 s0, s12, 16
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
@@ -495,9 +651,15 @@ define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out) #0 {
 
 define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out) #0 {
 ; GFX940-LABEL: preloadremainder_y:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX940-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB17_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB17_0:
 ; GFX940-NEXT:    s_and_b32 s0, s9, 0xffff
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s0
@@ -505,9 +667,15 @@ define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out) #0 {
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: preloadremainder_y:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB17_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB17_0:
 ; GFX90a-NEXT:    s_and_b32 s0, s13, 0xffff
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
@@ -523,9 +691,15 @@ define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out) #0 {
 
 define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out) #0 {
 ; GFX940-LABEL: preloadremainder_z:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX940-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB18_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB18_0:
 ; GFX940-NEXT:    s_lshr_b32 s0, s9, 16
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s0
@@ -533,9 +707,15 @@ define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out) #0 {
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: preloadremainder_z:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB18_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB18_0:
 ; GFX90a-NEXT:    s_lshr_b32 s0, s13, 16
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
@@ -551,9 +731,15 @@ define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out) #0 {
 
 define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out) #0 {
 ; GFX940-LABEL: preloadremainder_xyz:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX940-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB19_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB19_0:
 ; GFX940-NEXT:    s_lshr_b32 s0, s9, 16
 ; GFX940-NEXT:    s_lshr_b32 s1, s8, 16
 ; GFX940-NEXT:    s_and_b32 s4, s9, 0xffff
@@ -565,9 +751,15 @@ define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out) #0
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: preloadremainder_xyz:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB19_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB19_0:
 ; GFX90a-NEXT:    s_lshr_b32 s0, s13, 16
 ; GFX90a-NEXT:    s_lshr_b32 s1, s12, 16
 ; GFX90a-NEXT:    s_and_b32 s2, s13, 0xffff
@@ -596,9 +788,13 @@ define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out) #0
 
 define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inreg %out) {
 ; GFX940-LABEL: no_free_sgprs_preloadremainder_z:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB20_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB20_0:
 ; GFX940-NEXT:    s_lshr_b32 s0, s15, 16
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s0
@@ -606,9 +802,13 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: no_free_sgprs_preloadremainder_z:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[8:9], 0x0
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB20_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB20_0:
 ; GFX90a-NEXT:    s_load_dword s0, s[8:9], 0x1c
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
@@ -628,18 +828,31 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr
 
 define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %out, i192 inreg %t0, i32 inreg %t1) #0 {
 ; GFX940-LABEL: preload_block_max_user_sgprs:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x8
+; GFX940-NEXT:    s_load_dword s12, s[0:1], 0x28
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB21_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB21_0:
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s12
 ; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: preload_block_max_user_sgprs:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT:    s_load_dword s14, s[4:5], 0x20
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB21_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB21_0:
 ; GFX90a-NEXT:    s_load_dword s0, s[4:5], 0x28
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
@@ -654,9 +867,15 @@ define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %
 
 define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(ptr addrspace(1) inreg %out) #0 {
 ; GFX940-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX940-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB22_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB22_0:
 ; GFX940-NEXT:    s_lshr_b32 s0, s9, 16
 ; GFX940-NEXT:    s_and_b32 s1, s8, 0xffff
 ; GFX940-NEXT:    v_mov_b32_e32 v3, 0
@@ -667,9 +886,15 @@ define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(pt
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB22_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB22_0:
 ; GFX90a-NEXT:    s_lshr_b32 s0, s13, 16
 ; GFX90a-NEXT:    s_and_b32 s1, s12, 0xffff
 ; GFX90a-NEXT:    v_mov_b32_e32 v3, 0
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
index 5a03381447d0e..58f0b9657476c 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
@@ -1,23 +1,52 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,HSA,ASM %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN,HSA,OBJ %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN,NON-HSA,OBJ %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=obj | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN,HSA,OBJ %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -asm-verbose=0 < %s | FileCheck -check-prefixes=ASM %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx942 --disassemble - | FileCheck -check-prefixes=OBJ %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj | llvm-objdump --arch=amdgcn --mcpu=gfx942 --disassemble - | FileCheck -check-prefixes=OBJ %s
 
-; GCN: preload_kernarg_header
-; HSA: s_trap 2
-; NON-HSA: s_endpgm
-; ASM: .fill 63, 4, 0xbf800000 ; s_nop 0
-; OBJ-COUNT-63: s_nop 0
-define amdgpu_kernel void @preload_kernarg_header(ptr inreg %arg) {
+; OBJ: preload_ptr_kernarg_header
+; OBJ-COUNT-60: s_nop 0
+define amdgpu_kernel void @preload_ptr_kernarg_header(ptr inreg %arg) {
+; ASM-LABEL: preload_ptr_kernarg_header:
+; ASM:         s_load_dwordx2 s[8:9], s[4:5], 0x0
+; ASM-NEXT:    s_waitcnt lgkmcnt(0)
+; ASM-NEXT:    s_branch .LBB0_0
+; ASM-NEXT:    .p2align 8
+; ASM-NEXT:  .LBB0_0:
+; ASM-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; ASM-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; ASM-NEXT:    s_endpgm
     store ptr %arg, ptr %arg
     ret void
 }
 
-; GCN: non_kernel_function
-; GCN-NOT: s_trap 2
-; GCN-NOT: s_nop 0
-; GCN: flat_store
+; OBJ: preload_i32_kernarg_header
+; OBJ-COUNT-58: s_nop 0
+define amdgpu_kernel void @preload_i32_kernarg_header(ptr inreg %arg, i32 inreg %arg1) {
+; ASM-LABEL: preload_i32_kernarg_header:
+; ASM:         s_load_dwordx2 s[8:9], s[4:5], 0x0
+; ASM-NEXT:    s_load_dword s10, s[4:5], 0x8
+; ASM-NEXT:    s_waitcnt lgkmcnt(0)
+; ASM-NEXT:    s_branch .LBB1_0
+; ASM-NEXT:    .p2align 8
+; ASM-NEXT:  .LBB1_0:
+; ASM-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; ASM-NEXT:    v_mov_b32_e32 v2, s10
+; ASM-NEXT:    flat_store_dword v[0:1], v2
+; ASM-NEXT:    s_endpgm
+    store i32 %arg1, ptr %arg
+    ret void
+}
+
+; OBJ: non_kernel_function
+; ASM: non_kernel_function
+; OBJ-NOT: s_branch
+; ASM-NOT: s_branch
 define void @non_kernel_function(ptr %arg) {
+; ASM-LABEL: non_kernel_function:
+; ASM:         s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ASM-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; ASM-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; ASM-NEXT:    s_setpc_b64 s[30:31]
     store ptr %arg, ptr %arg
     ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index 2aaffd7121ae9..0f60888bcb2f5 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -5,9 +5,14 @@
 
 define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) inreg %out, i8 inreg %arg0) #0 {
 ; GFX940-LABEL: ptr1_i8:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB0_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB0_0:
 ; GFX940-NEXT:    s_and_b32 s0, s4, 0xff
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s0
@@ -15,9 +20,14 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) inreg %out, i8 inreg %arg0)
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: ptr1_i8:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB0_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB0_0:
 ; GFX90a-NEXT:    s_and_b32 s0, s8, 0xff
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
@@ -30,9 +40,14 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) inreg %out, i8 inreg %arg0)
 
 define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) inreg %out, i8 zeroext inreg %arg0) #0 {
 ; GFX940-LABEL: ptr1_i8_zext_arg:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB1_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB1_0:
 ; GFX940-NEXT:    s_and_b32 s0, s4, 0xff
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s0
@@ -40,9 +55,14 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) inreg %out, i8 zero
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: ptr1_i8_zext_arg:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB1_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB1_0:
 ; GFX90a-NEXT:    s_and_b32 s0, s8, 0xff
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
@@ -55,9 +75,14 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) inreg %out, i8 zero
 
 define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %arg0) #0 {
 ; GFX940-LABEL: ptr1_i16_preload_arg:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB2_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB2_0:
 ; GFX940-NEXT:    s_and_b32 s0, s4, 0xffff
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s0
@@ -65,9 +90,14 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) inreg %out, i16
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: ptr1_i16_preload_arg:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB2_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB2_0:
 ; GFX90a-NEXT:    s_and_b32 s0, s8, 0xffff
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
@@ -80,18 +110,28 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) inreg %out, i16
 
 define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) inreg %out, i32 inreg %arg0) #0 {
 ; GFX940-LABEL: ptr1_i32_preload_arg:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB3_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB3_0:
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: ptr1_i32_preload_arg:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB3_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB3_0:
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
@@ -103,9 +143,15 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) inreg %out, i32
 
 define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 inreg %arg0, ptr addrspace(1) inreg %out, i32 inreg %arg1) #0 {
 ; GFX940-LABEL: i32_ptr1_i32_preload_arg:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NEXT:    s_load_dword s6, s[0:1], 0x10
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB4_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB4_0:
 ; GFX940-NEXT:    s_add_i32 s0, s2, s6
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s0
@@ -113,9 +159,15 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 inreg %arg0, ptr addrspa
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: i32_ptr1_i32_preload_arg:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT:    s_load_dword s10, s[4:5], 0x10
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB4_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB4_0:
 ; GFX90a-NEXT:    s_add_i32 s0, s6, s10
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
@@ -128,9 +180,14 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 inreg %arg0, ptr addrspa
 
 define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %arg0, i16 inreg %arg1) #0 {
 ; GFX940-LABEL: ptr1_i16_i16_preload_arg:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB5_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB5_0:
 ; GFX940-NEXT:    s_lshr_b32 s0, s4, 16
 ; GFX940-NEXT:    s_and_b32 s1, s4, 0xffff
 ; GFX940-NEXT:    s_add_i32 s0, s1, s0
@@ -140,9 +197,14 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) inreg %out,
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: ptr1_i16_i16_preload_arg:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB5_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB5_0:
 ; GFX90a-NEXT:    s_lshr_b32 s0, s8, 16
 ; GFX90a-NEXT:    s_and_b32 s1, s8, 0xffff
 ; GFX90a-NEXT:    s_add_i32 s0, s1, s0
@@ -159,18 +221,28 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) inreg %out,
 
 define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) inreg %out, <2 x i8> inreg %in) #0 {
 ; GFX940-LABEL: ptr1_v2i8_preload_arg:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB6_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB6_0:
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX940-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: ptr1_v2i8_preload_arg:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB6_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB6_0:
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX90a-NEXT:    global_store_short v0, v1, s[6:7]
@@ -182,9 +254,13 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) inreg %out, <2
 
 define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) #0 {
 ; GFX940-LABEL: byref_preload_arg:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB7_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB7_0:
 ; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x100
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
@@ -197,9 +273,13 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr ad
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: byref_preload_arg:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB7_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB7_0:
 ; GFX90a-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
@@ -220,9 +300,13 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr ad
 
 define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 inreg %after.offset) #0 {
 ; GFX940-LABEL: byref_staggered_preload_arg:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB8_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB8_0:
 ; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x100
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
@@ -235,9 +319,13 @@ define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %o
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: byref_staggered_preload_arg:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB8_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB8_0:
 ; GFX90a-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
@@ -257,9 +345,13 @@ define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %o
 
 define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture inreg %out, <8 x i32> inreg %in) #0 {
 ; GFX940-LABEL: v8i32_arg:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB9_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB9_0:
 ; GFX940-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
 ; GFX940-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
@@ -277,9 +369,13 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture inreg %out, <8 x
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: v8i32_arg:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB9_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB9_0:
 ; GFX90a-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
 ; GFX90a-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
@@ -301,9 +397,14 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture inreg %out, <8 x
 
 define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture inreg %out, <3 x i16> inreg %in) #0 {
 ; GFX940-LABEL: v3i16_preload_arg:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB10_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB10_0:
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX940-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
@@ -312,9 +413,14 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture inreg %o
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: v3i16_preload_arg:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB10_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB10_0:
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX90a-NEXT:    global_store_short v0, v1, s[6:7] offset:4
@@ -327,9 +433,15 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture inreg %o
 
 define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture inreg %out, <3 x i32> inreg %in) #0 {
 ; GFX940-LABEL: v3i32_preload_arg:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX940-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB11_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB11_0:
 ; GFX940-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX940-NEXT:    v_mov_b32_e32 v2, s8
@@ -338,9 +450,15 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture inreg %o
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: v3i32_preload_arg:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB11_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB11_0:
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX90a-NEXT:    v_mov_b32_e32 v2, s12
@@ -353,9 +471,15 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture inreg %o
 
 define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture inreg %out, <3 x float> inreg %in) #0 {
 ; GFX940-LABEL: v3f32_preload_arg:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX940-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB12_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB12_0:
 ; GFX940-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s7
@@ -364,9 +488,15 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture inreg %o
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: v3f32_preload_arg:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB12_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB12_0:
 ; GFX90a-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s11
@@ -379,9 +509,14 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture inreg %o
 
 define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture inreg %out, <5 x i8> inreg %in) #0 {
 ; GFX940-LABEL: v5i8_preload_arg:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB13_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB13_0:
 ; GFX940-NEXT:    s_lshr_b32 s1, s4, 24
 ; GFX940-NEXT:    s_and_b32 s0, s4, 0xffff
 ; GFX940-NEXT:    s_lshl_b32 s1, s1, 8
@@ -397,9 +532,14 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture inreg %ou
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: v5i8_preload_arg:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB13_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB13_0:
 ; GFX90a-NEXT:    s_lshr_b32 s1, s8, 24
 ; GFX90a-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX90a-NEXT:    s_bfe_u32 s2, s8, 0x80010
@@ -419,9 +559,13 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture inreg %ou
 
 define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x double> inreg %in) #0 {
 ; GFX940-LABEL: v5f64_arg:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB14_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB14_0:
 ; GFX940-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
 ; GFX940-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
 ; GFX940-NEXT:    v_mov_b32_e32 v4, 0
@@ -442,9 +586,13 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: v5f64_arg:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB14_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB14_0:
 ; GFX90a-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
 ; GFX90a-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
 ; GFX90a-NEXT:    v_mov_b32_e32 v4, 0
@@ -469,9 +617,14 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x
 
 define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) inreg %out, <8 x i8> inreg %in) #0 {
 ; GFX940-LABEL: v8i8_preload_arg:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB15_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB15_0:
 ; GFX940-NEXT:    s_lshr_b32 s1, s5, 24
 ; GFX940-NEXT:    s_and_b32 s0, s5, 0xffff
 ; GFX940-NEXT:    s_lshl_b32 s1, s1, 8
@@ -493,9 +646,14 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) inreg %out, <8 x i8
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: v8i8_preload_arg:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB15_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB15_0:
 ; GFX90a-NEXT:    s_lshr_b32 s1, s9, 24
 ; GFX90a-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX90a-NEXT:    s_bfe_u32 s2, s9, 0x80010
@@ -521,18 +679,28 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) inreg %out, <8 x i8
 
 define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) inreg %out, i64 inreg %a) #0 {
 ; GFX940-LABEL: i64_kernel_preload_arg:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB16_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB16_0:
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
 ; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: i64_kernel_preload_arg:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB16_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB16_0:
 ; GFX90a-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90a-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
 ; GFX90a-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
@@ -543,18 +711,28 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) inreg %out, i
 
 define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) inreg %out, double inreg %in) #0 {
 ; GFX940-LABEL: f64_kernel_preload_arg:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB17_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB17_0:
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
 ; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: f64_kernel_preload_arg:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB17_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB17_0:
 ; GFX90a-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90a-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
 ; GFX90a-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
@@ -565,18 +743,28 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) inreg %out, d
 
 define amdgpu_kernel void @half_kernel_preload_arg(ptr addrspace(1) inreg %out, half inreg %in) #0 {
 ; GFX940-LABEL: half_kernel_preload_arg:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB18_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB18_0:
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX940-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: half_kernel_preload_arg:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB18_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB18_0:
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX90a-NEXT:    global_store_short v0, v1, s[6:7]
@@ -587,18 +775,28 @@ define amdgpu_kernel void @half_kernel_preload_arg(ptr addrspace(1) inreg %out,
 
 define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, bfloat inreg %in) #0 {
 ; GFX940-LABEL: bfloat_kernel_preload_arg:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB19_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB19_0:
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX940-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: bfloat_kernel_preload_arg:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB19_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB19_0:
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX90a-NEXT:    global_store_short v0, v1, s[6:7]
@@ -609,18 +807,28 @@ define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out
 
 define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, <2 x bfloat> inreg %in) #0 {
 ; GFX940-LABEL: v2bfloat_kernel_preload_arg:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB20_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB20_0:
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: v2bfloat_kernel_preload_arg:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB20_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB20_0:
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
@@ -631,9 +839,14 @@ define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o
 
 define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, <3 x bfloat> inreg %in) #0 {
 ; GFX940-LABEL: v3bfloat_kernel_preload_arg:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB21_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB21_0:
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX940-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
@@ -642,9 +855,14 @@ define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: v3bfloat_kernel_preload_arg:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB21_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB21_0:
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX90a-NEXT:    global_store_short v0, v1, s[6:7] offset:4
@@ -657,9 +875,15 @@ define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o
 
 define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, <6 x bfloat> inreg %in) #0 {
 ; GFX940-LABEL: v6bfloat_kernel_preload_arg:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX940-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB22_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB22_0:
 ; GFX940-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX940-NEXT:    v_mov_b32_e32 v2, s8
@@ -668,9 +892,15 @@ define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: v6bfloat_kernel_preload_arg:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB22_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB22_0:
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX90a-NEXT:    v_mov_b32_e32 v2, s12
@@ -683,9 +913,14 @@ define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o
 
 define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, half inreg %in, <7 x bfloat> inreg %in2, ptr addrspace(1) inreg %out2) #0 {
 ; GFX940-LABEL: half_v7bfloat_kernel_preload_arg:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x8
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB23_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB23_0:
 ; GFX940-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX940-NEXT:    global_store_short v3, v0, s[2:3] sc0 sc1
@@ -698,9 +933,14 @@ define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) inr
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: half_v7bfloat_kernel_preload_arg:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB23_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB23_0:
 ; GFX90a-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x20
 ; GFX90a-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, s8
@@ -720,9 +960,14 @@ define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) inr
 
 define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) inreg %out, i1 inreg %in) #0 {
 ; GFX940-LABEL: i1_kernel_preload_arg:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB24_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB24_0:
 ; GFX940-NEXT:    s_and_b32 s0, s4, 1
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s0
@@ -730,9 +975,14 @@ define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) inreg %out, i1
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: i1_kernel_preload_arg:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB24_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB24_0:
 ; GFX90a-NEXT:    s_and_b32 s0, s8, 1
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
@@ -744,9 +994,15 @@ define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) inreg %out, i1
 
 define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) inreg %out, fp128 inreg %in) #0 {
 ; GFX940-LABEL: fp128_kernel_preload_arg:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX940-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB25_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB25_0:
 ; GFX940-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s7
@@ -756,9 +1012,15 @@ define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) inreg %out,
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: fp128_kernel_preload_arg:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB25_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB25_0:
 ; GFX90a-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s11
@@ -772,9 +1034,14 @@ define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) inreg %out,
 
 define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) inreg %out, <7 x i8> inreg %in) #0 {
 ; GFX940-LABEL: v7i8_kernel_preload_arg:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB26_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB26_0:
 ; GFX940-NEXT:    s_lshr_b32 s1, s4, 24
 ; GFX940-NEXT:    s_and_b32 s0, s4, 0xffff
 ; GFX940-NEXT:    s_lshl_b32 s1, s1, 8
@@ -791,9 +1058,14 @@ define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) inreg %out,
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: v7i8_kernel_preload_arg:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB26_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB26_0:
 ; GFX90a-NEXT:    s_lshr_b32 s1, s8, 24
 ; GFX90a-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX90a-NEXT:    s_bfe_u32 s2, s8, 0x80010
@@ -814,9 +1086,15 @@ define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) inreg %out,
 
 define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) inreg %out, <7 x half> inreg %in) #0 {
 ; GFX940-LABEL: v7half_kernel_preload_arg:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX940-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB27_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB27_0:
 ; GFX940-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v0, s9
 ; GFX940-NEXT:    global_store_short v3, v0, s[2:3] offset:12 sc0 sc1
@@ -827,9 +1105,15 @@ define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) inreg %out
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: v7half_kernel_preload_arg:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB27_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB27_0:
 ; GFX90a-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, s13
 ; GFX90a-NEXT:    global_store_short v3, v0, s[6:7] offset:12
@@ -842,29 +1126,37 @@ define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) inreg %out
   ret void
 }
 
-define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) %out, i16 inreg %in, i32 inreg %in2, ptr addrspace(1) inreg %out2) #0 {
+define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %in, i32 inreg %in2, ptr addrspace(1) inreg %out2) #0 {
 ; GFX940-LABEL: i16_i32_kernel_preload_arg:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
-; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x10
-; GFX940-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v1, s6
-; GFX940-NEXT:    v_mov_b32_e32 v2, s7
-; GFX940-NEXT:    global_store_short v0, v1, s[4:5] sc0 sc1
-; GFX940-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
+; GFX940-NEXT:    s_branch .LBB28_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB28_0:
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NEXT:    global_store_dword v0, v1, s[6:7] sc0 sc1
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: i16_i32_kernel_preload_arg:
-; GFX90a:       ; %bb.0:
-; GFX90a-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
 ; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-NEXT:    v_mov_b32_e32 v2, s3
-; GFX90a-NEXT:    global_store_short v0, v1, s[0:1]
-; GFX90a-NEXT:    global_store_dword v0, v2, s[6:7]
+; GFX90a-NEXT:    s_branch .LBB28_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB28_0:
+; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-NEXT:    global_store_dword v0, v1, s[10:11]
 ; GFX90a-NEXT:    s_endpgm
   store i16 %in, ptr addrspace(1) %out
   store i32 %in2, ptr addrspace(1) %out2
@@ -873,9 +1165,14 @@ define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) %out, i16
 
 define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %in, <3 x i32> inreg %in2, ptr addrspace(1) inreg %out2) #0 {
 ; GFX940-LABEL: i16_v3i32_kernel_preload_arg:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x8
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB29_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB29_0:
 ; GFX940-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX940-NEXT:    v_mov_b32_e32 v0, s6
@@ -886,9 +1183,14 @@ define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) inreg %
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: i16_v3i32_kernel_preload_arg:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB29_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB29_0:
 ; GFX90a-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x20
 ; GFX90a-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX90a-NEXT:    v_mov_b32_e32 v4, s8
@@ -906,9 +1208,14 @@ define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) inreg %
 
 define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %in, i16 inreg %in2, ptr addrspace(1) inreg %out2) #0 {
 ; GFX940-LABEL: i16_i16_kernel_preload_arg:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB30_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB30_0:
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX940-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
@@ -916,9 +1223,14 @@ define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) inreg %ou
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: i16_i16_kernel_preload_arg:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB30_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB30_0:
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX90a-NEXT:    global_store_short v0, v1, s[6:7]
@@ -931,9 +1243,14 @@ define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) inreg %ou
 
 define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %in, <2 x i8> inreg %in2, ptr addrspace(1) inreg %out2) #0 {
 ; GFX940-LABEL: i16_v2i8_kernel_preload_arg:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB31_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB31_0:
 ; GFX940-NEXT:    s_lshr_b32 s0, s4, 24
 ; GFX940-NEXT:    s_lshl_b32 s0, s0, 8
 ; GFX940-NEXT:    s_bfe_u32 s1, s4, 0x80010
@@ -946,9 +1263,14 @@ define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) inreg %o
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: i16_v2i8_kernel_preload_arg:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB31_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB31_0:
 ; GFX90a-NEXT:    s_lshr_b32 s0, s8, 24
 ; GFX90a-NEXT:    s_lshl_b32 s0, s0, 8
 ; GFX90a-NEXT:    s_bfe_u32 s1, s8, 0x80010
@@ -968,9 +1290,13 @@ define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) inreg %o
 
 define amdgpu_kernel void @i32_ptr1_i32_staggered_preload_arg(i32 inreg %arg0, ptr addrspace(1) %out, i32 inreg %arg1) #0 {
 ; GFX940-LABEL: i32_ptr1_i32_staggered_preload_arg:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB32_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB32_0:
 ; GFX940-NEXT:    s_load_dword s3, s[0:1], 0x10
 ; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
@@ -981,9 +1307,13 @@ define amdgpu_kernel void @i32_ptr1_i32_staggered_preload_arg(i32 inreg %arg0, p
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: i32_ptr1_i32_staggered_preload_arg:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB32_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB32_0:
 ; GFX90a-NEXT:    s_load_dword s2, s[4:5], 0x10
 ; GFX90a-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
@@ -999,9 +1329,14 @@ define amdgpu_kernel void @i32_ptr1_i32_staggered_preload_arg(i32 inreg %arg0, p
 
 define amdgpu_kernel void @ptr1_i8_trailing_unused(ptr addrspace(1) inreg %out, i8 inreg %arg0, i32 inreg %unused) #0 {
 ; GFX940-LABEL: ptr1_i8_trailing_unused:
-; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-NEXT:  ; %bb.0:
+; GFX940:       ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_branch .LBB33_0
+; GFX940-NEXT:    .p2align 8
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:  .LBB33_0:
 ; GFX940-NEXT:    s_and_b32 s0, s4, 0xff
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s0
@@ -1009,9 +1344,14 @@ define amdgpu_kernel void @ptr1_i8_trailing_unused(ptr addrspace(1) inreg %out,
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: ptr1_i8_trailing_unused:
-; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-NEXT:  ; %bb.0:
+; GFX90a:       ; %bb.1:
+; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT:    s_branch .LBB33_0
+; GFX90a-NEXT:    .p2align 8
+; GFX90a-NEXT:  ; %bb.2:
+; GFX90a-NEXT:  .LBB33_0:
 ; GFX90a-NEXT:    s_and_b32 s0, s8, 0xff
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0

From 833a17489dd96f35df3a17ad231ada82acf38ef9 Mon Sep 17 00:00:00 2001
From: Med Ismail Bennani <ismail@bennani.ma>
Date: Fri, 10 Jan 2025 11:44:50 -0800
Subject: [PATCH 100/408] [lldb/crashlog] Fix typo in error message when
 creating a target (#122514)

This fixes a typo when creating a target from the crashlog script and
that we were not able to find a valid architecture from the crash
report.

rdar://137344016

Signed-off-by: Med Ismail Bennani <ismail@bennani.ma>
---
 lldb/examples/python/crashlog.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/examples/python/crashlog.py b/lldb/examples/python/crashlog.py
index 368437ed63e46..ab8c2fcaf034b 100755
--- a/lldb/examples/python/crashlog.py
+++ b/lldb/examples/python/crashlog.py
@@ -1512,7 +1512,7 @@ def load_crashlog_in_scripted_process(debugger, crashlog_path, options, result):
         arch = crashlog.process_arch
         if not arch:
             raise InteractiveCrashLogException(
-                "couldn't create find the architecture to create the target"
+                "couldn't find the architecture to create the target"
             )
         target = debugger.CreateTargetWithFileAndArch(None, arch)
     # 4. Fail

From 5912de9ede81407f93162e930ae9bc97e561d017 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Fri, 10 Jan 2025 14:46:43 -0500
Subject: [PATCH 101/408] [Driver][NetBSD] Remove support for NetBSD 8.x
 (#122513)

---
 clang/lib/Driver/ToolChains/NetBSD.cpp | 14 --------------
 clang/test/Driver/netbsd.c             | 11 -----------
 2 files changed, 25 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/NetBSD.cpp b/clang/lib/Driver/ToolChains/NetBSD.cpp
index abd5e1aa003b3..c5469f32ac80b 100644
--- a/clang/lib/Driver/ToolChains/NetBSD.cpp
+++ b/clang/lib/Driver/ToolChains/NetBSD.cpp
@@ -564,18 +564,4 @@ void NetBSD::addClangTargetOptions(const ArgList &DriverArgs,
   const SanitizerArgs &SanArgs = getSanitizerArgs(DriverArgs);
   if (SanArgs.hasAnySanitizer())
     CC1Args.push_back("-D_REENTRANT");
-
-  VersionTuple OsVersion = getTriple().getOSVersion();
-  bool UseInitArrayDefault =
-      OsVersion >= VersionTuple(9) || OsVersion.getMajor() == 0 ||
-      getTriple().getArch() == llvm::Triple::aarch64 ||
-      getTriple().getArch() == llvm::Triple::aarch64_be ||
-      getTriple().getArch() == llvm::Triple::arm ||
-      getTriple().getArch() == llvm::Triple::armeb ||
-      getTriple().getArch() == llvm::Triple::riscv32 ||
-      getTriple().getArch() == llvm::Triple::riscv64;
-
-  if (!DriverArgs.hasFlag(options::OPT_fuse_init_array,
-                          options::OPT_fno_use_init_array, UseInitArrayDefault))
-    CC1Args.push_back("-fno-use-init-array");
 }
diff --git a/clang/test/Driver/netbsd.c b/clang/test/Driver/netbsd.c
index 1b7c674e18af6..8404bb6bbd528 100644
--- a/clang/test/Driver/netbsd.c
+++ b/clang/test/Driver/netbsd.c
@@ -131,28 +131,24 @@
 // PIE: "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
 
 // X86_64: "-cc1" "-triple" "x86_64-unknown-netbsd"
-// X86_64-NOT: "-fno-use-init-array"
 // X86_64: ld{{.*}}" "--eh-frame-hdr" "-dynamic-linker" "/libexec/ld.elf_so"
 // X86_64: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o" "{{.*}}/usr/lib{{/|\\\\}}crti.o"
 // X86_64: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lc"
 // X86_64: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
 
 // AARCH64: "-cc1" "-triple" "aarch64-unknown-netbsd"
-// AARCH64-NOT: "-fno-use-init-array"
 // AARCH64: ld{{.*}}" "--eh-frame-hdr" "-dynamic-linker" "/libexec/ld.elf_so"
 // AARCH64: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o" "{{.*}}/usr/lib{{/|\\\\}}crti.o"
 // AARCH64: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lc"
 // AARCH64: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
 
 // AARCH64_BE: "-cc1" "-triple" "aarch64_be-unknown-netbsd"
-// AARCH64_BE-NOT: "-fno-use-init-array"
 // AARCH64_BE: ld{{.*}}" "--eh-frame-hdr" "-dynamic-linker" "/libexec/ld.elf_so"
 // AARCH64_BE: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o" "{{.*}}/usr/lib{{/|\\\\}}crti.o"
 // AARCH64_BE: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lc"
 // AARCH64_BE: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
 
 // ARM: "-cc1" "-triple" "armv5e-unknown-netbsd-eabi"
-// ARM-NOT: "-fno-use-init-array"
 // ARM: as{{.*}}" "-mcpu=arm926ej-s" "-o"
 // ARM: ld{{.*}}" "--eh-frame-hdr" "-dynamic-linker" "/libexec/ld.elf_so"
 // ARM: "-m" "armelf_nbsd_eabi"
@@ -162,7 +158,6 @@
 // ARM: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
 
 // ARMEB: "-cc1" "-triple" "armebv5e-unknown-netbsd-eabi"
-// ARMEB-NOT: "-fno-use-init-array"
 // ARMEB: as{{.*}}" "-mcpu=arm926ej-s" "-o"
 // ARMEB: ld{{.*}}" "--eh-frame-hdr" "-dynamic-linker" "/libexec/ld.elf_so"
 // ARMEB-NOT: "--be8"
@@ -215,7 +210,6 @@
 // THUMBEB: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
 
 // RISCV32: "-cc1" "-triple" "riscv32-unknown-netbsd"
-// RISCV32-NOT: "-fno-use-init-array"
 // RISCV32: ld{{.*}}" "--eh-frame-hdr" "-dynamic-linker" "/libexec/ld.elf_so"
 // RISCV32-SAME: "-m" "elf32lriscv" "-X"
 // RISCV32-SAME: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o" "{{.*}}/usr/lib{{/|\\\\}}crti.o"
@@ -223,7 +217,6 @@
 // RISCV32-SAME: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
 
 // RISCV64: "-cc1" "-triple" "riscv64-unknown-netbsd"
-// RISCV64-NOT: "-fno-use-init-array"
 // RISCV64: ld{{.*}}" "--eh-frame-hdr" "-dynamic-linker" "/libexec/ld.elf_so"
 // RISCV64-SAME: "-m" "elf64lriscv" "-X"
 // RISCV64-SAME: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o" "{{.*}}/usr/lib{{/|\\\\}}crti.o"
@@ -231,7 +224,6 @@
 // RISCV64-SAME: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
 
 // SPARC: "-cc1" "-triple" "sparc-unknown-netbsd"
-// SPARC-NOT: "-fno-use-init-array"
 // SPARC: as{{.*}}" "-32" "-Av8" "-o"
 // SPARC: ld{{.*}}" "--eh-frame-hdr" "-dynamic-linker" "/libexec/ld.elf_so"
 // SPARC: "-m" "elf32_sparc"
@@ -241,7 +233,6 @@
 // SPARC: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
 
 // SPARC64: "-cc1" "-triple" "sparc64-unknown-netbsd"
-// SPARC64-NOT: "-fno-use-init-array"
 // SPARC64: as{{.*}}" "-64" "-Av9" "-o"
 // SPARC64: ld{{.*}}" "--eh-frame-hdr" "-dynamic-linker" "/libexec/ld.elf_so"
 // SPARC64: "-m" "elf64_sparc"
@@ -250,7 +241,6 @@
 // SPARC64: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
 
 // POWERPC: "-cc1" "-triple" "powerpc-unknown-netbsd"
-// POWERPC-NOT: "-fno-use-init-array"
 // POWERPC: ld{{.*}}" "--eh-frame-hdr" "-dynamic-linker" "/libexec/ld.elf_so"
 // POWERPC: "-m" "elf32ppc_nbsd"
 // POWERPC: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o"
@@ -259,7 +249,6 @@
 // POWERPC: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
 
 // POWERPC64: "-cc1" "-triple" "powerpc64-unknown-netbsd"
-// POWERPC64-NOT: "-fno-use-init-array"
 // POWERPC64: ld{{.*}}" "--eh-frame-hdr" "-dynamic-linker" "/libexec/ld.elf_so"
 // POWERPC64: "-m" "elf64ppc"
 // POWERPC64: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o"

From 0f242897ce806a0cc88c328fd0f7a3f34d25504c Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 10 Jan 2025 19:52:53 +0000
Subject: [PATCH 102/408] [gn build] Port 2e5c29828196

---
 llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
index aaf2a869bf5d2..9226658d4c767 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@@ -175,6 +175,7 @@ static_library("LLVMAMDGPUCodeGen") {
     "AMDGPUPerfHintAnalysis.cpp",
     "AMDGPUPostLegalizerCombiner.cpp",
     "AMDGPUPreLegalizerCombiner.cpp",
+    "AMDGPUPreloadKernArgProlog.cpp",
     "AMDGPUPrintfRuntimeBinding.cpp",
     "AMDGPUPromoteAlloca.cpp",
     "AMDGPUPromoteKernelArguments.cpp",

From d2498afccb04c0f09b05827b6b9c1c6c181a4f2b Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Fri, 10 Jan 2025 15:01:32 -0500
Subject: [PATCH 103/408] [Driver][NFC] Formatting fixes (#122519)

---
 clang/lib/Basic/Targets.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp
index be5dedbe8044e..f40b81c02eb7e 100644
--- a/clang/lib/Basic/Targets.cpp
+++ b/clang/lib/Basic/Targets.cpp
@@ -726,31 +726,31 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
   case llvm::Triple::csky:
     switch (os) {
     case llvm::Triple::Linux:
-        return std::make_unique<LinuxTargetInfo<CSKYTargetInfo>>(Triple, Opts);
+      return std::make_unique<LinuxTargetInfo<CSKYTargetInfo>>(Triple, Opts);
     default:
-        return std::make_unique<CSKYTargetInfo>(Triple, Opts);
+      return std::make_unique<CSKYTargetInfo>(Triple, Opts);
     }
   case llvm::Triple::loongarch32:
     switch (os) {
     case llvm::Triple::Linux:
-        return std::make_unique<LinuxTargetInfo<LoongArch32TargetInfo>>(Triple,
-                                                                        Opts);
+      return std::make_unique<LinuxTargetInfo<LoongArch32TargetInfo>>(Triple,
+                                                                      Opts);
     case llvm::Triple::FreeBSD:
       return std::make_unique<FreeBSDTargetInfo<LoongArch32TargetInfo>>(Triple,
                                                                         Opts);
     default:
-        return std::make_unique<LoongArch32TargetInfo>(Triple, Opts);
+      return std::make_unique<LoongArch32TargetInfo>(Triple, Opts);
     }
   case llvm::Triple::loongarch64:
     switch (os) {
     case llvm::Triple::Linux:
-        return std::make_unique<LinuxTargetInfo<LoongArch64TargetInfo>>(Triple,
-                                                                        Opts);
+      return std::make_unique<LinuxTargetInfo<LoongArch64TargetInfo>>(Triple,
+                                                                      Opts);
     case llvm::Triple::FreeBSD:
       return std::make_unique<FreeBSDTargetInfo<LoongArch64TargetInfo>>(Triple,
                                                                         Opts);
     default:
-        return std::make_unique<LoongArch64TargetInfo>(Triple, Opts);
+      return std::make_unique<LoongArch64TargetInfo>(Triple, Opts);
     }
 
   case llvm::Triple::xtensa:

From 749bdc87f5d0646be93bb90dd843ffa07924205e Mon Sep 17 00:00:00 2001
From: Jan Voung <jvoung@google.com>
Date: Fri, 10 Jan 2025 15:14:27 -0500
Subject: [PATCH 104/408] [clang-tidy] sort / reorder a part of release notes
 (#122475)

and remove a trailing space
---
 clang-tools-extra/docs/ReleaseNotes.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 1a7d48a0b4dc7..684ba77d8f0f5 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -112,11 +112,11 @@ Improvements to clang-tidy
   the configuration options of the `Clang Static Analyzer Checks
   <https://clang.llvm.org/docs/analyzer/checkers.html>`_.
 
+- Improved :program:`clang-tidy` by accepting parameters file in command line.
+
 - Improved :program:`run-clang-tidy.py` script. Fixed minor shutdown noise
   happening on certain platforms when interrupting the script.
 
-- Improved :program:`clang-tidy` by accepting parameters file in command line.
-
 - Removed :program:`clang-tidy`'s global options for most of checks. All options
   are changed to local options except `IncludeStyle`, `StrictMode` and
   `IgnoreMacros`. Global scoped `StrictMode` and `IgnoreMacros` are deprecated
@@ -292,7 +292,7 @@ Changes in existing checks
   overloaded ``operator new`` and ``operator delete``.
 
 - Improved :doc:`modernize-avoid-c-arrays
-  <clang-tidy/checks/modernize/avoid-c-arrays>` check to suggest using 
+  <clang-tidy/checks/modernize/avoid-c-arrays>` check to suggest using
   ``std::span`` as a replacement for parameters of incomplete C array type in
   C++20 and ``std::array`` or ``std::vector`` before C++20.
 

From 4c6ca3efdae13a4dd75f9fe2cdfede5208e5d2c4 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Fri, 10 Jan 2025 15:20:55 -0500
Subject: [PATCH 105/408] [libc++] Implement a libc++ private version of
 isascii (#122361)

The isascii() function is not standard, so we should avoid relying on
the platform providing it, especially since it's easy to implement in
libc++ portably.
---
 libcxx/include/__locale | 10 +++---
 libcxx/src/locale.cpp   | 71 ++++++++++++++++++++++-------------------
 2 files changed, 44 insertions(+), 37 deletions(-)

diff --git a/libcxx/include/__locale b/libcxx/include/__locale
index 94dc8a08437bf..01c3a2e3456ba 100644
--- a/libcxx/include/__locale
+++ b/libcxx/include/__locale
@@ -516,6 +516,8 @@ protected:
 };
 #endif // _LIBCPP_HAS_WIDE_CHARACTERS
 
+inline _LIBCPP_HIDE_FROM_ABI bool __libcpp_isascii(int __c) { return (__c & ~0x7F) == 0; }
+
 template <>
 class _LIBCPP_EXPORTED_FROM_ABI ctype<char> : public locale::facet, public ctype_base {
   const mask* __tab_;
@@ -527,25 +529,25 @@ public:
   explicit ctype(const mask* __tab = nullptr, bool __del = false, size_t __refs = 0);
 
   _LIBCPP_HIDE_FROM_ABI bool is(mask __m, char_type __c) const {
-    return isascii(__c) ? (__tab_[static_cast<int>(__c)] & __m) != 0 : false;
+    return std::__libcpp_isascii(__c) ? (__tab_[static_cast<int>(__c)] & __m) != 0 : false;
   }
 
   _LIBCPP_HIDE_FROM_ABI const char_type* is(const char_type* __low, const char_type* __high, mask* __vec) const {
     for (; __low != __high; ++__low, ++__vec)
-      *__vec = isascii(*__low) ? __tab_[static_cast<int>(*__low)] : 0;
+      *__vec = std::__libcpp_isascii(*__low) ? __tab_[static_cast<int>(*__low)] : 0;
     return __low;
   }
 
   _LIBCPP_HIDE_FROM_ABI const char_type* scan_is(mask __m, const char_type* __low, const char_type* __high) const {
     for (; __low != __high; ++__low)
-      if (isascii(*__low) && (__tab_[static_cast<int>(*__low)] & __m))
+      if (std::__libcpp_isascii(*__low) && (__tab_[static_cast<int>(*__low)] & __m))
         break;
     return __low;
   }
 
   _LIBCPP_HIDE_FROM_ABI const char_type* scan_not(mask __m, const char_type* __low, const char_type* __high) const {
     for (; __low != __high; ++__low)
-      if (!isascii(*__low) || !(__tab_[static_cast<int>(*__low)] & __m))
+      if (!std::__libcpp_isascii(*__low) || !(__tab_[static_cast<int>(*__low)] & __m))
         break;
     return __low;
   }
diff --git a/libcxx/src/locale.cpp b/libcxx/src/locale.cpp
index 599c61ca7a36d..fb67a729cd0f2 100644
--- a/libcxx/src/locale.cpp
+++ b/libcxx/src/locale.cpp
@@ -707,69 +707,70 @@ constinit locale::id ctype<wchar_t>::id;
 ctype<wchar_t>::~ctype() {}
 
 bool ctype<wchar_t>::do_is(mask m, char_type c) const {
-  return isascii(c) ? (ctype<char>::classic_table()[c] & m) != 0 : false;
+  return std::__libcpp_isascii(c) ? (ctype<char>::classic_table()[c] & m) != 0 : false;
 }
 
 const wchar_t* ctype<wchar_t>::do_is(const char_type* low, const char_type* high, mask* vec) const {
   for (; low != high; ++low, ++vec)
-    *vec = static_cast<mask>(isascii(*low) ? ctype<char>::classic_table()[*low] : 0);
+    *vec = static_cast<mask>(std::__libcpp_isascii(*low) ? ctype<char>::classic_table()[*low] : 0);
   return low;
 }
 
 const wchar_t* ctype<wchar_t>::do_scan_is(mask m, const char_type* low, const char_type* high) const {
   for (; low != high; ++low)
-    if (isascii(*low) && (ctype<char>::classic_table()[*low] & m))
+    if (std::__libcpp_isascii(*low) && (ctype<char>::classic_table()[*low] & m))
       break;
   return low;
 }
 
 const wchar_t* ctype<wchar_t>::do_scan_not(mask m, const char_type* low, const char_type* high) const {
   for (; low != high; ++low)
-    if (!(isascii(*low) && (ctype<char>::classic_table()[*low] & m)))
+    if (!(std::__libcpp_isascii(*low) && (ctype<char>::classic_table()[*low] & m)))
       break;
   return low;
 }
 
 wchar_t ctype<wchar_t>::do_toupper(char_type c) const {
 #  ifdef _LIBCPP_HAS_DEFAULTRUNELOCALE
-  return isascii(c) ? _DefaultRuneLocale.__mapupper[c] : c;
+  return std::__libcpp_isascii(c) ? _DefaultRuneLocale.__mapupper[c] : c;
 #  elif defined(__GLIBC__) || defined(__EMSCRIPTEN__) || defined(__NetBSD__) || defined(__MVS__)
-  return isascii(c) ? ctype<char>::__classic_upper_table()[c] : c;
+  return std::__libcpp_isascii(c) ? ctype<char>::__classic_upper_table()[c] : c;
 #  else
-  return (isascii(c) && __locale::__iswlower(c, _LIBCPP_GET_C_LOCALE)) ? c - L'a' + L'A' : c;
+  return (std::__libcpp_isascii(c) && __locale::__iswlower(c, _LIBCPP_GET_C_LOCALE)) ? c - L'a' + L'A' : c;
 #  endif
 }
 
 const wchar_t* ctype<wchar_t>::do_toupper(char_type* low, const char_type* high) const {
   for (; low != high; ++low)
 #  ifdef _LIBCPP_HAS_DEFAULTRUNELOCALE
-    *low = isascii(*low) ? _DefaultRuneLocale.__mapupper[*low] : *low;
+    *low = std::__libcpp_isascii(*low) ? _DefaultRuneLocale.__mapupper[*low] : *low;
 #  elif defined(__GLIBC__) || defined(__EMSCRIPTEN__) || defined(__NetBSD__) || defined(__MVS__)
-    *low = isascii(*low) ? ctype<char>::__classic_upper_table()[*low] : *low;
+    *low = std::__libcpp_isascii(*low) ? ctype<char>::__classic_upper_table()[*low] : *low;
 #  else
-    *low = (isascii(*low) && __locale::__islower(*low, _LIBCPP_GET_C_LOCALE)) ? (*low - L'a' + L'A') : *low;
+    *low =
+        (std::__libcpp_isascii(*low) && __locale::__islower(*low, _LIBCPP_GET_C_LOCALE)) ? (*low - L'a' + L'A') : *low;
 #  endif
   return low;
 }
 
 wchar_t ctype<wchar_t>::do_tolower(char_type c) const {
 #  ifdef _LIBCPP_HAS_DEFAULTRUNELOCALE
-  return isascii(c) ? _DefaultRuneLocale.__maplower[c] : c;
+  return std::__libcpp_isascii(c) ? _DefaultRuneLocale.__maplower[c] : c;
 #  elif defined(__GLIBC__) || defined(__EMSCRIPTEN__) || defined(__NetBSD__) || defined(__MVS__)
-  return isascii(c) ? ctype<char>::__classic_lower_table()[c] : c;
+  return std::__libcpp_isascii(c) ? ctype<char>::__classic_lower_table()[c] : c;
 #  else
-  return (isascii(c) && __locale::__isupper(c, _LIBCPP_GET_C_LOCALE)) ? c - L'A' + 'a' : c;
+  return (std::__libcpp_isascii(c) && __locale::__isupper(c, _LIBCPP_GET_C_LOCALE)) ? c - L'A' + 'a' : c;
 #  endif
 }
 
 const wchar_t* ctype<wchar_t>::do_tolower(char_type* low, const char_type* high) const {
   for (; low != high; ++low)
 #  ifdef _LIBCPP_HAS_DEFAULTRUNELOCALE
-    *low = isascii(*low) ? _DefaultRuneLocale.__maplower[*low] : *low;
+    *low = std::__libcpp_isascii(*low) ? _DefaultRuneLocale.__maplower[*low] : *low;
 #  elif defined(__GLIBC__) || defined(__EMSCRIPTEN__) || defined(__NetBSD__) || defined(__MVS__)
-    *low = isascii(*low) ? ctype<char>::__classic_lower_table()[*low] : *low;
+    *low = std::__libcpp_isascii(*low) ? ctype<char>::__classic_lower_table()[*low] : *low;
 #  else
-    *low = (isascii(*low) && __locale::__isupper(*low, _LIBCPP_GET_C_LOCALE)) ? *low - L'A' + L'a' : *low;
+    *low = (std::__libcpp_isascii(*low) && __locale::__isupper(*low, _LIBCPP_GET_C_LOCALE)) ? *low - L'A' + L'a' : *low;
 #  endif
   return low;
 }
@@ -783,14 +784,14 @@ const char* ctype<wchar_t>::do_widen(const char* low, const char* high, char_typ
 }
 
 char ctype<wchar_t>::do_narrow(char_type c, char dfault) const {
-  if (isascii(c))
+  if (std::__libcpp_isascii(c))
     return static_cast<char>(c);
   return dfault;
 }
 
 const wchar_t* ctype<wchar_t>::do_narrow(const char_type* low, const char_type* high, char dfault, char* dest) const {
   for (; low != high; ++low, ++dest)
-    if (isascii(*low))
+    if (std::__libcpp_isascii(*low))
       *dest = static_cast<char>(*low);
     else
       *dest = dfault;
@@ -816,52 +817,56 @@ ctype<char>::~ctype() {
 
 char ctype<char>::do_toupper(char_type c) const {
 #ifdef _LIBCPP_HAS_DEFAULTRUNELOCALE
-  return isascii(c) ? static_cast<char>(_DefaultRuneLocale.__mapupper[static_cast<ptrdiff_t>(c)]) : c;
+  return std::__libcpp_isascii(c) ? static_cast<char>(_DefaultRuneLocale.__mapupper[static_cast<ptrdiff_t>(c)]) : c;
 #elif defined(__NetBSD__)
   return static_cast<char>(__classic_upper_table()[static_cast<unsigned char>(c)]);
 #elif defined(__GLIBC__) || defined(__EMSCRIPTEN__) || defined(__MVS__)
-  return isascii(c) ? static_cast<char>(__classic_upper_table()[static_cast<unsigned char>(c)]) : c;
+  return std::__libcpp_isascii(c) ? static_cast<char>(__classic_upper_table()[static_cast<unsigned char>(c)]) : c;
 #else
-  return (isascii(c) && __locale::__islower(c, _LIBCPP_GET_C_LOCALE)) ? c - 'a' + 'A' : c;
+  return (std::__libcpp_isascii(c) && __locale::__islower(c, _LIBCPP_GET_C_LOCALE)) ? c - 'a' + 'A' : c;
 #endif
 }
 
 const char* ctype<char>::do_toupper(char_type* low, const char_type* high) const {
   for (; low != high; ++low)
 #ifdef _LIBCPP_HAS_DEFAULTRUNELOCALE
-    *low = isascii(*low) ? static_cast<char>(_DefaultRuneLocale.__mapupper[static_cast<ptrdiff_t>(*low)]) : *low;
+    *low = std::__libcpp_isascii(*low)
+             ? static_cast<char>(_DefaultRuneLocale.__mapupper[static_cast<ptrdiff_t>(*low)])
+             : *low;
 #elif defined(__NetBSD__)
     *low = static_cast<char>(__classic_upper_table()[static_cast<unsigned char>(*low)]);
 #elif defined(__GLIBC__) || defined(__EMSCRIPTEN__) || defined(__MVS__)
-    *low = isascii(*low) ? static_cast<char>(__classic_upper_table()[static_cast<size_t>(*low)]) : *low;
+    *low = std::__libcpp_isascii(*low) ? static_cast<char>(__classic_upper_table()[static_cast<size_t>(*low)]) : *low;
 #else
-    *low = (isascii(*low) && __locale::__islower(*low, _LIBCPP_GET_C_LOCALE)) ? *low - 'a' + 'A' : *low;
+    *low = (std::__libcpp_isascii(*low) && __locale::__islower(*low, _LIBCPP_GET_C_LOCALE)) ? *low - 'a' + 'A' : *low;
 #endif
   return low;
 }
 
 char ctype<char>::do_tolower(char_type c) const {
 #ifdef _LIBCPP_HAS_DEFAULTRUNELOCALE
-  return isascii(c) ? static_cast<char>(_DefaultRuneLocale.__maplower[static_cast<ptrdiff_t>(c)]) : c;
+  return std::__libcpp_isascii(c) ? static_cast<char>(_DefaultRuneLocale.__maplower[static_cast<ptrdiff_t>(c)]) : c;
 #elif defined(__NetBSD__)
   return static_cast<char>(__classic_lower_table()[static_cast<unsigned char>(c)]);
 #elif defined(__GLIBC__) || defined(__EMSCRIPTEN__) || defined(__MVS__)
-  return isascii(c) ? static_cast<char>(__classic_lower_table()[static_cast<size_t>(c)]) : c;
+  return std::__libcpp_isascii(c) ? static_cast<char>(__classic_lower_table()[static_cast<size_t>(c)]) : c;
 #else
-  return (isascii(c) && __locale::__isupper(c, _LIBCPP_GET_C_LOCALE)) ? c - 'A' + 'a' : c;
+  return (std::__libcpp_isascii(c) && __locale::__isupper(c, _LIBCPP_GET_C_LOCALE)) ? c - 'A' + 'a' : c;
 #endif
 }
 
 const char* ctype<char>::do_tolower(char_type* low, const char_type* high) const {
   for (; low != high; ++low)
 #ifdef _LIBCPP_HAS_DEFAULTRUNELOCALE
-    *low = isascii(*low) ? static_cast<char>(_DefaultRuneLocale.__maplower[static_cast<ptrdiff_t>(*low)]) : *low;
+    *low = std::__libcpp_isascii(*low)
+             ? static_cast<char>(_DefaultRuneLocale.__maplower[static_cast<ptrdiff_t>(*low)])
+             : *low;
 #elif defined(__NetBSD__)
     *low = static_cast<char>(__classic_lower_table()[static_cast<unsigned char>(*low)]);
 #elif defined(__GLIBC__) || defined(__EMSCRIPTEN__) || defined(__MVS__)
-    *low = isascii(*low) ? static_cast<char>(__classic_lower_table()[static_cast<size_t>(*low)]) : *low;
+    *low = std::__libcpp_isascii(*low) ? static_cast<char>(__classic_lower_table()[static_cast<size_t>(*low)]) : *low;
 #else
-    *low = (isascii(*low) && __locale::__isupper(*low, _LIBCPP_GET_C_LOCALE)) ? *low - 'A' + 'a' : *low;
+    *low = (std::__libcpp_isascii(*low) && __locale::__isupper(*low, _LIBCPP_GET_C_LOCALE)) ? *low - 'A' + 'a' : *low;
 #endif
   return low;
 }
@@ -875,14 +880,14 @@ const char* ctype<char>::do_widen(const char* low, const char* high, char_type*
 }
 
 char ctype<char>::do_narrow(char_type c, char dfault) const {
-  if (isascii(c))
+  if (std::__libcpp_isascii(c))
     return static_cast<char>(c);
   return dfault;
 }
 
 const char* ctype<char>::do_narrow(const char_type* low, const char_type* high, char dfault, char* dest) const {
   for (; low != high; ++low, ++dest)
-    if (isascii(*low))
+    if (std::__libcpp_isascii(*low))
       *dest = *low;
     else
       *dest = dfault;
@@ -1140,7 +1145,7 @@ bool ctype_byname<wchar_t>::do_is(mask m, char_type c) const {
 
 const wchar_t* ctype_byname<wchar_t>::do_is(const char_type* low, const char_type* high, mask* vec) const {
   for (; low != high; ++low, ++vec) {
-    if (isascii(*low))
+    if (std::__libcpp_isascii(*low))
       *vec = static_cast<mask>(ctype<char>::classic_table()[*low]);
     else {
       *vec      = 0;

From 0b5cf9e17bd2f2fb9ee3a7dc2b4ef99fba3ae201 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Fri, 10 Jan 2025 15:21:58 -0500
Subject: [PATCH 106/408] [libc++] Add missing iswctype_l in posix_l_fallbacks
 (#122484)

---
 libcxx/include/__support/xlocale/__posix_l_fallback.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libcxx/include/__support/xlocale/__posix_l_fallback.h b/libcxx/include/__support/xlocale/__posix_l_fallback.h
index cd1d52bcb2af6..a56de0a5d78cd 100644
--- a/libcxx/include/__support/xlocale/__posix_l_fallback.h
+++ b/libcxx/include/__support/xlocale/__posix_l_fallback.h
@@ -38,6 +38,8 @@ inline _LIBCPP_HIDE_FROM_ABI int toupper_l(int __c, locale_t) { return ::toupper
 inline _LIBCPP_HIDE_FROM_ABI int tolower_l(int __c, locale_t) { return ::tolower(__c); }
 
 #if _LIBCPP_HAS_WIDE_CHARACTERS
+inline _LIBCPP_HIDE_FROM_ABI int iswctype_l(wint_t __c, wctype_t __type, locale_t) { return ::iswctype(__c, __type); }
+
 inline _LIBCPP_HIDE_FROM_ABI int iswalpha_l(wint_t __c, locale_t) { return ::iswalpha(__c); }
 
 inline _LIBCPP_HIDE_FROM_ABI int iswblank_l(wint_t __c, locale_t) { return ::iswblank(__c); }

From 008a39c0e3f934c7eb0dd04aa5759a0feac65967 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 10 Jan 2025 12:27:32 -0800
Subject: [PATCH 107/408] [lldb] Migrate away from PointerUnion::{is,get} (NFC)
 (#122420)

Note that PointerUnion::{is,get} have been soft deprecated in
PointerUnion.h:

  // FIXME: Replace the uses of is(), get() and dyn_cast() with
  //        isa<T>, cast<T> and the llvm::dyn_cast<T>
---
 .../Plugins/ObjectFile/ELF/ObjectFileELF.cpp  | 58 +++++++++----------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
index b16cb114bec88..6452baa4f84af 100644
--- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
+++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
@@ -123,7 +123,7 @@ class ELFRelocation {
 
   static elf_sxword RelocAddend64(const ELFRelocation &rel);
 
-  bool IsRela() { return (reloc.is<ELFRela *>()); }
+  bool IsRela() { return (llvm::isa<ELFRela *>(reloc)); }
 
 private:
   typedef llvm::PointerUnion<ELFRel *, ELFRela *> RelocUnion;
@@ -144,74 +144,74 @@ ELFRelocation::ELFRelocation(unsigned type) {
 }
 
 ELFRelocation::~ELFRelocation() {
-  if (reloc.is<ELFRel *>())
-    delete reloc.get<ELFRel *>();
+  if (auto *elfrel = llvm::dyn_cast<ELFRel *>(reloc))
+    delete elfrel;
   else
-    delete reloc.get<ELFRela *>();
+    delete llvm::cast<ELFRela *>(reloc);
 }
 
 bool ELFRelocation::Parse(const lldb_private::DataExtractor &data,
                           lldb::offset_t *offset) {
-  if (reloc.is<ELFRel *>())
-    return reloc.get<ELFRel *>()->Parse(data, offset);
+  if (auto *elfrel = llvm::dyn_cast<ELFRel *>(reloc))
+    return elfrel->Parse(data, offset);
   else
-    return reloc.get<ELFRela *>()->Parse(data, offset);
+    return llvm::cast<ELFRela *>(reloc)->Parse(data, offset);
 }
 
 unsigned ELFRelocation::RelocType32(const ELFRelocation &rel) {
-  if (rel.reloc.is<ELFRel *>())
-    return ELFRel::RelocType32(*rel.reloc.get<ELFRel *>());
+  if (auto *elfrel = llvm::dyn_cast<ELFRel *>(rel.reloc))
+    return ELFRel::RelocType32(*elfrel);
   else
-    return ELFRela::RelocType32(*rel.reloc.get<ELFRela *>());
+    return ELFRela::RelocType32(*llvm::cast<ELFRela *>(rel.reloc));
 }
 
 unsigned ELFRelocation::RelocType64(const ELFRelocation &rel) {
-  if (rel.reloc.is<ELFRel *>())
-    return ELFRel::RelocType64(*rel.reloc.get<ELFRel *>());
+  if (auto *elfrel = llvm::dyn_cast<ELFRel *>(rel.reloc))
+    return ELFRel::RelocType64(*elfrel);
   else
-    return ELFRela::RelocType64(*rel.reloc.get<ELFRela *>());
+    return ELFRela::RelocType64(*llvm::cast<ELFRela *>(rel.reloc));
 }
 
 unsigned ELFRelocation::RelocSymbol32(const ELFRelocation &rel) {
-  if (rel.reloc.is<ELFRel *>())
-    return ELFRel::RelocSymbol32(*rel.reloc.get<ELFRel *>());
+  if (auto *elfrel = llvm::dyn_cast<ELFRel *>(rel.reloc))
+    return ELFRel::RelocSymbol32(*elfrel);
   else
-    return ELFRela::RelocSymbol32(*rel.reloc.get<ELFRela *>());
+    return ELFRela::RelocSymbol32(*llvm::cast<ELFRela *>(rel.reloc));
 }
 
 unsigned ELFRelocation::RelocSymbol64(const ELFRelocation &rel) {
-  if (rel.reloc.is<ELFRel *>())
-    return ELFRel::RelocSymbol64(*rel.reloc.get<ELFRel *>());
+  if (auto *elfrel = llvm::dyn_cast<ELFRel *>(rel.reloc))
+    return ELFRel::RelocSymbol64(*elfrel);
   else
-    return ELFRela::RelocSymbol64(*rel.reloc.get<ELFRela *>());
+    return ELFRela::RelocSymbol64(*llvm::cast<ELFRela *>(rel.reloc));
 }
 
 elf_addr ELFRelocation::RelocOffset32(const ELFRelocation &rel) {
-  if (rel.reloc.is<ELFRel *>())
-    return rel.reloc.get<ELFRel *>()->r_offset;
+  if (auto *elfrel = llvm::dyn_cast<ELFRel *>(rel.reloc))
+    return elfrel->r_offset;
   else
-    return rel.reloc.get<ELFRela *>()->r_offset;
+    return llvm::cast<ELFRela *>(rel.reloc)->r_offset;
 }
 
 elf_addr ELFRelocation::RelocOffset64(const ELFRelocation &rel) {
-  if (rel.reloc.is<ELFRel *>())
-    return rel.reloc.get<ELFRel *>()->r_offset;
+  if (auto *elfrel = llvm::dyn_cast<ELFRel *>(rel.reloc))
+    return elfrel->r_offset;
   else
-    return rel.reloc.get<ELFRela *>()->r_offset;
+    return llvm::cast<ELFRela *>(rel.reloc)->r_offset;
 }
 
 elf_sxword ELFRelocation::RelocAddend32(const ELFRelocation &rel) {
-  if (rel.reloc.is<ELFRel *>())
+  if (llvm::isa<ELFRel *>(rel.reloc))
     return 0;
   else
-    return rel.reloc.get<ELFRela *>()->r_addend;
+    return llvm::cast<ELFRela *>(rel.reloc)->r_addend;
 }
 
 elf_sxword  ELFRelocation::RelocAddend64(const ELFRelocation &rel) {
-  if (rel.reloc.is<ELFRel *>())
+  if (llvm::isa<ELFRel *>(rel.reloc))
     return 0;
   else
-    return rel.reloc.get<ELFRela *>()->r_addend;
+    return llvm::cast<ELFRela *>(rel.reloc)->r_addend;
 }
 
 static user_id_t SegmentID(size_t PHdrIndex) {

From 8e6261fff122590a75604340cb3fcaa121e85b46 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi@microsoft.com>
Date: Fri, 10 Jan 2025 15:37:37 -0500
Subject: [PATCH 108/408] [HLSL] Implement the HLSL distance intrinsic
 (#122357)

- Hook of SPIRV builtin
- Implement Distance as length(X - Y)
---
 clang/lib/Headers/hlsl/hlsl_detail.h          |  15 ++
 clang/lib/Headers/hlsl/hlsl_intrinsics.h      |  29 ++++
 clang/test/CodeGenHLSL/builtins/distance.hlsl | 135 ++++++++++++++++++
 .../SemaHLSL/BuiltIns/distance-errors.hlsl    |  33 +++++
 4 files changed, 212 insertions(+)
 create mode 100644 clang/test/CodeGenHLSL/builtins/distance.hlsl
 create mode 100644 clang/test/SemaHLSL/BuiltIns/distance-errors.hlsl

diff --git a/clang/lib/Headers/hlsl/hlsl_detail.h b/clang/lib/Headers/hlsl/hlsl_detail.h
index 33d394f7883a6..19d83ea5471c7 100644
--- a/clang/lib/Headers/hlsl/hlsl_detail.h
+++ b/clang/lib/Headers/hlsl/hlsl_detail.h
@@ -53,6 +53,21 @@ length_vec_impl(vector<T, N> X) {
   return __builtin_elementwise_sqrt(__builtin_hlsl_dot(X, X));
 }
 
+template <typename T>
+constexpr enable_if_t<is_same<float, T>::value || is_same<half, T>::value, T>
+distance_impl(T X, T Y) {
+  return length_impl(X - Y);
+}
+
+template <typename T, int N>
+constexpr enable_if_t<is_same<float, T>::value || is_same<half, T>::value, T>
+distance_vec_impl(vector<T, N> X, vector<T, N> Y) {
+#if (__has_builtin(__builtin_spirv_distance))
+  return __builtin_spirv_distance(X, Y);
+#else
+  return length_vec_impl(X - Y);
+#endif
+}
 } // namespace __detail
 } // namespace hlsl
 #endif //_HLSL_HLSL_DETAILS_H_
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index e5ff7531d25d9..d2d3abd92ea6a 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -871,6 +871,35 @@ float3 degrees(float3);
 _HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_degrees)
 float4 degrees(float4);
 
+//===----------------------------------------------------------------------===//
+// distance builtins
+//===----------------------------------------------------------------------===//
+
+/// \fn K distance(T X, T Y)
+/// \brief Returns a distance scalar between \a X and \a Y.
+/// \param X The X input value.
+/// \param Y The Y input value.
+
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+const inline half distance(half X, half Y) {
+  return __detail::distance_impl(X, Y);
+}
+
+const inline float distance(float X, float Y) {
+  return __detail::distance_impl(X, Y);
+}
+
+template <int N>
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+const inline half distance(vector<half, N> X, vector<half, N> Y) {
+  return __detail::distance_vec_impl(X, Y);
+}
+
+template <int N>
+const inline float distance(vector<float, N> X, vector<float, N> Y) {
+  return __detail::distance_vec_impl(X, Y);
+}
+
 //===----------------------------------------------------------------------===//
 // dot product builtins
 //===----------------------------------------------------------------------===//
diff --git a/clang/test/CodeGenHLSL/builtins/distance.hlsl b/clang/test/CodeGenHLSL/builtins/distance.hlsl
new file mode 100644
index 0000000000000..6952700a87f1d
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/distance.hlsl
@@ -0,0 +1,135 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -O1 -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -triple \
+// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN: -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK
+
+// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z18test_distance_halfDhDh(
+// CHECK-SAME: half noundef nofpclass(nan inf) [[X:%.*]], half noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn half [[X]], [[Y]]
+// CHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.fabs.f16(half [[SUB_I]])
+// CHECK-NEXT:    ret half [[ELT_ABS_I]]
+//
+// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z18test_distance_halfDhDh(
+// SPVCHECK-SAME: half noundef nofpclass(nan inf) [[X:%.*]], half noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// SPVCHECK-NEXT:  [[ENTRY:.*:]]
+// SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn half [[X]], [[Y]]
+// SPVCHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.fabs.f16(half [[SUB_I]])
+// SPVCHECK-NEXT:    ret half [[ELT_ABS_I]]
+//
+half test_distance_half(half X, half Y) { return distance(X, Y); }
+
+// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z19test_distance_half2Dv2_DhS_(
+// CHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[X:%.*]], <2 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x half> [[X]], [[Y]]
+// CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v2f16(<2 x half> [[SUB_I]], <2 x half> [[SUB_I]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]])
+// CHECK-NEXT:    ret half [[TMP0]]
+//
+// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z19test_distance_half2Dv2_DhS_(
+// SPVCHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[X:%.*]], <2 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// SPVCHECK-NEXT:  [[ENTRY:.*:]]
+// SPVCHECK-NEXT:    [[SPV_DISTANCE_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.distance.v2f16(<2 x half> [[X]], <2 x half> [[Y]])
+// SPVCHECK-NEXT:    ret half [[SPV_DISTANCE_I]]
+//
+half test_distance_half2(half2 X, half2 Y) { return distance(X, Y); }
+
+// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z19test_distance_half3Dv3_DhS_(
+// CHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[X:%.*]], <3 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x half> [[X]], [[Y]]
+// CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v3f16(<3 x half> [[SUB_I]], <3 x half> [[SUB_I]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]])
+// CHECK-NEXT:    ret half [[TMP0]]
+//
+// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z19test_distance_half3Dv3_DhS_(
+// SPVCHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[X:%.*]], <3 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// SPVCHECK-NEXT:  [[ENTRY:.*:]]
+// SPVCHECK-NEXT:    [[SPV_DISTANCE_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.distance.v3f16(<3 x half> [[X]], <3 x half> [[Y]])
+// SPVCHECK-NEXT:    ret half [[SPV_DISTANCE_I]]
+//
+half test_distance_half3(half3 X, half3 Y) { return distance(X, Y); }
+
+// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z19test_distance_half4Dv4_DhS_(
+// CHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[X:%.*]], <4 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x half> [[X]], [[Y]]
+// CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v4f16(<4 x half> [[SUB_I]], <4 x half> [[SUB_I]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]])
+// CHECK-NEXT:    ret half [[TMP0]]
+//
+// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z19test_distance_half4Dv4_DhS_(
+// SPVCHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[X:%.*]], <4 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// SPVCHECK-NEXT:  [[ENTRY:.*:]]
+// SPVCHECK-NEXT:    [[SPV_DISTANCE_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.distance.v4f16(<4 x half> [[X]], <4 x half> [[Y]])
+// SPVCHECK-NEXT:    ret half [[SPV_DISTANCE_I]]
+//
+half test_distance_half4(half4 X, half4 Y) { return distance(X, Y); }
+
+// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z19test_distance_floatff(
+// CHECK-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn float [[X]], [[Y]]
+// CHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.fabs.f32(float [[SUB_I]])
+// CHECK-NEXT:    ret float [[ELT_ABS_I]]
+//
+// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z19test_distance_floatff(
+// SPVCHECK-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// SPVCHECK-NEXT:  [[ENTRY:.*:]]
+// SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn float [[X]], [[Y]]
+// SPVCHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.fabs.f32(float [[SUB_I]])
+// SPVCHECK-NEXT:    ret float [[ELT_ABS_I]]
+//
+float test_distance_float(float X, float Y) { return distance(X, Y); }
+
+// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z20test_distance_float2Dv2_fS_(
+// CHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[X:%.*]], <2 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x float> [[X]], [[Y]]
+// CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v2f32(<2 x float> [[SUB_I]], <2 x float> [[SUB_I]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]])
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z20test_distance_float2Dv2_fS_(
+// SPVCHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[X:%.*]], <2 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// SPVCHECK-NEXT:  [[ENTRY:.*:]]
+// SPVCHECK-NEXT:    [[SPV_DISTANCE_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.distance.v2f32(<2 x float> [[X]], <2 x float> [[Y]])
+// SPVCHECK-NEXT:    ret float [[SPV_DISTANCE_I]]
+//
+float test_distance_float2(float2 X, float2 Y) { return distance(X, Y); }
+
+// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z20test_distance_float3Dv3_fS_(
+// CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[X:%.*]], <3 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x float> [[X]], [[Y]]
+// CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v3f32(<3 x float> [[SUB_I]], <3 x float> [[SUB_I]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]])
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z20test_distance_float3Dv3_fS_(
+// SPVCHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[X:%.*]], <3 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// SPVCHECK-NEXT:  [[ENTRY:.*:]]
+// SPVCHECK-NEXT:    [[SPV_DISTANCE_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.distance.v3f32(<3 x float> [[X]], <3 x float> [[Y]])
+// SPVCHECK-NEXT:    ret float [[SPV_DISTANCE_I]]
+//
+float test_distance_float3(float3 X, float3 Y) { return distance(X, Y); }
+
+// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z20test_distance_float4Dv4_fS_(
+// CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[X:%.*]], <4 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x float> [[X]], [[Y]]
+// CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v4f32(<4 x float> [[SUB_I]], <4 x float> [[SUB_I]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]])
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z20test_distance_float4Dv4_fS_(
+// SPVCHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[X:%.*]], <4 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// SPVCHECK-NEXT:  [[ENTRY:.*:]]
+// SPVCHECK-NEXT:    [[SPV_DISTANCE_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.distance.v4f32(<4 x float> [[X]], <4 x float> [[Y]])
+// SPVCHECK-NEXT:    ret float [[SPV_DISTANCE_I]]
+//
+float test_distance_float4(float4 X, float4 Y) { return distance(X, Y); }
diff --git a/clang/test/SemaHLSL/BuiltIns/distance-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/distance-errors.hlsl
new file mode 100644
index 0000000000000..e996bf5d2cb7c
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/distance-errors.hlsl
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+
+float test_no_second_arg(float2 p0) {
+  return distance(p0);
+  // expected-error@-1 {{no matching function for call to 'distance'}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function not viable: requires 2 arguments, but 1 was provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function not viable: requires 2 arguments, but 1 was provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 2 arguments, but 1 was provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 2 arguments, but 1 was provided}}
+}
+
+float test_too_many_arg(float2 p0) {
+  return distance(p0, p0, p0);
+  // expected-error@-1 {{no matching function for call to 'distance'}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function not viable: requires 2 arguments, but 3 were provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function not viable: requires 2 arguments, but 3 were provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 2 arguments, but 3 were provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 2 arguments, but 3 were provided}}
+}
+
+float test_double_inputs(double p0, double p1) {
+  return distance(p0, p1);
+  // expected-error@-1  {{call to 'distance' is ambiguous}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function}}
+}
+
+float test_int_inputs(int p0, int p1) {
+  return distance(p0, p1);
+  // expected-error@-1  {{call to 'distance' is ambiguous}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function}}
+}

From 55b587506e5dccb436e5405b7236671112b36244 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston@google.com>
Date: Fri, 10 Jan 2025 12:40:57 -0800
Subject: [PATCH 109/408] [ubsan][NFCI] Use SanitizerOrdinal instead of
 SanitizerMask for EmitCheck (exactly one sanitizer is required) (#122511)

The `Checked` parameter of `CodeGenFunction::EmitCheck` is of type
`ArrayRef<std::pair<llvm::Value *, SanitizerMask>>`, which is overly
generalized: SanitizerMask can denote that zero or more sanitizers are
enabled, but `EmitCheck` requires that exactly one sanitizer is
specified in the SanitizerMask (e.g.,
`SanitizeTrap.has(Checked[i].second)` enforces that).

This patch replaces SanitizerMask with SanitizerOrdinal in the `Checked`
parameter of `EmitCheck` and code that transitively relies on it. This
should not affect the behavior of UBSan, but it has the advantages that:
- the code is clearer: it avoids ambiguity in EmitCheck about what to do
if multiple bits are set
- specifying the wrong number of sanitizers in `Checked[i].second` will
be detected as a compile-time error, rather than a runtime assertion
failure

Suggested by Vitaly in https://github.com/llvm/llvm-project/pull/122392
as an alternative to adding an explicit runtime assertion that the
SanitizerMask contains exactly one sanitizer.
---
 clang/include/clang/Basic/Sanitizers.h |  4 ++
 clang/lib/CodeGen/CGBuiltin.cpp        |  6 +-
 clang/lib/CodeGen/CGCall.cpp           | 12 ++--
 clang/lib/CodeGen/CGClass.cpp          | 15 ++---
 clang/lib/CodeGen/CGDecl.cpp           |  2 +-
 clang/lib/CodeGen/CGExpr.cpp           | 73 +++++++++++-----------
 clang/lib/CodeGen/CGExprScalar.cpp     | 84 ++++++++++++++------------
 clang/lib/CodeGen/CGObjC.cpp           |  2 +-
 clang/lib/CodeGen/CodeGenFunction.cpp  |  9 +--
 clang/lib/CodeGen/CodeGenFunction.h    | 13 ++--
 clang/lib/CodeGen/ItaniumCXXABI.cpp    |  4 +-
 11 files changed, 124 insertions(+), 100 deletions(-)

diff --git a/clang/include/clang/Basic/Sanitizers.h b/clang/include/clang/Basic/Sanitizers.h
index 2ff1acb772094..fc0576d452b17 100644
--- a/clang/include/clang/Basic/Sanitizers.h
+++ b/clang/include/clang/Basic/Sanitizers.h
@@ -171,6 +171,10 @@ struct SanitizerSet {
     return static_cast<bool>(Mask & K);
   }
 
+  bool has(SanitizerKind::SanitizerOrdinal O) const {
+    return has(SanitizerMask::bitPosToMask(O));
+  }
+
   /// Check if one or more sanitizers are enabled.
   bool hasOneOf(SanitizerMask K) const { return static_cast<bool>(Mask & K); }
 
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 2b09197669996..1b25d365932c3 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -2240,7 +2240,7 @@ Value *CodeGenFunction::EmitCheckedArgForBuiltin(const Expr *E,
   SanitizerScope SanScope(this);
   Value *Cond = Builder.CreateICmpNE(
       ArgValue, llvm::Constant::getNullValue(ArgValue->getType()));
-  EmitCheck(std::make_pair(Cond, SanitizerKind::Builtin),
+  EmitCheck(std::make_pair(Cond, SanitizerKind::SO_Builtin),
             SanitizerHandler::InvalidBuiltin,
             {EmitCheckSourceLocation(E->getExprLoc()),
              llvm::ConstantInt::get(Builder.getInt8Ty(), Kind)},
@@ -2255,7 +2255,7 @@ Value *CodeGenFunction::EmitCheckedArgForAssume(const Expr *E) {
 
   SanitizerScope SanScope(this);
   EmitCheck(
-      std::make_pair(ArgValue, SanitizerKind::Builtin),
+      std::make_pair(ArgValue, SanitizerKind::SO_Builtin),
       SanitizerHandler::InvalidBuiltin,
       {EmitCheckSourceLocation(E->getExprLoc()),
        llvm::ConstantInt::get(Builder.getInt8Ty(), BCK_AssumePassedFalse)},
@@ -2290,7 +2290,7 @@ static Value *EmitOverflowCheckedAbs(CodeGenFunction &CGF, const CallExpr *E,
 
   // TODO: support -ftrapv-handler.
   if (SanitizeOverflow) {
-    CGF.EmitCheck({{NotOverflow, SanitizerKind::SignedIntegerOverflow}},
+    CGF.EmitCheck({{NotOverflow, SanitizerKind::SO_SignedIntegerOverflow}},
                   SanitizerHandler::NegateOverflow,
                   {CGF.EmitCheckSourceLocation(E->getArg(0)->getExprLoc()),
                    CGF.EmitCheckTypeDescriptor(E->getType())},
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 7b0ef4be98619..379ce00b739ae 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -4024,20 +4024,20 @@ void CodeGenFunction::EmitReturnValueCheck(llvm::Value *RV) {
 
   // Prefer the returns_nonnull attribute if it's present.
   SourceLocation AttrLoc;
-  SanitizerMask CheckKind;
+  SanitizerKind::SanitizerOrdinal CheckKind;
   SanitizerHandler Handler;
   if (RetNNAttr) {
     assert(!requiresReturnValueNullabilityCheck() &&
            "Cannot check nullability and the nonnull attribute");
     AttrLoc = RetNNAttr->getLocation();
-    CheckKind = SanitizerKind::ReturnsNonnullAttribute;
+    CheckKind = SanitizerKind::SO_ReturnsNonnullAttribute;
     Handler = SanitizerHandler::NonnullReturn;
   } else {
     if (auto *DD = dyn_cast<DeclaratorDecl>(CurCodeDecl))
       if (auto *TSI = DD->getTypeSourceInfo())
         if (auto FTL = TSI->getTypeLoc().getAsAdjusted<FunctionTypeLoc>())
           AttrLoc = FTL.getReturnLoc().findNullabilityLoc();
-    CheckKind = SanitizerKind::NullabilityReturn;
+    CheckKind = SanitizerKind::SO_NullabilityReturn;
     Handler = SanitizerHandler::NullabilityReturn;
   }
 
@@ -4419,15 +4419,15 @@ void CodeGenFunction::EmitNonNullArgCheck(RValue RV, QualType ArgType,
     return;
 
   SourceLocation AttrLoc;
-  SanitizerMask CheckKind;
+  SanitizerKind::SanitizerOrdinal CheckKind;
   SanitizerHandler Handler;
   if (NNAttr) {
     AttrLoc = NNAttr->getLocation();
-    CheckKind = SanitizerKind::NonnullAttribute;
+    CheckKind = SanitizerKind::SO_NonnullAttribute;
     Handler = SanitizerHandler::NonnullArg;
   } else {
     AttrLoc = PVD->getTypeSourceInfo()->getTypeLoc().findNullabilityLoc();
-    CheckKind = SanitizerKind::NullabilityArg;
+    CheckKind = SanitizerKind::SO_NullabilityArg;
     Handler = SanitizerHandler::NullabilityArg;
   }
 
diff --git a/clang/lib/CodeGen/CGClass.cpp b/clang/lib/CodeGen/CGClass.cpp
index c45688bd1ed3c..16a1c1cebdaa4 100644
--- a/clang/lib/CodeGen/CGClass.cpp
+++ b/clang/lib/CodeGen/CGClass.cpp
@@ -2843,23 +2843,23 @@ void CodeGenFunction::EmitVTablePtrCheck(const CXXRecordDecl *RD,
       !CGM.HasHiddenLTOVisibility(RD))
     return;
 
-  SanitizerMask M;
+  SanitizerKind::SanitizerOrdinal M;
   llvm::SanitizerStatKind SSK;
   switch (TCK) {
   case CFITCK_VCall:
-    M = SanitizerKind::CFIVCall;
+    M = SanitizerKind::SO_CFIVCall;
     SSK = llvm::SanStat_CFI_VCall;
     break;
   case CFITCK_NVCall:
-    M = SanitizerKind::CFINVCall;
+    M = SanitizerKind::SO_CFINVCall;
     SSK = llvm::SanStat_CFI_NVCall;
     break;
   case CFITCK_DerivedCast:
-    M = SanitizerKind::CFIDerivedCast;
+    M = SanitizerKind::SO_CFIDerivedCast;
     SSK = llvm::SanStat_CFI_DerivedCast;
     break;
   case CFITCK_UnrelatedCast:
-    M = SanitizerKind::CFIUnrelatedCast;
+    M = SanitizerKind::SO_CFIUnrelatedCast;
     SSK = llvm::SanStat_CFI_UnrelatedCast;
     break;
   case CFITCK_ICall:
@@ -2869,7 +2869,8 @@ void CodeGenFunction::EmitVTablePtrCheck(const CXXRecordDecl *RD,
   }
 
   std::string TypeName = RD->getQualifiedNameAsString();
-  if (getContext().getNoSanitizeList().containsType(M, TypeName))
+  if (getContext().getNoSanitizeList().containsType(
+          SanitizerMask::bitPosToMask(M), TypeName))
     return;
 
   SanitizerScope SanScope(this);
@@ -2945,7 +2946,7 @@ llvm::Value *CodeGenFunction::EmitVTableTypeCheckedLoad(
   if (SanOpts.has(SanitizerKind::CFIVCall) &&
       !getContext().getNoSanitizeList().containsType(SanitizerKind::CFIVCall,
                                                      TypeName)) {
-    EmitCheck(std::make_pair(CheckResult, SanitizerKind::CFIVCall),
+    EmitCheck(std::make_pair(CheckResult, SanitizerKind::SO_CFIVCall),
               SanitizerHandler::CFICheckFail, {}, {});
   }
 
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index 6f3ff050cb697..f85e0b2c404f9 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -762,7 +762,7 @@ void CodeGenFunction::EmitNullabilityCheck(LValue LHS, llvm::Value *RHS,
       EmitCheckSourceLocation(Loc), EmitCheckTypeDescriptor(LHS.getType()),
       llvm::ConstantInt::get(Int8Ty, 0), // The LogAlignment info is unused.
       llvm::ConstantInt::get(Int8Ty, TCK_NonnullAssign)};
-  EmitCheck({{IsNotNull, SanitizerKind::NullabilityAssign}},
+  EmitCheck({{IsNotNull, SanitizerKind::SO_NullabilityAssign}},
             SanitizerHandler::TypeMismatch, StaticData, RHS);
 }
 
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 1bad7a722da07..060d02b7f1487 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -735,7 +735,8 @@ void CodeGenFunction::EmitTypeCheck(TypeCheckKind TCK, SourceLocation Loc,
 
   SanitizerScope SanScope(this);
 
-  SmallVector<std::pair<llvm::Value *, SanitizerMask>, 3> Checks;
+  SmallVector<std::pair<llvm::Value *, SanitizerKind::SanitizerOrdinal>, 3>
+      Checks;
   llvm::BasicBlock *Done = nullptr;
 
   // Quickly determine whether we have a pointer to an alloca. It's possible
@@ -767,7 +768,7 @@ void CodeGenFunction::EmitTypeCheck(TypeCheckKind TCK, SourceLocation Loc,
         Builder.CreateCondBr(IsNonNull, Rest, Done);
         EmitBlock(Rest);
       } else {
-        Checks.push_back(std::make_pair(IsNonNull, SanitizerKind::Null));
+        Checks.push_back(std::make_pair(IsNonNull, SanitizerKind::SO_Null));
       }
     }
   }
@@ -794,7 +795,8 @@ void CodeGenFunction::EmitTypeCheck(TypeCheckKind TCK, SourceLocation Loc,
       llvm::Value *Dynamic = Builder.getFalse();
       llvm::Value *LargeEnough = Builder.CreateICmpUGE(
           Builder.CreateCall(F, {Ptr, Min, NullIsUnknown, Dynamic}), Size);
-      Checks.push_back(std::make_pair(LargeEnough, SanitizerKind::ObjectSize));
+      Checks.push_back(
+          std::make_pair(LargeEnough, SanitizerKind::SO_ObjectSize));
     }
   }
 
@@ -818,7 +820,7 @@ void CodeGenFunction::EmitTypeCheck(TypeCheckKind TCK, SourceLocation Loc,
       llvm::Value *Aligned =
           Builder.CreateICmpEQ(Align, llvm::ConstantInt::get(IntPtrTy, 0));
       if (Aligned != True)
-        Checks.push_back(std::make_pair(Aligned, SanitizerKind::Alignment));
+        Checks.push_back(std::make_pair(Aligned, SanitizerKind::SO_Alignment));
     }
   }
 
@@ -902,7 +904,7 @@ void CodeGenFunction::EmitTypeCheck(TypeCheckKind TCK, SourceLocation Loc,
         llvm::ConstantInt::get(Int8Ty, TCK)
       };
       llvm::Value *DynamicData[] = { Ptr, Hash };
-      EmitCheck(std::make_pair(EqualHash, SanitizerKind::Vptr),
+      EmitCheck(std::make_pair(EqualHash, SanitizerKind::SO_Vptr),
                 SanitizerHandler::DynamicTypeCacheMiss, StaticData,
                 DynamicData);
     }
@@ -1220,7 +1222,7 @@ void CodeGenFunction::EmitBoundsCheckImpl(const Expr *E, llvm::Value *Bound,
   };
   llvm::Value *Check = Accessed ? Builder.CreateICmpULT(IndexVal, BoundVal)
                                 : Builder.CreateICmpULE(IndexVal, BoundVal);
-  EmitCheck(std::make_pair(Check, SanitizerKind::ArrayBounds),
+  EmitCheck(std::make_pair(Check, SanitizerKind::SO_ArrayBounds),
             SanitizerHandler::OutOfBounds, StaticData, Index);
 }
 
@@ -1960,8 +1962,8 @@ bool CodeGenFunction::EmitScalarRangeCheck(llvm::Value *Value, QualType Ty,
   }
   llvm::Constant *StaticArgs[] = {EmitCheckSourceLocation(Loc),
                                   EmitCheckTypeDescriptor(Ty)};
-  SanitizerMask Kind =
-      NeedsEnumCheck ? SanitizerKind::Enum : SanitizerKind::Bool;
+  SanitizerKind::SanitizerOrdinal Kind =
+      NeedsEnumCheck ? SanitizerKind::SO_Enum : SanitizerKind::SO_Bool;
   EmitCheck(std::make_pair(Check, Kind), SanitizerHandler::LoadInvalidValue,
             StaticArgs, EmitCheckValue(Value));
   return true;
@@ -3513,11 +3515,12 @@ enum class CheckRecoverableKind {
 };
 }
 
-static CheckRecoverableKind getRecoverableKind(SanitizerMask Kind) {
-  assert(Kind.countPopulation() == 1);
-  if (Kind == SanitizerKind::Vptr)
+static CheckRecoverableKind
+getRecoverableKind(SanitizerKind::SanitizerOrdinal Ordinal) {
+  if (Ordinal == SanitizerKind::SO_Vptr)
     return CheckRecoverableKind::AlwaysRecoverable;
-  else if (Kind == SanitizerKind::Return || Kind == SanitizerKind::Unreachable)
+  else if (Ordinal == SanitizerKind::SO_Return ||
+           Ordinal == SanitizerKind::SO_Unreachable)
     return CheckRecoverableKind::Unrecoverable;
   else
     return CheckRecoverableKind::Recoverable;
@@ -3589,7 +3592,7 @@ static void emitCheckHandlerCall(CodeGenFunction &CGF,
 }
 
 void CodeGenFunction::EmitCheck(
-    ArrayRef<std::pair<llvm::Value *, SanitizerMask>> Checked,
+    ArrayRef<std::pair<llvm::Value *, SanitizerKind::SanitizerOrdinal>> Checked,
     SanitizerHandler CheckHandler, ArrayRef<llvm::Constant *> StaticArgs,
     ArrayRef<llvm::Value *> DynamicArgs) {
   assert(IsSanitizerScope);
@@ -3715,8 +3718,9 @@ void CodeGenFunction::EmitCheck(
 }
 
 void CodeGenFunction::EmitCfiSlowPathCheck(
-    SanitizerMask Kind, llvm::Value *Cond, llvm::ConstantInt *TypeId,
-    llvm::Value *Ptr, ArrayRef<llvm::Constant *> StaticArgs) {
+    SanitizerKind::SanitizerOrdinal Ordinal, llvm::Value *Cond,
+    llvm::ConstantInt *TypeId, llvm::Value *Ptr,
+    ArrayRef<llvm::Constant *> StaticArgs) {
   llvm::BasicBlock *Cont = createBasicBlock("cfi.cont");
 
   llvm::BasicBlock *CheckBB = createBasicBlock("cfi.slowpath");
@@ -3728,7 +3732,7 @@ void CodeGenFunction::EmitCfiSlowPathCheck(
 
   EmitBlock(CheckBB);
 
-  bool WithDiag = !CGM.getCodeGenOpts().SanitizeTrap.has(Kind);
+  bool WithDiag = !CGM.getCodeGenOpts().SanitizeTrap.has(Ordinal);
 
   llvm::CallInst *CheckCall;
   llvm::FunctionCallee SlowPathFn;
@@ -3860,22 +3864,23 @@ void CodeGenFunction::EmitCfiCheckFail() {
                          {Addr, AllVtables}),
       IntPtrTy);
 
-  const std::pair<int, SanitizerMask> CheckKinds[] = {
-      {CFITCK_VCall, SanitizerKind::CFIVCall},
-      {CFITCK_NVCall, SanitizerKind::CFINVCall},
-      {CFITCK_DerivedCast, SanitizerKind::CFIDerivedCast},
-      {CFITCK_UnrelatedCast, SanitizerKind::CFIUnrelatedCast},
-      {CFITCK_ICall, SanitizerKind::CFIICall}};
-
-  SmallVector<std::pair<llvm::Value *, SanitizerMask>, 5> Checks;
-  for (auto CheckKindMaskPair : CheckKinds) {
-    int Kind = CheckKindMaskPair.first;
-    SanitizerMask Mask = CheckKindMaskPair.second;
+  const std::pair<int, SanitizerKind::SanitizerOrdinal> CheckKinds[] = {
+      {CFITCK_VCall, SanitizerKind::SO_CFIVCall},
+      {CFITCK_NVCall, SanitizerKind::SO_CFINVCall},
+      {CFITCK_DerivedCast, SanitizerKind::SO_CFIDerivedCast},
+      {CFITCK_UnrelatedCast, SanitizerKind::SO_CFIUnrelatedCast},
+      {CFITCK_ICall, SanitizerKind::SO_CFIICall}};
+
+  SmallVector<std::pair<llvm::Value *, SanitizerKind::SanitizerOrdinal>, 5>
+      Checks;
+  for (auto CheckKindOrdinalPair : CheckKinds) {
+    int Kind = CheckKindOrdinalPair.first;
+    SanitizerKind::SanitizerOrdinal Ordinal = CheckKindOrdinalPair.second;
     llvm::Value *Cond =
         Builder.CreateICmpNE(CheckKind, llvm::ConstantInt::get(Int8Ty, Kind));
-    if (CGM.getLangOpts().Sanitize.has(Mask))
-      EmitCheck(std::make_pair(Cond, Mask), SanitizerHandler::CFICheckFail, {},
-                {Data, Addr, ValidVtable});
+    if (CGM.getLangOpts().Sanitize.has(Ordinal))
+      EmitCheck(std::make_pair(Cond, Ordinal), SanitizerHandler::CFICheckFail,
+                {}, {Data, Addr, ValidVtable});
     else
       EmitTrapCheck(Cond, SanitizerHandler::CFICheckFail);
   }
@@ -3890,7 +3895,7 @@ void CodeGenFunction::EmitUnreachable(SourceLocation Loc) {
   if (SanOpts.has(SanitizerKind::Unreachable)) {
     SanitizerScope SanScope(this);
     EmitCheck(std::make_pair(static_cast<llvm::Value *>(Builder.getFalse()),
-                             SanitizerKind::Unreachable),
+                             SanitizerKind::SO_Unreachable),
               SanitizerHandler::BuiltinUnreachable,
               EmitCheckSourceLocation(Loc), {});
   }
@@ -6054,7 +6059,7 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType,
           Builder.CreateICmpEQ(CalleeTypeHash, TypeHash);
       llvm::Constant *StaticData[] = {EmitCheckSourceLocation(E->getBeginLoc()),
                                       EmitCheckTypeDescriptor(CalleeType)};
-      EmitCheck(std::make_pair(CalleeTypeHashMatch, SanitizerKind::Function),
+      EmitCheck(std::make_pair(CalleeTypeHashMatch, SanitizerKind::SO_Function),
                 SanitizerHandler::FunctionTypeMismatch, StaticData,
                 {CalleePtr});
 
@@ -6091,10 +6096,10 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType,
         EmitCheckTypeDescriptor(QualType(FnType, 0)),
     };
     if (CGM.getCodeGenOpts().SanitizeCfiCrossDso && CrossDsoTypeId) {
-      EmitCfiSlowPathCheck(SanitizerKind::CFIICall, TypeTest, CrossDsoTypeId,
+      EmitCfiSlowPathCheck(SanitizerKind::SO_CFIICall, TypeTest, CrossDsoTypeId,
                            CalleePtr, StaticData);
     } else {
-      EmitCheck(std::make_pair(TypeTest, SanitizerKind::CFIICall),
+      EmitCheck(std::make_pair(TypeTest, SanitizerKind::SO_CFIICall),
                 SanitizerHandler::CFICheckFail, StaticData,
                 {CalleePtr, llvm::UndefValue::get(IntPtrTy)});
     }
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 090c4fb3ea39e..ac499e490ee87 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -280,8 +280,9 @@ class ScalarExprEmitter
     return CGF.EmitCheckedLValue(E, TCK);
   }
 
-  void EmitBinOpCheck(ArrayRef<std::pair<Value *, SanitizerMask>> Checks,
-                      const BinOpInfo &Info);
+  void EmitBinOpCheck(
+      ArrayRef<std::pair<Value *, SanitizerKind::SanitizerOrdinal>> Checks,
+      const BinOpInfo &Info);
 
   Value *EmitLoadOfLValue(LValue LV, SourceLocation Loc) {
     return CGF.EmitLoadOfLValue(LV, Loc).getScalarVal();
@@ -1047,14 +1048,14 @@ void ScalarExprEmitter::EmitFloatConversionCheck(
   llvm::Constant *StaticArgs[] = {CGF.EmitCheckSourceLocation(Loc),
                                   CGF.EmitCheckTypeDescriptor(OrigSrcType),
                                   CGF.EmitCheckTypeDescriptor(DstType)};
-  CGF.EmitCheck(std::make_pair(Check, SanitizerKind::FloatCastOverflow),
+  CGF.EmitCheck(std::make_pair(Check, SanitizerKind::SO_FloatCastOverflow),
                 SanitizerHandler::FloatCastOverflow, StaticArgs, OrigSrc);
 }
 
 // Should be called within CodeGenFunction::SanitizerScope RAII scope.
 // Returns 'i1 false' when the truncation Src -> Dst was lossy.
 static std::pair<ScalarExprEmitter::ImplicitConversionCheckKind,
-                 std::pair<llvm::Value *, SanitizerMask>>
+                 std::pair<llvm::Value *, SanitizerKind::SanitizerOrdinal>>
 EmitIntegerTruncationCheckHelper(Value *Src, QualType SrcType, Value *Dst,
                                  QualType DstType, CGBuilderTy &Builder) {
   llvm::Type *SrcTy = Src->getType();
@@ -1073,13 +1074,13 @@ EmitIntegerTruncationCheckHelper(Value *Src, QualType SrcType, Value *Dst,
   // If both (src and dst) types are unsigned, then it's an unsigned truncation.
   // Else, it is a signed truncation.
   ScalarExprEmitter::ImplicitConversionCheckKind Kind;
-  SanitizerMask Mask;
+  SanitizerKind::SanitizerOrdinal Ordinal;
   if (!SrcSigned && !DstSigned) {
     Kind = ScalarExprEmitter::ICCK_UnsignedIntegerTruncation;
-    Mask = SanitizerKind::ImplicitUnsignedIntegerTruncation;
+    Ordinal = SanitizerKind::SO_ImplicitUnsignedIntegerTruncation;
   } else {
     Kind = ScalarExprEmitter::ICCK_SignedIntegerTruncation;
-    Mask = SanitizerKind::ImplicitSignedIntegerTruncation;
+    Ordinal = SanitizerKind::SO_ImplicitSignedIntegerTruncation;
   }
 
   llvm::Value *Check = nullptr;
@@ -1088,7 +1089,7 @@ EmitIntegerTruncationCheckHelper(Value *Src, QualType SrcType, Value *Dst,
   // 2. Equality-compare with the original source value
   Check = Builder.CreateICmpEQ(Check, Src, "truncheck");
   // If the comparison result is 'i1 false', then the truncation was lossy.
-  return std::make_pair(Kind, std::make_pair(Check, Mask));
+  return std::make_pair(Kind, std::make_pair(Check, Ordinal));
 }
 
 static bool PromotionIsPotentiallyEligibleForImplicitIntegerConversionCheck(
@@ -1128,7 +1129,7 @@ void ScalarExprEmitter::EmitIntegerTruncationCheck(Value *Src, QualType SrcType,
   CodeGenFunction::SanitizerScope SanScope(&CGF);
 
   std::pair<ScalarExprEmitter::ImplicitConversionCheckKind,
-            std::pair<llvm::Value *, SanitizerMask>>
+            std::pair<llvm::Value *, SanitizerKind::SanitizerOrdinal>>
       Check =
           EmitIntegerTruncationCheckHelper(Src, SrcType, Dst, DstType, Builder);
   // If the comparison result is 'i1 false', then the truncation was lossy.
@@ -1138,7 +1139,8 @@ void ScalarExprEmitter::EmitIntegerTruncationCheck(Value *Src, QualType SrcType,
     return;
 
   // Does some SSCL ignore this type?
-  if (CGF.getContext().isTypeIgnoredBySanitizer(Check.second.second, DstType))
+  if (CGF.getContext().isTypeIgnoredBySanitizer(
+          SanitizerMask::bitPosToMask(Check.second.second), DstType))
     return;
 
   llvm::Constant *StaticArgs[] = {
@@ -1169,7 +1171,7 @@ static llvm::Value *EmitIsNegativeTestHelper(Value *V, QualType VType,
 // Should be called within CodeGenFunction::SanitizerScope RAII scope.
 // Returns 'i1 false' when the conversion Src -> Dst changed the sign.
 static std::pair<ScalarExprEmitter::ImplicitConversionCheckKind,
-                 std::pair<llvm::Value *, SanitizerMask>>
+                 std::pair<llvm::Value *, SanitizerKind::SanitizerOrdinal>>
 EmitIntegerSignChangeCheckHelper(Value *Src, QualType SrcType, Value *Dst,
                                  QualType DstType, CGBuilderTy &Builder) {
   llvm::Type *SrcTy = Src->getType();
@@ -1205,13 +1207,13 @@ EmitIntegerSignChangeCheckHelper(Value *Src, QualType SrcType, Value *Dst,
   // If the comparison result is 'false', then the conversion changed the sign.
   return std::make_pair(
       ScalarExprEmitter::ICCK_IntegerSignChange,
-      std::make_pair(Check, SanitizerKind::ImplicitIntegerSignChange));
+      std::make_pair(Check, SanitizerKind::SO_ImplicitIntegerSignChange));
 }
 
 void ScalarExprEmitter::EmitIntegerSignChangeCheck(Value *Src, QualType SrcType,
                                                    Value *Dst, QualType DstType,
                                                    SourceLocation Loc) {
-  if (!CGF.SanOpts.has(SanitizerKind::ImplicitIntegerSignChange))
+  if (!CGF.SanOpts.has(SanitizerKind::SO_ImplicitIntegerSignChange))
     return;
 
   llvm::Type *SrcTy = Src->getType();
@@ -1265,12 +1267,14 @@ void ScalarExprEmitter::EmitIntegerSignChangeCheck(Value *Src, QualType SrcType,
   CodeGenFunction::SanitizerScope SanScope(&CGF);
 
   std::pair<ScalarExprEmitter::ImplicitConversionCheckKind,
-            std::pair<llvm::Value *, SanitizerMask>>
+            std::pair<llvm::Value *, SanitizerKind::SanitizerOrdinal>>
       Check;
 
   // Each of these checks needs to return 'false' when an issue was detected.
   ImplicitConversionCheckKind CheckKind;
-  llvm::SmallVector<std::pair<llvm::Value *, SanitizerMask>, 2> Checks;
+  llvm::SmallVector<std::pair<llvm::Value *, SanitizerKind::SanitizerOrdinal>,
+                    2>
+      Checks;
   // So we can 'and' all the checks together, and still get 'false',
   // if at least one of the checks detected an issue.
 
@@ -1303,7 +1307,7 @@ void ScalarExprEmitter::EmitIntegerSignChangeCheck(Value *Src, QualType SrcType,
 // Should be called within CodeGenFunction::SanitizerScope RAII scope.
 // Returns 'i1 false' when the truncation Src -> Dst was lossy.
 static std::pair<ScalarExprEmitter::ImplicitConversionCheckKind,
-                 std::pair<llvm::Value *, SanitizerMask>>
+                 std::pair<llvm::Value *, SanitizerKind::SanitizerOrdinal>>
 EmitBitfieldTruncationCheckHelper(Value *Src, QualType SrcType, Value *Dst,
                                   QualType DstType, CGBuilderTy &Builder) {
   bool SrcSigned = SrcType->isSignedIntegerOrEnumerationType();
@@ -1323,13 +1327,14 @@ EmitBitfieldTruncationCheckHelper(Value *Src, QualType SrcType, Value *Dst,
   // If the comparison result is 'i1 false', then the truncation was lossy.
 
   return std::make_pair(
-      Kind, std::make_pair(Check, SanitizerKind::ImplicitBitfieldConversion));
+      Kind,
+      std::make_pair(Check, SanitizerKind::SO_ImplicitBitfieldConversion));
 }
 
 // Should be called within CodeGenFunction::SanitizerScope RAII scope.
 // Returns 'i1 false' when the conversion Src -> Dst changed the sign.
 static std::pair<ScalarExprEmitter::ImplicitConversionCheckKind,
-                 std::pair<llvm::Value *, SanitizerMask>>
+                 std::pair<llvm::Value *, SanitizerKind::SanitizerOrdinal>>
 EmitBitfieldSignChangeCheckHelper(Value *Src, QualType SrcType, Value *Dst,
                                   QualType DstType, CGBuilderTy &Builder) {
   // 1. Was the old Value negative?
@@ -1348,7 +1353,7 @@ EmitBitfieldSignChangeCheckHelper(Value *Src, QualType SrcType, Value *Dst,
   // If the comparison result is 'false', then the conversion changed the sign.
   return std::make_pair(
       ScalarExprEmitter::ICCK_IntegerSignChange,
-      std::make_pair(Check, SanitizerKind::ImplicitBitfieldConversion));
+      std::make_pair(Check, SanitizerKind::SO_ImplicitBitfieldConversion));
 }
 
 void CodeGenFunction::EmitBitfieldConversionCheck(Value *Src, QualType SrcType,
@@ -1383,7 +1388,7 @@ void CodeGenFunction::EmitBitfieldConversionCheck(Value *Src, QualType SrcType,
   CodeGenFunction::SanitizerScope SanScope(this);
 
   std::pair<ScalarExprEmitter::ImplicitConversionCheckKind,
-            std::pair<llvm::Value *, SanitizerMask>>
+            std::pair<llvm::Value *, SanitizerKind::SanitizerOrdinal>>
       Check;
 
   // Truncation
@@ -1774,7 +1779,8 @@ Value *ScalarExprEmitter::EmitNullValue(QualType Ty) {
 /// operation). The check passes if all values in \p Checks (which are \c i1),
 /// are \c true.
 void ScalarExprEmitter::EmitBinOpCheck(
-    ArrayRef<std::pair<Value *, SanitizerMask>> Checks, const BinOpInfo &Info) {
+    ArrayRef<std::pair<Value *, SanitizerKind::SanitizerOrdinal>> Checks,
+    const BinOpInfo &Info) {
   assert(CGF.IsSanitizerScope);
   SanitizerHandler Check;
   SmallVector<llvm::Constant *, 4> StaticData;
@@ -3773,11 +3779,12 @@ Value *ScalarExprEmitter::EmitCompoundAssign(const CompoundAssignOperator *E,
 
 void ScalarExprEmitter::EmitUndefinedBehaviorIntegerDivAndRemCheck(
     const BinOpInfo &Ops, llvm::Value *Zero, bool isDiv) {
-  SmallVector<std::pair<llvm::Value *, SanitizerMask>, 2> Checks;
+  SmallVector<std::pair<llvm::Value *, SanitizerKind::SanitizerOrdinal>, 2>
+      Checks;
 
   if (CGF.SanOpts.has(SanitizerKind::IntegerDivideByZero)) {
     Checks.push_back(std::make_pair(Builder.CreateICmpNE(Ops.RHS, Zero),
-                                    SanitizerKind::IntegerDivideByZero));
+                                    SanitizerKind::SO_IntegerDivideByZero));
   }
 
   const auto *BO = cast<BinaryOperator>(Ops.E);
@@ -3795,7 +3802,7 @@ void ScalarExprEmitter::EmitUndefinedBehaviorIntegerDivAndRemCheck(
     llvm::Value *RHSCmp = Builder.CreateICmpNE(Ops.RHS, NegOne);
     llvm::Value *NotOverflow = Builder.CreateOr(LHSCmp, RHSCmp, "or");
     Checks.push_back(
-        std::make_pair(NotOverflow, SanitizerKind::SignedIntegerOverflow));
+        std::make_pair(NotOverflow, SanitizerKind::SO_SignedIntegerOverflow));
   }
 
   if (Checks.size() > 0)
@@ -3816,8 +3823,8 @@ Value *ScalarExprEmitter::EmitDiv(const BinOpInfo &Ops) {
                Ops.mayHaveFloatDivisionByZero()) {
       llvm::Value *Zero = llvm::Constant::getNullValue(ConvertType(Ops.Ty));
       llvm::Value *NonZero = Builder.CreateFCmpUNE(Ops.RHS, Zero);
-      EmitBinOpCheck(std::make_pair(NonZero, SanitizerKind::FloatDivideByZero),
-                     Ops);
+      EmitBinOpCheck(
+          std::make_pair(NonZero, SanitizerKind::SO_FloatDivideByZero), Ops);
     }
   }
 
@@ -3921,9 +3928,10 @@ Value *ScalarExprEmitter::EmitOverflowCheckedBinOp(const BinOpInfo &Ops) {
     // runtime. Otherwise, this is a -ftrapv check, so just emit a trap.
     if (!isSigned || CGF.SanOpts.has(SanitizerKind::SignedIntegerOverflow)) {
       llvm::Value *NotOverflow = Builder.CreateNot(overflow);
-      SanitizerMask Kind = isSigned ? SanitizerKind::SignedIntegerOverflow
-                              : SanitizerKind::UnsignedIntegerOverflow;
-      EmitBinOpCheck(std::make_pair(NotOverflow, Kind), Ops);
+      SanitizerKind::SanitizerOrdinal Ordinal =
+          isSigned ? SanitizerKind::SO_SignedIntegerOverflow
+                   : SanitizerKind::SO_UnsignedIntegerOverflow;
+      EmitBinOpCheck(std::make_pair(NotOverflow, Ordinal), Ops);
     } else
       CGF.EmitTrapCheck(Builder.CreateNot(overflow), OverflowKind);
     return result;
@@ -4543,7 +4551,7 @@ Value *ScalarExprEmitter::EmitShl(const BinOpInfo &Ops) {
   else if ((SanitizeBase || SanitizeExponent) &&
            isa<llvm::IntegerType>(Ops.LHS->getType())) {
     CodeGenFunction::SanitizerScope SanScope(&CGF);
-    SmallVector<std::pair<Value *, SanitizerMask>, 2> Checks;
+    SmallVector<std::pair<Value *, SanitizerKind::SanitizerOrdinal>, 2> Checks;
     bool RHSIsSigned = Ops.rhsHasSignedIntegerRepresentation();
     llvm::Value *WidthMinusOne =
         GetMaximumShiftAmount(Ops.LHS, Ops.RHS, RHSIsSigned);
@@ -4551,7 +4559,7 @@ Value *ScalarExprEmitter::EmitShl(const BinOpInfo &Ops) {
 
     if (SanitizeExponent) {
       Checks.push_back(
-          std::make_pair(ValidExponent, SanitizerKind::ShiftExponent));
+          std::make_pair(ValidExponent, SanitizerKind::SO_ShiftExponent));
     }
 
     if (SanitizeBase) {
@@ -4586,8 +4594,8 @@ Value *ScalarExprEmitter::EmitShl(const BinOpInfo &Ops) {
       BaseCheck->addIncoming(Builder.getTrue(), Orig);
       BaseCheck->addIncoming(ValidBase, CheckShiftBase);
       Checks.push_back(std::make_pair(
-          BaseCheck, SanitizeSignedBase ? SanitizerKind::ShiftBase
-                                        : SanitizerKind::UnsignedShiftBase));
+          BaseCheck, SanitizeSignedBase ? SanitizerKind::SO_ShiftBase
+                                        : SanitizerKind::SO_UnsignedShiftBase));
     }
 
     assert(!Checks.empty());
@@ -4617,7 +4625,7 @@ Value *ScalarExprEmitter::EmitShr(const BinOpInfo &Ops) {
     bool RHSIsSigned = Ops.rhsHasSignedIntegerRepresentation();
     llvm::Value *Valid = Builder.CreateICmpULE(
         Ops.RHS, GetMaximumShiftAmount(Ops.LHS, Ops.RHS, RHSIsSigned));
-    EmitBinOpCheck(std::make_pair(Valid, SanitizerKind::ShiftExponent), Ops);
+    EmitBinOpCheck(std::make_pair(Valid, SanitizerKind::SO_ShiftExponent), Ops);
   }
 
   if (Ops.Ty->hasUnsignedIntegerRepresentation())
@@ -5850,7 +5858,9 @@ CodeGenFunction::EmitCheckedInBoundsGEP(llvm::Type *ElemTy, Value *Ptr,
   auto *IntPtr = Builder.CreatePtrToInt(Ptr, IntPtrTy);
   auto *ComputedGEP = Builder.CreateAdd(IntPtr, EvaluatedGEP.TotalOffset);
 
-  llvm::SmallVector<std::pair<llvm::Value *, SanitizerMask>, 2> Checks;
+  llvm::SmallVector<std::pair<llvm::Value *, SanitizerKind::SanitizerOrdinal>,
+                    2>
+      Checks;
 
   if (PerformNullCheck) {
     // If the base pointer evaluates to a null pointer value,
@@ -5863,7 +5873,7 @@ CodeGenFunction::EmitCheckedInBoundsGEP(llvm::Type *ElemTy, Value *Ptr,
     auto *BaseIsNotNullptr = Builder.CreateIsNotNull(Ptr);
     auto *ResultIsNotNullptr = Builder.CreateIsNotNull(ComputedGEP);
     auto *Valid = Builder.CreateICmpEQ(BaseIsNotNullptr, ResultIsNotNullptr);
-    Checks.emplace_back(Valid, SanitizerKind::PointerOverflow);
+    Checks.emplace_back(Valid, SanitizerKind::SO_PointerOverflow);
   }
 
   if (PerformOverflowCheck) {
@@ -5899,7 +5909,7 @@ CodeGenFunction::EmitCheckedInBoundsGEP(llvm::Type *ElemTy, Value *Ptr,
       ValidGEP = Builder.CreateICmpULE(ComputedGEP, IntPtr);
     }
     ValidGEP = Builder.CreateAnd(ValidGEP, NoOffsetOverflow);
-    Checks.emplace_back(ValidGEP, SanitizerKind::PointerOverflow);
+    Checks.emplace_back(ValidGEP, SanitizerKind::SO_PointerOverflow);
   }
 
   assert(!Checks.empty() && "Should have produced some checks.");
diff --git a/clang/lib/CodeGen/CGObjC.cpp b/clang/lib/CodeGen/CGObjC.cpp
index 98d98e2d97e88..27c7c2fa9cba1 100644
--- a/clang/lib/CodeGen/CGObjC.cpp
+++ b/clang/lib/CodeGen/CGObjC.cpp
@@ -1984,7 +1984,7 @@ void CodeGenFunction::EmitObjCForCollectionStmt(const ObjCForCollectionStmt &S){
       llvm::Constant *StaticData[] = {
           EmitCheckSourceLocation(S.getBeginLoc()),
           EmitCheckTypeDescriptor(QualType(InterfaceTy, 0))};
-      EmitCheck({{IsClass, SanitizerKind::ObjCCast}},
+      EmitCheck({{IsClass, SanitizerKind::SO_ObjCCast}},
                 SanitizerHandler::InvalidObjCCast,
                 ArrayRef<llvm::Constant *>(StaticData), CurrentItem);
     }
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 27ec68bd2a872..d6f3716afabdf 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -1600,7 +1600,7 @@ void CodeGenFunction::GenerateCode(GlobalDecl GD, llvm::Function *Fn,
     if (SanOpts.has(SanitizerKind::Return)) {
       SanitizerScope SanScope(this);
       llvm::Value *IsFalse = Builder.getFalse();
-      EmitCheck(std::make_pair(IsFalse, SanitizerKind::Return),
+      EmitCheck(std::make_pair(IsFalse, SanitizerKind::SO_Return),
                 SanitizerHandler::MissingReturn,
                 EmitCheckSourceLocation(FD->getLocation()), {});
     } else if (ShouldEmitUnreachable) {
@@ -2484,8 +2484,9 @@ void CodeGenFunction::EmitVariablyModifiedType(QualType type) {
             llvm::Constant *StaticArgs[] = {
                 EmitCheckSourceLocation(sizeExpr->getBeginLoc()),
                 EmitCheckTypeDescriptor(SEType)};
-            EmitCheck(std::make_pair(CheckCondition, SanitizerKind::VLABound),
-                      SanitizerHandler::VLABoundNotPositive, StaticArgs, size);
+            EmitCheck(
+                std::make_pair(CheckCondition, SanitizerKind::SO_VLABound),
+                SanitizerHandler::VLABoundNotPositive, StaticArgs, size);
           }
 
           // Always zexting here would be wrong if it weren't
@@ -3139,7 +3140,7 @@ void CodeGenFunction::emitAlignmentAssumptionCheck(
     llvm::Value *DynamicData[] = {EmitCheckValue(Ptr),
                                   EmitCheckValue(Alignment),
                                   EmitCheckValue(OffsetValue)};
-    EmitCheck({std::make_pair(TheCheck, SanitizerKind::Alignment)},
+    EmitCheck({std::make_pair(TheCheck, SanitizerKind::SO_Alignment)},
               SanitizerHandler::AlignmentAssumption, StaticData, DynamicData);
   }
 
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 457e1477bb2ee..23ddc869277a8 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -5185,14 +5185,17 @@ class CodeGenFunction : public CodeGenTypeCache {
   /// Create a basic block that will either trap or call a handler function in
   /// the UBSan runtime with the provided arguments, and create a conditional
   /// branch to it.
-  void EmitCheck(ArrayRef<std::pair<llvm::Value *, SanitizerMask>> Checked,
-                 SanitizerHandler Check, ArrayRef<llvm::Constant *> StaticArgs,
-                 ArrayRef<llvm::Value *> DynamicArgs);
+  void
+  EmitCheck(ArrayRef<std::pair<llvm::Value *, SanitizerKind::SanitizerOrdinal>>
+                Checked,
+            SanitizerHandler Check, ArrayRef<llvm::Constant *> StaticArgs,
+            ArrayRef<llvm::Value *> DynamicArgs);
 
   /// Emit a slow path cross-DSO CFI check which calls __cfi_slowpath
   /// if Cond if false.
-  void EmitCfiSlowPathCheck(SanitizerMask Kind, llvm::Value *Cond,
-                            llvm::ConstantInt *TypeId, llvm::Value *Ptr,
+  void EmitCfiSlowPathCheck(SanitizerKind::SanitizerOrdinal Ordinal,
+                            llvm::Value *Cond, llvm::ConstantInt *TypeId,
+                            llvm::Value *Ptr,
                             ArrayRef<llvm::Constant *> StaticArgs);
 
   /// Emit a reached-unreachable diagnostic if \p Loc is valid and runtime
diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp
index cf9e338236e55..7c463f51f63dc 100644
--- a/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -779,7 +779,7 @@ CGCallee ItaniumCXXABI::EmitLoadOfMemberFunctionPointer(
             llvm::MDString::get(CGM.getLLVMContext(), "all-vtables"));
         llvm::Value *ValidVtable = Builder.CreateCall(
             CGM.getIntrinsic(llvm::Intrinsic::type_test), {VTable, AllVtables});
-        CGF.EmitCheck(std::make_pair(CheckResult, SanitizerKind::CFIMFCall),
+        CGF.EmitCheck(std::make_pair(CheckResult, SanitizerKind::SO_CFIMFCall),
                       SanitizerHandler::CFICheckFail, StaticData,
                       {VTable, ValidVtable});
       }
@@ -823,7 +823,7 @@ CGCallee ItaniumCXXABI::EmitLoadOfMemberFunctionPointer(
         Bit = Builder.CreateOr(Bit, TypeTest);
       }
 
-      CGF.EmitCheck(std::make_pair(Bit, SanitizerKind::CFIMFCall),
+      CGF.EmitCheck(std::make_pair(Bit, SanitizerKind::SO_CFIMFCall),
                     SanitizerHandler::CFICheckFail, StaticData,
                     {NonVirtualFn, llvm::UndefValue::get(CGF.IntPtrTy)});
 

From 3b0dafff87adf10480376a81f5c554857ea73ec7 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Fri, 10 Jan 2025 21:50:07 +0100
Subject: [PATCH 110/408] [LLD][COFF] Use EC load config for ARM64X relocations
 of load config directory (#121337)

This change ensures the load config in the hybrid image view is handled
correctly. It introduces a new Arm64XRelocVal class to abstract
relocation values, allowing them to be relative to a symbol. This class
will also be useful for managing ARM64X relocation offsets in the
future.
---
 lld/COFF/Chunks.cpp               | 10 ++++++---
 lld/COFF/Chunks.h                 | 18 ++++++++++++---
 lld/COFF/Writer.cpp               |  7 +++---
 lld/test/COFF/arm64x-loadconfig.s | 37 +++++++++++++++++++++++++++----
 4 files changed, 58 insertions(+), 14 deletions(-)

diff --git a/lld/COFF/Chunks.cpp b/lld/COFF/Chunks.cpp
index f2d0111a19558..115e3457db697 100644
--- a/lld/COFF/Chunks.cpp
+++ b/lld/COFF/Chunks.cpp
@@ -1166,6 +1166,10 @@ uint32_t ImportThunkChunkARM64EC::extendRanges() {
   return sizeof(arm64Thunk) - sizeof(uint32_t);
 }
 
+uint64_t Arm64XRelocVal::get() const {
+  return (sym ? sym->getRVA() : 0) + value;
+}
+
 size_t Arm64XDynamicRelocEntry::getSize() const {
   switch (type) {
   case IMAGE_DVRT_ARM64X_FIXUP_TYPE_VALUE:
@@ -1186,13 +1190,13 @@ void Arm64XDynamicRelocEntry::writeTo(uint8_t *buf) const {
     *out |= ((bit_width(size) - 1) << 14); // Encode the size.
     switch (size) {
     case 2:
-      out[1] = value;
+      out[1] = value.get();
       break;
     case 4:
-      *reinterpret_cast<ulittle32_t *>(out + 1) = value;
+      *reinterpret_cast<ulittle32_t *>(out + 1) = value.get();
       break;
     case 8:
-      *reinterpret_cast<ulittle64_t *>(out + 1) = value;
+      *reinterpret_cast<ulittle64_t *>(out + 1) = value.get();
       break;
     default:
       llvm_unreachable("invalid size");
diff --git a/lld/COFF/Chunks.h b/lld/COFF/Chunks.h
index 0d2b2ac0f15ea..46fd8e21dce65 100644
--- a/lld/COFF/Chunks.h
+++ b/lld/COFF/Chunks.h
@@ -835,18 +835,30 @@ class ECExportThunkChunk : public NonSectionCodeChunk {
   Defined *target;
 };
 
+// ARM64X relocation value, potentially relative to a symbol.
+class Arm64XRelocVal {
+public:
+  Arm64XRelocVal(uint64_t value = 0) : value(value) {}
+  Arm64XRelocVal(Defined *sym, int32_t offset = 0) : sym(sym), value(offset) {}
+  uint64_t get() const;
+
+private:
+  Defined *sym = nullptr;
+  uint64_t value;
+};
+
 // ARM64X entry for dynamic relocations.
 class Arm64XDynamicRelocEntry {
 public:
   Arm64XDynamicRelocEntry(llvm::COFF::Arm64XFixupType type, uint8_t size,
-                          uint32_t offset, uint64_t value)
+                          uint32_t offset, Arm64XRelocVal value)
       : offset(offset), value(value), type(type), size(size) {}
 
   size_t getSize() const;
   void writeTo(uint8_t *buf) const;
 
   uint32_t offset;
-  uint64_t value;
+  Arm64XRelocVal value;
 
 private:
   llvm::COFF::Arm64XFixupType type;
@@ -862,7 +874,7 @@ class DynamicRelocsChunk : public NonSectionChunk {
   void finalize();
 
   void add(llvm::COFF::Arm64XFixupType type, uint8_t size, uint32_t offset,
-           uint64_t value) {
+           Arm64XRelocVal value) {
     arm64xRelocs.emplace_back(type, size, offset, value);
   }
 
diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index 138112f109fd4..eb82a9cc01593 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -2584,18 +2584,17 @@ void Writer::createDynamicRelocs() {
                          coffHeaderOffset + offsetof(coff_file_header, Machine),
                          AMD64);
 
-  // Clear the load config directory.
-  // FIXME: Use the hybrid load config value instead.
+  // Set the hybrid load config to the EC load config.
   ctx.dynamicRelocs->add(IMAGE_DVRT_ARM64X_FIXUP_TYPE_VALUE, sizeof(uint32_t),
                          dataDirOffset64 +
                              LOAD_CONFIG_TABLE * sizeof(data_directory) +
                              offsetof(data_directory, RelativeVirtualAddress),
-                         0);
+                         ctx.hybridSymtab->loadConfigSym);
   ctx.dynamicRelocs->add(IMAGE_DVRT_ARM64X_FIXUP_TYPE_VALUE, sizeof(uint32_t),
                          dataDirOffset64 +
                              LOAD_CONFIG_TABLE * sizeof(data_directory) +
                              offsetof(data_directory, Size),
-                         0);
+                         ctx.hybridSymtab->loadConfigSize);
 }
 
 PartialSection *Writer::createPartialSection(StringRef name,
diff --git a/lld/test/COFF/arm64x-loadconfig.s b/lld/test/COFF/arm64x-loadconfig.s
index 3b53d32a9b549..f413adff2868c 100644
--- a/lld/test/COFF/arm64x-loadconfig.s
+++ b/lld/test/COFF/arm64x-loadconfig.s
@@ -99,20 +99,49 @@
 // LOADCFG-NEXT:       RVA: 0x150
 // LOADCFG-NEXT:       Type: VALUE
 // LOADCFG-NEXT:       Size: 0x4
-// LOADCFG-NEXT:       Value: 0x0
+// LOADCFG-NEXT:       Value: 0x1140
 // LOADCFG-NEXT:     ]
 // LOADCFG-NEXT:     Entry [
 // LOADCFG-NEXT:       RVA: 0x154
 // LOADCFG-NEXT:       Type: VALUE
 // LOADCFG-NEXT:       Size: 0x4
-// LOADCFG-NEXT:       Value: 0x0
+// LOADCFG-NEXT:       Value: 0x140
 // LOADCFG-NEXT:     ]
 // LOADCFG-NEXT:   ]
 // LOADCFG-NEXT: ]
 // LOADCFG-NEXT: HybridObject {
-// LOADCFG-NEXT:   Format: COFF-x86-64
-// LOADCFG-NEXT:   Arch: x86_64
+// LOADCFG-NEXT:   Format: COFF-ARM64EC
+// LOADCFG-NEXT:   Arch: aarch64
 // LOADCFG-NEXT:   AddressSize: 64bit
+// LOADCFG-NEXT:   LoadConfig [
+// LOADCFG-NEXT:     Size:   0x140
+// LOADCFG:        CHPEMetadata [
+// LOADCFG-NEXT:     Version:   0x2
+// LOADCFG:        ]
+// LOADCFG-NEXT:   DynamicRelocations [
+// LOADCFG-NEXT:     Version: 0x1
+// LOADCFG-NEXT:     Arm64X [
+// LOADCFG-NEXT:       Entry [
+// LOADCFG-NEXT:         RVA: 0x7C
+// LOADCFG-NEXT:         Type: VALUE
+// LOADCFG-NEXT:         Size: 0x2
+// LOADCFG-NEXT:         Value: 0x8664
+// LOADCFG-NEXT:       ]
+// LOADCFG-NEXT:       Entry [
+// LOADCFG-NEXT:         RVA: 0x150
+// LOADCFG-NEXT:         Type: VALUE
+// LOADCFG-NEXT:         Size: 0x4
+// LOADCFG-NEXT:         Value: 0x1140
+// LOADCFG-NEXT:       ]
+// LOADCFG-NEXT:       Entry [
+// LOADCFG-NEXT:         RVA: 0x154
+// LOADCFG-NEXT:         Type: VALUE
+// LOADCFG-NEXT:         Size: 0x4
+// LOADCFG-NEXT:         Value: 0x140
+// LOADCFG-NEXT:       ]
+// LOADCFG-NEXT:     ]
+// LOADCFG-NEXT:   ]
+// LOADCFG-NEXT: }
 
 // RUN: llvm-readobj --coff-basereloc out-hyb.dll | FileCheck --check-prefix=BASERELOC %s
 // BASERELOC:      BaseReloc [

From 29e5c1c92782ff7d455878747fb1dc1967ff607f Mon Sep 17 00:00:00 2001
From: Alina Sbirlea <asbirlea@google.com>
Date: Fri, 10 Jan 2025 13:03:09 -0800
Subject: [PATCH 111/408] [Hexagon] Fix test after
 9d7df23f4d6537752854d54b0c4c583512b930d0

---
 llvm/test/CodeGen/Hexagon/isel/isel-tfrrp.ll | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/test/CodeGen/Hexagon/isel/isel-tfrrp.ll b/llvm/test/CodeGen/Hexagon/isel/isel-tfrrp.ll
index b2a9f732bdddc..6351406264e57 100644
--- a/llvm/test/CodeGen/Hexagon/isel/isel-tfrrp.ll
+++ b/llvm/test/CodeGen/Hexagon/isel/isel-tfrrp.ll
@@ -2,6 +2,7 @@
 ; The constant 0 is generated by a transfer immediate instruction.
 
 ; RUN: llc -march=hexagon -debug-only=isel 2>&1 < %s - | FileCheck %s
+; REQUIRES: asserts
 
 ; CHECK: [[R0:%[0-9]+]]:intregs = A2_tfrsi 0
 ; CHECK-NEXT: predregs = C2_tfrrp killed [[R0]]:intregs

From 3fbc344b49800bb0f70fd5af46c0a47f6d55bbd1 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Fri, 10 Jan 2025 13:05:57 -0800
Subject: [PATCH 112/408] [WebAssembly] Refactor Wasm EH/SjLj error checking
 (#122466)

There were many overlaps between error checking and feature enabling
routines for Wasm EH and Wasm SjLj. This tries to factor out those
common routines in separate lambda functions.

This is not NFC because this ends up disallowing a few new combinations
(e.g. `-fwasm-exceptions` and `-emscripten-cxx-exceptions-allowed`), and
also deletes `-mllvm` from the error messages to share the same lambda
function between options with `-mllvm` and those without it.

This adds a few more tests but does not try to cover every single
possible disallowed combination.
---
 clang/lib/Driver/ToolChains/WebAssembly.cpp | 111 +++++++-------------
 clang/test/Driver/wasm-toolchain.c          |  53 ++++++----
 2 files changed, 69 insertions(+), 95 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/WebAssembly.cpp b/clang/lib/Driver/ToolChains/WebAssembly.cpp
index 44a6894d30fb2..e338b0d2398e0 100644
--- a/clang/lib/Driver/ToolChains/WebAssembly.cpp
+++ b/clang/lib/Driver/ToolChains/WebAssembly.cpp
@@ -344,44 +344,53 @@ void WebAssembly::addClangTargetOptions(const ArgList &DriverArgs,
     }
   }
 
-  if (DriverArgs.getLastArg(options::OPT_fwasm_exceptions)) {
-    // '-fwasm-exceptions' is not compatible with '-mno-exception-handling'
+  // Bans incompatible options for Wasm EH / SjLj. We don't allow using
+  // different modes for EH and SjLj.
+  auto BanIncompatibleOptionsForWasmEHSjLj = [&](StringRef CurOption) {
     if (DriverArgs.hasFlag(options::OPT_mno_exception_handing,
                            options::OPT_mexception_handing, false))
       getDriver().Diag(diag::err_drv_argument_not_allowed_with)
-          << "-fwasm-exceptions"
-          << "-mno-exception-handling";
-    // '-fwasm-exceptions' is not compatible with
-    // '-mllvm -enable-emscripten-cxx-exceptions'
-    for (const Arg *A : DriverArgs.filtered(options::OPT_mllvm)) {
-      if (StringRef(A->getValue(0)) == "-enable-emscripten-cxx-exceptions")
-        getDriver().Diag(diag::err_drv_argument_not_allowed_with)
-            << "-fwasm-exceptions"
-            << "-mllvm -enable-emscripten-cxx-exceptions";
-    }
-    // '-fwasm-exceptions' implies exception-handling feature
-    CC1Args.push_back("-target-feature");
-    CC1Args.push_back("+exception-handling");
-    // Backend needs -wasm-enable-eh to enable Wasm EH
-    CC1Args.push_back("-mllvm");
-    CC1Args.push_back("-wasm-enable-eh");
-
-    // New Wasm EH spec (adopted in Oct 2023) requires multivalue and
-    // reference-types.
+          << CurOption << "-mno-exception-handling";
+    // The standardized Wasm EH spec requires multivalue and reference-types.
     if (DriverArgs.hasFlag(options::OPT_mno_multivalue,
-                           options::OPT_mmultivalue, false)) {
+                           options::OPT_mmultivalue, false))
       getDriver().Diag(diag::err_drv_argument_not_allowed_with)
-          << "-fwasm-exceptions" << "-mno-multivalue";
-    }
+          << CurOption << "-mno-multivalue";
     if (DriverArgs.hasFlag(options::OPT_mno_reference_types,
-                           options::OPT_mreference_types, false)) {
+                           options::OPT_mreference_types, false))
       getDriver().Diag(diag::err_drv_argument_not_allowed_with)
-          << "-fwasm-exceptions" << "-mno-reference-types";
+          << CurOption << "-mno-reference-types";
+
+    for (const Arg *A : DriverArgs.filtered(options::OPT_mllvm)) {
+      for (const auto *Option :
+           {"-enable-emscripten-cxx-exceptions", "-enable-emscripten-sjlj",
+            "-emscripten-cxx-exceptions-allowed"}) {
+        if (StringRef(A->getValue(0)) == Option)
+          getDriver().Diag(diag::err_drv_argument_not_allowed_with)
+              << CurOption << Option;
+      }
     }
+  };
+
+  // Enable necessary features for Wasm EH / SjLj in the backend.
+  auto EnableFeaturesForWasmEHSjLj = [&]() {
+    CC1Args.push_back("-target-feature");
+    CC1Args.push_back("+exception-handling");
+    // The standardized Wasm EH spec requires multivalue and reference-types.
     CC1Args.push_back("-target-feature");
     CC1Args.push_back("+multivalue");
     CC1Args.push_back("-target-feature");
     CC1Args.push_back("+reference-types");
+    // Backend needs '-exception-model=wasm' to use Wasm EH instructions
+    CC1Args.push_back("-exception-model=wasm");
+  };
+
+  if (DriverArgs.getLastArg(options::OPT_fwasm_exceptions)) {
+    BanIncompatibleOptionsForWasmEHSjLj("-fwasm-exceptions");
+    EnableFeaturesForWasmEHSjLj();
+    // Backend needs -wasm-enable-eh to enable Wasm EH
+    CC1Args.push_back("-mllvm");
+    CC1Args.push_back("-wasm-enable-eh");
   }
 
   for (const Arg *A : DriverArgs.filtered(options::OPT_mllvm)) {
@@ -413,53 +422,11 @@ void WebAssembly::addClangTargetOptions(const ArgList &DriverArgs,
       }
     }
 
-    if (Opt.starts_with("-wasm-enable-sjlj")) {
-      // '-mllvm -wasm-enable-sjlj' is not compatible with
-      // '-mno-exception-handling'
-      if (DriverArgs.hasFlag(options::OPT_mno_exception_handing,
-                             options::OPT_mexception_handing, false))
-        getDriver().Diag(diag::err_drv_argument_not_allowed_with)
-            << "-mllvm -wasm-enable-sjlj"
-            << "-mno-exception-handling";
-      // '-mllvm -wasm-enable-sjlj' is not compatible with
-      // '-mllvm -enable-emscripten-cxx-exceptions'
-      // because we don't allow Emscripten EH + Wasm SjLj
-      for (const Arg *A : DriverArgs.filtered(options::OPT_mllvm)) {
-        if (StringRef(A->getValue(0)) == "-enable-emscripten-cxx-exceptions")
-          getDriver().Diag(diag::err_drv_argument_not_allowed_with)
-              << "-mllvm -wasm-enable-sjlj"
-              << "-mllvm -enable-emscripten-cxx-exceptions";
+    for (const auto *Option : {"-wasm-enable-eh", "-wasm-enable-sjlj"}) {
+      if (Opt.starts_with(Option)) {
+        BanIncompatibleOptionsForWasmEHSjLj(Option);
+        EnableFeaturesForWasmEHSjLj();
       }
-      // '-mllvm -wasm-enable-sjlj' is not compatible with
-      // '-mllvm -enable-emscripten-sjlj'
-      for (const Arg *A : DriverArgs.filtered(options::OPT_mllvm)) {
-        if (StringRef(A->getValue(0)) == "-enable-emscripten-sjlj")
-          getDriver().Diag(diag::err_drv_argument_not_allowed_with)
-              << "-mllvm -wasm-enable-sjlj"
-              << "-mllvm -enable-emscripten-sjlj";
-      }
-      // '-mllvm -wasm-enable-sjlj' implies exception-handling feature
-      CC1Args.push_back("-target-feature");
-      CC1Args.push_back("+exception-handling");
-      // Backend needs '-exception-model=wasm' to use Wasm EH instructions
-      CC1Args.push_back("-exception-model=wasm");
-
-      // New Wasm EH spec (adopted in Oct 2023) requires multivalue and
-      // reference-types.
-      if (DriverArgs.hasFlag(options::OPT_mno_multivalue,
-                             options::OPT_mmultivalue, false)) {
-        getDriver().Diag(diag::err_drv_argument_not_allowed_with)
-            << "-mllvm -wasm-enable-sjlj" << "-mno-multivalue";
-      }
-      if (DriverArgs.hasFlag(options::OPT_mno_reference_types,
-                             options::OPT_mreference_types, false)) {
-        getDriver().Diag(diag::err_drv_argument_not_allowed_with)
-            << "-mllvm -wasm-enable-sjlj" << "-mno-reference-types";
-      }
-      CC1Args.push_back("-target-feature");
-      CC1Args.push_back("+multivalue");
-      CC1Args.push_back("-target-feature");
-      CC1Args.push_back("+reference-types");
     }
   }
 }
diff --git a/clang/test/Driver/wasm-toolchain.c b/clang/test/Driver/wasm-toolchain.c
index 7c26c2c13c0ba..84c1b4f6efe66 100644
--- a/clang/test/Driver/wasm-toolchain.c
+++ b/clang/test/Driver/wasm-toolchain.c
@@ -120,18 +120,12 @@
 // RUN:   | FileCheck -check-prefix=EMSCRIPTEN_EH_ALLOWED_WO_ENABLE %s
 // EMSCRIPTEN_EH_ALLOWED_WO_ENABLE: invalid argument '-mllvm -emscripten-cxx-exceptions-allowed' only allowed with '-mllvm -enable-emscripten-cxx-exceptions'
 
-// '-fwasm-exceptions' sets +exception-handling, -multivalue, -reference-types
-// and '-mllvm -wasm-enable-eh'
+// '-fwasm-exceptions' sets +exception-handling, -multivalue, -reference-types,
+// "-exception-model=wasm", and '-mllvm -wasm-enable-eh'
 // RUN: %clang -### --target=wasm32-unknown-unknown \
 // RUN:    --sysroot=/foo %s -fwasm-exceptions 2>&1 \
 // RUN:  | FileCheck -check-prefix=WASM_EXCEPTIONS %s
-// WASM_EXCEPTIONS: "-cc1" {{.*}} "-target-feature" "+exception-handling" "-mllvm" "-wasm-enable-eh" "-target-feature" "+multivalue" "-target-feature" "+reference-types"
-
-// '-fwasm-exceptions' not allowed with '-mno-exception-handling'
-// RUN: not %clang -### --target=wasm32-unknown-unknown \
-// RUN:     --sysroot=/foo %s -fwasm-exceptions -mno-exception-handling 2>&1 \
-// RUN:   | FileCheck -check-prefix=WASM_EXCEPTIONS_NO_EH %s
-// WASM_EXCEPTIONS_NO_EH: invalid argument '-fwasm-exceptions' not allowed with '-mno-exception-handling'
+// WASM_EXCEPTIONS: "-cc1" {{.*}} "-target-feature" "+exception-handling" "-target-feature" "+multivalue" "-target-feature" "+reference-types" "-exception-model=wasm" "-mllvm" "-wasm-enable-eh"
 
 // '-fwasm-exceptions' not allowed with
 // '-mllvm -enable-emscripten-cxx-exceptions'
@@ -139,7 +133,20 @@
 // RUN:     --sysroot=/foo %s -fwasm-exceptions \
 // RUN:     -mllvm -enable-emscripten-cxx-exceptions 2>&1 \
 // RUN:   | FileCheck -check-prefix=WASM_EXCEPTIONS_EMSCRIPTEN_EH %s
-// WASM_EXCEPTIONS_EMSCRIPTEN_EH: invalid argument '-fwasm-exceptions' not allowed with '-mllvm -enable-emscripten-cxx-exceptions'
+// WASM_EXCEPTIONS_EMSCRIPTEN_EH: invalid argument '-fwasm-exceptions' not allowed with '-enable-emscripten-cxx-exceptions'
+
+// '-fwasm-exceptions' not allowed with '-mllvm -enable-emscripten-sjlj'
+// RUN: not %clang -### --target=wasm32-unknown-unknown \
+// RUN:     --sysroot=/foo %s -fwasm-exceptions \
+// RUN:     -mllvm -enable-emscripten-sjlj 2>&1 \
+// RUN:   | FileCheck -check-prefix=WASM_EXCEPTIONS_EMSCRIPTEN_SJLJ %s
+// WASM_EXCEPTIONS_EMSCRIPTEN_SJLJ: invalid argument '-fwasm-exceptions' not allowed with '-enable-emscripten-sjlj'
+
+// '-fwasm-exceptions' not allowed with '-mno-exception-handling'
+// RUN: not %clang -### --target=wasm32-unknown-unknown \
+// RUN:     --sysroot=/foo %s -fwasm-exceptions -mno-exception-handling 2>&1 \
+// RUN:   | FileCheck -check-prefix=WASM_EXCEPTIONS_NO_EH %s
+// WASM_EXCEPTIONS_NO_EH: invalid argument '-fwasm-exceptions' not allowed with '-mno-exception-handling'
 
 // '-fwasm-exceptions' not allowed with '-mno-multivalue'
 // RUN: not %clang -### --target=wasm32-unknown-unknown \
@@ -154,18 +161,11 @@
 // WASM_EXCEPTIONS_NO_REFERENCE_TYPES: invalid argument '-fwasm-exceptions' not allowed with '-mno-reference-types'
 
 // '-mllvm -wasm-enable-sjlj' sets +exception-handling, +multivalue,
-// +reference-types  and '-exception-model=wasm'
+// +reference-types and '-exception-model=wasm'
 // RUN: %clang -### --target=wasm32-unknown-unknown \
 // RUN:    --sysroot=/foo %s -mllvm -wasm-enable-sjlj 2>&1 \
 // RUN:  | FileCheck -check-prefix=WASM_SJLJ %s
-// WASM_SJLJ: "-cc1" {{.*}} "-target-feature" "+exception-handling" "-exception-model=wasm" "-target-feature" "+multivalue" "-target-feature" "+reference-types"
-
-// '-mllvm -wasm-enable-sjlj' not allowed with '-mno-exception-handling'
-// RUN: not %clang -### --target=wasm32-unknown-unknown \
-// RUN:     --sysroot=/foo %s -mllvm -wasm-enable-sjlj -mno-exception-handling \
-// RUN:     2>&1 \
-// RUN:   | FileCheck -check-prefix=WASM_SJLJ_NO_EH %s
-// WASM_SJLJ_NO_EH: invalid argument '-mllvm -wasm-enable-sjlj' not allowed with '-mno-exception-handling'
+// WASM_SJLJ: "-cc1" {{.*}} "-target-feature" "+exception-handling" "-target-feature" "+multivalue" "-target-feature" "+reference-types" "-exception-model=wasm"
 
 // '-mllvm -wasm-enable-sjlj' not allowed with
 // '-mllvm -enable-emscripten-cxx-exceptions'
@@ -173,27 +173,34 @@
 // RUN:     --sysroot=/foo %s -mllvm -wasm-enable-sjlj \
 // RUN:     -mllvm -enable-emscripten-cxx-exceptions 2>&1 \
 // RUN:   | FileCheck -check-prefix=WASM_SJLJ_EMSCRIPTEN_EH %s
-// WASM_SJLJ_EMSCRIPTEN_EH: invalid argument '-mllvm -wasm-enable-sjlj' not allowed with '-mllvm -enable-emscripten-cxx-exceptions'
+// WASM_SJLJ_EMSCRIPTEN_EH: invalid argument '-wasm-enable-sjlj' not allowed with '-enable-emscripten-cxx-exceptions'
 
 // '-mllvm -wasm-enable-sjlj' not allowed with '-mllvm -enable-emscripten-sjlj'
 // RUN: not %clang -### --target=wasm32-unknown-unknown \
 // RUN:     --sysroot=/foo %s -mllvm -wasm-enable-sjlj \
 // RUN:     -mllvm -enable-emscripten-sjlj 2>&1 \
 // RUN:   | FileCheck -check-prefix=WASM_SJLJ_EMSCRIPTEN_SJLJ %s
-// WASM_SJLJ_EMSCRIPTEN_SJLJ: invalid argument '-mllvm -wasm-enable-sjlj' not allowed with '-mllvm -enable-emscripten-sjlj'
+// WASM_SJLJ_EMSCRIPTEN_SJLJ: invalid argument '-wasm-enable-sjlj' not allowed with '-enable-emscripten-sjlj'
+
+// '-mllvm -wasm-enable-sjlj' not allowed with '-mno-exception-handling'
+// RUN: not %clang -### --target=wasm32-unknown-unknown \
+// RUN:     --sysroot=/foo %s -mllvm -wasm-enable-sjlj -mno-exception-handling \
+// RUN:     2>&1 \
+// RUN:   | FileCheck -check-prefix=WASM_SJLJ_NO_EH %s
+// WASM_SJLJ_NO_EH: invalid argument '-wasm-enable-sjlj' not allowed with '-mno-exception-handling'
 
 // '-mllvm -wasm-enable-sjlj' not allowed with '-mno-multivalue'
 // RUN: not %clang -### --target=wasm32-unknown-unknown \
 // RUN:     --sysroot=/foo %s -mllvm -wasm-enable-sjlj -mno-multivalue 2>&1 \
 // RUN:   | FileCheck -check-prefix=WASM_SJLJ_NO_MULTIVALUE %s
-// WASM_SJLJ_NO_MULTIVALUE: invalid argument '-mllvm -wasm-enable-sjlj' not allowed with '-mno-multivalue'
+// WASM_SJLJ_NO_MULTIVALUE: invalid argument '-wasm-enable-sjlj' not allowed with '-mno-multivalue'
 
 // '-mllvm -wasm-enable-sjlj' not allowed with '-mno-reference-types'
 // RUN: not %clang -### --target=wasm32-unknown-unknown \
 // RUN:     --sysroot=/foo %s -mllvm -wasm-enable-sjlj \
 // RUN:     -mno-reference-types 2>&1 \
 // RUN:   | FileCheck -check-prefix=WASM_SJLJ_NO_REFERENCE_TYPES %s
-// WASM_SJLJ_NO_REFERENCE_TYPES: invalid argument '-mllvm -wasm-enable-sjlj' not allowed with '-mno-reference-types'
+// WASM_SJLJ_NO_REFERENCE_TYPES: invalid argument '-wasm-enable-sjlj' not allowed with '-mno-reference-types'
 
 // RUN: %clang -### %s -fsanitize=address --target=wasm32-unknown-emscripten 2>&1 | FileCheck -check-prefix=CHECK-ASAN-EMSCRIPTEN %s
 // CHECK-ASAN-EMSCRIPTEN: "-fsanitize=address"

From 91892e8fa3830ed6590eda0bc62e2a2ea8df8872 Mon Sep 17 00:00:00 2001
From: Ellis Hoag <ellis.sparky.hoag@gmail.com>
Date: Fri, 10 Jan 2025 13:19:33 -0800
Subject: [PATCH 113/408] [InstrProf] Add frontend temporal profiling flag
 (#122385)

As discussed in https://github.com/llvm/llvm-project/pull/121514 add the
frontend flag `-ftemporal-profile` to enable temporal profiling
(https://discourse.llvm.org/t/rfc-temporal-profiling-extension-for-irpgo/68068)
as a replacement for `-forder-file-instrumentation`
(https://discourse.llvm.org/t/deprecate-forder-file-instrumentation-in-favor-of-temporal-profiling/83903)
---
 clang/docs/UsersManual.rst                    | 32 +++++++++++++++++++
 clang/include/clang/Driver/Options.td         |  5 ++-
 clang/lib/Driver/ToolChains/Clang.cpp         | 10 +++++-
 clang/test/Driver/clang_f_opts.c              |  2 +-
 .../test/Driver/fprofile-generate-temporal.c  |  7 ++++
 5 files changed, 53 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/Driver/fprofile-generate-temporal.c

diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index 8af9f5be644a0..f58fca465c549 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -3035,6 +3035,38 @@ indexed format, regardeless whether it is produced by frontend or the IR pass.
   overhead. ``prefer-atomic`` will be transformed to ``atomic`` when supported
   by the target, or ``single`` otherwise.
 
+.. option:: -ftemporal-profile
+
+  Enables the temporal profiling extension for IRPGO to improve startup time by
+  reducing ``.text`` section page faults. To do this, we instrument function
+  timestamps to measure when each function is called for the first time and use
+  this data to generate a function order to improve startup.
+
+  The profile is generated as normal.
+
+  .. code-block:: console
+
+    $ clang++ -O2 -fprofile-generate -ftemporal-profile code.cc -o code
+    $ ./code
+    $ llvm-profdata merge -o code.profdata yyy/zzz
+
+  Using the resulting profile, we can generate a function order to pass to the
+  linker via `--symbol-ordering-file` for ELF or `-order_file` for Mach-O.
+
+  .. code-block:: console
+
+    $ llvm-profdata order code.profdata -o code.orderfile
+    $ clang++ -O2 -Wl,--symbol-ordering-file=code.orderfile code.cc -o code
+
+  Or the profile can be passed to LLD directly.
+
+  .. code-block:: console
+
+    $ clang++ -O2 -fuse-ld=lld -Wl,--irpgo-profile=code.profdata,--bp-startup-sort=function code.cc -o code
+
+  For more information, please read the RFC:
+  https://discourse.llvm.org/t/rfc-temporal-profiling-extension-for-irpgo/68068
+
 Fine Tuning Profile Collection
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 41a7e8c372806..80360216c9503 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1796,6 +1796,9 @@ def fprofile_generate_cold_function_coverage : Flag<["-"], "fprofile-generate-co
 def fprofile_generate_cold_function_coverage_EQ : Joined<["-"], "fprofile-generate-cold-function-coverage=">,
     Group<f_Group>, Visibility<[ClangOption, CLOption]>, MetaVarName<"<directory>">,
     HelpText<"Generate instrumented code to collect coverage info for cold functions into <directory>/default.profraw (overridden by LLVM_PROFILE_FILE env var)">;
+def ftemporal_profile : Flag<["-"], "ftemporal-profile">,
+    Group<f_Group>, Visibility<[ClangOption, CLOption]>,
+    HelpText<"Generate instrumented code to collect temporal information">;
 def fprofile_instr_generate : Flag<["-"], "fprofile-instr-generate">,
     Group<f_Group>, Visibility<[ClangOption, CLOption]>,
     HelpText<"Generate instrumented code to collect execution counts into default.profraw file (overridden by '=' form of option or LLVM_PROFILE_FILE env var)">;
@@ -1891,7 +1894,7 @@ defm pseudo_probe_for_profiling : BoolFOption<"pseudo-probe-for-profiling",
           " pseudo probes for sample profiling">>;
 def forder_file_instrumentation : Flag<["-"], "forder-file-instrumentation">,
     Group<f_Group>, Visibility<[ClangOption, CC1Option, CLOption]>,
-    HelpText<"Generate instrumented code to collect order file into default.profraw file (overridden by '=' form of option or LLVM_PROFILE_FILE env var). Deprecated, please use temporal profiling.">;
+    HelpText<"Generate instrumented code to collect order file into default.profraw file (overridden by '=' form of option or LLVM_PROFILE_FILE env var). Deprecated, please use -ftemporal-profile">;
 def fprofile_list_EQ : Joined<["-"], "fprofile-list=">,
     Group<f_Group>, Visibility<[ClangOption, CC1Option, CLOption]>,
     HelpText<"Filename defining the list of functions/files to instrument. "
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index f81691f8aeaf9..33f08cf28feca 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -662,6 +662,14 @@ static void addPGOAndCoverageFlags(const ToolChain &TC, Compilation &C,
     CmdArgs.push_back("--pgo-function-entry-coverage");
   }
 
+  if (auto *A = Args.getLastArg(options::OPT_ftemporal_profile)) {
+    if (!PGOGenerateArg && !CSPGOGenerateArg)
+      D.Diag(clang::diag::err_drv_argument_only_allowed_with)
+          << A->getSpelling() << "-fprofile-generate or -fcs-profile-generate";
+    CmdArgs.push_back("-mllvm");
+    CmdArgs.push_back("--pgo-temporal-instrumentation");
+  }
+
   Arg *PGOGenArg = nullptr;
   if (PGOGenerateArg) {
     assert(!CSPGOGenerateArg);
@@ -8054,7 +8062,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
           Args.getLastArg(options::OPT_forder_file_instrumentation)) {
     D.Diag(diag::warn_drv_deprecated_arg)
         << A->getAsString(Args) << /*hasReplacement=*/true
-        << "-mllvm -pgo-temporal-instrumentation";
+        << "-ftemporal-profile";
     CmdArgs.push_back("-forder-file-instrumentation");
     // Enable order file instrumentation when ThinLTO is not on. When ThinLTO is
     // on, we need to pass these flags as linker flags and that will be handled
diff --git a/clang/test/Driver/clang_f_opts.c b/clang/test/Driver/clang_f_opts.c
index 2b72068eae1ee..38f25898c9556 100644
--- a/clang/test/Driver/clang_f_opts.c
+++ b/clang/test/Driver/clang_f_opts.c
@@ -424,7 +424,7 @@
 // CHECK-WARNING-DAG: optimization flag '-fno-devirtualize-speculatively' is not supported
 // CHECK-WARNING-DAG: the flag '-fslp-vectorize-aggressive' has been deprecated and will be ignored
 // CHECK-WARNING-DAG: the flag '-fno-slp-vectorize-aggressive' has been deprecated and will be ignored
-// CHECK-WARNING-DAG: argument '-forder-file-instrumentation' is deprecated, use '-mllvm -pgo-temporal-instrumentation' instead
+// CHECK-WARNING-DAG: argument '-forder-file-instrumentation' is deprecated, use '-ftemporal-profile' instead
 
 // Test that we mute the warning on these
 // RUN: %clang -### -finline-limit=1000 -Wno-invalid-command-line-argument              \
diff --git a/clang/test/Driver/fprofile-generate-temporal.c b/clang/test/Driver/fprofile-generate-temporal.c
new file mode 100644
index 0000000000000..8661dc6f8f262
--- /dev/null
+++ b/clang/test/Driver/fprofile-generate-temporal.c
@@ -0,0 +1,7 @@
+// RUN: %clang -### -c -fprofile-generate -ftemporal-profile %s 2>&1 | FileCheck %s
+// RUN: %clang -### -c -fcs-profile-generate -ftemporal-profile %s 2>&1 | FileCheck %s
+// RUN: not %clang -### -c -ftemporal-profile %s 2>&1 | FileCheck %s --check-prefix=ERR
+
+// CHECK: "-mllvm" "--pgo-temporal-instrumentation"
+
+// ERR: error: invalid argument '-ftemporal-profile' only allowed with '-fprofile-generate or -fcs-profile-generate'

From 9248428db78ebaa0af33c7b45285caf4ecb93174 Mon Sep 17 00:00:00 2001
From: vporpo <vporpodas@google.com>
Date: Fri, 10 Jan 2025 13:32:33 -0800
Subject: [PATCH 114/408] [SandboxVec][DAG][NFC] Refactor setNextNode() and
 setPrevNode() (#122363)

This patch updates DAG's `setNextNode()` and `setPrevNode()` to update
both nodes of the link.
---
 .../SandboxVectorizer/DependencyGraph.h         | 15 ++++++++++++---
 .../SandboxVectorizer/DependencyGraph.cpp       | 17 +++--------------
 2 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h
index f423e1ee456cd..00b53b42e2e57 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h
@@ -216,9 +216,18 @@ class MemDGNode final : public DGNode {
   /// Memory predecessors.
   DenseSet<MemDGNode *> MemPreds;
   friend class PredIterator; // For MemPreds.
-
-  void setNextNode(MemDGNode *N) { NextMemN = N; }
-  void setPrevNode(MemDGNode *N) { PrevMemN = N; }
+  /// Creates both edges: this<->N.
+  void setNextNode(MemDGNode *N) {
+    NextMemN = N;
+    if (NextMemN != nullptr)
+      NextMemN->PrevMemN = this;
+  }
+  /// Creates both edges: N<->this.
+  void setPrevNode(MemDGNode *N) {
+    PrevMemN = N;
+    if (PrevMemN != nullptr)
+      PrevMemN->NextMemN = this;
+  }
   friend class DependencyGraph; // For setNextNode(), setPrevNode().
   void detachFromChain() {
     if (PrevMemN != nullptr)
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
index ba62c45a4e704..c1a046a157d3b 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
@@ -283,8 +283,6 @@ void DependencyGraph::createNewNodes(const Interval<Instruction> &NewInterval) {
     // Build the Mem node chain.
     if (auto *MemN = dyn_cast<MemDGNode>(N)) {
       MemN->setPrevNode(LastMemN);
-      if (LastMemN != nullptr)
-        LastMemN->setNextNode(MemN);
       LastMemN = MemN;
     }
   }
@@ -302,7 +300,6 @@ void DependencyGraph::createNewNodes(const Interval<Instruction> &NewInterval) {
            "Wrong order!");
     if (LinkTopN != nullptr && LinkBotN != nullptr) {
       LinkTopN->setNextNode(LinkBotN);
-      LinkBotN->setPrevNode(LinkTopN);
     }
 #ifndef NDEBUG
     // TODO: Remove this once we've done enough testing.
@@ -394,22 +391,14 @@ void DependencyGraph::notifyMoveInstr(Instruction *I, const BBIterator &To) {
   if (To != BB->end()) {
     DGNode *ToN = getNodeOrNull(&*To);
     if (ToN != nullptr) {
-      MemDGNode *PrevMemN = getMemDGNodeBefore(ToN, /*IncludingN=*/false);
-      MemDGNode *NextMemN = getMemDGNodeAfter(ToN, /*IncludingN=*/true);
-      MemN->PrevMemN = PrevMemN;
-      if (PrevMemN != nullptr)
-        PrevMemN->NextMemN = MemN;
-      MemN->NextMemN = NextMemN;
-      if (NextMemN != nullptr)
-        NextMemN->PrevMemN = MemN;
+      MemN->setPrevNode(getMemDGNodeBefore(ToN, /*IncludingN=*/false));
+      MemN->setNextNode(getMemDGNodeAfter(ToN, /*IncludingN=*/true));
     }
   } else {
     // MemN becomes the last instruction in the BB.
     auto *TermN = getNodeOrNull(BB->getTerminator());
     if (TermN != nullptr) {
-      MemDGNode *PrevMemN = getMemDGNodeBefore(TermN, /*IncludingN=*/false);
-      PrevMemN->NextMemN = MemN;
-      MemN->PrevMemN = PrevMemN;
+      MemN->setPrevNode(getMemDGNodeBefore(TermN, /*IncludingN=*/false));
     } else {
       // The terminator is outside the DAG interval so do nothing.
     }

From 19557a4c8fab0dbfe9d9c53b99b7960ef211684e Mon Sep 17 00:00:00 2001
From: Damien L-G <dalg24@gmail.com>
Date: Fri, 10 Jan 2025 16:36:19 -0500
Subject: [PATCH 115/408] [libc++] Fix bug in tests for std::atomic_ref<T*>
 increment and decrement operators (#122271)

The implementation is fine and has the proper increment/decrement
operators defined, but the tests were wrong:
- a typo (`T` instead of `std::atomic_ref<T>`) when ensuring that increment/decrement
  operators are not defined in the primary template and specialization for floating point
  types, and
- the specialization for pointer types was miscategorized.
---
 .../atomics.ref/increment_decrement.pass.cpp  | 101 ++++++++++++------
 1 file changed, 68 insertions(+), 33 deletions(-)

diff --git a/libcxx/test/std/atomics/atomics.ref/increment_decrement.pass.cpp b/libcxx/test/std/atomics/atomics.ref/increment_decrement.pass.cpp
index f26a0bdf3663a..15bf0ecda5363 100644
--- a/libcxx/test/std/atomics/atomics.ref/increment_decrement.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.ref/increment_decrement.pass.cpp
@@ -42,43 +42,78 @@ constexpr bool does_not_have_increment_nor_decrement_operators() {
 
 template <typename T>
 struct TestDoesNotHaveIncrementDecrement {
-  void operator()() const { static_assert(does_not_have_increment_nor_decrement_operators<T>()); }
+  void operator()() const { static_assert(does_not_have_increment_nor_decrement_operators<std::atomic_ref<T>>()); }
 };
 
 template <typename T>
 struct TestIncrementDecrement {
   void operator()() const {
-    static_assert(std::is_integral_v<T>);
-
-    T x(T(1));
-    std::atomic_ref<T> const a(x);
-
-    {
-      std::same_as<T> decltype(auto) y = ++a;
-      assert(y == T(2));
-      assert(x == T(2));
-      ASSERT_NOEXCEPT(++a);
-    }
-
-    {
-      std::same_as<T> decltype(auto) y = --a;
-      assert(y == T(1));
-      assert(x == T(1));
-      ASSERT_NOEXCEPT(--a);
-    }
-
-    {
-      std::same_as<T> decltype(auto) y = a++;
-      assert(y == T(1));
-      assert(x == T(2));
-      ASSERT_NOEXCEPT(a++);
-    }
-
-    {
-      std::same_as<T> decltype(auto) y = a--;
-      assert(y == T(2));
-      assert(x == T(1));
-      ASSERT_NOEXCEPT(a--);
+    if constexpr (std::is_integral_v<T>) {
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+
+      {
+        std::same_as<T> decltype(auto) y = ++a;
+        assert(y == T(2));
+        assert(x == T(2));
+        ASSERT_NOEXCEPT(++a);
+      }
+
+      {
+        std::same_as<T> decltype(auto) y = --a;
+        assert(y == T(1));
+        assert(x == T(1));
+        ASSERT_NOEXCEPT(--a);
+      }
+
+      {
+        std::same_as<T> decltype(auto) y = a++;
+        assert(y == T(1));
+        assert(x == T(2));
+        ASSERT_NOEXCEPT(a++);
+      }
+
+      {
+        std::same_as<T> decltype(auto) y = a--;
+        assert(y == T(2));
+        assert(x == T(1));
+        ASSERT_NOEXCEPT(a--);
+      }
+    } else if constexpr (std::is_pointer_v<T>) {
+      using U = std::remove_pointer_t<T>;
+      U t[9]  = {};
+      T p{&t[1]};
+      std::atomic_ref<T> const a(p);
+
+      {
+        std::same_as<T> decltype(auto) y = ++a;
+        assert(y == &t[2]);
+        assert(p == &t[2]);
+        ASSERT_NOEXCEPT(++a);
+      }
+
+      {
+        std::same_as<T> decltype(auto) y = --a;
+        assert(y == &t[1]);
+        assert(p == &t[1]);
+        ASSERT_NOEXCEPT(--a);
+      }
+
+      {
+        std::same_as<T> decltype(auto) y = a++;
+        assert(y == &t[1]);
+        assert(p == &t[2]);
+        ASSERT_NOEXCEPT(a++);
+      }
+
+      {
+        std::same_as<T> decltype(auto) y = a--;
+        assert(y == &t[2]);
+        assert(p == &t[1]);
+        ASSERT_NOEXCEPT(a--);
+      }
+    } else {
+      static_assert(std::is_void_v<T>);
     }
   }
 };
@@ -88,7 +123,7 @@ int main(int, char**) {
 
   TestEachFloatingPointType<TestDoesNotHaveIncrementDecrement>()();
 
-  TestEachPointerType<TestDoesNotHaveIncrementDecrement>()();
+  TestEachPointerType<TestIncrementDecrement>()();
 
   TestDoesNotHaveIncrementDecrement<bool>()();
   TestDoesNotHaveIncrementDecrement<UserAtomicType>()();

From fb1d6f0d7d834067d36959ec4b54550cee72da95 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Fri, 10 Jan 2025 16:53:17 -0500
Subject: [PATCH 116/408] [Driver][OpenBSD] Remove riscv32 bit (#122525)

Someone added riscv32 here. OpenBSD does not support riscv32.
---
 clang/lib/Basic/Targets/OSTargets.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/lib/Basic/Targets/OSTargets.h b/clang/lib/Basic/Targets/OSTargets.h
index 53dd23c312963..b8d9a5725d012 100644
--- a/clang/lib/Basic/Targets/OSTargets.h
+++ b/clang/lib/Basic/Targets/OSTargets.h
@@ -447,7 +447,6 @@ class LLVM_LIBRARY_VISIBILITY OpenBSDTargetInfo : public OSTargetInfo<Target> {
     case llvm::Triple::sparcv9:
       this->MCountName = "_mcount";
       break;
-    case llvm::Triple::riscv32:
     case llvm::Triple::riscv64:
       break;
     }

From 0d9c027ad7fa36a607386e24d4928c9046f6ff56 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 10 Jan 2025 11:32:09 -0600
Subject: [PATCH 117/408] [InstCombine] Make `takeLog2` visible in all of
 InstCombine; NFC

Also add `tryGetLog2` helper that encapsulates the common pattern:

```
if (takeLog2(..., /*DoFold=*/false)) {
    Value * Log2 = takeLog2(..., /*DoFold=*/true);
    ...
}
```

Closes #122498
---
 .../InstCombine/InstCombineInternal.h         | 12 +++++
 .../InstCombine/InstCombineMulDivRem.cpp      | 53 ++++++-------------
 2 files changed, 29 insertions(+), 36 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index f6992119280c1..83e1da98deeda 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -785,6 +785,18 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
   void handlePotentiallyDeadBlocks(SmallVectorImpl<BasicBlock *> &Worklist);
   void handlePotentiallyDeadSuccessors(BasicBlock *BB, BasicBlock *LiveSucc);
   void freelyInvertAllUsersOf(Value *V, Value *IgnoredUser = nullptr);
+
+  /// Take the exact integer log2 of the value. If DoFold is true, create the
+  /// actual instructions, otherwise return a non-null dummy value. Return
+  /// nullptr on failure. Note, if DoFold is true the caller must ensure that
+  /// takeLog2 will succeed, otherwise it may create stray instructions.
+  Value *takeLog2(Value *Op, unsigned Depth, bool AssumeNonZero, bool DoFold);
+
+  Value *tryGetLog2(Value *Op, bool AssumeNonZero) {
+    if (takeLog2(Op, /*Depth=*/0, AssumeNonZero, /*DoFold=*/false))
+      return takeLog2(Op, /*Depth=*/0, AssumeNonZero, /*DoFold=*/true);
+    return nullptr;
+  }
 };
 
 class Negator final {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 0c34cf01bdf1a..1c5070a1b867c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -185,9 +185,6 @@ static Value *foldMulShl1(BinaryOperator &Mul, bool CommuteOperands,
   return nullptr;
 }
 
-static Value *takeLog2(IRBuilderBase &Builder, Value *Op, unsigned Depth,
-                       bool AssumeNonZero, bool DoFold);
-
 Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   if (Value *V =
@@ -531,19 +528,13 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
   //        (shl Op1, Log2(Op0))
   //    if Log2(Op1) folds away ->
   //        (shl Op0, Log2(Op1))
-  if (takeLog2(Builder, Op0, /*Depth*/ 0, /*AssumeNonZero*/ false,
-               /*DoFold*/ false)) {
-    Value *Res = takeLog2(Builder, Op0, /*Depth*/ 0, /*AssumeNonZero*/ false,
-                          /*DoFold*/ true);
+  if (Value *Res = tryGetLog2(Op0, /*AssumeNonZero=*/false)) {
     BinaryOperator *Shl = BinaryOperator::CreateShl(Op1, Res);
     // We can only propegate nuw flag.
     Shl->setHasNoUnsignedWrap(HasNUW);
     return Shl;
   }
-  if (takeLog2(Builder, Op1, /*Depth*/ 0, /*AssumeNonZero*/ false,
-               /*DoFold*/ false)) {
-    Value *Res = takeLog2(Builder, Op1, /*Depth*/ 0, /*AssumeNonZero*/ false,
-                          /*DoFold*/ true);
+  if (Value *Res = tryGetLog2(Op1, /*AssumeNonZero=*/false)) {
     BinaryOperator *Shl = BinaryOperator::CreateShl(Op0, Res);
     // We can only propegate nuw flag.
     Shl->setHasNoUnsignedWrap(HasNUW);
@@ -1407,13 +1398,8 @@ Instruction *InstCombinerImpl::commonIDivTransforms(BinaryOperator &I) {
   return nullptr;
 }
 
-static const unsigned MaxDepth = 6;
-
-// Take the exact integer log2 of the value. If DoFold is true, create the
-// actual instructions, otherwise return a non-null dummy value. Return nullptr
-// on failure.
-static Value *takeLog2(IRBuilderBase &Builder, Value *Op, unsigned Depth,
-                       bool AssumeNonZero, bool DoFold) {
+Value *InstCombinerImpl::takeLog2(Value *Op, unsigned Depth, bool AssumeNonZero,
+                                  bool DoFold) {
   auto IfFold = [DoFold](function_ref<Value *()> Fn) {
     if (!DoFold)
       return reinterpret_cast<Value *>(-1);
@@ -1432,14 +1418,14 @@ static Value *takeLog2(IRBuilderBase &Builder, Value *Op, unsigned Depth,
     });
 
   // The remaining tests are all recursive, so bail out if we hit the limit.
-  if (Depth++ == MaxDepth)
+  if (Depth++ == MaxAnalysisRecursionDepth)
     return nullptr;
 
   // log2(zext X) -> zext log2(X)
   // FIXME: Require one use?
   Value *X, *Y;
   if (match(Op, m_ZExt(m_Value(X))))
-    if (Value *LogX = takeLog2(Builder, X, Depth, AssumeNonZero, DoFold))
+    if (Value *LogX = takeLog2(X, Depth, AssumeNonZero, DoFold))
       return IfFold([&]() { return Builder.CreateZExt(LogX, Op->getType()); });
 
   // log2(trunc x) -> trunc log2(X)
@@ -1447,7 +1433,7 @@ static Value *takeLog2(IRBuilderBase &Builder, Value *Op, unsigned Depth,
   if (match(Op, m_Trunc(m_Value(X)))) {
     auto *TI = cast<TruncInst>(Op);
     if (AssumeNonZero || TI->hasNoUnsignedWrap())
-      if (Value *LogX = takeLog2(Builder, X, Depth, AssumeNonZero, DoFold))
+      if (Value *LogX = takeLog2(X, Depth, AssumeNonZero, DoFold))
         return IfFold([&]() {
           return Builder.CreateTrunc(LogX, Op->getType(), "",
                                      /*IsNUW=*/TI->hasNoUnsignedWrap());
@@ -1460,7 +1446,7 @@ static Value *takeLog2(IRBuilderBase &Builder, Value *Op, unsigned Depth,
     auto *BO = cast<OverflowingBinaryOperator>(Op);
     // nuw will be set if the `shl` is trivially non-zero.
     if (AssumeNonZero || BO->hasNoUnsignedWrap() || BO->hasNoSignedWrap())
-      if (Value *LogX = takeLog2(Builder, X, Depth, AssumeNonZero, DoFold))
+      if (Value *LogX = takeLog2(X, Depth, AssumeNonZero, DoFold))
         return IfFold([&]() { return Builder.CreateAdd(LogX, Y); });
   }
 
@@ -1469,26 +1455,25 @@ static Value *takeLog2(IRBuilderBase &Builder, Value *Op, unsigned Depth,
   if (match(Op, m_LShr(m_Value(X), m_Value(Y)))) {
     auto *PEO = cast<PossiblyExactOperator>(Op);
     if (AssumeNonZero || PEO->isExact())
-      if (Value *LogX = takeLog2(Builder, X, Depth, AssumeNonZero, DoFold))
+      if (Value *LogX = takeLog2(X, Depth, AssumeNonZero, DoFold))
         return IfFold([&]() { return Builder.CreateSub(LogX, Y); });
   }
 
   // log2(X & Y) -> either log2(X) or log2(Y)
   // This requires `AssumeNonZero` as `X & Y` may be zero when X != Y.
   if (AssumeNonZero && match(Op, m_And(m_Value(X), m_Value(Y)))) {
-    if (Value *LogX = takeLog2(Builder, X, Depth, AssumeNonZero, DoFold))
+    if (Value *LogX = takeLog2(X, Depth, AssumeNonZero, DoFold))
       return IfFold([&]() { return LogX; });
-    if (Value *LogY = takeLog2(Builder, Y, Depth, AssumeNonZero, DoFold))
+    if (Value *LogY = takeLog2(Y, Depth, AssumeNonZero, DoFold))
       return IfFold([&]() { return LogY; });
   }
 
   // log2(Cond ? X : Y) -> Cond ? log2(X) : log2(Y)
   // FIXME: Require one use?
   if (SelectInst *SI = dyn_cast<SelectInst>(Op))
-    if (Value *LogX = takeLog2(Builder, SI->getOperand(1), Depth,
-                               AssumeNonZero, DoFold))
-      if (Value *LogY = takeLog2(Builder, SI->getOperand(2), Depth,
-                                 AssumeNonZero, DoFold))
+    if (Value *LogX = takeLog2(SI->getOperand(1), Depth, AssumeNonZero, DoFold))
+      if (Value *LogY =
+              takeLog2(SI->getOperand(2), Depth, AssumeNonZero, DoFold))
         return IfFold([&]() {
           return Builder.CreateSelect(SI->getOperand(0), LogX, LogY);
         });
@@ -1499,9 +1484,9 @@ static Value *takeLog2(IRBuilderBase &Builder, Value *Op, unsigned Depth,
   if (MinMax && MinMax->hasOneUse() && !MinMax->isSigned()) {
     // Use AssumeNonZero as false here. Otherwise we can hit case where
     // log2(umax(X, Y)) != umax(log2(X), log2(Y)) (because overflow).
-    if (Value *LogX = takeLog2(Builder, MinMax->getLHS(), Depth,
+    if (Value *LogX = takeLog2(MinMax->getLHS(), Depth,
                                /*AssumeNonZero*/ false, DoFold))
-      if (Value *LogY = takeLog2(Builder, MinMax->getRHS(), Depth,
+      if (Value *LogY = takeLog2(MinMax->getRHS(), Depth,
                                  /*AssumeNonZero*/ false, DoFold))
         return IfFold([&]() {
           return Builder.CreateBinaryIntrinsic(MinMax->getIntrinsicID(), LogX,
@@ -1614,13 +1599,9 @@ Instruction *InstCombinerImpl::visitUDiv(BinaryOperator &I) {
   }
 
   // Op1 udiv Op2 -> Op1 lshr log2(Op2), if log2() folds away.
-  if (takeLog2(Builder, Op1, /*Depth*/ 0, /*AssumeNonZero*/ true,
-               /*DoFold*/ false)) {
-    Value *Res = takeLog2(Builder, Op1, /*Depth*/ 0,
-                          /*AssumeNonZero*/ true, /*DoFold*/ true);
+  if (Value *Res = tryGetLog2(Op1, /*AssumeNonZero=*/true))
     return replaceInstUsesWith(
         I, Builder.CreateLShr(Op0, Res, I.getName(), I.isExact()));
-  }
 
   return nullptr;
 }

From 7979e1ba298e3602d569f05a46c10b8efca9fd6f Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 10 Jan 2025 14:17:49 -0800
Subject: [PATCH 118/408] [RISCV] Add a default assignment of Inst{12-7} to
 RVInst16CSS. NFC

Some bits need to be overwritten by child classes, but at
least a few of the upper bits are common to all child classes.
---
 llvm/lib/Target/RISCV/RISCVInstrFormatsC.td | 3 ++-
 llvm/lib/Target/RISCV/RISCVInstrInfoC.td    | 5 -----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormatsC.td b/llvm/lib/Target/RISCV/RISCVInstrFormatsC.td
index 198d1466f022e..5e16061dc470f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormatsC.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormatsC.td
@@ -53,7 +53,7 @@ class RVInst16CI<bits<3> funct3, bits<2> opcode, dag outs, dag ins,
 
 // The immediate value encoding differs for each instruction, so each subclass
 // is responsible for setting the appropriate bits in the Inst field.
-// The bits Inst{12-7} must be set for each instruction.
+// The bits Inst{12-7} may need to be set differently for some instructions.
 class RVInst16CSS<bits<3> funct3, bits<2> opcode, dag outs, dag ins,
                   string opcodestr, string argstr>
     : RVInst16<outs, ins, opcodestr, argstr, [], InstFormatCSS> {
@@ -62,6 +62,7 @@ class RVInst16CSS<bits<3> funct3, bits<2> opcode, dag outs, dag ins,
   bits<5> rs1;
 
   let Inst{15-13} = funct3;
+  let Inst{12-7} = imm{5-0};
   let Inst{6-2} = rs2;
   let Inst{1-0} = opcode;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
index 1fab1fe1f3a15..0f320d2375ec2 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
@@ -569,20 +569,17 @@ def C_ADD : RVInst16CR<0b1001, 0b10, (outs GPRNoX0:$rd),
 let Predicates = [HasStdExtCOrZcd, HasStdExtD] in
 def C_FSDSP  : CStackStore<0b101, "c.fsdsp", FPR64, uimm9_lsb000>,
                Sched<[WriteFST64, ReadFStoreData, ReadFMemBase]> {
-  let Inst{12-10} = imm{5-3};
   let Inst{9-7}   = imm{8-6};
 }
 
 def C_SWSP : CStackStore<0b110, "c.swsp", GPR, uimm8_lsb00>,
              Sched<[WriteSTW, ReadStoreData, ReadMemBase]> {
-  let Inst{12-9} = imm{5-2};
   let Inst{8-7}  = imm{7-6};
 }
 
 let isCodeGenOnly = 1 in
 def C_SWSP_INX : CStackStore<0b110, "c.swsp", GPRF32, uimm8_lsb00>,
                  Sched<[WriteSTW, ReadStoreData, ReadMemBase]> {
-  let Inst{12-9} = imm{5-2};
   let Inst{8-7}  = imm{7-6};
 }
 
@@ -590,14 +587,12 @@ let DecoderNamespace = "RISCV32Only_",
     Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in
 def C_FSWSP  : CStackStore<0b111, "c.fswsp", FPR32, uimm8_lsb00>,
                Sched<[WriteFST32, ReadFStoreData, ReadFMemBase]> {
-  let Inst{12-9} = imm{5-2};
   let Inst{8-7}  = imm{7-6};
 }
 
 let Predicates = [HasStdExtCOrZca, IsRV64] in
 def C_SDSP : CStackStore<0b111, "c.sdsp", GPR, uimm9_lsb000>,
              Sched<[WriteSTD, ReadStoreData, ReadMemBase]> {
-  let Inst{12-10} = imm{5-3};
   let Inst{9-7}   = imm{8-6};
 }
 

From 85711bdda31a34a16c6458b0e824a3dd5f753929 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <nickdesaulniers@users.noreply.github.com>
Date: Fri, 10 Jan 2025 14:40:57 -0800
Subject: [PATCH 119/408] [libc][docs] update docs on how to build linux kernel
 headers from src (#122381)

It's simpler than the directions we have; which are very very Debian specific.
---
 libc/docs/full_host_build.rst | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/libc/docs/full_host_build.rst b/libc/docs/full_host_build.rst
index e25079141f47b..12aacf181695a 100644
--- a/libc/docs/full_host_build.rst
+++ b/libc/docs/full_host_build.rst
@@ -175,17 +175,15 @@ Linux Headers
 =============
 
 If you are using the full libc on Linux, then you will also need to install
-Linux headers in your sysroot.  The way to do this varies per system.
-
-These instructions should work on a Debian-based x86_64 system:
+Linux headers in your sysroot.  Let's build them from source.
 
 .. code-block:: sh
 
-   $> apt download linux-libc-dev
-   $> dpkg -x linux-libc-dev*deb .
-   $> cp -r usr/* /path/to/sysroot/
-   $> rm -r usr linux-libc-dev*deb
-   $> ln -s /path/to/sysroot/include/x86_64-linux-gnu/asm /path/to/sysroot/include/asm
+   $> git clone --depth=1 git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git /tmp/linux
+   $> make LLVM=1 INSTALL_HDR_PATH=/path/to/sysroot -C /tmp/linux headers_install
+
+The headers can be built to target non-host architectures by adding the
+``ARCH={arm|arm64|i386}`` to the above invocation of ``make``.
 
 Using your newly built libc
 ===========================

From 25b90c4ef67a01de6eba4f9e160d33772eb53454 Mon Sep 17 00:00:00 2001
From: Vasileios Porpodas <vporpodas@google.com>
Date: Fri, 10 Jan 2025 14:54:01 -0800
Subject: [PATCH 120/408] [SandboxVec][SeedCollector][NFC] Remove redundant
 'else' and move the assertion within the 'if'

---
 .../Vectorize/SandboxVectorizer/SeedCollector.cpp      | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SeedCollector.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SeedCollector.cpp
index a3ce663407c4a..bc86c785e84ae 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SeedCollector.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SeedCollector.cpp
@@ -67,13 +67,13 @@ ArrayRef<Instruction *> SeedBundle::getSlice(unsigned StartIdx,
     BitCount = BitCountPowerOfTwo;
   }
 
-  assert((!ForcePowerOf2 || isPowerOf2_32(BitCount)) &&
-         "Must be a power of two");
   // Return any non-empty slice
-  if (NumElements > 1)
+  if (NumElements > 1) {
+    assert((!ForcePowerOf2 || isPowerOf2_32(BitCount)) &&
+           "Must be a power of two");
     return ArrayRef<Instruction *>(&Seeds[StartIdx], NumElements);
-  else
-    return {};
+  }
+  return {};
 }
 
 template <typename LoadOrStoreT>

From 129ec845749fe117970f71c330945b5709e1d220 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 10 Jan 2025 15:10:17 -0800
Subject: [PATCH 121/408] [Conversion] Migrate away from PointerUnion::{is,get}
 (NFC) (#122421)

Note that PointerUnion::{is,get} have been soft deprecated in
PointerUnion.h:

  // FIXME: Replace the uses of is(), get() and dyn_cast() with
  //        isa<T>, cast<T> and the llvm::dyn_cast<T>

I'm not touching PointerUnion::dyn_cast for now because it's a bit
complicated; we could blindly migrate it to dyn_cast_if_present, but
we should probably use dyn_cast when the operand is known to be
non-null.
---
 .../MemRefToSPIRV/MemRefToSPIRV.cpp           |  2 +-
 mlir/lib/Conversion/MeshToMPI/MeshToMPI.cpp   | 23 +++++++++----------
 .../VectorToLLVM/ConvertVectorToLLVM.cpp      |  2 +-
 3 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp b/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp
index 49a391938eaf6..04bc62262c3d8 100644
--- a/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp
+++ b/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp
@@ -911,7 +911,7 @@ LogicalResult ReinterpretCastPattern::matchAndRewrite(
     if (auto val = dyn_cast<Value>(offset))
       return val;
 
-    int64_t attrVal = cast<IntegerAttr>(offset.get<Attribute>()).getInt();
+    int64_t attrVal = cast<IntegerAttr>(cast<Attribute>(offset)).getInt();
     Attribute attr = rewriter.getIntegerAttr(intType, attrVal);
     return rewriter.createOrFold<spirv::ConstantOp>(loc, intType, attr);
   }();
diff --git a/mlir/lib/Conversion/MeshToMPI/MeshToMPI.cpp b/mlir/lib/Conversion/MeshToMPI/MeshToMPI.cpp
index e1de125ccaede..eb265c621564a 100644
--- a/mlir/lib/Conversion/MeshToMPI/MeshToMPI.cpp
+++ b/mlir/lib/Conversion/MeshToMPI/MeshToMPI.cpp
@@ -240,13 +240,12 @@ struct ConvertUpdateHaloOp
     auto loc = op.getLoc();
 
     // convert a OpFoldResult into a Value
-    auto toValue = [&rewriter, &loc](OpFoldResult &v) {
-      return v.is<Value>()
-                 ? v.get<Value>()
-                 : rewriter.create<::mlir::arith::ConstantOp>(
-                       loc,
-                       rewriter.getIndexAttr(
-                           cast<IntegerAttr>(v.get<Attribute>()).getInt()));
+    auto toValue = [&rewriter, &loc](OpFoldResult &v) -> Value {
+      if (auto value = dyn_cast<Value>(v))
+        return value;
+      return rewriter.create<::mlir::arith::ConstantOp>(
+          loc, rewriter.getIndexAttr(
+                   cast<IntegerAttr>(cast<Attribute>(v)).getInt()));
     };
 
     auto dest = op.getDestination();
@@ -267,11 +266,11 @@ struct ConvertUpdateHaloOp
         getMixedValues(op.getStaticHaloSizes(), op.getHaloSizes(), rewriter);
     // subviews need Index values
     for (auto &sz : haloSizes) {
-      if (sz.is<Value>()) {
-        sz = rewriter
-                 .create<arith::IndexCastOp>(loc, rewriter.getIndexType(),
-                                             sz.get<Value>())
-                 .getResult();
+      if (auto value = dyn_cast<Value>(sz)) {
+        sz =
+            rewriter
+                .create<arith::IndexCastOp>(loc, rewriter.getIndexType(), value)
+                .getResult();
       }
     }
 
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 9657f583c375b..d688d8e2ab658 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -124,7 +124,7 @@ static Value getAsLLVMValue(OpBuilder &builder, Location loc,
     return builder.create<LLVM::ConstantOp>(loc, intAttr).getResult();
   }
 
-  return foldResult.get<Value>();
+  return cast<Value>(foldResult);
 }
 
 namespace {

From b302633bc5b93118b8a0bcaabfe0957294b9e894 Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@quicinc.com>
Date: Fri, 10 Jan 2025 15:11:19 -0800
Subject: [PATCH 122/408] [libclang] Allow using PrintingPolicy with types
 (#122386)

This allows controlling pretty-printing of types the same way it works
with cursors.
---
 clang/bindings/python/clang/cindex.py         |  5 +++++
 .../bindings/python/tests/cindex/test_type.py | 19 ++++++++++++++++++-
 clang/docs/ReleaseNotes.rst                   | 13 +++++++++----
 clang/include/clang-c/Index.h                 |  8 ++++++++
 clang/tools/libclang/CXType.cpp               | 14 ++++++++++++++
 clang/tools/libclang/libclang.map             |  1 +
 6 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/clang/bindings/python/clang/cindex.py b/clang/bindings/python/clang/cindex.py
index 7caf0bbfd722a..710259de855f9 100644
--- a/clang/bindings/python/clang/cindex.py
+++ b/clang/bindings/python/clang/cindex.py
@@ -2701,6 +2701,10 @@ def spelling(self):
         """Retrieve the spelling of this Type."""
         return _CXString.from_result(conf.lib.clang_getTypeSpelling(self))
 
+    def pretty_printed(self, policy):
+        """Pretty-prints this Type with the given PrintingPolicy"""
+        return _CXString.from_result(conf.lib.clang_getTypePrettyPrinted(self, policy))
+
     def __eq__(self, other):
         if type(other) != type(self):
             return False
@@ -3955,6 +3959,7 @@ def set_property(self, property, value):
     ("clang_getTypedefDeclUnderlyingType", [Cursor], Type),
     ("clang_getTypedefName", [Type], _CXString),
     ("clang_getTypeKindSpelling", [c_uint], _CXString),
+    ("clang_getTypePrettyPrinted", [Type, PrintingPolicy], _CXString),
     ("clang_getTypeSpelling", [Type], _CXString),
     ("clang_hashCursor", [Cursor], c_uint),
     ("clang_isAttribute", [CursorKind], bool),
diff --git a/clang/bindings/python/tests/cindex/test_type.py b/clang/bindings/python/tests/cindex/test_type.py
index e1d8c2aad1c3a..f39da8b5faf29 100644
--- a/clang/bindings/python/tests/cindex/test_type.py
+++ b/clang/bindings/python/tests/cindex/test_type.py
@@ -1,6 +1,14 @@
 import os
 
-from clang.cindex import Config, CursorKind, RefQualifierKind, TranslationUnit, TypeKind
+from clang.cindex import (
+    Config,
+    CursorKind,
+    PrintingPolicy,
+    PrintingPolicyProperty,
+    RefQualifierKind,
+    TranslationUnit,
+    TypeKind,
+)
 
 if "CLANG_LIBRARY_PATH" in os.environ:
     Config.set_library_path(os.environ["CLANG_LIBRARY_PATH"])
@@ -517,3 +525,12 @@ class Template {
         # Variable without a template argument.
         cursor = get_cursor(tu, "bar")
         self.assertEqual(cursor.get_num_template_arguments(), -1)
+
+    def test_pretty(self):
+        tu = get_tu("struct X {}; X x;", lang="cpp")
+        f = get_cursor(tu, "x")
+
+        pp = PrintingPolicy.create(f)
+        self.assertEqual(f.type.get_canonical().pretty_printed(pp), "X")
+        pp.set_property(PrintingPolicyProperty.SuppressTagKeyword, False)
+        self.assertEqual(f.type.get_canonical().pretty_printed(pp), "struct X")
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 5a48d6fbc01fa..190843f2aa6c9 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1182,6 +1182,8 @@ libclang
 --------
 - Add ``clang_isBeforeInTranslationUnit``. Given two source locations, it determines
   whether the first one comes strictly before the second in the source code.
+- Add ``clang_getTypePrettyPrinted``.  It allows controlling the PrintingPolicy used
+  to pretty-print a type.
 
 Static Analyzer
 ---------------
@@ -1322,10 +1324,13 @@ Sanitizers
 Python Binding Changes
 ----------------------
 - Fixed an issue that led to crashes when calling ``Type.get_exception_specification_kind``.
-- Added bindings for ``clang_getCursorPrettyPrinted`` and related functions,
-  which allow changing the formatting of pretty-printed code.
-- Added binding for ``clang_Cursor_isAnonymousRecordDecl``, which allows checking if
-  a declaration is an anonymous union or anonymous struct.
+- Added ``Cursor.pretty_printed``, a binding for ``clang_getCursorPrettyPrinted``,
+  and related functions, which allow changing the formatting of pretty-printed code.
+- Added ``Cursor.is_anonymous_record_decl``, a binding for
+  ``clang_Cursor_isAnonymousRecordDecl``, which allows checking if a
+  declaration is an anonymous union or anonymous struct.
+- Added ``Type.pretty_printed`, a binding for ``clang_getTypePrettyPrinted``,
+  which allows changing the formatting of pretty-printed types.
 
 OpenMP Support
 --------------
diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
index 63d266dc60ec7..ad64497ceb802 100644
--- a/clang/include/clang-c/Index.h
+++ b/clang/include/clang-c/Index.h
@@ -4182,6 +4182,14 @@ CINDEX_LINKAGE void clang_PrintingPolicy_dispose(CXPrintingPolicy Policy);
 CINDEX_LINKAGE CXString clang_getCursorPrettyPrinted(CXCursor Cursor,
                                                      CXPrintingPolicy Policy);
 
+/**
+ * Pretty-print the underlying type using a custom printing policy.
+ *
+ * If the type is invalid, an empty string is returned.
+ */
+CINDEX_LINKAGE CXString clang_getTypePrettyPrinted(CXType CT,
+                                                   CXPrintingPolicy cxPolicy);
+
 /**
  * Retrieve the display name for the entity referenced by this cursor.
  *
diff --git a/clang/tools/libclang/CXType.cpp b/clang/tools/libclang/CXType.cpp
index b4df12405cf35..f97023c429bfa 100644
--- a/clang/tools/libclang/CXType.cpp
+++ b/clang/tools/libclang/CXType.cpp
@@ -313,6 +313,20 @@ CXString clang_getTypeSpelling(CXType CT) {
   return cxstring::createDup(OS.str());
 }
 
+CXString clang_getTypePrettyPrinted(CXType CT, CXPrintingPolicy cxPolicy) {
+  QualType T = GetQualType(CT);
+  if (T.isNull())
+    return cxstring::createEmpty();
+
+  SmallString<64> Str;
+  llvm::raw_svector_ostream OS(Str);
+  PrintingPolicy *UserPolicy = static_cast<PrintingPolicy *>(cxPolicy);
+
+  T.print(OS, *UserPolicy);
+
+  return cxstring::createDup(OS.str());
+}
+
 CXType clang_getTypedefDeclUnderlyingType(CXCursor C) {
   using namespace cxcursor;
   CXTranslationUnit TU = cxcursor::getCursorTU(C);
diff --git a/clang/tools/libclang/libclang.map b/clang/tools/libclang/libclang.map
index 25d8ba57d3251..00ba56ab3c79d 100644
--- a/clang/tools/libclang/libclang.map
+++ b/clang/tools/libclang/libclang.map
@@ -436,6 +436,7 @@ LLVM_19 {
 
 LLVM_20 {
   global:
+    clang_getTypePrettyPrinted;
     clang_isBeforeInTranslationUnit;
 };
 

From 37f42cfb9a138409f19d31deeaa867ce2165d08f Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Fri, 10 Jan 2025 15:43:10 -0800
Subject: [PATCH 123/408] [nfc] Update header in llvm-ctxprof-utils (#122544)

---
 llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp b/llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp
index 485f6c7d33d90..2cf6d7613bdc9 100644
--- a/llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp
+++ b/llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp
@@ -1,4 +1,4 @@
-//===--- PGOCtxProfJSONReader.h - JSON format  ------------------*- C++ -*-===//
+//===--- llvm-ctxprof-util - utilities for ctxprof --------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,19 +8,15 @@
 ///
 /// \file
 ///
-/// JSON format for the contextual profile for testing.
+/// Utilities for manipulating contextual profiles
 ///
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/GlobalValue.h"
-#include "llvm/ProfileData/CtxInstrContextNode.h"
 #include "llvm/ProfileData/PGOCtxProfWriter.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/InitLLVM.h"
-#include "llvm/Support/JSON.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 

From 8a1174f06cb69c92290a2231ede0e2a8e8460e0c Mon Sep 17 00:00:00 2001
From: Ian Anderson <iana@apple.com>
Date: Fri, 10 Jan 2025 15:50:54 -0800
Subject: [PATCH 124/408] [Darwin][Driver][clang] arm64-apple-none-macho is
 missing the Apple macros from arm-apple-none-macho (#122427)

arm-apple-none-macho uses DarwinTargetInfo which provides several Apple
specific macros. arm64-apple-none-macho however just uses the generic
AArch64leTargetInfo and doesn't get any of those macros. It's not clear
if everything from DarwinTargetInfo is desirable for
arm64-apple-none-macho, so make an AppleMachOTargetInfo to hold the
generic Apple macros and a few other basic things.
---
 clang/lib/Basic/Targets.cpp                   |  8 ++
 clang/lib/Basic/Targets/AArch64.cpp           | 23 ++++-
 clang/lib/Basic/Targets/AArch64.h             | 14 ++++
 clang/lib/Basic/Targets/ARM.cpp               | 10 +++
 clang/lib/Basic/Targets/ARM.h                 | 11 +++
 clang/lib/Basic/Targets/OSTargets.cpp         | 26 ++++--
 clang/lib/Basic/Targets/OSTargets.h           | 38 +++++++--
 clang/lib/Basic/Targets/X86.h                 |  8 ++
 clang/lib/Frontend/InitPreprocessor.cpp       |  5 --
 clang/test/Preprocessor/darwin-predefines.c   | 83 +++++++++++++++++++
 .../Preprocessor/macho-embedded-predefines.c  | 23 +++++
 11 files changed, 227 insertions(+), 22 deletions(-)
 create mode 100644 clang/test/Preprocessor/darwin-predefines.c

diff --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp
index f40b81c02eb7e..fad3de217d81b 100644
--- a/clang/lib/Basic/Targets.cpp
+++ b/clang/lib/Basic/Targets.cpp
@@ -135,11 +135,15 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
   case llvm::Triple::aarch64_32:
     if (Triple.isOSDarwin())
       return std::make_unique<DarwinAArch64TargetInfo>(Triple, Opts);
+    else if (Triple.isAppleMachO())
+      return std::make_unique<AppleMachOAArch64TargetInfo>(Triple, Opts);
 
     return nullptr;
   case llvm::Triple::aarch64:
     if (Triple.isOSDarwin())
       return std::make_unique<DarwinAArch64TargetInfo>(Triple, Opts);
+    else if (Triple.isAppleMachO())
+      return std::make_unique<AppleMachOAArch64TargetInfo>(Triple, Opts);
 
     switch (os) {
     case llvm::Triple::FreeBSD:
@@ -243,6 +247,8 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
   case llvm::Triple::thumbeb:
     if (Triple.isOSDarwin())
       return std::make_unique<DarwinARMTargetInfo>(Triple, Opts);
+    else if (Triple.isAppleMachO())
+      return std::make_unique<AppleMachOARMTargetInfo>(Triple, Opts);
 
     switch (os) {
     case llvm::Triple::Linux:
@@ -531,6 +537,8 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
   case llvm::Triple::x86:
     if (Triple.isOSDarwin())
       return std::make_unique<DarwinI386TargetInfo>(Triple, Opts);
+    else if (Triple.isAppleMachO())
+      return std::make_unique<AppleMachOI386TargetInfo>(Triple, Opts);
 
     switch (os) {
     case llvm::Triple::Linux: {
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 2b4b954d0c27a..1bf58661d0efc 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -1671,6 +1671,10 @@ MinGWARM64TargetInfo::MinGWARM64TargetInfo(const llvm::Triple &Triple,
   TheCXXABI.set(TargetCXXABI::GenericAArch64);
 }
 
+AppleMachOAArch64TargetInfo::AppleMachOAArch64TargetInfo(
+    const llvm::Triple &Triple, const TargetOptions &Opts)
+    : AppleMachOTargetInfo<AArch64leTargetInfo>(Triple, Opts) {}
+
 DarwinAArch64TargetInfo::DarwinAArch64TargetInfo(const llvm::Triple &Triple,
                                                  const TargetOptions &Opts)
     : DarwinTargetInfo<AArch64leTargetInfo>(Triple, Opts) {
@@ -1695,9 +1699,9 @@ DarwinAArch64TargetInfo::DarwinAArch64TargetInfo(const llvm::Triple &Triple,
     TheCXXABI.set(TargetCXXABI::AppleARM64);
 }
 
-void DarwinAArch64TargetInfo::getOSDefines(const LangOptions &Opts,
-                                           const llvm::Triple &Triple,
-                                           MacroBuilder &Builder) const {
+void clang::targets::getAppleMachOAArch64Defines(MacroBuilder &Builder,
+                                                 const LangOptions &Opts,
+                                                 const llvm::Triple &Triple) {
   Builder.defineMacro("__AARCH64_SIMD__");
   if (Triple.isArch32Bit())
     Builder.defineMacro("__ARM64_ARCH_8_32__");
@@ -1710,7 +1714,20 @@ void DarwinAArch64TargetInfo::getOSDefines(const LangOptions &Opts,
 
   if (Triple.isArm64e())
     Builder.defineMacro("__arm64e__", "1");
+}
 
+void AppleMachOAArch64TargetInfo::getOSDefines(const LangOptions &Opts,
+                                               const llvm::Triple &Triple,
+                                               MacroBuilder &Builder) const {
+  getAppleMachOAArch64Defines(Builder, Opts, Triple);
+  AppleMachOTargetInfo<AArch64leTargetInfo>::getOSDefines(Opts, Triple,
+                                                          Builder);
+}
+
+void DarwinAArch64TargetInfo::getOSDefines(const LangOptions &Opts,
+                                           const llvm::Triple &Triple,
+                                           MacroBuilder &Builder) const {
+  getAppleMachOAArch64Defines(Builder, Opts, Triple);
   DarwinTargetInfo<AArch64leTargetInfo>::getOSDefines(Opts, Triple, Builder);
 }
 
diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h
index 4e927c0953b1f..cedf3286806ac 100644
--- a/clang/lib/Basic/Targets/AArch64.h
+++ b/clang/lib/Basic/Targets/AArch64.h
@@ -306,6 +306,20 @@ class LLVM_LIBRARY_VISIBILITY AArch64beTargetInfo : public AArch64TargetInfo {
   void setDataLayout() override;
 };
 
+void getAppleMachOAArch64Defines(MacroBuilder &Builder, const LangOptions &Opts,
+                                 const llvm::Triple &Triple);
+
+class LLVM_LIBRARY_VISIBILITY AppleMachOAArch64TargetInfo
+    : public AppleMachOTargetInfo<AArch64leTargetInfo> {
+public:
+  AppleMachOAArch64TargetInfo(const llvm::Triple &Triple,
+                              const TargetOptions &Opts);
+
+protected:
+  void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
+                    MacroBuilder &Builder) const override;
+};
+
 class LLVM_LIBRARY_VISIBILITY DarwinAArch64TargetInfo
     : public DarwinTargetInfo<AArch64leTargetInfo> {
 public:
diff --git a/clang/lib/Basic/Targets/ARM.cpp b/clang/lib/Basic/Targets/ARM.cpp
index 370444057b429..61ee26d886383 100644
--- a/clang/lib/Basic/Targets/ARM.cpp
+++ b/clang/lib/Basic/Targets/ARM.cpp
@@ -1479,6 +1479,16 @@ void CygwinARMTargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("_GNU_SOURCE");
 }
 
+AppleMachOARMTargetInfo::AppleMachOARMTargetInfo(const llvm::Triple &Triple,
+                                                 const TargetOptions &Opts)
+    : AppleMachOTargetInfo<ARMleTargetInfo>(Triple, Opts) {}
+
+void AppleMachOARMTargetInfo::getOSDefines(const LangOptions &Opts,
+                                           const llvm::Triple &Triple,
+                                           MacroBuilder &Builder) const {
+  getAppleMachODefines(Builder, Opts, Triple);
+}
+
 DarwinARMTargetInfo::DarwinARMTargetInfo(const llvm::Triple &Triple,
                                          const TargetOptions &Opts)
     : DarwinTargetInfo<ARMleTargetInfo>(Triple, Opts) {
diff --git a/clang/lib/Basic/Targets/ARM.h b/clang/lib/Basic/Targets/ARM.h
index 55ecb99d82d8f..fdb40c3d41918 100644
--- a/clang/lib/Basic/Targets/ARM.h
+++ b/clang/lib/Basic/Targets/ARM.h
@@ -300,6 +300,17 @@ class LLVM_LIBRARY_VISIBILITY CygwinARMTargetInfo : public ARMleTargetInfo {
                         MacroBuilder &Builder) const override;
 };
 
+class LLVM_LIBRARY_VISIBILITY AppleMachOARMTargetInfo
+    : public AppleMachOTargetInfo<ARMleTargetInfo> {
+protected:
+  void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
+                    MacroBuilder &Builder) const override;
+
+public:
+  AppleMachOARMTargetInfo(const llvm::Triple &Triple,
+                          const TargetOptions &Opts);
+};
+
 class LLVM_LIBRARY_VISIBILITY DarwinARMTargetInfo
     : public DarwinTargetInfo<ARMleTargetInfo> {
 protected:
diff --git a/clang/lib/Basic/Targets/OSTargets.cpp b/clang/lib/Basic/Targets/OSTargets.cpp
index 6f98353fb8c2e..bf10f9a725567 100644
--- a/clang/lib/Basic/Targets/OSTargets.cpp
+++ b/clang/lib/Basic/Targets/OSTargets.cpp
@@ -19,19 +19,17 @@ using namespace clang::targets;
 namespace clang {
 namespace targets {
 
-void getDarwinDefines(MacroBuilder &Builder, const LangOptions &Opts,
-                      const llvm::Triple &Triple, StringRef &PlatformName,
-                      VersionTuple &PlatformMinVersion) {
+void getAppleMachODefines(MacroBuilder &Builder, const LangOptions &Opts,
+                          const llvm::Triple &Triple) {
   Builder.defineMacro("__APPLE_CC__", "6000");
   Builder.defineMacro("__APPLE__");
-  Builder.defineMacro("__STDC_NO_THREADS__");
 
   // AddressSanitizer doesn't play well with source fortification, which is on
-  // by default on Darwin.
+  // by default on Apple platforms.
   if (Opts.Sanitize.has(SanitizerKind::Address))
     Builder.defineMacro("_FORTIFY_SOURCE", "0");
 
-  // Darwin defines __weak, __strong, and __unsafe_unretained even in C mode.
+  // Apple defines __weak, __strong, and __unsafe_unretained even in C mode.
   if (!Opts.ObjC) {
     // __weak is always defined, for use in blocks and with objc pointers.
     Builder.defineMacro("__weak", "__attribute__((objc_gc(weak)))");
@@ -47,6 +45,22 @@ void getDarwinDefines(MacroBuilder &Builder, const LangOptions &Opts,
   if (Opts.POSIXThreads)
     Builder.defineMacro("_REENTRANT");
 
+  // __MACH__ originally meant "will run in a Mach kernel based OS", but it has
+  // come to also mean "uses Apple Mach-O linking/symbol visibility semantics".
+  // Notably libc++'s __configuration/platform.h and Swift's shims/Visibility.h
+  // take __MACH__ for the more general meaning.
+  if (Triple.isAppleMachO() || Triple.isOSDarwin())
+    Builder.defineMacro("__MACH__");
+}
+
+void getDarwinDefines(MacroBuilder &Builder, const LangOptions &Opts,
+                      const llvm::Triple &Triple, StringRef &PlatformName,
+                      VersionTuple &PlatformMinVersion) {
+  getAppleMachODefines(Builder, Opts, Triple);
+
+  // Darwin's libc doesn't have threads.h
+  Builder.defineMacro("__STDC_NO_THREADS__");
+
   // Get the platform type and version number from the triple.
   VersionTuple OsVersion;
   if (Triple.isMacOSX()) {
diff --git a/clang/lib/Basic/Targets/OSTargets.h b/clang/lib/Basic/Targets/OSTargets.h
index b8d9a5725d012..ba9acc8b2a05c 100644
--- a/clang/lib/Basic/Targets/OSTargets.h
+++ b/clang/lib/Basic/Targets/OSTargets.h
@@ -34,12 +34,39 @@ class LLVM_LIBRARY_VISIBILITY OSTargetInfo : public TgtInfo {
   }
 };
 
+void getAppleMachODefines(MacroBuilder &Builder, const LangOptions &Opts,
+                          const llvm::Triple &Triple);
+
 void getDarwinDefines(MacroBuilder &Builder, const LangOptions &Opts,
                       const llvm::Triple &Triple, StringRef &PlatformName,
                       VersionTuple &PlatformMinVersion);
 
 template <typename Target>
-class LLVM_LIBRARY_VISIBILITY DarwinTargetInfo : public OSTargetInfo<Target> {
+class LLVM_LIBRARY_VISIBILITY AppleMachOTargetInfo
+    : public OSTargetInfo<Target> {
+protected:
+  void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
+                    MacroBuilder &Builder) const override {
+    getAppleMachODefines(Builder, Opts, Triple);
+  }
+
+public:
+  AppleMachOTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : OSTargetInfo<Target>(Triple, Opts) {}
+
+  const char *getStaticInitSectionSpecifier() const override {
+    return "__TEXT,__StaticInit,regular,pure_instructions";
+  }
+
+  /// Apple Mach-O does not support protected visibility.  Its "default" is very
+  /// similar to ELF's "protected";  Apple Mach-O requires a "weak" attribute on
+  /// declarations that can be dynamically replaced.
+  bool hasProtectedVisibility() const override { return false; }
+};
+
+template <typename Target>
+class LLVM_LIBRARY_VISIBILITY DarwinTargetInfo
+    : public AppleMachOTargetInfo<Target> {
 protected:
   void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
                     MacroBuilder &Builder) const override {
@@ -49,7 +76,7 @@ class LLVM_LIBRARY_VISIBILITY DarwinTargetInfo : public OSTargetInfo<Target> {
 
 public:
   DarwinTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
-      : OSTargetInfo<Target>(Triple, Opts) {
+      : AppleMachOTargetInfo<Target>(Triple, Opts) {
     // By default, no TLS, and we list permitted architecture/OS
     // combinations.
     this->TLSSupported = false;
@@ -82,14 +109,9 @@ class LLVM_LIBRARY_VISIBILITY DarwinTargetInfo : public OSTargetInfo<Target> {
 
   const char *getStaticInitSectionSpecifier() const override {
     // FIXME: We should return 0 when building kexts.
-    return "__TEXT,__StaticInit,regular,pure_instructions";
+    return AppleMachOTargetInfo<Target>::getStaticInitSectionSpecifier();
   }
 
-  /// Darwin does not support protected visibility.  Darwin's "default"
-  /// is very similar to ELF's "protected";  Darwin requires a "weak"
-  /// attribute on declarations that can be dynamically replaced.
-  bool hasProtectedVisibility() const override { return false; }
-
   unsigned getExnObjectAlignment() const override {
     // Older versions of libc++abi guarantee an alignment of only 8-bytes for
     // exception objects because of a bug in __cxa_exception that was
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index 35aceb1c58e14..2c200e64a3d84 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -534,6 +534,14 @@ class LLVM_LIBRARY_VISIBILITY OpenBSDI386TargetInfo
   }
 };
 
+class LLVM_LIBRARY_VISIBILITY AppleMachOI386TargetInfo
+    : public AppleMachOTargetInfo<X86_32TargetInfo> {
+public:
+  AppleMachOI386TargetInfo(const llvm::Triple &Triple,
+                           const TargetOptions &Opts)
+      : AppleMachOTargetInfo<X86_32TargetInfo>(Triple, Opts) {}
+};
+
 class LLVM_LIBRARY_VISIBILITY DarwinI386TargetInfo
     : public DarwinTargetInfo<X86_32TargetInfo> {
 public:
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 8eba766f21a64..29723b573e771 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1507,11 +1507,6 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
   // ELF targets define __ELF__
   if (TI.getTriple().isOSBinFormatELF())
     Builder.defineMacro("__ELF__");
-  else if (TI.getTriple().isAppleMachO())
-    // Apple MachO targets define __MACH__ even when not using DarwinTargetInfo.
-    // Hurd will also define this in some circumstances, but that's done in
-    // HurdTargetInfo. Windows targets don't define this.
-    Builder.defineMacro("__MACH__");
 
   // Target OS macro definitions.
   if (PPOpts.DefineTargetOSMacros) {
diff --git a/clang/test/Preprocessor/darwin-predefines.c b/clang/test/Preprocessor/darwin-predefines.c
new file mode 100644
index 0000000000000..c76eba6db1b6b
--- /dev/null
+++ b/clang/test/Preprocessor/darwin-predefines.c
@@ -0,0 +1,83 @@
+// RUN: %clang_cc1 -E -dM -triple arm64-apple-macosx15.0.0 -target-cpu apple-m1 %s | FileCheck %s -check-prefix CHECK-MACOSX
+
+// CHECK-MACOSX: #define __APPLE_CC__
+// CHECK-MACOSX: #define __APPLE__
+// CHECK-MACOSX: #define __ARM_64BIT_STATE 1
+// CHECK-MACOSX-NOT: #define __ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__
+// CHECK-MACOSX-NOT: #define __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__
+// CHECK-MACOSX: #define __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ 150000
+// CHECK-MACOSX: #define __ENVIRONMENT_OS_VERSION_MIN_REQUIRED__ 150000
+// CHECK-MACOSX-NOT: #define __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__
+// CHECK-MACOSX-NOT: #define __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__
+// CHECK-MACOSX: #define __MACH__
+// CHECK-MACOSX: #define __STDC_NO_THREADS__
+
+// RUN: %clang_cc1 -E -dM -triple arm64-apple-ios18.0.0 -target-cpu apple-a7 %s | FileCheck %s -check-prefix CHECK-IOS
+
+// CHECK-IOS: #define __APPLE_CC__
+// CHECK-IOS: #define __APPLE__
+// CHECK-IOS: #define __ARM_64BIT_STATE 1
+// CHECK-IOS-NOT: #define __ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__
+// CHECK-IOS: #define __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ 180000
+// CHECK-IOS-NOT: #define __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__
+// CHECK-IOS: #define __ENVIRONMENT_OS_VERSION_MIN_REQUIRED__ 180000
+// CHECK-IOS-NOT: #define __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__
+// CHECK-IOS-NOT: #define __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__
+// CHECK-IOS: #define __MACH__
+// CHECK-IOS: #define __STDC_NO_THREADS__
+
+// RUN: %clang_cc1 -E -dM -triple arm64-apple-watchos11.0.0 -target-cpu apple-s4 %s | FileCheck %s -check-prefix CHECK-WATCHOS
+
+// CHECK-WATCHOS: #define __APPLE_CC__
+// CHECK-WATCHOS: #define __APPLE__
+// CHECK-WATCHOS: #define __ARM_64BIT_STATE 1
+// CHECK-WATCHOS-NOT: #define __ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__
+// CHECK-WATCHOS-NOT: #define __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__
+// CHECK-WATCHOS-NOT: #define __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__
+// CHECK-WATCHOS: #define __ENVIRONMENT_OS_VERSION_MIN_REQUIRED__ 110000
+// CHECK-WATCHOS-NOT: #define __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__
+// CHECK-WATCHOS: #define __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ 110000
+// CHECK-WATCHOS: #define __MACH__
+// CHECK-WATCHOS: #define __STDC_NO_THREADS__
+
+// RUN: %clang_cc1 -E -dM -triple arm64-apple-tvos18.0.0 -target-cpu apple-a7 %s | FileCheck %s -check-prefix CHECK-TVOS
+
+// CHECK-TVOS: #define __APPLE_CC__
+// CHECK-TVOS: #define __APPLE__
+// CHECK-TVOS: #define __ARM_64BIT_STATE 1
+// CHECK-TVOS-NOT: #define __ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__
+// CHECK-TVOS-NOT: #define __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__
+// CHECK-TVOS-NOT: #define __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__
+// CHECK-TVOS: #define __ENVIRONMENT_OS_VERSION_MIN_REQUIRED__ 180000
+// CHECK-TVOS: #define __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ 180000
+// CHECK-TVOS-NOT: #define __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__
+// CHECK-TVOS: #define __MACH__
+// CHECK-TVOS: #define __STDC_NO_THREADS__
+
+// RUN: %clang_cc1 -E -dM -triple arm64-apple-driverkit24.0.0 -target-cpu apple-a7 %s | FileCheck %s -check-prefix CHECK-DRIVERKIT
+
+// CHECK-DRIVERKIT: #define __APPLE_CC__
+// CHECK-DRIVERKIT: #define __APPLE__
+// CHECK-DRIVERKIT: #define __ARM_64BIT_STATE 1
+// CHECK-DRIVERKIT: #define __ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__ 240000
+// CHECK-DRIVERKIT-NOT: #define __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__
+// CHECK-DRIVERKIT-NOT: #define __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__
+// CHECK-DRIVERKIT: #define __ENVIRONMENT_OS_VERSION_MIN_REQUIRED__ 240000
+// CHECK-DRIVERKIT-NOT: #define __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__
+// CHECK-DRIVERKIT-NOT: #define __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__
+// CHECK-DRIVERKIT: #define __MACH__
+// CHECK-DRIVERKIT: #define __STDC_NO_THREADS__
+
+// RUN: %clang_cc1 -E -dM -triple arm64-apple-xros2.0.0 -target-cpu apple-a12 %s | FileCheck %s -check-prefix CHECK-XROS
+
+// CHECK-XROS: #define __APPLE_CC__
+// CHECK-XROS: #define __APPLE__
+// CHECK-XROS: #define __ARM_64BIT_STATE 1
+// CHECK-XROS-NOT: #define __ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__
+// CHECK-XROS-NOT: #define __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__
+// CHECK-XROS-NOT: #define __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__
+// CHECK-XROS: #define __ENVIRONMENT_OS_VERSION_MIN_REQUIRED__ 20000
+// CHECK-XROS-NOT: #define __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__
+// CHECK-XROS-NOT: #define __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__
+// CHECK-XROS: #define __MACH__
+// CHECK-XROS: #define __STDC_NO_THREADS__
diff --git a/clang/test/Preprocessor/macho-embedded-predefines.c b/clang/test/Preprocessor/macho-embedded-predefines.c
index a7e5777a89a98..80a85eaa973e9 100644
--- a/clang/test/Preprocessor/macho-embedded-predefines.c
+++ b/clang/test/Preprocessor/macho-embedded-predefines.c
@@ -1,20 +1,43 @@
+// RUN: %clang_cc1 -E -dM -triple arm64-apple-none-macho -target-cpu generic %s | FileCheck %s -check-prefix CHECK-64
+
+// CHECK-64: #define __APPLE_CC__
+// CHECK-64: #define __APPLE__
+// CHECK-64: #define __ARM_64BIT_STATE 1
+// CHECK-64-NOT: #define __ENVIRONMENT_OS_VERSION_MIN_REQUIRED__
+// CHECK-64: #define __MACH__
+// CHECK-64-NOT: #define __STDC_NO_THREADS__
+
 // RUN: %clang_cc1 -E -dM -triple thumbv7m-apple-unknown-macho -target-cpu cortex-m3 %s | FileCheck %s -check-prefix CHECK-7M
 
 // CHECK-7M: #define __APPLE_CC__
 // CHECK-7M: #define __APPLE__
 // CHECK-7M: #define __ARM_ARCH_7M__
+// CHECK-7M-NOT: #define __ENVIRONMENT_OS_VERSION_MIN_REQUIRED__
 // CHECK-7M: #define __MACH__
+// CHECK-7M: #define __STDC_NO_THREADS__
 
 // RUN: %clang_cc1 -E -dM -triple thumbv7em-apple-unknown-macho -target-cpu cortex-m4 %s | FileCheck %s -check-prefix CHECK-7EM
 
 // CHECK-7EM: #define __APPLE_CC__
 // CHECK-7EM: #define __APPLE__
 // CHECK-7EM: #define __ARM_ARCH_7EM__
+// CHECK-7EM-NOT: #define __ENVIRONMENT_OS_VERSION_MIN_REQUIRED__
 // CHECK-7EM: #define __MACH__
+// CHECK-7EM: #define __STDC_NO_THREADS__
 
 // RUN: %clang_cc1 -E -dM -triple thumbv6m-apple-unknown-macho -target-cpu cortex-m0 %s | FileCheck %s -check-prefix CHECK-6M
 
 // CHECK-6M: #define __APPLE_CC__
 // CHECK-6M: #define __APPLE__
 // CHECK-6M: #define __ARM_ARCH_6M__
+// CHECK-6M-NOT: #define __ENVIRONMENT_OS_VERSION_MIN_REQUIRED__
 // CHECK-6M: #define __MACH__
+// CHECK-6M: #define __STDC_NO_THREADS__
+
+// RUN: %clang_cc1 -E -dM -triple x86_64-pc-windows-macho -target-cpu x86-64 %s | FileCheck %s -check-prefix CHECK-WINDOWS
+
+// CHECK-WINDOWS: #define __APPLE_CC__
+// CHECK-WINDOWS: #define __APPLE__
+// CHECK-WINDOWS-NOT: #define __ENVIRONMENT_OS_VERSION_MIN_REQUIRED__
+// CHECK-WINDOWS-NOT: #define __MACH__
+// CHECK-WINDOWS: #define __STDC_NO_THREADS__

From a475ae05fba7368055071c29a1fc9408346ebd6a Mon Sep 17 00:00:00 2001
From: Sergei Barannikov <barannikov88@gmail.com>
Date: Sat, 11 Jan 2025 03:36:34 +0300
Subject: [PATCH 125/408] Revert "[ADT] Fix specialization of ValueIsPresent
 for PointerUnion" (#122557)

Reverts llvm/llvm-project#121847

Causes compile time regressions and allegedly miscompilation.
---
 llvm/include/llvm/Support/Casting.h       | 8 ++++----
 llvm/lib/CodeGen/RegisterBankInfo.cpp     | 4 ++--
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 4 ++--
 llvm/unittests/ADT/PointerUnionTest.cpp   | 5 -----
 4 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/llvm/include/llvm/Support/Casting.h b/llvm/include/llvm/Support/Casting.h
index 2ce70e732e2ec..66fdcb44ea2c0 100644
--- a/llvm/include/llvm/Support/Casting.h
+++ b/llvm/include/llvm/Support/Casting.h
@@ -614,12 +614,12 @@ template <typename T> struct ValueIsPresent<std::optional<T>> {
   static inline decltype(auto) unwrapValue(std::optional<T> &t) { return *t; }
 };
 
-// If something is "nullable" then we just cast it to bool to see if it exists.
+// If something is "nullable" then we just compare it to nullptr to see if it
+// exists.
 template <typename T>
-struct ValueIsPresent<
-    T, std::enable_if_t<IsNullable<T> && std::is_constructible_v<bool, T>>> {
+struct ValueIsPresent<T, std::enable_if_t<IsNullable<T>>> {
   using UnwrappedType = T;
-  static inline bool isPresent(const T &t) { return static_cast<bool>(t); }
+  static inline bool isPresent(const T &t) { return t != T(nullptr); }
   static inline decltype(auto) unwrapValue(T &t) { return t; }
 };
 
diff --git a/llvm/lib/CodeGen/RegisterBankInfo.cpp b/llvm/lib/CodeGen/RegisterBankInfo.cpp
index 5a8cf13ad11fd..e1720b038e236 100644
--- a/llvm/lib/CodeGen/RegisterBankInfo.cpp
+++ b/llvm/lib/CodeGen/RegisterBankInfo.cpp
@@ -134,10 +134,10 @@ const TargetRegisterClass *RegisterBankInfo::constrainGenericRegister(
 
   // If the register already has a class, fallback to MRI::constrainRegClass.
   auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
-  if (isa_and_present<const TargetRegisterClass *>(RegClassOrBank))
+  if (isa<const TargetRegisterClass *>(RegClassOrBank))
     return MRI.constrainRegClass(Reg, &RC);
 
-  const auto *RB = dyn_cast_if_present<const RegisterBank *>(RegClassOrBank);
+  const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
   // Otherwise, all we can do is ensure the bank covers the class, and set it.
   if (RB && !RB->covers(RC))
     return nullptr;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 8fa656c77e90e..704435dad65d7 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3708,10 +3708,10 @@ const TargetRegisterClass *
 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
                                          const MachineRegisterInfo &MRI) const {
   const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
-  if (const auto *RB = dyn_cast_if_present<const RegisterBank *>(RCOrRB))
+  if (const RegisterBank *RB = dyn_cast<const RegisterBank *>(RCOrRB))
     return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB);
 
-  if (const auto *RC = dyn_cast_if_present<const TargetRegisterClass *>(RCOrRB))
+  if (const auto *RC = dyn_cast<const TargetRegisterClass *>(RCOrRB))
     return getAllocatableClass(RC);
 
   return nullptr;
diff --git a/llvm/unittests/ADT/PointerUnionTest.cpp b/llvm/unittests/ADT/PointerUnionTest.cpp
index a28d532865cbc..acddb78960149 100644
--- a/llvm/unittests/ADT/PointerUnionTest.cpp
+++ b/llvm/unittests/ADT/PointerUnionTest.cpp
@@ -208,11 +208,6 @@ TEST_F(PointerUnionTest, NewCastInfra) {
   EXPECT_FALSE(isa<float *>(d4null));
   EXPECT_FALSE(isa<long long *>(d4null));
 
-  EXPECT_FALSE(isa_and_present<int *>(i4null));
-  EXPECT_FALSE(isa_and_present<float *>(f4null));
-  EXPECT_FALSE(isa_and_present<long long *>(l4null));
-  EXPECT_FALSE(isa_and_present<double *>(d4null));
-
   // test cast<>
   EXPECT_EQ(cast<float *>(a), &f);
   EXPECT_EQ(cast<int *>(b), &i);

From 834d65eb2ecea04382630579007a88c30129c734 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Fri, 10 Jan 2025 16:41:43 -0800
Subject: [PATCH 126/408] [nfc][ubsan] Use O1 in test to remove more unrelated
 stuff (#122408)

---
 clang/test/CodeGen/allow-ubsan-check.c | 337 +++++++++++--------------
 1 file changed, 145 insertions(+), 192 deletions(-)

diff --git a/clang/test/CodeGen/allow-ubsan-check.c b/clang/test/CodeGen/allow-ubsan-check.c
index e3860784e716f..3981efe10a5bc 100644
--- a/clang/test/CodeGen/allow-ubsan-check.c
+++ b/clang/test/CodeGen/allow-ubsan-check.c
@@ -1,86 +1,65 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
-// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -emit-llvm -o - %s -fsanitize=signed-integer-overflow,integer-divide-by-zero,null -mllvm -ubsan-guard-checks | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -emit-llvm -o - %s -fsanitize=signed-integer-overflow,integer-divide-by-zero,null -mllvm -ubsan-guard-checks -fsanitize-trap=signed-integer-overflow,integer-divide-by-zero,null | FileCheck %s --check-prefixes=TR
-// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -emit-llvm -o - %s -fsanitize=signed-integer-overflow,integer-divide-by-zero,null -mllvm -ubsan-guard-checks -fsanitize-recover=signed-integer-overflow,integer-divide-by-zero,null | FileCheck %s --check-prefixes=REC
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -emit-llvm -O1 -o - %s -fsanitize=signed-integer-overflow,integer-divide-by-zero,null -mllvm -ubsan-guard-checks | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -emit-llvm -O1 -o - %s -fsanitize=signed-integer-overflow,integer-divide-by-zero,null -mllvm -ubsan-guard-checks -fsanitize-trap=signed-integer-overflow,integer-divide-by-zero,null | FileCheck %s --check-prefixes=TR
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -emit-llvm -O1 -o - %s -fsanitize=signed-integer-overflow,integer-divide-by-zero,null -mllvm -ubsan-guard-checks -fsanitize-recover=signed-integer-overflow,integer-divide-by-zero,null | FileCheck %s --check-prefixes=REC
 
 
-// CHECK-LABEL: define dso_local i32 @div(
-// CHECK-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[Y_ADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
-// CHECK-NEXT:    store i32 [[Y]], ptr [[Y_ADDR]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[Y_ADDR]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0, !nosanitize [[META2:![0-9]+]]
-// CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP0]], -2147483648, !nosanitize [[META2]]
-// CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], -1, !nosanitize [[META2]]
-// CHECK-NEXT:    [[OR:%.*]] = or i1 [[TMP3]], [[TMP4]], !nosanitize [[META2]]
-// CHECK-NEXT:    [[TMP5:%.*]] = and i1 [[TMP2]], [[OR]], !nosanitize [[META2]]
-// CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.allow.ubsan.check(i8 3), !nosanitize [[META2]]
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i1 [[TMP6]], true, !nosanitize [[META2]]
-// CHECK-NEXT:    [[TMP8:%.*]] = or i1 [[TMP5]], [[TMP7]], !nosanitize [[META2]]
-// CHECK-NEXT:    br i1 [[TMP8]], label [[CONT:%.*]], label [[HANDLER_DIVREM_OVERFLOW:%.*]], !prof [[PROF3:![0-9]+]], !nosanitize [[META2]]
-// CHECK:       handler.divrem_overflow:
-// CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP0]] to i64, !nosanitize [[META2]]
-// CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP1]] to i64, !nosanitize [[META2]]
-// CHECK-NEXT:    call void @__ubsan_handle_divrem_overflow_abort(ptr @[[GLOB1:[0-9]+]], i64 [[TMP9]], i64 [[TMP10]]) #[[ATTR4:[0-9]+]], !nosanitize [[META2]]
+// CHECK-LABEL: define dso_local noundef i32 @div(
+// CHECK-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[Y]], 0, !nosanitize [[META2:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[X]], -2147483648, !nosanitize [[META2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[Y]], -1, !nosanitize [[META2]]
+// CHECK-NEXT:    [[OR_NOT5:%.*]] = and i1 [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    [[DOTNOT3:%.*]] = or i1 [[TMP0]], [[OR_NOT5]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 3), !nosanitize [[META2]]
+// CHECK-NEXT:    [[DOTNOT1:%.*]] = and i1 [[DOTNOT3]], [[TMP3]]
+// CHECK-NEXT:    br i1 [[DOTNOT1]], label %[[HANDLER_DIVREM_OVERFLOW:.*]], label %[[CONT:.*]], !prof [[PROF3:![0-9]+]], !nosanitize [[META2]]
+// CHECK:       [[HANDLER_DIVREM_OVERFLOW]]:
+// CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[X]] to i64, !nosanitize [[META2]]
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[Y]] to i64, !nosanitize [[META2]]
+// CHECK-NEXT:    tail call void @__ubsan_handle_divrem_overflow_abort(ptr nonnull @[[GLOB1:[0-9]+]], i64 [[TMP4]], i64 [[TMP5]]) #[[ATTR4:[0-9]+]], !nosanitize [[META2]]
 // CHECK-NEXT:    unreachable, !nosanitize [[META2]]
-// CHECK:       cont:
-// CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP0]], [[TMP1]]
+// CHECK:       [[CONT]]:
+// CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[X]], [[Y]]
 // CHECK-NEXT:    ret i32 [[DIV]]
 //
-// TR-LABEL: define dso_local i32 @div(
-// TR-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
-// TR-NEXT:  entry:
-// TR-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
-// TR-NEXT:    [[Y_ADDR:%.*]] = alloca i32, align 4
-// TR-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
-// TR-NEXT:    store i32 [[Y]], ptr [[Y_ADDR]], align 4
-// TR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
-// TR-NEXT:    [[TMP1:%.*]] = load i32, ptr [[Y_ADDR]], align 4
-// TR-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0, !nosanitize [[META2:![0-9]+]]
-// TR-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP0]], -2147483648, !nosanitize [[META2]]
-// TR-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], -1, !nosanitize [[META2]]
-// TR-NEXT:    [[OR:%.*]] = or i1 [[TMP3]], [[TMP4]], !nosanitize [[META2]]
-// TR-NEXT:    [[TMP5:%.*]] = and i1 [[TMP2]], [[OR]], !nosanitize [[META2]]
-// TR-NEXT:    [[TMP6:%.*]] = call i1 @llvm.allow.ubsan.check(i8 3), !nosanitize [[META2]]
-// TR-NEXT:    [[TMP7:%.*]] = xor i1 [[TMP6]], true, !nosanitize [[META2]]
-// TR-NEXT:    [[TMP8:%.*]] = or i1 [[TMP5]], [[TMP7]], !nosanitize [[META2]]
-// TR-NEXT:    br i1 [[TMP8]], label [[CONT:%.*]], label [[TRAP:%.*]], !nosanitize [[META2]]
-// TR:       trap:
-// TR-NEXT:    call void @llvm.ubsantrap(i8 3) #[[ATTR4:[0-9]+]], !nosanitize [[META2]]
+// TR-LABEL: define dso_local noundef i32 @div(
+// TR-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// TR-NEXT:  [[ENTRY:.*:]]
+// TR-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[Y]], 0, !nosanitize [[META2:![0-9]+]]
+// TR-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[X]], -2147483648, !nosanitize [[META2]]
+// TR-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[Y]], -1, !nosanitize [[META2]]
+// TR-NEXT:    [[OR_NOT5:%.*]] = and i1 [[TMP1]], [[TMP2]]
+// TR-NEXT:    [[DOTNOT3:%.*]] = or i1 [[TMP0]], [[OR_NOT5]]
+// TR-NEXT:    [[TMP3:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 3), !nosanitize [[META2]]
+// TR-NEXT:    [[DOTNOT1:%.*]] = and i1 [[DOTNOT3]], [[TMP3]]
+// TR-NEXT:    br i1 [[DOTNOT1]], label %[[TRAP:.*]], label %[[CONT:.*]], !nosanitize [[META2]]
+// TR:       [[TRAP]]:
+// TR-NEXT:    tail call void @llvm.ubsantrap(i8 3) #[[ATTR4:[0-9]+]], !nosanitize [[META2]]
 // TR-NEXT:    unreachable, !nosanitize [[META2]]
-// TR:       cont:
-// TR-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP0]], [[TMP1]]
+// TR:       [[CONT]]:
+// TR-NEXT:    [[DIV:%.*]] = sdiv i32 [[X]], [[Y]]
 // TR-NEXT:    ret i32 [[DIV]]
 //
-// REC-LABEL: define dso_local i32 @div(
-// REC-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
-// REC-NEXT:  entry:
-// REC-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
-// REC-NEXT:    [[Y_ADDR:%.*]] = alloca i32, align 4
-// REC-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
-// REC-NEXT:    store i32 [[Y]], ptr [[Y_ADDR]], align 4
-// REC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
-// REC-NEXT:    [[TMP1:%.*]] = load i32, ptr [[Y_ADDR]], align 4
-// REC-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0, !nosanitize [[META2:![0-9]+]]
-// REC-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP0]], -2147483648, !nosanitize [[META2]]
-// REC-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], -1, !nosanitize [[META2]]
-// REC-NEXT:    [[OR:%.*]] = or i1 [[TMP3]], [[TMP4]], !nosanitize [[META2]]
-// REC-NEXT:    [[TMP5:%.*]] = and i1 [[TMP2]], [[OR]], !nosanitize [[META2]]
-// REC-NEXT:    [[TMP6:%.*]] = call i1 @llvm.allow.ubsan.check(i8 3), !nosanitize [[META2]]
-// REC-NEXT:    [[TMP7:%.*]] = xor i1 [[TMP6]], true, !nosanitize [[META2]]
-// REC-NEXT:    [[TMP8:%.*]] = or i1 [[TMP5]], [[TMP7]], !nosanitize [[META2]]
-// REC-NEXT:    br i1 [[TMP8]], label [[CONT:%.*]], label [[HANDLER_DIVREM_OVERFLOW:%.*]], !prof [[PROF3:![0-9]+]], !nosanitize [[META2]]
-// REC:       handler.divrem_overflow:
-// REC-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP0]] to i64, !nosanitize [[META2]]
-// REC-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP1]] to i64, !nosanitize [[META2]]
-// REC-NEXT:    call void @__ubsan_handle_divrem_overflow(ptr @[[GLOB1:[0-9]+]], i64 [[TMP9]], i64 [[TMP10]]) #[[ATTR4:[0-9]+]], !nosanitize [[META2]]
-// REC-NEXT:    br label [[CONT]], !nosanitize [[META2]]
-// REC:       cont:
-// REC-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP0]], [[TMP1]]
+// REC-LABEL: define dso_local noundef i32 @div(
+// REC-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// REC-NEXT:  [[ENTRY:.*:]]
+// REC-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[Y]], 0, !nosanitize [[META2:![0-9]+]]
+// REC-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[X]], -2147483648, !nosanitize [[META2]]
+// REC-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[Y]], -1, !nosanitize [[META2]]
+// REC-NEXT:    [[OR_NOT5:%.*]] = and i1 [[TMP1]], [[TMP2]]
+// REC-NEXT:    [[DOTNOT3:%.*]] = or i1 [[TMP0]], [[OR_NOT5]]
+// REC-NEXT:    [[TMP3:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 3), !nosanitize [[META2]]
+// REC-NEXT:    [[DOTNOT1:%.*]] = and i1 [[DOTNOT3]], [[TMP3]]
+// REC-NEXT:    br i1 [[DOTNOT1]], label %[[HANDLER_DIVREM_OVERFLOW:.*]], label %[[CONT:.*]], !prof [[PROF3:![0-9]+]], !nosanitize [[META2]]
+// REC:       [[HANDLER_DIVREM_OVERFLOW]]:
+// REC-NEXT:    [[TMP4:%.*]] = zext i32 [[X]] to i64, !nosanitize [[META2]]
+// REC-NEXT:    [[TMP5:%.*]] = zext i32 [[Y]] to i64, !nosanitize [[META2]]
+// REC-NEXT:    tail call void @__ubsan_handle_divrem_overflow(ptr nonnull @[[GLOB1:[0-9]+]], i64 [[TMP4]], i64 [[TMP5]]) #[[ATTR4:[0-9]+]], !nosanitize [[META2]]
+// REC-NEXT:    br label %[[CONT]], !nosanitize [[META2]]
+// REC:       [[CONT]]:
+// REC-NEXT:    [[DIV:%.*]] = sdiv i32 [[X]], [[Y]]
 // REC-NEXT:    ret i32 [[DIV]]
 //
 int div(int x, int y) {
@@ -88,147 +67,121 @@ int div(int x, int y) {
 }
 
 // CHECK-LABEL: define dso_local i32 @null(
-// CHECK-SAME: ptr noundef [[X:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne ptr [[TMP0]], null, !nosanitize [[META2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = call i1 @llvm.allow.ubsan.check(i8 22), !nosanitize [[META2]]
-// CHECK-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true, !nosanitize [[META2]]
-// CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[TMP1]], [[TMP3]], !nosanitize [[META2]]
-// CHECK-NEXT:    br i1 [[TMP4]], label [[CONT:%.*]], label [[HANDLER_TYPE_MISMATCH:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// CHECK:       handler.type_mismatch:
-// CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[TMP0]] to i64, !nosanitize [[META2]]
-// CHECK-NEXT:    call void @__ubsan_handle_type_mismatch_v1_abort(ptr @[[GLOB2:[0-9]+]], i64 [[TMP5]]) #[[ATTR4]], !nosanitize [[META2]]
+// CHECK-SAME: ptr noundef readonly [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq ptr [[X]], null, !nosanitize [[META2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 22), !nosanitize [[META2]]
+// CHECK-NEXT:    [[DOTNOT1:%.*]] = and i1 [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    br i1 [[DOTNOT1]], label %[[HANDLER_TYPE_MISMATCH:.*]], label %[[CONT:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// CHECK:       [[HANDLER_TYPE_MISMATCH]]:
+// CHECK-NEXT:    tail call void @__ubsan_handle_type_mismatch_v1_abort(ptr nonnull @[[GLOB2:[0-9]+]], i64 0) #[[ATTR4]], !nosanitize [[META2]]
 // CHECK-NEXT:    unreachable, !nosanitize [[META2]]
-// CHECK:       cont:
-// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP0]], align 4
-// CHECK-NEXT:    ret i32 [[TMP6]]
+// CHECK:       [[CONT]]:
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-NEXT:    ret i32 [[TMP2]]
 //
 // TR-LABEL: define dso_local i32 @null(
-// TR-SAME: ptr noundef [[X:%.*]]) #[[ATTR0]] {
-// TR-NEXT:  entry:
-// TR-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 8
-// TR-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8
-// TR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 8
-// TR-NEXT:    [[TMP1:%.*]] = icmp ne ptr [[TMP0]], null, !nosanitize [[META2]]
-// TR-NEXT:    [[TMP2:%.*]] = call i1 @llvm.allow.ubsan.check(i8 22), !nosanitize [[META2]]
-// TR-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true, !nosanitize [[META2]]
-// TR-NEXT:    [[TMP4:%.*]] = or i1 [[TMP1]], [[TMP3]], !nosanitize [[META2]]
-// TR-NEXT:    br i1 [[TMP4]], label [[CONT:%.*]], label [[TRAP:%.*]], !nosanitize [[META2]]
-// TR:       trap:
-// TR-NEXT:    call void @llvm.ubsantrap(i8 22) #[[ATTR4]], !nosanitize [[META2]]
+// TR-SAME: ptr noundef readonly [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// TR-NEXT:  [[ENTRY:.*:]]
+// TR-NEXT:    [[TMP0:%.*]] = icmp eq ptr [[X]], null, !nosanitize [[META2]]
+// TR-NEXT:    [[TMP1:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 22), !nosanitize [[META2]]
+// TR-NEXT:    [[DOTNOT1:%.*]] = and i1 [[TMP0]], [[TMP1]]
+// TR-NEXT:    br i1 [[DOTNOT1]], label %[[TRAP:.*]], label %[[CONT:.*]], !nosanitize [[META2]]
+// TR:       [[TRAP]]:
+// TR-NEXT:    tail call void @llvm.ubsantrap(i8 22) #[[ATTR4]], !nosanitize [[META2]]
 // TR-NEXT:    unreachable, !nosanitize [[META2]]
-// TR:       cont:
-// TR-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
-// TR-NEXT:    ret i32 [[TMP5]]
+// TR:       [[CONT]]:
+// TR-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA3:![0-9]+]]
+// TR-NEXT:    ret i32 [[TMP2]]
 //
 // REC-LABEL: define dso_local i32 @null(
-// REC-SAME: ptr noundef [[X:%.*]]) #[[ATTR0]] {
-// REC-NEXT:  entry:
-// REC-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 8
-// REC-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8
-// REC-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 8
-// REC-NEXT:    [[TMP1:%.*]] = icmp ne ptr [[TMP0]], null, !nosanitize [[META2]]
-// REC-NEXT:    [[TMP2:%.*]] = call i1 @llvm.allow.ubsan.check(i8 22), !nosanitize [[META2]]
-// REC-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true, !nosanitize [[META2]]
-// REC-NEXT:    [[TMP4:%.*]] = or i1 [[TMP1]], [[TMP3]], !nosanitize [[META2]]
-// REC-NEXT:    br i1 [[TMP4]], label [[CONT:%.*]], label [[HANDLER_TYPE_MISMATCH:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// REC:       handler.type_mismatch:
-// REC-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[TMP0]] to i64, !nosanitize [[META2]]
-// REC-NEXT:    call void @__ubsan_handle_type_mismatch_v1(ptr @[[GLOB2:[0-9]+]], i64 [[TMP5]]) #[[ATTR4]], !nosanitize [[META2]]
-// REC-NEXT:    br label [[CONT]], !nosanitize [[META2]]
-// REC:       cont:
-// REC-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP0]], align 4
-// REC-NEXT:    ret i32 [[TMP6]]
+// REC-SAME: ptr noundef readonly [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// REC-NEXT:  [[ENTRY:.*:]]
+// REC-NEXT:    [[TMP0:%.*]] = icmp eq ptr [[X]], null, !nosanitize [[META2]]
+// REC-NEXT:    [[TMP1:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 22), !nosanitize [[META2]]
+// REC-NEXT:    [[DOTNOT1:%.*]] = and i1 [[TMP0]], [[TMP1]]
+// REC-NEXT:    br i1 [[DOTNOT1]], label %[[HANDLER_TYPE_MISMATCH:.*]], label %[[CONT:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// REC:       [[HANDLER_TYPE_MISMATCH]]:
+// REC-NEXT:    tail call void @__ubsan_handle_type_mismatch_v1(ptr nonnull @[[GLOB2:[0-9]+]], i64 0) #[[ATTR4]], !nosanitize [[META2]]
+// REC-NEXT:    br label %[[CONT]], !nosanitize [[META2]]
+// REC:       [[CONT]]:
+// REC-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA4:![0-9]+]]
+// REC-NEXT:    ret i32 [[TMP2]]
 //
 int null(int* x) {
   return *x;
 }
 
-// CHECK-LABEL: define dso_local i32 @overflow(
-// CHECK-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[Y_ADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
-// CHECK-NEXT:    store i32 [[Y]], ptr [[Y_ADDR]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[Y_ADDR]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[TMP0]], i32 [[TMP1]]), !nosanitize [[META2]]
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i1 } [[TMP2]], 0, !nosanitize [[META2]]
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1, !nosanitize [[META2]]
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i1 [[TMP4]], true, !nosanitize [[META2]]
-// CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.allow.ubsan.check(i8 0), !nosanitize [[META2]]
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i1 [[TMP6]], true, !nosanitize [[META2]]
-// CHECK-NEXT:    [[TMP8:%.*]] = or i1 [[TMP5]], [[TMP7]], !nosanitize [[META2]]
-// CHECK-NEXT:    br i1 [[TMP8]], label [[CONT:%.*]], label [[HANDLER_ADD_OVERFLOW:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// CHECK:       handler.add_overflow:
-// CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP0]] to i64, !nosanitize [[META2]]
-// CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP1]] to i64, !nosanitize [[META2]]
-// CHECK-NEXT:    call void @__ubsan_handle_add_overflow_abort(ptr @[[GLOB3:[0-9]+]], i64 [[TMP9]], i64 [[TMP10]]) #[[ATTR4]], !nosanitize [[META2]]
+// CHECK-LABEL: define dso_local noundef i32 @overflow(
+// CHECK-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[X]], i32 [[Y]]), !nosanitize [[META2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i32, i1 } [[TMP0]], 1, !nosanitize [[META2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 0), !nosanitize [[META2]]
+// CHECK-NEXT:    [[DOTDEMORGAN:%.*]] = and i1 [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    br i1 [[DOTDEMORGAN]], label %[[HANDLER_ADD_OVERFLOW:.*]], label %[[CONT:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// CHECK:       [[HANDLER_ADD_OVERFLOW]]:
+// CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[X]] to i64, !nosanitize [[META2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[Y]] to i64, !nosanitize [[META2]]
+// CHECK-NEXT:    tail call void @__ubsan_handle_add_overflow_abort(ptr nonnull @[[GLOB3:[0-9]+]], i64 [[TMP3]], i64 [[TMP4]]) #[[ATTR4]], !nosanitize [[META2]]
 // CHECK-NEXT:    unreachable, !nosanitize [[META2]]
-// CHECK:       cont:
-// CHECK-NEXT:    ret i32 [[TMP3]]
+// CHECK:       [[CONT]]:
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP0]], 0, !nosanitize [[META2]]
+// CHECK-NEXT:    ret i32 [[TMP5]]
 //
-// TR-LABEL: define dso_local i32 @overflow(
-// TR-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR0]] {
-// TR-NEXT:  entry:
-// TR-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
-// TR-NEXT:    [[Y_ADDR:%.*]] = alloca i32, align 4
-// TR-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
-// TR-NEXT:    store i32 [[Y]], ptr [[Y_ADDR]], align 4
-// TR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
-// TR-NEXT:    [[TMP1:%.*]] = load i32, ptr [[Y_ADDR]], align 4
-// TR-NEXT:    [[TMP2:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[TMP0]], i32 [[TMP1]]), !nosanitize [[META2]]
-// TR-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i1 } [[TMP2]], 0, !nosanitize [[META2]]
-// TR-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1, !nosanitize [[META2]]
-// TR-NEXT:    [[TMP5:%.*]] = xor i1 [[TMP4]], true, !nosanitize [[META2]]
-// TR-NEXT:    [[TMP6:%.*]] = call i1 @llvm.allow.ubsan.check(i8 0), !nosanitize [[META2]]
-// TR-NEXT:    [[TMP7:%.*]] = xor i1 [[TMP6]], true, !nosanitize [[META2]]
-// TR-NEXT:    [[TMP8:%.*]] = or i1 [[TMP5]], [[TMP7]], !nosanitize [[META2]]
-// TR-NEXT:    br i1 [[TMP8]], label [[CONT:%.*]], label [[TRAP:%.*]], !nosanitize [[META2]]
-// TR:       trap:
-// TR-NEXT:    call void @llvm.ubsantrap(i8 0) #[[ATTR4]], !nosanitize [[META2]]
+// TR-LABEL: define dso_local noundef i32 @overflow(
+// TR-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// TR-NEXT:  [[ENTRY:.*:]]
+// TR-NEXT:    [[TMP0:%.*]] = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[X]], i32 [[Y]]), !nosanitize [[META2]]
+// TR-NEXT:    [[TMP1:%.*]] = extractvalue { i32, i1 } [[TMP0]], 1, !nosanitize [[META2]]
+// TR-NEXT:    [[TMP2:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 0), !nosanitize [[META2]]
+// TR-NEXT:    [[DOTDEMORGAN:%.*]] = and i1 [[TMP1]], [[TMP2]]
+// TR-NEXT:    br i1 [[DOTDEMORGAN]], label %[[TRAP:.*]], label %[[CONT:.*]], !nosanitize [[META2]]
+// TR:       [[TRAP]]:
+// TR-NEXT:    tail call void @llvm.ubsantrap(i8 0) #[[ATTR4]], !nosanitize [[META2]]
 // TR-NEXT:    unreachable, !nosanitize [[META2]]
-// TR:       cont:
+// TR:       [[CONT]]:
+// TR-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i1 } [[TMP0]], 0, !nosanitize [[META2]]
 // TR-NEXT:    ret i32 [[TMP3]]
 //
-// REC-LABEL: define dso_local i32 @overflow(
-// REC-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR0]] {
-// REC-NEXT:  entry:
-// REC-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
-// REC-NEXT:    [[Y_ADDR:%.*]] = alloca i32, align 4
-// REC-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
-// REC-NEXT:    store i32 [[Y]], ptr [[Y_ADDR]], align 4
-// REC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
-// REC-NEXT:    [[TMP1:%.*]] = load i32, ptr [[Y_ADDR]], align 4
-// REC-NEXT:    [[TMP2:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[TMP0]], i32 [[TMP1]]), !nosanitize [[META2]]
-// REC-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i1 } [[TMP2]], 0, !nosanitize [[META2]]
-// REC-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1, !nosanitize [[META2]]
-// REC-NEXT:    [[TMP5:%.*]] = xor i1 [[TMP4]], true, !nosanitize [[META2]]
-// REC-NEXT:    [[TMP6:%.*]] = call i1 @llvm.allow.ubsan.check(i8 0), !nosanitize [[META2]]
-// REC-NEXT:    [[TMP7:%.*]] = xor i1 [[TMP6]], true, !nosanitize [[META2]]
-// REC-NEXT:    [[TMP8:%.*]] = or i1 [[TMP5]], [[TMP7]], !nosanitize [[META2]]
-// REC-NEXT:    br i1 [[TMP8]], label [[CONT:%.*]], label [[HANDLER_ADD_OVERFLOW:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// REC:       handler.add_overflow:
-// REC-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP0]] to i64, !nosanitize [[META2]]
-// REC-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP1]] to i64, !nosanitize [[META2]]
-// REC-NEXT:    call void @__ubsan_handle_add_overflow(ptr @[[GLOB3:[0-9]+]], i64 [[TMP9]], i64 [[TMP10]]) #[[ATTR4]], !nosanitize [[META2]]
-// REC-NEXT:    br label [[CONT]], !nosanitize [[META2]]
-// REC:       cont:
-// REC-NEXT:    ret i32 [[TMP3]]
+// REC-LABEL: define dso_local noundef i32 @overflow(
+// REC-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// REC-NEXT:  [[ENTRY:.*:]]
+// REC-NEXT:    [[TMP0:%.*]] = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[X]], i32 [[Y]]), !nosanitize [[META2]]
+// REC-NEXT:    [[TMP1:%.*]] = extractvalue { i32, i1 } [[TMP0]], 1, !nosanitize [[META2]]
+// REC-NEXT:    [[TMP2:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 0), !nosanitize [[META2]]
+// REC-NEXT:    [[DOTDEMORGAN:%.*]] = and i1 [[TMP1]], [[TMP2]]
+// REC-NEXT:    br i1 [[DOTDEMORGAN]], label %[[HANDLER_ADD_OVERFLOW:.*]], label %[[CONT:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// REC:       [[HANDLER_ADD_OVERFLOW]]:
+// REC-NEXT:    [[TMP3:%.*]] = zext i32 [[X]] to i64, !nosanitize [[META2]]
+// REC-NEXT:    [[TMP4:%.*]] = zext i32 [[Y]] to i64, !nosanitize [[META2]]
+// REC-NEXT:    tail call void @__ubsan_handle_add_overflow(ptr nonnull @[[GLOB3:[0-9]+]], i64 [[TMP3]], i64 [[TMP4]]) #[[ATTR4]], !nosanitize [[META2]]
+// REC-NEXT:    br label %[[CONT]], !nosanitize [[META2]]
+// REC:       [[CONT]]:
+// REC-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP0]], 0, !nosanitize [[META2]]
+// REC-NEXT:    ret i32 [[TMP5]]
 //
 int overflow(int x, int y) {
   return x + y;
 }
 //.
 // CHECK: [[META2]] = !{}
-// CHECK: [[PROF3]] = !{!"branch_weights", i32 1048575, i32 1}
+// CHECK: [[PROF3]] = !{!"branch_weights", i32 1, i32 1048575}
+// CHECK: [[TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+// CHECK: [[META5]] = !{!"int", [[META6:![0-9]+]], i64 0}
+// CHECK: [[META6]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
+// CHECK: [[META7]] = !{!"Simple C/C++ TBAA"}
 //.
 // TR: [[META2]] = !{}
+// TR: [[TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+// TR: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
+// TR: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+// TR: [[META6]] = !{!"Simple C/C++ TBAA"}
 //.
 // REC: [[META2]] = !{}
-// REC: [[PROF3]] = !{!"branch_weights", i32 1048575, i32 1}
+// REC: [[PROF3]] = !{!"branch_weights", i32 1, i32 1048575}
+// REC: [[TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+// REC: [[META5]] = !{!"int", [[META6:![0-9]+]], i64 0}
+// REC: [[META6]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
+// REC: [[META7]] = !{!"Simple C/C++ TBAA"}
 //.

From 515946b2907e6d909ef133902c8c9395056ebd2d Mon Sep 17 00:00:00 2001
From: Kevin McAfee <kmcafee@nvidia.com>
Date: Fri, 10 Jan 2025 20:06:04 -0500
Subject: [PATCH 127/408] [NFC][NVPTX] Small style cleanup for
 NVPTXISelDAGToDAG.* (#122538)

---
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 20 +++++++++++---------
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h   |  4 ++--
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 1341f8a8fca1f..39f5571692058 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -2479,14 +2479,15 @@ bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
 }
 
 // symbol+offset
-bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
-    SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
+bool NVPTXDAGToDAGISel::SelectADDRsi_imp(SDNode *OpNode, SDValue Addr,
+                                         SDValue &Base, SDValue &Offset,
+                                         MVT VT) {
   if (isAddLike(Addr)) {
     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
       SDValue base = Addr.getOperand(0);
       if (SelectDirectAddr(base, Base)) {
-        Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
-                                           mvt);
+        Offset =
+            CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode), VT);
         return true;
       }
     }
@@ -2507,11 +2508,12 @@ bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
 }
 
 // register+offset
-bool NVPTXDAGToDAGISel::SelectADDRri_imp(
-    SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
+bool NVPTXDAGToDAGISel::SelectADDRri_imp(SDNode *OpNode, SDValue Addr,
+                                         SDValue &Base, SDValue &Offset,
+                                         MVT VT) {
   if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
-    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
-    Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt);
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), VT);
+    Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), VT);
     return true;
   }
   if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
@@ -2526,7 +2528,7 @@ bool NVPTXDAGToDAGISel::SelectADDRri_imp(
       if (FrameIndexSDNode *FIN =
               dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
         // Constant offset from frame ref.
-        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
+        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), VT);
       else
         Base = Addr.getOperand(0);
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 4b67a370c3fe0..8cadde8a82264 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -106,13 +106,13 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
   bool SelectDirectAddr(SDValue N, SDValue &Address);
 
   bool SelectADDRri_imp(SDNode *OpNode, SDValue Addr, SDValue &Base,
-                        SDValue &Offset, MVT mvt);
+                        SDValue &Offset, MVT VT);
   bool SelectADDRri(SDNode *OpNode, SDValue Addr, SDValue &Base,
                     SDValue &Offset);
   bool SelectADDRri64(SDNode *OpNode, SDValue Addr, SDValue &Base,
                       SDValue &Offset);
   bool SelectADDRsi_imp(SDNode *OpNode, SDValue Addr, SDValue &Base,
-                        SDValue &Offset, MVT mvt);
+                        SDValue &Offset, MVT VT);
   bool SelectADDRsi(SDNode *OpNode, SDValue Addr, SDValue &Base,
                     SDValue &Offset);
   bool SelectADDRsi64(SDNode *OpNode, SDValue Addr, SDValue &Base,

From 0cb1884989bca72895b2a1cd555955bfc33ac520 Mon Sep 17 00:00:00 2001
From: Ellis Hoag <ellis.sparky.hoag@gmail.com>
Date: Fri, 10 Jan 2025 17:08:22 -0800
Subject: [PATCH 128/408] [InstrProf] Fix format issue in user manual (#122559)

Fix a small formatting issue in the user manual after #122385.

https://clang.llvm.org/docs/UsersManual.html#cmdoption-ftemporal-profile
---
 clang/docs/UsersManual.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index f58fca465c549..4de288250f3ad 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -3051,7 +3051,7 @@ indexed format, regardeless whether it is produced by frontend or the IR pass.
     $ llvm-profdata merge -o code.profdata yyy/zzz
 
   Using the resulting profile, we can generate a function order to pass to the
-  linker via `--symbol-ordering-file` for ELF or `-order_file` for Mach-O.
+  linker via ``--symbol-ordering-file`` for ELF or ``-order_file`` for Mach-O.
 
   .. code-block:: console
 

From 58508ee1974d2aff52504bb1bd2271ac7b1763c0 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Fri, 10 Jan 2025 17:33:27 -0800
Subject: [PATCH 129/408] [WebAssembly] Add error checking for
 -wasm-use-legacy-ch (#122526)

This adds checks for `-wasm-use-legacy-eh`. While this option is true by
default in the backend, it is not supposed to be given to the Clang when
Wasm EH is not used.
---
 clang/lib/Driver/ToolChains/WebAssembly.cpp |  3 ++-
 clang/test/Driver/wasm-toolchain.c          | 22 +++++++++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Driver/ToolChains/WebAssembly.cpp b/clang/lib/Driver/ToolChains/WebAssembly.cpp
index e338b0d2398e0..10f9a4f338f8f 100644
--- a/clang/lib/Driver/ToolChains/WebAssembly.cpp
+++ b/clang/lib/Driver/ToolChains/WebAssembly.cpp
@@ -422,7 +422,8 @@ void WebAssembly::addClangTargetOptions(const ArgList &DriverArgs,
       }
     }
 
-    for (const auto *Option : {"-wasm-enable-eh", "-wasm-enable-sjlj"}) {
+    for (const auto *Option :
+         {"-wasm-enable-eh", "-wasm-enable-sjlj", "-wasm-use-legacy-eh"}) {
       if (Opt.starts_with(Option)) {
         BanIncompatibleOptionsForWasmEHSjLj(Option);
         EnableFeaturesForWasmEHSjLj();
diff --git a/clang/test/Driver/wasm-toolchain.c b/clang/test/Driver/wasm-toolchain.c
index 84c1b4f6efe66..2d14052082776 100644
--- a/clang/test/Driver/wasm-toolchain.c
+++ b/clang/test/Driver/wasm-toolchain.c
@@ -202,6 +202,28 @@
 // RUN:   | FileCheck -check-prefix=WASM_SJLJ_NO_REFERENCE_TYPES %s
 // WASM_SJLJ_NO_REFERENCE_TYPES: invalid argument '-wasm-enable-sjlj' not allowed with '-mno-reference-types'
 
+// '-mllvm -wasm-use-legacy-eh' not allowed with
+// '-mllvm -enable-emscripten-cxx-exceptions'
+// RUN: not %clang -### --target=wasm32-unknown-unknown \
+// RUN:     --sysroot=/foo %s -mllvm -wasm-use-legacy-eh \
+// RUN:     -mllvm -enable-emscripten-cxx-exceptions 2>&1 \
+// RUN:   | FileCheck -check-prefix=WASM_LEGACY_EH_EMSCRIPTEN_EH %s
+// WASM_LEGACY_EH_EMSCRIPTEN_EH: invalid argument '-wasm-use-legacy-eh' not allowed with '-enable-emscripten-cxx-exceptions'
+
+// '-mllvm -wasm-use-legacy-eh' not allowed with '-mllvm -enable-emscripten-sjlj'
+// RUN: not %clang -### --target=wasm32-unknown-unknown \
+// RUN:     --sysroot=/foo %s -mllvm -wasm-use-legacy-eh \
+// RUN:     -mllvm -enable-emscripten-sjlj 2>&1 \
+// RUN:   | FileCheck -check-prefix=WASM_LEGACY_EH_EMSCRIPTEN_SJLJ %s
+// WASM_LEGACY_EH_EMSCRIPTEN_SJLJ: invalid argument '-wasm-use-legacy-eh' not allowed with '-enable-emscripten-sjlj'
+
+// '-mllvm -wasm-use-legacy-eh' not allowed with '-mno-exception-handling'
+// RUN: not %clang -### --target=wasm32-unknown-unknown \
+// RUN:     --sysroot=/foo %s -mllvm -wasm-use-legacy-eh \
+// RUN:     -mno-exception-handling 2>&1 \
+// RUN:   | FileCheck -check-prefix=WASM_LEGACY_EH_NO_EH %s
+// WASM_LEGACY_EH_NO_EH: invalid argument '-wasm-use-legacy-eh' not allowed with '-mno-exception-handling'
+
 // RUN: %clang -### %s -fsanitize=address --target=wasm32-unknown-emscripten 2>&1 | FileCheck -check-prefix=CHECK-ASAN-EMSCRIPTEN %s
 // CHECK-ASAN-EMSCRIPTEN: "-fsanitize=address"
 // CHECK-ASAN-EMSCRIPTEN: "-fsanitize-address-globals-dead-stripping"

From 74d5373f49fab0943a45a9214dc7a134f9f112f1 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Fri, 10 Jan 2025 19:35:10 -0600
Subject: [PATCH 130/408] [OpenMP] Fix missing type getter for SFINAE helper

Summary:
This didn't get the type, which made using this always return false.
---
 offload/DeviceRTL/include/DeviceUtils.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/offload/DeviceRTL/include/DeviceUtils.h b/offload/DeviceRTL/include/DeviceUtils.h
index fa66b973a4f5e..2243673aef61c 100644
--- a/offload/DeviceRTL/include/DeviceUtils.h
+++ b/offload/DeviceRTL/include/DeviceUtils.h
@@ -44,7 +44,7 @@ inline constexpr bool is_same_v = is_same<T, U>::value;
 
 template <typename T> struct is_floating_point {
   inline static constexpr bool value =
-      is_same_v<remove_cv<T>, float> || is_same_v<remove_cv<T>, double>;
+      is_same_v<remove_cv_t<T>, float> || is_same_v<remove_cv_t<T>, double>;
 };
 template <typename T>
 inline constexpr bool is_floating_point_v = is_floating_point<T>::value;

From 99d0780f050c830c046c6f8790821880ab7c71f5 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Fri, 10 Jan 2025 17:47:35 -0800
Subject: [PATCH 131/408] [nfc][ubsan] Add local-bounds test (#122415)

Show that @llvm.allow.ubsan.check is not used yet.
---
 clang/test/CodeGen/allow-ubsan-check.c | 90 ++++++++++++++++++++++----
 1 file changed, 78 insertions(+), 12 deletions(-)

diff --git a/clang/test/CodeGen/allow-ubsan-check.c b/clang/test/CodeGen/allow-ubsan-check.c
index 3981efe10a5bc..a994e700115ff 100644
--- a/clang/test/CodeGen/allow-ubsan-check.c
+++ b/clang/test/CodeGen/allow-ubsan-check.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
-// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -emit-llvm -O1 -o - %s -fsanitize=signed-integer-overflow,integer-divide-by-zero,null -mllvm -ubsan-guard-checks | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -emit-llvm -O1 -o - %s -fsanitize=signed-integer-overflow,integer-divide-by-zero,null -mllvm -ubsan-guard-checks -fsanitize-trap=signed-integer-overflow,integer-divide-by-zero,null | FileCheck %s --check-prefixes=TR
-// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -emit-llvm -O1 -o - %s -fsanitize=signed-integer-overflow,integer-divide-by-zero,null -mllvm -ubsan-guard-checks -fsanitize-recover=signed-integer-overflow,integer-divide-by-zero,null | FileCheck %s --check-prefixes=REC
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -emit-llvm -O1 -o - %s -fsanitize=signed-integer-overflow,integer-divide-by-zero,null,local-bounds -mllvm -ubsan-guard-checks | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -emit-llvm -O1 -o - %s -fsanitize=signed-integer-overflow,integer-divide-by-zero,null,local-bounds -mllvm -ubsan-guard-checks -fsanitize-trap=signed-integer-overflow,integer-divide-by-zero,null,local-bounds | FileCheck %s --check-prefixes=TR
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -emit-llvm -O1 -o - %s -fsanitize=signed-integer-overflow,integer-divide-by-zero,null,local-bounds -mllvm -ubsan-guard-checks -fsanitize-recover=signed-integer-overflow,integer-divide-by-zero,null,local-bounds | FileCheck %s --check-prefixes=REC
 
 
 // CHECK-LABEL: define dso_local noundef i32 @div(
@@ -18,7 +18,7 @@
 // CHECK:       [[HANDLER_DIVREM_OVERFLOW]]:
 // CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[X]] to i64, !nosanitize [[META2]]
 // CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[Y]] to i64, !nosanitize [[META2]]
-// CHECK-NEXT:    tail call void @__ubsan_handle_divrem_overflow_abort(ptr nonnull @[[GLOB1:[0-9]+]], i64 [[TMP4]], i64 [[TMP5]]) #[[ATTR4:[0-9]+]], !nosanitize [[META2]]
+// CHECK-NEXT:    tail call void @__ubsan_handle_divrem_overflow_abort(ptr nonnull @[[GLOB1:[0-9]+]], i64 [[TMP4]], i64 [[TMP5]]) #[[ATTR6:[0-9]+]], !nosanitize [[META2]]
 // CHECK-NEXT:    unreachable, !nosanitize [[META2]]
 // CHECK:       [[CONT]]:
 // CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[X]], [[Y]]
@@ -36,7 +36,7 @@
 // TR-NEXT:    [[DOTNOT1:%.*]] = and i1 [[DOTNOT3]], [[TMP3]]
 // TR-NEXT:    br i1 [[DOTNOT1]], label %[[TRAP:.*]], label %[[CONT:.*]], !nosanitize [[META2]]
 // TR:       [[TRAP]]:
-// TR-NEXT:    tail call void @llvm.ubsantrap(i8 3) #[[ATTR4:[0-9]+]], !nosanitize [[META2]]
+// TR-NEXT:    tail call void @llvm.ubsantrap(i8 3) #[[ATTR5:[0-9]+]], !nosanitize [[META2]]
 // TR-NEXT:    unreachable, !nosanitize [[META2]]
 // TR:       [[CONT]]:
 // TR-NEXT:    [[DIV:%.*]] = sdiv i32 [[X]], [[Y]]
@@ -56,7 +56,7 @@
 // REC:       [[HANDLER_DIVREM_OVERFLOW]]:
 // REC-NEXT:    [[TMP4:%.*]] = zext i32 [[X]] to i64, !nosanitize [[META2]]
 // REC-NEXT:    [[TMP5:%.*]] = zext i32 [[Y]] to i64, !nosanitize [[META2]]
-// REC-NEXT:    tail call void @__ubsan_handle_divrem_overflow(ptr nonnull @[[GLOB1:[0-9]+]], i64 [[TMP4]], i64 [[TMP5]]) #[[ATTR4:[0-9]+]], !nosanitize [[META2]]
+// REC-NEXT:    tail call void @__ubsan_handle_divrem_overflow(ptr nonnull @[[GLOB1:[0-9]+]], i64 [[TMP4]], i64 [[TMP5]]) #[[ATTR6:[0-9]+]], !nosanitize [[META2]]
 // REC-NEXT:    br label %[[CONT]], !nosanitize [[META2]]
 // REC:       [[CONT]]:
 // REC-NEXT:    [[DIV:%.*]] = sdiv i32 [[X]], [[Y]]
@@ -74,7 +74,7 @@ int div(int x, int y) {
 // CHECK-NEXT:    [[DOTNOT1:%.*]] = and i1 [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    br i1 [[DOTNOT1]], label %[[HANDLER_TYPE_MISMATCH:.*]], label %[[CONT:.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // CHECK:       [[HANDLER_TYPE_MISMATCH]]:
-// CHECK-NEXT:    tail call void @__ubsan_handle_type_mismatch_v1_abort(ptr nonnull @[[GLOB2:[0-9]+]], i64 0) #[[ATTR4]], !nosanitize [[META2]]
+// CHECK-NEXT:    tail call void @__ubsan_handle_type_mismatch_v1_abort(ptr nonnull @[[GLOB2:[0-9]+]], i64 0) #[[ATTR6]], !nosanitize [[META2]]
 // CHECK-NEXT:    unreachable, !nosanitize [[META2]]
 // CHECK:       [[CONT]]:
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA4:![0-9]+]]
@@ -88,7 +88,7 @@ int div(int x, int y) {
 // TR-NEXT:    [[DOTNOT1:%.*]] = and i1 [[TMP0]], [[TMP1]]
 // TR-NEXT:    br i1 [[DOTNOT1]], label %[[TRAP:.*]], label %[[CONT:.*]], !nosanitize [[META2]]
 // TR:       [[TRAP]]:
-// TR-NEXT:    tail call void @llvm.ubsantrap(i8 22) #[[ATTR4]], !nosanitize [[META2]]
+// TR-NEXT:    tail call void @llvm.ubsantrap(i8 22) #[[ATTR5]], !nosanitize [[META2]]
 // TR-NEXT:    unreachable, !nosanitize [[META2]]
 // TR:       [[CONT]]:
 // TR-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA3:![0-9]+]]
@@ -102,7 +102,7 @@ int div(int x, int y) {
 // REC-NEXT:    [[DOTNOT1:%.*]] = and i1 [[TMP0]], [[TMP1]]
 // REC-NEXT:    br i1 [[DOTNOT1]], label %[[HANDLER_TYPE_MISMATCH:.*]], label %[[CONT:.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // REC:       [[HANDLER_TYPE_MISMATCH]]:
-// REC-NEXT:    tail call void @__ubsan_handle_type_mismatch_v1(ptr nonnull @[[GLOB2:[0-9]+]], i64 0) #[[ATTR4]], !nosanitize [[META2]]
+// REC-NEXT:    tail call void @__ubsan_handle_type_mismatch_v1(ptr nonnull @[[GLOB2:[0-9]+]], i64 0) #[[ATTR6]], !nosanitize [[META2]]
 // REC-NEXT:    br label %[[CONT]], !nosanitize [[META2]]
 // REC:       [[CONT]]:
 // REC-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA4:![0-9]+]]
@@ -123,7 +123,7 @@ int null(int* x) {
 // CHECK:       [[HANDLER_ADD_OVERFLOW]]:
 // CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[X]] to i64, !nosanitize [[META2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[Y]] to i64, !nosanitize [[META2]]
-// CHECK-NEXT:    tail call void @__ubsan_handle_add_overflow_abort(ptr nonnull @[[GLOB3:[0-9]+]], i64 [[TMP3]], i64 [[TMP4]]) #[[ATTR4]], !nosanitize [[META2]]
+// CHECK-NEXT:    tail call void @__ubsan_handle_add_overflow_abort(ptr nonnull @[[GLOB3:[0-9]+]], i64 [[TMP3]], i64 [[TMP4]]) #[[ATTR6]], !nosanitize [[META2]]
 // CHECK-NEXT:    unreachable, !nosanitize [[META2]]
 // CHECK:       [[CONT]]:
 // CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP0]], 0, !nosanitize [[META2]]
@@ -138,7 +138,7 @@ int null(int* x) {
 // TR-NEXT:    [[DOTDEMORGAN:%.*]] = and i1 [[TMP1]], [[TMP2]]
 // TR-NEXT:    br i1 [[DOTDEMORGAN]], label %[[TRAP:.*]], label %[[CONT:.*]], !nosanitize [[META2]]
 // TR:       [[TRAP]]:
-// TR-NEXT:    tail call void @llvm.ubsantrap(i8 0) #[[ATTR4]], !nosanitize [[META2]]
+// TR-NEXT:    tail call void @llvm.ubsantrap(i8 0) #[[ATTR5]], !nosanitize [[META2]]
 // TR-NEXT:    unreachable, !nosanitize [[META2]]
 // TR:       [[CONT]]:
 // TR-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i1 } [[TMP0]], 0, !nosanitize [[META2]]
@@ -155,7 +155,7 @@ int null(int* x) {
 // REC:       [[HANDLER_ADD_OVERFLOW]]:
 // REC-NEXT:    [[TMP3:%.*]] = zext i32 [[X]] to i64, !nosanitize [[META2]]
 // REC-NEXT:    [[TMP4:%.*]] = zext i32 [[Y]] to i64, !nosanitize [[META2]]
-// REC-NEXT:    tail call void @__ubsan_handle_add_overflow(ptr nonnull @[[GLOB3:[0-9]+]], i64 [[TMP3]], i64 [[TMP4]]) #[[ATTR4]], !nosanitize [[META2]]
+// REC-NEXT:    tail call void @__ubsan_handle_add_overflow(ptr nonnull @[[GLOB3:[0-9]+]], i64 [[TMP3]], i64 [[TMP4]]) #[[ATTR6]], !nosanitize [[META2]]
 // REC-NEXT:    br label %[[CONT]], !nosanitize [[META2]]
 // REC:       [[CONT]]:
 // REC-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP0]], 0, !nosanitize [[META2]]
@@ -164,6 +164,66 @@ int null(int* x) {
 int overflow(int x, int y) {
   return x + y;
 }
+
+void use(double*);
+
+// CHECK-LABEL: define dso_local double @lbounds(
+// CHECK-SAME: i32 noundef [[B:%.*]], i32 noundef [[I:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[B]] to i64
+// CHECK-NEXT:    [[VLA:%.*]] = alloca double, i64 [[TMP0]], align 16
+// CHECK-NEXT:    call void @use(ptr noundef nonnull [[VLA]]) #[[ATTR7:[0-9]+]]
+// CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
+// CHECK-NEXT:    [[DOTNOT:%.*]] = icmp ugt i64 [[TMP0]], [[IDXPROM]]
+// CHECK-NEXT:    br i1 [[DOTNOT]], label %[[BB1:.*]], label %[[TRAP:.*]]
+// CHECK:       [[BB1]]:
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[VLA]], i64 [[IDXPROM]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA8:![0-9]+]]
+// CHECK-NEXT:    ret double [[TMP2]]
+// CHECK:       [[TRAP]]:
+// CHECK-NEXT:    call void @__ubsan_handle_local_out_of_bounds_abort() #[[ATTR6]]
+// CHECK-NEXT:    unreachable
+//
+// TR-LABEL: define dso_local double @lbounds(
+// TR-SAME: i32 noundef [[B:%.*]], i32 noundef [[I:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// TR-NEXT:  [[ENTRY:.*:]]
+// TR-NEXT:    [[TMP0:%.*]] = zext i32 [[B]] to i64
+// TR-NEXT:    [[VLA:%.*]] = alloca double, i64 [[TMP0]], align 16
+// TR-NEXT:    call void @use(ptr noundef nonnull [[VLA]]) #[[ATTR6:[0-9]+]]
+// TR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
+// TR-NEXT:    [[DOTNOT:%.*]] = icmp ugt i64 [[TMP0]], [[IDXPROM]]
+// TR-NEXT:    br i1 [[DOTNOT]], label %[[BB1:.*]], label %[[TRAP:.*]]
+// TR:       [[BB1]]:
+// TR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[VLA]], i64 [[IDXPROM]]
+// TR-NEXT:    [[TMP2:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA7:![0-9]+]]
+// TR-NEXT:    ret double [[TMP2]]
+// TR:       [[TRAP]]:
+// TR-NEXT:    call void @llvm.ubsantrap(i8 3) #[[ATTR5]]
+// TR-NEXT:    unreachable
+//
+// REC-LABEL: define dso_local double @lbounds(
+// REC-SAME: i32 noundef [[B:%.*]], i32 noundef [[I:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// REC-NEXT:  [[ENTRY:.*:]]
+// REC-NEXT:    [[TMP0:%.*]] = zext i32 [[B]] to i64
+// REC-NEXT:    [[VLA:%.*]] = alloca double, i64 [[TMP0]], align 16
+// REC-NEXT:    call void @use(ptr noundef nonnull [[VLA]]) #[[ATTR5:[0-9]+]]
+// REC-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
+// REC-NEXT:    [[DOTNOT:%.*]] = icmp ugt i64 [[TMP0]], [[IDXPROM]]
+// REC-NEXT:    br i1 [[DOTNOT]], label %[[BB1:.*]], label %[[TRAP:.*]]
+// REC:       [[BB1]]:
+// REC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[VLA]], i64 [[IDXPROM]]
+// REC-NEXT:    [[TMP2:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA8:![0-9]+]]
+// REC-NEXT:    ret double [[TMP2]]
+// REC:       [[TRAP]]:
+// REC-NEXT:    call void @__ubsan_handle_local_out_of_bounds() #[[ATTR6]]
+// REC-NEXT:    br label %[[BB1]]
+//
+double lbounds(int b, int i) {
+  double a[b];
+  use(a);
+  return a[i];
+}
+
 //.
 // CHECK: [[META2]] = !{}
 // CHECK: [[PROF3]] = !{!"branch_weights", i32 1, i32 1048575}
@@ -171,12 +231,16 @@ int overflow(int x, int y) {
 // CHECK: [[META5]] = !{!"int", [[META6:![0-9]+]], i64 0}
 // CHECK: [[META6]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
 // CHECK: [[META7]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
+// CHECK: [[META9]] = !{!"double", [[META6]], i64 0}
 //.
 // TR: [[META2]] = !{}
 // TR: [[TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
 // TR: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
 // TR: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
 // TR: [[META6]] = !{!"Simple C/C++ TBAA"}
+// TR: [[TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
+// TR: [[META8]] = !{!"double", [[META5]], i64 0}
 //.
 // REC: [[META2]] = !{}
 // REC: [[PROF3]] = !{!"branch_weights", i32 1, i32 1048575}
@@ -184,4 +248,6 @@ int overflow(int x, int y) {
 // REC: [[META5]] = !{!"int", [[META6:![0-9]+]], i64 0}
 // REC: [[META6]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
 // REC: [[META7]] = !{!"Simple C/C++ TBAA"}
+// REC: [[TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
+// REC: [[META9]] = !{!"double", [[META6]], i64 0}
 //.

From af4d76d909b0df79494ca19b7c289c2a5b18c816 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 10 Jan 2025 17:59:28 -0800
Subject: [PATCH 132/408] [Support] Reduce globaal variable overhead after
 #121663

* Construct frequently-accessed TimerLock/DefaultTimerGroup early to
  reduce overhead.
* Rename `aquireDefaultGroup` to `acquireTimerGlobals` and restore
  ManagedStatic::claim. https://reviews.llvm.org/D76099

* Drop mtg::. We use internal linkage, so mtg:: is unneeded and might
  mislead users. In addition, llvm/ code almost never introduces a named
  namespace not in llvm::. Drop mtg::.
* Replace some unique_ptr with optional to reduce overhead.
* Switch to `functionName()`.
* Simplify `llvm::initTimerOptions` and `TimerGroup::constructForStatistics()`

Pull Request: https://github.com/llvm/llvm-project/pull/122429
---
 clang/tools/driver/driver.cpp     |   2 +-
 llvm/include/llvm/Support/Timer.h |   6 +-
 llvm/lib/Support/Timer.cpp        | 183 ++++++++++++------------------
 3 files changed, 75 insertions(+), 116 deletions(-)

diff --git a/clang/tools/driver/driver.cpp b/clang/tools/driver/driver.cpp
index ffd157e60997c..74923247b7ee1 100644
--- a/clang/tools/driver/driver.cpp
+++ b/clang/tools/driver/driver.cpp
@@ -439,7 +439,7 @@ int clang_main(int Argc, char **Argv, const llvm::ToolContext &ToolContext) {
   if (!UseNewCC1Process && IsCrash) {
     // When crashing in -fintegrated-cc1 mode, bury the timer pointers, because
     // the internal linked list might point to already released stack frames.
-    llvm::BuryPointer(llvm::TimerGroup::aquireDefaultGroup());
+    llvm::BuryPointer(llvm::TimerGroup::acquireTimerGlobals());
   } else {
     // If any timers were active but haven't been destroyed yet, print their
     // results now.  This happens in -disable-free mode.
diff --git a/llvm/include/llvm/Support/Timer.h b/llvm/include/llvm/Support/Timer.h
index c05389332b804..d21859905d4a7 100644
--- a/llvm/include/llvm/Support/Timer.h
+++ b/llvm/include/llvm/Support/Timer.h
@@ -240,9 +240,9 @@ class TimerGroup {
   /// global constructors and destructors.
   static void constructForStatistics();
 
-  /// This makes the default group unmanaged, and lets the user manage the
-  /// group's lifetime.
-  static std::unique_ptr<TimerGroup> aquireDefaultGroup();
+  /// This makes the timer globals unmanaged, and lets the user manage the
+  /// lifetime.
+  static void *acquireTimerGlobals();
 
 private:
   friend class Timer;
diff --git a/llvm/lib/Support/Timer.cpp b/llvm/lib/Support/Timer.cpp
index 3f0926ae0f3cb..1fa2cdf297aae 100644
--- a/llvm/lib/Support/Timer.cpp
+++ b/llvm/lib/Support/Timer.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Support/YAMLTraits.h"
 #include "llvm/Support/raw_ostream.h"
 #include <limits>
+#include <optional>
 
 #if HAVE_UNISTD_H
 #include <unistd.h>
@@ -39,7 +40,7 @@
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
-// Forward declarations for Managed Timer Globals (mtg) getters.
+// Forward declarations for Managed Timer Globals getters.
 //
 // Globals have been placed at the end of the file to restrict direct
 // access. Use of getters also has the benefit of making it a bit more explicit
@@ -49,29 +50,20 @@ namespace {
 class Name2PairMap;
 }
 
-namespace mtg {
-static std::string &LibSupportInfoOutputFilename();
-static const std::string &InfoOutputFilename();
-static bool TrackSpace();
-static bool SortTimers();
-static SignpostEmitter &Signposts();
-static sys::SmartMutex<true> &TimerLock();
-static TimerGroup &DefaultTimerGroup();
-static TimerGroup *claimDefaultTimerGroup();
-static Name2PairMap &NamedGroupedTimers();
-} // namespace mtg
+static std::string &libSupportInfoOutputFilename();
+static bool trackSpace();
+static bool sortTimers();
+static SignpostEmitter &signposts();
+static sys::SmartMutex<true> &timerLock();
+static TimerGroup &defaultTimerGroup();
+static Name2PairMap &namedGroupedTimers();
 
 //===----------------------------------------------------------------------===//
 //
 //===----------------------------------------------------------------------===//
-void llvm::initTimerOptions() {
-  mtg::TrackSpace();
-  mtg::InfoOutputFilename();
-  mtg::SortTimers();
-}
 
 std::unique_ptr<raw_ostream> llvm::CreateInfoOutputFile() {
-  const std::string &OutputFilename = mtg::LibSupportInfoOutputFilename();
+  const std::string &OutputFilename = libSupportInfoOutputFilename();
   if (OutputFilename.empty())
     return std::make_unique<raw_fd_ostream>(2, false); // stderr.
   if (OutputFilename == "-")
@@ -97,7 +89,7 @@ std::unique_ptr<raw_ostream> llvm::CreateInfoOutputFile() {
 //===----------------------------------------------------------------------===//
 
 void Timer::init(StringRef TimerName, StringRef TimerDescription) {
-  init(TimerName, TimerDescription, mtg::DefaultTimerGroup());
+  init(TimerName, TimerDescription, defaultTimerGroup());
 }
 
 void Timer::init(StringRef TimerName, StringRef TimerDescription,
@@ -116,7 +108,7 @@ Timer::~Timer() {
 }
 
 static inline size_t getMemUsage() {
-  if (!mtg::TrackSpace())
+  if (!trackSpace())
     return 0;
   return sys::Process::GetMallocUsage();
 }
@@ -157,7 +149,7 @@ TimeRecord TimeRecord::getCurrentTime(bool Start) {
 void Timer::startTimer() {
   assert(!Running && "Cannot start a running timer");
   Running = Triggered = true;
-  mtg::Signposts().startInterval(this, getName());
+  signposts().startInterval(this, getName());
   StartTime = TimeRecord::getCurrentTime(true);
 }
 
@@ -166,7 +158,7 @@ void Timer::stopTimer() {
   Running = false;
   Time += TimeRecord::getCurrentTime(false);
   Time -= StartTime;
-  mtg::Signposts().endInterval(this, getName());
+  signposts().endInterval(this, getName());
 }
 
 void Timer::clear() {
@@ -218,7 +210,7 @@ class Name2PairMap {
 
   Timer &get(StringRef Name, StringRef Description, StringRef GroupName,
              StringRef GroupDescription) {
-    sys::SmartScopedLock<true> L(mtg::TimerLock());
+    sys::SmartScopedLock<true> L(timerLock());
 
     std::pair<TimerGroup*, Name2TimerMap> &GroupEntry = Map[GroupName];
 
@@ -237,17 +229,17 @@ class Name2PairMap {
 NamedRegionTimer::NamedRegionTimer(StringRef Name, StringRef Description,
                                    StringRef GroupName,
                                    StringRef GroupDescription, bool Enabled)
-    : TimeRegion(!Enabled ? nullptr
-                          : &mtg::NamedGroupedTimers().get(Name, Description,
-                                                           GroupName,
-                                                           GroupDescription)) {}
+    : TimeRegion(!Enabled
+                     ? nullptr
+                     : &namedGroupedTimers().get(Name, Description, GroupName,
+                                                 GroupDescription)) {}
 
 //===----------------------------------------------------------------------===//
 //   TimerGroup Implementation
 //===----------------------------------------------------------------------===//
 
 /// This is the global list of TimerGroups, maintained by the TimerGroup
-/// ctor/dtor and is protected by the TimerLock lock.
+/// ctor/dtor and is protected by the timerLock lock.
 static TimerGroup *TimerGroupList = nullptr;
 
 TimerGroup::TimerGroup(StringRef Name, StringRef Description,
@@ -264,7 +256,7 @@ TimerGroup::TimerGroup(StringRef Name, StringRef Description,
 }
 
 TimerGroup::TimerGroup(StringRef Name, StringRef Description)
-    : TimerGroup(Name, Description, mtg::TimerLock()) {}
+    : TimerGroup(Name, Description, timerLock()) {}
 
 TimerGroup::TimerGroup(StringRef Name, StringRef Description,
                        const StringMap<TimeRecord> &Records)
@@ -283,7 +275,7 @@ TimerGroup::~TimerGroup() {
     removeTimer(*FirstTimer);
 
   // Remove the group from the TimerGroupList.
-  sys::SmartScopedLock<true> L(mtg::TimerLock());
+  sys::SmartScopedLock<true> L(timerLock());
   *Prev = Next;
   if (Next)
     Next->Prev = Prev;
@@ -291,7 +283,7 @@ TimerGroup::~TimerGroup() {
 
 
 void TimerGroup::removeTimer(Timer &T) {
-  sys::SmartScopedLock<true> L(mtg::TimerLock());
+  sys::SmartScopedLock<true> L(timerLock());
 
   // If the timer was started, move its data to TimersToPrint.
   if (T.hasTriggered())
@@ -314,7 +306,7 @@ void TimerGroup::removeTimer(Timer &T) {
 }
 
 void TimerGroup::addTimer(Timer &T) {
-  sys::SmartScopedLock<true> L(mtg::TimerLock());
+  sys::SmartScopedLock<true> L(timerLock());
 
   // Add the timer to our list.
   if (FirstTimer)
@@ -326,7 +318,7 @@ void TimerGroup::addTimer(Timer &T) {
 
 void TimerGroup::PrintQueuedTimers(raw_ostream &OS) {
   // Perhaps sort the timers in descending order by amount of time taken.
-  if (mtg::SortTimers())
+  if (sortTimers())
     llvm::sort(TimersToPrint);
 
   TimeRecord Total;
@@ -344,7 +336,7 @@ void TimerGroup::PrintQueuedTimers(raw_ostream &OS) {
   // If this is not an collection of ungrouped times, print the total time.
   // Ungrouped timers don't really make sense to add up.  We still print the
   // TOTAL line to make the percentages make sense.
-  if (this != &mtg::DefaultTimerGroup())
+  if (this != &defaultTimerGroup())
     OS << format("  Total Execution Time: %5.4f seconds (%5.4f wall clock)\n",
                  Total.getProcessTime(), Total.getWallTime());
   OS << '\n';
@@ -396,7 +388,7 @@ void TimerGroup::prepareToPrintList(bool ResetTime) {
 void TimerGroup::print(raw_ostream &OS, bool ResetAfterPrint) {
   {
     // After preparing the timers we can free the lock
-    sys::SmartScopedLock<true> L(mtg::TimerLock());
+    sys::SmartScopedLock<true> L(timerLock());
     prepareToPrintList(ResetAfterPrint);
   }
 
@@ -406,20 +398,20 @@ void TimerGroup::print(raw_ostream &OS, bool ResetAfterPrint) {
 }
 
 void TimerGroup::clear() {
-  sys::SmartScopedLock<true> L(mtg::TimerLock());
+  sys::SmartScopedLock<true> L(timerLock());
   for (Timer *T = FirstTimer; T; T = T->Next)
     T->clear();
 }
 
 void TimerGroup::printAll(raw_ostream &OS) {
-  sys::SmartScopedLock<true> L(mtg::TimerLock());
+  sys::SmartScopedLock<true> L(timerLock());
 
   for (TimerGroup *TG = TimerGroupList; TG; TG = TG->Next)
     TG->print(OS);
 }
 
 void TimerGroup::clearAll() {
-  sys::SmartScopedLock<true> L(mtg::TimerLock());
+  sys::SmartScopedLock<true> L(timerLock());
   for (TimerGroup *TG = TimerGroupList; TG; TG = TG->Next)
     TG->clear();
 }
@@ -436,7 +428,7 @@ void TimerGroup::printJSONValue(raw_ostream &OS, const PrintRecord &R,
 }
 
 const char *TimerGroup::printJSONValues(raw_ostream &OS, const char *delim) {
-  sys::SmartScopedLock<true> L(mtg::TimerLock());
+  sys::SmartScopedLock<true> L(timerLock());
 
   prepareToPrintList(false);
   for (const PrintRecord &R : TimersToPrint) {
@@ -463,21 +455,12 @@ const char *TimerGroup::printJSONValues(raw_ostream &OS, const char *delim) {
 }
 
 const char *TimerGroup::printAllJSONValues(raw_ostream &OS, const char *delim) {
-  sys::SmartScopedLock<true> L(mtg::TimerLock());
+  sys::SmartScopedLock<true> L(timerLock());
   for (TimerGroup *TG = TimerGroupList; TG; TG = TG->Next)
     delim = TG->printJSONValues(OS, delim);
   return delim;
 }
 
-void TimerGroup::constructForStatistics() {
-  mtg::LibSupportInfoOutputFilename();
-  mtg::NamedGroupedTimers();
-}
-
-std::unique_ptr<TimerGroup> TimerGroup::aquireDefaultGroup() {
-  return std::unique_ptr<TimerGroup>(mtg::claimDefaultTimerGroup());
-}
-
 //===----------------------------------------------------------------------===//
 // Timer Globals
 //
@@ -499,83 +482,59 @@ std::unique_ptr<TimerGroup> TimerGroup::aquireDefaultGroup() {
 class llvm::TimerGlobals {
 public:
   std::string LibSupportInfoOutputFilename;
-  cl::opt<std::string, true> InfoOutputFilename;
-  cl::opt<bool> TrackSpace;
-  cl::opt<bool> SortTimers;
+  cl::opt<std::string, true> InfoOutputFilename{
+      "info-output-file", cl::value_desc("filename"),
+      cl::desc("File to append -stats and -timer output to"), cl::Hidden,
+      cl::location(LibSupportInfoOutputFilename)};
+  cl::opt<bool> TrackSpace{
+      "track-memory",
+      cl::desc("Enable -time-passes memory tracking (this may be slow)"),
+      cl::Hidden};
+  cl::opt<bool> SortTimers{
+      "sort-timers",
+      cl::desc("In the report, sort the timers in each group in wall clock"
+               " time order"),
+      cl::init(true), cl::Hidden};
+
+  sys::SmartMutex<true> TimerLock;
+  TimerGroup DefaultTimerGroup{"misc", "Miscellaneous Ungrouped Timers",
+                               TimerLock};
+  SignpostEmitter Signposts;
 
-private:
   // Order of these members and initialization below is important. For example
-  // the DefaultTimerGroup uses the TimerLock. Most of these also depend on the
+  // the defaultTimerGroup uses the timerLock. Most of these also depend on the
   // options above.
   std::once_flag InitDeferredFlag;
-  std::unique_ptr<SignpostEmitter> SignpostsPtr;
-  std::unique_ptr<sys::SmartMutex<true>> TimerLockPtr;
-  std::unique_ptr<TimerGroup> DefaultTimerGroupPtr;
-  std::unique_ptr<Name2PairMap> NamedGroupedTimersPtr;
+  std::optional<Name2PairMap> NamedGroupedTimersPtr;
+
   TimerGlobals &initDeferred() {
-    std::call_once(InitDeferredFlag, [this]() {
-      SignpostsPtr = std::make_unique<SignpostEmitter>();
-      TimerLockPtr = std::make_unique<sys::SmartMutex<true>>();
-      DefaultTimerGroupPtr.reset(new TimerGroup(
-          "misc", "Miscellaneous Ungrouped Timers", *TimerLockPtr));
-      NamedGroupedTimersPtr = std::make_unique<Name2PairMap>();
-    });
+    std::call_once(InitDeferredFlag,
+                   [this]() { NamedGroupedTimersPtr.emplace(); });
     return *this;
   }
-
-public:
-  SignpostEmitter &Signposts() { return *initDeferred().SignpostsPtr; }
-  sys::SmartMutex<true> &TimerLock() { return *initDeferred().TimerLockPtr; }
-  TimerGroup &DefaultTimerGroup() {
-    return *initDeferred().DefaultTimerGroupPtr;
-  }
-  TimerGroup *claimDefaultTimerGroup() {
-    return initDeferred().DefaultTimerGroupPtr.release();
-  }
-  Name2PairMap &NamedGroupedTimers() {
-    return *initDeferred().NamedGroupedTimersPtr;
-  }
-
-public:
-  TimerGlobals()
-      : InfoOutputFilename(
-            "info-output-file", cl::value_desc("filename"),
-            cl::desc("File to append -stats and -timer output to"), cl::Hidden,
-            cl::location(LibSupportInfoOutputFilename)),
-        TrackSpace(
-            "track-memory",
-            cl::desc("Enable -time-passes memory tracking (this may be slow)"),
-            cl::Hidden),
-        SortTimers(
-            "sort-timers",
-            cl::desc(
-                "In the report, sort the timers in each group in wall clock"
-                " time order"),
-            cl::init(true), cl::Hidden) {}
 };
 
 static ManagedStatic<TimerGlobals> ManagedTimerGlobals;
 
-static std::string &mtg::LibSupportInfoOutputFilename() {
+static std::string &libSupportInfoOutputFilename() {
   return ManagedTimerGlobals->LibSupportInfoOutputFilename;
 }
-static const std::string &mtg::InfoOutputFilename() {
-  return ManagedTimerGlobals->InfoOutputFilename.getValue();
-}
-static bool mtg::TrackSpace() { return ManagedTimerGlobals->TrackSpace; }
-static bool mtg::SortTimers() { return ManagedTimerGlobals->SortTimers; }
-static SignpostEmitter &mtg::Signposts() {
-  return ManagedTimerGlobals->Signposts();
+static bool trackSpace() { return ManagedTimerGlobals->TrackSpace; }
+static bool sortTimers() { return ManagedTimerGlobals->SortTimers; }
+static SignpostEmitter &signposts() { return ManagedTimerGlobals->Signposts; }
+static sys::SmartMutex<true> &timerLock() {
+  return ManagedTimerGlobals->TimerLock;
 }
-static sys::SmartMutex<true> &mtg::TimerLock() {
-  return ManagedTimerGlobals->TimerLock();
+static TimerGroup &defaultTimerGroup() {
+  return ManagedTimerGlobals->DefaultTimerGroup;
 }
-static TimerGroup &mtg::DefaultTimerGroup() {
-  return ManagedTimerGlobals->DefaultTimerGroup();
+static Name2PairMap &namedGroupedTimers() {
+  return *ManagedTimerGlobals->initDeferred().NamedGroupedTimersPtr;
 }
-static TimerGroup *mtg::claimDefaultTimerGroup() {
-  return ManagedTimerGlobals->claimDefaultTimerGroup();
-}
-static Name2PairMap &mtg::NamedGroupedTimers() {
-  return ManagedTimerGlobals->NamedGroupedTimers();
+
+void llvm::initTimerOptions() { *ManagedTimerGlobals; }
+void TimerGroup::constructForStatistics() {
+  ManagedTimerGlobals->initDeferred();
 }
+
+void *TimerGroup::acquireTimerGlobals() { return ManagedTimerGlobals.claim(); }

From 6329355860e9b66bc7ed68b46c166763e408d4cc Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Fri, 10 Jan 2025 18:04:25 -0800
Subject: [PATCH 133/408] [ctxprof] Move test serialization to yaml (#122545)

We have a textual representation of contextual profiles for test scenarios, mainly. This patch moves that to YAML instead of JSON. YAML is more succinct and readable (some of the .ll tests should be illustrative). In addition, JSON is parse-able by the YAML reader.

A subsequent patch will address deserialization.

(thanks, @kazutakahirata, for showing me how to use the llvm YAML reader/writer APIs, which I incorrectly thought to be more low-level than the JSON ones!)
---
 .../llvm/ProfileData/PGOCtxProfWriter.h       | 10 ++-
 llvm/lib/ProfileData/PGOCtxProfWriter.cpp     | 57 +++++++---------
 .../CtxProfAnalysis/flatten-and-annotate.ll   | 32 +++------
 .../CtxProfAnalysis/flatten-check-path.ll     | 21 +++---
 .../Analysis/CtxProfAnalysis/flatten-icp.ll   | 25 ++++---
 .../CtxProfAnalysis/flatten-zero-path.ll      |  7 +-
 .../Analysis/CtxProfAnalysis/full-cycle.ll    | 66 +++++--------------
 .../Analysis/CtxProfAnalysis/handle-select.ll | 13 +++-
 llvm/test/Analysis/CtxProfAnalysis/inline.ll  | 33 +++++-----
 .../CtxProfAnalysis/load-unapplicable.ll      | 44 ++++---------
 llvm/test/Analysis/CtxProfAnalysis/load.ll    | 44 ++++---------
 llvm/test/ThinLTO/X86/ctxprof.ll              |  4 +-
 .../transform-to-local.ll                     |  4 +-
 .../tools/llvm-ctxprof-util/Inputs/bad.json   |  1 -
 .../tools/llvm-ctxprof-util/Inputs/bad.yaml   |  1 +
 .../tools/llvm-ctxprof-util/Inputs/empty.json |  1 -
 .../tools/llvm-ctxprof-util/Inputs/empty.yaml |  0
 .../Inputs/invalid-bad-subctx.json            |  8 ---
 .../Inputs/invalid-bad-subctx.yaml            |  4 ++
 .../Inputs/invalid-no-counters.json           |  5 --
 .../Inputs/invalid-no-counters.yaml           |  1 +
 .../Inputs/invalid-no-ctx.json                |  1 -
 .../Inputs/invalid-no-ctx.yaml                |  1 +
 .../Inputs/invalid-no-vector.json             |  1 -
 .../Inputs/invalid-no-vector.yaml             |  1 +
 .../tools/llvm-ctxprof-util/Inputs/valid.json | 47 -------------
 .../tools/llvm-ctxprof-util/Inputs/valid.yaml | 13 ++++
 .../llvm-ctxprof-util-negative.test           | 36 +++++-----
 .../llvm-ctxprof-util/llvm-ctxprof-util.test  |  8 +--
 .../llvm-ctxprof-util/llvm-ctxprof-util.cpp   | 14 ++--
 .../Utils/CallPromotionUtilsTest.cpp          |  2 +-
 31 files changed, 195 insertions(+), 310 deletions(-)
 delete mode 100644 llvm/test/tools/llvm-ctxprof-util/Inputs/bad.json
 create mode 100644 llvm/test/tools/llvm-ctxprof-util/Inputs/bad.yaml
 delete mode 100644 llvm/test/tools/llvm-ctxprof-util/Inputs/empty.json
 create mode 100644 llvm/test/tools/llvm-ctxprof-util/Inputs/empty.yaml
 delete mode 100644 llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-bad-subctx.json
 create mode 100644 llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-bad-subctx.yaml
 delete mode 100644 llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-counters.json
 create mode 100644 llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-counters.yaml
 delete mode 100644 llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-ctx.json
 create mode 100644 llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-ctx.yaml
 delete mode 100644 llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-vector.json
 create mode 100644 llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-vector.yaml
 delete mode 100644 llvm/test/tools/llvm-ctxprof-util/Inputs/valid.json
 create mode 100644 llvm/test/tools/llvm-ctxprof-util/Inputs/valid.yaml

diff --git a/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h b/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
index b370fdd9ba5a1..f6158609c1285 100644
--- a/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
+++ b/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
@@ -81,6 +81,14 @@ class PGOCtxProfileWriter final {
   static constexpr StringRef ContainerMagic = "CTXP";
 };
 
-Error createCtxProfFromJSON(StringRef Profile, raw_ostream &Out);
+/// Representation of the context node suitable for yaml / json serialization /
+/// deserialization.
+struct SerializableCtxRepresentation {
+  ctx_profile::GUID Guid = 0;
+  std::vector<uint64_t> Counters;
+  std::vector<std::vector<SerializableCtxRepresentation>> Callsites;
+};
+
+Error createCtxProfFromYAML(StringRef Profile, raw_ostream &Out);
 } // namespace llvm
 #endif
diff --git a/llvm/lib/ProfileData/PGOCtxProfWriter.cpp b/llvm/lib/ProfileData/PGOCtxProfWriter.cpp
index 4c0f3d459988b..d22aadd6bd7eb 100644
--- a/llvm/lib/ProfileData/PGOCtxProfWriter.cpp
+++ b/llvm/lib/ProfileData/PGOCtxProfWriter.cpp
@@ -13,7 +13,11 @@
 #include "llvm/ProfileData/PGOCtxProfWriter.h"
 #include "llvm/Bitstream/BitCodeEnums.h"
 #include "llvm/ProfileData/CtxInstrContextNode.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/JSON.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 using namespace llvm::ctx_profile;
@@ -85,22 +89,15 @@ void PGOCtxProfileWriter::write(const ContextNode &RootNode) {
 }
 
 namespace {
-// A structural representation of the JSON input.
-struct DeserializableCtx {
-  ctx_profile::GUID Guid = 0;
-  std::vector<uint64_t> Counters;
-  std::vector<std::vector<DeserializableCtx>> Callsites;
-};
-
 ctx_profile::ContextNode *
 createNode(std::vector<std::unique_ptr<char[]>> &Nodes,
-           const std::vector<DeserializableCtx> &DCList);
+           const std::vector<SerializableCtxRepresentation> &DCList);
 
 // Convert a DeserializableCtx into a ContextNode, potentially linking it to
 // its sibling (e.g. callee at same callsite) "Next".
 ctx_profile::ContextNode *
 createNode(std::vector<std::unique_ptr<char[]>> &Nodes,
-           const DeserializableCtx &DC,
+           const SerializableCtxRepresentation &DC,
            ctx_profile::ContextNode *Next = nullptr) {
   auto AllocSize = ctx_profile::ContextNode::getAllocSize(DC.Counters.size(),
                                                           DC.Callsites.size());
@@ -115,10 +112,11 @@ createNode(std::vector<std::unique_ptr<char[]>> &Nodes,
   return Ret;
 }
 
-// Convert a list of DeserializableCtx into a linked list of ContextNodes.
+// Convert a list of SerializableCtxRepresentation into a linked list of
+// ContextNodes.
 ctx_profile::ContextNode *
 createNode(std::vector<std::unique_ptr<char[]>> &Nodes,
-           const std::vector<DeserializableCtx> &DCList) {
+           const std::vector<SerializableCtxRepresentation> &DCList) {
   ctx_profile::ContextNode *List = nullptr;
   for (const auto &DC : DCList)
     List = createNode(Nodes, DC, List);
@@ -126,27 +124,22 @@ createNode(std::vector<std::unique_ptr<char[]>> &Nodes,
 }
 } // namespace
 
-namespace llvm {
-namespace json {
-bool fromJSON(const Value &E, DeserializableCtx &R, Path P) {
-  json::ObjectMapper Mapper(E, P);
-  return Mapper && Mapper.map("Guid", R.Guid) &&
-         Mapper.map("Counters", R.Counters) &&
-         Mapper.mapOptional("Callsites", R.Callsites);
-}
-} // namespace json
-} // namespace llvm
-
-Error llvm::createCtxProfFromJSON(StringRef Profile, raw_ostream &Out) {
-  auto P = json::parse(Profile);
-  if (!P)
-    return P.takeError();
+LLVM_YAML_IS_SEQUENCE_VECTOR(SerializableCtxRepresentation)
+LLVM_YAML_IS_SEQUENCE_VECTOR(std::vector<SerializableCtxRepresentation>)
+template <> struct yaml::MappingTraits<SerializableCtxRepresentation> {
+  static void mapping(yaml::IO &IO, SerializableCtxRepresentation &SCR) {
+    IO.mapRequired("Guid", SCR.Guid);
+    IO.mapRequired("Counters", SCR.Counters);
+    IO.mapOptional("Callsites", SCR.Callsites);
+  }
+};
 
-  json::Path::Root R("");
-  std::vector<DeserializableCtx> DCList;
-  if (!fromJSON(*P, DCList, R))
-    return R.getError();
-  // Nodes provides memory backing for the ContextualNodes.
+Error llvm::createCtxProfFromYAML(StringRef Profile, raw_ostream &Out) {
+  yaml::Input In(Profile);
+  std::vector<SerializableCtxRepresentation> DCList;
+  In >> DCList;
+  if (In.error())
+    return createStringError(In.error(), "incorrect yaml content");
   std::vector<std::unique_ptr<char[]>> Nodes;
   std::error_code EC;
   if (EC)
@@ -162,4 +155,4 @@ Error llvm::createCtxProfFromJSON(StringRef Profile, raw_ostream &Out) {
   if (EC)
     return createStringError(EC, "failed to write output");
   return Error::success();
-}
\ No newline at end of file
+}
diff --git a/llvm/test/Analysis/CtxProfAnalysis/flatten-and-annotate.ll b/llvm/test/Analysis/CtxProfAnalysis/flatten-and-annotate.ll
index b7950b26a3ef2..9eedade925b01 100644
--- a/llvm/test/Analysis/CtxProfAnalysis/flatten-and-annotate.ll
+++ b/llvm/test/Analysis/CtxProfAnalysis/flatten-and-annotate.ll
@@ -2,7 +2,7 @@
 ;
 ; RUN: rm -rf %t
 ; RUN: split-file %s %t
-; RUN: llvm-ctxprof-util fromJSON --input=%t/profile.json --output=%t/profile.ctxprofdata
+; RUN: llvm-ctxprof-util fromYAML --input=%t/profile.yaml --output=%t/profile.ctxprofdata
 ; RUN: opt -module-summary -passes='thinlto-pre-link<O2>' -use-ctx-profile=%t/profile.ctxprofdata \
 ; RUN:   %t/example.ll -S -o %t/prelink.ll
 ; RUN: FileCheck --input-file %t/prelink.ll %s --check-prefix=PRELINK
@@ -58,27 +58,15 @@
 ; CHECK:       ![[AN_ENTRYPOINT_EP]] = !{!"function_entry_count", i64 100}
 ; CHECK:       ![[AN_ENTRYPOINT_BW]] = !{!"branch_weights", i32 40, i32 60} 
 
-;--- profile.json
-[
-  {
-    "Guid": 4909520559318251808,
-    "Counters": [100, 40],
-    "Callsites": [
-      [
-        {
-          "Guid": 11872291593386833696,
-          "Counters": [ 100, 5 ]
-        }
-      ],
-      [
-        {
-          "Guid": 11872291593386833696,
-          "Counters": [ 40, 10 ]
-        }
-      ]
-    ]
-  }
-]
+;--- profile.yaml
+- Guid: 4909520559318251808
+  Counters: [100, 40]
+  Callsites: -
+              - Guid: 11872291593386833696
+                Counters: [ 100, 5 ]
+             -
+              - Guid: 11872291593386833696
+                Counters: [ 40, 10 ]
 ;--- example.ll
 declare void @bar()
 
diff --git a/llvm/test/Analysis/CtxProfAnalysis/flatten-check-path.ll b/llvm/test/Analysis/CtxProfAnalysis/flatten-check-path.ll
index 42eaa67a98308..c84a72f60a3d0 100644
--- a/llvm/test/Analysis/CtxProfAnalysis/flatten-check-path.ll
+++ b/llvm/test/Analysis/CtxProfAnalysis/flatten-check-path.ll
@@ -3,9 +3,9 @@
 ; already visited blocks count as taken (i.e. the flow continues through them).
 ;
 ; RUN: split-file %s %t
-; RUN: llvm-ctxprof-util fromJSON --input=%t/profile_ok.json --output=%t/profile_ok.ctxprofdata
-; RUN: llvm-ctxprof-util fromJSON --input=%t/profile_pump.json --output=%t/profile_pump.ctxprofdata
-; RUN: llvm-ctxprof-util fromJSON --input=%t/profile_unreachable.json --output=%t/profile_unreachable.ctxprofdata
+; RUN: llvm-ctxprof-util fromYAML --input=%t/profile_ok.yaml --output=%t/profile_ok.ctxprofdata
+; RUN: llvm-ctxprof-util fromYAML --input=%t/profile_pump.yaml --output=%t/profile_pump.ctxprofdata
+; RUN: llvm-ctxprof-util fromYAML --input=%t/profile_unreachable.yaml --output=%t/profile_unreachable.ctxprofdata
 ;
 ; RUN: opt -passes=ctx-prof-flatten %t/example_ok.ll -use-ctx-profile=%t/profile_ok.ctxprofdata -S -o - | FileCheck %s
 ; RUN: not --crash opt -passes=ctx-prof-flatten %t/message_pump.ll -use-ctx-profile=%t/profile_pump.ctxprofdata -S 2>&1 | FileCheck %s --check-prefix=ASSERTION
@@ -38,8 +38,9 @@ exit:
 }
 !0 = !{i64 1234}
 
-;--- profile_ok.json
-[{"Guid":1234, "Counters":[2, 2, 1, 2]}]
+;--- profile_ok.yaml
+- Guid: 1234 
+  Counters: [2, 2, 1, 2]
 
 ;--- message_pump.ll
 ; This is a message pump: the loop never exits. This should result in an
@@ -59,8 +60,9 @@ exit:
 }
 !0 = !{i64 1234}
 
-;--- profile_pump.json
-[{"Guid":1234, "Counters":[2, 10, 0]}]
+;--- profile_pump.yaml
+- Guid: 1234
+  Counters: [2, 10, 0]
 
 ;--- unreachable.ll
 ; An unreachable block is reached, that's an error
@@ -81,5 +83,6 @@ exit:
 }
 !0 = !{i64 1234}
 
-;--- profile_unreachable.json
-[{"Guid":1234, "Counters":[2, 1, 1, 2]}]
\ No newline at end of file
+;--- profile_unreachable.yaml
+- Guid: 1234
+  Counters: [2, 1, 1, 2]
\ No newline at end of file
diff --git a/llvm/test/Analysis/CtxProfAnalysis/flatten-icp.ll b/llvm/test/Analysis/CtxProfAnalysis/flatten-icp.ll
index fbffe780f0afa..46c17377710d0 100644
--- a/llvm/test/Analysis/CtxProfAnalysis/flatten-icp.ll
+++ b/llvm/test/Analysis/CtxProfAnalysis/flatten-icp.ll
@@ -1,5 +1,5 @@
 ; RUN: split-file %s %t
-; RUN: llvm-ctxprof-util fromJSON --input %t/profile.json --output %t/profile.ctxprofdata
+; RUN: llvm-ctxprof-util fromYAML --input %t/profile.yaml --output %t/profile.ctxprofdata
 ;
 ; In the given profile, in one of the contexts the indirect call is taken, the
 ; target we're trying to ICP - GUID:2000 - doesn't appear at all. That should
@@ -45,11 +45,18 @@ attributes #1 = { noinline }
 !1 = !{i64 3000}
 !2 = !{i64 4000}
 
-;--- profile.json
-[ {
-  "Guid": 4000, "Counters":[10], "Callsites": [
-    [{"Guid":3000, "Counters":[10], "Callsites":[[{"Guid":1000, "Counters":[10]}]]}],
-    [{"Guid":3000, "Counters":[10], "Callsites":[[{"Guid":9000, "Counters":[10]}]]}]
-  ]
-}
-]
+;--- profile.yaml
+- Guid: 4000
+  Counters: [10]
+  Callsites:  -
+                - Guid: 3000
+                  Counters: [10]
+                  Callsites: -
+                              - Guid: 1000
+                                Counters: [10]
+              -
+                - Guid: 3000
+                  Counters: [10]
+                  Callsites: -
+                              - Guid: 9000
+                                Counters: [10]
diff --git a/llvm/test/Analysis/CtxProfAnalysis/flatten-zero-path.ll b/llvm/test/Analysis/CtxProfAnalysis/flatten-zero-path.ll
index 7eea1c36afc37..251ece655196a 100644
--- a/llvm/test/Analysis/CtxProfAnalysis/flatten-zero-path.ll
+++ b/llvm/test/Analysis/CtxProfAnalysis/flatten-zero-path.ll
@@ -1,6 +1,6 @@
 ; Check that flattened profile lowering handles cold subgraphs that end in "unreachable"
 ; RUN: split-file %s %t
-; RUN: llvm-ctxprof-util fromJSON --input=%t/profile.json --output=%t/profile.ctxprofdata
+; RUN: llvm-ctxprof-util fromYAML --input=%t/profile.yaml --output=%t/profile.ctxprofdata
 ; RUN: opt -passes=ctx-prof-flatten %t/example.ll -use-ctx-profile=%t/profile.ctxprofdata -S -o - | FileCheck %s
 
 ; CHECK-LABEL: entry:
@@ -51,5 +51,6 @@ exit:
 
 !0 = !{i64 1234}
 
-;--- profile.json
-[{"Guid":1234, "Counters":[6,0,0,0]}]
+;--- profile.yaml
+- Guid: 1234
+  Counters: [6,0,0,0]
diff --git a/llvm/test/Analysis/CtxProfAnalysis/full-cycle.ll b/llvm/test/Analysis/CtxProfAnalysis/full-cycle.ll
index 905e7eea9f49e..5a8a2f4cad84b 100644
--- a/llvm/test/Analysis/CtxProfAnalysis/full-cycle.ll
+++ b/llvm/test/Analysis/CtxProfAnalysis/full-cycle.ll
@@ -8,7 +8,7 @@
 ; different counter values, and we expect resulting flat profile to be the sum
 ; (of values at the same index).
 ;
-; RUN: llvm-ctxprof-util fromJSON --input=%t/profile.json --output=%t/profile.ctxprofdata
+; RUN: llvm-ctxprof-util fromYAML --input=%t/profile.yaml --output=%t/profile.ctxprofdata
 ;
 ; RUN: opt -module-summary -passes='thinlto-pre-link<O2>' -use-ctx-profile=%t/profile.ctxprofdata -o %t/m1.bc %t/m1.ll
 ; RUN: opt -module-summary -passes='thinlto-pre-link<O2>' -use-ctx-profile=%t/profile.ctxprofdata -o %t/m2.bc %t/m2.ll
@@ -65,55 +65,21 @@ define void @entrypoint() {
   call void @f3()
   ret void
 }
-;--- profile.json
-[
-  {
-    "Callsites": [
-      [
-        {
-          "Callsites": [
-            [
-              {
-                "Counters": [
-                  10,
-                  7
-                ],
-                "Guid": 3087265239403591524
-              }
-            ]
-          ],
-          "Counters": [
-            7
-          ],
-          "Guid": 2072045998141807037
-        }
-      ],
-      [
-        {
-          "Callsites": [
-            [
-              {
-                "Counters": [
-                  1,
-                  2
-                ],
-                "Guid": 3087265239403591524
-              }
-            ]
-          ],
-          "Counters": [
-            2
-          ],
-          "Guid": 4197650231481825559
-        }
-      ]
-    ],
-    "Counters": [
-      1
-    ],
-    "Guid": 10507721908651011566
-  }
-]
+;--- profile.yaml
+- Guid: 10507721908651011566
+  Counters: [1]
+  Callsites:  -
+                - Guid:  2072045998141807037
+                  Counters: [7]
+                  Callsites:  -
+                                - Guid: 3087265239403591524
+                                  Counters: [10, 7]
+              -
+                - Guid: 4197650231481825559
+                  Counters: [2]
+                  Callsites:  -
+                                - Guid: 3087265239403591524
+                                  Counters: [1, 2]
 ;--- expected.txt
 Function Info:
 2072045998141807037 : f1. MaxCounterID: 1. MaxCallsiteID: 1
diff --git a/llvm/test/Analysis/CtxProfAnalysis/handle-select.ll b/llvm/test/Analysis/CtxProfAnalysis/handle-select.ll
index e740466a03f3e..ce90d27fc9906 100644
--- a/llvm/test/Analysis/CtxProfAnalysis/handle-select.ll
+++ b/llvm/test/Analysis/CtxProfAnalysis/handle-select.ll
@@ -4,7 +4,7 @@
 ; the `select` is elided.
 ;
 ; RUN: split-file %s %t
-; RUN: llvm-ctxprof-util fromJSON --input=%t/profile.json --output=%t/profile.ctxprofdata
+; RUN: llvm-ctxprof-util fromYAML --input=%t/profile.yaml --output=%t/profile.ctxprofdata
 ;
 ; RUN: opt -passes=ctx-instr-gen %t/example.ll -use-ctx-profile=%t/profile.ctxprofdata -S -o - | FileCheck %s --check-prefix=INSTR
 ; RUN: opt -passes=ctx-instr-gen,module-inline %t/example.ll -use-ctx-profile=%t/profile.ctxprofdata -S -o - | FileCheck %s --check-prefix=POST-INL
@@ -72,5 +72,12 @@ define i32 @bar(i32 %t) !guid !1 {
 !0 = !{i64 1234}
 !1 = !{i64 5678}
 
-;--- profile.json
-[{"Guid":1234, "Counters":[10, 4], "Callsites":[[{"Guid": 5678, "Counters":[4,3]}],[{"Guid": 5678, "Counters":[6,6]}]]}]
+;--- profile.yaml
+- Guid: 1234
+  Counters: [10, 4]
+  Callsites:  -
+                - Guid: 5678
+                  Counters: [4,3]
+              - 
+                - Guid: 5678
+                  Counters: [6,6]
diff --git a/llvm/test/Analysis/CtxProfAnalysis/inline.ll b/llvm/test/Analysis/CtxProfAnalysis/inline.ll
index 9381418c4e3f1..6c1e199c2ba1c 100644
--- a/llvm/test/Analysis/CtxProfAnalysis/inline.ll
+++ b/llvm/test/Analysis/CtxProfAnalysis/inline.ll
@@ -1,6 +1,6 @@
 ; RUN: rm -rf %t
 ; RUN: split-file %s %t
-; RUN: llvm-ctxprof-util fromJSON --input=%t/profile.json --output=%t/profile.ctxprofdata
+; RUN: llvm-ctxprof-util fromYAML --input=%t/profile.yaml --output=%t/profile.ctxprofdata
 
 ; RUN: opt -passes='module-inline,print<ctx-prof-analysis>' -ctx-profile-printer-level=everything %t/module.ll -S \
 ; RUN:   -use-ctx-profile=%t/profile.ctxprofdata -ctx-profile-printer-level=json \
@@ -94,22 +94,21 @@ define i32 @b() !guid !2 {
 !0 = !{i64 1000}
 !1 = !{i64 1001}
 !2 = !{i64 1002}
-;--- profile.json
-[
-  { "Guid": 1000,
-    "Counters": [10, 2, 8],
-    "Callsites": [
-      [ { "Guid": 1001,
-          "Counters": [2, 100],
-          "Callsites": [[{"Guid": 1002, "Counters": [100]}]]}
-      ],
-      [ { "Guid": 1001,
-          "Counters": [8, 500],
-          "Callsites": [[{"Guid": 1002, "Counters": [500]}]]}
-      ]
-    ]
-  }
-]
+;--- profile.yaml
+- Guid: 1000
+  Counters: [10, 2, 8]
+  Callsites:  -
+                - Guid: 1001
+                  Counters: [2, 100]
+                  Callsites:  -
+                                - Guid: 1002
+                                  Counters: [100]
+              -
+                - Guid: 1001
+                  Counters: [8, 500]
+                  Callsites:  -
+                                - Guid: 1002
+                                  Counters: [500]
 ;--- expected.json
 [
   { "Guid": 1000,
diff --git a/llvm/test/Analysis/CtxProfAnalysis/load-unapplicable.ll b/llvm/test/Analysis/CtxProfAnalysis/load-unapplicable.ll
index 09d2e150fbcef..38dd0ea825d82 100644
--- a/llvm/test/Analysis/CtxProfAnalysis/load-unapplicable.ll
+++ b/llvm/test/Analysis/CtxProfAnalysis/load-unapplicable.ll
@@ -5,7 +5,7 @@
 ;
 ; RUN: rm -rf %t
 ; RUN: split-file %s %t
-; RUN: llvm-ctxprof-util fromJSON --input=%t/profile.json --output=%t/profile.ctxprofdata
+; RUN: llvm-ctxprof-util fromYAML --input=%t/profile.yaml --output=%t/profile.ctxprofdata
 ; RUN: opt -passes='require<ctx-prof-analysis>,print<ctx-prof-analysis>' -ctx-profile-printer-level=everything \
 ; RUN:   %t/example.ll -S 2>&1 | FileCheck %s
 
@@ -15,38 +15,16 @@
 ; output it from opt. Note that the root GUIDs - 12341 and 34234 - are different from
 ; the GUID present in the module, which is otherwise present in the profile, but not
 ; as a root.
-;--- profile.json
-[
-  {
-    "Counters": [
-      9
-    ],
-    "Guid": 12341
-  },
-  {
-    "Counters": [
-      5
-    ],
-    "Guid": 1000
-  },
-  {
-    "Callsites": [
-      [
-        {
-          "Counters": [
-            6,
-            7
-          ],
-          "Guid": 1000
-        }
-      ]
-    ],
-    "Counters": [
-      1
-    ],
-    "Guid": 34234
-  }
-]
+;--- profile.yaml
+- Guid: 12341
+  Counters: [9]
+- Guid: 1000
+  Counters: [5]
+- Guid: 34234
+  Counters: [1]
+  Callsites:  -
+                - Guid: 1000
+                  Counters: [6, 7]
 ;--- example.ll
 declare void @bar()
 
diff --git a/llvm/test/Analysis/CtxProfAnalysis/load.ll b/llvm/test/Analysis/CtxProfAnalysis/load.ll
index 0cf6c5973dc6b..62c6344ed3fec 100644
--- a/llvm/test/Analysis/CtxProfAnalysis/load.ll
+++ b/llvm/test/Analysis/CtxProfAnalysis/load.ll
@@ -2,7 +2,7 @@
 ;
 ; RUN: rm -rf %t
 ; RUN: split-file %s %t
-; RUN: llvm-ctxprof-util fromJSON --input=%t/profile.json --output=%t/profile.ctxprofdata
+; RUN: llvm-ctxprof-util fromYAML --input=%t/profile.yaml --output=%t/profile.ctxprofdata
 ; RUN: opt -passes='require<ctx-prof-analysis>,print<ctx-prof-analysis>' -ctx-profile-printer-level=everything \
 ; RUN:   %t/example.ll -S 2>&1 | FileCheck %s --check-prefix=NO-CTX
 
@@ -23,38 +23,16 @@
 ;
 ; This is the reference profile, laid out in the format the json formatter will
 ; output it from opt.
-;--- profile.json
-[
-  {
-    "Counters": [
-      9
-    ],
-    "Guid": 12341
-  },
-  {
-    "Counters": [
-      5
-    ],
-    "Guid": 12074870348631550642
-  },
-  {
-    "Callsites": [
-      [
-        {
-          "Counters": [
-            6,
-            7
-          ],
-          "Guid": 728453322856651412
-        }
-      ]
-    ],
-    "Counters": [
-      1
-    ],
-    "Guid": 11872291593386833696
-  }
-]
+;--- profile.yaml
+- Guid: 12341
+  Counters: [9]
+- Guid: 12074870348631550642
+  Counters: [5]
+- Guid: 11872291593386833696
+  Counters: [1]
+  Callsites:  -
+                - Guid: 728453322856651412
+                  Counters: [6, 7]
 ;--- expected-profile-output.txt
 Function Info:
 4909520559318251808 : an_entrypoint. MaxCounterID: 2. MaxCallsiteID: 1
diff --git a/llvm/test/ThinLTO/X86/ctxprof.ll b/llvm/test/ThinLTO/X86/ctxprof.ll
index 4baea3b25890e..fd325dad5ada1 100644
--- a/llvm/test/ThinLTO/X86/ctxprof.ll
+++ b/llvm/test/ThinLTO/X86/ctxprof.ll
@@ -55,8 +55,8 @@
 ; RUN: echo '[ \
 ; RUN:        {"Guid": 6019442868614718803, "Counters": [1], "Callsites": [[{"Guid": 15593096274670919754, "Counters": [1]}]]}, \
 ; RUN:        {"Guid": 15593096274670919754, "Counters": [1], "Callsites": [[{"Guid": 6019442868614718803, "Counters": [1]}]]} \
-; RUN:  ]' > %t_exp/ctxprof.json
-; RUN: llvm-ctxprof-util fromJSON --input %t_exp/ctxprof.json --output %t_exp/ctxprof.bitstream
+; RUN:  ]' > %t_exp/ctxprof.yaml
+; RUN: llvm-ctxprof-util fromYAML --input %t_exp/ctxprof.yaml --output %t_exp/ctxprof.bitstream
 ; RUN: llvm-lto2 run %t/m1-instr.bc %t/m2-instr.bc \
 ; RUN:  -o %t_exp/result.o -save-temps \
 ; RUN:  -use-ctx-profile=%t_exp/ctxprof.bitstream \
diff --git a/llvm/test/Transforms/EliminateAvailableExternally/transform-to-local.ll b/llvm/test/Transforms/EliminateAvailableExternally/transform-to-local.ll
index 4908fba62e3bf..ad10c15503097 100644
--- a/llvm/test/Transforms/EliminateAvailableExternally/transform-to-local.ll
+++ b/llvm/test/Transforms/EliminateAvailableExternally/transform-to-local.ll
@@ -1,7 +1,7 @@
 ; REQUIRES: asserts
 ; RUN: opt -passes=elim-avail-extern -avail-extern-to-local -stats -S 2>&1 < %s | FileCheck %s
 ;
-; RUN: echo '[{"Guid":1234, "Counters": [1]}]' | llvm-ctxprof-util fromJSON --input=- --output=%t_profile.ctxprofdata
+; RUN: echo '[{"Guid":1234, "Counters": [1]}]' | llvm-ctxprof-util fromYAML --input=- --output=%t_profile.ctxprofdata
 ;
 ; Because we pass a contextual profile with a root defined in this module, we expect the outcome to be the same as-if
 ; we passed -avail-extern-to-local, i.e. available_externally don't get elided and instead get converted to local linkage
@@ -9,7 +9,7 @@
 
 ; If the profile doesn't apply to this module, available_externally won't get converted to internal linkage, and will be
 ; removed instead.
-; RUN: echo '[{"Guid":5678, "Counters": [1]}]' | llvm-ctxprof-util fromJSON --input=- --output=%t_profile_bad.ctxprofdata
+; RUN: echo '[{"Guid":5678, "Counters": [1]}]' | llvm-ctxprof-util fromYAML --input=- --output=%t_profile_bad.ctxprofdata
 ; RUN: opt -passes='assign-guid,require<ctx-prof-analysis>,elim-avail-extern' -use-ctx-profile=%t_profile_bad.ctxprofdata -stats -S 2>&1 < %s | FileCheck %s --check-prefix=NOOP
 
 declare void @call_out(ptr %fct)
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/bad.json b/llvm/test/tools/llvm-ctxprof-util/Inputs/bad.json
deleted file mode 100644
index 35c169002386e..0000000000000
--- a/llvm/test/tools/llvm-ctxprof-util/Inputs/bad.json
+++ /dev/null
@@ -1 +0,0 @@
-[{]
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/bad.yaml b/llvm/test/tools/llvm-ctxprof-util/Inputs/bad.yaml
new file mode 100644
index 0000000000000..54fc332d7c5e6
--- /dev/null
+++ b/llvm/test/tools/llvm-ctxprof-util/Inputs/bad.yaml
@@ -0,0 +1 @@
+- f
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/empty.json b/llvm/test/tools/llvm-ctxprof-util/Inputs/empty.json
deleted file mode 100644
index fe51488c7066f..0000000000000
--- a/llvm/test/tools/llvm-ctxprof-util/Inputs/empty.json
+++ /dev/null
@@ -1 +0,0 @@
-[]
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/empty.yaml b/llvm/test/tools/llvm-ctxprof-util/Inputs/empty.yaml
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-bad-subctx.json b/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-bad-subctx.json
deleted file mode 100644
index b47e0ee1a04ba..0000000000000
--- a/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-bad-subctx.json
+++ /dev/null
@@ -1,8 +0,0 @@
-[{
-  "Guid": 123,
-  "Counters": [1, 2],
-  "Callsites":
-  [
-    {"Guid": 1}
-  ]
-}]
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-bad-subctx.yaml b/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-bad-subctx.yaml
new file mode 100644
index 0000000000000..2c2527d75ad2a
--- /dev/null
+++ b/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-bad-subctx.yaml
@@ -0,0 +1,4 @@
+- Guid: 123
+  Counters: [1, 2]
+  Callsites: - Guid: 1
+
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-counters.json b/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-counters.json
deleted file mode 100644
index 95cdd45a5a0f7..0000000000000
--- a/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-counters.json
+++ /dev/null
@@ -1,5 +0,0 @@
-[
-  {
-    "Guid": 1231
-  }
-]
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-counters.yaml b/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-counters.yaml
new file mode 100644
index 0000000000000..7944d92e62ab7
--- /dev/null
+++ b/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-counters.yaml
@@ -0,0 +1 @@
+- Guid: 1231
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-ctx.json b/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-ctx.json
deleted file mode 100644
index 93d51406d63fb..0000000000000
--- a/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-ctx.json
+++ /dev/null
@@ -1 +0,0 @@
-[{}]
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-ctx.yaml b/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-ctx.yaml
new file mode 100644
index 0000000000000..3cf20d57b0b82
--- /dev/null
+++ b/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-ctx.yaml
@@ -0,0 +1 @@
+-
\ No newline at end of file
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-vector.json b/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-vector.json
deleted file mode 100644
index 0967ef424bce6..0000000000000
--- a/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-vector.json
+++ /dev/null
@@ -1 +0,0 @@
-{}
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-vector.yaml b/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-vector.yaml
new file mode 100644
index 0000000000000..362277183dec9
--- /dev/null
+++ b/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-vector.yaml
@@ -0,0 +1 @@
+Guid: 1
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/valid.json b/llvm/test/tools/llvm-ctxprof-util/Inputs/valid.json
deleted file mode 100644
index d4a6da4142110..0000000000000
--- a/llvm/test/tools/llvm-ctxprof-util/Inputs/valid.json
+++ /dev/null
@@ -1,47 +0,0 @@
-[
-  {
-    "Guid": 1000,
-    "Counters": [
-      1,
-      2,
-      3
-    ],
-    "Callsites": [
-      [],
-      [
-        {
-          "Guid": 2000,
-          "Counters": [
-            4,
-            5
-          ]
-        },
-        {
-          "Guid": 18446744073709551613,
-          "Counters": [
-            6,
-            7,
-            8
-          ]
-        }
-      ],
-      [
-        {
-          "Guid": 3000,
-          "Counters": [
-            40,
-            50
-          ]
-        }
-      ]
-    ]
-  },
-  {
-    "Guid": 18446744073709551612,
-    "Counters": [
-      5,
-      9,
-      10
-    ]
-  }
-]
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/valid.yaml b/llvm/test/tools/llvm-ctxprof-util/Inputs/valid.yaml
new file mode 100644
index 0000000000000..6080c2414d64a
--- /dev/null
+++ b/llvm/test/tools/llvm-ctxprof-util/Inputs/valid.yaml
@@ -0,0 +1,13 @@
+- Guid: 1000
+  Counters: [1, 2, 3]
+  Callsites:  - []
+              - 
+                - Guid: 2000
+                  Counters: [4, 5]
+                - Guid: 18446744073709551613
+                  Counters: [6, 7, 8]
+              - 
+                - Guid: 3000
+                  Counters: [40, 50]
+- Guid: 18446744073709551612
+  Counters: [5, 9, 10]
diff --git a/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util-negative.test b/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util-negative.test
index e8668a7f198a9..d1f20ffdbc1c4 100644
--- a/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util-negative.test
+++ b/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util-negative.test
@@ -1,24 +1,24 @@
 ; REQUIRES: x86_64-linux
 
-; RUN: not llvm-ctxprof-util nofile.json 2>&1 | FileCheck %s --check-prefix=NO_CMD
-; RUN: not llvm-ctxprof-util invalidCmd --input nofile.json 2>&1 | FileCheck %s --check-prefix=INVALID_CMD
-; RUN: not llvm-ctxprof-util fromJSON nofile.json 2>&1 | FileCheck %s --check-prefix=NO_FLAG
-; RUN: not llvm-ctxprof-util fromJSON --input nofile.json 2>&1 | FileCheck -DMSG=%errc_ENOENT %s --check-prefix=NO_FILE
-; RUN: not llvm-ctxprof-util fromJSON --input %S/Inputs/bad.json 2>&1 | FileCheck %s --check-prefix=BAD_JSON
-; RUN: not llvm-ctxprof-util fromJSON --input %S/Inputs/invalid-no-vector.json 2>&1 | FileCheck %s --check-prefix=NO_VECTOR
-; RUN: not llvm-ctxprof-util fromJSON --input %S/Inputs/invalid-no-ctx.json 2>&1 | FileCheck %s --check-prefix=NO_CTX
-; RUN: not llvm-ctxprof-util fromJSON --input %S/Inputs/invalid-no-counters.json 2>&1 | FileCheck %s --check-prefix=NO_COUNTERS
-; RUN: not llvm-ctxprof-util fromJSON --input %S/Inputs/invalid-bad-subctx.json 2>&1 | FileCheck %s --check-prefix=BAD_SUBCTX
+; RUN: not llvm-ctxprof-util nofile.yaml 2>&1 | FileCheck %s --check-prefix=NO_CMD
+; RUN: not llvm-ctxprof-util invalidCmd --input nofile.yaml 2>&1 | FileCheck %s --check-prefix=INVALID_CMD
+; RUN: not llvm-ctxprof-util fromYAML nofile.yaml 2>&1 | FileCheck %s --check-prefix=NO_FLAG
+; RUN: not llvm-ctxprof-util fromYAML --input nofile.yaml 2>&1 | FileCheck -DMSG=%errc_ENOENT %s --check-prefix=NO_FILE
+; RUN: not llvm-ctxprof-util fromYAML --input %S/Inputs/bad.yaml 2>&1 | FileCheck %s --check-prefix=BAD_FORMAT
+; RUN: not llvm-ctxprof-util fromYAML --input %S/Inputs/invalid-no-vector.yaml 2>&1 | FileCheck %s --check-prefix=NO_VECTOR
+; RUN: not llvm-ctxprof-util fromYAML --input %S/Inputs/invalid-no-ctx.yaml 2>&1 | FileCheck %s --check-prefix=NO_CTX
+; RUN: not llvm-ctxprof-util fromYAML --input %S/Inputs/invalid-no-counters.yaml 2>&1 | FileCheck %s --check-prefix=NO_COUNTERS
+; RUN: not llvm-ctxprof-util fromYAML --input %S/Inputs/invalid-bad-subctx.yaml 2>&1 | FileCheck %s --check-prefix=BAD_SUBCTX
 ; RUN: rm -rf %t
-; RUN: not llvm-ctxprof-util fromJSON --input %S/Inputs/valid.json --output %t/output.bitstream 2>&1 | FileCheck %s --check-prefix=NO_DIR
+; RUN: not llvm-ctxprof-util fromYAML --input %S/Inputs/valid.yaml --output %t/output.bitstream 2>&1 | FileCheck %s --check-prefix=NO_DIR
 
-; NO_CMD: Unknown subcommand 'nofile.json'
+; NO_CMD: Unknown subcommand 'nofile.yaml'
 ; INVALID_CMD: Unknown subcommand 'invalidCmd'
-; NO_FLAG: Unknown command line argument 'nofile.json'. 
-; NO_FILE: 'nofile.json': [[MSG]]
-; BAD_JSON: Expected object key
-; NO_VECTOR: expected array
-; NO_CTX: missing value at (root)[0].Guid
-; NO_COUNTERS: missing value at (root)[0].Counters
-; BAD_SUBCTX: expected array at (root)[0].Callsites[0]
+; NO_FLAG: Unknown command line argument 'nofile.yaml'. 
+; NO_FILE: 'nofile.yaml': [[MSG]]
+; BAD_FORMAT: YAML:1:3: error: not a mapping
+; NO_VECTOR: YAML:1:1: error: not a sequence
+; NO_CTX: YAML:1:2: error: not a mapping
+; NO_COUNTERS: YAML:1:3: error: missing required key 'Counters'
+; BAD_SUBCTX: YAML:3:16: error: not a sequence
 ; NO_DIR: failed to open output
diff --git a/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util.test b/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util.test
index 5a21bffa59022..91ebd1de59bb5 100644
--- a/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util.test
+++ b/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util.test
@@ -1,15 +1,15 @@
 ; REQUIRES: x86_64-linux
 
 ; RUN: mkdir -p %t
-; RUN: llvm-ctxprof-util fromJSON --input %S/Inputs/empty.json -output %t/empty.bitstream
+; RUN: llvm-ctxprof-util fromYAML --input %S/Inputs/empty.yaml -output %t/empty.bitstream
 ; RUN: llvm-bcanalyzer --dump %t/empty.bitstream | FileCheck %s --check-prefix=EMPTY
 
-; RUN: llvm-ctxprof-util fromJSON --input %S/Inputs/valid.json -output %t/valid.bitstream
+; RUN: llvm-ctxprof-util fromYAML --input %S/Inputs/valid.yaml -output %t/valid.bitstream
 
 ; For the valid case, check against a reference output.
 ; Note that uint64_t are printed as signed values by llvm-bcanalyzer:
-;  * 18446744073709551613 in json is -3 in the output
-;  * 18446744073709551612 in json is -4 in the output
+;  * 18446744073709551613 in yaml is -3 in the output
+;  * 18446744073709551612 in yaml is -4 in the output
 ; Also we have no callee/context at index 0, 2 callsites for index 1, and one for
 ; index 2.
 ; RUN: llvm-bcanalyzer --dump %t/valid.bitstream | FileCheck %s --check-prefix=VALID
diff --git a/llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp b/llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp
index 2cf6d7613bdc9..cfa14b22c1469 100644
--- a/llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp
+++ b/llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp
@@ -22,7 +22,7 @@
 
 using namespace llvm;
 
-static cl::SubCommand FromJSON("fromJSON", "Convert from json");
+static cl::SubCommand FromYAML("fromYAML", "Convert from yaml");
 
 static cl::opt<std::string> InputFilename(
     "input", cl::value_desc("input"), cl::init("-"),
@@ -35,15 +35,15 @@ static cl::opt<std::string> InputFilename(
         "'Contexts', optional. An array containing arrays of contexts. The "
         "context array at a position 'i' is the set of callees at that "
         "callsite index. Use an empty array to indicate no callees."),
-    cl::sub(FromJSON));
+    cl::sub(FromYAML));
 
 static cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
                                            cl::init("-"),
                                            cl::desc("Output file"),
-                                           cl::sub(FromJSON));
+                                           cl::sub(FromYAML));
 
 // Save the bitstream profile from the JSON representation.
-Error convertFromJSON() {
+Error convertFromYAML() {
   auto BufOrError =
       MemoryBuffer::getFileOrSTDIN(InputFilename, /*IsText=*/true);
   if (!BufOrError)
@@ -58,14 +58,14 @@ Error convertFromJSON() {
   if (EC)
     return createStringError(EC, "failed to open output");
 
-  return llvm::createCtxProfFromJSON(BufOrError.get()->getBuffer(), Out);
+  return llvm::createCtxProfFromYAML(BufOrError.get()->getBuffer(), Out);
 }
 
 int main(int argc, const char **argv) {
   cl::ParseCommandLineOptions(argc, argv, "LLVM Contextual Profile Utils\n");
   ExitOnError ExitOnErr("llvm-ctxprof-util: ");
-  if (FromJSON) {
-    if (auto E = convertFromJSON()) {
+  if (FromYAML) {
+    if (auto E = convertFromYAML()) {
       handleAllErrors(std::move(E), [&](const ErrorInfoBase &E) {
         E.log(errs());
         errs() << "\n";
diff --git a/llvm/unittests/Transforms/Utils/CallPromotionUtilsTest.cpp b/llvm/unittests/Transforms/Utils/CallPromotionUtilsTest.cpp
index dcb1c10433ccf..4420a6d065499 100644
--- a/llvm/unittests/Transforms/Utils/CallPromotionUtilsTest.cpp
+++ b/llvm/unittests/Transforms/Utils/CallPromotionUtilsTest.cpp
@@ -547,7 +547,7 @@ define i32 @f4() !guid !3 {
     raw_fd_stream Out(ProfileFile.path(), EC);
     ASSERT_FALSE(EC);
     // "False" means no error.
-    ASSERT_FALSE(llvm::createCtxProfFromJSON(Profile, Out));
+    ASSERT_FALSE(llvm::createCtxProfFromYAML(Profile, Out));
   }
 
   ModuleAnalysisManager MAM;

From 8af4d206e0f979f68925a08f9dffd60a98ce97e2 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Fri, 10 Jan 2025 18:11:19 -0800
Subject: [PATCH 134/408] [NFCI][BoundsChecking] Apply nosanitize on
 local-bounds instrumentation (#122416)

Should be NFCI as we run sanitizer, like msan, before local-bounds.
---
 clang/test/CodeGen/allow-ubsan-check.c        | 12 +--
 .../Instrumentation/BoundsChecking.cpp        |  8 +-
 .../BoundsChecking/runtimes.ll                | 98 +++++++++++--------
 3 files changed, 69 insertions(+), 49 deletions(-)

diff --git a/clang/test/CodeGen/allow-ubsan-check.c b/clang/test/CodeGen/allow-ubsan-check.c
index a994e700115ff..fb264ce32ab99 100644
--- a/clang/test/CodeGen/allow-ubsan-check.c
+++ b/clang/test/CodeGen/allow-ubsan-check.c
@@ -181,8 +181,8 @@ void use(double*);
 // CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA8:![0-9]+]]
 // CHECK-NEXT:    ret double [[TMP2]]
 // CHECK:       [[TRAP]]:
-// CHECK-NEXT:    call void @__ubsan_handle_local_out_of_bounds_abort() #[[ATTR6]]
-// CHECK-NEXT:    unreachable
+// CHECK-NEXT:    call void @__ubsan_handle_local_out_of_bounds_abort() #[[ATTR6]], !nosanitize [[META2]]
+// CHECK-NEXT:    unreachable, !nosanitize [[META2]]
 //
 // TR-LABEL: define dso_local double @lbounds(
 // TR-SAME: i32 noundef [[B:%.*]], i32 noundef [[I:%.*]]) local_unnamed_addr #[[ATTR0]] {
@@ -198,8 +198,8 @@ void use(double*);
 // TR-NEXT:    [[TMP2:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA7:![0-9]+]]
 // TR-NEXT:    ret double [[TMP2]]
 // TR:       [[TRAP]]:
-// TR-NEXT:    call void @llvm.ubsantrap(i8 3) #[[ATTR5]]
-// TR-NEXT:    unreachable
+// TR-NEXT:    call void @llvm.ubsantrap(i8 3) #[[ATTR5]], !nosanitize [[META2]]
+// TR-NEXT:    unreachable, !nosanitize [[META2]]
 //
 // REC-LABEL: define dso_local double @lbounds(
 // REC-SAME: i32 noundef [[B:%.*]], i32 noundef [[I:%.*]]) local_unnamed_addr #[[ATTR0]] {
@@ -215,8 +215,8 @@ void use(double*);
 // REC-NEXT:    [[TMP2:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA8:![0-9]+]]
 // REC-NEXT:    ret double [[TMP2]]
 // REC:       [[TRAP]]:
-// REC-NEXT:    call void @__ubsan_handle_local_out_of_bounds() #[[ATTR6]]
-// REC-NEXT:    br label %[[BB1]]
+// REC-NEXT:    call void @__ubsan_handle_local_out_of_bounds() #[[ATTR6]], !nosanitize [[META2]]
+// REC-NEXT:    br label %[[BB1]], !nosanitize [[META2]]
 //
 double lbounds(int b, int i) {
   double a[b];
diff --git a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
index 10596f87fbcab..8004552250b47 100644
--- a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -41,7 +41,13 @@ STATISTIC(ChecksAdded, "Bounds checks added");
 STATISTIC(ChecksSkipped, "Bounds checks skipped");
 STATISTIC(ChecksUnable, "Bounds checks unable to add");
 
-using BuilderTy = IRBuilder<TargetFolder>;
+class BuilderTy : public IRBuilder<TargetFolder> {
+public:
+  BuilderTy(BasicBlock *TheBB, BasicBlock::iterator IP, TargetFolder Folder)
+      : IRBuilder<TargetFolder>(TheBB, IP, Folder) {
+    SetNoSanitizeMetadata();
+  }
+};
 
 /// Gets the conditions under which memory accessing instructions will overflow.
 ///
diff --git a/llvm/test/Instrumentation/BoundsChecking/runtimes.ll b/llvm/test/Instrumentation/BoundsChecking/runtimes.ll
index 6695e0fa549fa..ccc7e93615fed 100644
--- a/llvm/test/Instrumentation/BoundsChecking/runtimes.ll
+++ b/llvm/test/Instrumentation/BoundsChecking/runtimes.ll
@@ -16,113 +16,113 @@ define void @f1(i64 %x) nounwind {
 ; TR-SAME: i64 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
 ; TR-NEXT:    [[TMP1:%.*]] = mul i64 16, [[X]]
 ; TR-NEXT:    [[TMP2:%.*]] = alloca i128, i64 [[X]], align 8
-; TR-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP1]], 0
-; TR-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 16
-; TR-NEXT:    [[TMP5:%.*]] = or i1 false, [[TMP4]]
-; TR-NEXT:    [[TMP6:%.*]] = or i1 false, [[TMP5]]
+; TR-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP1]], 0, !nosanitize [[META0:![0-9]+]]
+; TR-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 16, !nosanitize [[META0]]
+; TR-NEXT:    [[TMP5:%.*]] = or i1 false, [[TMP4]], !nosanitize [[META0]]
+; TR-NEXT:    [[TMP6:%.*]] = or i1 false, [[TMP5]], !nosanitize [[META0]]
 ; TR-NEXT:    br i1 [[TMP6]], label %[[TRAP:.*]], label %[[BB7:.*]]
 ; TR:       [[BB7]]:
 ; TR-NEXT:    [[TMP8:%.*]] = load i128, ptr [[TMP2]], align 4
 ; TR-NEXT:    ret void
 ; TR:       [[TRAP]]:
-; TR-NEXT:    call void @llvm.trap() #[[ATTR2:[0-9]+]]
-; TR-NEXT:    unreachable
+; TR-NEXT:    call void @llvm.trap() #[[ATTR2:[0-9]+]], !nosanitize [[META0]]
+; TR-NEXT:    unreachable, !nosanitize [[META0]]
 ;
 ; RT-LABEL: define void @f1(
 ; RT-SAME: i64 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
 ; RT-NEXT:    [[TMP1:%.*]] = mul i64 16, [[X]]
 ; RT-NEXT:    [[TMP2:%.*]] = alloca i128, i64 [[X]], align 8
-; RT-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP1]], 0
-; RT-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 16
-; RT-NEXT:    [[TMP5:%.*]] = or i1 false, [[TMP4]]
-; RT-NEXT:    [[TMP6:%.*]] = or i1 false, [[TMP5]]
+; RT-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP1]], 0, !nosanitize [[META0:![0-9]+]]
+; RT-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 16, !nosanitize [[META0]]
+; RT-NEXT:    [[TMP5:%.*]] = or i1 false, [[TMP4]], !nosanitize [[META0]]
+; RT-NEXT:    [[TMP6:%.*]] = or i1 false, [[TMP5]], !nosanitize [[META0]]
 ; RT-NEXT:    br i1 [[TMP6]], label %[[TRAP:.*]], label %[[BB7:.*]]
 ; RT:       [[BB7]]:
 ; RT-NEXT:    [[TMP8:%.*]] = load i128, ptr [[TMP2]], align 4
 ; RT-NEXT:    ret void
 ; RT:       [[TRAP]]:
-; RT-NEXT:    call void @__ubsan_handle_local_out_of_bounds() #[[ATTR0]]
-; RT-NEXT:    br label %[[BB7]]
+; RT-NEXT:    call void @__ubsan_handle_local_out_of_bounds() #[[ATTR0]], !nosanitize [[META0]]
+; RT-NEXT:    br label %[[BB7]], !nosanitize [[META0]]
 ;
 ; TR-NOMERGE-LABEL: define void @f1(
 ; TR-NOMERGE-SAME: i64 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
 ; TR-NOMERGE-NEXT:    [[TMP1:%.*]] = mul i64 16, [[X]]
 ; TR-NOMERGE-NEXT:    [[TMP2:%.*]] = alloca i128, i64 [[X]], align 8
-; TR-NOMERGE-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP1]], 0
-; TR-NOMERGE-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 16
-; TR-NOMERGE-NEXT:    [[TMP5:%.*]] = or i1 false, [[TMP4]]
-; TR-NOMERGE-NEXT:    [[TMP6:%.*]] = or i1 false, [[TMP5]]
+; TR-NOMERGE-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP1]], 0, !nosanitize [[META0:![0-9]+]]
+; TR-NOMERGE-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 16, !nosanitize [[META0]]
+; TR-NOMERGE-NEXT:    [[TMP5:%.*]] = or i1 false, [[TMP4]], !nosanitize [[META0]]
+; TR-NOMERGE-NEXT:    [[TMP6:%.*]] = or i1 false, [[TMP5]], !nosanitize [[META0]]
 ; TR-NOMERGE-NEXT:    br i1 [[TMP6]], label %[[TRAP:.*]], label %[[BB7:.*]]
 ; TR-NOMERGE:       [[BB7]]:
 ; TR-NOMERGE-NEXT:    [[TMP8:%.*]] = load i128, ptr [[TMP2]], align 4
 ; TR-NOMERGE-NEXT:    ret void
 ; TR-NOMERGE:       [[TRAP]]:
-; TR-NOMERGE-NEXT:    call void @llvm.ubsantrap(i8 3) #[[ATTR2:[0-9]+]]
-; TR-NOMERGE-NEXT:    unreachable
+; TR-NOMERGE-NEXT:    call void @llvm.ubsantrap(i8 3) #[[ATTR2:[0-9]+]], !nosanitize [[META0]]
+; TR-NOMERGE-NEXT:    unreachable, !nosanitize [[META0]]
 ;
 ; RT-NOMERGE-LABEL: define void @f1(
 ; RT-NOMERGE-SAME: i64 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
 ; RT-NOMERGE-NEXT:    [[TMP1:%.*]] = mul i64 16, [[X]]
 ; RT-NOMERGE-NEXT:    [[TMP2:%.*]] = alloca i128, i64 [[X]], align 8
-; RT-NOMERGE-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP1]], 0
-; RT-NOMERGE-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 16
-; RT-NOMERGE-NEXT:    [[TMP5:%.*]] = or i1 false, [[TMP4]]
-; RT-NOMERGE-NEXT:    [[TMP6:%.*]] = or i1 false, [[TMP5]]
+; RT-NOMERGE-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP1]], 0, !nosanitize [[META0:![0-9]+]]
+; RT-NOMERGE-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 16, !nosanitize [[META0]]
+; RT-NOMERGE-NEXT:    [[TMP5:%.*]] = or i1 false, [[TMP4]], !nosanitize [[META0]]
+; RT-NOMERGE-NEXT:    [[TMP6:%.*]] = or i1 false, [[TMP5]], !nosanitize [[META0]]
 ; RT-NOMERGE-NEXT:    br i1 [[TMP6]], label %[[TRAP:.*]], label %[[BB7:.*]]
 ; RT-NOMERGE:       [[BB7]]:
 ; RT-NOMERGE-NEXT:    [[TMP8:%.*]] = load i128, ptr [[TMP2]], align 4
 ; RT-NOMERGE-NEXT:    ret void
 ; RT-NOMERGE:       [[TRAP]]:
-; RT-NOMERGE-NEXT:    call void @__ubsan_handle_local_out_of_bounds() #[[ATTR1:[0-9]+]]
-; RT-NOMERGE-NEXT:    br label %[[BB7]]
+; RT-NOMERGE-NEXT:    call void @__ubsan_handle_local_out_of_bounds() #[[ATTR1:[0-9]+]], !nosanitize [[META0]]
+; RT-NOMERGE-NEXT:    br label %[[BB7]], !nosanitize [[META0]]
 ;
 ; RTABORT-NOMERGE-LABEL: define void @f1(
 ; RTABORT-NOMERGE-SAME: i64 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
 ; RTABORT-NOMERGE-NEXT:    [[TMP1:%.*]] = mul i64 16, [[X]]
 ; RTABORT-NOMERGE-NEXT:    [[TMP2:%.*]] = alloca i128, i64 [[X]], align 8
-; RTABORT-NOMERGE-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP1]], 0
-; RTABORT-NOMERGE-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 16
-; RTABORT-NOMERGE-NEXT:    [[TMP5:%.*]] = or i1 false, [[TMP4]]
-; RTABORT-NOMERGE-NEXT:    [[TMP6:%.*]] = or i1 false, [[TMP5]]
+; RTABORT-NOMERGE-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP1]], 0, !nosanitize [[META0:![0-9]+]]
+; RTABORT-NOMERGE-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 16, !nosanitize [[META0]]
+; RTABORT-NOMERGE-NEXT:    [[TMP5:%.*]] = or i1 false, [[TMP4]], !nosanitize [[META0]]
+; RTABORT-NOMERGE-NEXT:    [[TMP6:%.*]] = or i1 false, [[TMP5]], !nosanitize [[META0]]
 ; RTABORT-NOMERGE-NEXT:    br i1 [[TMP6]], label %[[TRAP:.*]], label %[[BB7:.*]]
 ; RTABORT-NOMERGE:       [[BB7]]:
 ; RTABORT-NOMERGE-NEXT:    [[TMP8:%.*]] = load i128, ptr [[TMP2]], align 4
 ; RTABORT-NOMERGE-NEXT:    ret void
 ; RTABORT-NOMERGE:       [[TRAP]]:
-; RTABORT-NOMERGE-NEXT:    call void @__ubsan_handle_local_out_of_bounds_abort() #[[ATTR2:[0-9]+]]
-; RTABORT-NOMERGE-NEXT:    unreachable
+; RTABORT-NOMERGE-NEXT:    call void @__ubsan_handle_local_out_of_bounds_abort() #[[ATTR2:[0-9]+]], !nosanitize [[META0]]
+; RTABORT-NOMERGE-NEXT:    unreachable, !nosanitize [[META0]]
 ;
 ; MINRT-NOMERGE-LABEL: define void @f1(
 ; MINRT-NOMERGE-SAME: i64 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
 ; MINRT-NOMERGE-NEXT:    [[TMP1:%.*]] = mul i64 16, [[X]]
 ; MINRT-NOMERGE-NEXT:    [[TMP2:%.*]] = alloca i128, i64 [[X]], align 8
-; MINRT-NOMERGE-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP1]], 0
-; MINRT-NOMERGE-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 16
-; MINRT-NOMERGE-NEXT:    [[TMP5:%.*]] = or i1 false, [[TMP4]]
-; MINRT-NOMERGE-NEXT:    [[TMP6:%.*]] = or i1 false, [[TMP5]]
+; MINRT-NOMERGE-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP1]], 0, !nosanitize [[META0:![0-9]+]]
+; MINRT-NOMERGE-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 16, !nosanitize [[META0]]
+; MINRT-NOMERGE-NEXT:    [[TMP5:%.*]] = or i1 false, [[TMP4]], !nosanitize [[META0]]
+; MINRT-NOMERGE-NEXT:    [[TMP6:%.*]] = or i1 false, [[TMP5]], !nosanitize [[META0]]
 ; MINRT-NOMERGE-NEXT:    br i1 [[TMP6]], label %[[TRAP:.*]], label %[[BB7:.*]]
 ; MINRT-NOMERGE:       [[BB7]]:
 ; MINRT-NOMERGE-NEXT:    [[TMP8:%.*]] = load i128, ptr [[TMP2]], align 4
 ; MINRT-NOMERGE-NEXT:    ret void
 ; MINRT-NOMERGE:       [[TRAP]]:
-; MINRT-NOMERGE-NEXT:    call void @__ubsan_handle_local_out_of_bounds_minimal() #[[ATTR1:[0-9]+]]
-; MINRT-NOMERGE-NEXT:    br label %[[BB7]]
+; MINRT-NOMERGE-NEXT:    call void @__ubsan_handle_local_out_of_bounds_minimal() #[[ATTR1:[0-9]+]], !nosanitize [[META0]]
+; MINRT-NOMERGE-NEXT:    br label %[[BB7]], !nosanitize [[META0]]
 ;
 ; MINRTABORT-NOMERGE-LABEL: define void @f1(
 ; MINRTABORT-NOMERGE-SAME: i64 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
 ; MINRTABORT-NOMERGE-NEXT:    [[TMP1:%.*]] = mul i64 16, [[X]]
 ; MINRTABORT-NOMERGE-NEXT:    [[TMP2:%.*]] = alloca i128, i64 [[X]], align 8
-; MINRTABORT-NOMERGE-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP1]], 0
-; MINRTABORT-NOMERGE-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 16
-; MINRTABORT-NOMERGE-NEXT:    [[TMP5:%.*]] = or i1 false, [[TMP4]]
-; MINRTABORT-NOMERGE-NEXT:    [[TMP6:%.*]] = or i1 false, [[TMP5]]
+; MINRTABORT-NOMERGE-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP1]], 0, !nosanitize [[META0:![0-9]+]]
+; MINRTABORT-NOMERGE-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 16, !nosanitize [[META0]]
+; MINRTABORT-NOMERGE-NEXT:    [[TMP5:%.*]] = or i1 false, [[TMP4]], !nosanitize [[META0]]
+; MINRTABORT-NOMERGE-NEXT:    [[TMP6:%.*]] = or i1 false, [[TMP5]], !nosanitize [[META0]]
 ; MINRTABORT-NOMERGE-NEXT:    br i1 [[TMP6]], label %[[TRAP:.*]], label %[[BB7:.*]]
 ; MINRTABORT-NOMERGE:       [[BB7]]:
 ; MINRTABORT-NOMERGE-NEXT:    [[TMP8:%.*]] = load i128, ptr [[TMP2]], align 4
 ; MINRTABORT-NOMERGE-NEXT:    ret void
 ; MINRTABORT-NOMERGE:       [[TRAP]]:
-; MINRTABORT-NOMERGE-NEXT:    call void @__ubsan_handle_local_out_of_bounds_minimal_abort() #[[ATTR2:[0-9]+]]
-; MINRTABORT-NOMERGE-NEXT:    unreachable
+; MINRTABORT-NOMERGE-NEXT:    call void @__ubsan_handle_local_out_of_bounds_minimal_abort() #[[ATTR2:[0-9]+]], !nosanitize [[META0]]
+; MINRTABORT-NOMERGE-NEXT:    unreachable, !nosanitize [[META0]]
 ;
   %1 = alloca i128, i64 %x
   %3 = load i128, ptr %1, align 4
@@ -154,3 +154,17 @@ define void @f1(i64 %x) nounwind {
 ; MINRTABORT-NOMERGE: attributes #[[ATTR1:[0-9]+]] = { noreturn nounwind }
 ; MINRTABORT-NOMERGE: attributes #[[ATTR2]] = { nomerge noreturn nounwind }
 ;.
+; TR: [[META0]] = !{}
+;.
+; RT: [[META0]] = !{}
+;.
+; TR-NOMERGE: [[META0]] = !{}
+;.
+; RT-NOMERGE: [[META0]] = !{}
+;.
+; RTABORT-NOMERGE: [[META0]] = !{}
+;.
+; MINRT-NOMERGE: [[META0]] = !{}
+;.
+; MINRTABORT-NOMERGE: [[META0]] = !{}
+;.

From 9b528ed38038e39c441927b1fd0220654c253a3c Mon Sep 17 00:00:00 2001
From: GeorgeHuyubo <113479859+GeorgeHuyubo@users.noreply.github.com>
Date: Fri, 10 Jan 2025 18:13:46 -0800
Subject: [PATCH 135/408] Debuginfod cache use index cache settings and include
 real file name (#120814)

This PR include two changes:
1. Change debuginfod cache file name to include origin file name, the
new file name would be something like:

llvmcache-13267c5f5d2e3df472c133c8efa45fb3331ef1ea-liblzma.so.5.2.2.debuginfo.dwp
So it will provide more information in image list instead of a plain
llvmcache-123
2. Switch debuginfod cache to use lldb index cache settings. Currently
we don't have proper settings for setting the cache path or the cache
expiration time for debuginfod cache. We want to use the lldb index
cache settings, as they make sense to be in the same place and have the
same TTL.

---------

Co-authored-by: George Hu <georgehuyubo@gmail.com>
---
 .../Debuginfod/SymbolLocatorDebuginfod.cpp    | 40 +++++++++++++++----
 llvm/include/llvm/Debuginfod/Debuginfod.h     |  4 +-
 llvm/lib/Debuginfod/Debuginfod.cpp            | 29 +++++++++-----
 llvm/unittests/Debuginfod/DebuginfodTests.cpp |  3 +-
 4 files changed, 58 insertions(+), 18 deletions(-)

diff --git a/lldb/source/Plugins/SymbolLocator/Debuginfod/SymbolLocatorDebuginfod.cpp b/lldb/source/Plugins/SymbolLocator/Debuginfod/SymbolLocatorDebuginfod.cpp
index 2cd7bbbb24490..905f4d783ac95 100644
--- a/lldb/source/Plugins/SymbolLocator/Debuginfod/SymbolLocatorDebuginfod.cpp
+++ b/lldb/source/Plugins/SymbolLocator/Debuginfod/SymbolLocatorDebuginfod.cpp
@@ -8,6 +8,7 @@
 
 #include "SymbolLocatorDebuginfod.h"
 
+#include "lldb/Core/DataFileCache.h"
 #include "lldb/Core/PluginManager.h"
 #include "lldb/Interpreter/OptionValueString.h"
 #include "lldb/Utility/Args.h"
@@ -141,6 +142,24 @@ SymbolLocator *SymbolLocatorDebuginfod::CreateInstance() {
   return new SymbolLocatorDebuginfod();
 }
 
+static llvm::StringRef getFileName(const ModuleSpec &module_spec,
+                                   std::string url_path) {
+  // Check if the URL path requests an executable file or a symbol file
+  bool is_executable = url_path.find("debuginfo") == std::string::npos;
+  if (is_executable)
+    return module_spec.GetFileSpec().GetFilename().GetStringRef();
+  llvm::StringRef symbol_file =
+      module_spec.GetSymbolFileSpec().GetFilename().GetStringRef();
+  // Remove llvmcache- prefix and hash, keep origin file name
+  if (symbol_file.starts_with("llvmcache-")) {
+    size_t pos = symbol_file.rfind('-');
+    if (pos != llvm::StringRef::npos) {
+      symbol_file = symbol_file.substr(pos + 1);
+    }
+  }
+  return symbol_file;
+}
+
 static std::optional<FileSpec>
 GetFileForModule(const ModuleSpec &module_spec,
                  std::function<std::string(llvm::object::BuildID)> UrlBuilder) {
@@ -154,11 +173,14 @@ GetFileForModule(const ModuleSpec &module_spec,
   // Grab LLDB's Debuginfod overrides from the
   // plugin.symbol-locator.debuginfod.* settings.
   PluginProperties &plugin_props = GetGlobalPluginProperties();
-  llvm::Expected<std::string> cache_path_or_err = plugin_props.GetCachePath();
-  // A cache location is *required*.
-  if (!cache_path_or_err)
-    return {};
-  std::string cache_path = *cache_path_or_err;
+  // Grab the lldb index cache settings from the global module list properties.
+  ModuleListProperties &properties =
+      ModuleList::GetGlobalModuleListProperties();
+  std::string cache_path = properties.GetLLDBIndexCachePath().GetPath();
+
+  llvm::CachePruningPolicy pruning_policy =
+      DataFileCache::GetLLDBIndexCachePolicy();
+
   llvm::SmallVector<llvm::StringRef> debuginfod_urls =
       llvm::getDefaultDebuginfodUrls();
   std::chrono::milliseconds timeout = plugin_props.GetTimeout();
@@ -166,9 +188,13 @@ GetFileForModule(const ModuleSpec &module_spec,
   // We're ready to ask the Debuginfod library to find our file.
   llvm::object::BuildID build_id(module_uuid.GetBytes());
   std::string url_path = UrlBuilder(build_id);
-  std::string cache_key = llvm::getDebuginfodCacheKey(url_path);
+  llvm::StringRef file_name = getFileName(module_spec, url_path);
+  std::string cache_file_name = llvm::toHex(build_id, true);
+  if (!file_name.empty())
+    cache_file_name += "-" + file_name.str();
   llvm::Expected<std::string> result = llvm::getCachedOrDownloadArtifact(
-      cache_key, url_path, cache_path, debuginfod_urls, timeout);
+      cache_file_name, url_path, cache_path, debuginfod_urls, timeout,
+      pruning_policy);
   if (result)
     return FileSpec(*result);
 
diff --git a/llvm/include/llvm/Debuginfod/Debuginfod.h b/llvm/include/llvm/Debuginfod/Debuginfod.h
index 99fe15ad85979..aebcf31cd4822 100644
--- a/llvm/include/llvm/Debuginfod/Debuginfod.h
+++ b/llvm/include/llvm/Debuginfod/Debuginfod.h
@@ -25,6 +25,7 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Object/BuildID.h"
+#include "llvm/Support/CachePruning.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Mutex.h"
@@ -95,7 +96,8 @@ Expected<std::string> getCachedOrDownloadArtifact(StringRef UniqueKey,
 /// found, uses the UniqueKey for the local cache file.
 Expected<std::string> getCachedOrDownloadArtifact(
     StringRef UniqueKey, StringRef UrlPath, StringRef CacheDirectoryPath,
-    ArrayRef<StringRef> DebuginfodUrls, std::chrono::milliseconds Timeout);
+    ArrayRef<StringRef> DebuginfodUrls, std::chrono::milliseconds Timeout,
+    llvm::CachePruningPolicy policy);
 
 class ThreadPoolInterface;
 
diff --git a/llvm/lib/Debuginfod/Debuginfod.cpp b/llvm/lib/Debuginfod/Debuginfod.cpp
index 4c785117ae8ef..17efaea892c66 100644
--- a/llvm/lib/Debuginfod/Debuginfod.cpp
+++ b/llvm/lib/Debuginfod/Debuginfod.cpp
@@ -106,6 +106,14 @@ Expected<std::string> getDefaultDebuginfodCacheDirectory() {
   return std::string(CacheDirectory);
 }
 
+Expected<llvm::CachePruningPolicy> getDefaultDebuginfodCachePruningPolicy() {
+  Expected<CachePruningPolicy> PruningPolicyOrErr =
+      parseCachePruningPolicy(std::getenv("DEBUGINFOD_CACHE_POLICY"));
+  if (!PruningPolicyOrErr)
+    return PruningPolicyOrErr.takeError();
+  return *PruningPolicyOrErr;
+}
+
 std::chrono::milliseconds getDefaultDebuginfodTimeout() {
   long Timeout;
   const char *DebuginfodTimeoutEnv = std::getenv("DEBUGINFOD_TIMEOUT");
@@ -169,9 +177,15 @@ Expected<std::string> getCachedOrDownloadArtifact(StringRef UniqueKey,
     return CacheDirOrErr.takeError();
   CacheDir = *CacheDirOrErr;
 
-  return getCachedOrDownloadArtifact(UniqueKey, UrlPath, CacheDir,
-                                     getDefaultDebuginfodUrls(),
-                                     getDefaultDebuginfodTimeout());
+  Expected<llvm::CachePruningPolicy> PruningPolicyOrErr =
+      getDefaultDebuginfodCachePruningPolicy();
+  if (!PruningPolicyOrErr)
+    return PruningPolicyOrErr.takeError();
+  llvm::CachePruningPolicy PruningPolicy = *PruningPolicyOrErr;
+
+  return getCachedOrDownloadArtifact(
+      UniqueKey, UrlPath, CacheDir, getDefaultDebuginfodUrls(),
+      getDefaultDebuginfodTimeout(), PruningPolicy);
 }
 
 namespace {
@@ -250,7 +264,8 @@ static SmallVector<std::string, 0> getHeaders() {
 
 Expected<std::string> getCachedOrDownloadArtifact(
     StringRef UniqueKey, StringRef UrlPath, StringRef CacheDirectoryPath,
-    ArrayRef<StringRef> DebuginfodUrls, std::chrono::milliseconds Timeout) {
+    ArrayRef<StringRef> DebuginfodUrls, std::chrono::milliseconds Timeout,
+    llvm::CachePruningPolicy policy) {
   SmallString<64> AbsCachedArtifactPath;
   sys::path::append(AbsCachedArtifactPath, CacheDirectoryPath,
                     "llvmcache-" + UniqueKey);
@@ -304,11 +319,7 @@ Expected<std::string> getCachedOrDownloadArtifact(
         continue;
     }
 
-    Expected<CachePruningPolicy> PruningPolicyOrErr =
-        parseCachePruningPolicy(std::getenv("DEBUGINFOD_CACHE_POLICY"));
-    if (!PruningPolicyOrErr)
-      return PruningPolicyOrErr.takeError();
-    pruneCache(CacheDirectoryPath, *PruningPolicyOrErr);
+    pruneCache(CacheDirectoryPath, policy);
 
     // Return the path to the artifact on disk.
     return std::string(AbsCachedArtifactPath);
diff --git a/llvm/unittests/Debuginfod/DebuginfodTests.cpp b/llvm/unittests/Debuginfod/DebuginfodTests.cpp
index 5312912599e93..8dacf2ae5b3f8 100644
--- a/llvm/unittests/Debuginfod/DebuginfodTests.cpp
+++ b/llvm/unittests/Debuginfod/DebuginfodTests.cpp
@@ -37,6 +37,7 @@ TEST(DebuginfodClient, CacheHit) {
   sys::fs::createTemporaryFile("llvmcache-key", "temp", FD, CachedFilePath);
   StringRef CacheDir = sys::path::parent_path(CachedFilePath);
   StringRef UniqueKey = sys::path::filename(CachedFilePath);
+  llvm::CachePruningPolicy policy;
   EXPECT_TRUE(UniqueKey.consume_front("llvmcache-"));
   raw_fd_ostream OF(FD, true, /*unbuffered=*/true);
   OF << "contents\n";
@@ -44,7 +45,7 @@ TEST(DebuginfodClient, CacheHit) {
   OF.close();
   Expected<std::string> PathOrErr = getCachedOrDownloadArtifact(
       UniqueKey, /*UrlPath=*/"/null", CacheDir,
-      /*DebuginfodUrls=*/{}, /*Timeout=*/std::chrono::milliseconds(1));
+      /*DebuginfodUrls=*/{}, /*Timeout=*/std::chrono::milliseconds(1), policy);
   EXPECT_THAT_EXPECTED(PathOrErr, HasValue(CachedFilePath));
 }
 

From 82aac16ddda8f61d8f547bd127f39b4e81e40464 Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe@google.com>
Date: Fri, 10 Jan 2025 18:19:26 -0800
Subject: [PATCH 136/408] [bazel] Remove obsolete exclusion from glob.

lib/Tooling/NodeIntrospection.cpp was deleted in commit
371eccd5dfed88c8e76449233d8388c12be3464b so excluding it from the glob
does nothing.
---
 utils/bazel/llvm-project-overlay/clang/BUILD.bazel | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index 8624028460975..128c348696a8f 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -739,8 +739,8 @@ cc_library(
         ":basic_builtins_gen",
         ":basic_builtins_riscv_gen",
         ":basic_builtins_spirv_gen",
-        ":basic_builtins_x86_gen",
         ":basic_builtins_x86_64_gen",
+        ":basic_builtins_x86_gen",
         ":basic_internal_headers",
         ":basic_riscv_sifive_vector_builtins_gen",
         ":basic_riscv_vector_builtin_cg_gen",
@@ -1354,13 +1354,9 @@ cc_library(
 
 cc_library(
     name = "tooling",
-    srcs = glob(
-        [
-            "lib/Tooling/*.cpp",
-        ],
-        # Temporarily disable until we support the generated file.
-        exclude = ["lib/Tooling/NodeIntrospection.cpp"],
-    ),
+    srcs = glob([
+        "lib/Tooling/*.cpp",
+    ]),
     hdrs = glob([
         "include/clang/Tooling/*.h",
     ]),

From df808df8f73e30d288e4dfdef5e527bc392f3bce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Fri, 10 Jan 2025 19:15:02 -0800
Subject: [PATCH 137/408] [flang][cuda] Add bind(c) interfaces for __fadd_rd
 and __fadd_ru (#122535)

Function like `__fadd_rd ` and `__fadd_ru ` need to be converted to the
cuda equivalent.
---
 flang/module/cudadevice.f90              | 16 ++++++++++++++++
 flang/test/Lower/CUDA/cuda-intrinsic.cuf | 17 +++++++++++++++++
 2 files changed, 33 insertions(+)
 create mode 100644 flang/test/Lower/CUDA/cuda-intrinsic.cuf

diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index 1402bd4e15041..e06c706538fe6 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -71,4 +71,20 @@ attributes(device) subroutine threadfence_system()
   end interface
   public :: threadfence_system
 
+  interface
+    attributes(device) function __fadd_rd(x, y) bind(c, name='__nv_fadd_rd')
+      real, intent(in) :: x, y
+      real :: __fadd_rd
+    end function
+  end interface
+  public :: __fadd_rd
+
+  interface
+    attributes(device) function __fadd_ru(x, y) bind(c, name='__nv_fadd_ru')
+      real, intent(in) :: x, y
+      real :: __fadd_ru
+    end function
+  end interface
+  public :: __fadd_ru
+
 end module
diff --git a/flang/test/Lower/CUDA/cuda-intrinsic.cuf b/flang/test/Lower/CUDA/cuda-intrinsic.cuf
new file mode 100644
index 0000000000000..9723afc532387
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-intrinsic.cuf
@@ -0,0 +1,17 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+module mod1
+  type int
+    real :: inf, sup 
+  end type int
+contains
+  attributes(global) subroutine fadd(c, a, b)
+    type (int) :: c, a, b
+    c%inf = __fadd_rd(a%inf, b%inf)
+    c%sup = __fadd_ru(a%sup, b%sup)
+  end subroutine
+end
+
+! CHECK-LABEL: func.func @_QMmod1Pfadd
+! CHECK: fir.call @__nv_fadd_rd
+! CHECK: fir.call @__nv_fadd_ru

From 0de18e72c607c1b52be2c60d45cf2f9fc3af4542 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 10 Jan 2025 19:25:18 -0800
Subject: [PATCH 138/408] -ftime-report: reorganize timers

The code generation time is unclear in the -ftime-report output:

* The two clang timers "Code Generation Time" and "LLVM IR Generation
  Time" are in the default group "Miscellaneous Ungrouped Timers".
* There is also a "Clang front-end time" group, which actually includes
  code generation time.

```
===-------------------------------------------------------------------------===
                         Miscellaneous Ungrouped Timers
===-------------------------------------------------------------------------===

   ---User Time---   --System Time--   --User+System--   ---Wall Time---  --- Name ---
   0.0611 (  1.7%)   0.0099 (  4.4%)   0.0710 (  1.9%)   0.0713 (  1.9%)  LLVM IR Generation Time
   3.5140 ( 98.3%)   0.2165 ( 95.6%)   3.7306 ( 98.1%)   3.7342 ( 98.1%)  Code Generation Time
   3.5751 (100.0%)   0.2265 (100.0%)   3.8016 (100.0%)   3.8055 (100.0%)  Total
...
===-------------------------------------------------------------------------===
                          Clang front-end time report
===-------------------------------------------------------------------------===
  Total Execution Time: 3.9108 seconds (3.9146 wall clock)

   ---User Time---   --System Time--   --User+System--   ---Wall Time---  --- Name ---
   3.6802 (100.0%)   0.2306 (100.0%)   3.9108 (100.0%)   3.9146 (100.0%)  Clang front-end timer
   3.6802 (100.0%)   0.2306 (100.0%)   3.9108 (100.0%)   3.9146 (100.0%)  Total
```

This patch

* renames "Clang front-end time report" (FrontendAction time) to "Clang
  time report",
* renames "Clang front-end" to "Front end",
* moves "LLVM IR Generation" into the group,
* replaces "Code Generation time" with "Optimizer" (middle end) and
  "Machine code generation" (back end).

```
% clang -c sqlite3.i -w -ftime-report -mllvm -sort-timers=0
...
===-------------------------------------------------------------------------===
                               Clang time report
===-------------------------------------------------------------------------===
  Total Execution Time: 1.5922 seconds (1.5972 wall clock)

   ---User Time---   --System Time--   --User+System--   ---Wall Time---  --- Name ---
   0.5107 ( 35.9%)   0.0105 (  6.2%)   0.5211 ( 32.7%)   0.5222 ( 32.7%)  Front end
   0.2464 ( 17.3%)   0.0340 ( 20.0%)   0.2804 ( 17.6%)   0.2814 ( 17.6%)  LLVM IR generation
   0.6240 ( 43.9%)   0.1235 ( 72.7%)   0.7475 ( 47.0%)   0.7503 ( 47.0%)  Machine code generation
   0.0413 (  2.9%)   0.0018 (  1.0%)   0.0431 (  2.7%)   0.0433 (  2.7%)  Optimizer
   1.4224 (100.0%)   0.1698 (100.0%)   1.5922 (100.0%)   1.5972 (100.0%)  Total
```

Pull Request: https://github.com/llvm/llvm-project/pull/122225
---
 clang/docs/ReleaseNotes.rst                   |  4 ++
 .../include/clang/Frontend/CompilerInstance.h |  4 +-
 clang/lib/CodeGen/BackendUtil.cpp             | 19 ++++++++--
 clang/lib/CodeGen/CodeGenAction.cpp           | 37 +++++++------------
 clang/lib/Frontend/CompilerInstance.cpp       | 24 +++++-------
 .../Frontend/ftime-report-template-decl.cpp   | 18 ++++-----
 llvm/include/llvm/Support/Timer.h             |  3 ++
 llvm/lib/Support/Timer.cpp                    |  5 +++
 8 files changed, 62 insertions(+), 52 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 190843f2aa6c9..197b3692b8a18 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -445,6 +445,10 @@ Non-comprehensive list of changes in this release
 - Matrix types (a Clang extension) can now be used in pseudo-destructor expressions,
   which allows them to be stored in STL containers.
 
+- In the ``-ftime-report`` output, the new "Clang time report" group replaces
+  the old "Clang front-end time report" and includes "Front end", "LLVM IR
+  generation", "Optimizer", and "Machine code generation".
+
 New Compiler Flags
 ------------------
 
diff --git a/clang/include/clang/Frontend/CompilerInstance.h b/clang/include/clang/Frontend/CompilerInstance.h
index 1220a4e29471d..4a79b8d107171 100644
--- a/clang/include/clang/Frontend/CompilerInstance.h
+++ b/clang/include/clang/Frontend/CompilerInstance.h
@@ -118,7 +118,7 @@ class CompilerInstance : public ModuleLoader {
   std::unique_ptr<Sema> TheSema;
 
   /// The frontend timer group.
-  std::unique_ptr<llvm::TimerGroup> FrontendTimerGroup;
+  std::unique_ptr<llvm::TimerGroup> timerGroup;
 
   /// The frontend timer.
   std::unique_ptr<llvm::Timer> FrontendTimer;
@@ -630,6 +630,8 @@ class CompilerInstance : public ModuleLoader {
   /// @name Frontend timer
   /// @{
 
+  llvm::TimerGroup &getTimerGroup() const { return *timerGroup; }
+
   bool hasFrontendTimer() const { return (bool)FrontendTimer; }
 
   llvm::Timer &getFrontendTimer() const {
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 2863887fd4d2f..bcf6db1467ffc 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -16,6 +16,7 @@
 #include "clang/Frontend/FrontendDiagnostic.h"
 #include "clang/Frontend/Utils.h"
 #include "clang/Lex/HeaderSearchOptions.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Analysis/GlobalsModRef.h"
@@ -137,8 +138,6 @@ class EmitAssemblyHelper {
   llvm::Module *TheModule;
   IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS;
 
-  Timer CodeGenerationTime;
-
   std::unique_ptr<raw_pwrite_stream> OS;
 
   Triple TargetTriple;
@@ -208,7 +207,6 @@ class EmitAssemblyHelper {
       : CI(CI), Diags(CI.getDiagnostics()), CodeGenOpts(CI.getCodeGenOpts()),
         TargetOpts(CI.getTargetOpts()), LangOpts(CI.getLangOpts()),
         TheModule(M), VFS(std::move(VFS)),
-        CodeGenerationTime("codegen", "Code Generation Time"),
         TargetTriple(TheModule->getTargetTriple()) {}
 
   ~EmitAssemblyHelper() {
@@ -1157,7 +1155,14 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
   {
     PrettyStackTraceString CrashInfo("Optimizer");
     llvm::TimeTraceScope TimeScope("Optimizer");
+    Timer timer;
+    if (CI.getCodeGenOpts().TimePasses) {
+      timer.init("optimizer", "Optimizer", CI.getTimerGroup());
+      CI.getFrontendTimer().yieldTo(timer);
+    }
     MPM.run(*TheModule, MAM);
+    if (CI.getCodeGenOpts().TimePasses)
+      timer.yieldTo(CI.getFrontendTimer());
   }
 }
 
@@ -1200,14 +1205,20 @@ void EmitAssemblyHelper::RunCodegenPipeline(
   {
     PrettyStackTraceString CrashInfo("Code generation");
     llvm::TimeTraceScope TimeScope("CodeGenPasses");
+    Timer timer;
+    if (CI.getCodeGenOpts().TimePasses) {
+      timer.init("codegen", "Machine code generation", CI.getTimerGroup());
+      CI.getFrontendTimer().yieldTo(timer);
+    }
     CodeGenPasses.run(*TheModule);
+    if (CI.getCodeGenOpts().TimePasses)
+      timer.yieldTo(CI.getFrontendTimer());
   }
 }
 
 void EmitAssemblyHelper::emitAssembly(BackendAction Action,
                                       std::unique_ptr<raw_pwrite_stream> OS,
                                       BackendConsumer *BC) {
-  TimeRegion Region(CodeGenOpts.TimePasses ? &CodeGenerationTime : nullptr);
   setCommandLineOpts(CodeGenOpts);
 
   bool RequiresCodeGen = actionRequiresCodeGen(Action);
diff --git a/clang/lib/CodeGen/CodeGenAction.cpp b/clang/lib/CodeGen/CodeGenAction.cpp
index 7446bddc11345..15311fb207810 100644
--- a/clang/lib/CodeGen/CodeGenAction.cpp
+++ b/clang/lib/CodeGen/CodeGenAction.cpp
@@ -115,8 +115,7 @@ BackendConsumer::BackendConsumer(CompilerInstance &CI, BackendAction Action,
                                  llvm::Module *CurLinkModule)
     : CI(CI), Diags(CI.getDiagnostics()), CodeGenOpts(CI.getCodeGenOpts()),
       TargetOpts(CI.getTargetOpts()), LangOpts(CI.getLangOpts()),
-      AsmOutStream(std::move(OS)), FS(VFS),
-      LLVMIRGeneration("irgen", "LLVM IR Generation Time"), Action(Action),
+      AsmOutStream(std::move(OS)), FS(VFS), Action(Action),
       Gen(CreateLLVMCodeGen(Diags, InFile, std::move(VFS),
                             CI.getHeaderSearchOpts(), CI.getPreprocessorOpts(),
                             CI.getCodeGenOpts(), C, CoverageInfo)),
@@ -124,6 +123,8 @@ BackendConsumer::BackendConsumer(CompilerInstance &CI, BackendAction Action,
   TimerIsEnabled = CodeGenOpts.TimePasses;
   llvm::TimePassesIsEnabled = CodeGenOpts.TimePasses;
   llvm::TimePassesPerRun = CodeGenOpts.TimePassesPerRun;
+  if (CodeGenOpts.TimePasses)
+    LLVMIRGeneration.init("irgen", "LLVM IR generation", CI.getTimerGroup());
 }
 
 llvm::Module* BackendConsumer::getModule() const {
@@ -162,19 +163,13 @@ bool BackendConsumer::HandleTopLevelDecl(DeclGroupRef D) {
                                  "LLVM IR generation of declaration");
 
   // Recurse.
-  if (TimerIsEnabled) {
-    LLVMIRGenerationRefCount += 1;
-    if (LLVMIRGenerationRefCount == 1)
-      LLVMIRGeneration.startTimer();
-  }
+  if (TimerIsEnabled && !LLVMIRGenerationRefCount++)
+    CI.getFrontendTimer().yieldTo(LLVMIRGeneration);
 
   Gen->HandleTopLevelDecl(D);
 
-  if (TimerIsEnabled) {
-    LLVMIRGenerationRefCount -= 1;
-    if (LLVMIRGenerationRefCount == 0)
-      LLVMIRGeneration.stopTimer();
-  }
+  if (TimerIsEnabled && !--LLVMIRGenerationRefCount)
+    LLVMIRGeneration.yieldTo(CI.getFrontendTimer());
 
   return true;
 }
@@ -184,12 +179,12 @@ void BackendConsumer::HandleInlineFunctionDefinition(FunctionDecl *D) {
                                  Context->getSourceManager(),
                                  "LLVM IR generation of inline function");
   if (TimerIsEnabled)
-    LLVMIRGeneration.startTimer();
+    CI.getFrontendTimer().yieldTo(LLVMIRGeneration);
 
   Gen->HandleInlineFunctionDefinition(D);
 
   if (TimerIsEnabled)
-    LLVMIRGeneration.stopTimer();
+    LLVMIRGeneration.yieldTo(CI.getFrontendTimer());
 }
 
 void BackendConsumer::HandleInterestingDecl(DeclGroupRef D) {
@@ -239,19 +234,13 @@ void BackendConsumer::HandleTranslationUnit(ASTContext &C) {
   {
     llvm::TimeTraceScope TimeScope("Frontend");
     PrettyStackTraceString CrashInfo("Per-file LLVM IR generation");
-    if (TimerIsEnabled) {
-      LLVMIRGenerationRefCount += 1;
-      if (LLVMIRGenerationRefCount == 1)
-        LLVMIRGeneration.startTimer();
-    }
+    if (TimerIsEnabled && !LLVMIRGenerationRefCount++)
+      CI.getFrontendTimer().yieldTo(LLVMIRGeneration);
 
     Gen->HandleTranslationUnit(C);
 
-    if (TimerIsEnabled) {
-      LLVMIRGenerationRefCount -= 1;
-      if (LLVMIRGenerationRefCount == 0)
-        LLVMIRGeneration.stopTimer();
-    }
+    if (TimerIsEnabled && !--LLVMIRGenerationRefCount)
+      LLVMIRGeneration.yieldTo(CI.getFrontendTimer());
 
     IRGenFinished = true;
   }
diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index fbfc305ca06a0..b00a4ac075776 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -722,11 +722,8 @@ void CompilerInstance::createCodeCompletionConsumer() {
 }
 
 void CompilerInstance::createFrontendTimer() {
-  FrontendTimerGroup.reset(
-      new llvm::TimerGroup("frontend", "Clang front-end time report"));
-  FrontendTimer.reset(
-      new llvm::Timer("frontend", "Clang front-end timer",
-                      *FrontendTimerGroup));
+  timerGroup.reset(new llvm::TimerGroup("clang", "Clang time report"));
+  FrontendTimer.reset(new llvm::Timer("frontend", "Front end", *timerGroup));
 }
 
 CodeCompleteConsumer *
@@ -1726,10 +1723,9 @@ void CompilerInstance::createASTReader() {
   const FrontendOptions &FEOpts = getFrontendOpts();
   std::unique_ptr<llvm::Timer> ReadTimer;
 
-  if (FrontendTimerGroup)
+  if (timerGroup)
     ReadTimer = std::make_unique<llvm::Timer>("reading_modules",
-                                                "Reading modules",
-                                                *FrontendTimerGroup);
+                                              "Reading modules", *timerGroup);
   TheASTReader = new ASTReader(
       getPreprocessor(), getModuleCache(), &getASTContext(),
       getPCHContainerReader(), getFrontendOpts().ModuleFileExtensions,
@@ -1758,10 +1754,10 @@ void CompilerInstance::createASTReader() {
 bool CompilerInstance::loadModuleFile(
     StringRef FileName, serialization::ModuleFile *&LoadedModuleFile) {
   llvm::Timer Timer;
-  if (FrontendTimerGroup)
+  if (timerGroup)
     Timer.init("preloading." + FileName.str(), "Preloading " + FileName.str(),
-               *FrontendTimerGroup);
-  llvm::TimeRegion TimeLoading(FrontendTimerGroup ? &Timer : nullptr);
+               *timerGroup);
+  llvm::TimeRegion TimeLoading(timerGroup ? &Timer : nullptr);
 
   // If we don't already have an ASTReader, create one now.
   if (!TheASTReader)
@@ -1892,10 +1888,10 @@ ModuleLoadResult CompilerInstance::findOrCompileModuleAndReadAST(
 
   // Time how long it takes to load the module.
   llvm::Timer Timer;
-  if (FrontendTimerGroup)
+  if (timerGroup)
     Timer.init("loading." + ModuleFilename, "Loading " + ModuleFilename,
-               *FrontendTimerGroup);
-  llvm::TimeRegion TimeLoading(FrontendTimerGroup ? &Timer : nullptr);
+               *timerGroup);
+  llvm::TimeRegion TimeLoading(timerGroup ? &Timer : nullptr);
   llvm::TimeTraceScope TimeScope("Module Load", ModuleName);
 
   // Try to load the module file. If we are not trying to load from the
diff --git a/clang/test/Frontend/ftime-report-template-decl.cpp b/clang/test/Frontend/ftime-report-template-decl.cpp
index 9ba9107b98040..45ce0dc15c56f 100644
--- a/clang/test/Frontend/ftime-report-template-decl.cpp
+++ b/clang/test/Frontend/ftime-report-template-decl.cpp
@@ -1,5 +1,6 @@
-// RUN: %clang_cc1 %s -emit-llvm -o - -ftime-report  2>&1 | FileCheck %s
-// RUN: %clang_cc1 %s -emit-llvm -o - -fdelayed-template-parsing -DDELAYED_TEMPLATE_PARSING -ftime-report  2>&1 | FileCheck %s
+// REQUIRES: x86-registered-target
+// RUN: %clang_cc1 %s -S -triple=x86_64 -mllvm -sort-timers=0 -o - -ftime-report  2>&1 | FileCheck %s
+// RUN: %clang_cc1 %s -S -triple=x86_64 -mllvm -sort-timers=0 -o - -fdelayed-template-parsing -DDELAYED_TEMPLATE_PARSING -ftime-report  2>&1 | FileCheck %s
 
 // Template function declarations
 template <typename T>
@@ -150,10 +151,9 @@ struct _Wrap_alloc {
 };
 _Wrap_alloc<int>::rebind<int> w;
 
-// CHECK: Miscellaneous Ungrouped Timers
-// CHECK-DAG: LLVM IR Generation Time
-// CHECK-DAG: Code Generation Time
-// CHECK: Total
-// CHECK: Clang front-end time report
-// CHECK: Clang front-end timer
-// CHECK: Total
+// CHECK: Clang time report
+// CHECK:      Front end
+// CHECK-NEXT: LLVM IR generation
+// CHECK-NEXT: Machine code generation
+// CHECK-NEXT: Optimizer
+// CHECK-NEXT: Total
diff --git a/llvm/include/llvm/Support/Timer.h b/llvm/include/llvm/Support/Timer.h
index d21859905d4a7..abe30451dd2f2 100644
--- a/llvm/include/llvm/Support/Timer.h
+++ b/llvm/include/llvm/Support/Timer.h
@@ -131,6 +131,9 @@ class Timer {
   /// Clear the timer state.
   void clear();
 
+  /// Stop the timer and start another timer.
+  void yieldTo(Timer &);
+
   /// Return the duration for which this timer has been running.
   TimeRecord getTotalTime() const { return Time; }
 
diff --git a/llvm/lib/Support/Timer.cpp b/llvm/lib/Support/Timer.cpp
index 1fa2cdf297aae..fbf3becf39fa1 100644
--- a/llvm/lib/Support/Timer.cpp
+++ b/llvm/lib/Support/Timer.cpp
@@ -166,6 +166,11 @@ void Timer::clear() {
   Time = StartTime = TimeRecord();
 }
 
+void Timer::yieldTo(Timer &O) {
+  stopTimer();
+  O.startTimer();
+}
+
 static void printVal(double Val, double Total, raw_ostream &OS) {
   if (Total < 1e-7)   // Avoid dividing by zero.
     OS << "        -----     ";

From 18078605046c50f01f31e826ea3591f99019de38 Mon Sep 17 00:00:00 2001
From: amosher-nvidia <amosher@nvidia.com>
Date: Fri, 10 Jan 2025 19:30:38 -0800
Subject: [PATCH 139/408] [lld][ELF] Handle archive special casing in Input
 Sections (#119293)

According to the binutils spec:
https://sourceware.org/binutils/docs/ld/Input-Section-Basics.html

You should be able to specify all files in an archive using this syntax
`archivename:` , however, lld currently will only accept `archivename:*`
to match all files within an archive.

This patch will, only when necessary, create a copy of the file
specification and add an implicit wildcard `*` to the end. It also
updates the filename-spec linkerscript test to check for this behavior.

---------

Co-authored-by: Peter Smith <peter.smith@arm.com>
---
 lld/ELF/LinkerScript.cpp                  | 12 ++++++++++--
 lld/ELF/LinkerScript.h                    | 19 +++++++++++++++++--
 lld/test/ELF/linkerscript/filename-spec.s | 12 +++++++++---
 3 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp
index a8e3d6486353d..120f5271cf229 100644
--- a/lld/ELF/LinkerScript.cpp
+++ b/lld/ELF/LinkerScript.cpp
@@ -406,8 +406,16 @@ bool InputSectionDescription::matchesFile(const InputFile &file) const {
   if (filePat.isTrivialMatchAll())
     return true;
 
-  if (!matchesFileCache || matchesFileCache->first != &file)
-    matchesFileCache.emplace(&file, filePat.match(file.getNameForScript()));
+  if (!matchesFileCache || matchesFileCache->first != &file) {
+    if (matchType == MatchType::WholeArchive) {
+      matchesFileCache.emplace(&file, filePat.match(file.archiveName));
+    } else {
+      if (matchType == MatchType::ArchivesExcluded && !file.archiveName.empty())
+        matchesFileCache.emplace(&file, false);
+      else
+        matchesFileCache.emplace(&file, filePat.match(file.getNameForScript()));
+    }
+  }
 
   return matchesFileCache->second;
 }
diff --git a/lld/ELF/LinkerScript.h b/lld/ELF/LinkerScript.h
index 721425166296a..0a2dda13f4ef8 100644
--- a/lld/ELF/LinkerScript.h
+++ b/lld/ELF/LinkerScript.h
@@ -194,6 +194,7 @@ class SectionPattern {
 };
 
 class InputSectionDescription : public SectionCommand {
+  enum class MatchType { Trivial, WholeArchive, ArchivesExcluded } matchType;
   SingleStringMatcher filePat;
 
   // Cache of the most recent input argument and result of matchesFile().
@@ -202,10 +203,24 @@ class InputSectionDescription : public SectionCommand {
 public:
   InputSectionDescription(StringRef filePattern, uint64_t withFlags = 0,
                           uint64_t withoutFlags = 0, StringRef classRef = {})
-      : SectionCommand(InputSectionKind), filePat(filePattern),
-        classRef(classRef), withFlags(withFlags), withoutFlags(withoutFlags) {
+      : SectionCommand(InputSectionKind), matchType(MatchType::Trivial),
+        filePat(filePattern), classRef(classRef), withFlags(withFlags),
+        withoutFlags(withoutFlags) {
     assert((filePattern.empty() || classRef.empty()) &&
            "file pattern and class reference are mutually exclusive");
+
+    // The matching syntax for whole archives and files outside of an archive
+    // can't be handled by SingleStringMatcher, and instead are handled
+    // manually within matchesFile()
+    if (!filePattern.empty()) {
+      if (filePattern.back() == ':') {
+        matchType = MatchType::WholeArchive;
+        filePat = filePattern.drop_back();
+      } else if (filePattern.front() == ':') {
+        matchType = MatchType::ArchivesExcluded;
+        filePat = filePattern.drop_front();
+      }
+    }
   }
 
   static bool classof(const SectionCommand *c) {
diff --git a/lld/test/ELF/linkerscript/filename-spec.s b/lld/test/ELF/linkerscript/filename-spec.s
index 4fc4f2f421752..6d5761f67f9f3 100644
--- a/lld/test/ELF/linkerscript/filename-spec.s
+++ b/lld/test/ELF/linkerscript/filename-spec.s
@@ -4,7 +4,7 @@
 # RUN: llvm-mc -filetype=obj -triple=x86_64 tx.s -o tx.o
 # RUN: llvm-mc -filetype=obj -triple=x86_64 ty.s -o ty.o
 
-# RUN: echo 'SECTIONS{.foo :{ KEEP(*x.o(.foo)) KEEP(*y.o(.foo)) }}' > 1.t
+# RUN: echo 'SECTIONS{.foo :{ KEEP(:*x.o(.foo)) KEEP(*y.o(.foo)) }}' > 1.t
 # RUN: ld.lld -o 1 -T 1.t tx.o ty.o
 # RUN: llvm-objdump -s 1 | FileCheck --check-prefix=FIRSTY %s
 # FIRSTY:      Contents of section .foo:
@@ -18,7 +18,7 @@
 
 ## Now the same tests but without KEEP. Checking that file name inside
 ## KEEP is parsed fine.
-# RUN: echo 'SECTIONS{.foo :{ *x.o(.foo) *y.o(.foo) }}' > 3.t
+# RUN: echo 'SECTIONS{.foo :{ :*x.o(.foo) *y.o(.foo) }}' > 3.t
 # RUN: ld.lld -o 3 -T 3.t tx.o ty.o
 # RUN: llvm-objdump -s 3 | FileCheck --check-prefix=FIRSTY %s
 
@@ -41,6 +41,7 @@
 # RUN: cp ty.o dir2/filename-spec2.o
 # RUN: llvm-ar rc dir1/lib1.a dir1/filename-spec1.o
 # RUN: llvm-ar rc dir2/lib2.a dir2/filename-spec2.o
+# RUN: llvm-ar rc combined.a tx.o ty.o
 
 ## Verify matching of archive library names.
 # RUN: echo 'SECTIONS{.foo :{ *lib2*(.foo) *lib1*(.foo) }}' > 7.t
@@ -55,7 +56,7 @@
 # RUN: llvm-objdump -s 8 | FileCheck --check-prefix=SECONDFIRST %s
 
 ## Verify matching of archive library names in KEEP.
-# RUN: echo 'SECTIONS{.foo :{ KEEP(*lib2*(.foo)) KEEP(*lib1*(.foo)) }}' > 9.t
+# RUN: echo 'SECTIONS{.foo :{ KEEP(*lib2.a:(.foo)) KEEP(*lib1*(.foo)) }}' > 9.t
 # RUN: ld.lld -o 9 -T 9.t --whole-archive \
 # RUN:   dir1/lib1.a dir2/lib2.a
 # RUN: llvm-objdump -s 9 | FileCheck --check-prefix=SECONDFIRST %s
@@ -72,6 +73,11 @@
 # RUN: ld.lld -o 11 -T 11.t --whole-archive 'lib1().a' dir2/lib2.a
 # RUN: llvm-objdump -s 11 | FileCheck --check-prefix=SECONDFIRST %s
 
+## Verify that matching files excluded from an archive will not match files within one.
+# RUN: echo 'SECTIONS{.foo :{ KEEP(:*x.o(.foo)) KEEP(*y.o(.foo)) KEEP(*x.o(.foo)) }}' > 12.t
+# RUN: ld.lld -o 12 -T 12.t --whole-archive combined.a
+# RUN: llvm-objdump -s 12 | FileCheck --check-prefix=SECONDFIRST %s
+
 #--- tx.s
 .global _start
 _start:

From 90eca3f3f43048f055e70e3edd4e2d8ea2a26783 Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang@microsoft.com>
Date: Fri, 10 Jan 2025 19:32:54 -0800
Subject: [PATCH 140/408] [Github] Explicitly requesting Ubuntu 22.04 for SPIRV
 test (#122395)

For the same reason as
[#122221](https://github.com/llvm/llvm-project/pull/122221), this fixes
build failure from missing python3.
---
 .github/workflows/spirv-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/spirv-tests.yml b/.github/workflows/spirv-tests.yml
index 75918e73e8973..34c77a398c150 100644
--- a/.github/workflows/spirv-tests.yml
+++ b/.github/workflows/spirv-tests.yml
@@ -26,4 +26,4 @@ jobs:
       build_target: check-llvm-codegen-spirv
       projects:
       extra_cmake_args: '-DLLVM_TARGETS_TO_BUILD="" -DLLVM_EXPERIMENTAL_TARGETS_TO_BUILD="SPIRV" -DLLVM_INCLUDE_SPIRV_TOOLS_TESTS=ON'
-      os_list: '["ubuntu-latest"]'
+      os_list: '["ubuntu-22.04"]'

From 799955eb176042999b4d12a901b1c33b42035014 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Fri, 10 Jan 2025 19:33:20 -0800
Subject: [PATCH 141/408] [ThinLTO] Skip opt pipeline and summary wrapper pass
 on empty modules (#120143)

Follow up to PR118508, to avoid unnecessary compile time for an empty
combind regular LTO module if all modules end up being ThinLTO only.

This required minor changes to a few tests to ensure they weren't empty.
---
 lld/test/ELF/lto/new-pass-manager.ll      |  4 +++
 llvm/lib/LTO/LTOBackend.cpp               | 31 +++++++++++++++++++----
 llvm/test/Feature/load_plugin_error.ll    |  2 +-
 llvm/test/Other/X86/lto-hot-cold-split.ll |  4 +++
 4 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/lld/test/ELF/lto/new-pass-manager.ll b/lld/test/ELF/lto/new-pass-manager.ll
index cc6ff34cd91ae..77a1d4ed3d27d 100644
--- a/lld/test/ELF/lto/new-pass-manager.ll
+++ b/lld/test/ELF/lto/new-pass-manager.ll
@@ -9,3 +9,7 @@
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo() {
+  ret void
+}
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index bdf4ff8960bc8..8a2dddce4892c 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -351,6 +351,14 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
   MPM.run(Mod, MAM);
 }
 
+static bool isEmptyModule(const Module &Mod) {
+  // Module is empty if it has no functions, no globals, no inline asm and no
+  // named metadata (aliases and ifuncs require functions or globals so we
+  // don't need to check those explicitly).
+  return Mod.empty() && Mod.global_empty() && Mod.named_metadata_empty() &&
+         Mod.getModuleInlineAsm().empty();
+}
+
 bool lto::opt(const Config &Conf, TargetMachine *TM, unsigned Task, Module &Mod,
               bool IsThinLTO, ModuleSummaryIndex *ExportSummary,
               const ModuleSummaryIndex *ImportSummary,
@@ -372,9 +380,16 @@ bool lto::opt(const Config &Conf, TargetMachine *TM, unsigned Task, Module &Mod,
                                /*EmbedBitcode*/ true, /*EmbedCmdline*/ true,
                                /*Cmdline*/ CmdArgs);
   }
-  // FIXME: Plumb the combined index into the new pass manager.
-  runNewPMPasses(Conf, Mod, TM, Conf.OptLevel, IsThinLTO, ExportSummary,
-                 ImportSummary);
+  // No need to run any opt passes if the module is empty.
+  // In theory these passes should take almost no time for an empty
+  // module, however, this guards against doing any unnecessary summary-based
+  // analysis in the case of a ThinLTO build where this might be an empty
+  // regular LTO combined module, with a large combined index from ThinLTO.
+  if (!isEmptyModule(Mod)) {
+    // FIXME: Plumb the combined index into the new pass manager.
+    runNewPMPasses(Conf, Mod, TM, Conf.OptLevel, IsThinLTO, ExportSummary,
+                   ImportSummary);
+  }
   return !Conf.PostOptModuleHook || Conf.PostOptModuleHook(Task, Mod);
 }
 
@@ -422,8 +437,14 @@ static void codegen(const Config &Conf, TargetMachine *TM,
   legacy::PassManager CodeGenPasses;
   TargetLibraryInfoImpl TLII(Triple(Mod.getTargetTriple()));
   CodeGenPasses.add(new TargetLibraryInfoWrapperPass(TLII));
-  CodeGenPasses.add(
-      createImmutableModuleSummaryIndexWrapperPass(&CombinedIndex));
+  // No need to make index available if the module is empty.
+  // In theory these passes should not use the index for an empty
+  // module, however, this guards against doing any unnecessary summary-based
+  // analysis in the case of a ThinLTO build where this might be an empty
+  // regular LTO combined module, with a large combined index from ThinLTO.
+  if (!isEmptyModule(Mod))
+    CodeGenPasses.add(
+        createImmutableModuleSummaryIndexWrapperPass(&CombinedIndex));
   if (Conf.PreCodeGenPassesHook)
     Conf.PreCodeGenPassesHook(CodeGenPasses);
   if (TM->addPassesToEmitFile(CodeGenPasses, *Stream->OS,
diff --git a/llvm/test/Feature/load_plugin_error.ll b/llvm/test/Feature/load_plugin_error.ll
index a112bfbb1cd19..b40dddff1205f 100644
--- a/llvm/test/Feature/load_plugin_error.ll
+++ b/llvm/test/Feature/load_plugin_error.ll
@@ -5,7 +5,7 @@
 
 ; RUN: opt %s -o %t.o
 ; RUN: not llvm-lto2 run -load-pass-plugin=%t/nonexistent.so %t.o -o %t \
-; RUN:     -r %t.o,test 2>&1 | \
+; RUN:     -r %t.o,test,plx 2>&1 | \
 ; RUN:   FileCheck %s
 
 ; CHECK: Could not load library {{.*}}nonexistent.so
diff --git a/llvm/test/Other/X86/lto-hot-cold-split.ll b/llvm/test/Other/X86/lto-hot-cold-split.ll
index 22c79c7e06bba..24903f34ed074 100644
--- a/llvm/test/Other/X86/lto-hot-cold-split.ll
+++ b/llvm/test/Other/X86/lto-hot-cold-split.ll
@@ -10,3 +10,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; OLDPM-ANYLTO-POSTLINK-Os: HotColdSplittingPass
+
+define void @foo() {
+  ret void
+}

From 24bd9bc0b59d51c82e9a4d84c21d86d58d0ef6ce Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 10 Jan 2025 20:46:57 -0800
Subject: [PATCH 142/408] [Timer] Remove signpots overhead on unsupported
 systems

startTimer/stopTimer are frequently called. It's important to reduce
overhead.
---
 llvm/lib/Support/Timer.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/llvm/lib/Support/Timer.cpp b/llvm/lib/Support/Timer.cpp
index fbf3becf39fa1..089dae2886f22 100644
--- a/llvm/lib/Support/Timer.cpp
+++ b/llvm/lib/Support/Timer.cpp
@@ -53,6 +53,7 @@ class Name2PairMap;
 static std::string &libSupportInfoOutputFilename();
 static bool trackSpace();
 static bool sortTimers();
+[[maybe_unused]]
 static SignpostEmitter &signposts();
 static sys::SmartMutex<true> &timerLock();
 static TimerGroup &defaultTimerGroup();
@@ -149,7 +150,9 @@ TimeRecord TimeRecord::getCurrentTime(bool Start) {
 void Timer::startTimer() {
   assert(!Running && "Cannot start a running timer");
   Running = Triggered = true;
+#if LLVM_SUPPORT_XCODE_SIGNPOSTS
   signposts().startInterval(this, getName());
+#endif
   StartTime = TimeRecord::getCurrentTime(true);
 }
 
@@ -158,7 +161,9 @@ void Timer::stopTimer() {
   Running = false;
   Time += TimeRecord::getCurrentTime(false);
   Time -= StartTime;
+#if LLVM_SUPPORT_XCODE_SIGNPOSTS
   signposts().endInterval(this, getName());
+#endif
 }
 
 void Timer::clear() {

From 2d5f07c82836bde6f5ae16518931a78783a22ec8 Mon Sep 17 00:00:00 2001
From: Veera <32646674+veera-sivarajan@users.noreply.github.com>
Date: Sat, 11 Jan 2025 11:26:13 +0530
Subject: [PATCH 143/408] [InstCombine] Fold `X udiv Y` to `X lshr cttz(Y)` if
 Y is a power of 2 (#121386)

Fixes #115767

This PR folds `X udiv Y` to `X lshr cttz(Y)` if Y is a power of two
since bitwise operations are faster than division.

Proof: https://alive2.llvm.org/ce/z/qHmLta
---
 .../InstCombine/InstCombineMulDivRem.cpp      |  19 ++-
 .../IndVarSimplify/rewrite-loop-exit-value.ll |   6 +-
 llvm/test/Transforms/InstCombine/div-shift.ll | 113 +++++++++++++++++-
 3 files changed, 130 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 1c5070a1b867c..d0b2ded127ff7 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -1598,8 +1598,23 @@ Instruction *InstCombinerImpl::visitUDiv(BinaryOperator &I) {
     return Lshr;
   }
 
-  // Op1 udiv Op2 -> Op1 lshr log2(Op2), if log2() folds away.
-  if (Value *Res = tryGetLog2(Op1, /*AssumeNonZero=*/true))
+  auto GetShiftableDenom = [&](Value *Denom) -> Value * {
+    // Op0 udiv Op1 -> Op0 lshr log2(Op1), if log2() folds away.
+    if (Value *Log2 = tryGetLog2(Op1, /*AssumeNonZero=*/true))
+      return Log2;
+
+    // Op0 udiv Op1 -> Op0 lshr cttz(Op1), if Op1 is a power of 2.
+    if (isKnownToBeAPowerOfTwo(Denom, /*OrZero=*/true, /*Depth=*/0, &I))
+      // This will increase instruction count but it's okay
+      // since bitwise operations are substantially faster than
+      // division.
+      return Builder.CreateBinaryIntrinsic(Intrinsic::cttz, Denom,
+                                           Builder.getTrue());
+
+    return nullptr;
+  };
+
+  if (auto *Res = GetShiftableDenom(Op1))
     return replaceInstUsesWith(
         I, Builder.CreateLShr(Op0, Res, I.getName(), I.isExact()));
 
diff --git a/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-value.ll b/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-value.ll
index 1956f454a52bb..fa47d06d859e9 100644
--- a/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-value.ll
+++ b/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-value.ll
@@ -218,7 +218,8 @@ define i32 @vscale_slt_with_vp_umin(ptr nocapture %A, i32 %n) mustprogress vscal
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i32 [[N]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = udiv i32 [[TMP0]], [[VF]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call range(i32 2, 33) i32 @llvm.cttz.i32(i32 [[VF]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP1]], [[VSCALE]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[N]], [[TMP3]]
@@ -270,7 +271,8 @@ define i32 @vscale_slt_with_vp_umin2(ptr nocapture %A, i32 %n) mustprogress vsca
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = udiv i32 [[TMP0]], [[VF]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call range(i32 2, 33) i32 @llvm.cttz.i32(i32 [[VF]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP1]], [[VSCALE]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[N]], [[TMP3]]
diff --git a/llvm/test/Transforms/InstCombine/div-shift.ll b/llvm/test/Transforms/InstCombine/div-shift.ll
index 8dd6d4a2e8371..af83f37011ba0 100644
--- a/llvm/test/Transforms/InstCombine/div-shift.ll
+++ b/llvm/test/Transforms/InstCombine/div-shift.ll
@@ -148,7 +148,8 @@ define i8 @udiv_umin_extra_use(i8 %x, i8 %y, i8 %z) {
 ; CHECK-NEXT:    [[Z2:%.*]] = shl nuw i8 1, [[Z:%.*]]
 ; CHECK-NEXT:    [[M:%.*]] = call i8 @llvm.umin.i8(i8 [[Y2]], i8 [[Z2]])
 ; CHECK-NEXT:    call void @use(i8 [[M]])
-; CHECK-NEXT:    [[D:%.*]] = udiv i8 [[X:%.*]], [[M]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[M]], i1 true)
+; CHECK-NEXT:    [[D:%.*]] = lshr i8 [[X:%.*]], [[TMP1]]
 ; CHECK-NEXT:    ret i8 [[D]]
 ;
   %y2 = shl i8 1, %y
@@ -165,7 +166,8 @@ define i8 @udiv_smin(i8 %x, i8 %y, i8 %z) {
 ; CHECK-NEXT:    [[Y2:%.*]] = shl nuw i8 1, [[Y:%.*]]
 ; CHECK-NEXT:    [[Z2:%.*]] = shl nuw i8 1, [[Z:%.*]]
 ; CHECK-NEXT:    [[M:%.*]] = call i8 @llvm.smin.i8(i8 [[Y2]], i8 [[Z2]])
-; CHECK-NEXT:    [[D:%.*]] = udiv i8 [[X:%.*]], [[M]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[M]], i1 true)
+; CHECK-NEXT:    [[D:%.*]] = lshr i8 [[X:%.*]], [[TMP1]]
 ; CHECK-NEXT:    ret i8 [[D]]
 ;
   %y2 = shl i8 1, %y
@@ -181,7 +183,8 @@ define i8 @udiv_smax(i8 %x, i8 %y, i8 %z) {
 ; CHECK-NEXT:    [[Y2:%.*]] = shl nuw i8 1, [[Y:%.*]]
 ; CHECK-NEXT:    [[Z2:%.*]] = shl nuw i8 1, [[Z:%.*]]
 ; CHECK-NEXT:    [[M:%.*]] = call i8 @llvm.smax.i8(i8 [[Y2]], i8 [[Z2]])
-; CHECK-NEXT:    [[D:%.*]] = udiv i8 [[X:%.*]], [[M]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[M]], i1 true)
+; CHECK-NEXT:    [[D:%.*]] = lshr i8 [[X:%.*]], [[TMP1]]
 ; CHECK-NEXT:    ret i8 [[D]]
 ;
   %y2 = shl i8 1, %y
@@ -1006,7 +1009,8 @@ define i8 @udiv_fail_shl_overflow(i8 %x, i8 %y) {
 ; CHECK-LABEL: @udiv_fail_shl_overflow(
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i8 2, [[Y:%.*]]
 ; CHECK-NEXT:    [[MIN:%.*]] = call i8 @llvm.umax.i8(i8 [[SHL]], i8 1)
-; CHECK-NEXT:    [[MUL:%.*]] = udiv i8 [[X:%.*]], [[MIN]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[MIN]], i1 true)
+; CHECK-NEXT:    [[MUL:%.*]] = lshr i8 [[X:%.*]], [[TMP1]]
 ; CHECK-NEXT:    ret i8 [[MUL]]
 ;
   %shl = shl i8 2, %y
@@ -1294,3 +1298,104 @@ entry:
   %div = sdiv i32 %add, %add2
   ret i32 %div
 }
+
+define i8 @udiv_if_power_of_two(i8 %x, i8 %y) {
+; CHECK-LABEL: @udiv_if_power_of_two(
+; CHECK-NEXT:  start:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[Y:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i8 [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[TMP1]], label [[BB1:%.*]], label [[BB3:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP2:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[Y]], i1 true)
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i8 [[X:%.*]], [[TMP2]]
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[_0_SROA_0_0:%.*]] = phi i8 [ [[TMP3]], [[BB1]] ], [ 0, [[START:%.*]] ]
+; CHECK-NEXT:    ret i8 [[_0_SROA_0_0]]
+;
+start:
+  %ctpop = tail call i8 @llvm.ctpop.i8(i8 %y)
+  %cmp = icmp eq i8 %ctpop, 1
+  br i1 %cmp, label %bb1, label %bb3
+
+bb1:
+  %div = udiv i8 %x, %y
+  br label %bb3
+
+bb3:
+  %result = phi i8 [ %div, %bb1 ], [ 0, %start ]
+  ret i8 %result
+}
+
+define i8 @udiv_exact_assume_power_of_two(i8 %x, i8 %y) {
+; CHECK-LABEL: @udiv_exact_assume_power_of_two(
+; CHECK-NEXT:  start:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call range(i8 1, 9) i8 @llvm.ctpop.i8(i8 [[Y:%.*]])
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i8 [[TMP0]], 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[Y]], i1 true)
+; CHECK-NEXT:    [[_0:%.*]] = lshr exact i8 [[X:%.*]], [[TMP1]]
+; CHECK-NEXT:    ret i8 [[_0]]
+;
+start:
+  %ctpop = tail call i8 @llvm.ctpop.i8(i8 %y)
+  %cond = icmp eq i8 %ctpop, 1
+  tail call void @llvm.assume(i1 %cond)
+  %div = udiv exact i8 %x, %y
+  ret i8 %div
+}
+
+define i7 @udiv_assume_power_of_two_illegal_type(i7 %x, i7 %y) {
+; CHECK-LABEL: @udiv_assume_power_of_two_illegal_type(
+; CHECK-NEXT:  start:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call range(i7 1, 8) i7 @llvm.ctpop.i7(i7 [[Y:%.*]])
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i7 [[TMP0]], 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i7 0, 8) i7 @llvm.cttz.i7(i7 [[Y]], i1 true)
+; CHECK-NEXT:    [[_0:%.*]] = lshr i7 [[X:%.*]], [[TMP1]]
+; CHECK-NEXT:    ret i7 [[_0]]
+;
+start:
+  %ctpop = tail call i7 @llvm.ctpop.i7(i7 %y)
+  %cond = icmp eq i7 %ctpop, 1
+  tail call void @llvm.assume(i1 %cond)
+  %div = udiv i7 %x, %y
+  ret i7 %div
+}
+
+define i8 @udiv_assume_power_of_two_multiuse(i8 %x, i8 %y) {
+; CHECK-LABEL: @udiv_assume_power_of_two_multiuse(
+; CHECK-NEXT:  start:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call range(i8 1, 9) i8 @llvm.ctpop.i8(i8 [[Y:%.*]])
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i8 [[TMP0]], 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[Y]], i1 true)
+; CHECK-NEXT:    [[_0:%.*]] = lshr i8 [[X:%.*]], [[TMP1]]
+; CHECK-NEXT:    call void @use(i8 [[_0]])
+; CHECK-NEXT:    ret i8 [[_0]]
+;
+start:
+  %ctpop = tail call i8 @llvm.ctpop.i8(i8 %y)
+  %cond = icmp eq i8 %ctpop, 1
+  tail call void @llvm.assume(i1 %cond)
+  %div = udiv i8 %x, %y
+  call void @use(i8 %div)
+  ret i8 %div
+}
+
+define i8 @udiv_power_of_two_negative(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @udiv_power_of_two_negative(
+; CHECK-NEXT:  start:
+; CHECK-NEXT:    [[CTPOP:%.*]] = tail call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[Z:%.*]])
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i8 [[CTPOP]], 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[_0:%.*]] = udiv i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i8 [[_0]]
+;
+start:
+  %ctpop = tail call i8 @llvm.ctpop.i8(i8 %z)
+  %cond = icmp eq i8 %ctpop, 1
+  tail call void @llvm.assume(i1 %cond)
+  %div = udiv i8 %x, %y
+  ret i8 %div
+}

From a418eb1c0ddeb119d9cfbf6d6e80c0f118503d13 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 10 Jan 2025 18:34:08 -0800
Subject: [PATCH 144/408] [ARM] Use GenericTable PrimaryKey to remove one of
 the SearchIndexes for BankedRegsList. NFC

---
 llvm/lib/Target/ARM/ARMSystemRegister.td | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMSystemRegister.td b/llvm/lib/Target/ARM/ARMSystemRegister.td
index 3afc410e04568..d68cab1d0ebb6 100644
--- a/llvm/lib/Target/ARM/ARMSystemRegister.td
+++ b/llvm/lib/Target/ARM/ARMSystemRegister.td
@@ -154,6 +154,9 @@ class BankedReg<string name,  bits<8> enc> {
 def BankedRegsList : GenericTable {
   let FilterClass = "BankedReg";
   let Fields = ["Name", "Encoding"];
+
+  let PrimaryKey = ["Encoding"];
+  let PrimaryKeyName = "lookupBankedRegByEncoding";
 }
 
 def lookupBankedRegByName : SearchIndex {
@@ -161,11 +164,6 @@ def lookupBankedRegByName : SearchIndex {
   let Key = ["Name"];
 }
 
-def lookupBankedRegByEncoding : SearchIndex {
-  let Table = BankedRegsList;
-  let Key = ["Encoding"];
-}
-
 
 // The values here come from B9.2.3 of the ARM ARM, where bits 4-0 are SysM
 // and bit 5 is R.

From cfe26358e3051755961fb1f3b46328dc2c326895 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Sat, 11 Jan 2025 07:12:37 +0100
Subject: [PATCH 145/408] Reapply "[clang] Avoid re-evaluating field bitwidth"
 (#122289)

---
 .../bugprone/NarrowingConversionsCheck.cpp    |  2 +-
 .../bugprone/TooSmallLoopVariableCheck.cpp    |  2 +-
 .../hicpp/MultiwayPathsCoveredCheck.cpp       |  2 +-
 clang-tools-extra/clangd/Hover.cpp            |  2 +-
 clang/include/clang/AST/Decl.h                |  6 ++--
 clang/include/clang/ASTMatchers/ASTMatchers.h |  3 +-
 clang/lib/AST/ASTContext.cpp                  | 10 +++---
 clang/lib/AST/ByteCode/Interp.h               | 10 +++---
 .../lib/AST/ByteCode/InterpBuiltinBitCast.cpp |  8 ++---
 clang/lib/AST/Decl.cpp                        | 16 ++++++---
 clang/lib/AST/DeclCXX.cpp                     |  2 +-
 clang/lib/AST/Expr.cpp                        |  3 +-
 clang/lib/AST/ExprConstant.cpp                |  2 +-
 clang/lib/AST/Randstruct.cpp                  |  2 +-
 clang/lib/AST/RecordLayoutBuilder.cpp         |  6 ++--
 clang/lib/CodeGen/ABIInfo.cpp                 |  2 +-
 clang/lib/CodeGen/ABIInfoImpl.cpp             |  2 +-
 clang/lib/CodeGen/CGCall.cpp                  |  6 ++--
 clang/lib/CodeGen/CGClass.cpp                 |  2 +-
 clang/lib/CodeGen/CGDebugInfo.cpp             |  8 ++---
 clang/lib/CodeGen/CGNonTrivialStruct.cpp      |  6 ++--
 clang/lib/CodeGen/CGObjCMac.cpp               |  3 +-
 clang/lib/CodeGen/CGObjCRuntime.cpp           |  2 +-
 clang/lib/CodeGen/CGRecordLayoutBuilder.cpp   | 20 +++++------
 clang/lib/CodeGen/SwiftCallingConv.cpp        |  2 +-
 clang/lib/CodeGen/Targets/LoongArch.cpp       |  2 +-
 clang/lib/CodeGen/Targets/RISCV.cpp           |  2 +-
 clang/lib/CodeGen/Targets/X86.cpp             |  2 +-
 clang/lib/CodeGen/Targets/XCore.cpp           |  2 +-
 .../Frontend/Rewrite/RewriteModernObjC.cpp    |  3 +-
 clang/lib/Sema/SemaChecking.cpp               | 10 +++---
 clang/lib/Sema/SemaDecl.cpp                   | 21 +++++------
 clang/lib/Sema/SemaDeclCXX.cpp                |  6 ++--
 clang/lib/Sema/SemaDeclObjC.cpp               |  3 +-
 clang/lib/Sema/SemaOverload.cpp               |  2 +-
 clang/lib/StaticAnalyzer/Core/RegionStore.cpp |  2 +-
 clang/tools/libclang/CXType.cpp               |  2 +-
 clang/unittests/AST/ASTImporterTest.cpp       |  4 +--
 .../TypeSystem/Clang/TypeSystemClang.cpp      |  2 ++
 .../x86/no_unique_address-with-bitfields.cpp  | 36 ++++++++++++++-----
 40 files changed, 125 insertions(+), 103 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.cpp
index a950704208c73..408390ebc70b6 100644
--- a/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.cpp
@@ -38,7 +38,7 @@ AST_MATCHER(FieldDecl, hasIntBitwidth) {
   assert(Node.isBitField());
   const ASTContext &Ctx = Node.getASTContext();
   unsigned IntBitWidth = Ctx.getIntWidth(Ctx.IntTy);
-  unsigned CurrentBitWidth = Node.getBitWidthValue(Ctx);
+  unsigned CurrentBitWidth = Node.getBitWidthValue();
   return IntBitWidth == CurrentBitWidth;
 }
 
diff --git a/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.cpp
index a73d46f01d9b2..4ceeefb78ee82 100644
--- a/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.cpp
@@ -124,7 +124,7 @@ static MagnitudeBits calcMagnitudeBits(const ASTContext &Context,
   unsigned SignedBits = IntExprType->isUnsignedIntegerType() ? 0U : 1U;
 
   if (const auto *BitField = IntExpr->getSourceBitField()) {
-    unsigned BitFieldWidth = BitField->getBitWidthValue(Context);
+    unsigned BitFieldWidth = BitField->getBitWidthValue();
     return {BitFieldWidth - SignedBits, BitFieldWidth};
   }
 
diff --git a/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.cpp b/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.cpp
index 47dafca2d03ff..7028c3958f103 100644
--- a/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.cpp
+++ b/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.cpp
@@ -160,7 +160,7 @@ void MultiwayPathsCoveredCheck::handleSwitchWithoutDefault(
     }
     if (const auto *BitfieldDecl =
             Result.Nodes.getNodeAs<FieldDecl>("bitfield")) {
-      return twoPow(BitfieldDecl->getBitWidthValue(*Result.Context));
+      return twoPow(BitfieldDecl->getBitWidthValue());
     }
 
     return static_cast<std::size_t>(0);
diff --git a/clang-tools-extra/clangd/Hover.cpp b/clang-tools-extra/clangd/Hover.cpp
index 298fa79e3fd0b..5e136d0e76ece 100644
--- a/clang-tools-extra/clangd/Hover.cpp
+++ b/clang-tools-extra/clangd/Hover.cpp
@@ -1018,7 +1018,7 @@ void addLayoutInfo(const NamedDecl &ND, HoverInfo &HI) {
       const ASTRecordLayout &Layout = Ctx.getASTRecordLayout(Record);
       HI.Offset = Layout.getFieldOffset(FD->getFieldIndex());
       if (FD->isBitField())
-        HI.Size = FD->getBitWidthValue(Ctx);
+        HI.Size = FD->getBitWidthValue();
       else if (auto Size = Ctx.getTypeSizeInCharsIfKnown(FD->getType()))
         HI.Size = FD->isZeroSize(Ctx) ? 0 : Size->getQuantity() * 8;
       if (HI.Size) {
diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index 16fc98aa1a57f..9c470f0940637 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -3154,7 +3154,9 @@ class FieldDecl : public DeclaratorDecl, public Mergeable<FieldDecl> {
 
   /// Computes the bit width of this field, if this is a bit field.
   /// May not be called on non-bitfields.
-  unsigned getBitWidthValue(const ASTContext &Ctx) const;
+  /// Note that in order to successfully use this function, the bitwidth
+  /// expression must be a ConstantExpr with a valid integer result set.
+  unsigned getBitWidthValue() const;
 
   /// Set the bit-field width for this member.
   // Note: used by some clients (i.e., do not remove it).
@@ -3185,7 +3187,7 @@ class FieldDecl : public DeclaratorDecl, public Mergeable<FieldDecl> {
   /// Is this a zero-length bit-field? Such bit-fields aren't really bit-fields
   /// at all and instead act as a separator between contiguous runs of other
   /// bit-fields.
-  bool isZeroLengthBitField(const ASTContext &Ctx) const;
+  bool isZeroLengthBitField() const;
 
   /// Determine if this field is a subobject of zero size, that is, either a
   /// zero-length bit-field or a field of empty class type with the
diff --git a/clang/include/clang/ASTMatchers/ASTMatchers.h b/clang/include/clang/ASTMatchers/ASTMatchers.h
index f32170c93bee2..239fcba4e5e05 100644
--- a/clang/include/clang/ASTMatchers/ASTMatchers.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchers.h
@@ -708,8 +708,7 @@ AST_MATCHER(FieldDecl, isBitField) {
 /// fieldDecl(hasBitWidth(2))
 ///   matches 'int a;' and 'int c;' but not 'int b;'.
 AST_MATCHER_P(FieldDecl, hasBitWidth, unsigned, Width) {
-  return Node.isBitField() &&
-         Node.getBitWidthValue(Finder->getASTContext()) == Width;
+  return Node.isBitField() && Node.getBitWidthValue() == Width;
 }
 
 /// Matches non-static data members that have an in-class initializer.
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 8f04b58841964..be1dd29d46278 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -2795,7 +2795,7 @@ getSubobjectSizeInBits(const FieldDecl *Field, const ASTContext &Context,
     if (Field->isUnnamedBitField())
       return 0;
 
-    int64_t BitfieldSize = Field->getBitWidthValue(Context);
+    int64_t BitfieldSize = Field->getBitWidthValue();
     if (IsBitIntType) {
       if ((unsigned)BitfieldSize >
           cast<BitIntType>(Field->getType())->getNumBits())
@@ -7769,7 +7769,7 @@ QualType ASTContext::isPromotableBitField(Expr *E) const {
 
   QualType FT = Field->getType();
 
-  uint64_t BitWidth = Field->getBitWidthValue(*this);
+  uint64_t BitWidth = Field->getBitWidthValue();
   uint64_t IntSize = getTypeSize(IntTy);
   // C++ [conv.prom]p5:
   //   A prvalue for an integral bit-field can be converted to a prvalue of type
@@ -8797,7 +8797,7 @@ static void EncodeBitField(const ASTContext *Ctx, std::string& S,
       S += getObjCEncodingForPrimitiveType(Ctx, BT);
     }
   }
-  S += llvm::utostr(FD->getBitWidthValue(*Ctx));
+  S += llvm::utostr(FD->getBitWidthValue());
 }
 
 // Helper function for determining whether the encoded type string would include
@@ -9223,7 +9223,7 @@ void ASTContext::getObjCEncodingForStructureImpl(RecordDecl *RDecl,
   }
 
   for (FieldDecl *Field : RDecl->fields()) {
-    if (!Field->isZeroLengthBitField(*this) && Field->isZeroSize(*this))
+    if (!Field->isZeroLengthBitField() && Field->isZeroSize(*this))
       continue;
     uint64_t offs = layout.getFieldOffset(Field->getFieldIndex());
     FieldOrBaseOffsets.insert(FieldOrBaseOffsets.upper_bound(offs),
@@ -9320,7 +9320,7 @@ void ASTContext::getObjCEncodingForStructureImpl(RecordDecl *RDecl,
       if (field->isBitField()) {
         EncodeBitField(this, S, field->getType(), field);
 #ifndef NDEBUG
-        CurOffs += field->getBitWidthValue(*this);
+        CurOffs += field->getBitWidthValue();
 #endif
       } else {
         QualType qt = field->getType();
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index d2aec69072e04..93a91976a31bf 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -1471,8 +1471,7 @@ bool InitThisBitField(InterpState &S, CodePtr OpPC, const Record::Field *F,
     return false;
   const Pointer &Field = This.atField(FieldOffset);
   const auto &Value = S.Stk.pop<T>();
-  Field.deref<T>() =
-      Value.truncate(F->Decl->getBitWidthValue(S.getASTContext()));
+  Field.deref<T>() = Value.truncate(F->Decl->getBitWidthValue());
   Field.initialize();
   return true;
 }
@@ -1495,8 +1494,7 @@ bool InitBitField(InterpState &S, CodePtr OpPC, const Record::Field *F) {
   assert(F->isBitField());
   const T &Value = S.Stk.pop<T>();
   const Pointer &Field = S.Stk.peek<Pointer>().atField(F->Offset);
-  Field.deref<T>() =
-      Value.truncate(F->Decl->getBitWidthValue(S.getASTContext()));
+  Field.deref<T>() = Value.truncate(F->Decl->getBitWidthValue());
   Field.activate();
   Field.initialize();
   return true;
@@ -1750,7 +1748,7 @@ bool StoreBitField(InterpState &S, CodePtr OpPC) {
   if (Ptr.canBeInitialized())
     Ptr.initialize();
   if (const auto *FD = Ptr.getField())
-    Ptr.deref<T>() = Value.truncate(FD->getBitWidthValue(S.getASTContext()));
+    Ptr.deref<T>() = Value.truncate(FD->getBitWidthValue());
   else
     Ptr.deref<T>() = Value;
   return true;
@@ -1765,7 +1763,7 @@ bool StoreBitFieldPop(InterpState &S, CodePtr OpPC) {
   if (Ptr.canBeInitialized())
     Ptr.initialize();
   if (const auto *FD = Ptr.getField())
-    Ptr.deref<T>() = Value.truncate(FD->getBitWidthValue(S.getASTContext()));
+    Ptr.deref<T>() = Value.truncate(FD->getBitWidthValue());
   else
     Ptr.deref<T>() = Value;
   return true;
diff --git a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
index 57c1fab5d6ab4..f4c54551a9a60 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
@@ -269,7 +269,7 @@ bool clang::interp::readPointerToBuffer(const Context &Ctx,
         Bits BitWidth = FullBitWidth;
 
         if (const FieldDecl *FD = P.getField(); FD && FD->isBitField())
-          BitWidth = Bits(std::min(FD->getBitWidthValue(ASTCtx),
+          BitWidth = Bits(std::min(FD->getBitWidthValue(),
                                    (unsigned)FullBitWidth.getQuantity()));
         else if (T == PT_Bool && PackedBools)
           BitWidth = Bits(1);
@@ -301,8 +301,8 @@ bool clang::interp::readPointerToBuffer(const Context &Ctx,
           assert(NumBits.isFullByte());
           assert(NumBits.getQuantity() <= FullBitWidth.getQuantity());
           F.bitcastToMemory(Buff.get());
-          // Now, only (maybe) swap the actual size of the float, excluding the
-          // padding bits.
+          // Now, only (maybe) swap the actual size of the float, excluding
+          // the padding bits.
           if (llvm::sys::IsBigEndianHost)
             swapBytes(Buff.get(), NumBits.roundToBytes());
 
@@ -406,7 +406,7 @@ bool clang::interp::DoBitCastPtr(InterpState &S, CodePtr OpPC,
 
         Bits BitWidth;
         if (const FieldDecl *FD = P.getField(); FD && FD->isBitField())
-          BitWidth = Bits(std::min(FD->getBitWidthValue(ASTCtx),
+          BitWidth = Bits(std::min(FD->getBitWidthValue(),
                                    (unsigned)FullBitWidth.getQuantity()));
         else if (T == PT_Bool && PackedBools)
           BitWidth = Bits(1);
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 97e23dd1aaa92..31749e46458d6 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -4599,18 +4599,24 @@ void FieldDecl::setLazyInClassInitializer(LazyDeclStmtPtr NewInit) {
     Init = NewInit;
 }
 
-unsigned FieldDecl::getBitWidthValue(const ASTContext &Ctx) const {
+unsigned FieldDecl::getBitWidthValue() const {
   assert(isBitField() && "not a bitfield");
-  return getBitWidth()->EvaluateKnownConstInt(Ctx).getZExtValue();
+  assert(isa<ConstantExpr>(getBitWidth()));
+  assert(cast<ConstantExpr>(getBitWidth())->hasAPValueResult());
+  assert(cast<ConstantExpr>(getBitWidth())->getAPValueResult().isInt());
+  return cast<ConstantExpr>(getBitWidth())
+      ->getAPValueResult()
+      .getInt()
+      .getZExtValue();
 }
 
-bool FieldDecl::isZeroLengthBitField(const ASTContext &Ctx) const {
+bool FieldDecl::isZeroLengthBitField() const {
   return isUnnamedBitField() && !getBitWidth()->isValueDependent() &&
-         getBitWidthValue(Ctx) == 0;
+         getBitWidthValue() == 0;
 }
 
 bool FieldDecl::isZeroSize(const ASTContext &Ctx) const {
-  if (isZeroLengthBitField(Ctx))
+  if (isZeroLengthBitField())
     return true;
 
   // C++2a [intro.object]p7:
diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp
index 8989e46e4847c..4163342118c2d 100644
--- a/clang/lib/AST/DeclCXX.cpp
+++ b/clang/lib/AST/DeclCXX.cpp
@@ -993,7 +993,7 @@ void CXXRecordDecl::addedMember(Decl *D) {
       // C++ [meta.unary.prop]p4: [LWG2358]
       //   T is a class type [...] with [...] no unnamed bit-fields of non-zero
       //   length
-      if (data().Empty && !Field->isZeroLengthBitField(Context) &&
+      if (data().Empty && !Field->isZeroLengthBitField() &&
           Context.getLangOpts().getClangABICompat() >
               LangOptions::ClangABI::Ver6)
         data().Empty = false;
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index ba66d36278567..5331357b5d1fe 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -196,8 +196,7 @@ bool Expr::isKnownToHaveBooleanValue(bool Semantic) const {
 
   if (const FieldDecl *FD = E->getSourceBitField())
     if (!Semantic && FD->getType()->isUnsignedIntegerType() &&
-        !FD->getBitWidth()->isValueDependent() &&
-        FD->getBitWidthValue(FD->getASTContext()) == 1)
+        !FD->getBitWidth()->isValueDependent() && FD->getBitWidthValue() == 1)
       return true;
 
   return false;
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 5768bb12ee38e..2e680d1569f60 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -2875,7 +2875,7 @@ static bool truncateBitfieldValue(EvalInfo &Info, const Expr *E,
 
   APSInt &Int = Value.getInt();
   unsigned OldBitWidth = Int.getBitWidth();
-  unsigned NewBitWidth = FD->getBitWidthValue(Info.Ctx);
+  unsigned NewBitWidth = FD->getBitWidthValue();
   if (NewBitWidth < OldBitWidth)
     Int = Int.trunc(NewBitWidth).extend(OldBitWidth);
   return true;
diff --git a/clang/lib/AST/Randstruct.cpp b/clang/lib/AST/Randstruct.cpp
index b484afa4997bb..4537ba5309e0b 100644
--- a/clang/lib/AST/Randstruct.cpp
+++ b/clang/lib/AST/Randstruct.cpp
@@ -91,7 +91,7 @@ void randomizeStructureLayoutImpl(const ASTContext &Context,
     auto FieldIter = FieldsOut.begin();
     FieldDecl *FD = *FieldIter;
 
-    if (FD->isBitField() && !FD->isZeroLengthBitField(Context)) {
+    if (FD->isBitField() && !FD->isZeroLengthBitField()) {
       // Start a bitfield run if this is the first bitfield we have found.
       if (!CurrentBitfieldRun)
         CurrentBitfieldRun = std::make_unique<BitfieldRunBucket>();
diff --git a/clang/lib/AST/RecordLayoutBuilder.cpp b/clang/lib/AST/RecordLayoutBuilder.cpp
index 4493dd00d26d8..ae6d299024c6d 100644
--- a/clang/lib/AST/RecordLayoutBuilder.cpp
+++ b/clang/lib/AST/RecordLayoutBuilder.cpp
@@ -1529,7 +1529,7 @@ static bool isAIXLayout(const ASTContext &Context) {
 
 void ItaniumRecordLayoutBuilder::LayoutBitField(const FieldDecl *D) {
   bool FieldPacked = Packed || D->hasAttr<PackedAttr>();
-  uint64_t FieldSize = D->getBitWidthValue(Context);
+  uint64_t FieldSize = D->getBitWidthValue();
   TypeInfo FieldInfo = Context.getTypeInfo(D->getType());
   uint64_t StorageUnitSize = FieldInfo.Width;
   unsigned FieldAlign = FieldInfo.Align;
@@ -3009,7 +3009,7 @@ void MicrosoftRecordLayoutBuilder::layoutField(const FieldDecl *FD) {
 }
 
 void MicrosoftRecordLayoutBuilder::layoutBitField(const FieldDecl *FD) {
-  unsigned Width = FD->getBitWidthValue(Context);
+  unsigned Width = FD->getBitWidthValue();
   if (Width == 0) {
     layoutZeroWidthBitField(FD);
     return;
@@ -3677,7 +3677,7 @@ static void DumpRecordLayout(raw_ostream &OS, const RecordDecl *RD,
     if (Field->isBitField()) {
       uint64_t LocalFieldByteOffsetInBits = C.toBits(FieldOffset - Offset);
       unsigned Begin = LocalFieldOffsetInBits - LocalFieldByteOffsetInBits;
-      unsigned Width = Field->getBitWidthValue(C);
+      unsigned Width = Field->getBitWidthValue();
       PrintBitFieldOffset(OS, FieldOffset, Begin, Width, IndentLevel);
     } else {
       PrintOffset(OS, FieldOffset, IndentLevel);
diff --git a/clang/lib/CodeGen/ABIInfo.cpp b/clang/lib/CodeGen/ABIInfo.cpp
index 8e76cf15b642c..642bca9e8b76d 100644
--- a/clang/lib/CodeGen/ABIInfo.cpp
+++ b/clang/lib/CodeGen/ABIInfo.cpp
@@ -106,7 +106,7 @@ bool ABIInfo::isHomogeneousAggregate(QualType Ty, const Type *&Base,
         continue;
 
       if (isZeroLengthBitfieldPermittedInHomogeneousAggregate() &&
-          FD->isZeroLengthBitField(getContext()))
+          FD->isZeroLengthBitField())
         continue;
 
       uint64_t FldMembers;
diff --git a/clang/lib/CodeGen/ABIInfoImpl.cpp b/clang/lib/CodeGen/ABIInfoImpl.cpp
index 79300df15d0e2..795874059bda7 100644
--- a/clang/lib/CodeGen/ABIInfoImpl.cpp
+++ b/clang/lib/CodeGen/ABIInfoImpl.cpp
@@ -303,7 +303,7 @@ bool CodeGen::isEmptyRecord(ASTContext &Context, QualType T, bool AllowArrays,
 
 bool CodeGen::isEmptyFieldForLayout(const ASTContext &Context,
                                     const FieldDecl *FD) {
-  if (FD->isZeroLengthBitField(Context))
+  if (FD->isZeroLengthBitField())
     return true;
 
   if (FD->isUnnamedBitField())
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 379ce00b739ae..473675d78c66f 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -954,7 +954,7 @@ getTypeExpansion(QualType Ty, const ASTContext &Context) {
       CharUnits UnionSize = CharUnits::Zero();
 
       for (const auto *FD : RD->fields()) {
-        if (FD->isZeroLengthBitField(Context))
+        if (FD->isZeroLengthBitField())
           continue;
         assert(!FD->isBitField() &&
                "Cannot expand structure with bit-field members.");
@@ -974,7 +974,7 @@ getTypeExpansion(QualType Ty, const ASTContext &Context) {
       }
 
       for (const auto *FD : RD->fields()) {
-        if (FD->isZeroLengthBitField(Context))
+        if (FD->isZeroLengthBitField())
           continue;
         assert(!FD->isBitField() &&
                "Cannot expand structure with bit-field members.");
@@ -3682,7 +3682,7 @@ static void setUsedBits(CodeGenModule &CGM, const RecordType *RTy, int Offset,
   for (auto I = RD->field_begin(), E = RD->field_end(); I != E; ++I, ++Idx) {
     const FieldDecl *F = *I;
 
-    if (F->isUnnamedBitField() || F->isZeroLengthBitField(Context) ||
+    if (F->isUnnamedBitField() || F->isZeroLengthBitField() ||
         F->getType()->isIncompleteArrayType())
       continue;
 
diff --git a/clang/lib/CodeGen/CGClass.cpp b/clang/lib/CodeGen/CGClass.cpp
index 16a1c1cebdaa4..7a1096fcbca82 100644
--- a/clang/lib/CodeGen/CGClass.cpp
+++ b/clang/lib/CodeGen/CGClass.cpp
@@ -945,7 +945,7 @@ namespace {
       ASTContext &Ctx = CGF.getContext();
       unsigned LastFieldSize =
           LastField->isBitField()
-              ? LastField->getBitWidthValue(Ctx)
+              ? LastField->getBitWidthValue()
               : Ctx.toBits(
                     Ctx.getTypeInfoDataSizeInChars(LastField->getType()).Width);
       uint64_t MemcpySizeBits = LastFieldOffset + LastFieldSize -
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 560d4ce293365..d7e5e95b7873a 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -1721,8 +1721,7 @@ llvm::DIDerivedType *CGDebugInfo::createBitFieldSeparatorIfNeeded(
 
   assert(PreviousBitfield->isBitField());
 
-  ASTContext &Context = CGM.getContext();
-  if (!PreviousBitfield->isZeroLengthBitField(Context))
+  if (!PreviousBitfield->isZeroLengthBitField())
     return nullptr;
 
   QualType Ty = PreviousBitfield->getType();
@@ -3214,9 +3213,8 @@ llvm::DIType *CGDebugInfo::CreateTypeDefinition(const ObjCInterfaceType *Ty,
     if (!FType->isIncompleteArrayType()) {
 
       // Bit size, align and offset of the type.
-      FieldSize = Field->isBitField()
-                      ? Field->getBitWidthValue(CGM.getContext())
-                      : CGM.getContext().getTypeSize(FType);
+      FieldSize = Field->isBitField() ? Field->getBitWidthValue()
+                                      : CGM.getContext().getTypeSize(FType);
       FieldAlign = getTypeAlignIfRequired(FType, CGM.getContext());
     }
 
diff --git a/clang/lib/CodeGen/CGNonTrivialStruct.cpp b/clang/lib/CodeGen/CGNonTrivialStruct.cpp
index 6a02e4dbf84d1..d90c44d770d14 100644
--- a/clang/lib/CodeGen/CGNonTrivialStruct.cpp
+++ b/clang/lib/CodeGen/CGNonTrivialStruct.cpp
@@ -25,7 +25,7 @@ using namespace CodeGen;
 static uint64_t getFieldSize(const FieldDecl *FD, QualType FT,
                              ASTContext &Ctx) {
   if (FD && FD->isBitField())
-    return FD->getBitWidthValue(Ctx);
+    return FD->getBitWidthValue();
   return Ctx.getTypeSize(FT);
 }
 
@@ -255,7 +255,7 @@ struct GenBinaryFuncName : CopyStructVisitor<GenBinaryFuncName<IsMove>, IsMove>,
   void visitVolatileTrivial(QualType FT, const FieldDecl *FD,
                             CharUnits CurStructOffset) {
     // Zero-length bit-fields don't need to be copied/assigned.
-    if (FD && FD->isZeroLengthBitField(this->Ctx))
+    if (FD && FD->isZeroLengthBitField())
       return;
 
     // Because volatile fields can be bit-fields and are individually copied,
@@ -544,7 +544,7 @@ struct GenBinaryFunc : CopyStructVisitor<Derived, IsMove>,
     LValue DstLV, SrcLV;
     if (FD) {
       // No need to copy zero-length bit-fields.
-      if (FD->isZeroLengthBitField(this->CGF->getContext()))
+      if (FD->isZeroLengthBitField())
         return;
 
       QualType RT = QualType(FD->getParent()->getTypeForDecl(), 0);
diff --git a/clang/lib/CodeGen/CGObjCMac.cpp b/clang/lib/CodeGen/CGObjCMac.cpp
index 7b85dcc2c7984..dd900f9b32fb7 100644
--- a/clang/lib/CodeGen/CGObjCMac.cpp
+++ b/clang/lib/CodeGen/CGObjCMac.cpp
@@ -2543,8 +2543,7 @@ void CGObjCCommonMac::BuildRCRecordLayout(const llvm::StructLayout *RecLayout,
   if (LastFieldBitfieldOrUnnamed) {
     if (LastFieldBitfieldOrUnnamed->isBitField()) {
       // Last field was a bitfield. Must update the info.
-      uint64_t BitFieldSize
-        = LastFieldBitfieldOrUnnamed->getBitWidthValue(CGM.getContext());
+      uint64_t BitFieldSize = LastFieldBitfieldOrUnnamed->getBitWidthValue();
       unsigned UnsSize = (BitFieldSize / ByteSizeInBits) +
                         ((BitFieldSize % ByteSizeInBits) != 0);
       CharUnits Size = CharUnits::fromQuantity(UnsSize);
diff --git a/clang/lib/CodeGen/CGObjCRuntime.cpp b/clang/lib/CodeGen/CGObjCRuntime.cpp
index 01d0f35da1964..b438a92a4fd62 100644
--- a/clang/lib/CodeGen/CGObjCRuntime.cpp
+++ b/clang/lib/CodeGen/CGObjCRuntime.cpp
@@ -89,7 +89,7 @@ LValue CGObjCRuntime::EmitValueForIvarAtOffset(CodeGen::CodeGenFunction &CGF,
       CGF.CGM.getContext().lookupFieldBitOffset(OID, nullptr, Ivar);
   uint64_t BitOffset = FieldBitOffset % CGF.CGM.getContext().getCharWidth();
   uint64_t AlignmentBits = CGF.CGM.getTarget().getCharAlign();
-  uint64_t BitFieldSize = Ivar->getBitWidthValue(CGF.getContext());
+  uint64_t BitFieldSize = Ivar->getBitWidthValue();
   CharUnits StorageSize = CGF.CGM.getContext().toCharUnitsFromBits(
       llvm::alignTo(BitOffset + BitFieldSize, AlignmentBits));
   CharUnits Alignment = CGF.CGM.getContext().toCharUnitsFromBits(AlignmentBits);
diff --git a/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp b/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp
index 209ef236f0d5d..232e2d8b43ca1 100644
--- a/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp
+++ b/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp
@@ -148,8 +148,8 @@ struct CGRecordLowering {
     llvm::Type *Type = Types.ConvertTypeForMem(FD->getType());
     if (!FD->isBitField()) return Type;
     if (isDiscreteBitFieldABI()) return Type;
-    return getIntNType(std::min(FD->getBitWidthValue(Context),
-                             (unsigned)Context.toBits(getSize(Type))));
+    return getIntNType(std::min(FD->getBitWidthValue(),
+                                (unsigned)Context.toBits(getSize(Type))));
   }
   /// Gets the llvm Basesubobject type from a CXXRecordDecl.
   llvm::Type *getStorageType(const CXXRecordDecl *RD) const {
@@ -242,7 +242,7 @@ void CGRecordLowering::setBitFieldInfo(
   CGBitFieldInfo &Info = BitFields[FD->getCanonicalDecl()];
   Info.IsSigned = FD->getType()->isSignedIntegerOrEnumerationType();
   Info.Offset = (unsigned)(getFieldBitOffset(FD) - Context.toBits(StartOffset));
-  Info.Size = FD->getBitWidthValue(Context);
+  Info.Size = FD->getBitWidthValue();
   Info.StorageSize = (unsigned)DataLayout.getTypeAllocSizeInBits(StorageType);
   Info.StorageOffset = StartOffset;
   if (Info.Size > Info.StorageSize)
@@ -322,7 +322,7 @@ void CGRecordLowering::lowerUnion(bool isNonVirtualBaseType) {
   // been doing and cause lit tests to change.
   for (const auto *Field : D->fields()) {
     if (Field->isBitField()) {
-      if (Field->isZeroLengthBitField(Context))
+      if (Field->isZeroLengthBitField())
         continue;
       llvm::Type *FieldType = getStorageType(Field);
       if (LayoutSize < getSize(FieldType))
@@ -423,7 +423,7 @@ CGRecordLowering::accumulateBitFields(bool isNonVirtualBaseType,
     uint64_t StartBitOffset, Tail = 0;
     for (; Field != FieldEnd && Field->isBitField(); ++Field) {
       // Zero-width bitfields end runs.
-      if (Field->isZeroLengthBitField(Context)) {
+      if (Field->isZeroLengthBitField()) {
         Run = FieldEnd;
         continue;
       }
@@ -559,7 +559,7 @@ CGRecordLowering::accumulateBitFields(bool isNonVirtualBaseType,
         // Bitfield potentially begins a new span. This includes zero-length
         // bitfields on non-aligning targets that lie at character boundaries
         // (those are barriers to merging).
-        if (Field->isZeroLengthBitField(Context))
+        if (Field->isZeroLengthBitField())
           Barrier = true;
         AtAlignedBoundary = true;
       }
@@ -697,7 +697,7 @@ CGRecordLowering::accumulateBitFields(bool isNonVirtualBaseType,
         }
         Members.push_back(StorageInfo(BeginOffset, Type));
         for (; Begin != BestEnd; ++Begin)
-          if (!Begin->isZeroLengthBitField(Context))
+          if (!Begin->isZeroLengthBitField())
             Members.push_back(
                 MemberInfo(BeginOffset, MemberInfo::Field, nullptr, *Begin));
       }
@@ -709,7 +709,7 @@ CGRecordLowering::accumulateBitFields(bool isNonVirtualBaseType,
              "Accumulating past end of bitfields");
       assert(!Barrier && "Accumulating across barrier");
       // Accumulate this bitfield into the current (potential) span.
-      BitSizeSinceBegin += Field->getBitWidthValue(Context);
+      BitSizeSinceBegin += Field->getBitWidthValue();
       ++Field;
     }
   }
@@ -813,7 +813,7 @@ void CGRecordLowering::computeVolatileBitfields() {
     bool Conflict = false;
     for (const auto *F : D->fields()) {
       // Allow sized bit-fields overlaps.
-      if (F->isBitField() && !F->isZeroLengthBitField(Context))
+      if (F->isBitField() && !F->isZeroLengthBitField())
         continue;
 
       const CharUnits FOffset = Context.toCharUnitsFromBits(
@@ -823,7 +823,7 @@ void CGRecordLowering::computeVolatileBitfields() {
       // fields after and before it should be race condition free.
       // The AAPCS acknowledges it and imposes no restritions when the
       // natural container overlaps a zero-length bit-field.
-      if (F->isZeroLengthBitField(Context)) {
+      if (F->isZeroLengthBitField()) {
         if (End > FOffset && StorageOffset < FOffset) {
           Conflict = true;
           break;
diff --git a/clang/lib/CodeGen/SwiftCallingConv.cpp b/clang/lib/CodeGen/SwiftCallingConv.cpp
index 0873896df213e..1ff4ece2811ec 100644
--- a/clang/lib/CodeGen/SwiftCallingConv.cpp
+++ b/clang/lib/CodeGen/SwiftCallingConv.cpp
@@ -186,7 +186,7 @@ void SwiftAggLowering::addBitFieldData(const FieldDecl *bitfield,
                                        uint64_t bitfieldBitBegin) {
   assert(bitfield->isBitField());
   auto &ctx = CGM.getContext();
-  auto width = bitfield->getBitWidthValue(ctx);
+  auto width = bitfield->getBitWidthValue();
 
   // We can ignore zero-width bit-fields.
   if (width == 0) return;
diff --git a/clang/lib/CodeGen/Targets/LoongArch.cpp b/clang/lib/CodeGen/Targets/LoongArch.cpp
index 6af9375461f09..6c90e48a5ea41 100644
--- a/clang/lib/CodeGen/Targets/LoongArch.cpp
+++ b/clang/lib/CodeGen/Targets/LoongArch.cpp
@@ -192,7 +192,7 @@ bool LoongArchABIInfo::detectFARsEligibleStructHelper(
     for (const FieldDecl *FD : RD->fields()) {
       QualType QTy = FD->getType();
       if (FD->isBitField()) {
-        unsigned BitWidth = FD->getBitWidthValue(getContext());
+        unsigned BitWidth = FD->getBitWidthValue();
         // Zero-width bitfields are ignored.
         if (BitWidth == 0)
           continue;
diff --git a/clang/lib/CodeGen/Targets/RISCV.cpp b/clang/lib/CodeGen/Targets/RISCV.cpp
index 873e696e1328f..2b70f2bd3f38b 100644
--- a/clang/lib/CodeGen/Targets/RISCV.cpp
+++ b/clang/lib/CodeGen/Targets/RISCV.cpp
@@ -246,7 +246,7 @@ bool RISCVABIInfo::detectFPCCEligibleStructHelper(QualType Ty, CharUnits CurOff,
       uint64_t FieldOffInBits = Layout.getFieldOffset(FD->getFieldIndex());
       QualType QTy = FD->getType();
       if (FD->isBitField()) {
-        unsigned BitWidth = FD->getBitWidthValue(getContext());
+        unsigned BitWidth = FD->getBitWidthValue();
         // Allow a bitfield with a type greater than XLen as long as the
         // bitwidth is XLen or less.
         if (getContext().getTypeSize(QTy) > XLen && BitWidth <= XLen)
diff --git a/clang/lib/CodeGen/Targets/X86.cpp b/clang/lib/CodeGen/Targets/X86.cpp
index 7f73bf2a65266..5ee5179dd0f3e 100644
--- a/clang/lib/CodeGen/Targets/X86.cpp
+++ b/clang/lib/CodeGen/Targets/X86.cpp
@@ -2130,7 +2130,7 @@ void X86_64ABIInfo::classify(QualType Ty, uint64_t OffsetBase, Class &Lo,
       if (BitField) {
         assert(!i->isUnnamedBitField());
         uint64_t Offset = OffsetBase + Layout.getFieldOffset(idx);
-        uint64_t Size = i->getBitWidthValue(getContext());
+        uint64_t Size = i->getBitWidthValue();
 
         uint64_t EB_Lo = Offset / 64;
         uint64_t EB_Hi = (Offset + Size - 1) / 64;
diff --git a/clang/lib/CodeGen/Targets/XCore.cpp b/clang/lib/CodeGen/Targets/XCore.cpp
index f3e241171b872..ced4981fd124f 100644
--- a/clang/lib/CodeGen/Targets/XCore.cpp
+++ b/clang/lib/CodeGen/Targets/XCore.cpp
@@ -343,7 +343,7 @@ static bool extractFieldType(SmallVectorImpl<FieldEncoding> &FE,
     if (Field->isBitField()) {
       Enc += "b(";
       llvm::raw_svector_ostream OS(Enc);
-      OS << Field->getBitWidthValue(CGM.getContext());
+      OS << Field->getBitWidthValue();
       Enc += ':';
     }
     if (!appendType(Enc, Field->getType(), CGM, TSC))
diff --git a/clang/lib/Frontend/Rewrite/RewriteModernObjC.cpp b/clang/lib/Frontend/Rewrite/RewriteModernObjC.cpp
index 8cdb463e2c99f..fc65559e9d4a5 100644
--- a/clang/lib/Frontend/Rewrite/RewriteModernObjC.cpp
+++ b/clang/lib/Frontend/Rewrite/RewriteModernObjC.cpp
@@ -3699,7 +3699,8 @@ void RewriteModernObjC::RewriteObjCFieldDecl(FieldDecl *fieldDecl,
     Type.getAsStringInternal(Name, Context->getPrintingPolicy());
   Result += Name;
   if (fieldDecl->isBitField()) {
-    Result += " : "; Result += utostr(fieldDecl->getBitWidthValue(*Context));
+    Result += " : ";
+    Result += utostr(fieldDecl->getBitWidthValue());
   }
   else if (EleboratedType && Type->isArrayType()) {
     const ArrayType *AT = Context->getAsArrayType(Type);
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 28dcfaac2e84f..881907ac311a3 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -658,7 +658,7 @@ struct BuiltinDumpStructGenerator {
         Format += ": %zu ";
         QualType SizeT = S.Context.getSizeType();
         llvm::APInt BitWidth(S.Context.getIntWidth(SizeT),
-                             FD->getBitWidthValue(S.Context));
+                             FD->getBitWidthValue());
         Args.push_back(IntegerLiteral::Create(S.Context, BitWidth, SizeT, Loc));
       }
 
@@ -10027,7 +10027,7 @@ static std::optional<IntRange> TryGetExprRange(ASTContext &C, const Expr *E,
                            Approximate);
 
   if (const auto *BitField = E->getSourceBitField())
-    return IntRange(BitField->getBitWidthValue(C),
+    return IntRange(BitField->getBitWidthValue(),
                     BitField->getType()->isUnsignedIntegerOrEnumerationType());
 
   if (GetExprType(E)->isVoidType())
@@ -10580,7 +10580,7 @@ static bool AnalyzeBitFieldAssignment(Sema &S, FieldDecl *Bitfield, Expr *Init,
     return false;
 
   Expr *OriginalInit = Init->IgnoreParenImpCasts();
-  unsigned FieldWidth = Bitfield->getBitWidthValue(S.Context);
+  unsigned FieldWidth = Bitfield->getBitWidthValue();
 
   Expr::EvalResult Result;
   if (!OriginalInit->EvaluateAsInt(Result, S.Context,
@@ -14044,8 +14044,8 @@ static bool isLayoutCompatible(const ASTContext &C, const FieldDecl *Field1,
 
   if (Field1->isBitField()) {
     // Make sure that the bit-fields are the same length.
-    unsigned Bits1 = Field1->getBitWidthValue(C);
-    unsigned Bits2 = Field2->getBitWidthValue(C);
+    unsigned Bits1 = Field1->getBitWidthValue();
+    unsigned Bits2 = Field2->getBitWidthValue();
 
     if (Bits1 != Bits2)
       return false;
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 0d476fea880b9..8724c20fd7b65 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -18357,7 +18357,9 @@ ExprResult Sema::VerifyBitField(SourceLocation FieldLoc,
     }
   }
 
-  return BitWidth;
+  if (isa<ConstantExpr>(BitWidth))
+    return BitWidth;
+  return ConstantExpr::Create(getASTContext(), BitWidth, APValue{Value});
 }
 
 Decl *Sema::ActOnField(Scope *S, Decl *TagD, SourceLocation DeclStart,
@@ -18732,7 +18734,7 @@ void Sema::ActOnLastBitfield(SourceLocation DeclLoc,
   Decl *ivarDecl = AllIvarDecls[AllIvarDecls.size()-1];
   ObjCIvarDecl *Ivar = cast<ObjCIvarDecl>(ivarDecl);
 
-  if (!Ivar->isBitField() || Ivar->isZeroLengthBitField(Context))
+  if (!Ivar->isBitField() || Ivar->isZeroLengthBitField())
     return;
   ObjCInterfaceDecl *ID = dyn_cast<ObjCInterfaceDecl>(CurContext);
   if (!ID) {
@@ -18747,14 +18749,13 @@ void Sema::ActOnLastBitfield(SourceLocation DeclLoc,
   // All conditions are met. Add a new bitfield to the tail end of ivars.
   llvm::APInt Zero(Context.getTypeSize(Context.IntTy), 0);
   Expr * BW = IntegerLiteral::Create(Context, Zero, Context.IntTy, DeclLoc);
+  Expr *BitWidth =
+      ConstantExpr::Create(Context, BW, APValue(llvm::APSInt(Zero)));
 
-  Ivar = ObjCIvarDecl::Create(Context, cast<ObjCContainerDecl>(CurContext),
-                              DeclLoc, DeclLoc, nullptr,
-                              Context.CharTy,
-                              Context.getTrivialTypeSourceInfo(Context.CharTy,
-                                                               DeclLoc),
-                              ObjCIvarDecl::Private, BW,
-                              true);
+  Ivar = ObjCIvarDecl::Create(
+      Context, cast<ObjCContainerDecl>(CurContext), DeclLoc, DeclLoc, nullptr,
+      Context.CharTy, Context.getTrivialTypeSourceInfo(Context.CharTy, DeclLoc),
+      ObjCIvarDecl::Private, BitWidth, true);
   AllIvarDecls.push_back(Ivar);
 }
 
@@ -19384,7 +19385,7 @@ void Sema::ActOnFields(Scope *S, SourceLocation RecLoc, Decl *EnclosingDecl,
            (NonBitFields == 0 || ZeroSize) && I != E; ++I) {
         IsEmpty = false;
         if (I->isUnnamedBitField()) {
-          if (!I->isZeroLengthBitField(Context))
+          if (!I->isZeroLengthBitField())
             ZeroSize = false;
         } else {
           ++NonBitFields;
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index c5a72cf812ebc..c4bee44f5ec04 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -4877,7 +4877,7 @@ BuildImplicitMemberInitializer(Sema &SemaRef, CXXConstructorDecl *Constructor,
     QualType ParamType = Param->getType().getNonReferenceType();
 
     // Suppress copying zero-width bitfields.
-    if (Field->isZeroLengthBitField(SemaRef.Context))
+    if (Field->isZeroLengthBitField())
       return false;
 
     Expr *MemberExprBase =
@@ -15041,7 +15041,7 @@ void Sema::DefineImplicitCopyAssignment(SourceLocation CurrentLocation,
     }
 
     // Suppress assigning zero-width bitfields.
-    if (Field->isZeroLengthBitField(Context))
+    if (Field->isZeroLengthBitField())
       continue;
 
     QualType FieldType = Field->getType().getNonReferenceType();
@@ -15428,7 +15428,7 @@ void Sema::DefineImplicitMoveAssignment(SourceLocation CurrentLocation,
     }
 
     // Suppress assigning zero-width bitfields.
-    if (Field->isZeroLengthBitField(Context))
+    if (Field->isZeroLengthBitField())
       continue;
 
     QualType FieldType = Field->getType().getNonReferenceType();
diff --git a/clang/lib/Sema/SemaDeclObjC.cpp b/clang/lib/Sema/SemaDeclObjC.cpp
index 6e6174ba17c55..f97f17e8c9658 100644
--- a/clang/lib/Sema/SemaDeclObjC.cpp
+++ b/clang/lib/Sema/SemaDeclObjC.cpp
@@ -2210,8 +2210,7 @@ void SemaObjC::CheckImplementationIvars(ObjCImplementationDecl *ImpDecl,
         << ImplIvar->getType() << ClsIvar->getType();
       Diag(ClsIvar->getLocation(), diag::note_previous_definition);
     } else if (ImplIvar->isBitField() && ClsIvar->isBitField() &&
-               ImplIvar->getBitWidthValue(Context) !=
-               ClsIvar->getBitWidthValue(Context)) {
+               ImplIvar->getBitWidthValue() != ClsIvar->getBitWidthValue()) {
       Diag(ImplIvar->getBitWidth()->getBeginLoc(),
            diag::err_conflicting_ivar_bitwidth)
           << ImplIvar->getIdentifier();
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 3be9ade80f1d9..34c287926b1d7 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -516,7 +516,7 @@ NarrowingKind StandardConversionSequence::getNarrowingKind(
     if (const FieldDecl *BitField = Initializer->getSourceBitField()) {
       if (BitField->getBitWidth()->isValueDependent())
         DependentBitField = true;
-      else if (unsigned BitFieldWidth = BitField->getBitWidthValue(Ctx);
+      else if (unsigned BitFieldWidth = BitField->getBitWidthValue();
                BitFieldWidth < FromWidth) {
         if (CanRepresentAll(FromSigned, BitFieldWidth, ToSigned, ToWidth))
           return NK_Not_Narrowing;
diff --git a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
index ad45ab5757a5a..6266878565c52 100644
--- a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
+++ b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
@@ -931,7 +931,7 @@ collectSubRegionBindings(SmallVectorImpl<BindingPair> &Bindings,
     Length = ExtentInt.getLimitedValue() * SVB.getContext().getCharWidth();
   } else if (const FieldRegion *FR = dyn_cast<FieldRegion>(Top)) {
     if (FR->getDecl()->isBitField())
-      Length = FR->getDecl()->getBitWidthValue(SVB.getContext());
+      Length = FR->getDecl()->getBitWidthValue();
   }
 
   for (const auto &StoreEntry : Cluster) {
diff --git a/clang/tools/libclang/CXType.cpp b/clang/tools/libclang/CXType.cpp
index f97023c429bfa..f1b661435c499 100644
--- a/clang/tools/libclang/CXType.cpp
+++ b/clang/tools/libclang/CXType.cpp
@@ -395,7 +395,7 @@ int clang_getFieldDeclBitWidth(CXCursor C) {
 
     if (const FieldDecl *FD = dyn_cast_or_null<FieldDecl>(D)) {
       if (FD->isBitField() && !FD->getBitWidth()->isValueDependent())
-        return FD->getBitWidthValue(getCursorContext(C));
+        return FD->getBitWidthValue();
     }
   }
 
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index d197d30df3adf..ef76ee80d7c7b 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -3392,12 +3392,12 @@ TEST_P(ASTImporterOptionSpecificTestBase, ImportBitfields) {
       FirstDeclMatcher<FieldDecl>().match(FromTU, fieldDecl(hasName("x")));
 
   ASSERT_TRUE(FromF->isBitField());
-  ASSERT_EQ(3u, FromF->getBitWidthValue(FromTU->getASTContext()));
+  ASSERT_EQ(3u, FromF->getBitWidthValue());
   auto *ToField = Import(FromF, Lang_CXX03);
   auto *ToTU = ToField->getTranslationUnitDecl();
 
   EXPECT_TRUE(ToField->isBitField());
-  EXPECT_EQ(3u, ToField->getBitWidthValue(ToTU->getASTContext()));
+  EXPECT_EQ(3u, ToField->getBitWidthValue());
 
   const auto *FromBT = FromF->getBitWidth()->getType()->getAs<BuiltinType>();
   const auto *ToBT = ToField->getBitWidth()->getType()->getAs<BuiltinType>();
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index 06c04c992efc0..47051f2e68090 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -7433,6 +7433,8 @@ clang::FieldDecl *TypeSystemClang::AddFieldToRecordType(
     bit_width = new (clang_ast)
         clang::IntegerLiteral(clang_ast, bitfield_bit_size_apint,
                               clang_ast.IntTy, clang::SourceLocation());
+    bit_width = clang::ConstantExpr::Create(
+        clang_ast, bit_width, APValue(llvm::APSInt(bitfield_bit_size_apint)));
   }
 
   clang::RecordDecl *record_decl = ast->GetAsRecordDecl(type);
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-with-bitfields.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-with-bitfields.cpp
index 980180e7be9ae..297fb82caee5f 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-with-bitfields.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-with-bitfields.cpp
@@ -13,7 +13,9 @@
 // CHECK:      |-FieldDecl {{.*}} data 'char[5]'
 // CHECK-NEXT: |-FieldDecl {{.*}} padding 'Empty'
 // CHECK-NEXT: `-FieldDecl {{.*}} flag 'unsigned long'
-// CHECK-NEXT:   `-IntegerLiteral {{.*}} 'int' 1
+// CHECK-NEXT:   `-ConstantExpr {{.*}}
+// CHECK-NEXT:   |-value: Int 1
+// CHECK-NEXT:     `-IntegerLiteral {{.*}} 'int' 1
 
 struct Empty {};
 struct Empty2 {};
@@ -33,7 +35,9 @@ Foo global;
 // CHECK-NEXT: |-FieldDecl {{.*}} p2 'Empty2'
 // CHECK-NEXT: |-FieldDecl {{.*}} p3 'Empty3'
 // CHECK-NEXT: `-FieldDecl {{.*}} flag 'unsigned long'
-// CHECK-NEXT:   `-IntegerLiteral {{.*}} 'int' 1
+// CHECK-NEXT:   `-ConstantExpr {{.*}}
+// CHECK-NEXT:   |-value: Int 1
+// CHECK-NEXT:     `-IntegerLiteral {{.*}} 'int' 1
 
 struct ConsecutiveOverlap {
   char data[5];
@@ -51,10 +55,14 @@ ConsecutiveOverlap global2;
 // CHECK:      |-FieldDecl {{.*}} data 'char[5]'
 // CHECK-NEXT: |-FieldDecl {{.*}} p1 'Empty'
 // CHECK-NEXT: |-FieldDecl {{.*}} f1 'unsigned long'
-// CHECK-NEXT: | `-IntegerLiteral {{.*}} 'int' 1
+// CHECK-NEXT: | `-ConstantExpr {{.*}}
+// CHECK-NEXT: | |-value: Int 1
+// CHECK-NEXT: |   `-IntegerLiteral {{.*}} 'int' 1
 // CHECK-NEXT: |-FieldDecl {{.*}} p2 'Empty2'
 // CHECK-NEXT: `-FieldDecl {{.*}} f2 'unsigned long'
-// CHECK-NEXT:   `-IntegerLiteral {{.*}} 'int' 1
+// CHECK-NEXT: | `-ConstantExpr {{.*}}
+// CHECK-NEXT: | |-value: Int 1
+// CHECK-NEXT: |   `-IntegerLiteral {{.*}} 'int' 1
 
 struct MultipleAtOffsetZero {
   char data[5];
@@ -74,10 +82,14 @@ MultipleAtOffsetZero global3;
 // CHECK:      |-FieldDecl {{.*}} data 'char[5]'
 // CHECK-NEXT: |-FieldDecl {{.*}} p1 'Empty'
 // CHECK-NEXT: |-FieldDecl {{.*}} f1 'unsigned long'
-// CHECK-NEXT: | `-IntegerLiteral {{.*}} 'int' 1
+// CHECK-NEXT: | `-ConstantExpr {{.*}}
+// CHECK-NEXT: | |-value: Int 1
+// CHECK-NEXT: |   `-IntegerLiteral {{.*}} 'int' 1
 // CHECK-NEXT: |-FieldDecl {{.*}} p2 'Empty'
 // CHECK-NEXT: `-FieldDecl {{.*}} f2 'unsigned long'
-// CHECK-NEXT:   `-IntegerLiteral {{.*}} 'int' 1
+// CHECK-NEXT: | `-ConstantExpr {{.*}}
+// CHECK-NEXT: | |-value: Int 1
+// CHECK-NEXT: |   `-IntegerLiteral {{.*}} 'int' 1
 
 struct MultipleEmpty {
   char data[5];
@@ -93,12 +105,18 @@ MultipleEmpty global4;
 
 // CHECK:      CXXRecordDecl {{.*}} struct FieldBitfieldOverlap definition
 // CHECK:      |-FieldDecl {{.*}} a 'int'
-// CHECK-NEXT: | `-IntegerLiteral {{.*}} 'int' 3
+// CHECK-NEXT: | `-ConstantExpr {{.*}}
+// CHECK-NEXT: | |-value: Int 3
+// CHECK-NEXT: |   `-IntegerLiteral {{.*}} 'int' 3
 // CHECK-NEXT: |-FieldDecl {{.*}} p1 'Empty'
 // CHECK-NEXT: |-FieldDecl {{.*}} b 'int'
-// CHECK-NEXT: | `-IntegerLiteral {{.*}} 'int' 6
+// CHECK-NEXT: | `-ConstantExpr {{.*}}
+// CHECK-NEXT: | |-value: Int 6
+// CHECK-NEXT: |   `-IntegerLiteral {{.*}} 'int' 6
 // CHECK-NEXT: `-FieldDecl {{.*}} c 'int'
-// CHECK-NEXT:   `-IntegerLiteral {{.*}} 'int' 1
+// CHECK-NEXT:   `-ConstantExpr {{.*}}
+// CHECK-NEXT:   |-value: Int 1
+// CHECK-NEXT:     `-IntegerLiteral {{.*}} 'int' 1
 
 struct FieldBitfieldOverlap {
   int a : 3;

From 7c886d5d9265177e5dadb7ac5704cccffc3b95e0 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 10 Jan 2025 22:12:46 -0800
Subject: [PATCH 146/408] PassTimingInfo: test TheTimeInfo first. NFC

TheTimeInfo is a member variable and is often non-null, allowing the
caller `getPassTimer` to skip one check.
---
 llvm/lib/IR/PassTimingInfo.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/IR/PassTimingInfo.cpp b/llvm/lib/IR/PassTimingInfo.cpp
index 3816eff5c0f22..46db2c74a5c76 100644
--- a/llvm/lib/IR/PassTimingInfo.cpp
+++ b/llvm/lib/IR/PassTimingInfo.cpp
@@ -103,7 +103,7 @@ PassTimingInfo::~PassTimingInfo() {
 }
 
 void PassTimingInfo::init() {
-  if (!TimePassesIsEnabled || TheTimeInfo)
+  if (TheTimeInfo || !TimePassesIsEnabled)
     return;
 
   // Constructed the first time this is called, iff -time-passes is enabled.

From 0384069d6ca5cd025cae414e65ab08f174fcc175 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 10 Jan 2025 23:03:59 -0800
Subject: [PATCH 147/408] [AArch64] Use GenericTable PrimaryKey to remove some
 SearchIndexes. NFC

---
 .../Target/AArch64/AArch64SystemOperands.td   | 144 +++++++-----------
 1 file changed, 54 insertions(+), 90 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
index 355a9d2a0415a..df5db8fa514a1 100644
--- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
@@ -61,6 +61,9 @@ def ATValues : GenericEnum {
 def ATsList : GenericTable {
   let FilterClass = "AT";
   let Fields = ["Name", "Encoding", "Requires"];
+
+  let PrimaryKey = ["Encoding"];
+  let PrimaryKeyName = "lookupATByEncoding";
 }
 
 def lookupATByName : SearchIndex {
@@ -68,11 +71,6 @@ def lookupATByName : SearchIndex {
   let Key = ["Name"];
 }
 
-def lookupATByEncoding : SearchIndex {
-  let Table = ATsList;
-  let Key = ["Encoding"];
-}
-
 def : AT<"S1E1R",  0b000, 0b0111, 0b1000, 0b000>;
 def : AT<"S1E2R",  0b100, 0b0111, 0b1000, 0b000>;
 def : AT<"S1E3R",  0b110, 0b0111, 0b1000, 0b000>;
@@ -114,6 +112,9 @@ def DBValues : GenericEnum {
 def DBsList : GenericTable {
   let FilterClass = "DB";
   let Fields = ["Name", "Encoding"];
+
+  let PrimaryKey = ["Encoding"];
+  let PrimaryKeyName = "lookupDBByEncoding";
 }
 
 def lookupDBByName : SearchIndex {
@@ -121,11 +122,6 @@ def lookupDBByName : SearchIndex {
   let Key = ["Name"];
 }
 
-def lookupDBByEncoding : SearchIndex {
-  let Table = DBsList;
-  let Key = ["Encoding"];
-}
-
 def : DB<"oshld", 0x1>;
 def : DB<"oshst", 0x2>;
 def : DB<"osh",   0x3>;
@@ -155,6 +151,9 @@ def DBnXSValues : GenericEnum {
 def DBnXSsList : GenericTable {
   let FilterClass = "DBnXS";
   let Fields = ["Name", "Encoding", "ImmValue", "Requires"];
+
+  let PrimaryKey = ["Encoding"];
+  let PrimaryKeyName = "lookupDBnXSByEncoding";
 }
 
 def lookupDBnXSByName : SearchIndex {
@@ -162,11 +161,6 @@ def lookupDBnXSByName : SearchIndex {
   let Key = ["Name"];
 }
 
-def lookupDBnXSByEncoding : SearchIndex {
-  let Table = DBnXSsList;
-  let Key = ["Encoding"];
-}
-
 def lookupDBnXSByImmValue : SearchIndex {
   let Table = DBnXSsList;
   let Key = ["ImmValue"];
@@ -201,6 +195,9 @@ def DCValues : GenericEnum {
 def DCsList : GenericTable {
   let FilterClass = "DC";
   let Fields = ["Name", "Encoding", "Requires"];
+
+  let PrimaryKey = ["Encoding"];
+  let PrimaryKeyName = "lookupDCByEncoding";
 }
 
 def lookupDCByName : SearchIndex {
@@ -208,11 +205,6 @@ def lookupDCByName : SearchIndex {
   let Key = ["Name"];
 }
 
-def lookupDCByEncoding : SearchIndex {
-  let Table = DCsList;
-  let Key = ["Encoding"];
-}
-
 def : DC<"ZVA",   0b011, 0b0111, 0b0100, 0b001>;
 def : DC<"IVAC",  0b000, 0b0111, 0b0110, 0b001>;
 def : DC<"ISW",   0b000, 0b0111, 0b0110, 0b010>;
@@ -289,6 +281,9 @@ def ICValues : GenericEnum {
 def ICsList : GenericTable {
   let FilterClass = "IC";
   let Fields = ["Name", "Encoding", "NeedsReg"];
+
+  let PrimaryKey = ["Encoding"];
+  let PrimaryKeyName = "lookupICByEncoding";
 }
 
 def lookupICByName : SearchIndex {
@@ -296,11 +291,6 @@ def lookupICByName : SearchIndex {
   let Key = ["Name"];
 }
 
-def lookupICByEncoding : SearchIndex {
-  let Table = ICsList;
-  let Key = ["Encoding"];
-}
-
 def : IC<"IALLUIS", 0b000, 0b0111, 0b0001, 0b000, 0>;
 def : IC<"IALLU",   0b000, 0b0111, 0b0101, 0b000, 0>;
 def : IC<"IVAU",    0b011, 0b0111, 0b0101, 0b001, 1>;
@@ -324,6 +314,9 @@ def ISBValues : GenericEnum {
 def ISBsList : GenericTable {
   let FilterClass = "ISB";
   let Fields = ["Name", "Encoding"];
+
+  let PrimaryKey = ["Encoding"];
+  let PrimaryKeyName = "lookupISBByEncoding";
 }
 
 def lookupISBByName : SearchIndex {
@@ -331,11 +324,6 @@ def lookupISBByName : SearchIndex {
   let Key = ["Name"];
 }
 
-def lookupISBByEncoding : SearchIndex {
-  let Table = ISBsList;
-  let Key = ["Encoding"];
-}
-
 def : ISB<"sy", 0xf>;
 
 //===----------------------------------------------------------------------===//
@@ -359,6 +347,9 @@ def TSBValues : GenericEnum {
 def TSBsList : GenericTable {
   let FilterClass = "TSB";
   let Fields = ["Name", "Encoding", "Requires"];
+
+  let PrimaryKey = ["Encoding"];
+  let PrimaryKeyName = "lookupTSBByEncoding";
 }
 
 def lookupTSBByName : SearchIndex {
@@ -366,11 +357,6 @@ def lookupTSBByName : SearchIndex {
   let Key = ["Name"];
 }
 
-def lookupTSBByEncoding : SearchIndex {
-  let Table = TSBsList;
-  let Key = ["Encoding"];
-}
-
 def : TSB<"csync", 0>;
 
 //===----------------------------------------------------------------------===//
@@ -398,6 +384,9 @@ def PRFMValues : GenericEnum {
 def PRFMsList : GenericTable {
   let FilterClass = "PRFM";
   let Fields = ["Name", "Encoding", "Requires"];
+
+  let PrimaryKey = ["Encoding"];
+  let PrimaryKeyName = "lookupPRFMByEncoding";
 }
 
 def lookupPRFMByName : SearchIndex {
@@ -405,11 +394,6 @@ def lookupPRFMByName : SearchIndex {
   let Key = ["Name"];
 }
 
-def lookupPRFMByEncoding : SearchIndex {
-  let Table = PRFMsList;
-  let Key = ["Encoding"];
-}
-
 def : PRFM<"pld", 0b00, "l1",  0b00, "keep", 0b0>;
 def : PRFM<"pld", 0b00, "l1",  0b00, "strm", 0b1>;
 def : PRFM<"pld", 0b00, "l2",  0b01, "keep", 0b0>;
@@ -461,6 +445,9 @@ def SVEPRFMValues : GenericEnum {
 def SVEPRFMsList : GenericTable {
   let FilterClass = "SVEPRFM";
   let Fields = ["Name", "Encoding", "Requires"];
+
+  let PrimaryKey = ["Encoding"];
+  let PrimaryKeyName = "lookupSVEPRFMByEncoding";
 }
 
 def lookupSVEPRFMByName : SearchIndex {
@@ -468,11 +455,6 @@ def lookupSVEPRFMByName : SearchIndex {
   let Key = ["Name"];
 }
 
-def lookupSVEPRFMByEncoding : SearchIndex {
-  let Table = SVEPRFMsList;
-  let Key = ["Encoding"];
-}
-
 let Requires = [{ {AArch64::FeatureSVE} }] in {
 def : SVEPRFM<"pldl1keep", 0x00>;
 def : SVEPRFM<"pldl1strm", 0x01>;
@@ -545,6 +527,9 @@ def SVEPREDPATValues : GenericEnum {
 def SVEPREDPATsList : GenericTable {
   let FilterClass = "SVEPREDPAT";
   let Fields = ["Name", "Encoding"];
+
+  let PrimaryKey = ["Encoding"];
+  let PrimaryKeyName = "lookupSVEPREDPATByEncoding";
 }
 
 def lookupSVEPREDPATByName : SearchIndex {
@@ -552,11 +537,6 @@ def lookupSVEPREDPATByName : SearchIndex {
   let Key = ["Name"];
 }
 
-def lookupSVEPREDPATByEncoding : SearchIndex {
-  let Table = SVEPREDPATsList;
-  let Key = ["Encoding"];
-}
-
 def : SVEPREDPAT<"pow2",  0x00>;
 def : SVEPREDPAT<"vl1",   0x01>;
 def : SVEPREDPAT<"vl2",   0x02>;
@@ -594,6 +574,9 @@ def SVEVECLENSPECIFIERValues : GenericEnum {
 def SVEVECLENSPECIFIERsList : GenericTable {
   let FilterClass = "SVEVECLENSPECIFIER";
   let Fields = ["Name", "Encoding"];
+
+  let PrimaryKey = ["Encoding"];
+  let PrimaryKeyName = "lookupSVEVECLENSPECIFIERByEncoding";
 }
 
 def lookupSVEVECLENSPECIFIERByName : SearchIndex {
@@ -601,11 +584,6 @@ def lookupSVEVECLENSPECIFIERByName : SearchIndex {
   let Key = ["Name"];
 }
 
-def lookupSVEVECLENSPECIFIERByEncoding : SearchIndex {
-  let Table = SVEVECLENSPECIFIERsList;
-  let Key = ["Encoding"];
-}
-
 def : SVEVECLENSPECIFIER<"vlx2", 0x0>;
 def : SVEVECLENSPECIFIER<"vlx4", 0x1>;
 
@@ -664,6 +642,9 @@ def PStateImm0_15Values : GenericEnum {
 def PStateImm0_15sList : GenericTable {
   let FilterClass = "PStateImm0_15";
   let Fields = ["Name", "Encoding", "Requires"];
+
+  let PrimaryKey = ["Encoding"];
+  let PrimaryKeyName = "lookupPStateImm0_15ByEncoding";
 }
 
 def lookupPStateImm0_15ByName : SearchIndex {
@@ -671,11 +652,6 @@ def lookupPStateImm0_15ByName : SearchIndex {
   let Key = ["Name"];
 }
 
-def lookupPStateImm0_15ByEncoding : SearchIndex {
-  let Table = PStateImm0_15sList;
-  let Key = ["Encoding"];
-}
-
 class PStateImm0_1<string name, bits<3> op1, bits<3> op2, bits<3> crm_high> {
   string Name = name;
   bits<9> Encoding;
@@ -694,6 +670,9 @@ def PStateImm0_1Values : GenericEnum {
 def PStateImm0_1sList : GenericTable {
   let FilterClass = "PStateImm0_1";
   let Fields = ["Name", "Encoding", "Requires"];
+
+  let PrimaryKey = ["Encoding"];
+  let PrimaryKeyName = "lookupPStateImm0_1ByEncoding";
 }
 
 def lookupPStateImm0_1ByName : SearchIndex {
@@ -701,11 +680,6 @@ def lookupPStateImm0_1ByName : SearchIndex {
   let Key = ["Name"];
 }
 
-def lookupPStateImm0_1ByEncoding : SearchIndex {
-  let Table = PStateImm0_1sList;
-  let Key = ["Encoding"];
-}
-
 //                   Name,     Op1,   Op2
 def : PStateImm0_15<"SPSel",   0b000, 0b101>;
 def : PStateImm0_15<"DAIFSet", 0b011, 0b110>;
@@ -753,6 +727,9 @@ def SVCRValues : GenericEnum {
 def SVCRsList : GenericTable {
   let FilterClass = "SVCR";
   let Fields = ["Name", "Encoding", "Requires"];
+
+  let PrimaryKey = ["Encoding"];
+  let PrimaryKeyName = "lookupSVCRByEncoding";
 }
 
 def lookupSVCRByName : SearchIndex {
@@ -760,11 +737,6 @@ def lookupSVCRByName : SearchIndex {
   let Key = ["Name"];
 }
 
-def lookupSVCRByEncoding : SearchIndex {
-  let Table = SVCRsList;
-  let Key = ["Encoding"];
-}
-
 let Requires = [{ {AArch64::FeatureSME} }] in {
 def : SVCR<"SVCRSM",   0b001>;
 def : SVCR<"SVCRZA",   0b010>;
@@ -790,6 +762,9 @@ def PSBValues : GenericEnum {
 def PSBsList : GenericTable {
   let FilterClass = "PSB";
   let Fields = ["Name", "Encoding"];
+
+  let PrimaryKey = ["Encoding"];
+  let PrimaryKeyName = "lookupPSBByEncoding";
 }
 
 def lookupPSBByName : SearchIndex {
@@ -797,11 +772,6 @@ def lookupPSBByName : SearchIndex {
   let Key = ["Name"];
 }
 
-def lookupPSBByEncoding : SearchIndex {
-  let Table = PSBsList;
-  let Key = ["Encoding"];
-}
-
 def : PSB<"csync", 0x11>;
 
 //===----------------------------------------------------------------------===//
@@ -823,6 +793,9 @@ def BTIValues : GenericEnum {
 def BTIsList : GenericTable {
   let FilterClass = "BTI";
   let Fields = ["Name", "Encoding"];
+
+  let PrimaryKey = ["Encoding"];
+  let PrimaryKeyName = "lookupBTIByEncoding";
 }
 
 def lookupBTIByName : SearchIndex {
@@ -830,11 +803,6 @@ def lookupBTIByName : SearchIndex {
   let Key = ["Name"];
 }
 
-def lookupBTIByEncoding : SearchIndex {
-  let Table = BTIsList;
-  let Key = ["Encoding"];
-}
-
 def : BTI<"c",  0b010>;
 def : BTI<"j",  0b100>;
 def : BTI<"jc", 0b110>;
@@ -861,6 +829,9 @@ def TLBITable : GenericTable {
   let FilterClass = "TLBIEntry";
   let CppTypeName = "TLBI";
   let Fields = ["Name", "Encoding", "NeedsReg", "RequiresStr"];
+
+  let PrimaryKey = ["Encoding"];
+  let PrimaryKeyName = "lookupTLBIByEncoding";
 }
 
 def lookupTLBIByName : SearchIndex {
@@ -868,11 +839,6 @@ def lookupTLBIByName : SearchIndex {
   let Key = ["Name"];
 }
 
-def lookupTLBIByEncoding : SearchIndex {
-  let Table = TLBITable;
-  let Key = ["Encoding"];
-}
-
 multiclass TLBI<string name, bits<3> op1, bits<4> crn, bits<4> crm,
              bits<3> op2, bit needsreg = 1> {
   def : TLBIEntry<name, op1, crn, crm, op2, needsreg>;
@@ -2382,6 +2348,9 @@ def PHintValues : GenericEnum {
 def PHintsList : GenericTable {
   let FilterClass = "PHint";
   let Fields = ["Name", "Encoding", "Requires"];
+
+  let PrimaryKey = ["Encoding"];
+  let PrimaryKeyName = "lookupPHintByEncoding";
 }
 
 def lookupPHintByName : SearchIndex {
@@ -2389,11 +2358,6 @@ def lookupPHintByName : SearchIndex {
   let Key = ["Name"];
 }
 
-def lookupPHintByEncoding : SearchIndex {
-  let Table = PHintsList;
-  let Key = ["Encoding"];
-}
-
 let Requires = [{ {AArch64::FeaturePCDPHINT} }] in {
   def KEEP : PHint<0b00, 0b000, 0b0000, 0b0000, 0b000, "keep">;
   def STRM : PHint<0b00, 0b000, 0b0000, 0b0000, 0b001, "strm">;

From f3d3ec86d1a40a2c86d743384d272ebcd0a1cbd8 Mon Sep 17 00:00:00 2001
From: khaki3 <47756807+khaki3@users.noreply.github.com>
Date: Fri, 10 Jan 2025 23:39:16 -0800
Subject: [PATCH 148/408] [flang][acc] Add a missing acc.delete generation for
 the copyin clause (#122539)

We are missing the deletion part of the copyin clause after a region or
in a destructor. This PR completes its implementation for data regions,
compute regions, and global declarations.

Example:
```f90
subroutine sub()
  real :: x(1:10)
  !$acc data copyin(x)
  !$acc end data
end subroutine sub
```
We are getting the following:
```mlir
    %5 = acc.copyin varPtr(%2#0 : !fir.ref<!fir.array<10xf32>>) bounds(%4) -> !fir.ref<!fir.array<10xf32>> {name = "x"}
    acc.data dataOperands(%5 : !fir.ref<!fir.array<10xf32>>) {
      acc.terminator
    }
    return
```
With this PR, we'll get:
```mlir
    %5 = acc.copyin varPtr(%2#0 : !fir.ref<!fir.array<10xf32>>) bounds(%4) -> !fir.ref<!fir.array<10xf32>> {name = "x"}
    acc.data dataOperands(%5 : !fir.ref<!fir.array<10xf32>>) {
      acc.terminator
    }
    acc.delete accPtr(%5 : !fir.ref<!fir.array<10xf32>>) bounds(%4) {dataClause = #acc<data_clause acc_copyin>, name = "x"}
    return
```
---
 flang/lib/Lower/OpenACC.cpp                   | 35 ++++++++++++++-----
 .../test/Lower/OpenACC/acc-data-operands.f90  | 13 ++++---
 flang/test/Lower/OpenACC/acc-data.f90         |  3 ++
 .../Lower/OpenACC/acc-declare-globals.f90     |  8 +++++
 flang/test/Lower/OpenACC/acc-declare.f90      | 10 +++---
 flang/test/Lower/OpenACC/acc-kernels-loop.f90 |  2 ++
 flang/test/Lower/OpenACC/acc-kernels.f90      |  3 ++
 .../test/Lower/OpenACC/acc-parallel-loop.f90  |  2 ++
 flang/test/Lower/OpenACC/acc-parallel.f90     |  3 ++
 flang/test/Lower/OpenACC/acc-serial-loop.f90  |  2 ++
 flang/test/Lower/OpenACC/acc-serial.f90       |  3 ++
 11 files changed, 67 insertions(+), 17 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 8155c36396b11..86d8a549331fb 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -2183,8 +2183,9 @@ static Op createComputeOp(
   mlir::Value ifCond;
   mlir::Value selfCond;
   llvm::SmallVector<mlir::Value> waitOperands, attachEntryOperands,
-      copyEntryOperands, copyoutEntryOperands, createEntryOperands,
-      dataClauseOperands, numGangs, numWorkers, vectorLength, async;
+      copyEntryOperands, copyinEntryOperands, copyoutEntryOperands,
+      createEntryOperands, dataClauseOperands, numGangs, numWorkers,
+      vectorLength, async;
   llvm::SmallVector<mlir::Attribute> numGangsDeviceTypes, numWorkersDeviceTypes,
       vectorLengthDeviceTypes, asyncDeviceTypes, asyncOnlyDeviceTypes,
       waitOperandsDeviceTypes, waitOnlyDeviceTypes;
@@ -2321,6 +2322,7 @@ static Op createComputeOp(
                                dataClauseOperands.end());
     } else if (const auto *copyinClause =
                    std::get_if<Fortran::parser::AccClause::Copyin>(&clause.u)) {
+      auto crtDataStart = dataClauseOperands.size();
       genDataOperandOperationsWithModifier<mlir::acc::CopyinOp,
                                            Fortran::parser::AccClause::Copyin>(
           copyinClause, converter, semanticsContext, stmtCtx,
@@ -2328,6 +2330,8 @@ static Op createComputeOp(
           dataClauseOperands, mlir::acc::DataClause::acc_copyin,
           mlir::acc::DataClause::acc_copyin_readonly, async, asyncDeviceTypes,
           asyncOnlyDeviceTypes);
+      copyinEntryOperands.append(dataClauseOperands.begin() + crtDataStart,
+                                 dataClauseOperands.end());
     } else if (const auto *copyoutClause =
                    std::get_if<Fortran::parser::AccClause::Copyout>(
                        &clause.u)) {
@@ -2525,6 +2529,8 @@ static Op createComputeOp(
   // Create the exit operations after the region.
   genDataExitOperations<mlir::acc::CopyinOp, mlir::acc::CopyoutOp>(
       builder, copyEntryOperands, /*structured=*/true);
+  genDataExitOperations<mlir::acc::CopyinOp, mlir::acc::DeleteOp>(
+      builder, copyinEntryOperands, /*structured=*/true);
   genDataExitOperations<mlir::acc::CreateOp, mlir::acc::CopyoutOp>(
       builder, copyoutEntryOperands, /*structured=*/true);
   genDataExitOperations<mlir::acc::AttachOp, mlir::acc::DetachOp>(
@@ -2544,8 +2550,8 @@ static void genACCDataOp(Fortran::lower::AbstractConverter &converter,
                          const Fortran::parser::AccClauseList &accClauseList) {
   mlir::Value ifCond;
   llvm::SmallVector<mlir::Value> attachEntryOperands, createEntryOperands,
-      copyEntryOperands, copyoutEntryOperands, dataClauseOperands, waitOperands,
-      async;
+      copyEntryOperands, copyinEntryOperands, copyoutEntryOperands,
+      dataClauseOperands, waitOperands, async;
   llvm::SmallVector<mlir::Attribute> asyncDeviceTypes, asyncOnlyDeviceTypes,
       waitOperandsDeviceTypes, waitOnlyDeviceTypes;
   llvm::SmallVector<int32_t> waitOperandsSegments;
@@ -2604,6 +2610,7 @@ static void genACCDataOp(Fortran::lower::AbstractConverter &converter,
                                dataClauseOperands.end());
     } else if (const auto *copyinClause =
                    std::get_if<Fortran::parser::AccClause::Copyin>(&clause.u)) {
+      auto crtDataStart = dataClauseOperands.size();
       genDataOperandOperationsWithModifier<mlir::acc::CopyinOp,
                                            Fortran::parser::AccClause::Copyin>(
           copyinClause, converter, semanticsContext, stmtCtx,
@@ -2611,6 +2618,8 @@ static void genACCDataOp(Fortran::lower::AbstractConverter &converter,
           dataClauseOperands, mlir::acc::DataClause::acc_copyin,
           mlir::acc::DataClause::acc_copyin_readonly, async, asyncDeviceTypes,
           asyncOnlyDeviceTypes);
+      copyinEntryOperands.append(dataClauseOperands.begin() + crtDataStart,
+                                 dataClauseOperands.end());
     } else if (const auto *copyoutClause =
                    std::get_if<Fortran::parser::AccClause::Copyout>(
                        &clause.u)) {
@@ -2723,6 +2732,8 @@ static void genACCDataOp(Fortran::lower::AbstractConverter &converter,
   // Create the exit operations after the region.
   genDataExitOperations<mlir::acc::CopyinOp, mlir::acc::CopyoutOp>(
       builder, copyEntryOperands, /*structured=*/true);
+  genDataExitOperations<mlir::acc::CopyinOp, mlir::acc::DeleteOp>(
+      builder, copyinEntryOperands, /*structured=*/true);
   genDataExitOperations<mlir::acc::CreateOp, mlir::acc::CopyoutOp>(
       builder, copyoutEntryOperands, /*structured=*/true);
   genDataExitOperations<mlir::acc::AttachOp, mlir::acc::DetachOp>(
@@ -3691,7 +3702,8 @@ genDeclareInFunction(Fortran::lower::AbstractConverter &converter,
                      mlir::Location loc,
                      const Fortran::parser::AccClauseList &accClauseList) {
   llvm::SmallVector<mlir::Value> dataClauseOperands, copyEntryOperands,
-      createEntryOperands, copyoutEntryOperands, deviceResidentEntryOperands;
+      copyinEntryOperands, createEntryOperands, copyoutEntryOperands,
+      deviceResidentEntryOperands;
   Fortran::lower::StatementContext stmtCtx;
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
 
@@ -3729,12 +3741,15 @@ genDeclareInFunction(Fortran::lower::AbstractConverter &converter,
           /*structured=*/true, /*implicit=*/false);
     } else if (const auto *copyinClause =
                    std::get_if<Fortran::parser::AccClause::Copyin>(&clause.u)) {
+      auto crtDataStart = dataClauseOperands.size();
       genDeclareDataOperandOperationsWithModifier<mlir::acc::CopyinOp,
                                                   mlir::acc::DeleteOp>(
           copyinClause, converter, semanticsContext, stmtCtx,
           Fortran::parser::AccDataModifier::Modifier::ReadOnly,
           dataClauseOperands, mlir::acc::DataClause::acc_copyin,
           mlir::acc::DataClause::acc_copyin_readonly);
+      copyinEntryOperands.append(dataClauseOperands.begin() + crtDataStart,
+                                 dataClauseOperands.end());
     } else if (const auto *copyoutClause =
                    std::get_if<Fortran::parser::AccClause::Copyout>(
                        &clause.u)) {
@@ -3801,12 +3816,14 @@ genDeclareInFunction(Fortran::lower::AbstractConverter &converter,
   }
 
   openAccCtx.attachCleanup([&builder, loc, createEntryOperands,
-                            copyEntryOperands, copyoutEntryOperands,
-                            deviceResidentEntryOperands, declareToken]() {
+                            copyEntryOperands, copyinEntryOperands,
+                            copyoutEntryOperands, deviceResidentEntryOperands,
+                            declareToken]() {
     llvm::SmallVector<mlir::Value> operands;
     operands.append(createEntryOperands);
     operands.append(deviceResidentEntryOperands);
     operands.append(copyEntryOperands);
+    operands.append(copyinEntryOperands);
     operands.append(copyoutEntryOperands);
 
     mlir::func::FuncOp funcOp = builder.getFunction();
@@ -3825,6 +3842,8 @@ genDeclareInFunction(Fortran::lower::AbstractConverter &converter,
         builder, deviceResidentEntryOperands, /*structured=*/true);
     genDataExitOperations<mlir::acc::CopyinOp, mlir::acc::CopyoutOp>(
         builder, copyEntryOperands, /*structured=*/true);
+    genDataExitOperations<mlir::acc::CopyinOp, mlir::acc::DeleteOp>(
+        builder, copyinEntryOperands, /*structured=*/true);
     genDataExitOperations<mlir::acc::CreateOp, mlir::acc::CopyoutOp>(
         builder, copyoutEntryOperands, /*structured=*/true);
   });
@@ -3848,7 +3867,7 @@ genDeclareInModule(Fortran::lower::AbstractConverter &converter,
     } else if (const auto *copyinClause =
                    std::get_if<Fortran::parser::AccClause::Copyin>(&clause.u)) {
       genGlobalCtorsWithModifier<Fortran::parser::AccClause::Copyin,
-                                 mlir::acc::CopyinOp, mlir::acc::CopyinOp>(
+                                 mlir::acc::CopyinOp, mlir::acc::DeleteOp>(
           converter, modBuilder, copyinClause,
           Fortran::parser::AccDataModifier::Modifier::ReadOnly,
           mlir::acc::DataClause::acc_copyin,
diff --git a/flang/test/Lower/OpenACC/acc-data-operands.f90 b/flang/test/Lower/OpenACC/acc-data-operands.f90
index 5151e8f6a135c..4341ea687070b 100644
--- a/flang/test/Lower/OpenACC/acc-data-operands.f90
+++ b/flang/test/Lower/OpenACC/acc-data-operands.f90
@@ -35,6 +35,7 @@ subroutine acc_operand_array_section()
 ! CHECK: acc.data dataOperands(%[[COPYIN]], %[[COPYOUT_CREATE]] : !fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>) {
 ! CHECK:   acc.terminator
 ! CHECK: }
+! CHECK: acc.delete accPtr(%[[COPYIN]] : !fir.ref<!fir.array<100xf32>>) bounds(%[[BOUND_1_50]]) {dataClause = #acc<data_clause acc_copyin>, name = "a(1:50)"}
 ! CHECK: acc.copyout accPtr(%[[COPYOUT_CREATE]] : !fir.ref<!fir.array<100xf32>>) bounds(%[[BOUND_51_100]]) to varPtr(%[[DECL]]#0 : !fir.ref<!fir.array<100xf32>>) {name = "a(51:100)"}
 
 ! Testing array sections of a derived-type component
@@ -138,9 +139,9 @@ subroutine acc_operand_array_section_allocatable()
 ! CHECK: %[[LOAD_BOX_A_2:.*]] = fir.load %[[DECLA]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 ! CHECK: %[[C0:.*]] = arith.constant 0 : index
 ! CHECK: %[[DIMS0_2:.*]]:3 = fir.box_dims %[[LOAD_BOX_A_2]], %[[C0]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
-! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) extent(%[[DIMS0_2]]#1 : index) stride(%[[DIMS0_1]]#2 : index) startIdx(%[[DIMS0_0]]#0 : index) {strideInBytes = true}
+! CHECK: %[[BOUND_1_50:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) extent(%[[DIMS0_2]]#1 : index) stride(%[[DIMS0_1]]#2 : index) startIdx(%[[DIMS0_0]]#0 : index) {strideInBytes = true}
 ! CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[LOAD_BOX_A_0]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>>
-! CHECK: %[[COPYIN:.*]] = acc.copyin varPtr(%[[BOX_ADDR]] : !fir.heap<!fir.array<?xf32>>) bounds(%[[BOUND]]) -> !fir.heap<!fir.array<?xf32>> {name = "a(1:50)"}
+! CHECK: %[[COPYIN:.*]] = acc.copyin varPtr(%[[BOX_ADDR]] : !fir.heap<!fir.array<?xf32>>) bounds(%[[BOUND_1_50]]) -> !fir.heap<!fir.array<?xf32>> {name = "a(1:50)"}
 ! CHECK: %[[LOAD_BOX_A_0:.*]] = fir.load %[[DECLA]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 ! CHECK: %[[LOAD_BOX_A_1:.*]] = fir.load %[[DECLA]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 ! CHECK: %[[C0:.*]] = arith.constant 0 : index
@@ -154,13 +155,14 @@ subroutine acc_operand_array_section_allocatable()
 ! CHECK: %[[LOAD_BOX_A_2:.*]] = fir.load %[[DECLA]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 ! CHECK: %[[C0:.*]] = arith.constant 0 : index
 ! CHECK: %[[DIMS0_2:.*]]:3 = fir.box_dims %[[LOAD_BOX_A_2]], %[[C0]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
-! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) extent(%[[DIMS0_2]]#1 : index) stride(%[[DIMS0_1]]#2 : index) startIdx(%[[DIMS0_0]]#0 : index) {strideInBytes = true}
+! CHECK: %[[BOUND_51_100:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) extent(%[[DIMS0_2]]#1 : index) stride(%[[DIMS0_1]]#2 : index) startIdx(%[[DIMS0_0]]#0 : index) {strideInBytes = true}
 ! CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[LOAD_BOX_A_0]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>>
-! CHECK: %[[COPYOUT_CREATE:.*]] = acc.create varPtr(%[[BOX_ADDR]] : !fir.heap<!fir.array<?xf32>>) bounds(%[[BOUND]]) -> !fir.heap<!fir.array<?xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "a(51:100)"}
+! CHECK: %[[COPYOUT_CREATE:.*]] = acc.create varPtr(%[[BOX_ADDR]] : !fir.heap<!fir.array<?xf32>>) bounds(%[[BOUND_51_100]]) -> !fir.heap<!fir.array<?xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "a(51:100)"}
 ! CHECK: acc.data dataOperands(%[[COPYIN]], %[[COPYOUT_CREATE]] : !fir.heap<!fir.array<?xf32>>, !fir.heap<!fir.array<?xf32>>) {
 ! CHECK:   acc.terminator
 ! CHECK: }
-! CHECK: acc.copyout accPtr(%[[COPYOUT_CREATE]] : !fir.heap<!fir.array<?xf32>>) bounds(%[[BOUND]]) to varPtr(%[[BOX_ADDR]] : !fir.heap<!fir.array<?xf32>>) {name = "a(51:100)"}
+! CHECK: acc.delete accPtr(%[[COPYIN]] : !fir.heap<!fir.array<?xf32>>) bounds(%[[BOUND_1_50]]) {dataClause = #acc<data_clause acc_copyin>, name = "a(1:50)"}
+! CHECK: acc.copyout accPtr(%[[COPYOUT_CREATE]] : !fir.heap<!fir.array<?xf32>>) bounds(%[[BOUND_51_100]]) to varPtr(%[[BOX_ADDR]] : !fir.heap<!fir.array<?xf32>>) {name = "a(51:100)"}
 
 
 ! Testing array sections on pointer array
@@ -196,6 +198,7 @@ subroutine acc_operand_array_section_pointer()
 ! CHECK: acc.data dataOperands(%[[COPYIN]] : !fir.ptr<!fir.array<?xf32>>) {
 ! CHECK:   acc.terminator
 ! CHECK: }
+! CHECK: acc.delete accPtr(%[[COPYIN]] : !fir.ptr<!fir.array<?xf32>>) bounds(%[[BOUND]]) {dataClause = #acc<data_clause acc_copyin>, name = "p(1:50)"}
 
 
 end module
diff --git a/flang/test/Lower/OpenACC/acc-data.f90 b/flang/test/Lower/OpenACC/acc-data.f90
index 6e0ecb9129061..074f4d1135f10 100644
--- a/flang/test/Lower/OpenACC/acc-data.f90
+++ b/flang/test/Lower/OpenACC/acc-data.f90
@@ -74,6 +74,9 @@ subroutine acc_data
 ! CHECK:      acc.data dataOperands(%[[COPYIN_A]], %[[COPYIN_B]], %[[COPYIN_C]] : !fir.ref<!fir.array<10x10xf32>>, !fir.ref<!fir.array<10x10xf32>>, !fir.ref<!fir.array<10x10xf32>>) {
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
+! CHECK: acc.delete accPtr(%[[COPYIN_A]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) {dataClause = #acc<data_clause acc_copyin>, name = "a"}
+! CHECK: acc.delete accPtr(%[[COPYIN_B]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) {dataClause = #acc<data_clause acc_copyin_readonly>, name = "b"}
+! CHECK: acc.delete accPtr(%[[COPYIN_C]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) {dataClause = #acc<data_clause acc_copyin_readonly>, name = "c"}
 
   !$acc data copyout(a) copyout(zero: b) copyout(c)
   !$acc end data
diff --git a/flang/test/Lower/OpenACC/acc-declare-globals.f90 b/flang/test/Lower/OpenACC/acc-declare-globals.f90
index 1c54010dc108a..b64bbbcc0d12f 100644
--- a/flang/test/Lower/OpenACC/acc-declare-globals.f90
+++ b/flang/test/Lower/OpenACC/acc-declare-globals.f90
@@ -39,6 +39,14 @@ module acc_declare_copyin_test
 ! CHECK:         acc.terminator
 ! CHECK:       }
 
+! CHECK-LABEL: acc.global_dtor @_QMacc_declare_copyin_testEdata1_acc_dtor {
+! CHECK:         %[[GLOBAL_ADDR:.*]] = fir.address_of(@_QMacc_declare_copyin_testEdata1) {acc.declare = #acc.declare<dataClause = acc_copyin>} : !fir.ref<!fir.array<100000xf32>>
+! CHECK:         %[[DEVICEPTR:.*]] = acc.getdeviceptr varPtr(%[[GLOBAL_ADDR]] : !fir.ref<!fir.array<100000xf32>>) -> !fir.ref<!fir.array<100000xf32>> {dataClause = #acc<data_clause acc_copyin>, name = "data1", structured = false}
+! CHECK:         acc.declare_exit dataOperands(%[[DEVICEPTR]] : !fir.ref<!fir.array<100000xf32>>)
+! CHECK:         acc.delete accPtr(%[[DEVICEPTR]] : !fir.ref<!fir.array<100000xf32>>) {dataClause = #acc<data_clause acc_copyin>, name = "data1", structured = false}
+! CHECK:         acc.terminator
+! CHECK:       }
+
 module acc_declare_device_resident_test
  integer, parameter :: n = 5000
  integer, dimension(n) :: data1
diff --git a/flang/test/Lower/OpenACC/acc-declare.f90 b/flang/test/Lower/OpenACC/acc-declare.f90
index 0066e712fbdcc..f40216e2c9fe8 100644
--- a/flang/test/Lower/OpenACC/acc-declare.f90
+++ b/flang/test/Lower/OpenACC/acc-declare.f90
@@ -82,12 +82,14 @@ subroutine acc_declare_copyin()
 ! CHECK: %[[ADECL:.*]]:2 = hlfir.declare %[[A]](%{{.*}}) {acc.declare = #acc.declare<dataClause =  acc_copyin>, uniq_name = "_QMacc_declareFacc_declare_copyinEa"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
 ! CHECK: %[[B:.*]] = fir.alloca !fir.array<10xi32> {bindc_name = "b", uniq_name = "_QMacc_declareFacc_declare_copyinEb"}
 ! CHECK: %[[BDECL:.*]]:2 = hlfir.declare %[[B]](%{{.*}}) {acc.declare = #acc.declare<dataClause =  acc_copyin_readonly>, uniq_name = "_QMacc_declareFacc_declare_copyinEb"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
-! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
-! CHECK: %[[COPYIN_A:.*]] = acc.copyin varPtr(%[[ADECL]]#0 : !fir.ref<!fir.array<100xi32>>)   bounds(%[[BOUND]]) -> !fir.ref<!fir.array<100xi32>> {name = "a"}
-! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
-! CHECK: %[[COPYIN_B:.*]] = acc.copyin varPtr(%[[BDECL]]#0 : !fir.ref<!fir.array<10xi32>>)   bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xi32>> {dataClause = #acc<data_clause acc_copyin_readonly>, name = "b"}
+! CHECK: %[[BOUND_A:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
+! CHECK: %[[COPYIN_A:.*]] = acc.copyin varPtr(%[[ADECL]]#0 : !fir.ref<!fir.array<100xi32>>)   bounds(%[[BOUND_A]]) -> !fir.ref<!fir.array<100xi32>> {name = "a"}
+! CHECK: %[[BOUND_B:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
+! CHECK: %[[COPYIN_B:.*]] = acc.copyin varPtr(%[[BDECL]]#0 : !fir.ref<!fir.array<10xi32>>)   bounds(%[[BOUND_B]]) -> !fir.ref<!fir.array<10xi32>> {dataClause = #acc<data_clause acc_copyin_readonly>, name = "b"}
 ! CHECK: acc.declare_enter dataOperands(%[[COPYIN_A]], %[[COPYIN_B]] : !fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK: %{{.*}}:2 = fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%arg{{.*}} = %{{.*}}) -> (index, i32)
+! CHECK: acc.delete accPtr(%[[COPYIN_A]] : !fir.ref<!fir.array<100xi32>>)   bounds(%[[BOUND_A]]) {dataClause = #acc<data_clause acc_copyin>, name = "a"}
+! CHECK: acc.delete accPtr(%[[COPYIN_B]] : !fir.ref<!fir.array<10xi32>>)   bounds(%[[BOUND_B]]) {dataClause = #acc<data_clause acc_copyin_readonly>, name = "b"}
 
   subroutine acc_declare_copyout()
     integer :: a(100), i
diff --git a/flang/test/Lower/OpenACC/acc-kernels-loop.f90 b/flang/test/Lower/OpenACC/acc-kernels-loop.f90
index e5791f0e5b392..388a14d278d6c 100644
--- a/flang/test/Lower/OpenACC/acc-kernels-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-kernels-loop.f90
@@ -345,6 +345,8 @@ subroutine acc_kernels_loop
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
+! CHECK:      acc.delete accPtr(%[[COPYIN_A]] : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) {dataClause = #acc<data_clause acc_copyin>, name = "a"}
+! CHECK:      acc.delete accPtr(%[[COPYIN_B]] : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) {dataClause = #acc<data_clause acc_copyin_readonly>, name = "b"}
 
   !$acc kernels loop copyout(a) copyout(zero: b)
   DO i = 1, n
diff --git a/flang/test/Lower/OpenACC/acc-kernels.f90 b/flang/test/Lower/OpenACC/acc-kernels.f90
index ff4f1d3b54591..7282fee689cad 100644
--- a/flang/test/Lower/OpenACC/acc-kernels.f90
+++ b/flang/test/Lower/OpenACC/acc-kernels.f90
@@ -214,6 +214,9 @@ subroutine acc_kernels
 ! CHECK:      acc.kernels dataOperands(%[[COPYIN_A]], %[[COPYIN_B]], %[[COPYIN_C]] : !fir.ref<!fir.array<10x10xf32>>, !fir.ref<!fir.array<10x10xf32>>, !fir.ref<!fir.array<10x10xf32>>) {
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
+! CHECK:      acc.delete accPtr(%[[COPYIN_A]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) {dataClause = #acc<data_clause acc_copyin>, name = "a"}
+! CHECK:      acc.delete accPtr(%[[COPYIN_B]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) {dataClause = #acc<data_clause acc_copyin_readonly>, name = "b"}
+! CHECK:      acc.delete accPtr(%[[COPYIN_C]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) {dataClause = #acc<data_clause acc_copyin_readonly>, name = "c"}
 
   !$acc kernels copyout(a) copyout(zero: b) copyout(c)
   !$acc end kernels
diff --git a/flang/test/Lower/OpenACC/acc-parallel-loop.f90 b/flang/test/Lower/OpenACC/acc-parallel-loop.f90
index 48ceda0710e8d..c4fc21a307b32 100644
--- a/flang/test/Lower/OpenACC/acc-parallel-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-parallel-loop.f90
@@ -345,6 +345,8 @@ subroutine acc_parallel_loop
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
+! CHECK:      acc.delete accPtr(%[[COPYIN_A]] : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) {dataClause = #acc<data_clause acc_copyin>, name = "a"}
+! CHECK:      acc.delete accPtr(%[[COPYIN_B]] : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) {dataClause = #acc<data_clause acc_copyin_readonly>, name = "b"}
 
   !$acc parallel loop copyout(a) copyout(zero: b)
   DO i = 1, n
diff --git a/flang/test/Lower/OpenACC/acc-parallel.f90 b/flang/test/Lower/OpenACC/acc-parallel.f90
index 5197e2b0bee09..6f6e3208d753a 100644
--- a/flang/test/Lower/OpenACC/acc-parallel.f90
+++ b/flang/test/Lower/OpenACC/acc-parallel.f90
@@ -244,6 +244,9 @@ subroutine acc_parallel
 ! CHECK:      acc.parallel dataOperands(%[[COPYIN_A]], %[[COPYIN_B]], %[[COPYIN_C]] : !fir.ref<!fir.array<10x10xf32>>, !fir.ref<!fir.array<10x10xf32>>, !fir.ref<!fir.array<10x10xf32>>) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
+! CHECK:      acc.delete accPtr(%[[COPYIN_A]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) {dataClause = #acc<data_clause acc_copyin>, name = "a"}
+! CHECK:      acc.delete accPtr(%[[COPYIN_B]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) {dataClause = #acc<data_clause acc_copyin_readonly>, name = "b"}
+! CHECK:      acc.delete accPtr(%[[COPYIN_C]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) {dataClause = #acc<data_clause acc_copyin_readonly>, name = "c"}
 
   !$acc parallel copyout(a) copyout(zero: b) copyout(c)
   !$acc end parallel
diff --git a/flang/test/Lower/OpenACC/acc-serial-loop.f90 b/flang/test/Lower/OpenACC/acc-serial-loop.f90
index fa3b3f758908c..167c05986a546 100644
--- a/flang/test/Lower/OpenACC/acc-serial-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-serial-loop.f90
@@ -286,6 +286,8 @@ subroutine acc_serial_loop
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
+! CHECK:      acc.delete accPtr(%[[COPYIN_A]] : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) {dataClause = #acc<data_clause acc_copyin>, name = "a"}
+! CHECK:      acc.delete accPtr(%[[COPYIN_B]] : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) {dataClause = #acc<data_clause acc_copyin_readonly>, name = "b"}
 
   !$acc serial loop copyout(a) copyout(zero: b)
   DO i = 1, n
diff --git a/flang/test/Lower/OpenACC/acc-serial.f90 b/flang/test/Lower/OpenACC/acc-serial.f90
index 284f61976a46d..6f79e18c71442 100644
--- a/flang/test/Lower/OpenACC/acc-serial.f90
+++ b/flang/test/Lower/OpenACC/acc-serial.f90
@@ -188,6 +188,9 @@ subroutine acc_serial
 ! CHECK:      acc.serial dataOperands(%[[COPYIN_A]], %[[COPYIN_B]], %[[COPYIN_C]] : !fir.ref<!fir.array<10x10xf32>>, !fir.ref<!fir.array<10x10xf32>>, !fir.ref<!fir.array<10x10xf32>>) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
+! CHECK: acc.delete accPtr(%[[COPYIN_A]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) {dataClause = #acc<data_clause acc_copyin>, name = "a"}
+! CHECK: acc.delete accPtr(%[[COPYIN_B]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) {dataClause = #acc<data_clause acc_copyin_readonly>, name = "b"}
+! CHECK: acc.delete accPtr(%[[COPYIN_C]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) {dataClause = #acc<data_clause acc_copyin_readonly>, name = "c"}
 
   !$acc serial copyout(a) copyout(zero: b) copyout(c)
   !$acc end serial

From a3e62d849f07d1e55e6d15465f4f3842ff4b3717 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 11 Jan 2025 00:11:42 -0800
Subject: [PATCH 149/408] [AST] Fix a warning

This patch fixes:

  clang/unittests/AST/ASTImporterTest.cpp:3397:9: error: unused
  variable 'ToTU' [-Werror,-Wunused-variable]
---
 clang/unittests/AST/ASTImporterTest.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index ef76ee80d7c7b..f2bfde9bed372 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -3394,7 +3394,6 @@ TEST_P(ASTImporterOptionSpecificTestBase, ImportBitfields) {
   ASSERT_TRUE(FromF->isBitField());
   ASSERT_EQ(3u, FromF->getBitWidthValue());
   auto *ToField = Import(FromF, Lang_CXX03);
-  auto *ToTU = ToField->getTranslationUnitDecl();
 
   EXPECT_TRUE(ToField->isBitField());
   EXPECT_EQ(3u, ToField->getBitWidthValue());

From 26d513d197e14b824dd9d353aff38af1925c3770 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 11 Jan 2025 00:17:40 -0800
Subject: [PATCH 150/408] [TableGen] Migrate away from PointerUnion::{is,get}
 (NFC) (#122569)

Note that PointerUnion::{is,get} have been soft deprecated in
PointerUnion.h:

  // FIXME: Replace the uses of is(), get() and dyn_cast() with
  //        isa<T>, cast<T> and the llvm::dyn_cast<T>
---
 mlir/lib/TableGen/Operator.cpp | 4 ++--
 mlir/lib/TableGen/Pattern.cpp  | 7 +++----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/mlir/lib/TableGen/Operator.cpp b/mlir/lib/TableGen/Operator.cpp
index 904cc6637d53f..c360c61afd27b 100644
--- a/mlir/lib/TableGen/Operator.cpp
+++ b/mlir/lib/TableGen/Operator.cpp
@@ -231,7 +231,7 @@ unsigned Operator::getNumVariableLengthOperands() const {
 }
 
 bool Operator::hasSingleVariadicArg() const {
-  return getNumArgs() == 1 && getArg(0).is<NamedTypeConstraint *>() &&
+  return getNumArgs() == 1 && isa<NamedTypeConstraint *>(getArg(0)) &&
          getOperand(0).isVariadic();
 }
 
@@ -829,7 +829,7 @@ void Operator::print(llvm::raw_ostream &os) const {
     if (auto *attr = llvm::dyn_cast_if_present<NamedAttribute *>(arg))
       os << "[attribute] " << attr->name << '\n';
     else
-      os << "[operand] " << arg.get<NamedTypeConstraint *>()->name << '\n';
+      os << "[operand] " << cast<NamedTypeConstraint *>(arg)->name << '\n';
   }
 }
 
diff --git a/mlir/lib/TableGen/Pattern.cpp b/mlir/lib/TableGen/Pattern.cpp
index ffa0c067b0285..ac8c49c72d384 100644
--- a/mlir/lib/TableGen/Pattern.cpp
+++ b/mlir/lib/TableGen/Pattern.cpp
@@ -254,8 +254,7 @@ std::string SymbolInfoMap::SymbolInfo::getVarTypeStr(StringRef name) const {
   switch (kind) {
   case Kind::Attr: {
     if (op)
-      return op->getArg(getArgIndex())
-          .get<NamedAttribute *>()
+      return cast<NamedAttribute *>(op->getArg(getArgIndex()))
           ->attr.getStorageType()
           .str();
     // TODO(suderman): Use a more exact type when available.
@@ -305,7 +304,7 @@ std::string SymbolInfoMap::SymbolInfo::getValueAndRangeUse(
   }
   case Kind::Operand: {
     assert(index < 0);
-    auto *operand = op->getArg(getArgIndex()).get<NamedTypeConstraint *>();
+    auto *operand = cast<NamedTypeConstraint *>(op->getArg(getArgIndex()));
     // If this operand is variadic and this SymbolInfo doesn't have a range
     // index, then return the full variadic operand_range. Otherwise, return
     // the value itself.
@@ -447,7 +446,7 @@ bool SymbolInfoMap::bindOpArgument(DagNode node, StringRef symbol,
   }
 
   auto symInfo =
-      op.getArg(argIndex).is<NamedAttribute *>()
+      isa<NamedAttribute *>(op.getArg(argIndex))
           ? SymbolInfo::getAttr(&op, argIndex)
           : SymbolInfo::getOperand(node, &op, argIndex, variadicSubIndex);
 

From 212cba0ef37dd3b2a253c063240370de42fc67c1 Mon Sep 17 00:00:00 2001
From: Michael Clark <michaeljclark@mac.com>
Date: Sat, 11 Jan 2025 21:51:20 +1300
Subject: [PATCH 151/408] [X86] Correct the cdisp8 encoding for
 VSCATTER/VGATHER prefetch (#122051)

during differential fuzzing, I found 8 more instructions with disp8
offset multiplier differences to binutils. somewhat sure there is a bug
in the X86 LLVM disp8 offset multipliers for this subset of vector
scatter and gather prefetch instructions. please check and refer to the
previous pull request: https://github.com/llvm/llvm-project/pull/120340

these vector scatter and gather prefetch instructions also have an
unusual k mask operand position but I have not addressed this with this
patch as I am unsure how to change the Intel format in the tablegen
file.

```
hex:	62 f2 fd 49 c6 4c 51 01
llvm:	vgatherpf0dpd	{k1}, zmmword ptr [rcx + 2*ymm2 + 4]
ours:	vgatherpf0dpd	qword ptr * 8 [rcx + 2*ymm2 + 8] {k1}
gnu:	vgatherpf0dpd 	QWORD PTR [rcx+ymm2*2+0x8]{k1}

hex:	62 f2 7d 49 c7 4c 51 01
llvm:	vgatherpf0qps	{k1}, ymmword ptr [rcx + 2*zmm2 + 8]
ours:	vgatherpf0qps	dword ptr * 8 [rcx + 2*zmm2 + 4] {k1}
gnu:	vgatherpf0qps	DWORD PTR [rcx+zmm2*2+0x4]{k1}

hex:	62 f2 fd 49 c6 54 51 01
llvm:	vgatherpf1dpd	{k1}, zmmword ptr [rcx + 2*ymm2 + 4]
ours:	vgatherpf1dpd	qword ptr * 8 [rcx + 2*ymm2 + 8] {k1}
gnu:	vgatherpf1dpd	QWORD PTR [rcx+ymm2*2+0x8]{k1}

hex:	62 f2 7d 49 c7 54 51 01
llvm:	vgatherpf1qps	{k1}, ymmword ptr [rcx + 2*zmm2 + 8]
ours:	vgatherpf1qps	dword ptr * 8 [rcx + 2*zmm2 + 4] {k1}
gnu:	vgatherpf1qps	DWORD PTR [rcx+zmm2*2+0x4]{k1}

hex:	62 f2 fd 49 c6 6c 51 01
llvm:	vscatterpf0dpd	{k1}, zmmword ptr [rcx + 2*ymm2 + 4]
ours:	vscatterpf0dpd	qword ptr * 8 [rcx + 2*ymm2 + 8] {k1}
gnu:	vscatterpf0dpd	QWORD PTR [rcx+ymm2*2+0x8]{k1}

hex:	62 f2 7d 49 c7 6c 51 01
llvm:	vscatterpf0qps	{k1}, ymmword ptr [rcx + 2*zmm2 + 8]
ours:	vscatterpf0qps	dword ptr * 8 [rcx + 2*zmm2 + 4] {k1}
gnu:	vscatterpf0qps	DWORD PTR [rcx+zmm2*2+0x4]{k1}

hex:	62 f2 fd 49 c6 74 51 01
llvm:	vscatterpf1dpd	{k1}, zmmword ptr [rcx + 2*ymm2 + 4]
ours:	vscatterpf1dpd	qword ptr * 8 [rcx + 2*ymm2 + 8] {k1}
gnu:	vscatterpf1dpd QWORD PTR [rcx+ymm2*2+0x8]{k1}

hex:	62 f2 7d 49 c7 74 51 01
llvm:	vscatterpf1qps	{k1}, ymmword ptr [rcx + 2*zmm2 + 8]
ours:	vscatterpf1qps	dword ptr * 8 [rcx + 2*zmm2 + 4] {k1}
gnu:	vscatterpf1qps DWORD PTR [rcx+zmm2*2+0x4]{k1}
```
---
 llvm/lib/Target/X86/X86InstrAVX512.td | 16 +++----
 llvm/test/MC/X86/avx512pf-64-att.s    | 64 +++++++++++++++++++++++++++
 2 files changed, 72 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index d6ca4b142afe0..abf016000fc8e 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -10388,10 +10388,10 @@ defm VGATHERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dps",
                      VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VGATHERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qps",
-                     VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VGATHERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dpd",
-                     VK8WM, vy512xmem>, EVEX_V512, REX_W, EVEX_CD8<32, CD8VT1>;
+                     VK8WM, vy512xmem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>;
 
 defm VGATHERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qpd",
                      VK8WM, vz512mem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>;
@@ -10400,10 +10400,10 @@ defm VGATHERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dps",
                      VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VGATHERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qps",
-                     VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VGATHERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dpd",
-                     VK8WM, vy512xmem>, EVEX_V512, REX_W, EVEX_CD8<32, CD8VT1>;
+                     VK8WM, vy512xmem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>;
 
 defm VGATHERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qpd",
                      VK8WM, vz512mem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>;
@@ -10412,10 +10412,10 @@ defm VSCATTERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dps
                      VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VSCATTERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qps",
-                     VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VSCATTERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dpd",
-                     VK8WM, vy512xmem>, EVEX_V512, REX_W, EVEX_CD8<32, CD8VT1>;
+                     VK8WM, vy512xmem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>;
 
 defm VSCATTERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qpd",
                      VK8WM, vz512mem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>;
@@ -10424,10 +10424,10 @@ defm VSCATTERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dps
                      VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VSCATTERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qps",
-                     VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd",
-                     VK8WM, vy512xmem>, EVEX_V512, REX_W, EVEX_CD8<32, CD8VT1>;
+                     VK8WM, vy512xmem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>;
 
 defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd",
                      VK8WM, vz512mem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>;
diff --git a/llvm/test/MC/X86/avx512pf-64-att.s b/llvm/test/MC/X86/avx512pf-64-att.s
index bae7fb0f235cc..ef2d30ee21a20 100644
--- a/llvm/test/MC/X86/avx512pf-64-att.s
+++ b/llvm/test/MC/X86/avx512pf-64-att.s
@@ -63,3 +63,67 @@ vscatterpf0qpd (%r14,%zmm14){%k7}
 // CHECK: vscatterpf1qpd (%r15,%zmm13) {%k1}
 // CHECK: encoding: [0x62,0x92,0xfd,0x49,0xc7,0x34,0x2f]
 vscatterpf1qpd (%r15,%zmm13){%k1}
+
+// CHECK: vgatherpf0dpd 8(%rcx,%ymm2,2) {%k1}
+// CHECK: encoding: [0x62,0xf2,0xfd,0x49,0xc6,0x4c,0x51,0x01]
+vgatherpf0dpd 8(%rcx,%ymm2,2){%k1}
+
+// CHECK: vgatherpf0dps 4(%rcx,%zmm2,2) {%k1}
+// CHECK: encoding: [0x62,0xf2,0x7d,0x49,0xc6,0x4c,0x51,0x01]
+vgatherpf0dps 4(%rcx,%zmm2,2){%k1}
+
+// CHECK: vgatherpf0qpd 8(%rcx,%zmm2,2) {%k1}
+// CHECK: encoding: [0x62,0xf2,0xfd,0x49,0xc7,0x4c,0x51,0x01]
+vgatherpf0qpd 8(%rcx,%zmm2,2){%k1}
+
+// CHECK: vgatherpf0qps	4(%rcx,%zmm2,2) {%k1}
+// CHECK: encoding: [0x62,0xf2,0x7d,0x49,0xc7,0x4c,0x51,0x01]
+vgatherpf0qps 4(%rcx,%zmm2,2){%k1}
+
+// CHECK: vgatherpf1dpd 8(%rcx,%ymm2,2) {%k1}
+// CHECK: encoding: [0x62,0xf2,0xfd,0x49,0xc6,0x54,0x51,0x01]
+vgatherpf1dpd 8(%rcx,%ymm2,2){%k1}
+
+// CHECK: vgatherpf1dps 4(%rcx,%zmm2,2) {%k1}
+// CHECK: encoding: [0x62,0xf2,0x7d,0x49,0xc6,0x54,0x51,0x01]
+vgatherpf1dps 4(%rcx,%zmm2,2){%k1}
+
+// CHECK: vgatherpf1qpd 8(%rcx,%zmm2,2) {%k1}
+// CHECK: encoding: [0x62,0xf2,0xfd,0x49,0xc7,0x54,0x51,0x01]
+vgatherpf1qpd 8(%rcx,%zmm2,2){%k1}
+
+// CHECK: vgatherpf1qps 4(%rcx,%zmm2,2) {%k1}
+// CHECK: encoding: [0x62,0xf2,0x7d,0x49,0xc7,0x54,0x51,0x01]
+vgatherpf1qps 4(%rcx,%zmm2,2){%k1}
+
+// CHECK: vscatterpf0dpd 8(%rcx,%ymm2,2) {%k1}
+// CHECK: encoding: [0x62,0xf2,0xfd,0x49,0xc6,0x6c,0x51,0x01]
+vscatterpf0dpd 8(%rcx,%ymm2,2){%k1}
+
+// CHECK: vscatterpf0dps 4(%rcx,%zmm2,2) {%k1}
+// CHECK: encoding: [0x62,0xf2,0x7d,0x49,0xc6,0x6c,0x51,0x01]
+vscatterpf0dps 4(%rcx,%zmm2,2){%k1}
+
+// CHECK: vscatterpf0qpd 8(%rcx,%zmm2,2) {%k1}
+// CHECK: encoding: [0x62,0xf2,0xfd,0x49,0xc7,0x6c,0x51,0x01]
+vscatterpf0qpd 8(%rcx,%zmm2,2){%k1}
+
+// CHECK: vscatterpf0qps	4(%rcx,%zmm2,2) {%k1}
+// CHECK: encoding: [0x62,0xf2,0x7d,0x49,0xc7,0x6c,0x51,0x01]
+vscatterpf0qps 4(%rcx,%zmm2,2){%k1}
+
+// CHECK: vscatterpf1dpd 8(%rcx,%ymm2,2) {%k1}
+// CHECK: encoding: [0x62,0xf2,0xfd,0x49,0xc6,0x74,0x51,0x01]
+vscatterpf1dpd 8(%rcx,%ymm2,2){%k1}
+
+// CHECK: vscatterpf1dps 4(%rcx,%zmm2,2) {%k1}
+// CHECK: encoding: [0x62,0xf2,0x7d,0x49,0xc6,0x74,0x51,0x01]
+vscatterpf1dps 4(%rcx,%zmm2,2){%k1}
+
+// CHECK: vscatterpf1qpd 8(%rcx,%zmm2,2) {%k1}
+// CHECK: encoding: [0x62,0xf2,0xfd,0x49,0xc7,0x74,0x51,0x01]
+vscatterpf1qpd 8(%rcx,%zmm2,2){%k1}
+
+// CHECK: vscatterpf1qps 4(%rcx,%zmm2,2) {%k1}
+// CHECK: encoding: [0x62,0xf2,0x7d,0x49,0xc7,0x74,0x51,0x01]
+vscatterpf1qps 4(%rcx,%zmm2,2){%k1}

From f38c40bff399fda9cbed522816f7ddd18f6c0918 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Sat, 11 Jan 2025 09:08:57 +0000
Subject: [PATCH 152/408] VT: teach isImpliedCondMatchingOperands about
 samesign (#122474)

Move isImplied{True,False}ByMatchingCmp from CmpInst to ICmpInst, so
that it can operate on CmpPredicate instead of CmpInst::Predicate, and
teach it about samesign. There are two callers of this function, and we
choose to migrate the one in ValueTracking, namely
isImpliedCondMatchingOperands to CmpPredicate, hence teaching it about
samesign, with visible test impact.
---
 llvm/include/llvm/IR/InstrTypes.h                |  8 --------
 llvm/include/llvm/IR/Instructions.h              | 10 ++++++++++
 llvm/include/llvm/SandboxIR/Instruction.h        | 16 +++++++++-------
 llvm/lib/Analysis/ValueTracking.cpp              |  9 ++++-----
 llvm/lib/IR/Instructions.cpp                     | 13 ++++++++++---
 llvm/lib/Transforms/Scalar/NewGVN.cpp            |  8 ++++----
 .../ValueTracking/implied-condition-samesign.ll  |  7 ++-----
 7 files changed, 39 insertions(+), 32 deletions(-)

diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
index e6332a16df7d5..7ad34e4f22339 100644
--- a/llvm/include/llvm/IR/InstrTypes.h
+++ b/llvm/include/llvm/IR/InstrTypes.h
@@ -967,14 +967,6 @@ class CmpInst : public Instruction {
   /// Determine if the predicate is false when comparing a value with itself.
   static bool isFalseWhenEqual(Predicate predicate);
 
-  /// Determine if Pred1 implies Pred2 is true when two compares have matching
-  /// operands.
-  static bool isImpliedTrueByMatchingCmp(Predicate Pred1, Predicate Pred2);
-
-  /// Determine if Pred1 implies Pred2 is false when two compares have matching
-  /// operands.
-  static bool isImpliedFalseByMatchingCmp(Predicate Pred1, Predicate Pred2);
-
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::ICmp ||
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index a8df12a1282fc..59eb504098837 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -1266,6 +1266,16 @@ class ICmpInst: public CmpInst {
     return getFlippedSignednessPredicate(getPredicate());
   }
 
+  /// Determine if Pred1 implies Pred2 is true when two compares have matching
+  /// operands.
+  static bool isImpliedTrueByMatchingCmp(CmpPredicate Pred1,
+                                         CmpPredicate Pred2);
+
+  /// Determine if Pred1 implies Pred2 is false when two compares have matching
+  /// operands.
+  static bool isImpliedFalseByMatchingCmp(CmpPredicate Pred1,
+                                          CmpPredicate Pred2);
+
   void setSameSign(bool B = true) {
     SubclassOptionalData = (SubclassOptionalData & ~SameSign) | (B * SameSign);
   }
diff --git a/llvm/include/llvm/SandboxIR/Instruction.h b/llvm/include/llvm/SandboxIR/Instruction.h
index 4d21c4d3da355..d7c1eda81c006 100644
--- a/llvm/include/llvm/SandboxIR/Instruction.h
+++ b/llvm/include/llvm/SandboxIR/Instruction.h
@@ -2511,13 +2511,6 @@ class CmpInst : public SingleLLVMInstructionImpl<llvm::CmpInst> {
   WRAP_STATIC_PREDICATE(isOrdered);
   WRAP_STATIC_PREDICATE(isUnordered);
 
-  static bool isImpliedTrueByMatchingCmp(Predicate Pred1, Predicate Pred2) {
-    return llvm::CmpInst::isImpliedTrueByMatchingCmp(Pred1, Pred2);
-  }
-  static bool isImpliedFalseByMatchingCmp(Predicate Pred1, Predicate Pred2) {
-    return llvm::CmpInst::isImpliedFalseByMatchingCmp(Pred1, Pred2);
-  }
-
   /// Method for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::ICmp ||
@@ -2554,6 +2547,15 @@ class ICmpInst : public CmpInst {
   WRAP_STATIC_PREDICATE(isGE);
   WRAP_STATIC_PREDICATE(isLE);
 
+  static bool isImpliedTrueByMatchingCmp(CmpPredicate Pred1,
+                                         CmpPredicate Pred2) {
+    return llvm::ICmpInst::isImpliedTrueByMatchingCmp(Pred1, Pred2);
+  }
+  static bool isImpliedFalseByMatchingCmp(CmpPredicate Pred1,
+                                          CmpPredicate Pred2) {
+    return llvm::ICmpInst::isImpliedFalseByMatchingCmp(Pred1, Pred2);
+  }
+
   static auto predicates() { return llvm::ICmpInst::predicates(); }
   static bool compare(const APInt &LHS, const APInt &RHS,
                       ICmpInst::Predicate Pred) {
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 4b246c013e96f..92338d33b27a4 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -9387,12 +9387,11 @@ isImpliedCondOperands(CmpInst::Predicate Pred, const Value *ALHS,
 /// Return true if "icmp1 LPred X, Y" implies "icmp2 RPred X, Y" is true.
 /// Return false if "icmp1 LPred X, Y" implies "icmp2 RPred X, Y" is false.
 /// Otherwise, return std::nullopt if we can't infer anything.
-static std::optional<bool>
-isImpliedCondMatchingOperands(CmpInst::Predicate LPred,
-                              CmpInst::Predicate RPred) {
-  if (CmpInst::isImpliedTrueByMatchingCmp(LPred, RPred))
+static std::optional<bool> isImpliedCondMatchingOperands(CmpPredicate LPred,
+                                                         CmpPredicate RPred) {
+  if (ICmpInst::isImpliedTrueByMatchingCmp(LPred, RPred))
     return true;
-  if (CmpInst::isImpliedFalseByMatchingCmp(LPred, RPred))
+  if (ICmpInst::isImpliedFalseByMatchingCmp(LPred, RPred))
     return false;
 
   return std::nullopt;
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 2d6fe40f4c1de..49c148bb68a4d 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -3886,12 +3886,18 @@ bool CmpInst::isFalseWhenEqual(Predicate predicate) {
   }
 }
 
-bool CmpInst::isImpliedTrueByMatchingCmp(Predicate Pred1, Predicate Pred2) {
+bool ICmpInst::isImpliedTrueByMatchingCmp(CmpPredicate Pred1,
+                                          CmpPredicate Pred2) {
   // If the predicates match, then we know the first condition implies the
   // second is true.
-  if (Pred1 == Pred2)
+  if (CmpPredicate::getMatching(Pred1, Pred2))
     return true;
 
+  if (Pred1.hasSameSign() && CmpInst::isSigned(Pred2))
+    Pred1 = ICmpInst::getFlippedSignednessPredicate(Pred1);
+  else if (Pred2.hasSameSign() && CmpInst::isSigned(Pred1))
+    Pred2 = ICmpInst::getFlippedSignednessPredicate(Pred2);
+
   switch (Pred1) {
   default:
     break;
@@ -3911,7 +3917,8 @@ bool CmpInst::isImpliedTrueByMatchingCmp(Predicate Pred1, Predicate Pred2) {
   return false;
 }
 
-bool CmpInst::isImpliedFalseByMatchingCmp(Predicate Pred1, Predicate Pred2) {
+bool ICmpInst::isImpliedFalseByMatchingCmp(CmpPredicate Pred1,
+                                           CmpPredicate Pred2) {
   return isImpliedTrueByMatchingCmp(Pred1, getInversePredicate(Pred2));
 }
 
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 0cba8739441bc..3812e99508f73 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -1964,15 +1964,15 @@ NewGVN::ExprResult NewGVN::performSymbolicCmpEvaluation(Instruction *I) const {
         if (PBranch->TrueEdge) {
           // If we know the previous predicate is true and we are in the true
           // edge then we may be implied true or false.
-          if (CmpInst::isImpliedTrueByMatchingCmp(BranchPredicate,
-                                                  OurPredicate)) {
+          if (ICmpInst::isImpliedTrueByMatchingCmp(BranchPredicate,
+                                                   OurPredicate)) {
             return ExprResult::some(
                 createConstantExpression(ConstantInt::getTrue(CI->getType())),
                 PI);
           }
 
-          if (CmpInst::isImpliedFalseByMatchingCmp(BranchPredicate,
-                                                   OurPredicate)) {
+          if (ICmpInst::isImpliedFalseByMatchingCmp(BranchPredicate,
+                                                    OurPredicate)) {
             return ExprResult::some(
                 createConstantExpression(ConstantInt::getFalse(CI->getType())),
                 PI);
diff --git a/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll b/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll
index 042155ae2bb79..546ff2d77d86e 100644
--- a/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll
+++ b/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll
@@ -118,8 +118,7 @@ define i1 @sgt_implies_ge_via_assume(i32 %i, i32 %j) {
 ; CHECK-SAME: i32 [[I:%.*]], i32 [[J:%.*]]) {
 ; CHECK-NEXT:    [[I_SGT_J:%.*]] = icmp sgt i32 [[I]], [[J]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[I_SGT_J]])
-; CHECK-NEXT:    [[I_GE_J:%.*]] = icmp samesign uge i32 [[I]], [[J]]
-; CHECK-NEXT:    ret i1 [[I_GE_J]]
+; CHECK-NEXT:    ret i1 true
 ;
   %i.sgt.j = icmp sgt i32 %i, %j
   call void @llvm.assume(i1 %i.sgt.j)
@@ -134,9 +133,7 @@ define i32 @gt_implies_sge_dominating(i32 %a, i32 %len) {
 ; CHECK-NEXT:    [[A_GT_LEN:%.*]] = icmp samesign ugt i32 [[A]], [[LEN]]
 ; CHECK-NEXT:    br i1 [[A_GT_LEN]], label %[[TAKEN:.*]], label %[[END:.*]]
 ; CHECK:       [[TAKEN]]:
-; CHECK-NEXT:    [[A_SGE_LEN:%.*]] = icmp sge i32 [[A]], [[LEN]]
-; CHECK-NEXT:    [[RES:%.*]] = select i1 [[A_SGE_LEN]], i32 30, i32 0
-; CHECK-NEXT:    ret i32 [[RES]]
+; CHECK-NEXT:    ret i32 30
 ; CHECK:       [[END]]:
 ; CHECK-NEXT:    ret i32 -1
 ;

From 30bb1863892a235fe1d90eb6b496bd24b0c177d8 Mon Sep 17 00:00:00 2001
From: Niels Dekker <N.Dekker@lumc.nl>
Date: Sat, 11 Jan 2025 10:15:17 +0100
Subject: [PATCH 153/408] [clang-tidy] Mention std::forward_list in
 container-size-empty doc (#120701)

Mentioned `std::forward_list` as example of a container without
`size()`.
---
 .../clang-tidy/readability/ContainerSizeEmptyCheck.h      | 8 ++++----
 .../checks/readability/container-size-empty.rst           | 7 ++++---
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/readability/ContainerSizeEmptyCheck.h b/clang-tools-extra/clang-tidy/readability/ContainerSizeEmptyCheck.h
index 3aa4bdc496194..e449686f77566 100644
--- a/clang-tools-extra/clang-tidy/readability/ContainerSizeEmptyCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/ContainerSizeEmptyCheck.h
@@ -19,10 +19,10 @@ namespace clang::tidy::readability {
 ///
 /// The emptiness of a container should be checked using the `empty()` method
 /// instead of the `size()`/`length()` method. It shows clearer intent to use
-/// `empty()`. Furthermore some containers may implement the `empty()` method
-/// but not implement the `size()` or `length()` method. Using `empty()`
-/// whenever possible makes it easier to switch to another container in the
-/// future.
+/// `empty()`. Furthermore some containers (for example, a `std::forward_list`)
+/// may implement the `empty()` method but not implement the `size()` or
+/// `length()` method. Using `empty()` whenever possible makes it easier to
+/// switch to another container in the future.
 class ContainerSizeEmptyCheck : public ClangTidyCheck {
 public:
   ContainerSizeEmptyCheck(StringRef Name, ClangTidyContext *Context);
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/container-size-empty.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/container-size-empty.rst
index 6a007f69767ab..43ad74f60dbe5 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/container-size-empty.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/container-size-empty.rst
@@ -9,9 +9,10 @@ with a call to ``empty()``.
 
 The emptiness of a container should be checked using the ``empty()`` method
 instead of the ``size()``/``length()`` method. It shows clearer intent to use
-``empty()``. Furthermore some containers may implement the ``empty()`` method
-but not implement the ``size()`` or ``length()`` method. Using ``empty()``
-whenever possible makes it easier to switch to another container in the future.
+``empty()``. Furthermore some containers (for example, a ``std::forward_list``)
+may implement the ``empty()`` method but not implement the ``size()`` or
+``length()`` method. Using ``empty()`` whenever possible makes it easier to
+switch to another container in the future.
 
 The check issues warning if a container has ``empty()`` and ``size()`` or
 ``length()`` methods matching following signatures:

From dc2963c8d77229ca1b20663beddef2323cc69a88 Mon Sep 17 00:00:00 2001
From: Tommy Chen <tmd279936@gmail.com>
Date: Sat, 11 Jan 2025 18:04:19 +0800
Subject: [PATCH 154/408] [clang-tidy] exclude CXXParenListInitExpr from
 RedundantCastingCheck (#109741)

Exclude CXXParenListInitExpr from RedundantCastingCheck because there
are false positive cases. Currently, we can't think of positive cases
for CXXParenListInitExpr. This can be improved by following the
initListExpr method if we can come up with some positive cases.

Fixes #108846
---
 .../readability/RedundantCastingCheck.cpp     |  5 +++++
 clang-tools-extra/docs/ReleaseNotes.rst       |  5 +++++
 .../readability/redundant-casting.cpp         | 20 ++++++++++++++++---
 3 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.cpp
index 4d5adbe02f525..768540e05c759 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.cpp
@@ -108,6 +108,10 @@ void RedundantCastingCheck::registerMatchers(MatchFinder *Finder) {
 
   auto BitfieldMemberExpr = memberExpr(member(fieldDecl(isBitField())));
 
+  const ast_matchers::internal::VariadicDynCastAllOfMatcher<
+      Stmt, CXXParenListInitExpr>
+      cxxParenListInitExpr; // NOLINT(readability-identifier-naming)
+
   Finder->addMatcher(
       explicitCastExpr(
           unless(hasCastKind(CK_ConstructorConversion)),
@@ -117,6 +121,7 @@ void RedundantCastingCheck::registerMatchers(MatchFinder *Finder) {
           hasDestinationType(qualType().bind("dstType")),
           hasSourceExpression(anyOf(
               expr(unless(initListExpr()), unless(BitfieldMemberExpr),
+                   unless(cxxParenListInitExpr()),
                    hasType(qualType().bind("srcType")))
                   .bind("source"),
               initListExpr(unless(hasInit(1, expr())),
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 684ba77d8f0f5..9cdad8fb6b58f 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -360,6 +360,11 @@ Changes in existing checks
   case of the literal suffix in fixes and fixing false positive for implicit
   conversion of comparison result in C23.
 
+- Improved :doc:`readability-redundant-casting
+  <clang-tidy/checks/readability/redundant-casting>` check
+  by addressing a false positive in aggregate initialization through
+  parenthesized list.
+
 - Improved :doc:`readability-redundant-smartptr-get
   <clang-tidy/checks/readability/redundant-smartptr-get>` check to
   remove `->`, when redundant `get()` is removed.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-casting.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-casting.cpp
index 30cac6bd5cca0..9c3c90bfaf459 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-casting.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-casting.cpp
@@ -1,10 +1,18 @@
-// RUN: %check_clang_tidy -std=c++11-or-later %s readability-redundant-casting %t -- -- -fno-delayed-template-parsing
-// RUN: %check_clang_tidy -std=c++11-or-later -check-suffix=,MACROS %s readability-redundant-casting %t -- \
+// RUN: %check_clang_tidy -std=c++11,c++14,c++17 %s readability-redundant-casting %t -- -- -fno-delayed-template-parsing
+// RUN: %check_clang_tidy -std=c++11,c++14,c++17 -check-suffix=,MACROS %s readability-redundant-casting %t -- \
 // RUN:   -config='{CheckOptions: { readability-redundant-casting.IgnoreMacros: false }}' \
 // RUN:   -- -fno-delayed-template-parsing
-// RUN: %check_clang_tidy -std=c++11-or-later -check-suffix=,ALIASES %s readability-redundant-casting %t -- \
+// RUN: %check_clang_tidy -std=c++11,c++14,c++17 -check-suffix=,ALIASES %s readability-redundant-casting %t -- \
 // RUN:   -config='{CheckOptions: { readability-redundant-casting.IgnoreTypeAliases: true }}' \
 // RUN:   -- -fno-delayed-template-parsing
+// RUN: %check_clang_tidy -std=c++20 %s readability-redundant-casting %t -- \
+// RUN:   -- -fno-delayed-template-parsing -D CXX_20=1
+// RUN: %check_clang_tidy -std=c++20 -check-suffix=,MACROS %s readability-redundant-casting %t -- \
+// RUN:   -config='{CheckOptions: { readability-redundant-casting.IgnoreMacros: false }}' \
+// RUN:   -- -fno-delayed-template-parsing -D CXX_20=1
+// RUN: %check_clang_tidy -std=c++20 -check-suffix=,ALIASES %s readability-redundant-casting %t -- \
+// RUN:   -config='{CheckOptions: { readability-redundant-casting.IgnoreTypeAliases: true }}' \
+// RUN:   -- -fno-delayed-template-parsing -D CXX_20=1
 
 struct A {};
 struct B : A {};
@@ -57,6 +65,12 @@ void testDiffrentTypesCast(B& value) {
   A& a7 = static_cast<A&>(value);
 }
 
+#ifdef CXX_20
+void testParenListInitExpr(A value) {
+  B b = static_cast<B>(value);
+}
+#endif
+
 void testCastingWithAuto() {
   auto a = getA();
   A& a8 = static_cast<A&>(a);

From 35e89897a4086f5adbab10b4b90aa63ef5b35514 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 11 Jan 2025 02:06:33 -0800
Subject: [PATCH 155/408] [Dialect] Migrate away from PointerUnion::{is,get}
 (NFC) (#122568)

Note that PointerUnion::{is,get} have been soft deprecated in
PointerUnion.h:

  // FIXME: Replace the uses of is(), get() and dyn_cast() with
  //        isa<T>, cast<T> and the llvm::dyn_cast<T>
---
 mlir/include/mlir/Dialect/Async/IR/AsyncOps.td         | 2 +-
 mlir/include/mlir/Dialect/EmitC/IR/EmitC.td            | 2 +-
 mlir/include/mlir/Dialect/Func/IR/FuncOps.td           | 4 ++--
 mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td            | 2 +-
 mlir/include/mlir/Dialect/Transform/IR/TransformOps.td | 2 +-
 mlir/lib/Dialect/Affine/IR/AffineOps.cpp               | 2 +-
 mlir/test/lib/Dialect/Affine/TestReifyValueBounds.cpp  | 2 +-
 mlir/test/lib/Dialect/Test/TestOpDefs.cpp              | 4 ++--
 mlir/test/lib/Dialect/Test/TestOps.td                  | 2 +-
 mlir/test/lib/Dialect/Test/TestTypes.cpp               | 5 +++--
 10 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Async/IR/AsyncOps.td b/mlir/include/mlir/Dialect/Async/IR/AsyncOps.td
index 33b6792175234..a08f5d6e714ef 100644
--- a/mlir/include/mlir/Dialect/Async/IR/AsyncOps.td
+++ b/mlir/include/mlir/Dialect/Async/IR/AsyncOps.td
@@ -256,7 +256,7 @@ def Async_CallOp : Async_Op<"call",
 
     /// Set the callee for this operation.
     void setCalleeFromCallable(CallInterfaceCallable callee) {
-      (*this)->setAttr("callee", callee.get<SymbolRefAttr>());
+      (*this)->setAttr("callee", cast<SymbolRefAttr>(callee));
     }
   }];
 
diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
index 744a0dc4770e6..b16f5a8619fe7 100644
--- a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
+++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
@@ -580,7 +580,7 @@ def EmitC_CallOp : EmitC_Op<"call",
 
     /// Set the callee for this operation.
     void setCalleeFromCallable(CallInterfaceCallable callee) {
-      (*this)->setAttr("callee", callee.get<SymbolRefAttr>());
+      (*this)->setAttr("callee", cast<SymbolRefAttr>(callee));
     }
   }];
 
diff --git a/mlir/include/mlir/Dialect/Func/IR/FuncOps.td b/mlir/include/mlir/Dialect/Func/IR/FuncOps.td
index 211201802b08c..4da0efcb13ddf 100644
--- a/mlir/include/mlir/Dialect/Func/IR/FuncOps.td
+++ b/mlir/include/mlir/Dialect/Func/IR/FuncOps.td
@@ -97,7 +97,7 @@ def CallOp : Func_Op<"call",
 
     /// Set the callee for this operation.
     void setCalleeFromCallable(CallInterfaceCallable callee) {
-      (*this)->setAttr("callee", callee.get<SymbolRefAttr>());
+      (*this)->setAttr("callee", cast<SymbolRefAttr>(callee));
     }
   }];
 
@@ -168,7 +168,7 @@ def CallIndirectOp : Func_Op<"call_indirect", [
 
     /// Set the callee for this operation.
     void setCalleeFromCallable(CallInterfaceCallable callee) {
-      setOperand(0, callee.get<Value>());
+      setOperand(0, cast<Value>(callee));
     }
   }];
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index a3aa53b1fcb85..b2281536aa40b 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -339,7 +339,7 @@ def LLVM_GEPOp : LLVM_Op<"getelementptr", [Pure,
         indices.push_back(value);
       else
         indices.push_back(
-            builder.getInt32(valueOrAttr.get<IntegerAttr>().getInt()));
+            builder.getInt32(cast<IntegerAttr>(valueOrAttr).getInt()));
     }
     Type baseElementType = op.getElemType();
     llvm::Type *elementType = moduleTranslation.convertType(baseElementType);
diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
index 1eebddca3df4d..77ed6b322451e 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
@@ -899,7 +899,7 @@ def IncludeOp : TransformDialectOp<"include",
     }
 
     void setCalleeFromCallable(::mlir::CallInterfaceCallable callee) {
-      setTargetAttr(callee.get<SymbolRefAttr>());
+      setTargetAttr(cast<SymbolRefAttr>(callee));
     }
 
     ::mlir::Operation::operand_range getArgOperands() {
diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
index b45829bcf6d2c..aa2c2041f411f 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -5069,7 +5069,7 @@ static OpFoldResult computeProduct(Location loc, OpBuilder &builder,
     if (maybeConst) {
       result = result * builder.getAffineConstantExpr(*maybeConst);
     } else {
-      dynamicPart.push_back(term.get<Value>());
+      dynamicPart.push_back(cast<Value>(term));
       result = result * builder.getAffineSymbolExpr(nDynamic++);
     }
   }
diff --git a/mlir/test/lib/Dialect/Affine/TestReifyValueBounds.cpp b/mlir/test/lib/Dialect/Affine/TestReifyValueBounds.cpp
index 2c954ffc4acef..891b3bab8629d 100644
--- a/mlir/test/lib/Dialect/Affine/TestReifyValueBounds.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestReifyValueBounds.cpp
@@ -150,7 +150,7 @@ static LogicalResult testReifyValueBounds(FunctionOpInterface funcOp,
       return WalkResult::skip();
     }
     Value constOp = rewriter.create<arith::ConstantIndexOp>(
-        op->getLoc(), cast<IntegerAttr>(reified->get<Attribute>()).getInt());
+        op->getLoc(), cast<IntegerAttr>(cast<Attribute>(*reified)).getInt());
     rewriter.replaceOp(op, constOp);
     return WalkResult::skip();
   });
diff --git a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
index b268e549b93ab..c6be26d0a44d9 100644
--- a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
+++ b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
@@ -1098,7 +1098,7 @@ CallInterfaceCallable TestCallAndStoreOp::getCallableForCallee() {
 }
 
 void TestCallAndStoreOp::setCalleeFromCallable(CallInterfaceCallable callee) {
-  setCalleeAttr(callee.get<SymbolRefAttr>());
+  setCalleeAttr(cast<SymbolRefAttr>(callee));
 }
 
 Operation::operand_range TestCallAndStoreOp::getArgOperands() {
@@ -1117,7 +1117,7 @@ CallInterfaceCallable TestCallOnDeviceOp::getCallableForCallee() {
 }
 
 void TestCallOnDeviceOp::setCalleeFromCallable(CallInterfaceCallable callee) {
-  setCalleeAttr(callee.get<SymbolRefAttr>());
+  setCalleeAttr(cast<SymbolRefAttr>(callee));
 }
 
 Operation::operand_range TestCallOnDeviceOp::getArgOperands() {
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index bafab155eb9d5..0b1f22b3ee932 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -566,7 +566,7 @@ def ConversionCallOp : TEST_Op<"conversion_call_op",
     }
 
     void $cppClass::setCalleeFromCallable(::mlir::CallInterfaceCallable callee) {
-      (*this)->setAttr("callee", callee.get<SymbolRefAttr>());
+      (*this)->setAttr("callee", cast<SymbolRefAttr>(callee));
     }
   }];
 }
diff --git a/mlir/test/lib/Dialect/Test/TestTypes.cpp b/mlir/test/lib/Dialect/Test/TestTypes.cpp
index 6e31bb71d04d8..b822e019e09d2 100644
--- a/mlir/test/lib/Dialect/Test/TestTypes.cpp
+++ b/mlir/test/lib/Dialect/Test/TestTypes.cpp
@@ -295,8 +295,9 @@ TestTypeWithLayoutType::verifyEntries(DataLayoutEntryListRef params,
   for (DataLayoutEntryInterface entry : params) {
     // This is for testing purposes only, so assert well-formedness.
     assert(entry.isTypeEntry() && "unexpected identifier entry");
-    assert(llvm::isa<TestTypeWithLayoutType>(entry.getKey().get<Type>()) &&
-           "wrong type passed in");
+    assert(
+        llvm::isa<TestTypeWithLayoutType>(llvm::cast<Type>(entry.getKey())) &&
+        "wrong type passed in");
     auto array = llvm::dyn_cast<ArrayAttr>(entry.getValue());
     assert(array && array.getValue().size() == 2 &&
            "expected array of two elements");

From 4435b7d8d3df31d59402b6b106d8d45fd2ba0f93 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 11 Jan 2025 02:06:47 -0800
Subject: [PATCH 156/408] [flang] Migrate away from PointerUnion::{is,get}
 (NFC) (#122585)

Note that PointerUnion::{is,get} have been soft deprecated in
PointerUnion.h:

  // FIXME: Replace the uses of is(), get() and dyn_cast() with
  //        isa<T>, cast<T> and the llvm::dyn_cast<T>
---
 flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td | 3 ++-
 flang/include/flang/Optimizer/Dialect/FIROps.td     | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
index 9a31ffa2e9471..6f886726b1283 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
@@ -222,7 +222,8 @@ def cuf_KernelLaunchOp : cuf_Op<"kernel_launch", [CallOpInterface,
     }
 
     void setCalleeFromCallable(mlir::CallInterfaceCallable callee) {
-      (*this)->setAttr(getCalleeAttrName(), callee.get<mlir::SymbolRefAttr>());
+      (*this)->setAttr(getCalleeAttrName(),
+                       llvm::cast<mlir::SymbolRefAttr>(callee));
     }
     mlir::FunctionType getFunctionType();
 
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index 01f588b3c8ba5..5f0f0b48e892b 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -2488,8 +2488,9 @@ def fir_CallOp : fir_Op<"call",
     void setCalleeFromCallable(mlir::CallInterfaceCallable callee) {
       if (auto calling =
           (*this)->getAttrOfType<mlir::SymbolRefAttr>(getCalleeAttrName()))
-        (*this)->setAttr(getCalleeAttrName(), callee.get<mlir::SymbolRefAttr>());
-      setOperand(0, callee.get<mlir::Value>());
+        (*this)->setAttr(getCalleeAttrName(),
+                         llvm::cast<mlir::SymbolRefAttr>(callee));
+      setOperand(0, llvm::cast<mlir::Value>(callee));
     }
   }];
 }

From 642e493d4dc6d8f18900a22ed2ca6f638b69a2f8 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Sat, 11 Jan 2025 11:48:05 +0100
Subject: [PATCH 157/408] [InstCombine] Convert fshl(x, 0, y) to shl(x, and(y,
 BitWidth - 1)) when BitWidth is pow2 (#122362)

Convert `fshl(x, 0, y)` to `shl(x, and(y, BitWidth - 1))` when BitWidth
is pow2

Alive2 proof: https://alive2.llvm.org/ce/z/3oTEop
Fixes: #122235
---
 .../InstCombine/InstCombineCalls.cpp          |  9 +++
 llvm/test/Transforms/InstCombine/fsh.ll       | 58 +++++++++++++++++++
 2 files changed, 67 insertions(+)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index c55c40c88bc84..7454382412369 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2229,6 +2229,15 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
         return BitOp;
     }
 
+    // fshl(X, 0, Y) --> shl(X, and(Y, BitWidth - 1)) if bitwidth is a
+    // power-of-2
+    if (IID == Intrinsic::fshl && isPowerOf2_32(BitWidth) &&
+        match(Op1, m_ZeroInt())) {
+      Value *Op2 = II->getArgOperand(2);
+      Value *And = Builder.CreateAnd(Op2, ConstantInt::get(Ty, BitWidth - 1));
+      return BinaryOperator::CreateShl(Op0, And);
+    }
+
     // Left or right might be masked.
     if (SimplifyDemandedInstructionBits(*II))
       return &CI;
diff --git a/llvm/test/Transforms/InstCombine/fsh.ll b/llvm/test/Transforms/InstCombine/fsh.ll
index 434cd810296d8..236c69e7a5bcb 100644
--- a/llvm/test/Transforms/InstCombine/fsh.ll
+++ b/llvm/test/Transforms/InstCombine/fsh.ll
@@ -1010,3 +1010,61 @@ define <2 x i32> @fshr_vec_zero_elem(<2 x i32> %x, <2 x i32> %y) {
   %fsh = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 2, i32 0>)
   ret <2 x i32> %fsh
 }
+
+define i16 @fshl_i16_shl(i16 %x, i16 %y) {
+; CHECK-LABEL: @fshl_i16_shl(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = and i16 [[Y:%.*]], 15
+; CHECK-NEXT:    [[RES:%.*]] = shl i16 [[X:%.*]], [[TMP0]]
+; CHECK-NEXT:    ret i16 [[RES]]
+;
+entry:
+  %res = call i16 @llvm.fshl.i16(i16 %x, i16 0, i16 %y)
+  ret i16 %res
+}
+
+define i32 @fshl_i32_shl(i32 %x, i32 %y) {
+; CHECK-LABEL: @fshl_i32_shl(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = and i32 [[Y:%.*]], 31
+; CHECK-NEXT:    [[RES:%.*]] = shl i32 [[X:%.*]], [[TMP0]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %res = call i32 @llvm.fshl.i32(i32 %x, i32 0, i32 %y)
+  ret i32 %res
+}
+
+define <2 x i16> @fshl_vi16_shl(<2 x i16> %x, <2 x i16> %y) {
+; CHECK-LABEL: @fshl_vi16_shl(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = and <2 x i16> [[Y:%.*]], splat (i16 15)
+; CHECK-NEXT:    [[RES:%.*]] = shl <2 x i16> [[X:%.*]], [[TMP0]]
+; CHECK-NEXT:    ret <2 x i16> [[RES]]
+;
+entry:
+  %res = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> %x, <2 x i16> zeroinitializer, <2 x i16> %y)
+  ret <2 x i16> %res
+}
+
+define i32 @fshr_i32_shl_negative_test(i32 %x, i32 %y) {
+; CHECK-LABEL: @fshr_i32_shl_negative_test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.fshr.i32(i32 [[X:%.*]], i32 0, i32 [[Y:%.*]])
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %res = call i32 @llvm.fshr.i32(i32 %x, i32 0, i32 %y)
+  ret i32 %res
+}
+
+define <2 x i31> @fshl_vi31_shl_negative_test(<2 x i31> %x, <2 x i31> %y) {
+; CHECK-LABEL: @fshl_vi31_shl_negative_test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i31> @llvm.fshl.v2i31(<2 x i31> [[X:%.*]], <2 x i31> zeroinitializer, <2 x i31> [[Y:%.*]])
+; CHECK-NEXT:    ret <2 x i31> [[RES]]
+;
+entry:
+  %res = call <2 x i31> @llvm.fshl.v2i31(<2 x i31> %x, <2 x i31> zeroinitializer, <2 x i31>  %y)
+  ret <2 x i31>  %res
+}

From 32bcd41adcc664f6d690efc9b7cd209ac9c65f68 Mon Sep 17 00:00:00 2001
From: Congcong Cai <congcongcai0907@163.com>
Date: Sat, 11 Jan 2025 18:48:39 +0800
Subject: [PATCH 158/408] [clang-tidy] use correct template type in
 ``std::min`` and ``std::max`` when operand is integer literal for
 readability-use-std-min-max (#122296)

When comparing with integer literal, integer promote will happen to
promote type which has less bit width than int to int or unsigned int.
It will let auto-fix provide correct but out of expected fix.

e.g.
```c++
short a;
if ( a > 10 )
  a = 10;
```
will be
```c++
short a;
if ( (int)a > 10 )
  a = (short)10;
```

which will be fixed as
```c++
short a;
a = std::max<int>(a, 10);
```

but actually it can be
```c++
short a;
a = std::max<short>(a, 10);
```

Fixed: #121676
---
 .../readability/UseStdMinMaxCheck.cpp         | 35 ++++++++++++-------
 clang-tools-extra/docs/ReleaseNotes.rst       |  4 +++
 .../checkers/readability/use-std-min-max.cpp  | 21 +++++++++++
 3 files changed, 48 insertions(+), 12 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/readability/UseStdMinMaxCheck.cpp b/clang-tools-extra/clang-tidy/readability/UseStdMinMaxCheck.cpp
index 179173502a8d0..6f6b8a853a91e 100644
--- a/clang-tools-extra/clang-tidy/readability/UseStdMinMaxCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/UseStdMinMaxCheck.cpp
@@ -79,6 +79,27 @@ static QualType getNonTemplateAlias(QualType QT) {
   return QT;
 }
 
+static QualType getReplacementCastType(const Expr *CondLhs, const Expr *CondRhs,
+                                       QualType ComparedType) {
+  QualType LhsType = CondLhs->getType();
+  QualType RhsType = CondRhs->getType();
+  QualType LhsCanonicalType =
+      LhsType.getCanonicalType().getNonReferenceType().getUnqualifiedType();
+  QualType RhsCanonicalType =
+      RhsType.getCanonicalType().getNonReferenceType().getUnqualifiedType();
+  QualType GlobalImplicitCastType;
+  if (LhsCanonicalType != RhsCanonicalType) {
+    if (llvm::isa<IntegerLiteral>(CondRhs)) {
+      GlobalImplicitCastType = getNonTemplateAlias(LhsType);
+    } else if (llvm::isa<IntegerLiteral>(CondLhs)) {
+      GlobalImplicitCastType = getNonTemplateAlias(RhsType);
+    } else {
+      GlobalImplicitCastType = getNonTemplateAlias(ComparedType);
+    }
+  }
+  return GlobalImplicitCastType;
+}
+
 static std::string createReplacement(const Expr *CondLhs, const Expr *CondRhs,
                                      const Expr *AssignLhs,
                                      const SourceManager &Source,
@@ -92,18 +113,8 @@ static std::string createReplacement(const Expr *CondLhs, const Expr *CondRhs,
   const llvm::StringRef AssignLhsStr = Lexer::getSourceText(
       Source.getExpansionRange(AssignLhs->getSourceRange()), Source, LO);
 
-  QualType GlobalImplicitCastType;
-  QualType LhsType = CondLhs->getType()
-                         .getCanonicalType()
-                         .getNonReferenceType()
-                         .getUnqualifiedType();
-  QualType RhsType = CondRhs->getType()
-                         .getCanonicalType()
-                         .getNonReferenceType()
-                         .getUnqualifiedType();
-  if (LhsType != RhsType) {
-    GlobalImplicitCastType = getNonTemplateAlias(BO->getLHS()->getType());
-  }
+  QualType GlobalImplicitCastType =
+      getReplacementCastType(CondLhs, CondRhs, BO->getLHS()->getType());
 
   return (AssignLhsStr + " = " + FunctionName +
           (!GlobalImplicitCastType.isNull()
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 9cdad8fb6b58f..d75a276729c58 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -369,6 +369,10 @@ Changes in existing checks
   <clang-tidy/checks/readability/redundant-smartptr-get>` check to
   remove `->`, when redundant `get()` is removed.
 
+- Improved :doc:`readability-use-std-min-max
+  <clang-tidy/checks/readability/use-std-min-max>` check to use correct template
+  type in ``std::min`` and ``std::max`` when operand is integer literal.
+
 Removed checks
 ^^^^^^^^^^^^^^
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/use-std-min-max.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/use-std-min-max.cpp
index 9c0e2eabda348..35ade8a7c6d37 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/use-std-min-max.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/use-std-min-max.cpp
@@ -252,3 +252,24 @@ void testVectorSizeType() {
   if (value < v.size())
     value = v.size();
 }
+
+namespace gh121676 {
+
+void useLeft() {
+  using U16 = unsigned short;
+  U16 I = 0;
+  // CHECK-MESSAGES: :[[@LINE+2]]:3: warning: use `std::max` instead of `<` [readability-use-std-min-max]
+  // CHECK-FIXES: I = std::max<U16>(I, 16U);
+  if (I < 16U)
+    I = 16U;
+}
+void useRight() {
+  using U16 = unsigned short;
+  U16 I = 0;
+  // CHECK-MESSAGES: :[[@LINE+2]]:3: warning: use `std::min` instead of `<` [readability-use-std-min-max]
+  // CHECK-FIXES: I = std::min<U16>(16U, I);
+  if (16U < I)
+    I = 16U;
+}
+
+} // namespace gh121676

From 77ef5a601ad3827316e412788f609e9141b51e83 Mon Sep 17 00:00:00 2001
From: Younan Zhang <zyn7109@gmail.com>
Date: Sat, 11 Jan 2025 19:17:09 +0800
Subject: [PATCH 159/408] [Clang][NFC] Fix a test failure with mold linker
 (#122587)

Mold prefers the suffix '$' for symbols like PLT and GOT entries, so
exclude these symbols as well. Otherwise, this test will fail for
developers using mold-linked Clang.

Closes https://github.com/llvm/llvm-project/issues/76982
---
 clang/test/LibClang/symbols.test | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/test/LibClang/symbols.test b/clang/test/LibClang/symbols.test
index fd2ff8bc6cd42..e12b4c9991b81 100644
--- a/clang/test/LibClang/symbols.test
+++ b/clang/test/LibClang/symbols.test
@@ -1,6 +1,6 @@
 # Check that there are no unversioned clang symbols in libclang.so
 RUN: llvm-nm -Dj --defined-only %libclang | grep -v -e '@@LLVM_[0-9]\+$' | not grep '^clang'
 
-# Check that here are no local clang_ symbols (ignoring symbols with .cold or
+# Check that there are no local clang_ symbols (ignoring symbols with .cold or
 # .localalias suffxies.)
-RUN: llvm-nm %libclang | not grep '[a-z] clang_[^.]\+$'
+RUN: llvm-nm %libclang | not grep '[a-z] clang_[^.$]\+$'

From 876fa60f081ed66ad9645f955790198c3a96882c Mon Sep 17 00:00:00 2001
From: Mingjie Xu <xumingjie.enna1@bytedance.com>
Date: Sat, 11 Jan 2025 20:15:21 +0800
Subject: [PATCH 160/408] [TySan] Skip instrumentation for function
 declarations (#122488)

Skip function declarations for instrumentation.

Fixes https://github.com/llvm/llvm-project/issues/122467
---
 llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp | 2 ++
 llvm/test/Instrumentation/TypeSanitizer/basic.ll      | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
index 9cd81f3e6edb3..2ae8106218667 100644
--- a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
@@ -512,6 +512,8 @@ void collectMemAccessInfo(
 
 bool TypeSanitizer::sanitizeFunction(Function &F,
                                      const TargetLibraryInfo &TLI) {
+  if (F.isDeclaration())
+    return false;
   // This is required to prevent instrumenting call to __tysan_init from within
   // the module constructor.
   if (&F == TysanCtorFunction.getCallee() || &F == TysanGlobalsSetTypeFunction)
diff --git a/llvm/test/Instrumentation/TypeSanitizer/basic.ll b/llvm/test/Instrumentation/TypeSanitizer/basic.ll
index 704c18800f19e..b40b64664502a 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/basic.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/basic.ll
@@ -5,6 +5,8 @@
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
+declare i32 @declaration_only(i32 %a) sanitize_type
+
 ;.
 ; CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @tysan.module_ctor, ptr null }]
 ; CHECK: @__tysan_v1_Simple_20C_2b_2b_20TBAA = linkonce_odr constant { i64, i64, [16 x i8] } { i64 2, i64 0, [16 x i8] c"Simple C++ TBAA\00" }, comdat

From 1d58699f5ce1a79634ea81f576cd4975cb04f046 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Sat, 11 Jan 2025 13:23:37 +0100
Subject: [PATCH 161/408] [SDPatternMatch] Add Matcher m_Undef (#122521)

Add Matcher `m_Undef`

Fixes: #122439
---
 llvm/include/llvm/CodeGen/SDPatternMatch.h              | 2 ++
 llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp | 5 +++++
 2 files changed, 7 insertions(+)

diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index 2ccefa33abbf8..4faa090901a6a 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -138,6 +138,8 @@ struct Opcode_match {
 
 inline Opcode_match m_Opc(unsigned Opcode) { return Opcode_match(Opcode); }
 
+inline Opcode_match m_Undef() { return Opcode_match(ISD::UNDEF); }
+
 template <unsigned NumUses, typename Pattern> struct NUses_match {
   Pattern P;
 
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index 9b759ef19efe1..bf9c597d8ac5e 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -474,6 +474,11 @@ TEST_F(SelectionDAGPatternMatchTest, matchConstants) {
   EXPECT_EQ(CC, ISD::SETULT);
   EXPECT_TRUE(sd_match(SetCC, m_Node(ISD::SETCC, m_Value(), m_Value(),
                                      m_SpecificCondCode(ISD::SETULT))));
+
+  SDValue UndefInt32VT = DAG->getUNDEF(Int32VT);
+  SDValue UndefVInt32VT = DAG->getUNDEF(VInt32VT);
+  EXPECT_TRUE(sd_match(UndefInt32VT, m_Undef()));
+  EXPECT_TRUE(sd_match(UndefVInt32VT, m_Undef()));
 }
 
 TEST_F(SelectionDAGPatternMatchTest, patternCombinators) {

From d291e459093be6df542c1770b26ff2e1bf1e6949 Mon Sep 17 00:00:00 2001
From: macurtis-amd <macurtis@amd.com>
Date: Sat, 11 Jan 2025 07:27:19 -0600
Subject: [PATCH 162/408] [flang] Teach omp-map-info-finalization to reuse
 descriptor allocas (#122507)

Internal testing shows improvements in some SPEC HPC benchmarks with
this change.
---
 .../Optimizer/OpenMP/MapInfoFinalization.cpp  | 17 +++++--
 .../Transforms/omp-map-info-finalization.fir  | 46 +++++++++++++++++++
 2 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
index e823443958714..c63d2f4531a6f 100644
--- a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
+++ b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
@@ -66,10 +66,12 @@ class MapInfoFinalizationPass
   /// Tracks any intermediate function/subroutine local allocations we
   /// generate for the descriptors of box type dummy arguments, so that
   /// we can retrieve it for subsequent reuses within the functions
-  /// scope
-  std::map</*descriptor opaque pointer=*/void *,
-           /*corresponding local alloca=*/fir::AllocaOp>
-      localBoxAllocas;
+  /// scope.
+  ///
+  ///      descriptor defining op
+  ///      |                  corresponding local alloca
+  ///      |                  |
+  std::map<mlir::Operation *, mlir::Value> localBoxAllocas;
 
   /// getMemberUserList gathers all users of a particular MapInfoOp that are
   /// other MapInfoOp's and places them into the mapMemberUsers list, which
@@ -132,6 +134,11 @@ class MapInfoFinalizationPass
     if (!mlir::isa<fir::BaseBoxType>(descriptor.getType()))
       return descriptor;
 
+    mlir::Value &slot = localBoxAllocas[descriptor.getDefiningOp()];
+    if (slot) {
+      return slot;
+    }
+
     // The fir::BoxOffsetOp only works with !fir.ref<!fir.box<...>> types, as
     // allowing it to access non-reference box operations can cause some
     // problematic SSA IR. However, in the case of assumed shape's the type
@@ -147,7 +154,7 @@ class MapInfoFinalizationPass
     auto alloca = builder.create<fir::AllocaOp>(loc, descriptor.getType());
     builder.restoreInsertionPoint(insPt);
     builder.create<fir::StoreOp>(loc, descriptor, alloca);
-    return alloca;
+    return slot = alloca;
   }
 
   /// Function that generates a FIR operation accessing the descriptor's
diff --git a/flang/test/Transforms/omp-map-info-finalization.fir b/flang/test/Transforms/omp-map-info-finalization.fir
index 19e6dcad068cd..a7254bcddd523 100644
--- a/flang/test/Transforms/omp-map-info-finalization.fir
+++ b/flang/test/Transforms/omp-map-info-finalization.fir
@@ -296,3 +296,49 @@ func.func @alloca_dtype_map_op_block_add(%arg0 : !fir.ref<!fir.box<!fir.heap<!fi
 // CHECK:    %[[DESC_MAP_2:.*]] = omp.map.info var_ptr(%[[DESC_2]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.box<!fir.heap<!fir.array<?xi32>>>) map_clauses(to) capture(ByRef) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {{.*}}
 // CHECK:    %[[TOP_PARENT_MAP:.*]] = omp.map.info var_ptr(%0#1 : !fir.ref<!fir.type<[[REC_TY]]>>, !fir.type<[[REC_TY]]>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) members(%6, %5, %14, %13 : [1], [1, 0], [1, 0, 2], [1, 0, 2, 0] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<[[REC_TY2]]>>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?x!fir.type<[[REC_TY2]]>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.type<[[REC_TY]]>> {{{.*}} partial_map = true}
 // CHECK:    omp.target map_entries(%[[TOP_PARENT_MAP]] -> %{{.*}}, %[[DESC_MAP_1]] -> %{{.*}}, %[[BASE_ADDR_MAP_1]] -> %{{.*}}, %[[DESC_MAP_2]] -> %{{.*}}, %[[BASE_ADDR_MAP_2]] -> %{{.*}} : !fir.ref<!fir.type<[[REC_TY]]>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<[[REC_TY2]]>>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?x!fir.type<[[REC_TY2]]>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) {
+
+// -----
+
+func.func @_QPreuse_alloca(%arg0: !fir.box<!fir.array<?xf64>> {fir.bindc_name = "a"}) {
+  %0 = fir.dummy_scope : !fir.dscope
+  %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFreuse_allocaEa"} : (!fir.box<!fir.array<?xf64>>, !fir.dscope) -> (!fir.box<!fir.array<?xf64>>, !fir.box<!fir.array<?xf64>>)
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %2:3 = fir.box_dims %1#0, %c0 : (!fir.box<!fir.array<?xf64>>, index) -> (index, index, index)
+  %c0_0 = arith.constant 0 : index
+  %3 = arith.subi %2#1, %c1 : index
+  %4 = omp.map.bounds lower_bound(%c0_0 : index) upper_bound(%3 : index) extent(%2#1 : index) stride(%2#2 : index) start_idx(%c1 : index) {stride_in_bytes = true}
+  %5 = fir.box_addr %1#1 : (!fir.box<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>>
+  %6 = omp.map.info var_ptr(%5 : !fir.ref<!fir.array<?xf64>>, f64) map_clauses(to) capture(ByRef) bounds(%4) -> !fir.ref<!fir.array<?xf64>> {name = "a"}
+  omp.target_data map_entries(%6 : !fir.ref<!fir.array<?xf64>>) {
+    %cst = arith.constant 0.000000e+00 : f64
+    %c0_1 = arith.constant 0 : index
+    %7 = hlfir.designate %1#0 (%c0_1)  : (!fir.box<!fir.array<?xf64>>, index) -> !fir.ref<f64>
+    hlfir.assign %cst to %7 : f64, !fir.ref<f64>
+    %c1_2 = arith.constant 1 : index
+    %c0_3 = arith.constant 0 : index
+    %8:3 = fir.box_dims %1#0, %c0_3 : (!fir.box<!fir.array<?xf64>>, index) -> (index, index, index)
+    %c0_4 = arith.constant 0 : index
+    %9 = arith.subi %8#1, %c1_2 : index
+    %10 = omp.map.bounds lower_bound(%c0_4 : index) upper_bound(%9 : index) extent(%8#1 : index) stride(%8#2 : index) start_idx(%c1_2 : index) {stride_in_bytes = true}
+    %11 = fir.box_addr %1#1 : (!fir.box<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>>
+    %12 = omp.map.info var_ptr(%11 : !fir.ref<!fir.array<?xf64>>, f64) map_clauses(from) capture(ByRef) bounds(%10) -> !fir.ref<!fir.array<?xf64>> {name = "a"}
+    omp.target_update map_entries(%12 : !fir.ref<!fir.array<?xf64>>)
+    omp.terminator
+  }
+  return
+}
+
+// CHECK-LABEL: @_QPreuse_alloca
+// CHECK-NEXT:  %[[ALLOCA:[0-9]+]] = fir.alloca !fir.box<!fir.array<?xf64>>
+// CHECK-NOT:   fir.alloca
+// CHECK:       %{{[0-9]+}} = omp.map.info var_ptr(%[[ALLOCA]]
+// CHECK:       %{{[0-9]+}} = omp.map.info var_ptr(%[[ALLOCA]]
+// CHECK:       omp.target_data map_entries
+// CHECK:         %{{[0-9]+}} = omp.map.info var_ptr(%[[ALLOCA]]
+// CHECK:         %{{[0-9]+}} = omp.map.info var_ptr(%[[ALLOCA]]
+// CHECK:         omp.target_update map_entries
+// CHECK:         omp.terminator
+// CHECK:       }
+// CHECK:       return
+

From b622cc67d0af9326b9e4f1f91d8be790c57dd86c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 11 Jan 2025 13:41:44 +0000
Subject: [PATCH 163/408] [X86] LowerCTPOP - check if the operand is a constant
 when collecting KnownBits

Under certain circumstances, lowering of other instructions can result in computeKnownBits being able to detect a constant that it couldn't previously.

Fixes #122580
---
 llvm/lib/Target/X86/X86ISelLowering.cpp |  2 ++
 llvm/test/CodeGen/X86/pr122580.ll       | 37 +++++++++++++++++++++++++
 2 files changed, 39 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/pr122580.ll

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index fbfcfc700ed62..596139d084570 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -32172,6 +32172,8 @@ static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget,
     // allowing us to shift the active bits down if necessary to fit into the
     // special cases below.
     KnownBits Known = DAG.computeKnownBits(Op);
+    if (Known.isConstant())
+      return DAG.getConstant(Known.getConstant().popcount(), DL, VT);
     unsigned LZ = Known.countMinLeadingZeros();
     unsigned TZ = Known.countMinTrailingZeros();
     assert((LZ + TZ) < Known.getBitWidth() && "Illegal shifted mask");
diff --git a/llvm/test/CodeGen/X86/pr122580.ll b/llvm/test/CodeGen/X86/pr122580.ll
new file mode 100644
index 0000000000000..509729416cf57
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr122580.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s
+
+@g_180 = external global i8
+@g_1032 = external global [2 x i32]
+
+define i32 @PR122580(ptr %0) {
+; CHECK-LABEL: PR122580:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq g_180@GOTPCREL(%rip), %rax
+; CHECK-NEXT:    cmpb $0, (%rax)
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    movl $878456583, %ecx # imm = 0x345C2F07
+; CHECK-NEXT:    cmovnel %eax, %ecx
+; CHECK-NEXT:    movq g_1032@GOTPCREL(%rip), %rax
+; CHECK-NEXT:    movl $0, (%rax)
+; CHECK-NEXT:    movl %ecx, (%rdi)
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
+entry:
+  %.b577 = load i1, ptr @g_180, align 4
+  %1 = select i1 %.b577, i32 1, i32 878456582
+  store i32 0, ptr @g_1032, align 4
+  %.b576 = load i1, ptr @g_180, align 4
+  %2 = select i1 %.b576, i32 1, i32 878456582
+  %or542.1.i = or i32 %2, %1
+  store i32 0, ptr @g_1032, align 4
+  %.b575 = load i1, ptr @g_180, align 4
+  %3 = select i1 %.b575, i32 1, i32 878456582
+  %or542.2.i = or i32 %3, %or542.1.i
+  %or542.3.i = or i32 %or542.2.i, 1
+  store i32 %or542.3.i, ptr %0, align 4
+  %4 = load i32, ptr %0, align 4
+  %div.i1796.i = sdiv i32 %4, 1096912269
+  %5 = tail call i32 @llvm.ctpop.i32(i32 %div.i1796.i)
+  ret i32 %5
+}

From b306eff56f950285e01c7accdb36f09d17506dcc Mon Sep 17 00:00:00 2001
From: William Moses <gh@wsmoses.com>
Date: Sat, 11 Jan 2025 09:10:27 -0500
Subject: [PATCH 164/408] [MLIR] Enable inlining for private symbols (#122572)

The inlining code for llvm funcs seems to have needlessly forbidden
inlining of private (e.g. non-cloning) symbols.
---
 .../LLVMIR/Transforms/InlinerInterfaceImpl.cpp      |  2 --
 mlir/test/Dialect/LLVMIR/inlining.mlir              | 13 +++++++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp
index dd20412ee7080..b3bed5ab5f412 100644
--- a/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp
@@ -663,8 +663,6 @@ struct LLVMInlinerInterface : public DialectInlinerInterface {
 
   bool isLegalToInline(Operation *call, Operation *callable,
                        bool wouldBeCloned) const final {
-    if (!wouldBeCloned)
-      return false;
     if (!isa<LLVM::CallOp>(call)) {
       LLVM_DEBUG(llvm::dbgs() << "Cannot inline: call is not an '"
                               << LLVM::CallOp::getOperationName() << "' op\n");
diff --git a/mlir/test/Dialect/LLVMIR/inlining.mlir b/mlir/test/Dialect/LLVMIR/inlining.mlir
index 0b7ca3f2bb048..edaac4da0b044 100644
--- a/mlir/test/Dialect/LLVMIR/inlining.mlir
+++ b/mlir/test/Dialect/LLVMIR/inlining.mlir
@@ -663,3 +663,16 @@ llvm.func @caller() {
   llvm.call @vararg_intrinrics() : () -> ()
   llvm.return
 }
+
+// -----
+
+llvm.func @private_func(%a : i32) -> i32 attributes {sym_visibility = "private"} {
+  llvm.return %a : i32
+}
+
+// CHECK-LABEL: func @caller
+llvm.func @caller(%x : i32) -> i32 {
+  // CHECK-NOT: llvm.call @private_func
+  %z = llvm.call @private_func(%x) : (i32) -> (i32)
+  llvm.return %z : i32
+}

From 38fcf62483907aa48325e60d1f685805e846c6ea Mon Sep 17 00:00:00 2001
From: William Moses <gh@wsmoses.com>
Date: Sat, 11 Jan 2025 09:11:22 -0500
Subject: [PATCH 165/408] [MLIR] Import LLVM add flag to disable
 loadalldialects (#122574)

Co-authored-by: Oleksandr "Alex" Zinenko <ftynse@gmail.com>
---
 mlir/include/mlir/Target/LLVMIR/Import.h | 5 ++++-
 mlir/lib/Target/LLVMIR/ModuleImport.cpp  | 6 ++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Target/LLVMIR/Import.h b/mlir/include/mlir/Target/LLVMIR/Import.h
index 4a1242a8eb059..4aa8f2ab7d8ce 100644
--- a/mlir/include/mlir/Target/LLVMIR/Import.h
+++ b/mlir/include/mlir/Target/LLVMIR/Import.h
@@ -39,10 +39,13 @@ class ModuleOp;
 /// be imported without elements. If set, the option avoids the recursive
 /// traversal of composite type debug information, which can be expensive for
 /// adversarial inputs.
+/// The `loadAllDialects` flag (default on) will load all dialects in the
+/// context.
 OwningOpRef<ModuleOp>
 translateLLVMIRToModule(std::unique_ptr<llvm::Module> llvmModule,
                         MLIRContext *context, bool emitExpensiveWarnings = true,
-                        bool dropDICompositeTypeElements = false);
+                        bool dropDICompositeTypeElements = false,
+                        bool loadAllDialects = true);
 
 /// Translate the given LLVM data layout into an MLIR equivalent using the DLTI
 /// dialect.
diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
index 2d8d7745eca9b..eba86f06d0905 100644
--- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
@@ -2325,7 +2325,8 @@ ModuleImport::translateLoopAnnotationAttr(const llvm::MDNode *node,
 OwningOpRef<ModuleOp>
 mlir::translateLLVMIRToModule(std::unique_ptr<llvm::Module> llvmModule,
                               MLIRContext *context, bool emitExpensiveWarnings,
-                              bool dropDICompositeTypeElements) {
+                              bool dropDICompositeTypeElements,
+                              bool loadAllDialects) {
   // Preload all registered dialects to allow the import to iterate the
   // registered LLVMImportDialectInterface implementations and query the
   // supported LLVM IR constructs before starting the translation. Assumes the
@@ -2335,7 +2336,8 @@ mlir::translateLLVMIRToModule(std::unique_ptr<llvm::Module> llvmModule,
                             LLVMDialect::getDialectNamespace()));
   assert(llvm::is_contained(context->getAvailableDialects(),
                             DLTIDialect::getDialectNamespace()));
-  context->loadAllAvailableDialects();
+  if (loadAllDialects)
+    context->loadAllAvailableDialects();
   OwningOpRef<ModuleOp> module(ModuleOp::create(FileLineColLoc::get(
       StringAttr::get(context, llvmModule->getSourceFileName()), /*line=*/0,
       /*column=*/0)));

From ba58d35019ae40641ae454472a43965a7bee5696 Mon Sep 17 00:00:00 2001
From: Janis Heims <technoelf@undertheprinter.com>
Date: Sat, 11 Jan 2025 14:18:05 +0000
Subject: [PATCH 166/408] [M68k] Use M68010 cpu as target for SR move test
 (#122452)

Fixes the test introduced in #111145.

It would also make sense to throw an error when the user attempts to use
a move-from-sr on an unsupported architecture. Currently the encoder
generates garbage instructions for a 68000 because the AsmMatcher is
able to match the move against a MOV16rr
---
 llvm/test/MC/M68k/Data/Classes/MxMoveSR.s | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/MC/M68k/Data/Classes/MxMoveSR.s b/llvm/test/MC/M68k/Data/Classes/MxMoveSR.s
index 03189311badb8..e999345f43fc3 100644
--- a/llvm/test/MC/M68k/Data/Classes/MxMoveSR.s
+++ b/llvm/test/MC/M68k/Data/Classes/MxMoveSR.s
@@ -1,4 +1,4 @@
-; RUN: llvm-mc -triple=m68k -mcpu=M68000 -show-encoding %s | FileCheck %s
+; RUN: llvm-mc -triple=m68k -mcpu=M68010 -show-encoding %s | FileCheck %s
 
 ; CHECK:      move.w  %d1, %sr
 ; CHECK-SAME: encoding: [0x46,0xc1]

From ae9bf17697d2245be707e93125f18d09eaf77aa9 Mon Sep 17 00:00:00 2001
From: Congcong Cai <congcongcai0907@163.com>
Date: Sat, 11 Jan 2025 22:46:04 +0800
Subject: [PATCH 167/408] [clang-tidy] remove never used IgnoreCase in option
 (#122573)

---
 .../clang-tidy/ClangTidyCheck.cpp             | 15 ++++++------
 clang-tools-extra/clang-tidy/ClangTidyCheck.h | 23 +++++++++----------
 .../clang-tidy/ClangTidyOptionsTest.cpp       |  7 ------
 3 files changed, 18 insertions(+), 27 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp
index 4aa9fe228ee79..341343e90822b 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp
+++ b/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp
@@ -163,9 +163,10 @@ void ClangTidyCheck::OptionsView::store<bool>(
   store(Options, LocalName, Value ? StringRef("true") : StringRef("false"));
 }
 
-std::optional<int64_t> ClangTidyCheck::OptionsView::getEnumInt(
-    StringRef LocalName, ArrayRef<NameAndValue> Mapping, bool CheckGlobal,
-    bool IgnoreCase) const {
+std::optional<int64_t>
+ClangTidyCheck::OptionsView::getEnumInt(StringRef LocalName,
+                                        ArrayRef<NameAndValue> Mapping,
+                                        bool CheckGlobal) const {
   if (!CheckGlobal && Context->getOptionsCollector())
     Context->getOptionsCollector()->insert((NamePrefix + LocalName).str());
   auto Iter = CheckGlobal ? findPriorityOption(CheckOptions, NamePrefix,
@@ -178,12 +179,10 @@ std::optional<int64_t> ClangTidyCheck::OptionsView::getEnumInt(
   StringRef Closest;
   unsigned EditDistance = 3;
   for (const auto &NameAndEnum : Mapping) {
-    if (IgnoreCase) {
-      if (Value.equals_insensitive(NameAndEnum.second))
-        return NameAndEnum.first;
-    } else if (Value == NameAndEnum.second) {
+    if (Value == NameAndEnum.second) {
       return NameAndEnum.first;
-    } else if (Value.equals_insensitive(NameAndEnum.second)) {
+    }
+    if (Value.equals_insensitive(NameAndEnum.second)) {
       Closest = NameAndEnum.second;
       EditDistance = 0;
       continue;
diff --git a/clang-tools-extra/clang-tidy/ClangTidyCheck.h b/clang-tools-extra/clang-tidy/ClangTidyCheck.h
index 7427aa9bf48f8..037526a0bd9af 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyCheck.h
+++ b/clang-tools-extra/clang-tidy/ClangTidyCheck.h
@@ -333,9 +333,9 @@ class ClangTidyCheck : public ast_matchers::MatchFinder::MatchCallback {
     /// supply the mapping required to convert between ``T`` and a string.
     template <typename T>
     std::enable_if_t<std::is_enum_v<T>, std::optional<T>>
-    get(StringRef LocalName, bool IgnoreCase = false) const {
+    get(StringRef LocalName) const {
       if (std::optional<int64_t> ValueOr =
-              getEnumInt(LocalName, typeEraseMapping<T>(), false, IgnoreCase))
+              getEnumInt(LocalName, typeEraseMapping<T>(), false))
         return static_cast<T>(*ValueOr);
       return std::nullopt;
     }
@@ -353,9 +353,9 @@ class ClangTidyCheck : public ast_matchers::MatchFinder::MatchCallback {
     /// \ref clang::tidy::OptionEnumMapping must be specialized for ``T`` to
     /// supply the mapping required to convert between ``T`` and a string.
     template <typename T>
-    std::enable_if_t<std::is_enum_v<T>, T> get(StringRef LocalName, T Default,
-                                               bool IgnoreCase = false) const {
-      return get<T>(LocalName, IgnoreCase).value_or(Default);
+    std::enable_if_t<std::is_enum_v<T>, T> get(StringRef LocalName,
+                                               T Default) const {
+      return get<T>(LocalName).value_or(Default);
     }
 
     /// Read a named option from the ``Context`` and parse it as an
@@ -373,9 +373,9 @@ class ClangTidyCheck : public ast_matchers::MatchFinder::MatchCallback {
     /// supply the mapping required to convert between ``T`` and a string.
     template <typename T>
     std::enable_if_t<std::is_enum_v<T>, std::optional<T>>
-    getLocalOrGlobal(StringRef LocalName, bool IgnoreCase = false) const {
+    getLocalOrGlobal(StringRef LocalName) const {
       if (std::optional<int64_t> ValueOr =
-              getEnumInt(LocalName, typeEraseMapping<T>(), true, IgnoreCase))
+              getEnumInt(LocalName, typeEraseMapping<T>(), true))
         return static_cast<T>(*ValueOr);
       return std::nullopt;
     }
@@ -394,10 +394,9 @@ class ClangTidyCheck : public ast_matchers::MatchFinder::MatchCallback {
     /// \ref clang::tidy::OptionEnumMapping must be specialized for ``T`` to
     /// supply the mapping required to convert between ``T`` and a string.
     template <typename T>
-    std::enable_if_t<std::is_enum_v<T>, T>
-    getLocalOrGlobal(StringRef LocalName, T Default,
-                     bool IgnoreCase = false) const {
-      return getLocalOrGlobal<T>(LocalName, IgnoreCase).value_or(Default);
+    std::enable_if_t<std::is_enum_v<T>, T> getLocalOrGlobal(StringRef LocalName,
+                                                            T Default) const {
+      return getLocalOrGlobal<T>(LocalName).value_or(Default);
     }
 
     /// Stores an option with the check-local name \p LocalName with
@@ -454,7 +453,7 @@ class ClangTidyCheck : public ast_matchers::MatchFinder::MatchCallback {
 
     std::optional<int64_t> getEnumInt(StringRef LocalName,
                                       ArrayRef<NameAndValue> Mapping,
-                                      bool CheckGlobal, bool IgnoreCase) const;
+                                      bool CheckGlobal) const;
 
     template <typename T>
     std::enable_if_t<std::is_enum_v<T>, std::vector<NameAndValue>>
diff --git a/clang-tools-extra/unittests/clang-tidy/ClangTidyOptionsTest.cpp b/clang-tools-extra/unittests/clang-tidy/ClangTidyOptionsTest.cpp
index af4f66ae3c54f..aaec0e6b50bbf 100644
--- a/clang-tools-extra/unittests/clang-tidy/ClangTidyOptionsTest.cpp
+++ b/clang-tools-extra/unittests/clang-tidy/ClangTidyOptionsTest.cpp
@@ -417,13 +417,6 @@ TEST(ValidConfiguration, ValidEnumOptions) {
   CHECK_VAL(TestCheck.getIntLocal<Colours>("Valid"), Colours::Red);
   CHECK_VAL(TestCheck.getIntGlobal<Colours>("GlobalValid"), Colours::Violet);
 
-  CHECK_VAL(
-      TestCheck.getIntLocal<Colours>("ValidWrongCase", /*IgnoreCase*/ true),
-      Colours::Red);
-  CHECK_VAL(TestCheck.getIntGlobal<Colours>("GlobalValidWrongCase",
-                                            /*IgnoreCase*/ true),
-            Colours::Violet);
-
   EXPECT_FALSE(TestCheck.getIntLocal<Colours>("ValidWrongCase").has_value());
   EXPECT_FALSE(TestCheck.getIntLocal<Colours>("NearMiss").has_value());
   EXPECT_FALSE(TestCheck.getIntGlobal<Colours>("GlobalInvalid").has_value());

From 0249554ee1ac49e6f1d93fa78a55971fc706f635 Mon Sep 17 00:00:00 2001
From: Congcong Cai <congcongcai0907@163.com>
Date: Sat, 11 Jan 2025 22:51:47 +0800
Subject: [PATCH 168/408] [clang-tidy] fix incorrect configuration file path
 resolving when file paths contain `..` (#121323)

`makeAbsolute` will not normalize path. When getting parent folder, `..`
will go into the subfolder instead of the parent folder.
---
 .../clang-tidy/ClangTidyOptions.cpp           | 32 ++++++++++++-------
 .../clang-tidy/ClangTidyOptions.h             |  4 +++
 clang-tools-extra/docs/ReleaseNotes.rst       |  3 ++
 .../Inputs/normalized-path/code.cpp           |  0
 .../normalized-path/error-config/.clang-tidy  |  1 +
 .../infrastructure/normalized-path.test       |  3 ++
 6 files changed, 32 insertions(+), 11 deletions(-)
 create mode 100644 clang-tools-extra/test/clang-tidy/infrastructure/Inputs/normalized-path/code.cpp
 create mode 100644 clang-tools-extra/test/clang-tidy/infrastructure/Inputs/normalized-path/error-config/.clang-tidy
 create mode 100644 clang-tools-extra/test/clang-tidy/infrastructure/normalized-path.test

diff --git a/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp b/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp
index e1d5df75f3e5a..8bac6f161fa05 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp
+++ b/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp
@@ -12,6 +12,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Errc.h"
+#include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBufferRef.h"
 #include "llvm/Support/Path.h"
@@ -298,12 +299,11 @@ ConfigOptionsProvider::getRawOptions(llvm::StringRef FileName) {
   if (ConfigOptions.InheritParentConfig.value_or(false)) {
     LLVM_DEBUG(llvm::dbgs()
                << "Getting options for file " << FileName << "...\n");
-    assert(FS && "FS must be set.");
 
-    llvm::SmallString<128> AbsoluteFilePath(FileName);
-
-    if (!FS->makeAbsolute(AbsoluteFilePath)) {
-      addRawFileOptions(AbsoluteFilePath, RawOptions);
+    llvm::ErrorOr<llvm::SmallString<128>> AbsoluteFilePath =
+        getNormalizedAbsolutePath(FileName);
+    if (AbsoluteFilePath) {
+      addRawFileOptions(AbsoluteFilePath->str(), RawOptions);
     }
   }
   RawOptions.emplace_back(ConfigOptions,
@@ -334,6 +334,17 @@ FileOptionsBaseProvider::FileOptionsBaseProvider(
       OverrideOptions(std::move(OverrideOptions)),
       ConfigHandlers(std::move(ConfigHandlers)) {}
 
+llvm::ErrorOr<llvm::SmallString<128>>
+FileOptionsBaseProvider::getNormalizedAbsolutePath(llvm::StringRef Path) {
+  assert(FS && "FS must be set.");
+  llvm::SmallString<128> NormalizedAbsolutePath = {Path};
+  std::error_code Err = FS->makeAbsolute(NormalizedAbsolutePath);
+  if (Err)
+    return Err;
+  llvm::sys::path::remove_dots(NormalizedAbsolutePath, /*remove_dot_dot=*/true);
+  return NormalizedAbsolutePath;
+}
+
 void FileOptionsBaseProvider::addRawFileOptions(
     llvm::StringRef AbsolutePath, std::vector<OptionsSource> &CurOptions) {
   auto CurSize = CurOptions.size();
@@ -397,16 +408,15 @@ std::vector<OptionsSource>
 FileOptionsProvider::getRawOptions(StringRef FileName) {
   LLVM_DEBUG(llvm::dbgs() << "Getting options for file " << FileName
                           << "...\n");
-  assert(FS && "FS must be set.");
-
-  llvm::SmallString<128> AbsoluteFilePath(FileName);
 
-  if (FS->makeAbsolute(AbsoluteFilePath))
+  llvm::ErrorOr<llvm::SmallString<128>> AbsoluteFilePath =
+      getNormalizedAbsolutePath(FileName);
+  if (!AbsoluteFilePath)
     return {};
 
   std::vector<OptionsSource> RawOptions =
-      DefaultOptionsProvider::getRawOptions(AbsoluteFilePath.str());
-  addRawFileOptions(AbsoluteFilePath, RawOptions);
+      DefaultOptionsProvider::getRawOptions(AbsoluteFilePath->str());
+  addRawFileOptions(AbsoluteFilePath->str(), RawOptions);
   OptionsSource CommandLineOptions(OverrideOptions,
                                    OptionsSourceTypeCheckCommandLineOption);
 
diff --git a/clang-tools-extra/clang-tidy/ClangTidyOptions.h b/clang-tools-extra/clang-tidy/ClangTidyOptions.h
index 568f60cf98b21..dd78c570d25d9 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyOptions.h
+++ b/clang-tools-extra/clang-tidy/ClangTidyOptions.h
@@ -10,6 +10,7 @@
 #define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CLANGTIDYOPTIONS_H
 
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/ErrorOr.h"
@@ -237,6 +238,9 @@ class FileOptionsBaseProvider : public DefaultOptionsProvider {
   void addRawFileOptions(llvm::StringRef AbsolutePath,
                          std::vector<OptionsSource> &CurOptions);
 
+  llvm::ErrorOr<llvm::SmallString<128>>
+  getNormalizedAbsolutePath(llvm::StringRef AbsolutePath);
+
   /// Try to read configuration files from \p Directory using registered
   /// \c ConfigHandlers.
   std::optional<OptionsSource> tryReadConfigFile(llvm::StringRef Directory);
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index d75a276729c58..9818ec9603bbc 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -117,6 +117,9 @@ Improvements to clang-tidy
 - Improved :program:`run-clang-tidy.py` script. Fixed minor shutdown noise
   happening on certain platforms when interrupting the script.
 
+- Improved :program:`clang-tidy` by fixing incorrect configuration file path
+  resolving when file paths contain ``..``.
+
 - Removed :program:`clang-tidy`'s global options for most of checks. All options
   are changed to local options except `IncludeStyle`, `StrictMode` and
   `IgnoreMacros`. Global scoped `StrictMode` and `IgnoreMacros` are deprecated
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/normalized-path/code.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/normalized-path/code.cpp
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/normalized-path/error-config/.clang-tidy b/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/normalized-path/error-config/.clang-tidy
new file mode 100644
index 0000000000000..83bc4d519722b
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/normalized-path/error-config/.clang-tidy
@@ -0,0 +1 @@
+InvalidYamlFormat
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/normalized-path.test b/clang-tools-extra/test/clang-tidy/infrastructure/normalized-path.test
new file mode 100644
index 0000000000000..d14ad43bb8b13
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/infrastructure/normalized-path.test
@@ -0,0 +1,3 @@
+// RUN: clang-tidy %S/Inputs/normalized-path/error-config/../code.cpp --verify-config 2>&1 | FileCheck %s
+
+// CHECK-NOT: Error parsing

From 78953433a5a0e3551f4c698636fe46e2536a30d2 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 11 Jan 2025 15:00:47 +0000
Subject: [PATCH 169/408] [X86] vector popcnt tests - regenerate VPTERNLOG
 comments

---
 .../CodeGen/X86/vector-popcnt-128-ult-ugt.ll  |   16 +-
 llvm/test/CodeGen/X86/vector-popcnt-128.ll    |   32 +-
 .../CodeGen/X86/vector-popcnt-256-ult-ugt.ll  |   16 +-
 llvm/test/CodeGen/X86/vector-popcnt-256.ll    |   32 +-
 .../CodeGen/X86/vector-popcnt-512-ult-ugt.ll  | 1520 ++++++++---------
 llvm/test/CodeGen/X86/vector-popcnt-512.ll    |   80 +-
 6 files changed, 848 insertions(+), 848 deletions(-)

diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll
index c3d5a4b32edbc..b5b9af543ed5c 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll
@@ -50,7 +50,7 @@ define <16 x i8> @ugt_1_v16i8(<16 x i8> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -62,7 +62,7 @@ define <16 x i8> @ugt_1_v16i8(<16 x i8> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: ugt_1_v16i8:
@@ -1543,7 +1543,7 @@ define <8 x i16> @ugt_1_v8i16(<8 x i16> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -1555,7 +1555,7 @@ define <8 x i16> @ugt_1_v8i16(<8 x i16> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: ugt_1_v8i16:
@@ -5806,7 +5806,7 @@ define <4 x i32> @ugt_1_v4i32(<4 x i32> %0) {
 ; BITALG_NOVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; BITALG_NOVLX-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -5818,7 +5818,7 @@ define <4 x i32> @ugt_1_v4i32(<4 x i32> %0) {
 ; BITALG-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; BITALG-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; BITALG-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; BITALG-NEXT:    retq
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
   %3 = icmp ugt <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
@@ -16832,7 +16832,7 @@ define <2 x i64> @ugt_1_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; BITALG_NOVLX-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -16844,7 +16844,7 @@ define <2 x i64> @ugt_1_v2i64(<2 x i64> %0) {
 ; BITALG-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
-; BITALG-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; BITALG-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; BITALG-NEXT:    retq
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
   %3 = icmp ugt <2 x i64> %2, <i64 1, i64 1>
diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128.ll b/llvm/test/CodeGen/X86/vector-popcnt-128.ll
index 58cacfb0485ec..e178ab0348a76 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-128.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-128.ll
@@ -775,7 +775,7 @@ define <2 x i64> @eq_1_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpminuq %zmm1, %zmm0, %zmm1
 ; BITALG_NOVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; BITALG_NOVLX-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -787,7 +787,7 @@ define <2 x i64> @eq_1_v2i64(<2 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    vpminuq %xmm1, %xmm0, %xmm1
 ; BITALG-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
-; BITALG-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; BITALG-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; BITALG-NEXT:    retq
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
   %3 = icmp eq <2 x i64> %2, <i64 1, i64 1>
@@ -837,7 +837,7 @@ define <2 x i64> @ne_1_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -847,7 +847,7 @@ define <2 x i64> @ne_1_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntq %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [1,1]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: ne_1_v2i64:
@@ -954,7 +954,7 @@ define <4 x i32> @eq_1_v4i32(<4 x i32> %0) {
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpminud %xmm1, %xmm0, %xmm1
 ; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; BITALG_NOVLX-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -966,7 +966,7 @@ define <4 x i32> @eq_1_v4i32(<4 x i32> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    vpminud %xmm1, %xmm0, %xmm1
 ; BITALG-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; BITALG-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; BITALG-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; BITALG-NEXT:    retq
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
   %3 = icmp eq <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
@@ -1040,7 +1040,7 @@ define <4 x i32> @ne_1_v4i32(<4 x i32> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -1050,7 +1050,7 @@ define <4 x i32> @ne_1_v4i32(<4 x i32> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: ne_1_v4i32:
@@ -1140,7 +1140,7 @@ define <8 x i16> @eq_1_v8i16(<8 x i16> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpminuw %xmm1, %xmm0, %xmm1
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -1152,7 +1152,7 @@ define <8 x i16> @eq_1_v8i16(<8 x i16> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpminuw %xmm1, %xmm0, %xmm1
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: eq_1_v8i16:
@@ -1245,7 +1245,7 @@ define <8 x i16> @ne_1_v8i16(<8 x i16> %0) {
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; BITALG_NOVLX-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -1254,7 +1254,7 @@ define <8 x i16> @ne_1_v8i16(<8 x i16> %0) {
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntw %xmm0, %xmm0
 ; BITALG-NEXT:    vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; BITALG-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; BITALG-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
   %3 = icmp ne <8 x i16> %2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -1299,7 +1299,7 @@ define <16 x i8> @eq_1_v16i8(<16 x i8> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpminub %xmm1, %xmm0, %xmm1
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -1311,7 +1311,7 @@ define <16 x i8> @eq_1_v16i8(<16 x i8> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpminub %xmm1, %xmm0, %xmm1
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: eq_1_v16i8:
@@ -1374,7 +1374,7 @@ define <16 x i8> @ne_1_v16i8(<16 x i8> %0) {
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; BITALG_NOVLX-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -1383,7 +1383,7 @@ define <16 x i8> @ne_1_v16i8(<16 x i8> %0) {
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
 ; BITALG-NEXT:    vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; BITALG-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; BITALG-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0)
   %3 = icmp ne <16 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll
index ab1922e3ad9a2..f72ad6d70522f 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll
@@ -41,7 +41,7 @@ define <32 x i8> @ugt_1_v32i8(<32 x i8> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
@@ -52,7 +52,7 @@ define <32 x i8> @ugt_1_v32i8(<32 x i8> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT:    vpternlogq $15, %ymm0, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT:    vpternlogq {{.*#+}} ymm0 = ~ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: ugt_1_v32i8:
@@ -1034,7 +1034,7 @@ define <16 x i16> @ugt_1_v16i16(<16 x i16> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
@@ -1045,7 +1045,7 @@ define <16 x i16> @ugt_1_v16i16(<16 x i16> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT:    vpternlogq $15, %ymm0, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT:    vpternlogq {{.*#+}} ymm0 = ~ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: ugt_1_v16i16:
@@ -3348,7 +3348,7 @@ define <8 x i32> @ugt_1_v8i32(<8 x i32> %0) {
 ; BITALG_NOVLX-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; BITALG_NOVLX-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
@@ -3359,7 +3359,7 @@ define <8 x i32> @ugt_1_v8i32(<8 x i32> %0) {
 ; BITALG-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
-; BITALG-NEXT:    vpternlogq $15, %ymm0, %ymm0, %ymm0
+; BITALG-NEXT:    vpternlogq {{.*#+}} ymm0 = ~ymm0
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0)
   %3 = icmp ugt <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -9423,7 +9423,7 @@ define <4 x i64> @ugt_1_v4i64(<4 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; BITALG_NOVLX-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
@@ -9434,7 +9434,7 @@ define <4 x i64> @ugt_1_v4i64(<4 x i64> %0) {
 ; BITALG-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
-; BITALG-NEXT:    vpternlogq $15, %ymm0, %ymm0, %ymm0
+; BITALG-NEXT:    vpternlogq {{.*#+}} ymm0 = ~ymm0
 ; BITALG-NEXT:    retq
   %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0)
   %3 = icmp ugt <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256.ll b/llvm/test/CodeGen/X86/vector-popcnt-256.ll
index a5ca4affdbc6b..6c45742730a62 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-256.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-256.ll
@@ -507,7 +507,7 @@ define <4 x i64> @eq_1_v4i64(<4 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpminuq %zmm1, %zmm0, %zmm1
 ; BITALG_NOVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; BITALG_NOVLX-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
@@ -518,7 +518,7 @@ define <4 x i64> @eq_1_v4i64(<4 x i64> %0) {
 ; BITALG-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    vpminuq %ymm1, %ymm0, %ymm1
 ; BITALG-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
-; BITALG-NEXT:    vpternlogq $15, %ymm0, %ymm0, %ymm0
+; BITALG-NEXT:    vpternlogq {{.*#+}} ymm0 = ~ymm0
 ; BITALG-NEXT:    retq
   %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0)
   %3 = icmp eq <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
@@ -576,7 +576,7 @@ define <4 x i64> @ne_1_v4i64(<4 x i64> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
@@ -585,7 +585,7 @@ define <4 x i64> @ne_1_v4i64(<4 x i64> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntq %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT:    vpternlogq $15, %ymm0, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT:    vpternlogq {{.*#+}} ymm0 = ~ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: ne_1_v4i64:
@@ -674,7 +674,7 @@ define <8 x i32> @eq_1_v8i32(<8 x i32> %0) {
 ; BITALG_NOVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpminud %ymm1, %ymm0, %ymm1
 ; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; BITALG_NOVLX-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
@@ -685,7 +685,7 @@ define <8 x i32> @eq_1_v8i32(<8 x i32> %0) {
 ; BITALG-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    vpminud %ymm1, %ymm0, %ymm1
 ; BITALG-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
-; BITALG-NEXT:    vpternlogq $15, %ymm0, %ymm0, %ymm0
+; BITALG-NEXT:    vpternlogq {{.*#+}} ymm0 = ~ymm0
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0)
   %3 = icmp eq <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -737,7 +737,7 @@ define <8 x i32> @ne_1_v8i32(<8 x i32> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
@@ -746,7 +746,7 @@ define <8 x i32> @ne_1_v8i32(<8 x i32> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT:    vpternlogq $15, %ymm0, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT:    vpternlogq {{.*#+}} ymm0 = ~ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: ne_1_v8i32:
@@ -820,7 +820,7 @@ define <16 x i16> @eq_1_v16i16(<16 x i16> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpminuw %ymm1, %ymm0, %ymm1
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
@@ -831,7 +831,7 @@ define <16 x i16> @eq_1_v16i16(<16 x i16> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpminuw %ymm1, %ymm0, %ymm1
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT:    vpternlogq $15, %ymm0, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT:    vpternlogq {{.*#+}} ymm0 = ~ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: eq_1_v16i16:
@@ -913,7 +913,7 @@ define <16 x i16> @ne_1_v16i16(<16 x i16> %0) {
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; BITALG_NOVLX-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
@@ -921,7 +921,7 @@ define <16 x i16> @ne_1_v16i16(<16 x i16> %0) {
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntw %ymm0, %ymm0
 ; BITALG-NEXT:    vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; BITALG-NEXT:    vpternlogq $15, %ymm0, %ymm0, %ymm0
+; BITALG-NEXT:    vpternlogq {{.*#+}} ymm0 = ~ymm0
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0)
   %3 = icmp ne <16 x i16> %2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -977,7 +977,7 @@ define <32 x i8> @eq_1_v32i8(<32 x i8> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpminub %ymm1, %ymm0, %ymm1
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
@@ -988,7 +988,7 @@ define <32 x i8> @eq_1_v32i8(<32 x i8> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpminub %ymm1, %ymm0, %ymm1
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT:    vpternlogq $15, %ymm0, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT:    vpternlogq {{.*#+}} ymm0 = ~ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: eq_1_v32i8:
@@ -1070,7 +1070,7 @@ define <32 x i8> @ne_1_v32i8(<32 x i8> %0) {
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; BITALG_NOVLX-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
@@ -1078,7 +1078,7 @@ define <32 x i8> @ne_1_v32i8(<32 x i8> %0) {
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntb %ymm0, %ymm0
 ; BITALG-NEXT:    vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; BITALG-NEXT:    vpternlogq $15, %ymm0, %ymm0, %ymm0
+; BITALG-NEXT:    vpternlogq {{.*#+}} ymm0 = ~ymm0
 ; BITALG-NEXT:    retq
   %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0)
   %3 = icmp ne <32 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
diff --git a/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll
index 182415f0ae5e2..828c97de3a079 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll
@@ -19,12 +19,12 @@ define <64 x i8> @ugt_1_v64i8(<64 x i8> %0) {
 ; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_1_v64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vptestmb %zmm1, %zmm0, %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
@@ -42,12 +42,12 @@ define <64 x i8> @ugt_1_v64i8(<64 x i8> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ugt_1_v64i8:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512VPOPCNTDQ-BW-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm1
 ; AVX512VPOPCNTDQ-BW-NEXT:    vptestmb %zmm1, %zmm0, %k0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpmovm2b %k0, %zmm0
@@ -82,7 +82,7 @@ define <64 x i8> @ult_2_v64i8(<64 x i8> %0) {
 ;
 ; AVX512BW-LABEL: ult_2_v64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vptestnmb %zmm1, %zmm0, %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
@@ -104,7 +104,7 @@ define <64 x i8> @ult_2_v64i8(<64 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ult_2_v64i8:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512VPOPCNTDQ-BW-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm1
 ; AVX512VPOPCNTDQ-BW-NEXT:    vptestnmb %zmm1, %zmm0, %k0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpmovm2b %k0, %zmm0
@@ -1045,12 +1045,12 @@ define <32 x i16> @ugt_1_v32i16(<32 x i16> %0) {
 ; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpcmpeqw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_1_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vptestmw %zmm1, %zmm0, %k0
 ; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
@@ -1068,12 +1068,12 @@ define <32 x i16> @ugt_1_v32i16(<32 x i16> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpeqw %ymm3, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ugt_1_v32i16:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512VPOPCNTDQ-BW-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm1
 ; AVX512VPOPCNTDQ-BW-NEXT:    vptestmw %zmm1, %zmm0, %k0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpmovm2w %k0, %zmm0
@@ -1108,7 +1108,7 @@ define <32 x i16> @ult_2_v32i16(<32 x i16> %0) {
 ;
 ; AVX512BW-LABEL: ult_2_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vptestnmw %zmm1, %zmm0, %k0
 ; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
@@ -1130,7 +1130,7 @@ define <32 x i16> @ult_2_v32i16(<32 x i16> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ult_2_v32i16:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512VPOPCNTDQ-BW-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm1
 ; AVX512VPOPCNTDQ-BW-NEXT:    vptestnmw %zmm1, %zmm0, %k0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpmovm2w %k0, %zmm0
@@ -3595,33 +3595,33 @@ define <32 x i16> @ult_15_v32i16(<32 x i16> %0) {
 define <16 x i32> @ugt_1_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ugt_1_v16i32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
 ; AVX512F-NEXT:    vptestmd %zmm1, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_1_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vptestmd %zmm1, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_1_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_1_v16i32:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; BITALG-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
 ; BITALG-NEXT:    vptestmd %zmm1, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ugt <16 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -3632,33 +3632,33 @@ define <16 x i32> @ugt_1_v16i32(<16 x i32> %0) {
 define <16 x i32> @ult_2_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ult_2_v16i32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
 ; AVX512F-NEXT:    vptestnmd %zmm1, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_2_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vptestnmd %zmm1, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_2_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_2_v16i32:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; BITALG-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
 ; BITALG-NEXT:    vptestnmd %zmm1, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ult <16 x i32> %2, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
@@ -3698,7 +3698,7 @@ define <16 x i32> @ugt_2_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_2_v16i32:
@@ -3719,14 +3719,14 @@ define <16 x i32> @ugt_2_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_2_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_2_v16i32:
@@ -3739,7 +3739,7 @@ define <16 x i32> @ugt_2_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ugt <16 x i32> %2, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
@@ -3779,7 +3779,7 @@ define <16 x i32> @ult_3_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_3_v16i32:
@@ -3800,14 +3800,14 @@ define <16 x i32> @ult_3_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_3_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_3_v16i32:
@@ -3820,7 +3820,7 @@ define <16 x i32> @ult_3_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ult <16 x i32> %2, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
@@ -3860,7 +3860,7 @@ define <16 x i32> @ugt_3_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_3_v16i32:
@@ -3881,14 +3881,14 @@ define <16 x i32> @ugt_3_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_3_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_3_v16i32:
@@ -3901,7 +3901,7 @@ define <16 x i32> @ugt_3_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ugt <16 x i32> %2, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
@@ -3941,7 +3941,7 @@ define <16 x i32> @ult_4_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_4_v16i32:
@@ -3962,14 +3962,14 @@ define <16 x i32> @ult_4_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_4_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_4_v16i32:
@@ -3982,7 +3982,7 @@ define <16 x i32> @ult_4_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ult <16 x i32> %2, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
@@ -4022,7 +4022,7 @@ define <16 x i32> @ugt_4_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_4_v16i32:
@@ -4043,14 +4043,14 @@ define <16 x i32> @ugt_4_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_4_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_4_v16i32:
@@ -4063,7 +4063,7 @@ define <16 x i32> @ugt_4_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ugt <16 x i32> %2, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
@@ -4103,7 +4103,7 @@ define <16 x i32> @ult_5_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_5_v16i32:
@@ -4124,14 +4124,14 @@ define <16 x i32> @ult_5_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_5_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_5_v16i32:
@@ -4144,7 +4144,7 @@ define <16 x i32> @ult_5_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ult <16 x i32> %2, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
@@ -4184,7 +4184,7 @@ define <16 x i32> @ugt_5_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_5_v16i32:
@@ -4205,14 +4205,14 @@ define <16 x i32> @ugt_5_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_5_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_5_v16i32:
@@ -4225,7 +4225,7 @@ define <16 x i32> @ugt_5_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ugt <16 x i32> %2, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
@@ -4265,7 +4265,7 @@ define <16 x i32> @ult_6_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_6_v16i32:
@@ -4286,14 +4286,14 @@ define <16 x i32> @ult_6_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_6_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_6_v16i32:
@@ -4306,7 +4306,7 @@ define <16 x i32> @ult_6_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ult <16 x i32> %2, <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
@@ -4346,7 +4346,7 @@ define <16 x i32> @ugt_6_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_6_v16i32:
@@ -4367,14 +4367,14 @@ define <16 x i32> @ugt_6_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_6_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_6_v16i32:
@@ -4387,7 +4387,7 @@ define <16 x i32> @ugt_6_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ugt <16 x i32> %2, <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
@@ -4427,7 +4427,7 @@ define <16 x i32> @ult_7_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_7_v16i32:
@@ -4448,14 +4448,14 @@ define <16 x i32> @ult_7_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_7_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_7_v16i32:
@@ -4468,7 +4468,7 @@ define <16 x i32> @ult_7_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ult <16 x i32> %2, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
@@ -4508,7 +4508,7 @@ define <16 x i32> @ugt_7_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_7_v16i32:
@@ -4529,14 +4529,14 @@ define <16 x i32> @ugt_7_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_7_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_7_v16i32:
@@ -4549,7 +4549,7 @@ define <16 x i32> @ugt_7_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ugt <16 x i32> %2, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
@@ -4589,7 +4589,7 @@ define <16 x i32> @ult_8_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_8_v16i32:
@@ -4610,14 +4610,14 @@ define <16 x i32> @ult_8_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_8_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_8_v16i32:
@@ -4630,7 +4630,7 @@ define <16 x i32> @ult_8_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ult <16 x i32> %2, <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
@@ -4670,7 +4670,7 @@ define <16 x i32> @ugt_8_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_8_v16i32:
@@ -4691,14 +4691,14 @@ define <16 x i32> @ugt_8_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_8_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_8_v16i32:
@@ -4711,7 +4711,7 @@ define <16 x i32> @ugt_8_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ugt <16 x i32> %2, <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
@@ -4751,7 +4751,7 @@ define <16 x i32> @ult_9_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_9_v16i32:
@@ -4772,14 +4772,14 @@ define <16 x i32> @ult_9_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_9_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_9_v16i32:
@@ -4792,7 +4792,7 @@ define <16 x i32> @ult_9_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ult <16 x i32> %2, <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
@@ -4832,7 +4832,7 @@ define <16 x i32> @ugt_9_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_9_v16i32:
@@ -4853,14 +4853,14 @@ define <16 x i32> @ugt_9_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_9_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_9_v16i32:
@@ -4873,7 +4873,7 @@ define <16 x i32> @ugt_9_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ugt <16 x i32> %2, <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
@@ -4913,7 +4913,7 @@ define <16 x i32> @ult_10_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_10_v16i32:
@@ -4934,14 +4934,14 @@ define <16 x i32> @ult_10_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_10_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_10_v16i32:
@@ -4954,7 +4954,7 @@ define <16 x i32> @ult_10_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ult <16 x i32> %2, <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
@@ -4994,7 +4994,7 @@ define <16 x i32> @ugt_10_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_10_v16i32:
@@ -5015,14 +5015,14 @@ define <16 x i32> @ugt_10_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_10_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_10_v16i32:
@@ -5035,7 +5035,7 @@ define <16 x i32> @ugt_10_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ugt <16 x i32> %2, <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
@@ -5075,7 +5075,7 @@ define <16 x i32> @ult_11_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_11_v16i32:
@@ -5096,14 +5096,14 @@ define <16 x i32> @ult_11_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_11_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_11_v16i32:
@@ -5116,7 +5116,7 @@ define <16 x i32> @ult_11_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ult <16 x i32> %2, <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
@@ -5156,7 +5156,7 @@ define <16 x i32> @ugt_11_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_11_v16i32:
@@ -5177,14 +5177,14 @@ define <16 x i32> @ugt_11_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_11_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_11_v16i32:
@@ -5197,7 +5197,7 @@ define <16 x i32> @ugt_11_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ugt <16 x i32> %2, <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
@@ -5237,7 +5237,7 @@ define <16 x i32> @ult_12_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_12_v16i32:
@@ -5258,14 +5258,14 @@ define <16 x i32> @ult_12_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_12_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_12_v16i32:
@@ -5278,7 +5278,7 @@ define <16 x i32> @ult_12_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ult <16 x i32> %2, <i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12>
@@ -5318,7 +5318,7 @@ define <16 x i32> @ugt_12_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_12_v16i32:
@@ -5339,14 +5339,14 @@ define <16 x i32> @ugt_12_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_12_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_12_v16i32:
@@ -5359,7 +5359,7 @@ define <16 x i32> @ugt_12_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ugt <16 x i32> %2, <i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12>
@@ -5399,7 +5399,7 @@ define <16 x i32> @ult_13_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_13_v16i32:
@@ -5420,14 +5420,14 @@ define <16 x i32> @ult_13_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_13_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_13_v16i32:
@@ -5440,7 +5440,7 @@ define <16 x i32> @ult_13_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ult <16 x i32> %2, <i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13>
@@ -5480,7 +5480,7 @@ define <16 x i32> @ugt_13_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_13_v16i32:
@@ -5501,14 +5501,14 @@ define <16 x i32> @ugt_13_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_13_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_13_v16i32:
@@ -5521,7 +5521,7 @@ define <16 x i32> @ugt_13_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ugt <16 x i32> %2, <i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13>
@@ -5561,7 +5561,7 @@ define <16 x i32> @ult_14_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_14_v16i32:
@@ -5582,14 +5582,14 @@ define <16 x i32> @ult_14_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_14_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_14_v16i32:
@@ -5602,7 +5602,7 @@ define <16 x i32> @ult_14_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ult <16 x i32> %2, <i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14>
@@ -5642,7 +5642,7 @@ define <16 x i32> @ugt_14_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_14_v16i32:
@@ -5663,14 +5663,14 @@ define <16 x i32> @ugt_14_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_14_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_14_v16i32:
@@ -5683,7 +5683,7 @@ define <16 x i32> @ugt_14_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ugt <16 x i32> %2, <i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14>
@@ -5723,7 +5723,7 @@ define <16 x i32> @ult_15_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_15_v16i32:
@@ -5744,14 +5744,14 @@ define <16 x i32> @ult_15_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_15_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_15_v16i32:
@@ -5764,7 +5764,7 @@ define <16 x i32> @ult_15_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ult <16 x i32> %2, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
@@ -5804,7 +5804,7 @@ define <16 x i32> @ugt_15_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_15_v16i32:
@@ -5825,14 +5825,14 @@ define <16 x i32> @ugt_15_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_15_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_15_v16i32:
@@ -5845,7 +5845,7 @@ define <16 x i32> @ugt_15_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ugt <16 x i32> %2, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
@@ -5885,7 +5885,7 @@ define <16 x i32> @ult_16_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_16_v16i32:
@@ -5906,14 +5906,14 @@ define <16 x i32> @ult_16_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_16_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_16_v16i32:
@@ -5926,7 +5926,7 @@ define <16 x i32> @ult_16_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ult <16 x i32> %2, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
@@ -5966,7 +5966,7 @@ define <16 x i32> @ugt_16_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_16_v16i32:
@@ -5987,14 +5987,14 @@ define <16 x i32> @ugt_16_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_16_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_16_v16i32:
@@ -6007,7 +6007,7 @@ define <16 x i32> @ugt_16_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ugt <16 x i32> %2, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
@@ -6047,7 +6047,7 @@ define <16 x i32> @ult_17_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_17_v16i32:
@@ -6068,14 +6068,14 @@ define <16 x i32> @ult_17_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_17_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_17_v16i32:
@@ -6088,7 +6088,7 @@ define <16 x i32> @ult_17_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ult <16 x i32> %2, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
@@ -6128,7 +6128,7 @@ define <16 x i32> @ugt_17_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_17_v16i32:
@@ -6149,14 +6149,14 @@ define <16 x i32> @ugt_17_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_17_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_17_v16i32:
@@ -6169,7 +6169,7 @@ define <16 x i32> @ugt_17_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ugt <16 x i32> %2, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
@@ -6209,7 +6209,7 @@ define <16 x i32> @ult_18_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_18_v16i32:
@@ -6230,14 +6230,14 @@ define <16 x i32> @ult_18_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_18_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_18_v16i32:
@@ -6250,7 +6250,7 @@ define <16 x i32> @ult_18_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ult <16 x i32> %2, <i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18>
@@ -6290,7 +6290,7 @@ define <16 x i32> @ugt_18_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_18_v16i32:
@@ -6311,14 +6311,14 @@ define <16 x i32> @ugt_18_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_18_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_18_v16i32:
@@ -6331,7 +6331,7 @@ define <16 x i32> @ugt_18_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ugt <16 x i32> %2, <i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18>
@@ -6371,7 +6371,7 @@ define <16 x i32> @ult_19_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_19_v16i32:
@@ -6392,14 +6392,14 @@ define <16 x i32> @ult_19_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_19_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_19_v16i32:
@@ -6412,7 +6412,7 @@ define <16 x i32> @ult_19_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ult <16 x i32> %2, <i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19>
@@ -6452,7 +6452,7 @@ define <16 x i32> @ugt_19_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_19_v16i32:
@@ -6473,14 +6473,14 @@ define <16 x i32> @ugt_19_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_19_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_19_v16i32:
@@ -6493,7 +6493,7 @@ define <16 x i32> @ugt_19_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ugt <16 x i32> %2, <i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19>
@@ -6533,7 +6533,7 @@ define <16 x i32> @ult_20_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_20_v16i32:
@@ -6554,14 +6554,14 @@ define <16 x i32> @ult_20_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_20_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_20_v16i32:
@@ -6574,7 +6574,7 @@ define <16 x i32> @ult_20_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ult <16 x i32> %2, <i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20>
@@ -6614,7 +6614,7 @@ define <16 x i32> @ugt_20_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_20_v16i32:
@@ -6635,14 +6635,14 @@ define <16 x i32> @ugt_20_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_20_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_20_v16i32:
@@ -6655,7 +6655,7 @@ define <16 x i32> @ugt_20_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ugt <16 x i32> %2, <i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20>
@@ -6695,7 +6695,7 @@ define <16 x i32> @ult_21_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_21_v16i32:
@@ -6716,14 +6716,14 @@ define <16 x i32> @ult_21_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_21_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_21_v16i32:
@@ -6736,7 +6736,7 @@ define <16 x i32> @ult_21_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ult <16 x i32> %2, <i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21>
@@ -6776,7 +6776,7 @@ define <16 x i32> @ugt_21_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_21_v16i32:
@@ -6797,14 +6797,14 @@ define <16 x i32> @ugt_21_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_21_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_21_v16i32:
@@ -6817,7 +6817,7 @@ define <16 x i32> @ugt_21_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ugt <16 x i32> %2, <i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21>
@@ -6857,7 +6857,7 @@ define <16 x i32> @ult_22_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_22_v16i32:
@@ -6878,14 +6878,14 @@ define <16 x i32> @ult_22_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_22_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_22_v16i32:
@@ -6898,7 +6898,7 @@ define <16 x i32> @ult_22_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ult <16 x i32> %2, <i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22>
@@ -6938,7 +6938,7 @@ define <16 x i32> @ugt_22_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_22_v16i32:
@@ -6959,14 +6959,14 @@ define <16 x i32> @ugt_22_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_22_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_22_v16i32:
@@ -6979,7 +6979,7 @@ define <16 x i32> @ugt_22_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ugt <16 x i32> %2, <i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22>
@@ -7019,7 +7019,7 @@ define <16 x i32> @ult_23_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_23_v16i32:
@@ -7040,14 +7040,14 @@ define <16 x i32> @ult_23_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_23_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_23_v16i32:
@@ -7060,7 +7060,7 @@ define <16 x i32> @ult_23_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ult <16 x i32> %2, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
@@ -7100,7 +7100,7 @@ define <16 x i32> @ugt_23_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_23_v16i32:
@@ -7121,14 +7121,14 @@ define <16 x i32> @ugt_23_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_23_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_23_v16i32:
@@ -7141,7 +7141,7 @@ define <16 x i32> @ugt_23_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ugt <16 x i32> %2, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
@@ -7181,7 +7181,7 @@ define <16 x i32> @ult_24_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_24_v16i32:
@@ -7202,14 +7202,14 @@ define <16 x i32> @ult_24_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_24_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_24_v16i32:
@@ -7222,7 +7222,7 @@ define <16 x i32> @ult_24_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ult <16 x i32> %2, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
@@ -7262,7 +7262,7 @@ define <16 x i32> @ugt_24_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_24_v16i32:
@@ -7283,14 +7283,14 @@ define <16 x i32> @ugt_24_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_24_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_24_v16i32:
@@ -7303,7 +7303,7 @@ define <16 x i32> @ugt_24_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ugt <16 x i32> %2, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
@@ -7343,7 +7343,7 @@ define <16 x i32> @ult_25_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_25_v16i32:
@@ -7364,14 +7364,14 @@ define <16 x i32> @ult_25_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_25_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_25_v16i32:
@@ -7384,7 +7384,7 @@ define <16 x i32> @ult_25_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ult <16 x i32> %2, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
@@ -7424,7 +7424,7 @@ define <16 x i32> @ugt_25_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_25_v16i32:
@@ -7445,14 +7445,14 @@ define <16 x i32> @ugt_25_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_25_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_25_v16i32:
@@ -7465,7 +7465,7 @@ define <16 x i32> @ugt_25_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ugt <16 x i32> %2, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
@@ -7505,7 +7505,7 @@ define <16 x i32> @ult_26_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_26_v16i32:
@@ -7526,14 +7526,14 @@ define <16 x i32> @ult_26_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_26_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_26_v16i32:
@@ -7546,7 +7546,7 @@ define <16 x i32> @ult_26_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ult <16 x i32> %2, <i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26>
@@ -7586,7 +7586,7 @@ define <16 x i32> @ugt_26_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_26_v16i32:
@@ -7607,14 +7607,14 @@ define <16 x i32> @ugt_26_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_26_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_26_v16i32:
@@ -7627,7 +7627,7 @@ define <16 x i32> @ugt_26_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ugt <16 x i32> %2, <i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26>
@@ -7667,7 +7667,7 @@ define <16 x i32> @ult_27_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_27_v16i32:
@@ -7688,14 +7688,14 @@ define <16 x i32> @ult_27_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_27_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_27_v16i32:
@@ -7708,7 +7708,7 @@ define <16 x i32> @ult_27_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ult <16 x i32> %2, <i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
@@ -7748,7 +7748,7 @@ define <16 x i32> @ugt_27_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_27_v16i32:
@@ -7769,14 +7769,14 @@ define <16 x i32> @ugt_27_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_27_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_27_v16i32:
@@ -7789,7 +7789,7 @@ define <16 x i32> @ugt_27_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ugt <16 x i32> %2, <i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
@@ -7829,7 +7829,7 @@ define <16 x i32> @ult_28_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_28_v16i32:
@@ -7850,14 +7850,14 @@ define <16 x i32> @ult_28_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_28_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_28_v16i32:
@@ -7870,7 +7870,7 @@ define <16 x i32> @ult_28_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ult <16 x i32> %2, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
@@ -7910,7 +7910,7 @@ define <16 x i32> @ugt_28_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_28_v16i32:
@@ -7931,14 +7931,14 @@ define <16 x i32> @ugt_28_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_28_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_28_v16i32:
@@ -7951,7 +7951,7 @@ define <16 x i32> @ugt_28_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ugt <16 x i32> %2, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
@@ -7991,7 +7991,7 @@ define <16 x i32> @ult_29_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_29_v16i32:
@@ -8012,14 +8012,14 @@ define <16 x i32> @ult_29_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_29_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_29_v16i32:
@@ -8032,7 +8032,7 @@ define <16 x i32> @ult_29_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ult <16 x i32> %2, <i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29>
@@ -8072,7 +8072,7 @@ define <16 x i32> @ugt_29_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_29_v16i32:
@@ -8093,14 +8093,14 @@ define <16 x i32> @ugt_29_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_29_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_29_v16i32:
@@ -8113,7 +8113,7 @@ define <16 x i32> @ugt_29_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ugt <16 x i32> %2, <i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29>
@@ -8153,7 +8153,7 @@ define <16 x i32> @ult_30_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_30_v16i32:
@@ -8174,14 +8174,14 @@ define <16 x i32> @ult_30_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_30_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_30_v16i32:
@@ -8194,7 +8194,7 @@ define <16 x i32> @ult_30_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ult <16 x i32> %2, <i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30>
@@ -8234,7 +8234,7 @@ define <16 x i32> @ugt_30_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_30_v16i32:
@@ -8255,14 +8255,14 @@ define <16 x i32> @ugt_30_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_30_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_30_v16i32:
@@ -8275,7 +8275,7 @@ define <16 x i32> @ugt_30_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ugt <16 x i32> %2, <i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30>
@@ -8315,7 +8315,7 @@ define <16 x i32> @ult_31_v16i32(<16 x i32> %0) {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_31_v16i32:
@@ -8336,14 +8336,14 @@ define <16 x i32> @ult_31_v16i32(<16 x i32> %0) {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_31_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_31_v16i32:
@@ -8356,7 +8356,7 @@ define <16 x i32> @ult_31_v16i32(<16 x i32> %0) {
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ult <16 x i32> %2, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
@@ -8367,33 +8367,33 @@ define <16 x i32> @ult_31_v16i32(<16 x i32> %0) {
 define <8 x i64> @ugt_1_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_1_v8i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512F-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
 ; AVX512F-NEXT:    vptestmq %zmm1, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_1_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512BW-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vptestmq %zmm1, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_1_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_1_v8i64:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; BITALG-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
 ; BITALG-NEXT:    vptestmq %zmm1, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
@@ -8404,33 +8404,33 @@ define <8 x i64> @ugt_1_v8i64(<8 x i64> %0) {
 define <8 x i64> @ult_2_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_2_v8i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512F-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
 ; AVX512F-NEXT:    vptestnmq %zmm1, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_2_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512BW-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vptestnmq %zmm1, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_2_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_2_v8i64:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; BITALG-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
 ; BITALG-NEXT:    vptestnmq %zmm1, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
@@ -8462,7 +8462,7 @@ define <8 x i64> @ugt_2_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_2_v8i64:
@@ -8479,14 +8479,14 @@ define <8 x i64> @ugt_2_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_2_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_2_v8i64:
@@ -8495,7 +8495,7 @@ define <8 x i64> @ugt_2_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
@@ -8527,7 +8527,7 @@ define <8 x i64> @ult_3_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_3_v8i64:
@@ -8544,14 +8544,14 @@ define <8 x i64> @ult_3_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_3_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_3_v8i64:
@@ -8560,7 +8560,7 @@ define <8 x i64> @ult_3_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>
@@ -8592,7 +8592,7 @@ define <8 x i64> @ugt_3_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_3_v8i64:
@@ -8609,14 +8609,14 @@ define <8 x i64> @ugt_3_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_3_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_3_v8i64:
@@ -8625,7 +8625,7 @@ define <8 x i64> @ugt_3_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>
@@ -8657,7 +8657,7 @@ define <8 x i64> @ult_4_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_4_v8i64:
@@ -8674,14 +8674,14 @@ define <8 x i64> @ult_4_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_4_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_4_v8i64:
@@ -8690,7 +8690,7 @@ define <8 x i64> @ult_4_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4>
@@ -8722,7 +8722,7 @@ define <8 x i64> @ugt_4_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_4_v8i64:
@@ -8739,14 +8739,14 @@ define <8 x i64> @ugt_4_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_4_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_4_v8i64:
@@ -8755,7 +8755,7 @@ define <8 x i64> @ugt_4_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4>
@@ -8787,7 +8787,7 @@ define <8 x i64> @ult_5_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_5_v8i64:
@@ -8804,14 +8804,14 @@ define <8 x i64> @ult_5_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_5_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_5_v8i64:
@@ -8820,7 +8820,7 @@ define <8 x i64> @ult_5_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
@@ -8852,7 +8852,7 @@ define <8 x i64> @ugt_5_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_5_v8i64:
@@ -8869,14 +8869,14 @@ define <8 x i64> @ugt_5_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_5_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_5_v8i64:
@@ -8885,7 +8885,7 @@ define <8 x i64> @ugt_5_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
@@ -8917,7 +8917,7 @@ define <8 x i64> @ult_6_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_6_v8i64:
@@ -8934,14 +8934,14 @@ define <8 x i64> @ult_6_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_6_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_6_v8i64:
@@ -8950,7 +8950,7 @@ define <8 x i64> @ult_6_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 6, i64 6, i64 6, i64 6, i64 6, i64 6, i64 6, i64 6>
@@ -8982,7 +8982,7 @@ define <8 x i64> @ugt_6_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_6_v8i64:
@@ -8999,14 +8999,14 @@ define <8 x i64> @ugt_6_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_6_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_6_v8i64:
@@ -9015,7 +9015,7 @@ define <8 x i64> @ugt_6_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 6, i64 6, i64 6, i64 6, i64 6, i64 6, i64 6, i64 6>
@@ -9047,7 +9047,7 @@ define <8 x i64> @ult_7_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_7_v8i64:
@@ -9064,14 +9064,14 @@ define <8 x i64> @ult_7_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_7_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_7_v8i64:
@@ -9080,7 +9080,7 @@ define <8 x i64> @ult_7_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
@@ -9112,7 +9112,7 @@ define <8 x i64> @ugt_7_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_7_v8i64:
@@ -9129,14 +9129,14 @@ define <8 x i64> @ugt_7_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_7_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_7_v8i64:
@@ -9145,7 +9145,7 @@ define <8 x i64> @ugt_7_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
@@ -9177,7 +9177,7 @@ define <8 x i64> @ult_8_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_8_v8i64:
@@ -9194,14 +9194,14 @@ define <8 x i64> @ult_8_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_8_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_8_v8i64:
@@ -9210,7 +9210,7 @@ define <8 x i64> @ult_8_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
@@ -9242,7 +9242,7 @@ define <8 x i64> @ugt_8_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_8_v8i64:
@@ -9259,14 +9259,14 @@ define <8 x i64> @ugt_8_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_8_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_8_v8i64:
@@ -9275,7 +9275,7 @@ define <8 x i64> @ugt_8_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
@@ -9307,7 +9307,7 @@ define <8 x i64> @ult_9_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_9_v8i64:
@@ -9324,14 +9324,14 @@ define <8 x i64> @ult_9_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_9_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_9_v8i64:
@@ -9340,7 +9340,7 @@ define <8 x i64> @ult_9_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 9, i64 9, i64 9, i64 9, i64 9, i64 9, i64 9, i64 9>
@@ -9372,7 +9372,7 @@ define <8 x i64> @ugt_9_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_9_v8i64:
@@ -9389,14 +9389,14 @@ define <8 x i64> @ugt_9_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_9_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_9_v8i64:
@@ -9405,7 +9405,7 @@ define <8 x i64> @ugt_9_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 9, i64 9, i64 9, i64 9, i64 9, i64 9, i64 9, i64 9>
@@ -9437,7 +9437,7 @@ define <8 x i64> @ult_10_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_10_v8i64:
@@ -9454,14 +9454,14 @@ define <8 x i64> @ult_10_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_10_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_10_v8i64:
@@ -9470,7 +9470,7 @@ define <8 x i64> @ult_10_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 10, i64 10, i64 10, i64 10, i64 10, i64 10, i64 10, i64 10>
@@ -9502,7 +9502,7 @@ define <8 x i64> @ugt_10_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_10_v8i64:
@@ -9519,14 +9519,14 @@ define <8 x i64> @ugt_10_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_10_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_10_v8i64:
@@ -9535,7 +9535,7 @@ define <8 x i64> @ugt_10_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 10, i64 10, i64 10, i64 10, i64 10, i64 10, i64 10, i64 10>
@@ -9567,7 +9567,7 @@ define <8 x i64> @ult_11_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_11_v8i64:
@@ -9584,14 +9584,14 @@ define <8 x i64> @ult_11_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_11_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_11_v8i64:
@@ -9600,7 +9600,7 @@ define <8 x i64> @ult_11_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 11, i64 11, i64 11, i64 11, i64 11, i64 11, i64 11, i64 11>
@@ -9632,7 +9632,7 @@ define <8 x i64> @ugt_11_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_11_v8i64:
@@ -9649,14 +9649,14 @@ define <8 x i64> @ugt_11_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_11_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_11_v8i64:
@@ -9665,7 +9665,7 @@ define <8 x i64> @ugt_11_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 11, i64 11, i64 11, i64 11, i64 11, i64 11, i64 11, i64 11>
@@ -9697,7 +9697,7 @@ define <8 x i64> @ult_12_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_12_v8i64:
@@ -9714,14 +9714,14 @@ define <8 x i64> @ult_12_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_12_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_12_v8i64:
@@ -9730,7 +9730,7 @@ define <8 x i64> @ult_12_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12>
@@ -9762,7 +9762,7 @@ define <8 x i64> @ugt_12_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_12_v8i64:
@@ -9779,14 +9779,14 @@ define <8 x i64> @ugt_12_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_12_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_12_v8i64:
@@ -9795,7 +9795,7 @@ define <8 x i64> @ugt_12_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12>
@@ -9827,7 +9827,7 @@ define <8 x i64> @ult_13_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_13_v8i64:
@@ -9844,14 +9844,14 @@ define <8 x i64> @ult_13_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_13_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_13_v8i64:
@@ -9860,7 +9860,7 @@ define <8 x i64> @ult_13_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13>
@@ -9892,7 +9892,7 @@ define <8 x i64> @ugt_13_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_13_v8i64:
@@ -9909,14 +9909,14 @@ define <8 x i64> @ugt_13_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_13_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_13_v8i64:
@@ -9925,7 +9925,7 @@ define <8 x i64> @ugt_13_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13>
@@ -9957,7 +9957,7 @@ define <8 x i64> @ult_14_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_14_v8i64:
@@ -9974,14 +9974,14 @@ define <8 x i64> @ult_14_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_14_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_14_v8i64:
@@ -9990,7 +9990,7 @@ define <8 x i64> @ult_14_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14>
@@ -10022,7 +10022,7 @@ define <8 x i64> @ugt_14_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_14_v8i64:
@@ -10039,14 +10039,14 @@ define <8 x i64> @ugt_14_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_14_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_14_v8i64:
@@ -10055,7 +10055,7 @@ define <8 x i64> @ugt_14_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14>
@@ -10087,7 +10087,7 @@ define <8 x i64> @ult_15_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_15_v8i64:
@@ -10104,14 +10104,14 @@ define <8 x i64> @ult_15_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_15_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_15_v8i64:
@@ -10120,7 +10120,7 @@ define <8 x i64> @ult_15_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15>
@@ -10152,7 +10152,7 @@ define <8 x i64> @ugt_15_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_15_v8i64:
@@ -10169,14 +10169,14 @@ define <8 x i64> @ugt_15_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_15_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_15_v8i64:
@@ -10185,7 +10185,7 @@ define <8 x i64> @ugt_15_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15>
@@ -10217,7 +10217,7 @@ define <8 x i64> @ult_16_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_16_v8i64:
@@ -10234,14 +10234,14 @@ define <8 x i64> @ult_16_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_16_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_16_v8i64:
@@ -10250,7 +10250,7 @@ define <8 x i64> @ult_16_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
@@ -10282,7 +10282,7 @@ define <8 x i64> @ugt_16_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_16_v8i64:
@@ -10299,14 +10299,14 @@ define <8 x i64> @ugt_16_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_16_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_16_v8i64:
@@ -10315,7 +10315,7 @@ define <8 x i64> @ugt_16_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
@@ -10347,7 +10347,7 @@ define <8 x i64> @ult_17_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_17_v8i64:
@@ -10364,14 +10364,14 @@ define <8 x i64> @ult_17_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_17_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_17_v8i64:
@@ -10380,7 +10380,7 @@ define <8 x i64> @ult_17_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 17, i64 17, i64 17, i64 17, i64 17, i64 17, i64 17, i64 17>
@@ -10412,7 +10412,7 @@ define <8 x i64> @ugt_17_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_17_v8i64:
@@ -10429,14 +10429,14 @@ define <8 x i64> @ugt_17_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_17_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_17_v8i64:
@@ -10445,7 +10445,7 @@ define <8 x i64> @ugt_17_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 17, i64 17, i64 17, i64 17, i64 17, i64 17, i64 17, i64 17>
@@ -10477,7 +10477,7 @@ define <8 x i64> @ult_18_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_18_v8i64:
@@ -10494,14 +10494,14 @@ define <8 x i64> @ult_18_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_18_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_18_v8i64:
@@ -10510,7 +10510,7 @@ define <8 x i64> @ult_18_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 18, i64 18, i64 18, i64 18, i64 18, i64 18, i64 18, i64 18>
@@ -10542,7 +10542,7 @@ define <8 x i64> @ugt_18_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_18_v8i64:
@@ -10559,14 +10559,14 @@ define <8 x i64> @ugt_18_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_18_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_18_v8i64:
@@ -10575,7 +10575,7 @@ define <8 x i64> @ugt_18_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 18, i64 18, i64 18, i64 18, i64 18, i64 18, i64 18, i64 18>
@@ -10607,7 +10607,7 @@ define <8 x i64> @ult_19_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_19_v8i64:
@@ -10624,14 +10624,14 @@ define <8 x i64> @ult_19_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_19_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_19_v8i64:
@@ -10640,7 +10640,7 @@ define <8 x i64> @ult_19_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19>
@@ -10672,7 +10672,7 @@ define <8 x i64> @ugt_19_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_19_v8i64:
@@ -10689,14 +10689,14 @@ define <8 x i64> @ugt_19_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_19_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_19_v8i64:
@@ -10705,7 +10705,7 @@ define <8 x i64> @ugt_19_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19>
@@ -10737,7 +10737,7 @@ define <8 x i64> @ult_20_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_20_v8i64:
@@ -10754,14 +10754,14 @@ define <8 x i64> @ult_20_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_20_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_20_v8i64:
@@ -10770,7 +10770,7 @@ define <8 x i64> @ult_20_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 20, i64 20, i64 20, i64 20, i64 20, i64 20, i64 20, i64 20>
@@ -10802,7 +10802,7 @@ define <8 x i64> @ugt_20_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_20_v8i64:
@@ -10819,14 +10819,14 @@ define <8 x i64> @ugt_20_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_20_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_20_v8i64:
@@ -10835,7 +10835,7 @@ define <8 x i64> @ugt_20_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 20, i64 20, i64 20, i64 20, i64 20, i64 20, i64 20, i64 20>
@@ -10867,7 +10867,7 @@ define <8 x i64> @ult_21_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_21_v8i64:
@@ -10884,14 +10884,14 @@ define <8 x i64> @ult_21_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_21_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_21_v8i64:
@@ -10900,7 +10900,7 @@ define <8 x i64> @ult_21_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 21, i64 21, i64 21, i64 21, i64 21, i64 21, i64 21, i64 21>
@@ -10932,7 +10932,7 @@ define <8 x i64> @ugt_21_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_21_v8i64:
@@ -10949,14 +10949,14 @@ define <8 x i64> @ugt_21_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_21_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_21_v8i64:
@@ -10965,7 +10965,7 @@ define <8 x i64> @ugt_21_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 21, i64 21, i64 21, i64 21, i64 21, i64 21, i64 21, i64 21>
@@ -10997,7 +10997,7 @@ define <8 x i64> @ult_22_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_22_v8i64:
@@ -11014,14 +11014,14 @@ define <8 x i64> @ult_22_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_22_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_22_v8i64:
@@ -11030,7 +11030,7 @@ define <8 x i64> @ult_22_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 22, i64 22, i64 22, i64 22, i64 22, i64 22, i64 22, i64 22>
@@ -11062,7 +11062,7 @@ define <8 x i64> @ugt_22_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_22_v8i64:
@@ -11079,14 +11079,14 @@ define <8 x i64> @ugt_22_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_22_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_22_v8i64:
@@ -11095,7 +11095,7 @@ define <8 x i64> @ugt_22_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 22, i64 22, i64 22, i64 22, i64 22, i64 22, i64 22, i64 22>
@@ -11127,7 +11127,7 @@ define <8 x i64> @ult_23_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_23_v8i64:
@@ -11144,14 +11144,14 @@ define <8 x i64> @ult_23_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_23_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_23_v8i64:
@@ -11160,7 +11160,7 @@ define <8 x i64> @ult_23_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 23, i64 23, i64 23, i64 23, i64 23, i64 23, i64 23, i64 23>
@@ -11192,7 +11192,7 @@ define <8 x i64> @ugt_23_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_23_v8i64:
@@ -11209,14 +11209,14 @@ define <8 x i64> @ugt_23_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_23_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_23_v8i64:
@@ -11225,7 +11225,7 @@ define <8 x i64> @ugt_23_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 23, i64 23, i64 23, i64 23, i64 23, i64 23, i64 23, i64 23>
@@ -11257,7 +11257,7 @@ define <8 x i64> @ult_24_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_24_v8i64:
@@ -11274,14 +11274,14 @@ define <8 x i64> @ult_24_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_24_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_24_v8i64:
@@ -11290,7 +11290,7 @@ define <8 x i64> @ult_24_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 24, i64 24, i64 24, i64 24, i64 24, i64 24, i64 24, i64 24>
@@ -11322,7 +11322,7 @@ define <8 x i64> @ugt_24_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_24_v8i64:
@@ -11339,14 +11339,14 @@ define <8 x i64> @ugt_24_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_24_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_24_v8i64:
@@ -11355,7 +11355,7 @@ define <8 x i64> @ugt_24_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 24, i64 24, i64 24, i64 24, i64 24, i64 24, i64 24, i64 24>
@@ -11387,7 +11387,7 @@ define <8 x i64> @ult_25_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_25_v8i64:
@@ -11404,14 +11404,14 @@ define <8 x i64> @ult_25_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_25_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_25_v8i64:
@@ -11420,7 +11420,7 @@ define <8 x i64> @ult_25_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 25, i64 25, i64 25, i64 25, i64 25, i64 25, i64 25, i64 25>
@@ -11452,7 +11452,7 @@ define <8 x i64> @ugt_25_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_25_v8i64:
@@ -11469,14 +11469,14 @@ define <8 x i64> @ugt_25_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_25_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_25_v8i64:
@@ -11485,7 +11485,7 @@ define <8 x i64> @ugt_25_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 25, i64 25, i64 25, i64 25, i64 25, i64 25, i64 25, i64 25>
@@ -11517,7 +11517,7 @@ define <8 x i64> @ult_26_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_26_v8i64:
@@ -11534,14 +11534,14 @@ define <8 x i64> @ult_26_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_26_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_26_v8i64:
@@ -11550,7 +11550,7 @@ define <8 x i64> @ult_26_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 26, i64 26, i64 26, i64 26, i64 26, i64 26, i64 26, i64 26>
@@ -11582,7 +11582,7 @@ define <8 x i64> @ugt_26_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_26_v8i64:
@@ -11599,14 +11599,14 @@ define <8 x i64> @ugt_26_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_26_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_26_v8i64:
@@ -11615,7 +11615,7 @@ define <8 x i64> @ugt_26_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 26, i64 26, i64 26, i64 26, i64 26, i64 26, i64 26, i64 26>
@@ -11647,7 +11647,7 @@ define <8 x i64> @ult_27_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_27_v8i64:
@@ -11664,14 +11664,14 @@ define <8 x i64> @ult_27_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_27_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_27_v8i64:
@@ -11680,7 +11680,7 @@ define <8 x i64> @ult_27_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 27, i64 27, i64 27, i64 27, i64 27, i64 27, i64 27, i64 27>
@@ -11712,7 +11712,7 @@ define <8 x i64> @ugt_27_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_27_v8i64:
@@ -11729,14 +11729,14 @@ define <8 x i64> @ugt_27_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_27_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_27_v8i64:
@@ -11745,7 +11745,7 @@ define <8 x i64> @ugt_27_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 27, i64 27, i64 27, i64 27, i64 27, i64 27, i64 27, i64 27>
@@ -11777,7 +11777,7 @@ define <8 x i64> @ult_28_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_28_v8i64:
@@ -11794,14 +11794,14 @@ define <8 x i64> @ult_28_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_28_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_28_v8i64:
@@ -11810,7 +11810,7 @@ define <8 x i64> @ult_28_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 28, i64 28, i64 28, i64 28, i64 28, i64 28, i64 28, i64 28>
@@ -11842,7 +11842,7 @@ define <8 x i64> @ugt_28_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_28_v8i64:
@@ -11859,14 +11859,14 @@ define <8 x i64> @ugt_28_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_28_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_28_v8i64:
@@ -11875,7 +11875,7 @@ define <8 x i64> @ugt_28_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 28, i64 28, i64 28, i64 28, i64 28, i64 28, i64 28, i64 28>
@@ -11907,7 +11907,7 @@ define <8 x i64> @ult_29_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_29_v8i64:
@@ -11924,14 +11924,14 @@ define <8 x i64> @ult_29_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_29_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_29_v8i64:
@@ -11940,7 +11940,7 @@ define <8 x i64> @ult_29_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 29, i64 29, i64 29, i64 29, i64 29, i64 29, i64 29, i64 29>
@@ -11972,7 +11972,7 @@ define <8 x i64> @ugt_29_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_29_v8i64:
@@ -11989,14 +11989,14 @@ define <8 x i64> @ugt_29_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_29_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_29_v8i64:
@@ -12005,7 +12005,7 @@ define <8 x i64> @ugt_29_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 29, i64 29, i64 29, i64 29, i64 29, i64 29, i64 29, i64 29>
@@ -12037,7 +12037,7 @@ define <8 x i64> @ult_30_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_30_v8i64:
@@ -12054,14 +12054,14 @@ define <8 x i64> @ult_30_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_30_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_30_v8i64:
@@ -12070,7 +12070,7 @@ define <8 x i64> @ult_30_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 30, i64 30, i64 30, i64 30, i64 30, i64 30, i64 30, i64 30>
@@ -12102,7 +12102,7 @@ define <8 x i64> @ugt_30_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_30_v8i64:
@@ -12119,14 +12119,14 @@ define <8 x i64> @ugt_30_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_30_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_30_v8i64:
@@ -12135,7 +12135,7 @@ define <8 x i64> @ugt_30_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 30, i64 30, i64 30, i64 30, i64 30, i64 30, i64 30, i64 30>
@@ -12167,7 +12167,7 @@ define <8 x i64> @ult_31_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_31_v8i64:
@@ -12184,14 +12184,14 @@ define <8 x i64> @ult_31_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_31_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_31_v8i64:
@@ -12200,7 +12200,7 @@ define <8 x i64> @ult_31_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31>
@@ -12232,7 +12232,7 @@ define <8 x i64> @ugt_31_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_31_v8i64:
@@ -12249,14 +12249,14 @@ define <8 x i64> @ugt_31_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_31_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_31_v8i64:
@@ -12265,7 +12265,7 @@ define <8 x i64> @ugt_31_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31>
@@ -12297,7 +12297,7 @@ define <8 x i64> @ult_32_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_32_v8i64:
@@ -12314,14 +12314,14 @@ define <8 x i64> @ult_32_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_32_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_32_v8i64:
@@ -12330,7 +12330,7 @@ define <8 x i64> @ult_32_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
@@ -12362,7 +12362,7 @@ define <8 x i64> @ugt_32_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_32_v8i64:
@@ -12379,14 +12379,14 @@ define <8 x i64> @ugt_32_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_32_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_32_v8i64:
@@ -12395,7 +12395,7 @@ define <8 x i64> @ugt_32_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
@@ -12427,7 +12427,7 @@ define <8 x i64> @ult_33_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_33_v8i64:
@@ -12444,14 +12444,14 @@ define <8 x i64> @ult_33_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_33_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_33_v8i64:
@@ -12460,7 +12460,7 @@ define <8 x i64> @ult_33_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 33, i64 33, i64 33, i64 33, i64 33, i64 33, i64 33, i64 33>
@@ -12492,7 +12492,7 @@ define <8 x i64> @ugt_33_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_33_v8i64:
@@ -12509,14 +12509,14 @@ define <8 x i64> @ugt_33_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_33_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_33_v8i64:
@@ -12525,7 +12525,7 @@ define <8 x i64> @ugt_33_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 33, i64 33, i64 33, i64 33, i64 33, i64 33, i64 33, i64 33>
@@ -12557,7 +12557,7 @@ define <8 x i64> @ult_34_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_34_v8i64:
@@ -12574,14 +12574,14 @@ define <8 x i64> @ult_34_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_34_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_34_v8i64:
@@ -12590,7 +12590,7 @@ define <8 x i64> @ult_34_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 34, i64 34, i64 34, i64 34, i64 34, i64 34, i64 34, i64 34>
@@ -12622,7 +12622,7 @@ define <8 x i64> @ugt_34_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_34_v8i64:
@@ -12639,14 +12639,14 @@ define <8 x i64> @ugt_34_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_34_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_34_v8i64:
@@ -12655,7 +12655,7 @@ define <8 x i64> @ugt_34_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 34, i64 34, i64 34, i64 34, i64 34, i64 34, i64 34, i64 34>
@@ -12687,7 +12687,7 @@ define <8 x i64> @ult_35_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_35_v8i64:
@@ -12704,14 +12704,14 @@ define <8 x i64> @ult_35_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_35_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_35_v8i64:
@@ -12720,7 +12720,7 @@ define <8 x i64> @ult_35_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 35, i64 35, i64 35, i64 35, i64 35, i64 35, i64 35, i64 35>
@@ -12752,7 +12752,7 @@ define <8 x i64> @ugt_35_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_35_v8i64:
@@ -12769,14 +12769,14 @@ define <8 x i64> @ugt_35_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_35_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_35_v8i64:
@@ -12785,7 +12785,7 @@ define <8 x i64> @ugt_35_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 35, i64 35, i64 35, i64 35, i64 35, i64 35, i64 35, i64 35>
@@ -12817,7 +12817,7 @@ define <8 x i64> @ult_36_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_36_v8i64:
@@ -12834,14 +12834,14 @@ define <8 x i64> @ult_36_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_36_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_36_v8i64:
@@ -12850,7 +12850,7 @@ define <8 x i64> @ult_36_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 36, i64 36, i64 36, i64 36, i64 36, i64 36, i64 36, i64 36>
@@ -12882,7 +12882,7 @@ define <8 x i64> @ugt_36_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_36_v8i64:
@@ -12899,14 +12899,14 @@ define <8 x i64> @ugt_36_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_36_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_36_v8i64:
@@ -12915,7 +12915,7 @@ define <8 x i64> @ugt_36_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 36, i64 36, i64 36, i64 36, i64 36, i64 36, i64 36, i64 36>
@@ -12947,7 +12947,7 @@ define <8 x i64> @ult_37_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_37_v8i64:
@@ -12964,14 +12964,14 @@ define <8 x i64> @ult_37_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_37_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_37_v8i64:
@@ -12980,7 +12980,7 @@ define <8 x i64> @ult_37_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 37, i64 37, i64 37, i64 37, i64 37, i64 37, i64 37, i64 37>
@@ -13012,7 +13012,7 @@ define <8 x i64> @ugt_37_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_37_v8i64:
@@ -13029,14 +13029,14 @@ define <8 x i64> @ugt_37_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_37_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_37_v8i64:
@@ -13045,7 +13045,7 @@ define <8 x i64> @ugt_37_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 37, i64 37, i64 37, i64 37, i64 37, i64 37, i64 37, i64 37>
@@ -13077,7 +13077,7 @@ define <8 x i64> @ult_38_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_38_v8i64:
@@ -13094,14 +13094,14 @@ define <8 x i64> @ult_38_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_38_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_38_v8i64:
@@ -13110,7 +13110,7 @@ define <8 x i64> @ult_38_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 38, i64 38, i64 38, i64 38, i64 38, i64 38, i64 38, i64 38>
@@ -13142,7 +13142,7 @@ define <8 x i64> @ugt_38_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_38_v8i64:
@@ -13159,14 +13159,14 @@ define <8 x i64> @ugt_38_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_38_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_38_v8i64:
@@ -13175,7 +13175,7 @@ define <8 x i64> @ugt_38_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 38, i64 38, i64 38, i64 38, i64 38, i64 38, i64 38, i64 38>
@@ -13207,7 +13207,7 @@ define <8 x i64> @ult_39_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_39_v8i64:
@@ -13224,14 +13224,14 @@ define <8 x i64> @ult_39_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_39_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_39_v8i64:
@@ -13240,7 +13240,7 @@ define <8 x i64> @ult_39_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 39, i64 39, i64 39, i64 39, i64 39, i64 39, i64 39, i64 39>
@@ -13272,7 +13272,7 @@ define <8 x i64> @ugt_39_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_39_v8i64:
@@ -13289,14 +13289,14 @@ define <8 x i64> @ugt_39_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_39_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_39_v8i64:
@@ -13305,7 +13305,7 @@ define <8 x i64> @ugt_39_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 39, i64 39, i64 39, i64 39, i64 39, i64 39, i64 39, i64 39>
@@ -13337,7 +13337,7 @@ define <8 x i64> @ult_40_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_40_v8i64:
@@ -13354,14 +13354,14 @@ define <8 x i64> @ult_40_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_40_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_40_v8i64:
@@ -13370,7 +13370,7 @@ define <8 x i64> @ult_40_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 40, i64 40, i64 40, i64 40, i64 40, i64 40, i64 40, i64 40>
@@ -13402,7 +13402,7 @@ define <8 x i64> @ugt_40_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_40_v8i64:
@@ -13419,14 +13419,14 @@ define <8 x i64> @ugt_40_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_40_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_40_v8i64:
@@ -13435,7 +13435,7 @@ define <8 x i64> @ugt_40_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 40, i64 40, i64 40, i64 40, i64 40, i64 40, i64 40, i64 40>
@@ -13467,7 +13467,7 @@ define <8 x i64> @ult_41_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_41_v8i64:
@@ -13484,14 +13484,14 @@ define <8 x i64> @ult_41_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_41_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_41_v8i64:
@@ -13500,7 +13500,7 @@ define <8 x i64> @ult_41_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 41, i64 41, i64 41, i64 41, i64 41, i64 41, i64 41, i64 41>
@@ -13532,7 +13532,7 @@ define <8 x i64> @ugt_41_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_41_v8i64:
@@ -13549,14 +13549,14 @@ define <8 x i64> @ugt_41_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_41_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_41_v8i64:
@@ -13565,7 +13565,7 @@ define <8 x i64> @ugt_41_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 41, i64 41, i64 41, i64 41, i64 41, i64 41, i64 41, i64 41>
@@ -13597,7 +13597,7 @@ define <8 x i64> @ult_42_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_42_v8i64:
@@ -13614,14 +13614,14 @@ define <8 x i64> @ult_42_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_42_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_42_v8i64:
@@ -13630,7 +13630,7 @@ define <8 x i64> @ult_42_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42>
@@ -13662,7 +13662,7 @@ define <8 x i64> @ugt_42_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_42_v8i64:
@@ -13679,14 +13679,14 @@ define <8 x i64> @ugt_42_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_42_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_42_v8i64:
@@ -13695,7 +13695,7 @@ define <8 x i64> @ugt_42_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42>
@@ -13727,7 +13727,7 @@ define <8 x i64> @ult_43_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_43_v8i64:
@@ -13744,14 +13744,14 @@ define <8 x i64> @ult_43_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_43_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_43_v8i64:
@@ -13760,7 +13760,7 @@ define <8 x i64> @ult_43_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 43, i64 43, i64 43, i64 43, i64 43, i64 43, i64 43, i64 43>
@@ -13792,7 +13792,7 @@ define <8 x i64> @ugt_43_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_43_v8i64:
@@ -13809,14 +13809,14 @@ define <8 x i64> @ugt_43_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_43_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_43_v8i64:
@@ -13825,7 +13825,7 @@ define <8 x i64> @ugt_43_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 43, i64 43, i64 43, i64 43, i64 43, i64 43, i64 43, i64 43>
@@ -13857,7 +13857,7 @@ define <8 x i64> @ult_44_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_44_v8i64:
@@ -13874,14 +13874,14 @@ define <8 x i64> @ult_44_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_44_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_44_v8i64:
@@ -13890,7 +13890,7 @@ define <8 x i64> @ult_44_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 44, i64 44, i64 44, i64 44, i64 44, i64 44, i64 44, i64 44>
@@ -13922,7 +13922,7 @@ define <8 x i64> @ugt_44_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_44_v8i64:
@@ -13939,14 +13939,14 @@ define <8 x i64> @ugt_44_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_44_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_44_v8i64:
@@ -13955,7 +13955,7 @@ define <8 x i64> @ugt_44_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 44, i64 44, i64 44, i64 44, i64 44, i64 44, i64 44, i64 44>
@@ -13987,7 +13987,7 @@ define <8 x i64> @ult_45_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_45_v8i64:
@@ -14004,14 +14004,14 @@ define <8 x i64> @ult_45_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_45_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_45_v8i64:
@@ -14020,7 +14020,7 @@ define <8 x i64> @ult_45_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 45, i64 45, i64 45, i64 45, i64 45, i64 45, i64 45, i64 45>
@@ -14052,7 +14052,7 @@ define <8 x i64> @ugt_45_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_45_v8i64:
@@ -14069,14 +14069,14 @@ define <8 x i64> @ugt_45_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_45_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_45_v8i64:
@@ -14085,7 +14085,7 @@ define <8 x i64> @ugt_45_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 45, i64 45, i64 45, i64 45, i64 45, i64 45, i64 45, i64 45>
@@ -14117,7 +14117,7 @@ define <8 x i64> @ult_46_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_46_v8i64:
@@ -14134,14 +14134,14 @@ define <8 x i64> @ult_46_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_46_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_46_v8i64:
@@ -14150,7 +14150,7 @@ define <8 x i64> @ult_46_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 46, i64 46, i64 46, i64 46, i64 46, i64 46, i64 46, i64 46>
@@ -14182,7 +14182,7 @@ define <8 x i64> @ugt_46_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_46_v8i64:
@@ -14199,14 +14199,14 @@ define <8 x i64> @ugt_46_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_46_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_46_v8i64:
@@ -14215,7 +14215,7 @@ define <8 x i64> @ugt_46_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 46, i64 46, i64 46, i64 46, i64 46, i64 46, i64 46, i64 46>
@@ -14247,7 +14247,7 @@ define <8 x i64> @ult_47_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_47_v8i64:
@@ -14264,14 +14264,14 @@ define <8 x i64> @ult_47_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_47_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_47_v8i64:
@@ -14280,7 +14280,7 @@ define <8 x i64> @ult_47_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 47, i64 47, i64 47, i64 47, i64 47, i64 47, i64 47, i64 47>
@@ -14312,7 +14312,7 @@ define <8 x i64> @ugt_47_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_47_v8i64:
@@ -14329,14 +14329,14 @@ define <8 x i64> @ugt_47_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_47_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_47_v8i64:
@@ -14345,7 +14345,7 @@ define <8 x i64> @ugt_47_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 47, i64 47, i64 47, i64 47, i64 47, i64 47, i64 47, i64 47>
@@ -14377,7 +14377,7 @@ define <8 x i64> @ult_48_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_48_v8i64:
@@ -14394,14 +14394,14 @@ define <8 x i64> @ult_48_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_48_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_48_v8i64:
@@ -14410,7 +14410,7 @@ define <8 x i64> @ult_48_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48>
@@ -14442,7 +14442,7 @@ define <8 x i64> @ugt_48_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_48_v8i64:
@@ -14459,14 +14459,14 @@ define <8 x i64> @ugt_48_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_48_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_48_v8i64:
@@ -14475,7 +14475,7 @@ define <8 x i64> @ugt_48_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48>
@@ -14507,7 +14507,7 @@ define <8 x i64> @ult_49_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_49_v8i64:
@@ -14524,14 +14524,14 @@ define <8 x i64> @ult_49_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_49_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_49_v8i64:
@@ -14540,7 +14540,7 @@ define <8 x i64> @ult_49_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 49, i64 49, i64 49, i64 49, i64 49, i64 49, i64 49, i64 49>
@@ -14572,7 +14572,7 @@ define <8 x i64> @ugt_49_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_49_v8i64:
@@ -14589,14 +14589,14 @@ define <8 x i64> @ugt_49_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_49_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_49_v8i64:
@@ -14605,7 +14605,7 @@ define <8 x i64> @ugt_49_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 49, i64 49, i64 49, i64 49, i64 49, i64 49, i64 49, i64 49>
@@ -14637,7 +14637,7 @@ define <8 x i64> @ult_50_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_50_v8i64:
@@ -14654,14 +14654,14 @@ define <8 x i64> @ult_50_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_50_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_50_v8i64:
@@ -14670,7 +14670,7 @@ define <8 x i64> @ult_50_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 50, i64 50, i64 50, i64 50, i64 50, i64 50, i64 50, i64 50>
@@ -14702,7 +14702,7 @@ define <8 x i64> @ugt_50_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_50_v8i64:
@@ -14719,14 +14719,14 @@ define <8 x i64> @ugt_50_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_50_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_50_v8i64:
@@ -14735,7 +14735,7 @@ define <8 x i64> @ugt_50_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 50, i64 50, i64 50, i64 50, i64 50, i64 50, i64 50, i64 50>
@@ -14767,7 +14767,7 @@ define <8 x i64> @ult_51_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_51_v8i64:
@@ -14784,14 +14784,14 @@ define <8 x i64> @ult_51_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_51_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_51_v8i64:
@@ -14800,7 +14800,7 @@ define <8 x i64> @ult_51_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 51, i64 51, i64 51, i64 51, i64 51, i64 51, i64 51, i64 51>
@@ -14832,7 +14832,7 @@ define <8 x i64> @ugt_51_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_51_v8i64:
@@ -14849,14 +14849,14 @@ define <8 x i64> @ugt_51_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_51_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_51_v8i64:
@@ -14865,7 +14865,7 @@ define <8 x i64> @ugt_51_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 51, i64 51, i64 51, i64 51, i64 51, i64 51, i64 51, i64 51>
@@ -14897,7 +14897,7 @@ define <8 x i64> @ult_52_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_52_v8i64:
@@ -14914,14 +14914,14 @@ define <8 x i64> @ult_52_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_52_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_52_v8i64:
@@ -14930,7 +14930,7 @@ define <8 x i64> @ult_52_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 52, i64 52, i64 52, i64 52, i64 52, i64 52, i64 52, i64 52>
@@ -14962,7 +14962,7 @@ define <8 x i64> @ugt_52_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_52_v8i64:
@@ -14979,14 +14979,14 @@ define <8 x i64> @ugt_52_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_52_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_52_v8i64:
@@ -14995,7 +14995,7 @@ define <8 x i64> @ugt_52_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 52, i64 52, i64 52, i64 52, i64 52, i64 52, i64 52, i64 52>
@@ -15027,7 +15027,7 @@ define <8 x i64> @ult_53_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_53_v8i64:
@@ -15044,14 +15044,14 @@ define <8 x i64> @ult_53_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_53_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_53_v8i64:
@@ -15060,7 +15060,7 @@ define <8 x i64> @ult_53_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 53, i64 53, i64 53, i64 53, i64 53, i64 53, i64 53, i64 53>
@@ -15092,7 +15092,7 @@ define <8 x i64> @ugt_53_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_53_v8i64:
@@ -15109,14 +15109,14 @@ define <8 x i64> @ugt_53_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_53_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_53_v8i64:
@@ -15125,7 +15125,7 @@ define <8 x i64> @ugt_53_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 53, i64 53, i64 53, i64 53, i64 53, i64 53, i64 53, i64 53>
@@ -15157,7 +15157,7 @@ define <8 x i64> @ult_54_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_54_v8i64:
@@ -15174,14 +15174,14 @@ define <8 x i64> @ult_54_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_54_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_54_v8i64:
@@ -15190,7 +15190,7 @@ define <8 x i64> @ult_54_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 54, i64 54, i64 54, i64 54, i64 54, i64 54, i64 54, i64 54>
@@ -15222,7 +15222,7 @@ define <8 x i64> @ugt_54_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_54_v8i64:
@@ -15239,14 +15239,14 @@ define <8 x i64> @ugt_54_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_54_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_54_v8i64:
@@ -15255,7 +15255,7 @@ define <8 x i64> @ugt_54_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 54, i64 54, i64 54, i64 54, i64 54, i64 54, i64 54, i64 54>
@@ -15287,7 +15287,7 @@ define <8 x i64> @ult_55_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_55_v8i64:
@@ -15304,14 +15304,14 @@ define <8 x i64> @ult_55_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_55_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_55_v8i64:
@@ -15320,7 +15320,7 @@ define <8 x i64> @ult_55_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 55, i64 55, i64 55, i64 55, i64 55, i64 55, i64 55, i64 55>
@@ -15352,7 +15352,7 @@ define <8 x i64> @ugt_55_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_55_v8i64:
@@ -15369,14 +15369,14 @@ define <8 x i64> @ugt_55_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_55_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_55_v8i64:
@@ -15385,7 +15385,7 @@ define <8 x i64> @ugt_55_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 55, i64 55, i64 55, i64 55, i64 55, i64 55, i64 55, i64 55>
@@ -15417,7 +15417,7 @@ define <8 x i64> @ult_56_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_56_v8i64:
@@ -15434,14 +15434,14 @@ define <8 x i64> @ult_56_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_56_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_56_v8i64:
@@ -15450,7 +15450,7 @@ define <8 x i64> @ult_56_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 56, i64 56, i64 56, i64 56, i64 56, i64 56, i64 56, i64 56>
@@ -15482,7 +15482,7 @@ define <8 x i64> @ugt_56_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_56_v8i64:
@@ -15499,14 +15499,14 @@ define <8 x i64> @ugt_56_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_56_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_56_v8i64:
@@ -15515,7 +15515,7 @@ define <8 x i64> @ugt_56_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 56, i64 56, i64 56, i64 56, i64 56, i64 56, i64 56, i64 56>
@@ -15547,7 +15547,7 @@ define <8 x i64> @ult_57_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_57_v8i64:
@@ -15564,14 +15564,14 @@ define <8 x i64> @ult_57_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_57_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_57_v8i64:
@@ -15580,7 +15580,7 @@ define <8 x i64> @ult_57_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 57, i64 57, i64 57, i64 57, i64 57, i64 57, i64 57, i64 57>
@@ -15612,7 +15612,7 @@ define <8 x i64> @ugt_57_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_57_v8i64:
@@ -15629,14 +15629,14 @@ define <8 x i64> @ugt_57_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_57_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_57_v8i64:
@@ -15645,7 +15645,7 @@ define <8 x i64> @ugt_57_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 57, i64 57, i64 57, i64 57, i64 57, i64 57, i64 57, i64 57>
@@ -15677,7 +15677,7 @@ define <8 x i64> @ult_58_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_58_v8i64:
@@ -15694,14 +15694,14 @@ define <8 x i64> @ult_58_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_58_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_58_v8i64:
@@ -15710,7 +15710,7 @@ define <8 x i64> @ult_58_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 58, i64 58, i64 58, i64 58, i64 58, i64 58, i64 58, i64 58>
@@ -15742,7 +15742,7 @@ define <8 x i64> @ugt_58_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_58_v8i64:
@@ -15759,14 +15759,14 @@ define <8 x i64> @ugt_58_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_58_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_58_v8i64:
@@ -15775,7 +15775,7 @@ define <8 x i64> @ugt_58_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 58, i64 58, i64 58, i64 58, i64 58, i64 58, i64 58, i64 58>
@@ -15807,7 +15807,7 @@ define <8 x i64> @ult_59_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_59_v8i64:
@@ -15824,14 +15824,14 @@ define <8 x i64> @ult_59_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_59_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_59_v8i64:
@@ -15840,7 +15840,7 @@ define <8 x i64> @ult_59_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 59, i64 59, i64 59, i64 59, i64 59, i64 59, i64 59, i64 59>
@@ -15872,7 +15872,7 @@ define <8 x i64> @ugt_59_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_59_v8i64:
@@ -15889,14 +15889,14 @@ define <8 x i64> @ugt_59_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_59_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_59_v8i64:
@@ -15905,7 +15905,7 @@ define <8 x i64> @ugt_59_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 59, i64 59, i64 59, i64 59, i64 59, i64 59, i64 59, i64 59>
@@ -15937,7 +15937,7 @@ define <8 x i64> @ult_60_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_60_v8i64:
@@ -15954,14 +15954,14 @@ define <8 x i64> @ult_60_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_60_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_60_v8i64:
@@ -15970,7 +15970,7 @@ define <8 x i64> @ult_60_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 60, i64 60, i64 60, i64 60, i64 60, i64 60, i64 60, i64 60>
@@ -16002,7 +16002,7 @@ define <8 x i64> @ugt_60_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_60_v8i64:
@@ -16019,14 +16019,14 @@ define <8 x i64> @ugt_60_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_60_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_60_v8i64:
@@ -16035,7 +16035,7 @@ define <8 x i64> @ugt_60_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 60, i64 60, i64 60, i64 60, i64 60, i64 60, i64 60, i64 60>
@@ -16067,7 +16067,7 @@ define <8 x i64> @ult_61_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_61_v8i64:
@@ -16084,14 +16084,14 @@ define <8 x i64> @ult_61_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_61_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_61_v8i64:
@@ -16100,7 +16100,7 @@ define <8 x i64> @ult_61_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 61, i64 61, i64 61, i64 61, i64 61, i64 61, i64 61, i64 61>
@@ -16132,7 +16132,7 @@ define <8 x i64> @ugt_61_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_61_v8i64:
@@ -16149,14 +16149,14 @@ define <8 x i64> @ugt_61_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_61_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_61_v8i64:
@@ -16165,7 +16165,7 @@ define <8 x i64> @ugt_61_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 61, i64 61, i64 61, i64 61, i64 61, i64 61, i64 61, i64 61>
@@ -16197,7 +16197,7 @@ define <8 x i64> @ult_62_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_62_v8i64:
@@ -16214,14 +16214,14 @@ define <8 x i64> @ult_62_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_62_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_62_v8i64:
@@ -16230,7 +16230,7 @@ define <8 x i64> @ult_62_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 62, i64 62, i64 62, i64 62, i64 62, i64 62, i64 62, i64 62>
@@ -16262,7 +16262,7 @@ define <8 x i64> @ugt_62_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ugt_62_v8i64:
@@ -16279,14 +16279,14 @@ define <8 x i64> @ugt_62_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_62_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ugt_62_v8i64:
@@ -16295,7 +16295,7 @@ define <8 x i64> @ugt_62_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ugt <8 x i64> %2, <i64 62, i64 62, i64 62, i64 62, i64 62, i64 62, i64 62, i64 62>
@@ -16327,7 +16327,7 @@ define <8 x i64> @ult_63_v8i64(<8 x i64> %0) {
 ; AVX512F-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ult_63_v8i64:
@@ -16344,14 +16344,14 @@ define <8 x i64> @ult_63_v8i64(<8 x i64> %0) {
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_63_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_63_v8i64:
@@ -16360,7 +16360,7 @@ define <8 x i64> @ult_63_v8i64(<8 x i64> %0) {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ult <8 x i64> %2, <i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63>
diff --git a/llvm/test/CodeGen/X86/vector-popcnt-512.ll b/llvm/test/CodeGen/X86/vector-popcnt-512.ll
index efe3985d98d9e..0a5f16a0f635f 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-512.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-512.ll
@@ -291,36 +291,36 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
 define <8 x i64> @eq_1_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: eq_1_v8i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512F-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
 ; AVX512F-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: eq_1_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512BW-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: eq_1_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: eq_1_v8i64:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; BITALG-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
 ; BITALG-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp eq <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
@@ -331,36 +331,36 @@ define <8 x i64> @eq_1_v8i64(<8 x i64> %0) {
 define <8 x i64> @ne_1_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ne_1_v8i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512F-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
 ; AVX512F-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpleuq %zmm1, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ne_1_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512BW-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpleuq %zmm1, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ne_1_v8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpneqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ne_1_v8i64:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; BITALG-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
 ; BITALG-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpleuq %zmm1, %zmm0, %k1
-; BITALG-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
   %3 = icmp ne <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
@@ -371,36 +371,36 @@ define <8 x i64> @ne_1_v8i64(<8 x i64> %0) {
 define <16 x i32> @eq_1_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: eq_1_v16i32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
 ; AVX512F-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpnleud %zmm1, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: eq_1_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpnleud %zmm1, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: eq_1_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: eq_1_v16i32:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; BITALG-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
 ; BITALG-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpnleud %zmm1, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp eq <16 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -411,36 +411,36 @@ define <16 x i32> @eq_1_v16i32(<16 x i32> %0) {
 define <16 x i32> @ne_1_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ne_1_v16i32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
 ; AVX512F-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcmpleud %zmm1, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ne_1_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpleud %zmm1, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ne_1_v16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
-; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VPOPCNTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: ne_1_v16i32:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; BITALG-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
 ; BITALG-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpcmpleud %zmm1, %zmm0, %k1
-; BITALG-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; BITALG-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
   %3 = icmp ne <16 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -462,12 +462,12 @@ define <32 x i16> @eq_1_v32i16(<32 x i16> %0) {
 ; AVX512F-NEXT:    vpminuw %ymm2, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: eq_1_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0
@@ -487,12 +487,12 @@ define <32 x i16> @eq_1_v32i16(<32 x i16> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpminuw %ymm2, %ymm0, %ymm2
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: eq_1_v32i16:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512VPOPCNTDQ-BW-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm1
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0
@@ -529,7 +529,7 @@ define <32 x i16> @ne_1_v32i16(<32 x i16> %0) {
 ;
 ; AVX512BW-LABEL: ne_1_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0
@@ -553,7 +553,7 @@ define <32 x i16> @ne_1_v32i16(<32 x i16> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ne_1_v32i16:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512VPOPCNTDQ-BW-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm1
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0
@@ -586,12 +586,12 @@ define <64 x i8> @eq_1_v64i8(<64 x i8> %0) {
 ; AVX512F-NEXT:    vpminub %ymm2, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: eq_1_v64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0
@@ -611,12 +611,12 @@ define <64 x i8> @eq_1_v64i8(<64 x i8> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpminub %ymm2, %ymm0, %ymm2
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: eq_1_v64i8:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512VPOPCNTDQ-BW-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm1
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0
@@ -653,7 +653,7 @@ define <64 x i8> @ne_1_v64i8(<64 x i8> %0) {
 ;
 ; AVX512BW-LABEL: ne_1_v64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpcmpleub %zmm1, %zmm0, %k0
@@ -677,7 +677,7 @@ define <64 x i8> @ne_1_v64i8(<64 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ne_1_v64i8:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512VPOPCNTDQ-BW-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm1
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpcmpleub %zmm1, %zmm0, %k0

From 7b184687dd144de33c9f3e3f5c2d18212a5df9a9 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 11 Jan 2025 15:02:05 +0000
Subject: [PATCH 170/408] [X86] vselect-avx.ll - regenerate VPTERNLOG comments

---
 llvm/test/CodeGen/X86/vselect-avx.ll | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll
index a2c3613ecca12..06474559644b0 100644
--- a/llvm/test/CodeGen/X86/vselect-avx.ll
+++ b/llvm/test/CodeGen/X86/vselect-avx.ll
@@ -128,7 +128,7 @@ define void @test3(<4 x i32> %induction30, ptr %tmp16, ptr %tmp17,  <4 x i16> %t
 ; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX512-NEXT:    vpmovdw %ymm0, %xmm0
-; AVX512-NEXT:    vpternlogq $226, %xmm2, %xmm0, %xmm1
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm2 ^ (xmm0 & (xmm1 ^ xmm2))
 ; AVX512-NEXT:    vmovq %xmm0, (%rdi)
 ; AVX512-NEXT:    vmovq %xmm1, (%rsi)
 ; AVX512-NEXT:    vzeroupper
@@ -283,7 +283,7 @@ define <4 x i64> @vselect_concat_split_v16i8(<4 x i64> %a, <4 x i64> %b, <4 x i6
 ; AVX512-LABEL: vselect_concat_split_v16i8:
 ; AVX512:       ## %bb.0:
 ; AVX512-NEXT:    vpcmpgtb %ymm2, %ymm3, %ymm2
-; AVX512-NEXT:    vpternlogq $216, %ymm2, %ymm1, %ymm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm2 & (ymm0 ^ ymm1))
 ; AVX512-NEXT:    retq
   %a.bc = bitcast <4 x i64> %a to <32 x i8>
   %b.bc = bitcast <4 x i64> %b to <32 x i8>

From 60788154981f52bb4595d2cdbe8e77d2f7ca0a48 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 11 Jan 2025 15:02:25 +0000
Subject: [PATCH 171/408] [X86] avx512-mask-op.ll - regenerate VPTERNLOG
 comments

---
 llvm/test/CodeGen/X86/avx512-mask-op.ll | 70 ++++++++++++-------------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll
index 9e689341f7b88..8d98290ba29a6 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@@ -467,7 +467,7 @@ define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1
 ; KNL-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
 ; KNL-NEXT:    vpcmpgtq %zmm3, %zmm2, %k1
 ; KNL-NEXT:    vpcmpleq %zmm1, %zmm0, %k1 {%k1}
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
@@ -488,7 +488,7 @@ define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1
 ; AVX512BW-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512BW-NEXT:    vpcmpgtq %zmm3, %zmm2, %k1
 ; AVX512BW-NEXT:    vpcmpleq %zmm1, %zmm0, %k1 {%k1}
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -529,7 +529,7 @@ define <2 x i64> @test5(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1
 ; KNL-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
 ; KNL-NEXT:    vpcmpgtq %zmm0, %zmm1, %k1
 ; KNL-NEXT:    vpcmpleq %zmm3, %zmm2, %k1 {%k1}
-; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
@@ -549,7 +549,7 @@ define <2 x i64> @test5(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1
 ; AVX512BW-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512BW-NEXT:    vpcmpgtq %zmm0, %zmm1, %k1
 ; AVX512BW-NEXT:    vpcmpleq %zmm3, %zmm2, %k1 {%k1}
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -677,7 +677,7 @@ define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) {
 ; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; KNL-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
 ; KNL-NEXT:  LBB17_3:
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
@@ -769,7 +769,7 @@ define <16 x i1> @test9(<16 x i1>%a, <16 x i1>%b, i32 %a1, i32 %b1) {
 ; KNL-NEXT:  LBB18_3:
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
@@ -853,7 +853,7 @@ define <8 x i1> @test10(<8 x i1>%a, <8 x i1>%b, i32 %a1, i32 %b1) {
 ; KNL-NEXT:  LBB19_3:
 ; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    vpmovdw %zmm0, %ymm0
 ; KNL-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $ymm0
 ; KNL-NEXT:    vzeroupper
@@ -938,7 +938,7 @@ define <4 x i1> @test11(<4 x i1>%a, <4 x i1>%b, i32 %a1, i32 %b1) {
 ; KNL-NEXT:    vpslld $31, %xmm0, %xmm0
 ; KNL-NEXT:  LBB20_3:
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
@@ -968,7 +968,7 @@ define <4 x i1> @test11(<4 x i1>%a, <4 x i1>%b, i32 %a1, i32 %b1) {
 ; AVX512BW-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX512BW-NEXT:  LBB20_3:
 ; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -1081,7 +1081,7 @@ define <16 x i1> @test15(i32 %x, i32 %y)  {
 ; KNL-NEXT:    movl $1, %ecx
 ; KNL-NEXT:    cmovgl %eax, %ecx
 ; KNL-NEXT:    kmovw %ecx, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
@@ -1158,14 +1158,14 @@ define <64 x i8> @test16(i64 %x) {
 ; KNL-NEXT:    kshiftlw $15, %k4, %k4
 ; KNL-NEXT:    kshiftrw $10, %k4, %k4
 ; KNL-NEXT:    korw %k4, %k0, %k4
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k3} {z} = -1
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
-; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1
 ; KNL-NEXT:    vpmovdb %zmm1, %xmm1
 ; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm1 {%k4} {z} = -1
 ; KNL-NEXT:    vpmovdb %zmm1, %xmm1
-; KNL-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
 ; KNL-NEXT:    vpmovdb %zmm2, %xmm2
 ; KNL-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; KNL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1274,14 +1274,14 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
 ; KNL-NEXT:    kshiftlw $15, %k4, %k4
 ; KNL-NEXT:    kshiftrw $10, %k4, %k4
 ; KNL-NEXT:    korw %k4, %k0, %k4
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k3} {z} = -1
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
-; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1
 ; KNL-NEXT:    vpmovdb %zmm1, %xmm1
 ; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm1 {%k4} {z} = -1
 ; KNL-NEXT:    vpmovdb %zmm1, %xmm1
-; KNL-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
 ; KNL-NEXT:    vpmovdb %zmm2, %xmm2
 ; KNL-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; KNL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1390,7 +1390,7 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
 ; KNL-NEXT:    kshiftrw $9, %k0, %k0
 ; KNL-NEXT:    kshiftlw $7, %k2, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    vpmovdw %zmm0, %ymm0
 ; KNL-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $ymm0
 ; KNL-NEXT:    vzeroupper
@@ -2368,7 +2368,7 @@ define <8 x i64> @load_8i1(ptr %a) {
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    movzbl (%rdi), %eax
 ; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: load_8i1:
@@ -2381,7 +2381,7 @@ define <8 x i64> @load_8i1(ptr %a) {
 ; AVX512BW:       ## %bb.0:
 ; AVX512BW-NEXT:    movzbl (%rdi), %eax
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: load_8i1:
@@ -2405,7 +2405,7 @@ define <16 x i32> @load_16i1(ptr %a) {
 ; KNL-LABEL: load_16i1:
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    kmovw (%rdi), %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: load_16i1:
@@ -2417,7 +2417,7 @@ define <16 x i32> @load_16i1(ptr %a) {
 ; AVX512BW-LABEL: load_16i1:
 ; AVX512BW:       ## %bb.0:
 ; AVX512BW-NEXT:    kmovw (%rdi), %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: load_16i1:
@@ -2442,7 +2442,7 @@ define <2 x i16> @load_2i1(ptr %a) {
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    movzbl (%rdi), %eax
 ; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    vpmovdw %zmm0, %ymm0
 ; KNL-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $ymm0
 ; KNL-NEXT:    vzeroupper
@@ -2488,7 +2488,7 @@ define <4 x i16> @load_4i1(ptr %a) {
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    movzbl (%rdi), %eax
 ; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    vpmovdw %zmm0, %ymm0
 ; KNL-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $ymm0
 ; KNL-NEXT:    vzeroupper
@@ -2534,9 +2534,9 @@ define <32 x i16> @load_32i1(ptr %a) {
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    kmovw (%rdi), %k1
 ; KNL-NEXT:    kmovw 2(%rdi), %k2
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    vpmovdw %zmm0, %ymm0
-; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1
 ; KNL-NEXT:    vpmovdw %zmm1, %ymm1
 ; KNL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; KNL-NEXT:    retq
@@ -2582,14 +2582,14 @@ define <64 x i8> @load_64i1(ptr %a) {
 ; KNL-NEXT:    kmovw 2(%rdi), %k2
 ; KNL-NEXT:    kmovw 4(%rdi), %k3
 ; KNL-NEXT:    kmovw 6(%rdi), %k4
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k3} {z} = -1
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
-; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm1 {%k4} {z} = -1
 ; KNL-NEXT:    vpmovdb %zmm1, %xmm1
 ; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1
 ; KNL-NEXT:    vpmovdb %zmm1, %xmm1
-; KNL-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1
 ; KNL-NEXT:    vpmovdb %zmm2, %xmm2
 ; KNL-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; KNL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -4750,7 +4750,7 @@ define void @ktest_6(<32 x i16> %w, <32 x i16> %x, <32 x i16> %y, <32 x i16> %z)
 ; KNL-NEXT:    vpcmpeqw %ymm5, %ymm2, %ymm2
 ; KNL-NEXT:    vpcmpeqw %ymm5, %ymm3, %ymm3
 ; KNL-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; KNL-NEXT:    vpternlogq $200, %zmm1, %zmm0, %zmm2
+; KNL-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm0 & (zmm2 | zmm1)
 ; KNL-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
 ; KNL-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
@@ -4831,7 +4831,7 @@ define void @ktest_6(<32 x i16> %w, <32 x i16> %x, <32 x i16> %y, <32 x i16> %z)
 ; AVX512DQ-NEXT:    vpcmpeqw %ymm5, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vpcmpeqw %ymm5, %ymm3, %ymm3
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512DQ-NEXT:    vpternlogq $200, %zmm1, %zmm0, %zmm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm0 & (zmm2 | zmm1)
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
@@ -4910,7 +4910,7 @@ define void @ktest_7(<64 x i8> %w, <64 x i8> %x, <64 x i8> %y, <64 x i8> %z) {
 ; KNL-NEXT:    vpcmpeqb %ymm5, %ymm2, %ymm2
 ; KNL-NEXT:    vpcmpeqb %ymm5, %ymm3, %ymm3
 ; KNL-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; KNL-NEXT:    vpternlogq $200, %zmm1, %zmm0, %zmm2
+; KNL-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm0 & (zmm2 | zmm1)
 ; KNL-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
 ; KNL-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; KNL-NEXT:    vpmovmskb %ymm0, %eax
@@ -4989,7 +4989,7 @@ define void @ktest_7(<64 x i8> %w, <64 x i8> %x, <64 x i8> %y, <64 x i8> %z) {
 ; AVX512DQ-NEXT:    vpcmpeqb %ymm5, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vpcmpeqb %ymm5, %ymm3, %ymm3
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512DQ-NEXT:    vpternlogq $200, %zmm1, %zmm0, %zmm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm0 & (zmm2 | zmm1)
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX512DQ-NEXT:    vpmovmskb %ymm0, %eax

From 70f37321de26bfbf2c5a0af5c274ab5ca43f8fed Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 11 Jan 2025 15:02:46 +0000
Subject: [PATCH 172/408] [X86] avx512-build-vector.ll - regenerate VPTERNLOG
 comments

---
 llvm/test/CodeGen/X86/avx512-build-vector.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/X86/avx512-build-vector.ll b/llvm/test/CodeGen/X86/avx512-build-vector.ll
index b001ebf4d19b7..55478a2e93154 100644
--- a/llvm/test/CodeGen/X86/avx512-build-vector.ll
+++ b/llvm/test/CodeGen/X86/avx512-build-vector.ll
@@ -4,7 +4,7 @@
 define <16 x i32> @test2(<16 x i32> %x) {
 ; CHECK-LABEL: test2:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; CHECK-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
    %res = add <16 x i32><i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, %x

From 32351b5450119799ef33da92e87149467c622224 Mon Sep 17 00:00:00 2001
From: Congcong Cai <congcongcai0907@163.com>
Date: Sat, 11 Jan 2025 23:26:34 +0800
Subject: [PATCH 173/408] [clang-tidy][doc] fix incorrectly code snippet in
 release note (#122595)

---
 clang-tools-extra/docs/ReleaseNotes.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 9818ec9603bbc..835a0269a2733 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -195,7 +195,7 @@ Changes in existing checks
   the offending code with ``reinterpret_cast``, to more clearly express intent.
 
 - Improved :doc:`bugprone-dangling-handle
-  <clang-tidy/checks/bugprone/dangling-handle>` check to treat `std::span` as a
+  <clang-tidy/checks/bugprone/dangling-handle>` check to treat ``std::span`` as a
   handle class.
 
 - Improved :doc:`bugprone-exception-escape
@@ -233,7 +233,7 @@ Changes in existing checks
 
 - Improved :doc:`bugprone-unchecked-optional-access
   <clang-tidy/checks/bugprone/unchecked-optional-access>` to support
-  `bsl::optional` and `bdlb::NullableValue` from
+  ``bsl::optional`` and ``bdlb::NullableValue`` from
   <https://github.com/bloomberg/bde>_.
 
 - Improved :doc:`bugprone-unhandled-self-assignment
@@ -370,7 +370,7 @@ Changes in existing checks
 
 - Improved :doc:`readability-redundant-smartptr-get
   <clang-tidy/checks/readability/redundant-smartptr-get>` check to
-  remove `->`, when redundant `get()` is removed.
+  remove ``->``, when redundant ``get()`` is removed.
 
 - Improved :doc:`readability-use-std-min-max
   <clang-tidy/checks/readability/use-std-min-max>` check to use correct template

From 1eed46960c217f9480865702f06fb730c7521e61 Mon Sep 17 00:00:00 2001
From: Marius Kamp <msk@posteo.org>
Date: Sat, 11 Jan 2025 17:26:11 +0100
Subject: [PATCH 174/408] [AArch64] Eliminate Common Subexpression of CSEL by
 Reassociation (#121350)

If we have a CSEL instruction that depends on the flags set by a
(SUBS x c) instruction and the true and/or false expression is
(add (add x y) -c), we can reassociate the latter expression to
(add (SUBS x c) y) and save one instruction.

Proof for the basic transformation: https://alive2.llvm.org/ce/z/-337Pb

We can extend this transformation for slightly different constants. For
example, if we have (add (add x y) -(c-1)) and a the comparison x <u c,
we can transform the comparison to x <=u c-1 to eliminate the comparison
instruction, too. Similarly, we can transform (x == 0) to (x <u 1).

Proofs for the transformations that alter the constants:
https://alive2.llvm.org/ce/z/3nVqgR

Fixes #119606.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 121 +++
 llvm/test/CodeGen/AArch64/csel-cmp-cse.ll     | 773 ++++++++++++++++++
 2 files changed, 894 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/csel-cmp-cse.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a9ab5b56f5755..469c6d7caf458 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -24867,6 +24867,122 @@ static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG) {
   return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
 }
 
+// Reassociate the true/false expressions of a CSEL instruction to obtain a
+// common subexpression with the comparison instruction. For example, change
+// (CSEL (ADD (ADD x y) -c) f LO (SUBS x c)) to
+// (CSEL (ADD (SUBS x c) y) f LO (SUBS x c)) such that (SUBS x c) is a common
+// subexpression.
+static SDValue reassociateCSELOperandsForCSE(SDNode *N, SelectionDAG &DAG) {
+  SDValue SubsNode = N->getOperand(3);
+  if (SubsNode.getOpcode() != AArch64ISD::SUBS || !SubsNode.hasOneUse())
+    return SDValue();
+  auto *CmpOpConst = dyn_cast<ConstantSDNode>(SubsNode.getOperand(1));
+  if (!CmpOpConst)
+    return SDValue();
+
+  SDValue CmpOpOther = SubsNode.getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // Get the operand that can be reassociated with the SUBS instruction.
+  auto GetReassociationOp = [&](SDValue Op, APInt ExpectedConst) {
+    if (Op.getOpcode() != ISD::ADD)
+      return SDValue();
+    if (Op.getOperand(0).getOpcode() != ISD::ADD ||
+        !Op.getOperand(0).hasOneUse())
+      return SDValue();
+    SDValue X = Op.getOperand(0).getOperand(0);
+    SDValue Y = Op.getOperand(0).getOperand(1);
+    if (X != CmpOpOther)
+      std::swap(X, Y);
+    if (X != CmpOpOther)
+      return SDValue();
+    auto *AddOpConst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    if (!AddOpConst || AddOpConst->getAPIntValue() != ExpectedConst)
+      return SDValue();
+    return Y;
+  };
+
+  // Try the reassociation using the given constant and condition code.
+  auto Fold = [&](APInt NewCmpConst, AArch64CC::CondCode NewCC) {
+    APInt ExpectedConst = -NewCmpConst;
+    SDValue TReassocOp = GetReassociationOp(N->getOperand(0), ExpectedConst);
+    SDValue FReassocOp = GetReassociationOp(N->getOperand(1), ExpectedConst);
+    if (!TReassocOp && !FReassocOp)
+      return SDValue();
+
+    SDValue NewCmp = DAG.getNode(AArch64ISD::SUBS, SDLoc(SubsNode),
+                                 DAG.getVTList(VT, MVT_CC), CmpOpOther,
+                                 DAG.getConstant(NewCmpConst, SDLoc(CmpOpConst),
+                                                 CmpOpConst->getValueType(0)));
+
+    auto Reassociate = [&](SDValue ReassocOp, unsigned OpNum) {
+      if (!ReassocOp)
+        return N->getOperand(OpNum);
+      SDValue Res = DAG.getNode(ISD::ADD, SDLoc(N->getOperand(OpNum)), VT,
+                                NewCmp.getValue(0), ReassocOp);
+      DAG.ReplaceAllUsesWith(N->getOperand(OpNum), Res);
+      return Res;
+    };
+
+    SDValue TValReassoc = Reassociate(TReassocOp, 0);
+    SDValue FValReassoc = Reassociate(FReassocOp, 1);
+    return DAG.getNode(AArch64ISD::CSEL, SDLoc(N), VT, TValReassoc, FValReassoc,
+                       DAG.getConstant(NewCC, SDLoc(N->getOperand(2)), MVT_CC),
+                       NewCmp.getValue(1));
+  };
+
+  auto CC = static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
+
+  // First, try to eliminate the compare instruction by searching for a
+  // subtraction with the same constant.
+  if (SDValue R = Fold(CmpOpConst->getAPIntValue(), CC))
+    return R;
+
+  if ((CC == AArch64CC::EQ || CC == AArch64CC::NE) && !CmpOpConst->isZero())
+    return SDValue();
+
+  // Next, search for a subtraction with a slightly different constant. By
+  // adjusting the condition code, we can still eliminate the compare
+  // instruction. Adjusting the constant is only valid if it does not result
+  // in signed/unsigned wrap for signed/unsigned comparisons, respectively.
+  // Since such comparisons are trivially true/false, we should not encounter
+  // them here but check for them nevertheless to be on the safe side.
+  auto CheckedFold = [&](bool Check, APInt NewCmpConst,
+                         AArch64CC::CondCode NewCC) {
+    return Check ? Fold(NewCmpConst, NewCC) : SDValue();
+  };
+  switch (CC) {
+  case AArch64CC::EQ:
+  case AArch64CC::LS:
+    return CheckedFold(!CmpOpConst->getAPIntValue().isMaxValue(),
+                       CmpOpConst->getAPIntValue() + 1, AArch64CC::LO);
+  case AArch64CC::NE:
+  case AArch64CC::HI:
+    return CheckedFold(!CmpOpConst->getAPIntValue().isMaxValue(),
+                       CmpOpConst->getAPIntValue() + 1, AArch64CC::HS);
+  case AArch64CC::LO:
+    return CheckedFold(!CmpOpConst->getAPIntValue().isZero(),
+                       CmpOpConst->getAPIntValue() - 1, AArch64CC::LS);
+  case AArch64CC::HS:
+    return CheckedFold(!CmpOpConst->getAPIntValue().isZero(),
+                       CmpOpConst->getAPIntValue() - 1, AArch64CC::HI);
+  case AArch64CC::LT:
+    return CheckedFold(!CmpOpConst->getAPIntValue().isMinSignedValue(),
+                       CmpOpConst->getAPIntValue() - 1, AArch64CC::LE);
+  case AArch64CC::LE:
+    return CheckedFold(!CmpOpConst->getAPIntValue().isMaxSignedValue(),
+                       CmpOpConst->getAPIntValue() + 1, AArch64CC::LT);
+  case AArch64CC::GT:
+    return CheckedFold(!CmpOpConst->getAPIntValue().isMaxSignedValue(),
+                       CmpOpConst->getAPIntValue() + 1, AArch64CC::GE);
+  case AArch64CC::GE:
+    return CheckedFold(!CmpOpConst->getAPIntValue().isMinSignedValue(),
+                       CmpOpConst->getAPIntValue() - 1, AArch64CC::GT);
+  default:
+    return SDValue();
+  }
+}
+
 // Optimize CSEL instructions
 static SDValue performCSELCombine(SDNode *N,
                                   TargetLowering::DAGCombinerInfo &DCI,
@@ -24878,6 +24994,11 @@ static SDValue performCSELCombine(SDNode *N,
   if (SDValue R = foldCSELOfCSEL(N, DAG))
     return R;
 
+  // Try to reassociate the true/false expressions so that we can do CSE with
+  // a SUBS instruction used to perform the comparison.
+  if (SDValue R = reassociateCSELOperandsForCSE(N, DAG))
+    return R;
+
   // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
   // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
   if (SDValue Folded = foldCSELofCTTZ(N, DAG))
diff --git a/llvm/test/CodeGen/AArch64/csel-cmp-cse.ll b/llvm/test/CodeGen/AArch64/csel-cmp-cse.ll
new file mode 100644
index 0000000000000..d8904cc6e35e3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/csel-cmp-cse.ll
@@ -0,0 +1,773 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-unknown-unknown < %s | FileCheck %s
+
+declare void @use_i1(i1 %x)
+declare void @use_i32(i32 %x)
+
+; Based on the IR generated for the `last` method of the type `slice` in Rust
+define ptr @test_last_elem_from_ptr(ptr noundef readnone %x0, i64 noundef %x1) {
+; CHECK-LABEL: test_last_elem_from_ptr:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs x8, x1, #1
+; CHECK-NEXT:    add x8, x8, x0
+; CHECK-NEXT:    csel x0, xzr, x8, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp eq i64 %x1, 0
+  %add.ptr = getelementptr inbounds nuw i8, ptr %x0, i64 %x1
+  %add.ptr1 = getelementptr inbounds i8, ptr %add.ptr, i64 -1
+  %retval.0 = select i1 %cmp, ptr null, ptr %add.ptr1
+  ret ptr %retval.0
+}
+
+define i32 @test_eq0_sub_add_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_eq0_sub_add_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, #1
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp eq i32 %x1, 0
+  %add = add nuw i32 %x0, %x1
+  %sub = sub i32 %add, 1
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_eq7_sub_add_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_eq7_sub_add_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, #7
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, eq
+; CHECK-NEXT:    ret
+  %cmp = icmp eq i32 %x1, 7
+  %add = add nuw i32 %x0, %x1
+  %sub = sub i32 %add, 7
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_ule7_sub7_add_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_ule7_sub7_add_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, #7
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, ls
+; CHECK-NEXT:    ret
+  %cmp = icmp ule i32 %x1, 7
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, 7
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_ule7_sub8_add_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_ule7_sub8_add_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, #8
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp ule i32 %x1, 7
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, 8
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_ule0_sub1_add_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_ule0_sub1_add_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, #1
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp ule i32 %x1, 0
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, 1
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_ultminus2_subminus2_add_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_ultminus2_subminus2_add_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adds w8, w1, #2
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x1, -2
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, -2
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_ultminus2_subminus3_add_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_ultminus2_subminus3_add_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adds w8, w1, #3
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, ls
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x1, -2
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, -3
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_ne0_sub_add_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_ne0_sub_add_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, #1
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, w8, wzr, hs
+; CHECK-NEXT:    ret
+  %cmp = icmp ne i32 %x1, 0
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, 1
+  %ret = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %ret
+}
+
+define i32 @test_ne7_sub_add_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_ne7_sub_add_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, #7
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, w8, wzr, ne
+; CHECK-NEXT:    ret
+  %cmp = icmp ne i32 %x1, 7
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, 7
+  %ret = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %ret
+}
+
+define i32 @test_ultminus1_sub_add_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_ultminus1_sub_add_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adds w8, w1, #1
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, ne
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x1, -1
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, -1
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_ugt7_sub7_add_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_ugt7_sub7_add_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, #7
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, hi
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %x1, 7
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, 7
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_ugt7_sub8_add_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_ugt7_sub8_add_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, #8
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, hs
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %x1, 7
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, 8
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_sle7_sub7_add_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_sle7_sub7_add_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, #7
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, le
+; CHECK-NEXT:    ret
+  %cmp = icmp sle i32 %x1, 7
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, 7
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_sle7_sub8_add_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_sle7_sub8_add_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, #8
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, lt
+; CHECK-NEXT:    ret
+  %cmp = icmp sle i32 %x1, 7
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, 8
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_slt8_sub8_add_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_slt8_sub8_add_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, #8
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, lt
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i32 %x1, 8
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, 8
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_slt8_sub7_add_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_slt8_sub7_add_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, #7
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, le
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i32 %x1, 8
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, 7
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_sltminus8_subminus8_add_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_sltminus8_subminus8_add_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adds w8, w1, #8
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, lt
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i32 %x1, -8
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, -8
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_sgtminus8_subminus8_add_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_sgtminus8_subminus8_add_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adds w8, w1, #8
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, gt
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i32 %x1, -8
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, -8
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_sgtminus8_subminus7_add_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_sgtminus8_subminus7_add_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adds w8, w1, #7
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, ge
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i32 %x1, -8
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, -7
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_eq0_sub_addcomm_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_eq0_sub_addcomm_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, #1
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp eq i32 %x1, 0
+  %add = add i32 %x1, %x0
+  %sub = sub i32 %add, 1
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_eq0_subcomm_add_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_eq0_subcomm_add_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, #1
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp eq i32 %x1, 0
+  %add = add i32 %x0, %x1
+  %sub = add i32 -1, %add
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_eq0_multi_use_sub_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_eq0_multi_use_sub_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    subs w8, w1, #1
+; CHECK-NEXT:    add w0, w8, w0
+; CHECK-NEXT:    csel w19, wzr, w0, lo
+; CHECK-NEXT:    bl use_i32
+; CHECK-NEXT:    mov w0, w19
+; CHECK-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %cmp = icmp eq i32 %x1, 0
+  %add = add nuw i32 %x0, %x1
+  %sub = sub i32 %add, 1
+  tail call void @use_i32(i32 %sub)
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @test_eq0_multi_use_cmp_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_eq0_multi_use_cmp_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    add w8, w0, w1
+; CHECK-NEXT:    cmp w1, #0
+; CHECK-NEXT:    sub w8, w8, #1
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    csel w19, wzr, w8, eq
+; CHECK-NEXT:    bl use_i1
+; CHECK-NEXT:    mov w0, w19
+; CHECK-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %cmp = icmp eq i32 %x1, 0
+  tail call void @use_i1(i1 %cmp)
+  %add = add nuw i32 %x0, %x1
+  %sub = sub i32 %add, 1
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @test_eq0_multi_use_add_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_eq0_multi_use_add_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    add w20, w0, w1
+; CHECK-NEXT:    mov w19, w1
+; CHECK-NEXT:    mov w0, w20
+; CHECK-NEXT:    bl use_i32
+; CHECK-NEXT:    sub w8, w20, #1
+; CHECK-NEXT:    cmp w19, #0
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    csel w0, wzr, w8, eq
+; CHECK-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %cmp = icmp eq i32 %x1, 0
+  %add = add nuw i32 %x0, %x1
+  tail call void @use_i32(i32 %add)
+  %sub = sub i32 %add, 1
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @test_eq1_sub_add_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_eq1_sub_add_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, w1
+; CHECK-NEXT:    cmp w1, #1
+; CHECK-NEXT:    sub w8, w8, #2
+; CHECK-NEXT:    csel w0, wzr, w8, eq
+; CHECK-NEXT:    ret
+  %cmp = icmp eq i32 %x1, 1
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, 2
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @test_ugtsmax_sub_add_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_ugtsmax_sub_add_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2147483648 // =0x80000000
+; CHECK-NEXT:    add w9, w0, w1
+; CHECK-NEXT:    cmp w1, #0
+; CHECK-NEXT:    add w8, w9, w8
+; CHECK-NEXT:    csel w0, wzr, w8, lt
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %x1, 2147483647
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, 2147483648
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @test_ult_nonconst_i32(i32 %x0, i32 %x1, i32 %x2) {
+; CHECK-LABEL: test_ult_nonconst_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, w1
+; CHECK-NEXT:    cmp w1, w2
+; CHECK-NEXT:    sub w8, w8, w2
+; CHECK-NEXT:    csel w0, wzr, w8, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x1, %x2
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, %x2
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @test_eq_const_mismatch_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_eq_const_mismatch_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, w1
+; CHECK-NEXT:    cmp w1, #0
+; CHECK-NEXT:    sub w8, w8, #2
+; CHECK-NEXT:    csel w0, wzr, w8, eq
+; CHECK-NEXT:    ret
+  %cmp = icmp eq i32 %x1, 0
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, 2
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @test_ne_const_mismatch_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_ne_const_mismatch_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, w1
+; CHECK-NEXT:    cmp w1, #0
+; CHECK-NEXT:    sub w8, w8, #2
+; CHECK-NEXT:    csel w0, w8, wzr, ne
+; CHECK-NEXT:    ret
+  %cmp = icmp ne i32 %x1, 0
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, 2
+  %ret = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @test_ult7_const_mismatch_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_ult7_const_mismatch_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, w1
+; CHECK-NEXT:    cmp w1, #7
+; CHECK-NEXT:    sub w8, w8, #8
+; CHECK-NEXT:    csel w0, wzr, w8, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x1, 7
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, 8
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @test_ule7_const_mismatch_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_ule7_const_mismatch_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, w1
+; CHECK-NEXT:    cmp w1, #8
+; CHECK-NEXT:    sub w8, w8, #6
+; CHECK-NEXT:    csel w0, wzr, w8, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp ule i32 %x1, 7
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, 6
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @test_ugt7_const_mismatch_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_ugt7_const_mismatch_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, w1
+; CHECK-NEXT:    cmp w1, #7
+; CHECK-NEXT:    sub w8, w8, #6
+; CHECK-NEXT:    csel w0, wzr, w8, hi
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %x1, 7
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, 6
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @test_uge7_const_mismatch_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_uge7_const_mismatch_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, w1
+; CHECK-NEXT:    cmp w1, #6
+; CHECK-NEXT:    sub w8, w8, #8
+; CHECK-NEXT:    csel w0, wzr, w8, hi
+; CHECK-NEXT:    ret
+  %cmp = icmp uge i32 %x1, 7
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, 8
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @test_slt7_const_mismatch_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_slt7_const_mismatch_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, w1
+; CHECK-NEXT:    cmp w1, #7
+; CHECK-NEXT:    sub w8, w8, #8
+; CHECK-NEXT:    csel w0, wzr, w8, lt
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i32 %x1, 7
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, 8
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @test_sle7_const_mismatch_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_sle7_const_mismatch_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, w1
+; CHECK-NEXT:    cmp w1, #8
+; CHECK-NEXT:    sub w8, w8, #6
+; CHECK-NEXT:    csel w0, wzr, w8, lt
+; CHECK-NEXT:    ret
+  %cmp = icmp sle i32 %x1, 7
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, 6
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @test_sgt7_const_mismatch_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_sgt7_const_mismatch_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, w1
+; CHECK-NEXT:    cmp w1, #7
+; CHECK-NEXT:    sub w8, w8, #6
+; CHECK-NEXT:    csel w0, wzr, w8, gt
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i32 %x1, 7
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, 6
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @test_sge7_const_mismatch_i32(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_sge7_const_mismatch_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, w1
+; CHECK-NEXT:    cmp w1, #6
+; CHECK-NEXT:    sub w8, w8, #8
+; CHECK-NEXT:    csel w0, wzr, w8, gt
+; CHECK-NEXT:    ret
+  %cmp = icmp sge i32 %x1, 7
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, 8
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @test_unrelated_add_i32(i32 %x0, i32 %x1, i32 %x2) {
+; CHECK-LABEL: test_unrelated_add_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, w2
+; CHECK-NEXT:    cmp w1, #0
+; CHECK-NEXT:    sub w8, w8, #1
+; CHECK-NEXT:    csel w0, wzr, w8, eq
+; CHECK-NEXT:    ret
+  %cmp = icmp eq i32 %x1, 0
+  %add = add nuw i32 %x0, %x2
+  %sub = sub i32 %add, 1
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+; Negative test
+define i16 @test_eq0_sub_add_i16(i16 %x0, i16 %x1) {
+; CHECK-LABEL: test_eq0_sub_add_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, w1
+; CHECK-NEXT:    tst w1, #0xffff
+; CHECK-NEXT:    sub w8, w8, #1
+; CHECK-NEXT:    csel w0, wzr, w8, eq
+; CHECK-NEXT:    ret
+  %cmp = icmp eq i16 %x1, 0
+  %add = add nuw i16 %x0, %x1
+  %sub = sub i16 %add, 1
+  %ret = select i1 %cmp, i16 0, i16 %sub
+  ret i16 %ret
+}
+
+; Negative test
+define i32 @test_ule_unsigned_overflow(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_ule_unsigned_overflow:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+  %cmp = icmp ule i32 %x1, -1
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, 0
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @test_ugt_unsigned_overflow(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_ugt_unsigned_overflow:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w0, w0, w1
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %x1, -1
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, 0
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @test_ult_unsigned_overflow(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_ult_unsigned_overflow:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, w1
+; CHECK-NEXT:    add w0, w8, #1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x1, 0
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, -1
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @test_uge_unsigned_overflow(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_uge_unsigned_overflow:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+  %cmp = icmp uge i32 %x1, 0
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, -1
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @test_slt_signed_overflow(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_slt_signed_overflow:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2147483647 // =0x80000001
+; CHECK-NEXT:    add w9, w0, w1
+; CHECK-NEXT:    add w0, w9, w8
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i32 %x1, 2147483648
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, 2147483647
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @test_sle_signed_overflow(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_sle_signed_overflow:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+  %cmp = icmp sle i32 %x1, 2147483647
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, 2147483648
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @test_sgt_signed_overflow(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_sgt_signed_overflow:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2147483648 // =0x80000000
+; CHECK-NEXT:    add w9, w0, w1
+; CHECK-NEXT:    add w0, w9, w8
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i32 %x1, 2147483647
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, 2147483648
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @test_sge_signed_overflow(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_sge_signed_overflow:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+  %cmp = icmp sge i32 %x1, 2147483648
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, 2147483647
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @test_eq0_bitwidth_mismatch(i32 %x0, i32 %x1) {
+; CHECK-LABEL: test_eq0_bitwidth_mismatch:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, w1
+; CHECK-NEXT:    tst w1, #0xffff
+; CHECK-NEXT:    sub w8, w8, #1
+; CHECK-NEXT:    csel w0, wzr, w8, eq
+; CHECK-NEXT:    ret
+  %x1t = trunc i32 %x1 to i16
+  %cmp = icmp eq i16 %x1t, 0
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, 1
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @test_eq0_bitwidth_mismatch_2(i32 %x0, i64 %x1) {
+; CHECK-LABEL: test_eq0_bitwidth_mismatch_2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, w1
+; CHECK-NEXT:    cmp x1, #0
+; CHECK-NEXT:    sub w8, w8, #1
+; CHECK-NEXT:    csel w0, wzr, w8, eq
+; CHECK-NEXT:    ret
+  %x1t = trunc i64 %x1 to i32
+  %cmp = icmp eq i64 %x1, 0
+  %add = add i32 %x0, %x1t
+  %sub = sub i32 %add, 1
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}

From ab9a80a3ad78f611fd06cd6f7215bd828809310c Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Sat, 11 Jan 2025 16:29:06 +0000
Subject: [PATCH 175/408] [DAG] Allow AssertZExt to scalarize. (#122463)

With range and undef metadata on a call we can have vector AssertZExt
generated on a target with no vector operations. The AssertZExt needs to
scalarize to a normal `AssertZext tin, ValueType`. I have added
AssertSext too, although I do not have a test case.

Fixes #110374
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |  2 +-
 .../SelectionDAG/LegalizeVectorTypes.cpp      |  8 +++-
 .../test/CodeGen/ARM/scalarize-assert-zext.ll | 46 +++++++++++++++++++
 3 files changed, 53 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/ARM/scalarize-assert-zext.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 571a710cc92a3..caaa40a64c7e1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -858,7 +858,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue ScalarizeVecRes_BUILD_VECTOR(SDNode *N);
   SDValue ScalarizeVecRes_EXTRACT_SUBVECTOR(SDNode *N);
   SDValue ScalarizeVecRes_FP_ROUND(SDNode *N);
-  SDValue ScalarizeVecRes_ExpOp(SDNode *N);
+  SDValue ScalarizeVecRes_UnaryOpWithExtraInput(SDNode *N);
   SDValue ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N);
   SDValue ScalarizeVecRes_LOAD(LoadSDNode *N);
   SDValue ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 780eba16c9c49..5117eb8d91dfb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -58,7 +58,11 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::BUILD_VECTOR:      R = ScalarizeVecRes_BUILD_VECTOR(N); break;
   case ISD::EXTRACT_SUBVECTOR: R = ScalarizeVecRes_EXTRACT_SUBVECTOR(N); break;
   case ISD::FP_ROUND:          R = ScalarizeVecRes_FP_ROUND(N); break;
-  case ISD::FPOWI:             R = ScalarizeVecRes_ExpOp(N); break;
+  case ISD::AssertZext:
+  case ISD::AssertSext:
+  case ISD::FPOWI:
+    R = ScalarizeVecRes_UnaryOpWithExtraInput(N);
+    break;
   case ISD::INSERT_VECTOR_ELT: R = ScalarizeVecRes_INSERT_VECTOR_ELT(N); break;
   case ISD::LOAD:           R = ScalarizeVecRes_LOAD(cast<LoadSDNode>(N));break;
   case ISD::SCALAR_TO_VECTOR:  R = ScalarizeVecRes_SCALAR_TO_VECTOR(N); break;
@@ -436,7 +440,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_FP_ROUND(SDNode *N) {
                      N->getOperand(1));
 }
 
-SDValue DAGTypeLegalizer::ScalarizeVecRes_ExpOp(SDNode *N) {
+SDValue DAGTypeLegalizer::ScalarizeVecRes_UnaryOpWithExtraInput(SDNode *N) {
   SDValue Op = GetScalarizedVector(N->getOperand(0));
   return DAG.getNode(N->getOpcode(), SDLoc(N), Op.getValueType(), Op,
                      N->getOperand(1));
diff --git a/llvm/test/CodeGen/ARM/scalarize-assert-zext.ll b/llvm/test/CodeGen/ARM/scalarize-assert-zext.ll
new file mode 100644
index 0000000000000..5638bb4a39880
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/scalarize-assert-zext.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=armv7-unknown-linux-musleabihf -mattr=-neon %s -o - | FileCheck %s
+
+declare fastcc noundef range(i16 0, 256) <4 x i16> @other()
+
+define void @test(ptr %0) #0 {
+; CHECK-LABEL: test:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    bl other
+; CHECK-NEXT:    uxth r3, r3
+; CHECK-NEXT:    uxth r2, r2
+; CHECK-NEXT:    uxth r1, r1
+; CHECK-NEXT:    uxth r0, r0
+; CHECK-NEXT:    strb r3, [r4, #3]
+; CHECK-NEXT:    strb r2, [r4, #2]
+; CHECK-NEXT:    strb r1, [r4, #1]
+; CHECK-NEXT:    strb r0, [r4]
+; CHECK-NEXT:    pop {r4, pc}
+entry:
+  %call = call fastcc <4 x i16> @other()
+  %t = trunc <4 x i16> %call to <4 x i8>
+  store <4 x i8> %t, ptr %0, align 1
+  ret void
+}
+
+define <4 x i16> @test2() #0 {
+; CHECK-LABEL: test2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    bl other
+; CHECK-NEXT:    movw r1, #65408
+; CHECK-NEXT:    and r0, r0, r1
+; CHECK-NEXT:    and r2, r2, r1
+; CHECK-NEXT:    mov r1, #0
+; CHECK-NEXT:    mov r3, #0
+; CHECK-NEXT:    pop {r11, pc}
+entry:
+  %call = call fastcc <4 x i16> @other()
+  %a = and <4 x i16> %call, <i16 u0x80, i16 u0x100, i16 u0x80, i16 u0x100>
+  ret <4 x i16> %a
+}
+

From 9a9e41ca89f0d78705b60497ece9071b0a5a83f0 Mon Sep 17 00:00:00 2001
From: bernhardu <bernhardu@mailbox.org>
Date: Sat, 11 Jan 2025 18:54:35 +0100
Subject: [PATCH 176/408] [win/asan] GetInstructionSize: Add test for `8D A4 24
 ...`. (#119794)

This adds a test line and updates a comment.
---
 compiler-rt/lib/interception/tests/interception_win_test.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/compiler-rt/lib/interception/tests/interception_win_test.cpp b/compiler-rt/lib/interception/tests/interception_win_test.cpp
index 04d9a6766f65a..3a2d8b271113d 100644
--- a/compiler-rt/lib/interception/tests/interception_win_test.cpp
+++ b/compiler-rt/lib/interception/tests/interception_win_test.cpp
@@ -801,8 +801,8 @@ const struct InstructionSizeData {
   size_t size;  // hold instruction size or 0 for failure,
                 // e.g. on control instructions
   u8 instr[16];
-  size_t rel_offset;  // filled just for instructions with two operands
-                      // and displacement length of four bytes.
+  size_t rel_offset;  // adjustment for RIP-relative addresses when copying
+                      // instructions during hooking via hotpatch or trampoline.
   const char *comment;
 } data[] = {
     // clang-format off
@@ -858,6 +858,7 @@ const struct InstructionSizeData {
     { 5, {0x68, 0x71, 0x72, 0x73, 0x74}, 0, "68 XX XX XX XX : push imm32"},
     { 5, {0xb8, 0x71, 0x72, 0x73, 0x74}, 0, "b8 XX XX XX XX : mov eax, XX XX XX XX"},
     { 5, {0xB9, 0x71, 0x72, 0x73, 0x74}, 0, "b9 XX XX XX XX : mov ecx, XX XX XX XX"},
+    { 7, {0x8D, 0xA4, 0x24, 0x73, 0x74, 0x75, 0x76}, 0, "8D A4 24 XX XX XX XX : lea esp, [esp + XX XX XX XX]"},
 #if SANITIZER_WINDOWS_x64
     // sorted list
     { 2, {0x40, 0x50}, 0, "40 50 : push rax"},

From 657fb4433e027722e8c9a5002d0c194ecd3f2956 Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow@amd.com>
Date: Sat, 11 Jan 2025 09:57:57 -0800
Subject: [PATCH 177/408] [AMDGPU] Add target hook to isGlobalMemoryObject
 (#112781)

We want special handing for IGLP instructions in the scheduler but they
should still be treated like they have side effects by other passes. Add
a target hook to the ScheduleDAGInstrs DAG builder so that we have more
control over this.
---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   |  4 +++
 llvm/lib/CodeGen/ScheduleDAGInstrs.cpp        | 10 ++----
 llvm/lib/CodeGen/TargetInstrInfo.cpp          |  5 +++
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp     | 21 -----------
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   | 12 ++++---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  7 ++++
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |  9 +++++
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir |  6 ++--
 .../CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll    | 35 +++++++++++++++++++
 .../CodeGen/AMDGPU/sched-barrier-pre-RA.mir   | 22 ++++++------
 10 files changed, 85 insertions(+), 46 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 408adcd330b84..165af902e42d0 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -136,6 +136,10 @@ class TargetInstrInfo : public MCInstrInfo {
                                          const TargetRegisterInfo *TRI,
                                          const MachineFunction &MF) const;
 
+  /// Returns true if MI is an instruction we are unable to reason about
+  /// (like a call or something with unmodeled side effects).
+  virtual bool isGlobalMemoryObject(const MachineInstr *MI) const;
+
   /// Return true if the instruction is trivially rematerializable, meaning it
   /// has no side effects and requires no operands that aren't always available.
   /// This means the only allowed uses are constants and unallocatable physical
diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
index 6d3b3f34e8cab..8e3e06bf57153 100644
--- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -35,6 +35,7 @@
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/ScheduleDFS.h"
 #include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Config/llvm-config.h"
@@ -547,12 +548,6 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
   }
 }
 
-/// Returns true if MI is an instruction we are unable to reason about
-/// (like a call or something with unmodeled side effects).
-static inline bool isGlobalMemoryObject(MachineInstr *MI) {
-  return MI->isCall() || MI->hasUnmodeledSideEffects() ||
-         (MI->hasOrderedMemoryRef() && !MI->isDereferenceableInvariantLoad());
-}
 
 void ScheduleDAGInstrs::addChainDependency (SUnit *SUa, SUnit *SUb,
                                             unsigned Latency) {
@@ -899,8 +894,9 @@ void ScheduleDAGInstrs::buildSchedGraph(AAResults *AA,
     // isLoadFromStackSLot are not usable after stack slots are lowered to
     // actual addresses).
 
+    const TargetInstrInfo *TII = ST.getInstrInfo();
     // This is a barrier event that acts as a pivotal node in the DAG.
-    if (isGlobalMemoryObject(&MI)) {
+    if (TII->isGlobalMemoryObject(&MI)) {
 
       // Become the barrier chain.
       if (BarrierChain)
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 38bd0b0ba4114..770b851f3607a 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -1917,3 +1917,8 @@ bool TargetInstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
   }
   return true;
 }
+
+bool TargetInstrInfo::isGlobalMemoryObject(const MachineInstr *MI) const {
+  return MI->isCall() || MI->hasUnmodeledSideEffects() ||
+         (MI->hasOrderedMemoryRef() && !MI->isDereferenceableInvariantLoad());
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index ac01562e457f7..b5dd0d8b86331 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -239,23 +239,6 @@ class SchedGroup {
   }
 };
 
-// Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER.
-static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) {
-  assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER ||
-         SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||
-         SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT);
-
-  while (!SU.Preds.empty())
-    for (auto &P : SU.Preds)
-      SU.removePred(P);
-
-  while (!SU.Succs.empty())
-    for (auto &S : SU.Succs)
-      for (auto &SP : S.getSUnit()->Preds)
-        if (SP.getSUnit() == &SU)
-          S.getSUnit()->removePred(SP);
-}
-
 using SUToCandSGsPair = std::pair<SUnit *, SmallVector<int, 4>>;
 using SUsToCandSGsVec = SmallVector<SUToCandSGsPair, 4>;
 
@@ -459,7 +442,6 @@ void PipelineSolver::makePipeline() {
       // Command line requested IGroupLP doesn't have SGBarr
       if (!SGBarr)
         continue;
-      resetEdges(*SGBarr, DAG);
       SG.link(*SGBarr, false);
     }
   }
@@ -2611,7 +2593,6 @@ void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
       initSchedGroupBarrierPipelineStage(R);
       FoundSB = true;
     } else if (Opc == AMDGPU::IGLP_OPT) {
-      resetEdges(*R, DAG);
       if (!FoundSB && !FoundIGLP) {
         FoundIGLP = true;
         ShouldApplyIGLP = initIGLPOpt(*R);
@@ -2633,7 +2614,6 @@ void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
   assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);
   // Remove all existing edges from the SCHED_BARRIER that were added due to the
   // instruction having side effects.
-  resetEdges(SchedBarrier, DAG);
   LLVM_DEBUG(dbgs() << "Building SchedGroup for SchedBarrier with Mask: "
                     << MI.getOperand(0).getImm() << "\n");
   auto InvertedMask =
@@ -2691,7 +2671,6 @@ void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
     std::vector<SUnit>::reverse_iterator RIter) {
   // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due
   // to the instruction having side effects.
-  resetEdges(*RIter, DAG);
   MachineInstr &SGB = *RIter->getInstr();
   assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER);
   int32_t SGMask = SGB.getOperand(0).getImm();
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 1c23b237eaf4b..f5bbc5482d347 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -188,6 +188,12 @@ static void getRegisterPressures(
   Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
 }
 
+// Return true if the instruction is mutually exclusive with all non-IGLP DAG
+// mutations, requiring all other mutations to be disabled.
+static bool isIGLPMutationOnly(unsigned Opcode) {
+  return Opcode == AMDGPU::SCHED_GROUP_BARRIER || Opcode == AMDGPU::IGLP_OPT;
+}
+
 void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
                                      bool AtTop,
                                      const RegPressureTracker &RPTracker,
@@ -1152,8 +1158,7 @@ bool GCNSchedStage::initGCNRegion() {
       StageID == GCNSchedStageID::ILPInitialSchedule) {
     for (auto &I : DAG) {
       Unsched.push_back(&I);
-      if (I.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||
-          I.getOpcode() == AMDGPU::IGLP_OPT)
+      if (isIGLPMutationOnly(I.getOpcode()))
         DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
     }
   } else {
@@ -1894,8 +1899,7 @@ void GCNScheduleDAGMILive::updateRegionBoundaries(
 
 static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
   return any_of(*DAG, [](MachineBasicBlock::iterator MI) {
-    unsigned Opc = MI->getOpcode();
-    return Opc == AMDGPU::SCHED_GROUP_BARRIER || Opc == AMDGPU::IGLP_OPT;
+    return isIGLPMutationOnly(MI->getOpcode());
   });
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index e6f333fbb8784..8fc32d9e60bf2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10051,3 +10051,10 @@ void SIInstrInfo::enforceOperandRCAlignment(MachineInstr &MI,
   Op.setSubReg(AMDGPU::sub0);
   MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
 }
+
+bool SIInstrInfo::isGlobalMemoryObject(const MachineInstr *MI) const {
+  if (isIGLP(*MI))
+    return false;
+
+  return TargetInstrInfo::isGlobalMemoryObject(MI);
+}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 8f9ca6141816d..d49939bf81b10 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -242,6 +242,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0,
                                int64_t &Offset1) const override;
 
+  bool isGlobalMemoryObject(const MachineInstr *MI) const override;
+
   bool getMemOperandsWithOffsetWidth(
       const MachineInstr &LdSt,
       SmallVectorImpl<const MachineOperand *> &BaseOps, int64_t &Offset,
@@ -968,6 +970,13 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
     return get(Opcode).TSFlags & SIInstrFlags::TiedSourceNotRead;
   }
 
+  bool isIGLP(unsigned Opcode) const {
+    return Opcode == AMDGPU::SCHED_BARRIER ||
+           Opcode == AMDGPU::SCHED_GROUP_BARRIER || Opcode == AMDGPU::IGLP_OPT;
+  }
+
+  bool isIGLP(const MachineInstr &MI) const { return isIGLP(MI.getOpcode()); }
+
   static unsigned getNonSoftWaitcntOpcode(unsigned Opcode) {
     switch (Opcode) {
     case AMDGPU::S_WAITCNT_soft:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
index bc4d35f5a1f9a..2707c2209e7c9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
@@ -25,9 +25,6 @@
   ; GCN-NEXT:    ; implicit-def: $vgpr79
   ; GCN-NEXT:    ; implicit-def: $vgpr80
   ; GCN-NEXT:    ; implicit-def: $vgpr91
-  ; GCN-NEXT:    ;;#ASMSTART
-  ; GCN-NEXT:    s_waitcnt vmcnt(8)
-  ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19
   ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
   ; GCN-NEXT:    s_nop 1
@@ -477,6 +474,9 @@
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[4:5], v[8:9], v[32:47]
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    v_mov_b32_e32 v4, 0
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
index b532aa9cd7e86..08c0d15432915 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
@@ -285,6 +285,41 @@ entry:
   ret void
 }
 
+define amdgpu_kernel void @test_iglp_opt_asm_sideeffect(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
+; GCN-LABEL: test_iglp_opt_asm_sideeffect:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffc, v0
+; GCN-NEXT:    ; iglp_opt mask(0x00000000)
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_add_u32_e32 v1, s0, v0
+; GCN-NEXT:    ds_read_b32 v1, v1
+; GCN-NEXT:    v_add_u32_e32 v0, s1, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    ds_write_b32 v0, v1
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_read_b32 v0, v2 offset:256
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    ds_write_b32 v1, v0 offset:256
+; GCN-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %load.0.addr = getelementptr float, ptr addrspace(3) %in, i32 %idx
+  %load.0 = load float, ptr addrspace(3) %load.0.addr
+  %store.0.addr = getelementptr float, ptr addrspace(3) %out, i32 %idx
+  store float %load.0, ptr addrspace(3) %store.0.addr
+  call void asm sideeffect "", ""() #1
+  call void @llvm.amdgcn.iglp.opt(i32 0) #1
+  %load.1.addr = getelementptr float, ptr addrspace(3) %in, i32 64
+  %load.1 = load float, ptr addrspace(3) %load.1.addr
+  %store.1.addr = getelementptr float, ptr addrspace(3) %out, i32 64
+  store float %load.1, ptr addrspace(3) %store.1.addr
+  ret void
+}
 
 declare void @llvm.amdgcn.iglp.opt(i32) #1
 declare i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir b/llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir
index bdfc8227fdccb..7295506213c4b 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir
@@ -96,10 +96,10 @@ body: |
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
     ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: S_NOP 0
     ; CHECK-NEXT: SCHED_BARRIER 1
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
-    ; CHECK-NEXT: S_NOP 0
     ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
     ; CHECK-NEXT: S_ENDPGM 0
     %0:sreg_64_xexec_xnull = IMPLICIT_DEF
@@ -163,19 +163,19 @@ body: |
     ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
     ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
-    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
-    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
-    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: S_NOP 0
     ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: SCHED_BARRIER 4
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
-    ; CHECK-NEXT: S_NOP 0
     ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
     ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MUL_LO_U32_e64_1]], implicit [[V_MUL_LO_U32_e64_2]], implicit [[V_MFMA_F32_4X4X1F32_e64_4]]
     %0:sreg_64_xexec_xnull = IMPLICIT_DEF
@@ -258,10 +258,10 @@ body: |
     ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
     ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
-    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
     ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
     ; CHECK-NEXT: S_NOP 0
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
     ; CHECK-NEXT: SCHED_BARRIER 16
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
     ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
@@ -290,10 +290,10 @@ body: |
     ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
     ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
-    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
     ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
     ; CHECK-NEXT: S_NOP 0
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
     ; CHECK-NEXT: SCHED_BARRIER 32
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
     ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
@@ -354,9 +354,9 @@ body: |
     ; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; CHECK-NEXT: [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_]], [[DS_READ_U16_gfx9_]], implicit $exec
-    ; CHECK-NEXT: [[DS_READ_U16_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
     ; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3)
     ; CHECK-NEXT: S_NOP 0
+    ; CHECK-NEXT: [[DS_READ_U16_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
     ; CHECK-NEXT: SCHED_BARRIER 128
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_1]], [[DS_READ_U16_gfx9_1]], implicit $exec
     ; CHECK-NEXT: dead [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
@@ -386,9 +386,9 @@ body: |
     ; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; CHECK-NEXT: [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_]], [[DS_READ_U16_gfx9_]], implicit $exec
-    ; CHECK-NEXT: [[DS_READ_U16_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
     ; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3)
     ; CHECK-NEXT: S_NOP 0
+    ; CHECK-NEXT: [[DS_READ_U16_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
     ; CHECK-NEXT: SCHED_BARRIER 256
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_1]], [[DS_READ_U16_gfx9_1]], implicit $exec
     ; CHECK-NEXT: dead [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
@@ -453,7 +453,6 @@ body: |
     ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
     ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF
-    ; CHECK-NEXT: S_NOP 0
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
     ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec
@@ -462,6 +461,7 @@ body: |
     ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
     ; CHECK-NEXT: SCHED_BARRIER 12
     ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: S_NOP 0
     ; CHECK-NEXT: SCHED_BARRIER 8
     ; CHECK-NEXT: S_NOP 0
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)

From 2914ba1c01fdc496082197abf7cd35e2af526634 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Sat, 11 Jan 2025 14:17:24 -0500
Subject: [PATCH 178/408] [libc++] Improve diagnostic when failing to parse the
 tzdb (#122125)

Providing the character that we failed on is helpful for figuring out
what's going wrong in the tzdb.
---
 libcxx/src/experimental/tzdb.cpp              | 20 ++++++++++++++-----
 .../time.zone/time.zone.db/rules.pass.cpp     | 15 +++++++++-----
 .../time.zone/time.zone.db/version.pass.cpp   |  5 +++--
 3 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/libcxx/src/experimental/tzdb.cpp b/libcxx/src/experimental/tzdb.cpp
index 638d45f69e033..f38f495c2d0bb 100644
--- a/libcxx/src/experimental/tzdb.cpp
+++ b/libcxx/src/experimental/tzdb.cpp
@@ -8,6 +8,7 @@
 
 // For information see https://libcxx.llvm.org/DesignDocs/TimeZone.html
 
+#include <__assert>
 #include <algorithm>
 #include <cctype>
 #include <chrono>
@@ -97,14 +98,23 @@ static void __skip(istream& __input, string_view __suffix) {
 }
 
 static void __matches(istream& __input, char __expected) {
-  if (std::tolower(__input.get()) != __expected)
-    std::__throw_runtime_error((string("corrupt tzdb: expected character '") + __expected + '\'').c_str());
+  _LIBCPP_ASSERT_INTERNAL(!std::isalpha(__expected) || std::islower(__expected), "lowercase characters only here!");
+  char __c = __input.get();
+  if (std::tolower(__c) != __expected)
+    std::__throw_runtime_error(
+        (string("corrupt tzdb: expected character '") + __expected + "', got '" + __c + "' instead").c_str());
 }
 
 static void __matches(istream& __input, string_view __expected) {
-  for (auto __c : __expected)
-    if (std::tolower(__input.get()) != __c)
-      std::__throw_runtime_error((string("corrupt tzdb: expected string '") + string(__expected) + '\'').c_str());
+  for (auto __c : __expected) {
+    _LIBCPP_ASSERT_INTERNAL(!std::isalpha(__c) || std::islower(__c), "lowercase strings only here!");
+    char __actual = __input.get();
+    if (std::tolower(__actual) != __c)
+      std::__throw_runtime_error(
+          (string("corrupt tzdb: expected character '") + __c + "' from string '" + string(__expected) + "', got '" +
+           __actual + "' instead")
+              .c_str());
+  }
 }
 
 [[nodiscard]] static string __parse_string(istream& __input) {
diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.db/rules.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.db/rules.pass.cpp
index 7d9759320c535..237a206b3a95b 100644
--- a/libcxx/test/libcxx/time/time.zone/time.zone.db/rules.pass.cpp
+++ b/libcxx/test/libcxx/time/time.zone/time.zone.db/rules.pass.cpp
@@ -20,9 +20,10 @@
 // ADDITIONAL_COMPILE_FLAGS: -I %{libcxx-dir}/src/experimental/include
 
 #include <chrono>
+#include <cstdio>
 #include <fstream>
-#include <string>
 #include <string_view>
+#include <string>
 #include <variant>
 
 #include "assert_macros.h"
@@ -96,7 +97,7 @@ static void test_invalid() {
   test_exception("R r 0 mix", "corrupt tzdb: expected whitespace");
   test_exception("R r 0 1", "corrupt tzdb: expected whitespace");
 
-  test_exception("R r 0 1 X", "corrupt tzdb: expected character '-'");
+  test_exception("R r 0 1 X", "corrupt tzdb: expected character '-', got 'X' instead");
 
   test_exception("R r 0 1 -", "corrupt tzdb: expected whitespace");
 
@@ -106,13 +107,17 @@ static void test_invalid() {
 
   test_exception("R r 0 1 - Ja +", "corrupt tzdb weekday: invalid name");
   test_exception("R r 0 1 - Ja 32", "corrupt tzdb day: value too large");
-  test_exception("R r 0 1 - Ja l", "corrupt tzdb: expected string 'last'");
+  test_exception(
+      "R r 0 1 - Ja l",
+      std::string{"corrupt tzdb: expected character 'a' from string 'last', got '"} + (char)EOF + "' instead");
   test_exception("R r 0 1 - Ja last", "corrupt tzdb weekday: invalid name");
   test_exception("R r 0 1 - Ja lastS", "corrupt tzdb weekday: invalid name");
   test_exception("R r 0 1 - Ja S", "corrupt tzdb weekday: invalid name");
   test_exception("R r 0 1 - Ja Su", "corrupt tzdb on: expected '>=' or '<='");
-  test_exception("R r 0 1 - Ja Su>", "corrupt tzdb: expected character '='");
-  test_exception("R r 0 1 - Ja Su<", "corrupt tzdb: expected character '='");
+  test_exception(
+      "R r 0 1 - Ja Su>", std::string{"corrupt tzdb: expected character '=', got '"} + (char)EOF + "' instead");
+  test_exception(
+      "R r 0 1 - Ja Su<", std::string{"corrupt tzdb: expected character '=', got '"} + (char)EOF + "' instead");
   test_exception("R r 0 1 - Ja Su>=+", "corrupt tzdb: expected a non-zero digit");
   test_exception("R r 0 1 - Ja Su>=0", "corrupt tzdb: expected a non-zero digit");
   test_exception("R r 0 1 - Ja Su>=32", "corrupt tzdb day: value too large");
diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.db/version.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.db/version.pass.cpp
index b4f32a1b6fd78..ca3a890f1fa54 100644
--- a/libcxx/test/libcxx/time/time.zone/time.zone.db/version.pass.cpp
+++ b/libcxx/test/libcxx/time/time.zone/time.zone.db/version.pass.cpp
@@ -18,9 +18,10 @@
 // This is not part of the public tzdb interface.
 
 #include <chrono>
+#include <cstdio>
 #include <fstream>
-#include <string>
 #include <string_view>
+#include <string>
 
 #include "assert_macros.h"
 #include "concat_macros.h"
@@ -60,7 +61,7 @@ static void test_exception(std::string_view input, [[maybe_unused]] std::string_
 }
 
 int main(int, const char**) {
-  test_exception("", "corrupt tzdb: expected character '#'");
+  test_exception("", std::string{"corrupt tzdb: expected character '#', got '"} + (char)EOF + "' instead");
   test_exception("#version", "corrupt tzdb: expected whitespace");
   test("#version     \t                      ABCD", "ABCD");
   test("#Version     \t                      ABCD", "ABCD");

From 7f59b4e9982f92431f3069645dab6171363c3404 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sat, 11 Jan 2025 20:31:41 +0000
Subject: [PATCH 179/408] [VPlan] Skip non-induction phi recipes in
 legalizeAndOptimizeInductions.

The body of the loop only applies to wide induction recipes, skip any other
header phi recipes up-frond
---
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 8f6fb07b1e4f2..878db522be1bd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -592,9 +592,9 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
   bool HasOnlyVectorVFs = !Plan.hasVF(ElementCount::getFixed(1));
   VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
   for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
-    auto *PhiR = dyn_cast<VPHeaderPHIRecipe>(&Phi);
+    auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
     if (!PhiR)
-      break;
+      continue;
 
     // Check if any uniform VPReplicateRecipes using the phi recipe are used by
     // ExtractFromEnd. Those must be replaced by a regular VPReplicateRecipe to
@@ -641,9 +641,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
 
     // Replace widened induction with scalar steps for users that only use
     // scalars.
-    auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
-    if (!WideIV)
-      continue;
+    auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
     if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
           return U->usesScalars(WideIV);
         }))

From bfe93aedcc7d393c2697e66d6569baffb701ba6f Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 11 Jan 2025 13:06:36 -0800
Subject: [PATCH 180/408] [AMDGPU] Fix a warning

This patch fixes:

  llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp:255:18: error: private
  field 'DAG' is not used [-Werror,-Wunused-private-field]
---
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index b5dd0d8b86331..bbd262748d680 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -252,7 +252,7 @@ using SUsToCandSGsVec = SmallVector<SUToCandSGsPair, 4>;
 // only be used for small sized problems or medium sized problems where an exact
 // solution is highly desired.
 class PipelineSolver {
-  ScheduleDAGMI *DAG;
+  [[maybe_unused]] ScheduleDAGMI *DAG;
 
   // Instructions that can be assigned to multiple SchedGroups
   DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;

From cc995ad064ffe22566270fe95e974a368c71ba22 Mon Sep 17 00:00:00 2001
From: goldsteinn <35538541+goldsteinn@users.noreply.github.com>
Date: Sat, 11 Jan 2025 15:10:42 -0600
Subject: [PATCH 181/408] [InstSimpify] Simplifying `(xor (sub C_Mask, X),
 C_Mask)` -> `X` (#122552)

- **[InstSimpify] Add tests for simplifying `(xor (sub C_Mask, X),
C_Mask)`; NFC**
- **[InstSimpify] Simplifying `(xor (sub C_Mask, X), C_Mask)` -> `X`**

Helps address regressions with folding `clz(Pow2)`.

Proof: https://alive2.llvm.org/ce/z/zGwUBp
---
 llvm/lib/Analysis/InstructionSimplify.cpp     |  16 +++
 .../InstSimplify/subnuw-with-xor.ll           | 118 ++++++++++++++++++
 2 files changed, 134 insertions(+)
 create mode 100644 llvm/test/Transforms/InstSimplify/subnuw-with-xor.ll

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 999386c0a0491..d69747e30f884 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -871,6 +871,14 @@ static Value *simplifySubInst(Value *Op0, Value *Op1, bool IsNSW, bool IsNUW,
   if (Value *V = simplifyByDomEq(Instruction::Sub, Op0, Op1, Q, MaxRecurse))
     return V;
 
+  // (sub nuw C_Mask, (xor X, C_Mask)) -> X
+  if (IsNUW) {
+    Value *X;
+    if (match(Op1, m_Xor(m_Value(X), m_Specific(Op0))) &&
+        match(Op0, m_LowBitMask()))
+      return X;
+  }
+
   return nullptr;
 }
 
@@ -2540,6 +2548,14 @@ static Value *simplifyXorInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
   if (Value *V = simplifyByDomEq(Instruction::Xor, Op0, Op1, Q, MaxRecurse))
     return V;
 
+  // (xor (sub nuw C_Mask, X), C_Mask) -> X
+  {
+    Value *X;
+    if (match(Op0, m_NUWSub(m_Specific(Op1), m_Value(X))) &&
+        match(Op1, m_LowBitMask()))
+      return X;
+  }
+
   return nullptr;
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/subnuw-with-xor.ll b/llvm/test/Transforms/InstSimplify/subnuw-with-xor.ll
new file mode 100644
index 0000000000000..a990e2f2ae394
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/subnuw-with-xor.ll
@@ -0,0 +1,118 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
+
+define i8 @xor_w_sub_fail_missing_nuw(i8 range(i8 0, 16) %x) {
+; CHECK-LABEL: define i8 @xor_w_sub_fail_missing_nuw(
+; CHECK-SAME: i8 range(i8 0, 16) [[X:%.*]]) {
+; CHECK-NEXT:    [[XOR:%.*]] = xor i8 [[X]], 15
+; CHECK-NEXT:    [[R:%.*]] = sub nsw i8 15, [[XOR]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %xor = xor i8 %x, 15
+  %r = sub nsw i8 15, %xor
+  ret i8 %r
+}
+
+define i8 @xor_w_sub_fail_diff_values(i8 range(i8 0, 16) %x) {
+; CHECK-LABEL: define i8 @xor_w_sub_fail_diff_values(
+; CHECK-SAME: i8 range(i8 0, 16) [[X:%.*]]) {
+; CHECK-NEXT:    [[XOR:%.*]] = xor i8 [[X]], 15
+; CHECK-NEXT:    [[R:%.*]] = sub nuw nsw i8 31, [[XOR]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %xor = xor i8 %x, 15
+  %r = sub nsw nuw i8 31, %xor
+  ret i8 %r
+}
+
+define i8 @xor_w_sub_fail_diff_values2(i8 range(i8 0, 16) %x) {
+; CHECK-LABEL: define i8 @xor_w_sub_fail_diff_values2(
+; CHECK-SAME: i8 range(i8 0, 16) [[X:%.*]]) {
+; CHECK-NEXT:    [[XOR:%.*]] = xor i8 [[X]], 31
+; CHECK-NEXT:    [[R:%.*]] = sub nuw nsw i8 15, [[XOR]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %xor = xor i8 %x, 31
+  %r = sub nsw nuw i8 15, %xor
+  ret i8 %r
+}
+
+define i8 @xor_w_sub_fail_not_mask(i8 range(i8 0, 16) %x) {
+; CHECK-LABEL: define i8 @xor_w_sub_fail_not_mask(
+; CHECK-SAME: i8 range(i8 0, 16) [[X:%.*]]) {
+; CHECK-NEXT:    [[XOR:%.*]] = xor i8 [[X]], 30
+; CHECK-NEXT:    [[R:%.*]] = sub nuw nsw i8 30, [[XOR]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %xor = xor i8 %x, 30
+  %r = sub nsw nuw i8 30, %xor
+  ret i8 %r
+}
+
+define i8 @xor_w_sub_okay(i8 range(i8 0, 16) %x) {
+; CHECK-LABEL: define i8 @xor_w_sub_okay(
+; CHECK-SAME: i8 range(i8 0, 16) [[X:%.*]]) {
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %xor = xor i8 %x, 31
+  %r = sub nsw nuw i8 31, %xor
+  ret i8 %r
+}
+
+define i8 @sub_w_xor_fail_missing_nuw(i8 range(i8 0, 16) %x) {
+; CHECK-LABEL: define i8 @sub_w_xor_fail_missing_nuw(
+; CHECK-SAME: i8 range(i8 0, 16) [[X:%.*]]) {
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i8 15, [[X]]
+; CHECK-NEXT:    [[R:%.*]] = xor i8 [[SUB]], 15
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sub = sub nsw i8 15, %x
+  %r = xor i8 %sub, 15
+  ret i8 %r
+}
+
+define i8 @sub_w_xor_fail_diff_values(i8 range(i8 0, 16) %x) {
+; CHECK-LABEL: define i8 @sub_w_xor_fail_diff_values(
+; CHECK-SAME: i8 range(i8 0, 16) [[X:%.*]]) {
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i8 15, [[X]]
+; CHECK-NEXT:    [[R:%.*]] = xor i8 [[SUB]], 31
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sub = sub nsw nuw i8 15, %x
+  %r = xor i8 %sub, 31
+  ret i8 %r
+}
+
+define i8 @sub_w_sub_fail_diff_values2(i8 range(i8 0, 16) %x) {
+; CHECK-LABEL: define i8 @sub_w_sub_fail_diff_values2(
+; CHECK-SAME: i8 range(i8 0, 16) [[X:%.*]]) {
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i8 31, [[X]]
+; CHECK-NEXT:    [[R:%.*]] = xor i8 [[SUB]], 15
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sub = sub nsw nuw i8 31, %x
+  %r = xor i8 %sub, 15
+  ret i8 %r
+}
+
+define i8 @sub_w_sub_fail_not_mask(i8 range(i8 0, 16) %x) {
+; CHECK-LABEL: define i8 @sub_w_sub_fail_not_mask(
+; CHECK-SAME: i8 range(i8 0, 16) [[X:%.*]]) {
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i8 30, [[X]]
+; CHECK-NEXT:    [[R:%.*]] = xor i8 [[SUB]], 30
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sub = sub nsw nuw i8 30, %x
+  %r = xor i8 %sub, 30
+  ret i8 %r
+}
+
+define i8 @sub_w_sub_okay(i8 range(i8 0, 16) %x) {
+; CHECK-LABEL: define i8 @sub_w_sub_okay(
+; CHECK-SAME: i8 range(i8 0, 16) [[X:%.*]]) {
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %sub = sub nsw nuw i8 31, %x
+  %r = xor i8 %sub, 31
+  ret i8 %r
+}

From 17ef436e3df231fa45aa6010bf8ed41189380679 Mon Sep 17 00:00:00 2001
From: goldsteinn <35538541+goldsteinn@users.noreply.github.com>
Date: Sat, 11 Jan 2025 15:11:11 -0600
Subject: [PATCH 182/408] [ValueTracking] Take into account whether zero is
 poison when computing CR for `ct{t,l}z` (#122548)

---
 llvm/lib/Analysis/ValueTracking.cpp       | 15 +++++++++++----
 llvm/test/Transforms/InstSimplify/call.ll |  8 ++------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 92338d33b27a4..53da8d2776a22 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -9897,13 +9897,20 @@ static void setLimitsForBinOp(const BinaryOperator &BO, APInt &Lower,
   }
 }
 
-static ConstantRange getRangeForIntrinsic(const IntrinsicInst &II) {
+static ConstantRange getRangeForIntrinsic(const IntrinsicInst &II,
+                                          bool UseInstrInfo) {
   unsigned Width = II.getType()->getScalarSizeInBits();
   const APInt *C;
   switch (II.getIntrinsicID()) {
-  case Intrinsic::ctpop:
   case Intrinsic::ctlz:
-  case Intrinsic::cttz:
+  case Intrinsic::cttz: {
+    APInt Upper(Width, Width);
+    if (!UseInstrInfo || !match(II.getArgOperand(1), m_One()))
+      Upper += 1;
+    // Maximum of set/clear bits is the bit width.
+    return ConstantRange::getNonEmpty(APInt::getZero(Width), Upper);
+  }
+  case Intrinsic::ctpop:
     // Maximum of set/clear bits is the bit width.
     return ConstantRange::getNonEmpty(APInt::getZero(Width),
                                       APInt(Width, Width) + 1);
@@ -10094,7 +10101,7 @@ ConstantRange llvm::computeConstantRange(const Value *V, bool ForSigned,
     setLimitsForBinOp(*BO, Lower, Upper, IIQ, ForSigned);
     CR = ConstantRange::getNonEmpty(Lower, Upper);
   } else if (auto *II = dyn_cast<IntrinsicInst>(V))
-    CR = getRangeForIntrinsic(*II);
+    CR = getRangeForIntrinsic(*II, UseInstrInfo);
   else if (auto *SI = dyn_cast<SelectInst>(V)) {
     ConstantRange CRTrue = computeConstantRange(
         SI->getTrueValue(), ForSigned, UseInstrInfo, AC, CtxI, DT, Depth + 1);
diff --git a/llvm/test/Transforms/InstSimplify/call.ll b/llvm/test/Transforms/InstSimplify/call.ll
index 67d5c4dbfb2e7..910750adf9d42 100644
--- a/llvm/test/Transforms/InstSimplify/call.ll
+++ b/llvm/test/Transforms/InstSimplify/call.ll
@@ -1582,9 +1582,7 @@ define i1 @ctlz_i1_non_poison_eq_false(i1 %x) {
 
 define i1 @ctlz_i1_poison_eq_false(i1 %x) {
 ; CHECK-LABEL: @ctlz_i1_poison_eq_false(
-; CHECK-NEXT:    [[CT:%.*]] = call i1 @llvm.ctlz.i1(i1 [[X:%.*]], i1 true)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i1 [[CT]], false
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %ct = call i1 @llvm.ctlz.i1(i1 %x, i1 true)
   %cmp = icmp eq i1 %ct, false
@@ -1604,9 +1602,7 @@ define i1 @cttz_i1_non_poison_eq_false(i1 %x) {
 
 define i1 @cttz_i1_poison_eq_false(i1 %x) {
 ; CHECK-LABEL: @cttz_i1_poison_eq_false(
-; CHECK-NEXT:    [[CT:%.*]] = call i1 @llvm.cttz.i1(i1 [[X:%.*]], i1 true)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i1 [[CT]], false
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %ct = call i1 @llvm.cttz.i1(i1 %x, i1 true)
   %cmp = icmp eq i1 %ct, false

From 07ff786e39e2190449998d3af1000454dee501be Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 11 Jan 2025 13:15:30 -0800
Subject: [PATCH 183/408] [TableGen] Avoid repeated hash lookups (NFC)
 (#122586)

---
 llvm/utils/TableGen/Common/CodeGenSchedule.cpp | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp
index 06d82daebac0d..7f4230affca09 100644
--- a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp
@@ -258,11 +258,9 @@ void CodeGenSchedModels::checkSTIPredicates() const {
   // There cannot be multiple declarations with the same name.
   for (const Record *R : Records.getAllDerivedDefinitions("STIPredicateDecl")) {
     StringRef Name = R->getValueAsString("Name");
-    const auto It = Declarations.find(Name);
-    if (It == Declarations.end()) {
-      Declarations[Name] = R;
+    const auto [It, Inserted] = Declarations.try_emplace(Name, R);
+    if (Inserted)
       continue;
-    }
 
     PrintError(R->getLoc(), "STIPredicate " + Name + " multiply declared.");
     PrintFatalNote(It->second->getLoc(), "Previous declaration was here.");
@@ -417,9 +415,9 @@ void CodeGenSchedModels::collectSTIPredicates() {
   for (const Record *R : Records.getAllDerivedDefinitions("STIPredicate")) {
     const Record *Decl = R->getValueAsDef("Declaration");
 
-    const auto It = Decl2Index.find(Decl);
-    if (It == Decl2Index.end()) {
-      Decl2Index[Decl] = STIPredicates.size();
+    const auto [It, Inserted] =
+        Decl2Index.try_emplace(Decl, STIPredicates.size());
+    if (Inserted) {
       STIPredicateFunction Predicate(Decl);
       Predicate.addDefinition(R);
       STIPredicates.emplace_back(std::move(Predicate));

From a56eb7c9986456ae4b492fff79c3cf18d0ef8ad3 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 11 Jan 2025 13:15:50 -0800
Subject: [PATCH 184/408] [Sema] Avoid repeated hash lookups (NFC) (#122588)

---
 clang/lib/Sema/SemaDecl.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 8724c20fd7b65..5b7275c316f74 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -15868,8 +15868,8 @@ static void diagnoseImplicitlyRetainedSelf(Sema &S) {
   llvm::DenseMap<const BlockDecl *, bool> EscapeInfo;
 
   auto IsOrNestedInEscapingBlock = [&](const BlockDecl *BD) {
-    if (EscapeInfo.count(BD))
-      return EscapeInfo[BD];
+    if (auto It = EscapeInfo.find(BD); It != EscapeInfo.end())
+      return It->second;
 
     bool R = false;
     const BlockDecl *CurBD = BD;

From 4f4e2abb1a5ff1225d32410fd02b732d077aa056 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 11 Jan 2025 13:16:43 -0800
Subject: [PATCH 185/408] [mlir] Migrate away from PointerUnion::{is,get} (NFC)
 (#122591)

Note that PointerUnion::{is,get} have been soft deprecated in
PointerUnion.h:

  // FIXME: Replace the uses of is(), get() and dyn_cast() with
  //        isa<T>, cast<T> and the llvm::dyn_cast<T>

I'm not touching PointerUnion::dyn_cast for now because it's a bit
complicated; we could blindly migrate it to dyn_cast_if_present, but
we should probably use dyn_cast when the operand is known to be
non-null.
---
 mlir/examples/transform-opt/mlir-transform-opt.cpp       | 2 +-
 mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h     | 2 +-
 mlir/include/mlir/IR/Matchers.h                          | 2 +-
 mlir/include/mlir/IR/OpDefinition.h                      | 5 +++--
 mlir/include/mlir/Interfaces/DataLayoutInterfaces.td     | 2 +-
 mlir/include/mlir/Pass/AnalysisManager.h                 | 2 +-
 mlir/lib/Analysis/DataFlowFramework.cpp                  | 4 ++--
 mlir/lib/AsmParser/Parser.cpp                            | 2 +-
 mlir/lib/Bytecode/Writer/IRNumbering.h                   | 4 ++--
 mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.cpp       | 4 ++--
 mlir/lib/Tools/tblgen-lsp-server/TableGenServer.cpp      | 8 ++++----
 mlir/lib/Transforms/Utils/FoldUtils.cpp                  | 2 +-
 mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp | 2 +-
 13 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/mlir/examples/transform-opt/mlir-transform-opt.cpp b/mlir/examples/transform-opt/mlir-transform-opt.cpp
index 65615fb25bff6..10e16096211ad 100644
--- a/mlir/examples/transform-opt/mlir-transform-opt.cpp
+++ b/mlir/examples/transform-opt/mlir-transform-opt.cpp
@@ -131,7 +131,7 @@ class DiagnosticHandlerWrapper {
     if (auto *ptr = handler.dyn_cast<mlir::SourceMgrDiagnosticHandler *>()) {
       delete ptr;
     } else {
-      delete handler.get<mlir::SourceMgrDiagnosticVerifierHandler *>();
+      delete cast<mlir::SourceMgrDiagnosticVerifierHandler *>(handler);
     }
   }
 
diff --git a/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h
index 507087d5575e9..387b9ee707179 100644
--- a/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h
+++ b/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h
@@ -37,7 +37,7 @@ class AbstractSparseLattice : public AnalysisState {
   AbstractSparseLattice(Value value) : AnalysisState(value) {}
 
   /// Return the value this lattice is located at.
-  Value getAnchor() const { return AnalysisState::getAnchor().get<Value>(); }
+  Value getAnchor() const { return cast<Value>(AnalysisState::getAnchor()); }
 
   /// Join the information contained in 'rhs' into this lattice. Returns
   /// if the value of the lattice changed.
diff --git a/mlir/include/mlir/IR/Matchers.h b/mlir/include/mlir/IR/Matchers.h
index d218206e50f8f..1dce055db1b4a 100644
--- a/mlir/include/mlir/IR/Matchers.h
+++ b/mlir/include/mlir/IR/Matchers.h
@@ -92,7 +92,7 @@ struct constant_op_binder {
     (void)result;
     assert(succeeded(result) && "expected ConstantLike op to be foldable");
 
-    if (auto attr = llvm::dyn_cast<AttrT>(foldedOp.front().get<Attribute>())) {
+    if (auto attr = llvm::dyn_cast<AttrT>(cast<Attribute>(foldedOp.front()))) {
       if (bind_value)
         *bind_value = attr;
       return true;
diff --git a/mlir/include/mlir/IR/OpDefinition.h b/mlir/include/mlir/IR/OpDefinition.h
index 59f094d669099..d91c573c03efe 100644
--- a/mlir/include/mlir/IR/OpDefinition.h
+++ b/mlir/include/mlir/IR/OpDefinition.h
@@ -272,8 +272,9 @@ class OpFoldResult : public PointerUnion<Attribute, Value> {
   void dump() const { llvm::errs() << *this << "\n"; }
 
   MLIRContext *getContext() const {
-    return is<Attribute>() ? get<Attribute>().getContext()
-                           : get<Value>().getContext();
+    PointerUnion pu = *this;
+    return isa<Attribute>(pu) ? cast<Attribute>(pu).getContext()
+                              : cast<Value>(pu).getContext();
   }
 };
 
diff --git a/mlir/include/mlir/Interfaces/DataLayoutInterfaces.td b/mlir/include/mlir/Interfaces/DataLayoutInterfaces.td
index 3532116700af5..0d09b92928fe3 100644
--- a/mlir/include/mlir/Interfaces/DataLayoutInterfaces.td
+++ b/mlir/include/mlir/Interfaces/DataLayoutInterfaces.td
@@ -86,7 +86,7 @@ def DataLayoutEntryInterface : AttrInterface<"DataLayoutEntryInterface"> {
   let extraClassDeclaration = [{
     /// Returns `true` if the key of this entry is a type.
     bool isTypeEntry() {
-      return getKey().is<::mlir::Type>();
+      return llvm::isa<::mlir::Type>(getKey());
     }
   }];
 }
diff --git a/mlir/include/mlir/Pass/AnalysisManager.h b/mlir/include/mlir/Pass/AnalysisManager.h
index f9db26140259b..199ffee792bb5 100644
--- a/mlir/include/mlir/Pass/AnalysisManager.h
+++ b/mlir/include/mlir/Pass/AnalysisManager.h
@@ -262,7 +262,7 @@ struct NestedAnalysisMap {
   PassInstrumentor *getPassInstrumentor() const {
     if (auto *parent = getParent())
       return parent->getPassInstrumentor();
-    return parentOrInstrumentor.get<PassInstrumentor *>();
+    return cast<PassInstrumentor *>(parentOrInstrumentor);
   }
 
   /// The cached analyses for nested operations.
diff --git a/mlir/lib/Analysis/DataFlowFramework.cpp b/mlir/lib/Analysis/DataFlowFramework.cpp
index 7e83668c06765..d2742c6e4b966 100644
--- a/mlir/lib/Analysis/DataFlowFramework.cpp
+++ b/mlir/lib/Analysis/DataFlowFramework.cpp
@@ -84,7 +84,7 @@ void LatticeAnchor::print(raw_ostream &os) const {
     return value.print(os, OpPrintingFlags().skipRegions());
   }
 
-  return get<ProgramPoint *>()->print(os);
+  return llvm::cast<ProgramPoint *>(*this)->print(os);
 }
 
 Location LatticeAnchor::getLoc() const {
@@ -93,7 +93,7 @@ Location LatticeAnchor::getLoc() const {
   if (auto value = llvm::dyn_cast<Value>(*this))
     return value.getLoc();
 
-  ProgramPoint *pp = get<ProgramPoint *>();
+  ProgramPoint *pp = llvm::cast<ProgramPoint *>(*this);
   if (!pp->isBlockStart())
     return pp->getPrevOp()->getLoc();
   return pp->getBlock()->getParent()->getLoc();
diff --git a/mlir/lib/AsmParser/Parser.cpp b/mlir/lib/AsmParser/Parser.cpp
index e3db248164672..eccb3241012a2 100644
--- a/mlir/lib/AsmParser/Parser.cpp
+++ b/mlir/lib/AsmParser/Parser.cpp
@@ -2162,7 +2162,7 @@ OperationParser::parseTrailingLocationSpecifier(OpOrArgument opOrArgument) {
   if (auto *op = llvm::dyn_cast_if_present<Operation *>(opOrArgument))
     op->setLoc(directLoc);
   else
-    opOrArgument.get<BlockArgument>().setLoc(directLoc);
+    cast<BlockArgument>(opOrArgument).setLoc(directLoc);
   return success();
 }
 
diff --git a/mlir/lib/Bytecode/Writer/IRNumbering.h b/mlir/lib/Bytecode/Writer/IRNumbering.h
index eab75f50d2ee4..9b7ac0d3688e3 100644
--- a/mlir/lib/Bytecode/Writer/IRNumbering.h
+++ b/mlir/lib/Bytecode/Writer/IRNumbering.h
@@ -50,11 +50,11 @@ struct AttrTypeNumbering {
 };
 struct AttributeNumbering : public AttrTypeNumbering {
   AttributeNumbering(Attribute value) : AttrTypeNumbering(value) {}
-  Attribute getValue() const { return value.get<Attribute>(); }
+  Attribute getValue() const { return cast<Attribute>(value); }
 };
 struct TypeNumbering : public AttrTypeNumbering {
   TypeNumbering(Type value) : AttrTypeNumbering(value) {}
-  Type getValue() const { return value.get<Type>(); }
+  Type getValue() const { return cast<Type>(value); }
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.cpp b/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.cpp
index 76b066feb6930..873ecb5b70e02 100644
--- a/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.cpp
+++ b/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.cpp
@@ -142,7 +142,7 @@ struct PDLIndexSymbol {
       const ast::Name *declName = decl->getName();
       return declName ? declName->getLoc() : decl->getLoc();
     }
-    return definition.get<const ods::Operation *>()->getLoc();
+    return cast<const ods::Operation *>(definition)->getLoc();
   }
 
   /// The main definition of the symbol.
@@ -470,7 +470,7 @@ PDLDocument::findHover(const lsp::URIForFile &uri,
   if (const auto *op =
           llvm::dyn_cast_if_present<const ods::Operation *>(symbol->definition))
     return buildHoverForOpName(op, hoverRange);
-  const auto *decl = symbol->definition.get<const ast::Decl *>();
+  const auto *decl = cast<const ast::Decl *>(symbol->definition);
   return findHover(decl, hoverRange);
 }
 
diff --git a/mlir/lib/Tools/tblgen-lsp-server/TableGenServer.cpp b/mlir/lib/Tools/tblgen-lsp-server/TableGenServer.cpp
index 5c59d94a061fa..6c4ec06fffb32 100644
--- a/mlir/lib/Tools/tblgen-lsp-server/TableGenServer.cpp
+++ b/mlir/lib/Tools/tblgen-lsp-server/TableGenServer.cpp
@@ -165,11 +165,11 @@ struct TableGenRecordSymbol : public TableGenIndexSymbol {
   ~TableGenRecordSymbol() override = default;
 
   static bool classof(const TableGenIndexSymbol *symbol) {
-    return symbol->definition.is<const Record *>();
+    return isa<const Record *>(symbol->definition);
   }
 
   /// Return the value of this symbol.
-  const Record *getValue() const { return definition.get<const Record *>(); }
+  const Record *getValue() const { return cast<const Record *>(definition); }
 };
 /// This class represents a single record value symbol.
 struct TableGenRecordValSymbol : public TableGenIndexSymbol {
@@ -178,12 +178,12 @@ struct TableGenRecordValSymbol : public TableGenIndexSymbol {
   ~TableGenRecordValSymbol() override = default;
 
   static bool classof(const TableGenIndexSymbol *symbol) {
-    return symbol->definition.is<const RecordVal *>();
+    return isa<const RecordVal *>(symbol->definition);
   }
 
   /// Return the value of this symbol.
   const RecordVal *getValue() const {
-    return definition.get<const RecordVal *>();
+    return cast<const RecordVal *>(definition);
   }
 
   /// The parent record of this symbol.
diff --git a/mlir/lib/Transforms/Utils/FoldUtils.cpp b/mlir/lib/Transforms/Utils/FoldUtils.cpp
index c43f439525526..e9adda0cd01db 100644
--- a/mlir/lib/Transforms/Utils/FoldUtils.cpp
+++ b/mlir/lib/Transforms/Utils/FoldUtils.cpp
@@ -260,7 +260,7 @@ OperationFolder::processFoldResults(Operation *op,
 
     // Check to see if there is a canonicalized version of this constant.
     auto res = op->getResult(i);
-    Attribute attrRepl = foldResults[i].get<Attribute>();
+    Attribute attrRepl = cast<Attribute>(foldResults[i]);
     if (auto *constOp =
             tryGetOrCreateConstant(uniquedConstants, dialect, attrRepl,
                                    res.getType(), erasedFoldedLocation)) {
diff --git a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
index 99f3569b767b1..969c560c99ab7 100644
--- a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
+++ b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
@@ -523,7 +523,7 @@ bool GreedyPatternRewriteDriver::processWorklist() {
           }
           // Materialize Attributes as SSA values.
           Operation *constOp = op->getDialect()->materializeConstant(
-              rewriter, ofr.get<Attribute>(), resultType, op->getLoc());
+              rewriter, cast<Attribute>(ofr), resultType, op->getLoc());
 
           if (!constOp) {
             // If materialization fails, cleanup any operations generated for

From eabf9313d47b8e25b2a2e4ee62dd70f79eacf1df Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Sat, 11 Jan 2025 14:04:03 -0800
Subject: [PATCH 186/408] [CI] Detect step failures in metrics job (#122564)

This patch makes the metrics job also detect failures in individual
steps. This is necessary now that we are setting continue-on-error in
the premerge jobs to prevent sending out unnecessary email to detect
what jobs actually fail.
---
 .ci/metrics/metrics.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py
index 55025e50d1081..cbff478b9ba15 100644
--- a/.ci/metrics/metrics.py
+++ b/.ci/metrics/metrics.py
@@ -80,6 +80,18 @@ def get_metrics(github_repo: github.Repository, workflows_to_track: dict[str, in
         completed_at = workflow_jobs[0].completed_at
 
         job_result = int(workflow_jobs[0].conclusion == "success")
+        if job_result:
+            # We still might want to mark the job as a failure if one of the steps
+            # failed. This is required due to use setting continue-on-error in
+            # the premerge pipeline to prevent sending emails while we are
+            # testing the infrastructure.
+            # TODO(boomanaiden154): Remove this once the premerge pipeline is no
+            # longer in a testing state and we can directly assert the workflow
+            # result.
+            for step in workflow_jobs[0].steps:
+                if step.conclusion != "success":
+                    job_result = 0
+                    break
 
         queue_time = started_at - created_at
         run_time = completed_at - started_at

From 2c7829e676dfd6a33f7c9955ea930f51aca37e20 Mon Sep 17 00:00:00 2001
From: Congcong Cai <congcongcai0907@163.com>
Date: Sun, 12 Jan 2025 06:48:46 +0800
Subject: [PATCH 187/408] [clang-tidy][doc] combine the clang-tidy itself's
 change together in release note (#122594)

<img width="1137" alt="image"
src="https://github.com/user-attachments/assets/25433743-2c19-422a-93c5-3edfc1bb7a3f"
/>
---
 clang-tools-extra/docs/ReleaseNotes.rst | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 835a0269a2733..5c7ba4333e381 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -108,22 +108,19 @@ Improvements to clang-query
 Improvements to clang-tidy
 --------------------------
 
-- Improved :program:`clang-tidy`'s `--verify-config` flag by adding support for
-  the configuration options of the `Clang Static Analyzer Checks
-  <https://clang.llvm.org/docs/analyzer/checkers.html>`_.
-
-- Improved :program:`clang-tidy` by accepting parameters file in command line.
-
 - Improved :program:`run-clang-tidy.py` script. Fixed minor shutdown noise
   happening on certain platforms when interrupting the script.
 
-- Improved :program:`clang-tidy` by fixing incorrect configuration file path
-  resolving when file paths contain ``..``.
+- Improved :program:`clang-tidy`:
 
-- Removed :program:`clang-tidy`'s global options for most of checks. All options
-  are changed to local options except `IncludeStyle`, `StrictMode` and
-  `IgnoreMacros`. Global scoped `StrictMode` and `IgnoreMacros` are deprecated
-  and will be removed in further releases.
+  - add support for `--verify-config` flag to check the configuration options of
+    the `Clang Static Analyzer Checks <https://clang.llvm.org/docs/analyzer/checkers.html>`_.
+  - accept parameters file in command line.
+  - fix incorrect configuration file path resolving when file paths contain ``..``.
+  - remove global options for most of checks. All options are changed to local
+    options except `IncludeStyle`, `StrictMode` and `IgnoreMacros`. Global scoped
+    `StrictMode` and `IgnoreMacros` are deprecated and will be removed in further
+    releases.
 
 .. csv-table::
   :header: "Check", "Options removed from global option"

From 5e4b41c1d534c90a4591b840a4f1b2f7bf59279f Mon Sep 17 00:00:00 2001
From: Roland McGrath <mcgrathr@google.com>
Date: Sat, 11 Jan 2025 17:24:37 -0800
Subject: [PATCH 188/408] [libc] Add compile tests for each public header
 (#122527)

This adds a test that consists of compiling `#include <...>`,
pretty much alone, for each public header file in each different
language mode (`-std=...` compiler switch) with -Werror and many
warnings enabled.

There are several headers that have bugs when used alone, and
many more headers that have bugs in certain language modes.  So
for now, compiling the new tests is gated on the cmake switch
-DLLVM_LIBC_BUILD_HEADER_TESTS=ON.  When all the bugs are fixed,
the switch will be removed so future regressions don't land.
---
 libc/cmake/modules/LLVMLibCHeaderRules.cmake |  2 +
 libc/test/include/CMakeLists.txt             | 69 +++++++++++++++++++-
 libc/test/include/header-test-template.c     | 14 ++++
 3 files changed, 84 insertions(+), 1 deletion(-)
 create mode 100644 libc/test/include/header-test-template.c

diff --git a/libc/cmake/modules/LLVMLibCHeaderRules.cmake b/libc/cmake/modules/LLVMLibCHeaderRules.cmake
index 288e4dade0b47..ea8b76e0f6235 100644
--- a/libc/cmake/modules/LLVMLibCHeaderRules.cmake
+++ b/libc/cmake/modules/LLVMLibCHeaderRules.cmake
@@ -66,6 +66,7 @@ function(add_header target_name)
   set_target_properties(
     ${fq_target_name}
     PROPERTIES
+      HEADER_NAME ${dest_leaf_filename}
       HEADER_FILE_PATH ${dest_file}
       DEPS "${fq_deps_list}"
   )
@@ -164,6 +165,7 @@ function(add_gen_header target_name)
   set_target_properties(
     ${fq_target_name}
     PROPERTIES
+      HEADER_NAME ${ADD_GEN_HDR_GEN_HDR}
       HEADER_FILE_PATH ${out_file}
       DECLS_FILE_PATH "${decl_out_file}"
       DEPS "${fq_deps_list}"
diff --git a/libc/test/include/CMakeLists.txt b/libc/test/include/CMakeLists.txt
index ba21a69a31a3b..24935cec048ba 100644
--- a/libc/test/include/CMakeLists.txt
+++ b/libc/test/include/CMakeLists.txt
@@ -422,7 +422,7 @@ add_libc_test(
     -Werror
   DEPENDS
     libc.include.llvm-libc-macros.math_function_macros
-)  
+)
 
 add_libc_test(
   isfinite_c_test
@@ -483,3 +483,70 @@ add_libc_test(
   DEPENDS
     libc.include.llvm-libc-macros.math_function_macros
 )
+
+# Test `#include <...>` of each header in each available language mode.
+# This is gated on -DLLVM_LIBC_BUILD_HEADER_TESTS=ON until all the bugs
+# in headers are fixed so the tests all compile.
+set(TEST_STDC_VERSIONS 89;99;11;17;23)
+set(TEST_STDCXX_VERSIONS 03;11;14;17;20;23;26)
+
+function(add_header_test target_name source_file deps std_mode)
+  if(LLVM_LIBC_BUILD_HEADER_TESTS)
+    add_libc_test(
+      ${target_name}
+      C_TEST
+      HERMETIC_TEST_ONLY
+      SUITE
+	libc_include_tests
+      SRCS
+	${source_file}
+      COMPILE_OPTIONS
+	-Werror
+	-Wsystem-headers
+	-Wall
+	-Wextra
+	-std=${std_mode}
+      DEPENDS
+	${deps}
+    )
+  endif()
+endfunction()
+
+foreach(target ${TARGET_PUBLIC_HEADERS})
+  string(REPLACE "libc.include." "" header ${target})
+  get_target_property(HEADER_NAME ${target} HEADER_NAME)
+
+  set(test_stdc_file "${CMAKE_CURRENT_BINARY_DIR}/${header}_test.c")
+  configure_file(header-test-template.c ${test_stdc_file} @ONLY)
+  foreach(stdc_version ${TEST_STDC_VERSIONS})
+    add_header_test(
+      "${header}_c${stdc_version}_test"
+      ${test_stdc_file}
+      ${target}
+      "c${stdc_version}"
+    )
+    add_header_test(
+      "${header}_gnu${stdc_version}_test"
+      ${test_stdc_file}
+      ${target}
+      "gnu${stdc_version}"
+    )
+  endforeach()
+
+  set(test_stdcxx_file "${CMAKE_CURRENT_BINARY_DIR}/${header}_test.cpp")
+  configure_file(header-test-template.c ${test_stdcxx_file} @ONLY)
+  foreach(stdcxx_version ${TEST_STDCXX_VERSIONS})
+    add_header_test(
+      "${header}_cpp${stdcxx_version}_test"
+      ${test_stdcxx_file}
+      ${target}
+      "c++${stdcxx_version}"
+    )
+    add_header_test(
+      "${header}_gnucpp${stdcxx_version}_test"
+      ${test_stdcxx_file}
+      ${target}
+      "gnu++${stdcxx_version}"
+    )
+  endforeach()
+endforeach()
diff --git a/libc/test/include/header-test-template.c b/libc/test/include/header-test-template.c
new file mode 100644
index 0000000000000..6905b930f57db
--- /dev/null
+++ b/libc/test/include/header-test-template.c
@@ -0,0 +1,14 @@
+/*===-- Test for <@HEADER_NAME@> ----------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDXList-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#include <@HEADER_NAME@>
+
+int main(int argc, char **argv) {
+  (void)argc;
+  (void)argv;
+  return 0;
+}

From 4f6fabd11ad1a5de8e066adad28c8da4a615f8bb Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 11 Jan 2025 19:49:59 -0800
Subject: [PATCH 189/408] [Driver] Avoid repeated map lookups (NFC) (#122625)

---
 clang/lib/Driver/Driver.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 4d9492ea08f64..10df730744b08 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -2534,8 +2534,8 @@ enum {
 static unsigned PrintActions1(const Compilation &C, Action *A,
                               std::map<Action *, unsigned> &Ids,
                               Twine Indent = {}, int Kind = TopLevelAction) {
-  if (Ids.count(A)) // A was already visited.
-    return Ids[A];
+  if (auto It = Ids.find(A); It != Ids.end()) // A was already visited.
+    return It->second;
 
   std::string str;
   llvm::raw_string_ostream os(str);

From 5ee0a71df919a328c714e25f0935c21e586cc18b Mon Sep 17 00:00:00 2001
From: Daniel Paoliello <danpao@microsoft.com>
Date: Sat, 11 Jan 2025 21:30:17 -0800
Subject: [PATCH 190/408] [aarch64][win] Add support for import call
 optimization (equivalent to MSVC /d2ImportCallOptimization) (#121516)

This change implements import call optimization for AArch64 Windows
(equivalent to the undocumented MSVC `/d2ImportCallOptimization` flag).

Import call optimization adds additional data to the binary which can be
used by the Windows kernel loader to rewrite indirect calls to imported
functions as direct calls. It uses the same [Dynamic Value Relocation
Table mechanism that was leveraged on x64 to implement
`/d2GuardRetpoline`](https://techcommunity.microsoft.com/blog/windowsosplatform/mitigating-spectre-variant-2-with-retpoline-on-windows/295618).

The change to the obj file is to add a new `.impcall` section with the
following layout:
```cpp
  // Per section that contains calls to imported functions:
  //  uint32_t SectionSize: Size in bytes for information in this section.
  //  uint32_t Section Number
  //  Per call to imported function in section:
  //    uint32_t Kind: the kind of imported function.
  //    uint32_t BranchOffset: the offset of the branch instruction in its
  //                            parent section.
  //    uint32_t TargetSymbolId: the symbol id of the called function.
```

NOTE: If the import call optimization feature is enabled, then the
`.impcall` section must be emitted, even if there are no calls to
imported functions.

The implementation is split across a few parts of LLVM:
* During AArch64 instruction selection, the `GlobalValue` for each call
to a global is recorded into the Extra Information for that node.
* During lowering to machine instructions, the called global value for
each call is noted in its containing `MachineFunction`.
* During AArch64 asm printing, if the import call optimization feature
is enabled:
- A (new) `.impcall` directive is emitted for each call to an imported
function.
- The `.impcall` section is emitted with its magic header (but is not
filled in).
* During COFF object writing, the `.impcall` section is filled in based
on each `.impcall` directive that were encountered.

The `.impcall` section can only be filled in when we are writing the
COFF object as it requires the actual section numbers, which are only
assigned at that point (i.e., they don't exist during asm printing).

I had tried to avoid using the Extra Information during instruction
selection and instead implement this either purely during asm printing
or in a `MachineFunctionPass` (as suggested in [on the
forums](https://discourse.llvm.org/t/design-gathering-locations-of-instructions-to-emit-into-a-section/83729/3))
but this was not possible due to how loading and calling an imported
function works on AArch64. Specifically, they are emitted as `ADRP` +
`LDR` (to load the symbol) then a `BR` (to do the call), so at the point
when we have machine instructions, we would have to work backwards
through the instructions to discover what is being called. An initial
prototype did work by inspecting instructions; however, it didn't
correctly handle the case where the same function was called twice in a
row, which caused LLVM to elide the `ADRP` + `LDR` and reuse the
previously loaded address. Worse than that, sometimes for the
double-call case LLVM decided to spill the loaded address to the stack
and then reload it before making the second call. So, instead of trying
to implement logic to discover where the value in a register came from,
I instead recorded the symbol being called at the last place where it
was easy to do: instruction selection.
---
 llvm/include/llvm/CodeGen/MIRYamlMapping.h    |  45 +++++--
 llvm/include/llvm/CodeGen/MachineFunction.h   |  25 ++++
 llvm/include/llvm/CodeGen/SelectionDAG.h      |  14 +++
 llvm/include/llvm/MC/MCObjectFileInfo.h       |   5 +
 llvm/include/llvm/MC/MCStreamer.h             |   8 ++
 llvm/include/llvm/MC/MCWinCOFFObjectWriter.h  |   1 +
 llvm/include/llvm/MC/MCWinCOFFStreamer.h      |   2 +
 llvm/lib/CodeGen/MIRParser/MIRParser.cpp      |  74 ++++++++++--
 llvm/lib/CodeGen/MIRPrinter.cpp               |  33 ++++-
 .../SelectionDAG/ScheduleDAGSDNodes.cpp       |   4 +
 llvm/lib/MC/MCAsmStreamer.cpp                 |  14 +++
 llvm/lib/MC/MCObjectFileInfo.cpp              |   5 +
 llvm/lib/MC/MCParser/COFFAsmParser.cpp        |  34 ++++++
 llvm/lib/MC/MCStreamer.cpp                    |   4 +
 llvm/lib/MC/MCWinCOFFStreamer.cpp             | 114 ++++++++++++++++++
 llvm/lib/MC/WinCOFFObjectWriter.cpp           |  27 +++--
 llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp |  72 +++++++++++
 .../Target/AArch64/AArch64ISelLowering.cpp    |  14 ++-
 .../win-import-call-optimization-nocalls.ll   |  18 +++
 .../AArch64/win-import-call-optimization.ll   |  48 ++++++++
 .../CodeGen/MIR/AArch64/called-globals.mir    |  61 ++++++++++
 .../CodeGen/MIR/X86/call-site-info-error1.mir |   2 +-
 .../CodeGen/MIR/X86/call-site-info-error2.mir |   2 +-
 .../MC/AArch64/win-import-call-optimization.s |  72 +++++++++++
 llvm/test/MC/COFF/bad-parse.s                 |  13 ++
 25 files changed, 673 insertions(+), 38 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/win-import-call-optimization-nocalls.ll
 create mode 100644 llvm/test/CodeGen/AArch64/win-import-call-optimization.ll
 create mode 100644 llvm/test/CodeGen/MIR/AArch64/called-globals.mir
 create mode 100644 llvm/test/MC/AArch64/win-import-call-optimization.s
 create mode 100644 llvm/test/MC/COFF/bad-parse.s

diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
index 09a6ca936fe1f..dbad3469d047d 100644
--- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h
+++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
@@ -457,6 +457,16 @@ template <> struct ScalarTraits<FrameIndex> {
   static QuotingType mustQuote(StringRef S) { return needsQuotes(S); }
 };
 
+/// Identifies call instruction location in machine function.
+struct MachineInstrLoc {
+  unsigned BlockNum;
+  unsigned Offset;
+
+  bool operator==(const MachineInstrLoc &Other) const {
+    return BlockNum == Other.BlockNum && Offset == Other.Offset;
+  }
+};
+
 /// Serializable representation of CallSiteInfo.
 struct CallSiteInfo {
   // Representation of call argument and register which is used to
@@ -470,16 +480,6 @@ struct CallSiteInfo {
     }
   };
 
-  /// Identifies call instruction location in machine function.
-  struct MachineInstrLoc {
-    unsigned BlockNum;
-    unsigned Offset;
-
-    bool operator==(const MachineInstrLoc &Other) const {
-      return BlockNum == Other.BlockNum && Offset == Other.Offset;
-    }
-  };
-
   MachineInstrLoc CallLocation;
   std::vector<ArgRegPair> ArgForwardingRegs;
 
@@ -595,6 +595,26 @@ template <> struct MappingTraits<MachineJumpTable::Entry> {
   }
 };
 
+struct CalledGlobal {
+  MachineInstrLoc CallSite;
+  StringValue Callee;
+  unsigned Flags;
+
+  bool operator==(const CalledGlobal &Other) const {
+    return CallSite == Other.CallSite && Callee == Other.Callee &&
+           Flags == Other.Flags;
+  }
+};
+
+template <> struct MappingTraits<CalledGlobal> {
+  static void mapping(IO &YamlIO, CalledGlobal &CG) {
+    YamlIO.mapRequired("bb", CG.CallSite.BlockNum);
+    YamlIO.mapRequired("offset", CG.CallSite.Offset);
+    YamlIO.mapRequired("callee", CG.Callee);
+    YamlIO.mapRequired("flags", CG.Flags);
+  }
+};
+
 } // end namespace yaml
 } // end namespace llvm
 
@@ -606,6 +626,7 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::FixedMachineStackObject)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::CallSiteInfo)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::MachineConstantPoolValue)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::MachineJumpTable::Entry)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::CalledGlobal)
 
 namespace llvm {
 namespace yaml {
@@ -764,6 +785,7 @@ struct MachineFunction {
   std::vector<DebugValueSubstitution> DebugValueSubstitutions;
   MachineJumpTable JumpTableInfo;
   std::vector<StringValue> MachineMetadataNodes;
+  std::vector<CalledGlobal> CalledGlobals;
   BlockStringValue Body;
 };
 
@@ -822,6 +844,9 @@ template <> struct MappingTraits<MachineFunction> {
     if (!YamlIO.outputting() || !MF.MachineMetadataNodes.empty())
       YamlIO.mapOptional("machineMetadataNodes", MF.MachineMetadataNodes,
                          std::vector<StringValue>());
+    if (!YamlIO.outputting() || !MF.CalledGlobals.empty())
+      YamlIO.mapOptional("calledGlobals", MF.CalledGlobals,
+                         std::vector<CalledGlobal>());
     YamlIO.mapOptional("body", MF.Body, BlockStringValue());
   }
 };
diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h
index d696add8a1af5..282aee2a69c4d 100644
--- a/llvm/include/llvm/CodeGen/MachineFunction.h
+++ b/llvm/include/llvm/CodeGen/MachineFunction.h
@@ -354,6 +354,11 @@ class LLVM_ABI MachineFunction {
   /// a table of valid targets for Windows EHCont Guard.
   std::vector<MCSymbol *> CatchretTargets;
 
+  /// Mapping of call instruction to the global value and target flags that it
+  /// calls, if applicable.
+  DenseMap<const MachineInstr *, std::pair<const GlobalValue *, unsigned>>
+      CalledGlobalsMap;
+
   /// \name Exception Handling
   /// \{
 
@@ -1182,6 +1187,26 @@ class LLVM_ABI MachineFunction {
     CatchretTargets.push_back(Target);
   }
 
+  /// Tries to get the global and target flags for a call site, if the
+  /// instruction is a call to a global.
+  std::pair<const GlobalValue *, unsigned>
+  tryGetCalledGlobal(const MachineInstr *MI) const {
+    return CalledGlobalsMap.lookup(MI);
+  }
+
+  /// Notes the global and target flags for a call site.
+  void addCalledGlobal(const MachineInstr *MI,
+                       std::pair<const GlobalValue *, unsigned> Details) {
+    assert(MI && "MI must not be null");
+    assert(Details.first && "Global must not be null");
+    CalledGlobalsMap.insert({MI, Details});
+  }
+
+  /// Iterates over the full set of call sites and their associated globals.
+  auto getCalledGlobals() const {
+    return llvm::make_range(CalledGlobalsMap.begin(), CalledGlobalsMap.end());
+  }
+
   /// \name Exception Handling
   /// \{
 
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index ff7caec41855f..b31ad11c3ee0e 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -293,6 +293,7 @@ class SelectionDAG {
     MDNode *HeapAllocSite = nullptr;
     MDNode *PCSections = nullptr;
     MDNode *MMRA = nullptr;
+    std::pair<const GlobalValue *, unsigned> CalledGlobal{};
     bool NoMerge = false;
   };
   /// Out-of-line extra information for SDNodes.
@@ -2373,6 +2374,19 @@ class SelectionDAG {
     auto It = SDEI.find(Node);
     return It != SDEI.end() ? It->second.MMRA : nullptr;
   }
+  /// Set CalledGlobal to be associated with Node.
+  void addCalledGlobal(const SDNode *Node, const GlobalValue *GV,
+                       unsigned OpFlags) {
+    SDEI[Node].CalledGlobal = {GV, OpFlags};
+  }
+  /// Return CalledGlobal associated with Node, or a nullopt if none exists.
+  std::optional<std::pair<const GlobalValue *, unsigned>>
+  getCalledGlobal(const SDNode *Node) {
+    auto I = SDEI.find(Node);
+    return I != SDEI.end()
+               ? std::make_optional(std::move(I->second).CalledGlobal)
+               : std::nullopt;
+  }
   /// Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
   void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge) {
     if (NoMerge)
diff --git a/llvm/include/llvm/MC/MCObjectFileInfo.h b/llvm/include/llvm/MC/MCObjectFileInfo.h
index e2a2c84e47910..fb575fe721015 100644
--- a/llvm/include/llvm/MC/MCObjectFileInfo.h
+++ b/llvm/include/llvm/MC/MCObjectFileInfo.h
@@ -73,6 +73,10 @@ class MCObjectFileInfo {
   /// to emit them into.
   MCSection *CompactUnwindSection = nullptr;
 
+  /// If import call optimization is supported by the target, this is the
+  /// section to emit import call data to.
+  MCSection *ImportCallSection = nullptr;
+
   // Dwarf sections for debug info.  If a target supports debug info, these must
   // be set.
   MCSection *DwarfAbbrevSection = nullptr;
@@ -269,6 +273,7 @@ class MCObjectFileInfo {
   MCSection *getBSSSection() const { return BSSSection; }
   MCSection *getReadOnlySection() const { return ReadOnlySection; }
   MCSection *getLSDASection() const { return LSDASection; }
+  MCSection *getImportCallSection() const { return ImportCallSection; }
   MCSection *getCompactUnwindSection() const { return CompactUnwindSection; }
   MCSection *getDwarfAbbrevSection() const { return DwarfAbbrevSection; }
   MCSection *getDwarfInfoSection() const { return DwarfInfoSection; }
diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h
index 21da4dac4872b..558b14cebfd3d 100644
--- a/llvm/include/llvm/MC/MCStreamer.h
+++ b/llvm/include/llvm/MC/MCStreamer.h
@@ -569,6 +569,14 @@ class MCStreamer {
   /// \param Symbol - Symbol the image relative relocation should point to.
   virtual void emitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset);
 
+  /// Emits the physical number of the section containing the given symbol as
+  /// assigned during object writing (i.e., this is not a runtime relocation).
+  virtual void emitCOFFSecNumber(MCSymbol const *Symbol);
+
+  /// Emits the offset of the symbol from the beginning of the section during
+  /// object writing (i.e., this is not a runtime relocation).
+  virtual void emitCOFFSecOffset(MCSymbol const *Symbol);
+
   /// Emits an lcomm directive with XCOFF csect information.
   ///
   /// \param LabelSym - Label on the block of storage.
diff --git a/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h b/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h
index a4ede61e45099..13d8c7d060c9e 100644
--- a/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h
+++ b/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h
@@ -72,6 +72,7 @@ class WinCOFFObjectWriter final : public MCObjectWriter {
                         const MCFixup &Fixup, MCValue Target,
                         uint64_t &FixedValue) override;
   uint64_t writeObject(MCAssembler &Asm) override;
+  int getSectionNumber(const MCSection &Section) const;
 };
 
 /// Construct a new Win COFF writer instance.
diff --git a/llvm/include/llvm/MC/MCWinCOFFStreamer.h b/llvm/include/llvm/MC/MCWinCOFFStreamer.h
index 5c39d80538944..2425abe51e6dd 100644
--- a/llvm/include/llvm/MC/MCWinCOFFStreamer.h
+++ b/llvm/include/llvm/MC/MCWinCOFFStreamer.h
@@ -58,6 +58,8 @@ class MCWinCOFFStreamer : public MCObjectStreamer {
   void emitCOFFSectionIndex(MCSymbol const *Symbol) override;
   void emitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset) override;
   void emitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) override;
+  void emitCOFFSecNumber(MCSymbol const *Symbol) override;
+  void emitCOFFSecOffset(MCSymbol const *Symbol) override;
   void emitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                         Align ByteAlignment) override;
   void emitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index e2543f883f91c..de2fe925c2d5c 100644
--- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -158,6 +158,9 @@ class MIRParserImpl {
                                  MachineFunction &MF,
                                  const yaml::MachineFunction &YMF);
 
+  bool parseCalledGlobals(PerFunctionMIParsingState &PFS, MachineFunction &MF,
+                          const yaml::MachineFunction &YMF);
+
 private:
   bool parseMDNode(PerFunctionMIParsingState &PFS, MDNode *&Node,
                    const yaml::StringValue &Source);
@@ -183,6 +186,9 @@ class MIRParserImpl {
 
   void setupDebugValueTracking(MachineFunction &MF,
     PerFunctionMIParsingState &PFS, const yaml::MachineFunction &YamlMF);
+
+  bool parseMachineInst(MachineFunction &MF, yaml::MachineInstrLoc MILoc,
+                        MachineInstr const *&MI);
 };
 
 } // end namespace llvm
@@ -457,24 +463,34 @@ bool MIRParserImpl::computeFunctionProperties(
   return false;
 }
 
+bool MIRParserImpl::parseMachineInst(MachineFunction &MF,
+                                     yaml::MachineInstrLoc MILoc,
+                                     MachineInstr const *&MI) {
+  if (MILoc.BlockNum >= MF.size()) {
+    return error(Twine(MF.getName()) +
+                 Twine(" instruction block out of range.") +
+                 " Unable to reference bb:" + Twine(MILoc.BlockNum));
+  }
+  auto BB = std::next(MF.begin(), MILoc.BlockNum);
+  if (MILoc.Offset >= BB->size())
+    return error(
+        Twine(MF.getName()) + Twine(" instruction offset out of range.") +
+        " Unable to reference instruction at bb: " + Twine(MILoc.BlockNum) +
+        " at offset:" + Twine(MILoc.Offset));
+  MI = &*std::next(BB->instr_begin(), MILoc.Offset);
+  return false;
+}
+
 bool MIRParserImpl::initializeCallSiteInfo(
     PerFunctionMIParsingState &PFS, const yaml::MachineFunction &YamlMF) {
   MachineFunction &MF = PFS.MF;
   SMDiagnostic Error;
   const TargetMachine &TM = MF.getTarget();
   for (auto &YamlCSInfo : YamlMF.CallSitesInfo) {
-    yaml::CallSiteInfo::MachineInstrLoc MILoc = YamlCSInfo.CallLocation;
-    if (MILoc.BlockNum >= MF.size())
-      return error(Twine(MF.getName()) +
-                   Twine(" call instruction block out of range.") +
-                   " Unable to reference bb:" + Twine(MILoc.BlockNum));
-    auto CallB = std::next(MF.begin(), MILoc.BlockNum);
-    if (MILoc.Offset >= CallB->size())
-      return error(Twine(MF.getName()) +
-                   Twine(" call instruction offset out of range.") +
-                   " Unable to reference instruction at bb: " +
-                   Twine(MILoc.BlockNum) + " at offset:" + Twine(MILoc.Offset));
-    auto CallI = std::next(CallB->instr_begin(), MILoc.Offset);
+    yaml::MachineInstrLoc MILoc = YamlCSInfo.CallLocation;
+    const MachineInstr *CallI;
+    if (parseMachineInst(MF, MILoc, CallI))
+      return true;
     if (!CallI->isCall(MachineInstr::IgnoreBundle))
       return error(Twine(MF.getName()) +
                    Twine(" call site info should reference call "
@@ -641,6 +657,9 @@ MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF,
   if (initializeCallSiteInfo(PFS, YamlMF))
     return true;
 
+  if (parseCalledGlobals(PFS, MF, YamlMF))
+    return true;
+
   setupDebugValueTracking(MF, PFS, YamlMF);
 
   MF.getSubtarget().mirFileLoaded(MF);
@@ -1111,6 +1130,37 @@ bool MIRParserImpl::parseMachineMetadataNodes(
   return false;
 }
 
+bool MIRParserImpl::parseCalledGlobals(PerFunctionMIParsingState &PFS,
+                                       MachineFunction &MF,
+                                       const yaml::MachineFunction &YMF) {
+  Function &F = MF.getFunction();
+  for (const auto &YamlCG : YMF.CalledGlobals) {
+    yaml::MachineInstrLoc MILoc = YamlCG.CallSite;
+    const MachineInstr *CallI;
+    if (parseMachineInst(MF, MILoc, CallI))
+      return true;
+    if (!CallI->isCall(MachineInstr::IgnoreBundle))
+      return error(Twine(MF.getName()) +
+                   Twine(" called global should reference call "
+                         "instruction. Instruction at bb:") +
+                   Twine(MILoc.BlockNum) + " at offset:" + Twine(MILoc.Offset) +
+                   " is not a call instruction");
+
+    auto Callee =
+        F.getParent()->getValueSymbolTable().lookup(YamlCG.Callee.Value);
+    if (!Callee)
+      return error(YamlCG.Callee.SourceRange.Start,
+                   "use of undefined global '" + YamlCG.Callee.Value + "'");
+    if (!isa<GlobalValue>(Callee))
+      return error(YamlCG.Callee.SourceRange.Start,
+                   "use of non-global value '" + YamlCG.Callee.Value + "'");
+
+    MF.addCalledGlobal(CallI, {cast<GlobalValue>(Callee), YamlCG.Flags});
+  }
+
+  return false;
+}
+
 SMDiagnostic MIRParserImpl::diagFromMIStringDiag(const SMDiagnostic &Error,
                                                  SMRange SourceRange) {
   assert(SourceRange.isValid() && "Invalid source range");
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index c8f6341c1224d..b8e41cc789856 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -133,6 +133,9 @@ class MIRPrinter {
   void convertMachineMetadataNodes(yaml::MachineFunction &YMF,
                                    const MachineFunction &MF,
                                    MachineModuleSlotTracker &MST);
+  void convertCalledGlobals(yaml::MachineFunction &YMF,
+                            const MachineFunction &MF,
+                            MachineModuleSlotTracker &MST);
 
 private:
   void initRegisterMaskIds(const MachineFunction &MF);
@@ -269,6 +272,8 @@ void MIRPrinter::print(const MachineFunction &MF) {
   // function.
   convertMachineMetadataNodes(YamlMF, MF, MST);
 
+  convertCalledGlobals(YamlMF, MF, MST);
+
   yaml::Output Out(OS);
   if (!SimplifyMIR)
       Out.setWriteDefaultValues(true);
@@ -555,7 +560,7 @@ void MIRPrinter::convertCallSiteObjects(yaml::MachineFunction &YMF,
   const auto *TRI = MF.getSubtarget().getRegisterInfo();
   for (auto CSInfo : MF.getCallSitesInfo()) {
     yaml::CallSiteInfo YmlCS;
-    yaml::CallSiteInfo::MachineInstrLoc CallLocation;
+    yaml::MachineInstrLoc CallLocation;
 
     // Prepare instruction position.
     MachineBasicBlock::const_instr_iterator CallI = CSInfo.first->getIterator();
@@ -596,6 +601,32 @@ void MIRPrinter::convertMachineMetadataNodes(yaml::MachineFunction &YMF,
   }
 }
 
+void MIRPrinter::convertCalledGlobals(yaml::MachineFunction &YMF,
+                                      const MachineFunction &MF,
+                                      MachineModuleSlotTracker &MST) {
+  for (const auto [CallInst, CG] : MF.getCalledGlobals()) {
+    // If the call instruction was dropped, then we don't need to print it.
+    auto BB = CallInst->getParent();
+    if (BB) {
+      yaml::MachineInstrLoc CallSite;
+      CallSite.BlockNum = CallInst->getParent()->getNumber();
+      CallSite.Offset = std::distance(CallInst->getParent()->instr_begin(),
+                                      CallInst->getIterator());
+
+      yaml::CalledGlobal YamlCG{CallSite, CG.first->getName().str(), CG.second};
+      YMF.CalledGlobals.push_back(YamlCG);
+    }
+  }
+
+  // Sort by position of call instructions.
+  llvm::sort(YMF.CalledGlobals.begin(), YMF.CalledGlobals.end(),
+             [](yaml::CalledGlobal A, yaml::CalledGlobal B) {
+               if (A.CallSite.BlockNum == B.CallSite.BlockNum)
+                 return A.CallSite.Offset < B.CallSite.Offset;
+               return A.CallSite.BlockNum < B.CallSite.BlockNum;
+             });
+}
+
 void MIRPrinter::convert(yaml::MachineFunction &MF,
                          const MachineConstantPool &ConstantPool) {
   unsigned ID = 0;
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index dff7243b0a99c..bafe26ff7d6b7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -908,6 +908,10 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
         It->setMMRAMetadata(MF, MMRA);
     }
 
+    if (auto CalledGlobal = DAG->getCalledGlobal(Node))
+      if (CalledGlobal->first)
+        MF.addCalledGlobal(MI, *CalledGlobal);
+
     return MI;
   };
 
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index 01fe11ed20501..dd8058c6d5cd8 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -209,6 +209,8 @@ class MCAsmStreamer final : public MCStreamer {
   void emitCOFFSectionIndex(MCSymbol const *Symbol) override;
   void emitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset) override;
   void emitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) override;
+  void emitCOFFSecNumber(MCSymbol const *Symbol) override;
+  void emitCOFFSecOffset(MCSymbol const *Symbol) override;
   void emitXCOFFLocalCommonSymbol(MCSymbol *LabelSym, uint64_t Size,
                                   MCSymbol *CsectSym, Align Alignment) override;
   void emitXCOFFSymbolLinkageWithVisibility(MCSymbol *Symbol,
@@ -893,6 +895,18 @@ void MCAsmStreamer::emitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) {
   EmitEOL();
 }
 
+void MCAsmStreamer::emitCOFFSecNumber(MCSymbol const *Symbol) {
+  OS << "\t.secnum\t";
+  Symbol->print(OS, MAI);
+  EmitEOL();
+}
+
+void MCAsmStreamer::emitCOFFSecOffset(MCSymbol const *Symbol) {
+  OS << "\t.secoffset\t";
+  Symbol->print(OS, MAI);
+  EmitEOL();
+}
+
 // We need an XCOFF-specific version of this directive as the AIX syntax
 // requires a QualName argument identifying the csect name and storage mapping
 // class to appear before the alignment if we are specifying it.
diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp
index f37e138edc36b..150e38a94db6a 100644
--- a/llvm/lib/MC/MCObjectFileInfo.cpp
+++ b/llvm/lib/MC/MCObjectFileInfo.cpp
@@ -596,6 +596,11 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
                                           COFF::IMAGE_SCN_MEM_READ);
   }
 
+  if (T.getArch() == Triple::aarch64) {
+    ImportCallSection =
+        Ctx->getCOFFSection(".impcall", COFF::IMAGE_SCN_LNK_INFO);
+  }
+
   // Debug info.
   COFFDebugSymbolsSection =
       Ctx->getCOFFSection(".debug$S", (COFF::IMAGE_SCN_MEM_DISCARDABLE |
diff --git a/llvm/lib/MC/MCParser/COFFAsmParser.cpp b/llvm/lib/MC/MCParser/COFFAsmParser.cpp
index 4d95a72085283..dd5ce9964a194 100644
--- a/llvm/lib/MC/MCParser/COFFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/COFFAsmParser.cpp
@@ -70,6 +70,8 @@ class COFFAsmParser : public MCAsmParserExtension {
     addDirectiveHandler<&COFFAsmParser::parseDirectiveSymbolAttribute>(
         ".weak_anti_dep");
     addDirectiveHandler<&COFFAsmParser::parseDirectiveCGProfile>(".cg_profile");
+    addDirectiveHandler<&COFFAsmParser::parseDirectiveSecNum>(".secnum");
+    addDirectiveHandler<&COFFAsmParser::parseDirectiveSecOffset>(".secoffset");
 
     // Win64 EH directives.
     addDirectiveHandler<&COFFAsmParser::parseSEHDirectiveStartProc>(
@@ -126,6 +128,8 @@ class COFFAsmParser : public MCAsmParserExtension {
   bool parseDirectiveLinkOnce(StringRef, SMLoc);
   bool parseDirectiveRVA(StringRef, SMLoc);
   bool parseDirectiveCGProfile(StringRef, SMLoc);
+  bool parseDirectiveSecNum(StringRef, SMLoc);
+  bool parseDirectiveSecOffset(StringRef, SMLoc);
 
   // Win64 EH directives.
   bool parseSEHDirectiveStartProc(StringRef, SMLoc);
@@ -577,6 +581,36 @@ bool COFFAsmParser::parseDirectiveSymIdx(StringRef, SMLoc) {
   return false;
 }
 
+bool COFFAsmParser::parseDirectiveSecNum(StringRef, SMLoc) {
+  StringRef SymbolID;
+  if (getParser().parseIdentifier(SymbolID))
+    return TokError("expected identifier in directive");
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  MCSymbol *Symbol = getContext().getOrCreateSymbol(SymbolID);
+
+  Lex();
+  getStreamer().emitCOFFSecNumber(Symbol);
+  return false;
+}
+
+bool COFFAsmParser::parseDirectiveSecOffset(StringRef, SMLoc) {
+  StringRef SymbolID;
+  if (getParser().parseIdentifier(SymbolID))
+    return TokError("expected identifier in directive");
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  MCSymbol *Symbol = getContext().getOrCreateSymbol(SymbolID);
+
+  Lex();
+  getStreamer().emitCOFFSecOffset(Symbol);
+  return false;
+}
+
 /// ::= [ identifier ]
 bool COFFAsmParser::parseCOMDATType(COFF::COMDATType &Type) {
   StringRef TypeId = getTok().getIdentifier();
diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp
index ccf65df150e78..e690723c0e502 100644
--- a/llvm/lib/MC/MCStreamer.cpp
+++ b/llvm/lib/MC/MCStreamer.cpp
@@ -1023,6 +1023,10 @@ void MCStreamer::emitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset) {}
 
 void MCStreamer::emitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) {}
 
+void MCStreamer::emitCOFFSecNumber(MCSymbol const *Symbol) {}
+
+void MCStreamer::emitCOFFSecOffset(MCSymbol const *Symbol) {}
+
 /// EmitRawText - If this file is backed by an assembly streamer, this dumps
 /// the specified string in the output .s file.  This capability is
 /// indicated by the hasRawTextSupport() predicate.
diff --git a/llvm/lib/MC/MCWinCOFFStreamer.cpp b/llvm/lib/MC/MCWinCOFFStreamer.cpp
index 395d4db3103d7..8fd46bc8b0255 100644
--- a/llvm/lib/MC/MCWinCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCWinCOFFStreamer.cpp
@@ -29,6 +29,7 @@
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSymbolCOFF.h"
 #include "llvm/MC/MCTargetOptions.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/MC/MCWinCOFFObjectWriter.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -43,6 +44,91 @@ using namespace llvm;
 
 #define DEBUG_TYPE "WinCOFFStreamer"
 
+/// MCExpr that represents the physical number for the sections that contains
+/// a symbol.
+class MCCOFFSectionNumberTargetExpr final : public MCTargetExpr {
+  const MCSymbol &SectionSymbol;
+  const WinCOFFObjectWriter &Writer;
+
+  MCCOFFSectionNumberTargetExpr(const MCSymbol &SectionSymbol_,
+                                const WinCOFFObjectWriter &Writer_)
+      : SectionSymbol(SectionSymbol_), Writer(Writer_) {}
+
+public:
+  static MCCOFFSectionNumberTargetExpr *
+  create(const MCSymbol &SectionSymbol, const WinCOFFObjectWriter &Writer,
+         MCContext &Ctx) {
+    return new (Ctx) MCCOFFSectionNumberTargetExpr(SectionSymbol, Writer);
+  }
+
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override {
+    OS << ":secnum:";
+    SectionSymbol.print(OS, MAI);
+  }
+
+  bool evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
+                                 const MCFixup *Fixup) const override {
+    auto sectionNumber = Writer.getSectionNumber(SectionSymbol.getSection());
+    assert(sectionNumber != 0 &&
+           "Containing section was not assigned a number");
+    Res = MCValue::get(sectionNumber);
+    return true;
+  }
+
+  void visitUsedExpr(MCStreamer &Streamer) const override {
+    // Contains no sub-expressions.
+  }
+
+  MCFragment *findAssociatedFragment() const override {
+    return SectionSymbol.getFragment();
+  }
+
+  void fixELFSymbolsInTLSFixups(MCAssembler &) const override {
+    llvm_unreachable("Not supported for ELF");
+  }
+};
+
+/// MCExpr that represents the offset to a symbol from the beginning of its
+/// section.
+class MCCOFFSectionOffsetTargetExpr final : public MCTargetExpr {
+  const MCSymbol &Symbol;
+
+  MCCOFFSectionOffsetTargetExpr(const MCSymbol &Symbol_) : Symbol(Symbol_) {}
+
+public:
+  static MCCOFFSectionOffsetTargetExpr *create(const MCSymbol &Symbol,
+                                               MCContext &Ctx) {
+    return new (Ctx) MCCOFFSectionOffsetTargetExpr(Symbol);
+  }
+
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override {
+    OS << ":secoffset:";
+    Symbol.print(OS, MAI);
+  }
+
+  bool evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
+                                 const MCFixup *Fixup) const override {
+    uint64_t CallsiteOffset = 0;
+    if (!Asm->getSymbolOffset(Symbol, CallsiteOffset)) {
+      return true;
+    }
+    Res = MCValue::get(CallsiteOffset);
+    return true;
+  }
+
+  void visitUsedExpr(MCStreamer &Streamer) const override {
+    // Contains no sub-expressions.
+  }
+
+  MCFragment *findAssociatedFragment() const override {
+    return Symbol.getFragment();
+  }
+
+  void fixELFSymbolsInTLSFixups(MCAssembler &) const override {
+    llvm_unreachable("Not supported for ELF");
+  }
+};
+
 MCWinCOFFStreamer::MCWinCOFFStreamer(MCContext &Context,
                                      std::unique_ptr<MCAsmBackend> MAB,
                                      std::unique_ptr<MCCodeEmitter> CE,
@@ -280,6 +366,34 @@ void MCWinCOFFStreamer::emitCOFFImgRel32(const MCSymbol *Symbol,
   DF->appendContents(4, 0);
 }
 
+void MCWinCOFFStreamer::emitCOFFSecNumber(MCSymbol const *Symbol) {
+  visitUsedSymbol(*Symbol);
+  MCDataFragment *DF = getOrCreateDataFragment();
+  // Create Symbol for section number.
+  const MCExpr *MCE = MCCOFFSectionNumberTargetExpr::create(
+      *Symbol, this->getWriter(), getContext());
+  // Build the relocation.
+  MCFixup Fixup = MCFixup::create(DF->getContents().size(), MCE, FK_Data_4);
+  // Record the relocation.
+  DF->getFixups().push_back(Fixup);
+  // Emit 4 bytes (zeros) to the object file.
+  DF->appendContents(4, 0);
+}
+
+void MCWinCOFFStreamer::emitCOFFSecOffset(MCSymbol const *Symbol) {
+  visitUsedSymbol(*Symbol);
+  MCDataFragment *DF = getOrCreateDataFragment();
+  // Create Symbol for section offset.
+  const MCExpr *MCE =
+      MCCOFFSectionOffsetTargetExpr::create(*Symbol, getContext());
+  // Build the relocation.
+  MCFixup Fixup = MCFixup::create(DF->getContents().size(), MCE, FK_Data_4);
+  // Record the relocation.
+  DF->getFixups().push_back(Fixup);
+  // Emit 4 bytes (zeros) to the object file.
+  DF->appendContents(4, 0);
+}
+
 void MCWinCOFFStreamer::emitCommonSymbol(MCSymbol *S, uint64_t Size,
                                          Align ByteAlignment) {
   auto *Symbol = cast<MCSymbolCOFF>(S);
diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp
index 09d2b08e43050..39e02d0522bcf 100644
--- a/llvm/lib/MC/WinCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp
@@ -163,6 +163,7 @@ class llvm::WinCOFFWriter {
                         const MCFixup &Fixup, MCValue Target,
                         uint64_t &FixedValue);
   uint64_t writeObject(MCAssembler &Asm);
+  int getSectionNumber(const MCSection &Section) const;
 
 private:
   COFFSymbol *createSymbol(StringRef Name);
@@ -818,6 +819,15 @@ void WinCOFFWriter::executePostLayoutBinding(MCAssembler &Asm) {
       if (!Symbol.isTemporary() ||
           cast<MCSymbolCOFF>(Symbol).getClass() == COFF::IMAGE_SYM_CLASS_STATIC)
         defineSymbol(Asm, Symbol);
+
+  UseBigObj = Sections.size() > COFF::MaxNumberOfSections16;
+  Header.NumberOfSections = Sections.size();
+  Header.NumberOfSymbols = 0;
+  if (Sections.size() > INT32_MAX)
+    report_fatal_error(
+        "PE COFF object files can't have more than 2147483647 sections");
+
+  assignSectionNumbers();
 }
 
 void WinCOFFWriter::recordRelocation(MCAssembler &Asm,
@@ -980,16 +990,7 @@ static std::time_t getTime() {
 uint64_t WinCOFFWriter::writeObject(MCAssembler &Asm) {
   uint64_t StartOffset = W.OS.tell();
 
-  if (Sections.size() > INT32_MAX)
-    report_fatal_error(
-        "PE COFF object files can't have more than 2147483647 sections");
-
-  UseBigObj = Sections.size() > COFF::MaxNumberOfSections16;
-  Header.NumberOfSections = Sections.size();
-  Header.NumberOfSymbols = 0;
-
   setWeakDefaultNames();
-  assignSectionNumbers();
   if (Mode != DwoOnly)
     createFileSymbols(Asm);
 
@@ -1143,6 +1144,10 @@ uint64_t WinCOFFWriter::writeObject(MCAssembler &Asm) {
   return W.OS.tell() - StartOffset;
 }
 
+int WinCOFFWriter::getSectionNumber(const MCSection &Section) const {
+  return SectionMap.at(&Section)->Number;
+}
+
 //------------------------------------------------------------------------------
 // WinCOFFObjectWriter class implementation
 
@@ -1194,6 +1199,10 @@ uint64_t WinCOFFObjectWriter::writeObject(MCAssembler &Asm) {
   return TotalSize;
 }
 
+int WinCOFFObjectWriter::getSectionNumber(const MCSection &Section) const {
+  return ObjWriter->getSectionNumber(Section);
+}
+
 MCWinCOFFObjectTargetWriter::MCWinCOFFObjectTargetWriter(unsigned Machine_)
     : Machine(Machine_) {}
 
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 9d9d9889b3858..27e65d60122fd 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -24,6 +24,7 @@
 #include "MCTargetDesc/AArch64TargetStreamer.h"
 #include "TargetInfo/AArch64TargetInfo.h"
 #include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
@@ -77,6 +78,11 @@ static cl::opt<PtrauthCheckMode> PtrauthAuthChecks(
     cl::desc("Check pointer authentication auth/resign failures"),
     cl::init(Default));
 
+static cl::opt<bool> EnableImportCallOptimization(
+    "aarch64-win-import-call-optimization", cl::Hidden,
+    cl::desc("Enable import call optimization for AArch64 Windows"),
+    cl::init(false));
+
 #define DEBUG_TYPE "asm-printer"
 
 namespace {
@@ -89,6 +95,8 @@ class AArch64AsmPrinter : public AsmPrinter {
 #ifndef NDEBUG
   unsigned InstsEmitted;
 #endif
+  DenseMap<MCSection *, std::vector<std::pair<MCSymbol *, MCSymbol *>>>
+      SectionToImportedFunctionCalls;
 
 public:
   AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
@@ -293,6 +301,11 @@ class AArch64AsmPrinter : public AsmPrinter {
                               MCSymbol *LazyPointer) override;
   void emitMachOIFuncStubHelperBody(Module &M, const GlobalIFunc &GI,
                                     MCSymbol *LazyPointer) override;
+
+  /// Checks if this instruction is part of a sequence that is eligle for import
+  /// call optimization and, if so, records it to be emitted in the import call
+  /// section.
+  void recordIfImportCall(const MachineInstr *BranchInst);
 };
 
 } // end anonymous namespace
@@ -930,6 +943,38 @@ void AArch64AsmPrinter::emitEndOfAsmFile(Module &M) {
   // Emit stack and fault map information.
   FM.serializeToFaultMapSection();
 
+  // If import call optimization is enabled, emit the appropriate section.
+  // We do this whether or not we recorded any import calls.
+  if (EnableImportCallOptimization && TT.isOSBinFormatCOFF()) {
+    OutStreamer->switchSection(getObjFileLowering().getImportCallSection());
+
+    // Section always starts with some magic.
+    constexpr char ImpCallMagic[12] = "Imp_Call_V1";
+    OutStreamer->emitBytes(StringRef{ImpCallMagic, sizeof(ImpCallMagic)});
+
+    // Layout of this section is:
+    // Per section that contains calls to imported functions:
+    //  uint32_t SectionSize: Size in bytes for information in this section.
+    //  uint32_t Section Number
+    //  Per call to imported function in section:
+    //    uint32_t Kind: the kind of imported function.
+    //    uint32_t BranchOffset: the offset of the branch instruction in its
+    //                            parent section.
+    //    uint32_t TargetSymbolId: the symbol id of the called function.
+    for (auto &[Section, CallsToImportedFuncs] :
+         SectionToImportedFunctionCalls) {
+      unsigned SectionSize =
+          sizeof(uint32_t) * (2 + 3 * CallsToImportedFuncs.size());
+      OutStreamer->emitInt32(SectionSize);
+      OutStreamer->emitCOFFSecNumber(Section->getBeginSymbol());
+      for (auto &[CallsiteSymbol, CalledSymbol] : CallsToImportedFuncs) {
+        // Kind is always IMAGE_REL_ARM64_DYNAMIC_IMPORT_CALL (0x13).
+        OutStreamer->emitInt32(0x13);
+        OutStreamer->emitCOFFSecOffset(CallsiteSymbol);
+        OutStreamer->emitCOFFSymbolIndex(CalledSymbol);
+      }
+    }
+  }
 }
 
 void AArch64AsmPrinter::emitLOHs() {
@@ -2703,6 +2748,7 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
   case AArch64::TCRETURNriALL: {
     emitPtrauthTailCallHardening(MI);
 
+    recordIfImportCall(MI);
     MCInst TmpInst;
     TmpInst.setOpcode(AArch64::BR);
     TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
@@ -2714,6 +2760,7 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
 
     MCOperand Dest;
     MCInstLowering.lowerOperand(MI->getOperand(0), Dest);
+    recordIfImportCall(MI);
     MCInst TmpInst;
     TmpInst.setOpcode(AArch64::B);
     TmpInst.addOperand(Dest);
@@ -3044,6 +3091,14 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
     TS->emitARM64WinCFISaveAnyRegQPX(MI->getOperand(0).getImm(),
                                      -MI->getOperand(2).getImm());
     return;
+
+  case AArch64::BLR:
+  case AArch64::BR:
+    recordIfImportCall(MI);
+    MCInst TmpInst;
+    MCInstLowering.Lower(MI, TmpInst);
+    EmitToStreamer(*OutStreamer, TmpInst);
+    return;
   }
 
   // Finally, do the automated lowerings for everything else.
@@ -3052,6 +3107,23 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
   EmitToStreamer(*OutStreamer, TmpInst);
 }
 
+void AArch64AsmPrinter::recordIfImportCall(
+    const llvm::MachineInstr *BranchInst) {
+  if (!EnableImportCallOptimization ||
+      !TM.getTargetTriple().isOSBinFormatCOFF())
+    return;
+
+  auto [GV, OpFlags] = BranchInst->getMF()->tryGetCalledGlobal(BranchInst);
+  if (GV && GV->hasDLLImportStorageClass()) {
+    auto *CallSiteSymbol = MMI->getContext().createNamedTempSymbol("impcall");
+    OutStreamer->emitLabel(CallSiteSymbol);
+
+    auto *CalledSymbol = MCInstLowering.GetGlobalValueSymbol(GV, OpFlags);
+    SectionToImportedFunctionCalls[OutStreamer->getCurrentSectionOnly()]
+        .push_back({CallSiteSymbol, CalledSymbol});
+  }
+}
+
 void AArch64AsmPrinter::emitMachOIFuncStubBody(Module &M, const GlobalIFunc &GI,
                                                MCSymbol *LazyPointer) {
   // _ifunc:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 469c6d7caf458..d9877fef1437c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9450,12 +9450,14 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
   // node so that legalize doesn't hack it.
+  const GlobalValue *CalledGlobal = nullptr;
+  unsigned OpFlags = 0;
   if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-    auto GV = G->getGlobal();
-    unsigned OpFlags =
-        Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine());
+    CalledGlobal = G->getGlobal();
+    OpFlags = Subtarget->classifyGlobalFunctionReference(CalledGlobal,
+                                                         getTargetMachine());
     if (OpFlags & AArch64II::MO_GOT) {
-      Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
+      Callee = DAG.getTargetGlobalAddress(CalledGlobal, DL, PtrVT, 0, OpFlags);
       Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
     } else {
       const GlobalValue *GV = G->getGlobal();
@@ -9575,6 +9577,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
     DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
     DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
+    if (CalledGlobal)
+      DAG.addCalledGlobal(Ret.getNode(), CalledGlobal, OpFlags);
     return Ret;
   }
 
@@ -9586,6 +9590,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
   InGlue = Chain.getValue(1);
   DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
+  if (CalledGlobal)
+    DAG.addCalledGlobal(Chain.getNode(), CalledGlobal, OpFlags);
 
   uint64_t CalleePopBytes =
       DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
diff --git a/llvm/test/CodeGen/AArch64/win-import-call-optimization-nocalls.ll b/llvm/test/CodeGen/AArch64/win-import-call-optimization-nocalls.ll
new file mode 100644
index 0000000000000..81d6d6369dcbf
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/win-import-call-optimization-nocalls.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=aarch64-pc-windows-msvc -aarch64-win-import-call-optimization < %s | FileCheck %s
+
+define dso_local void @normal_call() local_unnamed_addr {
+entry:
+  call void @a()
+  ret void
+}
+; CHECK-LABEL:  normal_call:
+; CHECK:        bl a
+
+declare void @a() local_unnamed_addr
+
+; Even if there are no calls to imported functions, we still need to emit the
+; .impcall section.
+
+; CHECK-LABEL  .section   .impcall,"yi"
+; CHECK-NEXT   .asciz  "Imp_Call_V1"
+; CHECK-NOT    .secnum
diff --git a/llvm/test/CodeGen/AArch64/win-import-call-optimization.ll b/llvm/test/CodeGen/AArch64/win-import-call-optimization.ll
new file mode 100644
index 0000000000000..6bb118ba1e159
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/win-import-call-optimization.ll
@@ -0,0 +1,48 @@
+; RUN: llc -mtriple=aarch64-pc-windows-msvc -aarch64-win-import-call-optimization < %s | FileCheck %s --check-prefix=CHECK-ENABLED
+; RUN: llc -mtriple=aarch64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK-DISABLED
+
+; CHECK-DISABLED-NOT: .section        .impcall
+
+define dso_local void @normal_call() local_unnamed_addr section "nc_sect" {
+entry:
+  call void @a()
+  call void @a()
+  ret void
+}
+; CHECK-ENABLED-LABEL:  normal_call:
+; CHECK-ENABLED:        adrp    [[ADRPREG:x[0-9]+]], __imp_a
+; CHECK-ENABLED-NEXT:   ldr     [[LDRREG:x[0-9]+]], [[[ADRPREG]], :lo12:__imp_a]
+; CHECK-ENABLED-NEXT:   .Limpcall0:
+; CHECK-ENABLED-NEXT:   blr     [[LDRREG]]
+; CHECK-ENABLED-NEXT:   .Limpcall1:
+; CHECK-ENABLED-NEXT:   blr     [[LDRREG]]
+
+define dso_local void @tail_call() local_unnamed_addr section "tc_sect" {
+entry:
+  tail call void @b()
+  ret void
+}
+; CHECK-ENABLED-LABEL:  tail_call:
+; CHECK-ENABLED:        adrp    [[ADRPREG:x[0-9]+]], __imp_b
+; CHECK-ENABLED-NEXT:   ldr     [[LDRREG:x[0-9]+]], [[[ADRPREG]], :lo12:__imp_b]
+; CHECK-ENABLED-NEXT:   .Limpcall2:
+; CHECK-ENABLED-NEXT:   br      [[LDRREG]]
+
+declare dllimport void @a() local_unnamed_addr
+declare dllimport void @b() local_unnamed_addr
+
+; CHECK-ENABLED-LABEL  .section   .impcall,"yi"
+; CHECK-ENABLED-NEXT   .asciz  "Imp_Call_V1"
+; CHECK-ENABLED-NEXT   .word   32
+; CHECK-ENABLED-NEXT   .secnum nc_sect
+; CHECK-ENABLED-NEXT   .word   19
+; CHECK-ENABLED-NEXT   .secoffset      .Limpcall0
+; CHECK-ENABLED-NEXT   .symidx __imp_a
+; CHECK-ENABLED-NEXT   .word   19
+; CHECK-ENABLED-NEXT   .secoffset      .Limpcall1
+; CHECK-ENABLED-NEXT   .symidx __imp_a
+; CHECK-ENABLED-NEXT   .word   20
+; CHECK-ENABLED-NEXT   .secnum tc_sect
+; CHECK-ENABLED-NEXT   .word   19
+; CHECK-ENABLED-NEXT   .secoffset      .Limpcall2
+; CHECK-ENABLED-NEXT   .symidx __imp_b
diff --git a/llvm/test/CodeGen/MIR/AArch64/called-globals.mir b/llvm/test/CodeGen/MIR/AArch64/called-globals.mir
new file mode 100644
index 0000000000000..cf0f0a23e2d91
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/AArch64/called-globals.mir
@@ -0,0 +1,61 @@
+# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass none -o - %s | FileCheck %s
+
+--- |
+  declare dllimport void @callee_func() local_unnamed_addr
+
+  define dso_local void @caller() local_unnamed_addr {
+  entry:
+    call void @callee_func()
+    call void @callee_func()
+    ret void
+  }
+...
+---
+name:            caller
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -8, size: 8, alignment: 8,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -16, size: 8, alignment: 8,
+      stack-id: default, callee-saved-register: '$x19', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+calledGlobals:
+  - bb:              0
+    offset:          7
+    callee:          callee_func
+    flags:           144
+  - bb:              0
+    offset:          8
+    callee:          callee_func
+    flags:           144
+body:             |
+  bb.0.entry:
+    liveins: $x19, $lr
+
+    early-clobber $sp = frame-setup STRXpre killed $x19, $sp, -16 :: (store (s64) into %stack.1)
+    frame-setup SEH_SaveReg_X 19, -16
+    frame-setup STRXui killed $lr, $sp, 1 :: (store (s64) into %stack.0)
+    frame-setup SEH_SaveReg 30, 8
+    frame-setup SEH_PrologEnd
+    $x19 = ADRP target-flags(aarch64-page, aarch64-got, aarch64-dllimport) @callee_func
+    renamable $x19 = LDRXui killed $x19, target-flags(aarch64-pageoff, aarch64-got, aarch64-nc, aarch64-dllimport) @callee_func
+    BLR renamable $x19, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+    BLR killed renamable $x19, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+    frame-destroy SEH_EpilogStart
+    $lr = frame-destroy LDRXui $sp, 1 :: (load (s64) from %stack.0)
+    frame-destroy SEH_SaveReg 30, 8
+    early-clobber $sp, $x19 = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.1)
+    frame-destroy SEH_SaveReg_X 19, -16
+    frame-destroy SEH_EpilogEnd
+    RET undef $lr
+...
+
+# CHECK-LABEL: calledGlobals:
+# CHECK-NEXT:  - bb:              0
+# CHECK-NEXT:    offset:          7
+# CHECK-NEXT:    callee:          callee_func
+# CHECK-NEXT:    flags:           144
+# CHECK-NEXT:  - bb:              0
+# CHECK-NEXT:    offset:          8
+# CHECK-NEXT:    callee:          callee_func
+# CHECK-NEXT:    flags:           144
diff --git a/llvm/test/CodeGen/MIR/X86/call-site-info-error1.mir b/llvm/test/CodeGen/MIR/X86/call-site-info-error1.mir
index 096a80f77dbb6..e4dab779216a8 100644
--- a/llvm/test/CodeGen/MIR/X86/call-site-info-error1.mir
+++ b/llvm/test/CodeGen/MIR/X86/call-site-info-error1.mir
@@ -1,5 +1,5 @@
 # RUN: not llc -mtriple=x86_64-- -run-pass none -debug-entry-values %s -o - 2>&1 | FileCheck %s
-# CHECK: baa call instruction block out of range. Unable to reference bb:1
+# CHECK: baa instruction block out of range. Unable to reference bb:1
 --- |
   define dso_local i32 @baa(i32 %a) local_unnamed_addr {
   entry:
diff --git a/llvm/test/CodeGen/MIR/X86/call-site-info-error2.mir b/llvm/test/CodeGen/MIR/X86/call-site-info-error2.mir
index bd5b2451a8d76..183610b326eeb 100644
--- a/llvm/test/CodeGen/MIR/X86/call-site-info-error2.mir
+++ b/llvm/test/CodeGen/MIR/X86/call-site-info-error2.mir
@@ -1,5 +1,5 @@
 # RUN: not llc -mtriple=x86_64-- -run-pass none -debug-entry-values %s -o - 2>&1 | FileCheck %s
-# CHECK: baa call instruction offset out of range. Unable to reference instruction at bb: 0 at offset:1
+# CHECK: baa instruction offset out of range. Unable to reference instruction at bb: 0 at offset:1
 --- |
   define dso_local i32 @baa(i32 %a) local_unnamed_addr {
   entry:
diff --git a/llvm/test/MC/AArch64/win-import-call-optimization.s b/llvm/test/MC/AArch64/win-import-call-optimization.s
new file mode 100644
index 0000000000000..f26e17b9b62cc
--- /dev/null
+++ b/llvm/test/MC/AArch64/win-import-call-optimization.s
@@ -0,0 +1,72 @@
+// RUN: llvm-mc -triple aarch64-windows-msvc -filetype obj -o %t.obj %s
+// RUN: llvm-readobj --sections --sd --relocs %t.obj | FileCheck %s
+
+.section        nc_sect,"xr"
+normal_call:
+  str     x30, [sp, #-16]!                // 8-byte Folded Spill
+  adrp    x8, __imp_a
+  ldr     x8, [x8, :lo12:__imp_a]
+.Limpcall0:
+  blr     x8
+  ldr     x30, [sp], #16                  // 8-byte Folded Reload
+  ret
+
+.section        tc_sect,"xr"
+tail_call:
+  adrp    x8, __imp_b
+  ldr     x8, [x8, :lo12:__imp_b]
+.Limpcall1:
+  br     x8
+
+.section        .impcall,"yi"
+.asciz  "Imp_Call_V1"
+.word   20
+.secnum nc_sect
+.word   19
+.secoffset      .Limpcall0
+.symidx __imp_a
+.word   20
+.secnum tc_sect
+.word   19
+.secoffset      .Limpcall1
+.symidx __imp_b
+
+// CHECK-LABEL: Name: .impcall (2E 69 6D 70 63 61 6C 6C)
+// CHECK-NEXT:  VirtualSize: 0x0
+// CHECK-NEXT:  VirtualAddress: 0x0
+// CHECK-NEXT:  RawDataSize: 52
+// CHECK-NEXT:  PointerToRawData: 0x150
+// CHECK-NEXT:  PointerToRelocations: 0x0
+// CHECK-NEXT:  PointerToLineNumbers: 0x0
+// CHECK-NEXT:  RelocationCount: 0
+// CHECK-NEXT:  LineNumberCount: 0
+// CHECK-NEXT:  Characteristics [
+// CHECK-NEXT:    IMAGE_SCN_ALIGN_4BYTES
+// CHECK-NEXT:    IMAGE_SCN_LNK_INFO
+// CHECK-NEXT:  ]
+// CHECK-NEXT:  SectionData (
+// CHECK-NEXT:    0000: 496D705F 43616C6C 5F563100 14000000  |Imp_Call_V1.....|
+// CHECK-NEXT:    0010:
+// CHECK-SAME:    [[#%.2X,NCSECT:]]000000
+// CHECK-SAME:    13000000
+// CHECK-SAME:    [[#%.2X,NCOFFSET:]]000000
+// CHECK-SAME:    [[#%.2X,NCSYM:]]000000
+// CHECK-NEXT:    0020:
+// CHECK-SAME:    14000000
+// CHECK-SAME:    [[#%.2X,TCSECT:]]000000
+// CHECK-SAME:    13000000
+// CHECK-SAME:    [[#%.2X,TCOFFSET:]]000000
+// CHECK-NEXT:    0030:
+// CHECK-SAME:    [[#%.2X,TCSYM:]]000000
+// CHECK-NEXT:  )
+
+// CHECK-LABEL: Relocations [
+// CHECK-NEXT:     Section ([[#%u,NCSECT]]) nc_sect {
+// CHECK-NEXT:       0x[[#%x,NCOFFSET - 8]] IMAGE_REL_ARM64_PAGEBASE_REL21 __imp_a ([[#%u,NCSYM]])
+// CHECK-NEXT:       0x[[#%x,NCOFFSET - 4]] IMAGE_REL_ARM64_PAGEOFFSET_12L __imp_a ([[#%u,NCSYM]])
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Section ([[#%u,TCSECT]]) tc_sect {
+// CHECK-NEXT:       0x[[#%x,TCOFFSET - 8]] IMAGE_REL_ARM64_PAGEBASE_REL21 __imp_b ([[#%u,TCSYM]])
+// CHECK-NEXT:       0x[[#%x,TCOFFSET - 4]] IMAGE_REL_ARM64_PAGEOFFSET_12L __imp_b ([[#%u,TCSYM]])
+// CHECK-NEXT:     }
+// CHECK-NEXT:   ]
diff --git a/llvm/test/MC/COFF/bad-parse.s b/llvm/test/MC/COFF/bad-parse.s
new file mode 100644
index 0000000000000..2491f41abeb4e
--- /dev/null
+++ b/llvm/test/MC/COFF/bad-parse.s
@@ -0,0 +1,13 @@
+// RUN: not llvm-mc -filetype=obj -triple i386-pc-win32 %s 2>&1 | FileCheck %s
+
+        .data
+
+// CHECK: [[@LINE+1]]:{{[0-9]+}}: error: expected identifier in directive
+        .secnum
+// CHECK: [[@LINE+1]]:{{[0-9]+}}: error: unexpected token in directive
+        .secnum section extra
+
+// CHECK: [[@LINE+1]]:{{[0-9]+}}: error: expected identifier in directive
+        .secoffset
+// CHECK: [[@LINE+1]]:{{[0-9]+}}: error: unexpected token in directive
+        .secoffset section extra

From d997a722c194feec5f3a94dec5acdce59ac5e55b Mon Sep 17 00:00:00 2001
From: Daniel Paoliello <danpao@microsoft.com>
Date: Sat, 11 Jan 2025 21:56:59 -0800
Subject: [PATCH 191/408] Fix build break in MIRPrinter (#122630)

---
 llvm/lib/CodeGen/MIRPrinter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index b8e41cc789856..b6da495590fe1 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -604,7 +604,7 @@ void MIRPrinter::convertMachineMetadataNodes(yaml::MachineFunction &YMF,
 void MIRPrinter::convertCalledGlobals(yaml::MachineFunction &YMF,
                                       const MachineFunction &MF,
                                       MachineModuleSlotTracker &MST) {
-  for (const auto [CallInst, CG] : MF.getCalledGlobals()) {
+  for (const auto &[CallInst, CG] : MF.getCalledGlobals()) {
     // If the call instruction was dropped, then we don't need to print it.
     auto BB = CallInst->getParent();
     if (BB) {

From 42da12063f49e8d52e63dcb36d25b55ed3688a26 Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Sun, 12 Jan 2025 07:46:58 +0100
Subject: [PATCH 192/408] [flang][OpenMP] Extend delayed privatization for
 `omp.simd` (#122156)

Adds support for delayed privatization for `simd` directives. This PR
includes PFT down to LLVM IR lowering.
---
 flang/lib/Lower/OpenMP/OpenMP.cpp             |   8 +-
 flang/test/Lower/OpenMP/order-clause.f90      |   6 +-
 .../Lower/OpenMP/parallel-private-clause.f90  |  32 +++--
 flang/test/Lower/OpenMP/simd.f90              |  28 ++---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |   7 +-
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  |   2 +-
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |  25 +++-
 .../Target/LLVMIR/openmp-simd-private.mlir    | 117 ++++++++++++++++++
 mlir/test/Target/LLVMIR/openmp-todo.mlir      |  19 ---
 9 files changed, 182 insertions(+), 62 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/openmp-simd-private.mlir

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index cd4b25a17722c..c71fd598d5c8a 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -2144,11 +2144,10 @@ static void genStandaloneSimd(lower::AbstractConverter &converter,
   genSimdClauses(converter, semaCtx, item->clauses, loc, simdClauseOps,
                  simdReductionSyms);
 
-  // TODO: Support delayed privatization.
   DataSharingProcessor dsp(converter, semaCtx, item->clauses, eval,
                            /*shouldCollectPreDeterminedSymbols=*/true,
-                           /*useDelayedPrivatization=*/false, symTable);
-  dsp.processStep1();
+                           enableDelayedPrivatization, symTable);
+  dsp.processStep1(&simdClauseOps);
 
   mlir::omp::LoopNestOperands loopNestClauseOps;
   llvm::SmallVector<const semantics::Symbol *> iv;
@@ -2156,7 +2155,8 @@ static void genStandaloneSimd(lower::AbstractConverter &converter,
                      loopNestClauseOps, iv);
 
   EntryBlockArgs simdArgs;
-  // TODO: Add private syms and vars.
+  simdArgs.priv.syms = dsp.getDelayedPrivSymbols();
+  simdArgs.priv.vars = simdClauseOps.privateVars;
   simdArgs.reduction.syms = simdReductionSyms;
   simdArgs.reduction.vars = simdClauseOps.reductionVars;
   auto simdOp =
diff --git a/flang/test/Lower/OpenMP/order-clause.f90 b/flang/test/Lower/OpenMP/order-clause.f90
index 717d9740c56f8..a30d82979021d 100644
--- a/flang/test/Lower/OpenMP/order-clause.f90
+++ b/flang/test/Lower/OpenMP/order-clause.f90
@@ -4,15 +4,15 @@
 
 !CHECK-LABEL:   func.func @_QPsimd_order() {
 subroutine simd_order
-   !CHECK: omp.simd order(reproducible:concurrent) {
+   !CHECK: omp.simd order(reproducible:concurrent) private({{.*}}) {
    !$omp simd order(concurrent)
    do i = 1, 10
    end do
-   !CHECK: omp.simd order(reproducible:concurrent) {
+   !CHECK: omp.simd order(reproducible:concurrent) private({{.*}}) {
    !$omp simd order(reproducible:concurrent)
    do i = 1, 10
    end do
-   !CHECK: omp.simd order(unconstrained:concurrent) {
+   !CHECK: omp.simd order(unconstrained:concurrent) private({{.*}}) {
    !$omp simd order(unconstrained:concurrent)
    do i = 1, 10
    end do
diff --git a/flang/test/Lower/OpenMP/parallel-private-clause.f90 b/flang/test/Lower/OpenMP/parallel-private-clause.f90
index 8dc843fc9d96a..7114314df05d3 100644
--- a/flang/test/Lower/OpenMP/parallel-private-clause.f90
+++ b/flang/test/Lower/OpenMP/parallel-private-clause.f90
@@ -5,6 +5,23 @@
 ! RUN: bbc --use-desc-for-alloc=false -fopenmp -emit-hlfir %s -o - \
 ! RUN: | FileCheck %s --check-prefix=FIRDialect
 
+! FIRDialect: omp.private {type = private} @_QFsimd_loop_1Er_private_ref_box_heap_f32 {{.*}} alloc {
+! FIRDialect:     [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
+! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+! FIRDialect:     [[R_DECL:%.*]]:2 = hlfir.declare [[R]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "{{.*}}r"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+! FIRDialect:     omp.yield([[R_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>)
+! FIRDialect:   } dealloc {
+! FIRDialect:  ^bb0([[R_DECL:%.*]]: !fir.ref<!fir.box<!fir.heap<f32>>>):
+! FIRDialect:     {{%.*}} = fir.load [[R_DECL]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+! FIRDialect:     fir.if {{%.*}} {
+! FIRDialect:     [[LD:%.*]] = fir.load [[R_DECL]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+! FIRDialect:     [[AD:%.*]] = fir.box_addr [[LD]] : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
+! FIRDialect:     fir.freemem [[AD]] : !fir.heap<f32>
+! FIRDialect:     fir.store {{%.*}} to [[R_DECL]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+! FIRDialect:     omp.yield
+! FIRDialect:   }
+
 !FIRDialect: omp.private {type = private} @[[DERIVED_PRIVATIZER:_QFprivate_clause_derived_typeEt_private_ref_rec__QFprivate_clause_derived_typeTmy_type]] : !fir.ref<!fir.type<_QFprivate_clause_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>> alloc {
 !FIRDialect:   ^bb0(%{{.*}}: !fir.ref<!fir.type<_QFprivate_clause_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>):
 !FIRDialect:     %[[PRIV_ALLOC:.*]] = fir.alloca !fir.type<_QFprivate_clause_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}> {bindc_name = "t", pinned, uniq_name = "_QFprivate_clause_derived_typeEt"}
@@ -246,7 +263,6 @@ subroutine parallel_pointer()
 !$omp end parallel
 end subroutine parallel_pointer
 
-
 !FIRDialect-LABEL: func @_QPsimple_loop_1()
 subroutine simple_loop_1
   integer :: i
@@ -354,20 +370,17 @@ subroutine simple_loop_3
   ! FIRDialect:  omp.terminator
 end subroutine
 
+
 !CHECK-LABEL: func @_QPsimd_loop_1()
 subroutine simd_loop_1
   integer :: i
   real, allocatable :: r;
-  ! FIRDialect:     [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
-  ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:     [[R_DECL:%.*]]:2 = hlfir.declare [[R]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "{{.*}}r"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
 
   ! FIRDialect:     %[[LB:.*]] = arith.constant 1 : i32
   ! FIRDialect:     %[[UB:.*]] = arith.constant 9 : i32
   ! FIRDialect:     %[[STEP:.*]] = arith.constant 1 : i32
 
-  ! FIRDialect: omp.simd {
+  ! FIRDialect: omp.simd private({{.*}}) {
   ! FIRDialect-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   !$OMP SIMD PRIVATE(r)
   do i=1, 9
@@ -378,10 +391,5 @@ subroutine simd_loop_1
   end do
   !$OMP END SIMD
   ! FIRDialect:     omp.yield
-  ! FIRDialect:     {{%.*}} = fir.load [[R_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:     fir.if {{%.*}} {
-  ! FIRDialect:     [[LD:%.*]] = fir.load [[R_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:     [[AD:%.*]] = fir.box_addr [[LD]] : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
-  ! FIRDialect:     fir.freemem [[AD]] : !fir.heap<f32>
-  ! FIRDialect:     fir.store {{%.*}} to [[R_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
+
 end subroutine
diff --git a/flang/test/Lower/OpenMP/simd.f90 b/flang/test/Lower/OpenMP/simd.f90
index d92f06cebfdbe..0345ace24aaa0 100644
--- a/flang/test/Lower/OpenMP/simd.f90
+++ b/flang/test/Lower/OpenMP/simd.f90
@@ -13,7 +13,7 @@ subroutine simd
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK-NEXT: %[[UB:.*]] = arith.constant 9 : i32
   ! CHECK-NEXT: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK-NEXT: omp.simd {
+  ! CHECK-NEXT: omp.simd private({{.*}}) {
   ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i=1, 9
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
@@ -33,7 +33,7 @@ subroutine simd_with_if_clause(n, threshold)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simd if(%[[COND:.*]]) {
+  ! CHECK: omp.simd if(%[[COND:.*]]) private({{.*}}) {
   ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i = 1, n
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
@@ -52,7 +52,7 @@ subroutine simd_with_simdlen_clause(n, threshold)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simd simdlen(2) {
+  ! CHECK: omp.simd simdlen(2) private({{.*}}) {
   ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i = 1, n
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
@@ -72,7 +72,7 @@ subroutine simd_with_simdlen_clause_from_param(n, threshold)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simd simdlen(2) {
+  ! CHECK: omp.simd simdlen(2) private({{.*}}) {
   ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i = 1, n
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
@@ -92,7 +92,7 @@ subroutine simd_with_simdlen_clause_from_expr_from_param(n, threshold)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simd simdlen(6) {
+  ! CHECK: omp.simd simdlen(6) private({{.*}}) {
   ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i = 1, n
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
@@ -111,7 +111,7 @@ subroutine simd_with_safelen_clause(n, threshold)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simd safelen(2) {
+  ! CHECK: omp.simd safelen(2) private({{.*}}) {
   ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i = 1, n
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
@@ -131,7 +131,7 @@ subroutine simd_with_safelen_clause_from_expr_from_param(n, threshold)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simd safelen(6) {
+  ! CHECK: omp.simd safelen(6) private({{.*}}) {
   ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i = 1, n
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
@@ -150,7 +150,7 @@ subroutine simd_with_simdlen_safelen_clause(n, threshold)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simd safelen(2) simdlen(1) {
+  ! CHECK: omp.simd safelen(2) simdlen(1) private({{.*}}) {
   ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i = 1, n
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
@@ -171,7 +171,7 @@ subroutine simd_with_collapse_clause(n)
   ! CHECK: %[[LOWER_J:.*]] = arith.constant 1 : i32
   ! CHECK: %[[UPPER_J:.*]] = fir.load %[[PARAM_ARG:.*]] : !fir.ref<i32>
   ! CHECK: %[[STEP_J:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simd {
+  ! CHECK: omp.simd private({{.*}}) {
   ! CHECK-NEXT: omp.loop_nest (%[[ARG_0:.*]], %[[ARG_1:.*]]) : i32 = (
   ! CHECK-SAME:                %[[LOWER_I]], %[[LOWER_J]]) to (
   ! CHECK-SAME:                %[[UPPER_I]], %[[UPPER_J]]) inclusive step (
@@ -235,7 +235,7 @@ subroutine simd_with_nontemporal_clause(n)
   !CHECK: %[[LB:.*]] = arith.constant 1 : i32
   !CHECK: %[[UB:.*]] = fir.load %{{.*}}#0 : !fir.ref<i32>
   !CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  !CHECK: omp.simd nontemporal(%[[A_DECL]]#1, %[[C_DECL]]#1 : !fir.ref<i32>, !fir.ref<i32>) {
+  !CHECK: omp.simd nontemporal(%[[A_DECL]]#1, %[[C_DECL]]#1 : !fir.ref<i32>, !fir.ref<i32>) private({{.*}}) {
   !CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   !$OMP SIMD NONTEMPORAL(A, C)
   do i = 1, n
@@ -249,16 +249,14 @@ subroutine lastprivate_with_simd
 
 !CHECK: %[[VAR_SUM:.*]] = fir.alloca f32 {bindc_name = "sum", uniq_name = "_QFlastprivate_with_simdEsum"}
 !CHECK: %[[VAR_SUM_DECLARE:.*]]:2 = hlfir.declare %[[VAR_SUM]] {{.*}}
-!CHECK: %[[VAR_SUM_PINNED:.*]] = fir.alloca f32 {bindc_name = "sum", pinned, uniq_name = "_QFlastprivate_with_simdEsum"}
-!CHECK: %[[VAR_SUM_PINNED_DECLARE:.*]]:2 = hlfir.declare %[[VAR_SUM_PINNED]] {{.*}}
-
   implicit none
   integer :: i
   real :: sum
 
   
-!CHECK: omp.simd {
+!CHECK: omp.simd private(@_QFlastprivate_with_simdEsum_private_ref_f32 %[[VAR_SUM_DECLARE]]#0 -> %[[VAR_SUM_PINNED:.*]], @{{.*}}) {
 !CHECK: omp.loop_nest (%[[ARG:.*]]) : i32 = ({{.*}} to ({{.*}}) inclusive step ({{.*}}) {
+!CHECK: %[[VAR_SUM_PINNED_DECLARE:.*]]:2 = hlfir.declare %[[VAR_SUM_PINNED]] {{.*}}
 !CHECK: %[[ADD_RESULT:.*]] = arith.addi {{.*}}
 !CHECK: %[[ADD_RESULT_CONVERT:.*]] = fir.convert %[[ADD_RESULT]] : (i32) -> f32
 !CHECK: hlfir.assign %[[ADD_RESULT_CONVERT]] to %[[VAR_SUM_PINNED_DECLARE]]#0 : f32, !fir.ref<f32>
@@ -283,7 +281,7 @@ subroutine simd_with_reduction_clause
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK-NEXT: %[[UB:.*]] = arith.constant 9 : i32
   ! CHECK-NEXT: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK-NEXT: omp.simd reduction(@[[REDUCER]] %[[X:.*]]#0 -> %[[X_RED:.*]] : !fir.ref<i32>) {
+  ! CHECK-NEXT: omp.simd private({{.*}}) reduction(@[[REDUCER]] %[[X:.*]]#0 -> %[[X_RED:.*]] : !fir.ref<i32>) {
   ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   !$omp simd reduction(+:x)
   do i=1, 9
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 1edf47dff8c4a..2b57a8dce3de5 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -5203,12 +5203,7 @@ void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
   Function *F = CanonicalLoop->getFunction();
 
   // Define where if branch should be inserted
-  Instruction *SplitBefore;
-  if (Instruction::classof(IfCond)) {
-    SplitBefore = dyn_cast<Instruction>(IfCond);
-  } else {
-    SplitBefore = CanonicalLoop->getPreheader()->getTerminator();
-  }
+  Instruction *SplitBefore = CanonicalLoop->getPreheader()->getTerminator();
 
   // TODO: We should not rely on pass manager. Currently we use pass manager
   // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 3d62b3218869e..ca7e08e9f18b5 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -2116,7 +2116,7 @@ void SimdOp::build(OpBuilder &builder, OperationState &state,
                 makeArrayAttr(ctx, clauses.alignments), clauses.ifExpr,
                 /*linear_vars=*/{}, /*linear_step_vars=*/{},
                 clauses.nontemporalVars, clauses.order, clauses.orderMod,
-                /*private_vars=*/{}, /*private_syms=*/nullptr,
+                clauses.privateVars, makeArrayAttr(ctx, clauses.privateSyms),
                 clauses.reductionVars,
                 makeDenseBoolArrayAttr(ctx, clauses.reductionByref),
                 makeArrayAttr(ctx, clauses.reductionSyms), clauses.safelen,
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index a364098e0bd8a..d6112fa9af118 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -273,7 +273,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
       .Case([&](omp::SimdOp op) {
         checkLinear(op, result);
         checkNontemporal(op, result);
-        checkPrivate(op, result);
         checkReduction(op, result);
       })
       .Case<omp::AtomicReadOp, omp::AtomicWriteOp, omp::AtomicUpdateOp,
@@ -2230,8 +2229,28 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
   if (failed(checkImplementationStatus(opInst)))
     return failure();
 
+  MutableArrayRef<BlockArgument> privateBlockArgs =
+      cast<omp::BlockArgOpenMPOpInterface>(*simdOp).getPrivateBlockArgs();
+  SmallVector<mlir::Value> mlirPrivateVars;
+  SmallVector<llvm::Value *> llvmPrivateVars;
+  SmallVector<omp::PrivateClauseOp> privateDecls;
+  mlirPrivateVars.reserve(privateBlockArgs.size());
+  llvmPrivateVars.reserve(privateBlockArgs.size());
+  collectPrivatizationDecls(simdOp, privateDecls);
+
+  for (mlir::Value privateVar : simdOp.getPrivateVars())
+    mlirPrivateVars.push_back(privateVar);
+
+  llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
+      findAllocaInsertPoint(builder, moduleTranslation);
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
 
+  llvm::Expected<llvm::BasicBlock *> afterAllocas = allocatePrivateVars(
+      builder, moduleTranslation, privateBlockArgs, privateDecls,
+      mlirPrivateVars, llvmPrivateVars, allocaIP);
+  if (handleError(afterAllocas, opInst).failed())
+    return failure();
+
   // Generator of the canonical loop body.
   SmallVector<llvm::CanonicalLoopInfo *> loopInfos;
   SmallVector<llvm::OpenMPIRBuilder::InsertPointTy> bodyInsertPoints;
@@ -2331,7 +2350,9 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
                         order, simdlen, safelen);
 
   builder.restoreIP(afterIP);
-  return success();
+
+  return cleanupPrivateVars(builder, moduleTranslation, simdOp.getLoc(),
+                            llvmPrivateVars, privateDecls);
 }
 
 /// Convert an Atomic Ordering attribute to llvm::AtomicOrdering.
diff --git a/mlir/test/Target/LLVMIR/openmp-simd-private.mlir b/mlir/test/Target/LLVMIR/openmp-simd-private.mlir
new file mode 100644
index 0000000000000..09d76f8edd007
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-simd-private.mlir
@@ -0,0 +1,117 @@
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+
+omp.private {type = private} @i_privatizer : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x i32 {bindc_name = "i", pinned} : (i64) -> !llvm.ptr
+  omp.yield(%1 : !llvm.ptr)
+}
+
+// CHECK-LABEL: test_loop_var_privatization()
+//                Original (non-privatized) allocation for `i`.
+// CHECK:         %{{.*}} = alloca i32, i64 1, align 4
+// CHECK:         %[[DUMMY:.*]] = alloca float, i64 1, align 4
+// CHECK:         %[[PRIV_I:.*]] = alloca i32, i64 1, align 4
+// CHECK:         br label %[[AFTER_ALLOC:.*]]
+
+// CHECK:       [[AFTER_ALLOC]]:
+// CHECK:         br label %[[ENTRY:.*]]
+
+// CHECK:       [[ENTRY]]:
+// CHECK:         br label %[[OMP_LOOP_PREHEADER:.*]]
+
+// CHECK:       [[OMP_LOOP_PREHEADER]]:
+// CHECK:         br label %[[OMP_LOOP_HEADER:.*]]
+
+// CHECK:       [[OMP_LOOP_HEADER]]:
+// CHECK:         %[[OMP_LOOP_IV:.*]] = phi i32 [ 0, %[[OMP_LOOP_PREHEADER]] ], [ %[[OMP_LOOP_NEXT:.*]], %[[OMP_LOOP_INC:.*]] ]
+// CHECK:         br label %[[OMP_LOOP_COND:.*]]
+
+// CHECK:       [[OMP_LOOP_COND]]:
+// CHECK:         %[[OMP_LOOP_CMP:.*]] = icmp ult i32 %[[OMP_LOOP_IV]], 10
+// CHECK:         br i1 %[[OMP_LOOP_CMP]], label %[[OMP_LOOP_BODY:.*]], label %[[OMP_LOOP_EXIT:.*]]
+
+// CHECK:       [[OMP_LOOP_BODY]]:
+// CHECK:         %[[IV_UPDATE:.*]] = mul i32 %[[OMP_LOOP_IV]], 1
+// CHECK:         %[[IV_UPDATE_2:.*]] = add i32 %[[IV_UPDATE]], 1
+// CHECK:         br label %[[OMP_SIMD_REGION:.*]]
+
+// CHECK:       [[OMP_SIMD_REGION]]:
+// CHECK:         store i32 %[[IV_UPDATE_2]], ptr %[[PRIV_I]], align 4
+// CHECK:         %[[DUMMY_VAL:.*]] = load float, ptr %[[DUMMY]], align 4
+// CHECK:         %[[PRIV_I_VAL:.*]] = load i32, ptr %[[PRIV_I]], align 4
+// CHECK:         %[[PRIV_I_VAL_FLT:.*]] = sitofp i32 %[[PRIV_I_VAL]] to float
+// CHECK:         %[[DUMMY_VAL_UPDATE:.*]] = fadd {{.*}} float %[[DUMMY_VAL]], %[[PRIV_I_VAL_FLT]]
+// CHECK:         store float %[[DUMMY_VAL_UPDATE]], ptr %[[DUMMY]], align 4, !llvm.access.group !1
+// CHECK:        br label %[[OMP_REGION_CONT:.*]]
+
+// CHECK:      [[OMP_REGION_CONT]]:
+// CHECK:        br label %[[OMP_LOOP_INC:.*]]
+
+// CHECK:      [[OMP_LOOP_INC]]:
+// CHECK:        %[[OMP_LOOP_NEXT:.*]] = add nuw i32 %[[OMP_LOOP_IV]], 1
+// CHECK:        br label %[[OMP_LOOP_HEADER]]
+
+// CHECK:      [[OMP_LOOP_EXIT]]:
+
+
+llvm.func @test_loop_var_privatization() attributes {fir.internal_name = "_QPtest_private_clause"} {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+  %3 = llvm.alloca %0 x f32 {bindc_name = "dummy"} : (i64) -> !llvm.ptr
+  %4 = llvm.mlir.constant(10 : i32) : i32
+  %5 = llvm.mlir.constant(1 : i32) : i32
+  omp.simd private(@i_privatizer %1 -> %arg0 : !llvm.ptr) {
+    omp.loop_nest (%arg1) : i32 = (%5) to (%4) inclusive step (%5) {
+      llvm.store %arg1, %arg0 : i32, !llvm.ptr
+      %8 = llvm.load %3 : !llvm.ptr -> f32
+      %9 = llvm.load %arg0 : !llvm.ptr -> i32
+      %10 = llvm.sitofp %9 : i32 to f32
+      %11 = llvm.fadd %8, %10 {fastmathFlags = #llvm.fastmath<contract>} : f32
+      llvm.store %11, %3 : f32, !llvm.ptr
+      omp.yield
+    }
+  }
+  llvm.return
+}
+
+omp.private {type = private} @dummy_privatizer : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x f32 {bindc_name = "dummy", pinned} : (i64) -> !llvm.ptr
+  omp.yield(%1 : !llvm.ptr)
+}
+
+// CHECK-LABEL: test_private_clause()
+//                Original (non-privatized) allocation for `i`.
+// CHECK:         %{{.*}} = alloca i32, i64 1, align 4
+//                Original (non-privatized) allocation for `dummy`.
+// CHECK:         %{{.*}} = alloca float, i64 1, align 4
+// CHECK:         %[[PRIV_DUMMY:.*]] = alloca float, i64 1, align 4
+// CHECK:         %[[PRIV_I:.*]] = alloca i32, i64 1, align 4
+
+// CHECK:       omp.simd.region:
+// CHECK-NOT:     br label
+// CHECK:         store i32 %{{.*}}, ptr %[[PRIV_I]], align 4
+// CHECK:        %{{.*}} = load float, ptr %[[PRIV_DUMMY]], align 4
+// CHECK:        store float %{{.*}}, ptr %[[PRIV_DUMMY]], align 4
+
+llvm.func @test_private_clause() attributes {fir.internal_name = "_QPtest_private_clause"} {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+  %3 = llvm.alloca %0 x f32 {bindc_name = "dummy"} : (i64) -> !llvm.ptr
+  %4 = llvm.mlir.constant(10 : i32) : i32
+  %5 = llvm.mlir.constant(1 : i32) : i32
+  omp.simd private(@dummy_privatizer %3 -> %arg0, @i_privatizer %1 -> %arg1 : !llvm.ptr, !llvm.ptr) {
+    omp.loop_nest (%arg2) : i32 = (%5) to (%4) inclusive step (%5) {
+      llvm.store %arg2, %arg1 : i32, !llvm.ptr
+      %8 = llvm.load %arg0 : !llvm.ptr -> f32
+      %9 = llvm.load %arg1 : !llvm.ptr -> i32
+      %10 = llvm.sitofp %9 : i32 to f32
+      %11 = llvm.fadd %8, %10 {fastmathFlags = #llvm.fastmath<contract>} : f32
+      llvm.store %11, %arg0 : f32, !llvm.ptr
+      omp.yield
+    }
+  }
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index 83a0990d63162..f8c9d911e3034 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -156,25 +156,6 @@ llvm.func @simd_nontemporal(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
 
 // -----
 
-omp.private {type = private} @x.privatizer : !llvm.ptr alloc {
-^bb0(%arg0: !llvm.ptr):
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  %1 = llvm.alloca %0 x i32 : (i32) -> !llvm.ptr
-  omp.yield(%1 : !llvm.ptr)
-}
-llvm.func @simd_private(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
-  // expected-error@below {{not yet implemented: Unhandled clause privatization in omp.simd operation}}
-  // expected-error@below {{LLVM Translation failed for operation: omp.simd}}
-  omp.simd private(@x.privatizer %x -> %arg0 : !llvm.ptr) {
-    omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
-      omp.yield
-    }
-  }
-  llvm.return
-}
-
-// -----
-
 omp.declare_reduction @add_f32 : f32
 init {
 ^bb0(%arg: f32):

From a5364444bd0dbbd60e83ee6cf11adabd72f4df54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Barnab=C3=A1s=20P=C5=91cze?= <pobrn@protonmail.com>
Date: Sun, 12 Jan 2025 10:07:52 +0100
Subject: [PATCH 193/408] [clang-tidy]
 performance-unnecessary-copy-initialization: Consider static functions
 (#119974)

Static member functions can be considered the same way as free functions
are, so do that.
---
 .../UnnecessaryCopyInitialization.cpp         | 21 +++++++++-------
 clang-tools-extra/docs/ReleaseNotes.rst       |  4 ++++
 .../unnecessary-copy-initialization.cpp       | 24 +++++++++++++++++++
 3 files changed, 40 insertions(+), 9 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp b/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp
index 034894c11bf2c..dc2e8a38a3e97 100644
--- a/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp
+++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp
@@ -104,15 +104,18 @@ AST_MATCHER_FUNCTION_P(StatementMatcher,
                                 hasArgument(0, hasType(ReceiverType)))));
 }
 
+AST_MATCHER(CXXMethodDecl, isStatic) { return Node.isStatic(); }
+
 AST_MATCHER_FUNCTION(StatementMatcher, isConstRefReturningFunctionCall) {
-  // Only allow initialization of a const reference from a free function if it
-  // has no arguments. Otherwise it could return an alias to one of its
-  // arguments and the arguments need to be checked for const use as well.
-  return callExpr(callee(functionDecl(returns(hasCanonicalType(
-                                          matchers::isReferenceToConst())))
-                             .bind(FunctionDeclId)),
-                  argumentCountIs(0), unless(callee(cxxMethodDecl())))
-      .bind(InitFunctionCallId);
+  // Only allow initialization of a const reference from a free function or
+  // static member function if it has no arguments. Otherwise it could return
+  // an alias to one of its arguments and the arguments need to be checked
+  // for const use as well.
+  return callExpr(argumentCountIs(0),
+                  callee(functionDecl(returns(hasCanonicalType(matchers::isReferenceToConst())),
+                                      unless(cxxMethodDecl(unless(isStatic()))))
+                         .bind(FunctionDeclId)))
+         .bind(InitFunctionCallId);
 }
 
 AST_MATCHER_FUNCTION_P(StatementMatcher, initializerReturnsReferenceToConst,
@@ -232,7 +235,7 @@ UnnecessaryCopyInitialization::UnnecessaryCopyInitialization(
           Options.get("ExcludedContainerTypes", ""))) {}
 
 void UnnecessaryCopyInitialization::registerMatchers(MatchFinder *Finder) {
-  auto LocalVarCopiedFrom = [this](const internal::Matcher<Expr> &CopyCtorArg) {
+  auto LocalVarCopiedFrom = [this](const ast_matchers::internal::Matcher<Expr> &CopyCtorArg) {
     return compoundStmt(
                forEachDescendant(
                    declStmt(
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 5c7ba4333e381..08156325369e6 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -339,6 +339,10 @@ Changes in existing checks
   <clang-tidy/checks/performance/move-const-arg>` check to fix a crash when
   an argument type is declared but not defined.
 
+- Improved :doc:`performance-unnecessary-copy-initialization`
+  <clang-tidy/checks/performance/unnecessary-copy-initialization> check
+  to consider static member functions the same way as free functions.
+
 - Improved :doc:`readability-container-contains
   <clang-tidy/checks/readability/container-contains>` check to let it work on
   any class that has a ``contains`` method. Fix some false negatives in the
diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-copy-initialization.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-copy-initialization.cpp
index d02bb98cf583c..b5325776f54c6 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-copy-initialization.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-copy-initialization.cpp
@@ -28,6 +28,8 @@ struct ExpensiveToCopyType {
   template <typename A>
   const A &templatedAccessor() const;
   operator int() const; // Implicit conversion to int.
+
+  static const ExpensiveToCopyType &instance();
 };
 
 template <typename T>
@@ -100,6 +102,28 @@ void PositiveFunctionCall() {
   VarCopyConstructed.constMethod();
 }
 
+void PositiveStaticMethodCall() {
+  const auto AutoAssigned = ExpensiveToCopyType::instance();
+  // CHECK-MESSAGES: [[@LINE-1]]:14: warning: the const qualified variable 'AutoAssigned' is copy-constructed from a const reference; consider making it a const reference [performance-unnecessary-copy-initialization]
+  // CHECK-FIXES: const auto& AutoAssigned = ExpensiveToCopyType::instance();
+  AutoAssigned.constMethod();
+
+  const auto AutoCopyConstructed(ExpensiveToCopyType::instance());
+  // CHECK-MESSAGES: [[@LINE-1]]:14: warning: the const qualified variable 'AutoCopyConstructed'
+  // CHECK-FIXES: const auto& AutoCopyConstructed(ExpensiveToCopyType::instance());
+  AutoCopyConstructed.constMethod();
+
+  const ExpensiveToCopyType VarAssigned = ExpensiveToCopyType::instance();
+  // CHECK-MESSAGES: [[@LINE-1]]:29: warning: the const qualified variable 'VarAssigned'
+  // CHECK-FIXES:   const ExpensiveToCopyType& VarAssigned = ExpensiveToCopyType::instance();
+  VarAssigned.constMethod();
+
+  const ExpensiveToCopyType VarCopyConstructed(ExpensiveToCopyType::instance());
+  // CHECK-MESSAGES: [[@LINE-1]]:29: warning: the const qualified variable 'VarCopyConstructed'
+  // CHECK-FIXES: const ExpensiveToCopyType& VarCopyConstructed(ExpensiveToCopyType::instance());
+  VarCopyConstructed.constMethod();
+}
+
 void PositiveMethodCallConstReferenceParam(const ExpensiveToCopyType &Obj) {
   const auto AutoAssigned = Obj.reference();
   // CHECK-MESSAGES: [[@LINE-1]]:14: warning: the const qualified variable 'AutoAssigned'

From fdfe7e7fabc85ed7293ca6f5f234d41812644584 Mon Sep 17 00:00:00 2001
From: Sharjeel Khan <sharjeel.khan4754@gmail.com>
Date: Sun, 12 Jan 2025 01:36:00 -0800
Subject: [PATCH 194/408] [Driver] Error when using msan on Android (#122540)

Msan is not supported on Android as mentioned in google/sanitizers#1381.
We proactively give the warning saying it is unsupported to fix
android/ndk#1958.
---
 clang/lib/Driver/ToolChains/Linux.cpp | 4 +++-
 clang/test/Driver/fsanitize.c         | 5 +++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
index 1c56355136df8..c2a85be819816 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -823,6 +823,7 @@ SanitizerMask Linux::getSupportedSanitizers() const {
   const bool IsRISCV64 = getTriple().getArch() == llvm::Triple::riscv64;
   const bool IsSystemZ = getTriple().getArch() == llvm::Triple::systemz;
   const bool IsHexagon = getTriple().getArch() == llvm::Triple::hexagon;
+  const bool IsAndroid = getTriple().isAndroid();
   SanitizerMask Res = ToolChain::getSupportedSanitizers();
   Res |= SanitizerKind::Address;
   Res |= SanitizerKind::PointerCompare;
@@ -831,7 +832,6 @@ SanitizerMask Linux::getSupportedSanitizers() const {
   Res |= SanitizerKind::Fuzzer;
   Res |= SanitizerKind::FuzzerNoLink;
   Res |= SanitizerKind::KernelAddress;
-  Res |= SanitizerKind::Memory;
   Res |= SanitizerKind::Vptr;
   Res |= SanitizerKind::SafeStack;
   if (IsX86_64 || IsMIPS64 || IsAArch64 || IsLoongArch64)
@@ -857,6 +857,8 @@ SanitizerMask Linux::getSupportedSanitizers() const {
   }
   if (IsX86_64)
     Res |= SanitizerKind::NumericalStability;
+  if (!IsAndroid)
+    Res |= SanitizerKind::Memory;
 
   // Work around "Cannot represent a difference across sections".
   if (getTriple().getArch() == llvm::Triple::ppc64)
diff --git a/clang/test/Driver/fsanitize.c b/clang/test/Driver/fsanitize.c
index 1d3caec748d77..429dc51b3356d 100644
--- a/clang/test/Driver/fsanitize.c
+++ b/clang/test/Driver/fsanitize.c
@@ -399,6 +399,11 @@
 // RUN: %clang --target=arm-linux-androideabi %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ANDROID-NO-ASAN
 // CHECK-ANDROID-NO-ASAN: "-mrelocation-model" "pic"
 
+// RUN: not %clang --target=aarch64-linux-android -fsanitize=memory %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-MSAN-ANDROID
+// RUN: not %clang --target=i386-linux-android -fsanitize=memory %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-MSAN-ANDROID
+// RUN: not %clang --target=x86_64-linux-android -fsanitize=memory %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-MSAN-ANDROID
+// CHECK-MSAN-ANDROID: unsupported option '-fsanitize=memory' for target
+
 // RUN: %clang --target=x86_64-linux-gnu %s -fsanitize=undefined -### 2>&1 | FileCheck %s --check-prefix=CHECK-RECOVER-UBSAN
 // RUN: %clang --target=x86_64-linux-gnu %s -fsanitize=undefined -fsanitize-recover -### 2>&1 | FileCheck %s --check-prefix=CHECK-RECOVER-UBSAN
 // RUN: %clang --target=x86_64-linux-gnu %s -fsanitize=undefined -fsanitize-recover=all -### 2>&1 | FileCheck %s --check-prefix=CHECK-RECOVER-UBSAN

From 8ebc35f8d041f097a2b973b455dc3533420af6bf Mon Sep 17 00:00:00 2001
From: MichelleCDjunaidi <michellechrisalyn@gmail.com>
Date: Sun, 12 Jan 2025 18:04:40 +0800
Subject: [PATCH 195/408] [clang-tidy] Create
 bugprone-incorrect-enable-shared-from-this check (#102299)

This checks that classes/structs inheriting from
``std::enable_shared_from_this`` does so with public inheritance, so it
prevents crashes due to ``std::make_shared`` and ``shared_from_this()``
getting called when the internal weak pointer was not initialized (e.g.
due to private inheritance).
---
 .../bugprone/BugproneTidyModule.cpp           |   3 +
 .../clang-tidy/bugprone/CMakeLists.txt        |   5 +
 .../IncorrectEnableSharedFromThisCheck.cpp    |  65 +++++++
 .../IncorrectEnableSharedFromThisCheck.h      |  35 ++++
 clang-tools-extra/docs/ReleaseNotes.rst       |   7 +
 .../incorrect-enable-shared-from-this.rst     |  34 ++++
 .../docs/clang-tidy/checks/list.rst           |   1 +
 .../incorrect-enable-shared-from-this.cpp     | 180 ++++++++++++++++++
 8 files changed, 330 insertions(+)
 create mode 100644 clang-tools-extra/clang-tidy/bugprone/IncorrectEnableSharedFromThisCheck.cpp
 create mode 100644 clang-tools-extra/clang-tidy/bugprone/IncorrectEnableSharedFromThisCheck.h
 create mode 100644 clang-tools-extra/docs/clang-tidy/checks/bugprone/incorrect-enable-shared-from-this.rst
 create mode 100644 clang-tools-extra/test/clang-tidy/checkers/bugprone/incorrect-enable-shared-from-this.cpp

diff --git a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
index b27616f3dcc65..c5f0b5b28418f 100644
--- a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
@@ -33,6 +33,7 @@
 #include "InaccurateEraseCheck.h"
 #include "IncDecInConditionsCheck.h"
 #include "IncorrectEnableIfCheck.h"
+#include "IncorrectEnableSharedFromThisCheck.h"
 #include "IncorrectRoundingsCheck.h"
 #include "InfiniteLoopCheck.h"
 #include "IntegerDivisionCheck.h"
@@ -144,6 +145,8 @@ class BugproneModule : public ClangTidyModule {
         "bugprone-inaccurate-erase");
     CheckFactories.registerCheck<IncorrectEnableIfCheck>(
         "bugprone-incorrect-enable-if");
+    CheckFactories.registerCheck<IncorrectEnableSharedFromThisCheck>(
+        "bugprone-incorrect-enable-shared-from-this");
     CheckFactories.registerCheck<ReturnConstRefFromParameterCheck>(
         "bugprone-return-const-ref-from-parameter");
     CheckFactories.registerCheck<SwitchMissingDefaultCaseCheck>(
diff --git a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
index 8bd5646c5fe05..e8309c68b7fca 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
@@ -27,6 +27,11 @@ add_clang_library(clangTidyBugproneModule STATIC
   ForwardingReferenceOverloadCheck.cpp
   ImplicitWideningOfMultiplicationResultCheck.cpp
   InaccurateEraseCheck.cpp
+  IncorrectEnableIfCheck.cpp
+  IncorrectEnableSharedFromThisCheck.cpp
+  ReturnConstRefFromParameterCheck.cpp
+  SuspiciousStringviewDataUsageCheck.cpp
+  SwitchMissingDefaultCaseCheck.cpp
   IncDecInConditionsCheck.cpp
   IncorrectEnableIfCheck.cpp
   IncorrectRoundingsCheck.cpp
diff --git a/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableSharedFromThisCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableSharedFromThisCheck.cpp
new file mode 100644
index 0000000000000..425e46cf6c88c
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableSharedFromThisCheck.cpp
@@ -0,0 +1,65 @@
+//===--- IncorrectEnableSharedFromThisCheck.cpp - clang-tidy --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "IncorrectEnableSharedFromThisCheck.h"
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/DeclCXX.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+
+using namespace clang::ast_matchers;
+
+namespace clang::tidy::bugprone {
+
+void IncorrectEnableSharedFromThisCheck::registerMatchers(MatchFinder *Finder) {
+  const auto EnableSharedFromThis =
+      cxxRecordDecl(hasName("enable_shared_from_this"), isInStdNamespace());
+  const auto QType = hasCanonicalType(hasDeclaration(
+      cxxRecordDecl(
+          anyOf(EnableSharedFromThis.bind("enable_rec"),
+                cxxRecordDecl(hasAnyBase(cxxBaseSpecifier(
+                    isPublic(), hasType(hasCanonicalType(
+                                    hasDeclaration(EnableSharedFromThis))))))))
+          .bind("base_rec")));
+  Finder->addMatcher(
+      cxxRecordDecl(
+          unless(isExpansionInSystemHeader()),
+          hasDirectBase(cxxBaseSpecifier(unless(isPublic()), hasType(QType))
+                            .bind("base")))
+          .bind("derived"),
+      this);
+}
+
+void IncorrectEnableSharedFromThisCheck::check(
+    const MatchFinder::MatchResult &Result) {
+  const auto *BaseSpec = Result.Nodes.getNodeAs<CXXBaseSpecifier>("base");
+  const auto *Base = Result.Nodes.getNodeAs<CXXRecordDecl>("base_rec");
+  const auto *Derived = Result.Nodes.getNodeAs<CXXRecordDecl>("derived");
+  const bool IsEnableSharedFromThisDirectBase =
+      Result.Nodes.getNodeAs<CXXRecordDecl>("enable_rec") == Base;
+  const bool HasWrittenAccessSpecifier =
+      BaseSpec->getAccessSpecifierAsWritten() != AS_none;
+  const auto ReplacementRange = CharSourceRange(
+      SourceRange(BaseSpec->getBeginLoc()), HasWrittenAccessSpecifier);
+  const llvm::StringRef Replacement =
+      HasWrittenAccessSpecifier ? "public" : "public ";
+  const FixItHint Hint =
+      IsEnableSharedFromThisDirectBase
+          ? FixItHint::CreateReplacement(ReplacementRange, Replacement)
+          : FixItHint();
+  diag(Derived->getLocation(),
+       "%2 is not publicly inheriting from "
+       "%select{%1 which inherits from |}0'std::enable_shared_"
+       "from_this', "
+       "which will cause unintended behaviour "
+       "when using 'shared_from_this'; make the inheritance "
+       "public",
+       DiagnosticIDs::Warning)
+      << IsEnableSharedFromThisDirectBase << Base << Derived << Hint;
+}
+
+} // namespace clang::tidy::bugprone
diff --git a/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableSharedFromThisCheck.h b/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableSharedFromThisCheck.h
new file mode 100644
index 0000000000000..987c56059259b
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableSharedFromThisCheck.h
@@ -0,0 +1,35 @@
+//===--- IncorrectEnableSharedFromThisCheck.h - clang-tidy ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_INCORRECTENABLESHAREDFROMTHISCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_INCORRECTENABLESHAREDFROMTHISCHECK_H
+
+#include "../ClangTidyCheck.h"
+
+namespace clang::tidy::bugprone {
+
+/// Detect classes or structs that do not publicly inherit from
+/// ``std::enable_shared_from_this``, because unintended behavior will
+/// otherwise occur when calling ``shared_from_this``.
+///
+/// For the user-facing documentation see:
+/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/incorrect-enable-shared-from-this.html
+class IncorrectEnableSharedFromThisCheck : public ClangTidyCheck {
+public:
+  IncorrectEnableSharedFromThisCheck(StringRef Name, ClangTidyContext *Context)
+      : ClangTidyCheck(Name, Context) {}
+  void registerMatchers(ast_matchers::MatchFinder *Finder) override;
+  void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
+  bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
+    return LangOpts.CPlusPlus11;
+  }
+};
+
+} // namespace clang::tidy::bugprone
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_INCORRECTENABLESHAREDFROMTHISCHECK_H
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 08156325369e6..375de831f0e11 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -145,6 +145,13 @@ New checks
   Warns about code that tries to cast between pointers by means of
   ``std::bit_cast`` or ``memcpy``.
 
+- New :doc:`bugprone-incorrect-enable-shared-from-this
+  <clang-tidy/checks/bugprone/incorrect-enable-shared-from-this>` check.
+
+  Detect classes or structs that do not publicly inherit from 
+  ``std::enable_shared_from_this``, because unintended behavior will 
+  otherwise occur when calling ``shared_from_this``.
+  
 - New :doc:`bugprone-nondeterministic-pointer-iteration-order
   <clang-tidy/checks/bugprone/nondeterministic-pointer-iteration-order>`
   check.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/incorrect-enable-shared-from-this.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/incorrect-enable-shared-from-this.rst
new file mode 100644
index 0000000000000..cc9e7be70f6ea
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/incorrect-enable-shared-from-this.rst
@@ -0,0 +1,34 @@
+.. title:: clang-tidy - bugprone-incorrect-enable-shared-from-this
+
+bugprone-incorrect-enable-shared-from-this
+==========================================
+
+Detect classes or structs that do not publicly inherit from 
+``std::enable_shared_from_this``, because unintended behavior will 
+otherwise occur when calling ``shared_from_this``.
+
+Consider the following code:
+
+.. code-block:: c++
+
+    #include <memory>
+
+    // private inheritance
+    class BadExample : std::enable_shared_from_this<BadExample> {
+    
+    // ``shared_from_this``` unintended behaviour
+    // `libstdc++` implementation returns uninitialized ``weak_ptr``
+        public:
+        BadExample* foo() { return shared_from_this().get(); }
+        void bar() { return; }
+    };
+
+    void using_not_public() {
+        auto bad_example = std::make_shared<BadExample>();
+        auto* b_ex = bad_example->foo();
+        b_ex->bar();
+    }
+
+Using `libstdc++` implementation, ``shared_from_this`` will throw 
+``std::bad_weak_ptr``. When ``using_not_public()`` is called, this code will 
+crash without exception handling.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index e8f9b4e829634..7b9b905ef7671 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -101,6 +101,7 @@ Clang-Tidy Checks
    :doc:`bugprone-inaccurate-erase <bugprone/inaccurate-erase>`, "Yes"
    :doc:`bugprone-inc-dec-in-conditions <bugprone/inc-dec-in-conditions>`,
    :doc:`bugprone-incorrect-enable-if <bugprone/incorrect-enable-if>`, "Yes"
+   :doc:`bugprone-incorrect-enable-shared-from-this <bugprone/incorrect-enable-shared-from-this>`, "Yes"
    :doc:`bugprone-incorrect-roundings <bugprone/incorrect-roundings>`,
    :doc:`bugprone-infinite-loop <bugprone/infinite-loop>`,
    :doc:`bugprone-integer-division <bugprone/integer-division>`,
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/incorrect-enable-shared-from-this.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/incorrect-enable-shared-from-this.cpp
new file mode 100644
index 0000000000000..d9048ef359281
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/incorrect-enable-shared-from-this.cpp
@@ -0,0 +1,180 @@
+// RUN: %check_clang_tidy -std=c++11-or-later %s bugprone-incorrect-enable-shared-from-this %t
+
+// NOLINTBEGIN
+namespace std {
+    template <typename T> class enable_shared_from_this {};
+} //namespace std
+// NOLINTEND
+
+class BadClassExample : std::enable_shared_from_this<BadClassExample> {};
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: 'BadClassExample' is not publicly inheriting from 'std::enable_shared_from_this', which will cause unintended behaviour when using 'shared_from_this'; make the inheritance public [bugprone-incorrect-enable-shared-from-this]
+// CHECK-FIXES: public std::enable_shared_from_this<BadClassExample>
+
+class BadClass2Example : private std::enable_shared_from_this<BadClass2Example> {};
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: 'BadClass2Example' is not publicly inheriting from 'std::enable_shared_from_this', which will cause unintended behaviour when using 'shared_from_this'; make the inheritance public [bugprone-incorrect-enable-shared-from-this]
+// CHECK-FIXES: public std::enable_shared_from_this<BadClass2Example>
+
+struct BadStructExample : private std::enable_shared_from_this<BadStructExample> {};
+// CHECK-MESSAGES: :[[@LINE-1]]:8: warning: 'BadStructExample' is not publicly inheriting from 'std::enable_shared_from_this', which will cause unintended behaviour when using 'shared_from_this'; make the inheritance public [bugprone-incorrect-enable-shared-from-this]
+// CHECK-FIXES: public std::enable_shared_from_this<BadStructExample>
+
+class GoodClassExample : public std::enable_shared_from_this<GoodClassExample> {};
+
+struct GoodStructExample : public std::enable_shared_from_this<GoodStructExample> {};
+
+struct GoodStruct2Example : std::enable_shared_from_this<GoodStruct2Example> {};
+
+class dummy_class1 {};
+class dummy_class2 {};
+
+class BadMultiClassExample : std::enable_shared_from_this<BadMultiClassExample>, dummy_class1 {};
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: 'BadMultiClassExample' is not publicly inheriting from 'std::enable_shared_from_this', which will cause unintended behaviour when using 'shared_from_this'; make the inheritance public [bugprone-incorrect-enable-shared-from-this]
+// CHECK-FIXES: public std::enable_shared_from_this<BadMultiClassExample>, dummy_class1
+
+class BadMultiClass2Example : dummy_class1, std::enable_shared_from_this<BadMultiClass2Example>, dummy_class2 {};
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: 'BadMultiClass2Example' is not publicly inheriting from 'std::enable_shared_from_this', which will cause unintended behaviour when using 'shared_from_this'; make the inheritance public [bugprone-incorrect-enable-shared-from-this]
+// CHECK-FIXES: dummy_class1, public std::enable_shared_from_this<BadMultiClass2Example>, dummy_class2
+
+class BadMultiClass3Example : dummy_class1, dummy_class2, std::enable_shared_from_this<BadMultiClass3Example> {};
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: 'BadMultiClass3Example' is not publicly inheriting from 'std::enable_shared_from_this', which will cause unintended behaviour when using 'shared_from_this'; make the inheritance public [bugprone-incorrect-enable-shared-from-this]
+// CHECK-FIXES: dummy_class1, dummy_class2, public std::enable_shared_from_this<BadMultiClass3Example>
+
+class ClassBase : public std::enable_shared_from_this<ClassBase> {};
+class PrivateInheritClassBase : private ClassBase {};
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: 'PrivateInheritClassBase' is not publicly inheriting from 'ClassBase' which inherits from 'std::enable_shared_from_this', which will cause unintended behaviour when using 'shared_from_this'; make the inheritance public [bugprone-incorrect-enable-shared-from-this]
+
+class DefaultInheritClassBase : ClassBase {};
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: 'DefaultInheritClassBase' is not publicly inheriting from 'ClassBase' which inherits from 'std::enable_shared_from_this', which will cause unintended behaviour when using 'shared_from_this'; make the inheritance public [bugprone-incorrect-enable-shared-from-this]
+
+class PublicInheritClassBase : public ClassBase {};
+
+struct StructBase : public std::enable_shared_from_this<StructBase> {};
+struct PrivateInheritStructBase : private StructBase {};
+// CHECK-MESSAGES: :[[@LINE-1]]:8: warning: 'PrivateInheritStructBase' is not publicly inheriting from 'StructBase' which inherits from 'std::enable_shared_from_this', which will cause unintended behaviour when using 'shared_from_this'; make the inheritance public [bugprone-incorrect-enable-shared-from-this]
+
+struct DefaultInheritStructBase : StructBase {};
+
+struct PublicInheritStructBase : StructBase {};
+
+//alias the template itself
+template <typename T> using esft_template = std::enable_shared_from_this<T>;
+
+class PrivateAliasTemplateClassBase : private esft_template<PrivateAliasTemplateClassBase> {};
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: 'PrivateAliasTemplateClassBase' is not publicly inheriting from 'std::enable_shared_from_this', which will cause unintended behaviour when using 'shared_from_this'; make the inheritance public [bugprone-incorrect-enable-shared-from-this]
+// CHECK-FIXES: class PrivateAliasTemplateClassBase : public esft_template<PrivateAliasTemplateClassBase> {};
+
+class DefaultAliasTemplateClassBase : esft_template<DefaultAliasTemplateClassBase> {}; 
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: 'DefaultAliasTemplateClassBase' is not publicly inheriting from 'std::enable_shared_from_this', which will cause unintended behaviour when using 'shared_from_this'; make the inheritance public [bugprone-incorrect-enable-shared-from-this]
+// CHECK-FIXES: class DefaultAliasTemplateClassBase : public esft_template<DefaultAliasTemplateClassBase> {};
+
+class PublicAliasTemplateClassBase : public esft_template<PublicAliasTemplateClassBase> {}; 
+
+struct PrivateAliasTemplateStructBase : private esft_template<PrivateAliasTemplateStructBase> {}; 
+// CHECK-MESSAGES: :[[@LINE-1]]:8: warning: 'PrivateAliasTemplateStructBase' is not publicly inheriting from 'std::enable_shared_from_this', which will cause unintended behaviour when using 'shared_from_this'; make the inheritance public [bugprone-incorrect-enable-shared-from-this]
+// CHECK-FIXES: struct PrivateAliasTemplateStructBase : public esft_template<PrivateAliasTemplateStructBase> {};
+
+struct DefaultAliasTemplateStructBase : esft_template<DefaultAliasTemplateStructBase> {}; 
+
+struct PublicAliasTemplateStructBase : public esft_template<PublicAliasTemplateStructBase> {}; 
+
+//alias with specific instance
+using esft = std::enable_shared_from_this<ClassBase>;
+class PrivateAliasClassBase : private esft {};
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: 'PrivateAliasClassBase' is not publicly inheriting from 'std::enable_shared_from_this', which will cause unintended behaviour when using 'shared_from_this'; make the inheritance public [bugprone-incorrect-enable-shared-from-this]
+// CHECK-FIXES: class PrivateAliasClassBase : public esft {};
+
+class DefaultAliasClassBase : esft {}; 
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: 'DefaultAliasClassBase' is not publicly inheriting from 'std::enable_shared_from_this', which will cause unintended behaviour when using 'shared_from_this'; make the inheritance public [bugprone-incorrect-enable-shared-from-this]
+// CHECK-FIXES: class DefaultAliasClassBase : public esft {};
+
+class PublicAliasClassBase : public esft {}; 
+
+struct PrivateAliasStructBase : private esft {}; 
+// CHECK-MESSAGES: :[[@LINE-1]]:8: warning: 'PrivateAliasStructBase' is not publicly inheriting from 'std::enable_shared_from_this', which will cause unintended behaviour when using 'shared_from_this'; make the inheritance public [bugprone-incorrect-enable-shared-from-this]
+// CHECK-FIXES: struct PrivateAliasStructBase : public esft {};
+
+struct DefaultAliasStructBase : esft {}; 
+
+struct PublicAliasStructBase : public esft {}; 
+
+//we can only typedef a specific instance of the template
+typedef std::enable_shared_from_this<ClassBase> EnableSharedFromThis;
+class PrivateTypedefClassBase : private EnableSharedFromThis {}; 
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: 'PrivateTypedefClassBase' is not publicly inheriting from 'std::enable_shared_from_this', which will cause unintended behaviour when using 'shared_from_this'; make the inheritance public [bugprone-incorrect-enable-shared-from-this]
+// CHECK-FIXES: class PrivateTypedefClassBase : public EnableSharedFromThis {};
+
+class DefaultTypedefClassBase : EnableSharedFromThis {}; 
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: 'DefaultTypedefClassBase' is not publicly inheriting from 'std::enable_shared_from_this', which will cause unintended behaviour when using 'shared_from_this'; make the inheritance public [bugprone-incorrect-enable-shared-from-this]
+// CHECK-FIXES: class DefaultTypedefClassBase : public EnableSharedFromThis {};
+
+class PublicTypedefClassBase : public EnableSharedFromThis {}; 
+
+struct PrivateTypedefStructBase : private EnableSharedFromThis {}; 
+// CHECK-MESSAGES: :[[@LINE-1]]:8: warning: 'PrivateTypedefStructBase' is not publicly inheriting from 'std::enable_shared_from_this', which will cause unintended behaviour when using 'shared_from_this'; make the inheritance public [bugprone-incorrect-enable-shared-from-this]
+// CHECK-FIXES: struct PrivateTypedefStructBase : public EnableSharedFromThis {};
+
+struct DefaultTypedefStructBase : EnableSharedFromThis {}; 
+
+struct PublicTypedefStructBase : public EnableSharedFromThis {}; 
+
+#define PRIVATE_ESFT_CLASS(ClassName) \
+   class ClassName: private std::enable_shared_from_this<ClassName> { \
+   };
+
+PRIVATE_ESFT_CLASS(PrivateEsftClass);
+// CHECK-MESSAGES: :[[@LINE-1]]:20: warning: 'PrivateEsftClass' is not publicly inheriting from 'std::enable_shared_from_this', which will cause unintended behaviour when using 'shared_from_this'; make the inheritance public [bugprone-incorrect-enable-shared-from-this]
+
+#define DEFAULT_ESFT_CLASS(ClassName) \
+   class ClassName: std::enable_shared_from_this<ClassName> { \
+   };
+
+DEFAULT_ESFT_CLASS(DefaultEsftClass);
+// CHECK-MESSAGES: :[[@LINE-1]]:20: warning: 'DefaultEsftClass' is not publicly inheriting from 'std::enable_shared_from_this', which will cause unintended behaviour when using 'shared_from_this'; make the inheritance public [bugprone-incorrect-enable-shared-from-this]
+
+#define PUBLIC_ESFT_CLASS(ClassName) \
+   class ClassName: public std::enable_shared_from_this<ClassName> { \
+   };
+
+PUBLIC_ESFT_CLASS(PublicEsftClass);
+
+#define PRIVATE_ESFT_STRUCT(StructName) \
+   struct StructName: private std::enable_shared_from_this<StructName> { \
+   };
+
+PRIVATE_ESFT_STRUCT(PrivateEsftStruct);
+// CHECK-MESSAGES: :[[@LINE-1]]:21: warning: 'PrivateEsftStruct' is not publicly inheriting from 'std::enable_shared_from_this', which will cause unintended behaviour when using 'shared_from_this'; make the inheritance public [bugprone-incorrect-enable-shared-from-this]
+
+#define DEFAULT_ESFT_STRUCT(StructName) \
+   struct StructName: std::enable_shared_from_this<StructName> { \
+   };
+
+DEFAULT_ESFT_STRUCT(DefaultEsftStruct);
+
+#define PUBLIC_ESFT_STRUCT(StructName) \
+   struct StructName: std::enable_shared_from_this<StructName> { \
+   };
+
+PUBLIC_ESFT_STRUCT(PublicEsftStruct);
+
+struct A : std::enable_shared_from_this<A> {};
+#define MACRO_A A
+
+class B : MACRO_A {};
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: 'B' is not publicly inheriting from 'A' which inherits from 'std::enable_shared_from_this', which will cause unintended behaviour when using 'shared_from_this'; make the inheritance public [bugprone-incorrect-enable-shared-from-this]
+
+class C : private MACRO_A {};
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: 'C' is not publicly inheriting from 'A' which inherits from 'std::enable_shared_from_this', which will cause unintended behaviour when using 'shared_from_this'; make the inheritance public [bugprone-incorrect-enable-shared-from-this]
+
+class D : public MACRO_A {};
+
+#define MACRO_PARAM(CLASS) std::enable_shared_from_this<CLASS>
+
+class E : MACRO_PARAM(E) {};
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: 'E' is not publicly inheriting from 'std::enable_shared_from_this', which will cause unintended behaviour when using 'shared_from_this'; make the inheritance public [bugprone-incorrect-enable-shared-from-this]
+// CHECK-FIXES: class E : public MACRO_PARAM(E) {};
+
+class F : private MACRO_PARAM(F) {};
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: 'F' is not publicly inheriting from 'std::enable_shared_from_this', which will cause unintended behaviour when using 'shared_from_this'; make the inheritance public [bugprone-incorrect-enable-shared-from-this]
+// CHECK-FIXES: class F : public MACRO_PARAM(F) {};
+
+class G : public MACRO_PARAM(G) {};

From 753295835544f16b4e180576aa020680231c4318 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Sun, 12 Jan 2025 10:05:24 +0000
Subject: [PATCH 196/408] [gn build] Port 8ebc35f8d041

---
 .../gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
index 670f24c242a89..223f273853d4f 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
@@ -41,6 +41,7 @@ static_library("bugprone") {
     "InaccurateEraseCheck.cpp",
     "IncDecInConditionsCheck.cpp",
     "IncorrectEnableIfCheck.cpp",
+    "IncorrectEnableSharedFromThisCheck.cpp",
     "IncorrectRoundingsCheck.cpp",
     "InfiniteLoopCheck.cpp",
     "IntegerDivisionCheck.cpp",

From 6422546e996c769dda39a681da090fe28870a376 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Sun, 12 Jan 2025 15:17:12 +0100
Subject: [PATCH 197/408] [mlir][LLVM] Fix conversion of non-standard MLIR
 float types (#122634)

Certain non-standard float types were directly passed through in the
LLVM type converter, resulting in invalid IR or failed assertions:

```
mlir-opt: mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp:638: FailureOr<Type> mlir::LLVMTypeConverter::convertVectorType(VectorType) const: Assertion `LLVM::isCompatibleVectorType(vectorType) && "expected vector type compatible with the LLVM dialect"' failed.
```

The LLVM type converter should not define invalid type conversion rules
for such types. If there is no type conversion rule, conversion patterns
will not apply to ops with such operand types.
---
 .../Conversion/LLVMCommon/TypeConverter.cpp   | 10 ++++-
 .../Conversion/ArithToLLVM/arith-to-llvm.mlir | 45 +++++++++++++++++++
 .../Conversion/FuncToLLVM/func-to-llvm.mlir   |  8 ++++
 3 files changed, 62 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
index 72799e42cf3fd..64bdb248dff43 100644
--- a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
@@ -294,13 +294,21 @@ Type LLVMTypeConverter::convertIntegerType(IntegerType type) const {
 }
 
 Type LLVMTypeConverter::convertFloatType(FloatType type) const {
+  // Valid LLVM float types are used directly.
+  if (LLVM::isCompatibleType(type))
+    return type;
+
+  // F4, F6, F8 types are converted to integer types with the same bit width.
   if (type.isFloat8E5M2() || type.isFloat8E4M3() || type.isFloat8E4M3FN() ||
       type.isFloat8E5M2FNUZ() || type.isFloat8E4M3FNUZ() ||
       type.isFloat8E4M3B11FNUZ() || type.isFloat8E3M4() ||
       type.isFloat4E2M1FN() || type.isFloat6E2M3FN() || type.isFloat6E3M2FN() ||
       type.isFloat8E8M0FNU())
     return IntegerType::get(&getContext(), type.getWidth());
-  return type;
+
+  // Other floating-point types: A custom type conversion rule must be
+  // specified by the user.
+  return Type();
 }
 
 // Convert a `ComplexType` to an LLVM type. The result is a complex number
diff --git a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
index a9dcc0a16b3db..1dabacfd8a47c 100644
--- a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
+++ b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
@@ -37,6 +37,8 @@ func.func @vector_ops(%arg0: vector<4xf32>, %arg1: vector<4xi1>, %arg2: vector<4
   return %1 : vector<4xf32>
 }
 
+// -----
+
 // CHECK-LABEL: @ops
 func.func @ops(f32, f32, i32, i32, f64) -> (f32, i32) {
 ^bb0(%arg0: f32, %arg1: f32, %arg2: i32, %arg3: i32, %arg4: f64):
@@ -84,9 +86,14 @@ func.func @ops(f32, f32, i32, i32, f64) -> (f32, i32) {
   %20 = arith.shrsi %arg2, %arg3 : i32
 // CHECK: = llvm.lshr %arg2, %arg3 : i32
   %21 = arith.shrui %arg2, %arg3 : i32
+// CHECK: arith.constant 2.000000e+00 : tf32
+  // There is no type conversion rule for tf32.
+  %22 = arith.constant 2.0 : tf32
   return %0, %10 : f32, i32
 }
 
+// -----
+
 // Checking conversion of index types to integers using i1, assuming no target
 // system would have a 1-bit address space.  Otherwise, we would have had to
 // make this test dependent on the pointer size on the target system.
@@ -99,6 +106,8 @@ func.func @index_cast(%arg0: index, %arg1: i1) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: @vector_index_cast
 func.func @vector_index_cast(%arg0: vector<2xindex>, %arg1: vector<2xi1>) {
 // CHECK: = llvm.trunc %{{.*}} : vector<2xi{{.*}}> to vector<2xi1>
@@ -108,6 +117,8 @@ func.func @vector_index_cast(%arg0: vector<2xindex>, %arg1: vector<2xi1>) {
   return
 }
 
+// -----
+
 func.func @index_castui(%arg0: index, %arg1: i1) {
 // CHECK: = llvm.trunc %0 : i{{.*}} to i1
   %0 = arith.index_castui %arg0: index to i1
@@ -116,6 +127,8 @@ func.func @index_castui(%arg0: index, %arg1: i1) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: @vector_index_castui
 func.func @vector_index_castui(%arg0: vector<2xindex>, %arg1: vector<2xi1>) {
 // CHECK: = llvm.trunc %{{.*}} : vector<2xi{{.*}}> to vector<2xi1>
@@ -125,6 +138,8 @@ func.func @vector_index_castui(%arg0: vector<2xindex>, %arg1: vector<2xi1>) {
   return
 }
 
+// -----
+
 // Checking conversion of signed integer types to floating point.
 // CHECK-LABEL: @sitofp
 func.func @sitofp(%arg0 : i32, %arg1 : i64) {
@@ -139,6 +154,8 @@ func.func @sitofp(%arg0 : i32, %arg1 : i64) {
   return
 }
 
+// -----
+
 // Checking conversion of integer vectors to floating point vector types.
 // CHECK-LABEL: @sitofp_vector
 func.func @sitofp_vector(%arg0 : vector<2xi16>, %arg1 : vector<2xi32>, %arg2 : vector<2xi64>) {
@@ -157,6 +174,8 @@ func.func @sitofp_vector(%arg0 : vector<2xi16>, %arg1 : vector<2xi32>, %arg2 : v
   return
 }
 
+// -----
+
 // Checking conversion of unsigned integer types to floating point.
 // CHECK-LABEL: @uitofp
 func.func @uitofp(%arg0 : i32, %arg1 : i64) {
@@ -171,6 +190,8 @@ func.func @uitofp(%arg0 : i32, %arg1 : i64) {
   return
 }
 
+// -----
+
 // Checking conversion of integer types to floating point.
 // CHECK-LABEL: @fpext
 func.func @fpext(%arg0 : f16, %arg1 : f32) {
@@ -183,6 +204,8 @@ func.func @fpext(%arg0 : f16, %arg1 : f32) {
   return
 }
 
+// -----
+
 // Checking conversion of integer types to floating point.
 // CHECK-LABEL: @fpext
 func.func @fpext_vector(%arg0 : vector<2xf16>, %arg1 : vector<2xf32>) {
@@ -195,6 +218,8 @@ func.func @fpext_vector(%arg0 : vector<2xf16>, %arg1 : vector<2xf32>) {
   return
 }
 
+// -----
+
 // Checking conversion of floating point to integer types.
 // CHECK-LABEL: @fptosi
 func.func @fptosi(%arg0 : f32, %arg1 : f64) {
@@ -209,6 +234,8 @@ func.func @fptosi(%arg0 : f32, %arg1 : f64) {
   return
 }
 
+// -----
+
 // Checking conversion of floating point vectors to integer vector types.
 // CHECK-LABEL: @fptosi_vector
 func.func @fptosi_vector(%arg0 : vector<2xf16>, %arg1 : vector<2xf32>, %arg2 : vector<2xf64>) {
@@ -227,6 +254,8 @@ func.func @fptosi_vector(%arg0 : vector<2xf16>, %arg1 : vector<2xf32>, %arg2 : v
   return
 }
 
+// -----
+
 // Checking conversion of floating point to integer types.
 // CHECK-LABEL: @fptoui
 func.func @fptoui(%arg0 : f32, %arg1 : f64) {
@@ -241,6 +270,8 @@ func.func @fptoui(%arg0 : f32, %arg1 : f64) {
   return
 }
 
+// -----
+
 // Checking conversion of floating point vectors to integer vector types.
 // CHECK-LABEL: @fptoui_vector
 func.func @fptoui_vector(%arg0 : vector<2xf16>, %arg1 : vector<2xf32>, %arg2 : vector<2xf64>) {
@@ -259,6 +290,8 @@ func.func @fptoui_vector(%arg0 : vector<2xf16>, %arg1 : vector<2xf32>, %arg2 : v
   return
 }
 
+// -----
+
 // Checking conversion of integer vectors to floating point vector types.
 // CHECK-LABEL: @uitofp_vector
 func.func @uitofp_vector(%arg0 : vector<2xi16>, %arg1 : vector<2xi32>, %arg2 : vector<2xi64>) {
@@ -277,6 +310,8 @@ func.func @uitofp_vector(%arg0 : vector<2xi16>, %arg1 : vector<2xi32>, %arg2 : v
   return
 }
 
+// -----
+
 // Checking conversion of integer types to floating point.
 // CHECK-LABEL: @fptrunc
 func.func @fptrunc(%arg0 : f32, %arg1 : f64) {
@@ -289,6 +324,8 @@ func.func @fptrunc(%arg0 : f32, %arg1 : f64) {
   return
 }
 
+// -----
+
 // Checking conversion of integer types to floating point.
 // CHECK-LABEL: @fptrunc
 func.func @fptrunc_vector(%arg0 : vector<2xf32>, %arg1 : vector<2xf64>) {
@@ -301,6 +338,8 @@ func.func @fptrunc_vector(%arg0 : vector<2xf32>, %arg1 : vector<2xf64>) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: experimental_constrained_fptrunc
 func.func @experimental_constrained_fptrunc(%arg0 : f64) {
 // CHECK-NEXT: = llvm.intr.experimental.constrained.fptrunc {{.*}} tonearest ignore : f64 to f32
@@ -316,6 +355,8 @@ func.func @experimental_constrained_fptrunc(%arg0 : f64) {
   return
 }
 
+// -----
+
 // Check sign and zero extension and truncation of integers.
 // CHECK-LABEL: @integer_extension_and_truncation
 func.func @integer_extension_and_truncation(%arg0 : i3) {
@@ -328,6 +369,8 @@ func.func @integer_extension_and_truncation(%arg0 : i3) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: @integer_cast_0d_vector
 func.func @integer_cast_0d_vector(%arg0 : vector<i3>) {
 // CHECK: %[[ARG0:.*]] = builtin.unrealized_conversion_cast
@@ -340,6 +383,8 @@ func.func @integer_cast_0d_vector(%arg0 : vector<i3>) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @fcmp(%arg0: f32, %arg1: f32) {
 func.func @fcmp(f32, f32) -> () {
 ^bb0(%arg0: f32, %arg1: f32):
diff --git a/mlir/test/Conversion/FuncToLLVM/func-to-llvm.mlir b/mlir/test/Conversion/FuncToLLVM/func-to-llvm.mlir
index 8396e5ad8ade1..22ac6eae73f53 100644
--- a/mlir/test/Conversion/FuncToLLVM/func-to-llvm.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/func-to-llvm.mlir
@@ -555,6 +555,14 @@ func.func @index_arg(%arg0: index) -> index {
   return %arg1 : index
 }
 
+// There is no type conversion rule for tf32, so vector<1xtf32> and, therefore,
+// the func op cannot be converted.
+// CHECK: func.func @non_convertible_arg_type({{.*}}: vector<1xtf32>)
+// CHECK:   llvm.return
+func.func @non_convertible_arg_type(%arg: vector<1xtf32>) {
+  return
+}
+
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%toplevel_module: !transform.any_op {transform.readonly}) {
     %func = transform.structured.match ops{["func.func"]} in %toplevel_module

From d080f78772acf9de4961b89062c02fdd5f82186a Mon Sep 17 00:00:00 2001
From: TilakChad <49703944+TilakChad@users.noreply.github.com>
Date: Sun, 12 Jan 2025 20:27:04 +0545
Subject: [PATCH 198/408] [Clang] Fixed a crash when __PRETTY_FUNCTION__ or
 __FUNCSIG__ (clang-cl) appears in the trailing return type of the lambda 
 (#122611)

The (function) type of the lambda function is null while parsing
trailing return type. The type is filled-in when the lambda body is
entered. So, resolving `__PRETTY_FUNCTION__` before the lambda body is
entered causes the crash.

Fixes #121274.
---
 clang/docs/ReleaseNotes.rst           |  1 +
 clang/lib/AST/Expr.cpp                | 10 +++++++++-
 clang/test/SemaCXX/crash-GH121274.cpp | 15 +++++++++++++++
 3 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/SemaCXX/crash-GH121274.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 197b3692b8a18..a14fb189c8e13 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -926,6 +926,7 @@ Bug Fixes to C++ Support
   (`LWG3929 <https://wg21.link/LWG3929>`__.) (#GH121278)
 - Clang now identifies unexpanded parameter packs within the type constraint on a non-type template parameter. (#GH88866)
 - Fixed an issue while resolving type of expression indexing into a pack of values of non-dependent type (#GH121242)
+- Fixed a crash when __PRETTY_FUNCTION__ or __FUNCSIG__ (clang-cl) appears in the trailing return type of the lambda (#GH121274)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index 5331357b5d1fe..f6a4ed970cb23 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -774,7 +774,15 @@ std::string PredefinedExpr::ComputeName(PredefinedIdentKind IK,
     const FunctionDecl *Decl = FD;
     if (const FunctionDecl* Pattern = FD->getTemplateInstantiationPattern())
       Decl = Pattern;
-    const FunctionType *AFT = Decl->getType()->getAs<FunctionType>();
+
+    // Bail out if the type of the function has not been set yet.
+    // This can notably happen in the trailing return type of a lambda
+    // expression.
+    const Type *Ty = Decl->getType().getTypePtrOrNull();
+    if (!Ty)
+      return "";
+
+    const FunctionType *AFT = Ty->getAs<FunctionType>();
     const FunctionProtoType *FT = nullptr;
     if (FD->hasWrittenPrototype())
       FT = dyn_cast<FunctionProtoType>(AFT);
diff --git a/clang/test/SemaCXX/crash-GH121274.cpp b/clang/test/SemaCXX/crash-GH121274.cpp
new file mode 100644
index 0000000000000..28677a0949bf9
--- /dev/null
+++ b/clang/test/SemaCXX/crash-GH121274.cpp
@@ -0,0 +1,15 @@
+// RUN: %clang_cc1 -std=c++11 -verify %s
+// expected-no-diagnostics
+
+// Do not crash when __PRETTY_FUNCTION__ appears in the trailing return type of the lambda
+void foo() {
+	[]() -> decltype(static_cast<const char*>(__PRETTY_FUNCTION__)) {
+		return nullptr;
+	}();
+
+#ifdef MS
+	[]() -> decltype(static_cast<const char*>(__FUNCSIG__)) {
+		return nullptr;
+	}();
+#endif
+}

From 26b4a0ac7ed3f04f10bd1c043e2cf9c52da7fc47 Mon Sep 17 00:00:00 2001
From: eleviant <56861949+eleviant@users.noreply.github.com>
Date: Sun, 12 Jan 2025 16:18:26 +0100
Subject: [PATCH 199/408] Add 'unifiedlto' option to gold plugin (#121336)

Option allows using full LTO when linking bitcode files compiled with
unified LTO pipeline.
---
 .../tools/gold/X86/Inputs/unified-lto-foo.ll  | 26 +++++++++
 llvm/test/tools/gold/X86/unified-lto.ll       | 57 +++++++++++++++++++
 llvm/tools/gold/gold-plugin.cpp               | 12 +++-
 3 files changed, 94 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/tools/gold/X86/Inputs/unified-lto-foo.ll
 create mode 100644 llvm/test/tools/gold/X86/unified-lto.ll

diff --git a/llvm/test/tools/gold/X86/Inputs/unified-lto-foo.ll b/llvm/test/tools/gold/X86/Inputs/unified-lto-foo.ll
new file mode 100644
index 0000000000000..cd6cb16cd332c
--- /dev/null
+++ b/llvm/test/tools/gold/X86/Inputs/unified-lto-foo.ll
@@ -0,0 +1,26 @@
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+@_g = dso_local local_unnamed_addr global i32 0, align 4
+@llvm.compiler.used = appending global [1 x ptr] [ptr @_g], section "llvm.metadata"
+
+define dso_local i32 @foo(i32 noundef %0) #0 {
+  %2 = add nsw i32 %0, 42
+  store i32 %2, ptr @_g, align 4
+  ret i32 %2
+}
+
+attributes #0 = { noinline }
+
+
+!llvm.module.flags = !{!0, !1}
+
+!0 = !{i32 1, !"EnableSplitLTOUnit", i32 1}
+!1 = !{i32 1, !"UnifiedLTO", i32 1}
+
+^0 = module: (path: "unified-lto-foo.o", hash: (1995435377, 1643957463, 3737953841, 2630641917, 1043992145))
+^1 = gv: (name: "foo", summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 3, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 1, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), refs: (writeonly ^2)))) ; guid = 6699318081062747564
+^2 = gv: (name: "_g", summaries: (variable: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), varFlags: (readonly: 1, writeonly: 1, constant: 0)))) ; guid = 9713702464056781075
+^3 = gv: (name: "llvm.compiler.used", summaries: (variable: (module: ^0, flags: (linkage: appending, visibility: default, notEligibleToImport: 0, live: 1, dsoLocal: 0, canAutoHide: 0), varFlags: (readonly: 0, writeonly: 0, constant: 0), refs: (^3)))) ; guid = 9610627770985738006
+^4 = flags: 520
+^5 = blockcount: 0
diff --git a/llvm/test/tools/gold/X86/unified-lto.ll b/llvm/test/tools/gold/X86/unified-lto.ll
new file mode 100644
index 0000000000000..e5030e863a64a
--- /dev/null
+++ b/llvm/test/tools/gold/X86/unified-lto.ll
@@ -0,0 +1,57 @@
+; Check that we can use full LTO with gold plugin when inputs
+; are compiled using unified LTO pipeline
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-as %p/Inputs/unified-lto-foo.ll -o %t-foo.bc
+; RUN: %gold -plugin %llvmshlibdir/LLVMgold%shlibext \
+; RUN:    -m elf_x86_64 \
+; RUN:    -plugin-opt=unifiedlto \
+; RUN:    -plugin-opt=save-temps \
+; RUN:    -u main \
+; RUN:    %t.bc %t-foo.bc \
+; RUN:    -o %t-out
+; RUN: llvm-dis %t-out.0.5.precodegen.bc -o - | FileCheck %s
+
+; Check thin LTO as well
+; RUN: %gold -plugin %llvmshlibdir/LLVMgold%shlibext \
+; RUN:    -m elf_x86_64 \
+; RUN:    -plugin-opt=unifiedlto \
+; RUN:    -plugin-opt=thinlto \
+; RUN:    -plugin-opt=save-temps \
+; RUN:    -u main \
+; RUN:    %t.bc %t-foo.bc \
+; RUN:    -o %t-out
+; RUN: llvm-dis %t.bc.5.precodegen.bc -o - | FileCheck %s --check-prefix=THIN
+
+; Constant propagation is not supported by thin LTO.
+; With full LTO we fold argument into constant 43
+; CHECK:       define dso_local noundef i32 @main()
+; CHECK-NEXT:    tail call fastcc void @foo()
+; CHECK-NEXT:    ret i32 43
+
+; CHECK:       define internal fastcc void @foo()
+; CHECK-NEXT:    store i32 43, ptr @_g, align 4
+
+; ThinLTO doesn't import foo, because the latter has noinline attribute
+; THIN:      define dso_local i32 @main()
+; THIN-NEXT:   %1 = tail call i32 @foo(i32 noundef 1)
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+define dso_local i32 @main() {
+  %1 = tail call i32 @foo(i32 noundef 1)
+  ret i32 %1
+}
+
+declare i32 @foo(i32 noundef)
+
+!llvm.module.flags = !{!0, !1}
+
+!0 = !{i32 1, !"EnableSplitLTOUnit", i32 1}
+!1 = !{i32 1, !"UnifiedLTO", i32 1}
+
+^0 = module: (path: "unified-lto.o", hash: (2850108895, 1189778381, 479678006, 1191715608, 4095117687))
+^1 = gv: (name: "foo") ; guid = 6699318081062747564
+^2 = gv: (name: "main", summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 0, alwaysInline: 0, noUnwind: 1, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^1, tail: 1))))) ; guid = 15822663052811949562
+^3 = flags: 520
+^4 = blockcount: 0
diff --git a/llvm/tools/gold/gold-plugin.cpp b/llvm/tools/gold/gold-plugin.cpp
index 6d0021c85f20f..ac2e9d4252aa3 100644
--- a/llvm/tools/gold/gold-plugin.cpp
+++ b/llvm/tools/gold/gold-plugin.cpp
@@ -154,6 +154,8 @@ namespace options {
   static std::string extra_library_path;
   static std::string triple;
   static std::string mcpu;
+  // Tells plugin to use unified lto
+  static bool unifiedlto = false;
   // When the thinlto plugin option is specified, only read the function
   // the information from intermediate files and write a combined
   // global index for the ThinLTO backends.
@@ -248,6 +250,8 @@ namespace options {
       TheOutputType = OT_DISABLE;
     } else if (opt == "emit-asm") {
       TheOutputType = OT_ASM_ONLY;
+    } else if (opt == "unifiedlto") {
+      unifiedlto = true;
     } else if (opt == "thinlto") {
       thinlto = true;
     } else if (opt == "thinlto-index-only") {
@@ -893,6 +897,7 @@ static std::unique_ptr<LTO> createLTO(IndexWriteCallback OnIndexWrite,
   Conf.OptLevel = options::OptLevel;
   Conf.PTO.LoopVectorization = options::OptLevel > 1;
   Conf.PTO.SLPVectorization = options::OptLevel > 1;
+  Conf.PTO.UnifiedLTO = options::unifiedlto;
   Conf.AlwaysEmitRegularLTOObj = !options::obj_path.empty();
 
   if (options::thinlto_index_only) {
@@ -972,8 +977,13 @@ static std::unique_ptr<LTO> createLTO(IndexWriteCallback OnIndexWrite,
   Conf.TimeTraceEnabled = !options::time_trace_file.empty();
   Conf.TimeTraceGranularity = options::time_trace_granularity;
 
+  LTO::LTOKind ltoKind = LTO::LTOK_Default;
+  if (options::unifiedlto)
+    ltoKind =
+        options::thinlto ? LTO::LTOK_UnifiedThin : LTO::LTOK_UnifiedRegular;
   return std::make_unique<LTO>(std::move(Conf), Backend,
-                                options::ParallelCodeGenParallelismLevel);
+                               options::ParallelCodeGenParallelismLevel,
+                               ltoKind);
 }
 
 // Write empty files that may be expected by a distributed build

From 66badf224ade6e78d5da005f6a9819092fd8767b Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Sun, 12 Jan 2025 15:19:29 +0000
Subject: [PATCH 200/408] VT: teach a special-case optz about samesign
 (#122590)

There is a narrow special-case in isImpliedCondICmps that can benefit
from being taught about samesign. Since it costs us nothing to implement
it, teach it about samesign, for completeness. This patch marks the
completion of the effort to teach ValueTracking about samesign.
---
 llvm/lib/Analysis/ValueTracking.cpp                    |  6 ++++--
 .../ValueTracking/implied-condition-samesign.ll        | 10 ++--------
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 53da8d2776a22..0e50fc60ce792 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -9495,7 +9495,8 @@ isImpliedCondICmps(const ICmpInst *LHS, CmpPredicate RPred, const Value *R0,
   // must be positive if X >= Y and no overflow".
   // Take SGT as an example:  L0:x > L1:y and C >= 0
   //                      ==> R0:(x -nsw y) < R1:(-C) is false
-  if ((LPred == ICmpInst::ICMP_SGT || LPred == ICmpInst::ICMP_SGE) &&
+  if ((CmpPredicate::getMatching(LPred, ICmpInst::ICMP_SGT) ||
+       CmpPredicate::getMatching(LPred, ICmpInst::ICMP_SGE)) &&
       match(R0, m_NSWSub(m_Specific(L0), m_Specific(L1)))) {
     if (match(R1, m_NonPositive()) &&
         isImpliedCondMatchingOperands(LPred, RPred) == false)
@@ -9504,7 +9505,8 @@ isImpliedCondICmps(const ICmpInst *LHS, CmpPredicate RPred, const Value *R0,
 
   // Take SLT as an example:  L0:x < L1:y and C <= 0
   //                      ==> R0:(x -nsw y) < R1:(-C) is true
-  if ((LPred == ICmpInst::ICMP_SLT || LPred == ICmpInst::ICMP_SLE) &&
+  if ((CmpPredicate::getMatching(LPred, ICmpInst::ICMP_SLT) ||
+       CmpPredicate::getMatching(LPred, ICmpInst::ICMP_SLE)) &&
       match(R0, m_NSWSub(m_Specific(L0), m_Specific(L1)))) {
     if (match(R1, m_NonNegative()) &&
         isImpliedCondMatchingOperands(LPred, RPred) == true)
diff --git a/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll b/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll
index 546ff2d77d86e..35cfadaa2965a 100644
--- a/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll
+++ b/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll
@@ -207,10 +207,7 @@ define i32 @gt_sub_nsw(i32 %x, i32 %y) {
 ; CHECK:       [[TAKEN]]:
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[X]], [[Y]]
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUB]], 1
-; CHECK-NEXT:    [[NEG:%.*]] = xor i32 [[SUB]], -1
-; CHECK-NEXT:    [[ABSCOND:%.*]] = icmp samesign ult i32 [[SUB]], -1
-; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[ABSCOND]], i32 [[NEG]], i32 [[ADD]]
-; CHECK-NEXT:    ret i32 [[ABS]]
+; CHECK-NEXT:    ret i32 [[ADD]]
 ; CHECK:       [[END]]:
 ; CHECK-NEXT:    ret i32 0
 ;
@@ -239,10 +236,7 @@ define i32 @ge_sub_nsw(i32 %x, i32 %y) {
 ; CHECK:       [[TAKEN]]:
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[X]], [[Y]]
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUB]], 1
-; CHECK-NEXT:    [[NEG:%.*]] = xor i32 [[SUB]], -1
-; CHECK-NEXT:    [[ABSCOND:%.*]] = icmp samesign ult i32 [[SUB]], -1
-; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[ABSCOND]], i32 [[NEG]], i32 [[ADD]]
-; CHECK-NEXT:    ret i32 [[ABS]]
+; CHECK-NEXT:    ret i32 [[ADD]]
 ; CHECK:       [[END]]:
 ; CHECK-NEXT:    ret i32 0
 ;

From 0d352b2ea767e043b47d78bfdbd6820356628314 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Sun, 12 Jan 2025 10:51:12 -0500
Subject: [PATCH 201/408] [Clang] Use `-targets=host-x86_64-unknown-linux-gnu`
 as bundler target (#122627)

This a prime patch to support generic target when using `--offload-compress`.
---
 clang/lib/Driver/ToolChains/HIPUtility.cpp    |  2 +-
 clang/test/Driver/cuda-arch-translation.cu    | 34 +++++++++----------
 clang/test/Driver/hip-code-object-version.hip |  8 ++---
 clang/test/Driver/hip-target-id.hip           |  6 ++--
 clang/test/Driver/hipspv-toolchain.hip        |  2 +-
 clang/test/Driver/linker-wrapper.c            |  8 ++---
 .../ClangLinkerWrapper.cpp                    |  2 +-
 7 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/HIPUtility.cpp b/clang/lib/Driver/ToolChains/HIPUtility.cpp
index 3f81c3cb0f80e..bfb6ec7a01058 100644
--- a/clang/lib/Driver/ToolChains/HIPUtility.cpp
+++ b/clang/lib/Driver/ToolChains/HIPUtility.cpp
@@ -291,7 +291,7 @@ void HIP::constructHIPFatbinCommand(Compilation &C, const JobAction &JA,
 
   // ToDo: Remove the dummy host binary entry which is required by
   // clang-offload-bundler.
-  std::string BundlerTargetArg = "-targets=host-x86_64-unknown-linux";
+  std::string BundlerTargetArg = "-targets=host-x86_64-unknown-linux-gnu";
   // AMDGCN:
   // For code object version 2 and 3, the offload kind in bundle ID is 'hip'
   // for backward compatibility. For code object version 4 and greater, the
diff --git a/clang/test/Driver/cuda-arch-translation.cu b/clang/test/Driver/cuda-arch-translation.cu
index a0ae16452692b..e4f83740a92eb 100644
--- a/clang/test/Driver/cuda-arch-translation.cu
+++ b/clang/test/Driver/cuda-arch-translation.cu
@@ -81,20 +81,20 @@
 // SM61:--image=profile=sm_61{{.*}}
 // SM62:--image=profile=sm_62{{.*}}
 // SM70:--image=profile=sm_70{{.*}}
-// GFX600:-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx600
-// GFX601:-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx601
-// GFX602:-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx602
-// GFX700:-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx700
-// GFX701:-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx701
-// GFX702:-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx702
-// GFX703:-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx703
-// GFX704:-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx704
-// GFX705:-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx705
-// GFX801:-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx801
-// GFX802:-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx802
-// GFX803:-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx803
-// GFX805:-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx805
-// GFX810:-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx810
-// GFX900:-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx900
-// GFX902:-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx902
-// SPIRV:-targets=host-x86_64-unknown-linux,hip-spirv64-amd-amdhsa--amdgcnspirv
+// GFX600:-targets=host-x86_64-unknown-linux-gnu,hipv4-amdgcn-amd-amdhsa--gfx600
+// GFX601:-targets=host-x86_64-unknown-linux-gnu,hipv4-amdgcn-amd-amdhsa--gfx601
+// GFX602:-targets=host-x86_64-unknown-linux-gnu,hipv4-amdgcn-amd-amdhsa--gfx602
+// GFX700:-targets=host-x86_64-unknown-linux-gnu,hipv4-amdgcn-amd-amdhsa--gfx700
+// GFX701:-targets=host-x86_64-unknown-linux-gnu,hipv4-amdgcn-amd-amdhsa--gfx701
+// GFX702:-targets=host-x86_64-unknown-linux-gnu,hipv4-amdgcn-amd-amdhsa--gfx702
+// GFX703:-targets=host-x86_64-unknown-linux-gnu,hipv4-amdgcn-amd-amdhsa--gfx703
+// GFX704:-targets=host-x86_64-unknown-linux-gnu,hipv4-amdgcn-amd-amdhsa--gfx704
+// GFX705:-targets=host-x86_64-unknown-linux-gnu,hipv4-amdgcn-amd-amdhsa--gfx705
+// GFX801:-targets=host-x86_64-unknown-linux-gnu,hipv4-amdgcn-amd-amdhsa--gfx801
+// GFX802:-targets=host-x86_64-unknown-linux-gnu,hipv4-amdgcn-amd-amdhsa--gfx802
+// GFX803:-targets=host-x86_64-unknown-linux-gnu,hipv4-amdgcn-amd-amdhsa--gfx803
+// GFX805:-targets=host-x86_64-unknown-linux-gnu,hipv4-amdgcn-amd-amdhsa--gfx805
+// GFX810:-targets=host-x86_64-unknown-linux-gnu,hipv4-amdgcn-amd-amdhsa--gfx810
+// GFX900:-targets=host-x86_64-unknown-linux-gnu,hipv4-amdgcn-amd-amdhsa--gfx900
+// GFX902:-targets=host-x86_64-unknown-linux-gnu,hipv4-amdgcn-amd-amdhsa--gfx902
+// SPIRV:-targets=host-x86_64-unknown-linux-gnu,hip-spirv64-amd-amdhsa--amdgcnspirv
diff --git a/clang/test/Driver/hip-code-object-version.hip b/clang/test/Driver/hip-code-object-version.hip
index 9d0afaeaa967d..30d8644dff54c 100644
--- a/clang/test/Driver/hip-code-object-version.hip
+++ b/clang/test/Driver/hip-code-object-version.hip
@@ -7,7 +7,7 @@
 
 // V4: "-mcode-object-version=4"
 // V4: "-mllvm" "--amdhsa-code-object-version=4"
-// V4: "-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx906"
+// V4: "-targets=host-x86_64-unknown-linux-gnu,hipv4-amdgcn-amd-amdhsa--gfx906"
 
 // Check bundle ID for code object version 5.
 
@@ -18,7 +18,7 @@
 
 // V5: "-mcode-object-version=5"
 // V5: "-mllvm" "--amdhsa-code-object-version=5"
-// V5: "-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx906"
+// V5: "-targets=host-x86_64-unknown-linux-gnu,hipv4-amdgcn-amd-amdhsa--gfx906"
 
 // Check bundle ID for code object version 6.
 
@@ -30,7 +30,7 @@
 // V6: warning: code object v6 is still in development and not ready for production use yet; use at your own risk
 // V6: "-mcode-object-version=6"
 // V6: "-mllvm" "--amdhsa-code-object-version=6"
-// V6: "-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx906"
+// V6: "-targets=host-x86_64-unknown-linux-gnu,hipv4-amdgcn-amd-amdhsa--gfx906"
 
 
 // Check bundle ID for code object version default
@@ -39,7 +39,7 @@
 // RUN:   --offload-arch=gfx906 -nogpuinc -nogpulib \
 // RUN:   %s 2>&1 | FileCheck -check-prefix=VD %s
 
-// VD: "-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx906"
+// VD: "-targets=host-x86_64-unknown-linux-gnu,hipv4-amdgcn-amd-amdhsa--gfx906"
 
 // Check invalid code object version option.
 
diff --git a/clang/test/Driver/hip-target-id.hip b/clang/test/Driver/hip-target-id.hip
index cf2ea0dc25daa..fee430fe08c8d 100644
--- a/clang/test/Driver/hip-target-id.hip
+++ b/clang/test/Driver/hip-target-id.hip
@@ -43,7 +43,7 @@
 // CHECK-SAME: "-plugin-opt=-mattr=-sramecc,+xnack"
 
 // CHECK: {{"[^"]*clang-offload-bundler[^"]*"}}
-// CHECK-SAME: "-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx908:sramecc+:xnack+,hipv4-amdgcn-amd-amdhsa--gfx908:sramecc-:xnack+"
+// CHECK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hipv4-amdgcn-amd-amdhsa--gfx908:sramecc+:xnack+,hipv4-amdgcn-amd-amdhsa--gfx908:sramecc-:xnack+"
 
 // Check canonicalization and repeating of target ID.
 
@@ -54,7 +54,7 @@
 // RUN:   --offload-arch=fiji \
 // RUN:   --no-offload-new-driver --rocm-path=%S/Inputs/rocm \
 // RUN:   %s 2>&1 | FileCheck -check-prefix=FIJI %s
-// FIJI: "-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx803"
+// FIJI: "-targets=host-x86_64-unknown-linux-gnu,hipv4-amdgcn-amd-amdhsa--gfx803"
 
 // RUN: not %clang -### --target=x86_64-linux-gnu \
 // RUN:   -x hip \
@@ -65,4 +65,4 @@
 // RUN:   --offload-arch=gfx906 \
 // RUN:   --no-offload-new-driver --rocm-path=%S/Inputs/rocm \
 // RUN:   %s 2>&1 | FileCheck -check-prefix=MULTI %s
-// MULTI: "-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx900:xnack+,hipv4-amdgcn-amd-amdhsa--gfx900:xnack-,hipv4-amdgcn-amd-amdhsa--gfx906,hipv4-amdgcn-amd-amdhsa--gfx908:sramecc+,hipv4-amdgcn-amd-amdhsa--gfx908:sramecc-"
+// MULTI: "-targets=host-x86_64-unknown-linux-gnu,hipv4-amdgcn-amd-amdhsa--gfx900:xnack+,hipv4-amdgcn-amd-amdhsa--gfx900:xnack-,hipv4-amdgcn-amd-amdhsa--gfx906,hipv4-amdgcn-amd-amdhsa--gfx908:sramecc+,hipv4-amdgcn-amd-amdhsa--gfx908:sramecc-"
diff --git a/clang/test/Driver/hipspv-toolchain.hip b/clang/test/Driver/hipspv-toolchain.hip
index a6c0166e8dce1..b2187acbcd5ab 100644
--- a/clang/test/Driver/hipspv-toolchain.hip
+++ b/clang/test/Driver/hipspv-toolchain.hip
@@ -24,7 +24,7 @@
 // CHECK-SAME: [[LOWER_BC]] "-o" "[[SPIRV_OUT:.*out]]"
 
 // CHECK: {{".*clang-offload-bundler"}} "-type=o" "-bundle-align=4096"
-// CHECK-SAME: "-targets=host-x86_64-unknown-linux,hip-spirv64----generic"
+// CHECK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-spirv64----generic"
 // CHECK-SAME: "-input={{.*}}" "-input=[[SPIRV_OUT]]" "-output=[[BUNDLE:.*hipfb]]"
 
 // CHECK: [[CLANG]] "-cc1" "-triple" {{".*"}} "-aux-triple" "spirv64"
diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
index 470af4d5d70ca..f416ee5f4463b 100644
--- a/clang/test/Driver/linker-wrapper.c
+++ b/clang/test/Driver/linker-wrapper.c
@@ -30,7 +30,7 @@ __attribute__((visibility("protected"), used)) int x;
 // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run --device-debug -O0 \
 // RUN:   --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=NVPTX-LINK-DEBUG
 
-// NVPTX-LINK-DEBUG: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 -O2 -flto {{.*}}.o {{.*}}.o -g 
+// NVPTX-LINK-DEBUG: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 -O2 -flto {{.*}}.o {{.*}}.o -g
 
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908 \
@@ -93,7 +93,7 @@ __attribute__((visibility("protected"), used)) int x;
 
 // CUDA: clang{{.*}} -o [[IMG_SM70:.+]] --target=nvptx64-nvidia-cuda -march=sm_70
 // CUDA: clang{{.*}} -o [[IMG_SM52:.+]] --target=nvptx64-nvidia-cuda -march=sm_52
-// CUDA: fatbinary{{.*}}-64 --create {{.*}}.fatbin --image=profile=sm_70,file=[[IMG_SM70]] --image=profile=sm_52,file=[[IMG_SM52]] 
+// CUDA: fatbinary{{.*}}-64 --create {{.*}}.fatbin --image=profile=sm_70,file=[[IMG_SM70]] --image=profile=sm_52,file=[[IMG_SM52]]
 // CUDA: usr/bin/ld{{.*}} {{.*}}.openmp.image.{{.*}}.o {{.*}}.cuda.image.{{.*}}.o
 
 // RUN: clang-offload-packager -o %t.out \
@@ -120,7 +120,7 @@ __attribute__((visibility("protected"), used)) int x;
 
 // HIP: clang{{.*}} -o [[IMG_GFX90A:.+]] --target=amdgcn-amd-amdhsa -mcpu=gfx90a
 // HIP: clang{{.*}} -o [[IMG_GFX908:.+]] --target=amdgcn-amd-amdhsa -mcpu=gfx908
-// HIP: clang-offload-bundler{{.*}}-type=o -bundle-align=4096 -compress -compression-level=6 -targets=host-x86_64-unknown-linux,hip-amdgcn-amd-amdhsa--gfx90a,hip-amdgcn-amd-amdhsa--gfx908 -input=/dev/null -input=[[IMG_GFX90A]] -input=[[IMG_GFX908]] -output={{.*}}.hipfb
+// HIP: clang-offload-bundler{{.*}}-type=o -bundle-align=4096 -compress -compression-level=6 -targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa--gfx90a,hip-amdgcn-amd-amdhsa--gfx908 -input=/dev/null -input=[[IMG_GFX90A]] -input=[[IMG_GFX908]] -output={{.*}}.hipfb
 
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908 \
@@ -211,7 +211,7 @@ __attribute__((visibility("protected"), used)) int x;
 // RUN:   %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=RELOCATABLE-LINK-HIP
 
 // RELOCATABLE-LINK-HIP: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa
-// RELOCATABLE-LINK-HIP: clang-offload-bundler{{.*}} -type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux,hip-amdgcn-amd-amdhsa--gfx90a -input=/dev/null -input={{.*}} -output={{.*}}
+// RELOCATABLE-LINK-HIP: clang-offload-bundler{{.*}} -type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa--gfx90a -input=/dev/null -input={{.*}} -output={{.*}}
 // RELOCATABLE-LINK-HIP: /usr/bin/ld.lld{{.*}}-r
 // RELOCATABLE-LINK-HIP: llvm-objcopy{{.*}}a.out --remove-section .llvm.offloading
 
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 9fba63b195bc1..c92590581a645 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -441,7 +441,7 @@ fatbinary(ArrayRef<std::pair<StringRef, StringRef>> InputFiles,
     CmdArgs.push_back(
         Args.MakeArgString(Twine("-compression-level=") + Arg->getValue()));
 
-  SmallVector<StringRef> Targets = {"-targets=host-x86_64-unknown-linux"};
+  SmallVector<StringRef> Targets = {"-targets=host-x86_64-unknown-linux-gnu"};
   for (const auto &[File, Arch] : InputFiles)
     Targets.push_back(Saver.save("hip-amdgcn-amd-amdhsa--" + Arch));
   CmdArgs.push_back(Saver.save(llvm::join(Targets, ",")));

From 4f7dc1b55ae5b8ed1a36dd941ef4f9920bfdac8d Mon Sep 17 00:00:00 2001
From: Ruhung <143302514+Ruhung@users.noreply.github.com>
Date: Sun, 12 Jan 2025 23:51:58 +0800
Subject: [PATCH 202/408] [InstCombine] Fold (add (add A, 1), (sext (icmp ne A,
 0))) to call umax(A, 1) (#122491)

Transform (add (add A, 1), (sext (icmp ne A, 0))) into call umax(A, 1).

Fixes #121853.

Alive2: https://alive2.llvm.org/ce/z/TweTan
---
 .../InstCombine/InstCombineAddSub.cpp         |   9 ++
 .../Transforms/InstCombine/add-sext-icmp.ll   | 139 ++++++++++++++++++
 2 files changed, 148 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/add-sext-icmp.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 73876d00e73a7..658bbbc569766 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1775,6 +1775,15 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
     }
   }
 
+  // (add (add A, 1), (sext (icmp ne A, 0))) => call umax(A, 1)
+  if (match(LHS, m_Add(m_Value(A), m_One())) &&
+      match(RHS, m_OneUse(m_SExt(m_OneUse(m_SpecificICmp(
+                     ICmpInst::ICMP_NE, m_Specific(A), m_ZeroInt())))))) {
+    Value *OneConst = ConstantInt::get(A->getType(), 1);
+    Value *UMax = Builder.CreateBinaryIntrinsic(Intrinsic::umax, A, OneConst);
+    return replaceInstUsesWith(I, UMax);
+  }
+
   if (Instruction *Ashr = foldAddToAshr(I))
     return Ashr;
 
diff --git a/llvm/test/Transforms/InstCombine/add-sext-icmp.ll b/llvm/test/Transforms/InstCombine/add-sext-icmp.ll
new file mode 100644
index 0000000000000..26bfc56690cb3
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/add-sext-icmp.ll
@@ -0,0 +1,139 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+; The pattern:
+;   add(add A, 1), (sext(icmp ne A, 0))
+; is transformed into:
+;   umax(A, 1)
+
+define i32 @add_sext_icmp(i32 %A) {
+; CHECK-LABEL: define i32 @add_sext_icmp(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[ADD2:%.*]] = call i32 @llvm.umax.i32(i32 [[A]], i32 1)
+; CHECK-NEXT:    ret i32 [[ADD2]]
+;
+  %add1 = add i32 %A, 1
+  %icmp = icmp ne i32 %A, 0
+  %sext = sext i1 %icmp to i32
+  %add2 = add i32 %add1, %sext
+  ret i32 %add2
+}
+
+define i32 @add_sext_icmp_commutative(i32 %A) {
+; CHECK-LABEL: define i32 @add_sext_icmp_commutative(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[ADD2:%.*]] = call i32 @llvm.umax.i32(i32 [[A]], i32 1)
+; CHECK-NEXT:    ret i32 [[ADD2]]
+;
+  %add1 = add i32 %A, 1
+  %icmp = icmp ne i32 %A, 0
+  %sext = sext i1 %icmp to i32
+  %add2 = add i32 %sext, %add1
+  ret i32 %add2
+}
+
+; Negative test
+
+define i32 @add_sext_icmp_negative_constant(i32 %A) {
+; CHECK-LABEL: define i32 @add_sext_icmp_negative_constant(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[A]], 2
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp ne i32 [[A]], 0
+; CHECK-NEXT:    [[SEXT:%.*]] = sext i1 [[ICMP]] to i32
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[ADD1]], [[SEXT]]
+; CHECK-NEXT:    ret i32 [[ADD2]]
+;
+  %add1 = add i32 %A, 2
+  %icmp = icmp ne i32 %A, 0
+  %sext = sext i1 %icmp to i32
+  %add2 = add i32 %add1, %sext
+  ret i32 %add2
+}
+
+define i32 @add_sext_icmp_negative_pred(i32 %A) {
+; CHECK-LABEL: define i32 @add_sext_icmp_negative_pred(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[A]], 1
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp eq i32 [[A]], 0
+; CHECK-NEXT:    [[SEXT:%.*]] = sext i1 [[ICMP]] to i32
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[ADD1]], [[SEXT]]
+; CHECK-NEXT:    ret i32 [[ADD2]]
+;
+  %add1 = add i32 %A, 1
+  %icmp = icmp eq i32 %A, 0
+  %sext = sext i1 %icmp to i32
+  %add2 = add i32 %add1, %sext
+  ret i32 %add2
+}
+
+; multi-use test
+
+define i32 @add_sext_icmp_multi_use_add2(i32 %A) {
+; CHECK-LABEL: define i32 @add_sext_icmp_multi_use_add2(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[ADD2:%.*]] = call i32 @llvm.umax.i32(i32 [[A]], i32 1)
+; CHECK-NEXT:    call void @use(i32 [[ADD2]])
+; CHECK-NEXT:    ret i32 [[ADD2]]
+;
+  %add1 = add i32 %A, 1
+  %icmp = icmp ne i32 %A, 0
+  %sext = sext i1 %icmp to i32
+  %add2 = add i32 %add1, %sext
+  call void @use(i32 %add2)
+  ret i32 %add2
+}
+
+define i32 @add_sext_icmp_multi_use_sext(i32 %A) {
+; CHECK-LABEL: define i32 @add_sext_icmp_multi_use_sext(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[A]], 1
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp ne i32 [[A]], 0
+; CHECK-NEXT:    [[SEXT:%.*]] = sext i1 [[ICMP]] to i32
+; CHECK-NEXT:    call void @use(i32 [[SEXT]])
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[ADD1]], [[SEXT]]
+; CHECK-NEXT:    ret i32 [[ADD2]]
+;
+  %add1 = add i32 %A, 1
+  %icmp = icmp ne i32 %A, 0
+  %sext = sext i1 %icmp to i32
+  call void @use(i32 %sext)
+  %add2 = add i32 %add1, %sext
+  ret i32 %add2
+}
+
+define i32 @add_sext_icmp_multi_use_icmp(i32 %A) {
+; CHECK-LABEL: define i32 @add_sext_icmp_multi_use_icmp(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[A]], 1
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp ne i32 [[A]], 0
+; CHECK-NEXT:    call void @use(i1 [[ICMP]])
+; CHECK-NEXT:    [[SEXT:%.*]] = sext i1 [[ICMP]] to i32
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[ADD1]], [[SEXT]]
+; CHECK-NEXT:    ret i32 [[ADD2]]
+;
+  %add1 = add i32 %A, 1
+  %icmp = icmp ne i32 %A, 0
+  call void @use(i1 %icmp)
+  %sext = sext i1 %icmp to i32
+  %add2 = add i32 %add1, %sext
+  ret i32 %add2
+}
+
+define i32 @add_sext_icmp_multi_use_add1(i32 %A) {
+; CHECK-LABEL: define i32 @add_sext_icmp_multi_use_add1(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[A]], 1
+; CHECK-NEXT:    call void @use(i32 [[ADD1]])
+; CHECK-NEXT:    [[ADD2:%.*]] = call i32 @llvm.umax.i32(i32 [[A]], i32 1)
+; CHECK-NEXT:    ret i32 [[ADD2]]
+;
+  %add1 = add i32 %A, 1
+  call void @use(i32 %add1)
+  %icmp = icmp ne i32 %A, 0
+  %sext = sext i1 %icmp to i32
+  %add2 = add i32 %add1, %sext
+  ret i32 %add2
+}
+
+declare void @use(i32)
+

From be6c752e157638849f1f59f7e2b7ecbe11a022fe Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 12 Jan 2025 15:59:05 +0000
Subject: [PATCH 203/408] [X86] X86FixupVectorConstantsPass - use VPMOVSX/ZX
 extensions for PS/PD domain moves (#122601)

For targets with free domain moves, or AVX512 support, allow the use of VPMOVSX/ZX extension loads to reduce the load sizes.

I've limited this to extension to i32/i64 types as we're mostly interested in shuffle mask loading here, but we could include i16 types as well just as easily.

Inspired by a regression on #122485
---
 .../Target/X86/X86FixupVectorConstants.cpp    |  119 +-
 llvm/test/CodeGen/X86/avx512-build-vector.ll  |    2 +-
 llvm/test/CodeGen/X86/avx512-mask-op.ll       |    4 +-
 .../X86/avx512-shuffles/partial_permute.ll    |  215 ++--
 .../CodeGen/X86/avx512-shuffles/permute.ll    |  144 +--
 llvm/test/CodeGen/X86/avx512vl-intrinsics.ll  |   12 +-
 llvm/test/CodeGen/X86/combine-or.ll           |   15 +-
 llvm/test/CodeGen/X86/combine-sub-usat.ll     |   36 +-
 .../CodeGen/X86/expand-vp-cast-intrinsics.ll  |    2 +-
 llvm/test/CodeGen/X86/extract-concat.ll       |    4 +-
 .../X86/insert-into-constant-vector.ll        |    4 +-
 llvm/test/CodeGen/X86/isel-buildvector-avx.ll |   13 +-
 llvm/test/CodeGen/X86/matrix-multiply.ll      |    8 +-
 llvm/test/CodeGen/X86/nontemporal-4.ll        |   32 +-
 llvm/test/CodeGen/X86/pr29112.ll              |   20 +-
 llvm/test/CodeGen/X86/pr46532.ll              |    2 +-
 llvm/test/CodeGen/X86/pr78109.ll              |    8 +-
 llvm/test/CodeGen/X86/pr97968.ll              |    3 +-
 llvm/test/CodeGen/X86/sse41-intrinsics-x86.ll |   12 +-
 llvm/test/CodeGen/X86/vec_fp_to_int.ll        |  182 ++-
 llvm/test/CodeGen/X86/vec_minmax_sint.ll      |  288 ++++-
 llvm/test/CodeGen/X86/vec_minmax_uint.ll      |  288 ++++-
 llvm/test/CodeGen/X86/vector-compress.ll      |   18 +-
 .../CodeGen/X86/vector-half-conversions.ll    |    4 +-
 .../vector-interleaved-load-i32-stride-6.ll   |   32 +-
 .../vector-interleaved-load-i64-stride-3.ll   |    8 +-
 .../vector-interleaved-load-i64-stride-4.ll   |    8 +-
 .../vector-interleaved-store-i32-stride-2.ll  |   32 +-
 .../vector-interleaved-store-i32-stride-3.ll  |   32 +-
 .../vector-interleaved-store-i32-stride-4.ll  |   24 +-
 .../vector-interleaved-store-i32-stride-5.ll  |   16 +-
 .../vector-interleaved-store-i32-stride-6.ll  |   16 +-
 .../vector-interleaved-store-i64-stride-2.ll  |   16 +-
 .../vector-interleaved-store-i64-stride-3.ll  |   16 +-
 llvm/test/CodeGen/X86/vector-lzcnt-128.ll     |   80 +-
 llvm/test/CodeGen/X86/vector-lzcnt-256.ll     |  112 +-
 llvm/test/CodeGen/X86/vector-popcnt-128.ll    |   44 +-
 llvm/test/CodeGen/X86/vector-popcnt-256.ll    |   76 +-
 .../test/CodeGen/X86/vector-shuffle-128-v4.ll |    4 +-
 .../test/CodeGen/X86/vector-shuffle-256-v4.ll |   20 +-
 .../test/CodeGen/X86/vector-shuffle-256-v8.ll |  512 +++++---
 .../CodeGen/X86/vector-shuffle-512-v16.ll     |   24 +-
 .../CodeGen/X86/vector-shuffle-512-v64.ll     |    2 +-
 .../test/CodeGen/X86/vector-shuffle-512-v8.ll | 1032 ++++++-----------
 .../test/CodeGen/X86/vector-shuffle-avx512.ll |   27 +-
 .../X86/vector-shuffle-combining-avx.ll       |   27 +-
 .../X86/vector-shuffle-combining-avx2.ll      |   47 +-
 .../X86/vector-shuffle-combining-avx512f.ll   |  106 +-
 llvm/test/CodeGen/X86/vector-tzcnt-128.ll     |  124 +-
 llvm/test/CodeGen/X86/vector-tzcnt-256.ll     |  172 ++-
 llvm/test/CodeGen/X86/vselect-avx.ll          |    2 +-
 llvm/test/CodeGen/X86/widen_fadd.ll           |    2 +-
 llvm/test/CodeGen/X86/widen_fdiv.ll           |    2 +-
 llvm/test/CodeGen/X86/widen_fmul.ll           |    2 +-
 llvm/test/CodeGen/X86/widen_fsub.ll           |    2 +-
 .../CodeGen/X86/x86-interleaved-access.ll     |    4 +-
 56 files changed, 2352 insertions(+), 1706 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
index 68a4a0be3a1db..7390cc5805452 100644
--- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
+++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
@@ -338,6 +338,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
   bool HasDQI = ST->hasDQI();
   bool HasBWI = ST->hasBWI();
   bool HasVLX = ST->hasVLX();
+  bool MultiDomain = ST->hasAVX512() || ST->hasNoDomainDelayMov();
 
   struct FixupEntry {
     int Op;
@@ -401,47 +402,107 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
   case X86::VMOVAPDrm:
   case X86::VMOVAPSrm:
   case X86::VMOVUPDrm:
-  case X86::VMOVUPSrm:
-    return FixupConstant({{X86::VMOVSSrm, 1, 32, rebuildZeroUpperCst},
-                          {X86::VBROADCASTSSrm, 1, 32, rebuildSplatCst},
-                          {X86::VMOVSDrm, 1, 64, rebuildZeroUpperCst},
-                          {X86::VMOVDDUPrm, 1, 64, rebuildSplatCst}},
-                         128, 1);
+  case X86::VMOVUPSrm: {
+    FixupEntry Fixups[] = {
+        {MultiDomain ? X86::VPMOVSXBQrm : 0, 2, 8, rebuildSExtCst},
+        {MultiDomain ? X86::VPMOVZXBQrm : 0, 2, 8, rebuildZExtCst},
+        {X86::VMOVSSrm, 1, 32, rebuildZeroUpperCst},
+        {X86::VBROADCASTSSrm, 1, 32, rebuildSplatCst},
+        {MultiDomain ? X86::VPMOVSXBDrm : 0, 4, 8, rebuildSExtCst},
+        {MultiDomain ? X86::VPMOVZXBDrm : 0, 4, 8, rebuildZExtCst},
+        {MultiDomain ? X86::VPMOVSXWQrm : 0, 2, 16, rebuildSExtCst},
+        {MultiDomain ? X86::VPMOVZXWQrm : 0, 2, 16, rebuildZExtCst},
+        {X86::VMOVSDrm, 1, 64, rebuildZeroUpperCst},
+        {X86::VMOVDDUPrm, 1, 64, rebuildSplatCst},
+        {MultiDomain ? X86::VPMOVSXWDrm : 0, 4, 16, rebuildSExtCst},
+        {MultiDomain ? X86::VPMOVZXWDrm : 0, 4, 16, rebuildZExtCst},
+        {MultiDomain ? X86::VPMOVSXDQrm : 0, 2, 32, rebuildSExtCst},
+        {MultiDomain ? X86::VPMOVZXDQrm : 0, 2, 32, rebuildZExtCst}};
+    return FixupConstant(Fixups, 128, 1);
+  }
   case X86::VMOVAPDYrm:
   case X86::VMOVAPSYrm:
   case X86::VMOVUPDYrm:
-  case X86::VMOVUPSYrm:
-    return FixupConstant({{X86::VBROADCASTSSYrm, 1, 32, rebuildSplatCst},
-                          {X86::VBROADCASTSDYrm, 1, 64, rebuildSplatCst},
-                          {X86::VBROADCASTF128rm, 1, 128, rebuildSplatCst}},
-                         256, 1);
+  case X86::VMOVUPSYrm: {
+    FixupEntry Fixups[] = {
+        {X86::VBROADCASTSSYrm, 1, 32, rebuildSplatCst},
+        {HasAVX2 && MultiDomain ? X86::VPMOVSXBQYrm : 0, 4, 8, rebuildSExtCst},
+        {HasAVX2 && MultiDomain ? X86::VPMOVZXBQYrm : 0, 4, 8, rebuildZExtCst},
+        {X86::VBROADCASTSDYrm, 1, 64, rebuildSplatCst},
+        {HasAVX2 && MultiDomain ? X86::VPMOVSXBDYrm : 0, 8, 8, rebuildSExtCst},
+        {HasAVX2 && MultiDomain ? X86::VPMOVZXBDYrm : 0, 8, 8, rebuildZExtCst},
+        {HasAVX2 && MultiDomain ? X86::VPMOVSXWQYrm : 0, 4, 16, rebuildSExtCst},
+        {HasAVX2 && MultiDomain ? X86::VPMOVZXWQYrm : 0, 4, 16, rebuildZExtCst},
+        {X86::VBROADCASTF128rm, 1, 128, rebuildSplatCst},
+        {HasAVX2 && MultiDomain ? X86::VPMOVSXWDYrm : 0, 8, 16, rebuildSExtCst},
+        {HasAVX2 && MultiDomain ? X86::VPMOVZXWDYrm : 0, 8, 16, rebuildZExtCst},
+        {HasAVX2 && MultiDomain ? X86::VPMOVSXDQYrm : 0, 4, 32, rebuildSExtCst},
+        {HasAVX2 && MultiDomain ? X86::VPMOVZXDQYrm : 0, 4, 32,
+         rebuildZExtCst}};
+    return FixupConstant(Fixups, 256, 1);
+  }
   case X86::VMOVAPDZ128rm:
   case X86::VMOVAPSZ128rm:
   case X86::VMOVUPDZ128rm:
-  case X86::VMOVUPSZ128rm:
-    return FixupConstant({{X86::VMOVSSZrm, 1, 32, rebuildZeroUpperCst},
-                          {X86::VBROADCASTSSZ128rm, 1, 32, rebuildSplatCst},
-                          {X86::VMOVSDZrm, 1, 64, rebuildZeroUpperCst},
-                          {X86::VMOVDDUPZ128rm, 1, 64, rebuildSplatCst}},
-                         128, 1);
+  case X86::VMOVUPSZ128rm: {
+    FixupEntry Fixups[] = {
+        {MultiDomain ? X86::VPMOVSXBQZ128rm : 0, 2, 8, rebuildSExtCst},
+        {MultiDomain ? X86::VPMOVZXBQZ128rm : 0, 2, 8, rebuildZExtCst},
+        {X86::VMOVSSZrm, 1, 32, rebuildZeroUpperCst},
+        {X86::VBROADCASTSSZ128rm, 1, 32, rebuildSplatCst},
+        {MultiDomain ? X86::VPMOVSXBDZ128rm : 0, 4, 8, rebuildSExtCst},
+        {MultiDomain ? X86::VPMOVZXBDZ128rm : 0, 4, 8, rebuildZExtCst},
+        {MultiDomain ? X86::VPMOVSXWQZ128rm : 0, 2, 16, rebuildSExtCst},
+        {MultiDomain ? X86::VPMOVZXWQZ128rm : 0, 2, 16, rebuildZExtCst},
+        {X86::VMOVSDZrm, 1, 64, rebuildZeroUpperCst},
+        {X86::VMOVDDUPZ128rm, 1, 64, rebuildSplatCst},
+        {MultiDomain ? X86::VPMOVSXWDZ128rm : 0, 4, 16, rebuildSExtCst},
+        {MultiDomain ? X86::VPMOVZXWDZ128rm : 0, 4, 16, rebuildZExtCst},
+        {MultiDomain ? X86::VPMOVSXDQZ128rm : 0, 2, 32, rebuildSExtCst},
+        {MultiDomain ? X86::VPMOVZXDQZ128rm : 0, 2, 32, rebuildZExtCst}};
+    return FixupConstant(Fixups, 128, 1);
+  }
   case X86::VMOVAPDZ256rm:
   case X86::VMOVAPSZ256rm:
   case X86::VMOVUPDZ256rm:
-  case X86::VMOVUPSZ256rm:
-    return FixupConstant(
-        {{X86::VBROADCASTSSZ256rm, 1, 32, rebuildSplatCst},
-         {X86::VBROADCASTSDZ256rm, 1, 64, rebuildSplatCst},
-         {X86::VBROADCASTF32X4Z256rm, 1, 128, rebuildSplatCst}},
-        256, 1);
+  case X86::VMOVUPSZ256rm: {
+    FixupEntry Fixups[] = {
+        {X86::VBROADCASTSSZ256rm, 1, 32, rebuildSplatCst},
+        {MultiDomain ? X86::VPMOVSXBQZ256rm : 0, 4, 8, rebuildSExtCst},
+        {MultiDomain ? X86::VPMOVZXBQZ256rm : 0, 4, 8, rebuildZExtCst},
+        {X86::VBROADCASTSDZ256rm, 1, 64, rebuildSplatCst},
+        {MultiDomain ? X86::VPMOVSXBDZ256rm : 0, 8, 8, rebuildSExtCst},
+        {MultiDomain ? X86::VPMOVZXBDZ256rm : 0, 8, 8, rebuildZExtCst},
+        {MultiDomain ? X86::VPMOVSXWQZ256rm : 0, 4, 16, rebuildSExtCst},
+        {MultiDomain ? X86::VPMOVZXWQZ256rm : 0, 4, 16, rebuildZExtCst},
+        {X86::VBROADCASTF32X4Z256rm, 1, 128, rebuildSplatCst},
+        {MultiDomain ? X86::VPMOVSXWDZ256rm : 0, 8, 16, rebuildSExtCst},
+        {MultiDomain ? X86::VPMOVZXWDZ256rm : 0, 8, 16, rebuildZExtCst},
+        {MultiDomain ? X86::VPMOVSXDQZ256rm : 0, 4, 32, rebuildSExtCst},
+        {MultiDomain ? X86::VPMOVZXDQZ256rm : 0, 4, 32, rebuildZExtCst}};
+    return FixupConstant(Fixups, 256, 1);
+  }
   case X86::VMOVAPDZrm:
   case X86::VMOVAPSZrm:
   case X86::VMOVUPDZrm:
-  case X86::VMOVUPSZrm:
-    return FixupConstant({{X86::VBROADCASTSSZrm, 1, 32, rebuildSplatCst},
-                          {X86::VBROADCASTSDZrm, 1, 64, rebuildSplatCst},
-                          {X86::VBROADCASTF32X4Zrm, 1, 128, rebuildSplatCst},
-                          {X86::VBROADCASTF64X4Zrm, 1, 256, rebuildSplatCst}},
-                         512, 1);
+  case X86::VMOVUPSZrm: {
+    FixupEntry Fixups[] = {
+        {X86::VBROADCASTSSZrm, 1, 32, rebuildSplatCst},
+        {X86::VBROADCASTSDZrm, 1, 64, rebuildSplatCst},
+        {MultiDomain ? X86::VPMOVSXBQZrm : 0, 8, 8, rebuildSExtCst},
+        {MultiDomain ? X86::VPMOVZXBQZrm : 0, 8, 8, rebuildZExtCst},
+        {X86::VBROADCASTF32X4Zrm, 1, 128, rebuildSplatCst},
+        {MultiDomain ? X86::VPMOVSXBDZrm : 0, 16, 8, rebuildSExtCst},
+        {MultiDomain ? X86::VPMOVZXBDZrm : 0, 16, 8, rebuildZExtCst},
+        {MultiDomain ? X86::VPMOVSXWQZrm : 0, 8, 16, rebuildSExtCst},
+        {MultiDomain ? X86::VPMOVZXWQZrm : 0, 8, 16, rebuildZExtCst},
+        {X86::VBROADCASTF64X4Zrm, 1, 256, rebuildSplatCst},
+        {MultiDomain ? X86::VPMOVSXWDZrm : 0, 16, 16, rebuildSExtCst},
+        {MultiDomain ? X86::VPMOVZXWDZrm : 0, 16, 16, rebuildZExtCst},
+        {MultiDomain ? X86::VPMOVSXDQZrm : 0, 8, 32, rebuildSExtCst},
+        {MultiDomain ? X86::VPMOVZXDQZrm : 0, 8, 32, rebuildZExtCst}};
+    return FixupConstant(Fixups, 512, 1);
+  }
     /* Integer Loads */
   case X86::MOVDQArm:
   case X86::MOVDQUrm: {
diff --git a/llvm/test/CodeGen/X86/avx512-build-vector.ll b/llvm/test/CodeGen/X86/avx512-build-vector.ll
index 55478a2e93154..b21a0c4e36c2b 100644
--- a/llvm/test/CodeGen/X86/avx512-build-vector.ll
+++ b/llvm/test/CodeGen/X86/avx512-build-vector.ll
@@ -15,7 +15,7 @@ define <16 x float> @test3(<4 x float> %a) {
 ; CHECK-LABEL: test3:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15]
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vpermt2ps %zmm0, %zmm2, %zmm1
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll
index 8d98290ba29a6..8aa898f3ec576 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@@ -1060,12 +1060,12 @@ define i32 @test13_crash(i32 %x, i32 %y)  {
 define <4 x i1> @test14()  {
 ; CHECK-LABEL: test14:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [1,1,0,1]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [1,1,0,1]
 ; CHECK-NEXT:    retq
 ;
 ; X86-LABEL: test14:
 ; X86:       ## %bb.0:
-; X86-NEXT:    vmovaps {{.*#+}} xmm0 = [1,1,0,1]
+; X86-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [1,1,0,1]
 ; X86-NEXT:    retl
   %a = bitcast i16 21845 to <16 x i1>
   %b = extractelement <16 x i1> %a, i32 2
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index 5078130f18077..5d901a8a380a9 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -903,7 +903,7 @@ define <8 x i16> @test_16xi16_to_8xi16_E84C94EF(<16 x i16> %vec) {
 define <4 x i32> @test_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec) {
 ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [4,0,3,2]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,0,3,2]
 ; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; CHECK-NEXT:    vzeroupper
@@ -1001,7 +1001,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i
 define <4 x i32> @test_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec) {
 ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [5,3,2,5]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [5,3,2,5]
 ; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; CHECK-NEXT:    vzeroupper
@@ -1189,7 +1189,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32>
 define <8 x i32> @test_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec) {
 ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [1,13,11,14,7,10,1,6]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [1,13,11,14,7,10,1,6]
 ; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
@@ -1283,7 +1283,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x
 define <8 x i32> @test_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec) {
 ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [14,5,7,7,10,3,9,3]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [14,5,7,7,10,3,9,3]
 ; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
@@ -1321,7 +1321,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x
 define <4 x i32> @test_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) {
 ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [0,2,4,12,4,6,4,12]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,4,12,4,6,4,12]
 ; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
@@ -1424,7 +1424,7 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x
 define <4 x i32> @test_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec) {
 ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [3,0,0,13]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [3,0,0,13]
 ; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
@@ -1465,7 +1465,7 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x
 define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask0(ptr %vp) {
 ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [7,0,6,0,1,2,4,4]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [7,0,6,0,1,2,4,4]
 ; CHECK-NEXT:    vpermps 32(%rdi), %ymm0, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, ptr %vp
@@ -1768,7 +1768,7 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32
 define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) {
 ; CHECK-FAST-LABEL: test_16xi32_to_4xi32_perm_mask9:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovaps {{.*#+}} xmm1 = [12,9,4,10]
+; CHECK-FAST-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [12,9,4,10]
 ; CHECK-FAST-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; CHECK-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-FAST-NEXT:    vzeroupper
@@ -2050,7 +2050,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i
 define <4 x i64> @test_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec) {
 ; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mask3:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [6,0,0,7]
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [6,0,0,7]
 ; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-FAST-NEXT:    retq
@@ -2185,7 +2185,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i
 define <4 x i64> @test_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec) {
 ; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mask6:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,5,3]
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [7,6,5,3]
 ; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-FAST-NEXT:    retq
@@ -2711,7 +2711,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64>
 define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(ptr %vp) {
 ; CHECK-FAST-LABEL: test_8xi64_to_2xi64_perm_mem_mask0:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovaps {{.*#+}} xmm0 = [4,1]
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [4,1]
 ; CHECK-FAST-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
 ; CHECK-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-FAST-NEXT:    vzeroupper
@@ -2847,7 +2847,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec,
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm3 = [1,3,5,0]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,3,5,0]
 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vcmpeqps %xmm4, %xmm2, %k1
 ; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1}
@@ -2863,7 +2863,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec,
 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm2 = [1,3,5,0]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,3,5,0]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
 ; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
@@ -2879,7 +2879,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec,
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm3 = [3,2,7,0]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [3,2,7,0]
 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vcmpeqps %xmm4, %xmm2, %k1
 ; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1}
@@ -2895,7 +2895,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec,
 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm2 = [3,2,7,0]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [3,2,7,0]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
 ; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
@@ -2910,7 +2910,7 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec
 define <4 x float> @test_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec) {
 ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [3,3,5,2]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [3,3,5,2]
 ; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; CHECK-NEXT:    vzeroupper
@@ -2922,7 +2922,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec,
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm3 = [3,3,5,2]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [3,3,5,2]
 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vcmpeqps %xmm4, %xmm2, %k1
 ; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1}
@@ -2938,7 +2938,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec,
 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm2 = [3,3,5,2]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [3,3,5,2]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
 ; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
@@ -2954,7 +2954,7 @@ define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp) {
 ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps 16(%rdi), %xmm1
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [2,6,0,1]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [2,6,0,1]
 ; CHECK-NEXT:    vpermi2ps (%rdi), %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
@@ -2965,7 +2965,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm3 = [2,6,0,1]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,6,0,1]
 ; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm3
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
@@ -2982,7 +2982,7 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4
 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [2,6,0,1]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [2,6,0,1]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
 ; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z}
@@ -2999,7 +2999,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm3 = [2,7,7,2]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,7,7,2]
 ; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm3
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
@@ -3016,7 +3016,7 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4
 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [2,7,7,2]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [2,7,7,2]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
 ; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z}
@@ -3033,7 +3033,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps (%rdi), %xmm2
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm3 = [3,1,3,7]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [3,1,3,7]
 ; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm2, %xmm3
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
@@ -3050,7 +3050,7 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4
 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps (%rdi), %xmm2
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [3,1,3,7]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [3,1,3,7]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
 ; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z}
@@ -3067,7 +3067,7 @@ define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) {
 ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps (%rdi), %xmm1
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [1,3,5,3]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [1,3,5,3]
 ; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
@@ -3078,7 +3078,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps (%rdi), %xmm2
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm3 = [1,3,5,3]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,3,5,3]
 ; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm2, %xmm3
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
@@ -3095,7 +3095,7 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4
 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps (%rdi), %xmm2
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [1,3,5,3]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [1,3,5,3]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
 ; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z}
@@ -3111,7 +3111,7 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4
 define <8 x float> @test_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec) {
 ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [0,4,12,10,8,2,11,7]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,4,12,10,8,2,11,7]
 ; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
@@ -3122,7 +3122,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec
 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [0,4,12,10,8,2,11,7]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,4,12,10,8,2,11,7]
 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vcmpeqps %ymm4, %ymm2, %k1
 ; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
@@ -3137,7 +3137,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec
 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [0,4,12,10,8,2,11,7]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,4,12,10,8,2,11,7]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
 ; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
@@ -3152,7 +3152,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec
 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [10,12,3,12,4,15,1,14]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [10,12,3,12,4,15,1,14]
 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vcmpeqps %ymm4, %ymm2, %k1
 ; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
@@ -3167,7 +3167,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec
 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [10,12,3,12,4,15,1,14]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [10,12,3,12,4,15,1,14]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
 ; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
@@ -3182,7 +3182,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec
 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [0,4,8,9,6,1,4,4]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,4,8,9,6,1,4,4]
 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vcmpeqps %ymm4, %ymm2, %k1
 ; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
@@ -3197,7 +3197,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec
 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [0,4,8,9,6,1,4,4]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,4,8,9,6,1,4,4]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
 ; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
@@ -3211,7 +3211,7 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask2(<16 x float> %v
 define <8 x float> @test_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec) {
 ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [12,14,9,0,12,4,5,8]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [12,14,9,0,12,4,5,8]
 ; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
@@ -3222,7 +3222,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec
 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [12,14,9,0,12,4,5,8]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [12,14,9,0,12,4,5,8]
 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vcmpeqps %ymm4, %ymm2, %k1
 ; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
@@ -3237,7 +3237,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec
 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [12,14,9,0,12,4,5,8]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [12,14,9,0,12,4,5,8]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
 ; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
@@ -3251,7 +3251,7 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask3(<16 x float> %v
 define <4 x float> @test_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec) {
 ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [4,8,9,10]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,8,9,10]
 ; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
@@ -3263,7 +3263,7 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec
 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm3 = [4,8,9,10]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [4,8,9,10]
 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vcmpeqps %xmm4, %xmm2, %k1
 ; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
@@ -3279,7 +3279,7 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec
 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm2 = [4,8,9,10]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [4,8,9,10]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
 ; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
@@ -3295,7 +3295,7 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec
 ; CHECK-FAST-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1:
 ; CHECK-FAST:       # %bb.0:
 ; CHECK-FAST-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; CHECK-FAST-NEXT:    vmovaps {{.*#+}} xmm3 = [8,6,10,6]
+; CHECK-FAST-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [8,6,10,6]
 ; CHECK-FAST-NEXT:    vxorps %xmm4, %xmm4, %xmm4
 ; CHECK-FAST-NEXT:    vcmpeqps %xmm4, %xmm2, %k1
 ; CHECK-FAST-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
@@ -3307,7 +3307,7 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec
 ; CHECK-FAST-PERLANE:       # %bb.0:
 ; CHECK-FAST-PERLANE-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
 ; CHECK-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; CHECK-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} xmm4 = [0,6,2,6]
+; CHECK-FAST-PERLANE-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [0,6,2,6]
 ; CHECK-FAST-PERLANE-NEXT:    vpermi2ps %xmm0, %xmm3, %xmm4
 ; CHECK-FAST-PERLANE-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; CHECK-FAST-PERLANE-NEXT:    vcmpeqps %xmm0, %xmm2, %k1
@@ -3323,7 +3323,7 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec
 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %mask) {
 ; CHECK-FAST-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovaps {{.*#+}} xmm2 = [8,6,10,6]
+; CHECK-FAST-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [8,6,10,6]
 ; CHECK-FAST-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-FAST-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
 ; CHECK-FAST-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
@@ -3335,7 +3335,7 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %v
 ; CHECK-FAST-PERLANE:       # %bb.0:
 ; CHECK-FAST-PERLANE-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
 ; CHECK-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; CHECK-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} xmm0 = [0,6,2,6]
+; CHECK-FAST-PERLANE-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,6,2,6]
 ; CHECK-FAST-PERLANE-NEXT:    vxorps %xmm4, %xmm4, %xmm4
 ; CHECK-FAST-PERLANE-NEXT:    vcmpeqps %xmm4, %xmm1, %k1
 ; CHECK-FAST-PERLANE-NEXT:    vpermi2ps %xmm3, %xmm2, %xmm0 {%k1} {z}
@@ -3381,7 +3381,7 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask2(<16 x float> %v
 define <4 x float> @test_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec) {
 ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [10,2,11,6]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [10,2,11,6]
 ; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
@@ -3393,7 +3393,7 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec
 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm3 = [10,2,11,6]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [10,2,11,6]
 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vcmpeqps %xmm4, %xmm2, %k1
 ; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
@@ -3409,7 +3409,7 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec
 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm2 = [10,2,11,6]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [10,2,11,6]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
 ; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
@@ -3425,7 +3425,7 @@ define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp) {
 ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps (%rdi), %ymm1
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [7,6,7,11,5,10,0,4]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [7,6,7,11,5,10,0,4]
 ; CHECK-NEXT:    vpermi2ps 32(%rdi), %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
@@ -3436,7 +3436,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp, <8 x
 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps (%rdi), %ymm2
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [7,6,7,11,5,10,0,4]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [7,6,7,11,5,10,0,4]
 ; CHECK-NEXT:    vpermi2ps 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
@@ -3453,7 +3453,7 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp, <8
 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps (%rdi), %ymm2
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,7,11,5,10,0,4]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [7,6,7,11,5,10,0,4]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %ymm3, %ymm0, %k1
 ; CHECK-NEXT:    vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z}
@@ -3470,7 +3470,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask1(ptr %vp, <8 x
 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps (%rdi), %ymm2
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [11,0,9,0,7,14,0,8]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [11,0,9,0,7,14,0,8]
 ; CHECK-NEXT:    vpermi2ps 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
@@ -3487,7 +3487,7 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1(ptr %vp, <8
 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps (%rdi), %ymm2
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [11,0,9,0,7,14,0,8]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [11,0,9,0,7,14,0,8]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %ymm3, %ymm0, %k1
 ; CHECK-NEXT:    vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z}
@@ -3504,7 +3504,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x
 ; CHECK-FAST-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2:
 ; CHECK-FAST:       # %bb.0:
 ; CHECK-FAST-NEXT:    vmovaps 32(%rdi), %ymm2
-; CHECK-FAST-NEXT:    vmovaps {{.*#+}} ymm3 = [9,5,2,3,2,8,8,1]
+; CHECK-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [9,5,2,3,2,8,8,1]
 ; CHECK-FAST-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm3
 ; CHECK-FAST-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-FAST-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
@@ -3514,7 +3514,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x
 ; CHECK-FAST-PERLANE-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2:
 ; CHECK-FAST-PERLANE:       # %bb.0:
 ; CHECK-FAST-PERLANE-NEXT:    vmovaps (%rdi), %xmm2
-; CHECK-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm3 = [1,13,10,11,10,0,0,9]
+; CHECK-FAST-PERLANE-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,13,10,11,10,0,0,9]
 ; CHECK-FAST-PERLANE-NEXT:    vpermi2ps 32(%rdi), %ymm2, %ymm3
 ; CHECK-FAST-PERLANE-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-FAST-PERLANE-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
@@ -3531,7 +3531,7 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8
 ; CHECK-FAST-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2:
 ; CHECK-FAST:       # %bb.0:
 ; CHECK-FAST-NEXT:    vmovaps 32(%rdi), %ymm2
-; CHECK-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1]
+; CHECK-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1]
 ; CHECK-FAST-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-FAST-NEXT:    vcmpeqps %ymm3, %ymm0, %k1
 ; CHECK-FAST-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
@@ -3541,7 +3541,7 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8
 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2:
 ; CHECK-FAST-PERLANE:       # %bb.0:
 ; CHECK-FAST-PERLANE-NEXT:    vmovaps (%rdi), %xmm2
-; CHECK-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm1 = [1,13,10,11,10,0,0,9]
+; CHECK-FAST-PERLANE-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [1,13,10,11,10,0,0,9]
 ; CHECK-FAST-PERLANE-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-FAST-PERLANE-NEXT:    vcmpeqps %ymm3, %ymm0, %k1
 ; CHECK-FAST-PERLANE-NEXT:    vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z}
@@ -3558,7 +3558,7 @@ define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp) {
 ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps 32(%rdi), %ymm1
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [7,5,3,3,11,4,12,9]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [7,5,3,3,11,4,12,9]
 ; CHECK-NEXT:    vpermi2ps (%rdi), %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
@@ -3569,7 +3569,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp, <8 x
 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps 32(%rdi), %ymm2
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [7,5,3,3,11,4,12,9]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [7,5,3,3,11,4,12,9]
 ; CHECK-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
@@ -3586,7 +3586,7 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp, <8
 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps 32(%rdi), %ymm2
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [7,5,3,3,11,4,12,9]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [7,5,3,3,11,4,12,9]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %ymm3, %ymm0, %k1
 ; CHECK-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
@@ -3603,7 +3603,7 @@ define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp) {
 ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpermpd $231, 32(%rdi), %ymm1 # ymm1 = mem[3,1,2,3]
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [0,6,7,3]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,6,7,3]
 ; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -3615,7 +3615,7 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x
 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpermpd $231, 32(%rdi), %ymm2 # ymm2 = mem[3,1,2,3]
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm3 = [0,6,7,3]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [0,6,7,3]
 ; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm2, %xmm3
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
@@ -3633,7 +3633,7 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4
 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpermpd $231, 32(%rdi), %ymm2 # ymm2 = mem[3,1,2,3]
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [0,6,7,3]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,6,7,3]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
 ; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z}
@@ -3651,8 +3651,7 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x
 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps 32(%rdi), %ymm2
-; CHECK-NEXT:    vbroadcastf128 {{.*#+}} ymm3 = [0,10,6,15,0,10,6,15]
-; CHECK-NEXT:    # ymm3 = mem[0,1,0,1]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,10,6,15,0,0,0,0]
 ; CHECK-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
@@ -3670,8 +3669,7 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4
 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps 32(%rdi), %ymm2
-; CHECK-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [0,10,6,15,0,10,6,15]
-; CHECK-NEXT:    # ymm1 = mem[0,1,0,1]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,10,6,15,0,0,0,0]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
 ; CHECK-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
@@ -3726,7 +3724,7 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4
 define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) {
 ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [3,3,15,9]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [3,3,15,9]
 ; CHECK-NEXT:    vmovaps (%rdi), %ymm0
 ; CHECK-NEXT:    vpermt2ps 32(%rdi), %ymm1, %ymm0
 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -3739,7 +3737,7 @@ define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) {
 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm2 = [3,3,15,9]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [3,3,15,9]
 ; CHECK-NEXT:    vmovaps (%rdi), %ymm3
 ; CHECK-NEXT:    vpermt2ps 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
@@ -3757,7 +3755,7 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x
 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm2 = [3,3,15,9]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [3,3,15,9]
 ; CHECK-NEXT:    vmovaps (%rdi), %ymm1
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
@@ -3917,8 +3915,7 @@ define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1(ptr %vp,
 define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) {
 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [7,3,7,3]
-; CHECK-NEXT:    # ymm1 = mem[0,1,0,1]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [7,3,7,3]
 ; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
@@ -3929,8 +3926,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %v
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; CHECK-NEXT:    vbroadcastf128 {{.*#+}} ymm3 = [7,3,7,3]
-; CHECK-NEXT:    # ymm3 = mem[0,1,0,1]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [7,3,7,3]
 ; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm2, %k1
 ; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
@@ -3945,8 +3941,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %v
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [7,3,7,3]
-; CHECK-NEXT:    # ymm2 = mem[0,1,0,1]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [7,3,7,3]
 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
 ; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
@@ -3961,7 +3956,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask1(<8 x double> %v
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [2,0,7,6]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [2,0,7,6]
 ; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm2, %k1
 ; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
@@ -3976,7 +3971,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask1(<8 x double> %v
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm2 = [2,0,7,6]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [2,0,7,6]
 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
 ; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
@@ -4016,7 +4011,7 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask2(<8 x double>
 define <4 x double> @test_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec) {
 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [0,2,1,4]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,2,1,4]
 ; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
@@ -4027,7 +4022,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask3(<8 x double> %v
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [0,2,1,4]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [0,2,1,4]
 ; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm2, %k1
 ; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
@@ -4042,7 +4037,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask3(<8 x double> %v
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm2 = [0,2,1,4]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [0,2,1,4]
 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
 ; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
@@ -4056,7 +4051,7 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask3(<8 x double>
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovapd {{.*#+}} xmm3 = [1,5]
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [1,5]
 ; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm3, %zmm0
 ; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
@@ -4082,7 +4077,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %v
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %mask) {
 ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovapd {{.*#+}} xmm2 = [1,5]
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [1,5]
 ; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm2, %zmm0
 ; CHECK-FAST-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-FAST-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
@@ -4106,7 +4101,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask5(<8 x double> %v
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask5:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [2,6,2,2]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [2,6,2,2]
 ; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm2, %k1
 ; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
@@ -4121,7 +4116,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask5(<8 x double> %v
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm2 = [2,6,2,2]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [2,6,2,2]
 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
 ; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
@@ -4135,7 +4130,7 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask5(<8 x double>
 define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) {
 ; CHECK-FAST-LABEL: test_8xdouble_to_4xdouble_perm_mask6:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [5,0,7,0]
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [5,0,7,0]
 ; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-FAST-NEXT:    retq
@@ -4153,7 +4148,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %v
 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
 ; CHECK-FAST:       # %bb.0:
 ; CHECK-FAST-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm3 = [5,0,7,0]
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [5,0,7,0]
 ; CHECK-FAST-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
 ; CHECK-FAST-NEXT:    vcmpeqpd %ymm4, %ymm2, %k1
 ; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
@@ -4178,7 +4173,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %v
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %mask) {
 ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm2 = [5,0,7,0]
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [5,0,7,0]
 ; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
 ; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
@@ -4202,7 +4197,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask7(<8 x double> %v
 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7:
 ; CHECK-FAST:       # %bb.0:
 ; CHECK-FAST-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm3 = [3,5,0,6]
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [3,5,0,6]
 ; CHECK-FAST-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
 ; CHECK-FAST-NEXT:    vcmpeqpd %ymm4, %ymm2, %k1
 ; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
@@ -4227,7 +4222,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask7(<8 x double> %v
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %mask) {
 ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm2 = [3,5,0,6]
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [3,5,0,6]
 ; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
 ; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
@@ -4250,7 +4245,7 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask7(<8 x double>
 define <2 x double> @test_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec) {
 ; CHECK-FAST-LABEL: test_8xdouble_to_2xdouble_perm_mask0:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovaps {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0]
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [0,6]
 ; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; CHECK-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-FAST-NEXT:    vzeroupper
@@ -4269,7 +4264,7 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %v
 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
 ; CHECK-FAST:       # %bb.0:
 ; CHECK-FAST-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; CHECK-FAST-NEXT:    vmovapd {{.*#+}} xmm3 = [0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0]
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [0,6]
 ; CHECK-FAST-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
 ; CHECK-FAST-NEXT:    vcmpeqpd %xmm4, %xmm2, %k1
 ; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
@@ -4295,7 +4290,7 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %v
 define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %mask) {
 ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovapd {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0]
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [0,6]
 ; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-FAST-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
 ; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
@@ -4320,7 +4315,7 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask1(<8 x double> %v
 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; CHECK-NEXT:    vmovapd {{.*#+}} xmm3 = [3,7]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [3,7]
 ; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vcmpeqpd %xmm4, %xmm2, %k1
 ; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
@@ -4336,7 +4331,7 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask1(<8 x double> %v
 define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd {{.*#+}} xmm2 = [3,7]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [3,7]
 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
 ; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
@@ -4352,7 +4347,7 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp) {
 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovapd (%rdi), %ymm1
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm0 = [1,6,7,2]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [1,6,7,2]
 ; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
@@ -4363,7 +4358,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp, <4
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovapd (%rdi), %ymm2
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [1,6,7,2]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [1,6,7,2]
 ; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
@@ -4380,7 +4375,7 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp,
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovapd (%rdi), %ymm2
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm1 = [1,6,7,2]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [1,6,7,2]
 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
 ; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
@@ -4397,7 +4392,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, <4
 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
 ; CHECK-FAST:       # %bb.0:
 ; CHECK-FAST-NEXT:    vbroadcastsd 32(%rdi), %ymm2
-; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm3 = [7,0,6,2]
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [7,0,6,2]
 ; CHECK-FAST-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm3
 ; CHECK-FAST-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-FAST-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
@@ -4422,7 +4417,7 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp,
 ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
 ; CHECK-FAST:       # %bb.0:
 ; CHECK-FAST-NEXT:    vbroadcastsd 32(%rdi), %ymm2
-; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm1 = [7,0,6,2]
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [7,0,6,2]
 ; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
 ; CHECK-FAST-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
@@ -4447,7 +4442,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4
 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2:
 ; CHECK-FAST:       # %bb.0:
 ; CHECK-FAST-NEXT:    vmovapd (%rdi), %ymm2
-; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm3 = [1,2,3,4]
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [1,2,3,4]
 ; CHECK-FAST-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm3
 ; CHECK-FAST-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-FAST-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
@@ -4473,7 +4468,7 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp,
 ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2:
 ; CHECK-FAST:       # %bb.0:
 ; CHECK-FAST-NEXT:    vmovapd (%rdi), %ymm2
-; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm1 = [1,2,3,4]
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [1,2,3,4]
 ; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
 ; CHECK-FAST-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
@@ -4499,7 +4494,7 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp) {
 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovapd (%rdi), %ymm1
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm0 = [4,2,1,0]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [4,2,1,0]
 ; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
@@ -4510,7 +4505,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovapd (%rdi), %ymm2
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [4,2,1,0]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [4,2,1,0]
 ; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
@@ -4527,7 +4522,7 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp,
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovapd (%rdi), %ymm2
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm1 = [4,2,1,0]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [4,2,1,0]
 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
 ; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
@@ -4544,7 +4539,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask4(ptr %vp, <4
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask4:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovapd 32(%rdi), %ymm2
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [2,4,1,5]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [2,4,1,5]
 ; CHECK-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
@@ -4561,7 +4556,7 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4(ptr %vp,
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovapd 32(%rdi), %ymm2
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm1 = [2,4,1,5]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [2,4,1,5]
 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
 ; CHECK-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
@@ -4610,7 +4605,7 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp) {
 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask6:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovapd 32(%rdi), %ymm1
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm0 = [0,2,4,1]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [0,2,4,1]
 ; CHECK-NEXT:    vpermi2pd (%rdi), %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
@@ -4621,7 +4616,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask6:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovapd 32(%rdi), %ymm2
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [0,2,4,1]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [0,2,4,1]
 ; CHECK-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
@@ -4638,7 +4633,7 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp,
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovapd 32(%rdi), %ymm2
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm1 = [0,2,4,1]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,2,4,1]
 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
 ; CHECK-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/permute.ll
index 2103ab87a17ad..367e28eb7364e 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/permute.ll
@@ -512,7 +512,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask3(ptr %vp, <32 x i16> %mask
 define <8 x i32> @test_8xi32_perm_mask0(<8 x i32> %vec) {
 ; CHECK-LABEL: test_8xi32_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6]
 ; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
@@ -599,7 +599,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask
 define <8 x i32> @test_8xi32_perm_mask3(<8 x i32> %vec) {
 ; CHECK-LABEL: test_8xi32_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0]
 ; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
@@ -634,7 +634,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask
 define <8 x i32> @test_8xi32_perm_mem_mask0(ptr %vp) {
 ; CHECK-LABEL: test_8xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5]
 ; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, ptr %vp
@@ -728,7 +728,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> %mask) {
 define <8 x i32> @test_8xi32_perm_mem_mask3(ptr %vp) {
 ; CHECK-LABEL: test_8xi32_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5]
 ; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, ptr %vp
@@ -766,7 +766,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> %mask) {
 define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec) {
 ; CHECK-LABEL: test_16xi32_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7]
 ; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
@@ -853,7 +853,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %
 define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) {
 ; CHECK-LABEL: test_16xi32_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12]
 ; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
@@ -888,7 +888,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %
 define <16 x i32> @test_16xi32_perm_mem_mask0(ptr %vp) {
 ; CHECK-LABEL: test_16xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6]
 ; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, ptr %vp
@@ -982,7 +982,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(ptr %vp, <16 x i32> %mask
 define <16 x i32> @test_16xi32_perm_mem_mask3(ptr %vp) {
 ; CHECK-LABEL: test_16xi32_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1]
 ; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, ptr %vp
@@ -1254,7 +1254,7 @@ define <4 x i64> @test_masked_z_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> %mask) {
 define <8 x i64> @test_8xi64_perm_mask0(<8 x i64> %vec) {
 ; CHECK-LABEL: test_8xi64_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6]
 ; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
@@ -1421,7 +1421,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %
 define <8 x i64> @test_8xi64_perm_mask6(<8 x i64> %vec) {
 ; CHECK-LABEL: test_8xi64_perm_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7]
 ; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
@@ -1480,7 +1480,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %
 define <8 x i64> @test_8xi64_perm_mem_mask0(ptr %vp) {
 ; CHECK-LABEL: test_8xi64_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3]
 ; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, ptr %vp
@@ -1661,7 +1661,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask5(ptr %vp, <8 x i64> %mas
 define <8 x i64> @test_8xi64_perm_mem_mask6(ptr %vp) {
 ; CHECK-LABEL: test_8xi64_perm_mem_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6]
 ; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, ptr %vp
@@ -1725,7 +1725,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask7(ptr %vp, <8 x i64> %mas
 define <8 x float> @test_8xfloat_perm_mask0(<8 x float> %vec) {
 ; CHECK-LABEL: test_8xfloat_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4]
 ; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
@@ -1734,7 +1734,7 @@ define <8 x float> @test_8xfloat_perm_mask0(<8 x float> %vec) {
 define <8 x float> @test_masked_8xfloat_perm_mask0(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4]
 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vcmpeqps %ymm4, %ymm2, %k1
 ; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1}
@@ -1749,7 +1749,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask0(<8 x float> %vec, <8 x float>
 define <8 x float> @test_masked_z_8xfloat_perm_mask0(<8 x float> %vec, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
 ; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
@@ -1762,7 +1762,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask0(<8 x float> %vec, <8 x floa
 define <8 x float> @test_masked_8xfloat_perm_mask1(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_perm_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1]
 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vcmpeqps %ymm4, %ymm2, %k1
 ; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1}
@@ -1777,7 +1777,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask1(<8 x float> %vec, <8 x float>
 define <8 x float> @test_masked_z_8xfloat_perm_mask1(<8 x float> %vec, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
 ; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
@@ -1790,7 +1790,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask1(<8 x float> %vec, <8 x floa
 define <8 x float> @test_masked_8xfloat_perm_mask2(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_perm_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5]
 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vcmpeqps %ymm4, %ymm2, %k1
 ; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1}
@@ -1805,7 +1805,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask2(<8 x float> %vec, <8 x float>
 define <8 x float> @test_masked_z_8xfloat_perm_mask2(<8 x float> %vec, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
 ; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
@@ -1818,7 +1818,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask2(<8 x float> %vec, <8 x floa
 define <8 x float> @test_8xfloat_perm_mask3(<8 x float> %vec) {
 ; CHECK-LABEL: test_8xfloat_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6]
 ; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
@@ -1827,7 +1827,7 @@ define <8 x float> @test_8xfloat_perm_mask3(<8 x float> %vec) {
 define <8 x float> @test_masked_8xfloat_perm_mask3(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6]
 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vcmpeqps %ymm4, %ymm2, %k1
 ; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1}
@@ -1842,7 +1842,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask3(<8 x float> %vec, <8 x float>
 define <8 x float> @test_masked_z_8xfloat_perm_mask3(<8 x float> %vec, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
 ; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
@@ -1855,7 +1855,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask3(<8 x float> %vec, <8 x floa
 define <8 x float> @test_8xfloat_perm_mem_mask0(ptr %vp) {
 ; CHECK-LABEL: test_8xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0]
 ; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
@@ -1865,7 +1865,7 @@ define <8 x float> @test_8xfloat_perm_mem_mask0(ptr %vp) {
 define <8 x float> @test_masked_8xfloat_perm_mem_mask0(ptr %vp, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
 ; CHECK-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1}
@@ -1880,7 +1880,7 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask0(ptr %vp, <8 x float> %vec
 define <8 x float> @test_masked_z_8xfloat_perm_mem_mask0(ptr %vp, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0]
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqps %ymm2, %ymm0, %k1
 ; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
@@ -1895,7 +1895,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask0(ptr %vp, <8 x float> %m
 define <8 x float> @test_masked_8xfloat_perm_mem_mask1(ptr %vp, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
 ; CHECK-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1}
@@ -1910,7 +1910,7 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask1(ptr %vp, <8 x float> %vec
 define <8 x float> @test_masked_z_8xfloat_perm_mem_mask1(ptr %vp, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6]
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqps %ymm2, %ymm0, %k1
 ; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
@@ -1925,7 +1925,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask1(ptr %vp, <8 x float> %m
 define <8 x float> @test_masked_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
 ; CHECK-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1}
@@ -1940,7 +1940,7 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %vec
 define <8 x float> @test_masked_z_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4]
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqps %ymm2, %ymm0, %k1
 ; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
@@ -1955,7 +1955,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %m
 define <8 x float> @test_8xfloat_perm_mem_mask3(ptr %vp) {
 ; CHECK-LABEL: test_8xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0]
 ; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
@@ -1965,7 +1965,7 @@ define <8 x float> @test_8xfloat_perm_mem_mask3(ptr %vp) {
 define <8 x float> @test_masked_8xfloat_perm_mem_mask3(ptr %vp, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
 ; CHECK-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1}
@@ -1980,7 +1980,7 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask3(ptr %vp, <8 x float> %vec
 define <8 x float> @test_masked_z_8xfloat_perm_mem_mask3(ptr %vp, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0]
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqps %ymm2, %ymm0, %k1
 ; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
@@ -1995,7 +1995,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask3(ptr %vp, <8 x float> %m
 define <16 x float> @test_16xfloat_perm_mask0(<16 x float> %vec) {
 ; CHECK-LABEL: test_16xfloat_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7]
 ; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
@@ -2004,7 +2004,7 @@ define <16 x float> @test_16xfloat_perm_mask0(<16 x float> %vec) {
 define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7]
 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vcmpeqps %zmm4, %zmm2, %k1
 ; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
@@ -2019,7 +2019,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x fl
 define <16 x float> @test_masked_z_16xfloat_perm_mask0(<16 x float> %vec, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
 ; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
@@ -2032,7 +2032,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask0(<16 x float> %vec, <16 x
 define <16 x float> @test_masked_16xfloat_perm_mask1(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_perm_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1]
 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vcmpeqps %zmm4, %zmm2, %k1
 ; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
@@ -2047,7 +2047,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask1(<16 x float> %vec, <16 x fl
 define <16 x float> @test_masked_z_16xfloat_perm_mask1(<16 x float> %vec, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
 ; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
@@ -2060,7 +2060,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask1(<16 x float> %vec, <16 x
 define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_perm_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11]
 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vcmpeqps %zmm4, %zmm2, %k1
 ; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
@@ -2075,7 +2075,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x fl
 define <16 x float> @test_masked_z_16xfloat_perm_mask2(<16 x float> %vec, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
 ; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
@@ -2088,7 +2088,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask2(<16 x float> %vec, <16 x
 define <16 x float> @test_16xfloat_perm_mask3(<16 x float> %vec) {
 ; CHECK-LABEL: test_16xfloat_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3]
 ; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
@@ -2097,7 +2097,7 @@ define <16 x float> @test_16xfloat_perm_mask3(<16 x float> %vec) {
 define <16 x float> @test_masked_16xfloat_perm_mask3(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3]
 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vcmpeqps %zmm4, %zmm2, %k1
 ; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
@@ -2112,7 +2112,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask3(<16 x float> %vec, <16 x fl
 define <16 x float> @test_masked_z_16xfloat_perm_mask3(<16 x float> %vec, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
 ; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
@@ -2125,7 +2125,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask3(<16 x float> %vec, <16 x
 define <16 x float> @test_16xfloat_perm_mem_mask0(ptr %vp) {
 ; CHECK-LABEL: test_16xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1]
 ; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
@@ -2135,7 +2135,7 @@ define <16 x float> @test_16xfloat_perm_mem_mask0(ptr %vp) {
 define <16 x float> @test_masked_16xfloat_perm_mem_mask0(ptr %vp, <16 x float> %vec2, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
 ; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
@@ -2150,7 +2150,7 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask0(ptr %vp, <16 x float> %
 define <16 x float> @test_masked_z_16xfloat_perm_mem_mask0(ptr %vp, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1]
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqps %zmm2, %zmm0, %k1
 ; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
@@ -2165,7 +2165,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask0(ptr %vp, <16 x float>
 define <16 x float> @test_masked_16xfloat_perm_mem_mask1(ptr %vp, <16 x float> %vec2, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
 ; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
@@ -2180,7 +2180,7 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask1(ptr %vp, <16 x float> %
 define <16 x float> @test_masked_z_16xfloat_perm_mem_mask1(ptr %vp, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4]
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqps %zmm2, %zmm0, %k1
 ; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
@@ -2195,7 +2195,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask1(ptr %vp, <16 x float>
 define <16 x float> @test_masked_16xfloat_perm_mem_mask2(ptr %vp, <16 x float> %vec2, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
 ; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
@@ -2210,7 +2210,7 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask2(ptr %vp, <16 x float> %
 define <16 x float> @test_masked_z_16xfloat_perm_mem_mask2(ptr %vp, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5]
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqps %zmm2, %zmm0, %k1
 ; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
@@ -2225,7 +2225,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask2(ptr %vp, <16 x float>
 define <16 x float> @test_16xfloat_perm_mem_mask3(ptr %vp) {
 ; CHECK-LABEL: test_16xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0]
 ; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
@@ -2235,7 +2235,7 @@ define <16 x float> @test_16xfloat_perm_mem_mask3(ptr %vp) {
 define <16 x float> @test_masked_16xfloat_perm_mem_mask3(ptr %vp, <16 x float> %vec2, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
 ; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
@@ -2250,7 +2250,7 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask3(ptr %vp, <16 x float> %
 define <16 x float> @test_masked_z_16xfloat_perm_mem_mask3(ptr %vp, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0]
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqps %zmm2, %zmm0, %k1
 ; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
@@ -2515,7 +2515,7 @@ define <4 x double> @test_masked_z_4xdouble_perm_mem_mask3(ptr %vp, <4 x double>
 define <8 x double> @test_8xdouble_perm_mask0(<8 x double> %vec) {
 ; CHECK-LABEL: test_8xdouble_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4]
 ; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
@@ -2524,7 +2524,7 @@ define <8 x double> @test_8xdouble_perm_mask0(<8 x double> %vec) {
 define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4]
 ; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm2, %k1
 ; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
@@ -2539,7 +2539,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x dou
 define <8 x double> @test_masked_z_8xdouble_perm_mask0(<8 x double> %vec, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4]
 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
 ; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
@@ -2578,7 +2578,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mask1(<8 x double> %vec, <8
 define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7]
 ; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm2, %k1
 ; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
@@ -2593,7 +2593,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x dou
 define <8 x double> @test_masked_z_8xdouble_perm_mask2(<8 x double> %vec, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7]
 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
 ; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
@@ -2640,7 +2640,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mask3(<8 x double> %vec, <8
 define <8 x double> @test_masked_8xdouble_perm_mask4(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_mask4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1]
 ; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm2, %k1
 ; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
@@ -2655,7 +2655,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask4(<8 x double> %vec, <8 x dou
 define <8 x double> @test_masked_z_8xdouble_perm_mask4(<8 x double> %vec, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mask4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1]
 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
 ; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
@@ -2694,7 +2694,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mask5(<8 x double> %vec, <8
 define <8 x double> @test_8xdouble_perm_mask6(<8 x double> %vec) {
 ; CHECK-LABEL: test_8xdouble_perm_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2]
 ; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
@@ -2703,7 +2703,7 @@ define <8 x double> @test_8xdouble_perm_mask6(<8 x double> %vec) {
 define <8 x double> @test_masked_8xdouble_perm_mask6(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2]
 ; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm2, %k1
 ; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
@@ -2718,7 +2718,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask6(<8 x double> %vec, <8 x dou
 define <8 x double> @test_masked_z_8xdouble_perm_mask6(<8 x double> %vec, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2]
 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
 ; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
@@ -2757,7 +2757,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mask7(<8 x double> %vec, <8
 define <8 x double> @test_8xdouble_perm_mem_mask0(ptr %vp) {
 ; CHECK-LABEL: test_8xdouble_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1]
 ; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
@@ -2767,7 +2767,7 @@ define <8 x double> @test_8xdouble_perm_mem_mask0(ptr %vp) {
 define <8 x double> @test_masked_8xdouble_perm_mem_mask0(ptr %vp, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1]
 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
 ; CHECK-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
@@ -2782,7 +2782,7 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask0(ptr %vp, <8 x double> %
 define <8 x double> @test_masked_z_8xdouble_perm_mem_mask0(ptr %vp, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1]
 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm0, %k1
 ; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
@@ -2825,7 +2825,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask1(ptr %vp, <8 x dou
 define <8 x double> @test_masked_8xdouble_perm_mem_mask2(ptr %vp, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5]
 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
 ; CHECK-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
@@ -2840,7 +2840,7 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask2(ptr %vp, <8 x double> %
 define <8 x double> @test_masked_z_8xdouble_perm_mem_mask2(ptr %vp, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5]
 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm0, %k1
 ; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
@@ -2892,7 +2892,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask3(ptr %vp, <8 x dou
 define <8 x double> @test_masked_8xdouble_perm_mem_mask4(ptr %vp, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0]
 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
 ; CHECK-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
@@ -2907,7 +2907,7 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask4(ptr %vp, <8 x double> %
 define <8 x double> @test_masked_z_8xdouble_perm_mem_mask4(ptr %vp, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0]
 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm0, %k1
 ; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
@@ -2950,7 +2950,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask5(ptr %vp, <8 x dou
 define <8 x double> @test_8xdouble_perm_mem_mask6(ptr %vp) {
 ; CHECK-LABEL: test_8xdouble_perm_mem_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5]
 ; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
@@ -2960,7 +2960,7 @@ define <8 x double> @test_8xdouble_perm_mem_mask6(ptr %vp) {
 define <8 x double> @test_masked_8xdouble_perm_mem_mask6(ptr %vp, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5]
 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
 ; CHECK-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
@@ -2975,7 +2975,7 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask6(ptr %vp, <8 x double> %
 define <8 x double> @test_masked_z_8xdouble_perm_mem_mask6(ptr %vp, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5]
 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm0, %k1
 ; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
index 6de09c745b32a..0973824fbb0ef 100644
--- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -7031,18 +7031,18 @@ define <4 x double> @test_mask_vfmadd256_pd_rmkz(<4 x double> %a0, <4 x double>
 define <8 x i32> @combine_vpermi2d_vpermps(<16 x i32> noundef %a) {
 ; X86-LABEL: combine_vpermi2d_vpermps:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovaps {{.*#+}} ymm1 = [14,13,6,3,5,15,0,1]
-; X86-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x0d,A,A,A,A]
-; X86-NEXT:    # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [14,13,6,3,5,15,0,1]
+; X86-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x0d,A,A,A,A]
+; X86-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x16,0xc0]
 ; X86-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: combine_vpermi2d_vpermps:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} ymm1 = [14,13,6,3,5,15,0,1]
-; X64-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x0d,A,A,A,A]
-; X64-NEXT:    # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [14,13,6,3,5,15,0,1]
+; X64-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x0d,A,A,A,A]
+; X64-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
 ; X64-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x16,0xc0]
 ; X64-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; X64-NEXT:    retq # encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/combine-or.ll b/llvm/test/CodeGen/X86/combine-or.ll
index 4060355495eb3..d9c6d7053be74 100644
--- a/llvm/test/CodeGen/X86/combine-or.ll
+++ b/llvm/test/CodeGen/X86/combine-or.ll
@@ -29,11 +29,16 @@ define <2 x i64> @or_zext_v2i32(<2 x i32> %a0) {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967295,4294967295]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: or_zext_v2i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = [4294967295,4294967295]
-; AVX-NEXT:    # xmm0 = mem[0,0]
-; AVX-NEXT:    retq
+; AVX1-LABEL: or_zext_v2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4294967295,0,4294967295,0]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: or_zext_v2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = [4294967295,4294967295]
+; AVX2-NEXT:    # xmm0 = mem[0,0]
+; AVX2-NEXT:    retq
   %1 = zext <2 x i32> %a0 to <2 x i64>
   %2 = or <2 x i64> %1, <i64 4294967295, i64 4294967295>
   ret <2 x i64> %2
diff --git a/llvm/test/CodeGen/X86/combine-sub-usat.ll b/llvm/test/CodeGen/X86/combine-sub-usat.ll
index 13d5c9f185645..b70e3fcd779c5 100644
--- a/llvm/test/CodeGen/X86/combine-sub-usat.ll
+++ b/llvm/test/CodeGen/X86/combine-sub-usat.ll
@@ -52,10 +52,20 @@ define <8 x i16> @combine_constfold_v8i16() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,0,254,0,65534,0,0,0]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: combine_constfold_v8i16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,0,254,0,65534,0,0,0]
-; AVX-NEXT:    retq
+; AVX1-LABEL: combine_constfold_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [0,0,254,0,65534,0,0,0]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_constfold_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [0,0,254,0,65534,0,0,0]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_constfold_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm0 = [0,254,65534,0]
+; AVX512-NEXT:    retq
   %res = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> <i16 0, i16 1, i16 255, i16 65535, i16 -1, i16 -255, i16 -65535, i16 1>, <8 x i16> <i16 1, i16 65535, i16 1, i16 65535, i16 1, i16 65535, i16 1, i16 65535>)
   ret <8 x i16> %res
 }
@@ -66,10 +76,20 @@ define <8 x i16> @combine_constfold_undef_v8i16() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,0,0,0,65534,0,0,0]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: combine_constfold_undef_v8i16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,0,0,0,65534,0,0,0]
-; AVX-NEXT:    retq
+; AVX1-LABEL: combine_constfold_undef_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [0,0,0,0,65534,0,0,0]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_constfold_undef_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [0,0,0,0,65534,0,0,0]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_constfold_undef_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm0 = [0,65534]
+; AVX512-NEXT:    retq
   %res = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> <i16 undef, i16 1, i16 undef, i16 65535, i16 -1, i16 -255, i16 -65535, i16 1>, <8 x i16> <i16 1, i16 undef, i16 undef, i16 65535, i16 1, i16 65535, i16 1, i16 65535>)
   ret <8 x i16> %res
 }
diff --git a/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll
index 0a52dfff71eda..dea29b7b5b93d 100644
--- a/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll
@@ -532,7 +532,7 @@ define <2 x half> @vfptrunc_v2f16_v2f64(<2 x double> %a, <2 x i1> %m, i32 zeroex
 ; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
 ; AVX512-NEXT:    callq __truncdfhf2@PLT
 ; AVX512-NEXT:    vpbroadcastw %xmm0, %xmm1
-; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = [4,0,0,0]
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [4,0]
 ; AVX512-NEXT:    vpermi2ps (%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
 ; AVX512-NEXT:    addq $40, %rsp
 ; AVX512-NEXT:    .cfi_def_cfa_offset 8
diff --git a/llvm/test/CodeGen/X86/extract-concat.ll b/llvm/test/CodeGen/X86/extract-concat.ll
index e7415dcf229f4..f12693469a3f6 100644
--- a/llvm/test/CodeGen/X86/extract-concat.ll
+++ b/llvm/test/CodeGen/X86/extract-concat.ll
@@ -84,9 +84,9 @@ define <16 x i64> @catcat(<4 x i64> %x) {
 ; AVX512F-LABEL: catcat:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1]
+; AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1]
 ; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm2
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [2,2,2,2,3,3,3,3]
+; AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [2,2,2,2,3,3,3,3]
 ; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm1
 ; AVX512F-NEXT:    vmovaps %zmm2, %zmm0
 ; AVX512F-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
index 6e41e1bb87eb2..c44945ac2d929 100644
--- a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
+++ b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
@@ -436,9 +436,9 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
 ;
 ; X86-AVX512F-LABEL: elt5_v8i64:
 ; X86-AVX512F:       # %bb.0:
-; X86-AVX512F-NEXT:    vmovaps {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0]
+; X86-AVX512F-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [42,1,2,3]
 ; X86-AVX512F-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-AVX512F-NEXT:    vmovss {{.*#+}} xmm2 = [4,0,0,0]
+; X86-AVX512F-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [4,0]
 ; X86-AVX512F-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; X86-AVX512F-NEXT:    vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1
 ; X86-AVX512F-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/isel-buildvector-avx.ll b/llvm/test/CodeGen/X86/isel-buildvector-avx.ll
index 91abfff2a3424..a9297f016521d 100644
--- a/llvm/test/CodeGen/X86/isel-buildvector-avx.ll
+++ b/llvm/test/CodeGen/X86/isel-buildvector-avx.ll
@@ -43,10 +43,15 @@ define <8 x float> @test_vector_v8f32() {
 }
 
 define <4 x i64> @test_vector_v4i64() {
-; AVX-ALL-LABEL: test_vector_v4i64:
-; AVX-ALL:       # %bb.0:
-; AVX-ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [23430,24650,1,12]
-; AVX-ALL-NEXT:    retq
+; AVX-LABEL: test_vector_v4i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [23430,24650,1,12]
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_vector_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxwq {{.*#+}} ymm0 = [23430,24650,1,12]
+; AVX512-NEXT:    retq
   ret <4 x i64> <i64 23430, i64 24650, i64 1, i64 12>
 }
 
diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll
index ed668c6ef4b04..bdc1ff4c157e4 100644
--- a/llvm/test/CodeGen/X86/matrix-multiply.ll
+++ b/llvm/test/CodeGen/X86/matrix-multiply.ll
@@ -394,7 +394,7 @@ define <9 x float> @test_mul3x3_f32(<9 x float> %a0, <9 x float> %a1) nounwind {
 ; AVX512F-NEXT:    vaddss %xmm1, %xmm2, %xmm1
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3]
 ; AVX512F-NEXT:    vinsertf32x4 $1, %xmm7, %zmm6, %zmm2
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,2,4,5,6,16,17,18,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,4,5,6,16,17,18,0,0,0,0,0,0,0]
 ; AVX512F-NEXT:    vpermi2ps %zmm1, %zmm2, %zmm0
 ; AVX512F-NEXT:    retq
 ;
@@ -453,7 +453,7 @@ define <9 x float> @test_mul3x3_f32(<9 x float> %a0, <9 x float> %a1) nounwind {
 ; AVX512VL-NEXT:    vaddss %xmm1, %xmm2, %xmm1
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3]
 ; AVX512VL-NEXT:    vinsertf32x4 $1, %xmm5, %zmm3, %zmm2
-; AVX512VL-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,2,4,5,6,16,17,18,u,u,u,u,u,u,u]
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,4,5,6,16,17,18,0,0,0,0,0,0,0]
 ; AVX512VL-NEXT:    vpermi2ps %zmm1, %zmm2, %zmm0
 ; AVX512VL-NEXT:    retq
 entry:
@@ -762,7 +762,7 @@ define <9 x double> @test_mul3x3_f64(<9 x double> %a0, <9 x double> %a1) nounwin
 ; AVX512F-NEXT:    vmulsd %xmm1, %xmm8, %xmm1
 ; AVX512F-NEXT:    vaddsd %xmm1, %xmm2, %xmm1
 ; AVX512F-NEXT:    vinsertf64x4 $1, %ymm4, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovapd {{.*#+}} zmm3 = [0,1,2,4,5,6,8,9]
+; AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,1,2,4,5,6,8,9]
 ; AVX512F-NEXT:    vpermi2pd %zmm0, %zmm2, %zmm3
 ; AVX512F-NEXT:    vmovsd %xmm1, 64(%rdi)
 ; AVX512F-NEXT:    vmovapd %zmm3, (%rdi)
@@ -818,7 +818,7 @@ define <9 x double> @test_mul3x3_f64(<9 x double> %a0, <9 x double> %a1) nounwin
 ; AVX512VL-NEXT:    vmulsd %xmm3, %xmm8, %xmm3
 ; AVX512VL-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vinsertf64x4 $1, %ymm4, %zmm1, %zmm1
-; AVX512VL-NEXT:    vmovapd {{.*#+}} zmm3 = [0,1,2,4,5,6,8,9]
+; AVX512VL-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,1,2,4,5,6,8,9]
 ; AVX512VL-NEXT:    vpermi2pd %zmm0, %zmm1, %zmm3
 ; AVX512VL-NEXT:    vmovsd %xmm2, 64(%rdi)
 ; AVX512VL-NEXT:    vmovapd %zmm3, (%rdi)
diff --git a/llvm/test/CodeGen/X86/nontemporal-4.ll b/llvm/test/CodeGen/X86/nontemporal-4.ll
index c1eff891a9487..3d86174e45103 100644
--- a/llvm/test/CodeGen/X86/nontemporal-4.ll
+++ b/llvm/test/CodeGen/X86/nontemporal-4.ll
@@ -659,9 +659,9 @@ define void @test_constant_v4i64_align16(ptr %dst) nounwind {
 ;
 ; AVX512-LABEL: test_constant_v4i64_align16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551614,18446744073709551613]
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [18446744073709551614,18446744073709551613]
 ; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255]
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [0,18446744073709551615]
 ; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
 ; AVX512-NEXT:    retq
   store <4 x i64> <i64 0, i64 -1, i64 -2, i64 -3>, ptr %dst, align 16, !nontemporal !1
@@ -687,9 +687,9 @@ define void @test_constant_v8i32_align16(ptr %dst) nounwind {
 ;
 ; AVX512-LABEL: test_constant_v8i32_align16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967292,4294967291,4294967290,4294967289]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4294967292,4294967291,4294967290,4294967289]
 ; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [0,4294967295,4294967294,4294967293]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,4294967295,4294967294,4294967293]
 ; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
 ; AVX512-NEXT:    retq
   store <8 x i32> <i32 0, i32 -1, i32 -2, i32 -3, i32 -4, i32 -5, i32 -6, i32 -7>, ptr %dst, align 16, !nontemporal !1
@@ -1408,13 +1408,13 @@ define void @test_constant_v8i64_align16(ptr %dst) nounwind {
 ;
 ; AVX512-LABEL: test_constant_v8i64_align16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551614,18446744073709551613]
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [18446744073709551614,18446744073709551613]
 ; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255]
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [0,18446744073709551615]
 ; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551610,18446744073709551609]
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [18446744073709551610,18446744073709551609]
 ; AVX512-NEXT:    vmovntps %xmm0, 48(%rdi)
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551612,18446744073709551611]
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [18446744073709551612,18446744073709551611]
 ; AVX512-NEXT:    vmovntps %xmm0, 32(%rdi)
 ; AVX512-NEXT:    retq
   store <8 x i64> <i64 0, i64 -1, i64 -2, i64 -3, i64 -4, i64 -5, i64 -6, i64 -7>, ptr %dst, align 16, !nontemporal !1
@@ -1448,13 +1448,13 @@ define void @test_constant_v16i32_align16(ptr %dst) nounwind {
 ;
 ; AVX512-LABEL: test_constant_v16i32_align16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967292,4294967291,4294967290,4294967289]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4294967292,4294967291,4294967290,4294967289]
 ; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [0,4294967295,4294967294,4294967293]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,4294967295,4294967294,4294967293]
 ; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967284,4294967283,4294967282,4294967281]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4294967284,4294967283,4294967282,4294967281]
 ; AVX512-NEXT:    vmovntps %xmm0, 48(%rdi)
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967288,4294967287,4294967286,4294967285]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4294967288,4294967287,4294967286,4294967285]
 ; AVX512-NEXT:    vmovntps %xmm0, 32(%rdi)
 ; AVX512-NEXT:    retq
   store <16 x i32> <i32 0, i32 -1, i32 -2, i32 -3, i32 -4, i32 -5, i32 -6, i32 -7, i32 -8, i32 -9, i32 -10, i32 -11, i32 -12, i32 -13, i32 -14, i32 -15>, ptr %dst, align 16, !nontemporal !1
@@ -1634,9 +1634,9 @@ define void @test_constant_v8i64_align32(ptr %dst) nounwind {
 ;
 ; AVX512-LABEL: test_constant_v8i64_align32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551612,18446744073709551611,18446744073709551610,18446744073709551609]
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [18446744073709551612,18446744073709551611,18446744073709551610,18446744073709551609]
 ; AVX512-NEXT:    vmovntps %ymm0, 32(%rdi)
-; AVX512-NEXT:    vmovaps {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551614,18446744073709551613]
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551614,18446744073709551613]
 ; AVX512-NEXT:    vmovntps %ymm0, (%rdi)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -1668,9 +1668,9 @@ define void @test_constant_v16i32_align32(ptr %dst) nounwind {
 ;
 ; AVX512-LABEL: test_constant_v16i32_align32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967288,4294967287,4294967286,4294967285,4294967284,4294967283,4294967282,4294967281]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [4294967288,4294967287,4294967286,4294967285,4294967284,4294967283,4294967282,4294967281]
 ; AVX512-NEXT:    vmovntps %ymm0, 32(%rdi)
-; AVX512-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,4294967294,4294967293,4294967292,4294967291,4294967290,4294967289]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [0,4294967295,4294967294,4294967293,4294967292,4294967291,4294967290,4294967289]
 ; AVX512-NEXT:    vmovntps %ymm0, (%rdi)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/pr29112.ll b/llvm/test/CodeGen/X86/pr29112.ll
index b099e0c399404..2e5c6f047292c 100644
--- a/llvm/test/CodeGen/X86/pr29112.ll
+++ b/llvm/test/CodeGen/X86/pr29112.ll
@@ -11,29 +11,29 @@ define <4 x float> @bar(ptr %a1p, ptr %a2p, <4 x float> %a3, <4 x float> %a4, <1
 ; CHECK-NEXT:    subq $136, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 144
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm13
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm5 = [3,20,1,17]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,20,1,17]
 ; CHECK-NEXT:    vpermi2ps %zmm3, %zmm2, %zmm5
 ; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[2,1,2,3]
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [4,21,1,17,4,21,5,21]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [4,21,1,17,4,21,5,21]
 ; CHECK-NEXT:    vpermi2ps %zmm3, %zmm2, %zmm0
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm6
 ; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm4 = [4,20,1,27]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [4,20,1,27]
 ; CHECK-NEXT:    vpermi2ps %zmm3, %zmm2, %zmm4
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm7 = [5,20,1,19,5,20,5,23]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [5,20,1,19,5,20,5,23]
 ; CHECK-NEXT:    vpermi2ps %zmm3, %zmm2, %zmm7
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [4,20,1,19,4,20,5,23]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [4,20,1,19,4,20,5,23]
 ; CHECK-NEXT:    vpermi2ps %zmm3, %zmm2, %zmm0
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm12 = [4,28,1,17]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm12 = [4,28,1,17]
 ; CHECK-NEXT:    vpermi2ps %zmm3, %zmm2, %zmm12
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm8 = [5,20,1,17,5,20,5,21]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [5,20,1,17,5,20,5,21]
 ; CHECK-NEXT:    vpermi2ps %zmm3, %zmm2, %zmm8
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm9 = [4,30,1,22]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [4,30,1,22]
 ; CHECK-NEXT:    vpermi2ps %zmm3, %zmm2, %zmm9
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm10 = [4,22,1,17,4,22,5,21]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [4,22,1,17,4,22,5,21]
 ; CHECK-NEXT:    vpermi2ps %zmm3, %zmm2, %zmm10
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm11 = [4,20,3,18,4,20,7,22]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [4,20,3,18,4,20,7,22]
 ; CHECK-NEXT:    vpermi2ps %zmm3, %zmm2, %zmm11
 ; CHECK-NEXT:    vaddps %xmm10, %xmm11, %xmm2
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
diff --git a/llvm/test/CodeGen/X86/pr46532.ll b/llvm/test/CodeGen/X86/pr46532.ll
index cbc677229ede6..c798e74a0b231 100644
--- a/llvm/test/CodeGen/X86/pr46532.ll
+++ b/llvm/test/CodeGen/X86/pr46532.ll
@@ -7,7 +7,7 @@ define void @WhileWithLoopInvariantOperation.21() {
 ; CHECK-NEXT:    movq (%rax), %rax
 ; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovaps %xmm0, 32(%rax)
-; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = [4294967295,4294967295,0,0]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [18446744073709551615,0]
 ; CHECK-NEXT:    vmaskmovps %ymm0, %ymm0, (%rax)
 while.1.body.preheader:
   %0 = load ptr, ptr undef, align 8, !invariant.load !0, !dereferenceable !1, !align !2
diff --git a/llvm/test/CodeGen/X86/pr78109.ll b/llvm/test/CodeGen/X86/pr78109.ll
index 26586cba37658..78b4885319b76 100644
--- a/llvm/test/CodeGen/X86/pr78109.ll
+++ b/llvm/test/CodeGen/X86/pr78109.ll
@@ -10,15 +10,11 @@ define <4 x i32> @PR78109() {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,1,0,1]
 ; SSE-NEXT:    retq
-;
-; AVX-LABEL: PR78109:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = [0,1,0,1]
-; AVX-NEXT:    # xmm0 = mem[0,0]
-; AVX-NEXT:    retq
   %shuffle.1 = shufflevector <4 x i32> <i32 7, i32 7, i32 0, i32 7>, <4 x i32> zeroinitializer, <4 x i32> <i32 2, i32 2, i32 1, i32 1> ; <0, 0, 7, 7>
   %shift = lshr <4 x i32> %shuffle.1, <i32 0, i32 0, i32 1, i32 0> ; <0, 0, 3, 7>
   %shuffle.2 = shufflevector <4 x i32> %shift, <4 x i32> zeroinitializer, <4 x i32> <i32 2, i32 2, i32 0, i32 0> ; <3, 3, 0, 0>
   %shuffle.3 = shufflevector <4 x i32> %shuffle.2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> <i32 2, i32 6, i32 3, i32 7> ; <0, 1, 0, 1>
   ret <4 x i32> %shuffle.3
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX: {{.*}}
diff --git a/llvm/test/CodeGen/X86/pr97968.ll b/llvm/test/CodeGen/X86/pr97968.ll
index c8a0536ac4316..ca5c63cdc1c2e 100644
--- a/llvm/test/CodeGen/X86/pr97968.ll
+++ b/llvm/test/CodeGen/X86/pr97968.ll
@@ -4,8 +4,7 @@
 define <2 x i32> @PR97968(<16 x i32> %a0) {
 ; CHECK-LABEL: PR97968:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovddup {{.*#+}} xmm1 = [2,7,2,7]
-; CHECK-NEXT:    # xmm1 = mem[0,0]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [2,7,2,7]
 ; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/sse41-intrinsics-x86.ll b/llvm/test/CodeGen/X86/sse41-intrinsics-x86.ll
index dfd17ffaed0b2..7c22330d7804b 100644
--- a/llvm/test/CodeGen/X86/sse41-intrinsics-x86.ll
+++ b/llvm/test/CodeGen/X86/sse41-intrinsics-x86.ll
@@ -201,9 +201,9 @@ define <8 x i16> @test_x86_sse41_packusdw_fold() {
 ;
 ; X86-AVX512-LABEL: test_x86_sse41_packusdw_fold:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [0,0,0,0,65535,65535,0,0]
-; X86-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
-; X86-AVX512-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,0,4294967295,0]
+; X86-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A]
+; X86-AVX512-NEXT:    ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: test_x86_sse41_packusdw_fold:
@@ -222,9 +222,9 @@ define <8 x i16> @test_x86_sse41_packusdw_fold() {
 ;
 ; X64-AVX512-LABEL: test_x86_sse41_packusdw_fold:
 ; X64-AVX512:       ## %bb.0:
-; X64-AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [0,0,0,0,65535,65535,0,0]
-; X64-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
-; X64-AVX512-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,0,4294967295,0]
+; X64-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A]
+; X64-AVX512-NEXT:    ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
 ; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> zeroinitializer, <4 x i32> <i32 65535, i32 65536, i32 -1, i32 -131072>)
   ret <8 x i16> %res
diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
index a0e9f33483b69..df2dc77dc1259 100644
--- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
@@ -1896,10 +1896,15 @@ define <2 x i64> @fptosi_2f64_to_2i64_const() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,18446744073709551615]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptosi_2f64_to_2i64_const:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,18446744073709551615]
-; AVX-NEXT:    retq
+; VEX-LABEL: fptosi_2f64_to_2i64_const:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,18446744073709551615]
+; VEX-NEXT:    retq
+;
+; AVX512-LABEL: fptosi_2f64_to_2i64_const:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [1,18446744073709551615]
+; AVX512-NEXT:    retq
   %cvt = fptosi <2 x double> <double 1.0, double -1.0> to <2 x i64>
   ret <2 x i64> %cvt
 }
@@ -1910,10 +1915,15 @@ define <4 x i32> @fptosi_2f64_to_2i32_const() {
 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = [4294967295,1,0,0]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptosi_2f64_to_2i32_const:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4294967295,1,0,0]
-; AVX-NEXT:    retq
+; VEX-LABEL: fptosi_2f64_to_2i32_const:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovsd {{.*#+}} xmm0 = [4294967295,1,0,0]
+; VEX-NEXT:    retq
+;
+; AVX512-LABEL: fptosi_2f64_to_2i32_const:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4294967295,1,0,0]
+; AVX512-NEXT:    retq
   %cvt = fptosi <2 x double> <double -1.0, double 1.0> to <2 x i32>
   %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   ret <4 x i32> %ext
@@ -1926,10 +1936,15 @@ define <4 x i64> @fptosi_4f64_to_4i64_const() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm1 = [2,18446744073709551613]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptosi_4f64_to_4i64_const:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,18446744073709551613]
-; AVX-NEXT:    retq
+; VEX-LABEL: fptosi_4f64_to_4i64_const:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,18446744073709551613]
+; VEX-NEXT:    retq
+;
+; AVX512-LABEL: fptosi_4f64_to_4i64_const:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [1,18446744073709551615,2,18446744073709551613]
+; AVX512-NEXT:    retq
   %cvt = fptosi <4 x double> <double 1.0, double -1.0, double 2.0, double -3.0> to <4 x i64>
   ret <4 x i64> %cvt
 }
@@ -1940,10 +1955,15 @@ define <4 x i32> @fptosi_4f64_to_4i32_const() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptosi_4f64_to_4i32_const:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3]
-; AVX-NEXT:    retq
+; VEX-LABEL: fptosi_4f64_to_4i32_const:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3]
+; VEX-NEXT:    retq
+;
+; AVX512-LABEL: fptosi_4f64_to_4i32_const:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4294967295,1,4294967294,3]
+; AVX512-NEXT:    retq
   %cvt = fptosi <4 x double> <double -1.0, double 1.0, double -2.0, double 3.0> to <4 x i32>
   ret <4 x i32> %cvt
 }
@@ -1954,10 +1974,15 @@ define <2 x i64> @fptoui_2f64_to_2i64_const() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [2,4]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptoui_2f64_to_2i64_const:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [2,4]
-; AVX-NEXT:    retq
+; VEX-LABEL: fptoui_2f64_to_2i64_const:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovaps {{.*#+}} xmm0 = [2,4]
+; VEX-NEXT:    retq
+;
+; AVX512-LABEL: fptoui_2f64_to_2i64_const:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [2,4]
+; AVX512-NEXT:    retq
   %cvt = fptoui <2 x double> <double 2.0, double 4.0> to <2 x i64>
   ret <2 x i64> %cvt
 }
@@ -1968,10 +1993,15 @@ define <4 x i32> @fptoui_2f64_to_2i32_const(<2 x double> %a) {
 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = [2,4,0,0]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptoui_2f64_to_2i32_const:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [2,4,0,0]
-; AVX-NEXT:    retq
+; VEX-LABEL: fptoui_2f64_to_2i32_const:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovsd {{.*#+}} xmm0 = [2,4,0,0]
+; VEX-NEXT:    retq
+;
+; AVX512-LABEL: fptoui_2f64_to_2i32_const:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [2,4,0,0]
+; AVX512-NEXT:    retq
   %cvt = fptoui <2 x double> <double 2.0, double 4.0> to <2 x i32>
   %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   ret <4 x i32> %ext
@@ -1984,10 +2014,15 @@ define <4 x i64> @fptoui_4f64_to_4i64_const(<4 x double> %a) {
 ; SSE-NEXT:    movaps {{.*#+}} xmm1 = [6,8]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptoui_4f64_to_4i64_const:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [2,4,6,8]
-; AVX-NEXT:    retq
+; VEX-LABEL: fptoui_4f64_to_4i64_const:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovaps {{.*#+}} ymm0 = [2,4,6,8]
+; VEX-NEXT:    retq
+;
+; AVX512-LABEL: fptoui_4f64_to_4i64_const:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [2,4,6,8]
+; AVX512-NEXT:    retq
   %cvt = fptoui <4 x double> <double 2.0, double 4.0, double 6.0, double 8.0> to <4 x i64>
   ret <4 x i64> %cvt
 }
@@ -1998,10 +2033,15 @@ define <4 x i32> @fptoui_4f64_to_4i32_const(<4 x double> %a) {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [2,4,6,8]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptoui_4f64_to_4i32_const:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [2,4,6,8]
-; AVX-NEXT:    retq
+; VEX-LABEL: fptoui_4f64_to_4i32_const:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovaps {{.*#+}} xmm0 = [2,4,6,8]
+; VEX-NEXT:    retq
+;
+; AVX512-LABEL: fptoui_4f64_to_4i32_const:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [2,4,6,8]
+; AVX512-NEXT:    retq
   %cvt = fptoui <4 x double> <double 2.0, double 4.0, double 6.0, double 8.0> to <4 x i32>
   ret <4 x i32> %cvt
 }
@@ -2012,10 +2052,15 @@ define <4 x i32> @fptosi_4f32_to_4i32_const() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,4294967295,2,3]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptosi_4f32_to_4i32_const:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,4294967295,2,3]
-; AVX-NEXT:    retq
+; VEX-LABEL: fptosi_4f32_to_4i32_const:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,4294967295,2,3]
+; VEX-NEXT:    retq
+;
+; AVX512-LABEL: fptosi_4f32_to_4i32_const:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [1,4294967295,2,3]
+; AVX512-NEXT:    retq
   %cvt = fptosi <4 x float> <float 1.0, float -1.0, float 2.0, float 3.0> to <4 x i32>
   ret <4 x i32> %cvt
 }
@@ -2027,10 +2072,15 @@ define <4 x i64> @fptosi_4f32_to_4i64_const() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm1 = [2,3]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptosi_4f32_to_4i64_const:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,3]
-; AVX-NEXT:    retq
+; VEX-LABEL: fptosi_4f32_to_4i64_const:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,3]
+; VEX-NEXT:    retq
+;
+; AVX512-LABEL: fptosi_4f32_to_4i64_const:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [1,18446744073709551615,2,3]
+; AVX512-NEXT:    retq
   %cvt = fptosi <4 x float> <float 1.0, float -1.0, float 2.0, float 3.0> to <4 x i64>
   ret <4 x i64> %cvt
 }
@@ -2042,10 +2092,15 @@ define <8 x i32> @fptosi_8f32_to_8i32_const(<8 x float> %a) {
 ; SSE-NEXT:    movaps {{.*#+}} xmm1 = [6,4294967288,2,4294967295]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptosi_8f32_to_8i32_const:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,4294967295,2,3,6,4294967288,2,4294967295]
-; AVX-NEXT:    retq
+; VEX-LABEL: fptosi_8f32_to_8i32_const:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,4294967295,2,3,6,4294967288,2,4294967295]
+; VEX-NEXT:    retq
+;
+; AVX512-LABEL: fptosi_8f32_to_8i32_const:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,4294967295,2,3,6,4294967288,2,4294967295]
+; AVX512-NEXT:    retq
   %cvt = fptosi <8 x float> <float 1.0, float -1.0, float 2.0, float 3.0, float 6.0, float -8.0, float 2.0, float -1.0> to <8 x i32>
   ret <8 x i32> %cvt
 }
@@ -2056,10 +2111,15 @@ define <4 x i32> @fptoui_4f32_to_4i32_const(<4 x float> %a) {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,2,4,6]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptoui_4f32_to_4i32_const:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,2,4,6]
-; AVX-NEXT:    retq
+; VEX-LABEL: fptoui_4f32_to_4i32_const:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,2,4,6]
+; VEX-NEXT:    retq
+;
+; AVX512-LABEL: fptoui_4f32_to_4i32_const:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [1,2,4,6]
+; AVX512-NEXT:    retq
   %cvt = fptoui <4 x float> <float 1.0, float 2.0, float 4.0, float 6.0> to <4 x i32>
   ret <4 x i32> %cvt
 }
@@ -2071,10 +2131,15 @@ define <4 x i64> @fptoui_4f32_to_4i64_const() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm1 = [4,8]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptoui_4f32_to_4i64_const:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,2,4,8]
-; AVX-NEXT:    retq
+; VEX-LABEL: fptoui_4f32_to_4i64_const:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,2,4,8]
+; VEX-NEXT:    retq
+;
+; AVX512-LABEL: fptoui_4f32_to_4i64_const:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [1,2,4,8]
+; AVX512-NEXT:    retq
   %cvt = fptoui <4 x float> <float 1.0, float 2.0, float 4.0, float 8.0> to <4 x i64>
   ret <4 x i64> %cvt
 }
@@ -2086,10 +2151,15 @@ define <8 x i32> @fptoui_8f32_to_8i32_const(<8 x float> %a) {
 ; SSE-NEXT:    movaps {{.*#+}} xmm1 = [8,6,4,1]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptoui_8f32_to_8i32_const:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,2,4,6,8,6,4,1]
-; AVX-NEXT:    retq
+; VEX-LABEL: fptoui_8f32_to_8i32_const:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,2,4,6,8,6,4,1]
+; VEX-NEXT:    retq
+;
+; AVX512-LABEL: fptoui_8f32_to_8i32_const:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,2,4,6,8,6,4,1]
+; AVX512-NEXT:    retq
   %cvt = fptoui <8 x float> <float 1.0, float 2.0, float 4.0, float 6.0, float 8.0, float 6.0, float 4.0, float 1.0> to <8 x i32>
   ret <8 x i32> %cvt
 }
diff --git a/llvm/test/CodeGen/X86/vec_minmax_sint.ll b/llvm/test/CodeGen/X86/vec_minmax_sint.ll
index ade250ac827c5..853e29b8acfcd 100644
--- a/llvm/test/CodeGen/X86/vec_minmax_sint.ll
+++ b/llvm/test/CodeGen/X86/vec_minmax_sint.ll
@@ -1541,10 +1541,20 @@ define <2 x i64> @max_gt_v2i64c() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [18446744073709551615,7]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: max_gt_v2i64c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551615,7]
-; AVX-NEXT:    retq
+; AVX1-LABEL: max_gt_v2i64c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551615,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_gt_v2i64c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551615,7]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_gt_v2i64c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [18446744073709551615,7]
+; AVX512-NEXT:    retq
   %1 = insertelement <2 x i64> <i64 -7, i64 7>, i64 -7, i32 0
   %2 = insertelement <2 x i64> <i64 -1, i64 1>, i64 -1, i32 0
   %3 = icmp sgt <2 x i64> %1, %2
@@ -1559,10 +1569,20 @@ define <4 x i64> @max_gt_v4i64c() {
 ; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: max_gt_v4i64c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7]
-; AVX-NEXT:    retq
+; AVX1-LABEL: max_gt_v4i64c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_gt_v4i64c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_gt_v4i64c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7]
+; AVX512-NEXT:    retq
   %1 = insertelement <4 x i64> <i64 -7, i64 -1, i64 1, i64 7>, i64 -7, i32 0
   %2 = insertelement <4 x i64> <i64 -1, i64 -7, i64 7, i64 1>, i64 -1, i32 0
   %3 = icmp sgt <4 x i64> %1, %2
@@ -1576,10 +1596,20 @@ define <4 x i32> @max_gt_v4i32c() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: max_gt_v4i32c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
-; AVX-NEXT:    retq
+; AVX1-LABEL: max_gt_v4i32c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_gt_v4i32c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_gt_v4i32c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
+; AVX512-NEXT:    retq
   %1 = insertelement <4 x i32> <i32 -7, i32 -1, i32 1, i32 7>, i32 -7, i32 0
   %2 = insertelement <4 x i32> <i32 -1, i32 -7, i32 7, i32 1>, i32 -1, i32 0
   %3 = icmp sgt <4 x i32> %1, %2
@@ -1594,10 +1624,20 @@ define <8 x i32> @max_gt_v8i32c() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm1 = [7,5,5,7]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: max_gt_v8i32c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7]
-; AVX-NEXT:    retq
+; AVX1-LABEL: max_gt_v8i32c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_gt_v8i32c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_gt_v8i32c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7]
+; AVX512-NEXT:    retq
   %1 = insertelement <8 x i32> <i32 -7, i32 -5, i32 -3, i32 -1, i32 1, i32 3, i32 5, i32 7>, i32 -7, i32 0
   %2 = insertelement <8 x i32> <i32 -1, i32 -3, i32 -5, i32 -7, i32 7, i32 5, i32 3, i32 1>, i32 -1, i32 0
   %3 = icmp sgt <8 x i32> %1, %2
@@ -1663,10 +1703,20 @@ define <2 x i64> @max_ge_v2i64c() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [18446744073709551615,7]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: max_ge_v2i64c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551615,7]
-; AVX-NEXT:    retq
+; AVX1-LABEL: max_ge_v2i64c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551615,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_ge_v2i64c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551615,7]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_ge_v2i64c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [18446744073709551615,7]
+; AVX512-NEXT:    retq
   %1 = insertelement <2 x i64> <i64 -7, i64 7>, i64 -7, i32 0
   %2 = insertelement <2 x i64> <i64 -1, i64 1>, i64 -1, i32 0
   %3 = icmp sge <2 x i64> %1, %2
@@ -1681,10 +1731,20 @@ define <4 x i64> @max_ge_v4i64c() {
 ; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: max_ge_v4i64c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7]
-; AVX-NEXT:    retq
+; AVX1-LABEL: max_ge_v4i64c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_ge_v4i64c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_ge_v4i64c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7]
+; AVX512-NEXT:    retq
   %1 = insertelement <4 x i64> <i64 -7, i64 -1, i64 1, i64 7>, i64 -7, i32 0
   %2 = insertelement <4 x i64> <i64 -1, i64 -7, i64 7, i64 1>, i64 -1, i32 0
   %3 = icmp sge <4 x i64> %1, %2
@@ -1698,10 +1758,20 @@ define <4 x i32> @max_ge_v4i32c() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: max_ge_v4i32c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
-; AVX-NEXT:    retq
+; AVX1-LABEL: max_ge_v4i32c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_ge_v4i32c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_ge_v4i32c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
+; AVX512-NEXT:    retq
   %1 = insertelement <4 x i32> <i32 -7, i32 -1, i32 1, i32 7>, i32 -7, i32 0
   %2 = insertelement <4 x i32> <i32 -1, i32 -7, i32 7, i32 1>, i32 -1, i32 0
   %3 = icmp sge <4 x i32> %1, %2
@@ -1716,10 +1786,20 @@ define <8 x i32> @max_ge_v8i32c() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm1 = [7,5,5,7]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: max_ge_v8i32c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7]
-; AVX-NEXT:    retq
+; AVX1-LABEL: max_ge_v8i32c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_ge_v8i32c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_ge_v8i32c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7]
+; AVX512-NEXT:    retq
   %1 = insertelement <8 x i32> <i32 -7, i32 -5, i32 -3, i32 -1, i32 1, i32 3, i32 5, i32 7>, i32 -7, i32 0
   %2 = insertelement <8 x i32> <i32 -1, i32 -3, i32 -5, i32 -7, i32 7, i32 5, i32 3, i32 1>, i32 -1, i32 0
   %3 = icmp sge <8 x i32> %1, %2
@@ -1785,10 +1865,20 @@ define <2 x i64> @min_lt_v2i64c() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [18446744073709551609,1]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: min_lt_v2i64c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551609,1]
-; AVX-NEXT:    retq
+; AVX1-LABEL: min_lt_v2i64c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551609,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_lt_v2i64c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551609,1]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_lt_v2i64c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [18446744073709551609,1]
+; AVX512-NEXT:    retq
   %1 = insertelement <2 x i64> <i64 -7, i64 7>, i64 -7, i32 0
   %2 = insertelement <2 x i64> <i64 -1, i64 1>, i64 -1, i32 0
   %3 = icmp slt <2 x i64> %1, %2
@@ -1803,10 +1893,20 @@ define <4 x i64> @min_lt_v4i64c() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,1]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: min_lt_v4i64c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1]
-; AVX-NEXT:    retq
+; AVX1-LABEL: min_lt_v4i64c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_lt_v4i64c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_lt_v4i64c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1]
+; AVX512-NEXT:    retq
   %1 = insertelement <4 x i64> <i64 -7, i64 -1, i64 1, i64 7>, i64 -7, i32 0
   %2 = insertelement <4 x i64> <i64 -1, i64 -7, i64 7, i64 1>, i64 -1, i32 0
   %3 = icmp slt <4 x i64> %1, %2
@@ -1820,10 +1920,20 @@ define <4 x i32> @min_lt_v4i32c() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: min_lt_v4i32c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
-; AVX-NEXT:    retq
+; AVX1-LABEL: min_lt_v4i32c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_lt_v4i32c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_lt_v4i32c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
+; AVX512-NEXT:    retq
   %1 = insertelement <4 x i32> <i32 -7, i32 -1, i32 1, i32 7>, i32 -7, i32 0
   %2 = insertelement <4 x i32> <i32 -1, i32 -7, i32 7, i32 1>, i32 -1, i32 0
   %3 = icmp slt <4 x i32> %1, %2
@@ -1838,10 +1948,20 @@ define <8 x i32> @min_lt_v8i32c() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,3,3,1]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: min_lt_v8i32c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1]
-; AVX-NEXT:    retq
+; AVX1-LABEL: min_lt_v8i32c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_lt_v8i32c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_lt_v8i32c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1]
+; AVX512-NEXT:    retq
   %1 = insertelement <8 x i32> <i32 -7, i32 -5, i32 -3, i32 -1, i32 1, i32 3, i32 5, i32 7>, i32 -7, i32 0
   %2 = insertelement <8 x i32> <i32 -1, i32 -3, i32 -5, i32 -7, i32 7, i32 5, i32 3, i32 1>, i32 -1, i32 0
   %3 = icmp slt <8 x i32> %1, %2
@@ -1907,10 +2027,20 @@ define <2 x i64> @min_le_v2i64c() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [18446744073709551609,1]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: min_le_v2i64c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551609,1]
-; AVX-NEXT:    retq
+; AVX1-LABEL: min_le_v2i64c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551609,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_le_v2i64c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551609,1]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_le_v2i64c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [18446744073709551609,1]
+; AVX512-NEXT:    retq
   %1 = insertelement <2 x i64> <i64 -7, i64 7>, i64 -7, i32 0
   %2 = insertelement <2 x i64> <i64 -1, i64 1>, i64 -1, i32 0
   %3 = icmp sle <2 x i64> %1, %2
@@ -1925,10 +2055,20 @@ define <4 x i64> @min_le_v4i64c() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,1]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: min_le_v4i64c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1]
-; AVX-NEXT:    retq
+; AVX1-LABEL: min_le_v4i64c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_le_v4i64c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_le_v4i64c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1]
+; AVX512-NEXT:    retq
   %1 = insertelement <4 x i64> <i64 -7, i64 -1, i64 1, i64 7>, i64 -7, i32 0
   %2 = insertelement <4 x i64> <i64 -1, i64 -7, i64 7, i64 1>, i64 -1, i32 0
   %3 = icmp sle <4 x i64> %1, %2
@@ -1942,10 +2082,20 @@ define <4 x i32> @min_le_v4i32c() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: min_le_v4i32c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
-; AVX-NEXT:    retq
+; AVX1-LABEL: min_le_v4i32c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_le_v4i32c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_le_v4i32c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
+; AVX512-NEXT:    retq
   %1 = insertelement <4 x i32> <i32 -7, i32 -1, i32 1, i32 7>, i32 -7, i32 0
   %2 = insertelement <4 x i32> <i32 -1, i32 -7, i32 7, i32 1>, i32 -1, i32 0
   %3 = icmp sle <4 x i32> %1, %2
@@ -1960,10 +2110,20 @@ define <8 x i32> @min_le_v8i32c() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,3,3,1]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: min_le_v8i32c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1]
-; AVX-NEXT:    retq
+; AVX1-LABEL: min_le_v8i32c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_le_v8i32c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_le_v8i32c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1]
+; AVX512-NEXT:    retq
   %1 = insertelement <8 x i32> <i32 -7, i32 -5, i32 -3, i32 -1, i32 1, i32 3, i32 5, i32 7>, i32 -7, i32 0
   %2 = insertelement <8 x i32> <i32 -1, i32 -3, i32 -5, i32 -7, i32 7, i32 5, i32 3, i32 1>, i32 -1, i32 0
   %3 = icmp sle <8 x i32> %1, %2
diff --git a/llvm/test/CodeGen/X86/vec_minmax_uint.ll b/llvm/test/CodeGen/X86/vec_minmax_uint.ll
index 3ddc882adf0af..9b4da3f9b817f 100644
--- a/llvm/test/CodeGen/X86/vec_minmax_uint.ll
+++ b/llvm/test/CodeGen/X86/vec_minmax_uint.ll
@@ -1653,10 +1653,20 @@ define <2 x i64> @max_gt_v2i64c() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [18446744073709551615,7]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: max_gt_v2i64c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551615,7]
-; AVX-NEXT:    retq
+; AVX1-LABEL: max_gt_v2i64c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551615,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_gt_v2i64c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551615,7]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_gt_v2i64c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [18446744073709551615,7]
+; AVX512-NEXT:    retq
   %1 = insertelement <2 x i64> <i64 -7, i64 7>, i64 -7, i32 0
   %2 = insertelement <2 x i64> <i64 -1, i64 1>, i64 -1, i32 0
   %3 = icmp ugt <2 x i64> %1, %2
@@ -1671,10 +1681,20 @@ define <4 x i64> @max_gt_v4i64c() {
 ; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: max_gt_v4i64c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7]
-; AVX-NEXT:    retq
+; AVX1-LABEL: max_gt_v4i64c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_gt_v4i64c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_gt_v4i64c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7]
+; AVX512-NEXT:    retq
   %1 = insertelement <4 x i64> <i64 -7, i64 -1, i64 1, i64 7>, i64 -7, i32 0
   %2 = insertelement <4 x i64> <i64 -1, i64 -7, i64 7, i64 1>, i64 -1, i32 0
   %3 = icmp ugt <4 x i64> %1, %2
@@ -1688,10 +1708,20 @@ define <4 x i32> @max_gt_v4i32c() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: max_gt_v4i32c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
-; AVX-NEXT:    retq
+; AVX1-LABEL: max_gt_v4i32c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_gt_v4i32c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_gt_v4i32c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
+; AVX512-NEXT:    retq
   %1 = insertelement <4 x i32> <i32 -7, i32 -1, i32 1, i32 7>, i32 -7, i32 0
   %2 = insertelement <4 x i32> <i32 -1, i32 -7, i32 7, i32 1>, i32 -1, i32 0
   %3 = icmp ugt <4 x i32> %1, %2
@@ -1706,10 +1736,20 @@ define <8 x i32> @max_gt_v8i32c() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm1 = [7,5,5,7]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: max_gt_v8i32c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7]
-; AVX-NEXT:    retq
+; AVX1-LABEL: max_gt_v8i32c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_gt_v8i32c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_gt_v8i32c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7]
+; AVX512-NEXT:    retq
   %1 = insertelement <8 x i32> <i32 -7, i32 -5, i32 -3, i32 -1, i32 1, i32 3, i32 5, i32 7>, i32 -7, i32 0
   %2 = insertelement <8 x i32> <i32 -1, i32 -3, i32 -5, i32 -7, i32 7, i32 5, i32 3, i32 1>, i32 -1, i32 0
   %3 = icmp ugt <8 x i32> %1, %2
@@ -1775,10 +1815,20 @@ define <2 x i64> @max_ge_v2i64c() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [18446744073709551615,7]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: max_ge_v2i64c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551615,7]
-; AVX-NEXT:    retq
+; AVX1-LABEL: max_ge_v2i64c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551615,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_ge_v2i64c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551615,7]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_ge_v2i64c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [18446744073709551615,7]
+; AVX512-NEXT:    retq
   %1 = insertelement <2 x i64> <i64 -7, i64 7>, i64 -7, i32 0
   %2 = insertelement <2 x i64> <i64 -1, i64 1>, i64 -1, i32 0
   %3 = icmp uge <2 x i64> %1, %2
@@ -1793,10 +1843,20 @@ define <4 x i64> @max_ge_v4i64c() {
 ; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: max_ge_v4i64c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7]
-; AVX-NEXT:    retq
+; AVX1-LABEL: max_ge_v4i64c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_ge_v4i64c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_ge_v4i64c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7]
+; AVX512-NEXT:    retq
   %1 = insertelement <4 x i64> <i64 -7, i64 -1, i64 1, i64 7>, i64 -7, i32 0
   %2 = insertelement <4 x i64> <i64 -1, i64 -7, i64 7, i64 1>, i64 -1, i32 0
   %3 = icmp uge <4 x i64> %1, %2
@@ -1810,10 +1870,20 @@ define <4 x i32> @max_ge_v4i32c() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: max_ge_v4i32c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
-; AVX-NEXT:    retq
+; AVX1-LABEL: max_ge_v4i32c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_ge_v4i32c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_ge_v4i32c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
+; AVX512-NEXT:    retq
   %1 = insertelement <4 x i32> <i32 -7, i32 -1, i32 1, i32 7>, i32 -7, i32 0
   %2 = insertelement <4 x i32> <i32 -1, i32 -7, i32 7, i32 1>, i32 -1, i32 0
   %3 = icmp uge <4 x i32> %1, %2
@@ -1828,10 +1898,20 @@ define <8 x i32> @max_ge_v8i32c() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm1 = [7,5,5,7]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: max_ge_v8i32c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7]
-; AVX-NEXT:    retq
+; AVX1-LABEL: max_ge_v8i32c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_ge_v8i32c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_ge_v8i32c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7]
+; AVX512-NEXT:    retq
   %1 = insertelement <8 x i32> <i32 -7, i32 -5, i32 -3, i32 -1, i32 1, i32 3, i32 5, i32 7>, i32 -7, i32 0
   %2 = insertelement <8 x i32> <i32 -1, i32 -3, i32 -5, i32 -7, i32 7, i32 5, i32 3, i32 1>, i32 -1, i32 0
   %3 = icmp uge <8 x i32> %1, %2
@@ -1897,10 +1977,20 @@ define <2 x i64> @min_lt_v2i64c() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [18446744073709551609,1]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: min_lt_v2i64c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551609,1]
-; AVX-NEXT:    retq
+; AVX1-LABEL: min_lt_v2i64c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551609,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_lt_v2i64c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551609,1]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_lt_v2i64c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [18446744073709551609,1]
+; AVX512-NEXT:    retq
   %1 = insertelement <2 x i64> <i64 -7, i64 7>, i64 -7, i32 0
   %2 = insertelement <2 x i64> <i64 -1, i64 1>, i64 -1, i32 0
   %3 = icmp ult <2 x i64> %1, %2
@@ -1915,10 +2005,20 @@ define <4 x i64> @min_lt_v4i64c() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,1]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: min_lt_v4i64c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1]
-; AVX-NEXT:    retq
+; AVX1-LABEL: min_lt_v4i64c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_lt_v4i64c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_lt_v4i64c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1]
+; AVX512-NEXT:    retq
   %1 = insertelement <4 x i64> <i64 -7, i64 -1, i64 1, i64 7>, i64 -7, i32 0
   %2 = insertelement <4 x i64> <i64 -1, i64 -7, i64 7, i64 1>, i64 -1, i32 0
   %3 = icmp ult <4 x i64> %1, %2
@@ -1932,10 +2032,20 @@ define <4 x i32> @min_lt_v4i32c() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: min_lt_v4i32c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
-; AVX-NEXT:    retq
+; AVX1-LABEL: min_lt_v4i32c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_lt_v4i32c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_lt_v4i32c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
+; AVX512-NEXT:    retq
   %1 = insertelement <4 x i32> <i32 -7, i32 -1, i32 1, i32 7>, i32 -7, i32 0
   %2 = insertelement <4 x i32> <i32 -1, i32 -7, i32 7, i32 1>, i32 -1, i32 0
   %3 = icmp ult <4 x i32> %1, %2
@@ -1950,10 +2060,20 @@ define <8 x i32> @min_lt_v8i32c() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,3,3,1]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: min_lt_v8i32c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1]
-; AVX-NEXT:    retq
+; AVX1-LABEL: min_lt_v8i32c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_lt_v8i32c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_lt_v8i32c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1]
+; AVX512-NEXT:    retq
   %1 = insertelement <8 x i32> <i32 -7, i32 -5, i32 -3, i32 -1, i32 1, i32 3, i32 5, i32 7>, i32 -7, i32 0
   %2 = insertelement <8 x i32> <i32 -1, i32 -3, i32 -5, i32 -7, i32 7, i32 5, i32 3, i32 1>, i32 -1, i32 0
   %3 = icmp ult <8 x i32> %1, %2
@@ -2019,10 +2139,20 @@ define <2 x i64> @min_le_v2i64c() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [18446744073709551609,1]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: min_le_v2i64c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551609,1]
-; AVX-NEXT:    retq
+; AVX1-LABEL: min_le_v2i64c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551609,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_le_v2i64c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551609,1]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_le_v2i64c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [18446744073709551609,1]
+; AVX512-NEXT:    retq
   %1 = insertelement <2 x i64> <i64 -7, i64 7>, i64 -7, i32 0
   %2 = insertelement <2 x i64> <i64 -1, i64 1>, i64 -1, i32 0
   %3 = icmp ule <2 x i64> %1, %2
@@ -2037,10 +2167,20 @@ define <4 x i64> @min_le_v4i64c() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,1]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: min_le_v4i64c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1]
-; AVX-NEXT:    retq
+; AVX1-LABEL: min_le_v4i64c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_le_v4i64c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_le_v4i64c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1]
+; AVX512-NEXT:    retq
   %1 = insertelement <4 x i64> <i64 -7, i64 -1, i64 1, i64 7>, i64 -7, i32 0
   %2 = insertelement <4 x i64> <i64 -1, i64 -7, i64 7, i64 1>, i64 -1, i32 0
   %3 = icmp ule <4 x i64> %1, %2
@@ -2054,10 +2194,20 @@ define <4 x i32> @min_le_v4i32c() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: min_le_v4i32c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
-; AVX-NEXT:    retq
+; AVX1-LABEL: min_le_v4i32c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_le_v4i32c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_le_v4i32c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
+; AVX512-NEXT:    retq
   %1 = insertelement <4 x i32> <i32 -7, i32 -1, i32 1, i32 7>, i32 -7, i32 0
   %2 = insertelement <4 x i32> <i32 -1, i32 -7, i32 7, i32 1>, i32 -1, i32 0
   %3 = icmp ule <4 x i32> %1, %2
@@ -2072,10 +2222,20 @@ define <8 x i32> @min_le_v8i32c() {
 ; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,3,3,1]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: min_le_v8i32c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1]
-; AVX-NEXT:    retq
+; AVX1-LABEL: min_le_v8i32c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_le_v8i32c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_le_v8i32c:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1]
+; AVX512-NEXT:    retq
   %1 = insertelement <8 x i32> <i32 -7, i32 -5, i32 -3, i32 -1, i32 1, i32 3, i32 5, i32 7>, i32 -7, i32 0
   %2 = insertelement <8 x i32> <i32 -1, i32 -3, i32 -5, i32 -7, i32 7, i32 5, i32 3, i32 1>, i32 -1, i32 0
   %3 = icmp ule <8 x i32> %1, %2
diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll
index 17b98b5ebcaea..94a1792cb8985 100644
--- a/llvm/test/CodeGen/X86/vector-compress.ll
+++ b/llvm/test/CodeGen/X86/vector-compress.ll
@@ -903,10 +903,20 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
 }
 
 define <4 x i32> @test_compress_all_const() {
-; CHECK-LABEL: test_compress_all_const:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = [5,9,0,0]
-; CHECK-NEXT:    retq
+; AVX2-LABEL: test_compress_all_const:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = [5,9,0,0]
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test_compress_all_const:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [5,9,0,0]
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: test_compress_all_const:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [5,9,0,0]
+; AVX512VL-NEXT:    retq
     %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> <i32 3, i32 5, i32 7, i32 9>,
                                                 <4 x i1>   <i1 0,  i1 1,  i1 0,  i1 1>,
                                                 <4 x i32> undef)
diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll
index 62ee0b298ba91..54acd012d1fe4 100644
--- a/llvm/test/CodeGen/X86/vector-half-conversions.ll
+++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll
@@ -3163,7 +3163,7 @@ define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind {
 ; AVX512F-NEXT:    callq __truncdfhf2@PLT
 ; AVX512F-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512F-NEXT:    vmovss {{.*#+}} xmm1 = [16,0,0,0]
+; AVX512F-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [16,0]
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm0
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
@@ -3185,7 +3185,7 @@ define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind {
 ; AVX512-FASTLANE-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
 ; AVX512-FASTLANE-NEXT:    callq __truncdfhf2@PLT
 ; AVX512-FASTLANE-NEXT:    vpbroadcastw %xmm0, %xmm1
-; AVX512-FASTLANE-NEXT:    vmovss {{.*#+}} xmm0 = [4,0,0,0]
+; AVX512-FASTLANE-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [4,0]
 ; AVX512-FASTLANE-NEXT:    vpermi2ps (%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
 ; AVX512-FASTLANE-NEXT:    addq $40, %rsp
 ; AVX512-FASTLANE-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
index b7e46e51064c0..3ba41ad07ce83 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
@@ -174,11 +174,11 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpinsrd $1, %r10d, %xmm4, %xmm4
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512-NEXT:    vmovsd {{.*#+}} xmm2 = [4,2,0,0]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0]
 ; AVX512-NEXT:    vmovaps 32(%rdi), %ymm5
 ; AVX512-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
 ; AVX512-NEXT:    vpermps %ymm5, %ymm2, %ymm2
-; AVX512-NEXT:    vmovsd {{.*#+}} xmm6 = [5,3,0,0]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0]
 ; AVX512-NEXT:    vpermps %ymm5, %ymm6, %ymm5
 ; AVX512-NEXT:    vmovq %xmm3, (%rsi)
 ; AVX512-NEXT:    vmovq %xmm1, (%rdx)
@@ -203,11 +203,11 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpermi2d %xmm3, %xmm1, %xmm2
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,5,0,0]
 ; AVX512-FCP-NEXT:    vpermi2d %xmm3, %xmm1, %xmm5
-; AVX512-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = [4,2,0,0]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,2,0,0]
 ; AVX512-FCP-NEXT:    vmovaps 32(%rdi), %ymm3
 ; AVX512-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7]
 ; AVX512-FCP-NEXT:    vpermps %ymm3, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vmovsd {{.*#+}} xmm6 = [5,3,0,0]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0]
 ; AVX512-FCP-NEXT:    vpermps %ymm3, %ymm6, %ymm3
 ; AVX512-FCP-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-FCP-NEXT:    vmovq %xmm4, (%rdx)
@@ -234,11 +234,11 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpinsrd $1, %r10d, %xmm4, %xmm4
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm2 = [4,2,0,0]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0]
 ; AVX512DQ-NEXT:    vmovaps 32(%rdi), %ymm5
 ; AVX512DQ-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
 ; AVX512DQ-NEXT:    vpermps %ymm5, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm6 = [5,3,0,0]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0]
 ; AVX512DQ-NEXT:    vpermps %ymm5, %ymm6, %ymm5
 ; AVX512DQ-NEXT:    vmovq %xmm3, (%rsi)
 ; AVX512DQ-NEXT:    vmovq %xmm1, (%rdx)
@@ -263,11 +263,11 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpermi2d %xmm3, %xmm1, %xmm2
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,5,0,0]
 ; AVX512DQ-FCP-NEXT:    vpermi2d %xmm3, %xmm1, %xmm5
-; AVX512DQ-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = [4,2,0,0]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,2,0,0]
 ; AVX512DQ-FCP-NEXT:    vmovaps 32(%rdi), %ymm3
 ; AVX512DQ-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpermps %ymm3, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovsd {{.*#+}} xmm6 = [5,3,0,0]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0]
 ; AVX512DQ-FCP-NEXT:    vpermps %ymm3, %ymm6, %ymm3
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm4, (%rdx)
@@ -294,11 +294,11 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vpinsrd $1, %r10d, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512BW-NEXT:    vmovsd {{.*#+}} xmm2 = [4,2,0,0]
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0]
 ; AVX512BW-NEXT:    vmovaps 32(%rdi), %ymm5
 ; AVX512BW-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
 ; AVX512BW-NEXT:    vpermps %ymm5, %ymm2, %ymm2
-; AVX512BW-NEXT:    vmovsd {{.*#+}} xmm6 = [5,3,0,0]
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0]
 ; AVX512BW-NEXT:    vpermps %ymm5, %ymm6, %ymm5
 ; AVX512BW-NEXT:    vmovq %xmm3, (%rsi)
 ; AVX512BW-NEXT:    vmovq %xmm1, (%rdx)
@@ -323,11 +323,11 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vpermi2d %xmm3, %xmm1, %xmm2
 ; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,5,0,0]
 ; AVX512BW-FCP-NEXT:    vpermi2d %xmm3, %xmm1, %xmm5
-; AVX512BW-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = [4,2,0,0]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,2,0,0]
 ; AVX512BW-FCP-NEXT:    vmovaps 32(%rdi), %ymm3
 ; AVX512BW-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vpermps %ymm3, %ymm1, %ymm1
-; AVX512BW-FCP-NEXT:    vmovsd {{.*#+}} xmm6 = [5,3,0,0]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0]
 ; AVX512BW-FCP-NEXT:    vpermps %ymm3, %ymm6, %ymm3
 ; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%rdx)
@@ -354,11 +354,11 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    vpinsrd $1, %r10d, %xmm4, %xmm4
 ; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512DQ-BW-NEXT:    vmovsd {{.*#+}} xmm2 = [4,2,0,0]
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0]
 ; AVX512DQ-BW-NEXT:    vmovaps 32(%rdi), %ymm5
 ; AVX512DQ-BW-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
 ; AVX512DQ-BW-NEXT:    vpermps %ymm5, %ymm2, %ymm2
-; AVX512DQ-BW-NEXT:    vmovsd {{.*#+}} xmm6 = [5,3,0,0]
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0]
 ; AVX512DQ-BW-NEXT:    vpermps %ymm5, %ymm6, %ymm5
 ; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rsi)
 ; AVX512DQ-BW-NEXT:    vmovq %xmm1, (%rdx)
@@ -383,11 +383,11 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm3, %xmm1, %xmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,5,0,0]
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm3, %xmm1, %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = [4,2,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,2,0,0]
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps 32(%rdi), %ymm3
 ; AVX512DQ-BW-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpermps %ymm3, %ymm1, %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vmovsd {{.*#+}} xmm6 = [5,3,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0]
 ; AVX512DQ-BW-FCP-NEXT:    vpermps %ymm3, %ymm6, %ymm3
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%rdx)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
index 6f534ee9cdf0b..93a84e30412d6 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
@@ -96,7 +96,7 @@ define void @load_i64_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512-FCP-LABEL: load_i64_stride3_vf2:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    vmovaps {{.*#+}} xmm0 = [1,4]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [1,4]
 ; AVX512-FCP-NEXT:    vmovaps (%rdi), %zmm1
 ; AVX512-FCP-NEXT:    vpermpd %zmm1, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vpermpd {{.*#+}} zmm1 = zmm1[0,3,2,3,4,7,6,7]
@@ -122,7 +122,7 @@ define void @load_i64_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512DQ-FCP-LABEL: load_i64_stride3_vf2:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    vmovaps {{.*#+}} xmm0 = [1,4]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [1,4]
 ; AVX512DQ-FCP-NEXT:    vmovaps (%rdi), %zmm1
 ; AVX512DQ-FCP-NEXT:    vpermpd %zmm1, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vpermpd {{.*#+}} zmm1 = zmm1[0,3,2,3,4,7,6,7]
@@ -148,7 +148,7 @@ define void @load_i64_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512BW-FCP-LABEL: load_i64_stride3_vf2:
 ; AVX512BW-FCP:       # %bb.0:
-; AVX512BW-FCP-NEXT:    vmovaps {{.*#+}} xmm0 = [1,4]
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [1,4]
 ; AVX512BW-FCP-NEXT:    vmovaps (%rdi), %zmm1
 ; AVX512BW-FCP-NEXT:    vpermpd %zmm1, %zmm0, %zmm0
 ; AVX512BW-FCP-NEXT:    vpermpd {{.*#+}} zmm1 = zmm1[0,3,2,3,4,7,6,7]
@@ -174,7 +174,7 @@ define void @load_i64_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512DQ-BW-FCP-LABEL: load_i64_stride3_vf2:
 ; AVX512DQ-BW-FCP:       # %bb.0:
-; AVX512DQ-BW-FCP-NEXT:    vmovaps {{.*#+}} xmm0 = [1,4]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [1,4]
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdi), %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermpd %zmm1, %zmm0, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vpermpd {{.*#+}} zmm1 = zmm1[0,3,2,3,4,7,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll
index 6716d97b3f07c..0c7c3f4b16646 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll
@@ -120,7 +120,7 @@ define void @load_i64_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512-FCP-LABEL: load_i64_stride4_vf2:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    vmovaps {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [0,4]
 ; AVX512-FCP-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vmovaps (%rdi), %xmm1
 ; AVX512-FCP-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1]
@@ -154,7 +154,7 @@ define void @load_i64_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512DQ-FCP-LABEL: load_i64_stride4_vf2:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    vmovaps {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [0,4]
 ; AVX512DQ-FCP-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovaps (%rdi), %xmm1
 ; AVX512DQ-FCP-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1]
@@ -188,7 +188,7 @@ define void @load_i64_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512BW-FCP-LABEL: load_i64_stride4_vf2:
 ; AVX512BW-FCP:       # %bb.0:
-; AVX512BW-FCP-NEXT:    vmovaps {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [0,4]
 ; AVX512BW-FCP-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovaps (%rdi), %xmm1
 ; AVX512BW-FCP-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1]
@@ -222,7 +222,7 @@ define void @load_i64_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512DQ-BW-FCP-LABEL: load_i64_stride4_vf2:
 ; AVX512DQ-BW-FCP:       # %bb.0:
-; AVX512DQ-BW-FCP-NEXT:    vmovaps {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [0,4]
 ; AVX512DQ-BW-FCP-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdi), %xmm1
 ; AVX512DQ-BW-FCP-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll
index e333e47219116..47526e960328e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll
@@ -186,7 +186,7 @@ define void @store_i32_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
-; AVX512-NEXT:    vmovaps {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
 ; AVX512-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vmovaps %ymm0, (%rdx)
 ; AVX512-NEXT:    vzeroupper
@@ -196,7 +196,7 @@ define void @store_i32_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512-FCP-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vmovaps {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
 ; AVX512-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vmovaps %ymm0, (%rdx)
 ; AVX512-FCP-NEXT:    vzeroupper
@@ -206,7 +206,7 @@ define void @store_i32_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512DQ-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT:    vmovaps {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
 ; AVX512DQ-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovaps %ymm0, (%rdx)
 ; AVX512DQ-NEXT:    vzeroupper
@@ -216,7 +216,7 @@ define void @store_i32_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512DQ-FCP-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovaps {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
 ; AVX512DQ-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovaps %ymm0, (%rdx)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
@@ -226,7 +226,7 @@ define void @store_i32_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512BW-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT:    vmovaps {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
 ; AVX512BW-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512BW-NEXT:    vmovaps %ymm0, (%rdx)
 ; AVX512BW-NEXT:    vzeroupper
@@ -236,7 +236,7 @@ define void @store_i32_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512BW-FCP-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vmovaps {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
 ; AVX512BW-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512BW-FCP-NEXT:    vmovaps %ymm0, (%rdx)
 ; AVX512BW-FCP-NEXT:    vzeroupper
@@ -246,7 +246,7 @@ define void @store_i32_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
 ; AVX512DQ-BW:       # %bb.0:
 ; AVX512DQ-BW-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512DQ-BW-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
-; AVX512DQ-BW-NEXT:    vmovaps {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
 ; AVX512DQ-BW-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512DQ-BW-NEXT:    vmovaps %ymm0, (%rdx)
 ; AVX512DQ-BW-NEXT:    vzeroupper
@@ -256,7 +256,7 @@ define void @store_i32_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512DQ-BW-FCP-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vmovaps {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps %ymm0, (%rdx)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
@@ -348,7 +348,7 @@ define void @store_i32_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX512-NEXT:    vinsertf64x4 $1, (%rsi), %zmm0, %zmm0
-; AVX512-NEXT:    vmovaps {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
 ; AVX512-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vmovaps %zmm0, (%rdx)
 ; AVX512-NEXT:    vzeroupper
@@ -358,7 +358,7 @@ define void @store_i32_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX512-FCP-NEXT:    vinsertf64x4 $1, (%rsi), %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vmovaps {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
 ; AVX512-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512-FCP-NEXT:    vmovaps %zmm0, (%rdx)
 ; AVX512-FCP-NEXT:    vzeroupper
@@ -368,7 +368,7 @@ define void @store_i32_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX512DQ-NEXT:    vinsertf64x4 $1, (%rsi), %zmm0, %zmm0
-; AVX512DQ-NEXT:    vmovaps {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
 ; AVX512DQ-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512DQ-NEXT:    vmovaps %zmm0, (%rdx)
 ; AVX512DQ-NEXT:    vzeroupper
@@ -378,7 +378,7 @@ define void @store_i32_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX512DQ-FCP-NEXT:    vinsertf64x4 $1, (%rsi), %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovaps {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
 ; AVX512DQ-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, (%rdx)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
@@ -388,7 +388,7 @@ define void @store_i32_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX512BW-NEXT:    vinsertf64x4 $1, (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovaps {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
 ; AVX512BW-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vmovaps %zmm0, (%rdx)
 ; AVX512BW-NEXT:    vzeroupper
@@ -398,7 +398,7 @@ define void @store_i32_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX512BW-FCP-NEXT:    vinsertf64x4 $1, (%rsi), %zmm0, %zmm0
-; AVX512BW-FCP-NEXT:    vmovaps {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
 ; AVX512BW-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovaps %zmm0, (%rdx)
 ; AVX512BW-FCP-NEXT:    vzeroupper
@@ -408,7 +408,7 @@ define void @store_i32_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
 ; AVX512DQ-BW:       # %bb.0:
 ; AVX512DQ-BW-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX512DQ-BW-NEXT:    vinsertf64x4 $1, (%rsi), %zmm0, %zmm0
-; AVX512DQ-BW-NEXT:    vmovaps {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
 ; AVX512DQ-BW-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512DQ-BW-NEXT:    vmovaps %zmm0, (%rdx)
 ; AVX512DQ-BW-NEXT:    vzeroupper
@@ -418,7 +418,7 @@ define void @store_i32_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX512DQ-BW-FCP-NEXT:    vinsertf64x4 $1, (%rsi), %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovaps {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
 ; AVX512DQ-BW-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm0, (%rdx)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
index 7037a2864654f..f9228707182f7 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
@@ -100,7 +100,7 @@ define void @store_i32_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
 ; AVX512-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT:    vmovaps {{.*#+}} ymm1 = [0,2,4,1,3,5,u,u]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,4,1,3,5,0,0]
 ; AVX512-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vmovlps %xmm1, 16(%rcx)
@@ -115,7 +115,7 @@ define void @store_i32_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
 ; AVX512-FCP-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; AVX512-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-FCP-NEXT:    vmovaps {{.*#+}} ymm1 = [0,2,4,1,3,5,u,u]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,4,1,3,5,0,0]
 ; AVX512-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-FCP-NEXT:    vmovlps %xmm1, 16(%rcx)
@@ -130,7 +130,7 @@ define void @store_i32_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
 ; AVX512DQ-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; AVX512DQ-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-NEXT:    vmovaps {{.*#+}} ymm1 = [0,2,4,1,3,5,u,u]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,4,1,3,5,0,0]
 ; AVX512DQ-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512DQ-NEXT:    vmovlps %xmm1, 16(%rcx)
@@ -145,7 +145,7 @@ define void @store_i32_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
 ; AVX512DQ-FCP-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; AVX512DQ-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovaps {{.*#+}} ymm1 = [0,2,4,1,3,5,u,u]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,4,1,3,5,0,0]
 ; AVX512DQ-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512DQ-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovlps %xmm1, 16(%rcx)
@@ -160,7 +160,7 @@ define void @store_i32_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
 ; AVX512BW-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; AVX512BW-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512BW-NEXT:    vmovaps {{.*#+}} ymm1 = [0,2,4,1,3,5,u,u]
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,4,1,3,5,0,0]
 ; AVX512BW-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512BW-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512BW-NEXT:    vmovlps %xmm1, 16(%rcx)
@@ -175,7 +175,7 @@ define void @store_i32_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
 ; AVX512BW-FCP-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; AVX512BW-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT:    vmovaps {{.*#+}} ymm1 = [0,2,4,1,3,5,u,u]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,4,1,3,5,0,0]
 ; AVX512BW-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512BW-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512BW-FCP-NEXT:    vmovlps %xmm1, 16(%rcx)
@@ -190,7 +190,7 @@ define void @store_i32_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
 ; AVX512DQ-BW-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; AVX512DQ-BW-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-BW-NEXT:    vmovaps {{.*#+}} ymm1 = [0,2,4,1,3,5,u,u]
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,4,1,3,5,0,0]
 ; AVX512DQ-BW-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512DQ-BW-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512DQ-BW-NEXT:    vmovlps %xmm1, 16(%rcx)
@@ -205,7 +205,7 @@ define void @store_i32_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
 ; AVX512DQ-BW-FCP-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; AVX512DQ-BW-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vmovaps {{.*#+}} ymm1 = [0,2,4,1,3,5,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,4,1,3,5,0,0]
 ; AVX512DQ-BW-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512DQ-BW-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm1, 16(%rcx)
@@ -321,7 +321,7 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512-NEXT:    vinsertf32x4 $2, (%rdx), %zmm0, %zmm0
-; AVX512-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0]
 ; AVX512-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vextractf32x4 $2, %zmm0, 32(%rcx)
 ; AVX512-NEXT:    vmovaps %ymm0, (%rcx)
@@ -333,7 +333,7 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512-FCP-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512-FCP-NEXT:    vinsertf32x4 $2, (%rdx), %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0]
 ; AVX512-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512-FCP-NEXT:    vextractf32x4 $2, %zmm0, 32(%rcx)
 ; AVX512-FCP-NEXT:    vmovaps %ymm0, (%rcx)
@@ -345,7 +345,7 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512DQ-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vinsertf32x4 $2, (%rdx), %zmm0, %zmm0
-; AVX512DQ-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0]
 ; AVX512DQ-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512DQ-NEXT:    vextractf32x4 $2, %zmm0, 32(%rcx)
 ; AVX512DQ-NEXT:    vmovaps %ymm0, (%rcx)
@@ -357,7 +357,7 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512DQ-FCP-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512DQ-FCP-NEXT:    vinsertf32x4 $2, (%rdx), %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0]
 ; AVX512DQ-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    vextractf32x4 $2, %zmm0, 32(%rcx)
 ; AVX512DQ-FCP-NEXT:    vmovaps %ymm0, (%rcx)
@@ -369,7 +369,7 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512BW-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512BW-NEXT:    vinsertf32x4 $2, (%rdx), %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u]
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0]
 ; AVX512BW-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vextractf32x4 $2, %zmm0, 32(%rcx)
 ; AVX512BW-NEXT:    vmovaps %ymm0, (%rcx)
@@ -381,7 +381,7 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512BW-FCP-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512BW-FCP-NEXT:    vinsertf32x4 $2, (%rdx), %zmm0, %zmm0
-; AVX512BW-FCP-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0]
 ; AVX512BW-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512BW-FCP-NEXT:    vextractf32x4 $2, %zmm0, 32(%rcx)
 ; AVX512BW-FCP-NEXT:    vmovaps %ymm0, (%rcx)
@@ -393,7 +393,7 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512DQ-BW-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512DQ-BW-NEXT:    vinsertf32x4 $2, (%rdx), %zmm0, %zmm0
-; AVX512DQ-BW-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0]
 ; AVX512DQ-BW-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512DQ-BW-NEXT:    vextractf32x4 $2, %zmm0, 32(%rcx)
 ; AVX512DQ-BW-NEXT:    vmovaps %ymm0, (%rcx)
@@ -405,7 +405,7 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512DQ-BW-FCP-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512DQ-BW-FCP-NEXT:    vinsertf32x4 $2, (%rdx), %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0]
 ; AVX512DQ-BW-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vextractf32x4 $2, %zmm0, 32(%rcx)
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps %ymm0, (%rcx)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll
index 4beed72f22e33..22040e0cdb791 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll
@@ -117,7 +117,7 @@ define void @store_i32_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
 ; AVX512-FCP-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; AVX512-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vmovaps {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7]
 ; AVX512-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vmovaps %ymm0, (%r8)
 ; AVX512-FCP-NEXT:    vzeroupper
@@ -147,7 +147,7 @@ define void @store_i32_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
 ; AVX512DQ-FCP-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; AVX512DQ-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovaps {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7]
 ; AVX512DQ-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovaps %ymm0, (%r8)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
@@ -177,7 +177,7 @@ define void @store_i32_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
 ; AVX512BW-FCP-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; AVX512BW-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vmovaps {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7]
 ; AVX512BW-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512BW-FCP-NEXT:    vmovaps %ymm0, (%r8)
 ; AVX512BW-FCP-NEXT:    vzeroupper
@@ -207,7 +207,7 @@ define void @store_i32_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
 ; AVX512DQ-BW-FCP-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; AVX512DQ-BW-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vmovaps {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps %ymm0, (%r8)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
@@ -346,7 +346,7 @@ define void @store_i32_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vinsertf128 $1, (%rcx), %ymm1, %ymm1
 ; AVX512-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
 ; AVX512-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vmovaps %zmm0, (%r8)
 ; AVX512-NEXT:    vzeroupper
@@ -359,7 +359,7 @@ define void @store_i32_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vinsertf128 $1, (%rcx), %ymm1, %ymm1
 ; AVX512-FCP-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512-FCP-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
 ; AVX512-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512-FCP-NEXT:    vmovaps %zmm0, (%r8)
 ; AVX512-FCP-NEXT:    vzeroupper
@@ -372,7 +372,7 @@ define void @store_i32_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vinsertf128 $1, (%rcx), %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
 ; AVX512DQ-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512DQ-NEXT:    vmovaps %zmm0, (%r8)
 ; AVX512DQ-NEXT:    vzeroupper
@@ -385,7 +385,7 @@ define void @store_i32_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vinsertf128 $1, (%rcx), %ymm1, %ymm1
 ; AVX512DQ-FCP-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512DQ-FCP-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
 ; AVX512DQ-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, (%r8)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
@@ -398,7 +398,7 @@ define void @store_i32_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vinsertf128 $1, (%rcx), %ymm1, %ymm1
 ; AVX512BW-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512BW-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
 ; AVX512BW-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vmovaps %zmm0, (%r8)
 ; AVX512BW-NEXT:    vzeroupper
@@ -411,7 +411,7 @@ define void @store_i32_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    vinsertf128 $1, (%rcx), %ymm1, %ymm1
 ; AVX512BW-FCP-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512BW-FCP-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
 ; AVX512BW-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovaps %zmm0, (%r8)
 ; AVX512BW-FCP-NEXT:    vzeroupper
@@ -424,7 +424,7 @@ define void @store_i32_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-NEXT:    vinsertf128 $1, (%rcx), %ymm1, %ymm1
 ; AVX512DQ-BW-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512DQ-BW-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
 ; AVX512DQ-BW-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512DQ-BW-NEXT:    vmovaps %zmm0, (%r8)
 ; AVX512DQ-BW-NEXT:    vzeroupper
@@ -437,7 +437,7 @@ define void @store_i32_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-NEXT:    vinsertf128 $1, (%rcx), %ymm1, %ymm1
 ; AVX512DQ-BW-FCP-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512DQ-BW-FCP-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
 ; AVX512DQ-BW-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm0, (%r8)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
index b6914ec197300..07d8a370a5f93 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
@@ -142,7 +142,7 @@ define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; AVX512-NEXT:    vinsertf32x4 $2, %xmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,0,0,0,0,0,0]
 ; AVX512-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm1
 ; AVX512-NEXT:    vmovlps %xmm1, 32(%r9)
@@ -161,7 +161,7 @@ define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; AVX512-FCP-NEXT:    vinsertf32x4 $2, %xmm1, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,0,0,0,0,0,0]
 ; AVX512-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512-FCP-NEXT:    vextractf32x4 $2, %zmm0, %xmm1
 ; AVX512-FCP-NEXT:    vmovlps %xmm1, 32(%r9)
@@ -180,7 +180,7 @@ define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; AVX512DQ-NEXT:    vinsertf32x4 $2, %xmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,0,0,0,0,0,0]
 ; AVX512DQ-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512DQ-NEXT:    vextractf32x4 $2, %zmm0, %xmm1
 ; AVX512DQ-NEXT:    vmovlps %xmm1, 32(%r9)
@@ -199,7 +199,7 @@ define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; AVX512DQ-FCP-NEXT:    vinsertf32x4 $2, %xmm1, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,0,0,0,0,0,0]
 ; AVX512DQ-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    vextractf32x4 $2, %zmm0, %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovlps %xmm1, 32(%r9)
@@ -218,7 +218,7 @@ define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; AVX512BW-NEXT:    vinsertf32x4 $2, %xmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,0,0,0,0,0,0]
 ; AVX512BW-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vextractf32x4 $2, %zmm0, %xmm1
 ; AVX512BW-NEXT:    vmovlps %xmm1, 32(%r9)
@@ -237,7 +237,7 @@ define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512BW-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; AVX512BW-FCP-NEXT:    vinsertf32x4 $2, %xmm1, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT:    vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,0,0,0,0,0,0]
 ; AVX512BW-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512BW-FCP-NEXT:    vextractf32x4 $2, %zmm0, %xmm1
 ; AVX512BW-FCP-NEXT:    vmovlps %xmm1, 32(%r9)
@@ -256,7 +256,7 @@ define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512DQ-BW-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; AVX512DQ-BW-NEXT:    vinsertf32x4 $2, %xmm1, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT:    vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,0,0,0,0,0,0]
 ; AVX512DQ-BW-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512DQ-BW-NEXT:    vextractf32x4 $2, %zmm0, %xmm1
 ; AVX512DQ-BW-NEXT:    vmovlps %xmm1, 32(%r9)
@@ -275,7 +275,7 @@ define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; AVX512DQ-BW-FCP-NEXT:    vinsertf32x4 $2, %xmm1, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,0,0,0,0,0,0]
 ; AVX512DQ-BW-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vextractf32x4 $2, %zmm0, %xmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm1, 32(%r9)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
index 9d53325ed7c56..78b07e5671e5a 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
@@ -152,7 +152,7 @@ define void @store_i32_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
 ; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vinsertf32x4 $2, %xmm2, %zmm0, %zmm0
-; AVX512-NEXT:    vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,u,u,u,u]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0]
 ; AVX512-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vextractf32x4 $2, %zmm0, 32(%rax)
 ; AVX512-NEXT:    vmovaps %ymm0, (%rax)
@@ -173,7 +173,7 @@ define void @store_i32_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
 ; AVX512-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512-FCP-NEXT:    vinsertf32x4 $2, %xmm2, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,u,u,u,u]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0]
 ; AVX512-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512-FCP-NEXT:    vextractf32x4 $2, %zmm0, 32(%rax)
 ; AVX512-FCP-NEXT:    vmovaps %ymm0, (%rax)
@@ -194,7 +194,7 @@ define void @store_i32_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
 ; AVX512DQ-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vinsertf32x4 $2, %xmm2, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,u,u,u,u]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0]
 ; AVX512DQ-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512DQ-NEXT:    vextractf32x4 $2, %zmm0, 32(%rax)
 ; AVX512DQ-NEXT:    vmovaps %ymm0, (%rax)
@@ -215,7 +215,7 @@ define void @store_i32_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
 ; AVX512DQ-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512DQ-FCP-NEXT:    vinsertf32x4 $2, %xmm2, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0]
 ; AVX512DQ-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    vextractf32x4 $2, %zmm0, 32(%rax)
 ; AVX512DQ-FCP-NEXT:    vmovaps %ymm0, (%rax)
@@ -236,7 +236,7 @@ define void @store_i32_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
 ; AVX512BW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vinsertf32x4 $2, %xmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,u,u,u,u]
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0]
 ; AVX512BW-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vextractf32x4 $2, %zmm0, 32(%rax)
 ; AVX512BW-NEXT:    vmovaps %ymm0, (%rax)
@@ -257,7 +257,7 @@ define void @store_i32_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
 ; AVX512BW-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512BW-FCP-NEXT:    vinsertf32x4 $2, %xmm2, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT:    vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0]
 ; AVX512BW-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512BW-FCP-NEXT:    vextractf32x4 $2, %zmm0, 32(%rax)
 ; AVX512BW-FCP-NEXT:    vmovaps %ymm0, (%rax)
@@ -278,7 +278,7 @@ define void @store_i32_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
 ; AVX512DQ-BW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512DQ-BW-NEXT:    vinsertf32x4 $2, %xmm2, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT:    vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0]
 ; AVX512DQ-BW-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512DQ-BW-NEXT:    vextractf32x4 $2, %zmm0, 32(%rax)
 ; AVX512DQ-BW-NEXT:    vmovaps %ymm0, (%rax)
@@ -299,7 +299,7 @@ define void @store_i32_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
 ; AVX512DQ-BW-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512DQ-BW-FCP-NEXT:    vinsertf32x4 $2, %xmm2, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0]
 ; AVX512DQ-BW-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vextractf32x4 $2, %zmm0, 32(%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps %ymm0, (%rax)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll
index 67b447ed5d014..1061c08dcceda 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll
@@ -225,7 +225,7 @@ define void @store_i64_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX512-NEXT:    vinsertf64x4 $1, (%rsi), %zmm0, %zmm0
-; AVX512-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
 ; AVX512-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vmovaps %zmm0, (%rdx)
 ; AVX512-NEXT:    vzeroupper
@@ -235,7 +235,7 @@ define void @store_i64_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX512-FCP-NEXT:    vinsertf64x4 $1, (%rsi), %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
 ; AVX512-FCP-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512-FCP-NEXT:    vmovaps %zmm0, (%rdx)
 ; AVX512-FCP-NEXT:    vzeroupper
@@ -245,7 +245,7 @@ define void @store_i64_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX512DQ-NEXT:    vinsertf64x4 $1, (%rsi), %zmm0, %zmm0
-; AVX512DQ-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
+; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
 ; AVX512DQ-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512DQ-NEXT:    vmovaps %zmm0, (%rdx)
 ; AVX512DQ-NEXT:    vzeroupper
@@ -255,7 +255,7 @@ define void @store_i64_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX512DQ-FCP-NEXT:    vinsertf64x4 $1, (%rsi), %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
 ; AVX512DQ-FCP-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, (%rdx)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
@@ -265,7 +265,7 @@ define void @store_i64_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX512BW-NEXT:    vinsertf64x4 $1, (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
+; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
 ; AVX512BW-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vmovaps %zmm0, (%rdx)
 ; AVX512BW-NEXT:    vzeroupper
@@ -275,7 +275,7 @@ define void @store_i64_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX512BW-FCP-NEXT:    vinsertf64x4 $1, (%rsi), %zmm0, %zmm0
-; AVX512BW-FCP-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
 ; AVX512BW-FCP-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovaps %zmm0, (%rdx)
 ; AVX512BW-FCP-NEXT:    vzeroupper
@@ -285,7 +285,7 @@ define void @store_i64_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
 ; AVX512DQ-BW:       # %bb.0:
 ; AVX512DQ-BW-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX512DQ-BW-NEXT:    vinsertf64x4 $1, (%rsi), %zmm0, %zmm0
-; AVX512DQ-BW-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
+; AVX512DQ-BW-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
 ; AVX512DQ-BW-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512DQ-BW-NEXT:    vmovaps %zmm0, (%rdx)
 ; AVX512DQ-BW-NEXT:    vzeroupper
@@ -295,7 +295,7 @@ define void @store_i64_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX512DQ-BW-FCP-NEXT:    vinsertf64x4 $1, (%rsi), %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm0, (%rdx)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll
index a01d4de0027f4..fe39c769c3545 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll
@@ -94,7 +94,7 @@ define void @store_i64_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512-NEXT:    vinsertf32x4 $2, (%rdx), %zmm0, %zmm0
-; AVX512-NEXT:    vmovaps {{.*#+}} zmm1 = [0,2,4,1,3,5,u,u]
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,2,4,1,3,5,0,0]
 ; AVX512-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vextractf32x4 $2, %zmm0, 32(%rcx)
 ; AVX512-NEXT:    vmovaps %ymm0, (%rcx)
@@ -106,7 +106,7 @@ define void @store_i64_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512-FCP-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512-FCP-NEXT:    vinsertf32x4 $2, (%rdx), %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vmovaps {{.*#+}} zmm1 = [0,2,4,1,3,5,u,u]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,2,4,1,3,5,0,0]
 ; AVX512-FCP-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512-FCP-NEXT:    vextractf32x4 $2, %zmm0, 32(%rcx)
 ; AVX512-FCP-NEXT:    vmovaps %ymm0, (%rcx)
@@ -118,7 +118,7 @@ define void @store_i64_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512DQ-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vinsertf32x4 $2, (%rdx), %zmm0, %zmm0
-; AVX512DQ-NEXT:    vmovaps {{.*#+}} zmm1 = [0,2,4,1,3,5,u,u]
+; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,2,4,1,3,5,0,0]
 ; AVX512DQ-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512DQ-NEXT:    vextractf32x4 $2, %zmm0, 32(%rcx)
 ; AVX512DQ-NEXT:    vmovaps %ymm0, (%rcx)
@@ -130,7 +130,7 @@ define void @store_i64_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512DQ-FCP-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512DQ-FCP-NEXT:    vinsertf32x4 $2, (%rdx), %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovaps {{.*#+}} zmm1 = [0,2,4,1,3,5,u,u]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,2,4,1,3,5,0,0]
 ; AVX512DQ-FCP-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    vextractf32x4 $2, %zmm0, 32(%rcx)
 ; AVX512DQ-FCP-NEXT:    vmovaps %ymm0, (%rcx)
@@ -142,7 +142,7 @@ define void @store_i64_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512BW-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512BW-NEXT:    vinsertf32x4 $2, (%rdx), %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovaps {{.*#+}} zmm1 = [0,2,4,1,3,5,u,u]
+; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,2,4,1,3,5,0,0]
 ; AVX512BW-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vextractf32x4 $2, %zmm0, 32(%rcx)
 ; AVX512BW-NEXT:    vmovaps %ymm0, (%rcx)
@@ -154,7 +154,7 @@ define void @store_i64_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512BW-FCP-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512BW-FCP-NEXT:    vinsertf32x4 $2, (%rdx), %zmm0, %zmm0
-; AVX512BW-FCP-NEXT:    vmovaps {{.*#+}} zmm1 = [0,2,4,1,3,5,u,u]
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,2,4,1,3,5,0,0]
 ; AVX512BW-FCP-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512BW-FCP-NEXT:    vextractf32x4 $2, %zmm0, 32(%rcx)
 ; AVX512BW-FCP-NEXT:    vmovaps %ymm0, (%rcx)
@@ -166,7 +166,7 @@ define void @store_i64_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512DQ-BW-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512DQ-BW-NEXT:    vinsertf32x4 $2, (%rdx), %zmm0, %zmm0
-; AVX512DQ-BW-NEXT:    vmovaps {{.*#+}} zmm1 = [0,2,4,1,3,5,u,u]
+; AVX512DQ-BW-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,2,4,1,3,5,0,0]
 ; AVX512DQ-BW-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512DQ-BW-NEXT:    vextractf32x4 $2, %zmm0, 32(%rcx)
 ; AVX512DQ-BW-NEXT:    vmovaps %ymm0, (%rcx)
@@ -178,7 +178,7 @@ define void @store_i64_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512DQ-BW-FCP-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512DQ-BW-FCP-NEXT:    vinsertf32x4 $2, (%rdx), %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovaps {{.*#+}} zmm1 = [0,2,4,1,3,5,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,2,4,1,3,5,0,0]
 ; AVX512DQ-BW-FCP-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vextractf32x4 $2, %zmm0, 32(%rcx)
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps %ymm0, (%rcx)
diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll
index 5d02bb8b05f18..cfb5fac2fd7aa 100644
--- a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll
+++ b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll
@@ -1815,16 +1815,26 @@ define <2 x i64> @foldv2i64() nounwind {
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = [55,0,0,0]
 ; SSE-NEXT:    retq
 ;
-; NOBW-LABEL: foldv2i64:
-; NOBW:       # %bb.0:
-; NOBW-NEXT:    vmovss {{.*#+}} xmm0 = [55,0,0,0]
-; NOBW-NEXT:    retq
+; AVX1OR2-LABEL: foldv2i64:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vmovss {{.*#+}} xmm0 = [55,0,0,0]
+; AVX1OR2-NEXT:    retq
+;
+; AVX512VL-LABEL: foldv2i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [55,0]
+; AVX512VL-NEXT:    retq
 ;
 ; AVX512VLBWDQ-LABEL: foldv2i64:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovss {{.*#+}} xmm0 = [55,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [55,0]
 ; AVX512VLBWDQ-NEXT:    retq
 ;
+; AVX512-LABEL: foldv2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [55,0]
+; AVX512-NEXT:    retq
+;
 ; X86-SSE-LABEL: foldv2i64:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = [55,0,0,0]
@@ -1839,16 +1849,26 @@ define <2 x i64> @foldv2i64u() nounwind {
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = [55,0,0,0]
 ; SSE-NEXT:    retq
 ;
-; NOBW-LABEL: foldv2i64u:
-; NOBW:       # %bb.0:
-; NOBW-NEXT:    vmovss {{.*#+}} xmm0 = [55,0,0,0]
-; NOBW-NEXT:    retq
+; AVX1OR2-LABEL: foldv2i64u:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vmovss {{.*#+}} xmm0 = [55,0,0,0]
+; AVX1OR2-NEXT:    retq
+;
+; AVX512VL-LABEL: foldv2i64u:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [55,0]
+; AVX512VL-NEXT:    retq
 ;
 ; AVX512VLBWDQ-LABEL: foldv2i64u:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovss {{.*#+}} xmm0 = [55,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [55,0]
 ; AVX512VLBWDQ-NEXT:    retq
 ;
+; AVX512-LABEL: foldv2i64u:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [55,0]
+; AVX512-NEXT:    retq
+;
 ; X86-SSE-LABEL: foldv2i64u:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = [55,0,0,0]
@@ -1863,16 +1883,26 @@ define <4 x i32> @foldv4i32() nounwind {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [23,0,32,24]
 ; SSE-NEXT:    retq
 ;
-; NOBW-LABEL: foldv4i32:
-; NOBW:       # %bb.0:
-; NOBW-NEXT:    vmovaps {{.*#+}} xmm0 = [23,0,32,24]
-; NOBW-NEXT:    retq
+; AVX1OR2-LABEL: foldv4i32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vmovaps {{.*#+}} xmm0 = [23,0,32,24]
+; AVX1OR2-NEXT:    retq
+;
+; AVX512VL-LABEL: foldv4i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [23,0,32,24]
+; AVX512VL-NEXT:    retq
 ;
 ; AVX512VLBWDQ-LABEL: foldv4i32:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [23,0,32,24]
+; AVX512VLBWDQ-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [23,0,32,24]
 ; AVX512VLBWDQ-NEXT:    retq
 ;
+; AVX512-LABEL: foldv4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [23,0,32,24]
+; AVX512-NEXT:    retq
+;
 ; X86-SSE-LABEL: foldv4i32:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movaps {{.*#+}} xmm0 = [23,0,32,24]
@@ -1887,16 +1917,26 @@ define <4 x i32> @foldv4i32u() nounwind {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [23,0,32,24]
 ; SSE-NEXT:    retq
 ;
-; NOBW-LABEL: foldv4i32u:
-; NOBW:       # %bb.0:
-; NOBW-NEXT:    vmovaps {{.*#+}} xmm0 = [23,0,32,24]
-; NOBW-NEXT:    retq
+; AVX1OR2-LABEL: foldv4i32u:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vmovaps {{.*#+}} xmm0 = [23,0,32,24]
+; AVX1OR2-NEXT:    retq
+;
+; AVX512VL-LABEL: foldv4i32u:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [23,0,32,24]
+; AVX512VL-NEXT:    retq
 ;
 ; AVX512VLBWDQ-LABEL: foldv4i32u:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [23,0,32,24]
+; AVX512VLBWDQ-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [23,0,32,24]
 ; AVX512VLBWDQ-NEXT:    retq
 ;
+; AVX512-LABEL: foldv4i32u:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [23,0,32,24]
+; AVX512-NEXT:    retq
+;
 ; X86-SSE-LABEL: foldv4i32u:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movaps {{.*#+}} xmm0 = [23,0,32,24]
diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll
index 8a0d9a6134cea..db363493e2dac 100644
--- a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll
+++ b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll
@@ -1128,10 +1128,30 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 }
 
 define <4 x i64> @foldv4i64() nounwind {
-; X64-LABEL: foldv4i64:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [55,0,64,56]
-; X64-NEXT:    retq
+; AVX1-LABEL: foldv4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [55,0,64,56]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: foldv4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [55,0,64,56]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: foldv4i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [55,0,64,56]
+; AVX512VL-NEXT:    retq
+;
+; AVX512VLBWDQ-LABEL: foldv4i64:
+; AVX512VLBWDQ:       # %bb.0:
+; AVX512VLBWDQ-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [55,0,64,56]
+; AVX512VLBWDQ-NEXT:    retq
+;
+; AVX512-LABEL: foldv4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [55,0,64,56]
+; AVX512-NEXT:    retq
 ;
 ; X86-AVX-LABEL: foldv4i64:
 ; X86-AVX:       # %bb.0:
@@ -1142,10 +1162,30 @@ define <4 x i64> @foldv4i64() nounwind {
 }
 
 define <4 x i64> @foldv4i64u() nounwind {
-; X64-LABEL: foldv4i64u:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [55,0,64,56]
-; X64-NEXT:    retq
+; AVX1-LABEL: foldv4i64u:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [55,0,64,56]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: foldv4i64u:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [55,0,64,56]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: foldv4i64u:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [55,0,64,56]
+; AVX512VL-NEXT:    retq
+;
+; AVX512VLBWDQ-LABEL: foldv4i64u:
+; AVX512VLBWDQ:       # %bb.0:
+; AVX512VLBWDQ-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [55,0,64,56]
+; AVX512VLBWDQ-NEXT:    retq
+;
+; AVX512-LABEL: foldv4i64u:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [55,0,64,56]
+; AVX512-NEXT:    retq
 ;
 ; X86-AVX-LABEL: foldv4i64u:
 ; X86-AVX:       # %bb.0:
@@ -1156,10 +1196,30 @@ define <4 x i64> @foldv4i64u() nounwind {
 }
 
 define <8 x i32> @foldv8i32() nounwind {
-; X64-LABEL: foldv8i32:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
-; X64-NEXT:    retq
+; AVX1-LABEL: foldv8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: foldv8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: foldv8i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
+; AVX512VL-NEXT:    retq
+;
+; AVX512VLBWDQ-LABEL: foldv8i32:
+; AVX512VLBWDQ:       # %bb.0:
+; AVX512VLBWDQ-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
+; AVX512VLBWDQ-NEXT:    retq
+;
+; AVX512-LABEL: foldv8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
+; AVX512-NEXT:    retq
 ;
 ; X86-AVX-LABEL: foldv8i32:
 ; X86-AVX:       # %bb.0:
@@ -1170,10 +1230,30 @@ define <8 x i32> @foldv8i32() nounwind {
 }
 
 define <8 x i32> @foldv8i32u() nounwind {
-; X64-LABEL: foldv8i32u:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
-; X64-NEXT:    retq
+; AVX1-LABEL: foldv8i32u:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: foldv8i32u:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: foldv8i32u:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
+; AVX512VL-NEXT:    retq
+;
+; AVX512VLBWDQ-LABEL: foldv8i32u:
+; AVX512VLBWDQ:       # %bb.0:
+; AVX512VLBWDQ-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
+; AVX512VLBWDQ-NEXT:    retq
+;
+; AVX512-LABEL: foldv8i32u:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
+; AVX512-NEXT:    retq
 ;
 ; X86-AVX-LABEL: foldv8i32u:
 ; X86-AVX:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128.ll b/llvm/test/CodeGen/X86/vector-popcnt-128.ll
index e178ab0348a76..741d70a369022 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-128.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-128.ll
@@ -629,19 +629,29 @@ define <2 x i64> @foldv2i64() nounwind {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,64]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: foldv2i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,64]
-; AVX-NEXT:    retq
+; AVX1OR2-LABEL: foldv2i64:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vmovaps {{.*#+}} xmm0 = [1,64]
+; AVX1OR2-NEXT:    retq
+;
+; XOP-LABEL: foldv2i64:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vmovaps {{.*#+}} xmm0 = [1,64]
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: foldv2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [1,64]
+; AVX512-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: foldv2i64:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,64]
+; BITALG_NOVLX-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [1,64]
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: foldv2i64:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovaps {{.*#+}} xmm0 = [1,64]
+; BITALG-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [1,64]
 ; BITALG-NEXT:    retq
   %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> <i64 256, i64 -1>)
   ret <2 x i64> %out
@@ -653,19 +663,29 @@ define <4 x i32> @foldv4i32() nounwind {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,32,0,8]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: foldv4i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,32,0,8]
-; AVX-NEXT:    retq
+; AVX1OR2-LABEL: foldv4i32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vmovaps {{.*#+}} xmm0 = [1,32,0,8]
+; AVX1OR2-NEXT:    retq
+;
+; XOP-LABEL: foldv4i32:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vmovaps {{.*#+}} xmm0 = [1,32,0,8]
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: foldv4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [1,32,0,8]
+; AVX512-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: foldv4i32:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,32,0,8]
+; BITALG_NOVLX-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [1,32,0,8]
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: foldv4i32:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovaps {{.*#+}} xmm0 = [1,32,0,8]
+; BITALG-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [1,32,0,8]
 ; BITALG-NEXT:    retq
   %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>)
   ret <4 x i32> %out
diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256.ll b/llvm/test/CodeGen/X86/vector-popcnt-256.ll
index 6c45742730a62..701b9622089db 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-256.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-256.ll
@@ -409,19 +409,79 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 }
 
 define <4 x i64> @foldv4i64() nounwind {
-; ALL-LABEL: foldv4i64:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [1,64,0,8]
-; ALL-NEXT:    retq
+; AVX1-LABEL: foldv4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [1,64,0,8]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: foldv4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [1,64,0,8]
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: foldv4i64:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vmovaps {{.*#+}} ymm0 = [1,64,0,8]
+; XOP-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: foldv4i64:
+; AVX512VPOPCNTDQ:       # %bb.0:
+; AVX512VPOPCNTDQ-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [1,64,0,8]
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
+; AVX512VPOPCNTDQVL-LABEL: foldv4i64:
+; AVX512VPOPCNTDQVL:       # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [1,64,0,8]
+; AVX512VPOPCNTDQVL-NEXT:    retq
+;
+; BITALG_NOVLX-LABEL: foldv4i64:
+; BITALG_NOVLX:       # %bb.0:
+; BITALG_NOVLX-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [1,64,0,8]
+; BITALG_NOVLX-NEXT:    retq
+;
+; BITALG-LABEL: foldv4i64:
+; BITALG:       # %bb.0:
+; BITALG-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [1,64,0,8]
+; BITALG-NEXT:    retq
   %out = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>)
   ret <4 x i64> %out
 }
 
 define <8 x i32> @foldv8i32() nounwind {
-; ALL-LABEL: foldv8i32:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [1,32,0,8,16,3,2,3]
-; ALL-NEXT:    retq
+; AVX1-LABEL: foldv8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [1,32,0,8,16,3,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: foldv8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [1,32,0,8,16,3,2,3]
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: foldv8i32:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vmovaps {{.*#+}} ymm0 = [1,32,0,8,16,3,2,3]
+; XOP-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: foldv8i32:
+; AVX512VPOPCNTDQ:       # %bb.0:
+; AVX512VPOPCNTDQ-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,32,0,8,16,3,2,3]
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
+; AVX512VPOPCNTDQVL-LABEL: foldv8i32:
+; AVX512VPOPCNTDQVL:       # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,32,0,8,16,3,2,3]
+; AVX512VPOPCNTDQVL-NEXT:    retq
+;
+; BITALG_NOVLX-LABEL: foldv8i32:
+; BITALG_NOVLX:       # %bb.0:
+; BITALG_NOVLX-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,32,0,8,16,3,2,3]
+; BITALG_NOVLX-NEXT:    retq
+;
+; BITALG-LABEL: foldv8i32:
+; BITALG:       # %bb.0:
+; BITALG-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,32,0,8,16,3,2,3]
+; BITALG-NEXT:    retq
   %out = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>)
   ret <8 x i32> %out
 }
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
index 025518d1a101e..a79b109feec72 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -2516,7 +2516,7 @@ define  <4 x float> @shuffle_mem_v4f32_0624(<4 x float> %a0, ptr %a1) {
 ;
 ; AVX512VL-LABEL: shuffle_mem_v4f32_0624:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovaps {{.*#+}} xmm1 = [4,2,6,0]
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,2,6,0]
 ; AVX512VL-NEXT:    vpermt2ps (%rdi), %xmm1, %xmm0
 ; AVX512VL-NEXT:    retq
   %1 = load <4 x float>, ptr %a1
@@ -2540,7 +2540,7 @@ define  <4 x float> @shuffle_mem_v4f32_4760(<4 x float> %a0, ptr %a1) {
 ;
 ; AVX512VL-LABEL: shuffle_mem_v4f32_4760:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovaps {{.*#+}} xmm1 = [0,3,2,4]
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,3,2,4]
 ; AVX512VL-NEXT:    vpermt2ps (%rdi), %xmm1, %xmm0
 ; AVX512VL-NEXT:    retq
   %1 = load <4 x float>, ptr %a1
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
index 919869df6c262..81f79f3b1399a 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -332,7 +332,7 @@ define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) {
 ;
 ; AVX512VL-FAST-LABEL: shuffle_v4f64_0423:
 ; AVX512VL-FAST:       # %bb.0:
-; AVX512VL-FAST-NEXT:    vmovapd {{.*#+}} ymm2 = [0,4,2,3]
+; AVX512VL-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [0,4,2,3]
 ; AVX512VL-FAST-NEXT:    vpermt2pd %ymm1, %ymm2, %ymm0
 ; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
@@ -349,7 +349,7 @@ define <4 x double> @shuffle_v4f64_0462(<4 x double> %a, <4 x double> %b) {
 ;
 ; AVX512VL-LABEL: shuffle_v4f64_0462:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovapd {{.*#+}} ymm2 = [0,4,6,2]
+; AVX512VL-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [0,4,6,2]
 ; AVX512VL-NEXT:    vpermt2pd %ymm1, %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 6, i32 2>
@@ -461,7 +461,7 @@ define <4 x double> @shuffle_v4f64_1054(<4 x double> %a, <4 x double> %b) {
 ;
 ; AVX512VL-FAST-ALL-LABEL: shuffle_v4f64_1054:
 ; AVX512VL-FAST-ALL:       # %bb.0:
-; AVX512VL-FAST-ALL-NEXT:    vmovapd {{.*#+}} ymm2 = [1,0,5,4]
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [1,0,5,4]
 ; AVX512VL-FAST-ALL-NEXT:    vpermt2pd %ymm1, %ymm2, %ymm0
 ; AVX512VL-FAST-ALL-NEXT:    retq
 ;
@@ -489,7 +489,7 @@ define <4 x double> @shuffle_v4f64_3254(<4 x double> %a, <4 x double> %b) {
 ;
 ; AVX512VL-FAST-ALL-LABEL: shuffle_v4f64_3254:
 ; AVX512VL-FAST-ALL:       # %bb.0:
-; AVX512VL-FAST-ALL-NEXT:    vmovapd {{.*#+}} ymm2 = [3,2,5,4]
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [3,2,5,4]
 ; AVX512VL-FAST-ALL-NEXT:    vpermt2pd %ymm1, %ymm2, %ymm0
 ; AVX512VL-FAST-ALL-NEXT:    retq
 ;
@@ -517,7 +517,7 @@ define <4 x double> @shuffle_v4f64_3276(<4 x double> %a, <4 x double> %b) {
 ;
 ; AVX512VL-FAST-ALL-LABEL: shuffle_v4f64_3276:
 ; AVX512VL-FAST-ALL:       # %bb.0:
-; AVX512VL-FAST-ALL-NEXT:    vmovapd {{.*#+}} ymm2 = [3,2,7,6]
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [3,2,7,6]
 ; AVX512VL-FAST-ALL-NEXT:    vpermt2pd %ymm1, %ymm2, %ymm0
 ; AVX512VL-FAST-ALL-NEXT:    retq
 ;
@@ -545,7 +545,7 @@ define <4 x double> @shuffle_v4f64_1076(<4 x double> %a, <4 x double> %b) {
 ;
 ; AVX512VL-FAST-LABEL: shuffle_v4f64_1076:
 ; AVX512VL-FAST:       # %bb.0:
-; AVX512VL-FAST-NEXT:    vmovapd {{.*#+}} ymm2 = [1,0,7,6]
+; AVX512VL-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [1,0,7,6]
 ; AVX512VL-FAST-NEXT:    vpermt2pd %ymm1, %ymm2, %ymm0
 ; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 7, i32 6>
@@ -569,7 +569,7 @@ define <4 x double> @shuffle_v4f64_0415(<4 x double> %a, <4 x double> %b) {
 ;
 ; AVX512VL-LABEL: shuffle_v4f64_0415:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovapd {{.*#+}} ymm2 = [0,4,1,5]
+; AVX512VL-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [0,4,1,5]
 ; AVX512VL-NEXT:    vpermt2pd %ymm1, %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -586,7 +586,7 @@ define <4 x double> @shuffle_v4f64_2741(<4 x double> %a, <4 x double> %b) {
 ;
 ; AVX512VL-LABEL: shuffle_v4f64_2741:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovapd {{.*#+}} ymm2 = [2,7,4,1]
+; AVX512VL-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [2,7,4,1]
 ; AVX512VL-NEXT:    vpermt2pd %ymm1, %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 7, i32 4, i32 1>
@@ -675,7 +675,7 @@ define <4 x double> @shuffle_v4f64_0456(<4 x double> %a, <4 x double> %b) {
 ;
 ; AVX512VL-FAST-ALL-LABEL: shuffle_v4f64_0456:
 ; AVX512VL-FAST-ALL:       # %bb.0:
-; AVX512VL-FAST-ALL-NEXT:    vmovapd {{.*#+}} ymm2 = [4,0,1,2]
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [4,0,1,2]
 ; AVX512VL-FAST-ALL-NEXT:    vpermi2pd %ymm0, %ymm1, %ymm2
 ; AVX512VL-FAST-ALL-NEXT:    vmovapd %ymm2, %ymm0
 ; AVX512VL-FAST-ALL-NEXT:    retq
@@ -762,7 +762,7 @@ define <4 x double> @shuffle_v4f64_0044(<4 x double> %a, <4 x double> %b) {
 ;
 ; AVX512VL-FAST-ALL-LABEL: shuffle_v4f64_0044:
 ; AVX512VL-FAST-ALL:       # %bb.0:
-; AVX512VL-FAST-ALL-NEXT:    vmovapd {{.*#+}} ymm2 = [0,0,4,4]
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [0,0,4,4]
 ; AVX512VL-FAST-ALL-NEXT:    vpermt2pd %ymm1, %ymm2, %ymm0
 ; AVX512VL-FAST-ALL-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
index b815cef63bfd4..bd78dbded0705 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -56,7 +56,7 @@ define <8 x float> @shuffle_v8f32_00000010(<8 x float> %a, <8 x float> %b) {
 ;
 ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_00000010:
 ; AVX512VL-FAST-ALL:       # %bb.0:
-; AVX512VL-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0]
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,0,0,1]
 ; AVX512VL-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512VL-FAST-ALL-NEXT:    retq
 ;
@@ -103,7 +103,7 @@ define <8 x float> @shuffle_v8f32_00000200(<8 x float> %a, <8 x float> %b) {
 ;
 ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_00000200:
 ; AVX512VL-FAST-ALL:       # %bb.0:
-; AVX512VL-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0]
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0]
 ; AVX512VL-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512VL-FAST-ALL-NEXT:    retq
 ;
@@ -150,7 +150,7 @@ define <8 x float> @shuffle_v8f32_00003000(<8 x float> %a, <8 x float> %b) {
 ;
 ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_00003000:
 ; AVX512VL-FAST-ALL:       # %bb.0:
-; AVX512VL-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0]
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,0,3,0]
 ; AVX512VL-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512VL-FAST-ALL-NEXT:    retq
 ;
@@ -172,11 +172,17 @@ define <8 x float> @shuffle_v8f32_00040000(<8 x float> %a, <8 x float> %b) {
 ; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[2,0],ymm0[4,4],ymm1[6,4]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8f32_00040000:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} xmm1 = [0,0,0,4]
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8f32_00040000:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} xmm1 = [0,0,0,4]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8f32_00040000:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4]
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
   ret <8 x float> %shuffle
 }
@@ -189,11 +195,17 @@ define <8 x float> @shuffle_v8f32_00500000(<8 x float> %a, <8 x float> %b) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,4,4]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8f32_00500000:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} xmm1 = [0,0,5,0]
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8f32_00500000:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} xmm1 = [0,0,5,0]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8f32_00500000:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [0,5]
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x float> %shuffle
 }
@@ -206,11 +218,17 @@ define <8 x float> @shuffle_v8f32_06000000(<8 x float> %a, <8 x float> %b) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,0,4,4,4,4]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8f32_06000000:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovsd {{.*#+}} xmm1 = [0,6,0,0]
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8f32_06000000:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = [0,6,0,0]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8f32_06000000:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,6,0,0]
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x float> %shuffle
 }
@@ -223,11 +241,17 @@ define <8 x float> @shuffle_v8f32_70000000(<8 x float> %a, <8 x float> %b) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,0,0,0,4,4,4,4]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8f32_70000000:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovss {{.*#+}} xmm1 = [7,0,0,0]
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8f32_70000000:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [7,0,0,0]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8f32_70000000:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [7,0]
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x float> %shuffle
 }
@@ -249,11 +273,17 @@ define <8 x float> @shuffle_v8f32_00112233(<8 x float> %a, <8 x float> %b) {
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8f32_00112233:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8f32_00112233:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8f32_00112233:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
   ret <8 x float> %shuffle
 }
@@ -292,7 +322,7 @@ define <8 x float> @shuffle_v8f32_00001111(<8 x float> %a, <8 x float> %b) {
 ;
 ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_00001111:
 ; AVX512VL-FAST-ALL:       # %bb.0:
-; AVX512VL-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
 ; AVX512VL-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512VL-FAST-ALL-NEXT:    retq
 ;
@@ -364,7 +394,7 @@ define <8 x float> @shuffle_v8f32_08084c4c(<8 x float> %a, <8 x float> %b) {
 ;
 ; AVX512VL-FAST-LABEL: shuffle_v8f32_08084c4c:
 ; AVX512VL-FAST:       # %bb.0:
-; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [0,8,0,8,4,12,4,12]
+; AVX512VL-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,8,0,8,4,12,4,12]
 ; AVX512VL-FAST-NEXT:    vpermt2ps %ymm1, %ymm2, %ymm0
 ; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
@@ -426,7 +456,7 @@ define <8 x float> @shuffle_v8f32_08192a3b(<8 x float> %a, <8 x float> %b) {
 ;
 ; AVX512VL-LABEL: shuffle_v8f32_08192a3b:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovaps {{.*#+}} ymm2 = [0,8,1,9,2,10,3,11]
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,8,1,9,2,10,3,11]
 ; AVX512VL-NEXT:    vpermt2ps %ymm1, %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -472,7 +502,7 @@ define <8 x float> @shuffle_v8f32_08991abb(<8 x float> %a, <8 x float> %b) {
 ;
 ; AVX512VL-LABEL: shuffle_v8f32_08991abb:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovaps {{.*#+}} ymm2 = [8,0,1,1,9,2,3,3]
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [8,0,1,1,9,2,3,3]
 ; AVX512VL-NEXT:    vpermi2ps %ymm0, %ymm1, %ymm2
 ; AVX512VL-NEXT:    vmovaps %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
@@ -503,7 +533,7 @@ define <8 x float> @shuffle_v8f32_091b2d3f(<8 x float> %a, <8 x float> %b) {
 ;
 ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_091b2d3f:
 ; AVX512VL-FAST-ALL:       # %bb.0:
-; AVX512VL-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = [0,9,1,11,2,13,3,15]
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,9,1,11,2,13,3,15]
 ; AVX512VL-FAST-ALL-NEXT:    vpermt2ps %ymm1, %ymm2, %ymm0
 ; AVX512VL-FAST-ALL-NEXT:    retq
 ;
@@ -547,7 +577,7 @@ define <8 x float> @shuffle_v8f32_09ab1def(<8 x float> %a, <8 x float> %b) {
 ;
 ; AVX512VL-LABEL: shuffle_v8f32_09ab1def:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovaps {{.*#+}} ymm2 = [8,1,2,3,9,5,6,7]
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [8,1,2,3,9,5,6,7]
 ; AVX512VL-NEXT:    vpermi2ps %ymm0, %ymm1, %ymm2
 ; AVX512VL-NEXT:    vmovaps %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
@@ -924,7 +954,7 @@ define <8 x float> @shuffle_v8f32_c348cda0(<8 x float> %a, <8 x float> %b) {
 ;
 ; AVX512VL-LABEL: shuffle_v8f32_c348cda0:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovaps {{.*#+}} ymm2 = [4,11,12,0,4,5,2,8]
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [4,11,12,0,4,5,2,8]
 ; AVX512VL-NEXT:    vpermi2ps %ymm0, %ymm1, %ymm2
 ; AVX512VL-NEXT:    vmovaps %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
@@ -972,7 +1002,7 @@ define <8 x float> @shuffle_v8f32_f511235a(<8 x float> %a, <8 x float> %b) {
 ;
 ; AVX512VL-LABEL: shuffle_v8f32_f511235a:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovaps {{.*#+}} ymm2 = [15,5,1,1,2,3,5,10]
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [15,5,1,1,2,3,5,10]
 ; AVX512VL-NEXT:    vpermt2ps %ymm1, %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 15, i32 5, i32 1, i32 1, i32 2, i32 3, i32 5, i32 10>
@@ -1108,7 +1138,7 @@ define <8 x float> @shuffle_v8f32_76543210(<8 x float> %a, <8 x float> %b) {
 ;
 ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_76543210:
 ; AVX512VL-FAST-ALL:       # %bb.0:
-; AVX512VL-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
 ; AVX512VL-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512VL-FAST-ALL-NEXT:    retq
 ;
@@ -1136,7 +1166,7 @@ define <8 x float> @shuffle_v8f32_3210ba98(<8 x float> %a, <8 x float> %b) {
 ;
 ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_3210ba98:
 ; AVX512VL-FAST-ALL:       # %bb.0:
-; AVX512VL-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = [3,2,1,0,11,10,9,8]
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [3,2,1,0,11,10,9,8]
 ; AVX512VL-FAST-ALL-NEXT:    vpermt2ps %ymm1, %ymm2, %ymm0
 ; AVX512VL-FAST-ALL-NEXT:    retq
 ;
@@ -1164,7 +1194,7 @@ define <8 x float> @shuffle_v8f32_3210fedc(<8 x float> %a, <8 x float> %b) {
 ;
 ; AVX512VL-FAST-LABEL: shuffle_v8f32_3210fedc:
 ; AVX512VL-FAST:       # %bb.0:
-; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12]
+; AVX512VL-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12]
 ; AVX512VL-FAST-NEXT:    vpermt2ps %ymm1, %ymm2, %ymm0
 ; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12>
@@ -1186,7 +1216,7 @@ define <8 x float> @shuffle_v8f32_7654fedc(<8 x float> %a, <8 x float> %b) {
 ;
 ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_7654fedc:
 ; AVX512VL-FAST-ALL:       # %bb.0:
-; AVX512VL-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12]
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12]
 ; AVX512VL-FAST-ALL-NEXT:    vpermt2ps %ymm1, %ymm2, %ymm0
 ; AVX512VL-FAST-ALL-NEXT:    retq
 ;
@@ -1214,7 +1244,7 @@ define <8 x float> @shuffle_v8f32_fedc7654(<8 x float> %a, <8 x float> %b) {
 ;
 ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_fedc7654:
 ; AVX512VL-FAST-ALL:       # %bb.0:
-; AVX512VL-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12]
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12]
 ; AVX512VL-FAST-ALL-NEXT:    vpermi2ps %ymm0, %ymm1, %ymm2
 ; AVX512VL-FAST-ALL-NEXT:    vmovaps %ymm2, %ymm0
 ; AVX512VL-FAST-ALL-NEXT:    retq
@@ -1250,7 +1280,7 @@ define <8 x float> @PR21138(<8 x float> %truc, <8 x float> %tchose) {
 ;
 ; AVX512VL-FAST-ALL-LABEL: PR21138:
 ; AVX512VL-FAST-ALL:       # %bb.0:
-; AVX512VL-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15]
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15]
 ; AVX512VL-FAST-ALL-NEXT:    vpermt2ps %ymm1, %ymm2, %ymm0
 ; AVX512VL-FAST-ALL-NEXT:    retq
 ;
@@ -1278,7 +1308,7 @@ define <8 x float> @shuffle_v8f32_ba987654(<8 x float> %a, <8 x float> %b) {
 ;
 ; AVX512VL-FAST-LABEL: shuffle_v8f32_ba987654:
 ; AVX512VL-FAST:       # %bb.0:
-; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12]
+; AVX512VL-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12]
 ; AVX512VL-FAST-NEXT:    vpermi2ps %ymm0, %ymm1, %ymm2
 ; AVX512VL-FAST-NEXT:    vmovaps %ymm2, %ymm0
 ; AVX512VL-FAST-NEXT:    retq
@@ -1301,7 +1331,7 @@ define <8 x float> @shuffle_v8f32_ba983210(<8 x float> %a, <8 x float> %b) {
 ;
 ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_ba983210:
 ; AVX512VL-FAST-ALL:       # %bb.0:
-; AVX512VL-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = [3,2,1,0,11,10,9,8]
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [3,2,1,0,11,10,9,8]
 ; AVX512VL-FAST-ALL-NEXT:    vpermi2ps %ymm0, %ymm1, %ymm2
 ; AVX512VL-FAST-ALL-NEXT:    vmovaps %ymm2, %ymm0
 ; AVX512VL-FAST-ALL-NEXT:    retq
@@ -1356,7 +1386,7 @@ define <8 x float> @shuffle_v8f32_084c195d(<8 x float> %a, <8 x float> %b) {
 ;
 ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_084c195d:
 ; AVX512VL-FAST-ALL:       # %bb.0:
-; AVX512VL-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = [0,8,4,12,1,9,5,13]
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,8,4,12,1,9,5,13]
 ; AVX512VL-FAST-ALL-NEXT:    vpermt2ps %ymm1, %ymm2, %ymm0
 ; AVX512VL-FAST-ALL-NEXT:    retq
 ;
@@ -1404,7 +1434,7 @@ define <8 x float> @shuffle_v8f32_089abcde(<8 x float> %a, <8 x float> %b) {
 ;
 ; AVX512VL-LABEL: shuffle_v8f32_089abcde:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovaps {{.*#+}} ymm2 = [8,0,1,2,3,4,5,6]
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [8,0,1,2,3,4,5,6]
 ; AVX512VL-NEXT:    vpermi2ps %ymm0, %ymm1, %ymm2
 ; AVX512VL-NEXT:    vmovaps %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
@@ -1427,7 +1457,7 @@ define <8 x float> @shuffle_v8f32_0189abcd(<8 x float> %a, <8 x float> %b) {
 ;
 ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_0189abcd:
 ; AVX512VL-FAST-ALL:       # %bb.0:
-; AVX512VL-FAST-ALL-NEXT:    vmovapd {{.*#+}} ymm2 = [4,0,1,2]
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [4,0,1,2]
 ; AVX512VL-FAST-ALL-NEXT:    vpermi2pd %ymm0, %ymm1, %ymm2
 ; AVX512VL-FAST-ALL-NEXT:    vmovapd %ymm2, %ymm0
 ; AVX512VL-FAST-ALL-NEXT:    retq
@@ -1459,7 +1489,7 @@ define <8 x float> @shuffle_v8f32_01289abc(<8 x float> %a, <8 x float> %b) {
 ;
 ; AVX512VL-LABEL: shuffle_v8f32_01289abc:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovaps {{.*#+}} ymm2 = [8,9,10,0,1,2,3,4]
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [8,9,10,0,1,2,3,4]
 ; AVX512VL-NEXT:    vpermi2ps %ymm0, %ymm1, %ymm2
 ; AVX512VL-NEXT:    vmovaps %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
@@ -1599,7 +1629,7 @@ define <8 x float> @shuffle_mem_v8f32_8BA0CFE4(<8 x float> %a0, ptr %a1) {
 ;
 ; AVX512VL-FAST-LABEL: shuffle_mem_v8f32_8BA0CFE4:
 ; AVX512VL-FAST:       # %bb.0:
-; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [0,3,2,8,4,7,6,12]
+; AVX512VL-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,3,2,8,4,7,6,12]
 ; AVX512VL-FAST-NEXT:    vpermt2ps (%rdi), %ymm1, %ymm0
 ; AVX512VL-FAST-NEXT:    retq
   %1 = load <8 x float>, ptr %a1
@@ -1656,7 +1686,7 @@ define <8 x i32> @shuffle_v8i32_00000010(<8 x i32> %a, <8 x i32> %b) {
 ;
 ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_00000010:
 ; AVX512VL-FAST-ALL:       # %bb.0:
-; AVX512VL-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0]
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,0,0,1]
 ; AVX512VL-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512VL-FAST-ALL-NEXT:    retq
 ;
@@ -1703,7 +1733,7 @@ define <8 x i32> @shuffle_v8i32_00000200(<8 x i32> %a, <8 x i32> %b) {
 ;
 ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_00000200:
 ; AVX512VL-FAST-ALL:       # %bb.0:
-; AVX512VL-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0]
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0]
 ; AVX512VL-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512VL-FAST-ALL-NEXT:    retq
 ;
@@ -1750,7 +1780,7 @@ define <8 x i32> @shuffle_v8i32_00003000(<8 x i32> %a, <8 x i32> %b) {
 ;
 ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_00003000:
 ; AVX512VL-FAST-ALL:       # %bb.0:
-; AVX512VL-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0]
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,0,3,0]
 ; AVX512VL-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512VL-FAST-ALL-NEXT:    retq
 ;
@@ -1772,11 +1802,17 @@ define <8 x i32> @shuffle_v8i32_00040000(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[2,0],ymm0[4,4],ymm1[6,4]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8i32_00040000:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} xmm1 = [0,0,0,4]
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8i32_00040000:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} xmm1 = [0,0,0,4]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_00040000:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4]
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
   ret <8 x i32> %shuffle
 }
@@ -1789,11 +1825,17 @@ define <8 x i32> @shuffle_v8i32_00500000(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,4,4]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8i32_00500000:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} xmm1 = [0,0,5,0]
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8i32_00500000:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} xmm1 = [0,0,5,0]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_00500000:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [0,5]
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x i32> %shuffle
 }
@@ -1806,11 +1848,17 @@ define <8 x i32> @shuffle_v8i32_06000000(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,0,4,4,4,4]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8i32_06000000:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovsd {{.*#+}} xmm1 = [0,6,0,0]
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8i32_06000000:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = [0,6,0,0]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_06000000:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,6,0,0]
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x i32> %shuffle
 }
@@ -1823,11 +1871,17 @@ define <8 x i32> @shuffle_v8i32_70000000(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,0,0,0,4,4,4,4]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8i32_70000000:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovss {{.*#+}} xmm1 = [7,0,0,0]
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8i32_70000000:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [7,0,0,0]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_70000000:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [7,0]
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x i32> %shuffle
 }
@@ -1880,7 +1934,7 @@ define <8 x i32> @shuffle_v8i32_00112233(<8 x i32> %a, <8 x i32> %b) {
 ;
 ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_00112233:
 ; AVX512VL-FAST-ALL:       # %bb.0:
-; AVX512VL-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
 ; AVX512VL-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512VL-FAST-ALL-NEXT:    retq
 ;
@@ -1927,7 +1981,7 @@ define <8 x i32> @shuffle_v8i32_00001111(<8 x i32> %a, <8 x i32> %b) {
 ;
 ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_00001111:
 ; AVX512VL-FAST-ALL:       # %bb.0:
-; AVX512VL-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
 ; AVX512VL-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512VL-FAST-ALL-NEXT:    retq
 ;
@@ -2329,11 +2383,17 @@ define <8 x i32> @shuffle_v8i32_00015444(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,5,4,4,4]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8i32_00015444:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,1,5,4,4,4]
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8i32_00015444:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,1,5,4,4,4]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_00015444:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,5,4,4,4]
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
   ret <8 x i32> %shuffle
 }
@@ -2344,11 +2404,17 @@ define <8 x i32> @shuffle_v8i32_00204644(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,6,4,4]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8i32_00204644:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,2,0,4,6,4,4]
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8i32_00204644:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,2,0,4,6,4,4]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_00204644:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,2,0,4,6,4,4]
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
   ret <8 x i32> %shuffle
 }
@@ -2359,11 +2425,17 @@ define <8 x i32> @shuffle_v8i32_03004474(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,4,7,4]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8i32_03004474:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,3,0,0,4,4,7,4]
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8i32_03004474:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [0,3,0,0,4,4,7,4]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_03004474:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,3,0,0,4,4,7,4]
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
   ret <8 x i32> %shuffle
 }
@@ -2374,11 +2446,17 @@ define <8 x i32> @shuffle_v8i32_10004444(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,4,4,4,4]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8i32_10004444:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [1,0,0,0,4,4,4,4]
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8i32_10004444:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [1,0,0,0,4,4,4,4]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_10004444:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [1,0,0,0,4,4,4,4]
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   ret <8 x i32> %shuffle
 }
@@ -2389,11 +2467,17 @@ define <8 x i32> @shuffle_v8i32_22006446(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,4,4,6]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8i32_22006446:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [2,2,0,0,6,4,4,6]
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8i32_22006446:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [2,2,0,0,6,4,4,6]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_22006446:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [2,2,0,0,6,4,4,6]
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
   ret <8 x i32> %shuffle
 }
@@ -2404,11 +2488,17 @@ define <8 x i32> @shuffle_v8i32_33307474(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,4,7,4]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8i32_33307474:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [3,3,3,0,7,4,7,4]
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8i32_33307474:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [3,3,3,0,7,4,7,4]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_33307474:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [3,3,3,0,7,4,7,4]
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
   ret <8 x i32> %shuffle
 }
@@ -2419,11 +2509,17 @@ define <8 x i32> @shuffle_v8i32_32104567(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8i32_32104567:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [3,2,1,0,4,5,6,7]
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8i32_32104567:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [3,2,1,0,4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_32104567:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [3,2,1,0,4,5,6,7]
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i32> %shuffle
 }
@@ -2434,11 +2530,17 @@ define <8 x i32> @shuffle_v8i32_00236744(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,6,7,4,4]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8i32_00236744:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,2,3,6,7,4,4]
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8i32_00236744:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,2,3,6,7,4,4]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_00236744:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,2,3,6,7,4,4]
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
   ret <8 x i32> %shuffle
 }
@@ -2449,11 +2551,17 @@ define <8 x i32> @shuffle_v8i32_00226644(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,6,6,4,4]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8i32_00226644:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,2,2,6,6,4,4]
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8i32_00226644:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,2,2,6,6,4,4]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_00226644:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,2,2,6,6,4,4]
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
   ret <8 x i32> %shuffle
 }
@@ -2464,11 +2572,17 @@ define <8 x i32> @shuffle_v8i32_10324567(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8i32_10324567:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [1,0,3,2,4,5,6,7]
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8i32_10324567:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [1,0,3,2,4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_10324567:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [1,0,3,2,4,5,6,7]
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i32> %shuffle
 }
@@ -2479,11 +2593,17 @@ define <8 x i32> @shuffle_v8i32_11334567(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8i32_11334567:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [1,1,3,3,4,5,6,7]
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8i32_11334567:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [1,1,3,3,4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_11334567:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [1,1,3,3,4,5,6,7]
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i32> %shuffle
 }
@@ -2494,11 +2614,17 @@ define <8 x i32> @shuffle_v8i32_01235467(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,7]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8i32_01235467:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,1,2,3,5,4,6,7]
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8i32_01235467:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [0,1,2,3,5,4,6,7]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_01235467:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,5,4,6,7]
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
   ret <8 x i32> %shuffle
 }
@@ -2509,11 +2635,17 @@ define <8 x i32> @shuffle_v8i32_01235466(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,6]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8i32_01235466:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,1,2,3,5,4,6,6]
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8i32_01235466:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [0,1,2,3,5,4,6,6]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_01235466:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,5,4,6,6]
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
   ret <8 x i32> %shuffle
 }
@@ -2524,11 +2656,17 @@ define <8 x i32> @shuffle_v8i32_002u6u44(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,u,6,u,4,4]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8i32_002u6u44:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,2,u,6,u,4,4]
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8i32_002u6u44:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,2,u,6,u,4,4]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_002u6u44:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,2,0,6,0,4,4]
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 poison, i32 6, i32 poison, i32 4, i32 4>
   ret <8 x i32> %shuffle
 }
@@ -2539,11 +2677,17 @@ define <8 x i32> @shuffle_v8i32_00uu66uu(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,u,u,6,6,u,u]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8i32_00uu66uu:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,u,u,6,6,u,u]
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8i32_00uu66uu:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,u,u,6,6,u,u]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_00uu66uu:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,6,6,0,0]
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 6, i32 6, i32 poison, i32 poison>
   ret <8 x i32> %shuffle
 }
@@ -2554,11 +2698,17 @@ define <8 x i32> @shuffle_v8i32_103245uu(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,u,u]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8i32_103245uu:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [1,0,3,2,4,5,u,u]
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8i32_103245uu:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [1,0,3,2,4,5,u,u]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_103245uu:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [1,0,3,2,4,5,0,0]
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 poison, i32 poison>
   ret <8 x i32> %shuffle
 }
@@ -2569,11 +2719,17 @@ define <8 x i32> @shuffle_v8i32_1133uu67(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,u,u,6,7]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8i32_1133uu67:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [1,1,3,3,u,u,6,7]
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8i32_1133uu67:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [1,1,3,3,u,u,6,7]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_1133uu67:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [1,1,3,3,0,0,6,7]
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 poison, i32 poison, i32 6, i32 7>
   ret <8 x i32> %shuffle
 }
@@ -2584,11 +2740,17 @@ define <8 x i32> @shuffle_v8i32_0uu354uu(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,u,u,3,5,4,u,u]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8i32_0uu354uu:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,u,u,3,5,4,u,u]
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8i32_0uu354uu:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [0,u,u,3,5,4,u,u]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_0uu354uu:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,0,3,5,4,0,0]
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 poison, i32 poison, i32 3, i32 5, i32 4, i32 poison, i32 poison>
   ret <8 x i32> %shuffle
 }
@@ -2599,11 +2761,17 @@ define <8 x i32> @shuffle_v8i32_uuu3uu66(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[u,u,u,3,u,u,6,6]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8i32_uuu3uu66:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [u,u,u,3,u,u,6,6]
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8i32_uuu3uu66:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [u,u,u,3,u,u,6,6]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_uuu3uu66:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,0,3,0,0,6,6]
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 6, i32 6>
   ret <8 x i32> %shuffle
 }
@@ -2781,7 +2949,7 @@ define <8 x i32> @shuffle_v8i32_76543210(<8 x i32> %a, <8 x i32> %b) {
 ;
 ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_76543210:
 ; AVX512VL-FAST-ALL:       # %bb.0:
-; AVX512VL-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
 ; AVX512VL-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512VL-FAST-ALL-NEXT:    retq
 ;
@@ -3713,7 +3881,7 @@ define <8 x float> @broadcast_concat_crash(<4 x float> %x, <4 x float> %y, float
 ; AVX512VL-FAST:       # %bb.0: # %entry
 ; AVX512VL-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512VL-FAST-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
-; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} xmm1 = [1,4,3,3]
+; AVX512VL-FAST-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [1,4,3,3]
 ; AVX512VL-FAST-NEXT:    vpermi2ps %xmm2, %xmm0, %xmm1
 ; AVX512VL-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512VL-FAST-NEXT:    retq
@@ -3847,7 +4015,7 @@ define <8 x float> @lowhalf_v8f32(<8 x float> %x, <8 x float> %y) {
 ;
 ; AVX512VL-LABEL: lowhalf_v8f32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovaps {{.*#+}} xmm2 = [2,14,3,14]
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [2,14,3,14]
 ; AVX512VL-NEXT:    vpermt2ps %ymm1, %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
   %r = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 2, i32 14, i32 3, i32 14, i32 poison, i32 poison, i32 poison, i32 poison>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
index d3b04878dc06d..b1efb416014b0 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
@@ -292,7 +292,7 @@ define <16 x i32> @shuffle_v16i32_01_02_03_16_05_06_07_20_09_10_11_24_13_14_15_2
 define <16 x float> @shuffle_v16f32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<16 x float> %a)  {
 ; ALL-LABEL: shuffle_v16f32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vmovaps {{.*#+}} zmm1 = [2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1]
+; ALL-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [2,5,0,0,7,0,10,1,0,5,0,4,7,0,10,1]
 ; ALL-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; ALL-NEXT:    retq
   %c = shufflevector <16 x float> %a, <16 x float> poison, <16 x i32> <i32 2, i32 5, i32 poison, i32 poison, i32 7, i32 poison, i32 10, i32 1,  i32 0, i32 5, i32 poison, i32 4, i32 7, i32 poison, i32 10, i32 1>
@@ -302,7 +302,7 @@ define <16 x float> @shuffle_v16f32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<
 define <16 x i32> @shuffle_v16i32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<16 x i32> %a)  {
 ; ALL-LABEL: shuffle_v16i32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vmovaps {{.*#+}} zmm1 = [2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1]
+; ALL-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [2,5,0,0,7,0,10,1,0,5,0,4,7,0,10,1]
 ; ALL-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; ALL-NEXT:    retq
   %c = shufflevector <16 x i32> %a, <16 x i32> poison, <16 x i32> <i32 2, i32 5, i32 poison, i32 poison, i32 7, i32 poison, i32 10, i32 1,  i32 0, i32 5, i32 poison, i32 4, i32 7, i32 poison, i32 10, i32 1>
@@ -322,7 +322,7 @@ define <16 x i32> @shuffle_v16i32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_1
 define <16 x float> @shuffle_v16f32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18(<16 x float> %a, <16 x float> %b)  {
 ; ALL-LABEL: shuffle_v16f32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vmovaps {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
+; ALL-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
 ; ALL-NEXT:    vpermt2ps %zmm1, %zmm2, %zmm0
 ; ALL-NEXT:    retq
   %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
@@ -339,7 +339,7 @@ define <16 x i32> @shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_0
 ;
 ; FAST-LABEL: shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
 ; FAST:       # %bb.0:
-; FAST-NEXT:    vmovaps {{.*#+}} zmm1 = [11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4]
+; FAST-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4]
 ; FAST-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; FAST-NEXT:    retq
   %1 = shufflevector <16 x i32> %a, <16 x i32> poison, <16 x i32> <i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
@@ -355,7 +355,7 @@ define <16 x float> @shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05
 ;
 ; FAST-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
 ; FAST:       # %bb.0:
-; FAST-NEXT:    vmovaps {{.*#+}} zmm1 = [11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4]
+; FAST-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4]
 ; FAST-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; FAST-NEXT:    retq
   %1 = shufflevector <16 x float> %a, <16 x float> poison, <16 x i32> <i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
@@ -365,7 +365,7 @@ define <16 x float> @shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05
 define <16 x float> @shuffle_v16f32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18(<16 x float> %a, ptr %b)  {
 ; ALL-LABEL: shuffle_v16f32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vmovaps {{.*#+}} zmm1 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
+; ALL-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
 ; ALL-NEXT:    vpermt2ps (%rdi), %zmm1, %zmm0
 ; ALL-NEXT:    retq
   %c = load <16 x float>, ptr %b
@@ -382,7 +382,7 @@ define <16 x float> @shuffle_v16f32_load_08_11_10_00_12_15_14_04(<16 x float> %a
 ;
 ; FAST-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04:
 ; FAST:       # %bb.0:
-; FAST-NEXT:    vmovaps {{.*#+}} zmm1 = [0,3,2,16,4,7,6,20,8,11,10,24,12,15,14,28]
+; FAST-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,3,2,16,4,7,6,20,8,11,10,24,12,15,14,28]
 ; FAST-NEXT:    vpermt2ps (%rdi), %zmm1, %zmm0
 ; FAST-NEXT:    retq
   %1 = load <16 x float>, ptr %a1
@@ -421,7 +421,7 @@ define <8 x i32> @test_v16i32_1_3_5_7_9_11_13_15(<16 x i32> %v) {
 ;
 ; FAST-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
 ; FAST:       # %bb.0:
-; FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
+; FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
 ; FAST-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; FAST-NEXT:    retq
@@ -441,7 +441,7 @@ define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) {
 ;
 ; FAST-LABEL: test_v16i32_0_1_2_12:
 ; FAST:       # %bb.0:
-; FAST-NEXT:    vmovaps {{.*#+}} xmm1 = [0,1,2,12]
+; FAST-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,1,2,12]
 ; FAST-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; FAST-NEXT:    vzeroupper
@@ -455,7 +455,7 @@ define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) {
 define <4 x i32> @test_v16i32_0_4_8_12(<16 x i32> %v) {
 ; ALL-LABEL: test_v16i32_0_4_8_12:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vmovaps {{.*#+}} xmm1 = [0,4,8,12]
+; ALL-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,4,8,12]
 ; ALL-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; ALL-NEXT:    vzeroupper
@@ -478,7 +478,7 @@ define <8 x float> @shuffle_v16f32_extract_256(ptr %RET, ptr %a) {
 define <8 x float> @test_v16f32_0_1_2_3_4_6_7_10 (<16 x float> %v) {
 ; ALL-LABEL: test_v16f32_0_1_2_3_4_6_7_10:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,1,2,3,4,6,7,10]
+; ALL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,6,7,10]
 ; ALL-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; ALL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; ALL-NEXT:    retq
@@ -490,7 +490,7 @@ define <8 x float> @test_v16f32_0_1_2_3_4_6_7_10 (<16 x float> %v) {
 define <4 x float> @test_v16f32_0_1_3_6 (<16 x float> %v) {
 ; ALL-LABEL: test_v16f32_0_1_3_6:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vmovaps {{.*#+}} xmm1 = [0,1,3,6]
+; ALL-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,1,3,6]
 ; ALL-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; ALL-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
index 62327fb2618ce..4fe50a60b67fa 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
@@ -121,7 +121,7 @@ define <64 x i8> @shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<64 x i8> %a) {
 ;
 ; AVX512DQ-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovss {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512DQ-NEXT:    vpmovzxbq {{.*#+}} xmm1 = [255,0]
 ; AVX512DQ-NEXT:    vandps %zmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
index 172ba83d30415..38fdf0b23970a 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -44,97 +44,61 @@ define <8 x double> @shuffle_v8f64_44444444_bc(<8 x i64> %a, <8 x i64> %b) {
 }
 
 define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) {
-; AVX512F-LABEL: shuffle_v8f64_00000010:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8f64_00000010:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8f64_00000010:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_00000200(<8 x double> %a, <8 x double> %b) {
-; AVX512F-LABEL: shuffle_v8f64_00000200:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,2,0,0]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8f64_00000200:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8f64_00000200:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,2,0,0]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_00003000(<8 x double> %a, <8 x double> %b) {
-; AVX512F-LABEL: shuffle_v8f64_00003000:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,3,0,0,0]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8f64_00003000:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8f64_00003000:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,3,0,0,0]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_00040000(<8 x double> %a, <8 x double> %b) {
-; AVX512F-LABEL: shuffle_v8f64_00040000:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,4]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8f64_00040000:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,4,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8f64_00040000:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,0,0,4]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_00500000(<8 x double> %a, <8 x double> %b) {
-; AVX512F-LABEL: shuffle_v8f64_00500000:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,5,0]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8f64_00500000:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,0,5,0,0,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8f64_00500000:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,0,5,0]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) {
-; AVX512F-LABEL: shuffle_v8f64_06000000:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8f64_06000000:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} xmm1 = [0,0,6,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8f64_06000000:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [0,6]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x double> %shuffle
 }
@@ -142,7 +106,7 @@ define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) {
 define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) {
 ; ALL-LABEL: shuffle_v8f64_70000000:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vmovss {{.*#+}} xmm1 = [7,0,0,0]
+; ALL-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [7,0]
 ; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -159,33 +123,21 @@ define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) {
 }
 
 define <8 x double> @shuffle_v8f64_00112233(<8 x double> %a, <8 x double> %b) {
-; AVX512F-LABEL: shuffle_v8f64_00112233:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8f64_00112233:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,1,0,1,0,2,0,2,0,3,0,3,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8f64_00112233:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_00001111(<8 x double> %a, <8 x double> %b) {
-; AVX512F-LABEL: shuffle_v8f64_00001111:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8f64_00001111:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8f64_00001111:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
   ret <8 x double> %shuffle
 }
@@ -218,171 +170,106 @@ define <8 x double> @shuffle_v8f64_08080808(<8 x double> %a, <8 x double> %b) {
 }
 
 define <8 x double> @shuffle_v8f64_08084c4c(<8 x double> %a, <8 x double> %b) {
-; AVX512F-LABEL: shuffle_v8f64_08084c4c:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovapd {{.*#+}} zmm2 = [0,8,0,8,4,12,4,12]
-; AVX512F-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8f64_08084c4c:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovapd {{.*#+}} zmm2 = [0,0,8,0,0,0,8,0,4,0,12,0,4,0,12,0]
-; AVX512F-32-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8f64_08084c4c:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,8,0,8,4,12,4,12]
+; ALL-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_8823cc67(<8 x double> %a, <8 x double> %b) {
-; AVX512F-LABEL: shuffle_v8f64_8823cc67:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovapd {{.*#+}} zmm2 = [0,0,10,11,4,4,14,15]
-; AVX512F-NEXT:    vpermi2pd %zmm0, %zmm1, %zmm2
-; AVX512F-NEXT:    vmovapd %zmm2, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8f64_8823cc67:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovapd {{.*#+}} zmm2 = [0,0,0,0,10,0,11,0,4,0,4,0,14,0,15,0]
-; AVX512F-32-NEXT:    vpermi2pd %zmm0, %zmm1, %zmm2
-; AVX512F-32-NEXT:    vmovapd %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8f64_8823cc67:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,0,10,11,4,4,14,15]
+; ALL-NEXT:    vpermi2pd %zmm0, %zmm1, %zmm2
+; ALL-NEXT:    vmovapd %zmm2, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_9832dc76(<8 x double> %a, <8 x double> %b) {
-; AVX512F-LABEL: shuffle_v8f64_9832dc76:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovapd {{.*#+}} zmm2 = [1,0,11,10,5,4,15,14]
-; AVX512F-NEXT:    vpermi2pd %zmm0, %zmm1, %zmm2
-; AVX512F-NEXT:    vmovapd %zmm2, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8f64_9832dc76:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovapd {{.*#+}} zmm2 = [1,0,0,0,11,0,10,0,5,0,4,0,15,0,14,0]
-; AVX512F-32-NEXT:    vpermi2pd %zmm0, %zmm1, %zmm2
-; AVX512F-32-NEXT:    vmovapd %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8f64_9832dc76:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [1,0,11,10,5,4,15,14]
+; ALL-NEXT:    vpermi2pd %zmm0, %zmm1, %zmm2
+; ALL-NEXT:    vmovapd %zmm2, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_9810dc54(<8 x double> %a, <8 x double> %b) {
-; AVX512F-LABEL: shuffle_v8f64_9810dc54:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovapd {{.*#+}} zmm2 = [1,0,9,8,5,4,13,12]
-; AVX512F-NEXT:    vpermi2pd %zmm0, %zmm1, %zmm2
-; AVX512F-NEXT:    vmovapd %zmm2, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8f64_9810dc54:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovapd {{.*#+}} zmm2 = [1,0,0,0,9,0,8,0,5,0,4,0,13,0,12,0]
-; AVX512F-32-NEXT:    vpermi2pd %zmm0, %zmm1, %zmm2
-; AVX512F-32-NEXT:    vmovapd %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8f64_9810dc54:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [1,0,9,8,5,4,13,12]
+; ALL-NEXT:    vpermi2pd %zmm0, %zmm1, %zmm2
+; ALL-NEXT:    vmovapd %zmm2, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_08194c5d(<8 x double> %a, <8 x double> %b) {
-; AVX512F-LABEL: shuffle_v8f64_08194c5d:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovapd {{.*#+}} zmm2 = [0,8,1,9,4,12,5,13]
-; AVX512F-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8f64_08194c5d:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovapd {{.*#+}} zmm2 = [0,0,8,0,1,0,9,0,4,0,12,0,5,0,13,0]
-; AVX512F-32-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8f64_08194c5d:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,4,12,5,13]
+; ALL-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_2a3b6e7f(<8 x double> %a, <8 x double> %b) {
-; AVX512F-LABEL: shuffle_v8f64_2a3b6e7f:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovapd {{.*#+}} zmm2 = [2,10,3,11,6,14,7,15]
-; AVX512F-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8f64_2a3b6e7f:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovapd {{.*#+}} zmm2 = [2,0,10,0,3,0,11,0,6,0,14,0,7,0,15,0]
-; AVX512F-32-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8f64_2a3b6e7f:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [2,10,3,11,6,14,7,15]
+; ALL-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_08192a3b(<8 x double> %a, <8 x double> %b) {
-; AVX512F-LABEL: shuffle_v8f64_08192a3b:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovapd {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11]
-; AVX512F-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8f64_08192a3b:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovapd {{.*#+}} zmm2 = [0,0,8,0,1,0,9,0,2,0,10,0,3,0,11,0]
-; AVX512F-32-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8f64_08192a3b:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11]
+; ALL-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_08991abb(<8 x double> %a, <8 x double> %b) {
-; AVX512F-LABEL: shuffle_v8f64_08991abb:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovapd {{.*#+}} zmm2 = [8,0,1,1,9,2,3,3]
-; AVX512F-NEXT:    vpermi2pd %zmm0, %zmm1, %zmm2
-; AVX512F-NEXT:    vmovapd %zmm2, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8f64_08991abb:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovapd {{.*#+}} zmm2 = [8,0,0,0,1,0,1,0,9,0,2,0,3,0,3,0]
-; AVX512F-32-NEXT:    vpermi2pd %zmm0, %zmm1, %zmm2
-; AVX512F-32-NEXT:    vmovapd %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8f64_08991abb:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [8,0,1,1,9,2,3,3]
+; ALL-NEXT:    vpermi2pd %zmm0, %zmm1, %zmm2
+; ALL-NEXT:    vmovapd %zmm2, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_091b2d3f(<8 x double> %a, <8 x double> %b) {
-; AVX512F-LABEL: shuffle_v8f64_091b2d3f:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovapd {{.*#+}} zmm2 = [0,9,1,11,2,13,3,15]
-; AVX512F-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8f64_091b2d3f:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovapd {{.*#+}} zmm2 = [0,0,9,0,1,0,11,0,2,0,13,0,3,0,15,0]
-; AVX512F-32-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8f64_091b2d3f:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,9,1,11,2,13,3,15]
+; ALL-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_09ab1def(<8 x double> %a, <8 x double> %b) {
-; AVX512F-LABEL: shuffle_v8f64_09ab1def:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovapd {{.*#+}} zmm2 = [8,1,2,3,9,5,6,7]
-; AVX512F-NEXT:    vpermi2pd %zmm0, %zmm1, %zmm2
-; AVX512F-NEXT:    vmovapd %zmm2, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8f64_09ab1def:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovapd {{.*#+}} zmm2 = [8,0,1,0,2,0,3,0,9,0,5,0,6,0,7,0]
-; AVX512F-32-NEXT:    vpermi2pd %zmm0, %zmm1, %zmm2
-; AVX512F-32-NEXT:    vmovapd %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8f64_09ab1def:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [8,1,2,3,9,5,6,7]
+; ALL-NEXT:    vpermi2pd %zmm0, %zmm1, %zmm2
+; ALL-NEXT:    vmovapd %zmm2, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
   ret <8 x double> %shuffle
 }
@@ -505,145 +392,91 @@ define <8 x double> @shuffle_v8f64_10225466(<8 x double> %a, <8 x double> %b) {
 }
 
 define <8 x double> @shuffle_v8f64_00015444(<8 x double> %a, <8 x double> %b) {
-; AVX512F-LABEL: shuffle_v8f64_00015444:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,1,5,4,4,4]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8f64_00015444:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,5,0,4,0,4,0,4,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8f64_00015444:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,0,0,1,5,4,4,4]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_00204644(<8 x double> %a, <8 x double> %b) {
-; AVX512F-LABEL: shuffle_v8f64_00204644:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,2,0,4,6,4,4]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8f64_00204644:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,0,0,4,0,6,0,4,0,4,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8f64_00204644:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,0,2,0,4,6,4,4]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_03004474(<8 x double> %a, <8 x double> %b) {
-; AVX512F-LABEL: shuffle_v8f64_03004474:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,3,0,0,4,4,7,4]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8f64_03004474:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,3,0,0,0,0,0,4,0,4,0,7,0,4,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8f64_03004474:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,3,0,0,4,4,7,4]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_10004444(<8 x double> %a, <8 x double> %b) {
-; AVX512F-LABEL: shuffle_v8f64_10004444:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [1,0,0,0,4,4,4,4]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8f64_10004444:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [1,0,0,0,0,0,0,0,4,0,4,0,4,0,4,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8f64_10004444:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [1,0,0,0,4,4,4,4]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_22006446(<8 x double> %a, <8 x double> %b) {
-; AVX512F-LABEL: shuffle_v8f64_22006446:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [2,2,0,0,6,4,4,6]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8f64_22006446:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [2,0,2,0,0,0,0,0,6,0,4,0,4,0,6,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8f64_22006446:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [2,2,0,0,6,4,4,6]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_33307474(<8 x double> %a, <8 x double> %b) {
-; AVX512F-LABEL: shuffle_v8f64_33307474:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [3,3,3,0,7,4,7,4]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8f64_33307474:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [3,0,3,0,3,0,0,0,7,0,4,0,7,0,4,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8f64_33307474:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [3,3,3,0,7,4,7,4]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_32104567(<8 x double> %a, <8 x double> %b) {
-; AVX512F-LABEL: shuffle_v8f64_32104567:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [3,2,1,0,4,5,6,7]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8f64_32104567:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [3,0,2,0,1,0,0,0,4,0,5,0,6,0,7,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8f64_32104567:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [3,2,1,0,4,5,6,7]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_00236744(<8 x double> %a, <8 x double> %b) {
-; AVX512F-LABEL: shuffle_v8f64_00236744:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,2,3,6,7,4,4]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8f64_00236744:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,3,0,6,0,7,0,4,0,4,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8f64_00236744:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,0,2,3,6,7,4,4]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_00226644(<8 x double> %a, <8 x double> %b) {
-; AVX512F-LABEL: shuffle_v8f64_00226644:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,2,2,6,6,4,4]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8f64_00226644:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,2,0,6,0,6,0,4,0,4,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8f64_00226644:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,0,2,2,6,6,4,4]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
   ret <8 x double> %shuffle
 }
@@ -685,33 +518,21 @@ define <8 x double> @shuffle_v8f64_01235466(<8 x double> %a, <8 x double> %b) {
 }
 
 define <8 x double> @shuffle_v8f64_002u6u44(<8 x double> %a, <8 x double> %b) {
-; AVX512F-LABEL: shuffle_v8f64_002u6u44:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,2,u,6,u,4,4]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8f64_002u6u44:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,u,u,6,0,u,u,4,0,4,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8f64_002u6u44:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,0,2,0,6,0,4,4]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 poison, i32 6, i32 poison, i32 4, i32 4>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_00uu66uu(<8 x double> %a, <8 x double> %b) {
-; AVX512F-LABEL: shuffle_v8f64_00uu66uu:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,u,u,6,6,u,u]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8f64_00uu66uu:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,u,u,u,u,6,0,6,0,u,u,u,u]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8f64_00uu66uu:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,6,6,0,0]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 6, i32 6, i32 poison, i32 poison>
   ret <8 x double> %shuffle
 }
@@ -753,35 +574,22 @@ define <8 x double> @shuffle_v8f64_uuu3uu66(<8 x double> %a, <8 x double> %b) {
 }
 
 define <8 x double> @shuffle_v8f64_c348cda0(<8 x double> %a, <8 x double> %b) {
-; AVX512F-LABEL: shuffle_v8f64_c348cda0:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovapd {{.*#+}} zmm2 = [4,11,12,0,4,5,2,8]
-; AVX512F-NEXT:    vpermi2pd %zmm0, %zmm1, %zmm2
-; AVX512F-NEXT:    vmovapd %zmm2, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8f64_c348cda0:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovapd {{.*#+}} zmm2 = [4,0,11,0,12,0,0,0,4,0,5,0,2,0,8,0]
-; AVX512F-32-NEXT:    vpermi2pd %zmm0, %zmm1, %zmm2
-; AVX512F-32-NEXT:    vmovapd %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8f64_c348cda0:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [4,11,12,0,4,5,2,8]
+; ALL-NEXT:    vpermi2pd %zmm0, %zmm1, %zmm2
+; ALL-NEXT:    vmovapd %zmm2, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 12, i32 3, i32 4, i32 8, i32 12, i32 13, i32 10, i32 0>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_f511235a(<8 x double> %a, <8 x double> %b) {
-; AVX512F-LABEL: shuffle_v8f64_f511235a:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovapd {{.*#+}} zmm2 = [15,5,1,1,2,3,5,10]
-; AVX512F-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8f64_f511235a:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovapd {{.*#+}} zmm2 = [15,0,5,0,1,0,1,0,2,0,3,0,5,0,10,0]
-; AVX512F-32-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8f64_f511235a:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [15,5,1,1,2,3,5,10]
+; ALL-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 15, i32 5, i32 1, i32 1, i32 2, i32 3, i32 5, i32 10>
   ret <8 x double> %shuffle
 }
@@ -862,97 +670,61 @@ define <8 x i64> @shuffle_v8i64_66666666(<8 x i64> %a, <8 x i64> %b) {
 }
 
 define <8 x i64> @shuffle_v8i64_00000010(<8 x i64> %a, <8 x i64> %b) {
-; AVX512F-LABEL: shuffle_v8i64_00000010:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8i64_00000010:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8i64_00000010:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_00000200(<8 x i64> %a, <8 x i64> %b) {
-; AVX512F-LABEL: shuffle_v8i64_00000200:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,2,0,0]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8i64_00000200:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8i64_00000200:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,2,0,0]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_00003000(<8 x i64> %a, <8 x i64> %b) {
-; AVX512F-LABEL: shuffle_v8i64_00003000:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,3,0,0,0]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8i64_00003000:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8i64_00003000:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,3,0,0,0]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_00040000(<8 x i64> %a, <8 x i64> %b) {
-; AVX512F-LABEL: shuffle_v8i64_00040000:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,4]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8i64_00040000:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,4,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8i64_00040000:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,0,0,4]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_00500000(<8 x i64> %a, <8 x i64> %b) {
-; AVX512F-LABEL: shuffle_v8i64_00500000:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,5,0]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8i64_00500000:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,0,5,0,0,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8i64_00500000:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,0,5,0]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) {
-; AVX512F-LABEL: shuffle_v8i64_06000000:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8i64_06000000:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} xmm1 = [0,0,6,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8i64_06000000:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [0,6]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x i64> %shuffle
 }
@@ -960,7 +732,7 @@ define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) {
 define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) {
 ; ALL-LABEL: shuffle_v8i64_70000000:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vmovss {{.*#+}} xmm1 = [7,0,0,0]
+; ALL-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [7,0]
 ; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -995,33 +767,21 @@ define <8 x i64> @shuffle_v8i64_01014545_mem(ptr %ptr, <8 x i64> %b) {
 }
 
 define <8 x i64> @shuffle_v8i64_00112233(<8 x i64> %a, <8 x i64> %b) {
-; AVX512F-LABEL: shuffle_v8i64_00112233:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8i64_00112233:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,1,0,1,0,2,0,2,0,3,0,3,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8i64_00112233:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_00001111(<8 x i64> %a, <8 x i64> %b) {
-; AVX512F-LABEL: shuffle_v8i64_00001111:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8i64_00001111:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8i64_00001111:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
   ret <8 x i64> %shuffle
 }
@@ -1278,305 +1038,191 @@ define <8 x i64> @shuffle_v8i64_10225466(<8 x i64> %a, <8 x i64> %b) {
 }
 
 define <8 x i64> @shuffle_v8i64_00015444(<8 x i64> %a, <8 x i64> %b) {
-; AVX512F-LABEL: shuffle_v8i64_00015444:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,1,5,4,4,4]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8i64_00015444:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,5,0,4,0,4,0,4,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8i64_00015444:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,0,0,1,5,4,4,4]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_00204644(<8 x i64> %a, <8 x i64> %b) {
-; AVX512F-LABEL: shuffle_v8i64_00204644:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,2,0,4,6,4,4]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8i64_00204644:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,0,0,4,0,6,0,4,0,4,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8i64_00204644:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,0,2,0,4,6,4,4]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_03004474(<8 x i64> %a, <8 x i64> %b) {
-; AVX512F-LABEL: shuffle_v8i64_03004474:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,3,0,0,4,4,7,4]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8i64_03004474:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,3,0,0,0,0,0,4,0,4,0,7,0,4,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8i64_03004474:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,3,0,0,4,4,7,4]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_10004444(<8 x i64> %a, <8 x i64> %b) {
-; AVX512F-LABEL: shuffle_v8i64_10004444:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [1,0,0,0,4,4,4,4]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8i64_10004444:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [1,0,0,0,0,0,0,0,4,0,4,0,4,0,4,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8i64_10004444:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [1,0,0,0,4,4,4,4]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_22006446(<8 x i64> %a, <8 x i64> %b) {
-; AVX512F-LABEL: shuffle_v8i64_22006446:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [2,2,0,0,6,4,4,6]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8i64_22006446:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [2,0,2,0,0,0,0,0,6,0,4,0,4,0,6,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8i64_22006446:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [2,2,0,0,6,4,4,6]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_33307474(<8 x i64> %a, <8 x i64> %b) {
-; AVX512F-LABEL: shuffle_v8i64_33307474:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [3,3,3,0,7,4,7,4]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8i64_33307474:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [3,0,3,0,3,0,0,0,7,0,4,0,7,0,4,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8i64_33307474:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [3,3,3,0,7,4,7,4]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_32104567(<8 x i64> %a, <8 x i64> %b) {
-; AVX512F-LABEL: shuffle_v8i64_32104567:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [3,2,1,0,4,5,6,7]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8i64_32104567:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [3,0,2,0,1,0,0,0,4,0,5,0,6,0,7,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8i64_32104567:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [3,2,1,0,4,5,6,7]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_00236744(<8 x i64> %a, <8 x i64> %b) {
-; AVX512F-LABEL: shuffle_v8i64_00236744:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,2,3,6,7,4,4]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8i64_00236744:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,3,0,6,0,7,0,4,0,4,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8i64_00236744:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,0,2,3,6,7,4,4]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_00226644(<8 x i64> %a, <8 x i64> %b) {
-; AVX512F-LABEL: shuffle_v8i64_00226644:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,2,2,6,6,4,4]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8i64_00226644:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,2,0,6,0,6,0,4,0,4,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8i64_00226644:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,0,2,2,6,6,4,4]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_10324567(<8 x i64> %a, <8 x i64> %b) {
-; AVX512F-LABEL: shuffle_v8i64_10324567:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [1,0,3,2,4,5,6,7]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8i64_10324567:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [1,0,0,0,3,0,2,0,4,0,5,0,6,0,7,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8i64_10324567:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [1,0,3,2,4,5,6,7]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_11334567(<8 x i64> %a, <8 x i64> %b) {
-; AVX512F-LABEL: shuffle_v8i64_11334567:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [1,1,3,3,4,5,6,7]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8i64_11334567:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [1,0,1,0,3,0,3,0,4,0,5,0,6,0,7,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8i64_11334567:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [1,1,3,3,4,5,6,7]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_01235467(<8 x i64> %a, <8 x i64> %b) {
-; AVX512F-LABEL: shuffle_v8i64_01235467:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,1,2,3,5,4,6,7]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8i64_01235467:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,5,0,4,0,6,0,7,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8i64_01235467:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,5,4,6,7]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_01235466(<8 x i64> %a, <8 x i64> %b) {
-; AVX512F-LABEL: shuffle_v8i64_01235466:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,1,2,3,5,4,6,6]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8i64_01235466:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,5,0,4,0,6,0,6,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8i64_01235466:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,5,4,6,6]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_002u6u44(<8 x i64> %a, <8 x i64> %b) {
-; AVX512F-LABEL: shuffle_v8i64_002u6u44:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,2,u,6,u,4,4]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8i64_002u6u44:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,u,u,6,0,u,u,4,0,4,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8i64_002u6u44:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,0,2,0,6,0,4,4]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 poison, i32 6, i32 poison, i32 4, i32 4>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_00uu66uu(<8 x i64> %a, <8 x i64> %b) {
-; AVX512F-LABEL: shuffle_v8i64_00uu66uu:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,u,u,6,6,u,u]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8i64_00uu66uu:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,u,u,u,u,6,0,6,0,u,u,u,u]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8i64_00uu66uu:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,6,6,0,0]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 6, i32 6, i32 poison, i32 poison>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_103245uu(<8 x i64> %a, <8 x i64> %b) {
-; AVX512F-LABEL: shuffle_v8i64_103245uu:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [1,0,3,2,4,5,u,u]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8i64_103245uu:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [1,0,0,0,3,0,2,0,4,0,5,0,u,u,u,u]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8i64_103245uu:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [1,0,3,2,4,5,0,0]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 poison, i32 poison>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_1133uu67(<8 x i64> %a, <8 x i64> %b) {
-; AVX512F-LABEL: shuffle_v8i64_1133uu67:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [1,1,3,3,u,u,6,7]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8i64_1133uu67:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [1,0,1,0,3,0,3,0,u,u,u,u,6,0,7,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8i64_1133uu67:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [1,1,3,3,0,0,6,7]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 poison, i32 poison, i32 6, i32 7>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_0uu354uu(<8 x i64> %a, <8 x i64> %b) {
-; AVX512F-LABEL: shuffle_v8i64_0uu354uu:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,u,u,3,5,4,u,u]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8i64_0uu354uu:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,u,u,u,u,3,0,5,0,4,0,u,u,u,u]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8i64_0uu354uu:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,0,0,3,5,4,0,0]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 poison, i32 poison, i32 3, i32 5, i32 4, i32 poison, i32 poison>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_uuu3uu66(<8 x i64> %a, <8 x i64> %b) {
-; AVX512F-LABEL: shuffle_v8i64_uuu3uu66:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [u,u,u,3,u,u,6,6]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8i64_uuu3uu66:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [u,u,u,u,u,u,3,0,u,u,u,u,6,0,6,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8i64_uuu3uu66:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,0,0,3,0,0,6,6]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 6, i32 6>
   ret <8 x i64> %shuffle
 }
@@ -2131,79 +1777,49 @@ define <8 x double> @shuffle_v2f64_v8f64_01010101(<2 x double> %a) {
 
 ;FIXME: compressp
 define <4 x double> @test_v8f64_2346 (<8 x double> %v) {
-; AVX512F-LABEL: test_v8f64_2346:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} ymm1 = [2,3,4,6]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_v8f64_2346:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} ymm1 = [2,0,3,0,4,0,6,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: test_v8f64_2346:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [2,3,4,6]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <8 x double> %v, <8 x double> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 6>
   ret <4 x double> %res
 }
 
 ;FIXME: compressp
 define <2 x double> @test_v8f64_34 (<8 x double> %v) {
-; AVX512F-LABEL: test_v8f64_34:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} xmm1 = [3,4]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_v8f64_34:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} xmm1 = [3,0,4,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-32-NEXT:    vzeroupper
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: test_v8f64_34:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [3,4]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <8 x double> %v, <8 x double> poison, <2 x i32> <i32 3, i32 4>
   ret <2 x double> %res
 }
 
 ; FIXME: vpcompress
 define <4 x i64> @test_v8i64_1257 (<8 x i64> %v) {
-; AVX512F-LABEL: test_v8i64_1257:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} ymm1 = [1,2,5,7]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_v8i64_1257:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} ymm1 = [1,0,2,0,5,0,7,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: test_v8i64_1257:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [1,2,5,7]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <8 x i64> %v, <8 x i64> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 7>
   ret <4 x i64> %res
 }
 
 define <2 x i64> @test_v8i64_2_5 (<8 x i64> %v) {
-; AVX512F-LABEL: test_v8i64_2_5:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} xmm1 = [2,5]
-; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_v8i64_2_5:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} xmm1 = [2,0,5,0]
-; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-32-NEXT:    vzeroupper
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: test_v8i64_2_5:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [2,5]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <8 x i64> %v, <8 x i64> poison, <2 x i32> <i32 2, i32 5>
   ret <2 x i64> %res
 }
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
index 3fd73319e8577..545a9d3e314a2 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
@@ -38,7 +38,7 @@ define <8 x float> @expand1(<4 x float> %a ) {
 ; AVX512F-LABEL: expand1:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    vmovaps {{.*#+}} ymm1 = [16,0,18,1,20,2,22,3]
+; AVX512F-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [16,0,18,1,20,2,22,3]
 ; AVX512F-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm0
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
@@ -114,7 +114,7 @@ define <8 x float> @expand5(<4 x float> %a ) {
 ; AVX512-FAST:       # %bb.0:
 ; AVX512-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
 ; AVX512-FAST-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [8,0,10,0,12,0,14,0]
+; AVX512-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [8,10,12,14]
 ; AVX512-FAST-NEXT:    vpermt2ps %ymm1, %ymm2, %ymm0
 ; AVX512-FAST-NEXT:    ret{{[l|q]}}
 ;
@@ -245,7 +245,7 @@ define <16 x float> @expand12(<8 x float> %a) {
 ; CHECK-LABEL: expand12:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16]
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vpermt2ps %zmm0, %zmm2, %zmm1
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
@@ -278,7 +278,7 @@ define <8 x float> @expand14(<4 x float> %a) {
 ; AVX512F-LABEL: expand14:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    vmovaps {{.*#+}} ymm1 = [16,17,0,19,1,21,22,23]
+; AVX512F-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [16,17,0,19,1,21,22,23]
 ; AVX512F-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm0
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
@@ -300,8 +300,7 @@ define <8 x float> @expand15(<4 x float> %a) {
 ; AVX512-FAST-LABEL: expand15:
 ; AVX512-FAST:       # %bb.0:
 ; AVX512-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512-FAST-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [1,0,0,0,1,0,0,0]
-; AVX512-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,0,1,0]
 ; AVX512-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX512-FAST-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7]
 ; AVX512-FAST-NEXT:    ret{{[l|q]}}
@@ -481,13 +480,13 @@ define <8 x float> @test_masked_permps_v8f32(ptr %vp, <8 x float> %vec2) {
 ; X86-AVX512-LABEL: test_masked_permps_v8f32:
 ; X86-AVX512:       # %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    vmovaps {{.*#+}} ymm1 = [15,14,11,3,15,14,6,7]
+; X86-AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [15,14,11,3,15,14,6,7]
 ; X86-AVX512-NEXT:    vpermt2ps (%eax), %ymm1, %ymm0
 ; X86-AVX512-NEXT:    retl
 ;
 ; X64-AVX512-LABEL: test_masked_permps_v8f32:
 ; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovaps {{.*#+}} ymm1 = [15,14,11,3,15,14,6,7]
+; X64-AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [15,14,11,3,15,14,6,7]
 ; X64-AVX512-NEXT:    vpermt2ps (%rdi), %ymm1, %ymm0
 ; X64-AVX512-NEXT:    retq
 ;
@@ -496,7 +495,7 @@ define <8 x float> @test_masked_permps_v8f32(ptr %vp, <8 x float> %vec2) {
 ; X86-AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512F-NEXT:    vmovaps (%eax), %ymm1
-; X86-AVX512F-NEXT:    vmovaps {{.*#+}} ymm2 = [23,22,19,3,23,22,6,7]
+; X86-AVX512F-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [23,22,19,3,23,22,6,7]
 ; X86-AVX512F-NEXT:    vpermt2ps %zmm1, %zmm2, %zmm0
 ; X86-AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; X86-AVX512F-NEXT:    retl
@@ -505,7 +504,7 @@ define <8 x float> @test_masked_permps_v8f32(ptr %vp, <8 x float> %vec2) {
 ; X64-AVX512F:       # %bb.0:
 ; X64-AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; X64-AVX512F-NEXT:    vmovaps (%rdi), %ymm1
-; X64-AVX512F-NEXT:    vmovaps {{.*#+}} ymm2 = [23,22,19,3,23,22,6,7]
+; X64-AVX512F-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [23,22,19,3,23,22,6,7]
 ; X64-AVX512F-NEXT:    vpermt2ps %zmm1, %zmm2, %zmm0
 ; X64-AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; X64-AVX512F-NEXT:    retq
@@ -519,26 +518,26 @@ define <16 x float> @test_masked_permps_v16f32(ptr %vp, <16 x float> %vec2) {
 ; X86-AVX512-LABEL: test_masked_permps_v16f32:
 ; X86-AVX512:       # %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    vmovaps {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15]
+; X86-AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15]
 ; X86-AVX512-NEXT:    vpermt2ps (%eax), %zmm1, %zmm0
 ; X86-AVX512-NEXT:    retl
 ;
 ; X64-AVX512-LABEL: test_masked_permps_v16f32:
 ; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovaps {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15]
+; X64-AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15]
 ; X64-AVX512-NEXT:    vpermt2ps (%rdi), %zmm1, %zmm0
 ; X64-AVX512-NEXT:    retq
 ;
 ; X86-AVX512F-LABEL: test_masked_permps_v16f32:
 ; X86-AVX512F:       # %bb.0:
 ; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15]
+; X86-AVX512F-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15]
 ; X86-AVX512F-NEXT:    vpermt2ps (%eax), %zmm1, %zmm0
 ; X86-AVX512F-NEXT:    retl
 ;
 ; X64-AVX512F-LABEL: test_masked_permps_v16f32:
 ; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15]
+; X64-AVX512F-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15]
 ; X64-AVX512F-NEXT:    vpermt2ps (%rdi), %zmm1, %zmm0
 ; X64-AVX512F-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
index 05071064fc60e..79602a18693db 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -140,7 +140,7 @@ define <8 x float> @combine_vpermilvar_vperm2f128_zero_8f32(<8 x float> %a0) {
 ; AVX512-LABEL: combine_vpermilvar_vperm2f128_zero_8f32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512-NEXT:    vmovaps {{.*#+}} ymm1 = [16,17,18,19,3,2,1,0]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [16,17,18,19,3,2,1,0]
 ; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; AVX512-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm0
 ; AVX512-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
@@ -405,7 +405,7 @@ define void @PR39483() {
 ; X86-AVX512:       # %bb.0: # %entry
 ; X86-AVX512-NEXT:    vmovups 0, %zmm0
 ; X86-AVX512-NEXT:    vmovups 64, %ymm1
-; X86-AVX512-NEXT:    vmovaps {{.*#+}} ymm2 = [2,5,8,11,14,17,20,23]
+; X86-AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [2,5,8,11,14,17,20,23]
 ; X86-AVX512-NEXT:    vpermi2ps %zmm1, %zmm0, %zmm2
 ; X86-AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; X86-AVX512-NEXT:    vmulps %ymm0, %ymm2, %ymm1
@@ -446,7 +446,7 @@ define void @PR39483() {
 ; X64-AVX512:       # %bb.0: # %entry
 ; X64-AVX512-NEXT:    vmovups 0, %zmm0
 ; X64-AVX512-NEXT:    vmovups 64, %ymm1
-; X64-AVX512-NEXT:    vmovaps {{.*#+}} ymm2 = [2,5,8,11,14,17,20,23]
+; X64-AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [2,5,8,11,14,17,20,23]
 ; X64-AVX512-NEXT:    vpermi2ps %zmm1, %zmm0, %zmm2
 ; X64-AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vmulps %ymm0, %ymm2, %ymm1
@@ -521,19 +521,18 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm4
-; X86-AVX512-NEXT:    vmovapd {{.*#+}} ymm3 = [1,0,2,0,8,0,9,0]
+; X86-AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [1,2,8,9]
 ; X86-AVX512-NEXT:    vpermi2pd %zmm2, %zmm1, %zmm3
-; X86-AVX512-NEXT:    vmovapd {{.*#+}} ymm5 = [0,0,10,0,2,0,9,0]
+; X86-AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm5 = [0,10,2,9]
 ; X86-AVX512-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
 ; X86-AVX512-NEXT:    vpermt2pd %zmm4, %zmm5, %zmm6
 ; X86-AVX512-NEXT:    vmovapd %ymm6, (%edx)
-; X86-AVX512-NEXT:    vmovapd {{.*#+}} ymm4 = [0,0,3,0,10,0,1,0]
+; X86-AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm4 = [0,3,10,1]
 ; X86-AVX512-NEXT:    vpermi2pd %zmm0, %zmm3, %zmm4
 ; X86-AVX512-NEXT:    vmovapd %ymm4, (%ecx)
-; X86-AVX512-NEXT:    vbroadcastf128 {{.*#+}} ymm3 = [3,0,11,0,3,0,11,0]
-; X86-AVX512-NEXT:    # ymm3 = mem[0,1,0,1]
+; X86-AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [3,11,0,0]
 ; X86-AVX512-NEXT:    vpermi2pd %zmm1, %zmm0, %zmm3
-; X86-AVX512-NEXT:    vmovapd {{.*#+}} ymm0 = [2,0,8,0,9,0,3,0]
+; X86-AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [2,8,9,3]
 ; X86-AVX512-NEXT:    vpermi2pd %zmm3, %zmm2, %zmm0
 ; X86-AVX512-NEXT:    vmovapd %ymm0, (%eax)
 ; X86-AVX512-NEXT:    vzeroupper
@@ -589,18 +588,18 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
 ; X64-AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; X64-AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3
-; X64-AVX512-NEXT:    vmovapd {{.*#+}} ymm4 = [1,2,8,9]
+; X64-AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm4 = [1,2,8,9]
 ; X64-AVX512-NEXT:    vpermi2pd %zmm2, %zmm1, %zmm4
-; X64-AVX512-NEXT:    vmovapd {{.*#+}} ymm5 = [0,10,2,9]
+; X64-AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm5 = [0,10,2,9]
 ; X64-AVX512-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
 ; X64-AVX512-NEXT:    vpermt2pd %zmm3, %zmm5, %zmm6
 ; X64-AVX512-NEXT:    vmovapd %ymm6, (%rdi)
-; X64-AVX512-NEXT:    vmovapd {{.*#+}} ymm3 = [0,3,10,1]
+; X64-AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [0,3,10,1]
 ; X64-AVX512-NEXT:    vpermi2pd %zmm0, %zmm4, %zmm3
 ; X64-AVX512-NEXT:    vmovapd %ymm3, (%rsi)
-; X64-AVX512-NEXT:    vmovapd {{.*#+}} xmm3 = [3,11]
+; X64-AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [3,11]
 ; X64-AVX512-NEXT:    vpermi2pd %zmm1, %zmm0, %zmm3
-; X64-AVX512-NEXT:    vmovapd {{.*#+}} ymm0 = [2,8,9,3]
+; X64-AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [2,8,9,3]
 ; X64-AVX512-NEXT:    vpermi2pd %zmm3, %zmm2, %zmm0
 ; X64-AVX512-NEXT:    vmovapd %ymm0, (%rdx)
 ; X64-AVX512-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
index 5488683fa860c..9e82c84fe5520 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@@ -114,11 +114,17 @@ define <4 x i64> @combine_permq_pshufb_as_vmovdqa(<4 x i64> %a0) {
 }
 
 define <8 x i32> @combine_as_vpermd(<8 x i32> %a0) {
-; CHECK-LABEL: combine_as_vpermd:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [4,5,4,5,6,7,0,7]
-; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; AVX2-LABEL: combine_as_vpermd:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [4,5,4,5,6,7,0,7]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    ret{{[l|q]}}
+;
+; AVX512-LABEL: combine_as_vpermd:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [4,5,4,5,6,7,0,7]
+; AVX512-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    ret{{[l|q]}}
   %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
   %2 = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6>)
   %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <8 x i32> <i32 0, i32 8, i32 9, i32 1, i32 15, i32 14, i32 4, i32 3>
@@ -126,11 +132,17 @@ define <8 x i32> @combine_as_vpermd(<8 x i32> %a0) {
 }
 
 define <8 x float> @combine_as_vpermps(<8 x float> %a0) {
-; CHECK-LABEL: combine_as_vpermps:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [6,4,7,5,1,u,4,7]
-; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; AVX2-LABEL: combine_as_vpermps:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [6,4,7,5,1,u,4,7]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    ret{{[l|q]}}
+;
+; AVX512-LABEL: combine_as_vpermps:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [6,4,7,5,1,0,4,7]
+; AVX512-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    ret{{[l|q]}}
   %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
   %2 = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 1, i32 undef, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>)
   %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 15, i32 0, i32 14, i32 1, i32 8, i32 9, i32 4, i32 3>
@@ -764,10 +776,15 @@ define <32 x i8> @combine_pshufb_pshufb_or_pshufb(<32 x i8> %a0) {
 }
 
 define <8 x i32> @constant_fold_permd() {
-; CHECK-LABEL: constant_fold_permd:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1]
-; CHECK-NEXT:    ret{{[l|q]}}
+; AVX2-LABEL: constant_fold_permd:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1]
+; AVX2-NEXT:    ret{{[l|q]}}
+;
+; AVX512-LABEL: constant_fold_permd:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1]
+; AVX512-NEXT:    ret{{[l|q]}}
   %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>, <8 x i32> <i32 4, i32 6, i32 2, i32 1, i32 7, i32 1, i32 5, i32 0>)
   ret <8 x i32> %1
 }
@@ -843,7 +860,7 @@ define internal fastcc <8 x float> @PR34577(<8 x float> %inp0, <8 x float> %inp1
 ; AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1]
 ; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; AVX512-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7]
-; AVX512-NEXT:    vmovaps {{.*#+}} ymm0 = [23,18,7,2,20,u,3,2]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [23,18,7,2,20,0,3,2]
 ; AVX512-NEXT:    vpermi2ps %zmm2, %zmm1, %zmm0
 ; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512-NEXT:    ret{{[l|q]}}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
index 29806cd25fe3f..d64d2d8a638c6 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
@@ -33,42 +33,42 @@ define <8 x double> @combine_permvar_8f64_identity(<8 x double> %x0, <8 x double
 define <8 x double> @combine_permvar_8f64_identity_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
 ; X86-AVX512F-LABEL: combine_permvar_8f64_identity_mask:
 ; X86-AVX512F:       # %bb.0:
-; X86-AVX512F-NEXT:    vmovapd {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
+; X86-AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
 ; X86-AVX512F-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512F-NEXT:    kmovw %eax, %k1
 ; X86-AVX512F-NEXT:    vpermpd %zmm0, %zmm2, %zmm1 {%k1}
-; X86-AVX512F-NEXT:    vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
+; X86-AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
 ; X86-AVX512F-NEXT:    vpermpd %zmm1, %zmm0, %zmm1 {%k1}
 ; X86-AVX512F-NEXT:    vmovapd %zmm1, %zmm0
 ; X86-AVX512F-NEXT:    retl
 ;
 ; X86-AVX512BW-LABEL: combine_permvar_8f64_identity_mask:
 ; X86-AVX512BW:       # %bb.0:
-; X86-AVX512BW-NEXT:    vmovapd {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
+; X86-AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
 ; X86-AVX512BW-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512BW-NEXT:    kmovd %eax, %k1
 ; X86-AVX512BW-NEXT:    vpermpd %zmm0, %zmm2, %zmm1 {%k1}
-; X86-AVX512BW-NEXT:    vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
+; X86-AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
 ; X86-AVX512BW-NEXT:    vpermpd %zmm1, %zmm0, %zmm1 {%k1}
 ; X86-AVX512BW-NEXT:    vmovapd %zmm1, %zmm0
 ; X86-AVX512BW-NEXT:    retl
 ;
 ; X64-AVX512F-LABEL: combine_permvar_8f64_identity_mask:
 ; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovapd {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
+; X64-AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
 ; X64-AVX512F-NEXT:    kmovw %edi, %k1
 ; X64-AVX512F-NEXT:    vpermpd %zmm0, %zmm2, %zmm1 {%k1}
-; X64-AVX512F-NEXT:    vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
+; X64-AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
 ; X64-AVX512F-NEXT:    vpermpd %zmm1, %zmm0, %zmm1 {%k1}
 ; X64-AVX512F-NEXT:    vmovapd %zmm1, %zmm0
 ; X64-AVX512F-NEXT:    retq
 ;
 ; X64-AVX512BW-LABEL: combine_permvar_8f64_identity_mask:
 ; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovapd {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
+; X64-AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
 ; X64-AVX512BW-NEXT:    kmovd %edi, %k1
 ; X64-AVX512BW-NEXT:    vpermpd %zmm0, %zmm2, %zmm1 {%k1}
-; X64-AVX512BW-NEXT:    vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
+; X64-AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
 ; X64-AVX512BW-NEXT:    vpermpd %zmm1, %zmm0, %zmm1 {%k1}
 ; X64-AVX512BW-NEXT:    vmovapd %zmm1, %zmm0
 ; X64-AVX512BW-NEXT:    retq
@@ -151,7 +151,7 @@ define <8 x double> @combine_vpermt2var_8f64_identity(<8 x double> %x0, <8 x dou
 define <8 x double> @combine_vpermt2var_8f64_identity_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
 ; X86-AVX512F-LABEL: combine_vpermt2var_8f64_identity_mask:
 ; X86-AVX512F:       # %bb.0:
-; X86-AVX512F-NEXT:    vmovapd {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
+; X86-AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
 ; X86-AVX512F-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512F-NEXT:    kmovw %eax, %k1
 ; X86-AVX512F-NEXT:    vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
@@ -160,7 +160,7 @@ define <8 x double> @combine_vpermt2var_8f64_identity_mask(<8 x double> %x0, <8
 ;
 ; X86-AVX512BW-LABEL: combine_vpermt2var_8f64_identity_mask:
 ; X86-AVX512BW:       # %bb.0:
-; X86-AVX512BW-NEXT:    vmovapd {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
+; X86-AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
 ; X86-AVX512BW-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512BW-NEXT:    kmovd %eax, %k1
 ; X86-AVX512BW-NEXT:    vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
@@ -169,7 +169,7 @@ define <8 x double> @combine_vpermt2var_8f64_identity_mask(<8 x double> %x0, <8
 ;
 ; X64-AVX512F-LABEL: combine_vpermt2var_8f64_identity_mask:
 ; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovapd {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; X64-AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
 ; X64-AVX512F-NEXT:    kmovw %edi, %k1
 ; X64-AVX512F-NEXT:    vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X64-AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
@@ -177,7 +177,7 @@ define <8 x double> @combine_vpermt2var_8f64_identity_mask(<8 x double> %x0, <8
 ;
 ; X64-AVX512BW-LABEL: combine_vpermt2var_8f64_identity_mask:
 ; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovapd {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; X64-AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
 ; X64-AVX512BW-NEXT:    kmovd %edi, %k1
 ; X64-AVX512BW-NEXT:    vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X64-AVX512BW-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
@@ -298,7 +298,7 @@ define <16 x float> @combine_vpermt2var_16f32_identity(<16 x float> %x0, <16 x f
 define <16 x float> @combine_vpermt2var_16f32_identity_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
 ; X86-LABEL: combine_vpermt2var_16f32_identity_mask:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X86-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X86-NEXT:    vpermps %zmm0, %zmm1, %zmm0 {%k1} {z}
@@ -306,7 +306,7 @@ define <16 x float> @combine_vpermt2var_16f32_identity_mask(<16 x float> %x0, <1
 ;
 ; X64-AVX512F-LABEL: combine_vpermt2var_16f32_identity_mask:
 ; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X64-AVX512F-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X64-AVX512F-NEXT:    kmovw %edi, %k1
 ; X64-AVX512F-NEXT:    vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X64-AVX512F-NEXT:    vpermps %zmm0, %zmm1, %zmm0 {%k1} {z}
@@ -314,7 +314,7 @@ define <16 x float> @combine_vpermt2var_16f32_identity_mask(<16 x float> %x0, <1
 ;
 ; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_identity_mask:
 ; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X64-AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X64-AVX512BW-NEXT:    kmovd %edi, %k1
 ; X64-AVX512BW-NEXT:    vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X64-AVX512BW-NEXT:    vpermps %zmm0, %zmm1, %zmm0 {%k1} {z}
@@ -790,34 +790,22 @@ define <16 x i32> @vpermt2var_vpermi2var_16i32_as_unpckldq(<16 x i32> %a0, <16 x
 }
 
 define <8 x double> @combine_vpermi2var_8f64_as_vpermpd(<8 x double> %x0, <8 x double> %x1) {
-; X86-LABEL: combine_vpermi2var_8f64_as_vpermpd:
-; X86:       # %bb.0:
-; X86-NEXT:    vmovaps {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
-; X86-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: combine_vpermi2var_8f64_as_vpermpd:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
-; X64-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermi2var_8f64_as_vpermpd:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 0, i64 7, i64 6, i64 5, i64 4>, <8 x double> %x1, i8 -1)
   %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %res0, <8 x i64> <i64 12, i64 5, i64 14, i64 7, i64 8, i64 1, i64 10, i64 3>, <8 x double> %res0, i8 -1)
   ret <8 x double> %res1
 }
 
 define <8 x i64> @combine_vpermt2var_8i64_as_vpermq(<8 x i64> %x0, <8 x i64> %x1) {
-; X86-LABEL: combine_vpermt2var_8i64_as_vpermq:
-; X86:       # %bb.0:
-; X86-NEXT:    vmovaps {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
-; X86-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: combine_vpermt2var_8i64_as_vpermq:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
-; X64-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermt2var_8i64_as_vpermq:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 3, i64 2, i64 1, i64 0, i64 7, i64 6, i64 5, i64 4>, <8 x i64> %x0, <8 x i64> %x1, i8 -1)
   %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 12, i64 5, i64 14, i64 7, i64 8, i64 1, i64 10, i64 3>, <8 x i64> %res0, <8 x i64> %res0, i8 -1)
   ret <8 x i64> %res1
@@ -826,7 +814,7 @@ define <8 x i64> @combine_vpermt2var_8i64_as_vpermq(<8 x i64> %x0, <8 x i64> %x1
 define <16 x float> @combine_vpermi2var_16f32_as_vpermps(<16 x float> %x0, <16 x float> %x1) {
 ; CHECK-LABEL: combine_vpermi2var_16f32_as_vpermps:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9]
 ; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>, <16 x float> %x1, i16 -1)
@@ -837,7 +825,7 @@ define <16 x float> @combine_vpermi2var_16f32_as_vpermps(<16 x float> %x0, <16 x
 define <16 x i32> @combine_vpermt2var_16i32_as_vpermd(<16 x i32> %x0, <16 x i32> %x1) {
 ; CHECK-LABEL: combine_vpermt2var_16i32_as_vpermd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9]
 ; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>, <16 x i32> %x0, <16 x i32> %x1, i16 -1)
@@ -864,19 +852,12 @@ define <16 x i32> @combine_vpermt2var_16i32_as_vpsllq(<16 x i32> %x0) {
 }
 
 define <8 x double> @combine_vpermi2var_vpermt2var_8f64_as_vperm2(<8 x double> %x0, <8 x double> %x1) {
-; X86-LABEL: combine_vpermi2var_vpermt2var_8f64_as_vperm2:
-; X86:       # %bb.0:
-; X86-NEXT:    vmovapd {{.*#+}} zmm2 = [4,0,14,0,3,0,12,0,7,0,8,0,0,0,15,0]
-; X86-NEXT:    vpermi2pd %zmm0, %zmm1, %zmm2
-; X86-NEXT:    vmovapd %zmm2, %zmm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: combine_vpermi2var_vpermt2var_8f64_as_vperm2:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovapd {{.*#+}} zmm2 = [4,14,3,12,7,8,0,15]
-; X64-NEXT:    vpermi2pd %zmm0, %zmm1, %zmm2
-; X64-NEXT:    vmovapd %zmm2, %zmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermi2var_vpermt2var_8f64_as_vperm2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [4,14,3,12,7,8,0,15]
+; CHECK-NEXT:    vpermi2pd %zmm0, %zmm1, %zmm2
+; CHECK-NEXT:    vmovapd %zmm2, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 15, i64 0, i64 8, i64 7, i64 12, i64 6, i64 11, i64 4>, <8 x double> %x1, i8 -1)
   %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 12, i64 5, i64 14, i64 7, i64 8, i64 1, i64 10, i64 3>, <8 x double> %res0, <8 x double> %res0, i8 -1)
   ret <8 x double> %res1
@@ -915,19 +896,12 @@ define <16 x i32> @combine_vpermi2var_vpermt2var_16i32_as_vpermd(<16 x i32> %x0,
 }
 
 define <8 x double> @combine_vpermi2var_vpermvar_8f64_as_vperm2_zero(<8 x double> %x0) {
-; X86-LABEL: combine_vpermi2var_vpermvar_8f64_as_vperm2_zero:
-; X86:       # %bb.0:
-; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; X86-NEXT:    vmovapd {{.*#+}} zmm2 = [8,0,3,0,10,0,11,0,1,0,7,0,14,0,5,0]
-; X86-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: combine_vpermi2var_vpermvar_8f64_as_vperm2_zero:
-; X64:       # %bb.0:
-; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; X64-NEXT:    vmovapd {{.*#+}} zmm2 = [8,3,10,11,1,7,14,5]
-; X64-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermi2var_vpermvar_8f64_as_vperm2_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [8,3,10,11,1,7,14,5]
+; CHECK-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = shufflevector <8 x double> %x0, <8 x double> zeroinitializer, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %res0, <8 x i64> <i64 3, i64 2, i64 1, i64 7, i64 0, i64 6, i64 5, i64 4>)
   ret <8 x double> %1
@@ -937,7 +911,7 @@ define <16 x float> @combine_vpermi2var_vpermvar_16f32_as_vperm2_zero(<16 x floa
 ; CHECK-LABEL: combine_vpermi2var_vpermvar_16f32_as_vperm2_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [0,13,1,12,4,9,22,12,4,25,26,9,5,29,30,8]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,13,1,12,4,9,22,12,4,25,26,9,5,29,30,8]
 ; CHECK-NEXT:    vpermt2ps %zmm1, %zmm2, %zmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = shufflevector <16 x float> %x0, <16 x float> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
index 882b816370478..ad73bb6886b9f 100644
--- a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
+++ b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
@@ -1765,29 +1765,44 @@ define <2 x i64> @foldv2i64() nounwind {
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = [8,0,0,0]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: foldv2i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [8,0,0,0]
-; AVX-NEXT:    retq
+; AVX1-LABEL: foldv2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = [8,0,0,0]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: foldv2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [8,0,0,0]
+; AVX2-NEXT:    retq
+;
+; AVX512CDVL-LABEL: foldv2i64:
+; AVX512CDVL:       # %bb.0:
+; AVX512CDVL-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [8,0]
+; AVX512CDVL-NEXT:    retq
+;
+; AVX512CD-LABEL: foldv2i64:
+; AVX512CD:       # %bb.0:
+; AVX512CD-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [8,0]
+; AVX512CD-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: foldv2i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vmovss {{.*#+}} xmm0 = [8,0,0,0]
+; AVX512VPOPCNTDQ-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [8,0]
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQVL-LABEL: foldv2i64:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vmovss {{.*#+}} xmm0 = [8,0,0,0]
+; AVX512VPOPCNTDQVL-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [8,0]
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: foldv2i64:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovss {{.*#+}} xmm0 = [8,0,0,0]
+; BITALG_NOVLX-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [8,0]
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: foldv2i64:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovss {{.*#+}} xmm0 = [8,0,0,0]
+; BITALG-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [8,0]
 ; BITALG-NEXT:    retq
 ;
 ; X86-SSE-LABEL: foldv2i64:
@@ -1804,29 +1819,44 @@ define <2 x i64> @foldv2i64u() nounwind {
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = [8,0,0,0]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: foldv2i64u:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [8,0,0,0]
-; AVX-NEXT:    retq
+; AVX1-LABEL: foldv2i64u:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = [8,0,0,0]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: foldv2i64u:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [8,0,0,0]
+; AVX2-NEXT:    retq
+;
+; AVX512CDVL-LABEL: foldv2i64u:
+; AVX512CDVL:       # %bb.0:
+; AVX512CDVL-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [8,0]
+; AVX512CDVL-NEXT:    retq
+;
+; AVX512CD-LABEL: foldv2i64u:
+; AVX512CD:       # %bb.0:
+; AVX512CD-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [8,0]
+; AVX512CD-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: foldv2i64u:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vmovss {{.*#+}} xmm0 = [8,0,0,0]
+; AVX512VPOPCNTDQ-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [8,0]
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQVL-LABEL: foldv2i64u:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vmovss {{.*#+}} xmm0 = [8,0,0,0]
+; AVX512VPOPCNTDQVL-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [8,0]
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: foldv2i64u:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovss {{.*#+}} xmm0 = [8,0,0,0]
+; BITALG_NOVLX-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [8,0]
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: foldv2i64u:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovss {{.*#+}} xmm0 = [8,0,0,0]
+; BITALG-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [8,0]
 ; BITALG-NEXT:    retq
 ;
 ; X86-SSE-LABEL: foldv2i64u:
@@ -1843,29 +1873,44 @@ define <4 x i32> @foldv4i32() nounwind {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,32,0]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: foldv4i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
-; AVX-NEXT:    retq
+; AVX1-LABEL: foldv4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: foldv4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; AVX2-NEXT:    retq
+;
+; AVX512CDVL-LABEL: foldv4i32:
+; AVX512CDVL:       # %bb.0:
+; AVX512CDVL-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [8,32]
+; AVX512CDVL-NEXT:    retq
+;
+; AVX512CD-LABEL: foldv4i32:
+; AVX512CD:       # %bb.0:
+; AVX512CD-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [8,32]
+; AVX512CD-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: foldv4i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; AVX512VPOPCNTDQ-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [8,32]
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQVL-LABEL: foldv4i32:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; AVX512VPOPCNTDQVL-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [8,32]
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: foldv4i32:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; BITALG_NOVLX-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [8,32]
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: foldv4i32:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; BITALG-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [8,32]
 ; BITALG-NEXT:    retq
 ;
 ; X86-SSE-LABEL: foldv4i32:
@@ -1882,29 +1927,44 @@ define <4 x i32> @foldv4i32u() nounwind {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,32,0]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: foldv4i32u:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
-; AVX-NEXT:    retq
+; AVX1-LABEL: foldv4i32u:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: foldv4i32u:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; AVX2-NEXT:    retq
+;
+; AVX512CDVL-LABEL: foldv4i32u:
+; AVX512CDVL:       # %bb.0:
+; AVX512CDVL-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [8,32]
+; AVX512CDVL-NEXT:    retq
+;
+; AVX512CD-LABEL: foldv4i32u:
+; AVX512CD:       # %bb.0:
+; AVX512CD-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [8,32]
+; AVX512CD-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: foldv4i32u:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; AVX512VPOPCNTDQ-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [8,32]
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQVL-LABEL: foldv4i32u:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; AVX512VPOPCNTDQVL-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [8,32]
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: foldv4i32u:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; BITALG_NOVLX-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [8,32]
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: foldv4i32u:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; BITALG-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [8,32]
 ; BITALG-NEXT:    retq
 ;
 ; X86-SSE-LABEL: foldv4i32u:
diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll
index 5bcdf0e22a5ae..3c35f7b7fb751 100644
--- a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll
+++ b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll
@@ -1140,19 +1140,44 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 }
 
 define <4 x i64> @foldv4i64() nounwind {
-; AVX-LABEL: foldv4i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
-; AVX-NEXT:    retq
+; AVX1-LABEL: foldv4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: foldv4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
+; AVX2-NEXT:    retq
+;
+; AVX512CDVL-LABEL: foldv4i64:
+; AVX512CDVL:       # %bb.0:
+; AVX512CDVL-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [8,0,64,0]
+; AVX512CDVL-NEXT:    retq
+;
+; AVX512CD-LABEL: foldv4i64:
+; AVX512CD:       # %bb.0:
+; AVX512CD-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [8,0,64,0]
+; AVX512CD-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: foldv4i64:
+; AVX512VPOPCNTDQ:       # %bb.0:
+; AVX512VPOPCNTDQ-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [8,0,64,0]
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
+; AVX512VPOPCNTDQVL-LABEL: foldv4i64:
+; AVX512VPOPCNTDQVL:       # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [8,0,64,0]
+; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: foldv4i64:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
+; BITALG_NOVLX-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [8,0,64,0]
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: foldv4i64:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
+; BITALG-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [8,0,64,0]
 ; BITALG-NEXT:    retq
 ;
 ; X86-AVX-LABEL: foldv4i64:
@@ -1164,19 +1189,44 @@ define <4 x i64> @foldv4i64() nounwind {
 }
 
 define <4 x i64> @foldv4i64u() nounwind {
-; AVX-LABEL: foldv4i64u:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
-; AVX-NEXT:    retq
+; AVX1-LABEL: foldv4i64u:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: foldv4i64u:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
+; AVX2-NEXT:    retq
+;
+; AVX512CDVL-LABEL: foldv4i64u:
+; AVX512CDVL:       # %bb.0:
+; AVX512CDVL-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [8,0,64,0]
+; AVX512CDVL-NEXT:    retq
+;
+; AVX512CD-LABEL: foldv4i64u:
+; AVX512CD:       # %bb.0:
+; AVX512CD-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [8,0,64,0]
+; AVX512CD-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: foldv4i64u:
+; AVX512VPOPCNTDQ:       # %bb.0:
+; AVX512VPOPCNTDQ-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [8,0,64,0]
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
+; AVX512VPOPCNTDQVL-LABEL: foldv4i64u:
+; AVX512VPOPCNTDQVL:       # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [8,0,64,0]
+; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: foldv4i64u:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
+; BITALG_NOVLX-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [8,0,64,0]
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: foldv4i64u:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
+; BITALG-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [8,0,64,0]
 ; BITALG-NEXT:    retq
 ;
 ; X86-AVX-LABEL: foldv4i64u:
@@ -1188,19 +1238,99 @@ define <4 x i64> @foldv4i64u() nounwind {
 }
 
 define <8 x i32> @foldv8i32() nounwind {
-; ALL-LABEL: foldv8i32:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
-; ALL-NEXT:    ret{{[l|q]}}
+; AVX1-LABEL: foldv8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: foldv8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; AVX2-NEXT:    retq
+;
+; AVX512CDVL-LABEL: foldv8i32:
+; AVX512CDVL:       # %bb.0:
+; AVX512CDVL-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; AVX512CDVL-NEXT:    retq
+;
+; AVX512CD-LABEL: foldv8i32:
+; AVX512CD:       # %bb.0:
+; AVX512CD-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; AVX512CD-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: foldv8i32:
+; AVX512VPOPCNTDQ:       # %bb.0:
+; AVX512VPOPCNTDQ-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
+; AVX512VPOPCNTDQVL-LABEL: foldv8i32:
+; AVX512VPOPCNTDQVL:       # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; AVX512VPOPCNTDQVL-NEXT:    retq
+;
+; BITALG_NOVLX-LABEL: foldv8i32:
+; BITALG_NOVLX:       # %bb.0:
+; BITALG_NOVLX-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; BITALG_NOVLX-NEXT:    retq
+;
+; BITALG-LABEL: foldv8i32:
+; BITALG:       # %bb.0:
+; BITALG-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; BITALG-NEXT:    retq
+;
+; X86-AVX-LABEL: foldv8i32:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; X86-AVX-NEXT:    retl
   %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 0)
   ret <8 x i32> %out
 }
 
 define <8 x i32> @foldv8i32u() nounwind {
-; ALL-LABEL: foldv8i32u:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
-; ALL-NEXT:    ret{{[l|q]}}
+; AVX1-LABEL: foldv8i32u:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: foldv8i32u:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; AVX2-NEXT:    retq
+;
+; AVX512CDVL-LABEL: foldv8i32u:
+; AVX512CDVL:       # %bb.0:
+; AVX512CDVL-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; AVX512CDVL-NEXT:    retq
+;
+; AVX512CD-LABEL: foldv8i32u:
+; AVX512CD:       # %bb.0:
+; AVX512CD-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; AVX512CD-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: foldv8i32u:
+; AVX512VPOPCNTDQ:       # %bb.0:
+; AVX512VPOPCNTDQ-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
+; AVX512VPOPCNTDQVL-LABEL: foldv8i32u:
+; AVX512VPOPCNTDQVL:       # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; AVX512VPOPCNTDQVL-NEXT:    retq
+;
+; BITALG_NOVLX-LABEL: foldv8i32u:
+; BITALG_NOVLX:       # %bb.0:
+; BITALG_NOVLX-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; BITALG_NOVLX-NEXT:    retq
+;
+; BITALG-LABEL: foldv8i32u:
+; BITALG:       # %bb.0:
+; BITALG-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; BITALG-NEXT:    retq
+;
+; X86-AVX-LABEL: foldv8i32u:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; X86-AVX-NEXT:    retl
   %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 -1)
   ret <8 x i32> %out
 }
@@ -1245,3 +1375,5 @@ declare <4 x i64> @llvm.cttz.v4i64(<4 x i64>, i1)
 declare <8 x i32> @llvm.cttz.v8i32(<8 x i32>, i1)
 declare <16 x i16> @llvm.cttz.v16i16(<16 x i16>, i1)
 declare <32 x i8> @llvm.cttz.v32i8(<32 x i8>, i1)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX: {{.*}}
diff --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll
index 06474559644b0..d07b7b574eba7 100644
--- a/llvm/test/CodeGen/X86/vselect-avx.ll
+++ b/llvm/test/CodeGen/X86/vselect-avx.ll
@@ -373,7 +373,7 @@ define void @vselect_concat_splat() {
 ; AVX512:       ## %bb.0: ## %entry
 ; AVX512-NEXT:    vmovups (%rax), %ymm0
 ; AVX512-NEXT:    vmovups (%rax), %xmm1
-; AVX512-NEXT:    vmovaps {{.*#+}} ymm2 = [0,3,6,9,1,4,7,10]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,1,4,7,10]
 ; AVX512-NEXT:    vmovaps %ymm2, %ymm3
 ; AVX512-NEXT:    vpermi2ps %ymm1, %ymm0, %ymm3
 ; AVX512-NEXT:    vmovups 32, %xmm4
diff --git a/llvm/test/CodeGen/X86/widen_fadd.ll b/llvm/test/CodeGen/X86/widen_fadd.ll
index be249ddc0cca3..e2c36393da2f6 100644
--- a/llvm/test/CodeGen/X86/widen_fadd.ll
+++ b/llvm/test/CodeGen/X86/widen_fadd.ll
@@ -303,7 +303,7 @@ define void @widen_fadd_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) {
 ; AVX512VL-NEXT:    vpermi2pd %zmm6, %zmm4, %zmm5
 ; AVX512VL-NEXT:    vinsertf32x4 $1, %xmm3, %zmm2, %zmm2
 ; AVX512VL-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
-; AVX512VL-NEXT:    vmovapd {{.*#+}} ymm1 = [0,2,4,6]
+; AVX512VL-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,2,4,6]
 ; AVX512VL-NEXT:    vpermi2pd %ymm2, %ymm0, %ymm1
 ; AVX512VL-NEXT:    vinsertf64x4 $0, %ymm1, %zmm5, %zmm0
 ; AVX512VL-NEXT:    vmovupd %zmm0, (%rdx)
diff --git a/llvm/test/CodeGen/X86/widen_fdiv.ll b/llvm/test/CodeGen/X86/widen_fdiv.ll
index e4c9278478a5b..4e5695500fbff 100644
--- a/llvm/test/CodeGen/X86/widen_fdiv.ll
+++ b/llvm/test/CodeGen/X86/widen_fdiv.ll
@@ -251,7 +251,7 @@ define void @widen_fdiv_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) {
 ; AVX512VL-NEXT:    vpermi2pd %zmm6, %zmm4, %zmm5
 ; AVX512VL-NEXT:    vinsertf32x4 $1, %xmm3, %zmm2, %zmm2
 ; AVX512VL-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
-; AVX512VL-NEXT:    vmovapd {{.*#+}} ymm1 = [0,2,4,6]
+; AVX512VL-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,2,4,6]
 ; AVX512VL-NEXT:    vpermi2pd %ymm2, %ymm0, %ymm1
 ; AVX512VL-NEXT:    vinsertf64x4 $0, %ymm1, %zmm5, %zmm0
 ; AVX512VL-NEXT:    vmovupd %zmm0, (%rdx)
diff --git a/llvm/test/CodeGen/X86/widen_fmul.ll b/llvm/test/CodeGen/X86/widen_fmul.ll
index 9aa9d63e7fcbd..fc099e7c68969 100644
--- a/llvm/test/CodeGen/X86/widen_fmul.ll
+++ b/llvm/test/CodeGen/X86/widen_fmul.ll
@@ -303,7 +303,7 @@ define void @widen_fmul_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) {
 ; AVX512VL-NEXT:    vpermi2pd %zmm6, %zmm4, %zmm5
 ; AVX512VL-NEXT:    vinsertf32x4 $1, %xmm3, %zmm2, %zmm2
 ; AVX512VL-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
-; AVX512VL-NEXT:    vmovapd {{.*#+}} ymm1 = [0,2,4,6]
+; AVX512VL-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,2,4,6]
 ; AVX512VL-NEXT:    vpermi2pd %ymm2, %ymm0, %ymm1
 ; AVX512VL-NEXT:    vinsertf64x4 $0, %ymm1, %zmm5, %zmm0
 ; AVX512VL-NEXT:    vmovupd %zmm0, (%rdx)
diff --git a/llvm/test/CodeGen/X86/widen_fsub.ll b/llvm/test/CodeGen/X86/widen_fsub.ll
index 60e54ab71abcf..3256d5c6f5e3f 100644
--- a/llvm/test/CodeGen/X86/widen_fsub.ll
+++ b/llvm/test/CodeGen/X86/widen_fsub.ll
@@ -303,7 +303,7 @@ define void @widen_fsub_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) {
 ; AVX512VL-NEXT:    vpermi2pd %zmm6, %zmm4, %zmm5
 ; AVX512VL-NEXT:    vinsertf32x4 $1, %xmm3, %zmm2, %zmm2
 ; AVX512VL-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
-; AVX512VL-NEXT:    vmovapd {{.*#+}} ymm1 = [0,2,4,6]
+; AVX512VL-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,2,4,6]
 ; AVX512VL-NEXT:    vpermi2pd %ymm2, %ymm0, %ymm1
 ; AVX512VL-NEXT:    vinsertf64x4 $0, %ymm1, %zmm5, %zmm0
 ; AVX512VL-NEXT:    vmovupd %zmm0, (%rdx)
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index cf2b1117581dd..3d49edbb7bd8d 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -1652,7 +1652,7 @@ define void @splat2_v4f64_load_store(ptr %s, ptr %d) nounwind {
 ; AVX512-LABEL: splat2_v4f64_load_store:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovups (%rdi), %ymm0
-; AVX512-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3]
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3]
 ; AVX512-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vmovups %zmm0, (%rsi)
 ; AVX512-NEXT:    vzeroupper
@@ -1689,7 +1689,7 @@ define void @splat2_v4i64_load_store(ptr %s, ptr %d) nounwind {
 ; AVX512-LABEL: splat2_v4i64_load_store:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovups (%rdi), %ymm0
-; AVX512-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3]
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3]
 ; AVX512-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vmovups %zmm0, (%rsi)
 ; AVX512-NEXT:    vzeroupper

From b91d5af1ac3ad2c18b1dfde2061a6ac1d638e6e4 Mon Sep 17 00:00:00 2001
From: Twice <twice@apache.org>
Date: Mon, 13 Jan 2025 00:02:41 +0800
Subject: [PATCH 204/408] [MLIR][Vector] Allow any strided memref for
 one-element vector.load in lowering vector.gather (#122437)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In `Gather1DToConditionalLoads`, currently we will check if the stride
of the most minor dim of the input memref is 1. And if not, the
rewriting pattern will not be applied. However, according to the
verification of `vector.load` here:

https://github.com/llvm/llvm-project/blob/4e32271e8b304eb018c69f74c16edd1668fcdaf3/mlir/lib/Dialect/Vector/IR/VectorOps.cpp#L4971-L4975

.. if the output vector type of `vector.load` contains only one element,
we can ignore the requirement of the stride of the input memref, i.e.
the input memref can be with any stride layout attribute in such case.

So here we can allow more cases in lowering `vector.gather` by relaxing
such check.

As shown in the test case attached in this patch
[here](https://github.com/llvm/llvm-project/blob/1933fbad58302814ccce5991a9320c0967f3571b/mlir/test/Dialect/Vector/vector-gather-lowering.mlir#L151),
now `vector.gather` of memref with non-trivial stride can be lowered
successfully if the result vector contains only one element.

---------

Signed-off-by: PragmaTwice <twice@apache.org>
Co-authored-by: Andrzej Warzyński <andrzej.warzynski@gmail.com>
---
 .../Vector/Transforms/LowerVectorGather.cpp   |  4 ++-
 .../Vector/vector-gather-lowering.mlir        | 28 +++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorGather.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorGather.cpp
index f1a5aa7664d2f..3b38505becd18 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorGather.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorGather.cpp
@@ -206,10 +206,12 @@ struct Gather1DToConditionalLoads : OpRewritePattern<vector::GatherOp> {
     Value base = op.getBase();
 
     // vector.load requires the most minor memref dim to have unit stride
+    // (unless reading exactly 1 element)
     if (auto memType = dyn_cast<MemRefType>(base.getType())) {
       if (auto stridesAttr =
               dyn_cast_if_present<StridedLayoutAttr>(memType.getLayout())) {
-        if (stridesAttr.getStrides().back() != 1)
+        if (stridesAttr.getStrides().back() != 1 &&
+            resultTy.getNumElements() != 1)
           return failure();
       }
     }
diff --git a/mlir/test/Dialect/Vector/vector-gather-lowering.mlir b/mlir/test/Dialect/Vector/vector-gather-lowering.mlir
index 5ad3a23e0ba15..20e9400ed698d 100644
--- a/mlir/test/Dialect/Vector/vector-gather-lowering.mlir
+++ b/mlir/test/Dialect/Vector/vector-gather-lowering.mlir
@@ -136,6 +136,34 @@ func.func @gather_tensor_1d(%base: tensor<?xf32>, %v: vector<2xindex>, %mask: ve
   return %0 : vector<2xf32>
 }
 
+// CHECK-LABEL: @gather_memref_non_unit_stride_read_1_element
+// CHECK: %[[MASK:.*]] = vector.extract %arg2[0] : i1 from vector<1xi1>
+// CHECK: %[[IDX:.*]] = vector.extract %arg1[0] : index from vector<1xindex>
+// CHECK: %[[RET:.*]] = scf.if %[[MASK]] -> (vector<1xf32>) {
+// CHECK:   %[[VEC:.*]] = vector.load %arg0[%[[IDX]]] : memref<4xf32, strided<[2]>>, vector<1xf32>
+// CHECK:   %[[VAL:.*]] = vector.extract %[[VEC]][0] : f32 from vector<1xf32>
+// CHECK:   %[[RES:.*]] = vector.insert %[[VAL]], %arg3 [0] : f32 into vector<1xf32>
+// CHECK:   scf.yield %[[RES]] : vector<1xf32>
+// CHECK: } else {
+// CHECK:    scf.yield %arg3 : vector<1xf32>
+// CHECK: }
+// CHECK: return %[[RET]] : vector<1xf32>
+func.func @gather_memref_non_unit_stride_read_1_element(%base: memref<4xf32, strided<[2]>>, %v: vector<1xindex>, %mask: vector<1xi1>, %pass_thru: vector<1xf32>) -> vector<1xf32> {
+  %c0 = arith.constant 0 : index
+  %0 = vector.gather %base[%c0][%v], %mask, %pass_thru : memref<4xf32, strided<[2]>>, vector<1xindex>, vector<1xi1>, vector<1xf32> into vector<1xf32>
+  return %0 : vector<1xf32>
+}
+
+// CHECK-LABEL: @gather_memref_non_unit_stride_read_more_than_1_element
+// CHECK: %[[CONST:.*]] = arith.constant 0 : index
+// CHECK: %[[RET:.*]] = vector.gather %arg0[%[[CONST]]] [%arg1], %arg2, %arg3 : memref<4xf32, strided<[2]>>, vector<2xindex>, vector<2xi1>, vector<2xf32> into vector<2xf32>
+// CHECK: return %[[RET]] : vector<2xf32>
+func.func @gather_memref_non_unit_stride_read_more_than_1_element(%base: memref<4xf32, strided<[2]>>, %v: vector<2xindex>, %mask: vector<2xi1>, %pass_thru: vector<2xf32>) -> vector<2xf32> {
+  %c0 = arith.constant 0 : index
+  %0 = vector.gather %base[%c0][%v], %mask, %pass_thru : memref<4xf32, strided<[2]>>, vector<2xindex>, vector<2xi1>, vector<2xf32> into vector<2xf32>
+  return %0 : vector<2xf32>
+}
+
 // CHECK-LABEL: @gather_tensor_2d
 // CHECK:  scf.if
 // CHECK:    tensor.extract

From d047dbd95ed3ef4ace54eaa7c32fe3954317f926 Mon Sep 17 00:00:00 2001
From: eleviant <56861949+eleviant@users.noreply.github.com>
Date: Sun, 12 Jan 2025 17:31:45 +0100
Subject: [PATCH 205/408] Add function merger to be run during LTO link with
 gold plugin (#121343)

Patch adds 'merge-functions' plugin option for this purpose.
---
 .../gold/X86/Inputs/merge-functions-foo.ll    | 25 ++++++++++
 llvm/test/tools/gold/X86/merge-functions.ll   | 49 +++++++++++++++++++
 llvm/tools/gold/gold-plugin.cpp               |  6 +++
 3 files changed, 80 insertions(+)
 create mode 100644 llvm/test/tools/gold/X86/Inputs/merge-functions-foo.ll
 create mode 100644 llvm/test/tools/gold/X86/merge-functions.ll

diff --git a/llvm/test/tools/gold/X86/Inputs/merge-functions-foo.ll b/llvm/test/tools/gold/X86/Inputs/merge-functions-foo.ll
new file mode 100644
index 0000000000000..f2bcff89868e1
--- /dev/null
+++ b/llvm/test/tools/gold/X86/Inputs/merge-functions-foo.ll
@@ -0,0 +1,25 @@
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+@_g = dso_local global i32 0, align 4
+@llvm.compiler.used = appending global [1 x ptr] [ptr @_g], section "llvm.metadata"
+
+define dso_local i32 @foo(i32 noundef %0) #0 {
+  %2 = add nsw i32 %0, 42
+  store i32 %2, ptr @_g, align 4
+  ret i32 %2
+}
+
+attributes #0 = { noinline }
+
+!llvm.module.flags = !{!0, !1}
+
+!0 = !{i32 1, !"ThinLTO", i32 0}
+!1 = !{i32 1, !"EnableSplitLTOUnit", i32 1}
+
+^0 = module: (path: "/tmp/func2.o", hash: (0, 0, 0, 0, 0))
+^1 = gv: (name: "foo", summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 1, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 3, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 1, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), refs: (^3)))) ; guid = 6699318081062747564
+^2 = gv: (name: "llvm.compiler.used", summaries: (variable: (module: ^0, flags: (linkage: appending, visibility: default, notEligibleToImport: 1, live: 1, dsoLocal: 0, canAutoHide: 0), varFlags: (readonly: 0, writeonly: 0, constant: 0), refs: (^3)))) ; guid = 9610627770985738006
+^3 = gv: (name: "_g", summaries: (variable: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 1, live: 0, dsoLocal: 1, canAutoHide: 0), varFlags: (readonly: 1, writeonly: 1, constant: 0)))) ; guid = 9713702464056781075
+^4 = flags: 8
+^5 = blockcount: 0
diff --git a/llvm/test/tools/gold/X86/merge-functions.ll b/llvm/test/tools/gold/X86/merge-functions.ll
new file mode 100644
index 0000000000000..d4a49b1c40b47
--- /dev/null
+++ b/llvm/test/tools/gold/X86/merge-functions.ll
@@ -0,0 +1,49 @@
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-as %p/Inputs/merge-functions-foo.ll -o %t-foo.bc
+; RUN: %gold -plugin %llvmshlibdir/LLVMgold%shlibext \
+; RUN:    -m elf_x86_64 \
+; RUN:    -plugin-opt=merge-functions \
+; RUN:    -plugin-opt=save-temps \
+; RUN:    -u main \
+; RUN:    %t.bc %t-foo.bc \
+; RUN:    -o %t-out
+; RUN: llvm-dis %t-out.0.5.precodegen.bc -o - | FileCheck %s
+
+; Check that we've merged foo and bar
+; CHECK:      define dso_local noundef i32 @main()
+; CHECK-NEXT:   tail call fastcc void @bar()
+; CHECK-NEXT:   tail call fastcc void @bar()
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+@_g = external local_unnamed_addr global i32, align 4
+
+define dso_local i32 @bar(i32 noundef %0) #0 {
+  %2 = add nsw i32 %0, 42
+  store i32 %2, ptr @_g, align 4
+  ret i32 %2
+}
+
+define dso_local noundef i32 @main() {
+  %1 = tail call i32 @foo(i32 noundef 1)
+  %2 = tail call i32 @bar(i32 noundef 1)
+  ret i32 0
+}
+
+declare i32 @foo(i32 noundef) local_unnamed_addr #2
+
+attributes #0 = { noinline }
+
+!llvm.module.flags = !{!0, !1}
+
+!0 = !{i32 1, !"ThinLTO", i32 0}
+!1 = !{i32 1, !"EnableSplitLTOUnit", i32 1}
+
+^0 = module: (path: "merge-functions.o", hash: (0, 0, 0, 0, 0))
+^1 = gv: (name: "foo") ; guid = 6699318081062747564
+^2 = gv: (name: "_g") ; guid = 9713702464056781075
+^3 = gv: (name: "main", summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 1, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 3, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 0, alwaysInline: 0, noUnwind: 1, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^1, tail: 1), (callee: ^4, tail: 1))))) ; guid = 15822663052811949562
+^4 = gv: (name: "bar", summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 1, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 3, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 1, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), refs: (^2)))) ; guid = 16434608426314478903
+^5 = flags: 8
+^6 = blockcount: 0
diff --git a/llvm/tools/gold/gold-plugin.cpp b/llvm/tools/gold/gold-plugin.cpp
index ac2e9d4252aa3..ae965e6f486aa 100644
--- a/llvm/tools/gold/gold-plugin.cpp
+++ b/llvm/tools/gold/gold-plugin.cpp
@@ -224,6 +224,9 @@ namespace options {
   static std::string cs_profile_path;
   static bool cs_pgo_gen = false;
 
+  // When true, MergeFunctions pass is used in LTO link pipeline.
+  static bool merge_functions = false;
+
   // Time trace options.
   static std::string time_trace_file;
   static unsigned time_trace_granularity = 500;
@@ -292,6 +295,8 @@ namespace options {
       sample_profile = std::string(opt);
     } else if (opt == "cs-profile-generate") {
       cs_pgo_gen = true;
+    } else if (opt == "merge-functions") {
+      merge_functions = true;
     } else if (opt.consume_front("cs-profile-path=")) {
       cs_profile_path = std::string(opt);
     } else if (opt == "new-pass-manager") {
@@ -897,6 +902,7 @@ static std::unique_ptr<LTO> createLTO(IndexWriteCallback OnIndexWrite,
   Conf.OptLevel = options::OptLevel;
   Conf.PTO.LoopVectorization = options::OptLevel > 1;
   Conf.PTO.SLPVectorization = options::OptLevel > 1;
+  Conf.PTO.MergeFunctions = options::merge_functions;
   Conf.PTO.UnifiedLTO = options::unifiedlto;
   Conf.AlwaysEmitRegularLTOObj = !options::obj_path.empty();
 

From 9dee7c44491635ec9037b90050bcdbd3d5291e38 Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Sun, 12 Jan 2025 18:56:49 +0100
Subject: [PATCH 206/408] Added free-threading CPython mode support in MLIR
 Python bindings (#107103)

Related to https://github.com/llvm/llvm-project/issues/105522

Description:

This PR is a joint work with Peter Hawkins (@hawkinsp) originally done
by myself for pybind11 and then reworked to nanobind based on Peter's
branch: https://github.com/hawkinsp/llvm-project/tree/nbdev .

- Added free-threading CPython mode support for MLIR Python bindings
- Added a test which can reveal data races when cpython and LLVM/MLIR
compiled with TSAN

Context:
- Related to https://github.com/google/jax/issues/23073

Co-authored-by: Peter Hawkins <phawkins@google.com>
---
 mlir/cmake/modules/AddMLIRPython.cmake  |  21 +-
 mlir/docs/Bindings/Python.md            |  40 ++
 mlir/lib/Bindings/Python/Globals.h      |  12 +-
 mlir/lib/Bindings/Python/IRCore.cpp     |  31 +-
 mlir/lib/Bindings/Python/IRModule.cpp   |  18 +-
 mlir/lib/Bindings/Python/IRModule.h     |   1 +
 mlir/lib/Bindings/Python/MainModule.cpp |   9 +-
 mlir/python/requirements.txt            |   2 +-
 mlir/test/python/multithreaded_tests.py | 531 ++++++++++++++++++++++++
 9 files changed, 649 insertions(+), 16 deletions(-)
 create mode 100644 mlir/test/python/multithreaded_tests.py

diff --git a/mlir/cmake/modules/AddMLIRPython.cmake b/mlir/cmake/modules/AddMLIRPython.cmake
index 717a503468a85..0679db9cf93e1 100644
--- a/mlir/cmake/modules/AddMLIRPython.cmake
+++ b/mlir/cmake/modules/AddMLIRPython.cmake
@@ -668,12 +668,31 @@ function(add_mlir_python_extension libname extname)
   elseif(ARG_PYTHON_BINDINGS_LIBRARY STREQUAL "nanobind")
     nanobind_add_module(${libname}
       NB_DOMAIN mlir
+      FREE_THREADED
       ${ARG_SOURCES}
     )
 
     if (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL)
       # Avoids warnings from upstream nanobind.
-      target_compile_options(nanobind-static
+      set(nanobind_target "nanobind-static")
+      if (NOT TARGET ${nanobind_target})
+        # Get correct nanobind target name: nanobind-static-ft or something else
+        # It is set by nanobind_add_module function according to the passed options
+        get_property(all_targets DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY BUILDSYSTEM_TARGETS)
+
+        # Iterate over the list of targets
+        foreach(target ${all_targets})
+          # Check if the target name matches the given string
+          if("${target}" MATCHES "nanobind-")
+            set(nanobind_target "${target}")
+          endif()
+        endforeach()
+
+        if (NOT TARGET ${nanobind_target})
+          message(FATAL_ERROR "Could not find nanobind target to set compile options to")
+        endif()
+      endif()
+      target_compile_options(${nanobind_target}
         PRIVATE
           -Wno-cast-qual
           -Wno-zero-length-array
diff --git a/mlir/docs/Bindings/Python.md b/mlir/docs/Bindings/Python.md
index 32df3310d811d..b8bd0f507a510 100644
--- a/mlir/docs/Bindings/Python.md
+++ b/mlir/docs/Bindings/Python.md
@@ -1187,3 +1187,43 @@ or nanobind and
 utilities to connect to the rest of Python API. The bindings can be located in a
 separate module or in the same module as attributes and types, and
 loaded along with the dialect.
+
+## Free-threading (No-GIL) support
+
+Free-threading or no-GIL support refers to CPython interpreter (>=3.13) with Global Interpreter Lock made optional. For details on the topic, please check [PEP-703](https://peps.python.org/pep-0703/) and this [Python free-threading guide](https://py-free-threading.github.io/).
+
+MLIR Python bindings are free-threading compatible with exceptions (discussed below) in the following sense: it is safe to work in multiple threads with **independent** contexts. Below we show an example code of safe usage:
+
+```python
+# python3.13t example.py
+import concurrent.futures
+
+import mlir.dialects.arith as arith
+from mlir.ir import Context, Location, Module, IntegerType, InsertionPoint
+
+
+def func(py_value):
+    with Context() as ctx:
+        module = Module.create(loc=Location.file("foo.txt", 0, 0))
+
+        dtype = IntegerType.get_signless(64)
+        with InsertionPoint(module.body), Location.name("a"):
+            arith.constant(dtype, py_value)
+
+    return module
+
+
+num_workers = 8
+with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
+    futures = []
+    for i in range(num_workers):
+        futures.append(executor.submit(func, i))
+    assert len(list(f.result() for f in futures)) == num_workers
+```
+
+The exceptions to the free-threading compatibility:
+- IR printing is unsafe, e.g. when using `PassManager` with `PassManager.enable_ir_printing()` which calls thread-unsafe `llvm::raw_ostream`.
+- Usage of `Location.emit_error` is unsafe (due to thread-unsafe `llvm::raw_ostream`).
+- Usage of `Module.dump` is unsafe (due to thread-unsafe `llvm::raw_ostream`).
+- Usage of `mlir.dialects.transform.interpreter` is unsafe.
+- Usage of `mlir.dialects.gpu` and `gpu-module-to-binary` is unsafe.
\ No newline at end of file
diff --git a/mlir/lib/Bindings/Python/Globals.h b/mlir/lib/Bindings/Python/Globals.h
index 0ec522d14f74b..826a34a535176 100644
--- a/mlir/lib/Bindings/Python/Globals.h
+++ b/mlir/lib/Bindings/Python/Globals.h
@@ -24,6 +24,7 @@ namespace mlir {
 namespace python {
 
 /// Globals that are always accessible once the extension has been initialized.
+/// Methods of this class are thread-safe.
 class PyGlobals {
 public:
   PyGlobals();
@@ -37,12 +38,18 @@ class PyGlobals {
 
   /// Get and set the list of parent modules to search for dialect
   /// implementation classes.
-  std::vector<std::string> &getDialectSearchPrefixes() {
+  std::vector<std::string> getDialectSearchPrefixes() {
+    nanobind::ft_lock_guard lock(mutex);
     return dialectSearchPrefixes;
   }
   void setDialectSearchPrefixes(std::vector<std::string> newValues) {
+    nanobind::ft_lock_guard lock(mutex);
     dialectSearchPrefixes.swap(newValues);
   }
+  void addDialectSearchPrefix(std::string value) {
+    nanobind::ft_lock_guard lock(mutex);
+    dialectSearchPrefixes.push_back(std::move(value));
+  }
 
   /// Loads a python module corresponding to the given dialect namespace.
   /// No-ops if the module has already been loaded or is not found. Raises
@@ -109,6 +116,9 @@ class PyGlobals {
 
 private:
   static PyGlobals *instance;
+
+  nanobind::ft_mutex mutex;
+
   /// Module name prefixes to search under for dialect implementation modules.
   std::vector<std::string> dialectSearchPrefixes;
   /// Map of dialect namespace to external dialect class object.
diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index 453d4f7c7e8bc..463ebdebb3f3f 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -243,9 +243,15 @@ static MlirBlock createBlock(const nb::sequence &pyArgTypes,
 
 /// Wrapper for the global LLVM debugging flag.
 struct PyGlobalDebugFlag {
-  static void set(nb::object &o, bool enable) { mlirEnableGlobalDebug(enable); }
+  static void set(nb::object &o, bool enable) {
+    nb::ft_lock_guard lock(mutex);
+    mlirEnableGlobalDebug(enable);
+  }
 
-  static bool get(const nb::object &) { return mlirIsGlobalDebugEnabled(); }
+  static bool get(const nb::object &) {
+    nb::ft_lock_guard lock(mutex);
+    return mlirIsGlobalDebugEnabled();
+  }
 
   static void bind(nb::module_ &m) {
     // Debug flags.
@@ -255,6 +261,7 @@ struct PyGlobalDebugFlag {
         .def_static(
             "set_types",
             [](const std::string &type) {
+              nb::ft_lock_guard lock(mutex);
               mlirSetGlobalDebugType(type.c_str());
             },
             "types"_a, "Sets specific debug types to be produced by LLVM")
@@ -263,11 +270,17 @@ struct PyGlobalDebugFlag {
           pointers.reserve(types.size());
           for (const std::string &str : types)
             pointers.push_back(str.c_str());
+          nb::ft_lock_guard lock(mutex);
           mlirSetGlobalDebugTypes(pointers.data(), pointers.size());
         });
   }
+
+private:
+  static nb::ft_mutex mutex;
 };
 
+nb::ft_mutex PyGlobalDebugFlag::mutex;
+
 struct PyAttrBuilderMap {
   static bool dunderContains(const std::string &attributeKind) {
     return PyGlobals::get().lookupAttributeBuilder(attributeKind).has_value();
@@ -606,6 +619,7 @@ class PyOpOperandIterator {
 
 PyMlirContext::PyMlirContext(MlirContext context) : context(context) {
   nb::gil_scoped_acquire acquire;
+  nb::ft_lock_guard lock(live_contexts_mutex);
   auto &liveContexts = getLiveContexts();
   liveContexts[context.ptr] = this;
 }
@@ -615,7 +629,10 @@ PyMlirContext::~PyMlirContext() {
   // forContext method, which always puts the associated handle into
   // liveContexts.
   nb::gil_scoped_acquire acquire;
-  getLiveContexts().erase(context.ptr);
+  {
+    nb::ft_lock_guard lock(live_contexts_mutex);
+    getLiveContexts().erase(context.ptr);
+  }
   mlirContextDestroy(context);
 }
 
@@ -632,6 +649,7 @@ nb::object PyMlirContext::createFromCapsule(nb::object capsule) {
 
 PyMlirContextRef PyMlirContext::forContext(MlirContext context) {
   nb::gil_scoped_acquire acquire;
+  nb::ft_lock_guard lock(live_contexts_mutex);
   auto &liveContexts = getLiveContexts();
   auto it = liveContexts.find(context.ptr);
   if (it == liveContexts.end()) {
@@ -647,12 +665,17 @@ PyMlirContextRef PyMlirContext::forContext(MlirContext context) {
   return PyMlirContextRef(it->second, std::move(pyRef));
 }
 
+nb::ft_mutex PyMlirContext::live_contexts_mutex;
+
 PyMlirContext::LiveContextMap &PyMlirContext::getLiveContexts() {
   static LiveContextMap liveContexts;
   return liveContexts;
 }
 
-size_t PyMlirContext::getLiveCount() { return getLiveContexts().size(); }
+size_t PyMlirContext::getLiveCount() {
+  nb::ft_lock_guard lock(live_contexts_mutex);
+  return getLiveContexts().size();
+}
 
 size_t PyMlirContext::getLiveOperationCount() { return liveOperations.size(); }
 
diff --git a/mlir/lib/Bindings/Python/IRModule.cpp b/mlir/lib/Bindings/Python/IRModule.cpp
index f7bf77e5a7e04..e600f1bbd4493 100644
--- a/mlir/lib/Bindings/Python/IRModule.cpp
+++ b/mlir/lib/Bindings/Python/IRModule.cpp
@@ -38,8 +38,11 @@ PyGlobals::PyGlobals() {
 PyGlobals::~PyGlobals() { instance = nullptr; }
 
 bool PyGlobals::loadDialectModule(llvm::StringRef dialectNamespace) {
-  if (loadedDialectModules.contains(dialectNamespace))
-    return true;
+  {
+    nb::ft_lock_guard lock(mutex);
+    if (loadedDialectModules.contains(dialectNamespace))
+      return true;
+  }
   // Since re-entrancy is possible, make a copy of the search prefixes.
   std::vector<std::string> localSearchPrefixes = dialectSearchPrefixes;
   nb::object loaded = nb::none();
@@ -62,12 +65,14 @@ bool PyGlobals::loadDialectModule(llvm::StringRef dialectNamespace) {
     return false;
   // Note: Iterator cannot be shared from prior to loading, since re-entrancy
   // may have occurred, which may do anything.
+  nb::ft_lock_guard lock(mutex);
   loadedDialectModules.insert(dialectNamespace);
   return true;
 }
 
 void PyGlobals::registerAttributeBuilder(const std::string &attributeKind,
                                          nb::callable pyFunc, bool replace) {
+  nb::ft_lock_guard lock(mutex);
   nb::object &found = attributeBuilderMap[attributeKind];
   if (found && !replace) {
     throw std::runtime_error((llvm::Twine("Attribute builder for '") +
@@ -81,6 +86,7 @@ void PyGlobals::registerAttributeBuilder(const std::string &attributeKind,
 
 void PyGlobals::registerTypeCaster(MlirTypeID mlirTypeID,
                                    nb::callable typeCaster, bool replace) {
+  nb::ft_lock_guard lock(mutex);
   nb::object &found = typeCasterMap[mlirTypeID];
   if (found && !replace)
     throw std::runtime_error("Type caster is already registered with caster: " +
@@ -90,6 +96,7 @@ void PyGlobals::registerTypeCaster(MlirTypeID mlirTypeID,
 
 void PyGlobals::registerValueCaster(MlirTypeID mlirTypeID,
                                     nb::callable valueCaster, bool replace) {
+  nb::ft_lock_guard lock(mutex);
   nb::object &found = valueCasterMap[mlirTypeID];
   if (found && !replace)
     throw std::runtime_error("Value caster is already registered: " +
@@ -99,6 +106,7 @@ void PyGlobals::registerValueCaster(MlirTypeID mlirTypeID,
 
 void PyGlobals::registerDialectImpl(const std::string &dialectNamespace,
                                     nb::object pyClass) {
+  nb::ft_lock_guard lock(mutex);
   nb::object &found = dialectClassMap[dialectNamespace];
   if (found) {
     throw std::runtime_error((llvm::Twine("Dialect namespace '") +
@@ -110,6 +118,7 @@ void PyGlobals::registerDialectImpl(const std::string &dialectNamespace,
 
 void PyGlobals::registerOperationImpl(const std::string &operationName,
                                       nb::object pyClass, bool replace) {
+  nb::ft_lock_guard lock(mutex);
   nb::object &found = operationClassMap[operationName];
   if (found && !replace) {
     throw std::runtime_error((llvm::Twine("Operation '") + operationName +
@@ -121,6 +130,7 @@ void PyGlobals::registerOperationImpl(const std::string &operationName,
 
 std::optional<nb::callable>
 PyGlobals::lookupAttributeBuilder(const std::string &attributeKind) {
+  nb::ft_lock_guard lock(mutex);
   const auto foundIt = attributeBuilderMap.find(attributeKind);
   if (foundIt != attributeBuilderMap.end()) {
     assert(foundIt->second && "attribute builder is defined");
@@ -133,6 +143,7 @@ std::optional<nb::callable> PyGlobals::lookupTypeCaster(MlirTypeID mlirTypeID,
                                                         MlirDialect dialect) {
   // Try to load dialect module.
   (void)loadDialectModule(unwrap(mlirDialectGetNamespace(dialect)));
+  nb::ft_lock_guard lock(mutex);
   const auto foundIt = typeCasterMap.find(mlirTypeID);
   if (foundIt != typeCasterMap.end()) {
     assert(foundIt->second && "type caster is defined");
@@ -145,6 +156,7 @@ std::optional<nb::callable> PyGlobals::lookupValueCaster(MlirTypeID mlirTypeID,
                                                          MlirDialect dialect) {
   // Try to load dialect module.
   (void)loadDialectModule(unwrap(mlirDialectGetNamespace(dialect)));
+  nb::ft_lock_guard lock(mutex);
   const auto foundIt = valueCasterMap.find(mlirTypeID);
   if (foundIt != valueCasterMap.end()) {
     assert(foundIt->second && "value caster is defined");
@@ -158,6 +170,7 @@ PyGlobals::lookupDialectClass(const std::string &dialectNamespace) {
   // Make sure dialect module is loaded.
   if (!loadDialectModule(dialectNamespace))
     return std::nullopt;
+  nb::ft_lock_guard lock(mutex);
   const auto foundIt = dialectClassMap.find(dialectNamespace);
   if (foundIt != dialectClassMap.end()) {
     assert(foundIt->second && "dialect class is defined");
@@ -175,6 +188,7 @@ PyGlobals::lookupOperationClass(llvm::StringRef operationName) {
   if (!loadDialectModule(dialectNamespace))
     return std::nullopt;
 
+  nb::ft_lock_guard lock(mutex);
   auto foundIt = operationClassMap.find(operationName);
   if (foundIt != operationClassMap.end()) {
     assert(foundIt->second && "OpView is defined");
diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h
index 8fb32a225e65f..f5fbb6c61b57e 100644
--- a/mlir/lib/Bindings/Python/IRModule.h
+++ b/mlir/lib/Bindings/Python/IRModule.h
@@ -260,6 +260,7 @@ class PyMlirContext {
   // Note that this holds a handle, which does not imply ownership.
   // Mappings will be removed when the context is destructed.
   using LiveContextMap = llvm::DenseMap<void *, PyMlirContext *>;
+  static nanobind::ft_mutex live_contexts_mutex;
   static LiveContextMap &getLiveContexts();
 
   // Interns all live modules associated with this context. Modules tracked
diff --git a/mlir/lib/Bindings/Python/MainModule.cpp b/mlir/lib/Bindings/Python/MainModule.cpp
index 7c4064262012e..6f49431006605 100644
--- a/mlir/lib/Bindings/Python/MainModule.cpp
+++ b/mlir/lib/Bindings/Python/MainModule.cpp
@@ -30,12 +30,8 @@ NB_MODULE(_mlir, m) {
       .def_prop_rw("dialect_search_modules",
                    &PyGlobals::getDialectSearchPrefixes,
                    &PyGlobals::setDialectSearchPrefixes)
-      .def(
-          "append_dialect_search_prefix",
-          [](PyGlobals &self, std::string moduleName) {
-            self.getDialectSearchPrefixes().push_back(std::move(moduleName));
-          },
-          "module_name"_a)
+      .def("append_dialect_search_prefix", &PyGlobals::addDialectSearchPrefix,
+           "module_name"_a)
       .def(
           "_check_dialect_module_loaded",
           [](PyGlobals &self, const std::string &dialectNamespace) {
@@ -76,7 +72,6 @@ NB_MODULE(_mlir, m) {
                   nanobind::cast<std::string>(opClass.attr("OPERATION_NAME"));
               PyGlobals::get().registerOperationImpl(operationName, opClass,
                                                      replace);
-
               // Dict-stuff the new opClass by name onto the dialect class.
               nb::object opClassName = opClass.attr("__name__");
               dialectClass.attr(opClassName) = opClass;
diff --git a/mlir/python/requirements.txt b/mlir/python/requirements.txt
index f240d6ef944ec..259e679f510f7 100644
--- a/mlir/python/requirements.txt
+++ b/mlir/python/requirements.txt
@@ -2,4 +2,4 @@ nanobind>=2.4, <3.0
 numpy>=1.19.5, <=2.1.2
 pybind11>=2.10.0, <=2.13.6
 PyYAML>=5.4.0, <=6.0.1
-ml_dtypes>=0.1.0, <=0.5.0   # provides several NumPy dtype extensions, including the bf16
+ml_dtypes>=0.5.0, <=0.6.0   # provides several NumPy dtype extensions, including the bf16
diff --git a/mlir/test/python/multithreaded_tests.py b/mlir/test/python/multithreaded_tests.py
new file mode 100644
index 0000000000000..2df75e2e1b90c
--- /dev/null
+++ b/mlir/test/python/multithreaded_tests.py
@@ -0,0 +1,531 @@
+# RUN: %PYTHON %s
+"""
+This script generates multi-threaded tests to check free-threading mode using CPython compiled with TSAN.
+Tests can be run using pytest:
+```bash
+python3.13t -mpytest -vvv multithreaded_tests.py
+```
+
+IMPORTANT. Running tests are not checking the correctness, but just the execution of the tests in multi-threaded context
+and passing if no warnings reported by TSAN and failing otherwise.
+
+
+Details on the generated tests and execution:
+1) Multi-threaded execution: all generated tests are executed independently by
+a pool of threads, running each test multiple times, see @multi_threaded for details
+
+2) Tests generation: we use existing tests: test/python/ir/*.py,
+test/python/dialects/*.py, etc to generate multi-threaded tests.
+In details, we perform the following:
+a) we define a list of source tests to be used to generate multi-threaded tests, see `TEST_MODULES`.
+b) we define `TestAllMultiThreaded` class and add existing tests to the class. See `add_existing_tests` method.
+c) for each test file, we copy and modify it: test/python/ir/affine_expr.py -> /tmp/ir/affine_expr.py.
+In order to import the test file as python module, we remove all executing functions, like
+`@run` or `run(testMethod)`. See `copy_and_update` and `add_existing_tests` methods for details.
+
+
+Observed warnings reported by TSAN.
+
+CPython and free-threading known data-races:
+1) ctypes related races: https://github.com/python/cpython/issues/127945
+2) LLVM related data-races, llvm::raw_ostream is not thread-safe
+- mlir pass manager
+- dialects/transform_interpreter.py
+- ir/diagnostic_handler.py
+- ir/module.py
+3) Dialect gpu module-to-binary method is unsafe
+"""
+import concurrent.futures
+import gc
+import importlib.util
+import os
+import sys
+import threading
+import tempfile
+import unittest
+
+from contextlib import contextmanager
+from functools import partial
+from pathlib import Path
+from typing import Optional
+
+import mlir.dialects.arith as arith
+from mlir.dialects import transform
+from mlir.ir import Context, Location, Module, IntegerType, InsertionPoint
+
+
+def import_from_path(module_name: str, file_path: Path):
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)
+    return module
+
+
+def copy_and_update(src_filepath: Path, dst_filepath: Path):
+    # We should remove all calls like `run(testMethod)`
+    with open(src_filepath, "r") as reader, open(dst_filepath, "w") as writer:
+        while True:
+            src_line = reader.readline()
+            if len(src_line) == 0:
+                break
+            skip_lines = [
+                "run(",
+                "@run",
+                "@constructAndPrintInModule",
+                "run_apply_patterns(",
+                "@run_apply_patterns",
+                "@test_in_context",
+                "@construct_and_print_in_module",
+            ]
+            if any(src_line.startswith(line) for line in skip_lines):
+                continue
+            writer.write(src_line)
+
+
+# Helper run functions
+# They are copied from the test modules (e.g. run function in execution_engine.py)
+def run(test_function):
+    # Generic run tests function used by dialects and ir test modules
+    test_function()
+
+
+def run_with_context_and_location(test_function):
+    # run tests function with a context and a location
+    # used by the following test modules:
+    # - dialects/transform_gpu_ext,
+    # - dialects/vector
+    # - dialects/gpu/*
+    with Context(), Location.unknown():
+        test_function()
+    return test_function
+
+
+def run_with_insertion_point_and_context_arg(test_function):
+    # run tests function used by dialects/index_dialect test module
+    with Context() as ctx, Location.unknown():
+        module = Module.create()
+        with InsertionPoint(module.body):
+            test_function(ctx)
+
+
+def run_with_insertion_point(test_function):
+    # Used by a lot of dialects test modules
+    with Context(), Location.unknown():
+        module = Module.create()
+        with InsertionPoint(module.body):
+            test_function()
+    return test_function
+
+
+def run_with_insertion_point_and_module_arg(test_function):
+    # Used by dialects/transform test module
+    with Context(), Location.unknown():
+        module = Module.create()
+        with InsertionPoint(module.body):
+            test_function(module)
+    return test_function
+
+
+def run_with_insertion_point_all_unreg_dialects(test_function):
+    # Used by dialects/cf test module
+    with Context() as ctx, Location.unknown():
+        ctx.allow_unregistered_dialects = True
+        module = Module.create()
+        with InsertionPoint(module.body):
+            test_function()
+    return test_function
+
+
+def run_apply_patterns(test_function):
+    # Used by dialects/transform_tensor_ext test module
+    with Context(), Location.unknown():
+        module = Module.create()
+        with InsertionPoint(module.body):
+            sequence = transform.SequenceOp(
+                transform.FailurePropagationMode.Propagate,
+                [],
+                transform.AnyOpType.get(),
+            )
+            with InsertionPoint(sequence.body):
+                apply = transform.ApplyPatternsOp(sequence.bodyTarget)
+                with InsertionPoint(apply.patterns):
+                    test_function()
+                transform.YieldOp()
+        print(module)
+    return test_function
+
+
+def run_transform_tensor_ext(test_function):
+    # Used by test modules:
+    # - dialects/transform_gpu_ext
+    # - dialects/transform_sparse_tensor_ext
+    # - dialects/transform_tensor_ext
+    with Context(), Location.unknown():
+        module = Module.create()
+        with InsertionPoint(module.body):
+            sequence = transform.SequenceOp(
+                transform.FailurePropagationMode.Propagate,
+                [],
+                transform.AnyOpType.get(),
+            )
+            with InsertionPoint(sequence.body):
+                test_function(sequence.bodyTarget)
+                transform.YieldOp()
+        print(module)
+    return test_function
+
+
+def run_transform_structured_ext(test_function):
+    # Used by dialects/transform_structured_ext test module
+    with Context(), Location.unknown():
+        module = Module.create()
+        with InsertionPoint(module.body):
+            test_function()
+        module.operation.verify()
+        print(module)
+    return test_function
+
+
+def run_construct_and_print_in_module(test_function):
+    # Used by test modules:
+    # - integration/dialects/pdl
+    # - integration/dialects/transform
+    with Context(), Location.unknown():
+        module = Module.create()
+        with InsertionPoint(module.body):
+            module = test_function(module)
+        if module is not None:
+            print(module)
+    return test_function
+
+
+TEST_MODULES = [
+    ("execution_engine", run),
+    ("pass_manager", run),
+    ("dialects/affine", run_with_insertion_point),
+    ("dialects/func", run_with_insertion_point),
+    ("dialects/arith_dialect", run),
+    ("dialects/arith_llvm", run),
+    ("dialects/async_dialect", run),
+    ("dialects/builtin", run),
+    ("dialects/cf", run_with_insertion_point_all_unreg_dialects),
+    ("dialects/complex_dialect", run),
+    ("dialects/func", run_with_insertion_point),
+    ("dialects/index_dialect", run_with_insertion_point_and_context_arg),
+    ("dialects/llvm", run_with_insertion_point),
+    ("dialects/math_dialect", run),
+    ("dialects/memref", run),
+    ("dialects/ml_program", run_with_insertion_point),
+    ("dialects/nvgpu", run_with_insertion_point),
+    ("dialects/nvvm", run_with_insertion_point),
+    ("dialects/ods_helpers", run),
+    ("dialects/openmp_ops", run_with_insertion_point),
+    ("dialects/pdl_ops", run_with_insertion_point),
+    # ("dialects/python_test", run),  # TODO: Need to pass pybind11 or nanobind argv
+    ("dialects/quant", run),
+    ("dialects/rocdl", run_with_insertion_point),
+    ("dialects/scf", run_with_insertion_point),
+    ("dialects/shape", run),
+    ("dialects/spirv_dialect", run),
+    ("dialects/tensor", run),
+    # ("dialects/tosa", ),  # Nothing to test
+    ("dialects/transform_bufferization_ext", run_with_insertion_point),
+    # ("dialects/transform_extras", ),  # Needs a more complicated execution schema
+    ("dialects/transform_gpu_ext", run_transform_tensor_ext),
+    (
+        "dialects/transform_interpreter",
+        run_with_context_and_location,
+        ["print_", "transform_options", "failed", "include"],
+    ),
+    (
+        "dialects/transform_loop_ext",
+        run_with_insertion_point,
+        ["loopOutline"],
+    ),
+    ("dialects/transform_memref_ext", run_with_insertion_point),
+    ("dialects/transform_nvgpu_ext", run_with_insertion_point),
+    ("dialects/transform_sparse_tensor_ext", run_transform_tensor_ext),
+    ("dialects/transform_structured_ext", run_transform_structured_ext),
+    ("dialects/transform_tensor_ext", run_transform_tensor_ext),
+    (
+        "dialects/transform_vector_ext",
+        run_apply_patterns,
+        ["configurable_patterns"],
+    ),
+    ("dialects/transform", run_with_insertion_point_and_module_arg),
+    ("dialects/vector", run_with_context_and_location),
+    ("dialects/gpu/dialect", run_with_context_and_location),
+    ("dialects/gpu/module-to-binary-nvvm", run_with_context_and_location),
+    ("dialects/gpu/module-to-binary-rocdl", run_with_context_and_location),
+    ("dialects/linalg/ops", run),
+    # TO ADD: No proper tests in this dialects/linalg/opsdsl/*
+    # ("dialects/linalg/opsdsl/*", ...),
+    ("dialects/sparse_tensor/dialect", run),
+    ("dialects/sparse_tensor/passes", run),
+    ("integration/dialects/pdl", run_construct_and_print_in_module),
+    ("integration/dialects/transform", run_construct_and_print_in_module),
+    ("integration/dialects/linalg/opsrun", run),
+    ("ir/affine_expr", run),
+    ("ir/affine_map", run),
+    ("ir/array_attributes", run),
+    ("ir/attributes", run),
+    ("ir/blocks", run),
+    ("ir/builtin_types", run),
+    ("ir/context_managers", run),
+    ("ir/debug", run),
+    ("ir/diagnostic_handler", run),
+    ("ir/dialects", run),
+    ("ir/exception", run),
+    ("ir/insertion_point", run),
+    ("ir/integer_set", run),
+    ("ir/location", run),
+    ("ir/module", run),
+    ("ir/operation", run),
+    ("ir/symbol_table", run),
+    ("ir/value", run),
+]
+
+TESTS_TO_SKIP = [
+    "test_execution_engine__testNanoTime_multi_threaded",  # testNanoTime can't run in multiple threads, even with GIL
+    "test_execution_engine__testSharedLibLoad_multi_threaded",  # testSharedLibLoad can't run in multiple threads, even with GIL
+    "test_dialects_arith_dialect__testArithValue_multi_threaded",  # RuntimeError: Value caster is already registered: <class 'dialects/arith_dialect.testArithValue.<locals>.ArithValue'>, even with GIL
+    "test_ir_dialects__testAppendPrefixSearchPath_multi_threaded",  # PyGlobals::setDialectSearchPrefixes is not thread-safe, even with GIL. Strange usage of static PyGlobals vs python exposed _cext.globals
+    "test_ir_value__testValueCasters_multi_threaded",  # RuntimeError: Value caster is already registered: <function testValueCasters.<locals>.dont_cast_int, even with GIL
+    # tests indirectly calling thread-unsafe llvm::raw_ostream
+    "test_execution_engine__testInvalidModule_multi_threaded",  # mlirExecutionEngineCreate calls thread-unsafe llvm::raw_ostream
+    "test_pass_manager__testPrintIrAfterAll_multi_threaded",  # IRPrinterInstrumentation::runAfterPass calls thread-unsafe llvm::raw_ostream
+    "test_pass_manager__testPrintIrBeforeAndAfterAll_multi_threaded",  # IRPrinterInstrumentation::runBeforePass calls thread-unsafe llvm::raw_ostream
+    "test_pass_manager__testPrintIrLargeLimitElements_multi_threaded",  # IRPrinterInstrumentation::runAfterPass calls thread-unsafe llvm::raw_ostream
+    "test_pass_manager__testPrintIrTree_multi_threaded",  # IRPrinterInstrumentation::runAfterPass calls thread-unsafe llvm::raw_ostream
+    "test_pass_manager__testRunPipeline_multi_threaded",  # PrintOpStatsPass::printSummary calls thread-unsafe llvm::raw_ostream
+    "test_dialects_transform_interpreter__include_multi_threaded",  # mlir::transform::PrintOp::apply(mlir::transform::TransformRewriter...) calls thread-unsafe llvm::raw_ostream
+    "test_dialects_transform_interpreter__transform_options_multi_threaded",  # mlir::transform::PrintOp::apply(mlir::transform::TransformRewriter...) calls thread-unsafe llvm::raw_ostream
+    "test_dialects_transform_interpreter__print_self_multi_threaded",  # mlir::transform::PrintOp::apply(mlir::transform::TransformRewriter...) call thread-unsafe llvm::raw_ostream
+    "test_ir_diagnostic_handler__testDiagnosticCallbackException_multi_threaded",  # mlirEmitError calls thread-unsafe llvm::raw_ostream
+    "test_ir_module__testParseSuccess_multi_threaded",  # mlirOperationDump calls thread-unsafe llvm::raw_ostream
+    # False-positive TSAN detected race in llvm::RuntimeDyldELF::registerEHFrames()
+    # Details: https://github.com/llvm/llvm-project/pull/107103/files#r1905726947
+    "test_execution_engine__testCapsule_multi_threaded",
+    "test_execution_engine__testDumpToObjectFile_multi_threaded",
+]
+
+TESTS_TO_XFAIL = [
+    # execution_engine tests:
+    # - ctypes related data-races: https://github.com/python/cpython/issues/127945
+    "test_execution_engine__testBF16Memref_multi_threaded",
+    "test_execution_engine__testBasicCallback_multi_threaded",
+    "test_execution_engine__testComplexMemrefAdd_multi_threaded",
+    "test_execution_engine__testComplexUnrankedMemrefAdd_multi_threaded",
+    "test_execution_engine__testDynamicMemrefAdd2D_multi_threaded",
+    "test_execution_engine__testF16MemrefAdd_multi_threaded",
+    "test_execution_engine__testF8E5M2Memref_multi_threaded",
+    "test_execution_engine__testInvokeFloatAdd_multi_threaded",
+    "test_execution_engine__testInvokeVoid_multi_threaded",  # a ctypes race
+    "test_execution_engine__testMemrefAdd_multi_threaded",
+    "test_execution_engine__testRankedMemRefCallback_multi_threaded",
+    "test_execution_engine__testRankedMemRefWithOffsetCallback_multi_threaded",
+    "test_execution_engine__testUnrankedMemRefCallback_multi_threaded",
+    "test_execution_engine__testUnrankedMemRefWithOffsetCallback_multi_threaded",
+    # dialects tests
+    "test_dialects_memref__testSubViewOpInferReturnTypeExtensiveSlicing_multi_threaded",  # Related to ctypes data races
+    "test_dialects_transform_interpreter__print_other_multi_threaded",  # Fatal Python error: Aborted or mlir::transform::PrintOp::apply(mlir::transform::TransformRewriter...) is not thread-safe
+    "test_dialects_gpu_module-to-binary-rocdl__testGPUToASMBin_multi_threaded",  # Due to global llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp::GCNTrackers variable mutation
+    "test_dialects_gpu_module-to-binary-nvvm__testGPUToASMBin_multi_threaded",
+    "test_dialects_gpu_module-to-binary-nvvm__testGPUToLLVMBin_multi_threaded",
+    "test_dialects_gpu_module-to-binary-rocdl__testGPUToLLVMBin_multi_threaded",
+    # integration tests
+    "test_integration_dialects_linalg_opsrun__test_elemwise_builtin_multi_threaded",  # Related to ctypes data races
+    "test_integration_dialects_linalg_opsrun__test_elemwise_generic_multi_threaded",  # Related to ctypes data races
+    "test_integration_dialects_linalg_opsrun__test_fill_builtin_multi_threaded",  # ctypes
+    "test_integration_dialects_linalg_opsrun__test_fill_generic_multi_threaded",  # ctypes
+    "test_integration_dialects_linalg_opsrun__test_fill_rng_builtin_multi_threaded",  # ctypes
+    "test_integration_dialects_linalg_opsrun__test_fill_rng_generic_multi_threaded",  # ctypes
+    "test_integration_dialects_linalg_opsrun__test_max_pooling_builtin_multi_threaded",  # ctypes
+    "test_integration_dialects_linalg_opsrun__test_max_pooling_generic_multi_threaded",  # ctypes
+    "test_integration_dialects_linalg_opsrun__test_min_pooling_builtin_multi_threaded",  # ctypes
+    "test_integration_dialects_linalg_opsrun__test_min_pooling_generic_multi_threaded",  # ctypes
+]
+
+
+def add_existing_tests(test_modules, test_prefix: str = "_original_test"):
+    def decorator(test_cls):
+        this_folder = Path(__file__).parent.absolute()
+        test_cls.output_folder = tempfile.TemporaryDirectory()
+        output_folder = Path(test_cls.output_folder.name)
+
+        for test_mod_info in test_modules:
+            # test_mod_info is a tuple of size 2 or 3:
+            # (test_module_str, run_test_function) or (test_module_str, run_test_function, test_name_patterns_list)
+            # For example:
+            # - ("ir/value", run) or
+            # - ("dialects/transform_loop_ext", run_with_insertion_point, ["loopOutline"])
+            assert isinstance(test_mod_info, tuple) and len(test_mod_info) in (2, 3)
+            if len(test_mod_info) == 2:
+                test_module_name, exec_fn = test_mod_info
+                test_pattern = None
+            else:
+                test_module_name, exec_fn, test_pattern = test_mod_info
+
+            src_filepath = this_folder / f"{test_module_name}.py"
+            dst_filepath = (output_folder / f"{test_module_name}.py").absolute()
+            if not dst_filepath.parent.exists():
+                dst_filepath.parent.mkdir(parents=True)
+            copy_and_update(src_filepath, dst_filepath)
+            test_mod = import_from_path(test_module_name, dst_filepath)
+            for attr_name in dir(test_mod):
+                is_test_fn = test_pattern is None and attr_name.startswith("test")
+                is_test_fn |= test_pattern is not None and any(
+                    [p in attr_name for p in test_pattern]
+                )
+                if is_test_fn:
+                    obj = getattr(test_mod, attr_name)
+                    if callable(obj):
+                        test_name = f"{test_prefix}_{test_module_name.replace('/', '_')}__{attr_name}"
+
+                        def wrapped_test_fn(
+                            self, *args, __test_fn__=obj, __exec_fn__=exec_fn, **kwargs
+                        ):
+                            __exec_fn__(__test_fn__)
+
+                        setattr(test_cls, test_name, wrapped_test_fn)
+        return test_cls
+
+    return decorator
+
+
+@contextmanager
+def _capture_output(fp):
+    # Inspired from jax test_utils.py capture_stderr method
+    # ``None`` means nothing has not been captured yet.
+    captured = None
+
+    def get_output() -> str:
+        if captured is None:
+            raise ValueError("get_output() called while the context is active.")
+        return captured
+
+    with tempfile.NamedTemporaryFile(mode="w+", encoding="utf-8") as f:
+        original_fd = os.dup(fp.fileno())
+        os.dup2(f.fileno(), fp.fileno())
+        try:
+            yield get_output
+        finally:
+            # Python also has its own buffers, make sure everything is flushed.
+            fp.flush()
+            os.fsync(fp.fileno())
+            f.seek(0)
+            captured = f.read()
+            os.dup2(original_fd, fp.fileno())
+
+
+capture_stdout = partial(_capture_output, sys.stdout)
+capture_stderr = partial(_capture_output, sys.stderr)
+
+
+def multi_threaded(
+    num_workers: int,
+    num_runs: int = 5,
+    skip_tests: Optional[list[str]] = None,
+    xfail_tests: Optional[list[str]] = None,
+    test_prefix: str = "_original_test",
+    multithreaded_test_postfix: str = "_multi_threaded",
+):
+    """Decorator that runs a test in a multi-threaded environment."""
+
+    def decorator(test_cls):
+        for name, test_fn in test_cls.__dict__.copy().items():
+            if not (name.startswith(test_prefix) and callable(test_fn)):
+                continue
+
+            name = f"test{name[len(test_prefix):]}"
+            if skip_tests is not None:
+                if any(
+                    test_name.replace(multithreaded_test_postfix, "") in name
+                    for test_name in skip_tests
+                ):
+                    continue
+
+            def multi_threaded_test_fn(self, *args, __test_fn__=test_fn, **kwargs):
+                with capture_stdout(), capture_stderr() as get_output:
+                    barrier = threading.Barrier(num_workers)
+
+                    def closure():
+                        barrier.wait()
+                        for _ in range(num_runs):
+                            __test_fn__(self, *args, **kwargs)
+
+                    with concurrent.futures.ThreadPoolExecutor(
+                        max_workers=num_workers
+                    ) as executor:
+                        futures = []
+                        for _ in range(num_workers):
+                            futures.append(executor.submit(closure))
+                        # We should call future.result() to re-raise an exception if test has
+                        # failed
+                        assert len(list(f.result() for f in futures)) == num_workers
+
+                    gc.collect()
+                    assert Context._get_live_count() == 0
+
+                captured = get_output()
+                if len(captured) > 0 and "ThreadSanitizer" in captured:
+                    raise RuntimeError(
+                        f"ThreadSanitizer reported warnings:\n{captured}"
+                    )
+
+            test_new_name = f"{name}{multithreaded_test_postfix}"
+            if xfail_tests is not None and test_new_name in xfail_tests:
+                multi_threaded_test_fn = unittest.expectedFailure(
+                    multi_threaded_test_fn
+                )
+
+            setattr(test_cls, test_new_name, multi_threaded_test_fn)
+
+        return test_cls
+
+    return decorator
+
+
+@multi_threaded(
+    num_workers=10,
+    num_runs=20,
+    skip_tests=TESTS_TO_SKIP,
+    xfail_tests=TESTS_TO_XFAIL,
+)
+@add_existing_tests(test_modules=TEST_MODULES, test_prefix="_original_test")
+class TestAllMultiThreaded(unittest.TestCase):
+    @classmethod
+    def tearDownClass(cls):
+        if hasattr(cls, "output_folder"):
+            cls.output_folder.cleanup()
+
+    def _original_test_create_context(self):
+        with Context() as ctx:
+            print(ctx._get_live_count())
+            print(ctx._get_live_module_count())
+            print(ctx._get_live_operation_count())
+            print(ctx._get_live_operation_objects())
+            print(ctx._get_context_again() is ctx)
+            print(ctx._clear_live_operations())
+
+    def _original_test_create_module_with_consts(self):
+        py_values = [123, 234, 345]
+        with Context() as ctx:
+            module = Module.create(loc=Location.file("foo.txt", 0, 0))
+
+            dtype = IntegerType.get_signless(64)
+            with InsertionPoint(module.body), Location.name("a"):
+                arith.constant(dtype, py_values[0])
+
+            with InsertionPoint(module.body), Location.name("b"):
+                arith.constant(dtype, py_values[1])
+
+            with InsertionPoint(module.body), Location.name("c"):
+                arith.constant(dtype, py_values[2])
+
+
+if __name__ == "__main__":
+    # Do not run the tests on CPython with GIL
+    if hasattr(sys, "_is_gil_enabled") and not sys._is_gil_enabled():
+        unittest.main()

From 1d2eea962ac9724350f025f4c70808d42a435289 Mon Sep 17 00:00:00 2001
From: CHANDRA GHALE <chandra.nitdgp@gmail.com>
Date: Sun, 12 Jan 2025 23:38:00 +0530
Subject: [PATCH 207/408] [OpenMP] codegen support for masked combined
 construct masked taskloop simd (#121916)

Added codegen support for combined masked constructs `masked taskloop
simd`.
Added implementation for `EmitOMPMaskedTaskLoopSimdDirective`.

Co-authored-by: Chandra Ghale <ghale@pe31.hpc.amslabs.hpecorp.net>
---
 clang/lib/CodeGen/CGStmt.cpp                  |  3 +-
 clang/lib/CodeGen/CGStmtOpenMP.cpp            | 12 +++++
 clang/lib/CodeGen/CodeGenFunction.h           |  2 +
 .../OpenMP/masked_taskloop_simd_codegen.c     | 49 +++++++++++++++++++
 4 files changed, 65 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/OpenMP/masked_taskloop_simd_codegen.c

diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index c87ec899798aa..ee10e586d9250 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -339,7 +339,8 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef<const Attr *> Attrs) {
         cast<OMPMasterTaskLoopSimdDirective>(*S));
     break;
   case Stmt::OMPMaskedTaskLoopSimdDirectiveClass:
-    llvm_unreachable("masked taskloop simd directive not supported yet.");
+    EmitOMPMaskedTaskLoopSimdDirective(
+        cast<OMPMaskedTaskLoopSimdDirective>(*S));
     break;
   case Stmt::OMPParallelMasterTaskLoopDirectiveClass:
     EmitOMPParallelMasterTaskLoopDirective(
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 4e7b0dcd31ecf..94daf059edba0 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -7994,6 +7994,18 @@ void CodeGenFunction::EmitOMPMasterTaskLoopSimdDirective(
   CGM.getOpenMPRuntime().emitMasterRegion(*this, CodeGen, S.getBeginLoc());
 }
 
+void CodeGenFunction::EmitOMPMaskedTaskLoopSimdDirective(
+    const OMPMaskedTaskLoopSimdDirective &S) {
+  auto &&CodeGen = [this, &S](CodeGenFunction &CGF, PrePostActionTy &Action) {
+    Action.Enter(CGF);
+    EmitOMPTaskLoopBasedDirective(S);
+  };
+  auto LPCRegion =
+      CGOpenMPRuntime::LastprivateConditionalRAII::disable(*this, S);
+  OMPLexicalScope Scope(*this, S);
+  CGM.getOpenMPRuntime().emitMaskedRegion(*this, CodeGen, S.getBeginLoc());
+}
+
 void CodeGenFunction::EmitOMPParallelMasterTaskLoopDirective(
     const OMPParallelMasterTaskLoopDirective &S) {
   auto &&CodeGen = [this, &S](CodeGenFunction &CGF, PrePostActionTy &Action) {
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 23ddc869277a8..86328db345508 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -3872,6 +3872,8 @@ class CodeGenFunction : public CodeGenTypeCache {
   void EmitOMPMasterTaskLoopDirective(const OMPMasterTaskLoopDirective &S);
   void
   EmitOMPMasterTaskLoopSimdDirective(const OMPMasterTaskLoopSimdDirective &S);
+  void
+  EmitOMPMaskedTaskLoopSimdDirective(const OMPMaskedTaskLoopSimdDirective &S);
   void EmitOMPParallelMasterTaskLoopDirective(
       const OMPParallelMasterTaskLoopDirective &S);
   void EmitOMPParallelMaskedTaskLoopDirective(
diff --git a/clang/test/OpenMP/masked_taskloop_simd_codegen.c b/clang/test/OpenMP/masked_taskloop_simd_codegen.c
new file mode 100644
index 0000000000000..f786bc582beb2
--- /dev/null
+++ b/clang/test/OpenMP/masked_taskloop_simd_codegen.c
@@ -0,0 +1,49 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --prefix-filecheck-ir-name _ --version 5
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fopenmp -fopenmp-version=52 -x c -emit-llvm %s -o - | FileCheck %s
+// expected-no-diagnostics
+#define N 100
+void masked_taskloop_simd(){
+	#pragma omp masked taskloop simd
+	for( int i = 0; i < N; i++)
+	;
+
+}
+
+int main()
+{
+ masked_taskloop_simd();
+}
+// CHECK-LABEL: define dso_local void @masked_taskloop_simd(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 1
+// CHECK-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_masked(ptr @[[GLOB1]], i32 [[TMP0]], i32 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
+// CHECK-NEXT:    br i1 [[TMP2]], label %[[OMP_IF_THEN:.*]], label %[[OMP_IF_END:.*]]
+// CHECK:       [[OMP_IF_THEN]]:
+// CHECK-NEXT:    call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i64 80, i64 0, ptr @.omp_task_entry.)
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP3]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 5
+// CHECK-NEXT:    store i64 0, ptr [[TMP5]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 6
+// CHECK-NEXT:    store i64 99, ptr [[TMP6]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 7
+// CHECK-NEXT:    store i64 1, ptr [[TMP7]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 9
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP8]], i8 0, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
+// CHECK-NEXT:    call void @__kmpc_taskloop(ptr @[[GLOB1]], i32 [[TMP0]], ptr [[TMP3]], i32 1, ptr [[TMP5]], ptr [[TMP6]], i64 [[TMP9]], i32 1, i32 0, i64 0, ptr null)
+// CHECK-NEXT:    call void @__kmpc_end_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
+// CHECK-NEXT:    call void @__kmpc_end_masked(ptr @[[GLOB1]], i32 [[TMP0]])
+// CHECK-NEXT:    br label %[[OMP_IF_END]]
+// CHECK:       [[OMP_IF_END]]:
+// CHECK-NEXT:    ret void
+//
+// CHECK-LABEL: define dso_local i32 @main(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @masked_taskloop_simd()
+// CHECK-NEXT:    ret i32 0

From 3f1486f08e0dd64136fb7f50e38cd618dd0255d2 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Sun, 12 Jan 2025 18:30:42 +0000
Subject: [PATCH 208/408] Revert "Added free-threading CPython mode support in
 MLIR Python bindings (#107103)"

Breaks on 3.8, rolling back to avoid breakage while fixing.

This reverts commit 9dee7c44491635ec9037b90050bcdbd3d5291e38.
---
 mlir/cmake/modules/AddMLIRPython.cmake  |  21 +-
 mlir/docs/Bindings/Python.md            |  40 --
 mlir/lib/Bindings/Python/Globals.h      |  12 +-
 mlir/lib/Bindings/Python/IRCore.cpp     |  31 +-
 mlir/lib/Bindings/Python/IRModule.cpp   |  18 +-
 mlir/lib/Bindings/Python/IRModule.h     |   1 -
 mlir/lib/Bindings/Python/MainModule.cpp |   9 +-
 mlir/python/requirements.txt            |   2 +-
 mlir/test/python/multithreaded_tests.py | 531 ------------------------
 9 files changed, 16 insertions(+), 649 deletions(-)
 delete mode 100644 mlir/test/python/multithreaded_tests.py

diff --git a/mlir/cmake/modules/AddMLIRPython.cmake b/mlir/cmake/modules/AddMLIRPython.cmake
index 0679db9cf93e1..717a503468a85 100644
--- a/mlir/cmake/modules/AddMLIRPython.cmake
+++ b/mlir/cmake/modules/AddMLIRPython.cmake
@@ -668,31 +668,12 @@ function(add_mlir_python_extension libname extname)
   elseif(ARG_PYTHON_BINDINGS_LIBRARY STREQUAL "nanobind")
     nanobind_add_module(${libname}
       NB_DOMAIN mlir
-      FREE_THREADED
       ${ARG_SOURCES}
     )
 
     if (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL)
       # Avoids warnings from upstream nanobind.
-      set(nanobind_target "nanobind-static")
-      if (NOT TARGET ${nanobind_target})
-        # Get correct nanobind target name: nanobind-static-ft or something else
-        # It is set by nanobind_add_module function according to the passed options
-        get_property(all_targets DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY BUILDSYSTEM_TARGETS)
-
-        # Iterate over the list of targets
-        foreach(target ${all_targets})
-          # Check if the target name matches the given string
-          if("${target}" MATCHES "nanobind-")
-            set(nanobind_target "${target}")
-          endif()
-        endforeach()
-
-        if (NOT TARGET ${nanobind_target})
-          message(FATAL_ERROR "Could not find nanobind target to set compile options to")
-        endif()
-      endif()
-      target_compile_options(${nanobind_target}
+      target_compile_options(nanobind-static
         PRIVATE
           -Wno-cast-qual
           -Wno-zero-length-array
diff --git a/mlir/docs/Bindings/Python.md b/mlir/docs/Bindings/Python.md
index b8bd0f507a510..32df3310d811d 100644
--- a/mlir/docs/Bindings/Python.md
+++ b/mlir/docs/Bindings/Python.md
@@ -1187,43 +1187,3 @@ or nanobind and
 utilities to connect to the rest of Python API. The bindings can be located in a
 separate module or in the same module as attributes and types, and
 loaded along with the dialect.
-
-## Free-threading (No-GIL) support
-
-Free-threading or no-GIL support refers to CPython interpreter (>=3.13) with Global Interpreter Lock made optional. For details on the topic, please check [PEP-703](https://peps.python.org/pep-0703/) and this [Python free-threading guide](https://py-free-threading.github.io/).
-
-MLIR Python bindings are free-threading compatible with exceptions (discussed below) in the following sense: it is safe to work in multiple threads with **independent** contexts. Below we show an example code of safe usage:
-
-```python
-# python3.13t example.py
-import concurrent.futures
-
-import mlir.dialects.arith as arith
-from mlir.ir import Context, Location, Module, IntegerType, InsertionPoint
-
-
-def func(py_value):
-    with Context() as ctx:
-        module = Module.create(loc=Location.file("foo.txt", 0, 0))
-
-        dtype = IntegerType.get_signless(64)
-        with InsertionPoint(module.body), Location.name("a"):
-            arith.constant(dtype, py_value)
-
-    return module
-
-
-num_workers = 8
-with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
-    futures = []
-    for i in range(num_workers):
-        futures.append(executor.submit(func, i))
-    assert len(list(f.result() for f in futures)) == num_workers
-```
-
-The exceptions to the free-threading compatibility:
-- IR printing is unsafe, e.g. when using `PassManager` with `PassManager.enable_ir_printing()` which calls thread-unsafe `llvm::raw_ostream`.
-- Usage of `Location.emit_error` is unsafe (due to thread-unsafe `llvm::raw_ostream`).
-- Usage of `Module.dump` is unsafe (due to thread-unsafe `llvm::raw_ostream`).
-- Usage of `mlir.dialects.transform.interpreter` is unsafe.
-- Usage of `mlir.dialects.gpu` and `gpu-module-to-binary` is unsafe.
\ No newline at end of file
diff --git a/mlir/lib/Bindings/Python/Globals.h b/mlir/lib/Bindings/Python/Globals.h
index 826a34a535176..0ec522d14f74b 100644
--- a/mlir/lib/Bindings/Python/Globals.h
+++ b/mlir/lib/Bindings/Python/Globals.h
@@ -24,7 +24,6 @@ namespace mlir {
 namespace python {
 
 /// Globals that are always accessible once the extension has been initialized.
-/// Methods of this class are thread-safe.
 class PyGlobals {
 public:
   PyGlobals();
@@ -38,18 +37,12 @@ class PyGlobals {
 
   /// Get and set the list of parent modules to search for dialect
   /// implementation classes.
-  std::vector<std::string> getDialectSearchPrefixes() {
-    nanobind::ft_lock_guard lock(mutex);
+  std::vector<std::string> &getDialectSearchPrefixes() {
     return dialectSearchPrefixes;
   }
   void setDialectSearchPrefixes(std::vector<std::string> newValues) {
-    nanobind::ft_lock_guard lock(mutex);
     dialectSearchPrefixes.swap(newValues);
   }
-  void addDialectSearchPrefix(std::string value) {
-    nanobind::ft_lock_guard lock(mutex);
-    dialectSearchPrefixes.push_back(std::move(value));
-  }
 
   /// Loads a python module corresponding to the given dialect namespace.
   /// No-ops if the module has already been loaded or is not found. Raises
@@ -116,9 +109,6 @@ class PyGlobals {
 
 private:
   static PyGlobals *instance;
-
-  nanobind::ft_mutex mutex;
-
   /// Module name prefixes to search under for dialect implementation modules.
   std::vector<std::string> dialectSearchPrefixes;
   /// Map of dialect namespace to external dialect class object.
diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index 463ebdebb3f3f..453d4f7c7e8bc 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -243,15 +243,9 @@ static MlirBlock createBlock(const nb::sequence &pyArgTypes,
 
 /// Wrapper for the global LLVM debugging flag.
 struct PyGlobalDebugFlag {
-  static void set(nb::object &o, bool enable) {
-    nb::ft_lock_guard lock(mutex);
-    mlirEnableGlobalDebug(enable);
-  }
+  static void set(nb::object &o, bool enable) { mlirEnableGlobalDebug(enable); }
 
-  static bool get(const nb::object &) {
-    nb::ft_lock_guard lock(mutex);
-    return mlirIsGlobalDebugEnabled();
-  }
+  static bool get(const nb::object &) { return mlirIsGlobalDebugEnabled(); }
 
   static void bind(nb::module_ &m) {
     // Debug flags.
@@ -261,7 +255,6 @@ struct PyGlobalDebugFlag {
         .def_static(
             "set_types",
             [](const std::string &type) {
-              nb::ft_lock_guard lock(mutex);
               mlirSetGlobalDebugType(type.c_str());
             },
             "types"_a, "Sets specific debug types to be produced by LLVM")
@@ -270,17 +263,11 @@ struct PyGlobalDebugFlag {
           pointers.reserve(types.size());
           for (const std::string &str : types)
             pointers.push_back(str.c_str());
-          nb::ft_lock_guard lock(mutex);
           mlirSetGlobalDebugTypes(pointers.data(), pointers.size());
         });
   }
-
-private:
-  static nb::ft_mutex mutex;
 };
 
-nb::ft_mutex PyGlobalDebugFlag::mutex;
-
 struct PyAttrBuilderMap {
   static bool dunderContains(const std::string &attributeKind) {
     return PyGlobals::get().lookupAttributeBuilder(attributeKind).has_value();
@@ -619,7 +606,6 @@ class PyOpOperandIterator {
 
 PyMlirContext::PyMlirContext(MlirContext context) : context(context) {
   nb::gil_scoped_acquire acquire;
-  nb::ft_lock_guard lock(live_contexts_mutex);
   auto &liveContexts = getLiveContexts();
   liveContexts[context.ptr] = this;
 }
@@ -629,10 +615,7 @@ PyMlirContext::~PyMlirContext() {
   // forContext method, which always puts the associated handle into
   // liveContexts.
   nb::gil_scoped_acquire acquire;
-  {
-    nb::ft_lock_guard lock(live_contexts_mutex);
-    getLiveContexts().erase(context.ptr);
-  }
+  getLiveContexts().erase(context.ptr);
   mlirContextDestroy(context);
 }
 
@@ -649,7 +632,6 @@ nb::object PyMlirContext::createFromCapsule(nb::object capsule) {
 
 PyMlirContextRef PyMlirContext::forContext(MlirContext context) {
   nb::gil_scoped_acquire acquire;
-  nb::ft_lock_guard lock(live_contexts_mutex);
   auto &liveContexts = getLiveContexts();
   auto it = liveContexts.find(context.ptr);
   if (it == liveContexts.end()) {
@@ -665,17 +647,12 @@ PyMlirContextRef PyMlirContext::forContext(MlirContext context) {
   return PyMlirContextRef(it->second, std::move(pyRef));
 }
 
-nb::ft_mutex PyMlirContext::live_contexts_mutex;
-
 PyMlirContext::LiveContextMap &PyMlirContext::getLiveContexts() {
   static LiveContextMap liveContexts;
   return liveContexts;
 }
 
-size_t PyMlirContext::getLiveCount() {
-  nb::ft_lock_guard lock(live_contexts_mutex);
-  return getLiveContexts().size();
-}
+size_t PyMlirContext::getLiveCount() { return getLiveContexts().size(); }
 
 size_t PyMlirContext::getLiveOperationCount() { return liveOperations.size(); }
 
diff --git a/mlir/lib/Bindings/Python/IRModule.cpp b/mlir/lib/Bindings/Python/IRModule.cpp
index e600f1bbd4493..f7bf77e5a7e04 100644
--- a/mlir/lib/Bindings/Python/IRModule.cpp
+++ b/mlir/lib/Bindings/Python/IRModule.cpp
@@ -38,11 +38,8 @@ PyGlobals::PyGlobals() {
 PyGlobals::~PyGlobals() { instance = nullptr; }
 
 bool PyGlobals::loadDialectModule(llvm::StringRef dialectNamespace) {
-  {
-    nb::ft_lock_guard lock(mutex);
-    if (loadedDialectModules.contains(dialectNamespace))
-      return true;
-  }
+  if (loadedDialectModules.contains(dialectNamespace))
+    return true;
   // Since re-entrancy is possible, make a copy of the search prefixes.
   std::vector<std::string> localSearchPrefixes = dialectSearchPrefixes;
   nb::object loaded = nb::none();
@@ -65,14 +62,12 @@ bool PyGlobals::loadDialectModule(llvm::StringRef dialectNamespace) {
     return false;
   // Note: Iterator cannot be shared from prior to loading, since re-entrancy
   // may have occurred, which may do anything.
-  nb::ft_lock_guard lock(mutex);
   loadedDialectModules.insert(dialectNamespace);
   return true;
 }
 
 void PyGlobals::registerAttributeBuilder(const std::string &attributeKind,
                                          nb::callable pyFunc, bool replace) {
-  nb::ft_lock_guard lock(mutex);
   nb::object &found = attributeBuilderMap[attributeKind];
   if (found && !replace) {
     throw std::runtime_error((llvm::Twine("Attribute builder for '") +
@@ -86,7 +81,6 @@ void PyGlobals::registerAttributeBuilder(const std::string &attributeKind,
 
 void PyGlobals::registerTypeCaster(MlirTypeID mlirTypeID,
                                    nb::callable typeCaster, bool replace) {
-  nb::ft_lock_guard lock(mutex);
   nb::object &found = typeCasterMap[mlirTypeID];
   if (found && !replace)
     throw std::runtime_error("Type caster is already registered with caster: " +
@@ -96,7 +90,6 @@ void PyGlobals::registerTypeCaster(MlirTypeID mlirTypeID,
 
 void PyGlobals::registerValueCaster(MlirTypeID mlirTypeID,
                                     nb::callable valueCaster, bool replace) {
-  nb::ft_lock_guard lock(mutex);
   nb::object &found = valueCasterMap[mlirTypeID];
   if (found && !replace)
     throw std::runtime_error("Value caster is already registered: " +
@@ -106,7 +99,6 @@ void PyGlobals::registerValueCaster(MlirTypeID mlirTypeID,
 
 void PyGlobals::registerDialectImpl(const std::string &dialectNamespace,
                                     nb::object pyClass) {
-  nb::ft_lock_guard lock(mutex);
   nb::object &found = dialectClassMap[dialectNamespace];
   if (found) {
     throw std::runtime_error((llvm::Twine("Dialect namespace '") +
@@ -118,7 +110,6 @@ void PyGlobals::registerDialectImpl(const std::string &dialectNamespace,
 
 void PyGlobals::registerOperationImpl(const std::string &operationName,
                                       nb::object pyClass, bool replace) {
-  nb::ft_lock_guard lock(mutex);
   nb::object &found = operationClassMap[operationName];
   if (found && !replace) {
     throw std::runtime_error((llvm::Twine("Operation '") + operationName +
@@ -130,7 +121,6 @@ void PyGlobals::registerOperationImpl(const std::string &operationName,
 
 std::optional<nb::callable>
 PyGlobals::lookupAttributeBuilder(const std::string &attributeKind) {
-  nb::ft_lock_guard lock(mutex);
   const auto foundIt = attributeBuilderMap.find(attributeKind);
   if (foundIt != attributeBuilderMap.end()) {
     assert(foundIt->second && "attribute builder is defined");
@@ -143,7 +133,6 @@ std::optional<nb::callable> PyGlobals::lookupTypeCaster(MlirTypeID mlirTypeID,
                                                         MlirDialect dialect) {
   // Try to load dialect module.
   (void)loadDialectModule(unwrap(mlirDialectGetNamespace(dialect)));
-  nb::ft_lock_guard lock(mutex);
   const auto foundIt = typeCasterMap.find(mlirTypeID);
   if (foundIt != typeCasterMap.end()) {
     assert(foundIt->second && "type caster is defined");
@@ -156,7 +145,6 @@ std::optional<nb::callable> PyGlobals::lookupValueCaster(MlirTypeID mlirTypeID,
                                                          MlirDialect dialect) {
   // Try to load dialect module.
   (void)loadDialectModule(unwrap(mlirDialectGetNamespace(dialect)));
-  nb::ft_lock_guard lock(mutex);
   const auto foundIt = valueCasterMap.find(mlirTypeID);
   if (foundIt != valueCasterMap.end()) {
     assert(foundIt->second && "value caster is defined");
@@ -170,7 +158,6 @@ PyGlobals::lookupDialectClass(const std::string &dialectNamespace) {
   // Make sure dialect module is loaded.
   if (!loadDialectModule(dialectNamespace))
     return std::nullopt;
-  nb::ft_lock_guard lock(mutex);
   const auto foundIt = dialectClassMap.find(dialectNamespace);
   if (foundIt != dialectClassMap.end()) {
     assert(foundIt->second && "dialect class is defined");
@@ -188,7 +175,6 @@ PyGlobals::lookupOperationClass(llvm::StringRef operationName) {
   if (!loadDialectModule(dialectNamespace))
     return std::nullopt;
 
-  nb::ft_lock_guard lock(mutex);
   auto foundIt = operationClassMap.find(operationName);
   if (foundIt != operationClassMap.end()) {
     assert(foundIt->second && "OpView is defined");
diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h
index f5fbb6c61b57e..8fb32a225e65f 100644
--- a/mlir/lib/Bindings/Python/IRModule.h
+++ b/mlir/lib/Bindings/Python/IRModule.h
@@ -260,7 +260,6 @@ class PyMlirContext {
   // Note that this holds a handle, which does not imply ownership.
   // Mappings will be removed when the context is destructed.
   using LiveContextMap = llvm::DenseMap<void *, PyMlirContext *>;
-  static nanobind::ft_mutex live_contexts_mutex;
   static LiveContextMap &getLiveContexts();
 
   // Interns all live modules associated with this context. Modules tracked
diff --git a/mlir/lib/Bindings/Python/MainModule.cpp b/mlir/lib/Bindings/Python/MainModule.cpp
index 6f49431006605..7c4064262012e 100644
--- a/mlir/lib/Bindings/Python/MainModule.cpp
+++ b/mlir/lib/Bindings/Python/MainModule.cpp
@@ -30,8 +30,12 @@ NB_MODULE(_mlir, m) {
       .def_prop_rw("dialect_search_modules",
                    &PyGlobals::getDialectSearchPrefixes,
                    &PyGlobals::setDialectSearchPrefixes)
-      .def("append_dialect_search_prefix", &PyGlobals::addDialectSearchPrefix,
-           "module_name"_a)
+      .def(
+          "append_dialect_search_prefix",
+          [](PyGlobals &self, std::string moduleName) {
+            self.getDialectSearchPrefixes().push_back(std::move(moduleName));
+          },
+          "module_name"_a)
       .def(
           "_check_dialect_module_loaded",
           [](PyGlobals &self, const std::string &dialectNamespace) {
@@ -72,6 +76,7 @@ NB_MODULE(_mlir, m) {
                   nanobind::cast<std::string>(opClass.attr("OPERATION_NAME"));
               PyGlobals::get().registerOperationImpl(operationName, opClass,
                                                      replace);
+
               // Dict-stuff the new opClass by name onto the dialect class.
               nb::object opClassName = opClass.attr("__name__");
               dialectClass.attr(opClassName) = opClass;
diff --git a/mlir/python/requirements.txt b/mlir/python/requirements.txt
index 259e679f510f7..f240d6ef944ec 100644
--- a/mlir/python/requirements.txt
+++ b/mlir/python/requirements.txt
@@ -2,4 +2,4 @@ nanobind>=2.4, <3.0
 numpy>=1.19.5, <=2.1.2
 pybind11>=2.10.0, <=2.13.6
 PyYAML>=5.4.0, <=6.0.1
-ml_dtypes>=0.5.0, <=0.6.0   # provides several NumPy dtype extensions, including the bf16
+ml_dtypes>=0.1.0, <=0.5.0   # provides several NumPy dtype extensions, including the bf16
diff --git a/mlir/test/python/multithreaded_tests.py b/mlir/test/python/multithreaded_tests.py
deleted file mode 100644
index 2df75e2e1b90c..0000000000000
--- a/mlir/test/python/multithreaded_tests.py
+++ /dev/null
@@ -1,531 +0,0 @@
-# RUN: %PYTHON %s
-"""
-This script generates multi-threaded tests to check free-threading mode using CPython compiled with TSAN.
-Tests can be run using pytest:
-```bash
-python3.13t -mpytest -vvv multithreaded_tests.py
-```
-
-IMPORTANT. Running tests are not checking the correctness, but just the execution of the tests in multi-threaded context
-and passing if no warnings reported by TSAN and failing otherwise.
-
-
-Details on the generated tests and execution:
-1) Multi-threaded execution: all generated tests are executed independently by
-a pool of threads, running each test multiple times, see @multi_threaded for details
-
-2) Tests generation: we use existing tests: test/python/ir/*.py,
-test/python/dialects/*.py, etc to generate multi-threaded tests.
-In details, we perform the following:
-a) we define a list of source tests to be used to generate multi-threaded tests, see `TEST_MODULES`.
-b) we define `TestAllMultiThreaded` class and add existing tests to the class. See `add_existing_tests` method.
-c) for each test file, we copy and modify it: test/python/ir/affine_expr.py -> /tmp/ir/affine_expr.py.
-In order to import the test file as python module, we remove all executing functions, like
-`@run` or `run(testMethod)`. See `copy_and_update` and `add_existing_tests` methods for details.
-
-
-Observed warnings reported by TSAN.
-
-CPython and free-threading known data-races:
-1) ctypes related races: https://github.com/python/cpython/issues/127945
-2) LLVM related data-races, llvm::raw_ostream is not thread-safe
-- mlir pass manager
-- dialects/transform_interpreter.py
-- ir/diagnostic_handler.py
-- ir/module.py
-3) Dialect gpu module-to-binary method is unsafe
-"""
-import concurrent.futures
-import gc
-import importlib.util
-import os
-import sys
-import threading
-import tempfile
-import unittest
-
-from contextlib import contextmanager
-from functools import partial
-from pathlib import Path
-from typing import Optional
-
-import mlir.dialects.arith as arith
-from mlir.dialects import transform
-from mlir.ir import Context, Location, Module, IntegerType, InsertionPoint
-
-
-def import_from_path(module_name: str, file_path: Path):
-    spec = importlib.util.spec_from_file_location(module_name, file_path)
-    module = importlib.util.module_from_spec(spec)
-    sys.modules[module_name] = module
-    spec.loader.exec_module(module)
-    return module
-
-
-def copy_and_update(src_filepath: Path, dst_filepath: Path):
-    # We should remove all calls like `run(testMethod)`
-    with open(src_filepath, "r") as reader, open(dst_filepath, "w") as writer:
-        while True:
-            src_line = reader.readline()
-            if len(src_line) == 0:
-                break
-            skip_lines = [
-                "run(",
-                "@run",
-                "@constructAndPrintInModule",
-                "run_apply_patterns(",
-                "@run_apply_patterns",
-                "@test_in_context",
-                "@construct_and_print_in_module",
-            ]
-            if any(src_line.startswith(line) for line in skip_lines):
-                continue
-            writer.write(src_line)
-
-
-# Helper run functions
-# They are copied from the test modules (e.g. run function in execution_engine.py)
-def run(test_function):
-    # Generic run tests function used by dialects and ir test modules
-    test_function()
-
-
-def run_with_context_and_location(test_function):
-    # run tests function with a context and a location
-    # used by the following test modules:
-    # - dialects/transform_gpu_ext,
-    # - dialects/vector
-    # - dialects/gpu/*
-    with Context(), Location.unknown():
-        test_function()
-    return test_function
-
-
-def run_with_insertion_point_and_context_arg(test_function):
-    # run tests function used by dialects/index_dialect test module
-    with Context() as ctx, Location.unknown():
-        module = Module.create()
-        with InsertionPoint(module.body):
-            test_function(ctx)
-
-
-def run_with_insertion_point(test_function):
-    # Used by a lot of dialects test modules
-    with Context(), Location.unknown():
-        module = Module.create()
-        with InsertionPoint(module.body):
-            test_function()
-    return test_function
-
-
-def run_with_insertion_point_and_module_arg(test_function):
-    # Used by dialects/transform test module
-    with Context(), Location.unknown():
-        module = Module.create()
-        with InsertionPoint(module.body):
-            test_function(module)
-    return test_function
-
-
-def run_with_insertion_point_all_unreg_dialects(test_function):
-    # Used by dialects/cf test module
-    with Context() as ctx, Location.unknown():
-        ctx.allow_unregistered_dialects = True
-        module = Module.create()
-        with InsertionPoint(module.body):
-            test_function()
-    return test_function
-
-
-def run_apply_patterns(test_function):
-    # Used by dialects/transform_tensor_ext test module
-    with Context(), Location.unknown():
-        module = Module.create()
-        with InsertionPoint(module.body):
-            sequence = transform.SequenceOp(
-                transform.FailurePropagationMode.Propagate,
-                [],
-                transform.AnyOpType.get(),
-            )
-            with InsertionPoint(sequence.body):
-                apply = transform.ApplyPatternsOp(sequence.bodyTarget)
-                with InsertionPoint(apply.patterns):
-                    test_function()
-                transform.YieldOp()
-        print(module)
-    return test_function
-
-
-def run_transform_tensor_ext(test_function):
-    # Used by test modules:
-    # - dialects/transform_gpu_ext
-    # - dialects/transform_sparse_tensor_ext
-    # - dialects/transform_tensor_ext
-    with Context(), Location.unknown():
-        module = Module.create()
-        with InsertionPoint(module.body):
-            sequence = transform.SequenceOp(
-                transform.FailurePropagationMode.Propagate,
-                [],
-                transform.AnyOpType.get(),
-            )
-            with InsertionPoint(sequence.body):
-                test_function(sequence.bodyTarget)
-                transform.YieldOp()
-        print(module)
-    return test_function
-
-
-def run_transform_structured_ext(test_function):
-    # Used by dialects/transform_structured_ext test module
-    with Context(), Location.unknown():
-        module = Module.create()
-        with InsertionPoint(module.body):
-            test_function()
-        module.operation.verify()
-        print(module)
-    return test_function
-
-
-def run_construct_and_print_in_module(test_function):
-    # Used by test modules:
-    # - integration/dialects/pdl
-    # - integration/dialects/transform
-    with Context(), Location.unknown():
-        module = Module.create()
-        with InsertionPoint(module.body):
-            module = test_function(module)
-        if module is not None:
-            print(module)
-    return test_function
-
-
-TEST_MODULES = [
-    ("execution_engine", run),
-    ("pass_manager", run),
-    ("dialects/affine", run_with_insertion_point),
-    ("dialects/func", run_with_insertion_point),
-    ("dialects/arith_dialect", run),
-    ("dialects/arith_llvm", run),
-    ("dialects/async_dialect", run),
-    ("dialects/builtin", run),
-    ("dialects/cf", run_with_insertion_point_all_unreg_dialects),
-    ("dialects/complex_dialect", run),
-    ("dialects/func", run_with_insertion_point),
-    ("dialects/index_dialect", run_with_insertion_point_and_context_arg),
-    ("dialects/llvm", run_with_insertion_point),
-    ("dialects/math_dialect", run),
-    ("dialects/memref", run),
-    ("dialects/ml_program", run_with_insertion_point),
-    ("dialects/nvgpu", run_with_insertion_point),
-    ("dialects/nvvm", run_with_insertion_point),
-    ("dialects/ods_helpers", run),
-    ("dialects/openmp_ops", run_with_insertion_point),
-    ("dialects/pdl_ops", run_with_insertion_point),
-    # ("dialects/python_test", run),  # TODO: Need to pass pybind11 or nanobind argv
-    ("dialects/quant", run),
-    ("dialects/rocdl", run_with_insertion_point),
-    ("dialects/scf", run_with_insertion_point),
-    ("dialects/shape", run),
-    ("dialects/spirv_dialect", run),
-    ("dialects/tensor", run),
-    # ("dialects/tosa", ),  # Nothing to test
-    ("dialects/transform_bufferization_ext", run_with_insertion_point),
-    # ("dialects/transform_extras", ),  # Needs a more complicated execution schema
-    ("dialects/transform_gpu_ext", run_transform_tensor_ext),
-    (
-        "dialects/transform_interpreter",
-        run_with_context_and_location,
-        ["print_", "transform_options", "failed", "include"],
-    ),
-    (
-        "dialects/transform_loop_ext",
-        run_with_insertion_point,
-        ["loopOutline"],
-    ),
-    ("dialects/transform_memref_ext", run_with_insertion_point),
-    ("dialects/transform_nvgpu_ext", run_with_insertion_point),
-    ("dialects/transform_sparse_tensor_ext", run_transform_tensor_ext),
-    ("dialects/transform_structured_ext", run_transform_structured_ext),
-    ("dialects/transform_tensor_ext", run_transform_tensor_ext),
-    (
-        "dialects/transform_vector_ext",
-        run_apply_patterns,
-        ["configurable_patterns"],
-    ),
-    ("dialects/transform", run_with_insertion_point_and_module_arg),
-    ("dialects/vector", run_with_context_and_location),
-    ("dialects/gpu/dialect", run_with_context_and_location),
-    ("dialects/gpu/module-to-binary-nvvm", run_with_context_and_location),
-    ("dialects/gpu/module-to-binary-rocdl", run_with_context_and_location),
-    ("dialects/linalg/ops", run),
-    # TO ADD: No proper tests in this dialects/linalg/opsdsl/*
-    # ("dialects/linalg/opsdsl/*", ...),
-    ("dialects/sparse_tensor/dialect", run),
-    ("dialects/sparse_tensor/passes", run),
-    ("integration/dialects/pdl", run_construct_and_print_in_module),
-    ("integration/dialects/transform", run_construct_and_print_in_module),
-    ("integration/dialects/linalg/opsrun", run),
-    ("ir/affine_expr", run),
-    ("ir/affine_map", run),
-    ("ir/array_attributes", run),
-    ("ir/attributes", run),
-    ("ir/blocks", run),
-    ("ir/builtin_types", run),
-    ("ir/context_managers", run),
-    ("ir/debug", run),
-    ("ir/diagnostic_handler", run),
-    ("ir/dialects", run),
-    ("ir/exception", run),
-    ("ir/insertion_point", run),
-    ("ir/integer_set", run),
-    ("ir/location", run),
-    ("ir/module", run),
-    ("ir/operation", run),
-    ("ir/symbol_table", run),
-    ("ir/value", run),
-]
-
-TESTS_TO_SKIP = [
-    "test_execution_engine__testNanoTime_multi_threaded",  # testNanoTime can't run in multiple threads, even with GIL
-    "test_execution_engine__testSharedLibLoad_multi_threaded",  # testSharedLibLoad can't run in multiple threads, even with GIL
-    "test_dialects_arith_dialect__testArithValue_multi_threaded",  # RuntimeError: Value caster is already registered: <class 'dialects/arith_dialect.testArithValue.<locals>.ArithValue'>, even with GIL
-    "test_ir_dialects__testAppendPrefixSearchPath_multi_threaded",  # PyGlobals::setDialectSearchPrefixes is not thread-safe, even with GIL. Strange usage of static PyGlobals vs python exposed _cext.globals
-    "test_ir_value__testValueCasters_multi_threaded",  # RuntimeError: Value caster is already registered: <function testValueCasters.<locals>.dont_cast_int, even with GIL
-    # tests indirectly calling thread-unsafe llvm::raw_ostream
-    "test_execution_engine__testInvalidModule_multi_threaded",  # mlirExecutionEngineCreate calls thread-unsafe llvm::raw_ostream
-    "test_pass_manager__testPrintIrAfterAll_multi_threaded",  # IRPrinterInstrumentation::runAfterPass calls thread-unsafe llvm::raw_ostream
-    "test_pass_manager__testPrintIrBeforeAndAfterAll_multi_threaded",  # IRPrinterInstrumentation::runBeforePass calls thread-unsafe llvm::raw_ostream
-    "test_pass_manager__testPrintIrLargeLimitElements_multi_threaded",  # IRPrinterInstrumentation::runAfterPass calls thread-unsafe llvm::raw_ostream
-    "test_pass_manager__testPrintIrTree_multi_threaded",  # IRPrinterInstrumentation::runAfterPass calls thread-unsafe llvm::raw_ostream
-    "test_pass_manager__testRunPipeline_multi_threaded",  # PrintOpStatsPass::printSummary calls thread-unsafe llvm::raw_ostream
-    "test_dialects_transform_interpreter__include_multi_threaded",  # mlir::transform::PrintOp::apply(mlir::transform::TransformRewriter...) calls thread-unsafe llvm::raw_ostream
-    "test_dialects_transform_interpreter__transform_options_multi_threaded",  # mlir::transform::PrintOp::apply(mlir::transform::TransformRewriter...) calls thread-unsafe llvm::raw_ostream
-    "test_dialects_transform_interpreter__print_self_multi_threaded",  # mlir::transform::PrintOp::apply(mlir::transform::TransformRewriter...) call thread-unsafe llvm::raw_ostream
-    "test_ir_diagnostic_handler__testDiagnosticCallbackException_multi_threaded",  # mlirEmitError calls thread-unsafe llvm::raw_ostream
-    "test_ir_module__testParseSuccess_multi_threaded",  # mlirOperationDump calls thread-unsafe llvm::raw_ostream
-    # False-positive TSAN detected race in llvm::RuntimeDyldELF::registerEHFrames()
-    # Details: https://github.com/llvm/llvm-project/pull/107103/files#r1905726947
-    "test_execution_engine__testCapsule_multi_threaded",
-    "test_execution_engine__testDumpToObjectFile_multi_threaded",
-]
-
-TESTS_TO_XFAIL = [
-    # execution_engine tests:
-    # - ctypes related data-races: https://github.com/python/cpython/issues/127945
-    "test_execution_engine__testBF16Memref_multi_threaded",
-    "test_execution_engine__testBasicCallback_multi_threaded",
-    "test_execution_engine__testComplexMemrefAdd_multi_threaded",
-    "test_execution_engine__testComplexUnrankedMemrefAdd_multi_threaded",
-    "test_execution_engine__testDynamicMemrefAdd2D_multi_threaded",
-    "test_execution_engine__testF16MemrefAdd_multi_threaded",
-    "test_execution_engine__testF8E5M2Memref_multi_threaded",
-    "test_execution_engine__testInvokeFloatAdd_multi_threaded",
-    "test_execution_engine__testInvokeVoid_multi_threaded",  # a ctypes race
-    "test_execution_engine__testMemrefAdd_multi_threaded",
-    "test_execution_engine__testRankedMemRefCallback_multi_threaded",
-    "test_execution_engine__testRankedMemRefWithOffsetCallback_multi_threaded",
-    "test_execution_engine__testUnrankedMemRefCallback_multi_threaded",
-    "test_execution_engine__testUnrankedMemRefWithOffsetCallback_multi_threaded",
-    # dialects tests
-    "test_dialects_memref__testSubViewOpInferReturnTypeExtensiveSlicing_multi_threaded",  # Related to ctypes data races
-    "test_dialects_transform_interpreter__print_other_multi_threaded",  # Fatal Python error: Aborted or mlir::transform::PrintOp::apply(mlir::transform::TransformRewriter...) is not thread-safe
-    "test_dialects_gpu_module-to-binary-rocdl__testGPUToASMBin_multi_threaded",  # Due to global llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp::GCNTrackers variable mutation
-    "test_dialects_gpu_module-to-binary-nvvm__testGPUToASMBin_multi_threaded",
-    "test_dialects_gpu_module-to-binary-nvvm__testGPUToLLVMBin_multi_threaded",
-    "test_dialects_gpu_module-to-binary-rocdl__testGPUToLLVMBin_multi_threaded",
-    # integration tests
-    "test_integration_dialects_linalg_opsrun__test_elemwise_builtin_multi_threaded",  # Related to ctypes data races
-    "test_integration_dialects_linalg_opsrun__test_elemwise_generic_multi_threaded",  # Related to ctypes data races
-    "test_integration_dialects_linalg_opsrun__test_fill_builtin_multi_threaded",  # ctypes
-    "test_integration_dialects_linalg_opsrun__test_fill_generic_multi_threaded",  # ctypes
-    "test_integration_dialects_linalg_opsrun__test_fill_rng_builtin_multi_threaded",  # ctypes
-    "test_integration_dialects_linalg_opsrun__test_fill_rng_generic_multi_threaded",  # ctypes
-    "test_integration_dialects_linalg_opsrun__test_max_pooling_builtin_multi_threaded",  # ctypes
-    "test_integration_dialects_linalg_opsrun__test_max_pooling_generic_multi_threaded",  # ctypes
-    "test_integration_dialects_linalg_opsrun__test_min_pooling_builtin_multi_threaded",  # ctypes
-    "test_integration_dialects_linalg_opsrun__test_min_pooling_generic_multi_threaded",  # ctypes
-]
-
-
-def add_existing_tests(test_modules, test_prefix: str = "_original_test"):
-    def decorator(test_cls):
-        this_folder = Path(__file__).parent.absolute()
-        test_cls.output_folder = tempfile.TemporaryDirectory()
-        output_folder = Path(test_cls.output_folder.name)
-
-        for test_mod_info in test_modules:
-            # test_mod_info is a tuple of size 2 or 3:
-            # (test_module_str, run_test_function) or (test_module_str, run_test_function, test_name_patterns_list)
-            # For example:
-            # - ("ir/value", run) or
-            # - ("dialects/transform_loop_ext", run_with_insertion_point, ["loopOutline"])
-            assert isinstance(test_mod_info, tuple) and len(test_mod_info) in (2, 3)
-            if len(test_mod_info) == 2:
-                test_module_name, exec_fn = test_mod_info
-                test_pattern = None
-            else:
-                test_module_name, exec_fn, test_pattern = test_mod_info
-
-            src_filepath = this_folder / f"{test_module_name}.py"
-            dst_filepath = (output_folder / f"{test_module_name}.py").absolute()
-            if not dst_filepath.parent.exists():
-                dst_filepath.parent.mkdir(parents=True)
-            copy_and_update(src_filepath, dst_filepath)
-            test_mod = import_from_path(test_module_name, dst_filepath)
-            for attr_name in dir(test_mod):
-                is_test_fn = test_pattern is None and attr_name.startswith("test")
-                is_test_fn |= test_pattern is not None and any(
-                    [p in attr_name for p in test_pattern]
-                )
-                if is_test_fn:
-                    obj = getattr(test_mod, attr_name)
-                    if callable(obj):
-                        test_name = f"{test_prefix}_{test_module_name.replace('/', '_')}__{attr_name}"
-
-                        def wrapped_test_fn(
-                            self, *args, __test_fn__=obj, __exec_fn__=exec_fn, **kwargs
-                        ):
-                            __exec_fn__(__test_fn__)
-
-                        setattr(test_cls, test_name, wrapped_test_fn)
-        return test_cls
-
-    return decorator
-
-
-@contextmanager
-def _capture_output(fp):
-    # Inspired from jax test_utils.py capture_stderr method
-    # ``None`` means nothing has not been captured yet.
-    captured = None
-
-    def get_output() -> str:
-        if captured is None:
-            raise ValueError("get_output() called while the context is active.")
-        return captured
-
-    with tempfile.NamedTemporaryFile(mode="w+", encoding="utf-8") as f:
-        original_fd = os.dup(fp.fileno())
-        os.dup2(f.fileno(), fp.fileno())
-        try:
-            yield get_output
-        finally:
-            # Python also has its own buffers, make sure everything is flushed.
-            fp.flush()
-            os.fsync(fp.fileno())
-            f.seek(0)
-            captured = f.read()
-            os.dup2(original_fd, fp.fileno())
-
-
-capture_stdout = partial(_capture_output, sys.stdout)
-capture_stderr = partial(_capture_output, sys.stderr)
-
-
-def multi_threaded(
-    num_workers: int,
-    num_runs: int = 5,
-    skip_tests: Optional[list[str]] = None,
-    xfail_tests: Optional[list[str]] = None,
-    test_prefix: str = "_original_test",
-    multithreaded_test_postfix: str = "_multi_threaded",
-):
-    """Decorator that runs a test in a multi-threaded environment."""
-
-    def decorator(test_cls):
-        for name, test_fn in test_cls.__dict__.copy().items():
-            if not (name.startswith(test_prefix) and callable(test_fn)):
-                continue
-
-            name = f"test{name[len(test_prefix):]}"
-            if skip_tests is not None:
-                if any(
-                    test_name.replace(multithreaded_test_postfix, "") in name
-                    for test_name in skip_tests
-                ):
-                    continue
-
-            def multi_threaded_test_fn(self, *args, __test_fn__=test_fn, **kwargs):
-                with capture_stdout(), capture_stderr() as get_output:
-                    barrier = threading.Barrier(num_workers)
-
-                    def closure():
-                        barrier.wait()
-                        for _ in range(num_runs):
-                            __test_fn__(self, *args, **kwargs)
-
-                    with concurrent.futures.ThreadPoolExecutor(
-                        max_workers=num_workers
-                    ) as executor:
-                        futures = []
-                        for _ in range(num_workers):
-                            futures.append(executor.submit(closure))
-                        # We should call future.result() to re-raise an exception if test has
-                        # failed
-                        assert len(list(f.result() for f in futures)) == num_workers
-
-                    gc.collect()
-                    assert Context._get_live_count() == 0
-
-                captured = get_output()
-                if len(captured) > 0 and "ThreadSanitizer" in captured:
-                    raise RuntimeError(
-                        f"ThreadSanitizer reported warnings:\n{captured}"
-                    )
-
-            test_new_name = f"{name}{multithreaded_test_postfix}"
-            if xfail_tests is not None and test_new_name in xfail_tests:
-                multi_threaded_test_fn = unittest.expectedFailure(
-                    multi_threaded_test_fn
-                )
-
-            setattr(test_cls, test_new_name, multi_threaded_test_fn)
-
-        return test_cls
-
-    return decorator
-
-
-@multi_threaded(
-    num_workers=10,
-    num_runs=20,
-    skip_tests=TESTS_TO_SKIP,
-    xfail_tests=TESTS_TO_XFAIL,
-)
-@add_existing_tests(test_modules=TEST_MODULES, test_prefix="_original_test")
-class TestAllMultiThreaded(unittest.TestCase):
-    @classmethod
-    def tearDownClass(cls):
-        if hasattr(cls, "output_folder"):
-            cls.output_folder.cleanup()
-
-    def _original_test_create_context(self):
-        with Context() as ctx:
-            print(ctx._get_live_count())
-            print(ctx._get_live_module_count())
-            print(ctx._get_live_operation_count())
-            print(ctx._get_live_operation_objects())
-            print(ctx._get_context_again() is ctx)
-            print(ctx._clear_live_operations())
-
-    def _original_test_create_module_with_consts(self):
-        py_values = [123, 234, 345]
-        with Context() as ctx:
-            module = Module.create(loc=Location.file("foo.txt", 0, 0))
-
-            dtype = IntegerType.get_signless(64)
-            with InsertionPoint(module.body), Location.name("a"):
-                arith.constant(dtype, py_values[0])
-
-            with InsertionPoint(module.body), Location.name("b"):
-                arith.constant(dtype, py_values[1])
-
-            with InsertionPoint(module.body), Location.name("c"):
-                arith.constant(dtype, py_values[2])
-
-
-if __name__ == "__main__":
-    # Do not run the tests on CPython with GIL
-    if hasattr(sys, "_is_gil_enabled") and not sys._is_gil_enabled():
-        unittest.main()

From 43fdd6e81d8ea42d1de8fd96fe45957dee54cb32 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 12 Jan 2025 11:06:42 -0800
Subject: [PATCH 209/408] [memprof] Migrate away from PointerUnion::is (NFC)
 (#122622)

Note that PointerUnion::is have been soft deprecated in
PointerUnion.h:

  // FIXME: Replace the uses of is(), get() and dyn_cast() with
  //        isa<T>, cast<T> and the llvm::dyn_cast<T>

In this patch, I'm calling call().getBase() for an instance of
PointerUnion.  call() alone would return an instance of IndexCall,
which wraps PointerUnion.  Note that isa<> cannot directly accept an
instance of IndexCall, at least without defining CastInfo.

I'm not touching PointerUnion::dyn_cast for now because it's a bit
complicated; we could blindly migrate it to dyn_cast_if_present, but
we should probably use dyn_cast when the operand is known to be
non-null.
---
 llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 3d0e5cb5f97b4..61a8f4a448bbd 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -3610,7 +3610,7 @@ IndexCallsiteContextGraph::cloneFunctionForCallsite(
   // Confirm this matches the CloneNo provided by the caller, which is based on
   // the number of function clones we have.
   assert(CloneNo ==
-         (Call.call().is<AllocInfo *>()
+         (isa<AllocInfo *>(Call.call().getBase())
               ? Call.call().dyn_cast<AllocInfo *>()->Versions.size()
               : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size()));
   // Walk all the instructions in this function. Create a new version for

From fd87188c2bbbdd283b286daea2b9bd5c6605f7e1 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 12 Jan 2025 11:06:56 -0800
Subject: [PATCH 210/408] [wasm] Avoid repeated hash lookups (NFC) (#122626)

---
 lld/wasm/SymbolTable.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lld/wasm/SymbolTable.cpp b/lld/wasm/SymbolTable.cpp
index f57359083d242..6309cf35fe552 100644
--- a/lld/wasm/SymbolTable.cpp
+++ b/lld/wasm/SymbolTable.cpp
@@ -1009,8 +1009,8 @@ void SymbolTable::handleWeakUndefines() {
 }
 
 DefinedFunction *SymbolTable::createUndefinedStub(const WasmSignature &sig) {
-  if (stubFunctions.count(sig))
-    return stubFunctions[sig];
+  if (auto it = stubFunctions.find(sig); it != stubFunctions.end())
+    return it->second;
   LLVM_DEBUG(dbgs() << "createUndefinedStub: " << toString(sig) << "\n");
   auto *sym = reinterpret_cast<DefinedFunction *>(make<SymbolUnion>());
   sym->isUsedInRegularObj = true;

From 16aa400a2780ab21f73722875734440643f276c3 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 12 Jan 2025 11:07:07 -0800
Subject: [PATCH 211/408] [ELF] Avoid repeated hash lookups (NFC) (#122628)

---
 lld/ELF/Arch/ARM.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp
index 29a72d35af666..de6e45c6cc65c 100644
--- a/lld/ELF/Arch/ARM.cpp
+++ b/lld/ELF/Arch/ARM.cpp
@@ -1320,8 +1320,9 @@ void elf::processArmCmseSymbols(Ctx &ctx) {
     MutableArrayRef<Symbol *> syms = file->getMutableSymbols();
     for (size_t i = 0, e = syms.size(); i != e; ++i) {
       StringRef symName = syms[i]->getName();
-      if (ctx.symtab->cmseSymMap.count(symName))
-        syms[i] = ctx.symtab->cmseSymMap[symName].acleSeSym;
+      auto it = ctx.symtab->cmseSymMap.find(symName);
+      if (it != ctx.symtab->cmseSymMap.end())
+        syms[i] = it->second.acleSeSym;
     }
   });
 }
@@ -1370,8 +1371,9 @@ void ArmCmseSGSection::addSGVeneer(Symbol *acleSeSym, Symbol *sym) {
   // Only secure symbols with values equal to that of it's non-secure
   // counterpart needs to be in the .gnu.sgstubs section.
   std::unique_ptr<ArmCmseSGVeneer> ss;
-  if (ctx.symtab->cmseImportLib.count(sym->getName())) {
-    Defined *impSym = ctx.symtab->cmseImportLib[sym->getName()];
+  auto it = ctx.symtab->cmseImportLib.find(sym->getName());
+  if (it != ctx.symtab->cmseImportLib.end()) {
+    Defined *impSym = it->second;
     ss = std::make_unique<ArmCmseSGVeneer>(sym, acleSeSym, impSym->value);
   } else {
     ss = std::make_unique<ArmCmseSGVeneer>(sym, acleSeSym);

From 1afba19913253dda865a8e57b37b9f4dabead1ac Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sun, 12 Jan 2025 19:29:09 +0000
Subject: [PATCH 212/408] [VPlan] Try to narrow wide and replicating recipes to
 uniform recipes.

Use the existing VPlan-based analysis to identify recipes that only have
their first lane demanded and transform them to uniform recpliate
recipes. This simplifies the generated code in some places and prepares
for fixing https://github.com/llvm/llvm-project/issues/122496.
---
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 27 ++++++++++++++++++-
 .../AArch64/divs-with-scalable-vfs.ll         |  8 +++---
 .../LoopVectorize/X86/gep-use-outside-loop.ll |  6 -----
 .../LoopVectorize/scalable-assume.ll          | 12 ++++-----
 4 files changed, 35 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 878db522be1bd..f8e878c04d1f3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -596,11 +596,36 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
     if (!PhiR)
       continue;
 
+    // Try to narrow wide and replicating recipes to uniform recipes, based on
+    // VPlan analysis.
+    // TODO: Apply to all recipes in the future, to replace legacy uniformity
+    // analysis.
+    auto Users = collectUsersRecursively(PhiR);
+    for (VPUser *U : reverse(Users)) {
+      auto *Def = dyn_cast<VPSingleDefRecipe>(U);
+      auto *RepR = dyn_cast<VPReplicateRecipe>(U);
+      // Skip recipes that shouldn't be narrowed.
+      if (!isa<VPReplicateRecipe, VPWidenRecipe>(Def) || !Def ||
+          Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
+          (RefR && (RepR->isUniform() || RepRr->isPredicated())))
+        continue;
+
+      // Skip recipes that may have other lanes than their first used.
+      if (!vputils::isUniformAfterVectorization(Def) &&
+          !vputils::onlyFirstLaneUsed(Def))
+        continue;
+
+      auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
+                                          Def->operands(), /*IsUniform*/ true);
+      Clone->insertAfter(Def);
+      Def->replaceAllUsesWith(Clone);
+    }
+
     // Check if any uniform VPReplicateRecipes using the phi recipe are used by
     // ExtractFromEnd. Those must be replaced by a regular VPReplicateRecipe to
     // ensure the final value is available.
     // TODO: Remove once uniformity analysis is done on VPlan.
-    for (VPUser *U : collectUsersRecursively(PhiR)) {
+    for (VPUser *U : Users) {
       auto *ExitIRI = dyn_cast<VPIRInstruction>(U);
       VPValue *Op;
       if (!ExitIRI || !match(ExitIRI->getOperand(0),
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
index 9e5c6e1527c55..2c37593be7861 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
@@ -132,8 +132,6 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[M]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[CONV6]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -142,9 +140,9 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i
 ; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp ule <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP22]], <vscale x 2 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = select <vscale x 2 x i1> [[TMP23]], <vscale x 2 x i64> [[BROADCAST_SPLAT2]], <vscale x 2 x i64> splat (i64 1)
-; CHECK-NEXT:    [[TMP25:%.*]] = sdiv <vscale x 2 x i64> [[BROADCAST_SPLAT]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <vscale x 2 x i64> [[TMP25]], i32 0
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <vscale x 2 x i1> [[TMP23]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[CONV6]], i64 1
+; CHECK-NEXT:    [[TMP26:%.*]] = sdiv i64 [[M]], [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32
 ; CHECK-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP26]], [[CONV61]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = sub i64 [[TMP21]], [[TMP28]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll b/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll
index a90b4df3999f2..e22b090f6c0d0 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll
@@ -12,18 +12,12 @@ define void @gep_use_in_dead_block(ptr noalias %dst, ptr %src) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i16, ptr [[TMP4]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP5]], align 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i16> [[WIDE_LOAD]], splat (i16 10)
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i1> [[TMP6]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i16, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    call void @llvm.masked.store.v4i16.p0(<4 x i16> zeroinitializer, ptr [[TMP12]], i32 2, <4 x i1> [[TMP7]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-assume.ll b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
index f4c099ff9aa11..358293f123454 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
@@ -3,12 +3,12 @@
 define void @test1(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) {
 ; CHECK-LABEL: @test1(
 ; CHECK:       vector.body:
-; CHECK:         [[FCMP1:%.*]] = fcmp ogt <vscale x 2 x float>
-; CHECK-NEXT:    [[FCMP2:%.*]] = fcmp ogt <vscale x 2 x float>
-; CHECK-NEXT:    [[FCMP1L0:%.*]] = extractelement <vscale x 2 x i1> [[FCMP1]], i32 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[FCMP1L0]])
-; CHECK-NEXT:    [[FCMP2L0:%.*]] = extractelement <vscale x 2 x i1> [[FCMP2]], i32 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[FCMP2L0]])
+; CHECK:         [[E1:%.*]] = extractelement <vscale x 2 x float> {{.+}}, i32 0
+; CHECK-NEXT:    [[FCMP1:%.*]] = fcmp ogt float [[E1]]
+; CHECK-NEXT:    [[E2:%.*]] = extractelement <vscale x 2 x float> {{.+}}, i32 0
+; CHECK-NEXT:    [[FCMP2:%.*]] = fcmp ogt float [[E2]]
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[FCMP1]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[FCMP2]])
 entry:
   br label %for.body
 

From 0ebb3ac7c92c4c1c44e7f3d17832d75ec5a42a67 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sun, 12 Jan 2025 19:37:45 +0000
Subject: [PATCH 213/408] Revert "[VPlan] Try to narrow wide and replicating
 recipes to uniform recipes."

This reverts commit 1afba19913253dda865a8e57b37b9f4dabead1ac.

Typo breaking the build
---
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 27 +------------------
 .../AArch64/divs-with-scalable-vfs.ll         |  8 +++---
 .../LoopVectorize/X86/gep-use-outside-loop.ll |  6 +++++
 .../LoopVectorize/scalable-assume.ll          | 12 ++++-----
 4 files changed, 18 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index f8e878c04d1f3..878db522be1bd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -596,36 +596,11 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
     if (!PhiR)
       continue;
 
-    // Try to narrow wide and replicating recipes to uniform recipes, based on
-    // VPlan analysis.
-    // TODO: Apply to all recipes in the future, to replace legacy uniformity
-    // analysis.
-    auto Users = collectUsersRecursively(PhiR);
-    for (VPUser *U : reverse(Users)) {
-      auto *Def = dyn_cast<VPSingleDefRecipe>(U);
-      auto *RepR = dyn_cast<VPReplicateRecipe>(U);
-      // Skip recipes that shouldn't be narrowed.
-      if (!isa<VPReplicateRecipe, VPWidenRecipe>(Def) || !Def ||
-          Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
-          (RefR && (RepR->isUniform() || RepRr->isPredicated())))
-        continue;
-
-      // Skip recipes that may have other lanes than their first used.
-      if (!vputils::isUniformAfterVectorization(Def) &&
-          !vputils::onlyFirstLaneUsed(Def))
-        continue;
-
-      auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
-                                          Def->operands(), /*IsUniform*/ true);
-      Clone->insertAfter(Def);
-      Def->replaceAllUsesWith(Clone);
-    }
-
     // Check if any uniform VPReplicateRecipes using the phi recipe are used by
     // ExtractFromEnd. Those must be replaced by a regular VPReplicateRecipe to
     // ensure the final value is available.
     // TODO: Remove once uniformity analysis is done on VPlan.
-    for (VPUser *U : Users) {
+    for (VPUser *U : collectUsersRecursively(PhiR)) {
       auto *ExitIRI = dyn_cast<VPIRInstruction>(U);
       VPValue *Op;
       if (!ExitIRI || !match(ExitIRI->getOperand(0),
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
index 2c37593be7861..9e5c6e1527c55 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
@@ -132,6 +132,8 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[M]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[CONV6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -140,9 +142,9 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i
 ; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp ule <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP22]], <vscale x 2 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <vscale x 2 x i1> [[TMP23]], i32 0
-; CHECK-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[CONV6]], i64 1
-; CHECK-NEXT:    [[TMP26:%.*]] = sdiv i64 [[M]], [[TMP25]]
+; CHECK-NEXT:    [[TMP24:%.*]] = select <vscale x 2 x i1> [[TMP23]], <vscale x 2 x i64> [[BROADCAST_SPLAT2]], <vscale x 2 x i64> splat (i64 1)
+; CHECK-NEXT:    [[TMP25:%.*]] = sdiv <vscale x 2 x i64> [[BROADCAST_SPLAT]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <vscale x 2 x i64> [[TMP25]], i32 0
 ; CHECK-NEXT:    [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32
 ; CHECK-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP26]], [[CONV61]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = sub i64 [[TMP21]], [[TMP28]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll b/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll
index e22b090f6c0d0..a90b4df3999f2 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll
@@ -12,12 +12,18 @@ define void @gep_use_in_dead_block(ptr noalias %dst, ptr %src) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i16, ptr [[TMP4]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP5]], align 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i16> [[WIDE_LOAD]], splat (i16 10)
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i1> [[TMP6]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i16, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    call void @llvm.masked.store.v4i16.p0(<4 x i16> zeroinitializer, ptr [[TMP12]], i32 2, <4 x i1> [[TMP7]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-assume.ll b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
index 358293f123454..f4c099ff9aa11 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
@@ -3,12 +3,12 @@
 define void @test1(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) {
 ; CHECK-LABEL: @test1(
 ; CHECK:       vector.body:
-; CHECK:         [[E1:%.*]] = extractelement <vscale x 2 x float> {{.+}}, i32 0
-; CHECK-NEXT:    [[FCMP1:%.*]] = fcmp ogt float [[E1]]
-; CHECK-NEXT:    [[E2:%.*]] = extractelement <vscale x 2 x float> {{.+}}, i32 0
-; CHECK-NEXT:    [[FCMP2:%.*]] = fcmp ogt float [[E2]]
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[FCMP1]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[FCMP2]])
+; CHECK:         [[FCMP1:%.*]] = fcmp ogt <vscale x 2 x float>
+; CHECK-NEXT:    [[FCMP2:%.*]] = fcmp ogt <vscale x 2 x float>
+; CHECK-NEXT:    [[FCMP1L0:%.*]] = extractelement <vscale x 2 x i1> [[FCMP1]], i32 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[FCMP1L0]])
+; CHECK-NEXT:    [[FCMP2L0:%.*]] = extractelement <vscale x 2 x i1> [[FCMP2]], i32 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[FCMP2L0]])
 entry:
   br label %for.body
 

From 3ff1d0198575282ad585c891b2e61e904a7d8c5a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sun, 12 Jan 2025 20:10:27 +0000
Subject: [PATCH 214/408] Recommit "[VPlan] Try to narrow wide and replicating
 recipes to uniform recipes."

This reverts commit 0ebb3ac7c92c4c1c44e7f3d17832d75ec5a42a67.

Re-applies commit with typos fixed.
---
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 27 ++++++++++++++++++-
 .../AArch64/divs-with-scalable-vfs.ll         |  8 +++---
 .../LoopVectorize/X86/gep-use-outside-loop.ll |  6 -----
 .../LoopVectorize/scalable-assume.ll          | 12 ++++-----
 4 files changed, 35 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 878db522be1bd..f440bf2eb0223 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -596,11 +596,36 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
     if (!PhiR)
       continue;
 
+    // Try to narrow wide and replicating recipes to uniform recipes, based on
+    // VPlan analysis.
+    // TODO: Apply to all recipes in the future, to replace legacy uniformity
+    // analysis.
+    auto Users = collectUsersRecursively(PhiR);
+    for (VPUser *U : reverse(Users)) {
+      auto *Def = dyn_cast<VPSingleDefRecipe>(U);
+      auto *RepR = dyn_cast<VPReplicateRecipe>(U);
+      // Skip recipes that shouldn't be narrowed.
+      if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
+          Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
+          (RepR && (RepR->isUniform() || RepR->isPredicated())))
+        continue;
+
+      // Skip recipes that may have other lanes than their first used.
+      if (!vputils::isUniformAfterVectorization(Def) &&
+          !vputils::onlyFirstLaneUsed(Def))
+        continue;
+
+      auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
+                                          Def->operands(), /*IsUniform*/ true);
+      Clone->insertAfter(Def);
+      Def->replaceAllUsesWith(Clone);
+    }
+
     // Check if any uniform VPReplicateRecipes using the phi recipe are used by
     // ExtractFromEnd. Those must be replaced by a regular VPReplicateRecipe to
     // ensure the final value is available.
     // TODO: Remove once uniformity analysis is done on VPlan.
-    for (VPUser *U : collectUsersRecursively(PhiR)) {
+    for (VPUser *U : Users) {
       auto *ExitIRI = dyn_cast<VPIRInstruction>(U);
       VPValue *Op;
       if (!ExitIRI || !match(ExitIRI->getOperand(0),
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
index 9e5c6e1527c55..2c37593be7861 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
@@ -132,8 +132,6 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[M]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[CONV6]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -142,9 +140,9 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i
 ; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp ule <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP22]], <vscale x 2 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = select <vscale x 2 x i1> [[TMP23]], <vscale x 2 x i64> [[BROADCAST_SPLAT2]], <vscale x 2 x i64> splat (i64 1)
-; CHECK-NEXT:    [[TMP25:%.*]] = sdiv <vscale x 2 x i64> [[BROADCAST_SPLAT]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <vscale x 2 x i64> [[TMP25]], i32 0
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <vscale x 2 x i1> [[TMP23]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[CONV6]], i64 1
+; CHECK-NEXT:    [[TMP26:%.*]] = sdiv i64 [[M]], [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32
 ; CHECK-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP26]], [[CONV61]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = sub i64 [[TMP21]], [[TMP28]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll b/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll
index a90b4df3999f2..e22b090f6c0d0 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll
@@ -12,18 +12,12 @@ define void @gep_use_in_dead_block(ptr noalias %dst, ptr %src) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i16, ptr [[TMP4]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP5]], align 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i16> [[WIDE_LOAD]], splat (i16 10)
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i1> [[TMP6]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i16, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    call void @llvm.masked.store.v4i16.p0(<4 x i16> zeroinitializer, ptr [[TMP12]], i32 2, <4 x i1> [[TMP7]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-assume.ll b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
index f4c099ff9aa11..358293f123454 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
@@ -3,12 +3,12 @@
 define void @test1(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) {
 ; CHECK-LABEL: @test1(
 ; CHECK:       vector.body:
-; CHECK:         [[FCMP1:%.*]] = fcmp ogt <vscale x 2 x float>
-; CHECK-NEXT:    [[FCMP2:%.*]] = fcmp ogt <vscale x 2 x float>
-; CHECK-NEXT:    [[FCMP1L0:%.*]] = extractelement <vscale x 2 x i1> [[FCMP1]], i32 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[FCMP1L0]])
-; CHECK-NEXT:    [[FCMP2L0:%.*]] = extractelement <vscale x 2 x i1> [[FCMP2]], i32 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[FCMP2L0]])
+; CHECK:         [[E1:%.*]] = extractelement <vscale x 2 x float> {{.+}}, i32 0
+; CHECK-NEXT:    [[FCMP1:%.*]] = fcmp ogt float [[E1]]
+; CHECK-NEXT:    [[E2:%.*]] = extractelement <vscale x 2 x float> {{.+}}, i32 0
+; CHECK-NEXT:    [[FCMP2:%.*]] = fcmp ogt float [[E2]]
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[FCMP1]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[FCMP2]])
 entry:
   br label %for.body
 

From f5a35a31bfe6cbc16bec0c130f2bb3632dbf1fbf Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sun, 12 Jan 2025 20:55:20 +0000
Subject: [PATCH 215/408] [LV] Add test cases with incorrect IV live-outs.

Add test cases for https://github.com/llvm/llvm-project/issues/122496
and https://github.com/llvm/llvm-project/issues/122602.
---
 .../LoopVectorize/iv_outside_user.ll          | 185 ++++++++++++++++++
 1 file changed, 185 insertions(+)

diff --git a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
index 482f731afd813..6b0c677b56d2c 100644
--- a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
@@ -1174,3 +1174,188 @@ e.exit:
   %res = phi i32 [ %iv.next, %loop ]
   ret i32 %res
 }
+
+; Test case for https://github.com/llvm/llvm-project/issues/122496.
+; FIXME: Currently an incorrect live-out is used.
+define i32 @iv_ext_used_outside( ptr %dst) {
+; VEC-LABEL: define i32 @iv_ext_used_outside(
+; VEC-SAME: ptr [[DST:%.*]]) {
+; VEC-NEXT:  [[ENTRY:.*]]:
+; VEC-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VEC:       [[VECTOR_PH]]:
+; VEC-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VEC:       [[VECTOR_BODY]]:
+; VEC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
+; VEC-NEXT:    [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0
+; VEC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i16 [[TMP0]]
+; VEC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0
+; VEC-NEXT:    store <2 x i32> zeroinitializer, ptr [[TMP2]], align 4
+; VEC-NEXT:    [[TMP3:%.*]] = add nuw nsw i16 [[TMP0]], 1
+; VEC-NEXT:    [[TMP4:%.*]] = zext nneg i16 [[TMP3]] to i32
+; VEC-NEXT:    [[TMP5:%.*]] = zext nneg i16 [[TMP3]] to i32
+; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; VEC-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
+; VEC-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[MIDDLE_BLOCK]]:
+; VEC-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VEC:       [[SCALAR_PH]]:
+; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VEC-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VEC-NEXT:    br label %[[LOOP:.*]]
+; VEC:       [[LOOP]]:
+; VEC-NEXT:    [[IV_1:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ]
+; VEC-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[IV_1_EXT:%.*]], %[[LOOP]] ]
+; VEC-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i16 [[IV_1]]
+; VEC-NEXT:    store i32 0, ptr [[GEP]], align 4
+; VEC-NEXT:    [[IV_1_NEXT]] = add nuw nsw i16 [[IV_1]], 1
+; VEC-NEXT:    [[IV_1_EXT]] = zext nneg i16 [[IV_1_NEXT]] to i32
+; VEC-NEXT:    [[EC:%.*]] = icmp samesign ult i16 [[IV_1]], 128
+; VEC-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[EXIT]]:
+; VEC-NEXT:    [[IV_1_EXT_LCSSA:%.*]] = phi i32 [ [[IV_1_EXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
+; VEC-NEXT:    ret i32 [[IV_1_EXT_LCSSA]]
+;
+; INTERLEAVE-LABEL: define i32 @iv_ext_used_outside(
+; INTERLEAVE-SAME: ptr [[DST:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 1
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i16 [[TMP0]]
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i16 [[TMP1]]
+; INTERLEAVE-NEXT:    store i32 0, ptr [[TMP2]], align 4
+; INTERLEAVE-NEXT:    store i32 0, ptr [[TMP3]], align 4
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = add nuw nsw i16 [[TMP1]], 1
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = zext nneg i16 [[TMP4]] to i32
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
+; INTERLEAVE-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
+; INTERLEAVE:       [[LOOP]]:
+; INTERLEAVE-NEXT:    [[IV_1:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[IV_1_EXT:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i16 [[IV_1]]
+; INTERLEAVE-NEXT:    store i32 0, ptr [[GEP]], align 4
+; INTERLEAVE-NEXT:    [[IV_1_NEXT]] = add nuw nsw i16 [[IV_1]], 1
+; INTERLEAVE-NEXT:    [[IV_1_EXT]] = zext nneg i16 [[IV_1_NEXT]] to i32
+; INTERLEAVE-NEXT:    [[EC:%.*]] = icmp samesign ult i16 [[IV_1]], 128
+; INTERLEAVE-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[EXIT]]:
+; INTERLEAVE-NEXT:    [[IV_1_EXT_LCSSA:%.*]] = phi i32 [ [[IV_1_EXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    ret i32 [[IV_1_EXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv.1 = phi i16 [ 0, %entry ], [ %iv.1.next, %loop ]
+  %iv.2 = phi i32 [ 0, %entry ], [ %iv.1.ext, %loop ]
+  %gep = getelementptr inbounds nuw i32, ptr %dst, i16 %iv.1
+  store i32 0, ptr %gep, align 4
+  %iv.1.next = add nuw nsw i16 %iv.1, 1
+  %iv.1.ext = zext nneg i16 %iv.1.next to i32
+  %ec = icmp samesign ult i16 %iv.1, 128
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  %iv.1.ext.lcssa = phi i32 [ %iv.1.ext, %loop ]
+  ret i32 %iv.1.ext.lcssa
+}
+
+; Test case for https://github.com/llvm/llvm-project/issues/122602.
+; FIXME: Currently an incorrect live-out is used.
+define i64 @test_iv_increment_incremented(ptr %dst) {
+; VEC-LABEL: define i64 @test_iv_increment_incremented(
+; VEC-SAME: ptr [[DST:%.*]]) {
+; VEC-NEXT:  [[ENTRY:.*]]:
+; VEC-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VEC:       [[VECTOR_PH]]:
+; VEC-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VEC:       [[VECTOR_BODY]]:
+; VEC-NEXT:    [[TMP0:%.*]] = getelementptr i16, ptr [[DST]], i64 3
+; VEC-NEXT:    [[TMP1:%.*]] = getelementptr i16, ptr [[TMP0]], i32 0
+; VEC-NEXT:    [[TMP2:%.*]] = getelementptr i16, ptr [[TMP1]], i32 -1
+; VEC-NEXT:    store <2 x i16> splat (i16 1), ptr [[TMP2]], align 2
+; VEC-NEXT:    [[TMP3:%.*]] = add i64 2, -1
+; VEC-NEXT:    [[TMP4:%.*]] = add i64 [[TMP3]], 1
+; VEC-NEXT:    [[TMP5:%.*]] = add i64 [[TMP3]], 1
+; VEC-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VEC:       [[MIDDLE_BLOCK]]:
+; VEC-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VEC:       [[SCALAR_PH]]:
+; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1, %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ]
+; VEC-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 0, %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ]
+; VEC-NEXT:    br label %[[LOOP:.*]]
+; VEC:       [[LOOP]]:
+; VEC-NEXT:    [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ]
+; VEC-NEXT:    [[IV_2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ]
+; VEC-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[DST]], i64 [[IV_1]]
+; VEC-NEXT:    store i16 1, ptr [[GEP]], align 2
+; VEC-NEXT:    [[IV_2_NEXT]] = add i64 [[IV_2]], -1
+; VEC-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_2_NEXT]], 0
+; VEC-NEXT:    [[IV_1_NEXT]] = add i64 [[IV_2_NEXT]], 1
+; VEC-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[EXIT]]:
+; VEC-NEXT:    [[IV_1_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_1_NEXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
+; VEC-NEXT:    ret i64 [[IV_1_NEXT_LCSSA]]
+;
+; INTERLEAVE-LABEL: define i64 @test_iv_increment_incremented(
+; INTERLEAVE-SAME: ptr [[DST:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = getelementptr i16, ptr [[DST]], i64 3
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = getelementptr i16, ptr [[DST]], i64 2
+; INTERLEAVE-NEXT:    store i16 1, ptr [[TMP0]], align 2
+; INTERLEAVE-NEXT:    store i16 1, ptr [[TMP1]], align 2
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = add i64 1, -1
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = add i64 [[TMP2]], 1
+; INTERLEAVE-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1, %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 0, %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
+; INTERLEAVE:       [[LOOP]]:
+; INTERLEAVE-NEXT:    [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[IV_2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[DST]], i64 [[IV_1]]
+; INTERLEAVE-NEXT:    store i16 1, ptr [[GEP]], align 2
+; INTERLEAVE-NEXT:    [[IV_2_NEXT]] = add i64 [[IV_2]], -1
+; INTERLEAVE-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_2_NEXT]], 0
+; INTERLEAVE-NEXT:    [[IV_1_NEXT]] = add i64 [[IV_2_NEXT]], 1
+; INTERLEAVE-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[EXIT]]:
+; INTERLEAVE-NEXT:    [[IV_1_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_1_NEXT]], %[[LOOP]] ], [ [[TMP3]], %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    ret i64 [[IV_1_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv.1 = phi i64 [ 3, %entry ], [ %iv.1.next, %loop ]
+  %iv.2 = phi i64 [ 2, %entry ], [ %iv.2.next, %loop ]
+  %gep = getelementptr i16, ptr %dst, i64 %iv.1
+  store i16 1, ptr %gep, align 2
+  %iv.2.next = add i64 %iv.2, -1
+  %ec = icmp eq i64 %iv.2.next, 0
+  %iv.1.next = add i64 %iv.2.next, 1
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i64 %iv.1.next
+}

From 5c0aa31c3cb448065f12ede53e4dd54a9a98f650 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 12 Jan 2025 13:17:49 -0800
Subject: [PATCH 216/408] -ftime-report: Move FrontendTimer closer to
 TimeTraceScope

... to improve consistency and make "Clang time report" cover
`FrontendAction::BeginSourceFile` and `FrontendAction::EndSourceFile`.
---
 clang/include/clang/Frontend/CompilerInstance.h | 2 --
 clang/lib/Frontend/CompilerInstance.cpp         | 3 ---
 clang/lib/Frontend/FrontendAction.cpp           | 7 +------
 clang/tools/driver/cc1_main.cpp                 | 4 ++++
 4 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/clang/include/clang/Frontend/CompilerInstance.h b/clang/include/clang/Frontend/CompilerInstance.h
index 4a79b8d107171..8b539dfc92960 100644
--- a/clang/include/clang/Frontend/CompilerInstance.h
+++ b/clang/include/clang/Frontend/CompilerInstance.h
@@ -632,8 +632,6 @@ class CompilerInstance : public ModuleLoader {
 
   llvm::TimerGroup &getTimerGroup() const { return *timerGroup; }
 
-  bool hasFrontendTimer() const { return (bool)FrontendTimer; }
-
   llvm::Timer &getFrontendTimer() const {
     assert(FrontendTimer && "Compiler instance has no frontend timer!");
     return *FrontendTimer;
diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index b00a4ac075776..c11c857ea0606 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -1038,9 +1038,6 @@ bool CompilerInstance::ExecuteAction(FrontendAction &Act) {
        << LLVM_VERSION_STRING << " default target "
        << llvm::sys::getDefaultTargetTriple() << "\n";
 
-  if (getCodeGenOpts().TimePasses)
-    createFrontendTimer();
-
   if (getFrontendOpts().ShowStats || !getFrontendOpts().StatsFile.empty())
     llvm::EnableStatistics(false);
 
diff --git a/clang/lib/Frontend/FrontendAction.cpp b/clang/lib/Frontend/FrontendAction.cpp
index 9a50e7453eb61..9f789f093f55d 100644
--- a/clang/lib/Frontend/FrontendAction.cpp
+++ b/clang/lib/Frontend/FrontendAction.cpp
@@ -1069,12 +1069,7 @@ bool FrontendAction::BeginSourceFile(CompilerInstance &CI,
 
 llvm::Error FrontendAction::Execute() {
   CompilerInstance &CI = getCompilerInstance();
-
-  if (CI.hasFrontendTimer()) {
-    llvm::TimeRegion Timer(CI.getFrontendTimer());
-    ExecuteAction();
-  }
-  else ExecuteAction();
+  ExecuteAction();
 
   // If we are supposed to rebuild the global module index, do so now unless
   // there were any module-build failures.
diff --git a/clang/tools/driver/cc1_main.cpp b/clang/tools/driver/cc1_main.cpp
index d14058ff2c723..26b5e78cfb4b5 100644
--- a/clang/tools/driver/cc1_main.cpp
+++ b/clang/tools/driver/cc1_main.cpp
@@ -283,6 +283,10 @@ int cc1_main(ArrayRef<const char *> Argv, const char *Argv0, void *MainAddr) {
   // Execute the frontend actions.
   {
     llvm::TimeTraceScope TimeScope("ExecuteCompiler");
+    bool TimePasses = Clang->getCodeGenOpts().TimePasses;
+    if (TimePasses)
+      Clang->createFrontendTimer();
+    llvm::TimeRegion Timer(TimePasses ? &Clang->getFrontendTimer() : nullptr);
     Success = ExecuteCompilerInvocation(Clang.get());
   }
 

From b4ce29ab31b29ca926704c160e3909298ddf2b2b Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Sun, 12 Jan 2025 21:35:44 +0000
Subject: [PATCH 217/408] [AArch64][Clang] Add support for
 __arm_agnostic("sme_za_state") (#121788)

This adds support for parsing the attribute and codegen to map it to
"aarch64_za_state_agnostic" LLVM IR attribute.

This attribute is described in the Arm C Language Extensions (ACLE)
document:

  https://github.com/ARM-software/acle/blob/main/main/acle.md#__arm_agnostic
---
 clang/include/clang/AST/Type.h                | 13 +++--
 clang/include/clang/Basic/Attr.td             |  7 +++
 clang/include/clang/Basic/AttrDocs.td         | 26 ++++++++++
 .../clang/Basic/DiagnosticSemaKinds.td        |  5 ++
 clang/lib/AST/ItaniumMangle.cpp               | 14 +++---
 clang/lib/AST/TypePrinter.cpp                 |  2 +
 clang/lib/CodeGen/CGCall.cpp                  |  2 +
 clang/lib/Sema/SemaARM.cpp                    |  8 ++++
 clang/lib/Sema/SemaType.cpp                   | 48 ++++++++++++++++++-
 .../sme-intrinsics/aarch64-sme-attrs.cpp      | 24 ++++++++++
 .../CodeGenCXX/aarch64-mangle-sme-atts.cpp    | 10 ++++
 clang/test/Sema/aarch64-sme-func-attrs.c      | 21 ++++++++
 12 files changed, 169 insertions(+), 11 deletions(-)

diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 09c98f642852f..78677df578c4b 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -4593,9 +4593,14 @@ class FunctionType : public Type {
     SME_ZT0Shift = 5,
     SME_ZT0Mask = 0b111 << SME_ZT0Shift,
 
+    // A bit to tell whether a function is agnostic about sme ZA state.
+    SME_AgnosticZAStateShift = 8,
+    SME_AgnosticZAStateMask = 1 << SME_AgnosticZAStateShift,
+
     SME_AttributeMask =
-        0b111'111'11 // We can't support more than 8 bits because of
-                     // the bitmask in FunctionTypeExtraBitfields.
+        0b1'111'111'11 // We can't support more than 9 bits because of
+                       // the bitmask in FunctionTypeArmAttributes
+                       // and ExtProtoInfo.
   };
 
   enum ArmStateValue : unsigned {
@@ -4620,7 +4625,7 @@ class FunctionType : public Type {
   struct alignas(void *) FunctionTypeArmAttributes {
     /// Any AArch64 SME ACLE type attributes that need to be propagated
     /// on declarations and function pointers.
-    unsigned AArch64SMEAttributes : 8;
+    unsigned AArch64SMEAttributes : 9;
 
     FunctionTypeArmAttributes() : AArch64SMEAttributes(SME_NormalFunction) {}
   };
@@ -5188,7 +5193,7 @@ class FunctionProtoType final
     FunctionType::ExtInfo ExtInfo;
     unsigned Variadic : 1;
     unsigned HasTrailingReturn : 1;
-    unsigned AArch64SMEAttributes : 8;
+    unsigned AArch64SMEAttributes : 9;
     Qualifiers TypeQuals;
     RefQualifierKind RefQualifier = RQ_None;
     ExceptionSpecInfo ExceptionSpec;
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 5039c20d8b73b..c0632aaa51625 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -2877,6 +2877,13 @@ def ArmPreserves : TypeAttr, TargetSpecificAttr<TargetAArch64> {
   let Documentation = [ArmPreservesDocs];
 }
 
+def ArmAgnostic : TypeAttr, TargetSpecificAttr<TargetAArch64> {
+  let Spellings = [RegularKeyword<"__arm_agnostic">];
+  let Args = [VariadicStringArgument<"AgnosticArgs">];
+  let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>;
+  let Documentation = [ArmAgnosticDocs];
+}
+
 def ArmLocallyStreaming : InheritableAttr, TargetSpecificAttr<TargetAArch64> {
   let Spellings = [RegularKeyword<"__arm_locally_streaming">];
   let Subjects = SubjectList<[Function], ErrorDiag>;
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 506fe38eb882b..953ff9a700e51 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -7635,6 +7635,32 @@ The attributes ``__arm_in(S)``, ``__arm_out(S)``, ``__arm_inout(S)`` and
   }];
 }
 
+def ArmAgnosticDocs : Documentation {
+  let Category = DocCatArmSmeAttributes;
+  let Content = [{
+The ``__arm_agnostic`` keyword applies to prototyped function types and
+affects the function's calling convention for a given state S. This
+attribute allows the user to describe a function that preserves S, without
+requiring the function to share S with its callers and without making
+the assumption that S exists.
+
+If a function has the ``__arm_agnostic(S)`` attribute and calls a function
+without this attribute, then the function's object code will contain code
+to preserve state S. Otherwise, the function's object code will be the same
+as if it did not have the attribute.
+
+The attribute takes string arguments to describe state S. The supported
+states are:
+
+* ``"sme_za_state"`` for state enabled by PSTATE.ZA, such as ZA and ZT0.
+
+The attribute ``__arm_agnostic("sme_za_state")`` cannot be used in conjunction
+with ``__arm_in(S)``, ``__arm_out(S)``, ``__arm_inout(S)`` or
+``__arm_preserves(S)`` where state S describes state enabled by PSTATE.ZA,
+such as "za" or "zt0".
+  }];
+}
+
 def ArmSmeLocallyStreamingDocs : Documentation {
   let Category = DocCatArmSmeAttributes;
   let Content = [{
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index d4e897868f1a9..f04381a32a415 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3835,6 +3835,9 @@ def err_sme_unimplemented_za_save_restore : Error<
   "call to a function that shares state other than 'za' from a "
   "function that has live 'za' state requires a spill/fill of ZA, which is not yet "
   "implemented">;
+def err_sme_unsupported_agnostic_new : Error<
+  "__arm_agnostic(\"sme_za_state\") is not supported together with "
+  "__arm_new(\"za\") or __arm_new(\"zt0\")">;
 def note_sme_use_preserves_za : Note<
   "add '__arm_preserves(\"za\")' to the callee if it preserves ZA">;
 def err_sme_definition_using_sm_in_non_sme_target : Error<
@@ -3851,6 +3854,8 @@ def warn_sme_locally_streaming_has_vl_args_returns : Warning<
   "%select{returning|passing}0 a VL-dependent argument %select{from|to}0 a locally streaming function is undefined"
   " behaviour when the streaming and non-streaming vector lengths are different at runtime">,
   InGroup<AArch64SMEAttributes>, DefaultIgnore;
+def err_conflicting_attributes_arm_agnostic : Error<
+  "__arm_agnostic(\"sme_za_state\") cannot share ZA state with its caller">;
 def err_conflicting_attributes_arm_state : Error<
   "conflicting attributes for state '%0'">;
 def err_unknown_arm_state : Error<
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 47aa9b40dab84..1dd936cf4fb51 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -3585,13 +3585,15 @@ void CXXNameMangler::mangleSMEAttrs(unsigned SMEAttrs) {
   else if (SMEAttrs & FunctionType::SME_PStateSMCompatibleMask)
     Bitmask |= AAPCSBitmaskSME::ArmStreamingCompatibleBit;
 
-  // TODO: Must represent __arm_agnostic("sme_za_state")
-
-  Bitmask |= encodeAAPCSZAState(FunctionType::getArmZAState(SMEAttrs))
-             << AAPCSBitmaskSME::ZA_Shift;
+  if (SMEAttrs & FunctionType::SME_AgnosticZAStateMask)
+    Bitmask |= AAPCSBitmaskSME::ArmAgnosticSMEZAStateBit;
+  else {
+    Bitmask |= encodeAAPCSZAState(FunctionType::getArmZAState(SMEAttrs))
+               << AAPCSBitmaskSME::ZA_Shift;
 
-  Bitmask |= encodeAAPCSZAState(FunctionType::getArmZT0State(SMEAttrs))
-             << AAPCSBitmaskSME::ZT0_Shift;
+    Bitmask |= encodeAAPCSZAState(FunctionType::getArmZT0State(SMEAttrs))
+               << AAPCSBitmaskSME::ZT0_Shift;
+  }
 
   Out << "Lj" << static_cast<unsigned>(Bitmask) << "EE";
 }
diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index 7caebfb061a50..9590145ffbd5a 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -1000,6 +1000,8 @@ void TypePrinter::printFunctionProtoAfter(const FunctionProtoType *T,
     OS << " __arm_streaming_compatible";
   if (SMEBits & FunctionType::SME_PStateSMEnabledMask)
     OS << " __arm_streaming";
+  if (SMEBits & FunctionType::SME_AgnosticZAStateMask)
+    OS << "__arm_agnostic(\"sme_za_state\")";
   if (FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_Preserves)
     OS << " __arm_preserves(\"za\")";
   if (FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_In)
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 473675d78c66f..0fde4d8ee296b 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -1779,6 +1779,8 @@ static void AddAttributesFromFunctionProtoType(ASTContext &Ctx,
     FuncAttrs.addAttribute("aarch64_pstate_sm_enabled");
   if (SMEBits & FunctionType::SME_PStateSMCompatibleMask)
     FuncAttrs.addAttribute("aarch64_pstate_sm_compatible");
+  if (SMEBits & FunctionType::SME_AgnosticZAStateMask)
+    FuncAttrs.addAttribute("aarch64_za_state_agnostic");
 
   // ZA
   if (FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_Preserves)
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index eafd43eb979ba..db418d80e0e09 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -1337,6 +1337,14 @@ void SemaARM::CheckSMEFunctionDefAttributes(const FunctionDecl *FD) {
   bool UsesZA = Attr && Attr->isNewZA();
   bool UsesZT0 = Attr && Attr->isNewZT0();
 
+  if (UsesZA || UsesZT0) {
+    if (const auto *FPT = FD->getType()->getAs<FunctionProtoType>()) {
+      FunctionProtoType::ExtProtoInfo EPI = FPT->getExtProtoInfo();
+      if (EPI.AArch64SMEAttributes & FunctionType::SME_AgnosticZAStateMask)
+        Diag(FD->getLocation(), diag::err_sme_unsupported_agnostic_new);
+    }
+  }
+
   if (FD->hasAttr<ArmLocallyStreamingAttr>()) {
     if (FD->getReturnType()->isSizelessVectorType())
       Diag(FD->getLocation(),
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index d46fbca9ee976..e3ec327c1b364 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -161,6 +161,7 @@ static void diagnoseBadTypeAttribute(Sema &S, const ParsedAttr &attr,
   case ParsedAttr::AT_ArmIn:                                                   \
   case ParsedAttr::AT_ArmOut:                                                  \
   case ParsedAttr::AT_ArmInOut:                                                \
+  case ParsedAttr::AT_ArmAgnostic:                                             \
   case ParsedAttr::AT_AnyX86NoCallerSavedRegisters:                            \
   case ParsedAttr::AT_AnyX86NoCfCheck:                                         \
     CALLING_CONV_ATTRS_CASELIST
@@ -7745,6 +7746,40 @@ static bool checkMutualExclusion(TypeProcessingState &state,
   return true;
 }
 
+static bool handleArmAgnosticAttribute(Sema &S,
+                                       FunctionProtoType::ExtProtoInfo &EPI,
+                                       ParsedAttr &Attr) {
+  if (!Attr.getNumArgs()) {
+    S.Diag(Attr.getLoc(), diag::err_missing_arm_state) << Attr;
+    Attr.setInvalid();
+    return true;
+  }
+
+  for (unsigned I = 0; I < Attr.getNumArgs(); ++I) {
+    StringRef StateName;
+    SourceLocation LiteralLoc;
+    if (!S.checkStringLiteralArgumentAttr(Attr, I, StateName, &LiteralLoc))
+      return true;
+
+    if (StateName != "sme_za_state") {
+      S.Diag(LiteralLoc, diag::err_unknown_arm_state) << StateName;
+      Attr.setInvalid();
+      return true;
+    }
+
+    if (EPI.AArch64SMEAttributes &
+        (FunctionType::SME_ZAMask | FunctionType::SME_ZT0Mask)) {
+      S.Diag(Attr.getLoc(), diag::err_conflicting_attributes_arm_agnostic);
+      Attr.setInvalid();
+      return true;
+    }
+
+    EPI.setArmSMEAttribute(FunctionType::SME_AgnosticZAStateMask);
+  }
+
+  return false;
+}
+
 static bool handleArmStateAttribute(Sema &S,
                                     FunctionProtoType::ExtProtoInfo &EPI,
                                     ParsedAttr &Attr,
@@ -7775,6 +7810,12 @@ static bool handleArmStateAttribute(Sema &S,
       return true;
     }
 
+    if (EPI.AArch64SMEAttributes & FunctionType::SME_AgnosticZAStateMask) {
+      S.Diag(LiteralLoc, diag::err_conflicting_attributes_arm_agnostic);
+      Attr.setInvalid();
+      return true;
+    }
+
     // __arm_in(S), __arm_out(S), __arm_inout(S) and __arm_preserves(S)
     // are all mutually exclusive for the same S, so check if there are
     // conflicting attributes.
@@ -7925,7 +7966,8 @@ static bool handleFunctionTypeAttr(TypeProcessingState &state, ParsedAttr &attr,
       attr.getKind() == ParsedAttr::AT_ArmPreserves ||
       attr.getKind() == ParsedAttr::AT_ArmIn ||
       attr.getKind() == ParsedAttr::AT_ArmOut ||
-      attr.getKind() == ParsedAttr::AT_ArmInOut) {
+      attr.getKind() == ParsedAttr::AT_ArmInOut ||
+      attr.getKind() == ParsedAttr::AT_ArmAgnostic) {
     if (S.CheckAttrTarget(attr))
       return true;
 
@@ -7976,6 +8018,10 @@ static bool handleFunctionTypeAttr(TypeProcessingState &state, ParsedAttr &attr,
       if (handleArmStateAttribute(S, EPI, attr, FunctionType::ARM_InOut))
         return true;
       break;
+    case ParsedAttr::AT_ArmAgnostic:
+      if (handleArmAgnosticAttribute(S, EPI, attr))
+        return true;
+      break;
     default:
       llvm_unreachable("Unsupported attribute");
     }
diff --git a/clang/test/CodeGen/AArch64/sme-intrinsics/aarch64-sme-attrs.cpp b/clang/test/CodeGen/AArch64/sme-intrinsics/aarch64-sme-attrs.cpp
index 9885ac45e6a0e..54762c8b41412 100644
--- a/clang/test/CodeGen/AArch64/sme-intrinsics/aarch64-sme-attrs.cpp
+++ b/clang/test/CodeGen/AArch64/sme-intrinsics/aarch64-sme-attrs.cpp
@@ -15,6 +15,7 @@ int streaming_compatible_decl(void) __arm_streaming_compatible;
 int shared_za_decl(void) __arm_inout("za");
 int preserves_za_decl(void) __arm_preserves("za");
 int private_za_decl(void);
+int agnostic_za_decl(void) __arm_agnostic("sme_za_state");
 
 // == FUNCTION DEFINITIONS ==
 
@@ -130,6 +131,27 @@ __arm_new("za") int new_za_callee() {
 
 // CHECK: declare i32 @private_za_decl()
 
+// CHECK-LABEL: @agnostic_za_caller()
+// CHECK-SAME: #[[ZA_AGNOSTIC:[0-9]+]]
+// CHECK: call i32 @normal_callee()
+//
+int agnostic_za_caller() __arm_agnostic("sme_za_state") {
+  return normal_callee();
+}
+
+// CHECK-LABEL: @agnostic_za_callee()
+// CHECK: call i32 @agnostic_za_decl() #[[ZA_AGNOSTIC_CALL:[0-9]+]]
+//
+int agnostic_za_callee() {
+  return agnostic_za_decl();
+}
+
+// CHECK-LABEL: @agnostic_za_callee_live_za()
+// CHECK: call i32 @agnostic_za_decl() #[[ZA_AGNOSTIC_CALL]]
+//
+int agnostic_za_callee_live_za() __arm_inout("za") {
+  return agnostic_za_decl();
+}
 
 // Ensure that the attributes are correctly propagated to function types
 // and also to callsites.
@@ -289,12 +311,14 @@ int test_variadic_template() __arm_inout("za") {
 // CHECK: attributes #[[ZA_PRESERVED]] = { mustprogress noinline nounwind "aarch64_preserves_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
 // CHECK: attributes #[[ZA_PRESERVED_DECL]] = { "aarch64_preserves_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
 // CHECK: attributes #[[ZA_NEW]] = { mustprogress noinline nounwind "aarch64_new_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[ZA_AGNOSTIC]] = { mustprogress noinline nounwind "aarch64_za_state_agnostic" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
 // CHECK: attributes #[[NORMAL_DEF]] = { mustprogress noinline nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
 // CHECK: attributes #[[SM_ENABLED_CALL]] = { "aarch64_pstate_sm_enabled" }
 // CHECK: attributes #[[SM_COMPATIBLE_CALL]] = { "aarch64_pstate_sm_compatible" }
 // CHECK: attributes #[[SM_BODY_CALL]] = { "aarch64_pstate_sm_body" }
 // CHECK: attributes #[[ZA_SHARED_CALL]] = { "aarch64_inout_za" }
 // CHECK: attributes #[[ZA_PRESERVED_CALL]] = { "aarch64_preserves_za" }
+// CHECK: attributes #[[ZA_AGNOSTIC_CALL]] = { "aarch64_za_state_agnostic" }
 // CHECK: attributes #[[NOUNWIND_CALL]] = { nounwind }
 // CHECK: attributes #[[NOUNWIND_SM_ENABLED_CALL]] = { nounwind "aarch64_pstate_sm_enabled" }
 // CHECK: attributes #[[NOUNWIND_SM_COMPATIBLE_CALL]] = { nounwind "aarch64_pstate_sm_compatible" }
diff --git a/clang/test/CodeGenCXX/aarch64-mangle-sme-atts.cpp b/clang/test/CodeGenCXX/aarch64-mangle-sme-atts.cpp
index 09db59ac621a2..3612c03a86efd 100644
--- a/clang/test/CodeGenCXX/aarch64-mangle-sme-atts.cpp
+++ b/clang/test/CodeGenCXX/aarch64-mangle-sme-atts.cpp
@@ -45,6 +45,16 @@ __arm_new("zt0") void fn_zt0_out(int (*foo)() __arm_out("zt0")) { foo(); }
 // CHECK: define dso_local void @_Z12fn_zt0_inoutP11__SME_ATTRSIFivELj192EE(
 __arm_new("zt0") void fn_zt0_inout(int (*foo)() __arm_inout("zt0")) { foo(); }
 
+//
+// __arm_agnostic("sme_za_state") Attribute
+//
+
+// CHECK: define dso_local void @_Z24fn_sme_za_state_agnosticP11__SME_ATTRSIFvvELj4EE(
+void fn_sme_za_state_agnostic(void (*foo)() __arm_agnostic("sme_za_state")) { foo(); }
+
+// CHECK: define dso_local void @_Z34fn_sme_za_state_streaming_agnosticP11__SME_ATTRSIFvvELj5EE(
+void fn_sme_za_state_streaming_agnostic(void (*foo)() __arm_streaming __arm_agnostic("sme_za_state")) { foo(); }
+
 //
 // Streaming-mode, ZA & ZT0 Attributes
 //
diff --git a/clang/test/Sema/aarch64-sme-func-attrs.c b/clang/test/Sema/aarch64-sme-func-attrs.c
index 0c263eb2610cf..1543e990dd042 100644
--- a/clang/test/Sema/aarch64-sme-func-attrs.c
+++ b/clang/test/Sema/aarch64-sme-func-attrs.c
@@ -9,6 +9,7 @@ void sme_arm_streaming_compatible(void) __arm_streaming_compatible;
 __arm_new("za") void sme_arm_new_za(void) {}
 void sme_arm_shared_za(void) __arm_inout("za");
 void sme_arm_preserves_za(void) __arm_preserves("za");
+void sme_arm_agnostic(void) __arm_agnostic("sme_za_state");
 
 __arm_new("za") void sme_arm_streaming_new_za(void) __arm_streaming {}
 void sme_arm_streaming_shared_za(void) __arm_streaming __arm_inout("za");
@@ -88,6 +89,26 @@ fptrty7 invalid_streaming_func() { return streaming_ptr_invalid; }
 // expected-error@+1 {{'__arm_streaming' only applies to function types; type here is 'void ()'}}
 void function_no_prototype() __arm_streaming;
 
+// expected-cpp-error@+2 {{__arm_agnostic("sme_za_state") cannot share ZA state with its caller}}
+// expected-error@+1 {{__arm_agnostic("sme_za_state") cannot share ZA state with its caller}}
+void sme_arm_agnostic_shared_za_zt0(void) __arm_agnostic("sme_za_state") __arm_inout("zt0") {}
+
+// expected-cpp-error@+2 {{__arm_agnostic("sme_za_state") cannot share ZA state with its caller}}
+// expected-error@+1 {{__arm_agnostic("sme_za_state") cannot share ZA state with its caller}}
+void sme_arm_agnostic_shared_za_za(void) __arm_agnostic("sme_za_state") __arm_inout("za") {}
+
+// expected-cpp-error@+2 {{__arm_agnostic("sme_za_state") cannot share ZA state with its caller}}
+// expected-error@+1 {{__arm_agnostic("sme_za_state") cannot share ZA state with its caller}}
+void sme_arm_agnostic_shared_za_za_rev(void) __arm_inout("za") __arm_agnostic("sme_za_state") {}
+
+// expected-cpp-error@+2 {{__arm_agnostic("sme_za_state") is not supported together with __arm_new("za") or __arm_new("zt0")}}
+// expected-error@+1 {{__arm_agnostic("sme_za_state") is not supported together with __arm_new("za") or __arm_new("zt0")}}
+__arm_new("zt0") void sme_arm_agnostic_arm_new_zt0(void) __arm_agnostic("sme_za_state") {}
+
+// expected-cpp-error@+2 {{__arm_agnostic("sme_za_state") is not supported together with __arm_new("za") or __arm_new("zt0")}}
+// expected-error@+1 {{__arm_agnostic("sme_za_state") is not supported together with __arm_new("za") or __arm_new("zt0")}}
+__arm_new("za") void sme_arm_agnostic_arm_new_za(void) __arm_agnostic("sme_za_state") {}
+
 //
 // Check for incorrect conversions of function pointers with the attributes
 //

From 8df64ed77727ab9b7540819f2fe64379e88a50be Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sun, 12 Jan 2025 22:03:21 +0000
Subject: [PATCH 218/408] [LV] Don't consider IV increments uniform if exit
 value is used outside.

In some cases, there might be a chain of uniform instructions producing
the exit value. To generate correct code in all cases, consider the IV
increment not uniform, if there are users outside the loop.

Instead, let VPlan narrow the IV, if possible using the logic from
3ff1d01985752.

Test case from #122602 verified with Alive2:
    https://alive2.llvm.org/ce/z/bA4EGj

Fixes https://github.com/llvm/llvm-project/issues/122496.
Fixes https://github.com/llvm/llvm-project/issues/122602.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  2 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 22 -----
 .../LoopVectorize/X86/uniform-phi.ll          |  2 -
 .../LoopVectorize/iv_outside_user.ll          | 17 ++--
 .../LoopVectorize/scalable-iv-outside-user.ll | 90 +++++++++++++++++++
 5 files changed, 101 insertions(+), 32 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b017b61a45a0c..d32a463a996c4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3806,7 +3806,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
     // uniform after vectorization.
     bool UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
       auto *I = cast<Instruction>(U);
-      return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
+      return I == Ind || Worklist.count(I) ||
              IsVectorizedMemAccessUse(I, IndUpdate);
     });
     if (!UniformIndUpdate)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index f440bf2eb0223..545d277d7aa01 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -621,28 +621,6 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
       Def->replaceAllUsesWith(Clone);
     }
 
-    // Check if any uniform VPReplicateRecipes using the phi recipe are used by
-    // ExtractFromEnd. Those must be replaced by a regular VPReplicateRecipe to
-    // ensure the final value is available.
-    // TODO: Remove once uniformity analysis is done on VPlan.
-    for (VPUser *U : Users) {
-      auto *ExitIRI = dyn_cast<VPIRInstruction>(U);
-      VPValue *Op;
-      if (!ExitIRI || !match(ExitIRI->getOperand(0),
-                             m_VPInstruction<VPInstruction::ExtractFromEnd>(
-                                 m_VPValue(Op), m_VPValue())))
-        continue;
-      auto *RepR = dyn_cast<VPReplicateRecipe>(Op);
-      if (!RepR || !RepR->isUniform())
-        continue;
-      assert(!RepR->isPredicated() && "RepR must not be predicated");
-      Instruction *I = RepR->getUnderlyingInstr();
-      auto *Clone =
-          new VPReplicateRecipe(I, RepR->operands(), /*IsUniform*/ false);
-      Clone->insertAfter(RepR);
-      RepR->replaceAllUsesWith(Clone);
-    }
-
     // Replace wide pointer inductions which have only their scalars used by
     // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
     if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform-phi.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform-phi.ll
index f0641154f85c7..5e97627e7688b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/uniform-phi.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/uniform-phi.ll
@@ -51,8 +51,6 @@ for.end:
 
 ; CHECK-LABEL: goo
 ; Check %indvars.iv and %indvars.iv.next are uniform instructions even if they are used outside of loop.
-; CHECK-DAG: LV: Found uniform instruction:   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-; CHECK-DAG: LV: Found uniform instruction:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 ; CHECK-DAG: LV: Found uniform instruction:   %exitcond = icmp eq i64 %indvars.iv, 1599
 
 define i64 @goo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) #0 {
diff --git a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
index 6b0c677b56d2c..3e61546da2ceb 100644
--- a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
@@ -1176,7 +1176,6 @@ e.exit:
 }
 
 ; Test case for https://github.com/llvm/llvm-project/issues/122496.
-; FIXME: Currently an incorrect live-out is used.
 define i32 @iv_ext_used_outside( ptr %dst) {
 ; VEC-LABEL: define i32 @iv_ext_used_outside(
 ; VEC-SAME: ptr [[DST:%.*]]) {
@@ -1186,15 +1185,19 @@ define i32 @iv_ext_used_outside( ptr %dst) {
 ; VEC-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VEC:       [[VECTOR_BODY]]:
 ; VEC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VEC-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
 ; VEC-NEXT:    [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0
 ; VEC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i16 [[TMP0]]
 ; VEC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0
 ; VEC-NEXT:    store <2 x i32> zeroinitializer, ptr [[TMP2]], align 4
-; VEC-NEXT:    [[TMP3:%.*]] = add nuw nsw i16 [[TMP0]], 1
+; VEC-NEXT:    [[TMP5:%.*]] = add nuw nsw <2 x i16> [[VEC_IND]], splat (i16 1)
+; VEC-NEXT:    [[TMP3:%.*]] = extractelement <2 x i16> [[TMP5]], i32 0
 ; VEC-NEXT:    [[TMP4:%.*]] = zext nneg i16 [[TMP3]] to i32
-; VEC-NEXT:    [[TMP5:%.*]] = zext nneg i16 [[TMP3]] to i32
+; VEC-NEXT:    [[TMP8:%.*]] = extractelement <2 x i16> [[TMP5]], i32 1
+; VEC-NEXT:    [[TMP7:%.*]] = zext nneg i16 [[TMP8]] to i32
 ; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; VEC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; VEC-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
 ; VEC-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; VEC:       [[MIDDLE_BLOCK]]:
@@ -1213,7 +1216,7 @@ define i32 @iv_ext_used_outside( ptr %dst) {
 ; VEC-NEXT:    [[EC:%.*]] = icmp samesign ult i16 [[IV_1]], 128
 ; VEC-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT]], {{!llvm.loop ![0-9]+}}
 ; VEC:       [[EXIT]]:
-; VEC-NEXT:    [[IV_1_EXT_LCSSA:%.*]] = phi i32 [ [[IV_1_EXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
+; VEC-NEXT:    [[IV_1_EXT_LCSSA:%.*]] = phi i32 [ [[IV_1_EXT]], %[[LOOP]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
 ; VEC-NEXT:    ret i32 [[IV_1_EXT_LCSSA]]
 ;
 ; INTERLEAVE-LABEL: define i32 @iv_ext_used_outside(
@@ -1274,7 +1277,6 @@ exit:
 }
 
 ; Test case for https://github.com/llvm/llvm-project/issues/122602.
-; FIXME: Currently an incorrect live-out is used.
 define i64 @test_iv_increment_incremented(ptr %dst) {
 ; VEC-LABEL: define i64 @test_iv_increment_incremented(
 ; VEC-SAME: ptr [[DST:%.*]]) {
@@ -1288,8 +1290,9 @@ define i64 @test_iv_increment_incremented(ptr %dst) {
 ; VEC-NEXT:    [[TMP2:%.*]] = getelementptr i16, ptr [[TMP1]], i32 -1
 ; VEC-NEXT:    store <2 x i16> splat (i16 1), ptr [[TMP2]], align 2
 ; VEC-NEXT:    [[TMP3:%.*]] = add i64 2, -1
+; VEC-NEXT:    [[TMP5:%.*]] = add i64 1, -1
 ; VEC-NEXT:    [[TMP4:%.*]] = add i64 [[TMP3]], 1
-; VEC-NEXT:    [[TMP5:%.*]] = add i64 [[TMP3]], 1
+; VEC-NEXT:    [[TMP6:%.*]] = add i64 [[TMP5]], 1
 ; VEC-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VEC:       [[MIDDLE_BLOCK]]:
 ; VEC-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -1307,7 +1310,7 @@ define i64 @test_iv_increment_incremented(ptr %dst) {
 ; VEC-NEXT:    [[IV_1_NEXT]] = add i64 [[IV_2_NEXT]], 1
 ; VEC-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
 ; VEC:       [[EXIT]]:
-; VEC-NEXT:    [[IV_1_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_1_NEXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
+; VEC-NEXT:    [[IV_1_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_1_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
 ; VEC-NEXT:    ret i64 [[IV_1_NEXT_LCSSA]]
 ;
 ; INTERLEAVE-LABEL: define i64 @test_iv_increment_incremented(
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll b/llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll
new file mode 100644
index 0000000000000..d88b0583ffc04
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll
@@ -0,0 +1,90 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -scalable-vectorization=on -force-target-supports-scalable-vectors=true -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=2 -S %s | FileCheck %s
+
+define i32 @iv_live_out_wide(ptr %dst) {
+; CHECK-LABEL: define i32 @iv_live_out_wide(
+; CHECK-SAME: ptr [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[STEP_1:%.*]] = sext i8 0 to i32
+; CHECK-NEXT:    [[STEP_2:%.*]] = add nsw i32 [[STEP_1]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 2000, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 2000, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 2000, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i32> [[TMP7]], splat (i32 1)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i32> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP5]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[STEP_2]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i64 [[TMP13]]
+; CHECK-NEXT:    store <vscale x 2 x i16> zeroinitializer, ptr [[TMP11]], align 2
+; CHECK-NEXT:    store <vscale x 2 x i16> zeroinitializer, ptr [[TMP14]], align 2
+; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i32> [[BROADCAST_SPLAT2]], [[STEP_ADD]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP6]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], 2
+; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 1
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 2 x i32> [[TMP15]], i32 [[TMP19]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 2000, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[E_EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[IV]]
+; CHECK-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[STEP_2]], [[IV]]
+; CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[IV_NEXT]], 2000
+; CHECK-NEXT:    br i1 [[CMP_I]], label %[[LOOP]], label %[[E_EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[E_EXIT]]:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[IV_NEXT]], %[[LOOP]] ], [ [[TMP20]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %step.1 = sext i8 0 to i32
+  %step.2 = add nsw i32 %step.1, 1
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.dst = getelementptr inbounds i16, ptr %dst, i32 %iv
+  store i16 0, ptr %gep.dst, align 2
+  %iv.next = add i32 %step.2, %iv
+  %cmp.i = icmp slt i32 %iv.next, 2000
+  br i1 %cmp.i, label %loop, label %e.exit
+
+e.exit:
+  %res = phi i32 [ %iv.next, %loop ]
+  ret i32 %res
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.

From 08028d68a90bbc47464562a745e33fa10256a7d3 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Sun, 12 Jan 2025 22:06:28 +0000
Subject: [PATCH 219/408] [Clang] Fix buildbot failure introduced by #121788

Silences 'enumeration not handled in switch' warning,
which causes buildbot failures with -Werror.
---
 clang/lib/AST/TypePrinter.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index 9590145ffbd5a..a850410ffc846 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -2002,6 +2002,7 @@ void TypePrinter::printAttributedAfter(const AttributedType *T,
   case attr::CmseNSCall:
   case attr::AnnotateType:
   case attr::WebAssemblyFuncref:
+  case attr::ArmAgnostic:
   case attr::ArmStreaming:
   case attr::ArmStreamingCompatible:
   case attr::ArmIn:

From 0e51b54b7ac02b0920e20b8ccae26b32bd6b6982 Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail@justinbogner.com>
Date: Sun, 12 Jan 2025 18:52:20 -0700
Subject: [PATCH 220/408] [DirectX] Implement the resource.store.rawbuffer
 intrinsic (#121282)

This introduces `@llvm.dx.resource.store.rawbuffer` and generalizes the
buffer store docs under DirectX/DXILResources.

Fixes #106188
---
 llvm/docs/DirectX/DXILResources.rst           | 114 ++++++++++++--
 llvm/include/llvm/IR/IntrinsicsDirectX.td     |   4 +
 llvm/lib/Target/DirectX/DXIL.td               |  20 +++
 llvm/lib/Target/DirectX/DXILOpLowering.cpp    |  82 ++++++----
 llvm/test/CodeGen/DirectX/BufferStore-sm61.ll | 126 +++++++++++++++
 .../CodeGen/DirectX/RawBufferStore-error64.ll |  20 +++
 llvm/test/CodeGen/DirectX/RawBufferStore.ll   | 144 ++++++++++++++++++
 7 files changed, 469 insertions(+), 41 deletions(-)
 create mode 100644 llvm/test/CodeGen/DirectX/BufferStore-sm61.ll
 create mode 100644 llvm/test/CodeGen/DirectX/RawBufferStore-error64.ll
 create mode 100644 llvm/test/CodeGen/DirectX/RawBufferStore.ll

diff --git a/llvm/docs/DirectX/DXILResources.rst b/llvm/docs/DirectX/DXILResources.rst
index 857d29e48363b..80e3c2c11153d 100644
--- a/llvm/docs/DirectX/DXILResources.rst
+++ b/llvm/docs/DirectX/DXILResources.rst
@@ -491,26 +491,28 @@ Examples:
            i32 %byte_offset,
            i32 0)
 
-Texture and Typed Buffer Stores
--------------------------------
+Stores
+------
 
-*relevant types: Textures and TypedBuffer*
+*relevant types: Textures and Buffer*
 
-The `TextureStore`_ and `BufferStore`_ DXIL operations always write all four
-32-bit components to a texture or a typed buffer. While both operations include
-a mask parameter, it is specified that the mask must cover all components when
-used with these types.
+The `TextureStore`_, `BufferStore`_, and `RawBufferStore`_ DXIL operations
+write four components to a texture or a buffer. These include a mask argument
+that is used when fewer than 4 components are written, but notably this only
+takes on the contiguous x, xy, xyz, and xyzw values.
 
-The store operations that we define as intrinsics behave similarly, and will
-only accept writes to the whole of the contained type. This differs from the
-loads above, but this makes sense to do from a semantics preserving point of
-view. Thus, texture and buffer stores may only operate on 4-element vectors of
-types that are 32-bits or fewer, such as ``<4 x i32>``, ``<4 x float>``, and
-``<4 x half>``, and 2 element vectors of 64-bit types like ``<2 x double>`` and
-``<2 x i64>``.
+We define the LLVM store intrinsics to accept vectors when storing multiple
+components rather than using `undef` and a mask, but otherwise match the DXIL
+ops fairly closely.
 
-.. _BufferStore: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#bufferstore
 .. _TextureStore: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#texturestore
+.. _BufferStore: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#bufferstore
+.. _RawBufferStore: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#rawbufferstore
+
+For TypedBuffer, we only need one coordinate, and we must always write a vector
+since partial writes aren't possible. Similarly to the load operations
+described above, we handle 64-bit types specially and only handle 2-element
+vectors rather than 4.
 
 Examples:
 
@@ -548,3 +550,85 @@ Examples:
        target("dx.TypedBuffer", f16, 1, 0) %buf, i32 %index, <4 x f16> %data)
    call void @llvm.dx.resource.store.typedbuffer.tdx.Buffer_v2f64_1_0_0t(
        target("dx.TypedBuffer", f64, 1, 0) %buf, i32 %index, <2 x f64> %data)
+
+For RawBuffer, we need two indices and we accept scalars and vectors of 4 or
+fewer elements. Note that we do allow vectors of 4 64-bit elements here.
+
+Examples:
+
+.. list-table:: ``@llvm.dx.resource.store.rawbuffer``
+   :header-rows: 1
+
+   * - Argument
+     -
+     - Type
+     - Description
+   * - Return value
+     -
+     - ``void``
+     -
+   * - ``%buffer``
+     - 0
+     - ``target(dx.RawBuffer, ...)``
+     - The buffer to store into
+   * - ``%index``
+     - 1
+     - ``i32``
+     - Index into the buffer
+   * - ``%offset``
+     - 2
+     - ``i32``
+     - Byte offset into structured buffer elements
+   * - ``%data``
+     - 3
+     - Scalar or vector
+     - The data to store
+
+Examples:
+
+.. code-block:: llvm
+
+   ; float
+   call void @llvm.dx.resource.store.rawbuffer.tdx.RawBuffer_f32_1_0_0t.f32(
+       target("dx.RawBuffer", float, 1, 0, 0) %buffer,
+       i32 %index, i32 0, float %data)
+   call void @llvm.dx.resource.store.rawbuffer.tdx.RawBuffer_i8_1_0_0t.f32(
+       target("dx.RawBuffer", i8, 1, 0, 0) %buffer,
+       i32 %index, i32 0, float %data)
+
+   ; float4
+   call void @llvm.dx.resource.store.rawbuffer.tdx.RawBuffer_v4f32_1_0_0t.v4f32(
+       target("dx.RawBuffer", <4 x float>, 1, 0, 0) %buffer,
+       i32 %index, i32 0, <4 x float> %data)
+   call void @llvm.dx.resource.store.rawbuffer.tdx.RawBuffer_i8_1_0_0t.v4f32(
+       target("dx.RawBuffer", i8, 1, 0, 0) %buffer,
+       i32 %index, i32 0, <4 x float> %data)
+
+   ; struct S0 { float4 f; int4 i; }
+   call void @llvm.dx.resource.store.rawbuffer.v4f32(
+       target("dx.RawBuffer", { <4 x float>, <4 x i32> }, 1, 0, 0) %buffer,
+       i32 %index, i32 0, <4 x float> %data0)
+   call void @llvm.dx.resource.store.rawbuffer.v4i32(
+       target("dx.RawBuffer", { <4 x float>, <4 x i32> }, 1, 0, 0) %buffer,
+       i32 %index, i32 16, <4 x i32> %data1)
+
+   ; struct Q { float4 f; int3 i; }
+   ; struct R { int z; S x; }
+   call void @llvm.dx.resource.store.rawbuffer.i32(
+       target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0)
+           %buffer,
+       i32 %index, i32 0, i32 %data0)
+   call void @llvm.dx.resource.store.rawbuffer.v4f32(
+       target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0)
+           %buffer,
+       i32 %index, i32 4, <4 x float> %data1)
+   call void @llvm.dx.resource.store.rawbuffer.v3f16(
+       target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0)
+           %buffer,
+       i32 %index, i32 20, <3 x half> %data2)
+
+   ; byteaddressbuf.Store<int64_t4>
+   call void @llvm.dx.resource.store.rawbuffer.tdx.RawBuffer_i8_1_0_0t.v4f64(
+       target("dx.RawBuffer", i8, 1, 0, 0) %buffer,
+       i32 %index, i32 0, <4 x double> %data)
+
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index ef48af5b42dbf..2a56ba78ce88e 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -40,6 +40,10 @@ def int_dx_resource_load_rawbuffer
     : DefaultAttrsIntrinsic<[llvm_any_ty, llvm_i1_ty],
                             [llvm_any_ty, llvm_i32_ty, llvm_i32_ty],
                             [IntrReadMem]>;
+def int_dx_resource_store_rawbuffer
+    : DefaultAttrsIntrinsic<
+          [], [llvm_any_ty, llvm_i32_ty, llvm_i32_ty, llvm_any_ty],
+          [IntrWriteMem]>;
 
 def int_dx_resource_updatecounter
     : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_any_ty, llvm_i8_ty],
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 62b5b704e99eb..6fdd83c4dc877 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -909,6 +909,26 @@ def RawBufferLoad : DXILOp<139, rawBufferLoad> {
   let stages = [Stages<DXIL1_2, [all_stages]>];
 }
 
+def RawBufferStore : DXILOp<140, rawBufferStore> {
+  let Doc = "writes to a RWByteAddressBuffer or RWStructuredBuffer";
+  // Handle, Coord0, Coord1, Val0, Val1, Val2, Val3, Mask, Alignment
+  let arguments = [
+    HandleTy, Int32Ty, Int32Ty, OverloadTy, OverloadTy, OverloadTy, OverloadTy,
+    Int8Ty, Int32Ty
+  ];
+  let result = VoidTy;
+  let overloads = [
+    Overloads<DXIL1_2,
+              [ResRetHalfTy, ResRetFloatTy, ResRetInt16Ty, ResRetInt32Ty]>,
+    Overloads<DXIL1_3,
+              [
+                ResRetHalfTy, ResRetFloatTy, ResRetDoubleTy, ResRetInt16Ty,
+                ResRetInt32Ty, ResRetInt64Ty
+              ]>
+  ];
+  let stages = [Stages<DXIL1_2, [all_stages]>];
+}
+
 def Dot4AddI8Packed : DXILOp<163, dot4AddPacked> {
   let Doc = "signed dot product of 4 x i8 vectors packed into i32, with "
             "accumulate to i32";
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index f43815bf21166..0c245c1a43d31 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -616,7 +616,10 @@ class OpLowerer {
     return false;
   }
 
-  [[nodiscard]] bool lowerTypedBufferStore(Function &F) {
+  [[nodiscard]] bool lowerBufferStore(Function &F, bool IsRaw) {
+    Triple TT(Triple(M.getTargetTriple()));
+    VersionTuple DXILVersion = TT.getDXILVersion();
+    const DataLayout &DL = F.getDataLayout();
     IRBuilder<> &IRB = OpBuilder.getIRB();
     Type *Int8Ty = IRB.getInt8Ty();
     Type *Int32Ty = IRB.getInt32Ty();
@@ -627,51 +630,75 @@ class OpLowerer {
       Value *Handle =
           createTmpHandleCast(CI->getArgOperand(0), OpBuilder.getHandleType());
       Value *Index0 = CI->getArgOperand(1);
-      Value *Index1 = UndefValue::get(Int32Ty);
-      // For typed stores, the mask must always cover all four elements.
-      Constant *Mask = ConstantInt::get(Int8Ty, 0xF);
+      Value *Index1 = IsRaw ? CI->getArgOperand(2) : UndefValue::get(Int32Ty);
+
+      Value *Data = CI->getArgOperand(IsRaw ? 3 : 2);
+      Type *DataTy = Data->getType();
+      Type *ScalarTy = DataTy->getScalarType();
 
-      Value *Data = CI->getArgOperand(2);
-      auto *DataTy = dyn_cast<FixedVectorType>(Data->getType());
-      if (!DataTy || DataTy->getNumElements() != 4)
+      uint64_t NumElements =
+          DL.getTypeSizeInBits(DataTy) / DL.getTypeSizeInBits(ScalarTy);
+      Value *Mask = ConstantInt::get(Int8Ty, ~(~0U << NumElements));
+
+      // TODO: check that we only have vector or scalar...
+      if (!IsRaw && NumElements != 4)
         return make_error<StringError>(
             "typedBufferStore data must be a vector of 4 elements",
             inconvertibleErrorCode());
+      else if (NumElements > 4)
+        return make_error<StringError>(
+            "rawBufferStore data must have at most 4 elements",
+            inconvertibleErrorCode());
 
-      // Since we're post-scalarizer, we likely have a vector that's constructed
-      // solely for the argument of the store. If so, just use the scalar values
-      // from before they're inserted into the temporary.
       std::array<Value *, 4> DataElements{nullptr, nullptr, nullptr, nullptr};
-      auto *IEI = dyn_cast<InsertElementInst>(Data);
-      while (IEI) {
-        auto *IndexOp = dyn_cast<ConstantInt>(IEI->getOperand(2));
-        if (!IndexOp)
-          break;
-        size_t IndexVal = IndexOp->getZExtValue();
-        assert(IndexVal < 4 && "Too many elements for buffer store");
-        DataElements[IndexVal] = IEI->getOperand(1);
-        IEI = dyn_cast<InsertElementInst>(IEI->getOperand(0));
+      if (DataTy == ScalarTy)
+        DataElements[0] = Data;
+      else {
+        // Since we're post-scalarizer, if we see a vector here it's likely
+        // constructed solely for the argument of the store. Just use the scalar
+        // values from before they're inserted into the temporary.
+        auto *IEI = dyn_cast<InsertElementInst>(Data);
+        while (IEI) {
+          auto *IndexOp = dyn_cast<ConstantInt>(IEI->getOperand(2));
+          if (!IndexOp)
+            break;
+          size_t IndexVal = IndexOp->getZExtValue();
+          assert(IndexVal < 4 && "Too many elements for buffer store");
+          DataElements[IndexVal] = IEI->getOperand(1);
+          IEI = dyn_cast<InsertElementInst>(IEI->getOperand(0));
+        }
       }
 
       // If for some reason we weren't able to forward the arguments from the
-      // scalarizer artifact, then we need to actually extract elements from the
-      // vector.
-      for (int I = 0, E = 4; I != E; ++I)
+      // scalarizer artifact, then we may need to actually extract elements from
+      // the vector.
+      for (int I = 0, E = NumElements; I < E; ++I)
         if (DataElements[I] == nullptr)
           DataElements[I] =
               IRB.CreateExtractElement(Data, ConstantInt::get(Int32Ty, I));
+      // For any elements beyond the length of the vector, fill up with undef.
+      for (int I = NumElements, E = 4; I < E; ++I)
+        if (DataElements[I] == nullptr)
+          DataElements[I] = UndefValue::get(ScalarTy);
 
-      std::array<Value *, 8> Args{
+      dxil::OpCode Op = OpCode::BufferStore;
+      SmallVector<Value *, 9> Args{
           Handle,          Index0,          Index1,          DataElements[0],
           DataElements[1], DataElements[2], DataElements[3], Mask};
+      if (IsRaw && DXILVersion >= VersionTuple(1, 2)) {
+        Op = OpCode::RawBufferStore;
+        // RawBufferStore requires the alignment
+        Args.push_back(
+            ConstantInt::get(Int32Ty, DL.getPrefTypeAlign(ScalarTy).value()));
+      }
       Expected<CallInst *> OpCall =
-          OpBuilder.tryCreateOp(OpCode::BufferStore, Args, CI->getName());
+          OpBuilder.tryCreateOp(Op, Args, CI->getName());
       if (Error E = OpCall.takeError())
         return E;
 
       CI->eraseFromParent();
       // Clean up any leftover `insertelement`s
-      IEI = dyn_cast<InsertElementInst>(Data);
+      auto *IEI = dyn_cast<InsertElementInst>(Data);
       while (IEI && IEI->use_empty()) {
         InsertElementInst *Tmp = IEI;
         IEI = dyn_cast<InsertElementInst>(IEI->getOperand(0));
@@ -776,11 +803,14 @@ class OpLowerer {
         HasErrors |= lowerTypedBufferLoad(F, /*HasCheckBit=*/true);
         break;
       case Intrinsic::dx_resource_store_typedbuffer:
-        HasErrors |= lowerTypedBufferStore(F);
+        HasErrors |= lowerBufferStore(F, /*IsRaw=*/false);
         break;
       case Intrinsic::dx_resource_load_rawbuffer:
         HasErrors |= lowerRawBufferLoad(F);
         break;
+      case Intrinsic::dx_resource_store_rawbuffer:
+        HasErrors |= lowerBufferStore(F, /*IsRaw=*/true);
+        break;
       case Intrinsic::dx_resource_updatecounter:
         HasErrors |= lowerUpdateCounter(F);
         break;
diff --git a/llvm/test/CodeGen/DirectX/BufferStore-sm61.ll b/llvm/test/CodeGen/DirectX/BufferStore-sm61.ll
new file mode 100644
index 0000000000000..1916cdf374455
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/BufferStore-sm61.ll
@@ -0,0 +1,126 @@
+; RUN: opt -S -dxil-op-lower %s | FileCheck %s
+; Before SM6.2 ByteAddressBuffer and StructuredBuffer lower to bufferStore.
+
+target triple = "dxil-pc-shadermodel6.1-compute"
+
+; CHECK-LABEL: define void @storef32_struct
+define void @storef32_struct(i32 %index, float %data) {
+  %buffer = call target("dx.RawBuffer", float, 1, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: call void @dx.op.bufferStore.f32(i32 69, %dx.types.Handle %{{.*}}, i32 %index, i32 0, float %data, float undef, float undef, float undef, i8 1)
+  call void @llvm.dx.resource.store.rawbuffer.f32(
+      target("dx.RawBuffer", float, 1, 0, 0) %buffer,
+      i32 %index, i32 0, float %data)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @storef32_byte
+define void @storef32_byte(i32 %offset, float %data) {
+  %buffer = call target("dx.RawBuffer", i8, 1, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: call void @dx.op.bufferStore.f32(i32 69, %dx.types.Handle %{{.*}}, i32 %offset, i32 0, float %data, float undef, float undef, float undef, i8 1)
+  call void @llvm.dx.resource.store.rawbuffer.f32(
+      target("dx.RawBuffer", i8, 1, 0, 0) %buffer,
+      i32 %offset, i32 0, float %data)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @storev4f32_struct
+define void @storev4f32_struct(i32 %index, <4 x float> %data) {
+  %buffer = call target("dx.RawBuffer", <4 x float>, 1, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_v4f32_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATA0:%.*]] = extractelement <4 x float> %data, i32 0
+  ; CHECK: [[DATA1:%.*]] = extractelement <4 x float> %data, i32 1
+  ; CHECK: [[DATA2:%.*]] = extractelement <4 x float> %data, i32 2
+  ; CHECK: [[DATA3:%.*]] = extractelement <4 x float> %data, i32 3
+  ; CHECK: call void @dx.op.bufferStore.f32(i32 69, %dx.types.Handle %{{.*}}, i32 %index, i32 0, float [[DATA0]], float [[DATA1]], float [[DATA2]], float [[DATA3]], i8 15)
+  call void @llvm.dx.resource.store.rawbuffer.v4f32(
+      target("dx.RawBuffer", <4 x float>, 1, 0, 0) %buffer,
+      i32 %index, i32 0, <4 x float> %data)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @storev4f32_byte
+define void @storev4f32_byte(i32 %offset, <4 x float> %data) {
+  %buffer = call target("dx.RawBuffer", i8, 1, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATA0:%.*]] = extractelement <4 x float> %data, i32 0
+  ; CHECK: [[DATA1:%.*]] = extractelement <4 x float> %data, i32 1
+  ; CHECK: [[DATA2:%.*]] = extractelement <4 x float> %data, i32 2
+  ; CHECK: [[DATA3:%.*]] = extractelement <4 x float> %data, i32 3
+  ; CHECK: call void @dx.op.bufferStore.f32(i32 69, %dx.types.Handle %{{.*}}, i32 %offset, i32 0, float [[DATA0]], float [[DATA1]], float [[DATA2]], float [[DATA3]], i8 15)
+  call void @llvm.dx.resource.store.rawbuffer.v4f32(
+      target("dx.RawBuffer", i8, 1, 0, 0) %buffer,
+      i32 %offset, i32 0, <4 x float> %data)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @storeelements
+define void @storeelements(i32 %index, <4 x float> %data0, <4 x i32> %data1) {
+  %buffer = call target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 1, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_sl_v4f32v4i32s_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATA0_0:%.*]] = extractelement <4 x float> %data0, i32 0
+  ; CHECK: [[DATA0_1:%.*]] = extractelement <4 x float> %data0, i32 1
+  ; CHECK: [[DATA0_2:%.*]] = extractelement <4 x float> %data0, i32 2
+  ; CHECK: [[DATA0_3:%.*]] = extractelement <4 x float> %data0, i32 3
+  ; CHECK: call void @dx.op.bufferStore.f32(i32 69, %dx.types.Handle %{{.*}}, i32 %index, i32 0, float [[DATA0_0]], float [[DATA0_1]], float [[DATA0_2]], float [[DATA0_3]], i8 15)
+  call void @llvm.dx.resource.store.rawbuffer.v4f32(
+      target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 1, 0, 0) %buffer,
+      i32 %index, i32 0, <4 x float> %data0)
+
+  ; CHECK: [[DATA1_0:%.*]] = extractelement <4 x i32> %data1, i32 0
+  ; CHECK: [[DATA1_1:%.*]] = extractelement <4 x i32> %data1, i32 1
+  ; CHECK: [[DATA1_2:%.*]] = extractelement <4 x i32> %data1, i32 2
+  ; CHECK: [[DATA1_3:%.*]] = extractelement <4 x i32> %data1, i32 3
+  ; CHECK: call void @dx.op.bufferStore.i32(i32 69, %dx.types.Handle %{{.*}}, i32 %index, i32 16, i32 [[DATA1_0]], i32 [[DATA1_1]], i32 [[DATA1_2]], i32 [[DATA1_3]], i8 15)
+  call void @llvm.dx.resource.store.rawbuffer.v4i32(
+      target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 1, 0, 0) %buffer,
+      i32 %index, i32 16, <4 x i32> %data1)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @storenested
+define void @storenested(i32 %index, i32 %data0, <4 x float> %data1, <3 x half> %data2) {
+  %buffer = call
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0)
+      @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: call void @dx.op.bufferStore.i32(i32 69, %dx.types.Handle %{{.*}}, i32 %index, i32 0, i32 %data0, i32 undef, i32 undef, i32 undef, i8 1)
+  call void @llvm.dx.resource.store.rawbuffer.i32(
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0) %buffer,
+      i32 %index, i32 0, i32 %data0)
+
+  ; CHECK: [[DATA1_0:%.*]] = extractelement <4 x float> %data1, i32 0
+  ; CHECK: [[DATA1_1:%.*]] = extractelement <4 x float> %data1, i32 1
+  ; CHECK: [[DATA1_2:%.*]] = extractelement <4 x float> %data1, i32 2
+  ; CHECK: [[DATA1_3:%.*]] = extractelement <4 x float> %data1, i32 3
+  ; CHECK: call void @dx.op.bufferStore.f32(i32 69, %dx.types.Handle %{{.*}}, i32 %index, i32 4, float [[DATA1_0]], float [[DATA1_1]], float [[DATA1_2]], float [[DATA1_3]], i8 15)
+  call void @llvm.dx.resource.store.rawbuffer.v4f32(
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0) %buffer,
+      i32 %index, i32 4, <4 x float> %data1)
+
+  ; CHECK: [[DATA2_0:%.*]] = extractelement <3 x half> %data2, i32 0
+  ; CHECK: [[DATA2_1:%.*]] = extractelement <3 x half> %data2, i32 1
+  ; CHECK: [[DATA2_2:%.*]] = extractelement <3 x half> %data2, i32 2
+  ; CHECK: call void @dx.op.bufferStore.f16(i32 69, %dx.types.Handle %{{.*}}, i32 %index, i32 20, half [[DATA2_0]], half [[DATA2_1]], half [[DATA2_2]], half undef, i8 7)
+  call void @llvm.dx.resource.store.rawbuffer.v3f16(
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0) %buffer,
+      i32 %index, i32 20, <3 x half> %data2)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/RawBufferStore-error64.ll b/llvm/test/CodeGen/DirectX/RawBufferStore-error64.ll
new file mode 100644
index 0000000000000..a883a0bbc29fd
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/RawBufferStore-error64.ll
@@ -0,0 +1,20 @@
+; We use llc for this test so that we don't abort after the first error.
+; RUN: not llc %s -o /dev/null 2>&1 | FileCheck %s
+
+target triple = "dxil-pc-shadermodel6.2-compute"
+
+; Can't store 64 bit types directly until SM6.3 (byteaddressbuf.Store<int64_t4>)
+; CHECK: error:
+; CHECK-SAME: in function storev4f64_byte
+; CHECK-SAME: Cannot create RawBufferStore operation: Invalid overload type
+define void @storev4f64_byte(i32 %offset, <4 x double> %data) "hlsl.export" {
+  %buffer = call target("dx.RawBuffer", i8, 1, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  call void @llvm.dx.resource.store.rawbuffer.v4i64(
+      target("dx.RawBuffer", i8, 1, 0, 0) %buffer,
+      i32 %offset, i32 0, <4 x double> %data)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/RawBufferStore.ll b/llvm/test/CodeGen/DirectX/RawBufferStore.ll
new file mode 100644
index 0000000000000..96824d5ee5a4a
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/RawBufferStore.ll
@@ -0,0 +1,144 @@
+; RUN: opt -S -dxil-op-lower %s | FileCheck %s
+
+target triple = "dxil-pc-shadermodel6.6-compute"
+
+; CHECK-LABEL: define void @storef32_struct
+define void @storef32_struct(i32 %index, float %data) {
+  %buffer = call target("dx.RawBuffer", float, 1, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %buffer_annot, i32 %index, i32 0, float %data, float undef, float undef, float undef, i8 1, i32 4)
+  call void @llvm.dx.resource.store.rawbuffer.f32(
+      target("dx.RawBuffer", float, 1, 0, 0) %buffer,
+      i32 %index, i32 0, float %data)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @storef32_byte
+define void @storef32_byte(i32 %offset, float %data) {
+  %buffer = call target("dx.RawBuffer", i8, 1, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %buffer_annot, i32 %offset, i32 0, float %data, float undef, float undef, float undef, i8 1, i32 4)
+  call void @llvm.dx.resource.store.rawbuffer.f32(
+      target("dx.RawBuffer", i8, 1, 0, 0) %buffer,
+      i32 %offset, i32 0, float %data)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @storev4f32_struct
+define void @storev4f32_struct(i32 %index, <4 x float> %data) {
+  %buffer = call target("dx.RawBuffer", <4 x float>, 1, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_v4f32_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATA0:%.*]] = extractelement <4 x float> %data, i32 0
+  ; CHECK: [[DATA1:%.*]] = extractelement <4 x float> %data, i32 1
+  ; CHECK: [[DATA2:%.*]] = extractelement <4 x float> %data, i32 2
+  ; CHECK: [[DATA3:%.*]] = extractelement <4 x float> %data, i32 3
+  ; CHECK: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %buffer_annot, i32 %index, i32 0, float [[DATA0]], float [[DATA1]], float [[DATA2]], float [[DATA3]], i8 15, i32 4)
+  call void @llvm.dx.resource.store.rawbuffer.v4f32(
+      target("dx.RawBuffer", <4 x float>, 1, 0, 0) %buffer,
+      i32 %index, i32 0, <4 x float> %data)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @storev4f32_byte
+define void @storev4f32_byte(i32 %offset, <4 x float> %data) {
+  %buffer = call target("dx.RawBuffer", i8, 1, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATA0:%.*]] = extractelement <4 x float> %data, i32 0
+  ; CHECK: [[DATA1:%.*]] = extractelement <4 x float> %data, i32 1
+  ; CHECK: [[DATA2:%.*]] = extractelement <4 x float> %data, i32 2
+  ; CHECK: [[DATA3:%.*]] = extractelement <4 x float> %data, i32 3
+  ; CHECK: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %buffer_annot, i32 %offset, i32 0, float [[DATA0]], float [[DATA1]], float [[DATA2]], float [[DATA3]], i8 15, i32 4)
+  call void @llvm.dx.resource.store.rawbuffer.v4f32(
+      target("dx.RawBuffer", i8, 1, 0, 0) %buffer,
+      i32 %offset, i32 0, <4 x float> %data)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @storeelements
+define void @storeelements(i32 %index, <4 x float> %data0, <4 x i32> %data1) {
+  %buffer = call target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 1, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_sl_v4f32v4i32s_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATA0_0:%.*]] = extractelement <4 x float> %data0, i32 0
+  ; CHECK: [[DATA0_1:%.*]] = extractelement <4 x float> %data0, i32 1
+  ; CHECK: [[DATA0_2:%.*]] = extractelement <4 x float> %data0, i32 2
+  ; CHECK: [[DATA0_3:%.*]] = extractelement <4 x float> %data0, i32 3
+  ; CHECK: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %buffer_annot, i32 %index, i32 0, float [[DATA0_0]], float [[DATA0_1]], float [[DATA0_2]], float [[DATA0_3]], i8 15, i32 4)
+  call void @llvm.dx.resource.store.rawbuffer.v4f32(
+      target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 1, 0, 0) %buffer,
+      i32 %index, i32 0, <4 x float> %data0)
+
+  ; CHECK: [[DATA1_0:%.*]] = extractelement <4 x i32> %data1, i32 0
+  ; CHECK: [[DATA1_1:%.*]] = extractelement <4 x i32> %data1, i32 1
+  ; CHECK: [[DATA1_2:%.*]] = extractelement <4 x i32> %data1, i32 2
+  ; CHECK: [[DATA1_3:%.*]] = extractelement <4 x i32> %data1, i32 3
+  ; CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %buffer_annot, i32 %index, i32 16, i32 [[DATA1_0]], i32 [[DATA1_1]], i32 [[DATA1_2]], i32 [[DATA1_3]], i8 15, i32 4)
+  call void @llvm.dx.resource.store.rawbuffer.v4i32(
+      target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 1, 0, 0) %buffer,
+      i32 %index, i32 16, <4 x i32> %data1)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @storenested
+define void @storenested(i32 %index, i32 %data0, <4 x float> %data1, <3 x half> %data2) {
+  %buffer = call
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0)
+      @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %buffer_annot, i32 %index, i32 0, i32 %data0, i32 undef, i32 undef, i32 undef, i8 1, i32 4)
+  call void @llvm.dx.resource.store.rawbuffer.i32(
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0) %buffer,
+      i32 %index, i32 0, i32 %data0)
+
+  ; CHECK: [[DATA1_0:%.*]] = extractelement <4 x float> %data1, i32 0
+  ; CHECK: [[DATA1_1:%.*]] = extractelement <4 x float> %data1, i32 1
+  ; CHECK: [[DATA1_2:%.*]] = extractelement <4 x float> %data1, i32 2
+  ; CHECK: [[DATA1_3:%.*]] = extractelement <4 x float> %data1, i32 3
+  ; CHECK: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %buffer_annot, i32 %index, i32 4, float [[DATA1_0]], float [[DATA1_1]], float [[DATA1_2]], float [[DATA1_3]], i8 15, i32 4)
+  call void @llvm.dx.resource.store.rawbuffer.v4f32(
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0) %buffer,
+      i32 %index, i32 4, <4 x float> %data1)
+
+  ; CHECK: [[DATA2_0:%.*]] = extractelement <3 x half> %data2, i32 0
+  ; CHECK: [[DATA2_1:%.*]] = extractelement <3 x half> %data2, i32 1
+  ; CHECK: [[DATA2_2:%.*]] = extractelement <3 x half> %data2, i32 2
+  ; CHECK: call void @dx.op.rawBufferStore.f16(i32 140, %dx.types.Handle %buffer_annot, i32 %index, i32 20, half [[DATA2_0]], half [[DATA2_1]], half [[DATA2_2]], half undef, i8 7, i32 2)
+  call void @llvm.dx.resource.store.rawbuffer.v3f16(
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0) %buffer,
+      i32 %index, i32 20, <3 x half> %data2)
+
+  ret void
+}
+
+; byteaddressbuf.Store<int64_t4>
+; CHECK-LABEL: define void @storev4f64_byte
+define void @storev4f64_byte(i32 %offset, <4 x double> %data) {
+  %buffer = call target("dx.RawBuffer", i8, 1, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATA0:%.*]] = extractelement <4 x double> %data, i32 0
+  ; CHECK: [[DATA1:%.*]] = extractelement <4 x double> %data, i32 1
+  ; CHECK: [[DATA2:%.*]] = extractelement <4 x double> %data, i32 2
+  ; CHECK: [[DATA3:%.*]] = extractelement <4 x double> %data, i32 3
+  ; CHECK: call void @dx.op.rawBufferStore.f64(i32 140, %dx.types.Handle %buffer_annot, i32 %offset, i32 0, double [[DATA0]], double [[DATA1]], double [[DATA2]], double [[DATA3]], i8 15, i32 8)
+  call void @llvm.dx.resource.store.rawbuffer.v4i64(
+      target("dx.RawBuffer", i8, 1, 0, 0) %buffer,
+      i32 %offset, i32 0, <4 x double> %data)
+
+  ret void
+}

From acbd822879f7727127926c25e1b47f5017f962c5 Mon Sep 17 00:00:00 2001
From: Bill Hoffman <bill.hoffman@kitware.com>
Date: Sun, 12 Jan 2025 21:20:20 -0500
Subject: [PATCH 221/408] Fix print module manifest file for macos (#122370)

This commit fixes -print-library-module-manifest-path on macos.
Currently, this only works on linux systems. This is because on macos
systems the library and header files are installed in a different
location. The module manifest is next to the libraries and the search
function was not looking in both places. There is also a test included.
---
 clang/lib/Driver/Driver.cpp                   |  5 ++++
 ...les-print-library-module-manifest-path.cpp | 26 +++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 10df730744b08..9a947f32283c3 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -6398,6 +6398,11 @@ std::string Driver::GetFilePath(StringRef Name, const ToolChain &TC) const {
   if (auto P = SearchPaths(TC.getFilePaths()))
     return *P;
 
+  SmallString<128> R2(ResourceDir);
+  llvm::sys::path::append(R2, "..", "..", Name);
+  if (llvm::sys::fs::exists(Twine(R2)))
+    return std::string(R2);
+
   return std::string(Name);
 }
 
diff --git a/clang/test/Driver/modules-print-library-module-manifest-path.cpp b/clang/test/Driver/modules-print-library-module-manifest-path.cpp
index 3ba2709ad95cc..8d17fe1549e34 100644
--- a/clang/test/Driver/modules-print-library-module-manifest-path.cpp
+++ b/clang/test/Driver/modules-print-library-module-manifest-path.cpp
@@ -18,6 +18,28 @@
 // RUN:     --target=x86_64-linux-gnu 2>&1 \
 // RUN:   | FileCheck libcxx.cpp
 
+// for macos there is a different directory structure
+// where the library and libc++.modules.json file are in lib
+// directly but headers are in clang/ver directory which
+// is the resource directory
+// RUN: mkdir -p %t/Inputs/usr/lib/clang/20
+// RUN: touch %t/Inputs/usr/lib/libc++.so
+// RUN: touch %t/Inputs/usr/lib/libc++.modules.json
+// RUN: %clang -print-library-module-manifest-path \
+// RUN:     -stdlib=libc++ \
+// RUN:     -resource-dir=%t/Inputs/usr/lib/clang/20 \
+// RUN:     --target=arm64-apple-darwin24.1.0 2>&1 \
+// RUN:   | FileCheck libcxx.cpp.macos
+
+// RUN: rm %t/Inputs/usr/lib/libc++.so
+// RUN: touch %t/Inputs/usr/lib/libc++.a
+// RUN: touch %t/Inputs/usr/lib/libc++.modules.json
+// RUN: %clang -print-library-module-manifest-path \
+// RUN:     -stdlib=libc++ \
+// RUN:     -resource-dir=%t/Inputs/usr/lib/clang/20 \
+// RUN:     --target=arm64-apple-darwin24.1.0 2>&1 \
+// RUN:   | FileCheck libcxx.cpp.macos
+
 // RUN: rm %t/Inputs/usr/lib/x86_64-linux-gnu/libc++.so
 // RUN: touch %t/Inputs/usr/lib/x86_64-linux-gnu/libc++.a
 // RUN: %clang -print-library-module-manifest-path \
@@ -40,6 +62,10 @@
 
 // CHECK: {{.*}}/Inputs/usr/lib/x86_64-linux-gnu{{/|\\}}libc++.modules.json
 
+//--- libcxx.cpp.macos
+
+// CHECK: {{.*}}libc++.modules.json
+
 //--- libcxx-no-shared-lib.cpp
 
 // Note this might find a different path depending whether search path

From 59bba39a692fd371d0dd0e6baba49a414bf7d855 Mon Sep 17 00:00:00 2001
From: Pengcheng Wang <wangpengcheng.pp@bytedance.com>
Date: Mon, 13 Jan 2025 11:28:24 +0800
Subject: [PATCH 222/408] [RISCV] Rework memcpy test (#120364)

Use descriptive names and add more cases.
---
 llvm/test/CodeGen/RISCV/memcpy.ll | 1180 +++++++++++++++++++++--------
 1 file changed, 878 insertions(+), 302 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/memcpy.ll b/llvm/test/CodeGen/RISCV/memcpy.ll
index 1ab3722080f70..ce47476de9ce8 100644
--- a/llvm/test/CodeGen/RISCV/memcpy.ll
+++ b/llvm/test/CodeGen/RISCV/memcpy.ll
@@ -7,406 +7,935 @@
 ; RUN:   | FileCheck %s --check-prefixes=RV32-BOTH,RV32-FAST
 ; RUN: llc < %s -mtriple=riscv64 -mattr=+unaligned-scalar-mem \
 ; RUN:   | FileCheck %s --check-prefixes=RV64-BOTH,RV64-FAST
-%struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 }
 
-@src = external dso_local global %struct.x
-@dst = external dso_local global %struct.x
+; ----------------------------------------------------------------------
+; Fully unaligned cases
 
-@.str1 = private unnamed_addr constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 1
-@.str2 = private unnamed_addr constant [36 x i8] c"DHRYSTONE PROGRAM, SOME STRING BLAH\00", align 1
-@.str3 = private unnamed_addr constant [24 x i8] c"DHRYSTONE PROGRAM, SOME\00", align 1
-@.str4 = private unnamed_addr constant [18 x i8] c"DHRYSTONE PROGR  \00", align 1
-@.str5 = private unnamed_addr constant [7 x i8] c"DHRYST\00", align 1
-@.str6 = private unnamed_addr constant [14 x i8] c"/tmp/rmXXXXXX\00", align 1
-@spool.splbuf = internal global [512 x i8] zeroinitializer, align 16
+define void @unaligned_memcpy0(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-BOTH-LABEL: unaligned_memcpy0:
+; RV32-BOTH:       # %bb.0: # %entry
+; RV32-BOTH-NEXT:    ret
+;
+; RV64-BOTH-LABEL: unaligned_memcpy0:
+; RV64-BOTH:       # %bb.0: # %entry
+; RV64-BOTH-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 0, i1 false)
+  ret void
+}
 
-define i32 @t0() {
-; RV32-LABEL: t0:
+define void @unaligned_memcpy1(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-BOTH-LABEL: unaligned_memcpy1:
+; RV32-BOTH:       # %bb.0: # %entry
+; RV32-BOTH-NEXT:    lbu a1, 0(a1)
+; RV32-BOTH-NEXT:    sb a1, 0(a0)
+; RV32-BOTH-NEXT:    ret
+;
+; RV64-BOTH-LABEL: unaligned_memcpy1:
+; RV64-BOTH:       # %bb.0: # %entry
+; RV64-BOTH-NEXT:    lbu a1, 0(a1)
+; RV64-BOTH-NEXT:    sb a1, 0(a0)
+; RV64-BOTH-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 1, i1 false)
+  ret void
+}
+
+define void @unaligned_memcpy2(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-LABEL: unaligned_memcpy2:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lui a0, %hi(src)
-; RV32-NEXT:    lw a1, %lo(src)(a0)
-; RV32-NEXT:    lui a2, %hi(dst)
-; RV32-NEXT:    addi a0, a0, %lo(src)
-; RV32-NEXT:    sw a1, %lo(dst)(a2)
-; RV32-NEXT:    lw a1, 4(a0)
-; RV32-NEXT:    lh a3, 8(a0)
-; RV32-NEXT:    lbu a0, 10(a0)
-; RV32-NEXT:    addi a2, a2, %lo(dst)
-; RV32-NEXT:    sw a1, 4(a2)
-; RV32-NEXT:    sh a3, 8(a2)
-; RV32-NEXT:    sb a0, 10(a2)
-; RV32-NEXT:    li a0, 0
+; RV32-NEXT:    lbu a2, 1(a1)
+; RV32-NEXT:    sb a2, 1(a0)
+; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    sb a1, 0(a0)
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: t0:
+; RV64-LABEL: unaligned_memcpy2:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    lui a0, %hi(src)
-; RV64-NEXT:    lui a1, %hi(dst)
-; RV64-NEXT:    ld a2, %lo(src)(a0)
-; RV64-NEXT:    addi a0, a0, %lo(src)
-; RV64-NEXT:    lh a3, 8(a0)
-; RV64-NEXT:    lbu a0, 10(a0)
-; RV64-NEXT:    sd a2, %lo(dst)(a1)
-; RV64-NEXT:    addi a1, a1, %lo(dst)
-; RV64-NEXT:    sh a3, 8(a1)
-; RV64-NEXT:    sb a0, 10(a1)
-; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    lbu a2, 1(a1)
+; RV64-NEXT:    sb a2, 1(a0)
+; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    sb a1, 0(a0)
 ; RV64-NEXT:    ret
 ;
-; RV32-FAST-LABEL: t0:
+; RV32-FAST-LABEL: unaligned_memcpy2:
 ; RV32-FAST:       # %bb.0: # %entry
-; RV32-FAST-NEXT:    lui a0, %hi(src)
-; RV32-FAST-NEXT:    lw a1, %lo(src)(a0)
-; RV32-FAST-NEXT:    addi a0, a0, %lo(src)
-; RV32-FAST-NEXT:    lw a2, 4(a0)
-; RV32-FAST-NEXT:    lw a0, 7(a0)
-; RV32-FAST-NEXT:    lui a3, %hi(dst)
-; RV32-FAST-NEXT:    sw a1, %lo(dst)(a3)
-; RV32-FAST-NEXT:    addi a1, a3, %lo(dst)
-; RV32-FAST-NEXT:    sw a0, 7(a1)
-; RV32-FAST-NEXT:    sw a2, 4(a1)
-; RV32-FAST-NEXT:    li a0, 0
+; RV32-FAST-NEXT:    lh a1, 0(a1)
+; RV32-FAST-NEXT:    sh a1, 0(a0)
 ; RV32-FAST-NEXT:    ret
 ;
-; RV64-FAST-LABEL: t0:
+; RV64-FAST-LABEL: unaligned_memcpy2:
 ; RV64-FAST:       # %bb.0: # %entry
-; RV64-FAST-NEXT:    lui a0, %hi(src)
-; RV64-FAST-NEXT:    ld a1, %lo(src)(a0)
-; RV64-FAST-NEXT:    addi a0, a0, %lo(src)
-; RV64-FAST-NEXT:    lw a0, 7(a0)
-; RV64-FAST-NEXT:    lui a2, %hi(dst)
-; RV64-FAST-NEXT:    sd a1, %lo(dst)(a2)
-; RV64-FAST-NEXT:    addi a1, a2, %lo(dst)
-; RV64-FAST-NEXT:    sw a0, 7(a1)
-; RV64-FAST-NEXT:    li a0, 0
+; RV64-FAST-NEXT:    lh a1, 0(a1)
+; RV64-FAST-NEXT:    sh a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
-  call void @llvm.memcpy.p0.p0.i32(ptr align 8 @dst, ptr align 8 @src, i32 11, i1 false)
-  ret i32 0
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 2, i1 false)
+  ret void
 }
 
-define void @t1(ptr nocapture %C) nounwind {
-; RV32-LABEL: t1:
+define void @unaligned_memcpy3(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-LABEL: unaligned_memcpy3:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lui a1, %hi(.L.str1)
-; RV32-NEXT:    addi a1, a1, %lo(.L.str1)
-; RV32-NEXT:    li a2, 31
-; RV32-NEXT:    tail memcpy
+; RV32-NEXT:    lbu a2, 2(a1)
+; RV32-NEXT:    sb a2, 2(a0)
+; RV32-NEXT:    lbu a2, 1(a1)
+; RV32-NEXT:    sb a2, 1(a0)
+; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    sb a1, 0(a0)
+; RV32-NEXT:    ret
 ;
-; RV64-LABEL: t1:
+; RV64-LABEL: unaligned_memcpy3:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    lui a1, %hi(.L.str1)
-; RV64-NEXT:    addi a1, a1, %lo(.L.str1)
-; RV64-NEXT:    li a2, 31
-; RV64-NEXT:    tail memcpy
+; RV64-NEXT:    lbu a2, 2(a1)
+; RV64-NEXT:    sb a2, 2(a0)
+; RV64-NEXT:    lbu a2, 1(a1)
+; RV64-NEXT:    sb a2, 1(a0)
+; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    ret
 ;
-; RV32-FAST-LABEL: t1:
+; RV32-FAST-LABEL: unaligned_memcpy3:
 ; RV32-FAST:       # %bb.0: # %entry
-; RV32-FAST-NEXT:    lui a1, 1141
-; RV32-FAST-NEXT:    lui a2, 300325
-; RV32-FAST-NEXT:    lui a3, 132181
-; RV32-FAST-NEXT:    lui a4, 340483
-; RV32-FAST-NEXT:    lui a5, 267556
-; RV32-FAST-NEXT:    lui a6, 337154
-; RV32-FAST-NEXT:    addi a1, a1, -439
-; RV32-FAST-NEXT:    sw a1, 27(a0)
-; RV32-FAST-NEXT:    lui a1, 320757
-; RV32-FAST-NEXT:    addi a2, a2, 1107
-; RV32-FAST-NEXT:    addi a3, a3, -689
-; RV32-FAST-NEXT:    addi a4, a4, -947
-; RV32-FAST-NEXT:    sw a4, 16(a0)
-; RV32-FAST-NEXT:    sw a3, 20(a0)
-; RV32-FAST-NEXT:    sw a2, 24(a0)
-; RV32-FAST-NEXT:    lui a2, 365861
-; RV32-FAST-NEXT:    addi a3, a5, 1871
-; RV32-FAST-NEXT:    addi a4, a6, 69
-; RV32-FAST-NEXT:    addi a1, a1, 1107
-; RV32-FAST-NEXT:    addi a2, a2, -1980
-; RV32-FAST-NEXT:    sw a2, 0(a0)
-; RV32-FAST-NEXT:    sw a1, 4(a0)
-; RV32-FAST-NEXT:    sw a4, 8(a0)
-; RV32-FAST-NEXT:    sw a3, 12(a0)
+; RV32-FAST-NEXT:    lbu a2, 2(a1)
+; RV32-FAST-NEXT:    sb a2, 2(a0)
+; RV32-FAST-NEXT:    lh a1, 0(a1)
+; RV32-FAST-NEXT:    sh a1, 0(a0)
 ; RV32-FAST-NEXT:    ret
 ;
-; RV64-FAST-LABEL: t1:
+; RV64-FAST-LABEL: unaligned_memcpy3:
 ; RV64-FAST:       # %bb.0: # %entry
-; RV64-FAST-NEXT:    lui a1, %hi(.L.str1)
-; RV64-FAST-NEXT:    addi a2, a1, %lo(.L.str1)
-; RV64-FAST-NEXT:    ld a3, 23(a2)
-; RV64-FAST-NEXT:    ld a1, %lo(.L.str1)(a1)
-; RV64-FAST-NEXT:    ld a4, 8(a2)
-; RV64-FAST-NEXT:    ld a2, 16(a2)
-; RV64-FAST-NEXT:    sd a3, 23(a0)
-; RV64-FAST-NEXT:    sd a1, 0(a0)
-; RV64-FAST-NEXT:    sd a4, 8(a0)
-; RV64-FAST-NEXT:    sd a2, 16(a0)
+; RV64-FAST-NEXT:    lbu a2, 2(a1)
+; RV64-FAST-NEXT:    sb a2, 2(a0)
+; RV64-FAST-NEXT:    lh a1, 0(a1)
+; RV64-FAST-NEXT:    sh a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str1, i64 31, i1 false)
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
   ret void
 }
 
-define void @t2(ptr nocapture %C) nounwind {
-; RV32-BOTH-LABEL: t2:
-; RV32-BOTH:       # %bb.0: # %entry
-; RV32-BOTH-NEXT:    lui a1, %hi(.L.str2)
-; RV32-BOTH-NEXT:    addi a1, a1, %lo(.L.str2)
-; RV32-BOTH-NEXT:    li a2, 36
-; RV32-BOTH-NEXT:    tail memcpy
+define void @unaligned_memcpy4(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-LABEL: unaligned_memcpy4:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    lbu a2, 3(a1)
+; RV32-NEXT:    sb a2, 3(a0)
+; RV32-NEXT:    lbu a2, 2(a1)
+; RV32-NEXT:    sb a2, 2(a0)
+; RV32-NEXT:    lbu a2, 1(a1)
+; RV32-NEXT:    sb a2, 1(a0)
+; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    sb a1, 0(a0)
+; RV32-NEXT:    ret
 ;
-; RV64-LABEL: t2:
+; RV64-LABEL: unaligned_memcpy4:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    lui a1, %hi(.L.str2)
-; RV64-NEXT:    addi a1, a1, %lo(.L.str2)
-; RV64-NEXT:    li a2, 36
-; RV64-NEXT:    tail memcpy
+; RV64-NEXT:    lbu a2, 3(a1)
+; RV64-NEXT:    sb a2, 3(a0)
+; RV64-NEXT:    lbu a2, 2(a1)
+; RV64-NEXT:    sb a2, 2(a0)
+; RV64-NEXT:    lbu a2, 1(a1)
+; RV64-NEXT:    sb a2, 1(a0)
+; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: unaligned_memcpy4:
+; RV32-FAST:       # %bb.0: # %entry
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    ret
 ;
-; RV64-FAST-LABEL: t2:
+; RV64-FAST-LABEL: unaligned_memcpy4:
 ; RV64-FAST:       # %bb.0: # %entry
-; RV64-FAST-NEXT:    lui a1, %hi(.L.str2)
-; RV64-FAST-NEXT:    lui a2, 1156
-; RV64-FAST-NEXT:    ld a3, %lo(.L.str2)(a1)
-; RV64-FAST-NEXT:    addi a2, a2, 332
-; RV64-FAST-NEXT:    addi a1, a1, %lo(.L.str2)
-; RV64-FAST-NEXT:    sw a2, 32(a0)
-; RV64-FAST-NEXT:    ld a2, 8(a1)
-; RV64-FAST-NEXT:    ld a4, 16(a1)
-; RV64-FAST-NEXT:    ld a1, 24(a1)
-; RV64-FAST-NEXT:    sd a3, 0(a0)
-; RV64-FAST-NEXT:    sd a2, 8(a0)
-; RV64-FAST-NEXT:    sd a4, 16(a0)
-; RV64-FAST-NEXT:    sd a1, 24(a0)
+; RV64-FAST-NEXT:    lw a1, 0(a1)
+; RV64-FAST-NEXT:    sw a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str2, i64 36, i1 false)
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 4, i1 false)
   ret void
 }
 
-define void @t3(ptr nocapture %C) nounwind {
-; RV32-LABEL: t3:
+define void @unaligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-LABEL: unaligned_memcpy7:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lui a1, %hi(.L.str3)
-; RV32-NEXT:    addi a1, a1, %lo(.L.str3)
-; RV32-NEXT:    li a2, 24
-; RV32-NEXT:    tail memcpy
+; RV32-NEXT:    lbu a2, 6(a1)
+; RV32-NEXT:    sb a2, 6(a0)
+; RV32-NEXT:    lbu a2, 5(a1)
+; RV32-NEXT:    sb a2, 5(a0)
+; RV32-NEXT:    lbu a2, 4(a1)
+; RV32-NEXT:    sb a2, 4(a0)
+; RV32-NEXT:    lbu a2, 3(a1)
+; RV32-NEXT:    sb a2, 3(a0)
+; RV32-NEXT:    lbu a2, 2(a1)
+; RV32-NEXT:    sb a2, 2(a0)
+; RV32-NEXT:    lbu a2, 1(a1)
+; RV32-NEXT:    sb a2, 1(a0)
+; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    sb a1, 0(a0)
+; RV32-NEXT:    ret
 ;
-; RV64-LABEL: t3:
+; RV64-LABEL: unaligned_memcpy7:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    lui a1, %hi(.L.str3)
-; RV64-NEXT:    addi a1, a1, %lo(.L.str3)
-; RV64-NEXT:    li a2, 24
-; RV64-NEXT:    tail memcpy
+; RV64-NEXT:    lbu a2, 6(a1)
+; RV64-NEXT:    sb a2, 6(a0)
+; RV64-NEXT:    lbu a2, 5(a1)
+; RV64-NEXT:    sb a2, 5(a0)
+; RV64-NEXT:    lbu a2, 4(a1)
+; RV64-NEXT:    sb a2, 4(a0)
+; RV64-NEXT:    lbu a2, 3(a1)
+; RV64-NEXT:    sb a2, 3(a0)
+; RV64-NEXT:    lbu a2, 2(a1)
+; RV64-NEXT:    sb a2, 2(a0)
+; RV64-NEXT:    lbu a2, 1(a1)
+; RV64-NEXT:    sb a2, 1(a0)
+; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    ret
 ;
-; RV32-FAST-LABEL: t3:
+; RV32-FAST-LABEL: unaligned_memcpy7:
 ; RV32-FAST:       # %bb.0: # %entry
-; RV32-FAST-NEXT:    lui a1, 1109
-; RV32-FAST-NEXT:    lui a2, 340483
-; RV32-FAST-NEXT:    lui a3, 267556
-; RV32-FAST-NEXT:    lui a4, 337154
-; RV32-FAST-NEXT:    lui a5, 320757
-; RV32-FAST-NEXT:    addi a1, a1, -689
-; RV32-FAST-NEXT:    addi a2, a2, -947
-; RV32-FAST-NEXT:    sw a2, 16(a0)
-; RV32-FAST-NEXT:    sw a1, 20(a0)
-; RV32-FAST-NEXT:    lui a1, 365861
-; RV32-FAST-NEXT:    addi a2, a3, 1871
-; RV32-FAST-NEXT:    addi a3, a4, 69
-; RV32-FAST-NEXT:    addi a4, a5, 1107
-; RV32-FAST-NEXT:    addi a1, a1, -1980
+; RV32-FAST-NEXT:    lw a2, 3(a1)
+; RV32-FAST-NEXT:    sw a2, 3(a0)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: unaligned_memcpy7:
+; RV64-FAST:       # %bb.0: # %entry
+; RV64-FAST-NEXT:    lw a2, 3(a1)
+; RV64-FAST-NEXT:    sw a2, 3(a0)
+; RV64-FAST-NEXT:    lw a1, 0(a1)
+; RV64-FAST-NEXT:    sw a1, 0(a0)
+; RV64-FAST-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 7, i1 false)
+  ret void
+}
+
+define void @unaligned_memcpy8(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-LABEL: unaligned_memcpy8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    lbu a2, 7(a1)
+; RV32-NEXT:    sb a2, 7(a0)
+; RV32-NEXT:    lbu a2, 6(a1)
+; RV32-NEXT:    sb a2, 6(a0)
+; RV32-NEXT:    lbu a2, 5(a1)
+; RV32-NEXT:    sb a2, 5(a0)
+; RV32-NEXT:    lbu a2, 4(a1)
+; RV32-NEXT:    sb a2, 4(a0)
+; RV32-NEXT:    lbu a2, 3(a1)
+; RV32-NEXT:    sb a2, 3(a0)
+; RV32-NEXT:    lbu a2, 2(a1)
+; RV32-NEXT:    sb a2, 2(a0)
+; RV32-NEXT:    lbu a2, 1(a1)
+; RV32-NEXT:    sb a2, 1(a0)
+; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    sb a1, 0(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: unaligned_memcpy8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    lbu a2, 7(a1)
+; RV64-NEXT:    sb a2, 7(a0)
+; RV64-NEXT:    lbu a2, 6(a1)
+; RV64-NEXT:    sb a2, 6(a0)
+; RV64-NEXT:    lbu a2, 5(a1)
+; RV64-NEXT:    sb a2, 5(a0)
+; RV64-NEXT:    lbu a2, 4(a1)
+; RV64-NEXT:    sb a2, 4(a0)
+; RV64-NEXT:    lbu a2, 3(a1)
+; RV64-NEXT:    sb a2, 3(a0)
+; RV64-NEXT:    lbu a2, 2(a1)
+; RV64-NEXT:    sb a2, 2(a0)
+; RV64-NEXT:    lbu a2, 1(a1)
+; RV64-NEXT:    sb a2, 1(a0)
+; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: unaligned_memcpy8:
+; RV32-FAST:       # %bb.0: # %entry
+; RV32-FAST-NEXT:    lw a2, 4(a1)
+; RV32-FAST-NEXT:    sw a2, 4(a0)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
 ; RV32-FAST-NEXT:    sw a1, 0(a0)
-; RV32-FAST-NEXT:    sw a4, 4(a0)
-; RV32-FAST-NEXT:    sw a3, 8(a0)
-; RV32-FAST-NEXT:    sw a2, 12(a0)
 ; RV32-FAST-NEXT:    ret
 ;
-; RV64-FAST-LABEL: t3:
+; RV64-FAST-LABEL: unaligned_memcpy8:
 ; RV64-FAST:       # %bb.0: # %entry
-; RV64-FAST-NEXT:    lui a1, %hi(.L.str3)
-; RV64-FAST-NEXT:    ld a2, %lo(.L.str3)(a1)
-; RV64-FAST-NEXT:    addi a1, a1, %lo(.L.str3)
-; RV64-FAST-NEXT:    ld a3, 8(a1)
-; RV64-FAST-NEXT:    ld a1, 16(a1)
-; RV64-FAST-NEXT:    sd a2, 0(a0)
-; RV64-FAST-NEXT:    sd a3, 8(a0)
-; RV64-FAST-NEXT:    sd a1, 16(a0)
+; RV64-FAST-NEXT:    ld a1, 0(a1)
+; RV64-FAST-NEXT:    sd a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str3, i64 24, i1 false)
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 8, i1 false)
   ret void
 }
 
-define void @t4(ptr nocapture %C) nounwind {
-; RV32-LABEL: t4:
+define void @unaligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-LABEL: unaligned_memcpy15:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lui a1, %hi(.L.str4)
-; RV32-NEXT:    addi a1, a1, %lo(.L.str4)
-; RV32-NEXT:    li a2, 18
-; RV32-NEXT:    tail memcpy
+; RV32-NEXT:    lbu a2, 14(a1)
+; RV32-NEXT:    sb a2, 14(a0)
+; RV32-NEXT:    lbu a2, 13(a1)
+; RV32-NEXT:    sb a2, 13(a0)
+; RV32-NEXT:    lbu a2, 12(a1)
+; RV32-NEXT:    sb a2, 12(a0)
+; RV32-NEXT:    lbu a2, 11(a1)
+; RV32-NEXT:    sb a2, 11(a0)
+; RV32-NEXT:    lbu a2, 10(a1)
+; RV32-NEXT:    sb a2, 10(a0)
+; RV32-NEXT:    lbu a2, 9(a1)
+; RV32-NEXT:    sb a2, 9(a0)
+; RV32-NEXT:    lbu a2, 8(a1)
+; RV32-NEXT:    sb a2, 8(a0)
+; RV32-NEXT:    lbu a2, 7(a1)
+; RV32-NEXT:    sb a2, 7(a0)
+; RV32-NEXT:    lbu a2, 6(a1)
+; RV32-NEXT:    sb a2, 6(a0)
+; RV32-NEXT:    lbu a2, 5(a1)
+; RV32-NEXT:    sb a2, 5(a0)
+; RV32-NEXT:    lbu a2, 4(a1)
+; RV32-NEXT:    sb a2, 4(a0)
+; RV32-NEXT:    lbu a2, 3(a1)
+; RV32-NEXT:    sb a2, 3(a0)
+; RV32-NEXT:    lbu a2, 2(a1)
+; RV32-NEXT:    sb a2, 2(a0)
+; RV32-NEXT:    lbu a2, 1(a1)
+; RV32-NEXT:    sb a2, 1(a0)
+; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    sb a1, 0(a0)
+; RV32-NEXT:    ret
 ;
-; RV64-LABEL: t4:
+; RV64-LABEL: unaligned_memcpy15:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    lui a1, %hi(.L.str4)
-; RV64-NEXT:    addi a1, a1, %lo(.L.str4)
-; RV64-NEXT:    li a2, 18
-; RV64-NEXT:    tail memcpy
+; RV64-NEXT:    lbu a2, 14(a1)
+; RV64-NEXT:    sb a2, 14(a0)
+; RV64-NEXT:    lbu a2, 13(a1)
+; RV64-NEXT:    sb a2, 13(a0)
+; RV64-NEXT:    lbu a2, 12(a1)
+; RV64-NEXT:    sb a2, 12(a0)
+; RV64-NEXT:    lbu a2, 11(a1)
+; RV64-NEXT:    sb a2, 11(a0)
+; RV64-NEXT:    lbu a2, 10(a1)
+; RV64-NEXT:    sb a2, 10(a0)
+; RV64-NEXT:    lbu a2, 9(a1)
+; RV64-NEXT:    sb a2, 9(a0)
+; RV64-NEXT:    lbu a2, 8(a1)
+; RV64-NEXT:    sb a2, 8(a0)
+; RV64-NEXT:    lbu a2, 7(a1)
+; RV64-NEXT:    sb a2, 7(a0)
+; RV64-NEXT:    lbu a2, 6(a1)
+; RV64-NEXT:    sb a2, 6(a0)
+; RV64-NEXT:    lbu a2, 5(a1)
+; RV64-NEXT:    sb a2, 5(a0)
+; RV64-NEXT:    lbu a2, 4(a1)
+; RV64-NEXT:    sb a2, 4(a0)
+; RV64-NEXT:    lbu a2, 3(a1)
+; RV64-NEXT:    sb a2, 3(a0)
+; RV64-NEXT:    lbu a2, 2(a1)
+; RV64-NEXT:    sb a2, 2(a0)
+; RV64-NEXT:    lbu a2, 1(a1)
+; RV64-NEXT:    sb a2, 1(a0)
+; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    ret
 ;
-; RV32-FAST-LABEL: t4:
+; RV32-FAST-LABEL: unaligned_memcpy15:
 ; RV32-FAST:       # %bb.0: # %entry
-; RV32-FAST-NEXT:    li a1, 32
-; RV32-FAST-NEXT:    lui a2, 132388
-; RV32-FAST-NEXT:    lui a3, 337154
-; RV32-FAST-NEXT:    lui a4, 320757
-; RV32-FAST-NEXT:    sh a1, 16(a0)
-; RV32-FAST-NEXT:    lui a1, 365861
-; RV32-FAST-NEXT:    addi a2, a2, 1871
-; RV32-FAST-NEXT:    addi a3, a3, 69
-; RV32-FAST-NEXT:    addi a4, a4, 1107
-; RV32-FAST-NEXT:    addi a1, a1, -1980
+; RV32-FAST-NEXT:    lw a2, 11(a1)
+; RV32-FAST-NEXT:    sw a2, 11(a0)
+; RV32-FAST-NEXT:    lw a2, 8(a1)
+; RV32-FAST-NEXT:    sw a2, 8(a0)
+; RV32-FAST-NEXT:    lw a2, 4(a1)
+; RV32-FAST-NEXT:    sw a2, 4(a0)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
 ; RV32-FAST-NEXT:    sw a1, 0(a0)
-; RV32-FAST-NEXT:    sw a4, 4(a0)
-; RV32-FAST-NEXT:    sw a3, 8(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: unaligned_memcpy15:
+; RV64-FAST:       # %bb.0: # %entry
+; RV64-FAST-NEXT:    ld a2, 7(a1)
+; RV64-FAST-NEXT:    sd a2, 7(a0)
+; RV64-FAST-NEXT:    ld a1, 0(a1)
+; RV64-FAST-NEXT:    sd a1, 0(a0)
+; RV64-FAST-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 15, i1 false)
+  ret void
+}
+
+define void @unaligned_memcpy16(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-LABEL: unaligned_memcpy16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    lbu a2, 15(a1)
+; RV32-NEXT:    sb a2, 15(a0)
+; RV32-NEXT:    lbu a2, 14(a1)
+; RV32-NEXT:    sb a2, 14(a0)
+; RV32-NEXT:    lbu a2, 13(a1)
+; RV32-NEXT:    sb a2, 13(a0)
+; RV32-NEXT:    lbu a2, 12(a1)
+; RV32-NEXT:    sb a2, 12(a0)
+; RV32-NEXT:    lbu a2, 11(a1)
+; RV32-NEXT:    sb a2, 11(a0)
+; RV32-NEXT:    lbu a2, 10(a1)
+; RV32-NEXT:    sb a2, 10(a0)
+; RV32-NEXT:    lbu a2, 9(a1)
+; RV32-NEXT:    sb a2, 9(a0)
+; RV32-NEXT:    lbu a2, 8(a1)
+; RV32-NEXT:    sb a2, 8(a0)
+; RV32-NEXT:    lbu a2, 7(a1)
+; RV32-NEXT:    sb a2, 7(a0)
+; RV32-NEXT:    lbu a2, 6(a1)
+; RV32-NEXT:    sb a2, 6(a0)
+; RV32-NEXT:    lbu a2, 5(a1)
+; RV32-NEXT:    sb a2, 5(a0)
+; RV32-NEXT:    lbu a2, 4(a1)
+; RV32-NEXT:    sb a2, 4(a0)
+; RV32-NEXT:    lbu a2, 3(a1)
+; RV32-NEXT:    sb a2, 3(a0)
+; RV32-NEXT:    lbu a2, 2(a1)
+; RV32-NEXT:    sb a2, 2(a0)
+; RV32-NEXT:    lbu a2, 1(a1)
+; RV32-NEXT:    sb a2, 1(a0)
+; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    sb a1, 0(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: unaligned_memcpy16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    lbu a2, 15(a1)
+; RV64-NEXT:    sb a2, 15(a0)
+; RV64-NEXT:    lbu a2, 14(a1)
+; RV64-NEXT:    sb a2, 14(a0)
+; RV64-NEXT:    lbu a2, 13(a1)
+; RV64-NEXT:    sb a2, 13(a0)
+; RV64-NEXT:    lbu a2, 12(a1)
+; RV64-NEXT:    sb a2, 12(a0)
+; RV64-NEXT:    lbu a2, 11(a1)
+; RV64-NEXT:    sb a2, 11(a0)
+; RV64-NEXT:    lbu a2, 10(a1)
+; RV64-NEXT:    sb a2, 10(a0)
+; RV64-NEXT:    lbu a2, 9(a1)
+; RV64-NEXT:    sb a2, 9(a0)
+; RV64-NEXT:    lbu a2, 8(a1)
+; RV64-NEXT:    sb a2, 8(a0)
+; RV64-NEXT:    lbu a2, 7(a1)
+; RV64-NEXT:    sb a2, 7(a0)
+; RV64-NEXT:    lbu a2, 6(a1)
+; RV64-NEXT:    sb a2, 6(a0)
+; RV64-NEXT:    lbu a2, 5(a1)
+; RV64-NEXT:    sb a2, 5(a0)
+; RV64-NEXT:    lbu a2, 4(a1)
+; RV64-NEXT:    sb a2, 4(a0)
+; RV64-NEXT:    lbu a2, 3(a1)
+; RV64-NEXT:    sb a2, 3(a0)
+; RV64-NEXT:    lbu a2, 2(a1)
+; RV64-NEXT:    sb a2, 2(a0)
+; RV64-NEXT:    lbu a2, 1(a1)
+; RV64-NEXT:    sb a2, 1(a0)
+; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: unaligned_memcpy16:
+; RV32-FAST:       # %bb.0: # %entry
+; RV32-FAST-NEXT:    lw a2, 12(a1)
 ; RV32-FAST-NEXT:    sw a2, 12(a0)
+; RV32-FAST-NEXT:    lw a2, 8(a1)
+; RV32-FAST-NEXT:    sw a2, 8(a0)
+; RV32-FAST-NEXT:    lw a2, 4(a1)
+; RV32-FAST-NEXT:    sw a2, 4(a0)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a1, 0(a0)
 ; RV32-FAST-NEXT:    ret
 ;
-; RV64-FAST-LABEL: t4:
+; RV64-FAST-LABEL: unaligned_memcpy16:
 ; RV64-FAST:       # %bb.0: # %entry
-; RV64-FAST-NEXT:    lui a1, %hi(.L.str4)
-; RV64-FAST-NEXT:    ld a2, %lo(.L.str4)(a1)
-; RV64-FAST-NEXT:    addi a1, a1, %lo(.L.str4)
-; RV64-FAST-NEXT:    ld a1, 8(a1)
-; RV64-FAST-NEXT:    li a3, 32
-; RV64-FAST-NEXT:    sd a2, 0(a0)
-; RV64-FAST-NEXT:    sd a1, 8(a0)
-; RV64-FAST-NEXT:    sh a3, 16(a0)
+; RV64-FAST-NEXT:    ld a2, 8(a1)
+; RV64-FAST-NEXT:    sd a2, 8(a0)
+; RV64-FAST-NEXT:    ld a1, 0(a1)
+; RV64-FAST-NEXT:    sd a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str4, i64 18, i1 false)
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 16, i1 false)
   ret void
 }
 
-define void @t5(ptr nocapture %C) nounwind {
-; RV32-LABEL: t5:
+define void @unaligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-LABEL: unaligned_memcpy31:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    li a1, 84
-; RV32-NEXT:    li a2, 83
-; RV32-NEXT:    li a3, 89
-; RV32-NEXT:    li a4, 82
-; RV32-NEXT:    li a5, 72
-; RV32-NEXT:    li a6, 68
+; RV32-NEXT:    lbu a2, 30(a1)
+; RV32-NEXT:    sb a2, 30(a0)
+; RV32-NEXT:    lbu a2, 29(a1)
+; RV32-NEXT:    sb a2, 29(a0)
+; RV32-NEXT:    lbu a2, 28(a1)
+; RV32-NEXT:    sb a2, 28(a0)
+; RV32-NEXT:    lbu a2, 27(a1)
+; RV32-NEXT:    sb a2, 27(a0)
+; RV32-NEXT:    lbu a2, 26(a1)
+; RV32-NEXT:    sb a2, 26(a0)
+; RV32-NEXT:    lbu a2, 25(a1)
+; RV32-NEXT:    sb a2, 25(a0)
+; RV32-NEXT:    lbu a2, 24(a1)
+; RV32-NEXT:    sb a2, 24(a0)
+; RV32-NEXT:    lbu a2, 23(a1)
+; RV32-NEXT:    sb a2, 23(a0)
+; RV32-NEXT:    lbu a2, 22(a1)
+; RV32-NEXT:    sb a2, 22(a0)
+; RV32-NEXT:    lbu a2, 21(a1)
+; RV32-NEXT:    sb a2, 21(a0)
+; RV32-NEXT:    lbu a2, 20(a1)
+; RV32-NEXT:    sb a2, 20(a0)
+; RV32-NEXT:    lbu a2, 19(a1)
+; RV32-NEXT:    sb a2, 19(a0)
+; RV32-NEXT:    lbu a2, 18(a1)
+; RV32-NEXT:    sb a2, 18(a0)
+; RV32-NEXT:    lbu a2, 17(a1)
+; RV32-NEXT:    sb a2, 17(a0)
+; RV32-NEXT:    lbu a2, 16(a1)
+; RV32-NEXT:    sb a2, 16(a0)
+; RV32-NEXT:    lbu a2, 15(a1)
+; RV32-NEXT:    sb a2, 15(a0)
+; RV32-NEXT:    lbu a2, 14(a1)
+; RV32-NEXT:    sb a2, 14(a0)
+; RV32-NEXT:    lbu a2, 13(a1)
+; RV32-NEXT:    sb a2, 13(a0)
+; RV32-NEXT:    lbu a2, 12(a1)
+; RV32-NEXT:    sb a2, 12(a0)
+; RV32-NEXT:    lbu a2, 11(a1)
+; RV32-NEXT:    sb a2, 11(a0)
+; RV32-NEXT:    lbu a2, 10(a1)
+; RV32-NEXT:    sb a2, 10(a0)
+; RV32-NEXT:    lbu a2, 9(a1)
+; RV32-NEXT:    sb a2, 9(a0)
+; RV32-NEXT:    lbu a2, 8(a1)
+; RV32-NEXT:    sb a2, 8(a0)
+; RV32-NEXT:    lbu a2, 7(a1)
+; RV32-NEXT:    sb a2, 7(a0)
+; RV32-NEXT:    lbu a2, 6(a1)
+; RV32-NEXT:    sb a2, 6(a0)
+; RV32-NEXT:    lbu a2, 5(a1)
+; RV32-NEXT:    sb a2, 5(a0)
+; RV32-NEXT:    lbu a2, 4(a1)
 ; RV32-NEXT:    sb a2, 4(a0)
-; RV32-NEXT:    sb a1, 5(a0)
-; RV32-NEXT:    sb zero, 6(a0)
-; RV32-NEXT:    sb a6, 0(a0)
-; RV32-NEXT:    sb a5, 1(a0)
-; RV32-NEXT:    sb a4, 2(a0)
-; RV32-NEXT:    sb a3, 3(a0)
+; RV32-NEXT:    lbu a2, 3(a1)
+; RV32-NEXT:    sb a2, 3(a0)
+; RV32-NEXT:    lbu a2, 2(a1)
+; RV32-NEXT:    sb a2, 2(a0)
+; RV32-NEXT:    lbu a2, 1(a1)
+; RV32-NEXT:    sb a2, 1(a0)
+; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    sb a1, 0(a0)
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: t5:
+; RV64-LABEL: unaligned_memcpy31:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    li a1, 84
-; RV64-NEXT:    li a2, 83
-; RV64-NEXT:    li a3, 89
-; RV64-NEXT:    li a4, 82
-; RV64-NEXT:    li a5, 72
-; RV64-NEXT:    li a6, 68
+; RV64-NEXT:    lbu a2, 30(a1)
+; RV64-NEXT:    sb a2, 30(a0)
+; RV64-NEXT:    lbu a2, 29(a1)
+; RV64-NEXT:    sb a2, 29(a0)
+; RV64-NEXT:    lbu a2, 28(a1)
+; RV64-NEXT:    sb a2, 28(a0)
+; RV64-NEXT:    lbu a2, 27(a1)
+; RV64-NEXT:    sb a2, 27(a0)
+; RV64-NEXT:    lbu a2, 26(a1)
+; RV64-NEXT:    sb a2, 26(a0)
+; RV64-NEXT:    lbu a2, 25(a1)
+; RV64-NEXT:    sb a2, 25(a0)
+; RV64-NEXT:    lbu a2, 24(a1)
+; RV64-NEXT:    sb a2, 24(a0)
+; RV64-NEXT:    lbu a2, 23(a1)
+; RV64-NEXT:    sb a2, 23(a0)
+; RV64-NEXT:    lbu a2, 22(a1)
+; RV64-NEXT:    sb a2, 22(a0)
+; RV64-NEXT:    lbu a2, 21(a1)
+; RV64-NEXT:    sb a2, 21(a0)
+; RV64-NEXT:    lbu a2, 20(a1)
+; RV64-NEXT:    sb a2, 20(a0)
+; RV64-NEXT:    lbu a2, 19(a1)
+; RV64-NEXT:    sb a2, 19(a0)
+; RV64-NEXT:    lbu a2, 18(a1)
+; RV64-NEXT:    sb a2, 18(a0)
+; RV64-NEXT:    lbu a2, 17(a1)
+; RV64-NEXT:    sb a2, 17(a0)
+; RV64-NEXT:    lbu a2, 16(a1)
+; RV64-NEXT:    sb a2, 16(a0)
+; RV64-NEXT:    lbu a2, 15(a1)
+; RV64-NEXT:    sb a2, 15(a0)
+; RV64-NEXT:    lbu a2, 14(a1)
+; RV64-NEXT:    sb a2, 14(a0)
+; RV64-NEXT:    lbu a2, 13(a1)
+; RV64-NEXT:    sb a2, 13(a0)
+; RV64-NEXT:    lbu a2, 12(a1)
+; RV64-NEXT:    sb a2, 12(a0)
+; RV64-NEXT:    lbu a2, 11(a1)
+; RV64-NEXT:    sb a2, 11(a0)
+; RV64-NEXT:    lbu a2, 10(a1)
+; RV64-NEXT:    sb a2, 10(a0)
+; RV64-NEXT:    lbu a2, 9(a1)
+; RV64-NEXT:    sb a2, 9(a0)
+; RV64-NEXT:    lbu a2, 8(a1)
+; RV64-NEXT:    sb a2, 8(a0)
+; RV64-NEXT:    lbu a2, 7(a1)
+; RV64-NEXT:    sb a2, 7(a0)
+; RV64-NEXT:    lbu a2, 6(a1)
+; RV64-NEXT:    sb a2, 6(a0)
+; RV64-NEXT:    lbu a2, 5(a1)
+; RV64-NEXT:    sb a2, 5(a0)
+; RV64-NEXT:    lbu a2, 4(a1)
 ; RV64-NEXT:    sb a2, 4(a0)
-; RV64-NEXT:    sb a1, 5(a0)
-; RV64-NEXT:    sb zero, 6(a0)
-; RV64-NEXT:    sb a6, 0(a0)
-; RV64-NEXT:    sb a5, 1(a0)
-; RV64-NEXT:    sb a4, 2(a0)
-; RV64-NEXT:    sb a3, 3(a0)
+; RV64-NEXT:    lbu a2, 3(a1)
+; RV64-NEXT:    sb a2, 3(a0)
+; RV64-NEXT:    lbu a2, 2(a1)
+; RV64-NEXT:    sb a2, 2(a0)
+; RV64-NEXT:    lbu a2, 1(a1)
+; RV64-NEXT:    sb a2, 1(a0)
+; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: unaligned_memcpy31:
+; RV32-FAST:       # %bb.0: # %entry
+; RV32-FAST-NEXT:    lw a2, 27(a1)
+; RV32-FAST-NEXT:    sw a2, 27(a0)
+; RV32-FAST-NEXT:    lw a2, 24(a1)
+; RV32-FAST-NEXT:    sw a2, 24(a0)
+; RV32-FAST-NEXT:    lw a2, 20(a1)
+; RV32-FAST-NEXT:    sw a2, 20(a0)
+; RV32-FAST-NEXT:    lw a2, 16(a1)
+; RV32-FAST-NEXT:    sw a2, 16(a0)
+; RV32-FAST-NEXT:    lw a2, 12(a1)
+; RV32-FAST-NEXT:    sw a2, 12(a0)
+; RV32-FAST-NEXT:    lw a2, 8(a1)
+; RV32-FAST-NEXT:    sw a2, 8(a0)
+; RV32-FAST-NEXT:    lw a2, 4(a1)
+; RV32-FAST-NEXT:    sw a2, 4(a0)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: unaligned_memcpy31:
+; RV64-FAST:       # %bb.0: # %entry
+; RV64-FAST-NEXT:    ld a2, 23(a1)
+; RV64-FAST-NEXT:    sd a2, 23(a0)
+; RV64-FAST-NEXT:    ld a2, 16(a1)
+; RV64-FAST-NEXT:    sd a2, 16(a0)
+; RV64-FAST-NEXT:    ld a2, 8(a1)
+; RV64-FAST-NEXT:    sd a2, 8(a0)
+; RV64-FAST-NEXT:    ld a1, 0(a1)
+; RV64-FAST-NEXT:    sd a1, 0(a0)
+; RV64-FAST-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 31, i1 false)
+  ret void
+}
+
+; ----------------------------------------------------------------------
+; Fully aligned cases
+
+define void @aligned_memcpy0(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-BOTH-LABEL: aligned_memcpy0:
+; RV32-BOTH:       # %bb.0: # %entry
+; RV32-BOTH-NEXT:    ret
+;
+; RV64-BOTH-LABEL: aligned_memcpy0:
+; RV64-BOTH:       # %bb.0: # %entry
+; RV64-BOTH-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 0, i1 false)
+  ret void
+}
+
+define void @aligned_memcpy1(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-BOTH-LABEL: aligned_memcpy1:
+; RV32-BOTH:       # %bb.0: # %entry
+; RV32-BOTH-NEXT:    lbu a1, 0(a1)
+; RV32-BOTH-NEXT:    sb a1, 0(a0)
+; RV32-BOTH-NEXT:    ret
+;
+; RV64-BOTH-LABEL: aligned_memcpy1:
+; RV64-BOTH:       # %bb.0: # %entry
+; RV64-BOTH-NEXT:    lbu a1, 0(a1)
+; RV64-BOTH-NEXT:    sb a1, 0(a0)
+; RV64-BOTH-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 1, i1 false)
+  ret void
+}
+
+define void @aligned_memcpy2(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-BOTH-LABEL: aligned_memcpy2:
+; RV32-BOTH:       # %bb.0: # %entry
+; RV32-BOTH-NEXT:    lh a1, 0(a1)
+; RV32-BOTH-NEXT:    sh a1, 0(a0)
+; RV32-BOTH-NEXT:    ret
+;
+; RV64-BOTH-LABEL: aligned_memcpy2:
+; RV64-BOTH:       # %bb.0: # %entry
+; RV64-BOTH-NEXT:    lh a1, 0(a1)
+; RV64-BOTH-NEXT:    sh a1, 0(a0)
+; RV64-BOTH-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 2, i1 false)
+  ret void
+}
+
+define void @aligned_memcpy3(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-BOTH-LABEL: aligned_memcpy3:
+; RV32-BOTH:       # %bb.0: # %entry
+; RV32-BOTH-NEXT:    lbu a2, 2(a1)
+; RV32-BOTH-NEXT:    sb a2, 2(a0)
+; RV32-BOTH-NEXT:    lh a1, 0(a1)
+; RV32-BOTH-NEXT:    sh a1, 0(a0)
+; RV32-BOTH-NEXT:    ret
+;
+; RV64-BOTH-LABEL: aligned_memcpy3:
+; RV64-BOTH:       # %bb.0: # %entry
+; RV64-BOTH-NEXT:    lbu a2, 2(a1)
+; RV64-BOTH-NEXT:    sb a2, 2(a0)
+; RV64-BOTH-NEXT:    lh a1, 0(a1)
+; RV64-BOTH-NEXT:    sh a1, 0(a0)
+; RV64-BOTH-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 3, i1 false)
+  ret void
+}
+
+define void @aligned_memcpy4(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-BOTH-LABEL: aligned_memcpy4:
+; RV32-BOTH:       # %bb.0: # %entry
+; RV32-BOTH-NEXT:    lw a1, 0(a1)
+; RV32-BOTH-NEXT:    sw a1, 0(a0)
+; RV32-BOTH-NEXT:    ret
+;
+; RV64-BOTH-LABEL: aligned_memcpy4:
+; RV64-BOTH:       # %bb.0: # %entry
+; RV64-BOTH-NEXT:    lw a1, 0(a1)
+; RV64-BOTH-NEXT:    sw a1, 0(a0)
+; RV64-BOTH-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 4, i1 false)
+  ret void
+}
+
+define void @aligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-LABEL: aligned_memcpy7:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    lbu a2, 6(a1)
+; RV32-NEXT:    sb a2, 6(a0)
+; RV32-NEXT:    lh a2, 4(a1)
+; RV32-NEXT:    sh a2, 4(a0)
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    sw a1, 0(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: aligned_memcpy7:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    lbu a2, 6(a1)
+; RV64-NEXT:    sb a2, 6(a0)
+; RV64-NEXT:    lh a2, 4(a1)
+; RV64-NEXT:    sh a2, 4(a0)
+; RV64-NEXT:    lw a1, 0(a1)
+; RV64-NEXT:    sw a1, 0(a0)
 ; RV64-NEXT:    ret
 ;
-; RV32-FAST-LABEL: t5:
+; RV32-FAST-LABEL: aligned_memcpy7:
 ; RV32-FAST:       # %bb.0: # %entry
-; RV32-FAST-NEXT:    lui a1, 1349
-; RV32-FAST-NEXT:    addi a1, a1, 857
-; RV32-FAST-NEXT:    sw a1, 3(a0)
-; RV32-FAST-NEXT:    lui a1, 365861
-; RV32-FAST-NEXT:    addi a1, a1, -1980
+; RV32-FAST-NEXT:    lw a2, 3(a1)
+; RV32-FAST-NEXT:    sw a2, 3(a0)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
 ; RV32-FAST-NEXT:    sw a1, 0(a0)
 ; RV32-FAST-NEXT:    ret
 ;
-; RV64-FAST-LABEL: t5:
+; RV64-FAST-LABEL: aligned_memcpy7:
 ; RV64-FAST:       # %bb.0: # %entry
-; RV64-FAST-NEXT:    lui a1, 1349
-; RV64-FAST-NEXT:    addi a1, a1, 857
-; RV64-FAST-NEXT:    sw a1, 3(a0)
-; RV64-FAST-NEXT:    lui a1, 365861
-; RV64-FAST-NEXT:    addi a1, a1, -1980
+; RV64-FAST-NEXT:    lw a2, 3(a1)
+; RV64-FAST-NEXT:    sw a2, 3(a0)
+; RV64-FAST-NEXT:    lw a1, 0(a1)
 ; RV64-FAST-NEXT:    sw a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str5, i64 7, i1 false)
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 7, i1 false)
+  ret void
+}
+
+define void @aligned_memcpy8(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-BOTH-LABEL: aligned_memcpy8:
+; RV32-BOTH:       # %bb.0: # %entry
+; RV32-BOTH-NEXT:    lw a2, 4(a1)
+; RV32-BOTH-NEXT:    sw a2, 4(a0)
+; RV32-BOTH-NEXT:    lw a1, 0(a1)
+; RV32-BOTH-NEXT:    sw a1, 0(a0)
+; RV32-BOTH-NEXT:    ret
+;
+; RV64-BOTH-LABEL: aligned_memcpy8:
+; RV64-BOTH:       # %bb.0: # %entry
+; RV64-BOTH-NEXT:    ld a1, 0(a1)
+; RV64-BOTH-NEXT:    sd a1, 0(a0)
+; RV64-BOTH-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 8, i1 false)
+  ret void
+}
+
+define void @aligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-LABEL: aligned_memcpy15:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    lbu a2, 14(a1)
+; RV32-NEXT:    sb a2, 14(a0)
+; RV32-NEXT:    lh a2, 12(a1)
+; RV32-NEXT:    sh a2, 12(a0)
+; RV32-NEXT:    lw a2, 8(a1)
+; RV32-NEXT:    sw a2, 8(a0)
+; RV32-NEXT:    lw a2, 4(a1)
+; RV32-NEXT:    sw a2, 4(a0)
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    sw a1, 0(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: aligned_memcpy15:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    lbu a2, 14(a1)
+; RV64-NEXT:    sb a2, 14(a0)
+; RV64-NEXT:    lh a2, 12(a1)
+; RV64-NEXT:    sh a2, 12(a0)
+; RV64-NEXT:    lw a2, 8(a1)
+; RV64-NEXT:    sw a2, 8(a0)
+; RV64-NEXT:    ld a1, 0(a1)
+; RV64-NEXT:    sd a1, 0(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: aligned_memcpy15:
+; RV32-FAST:       # %bb.0: # %entry
+; RV32-FAST-NEXT:    lw a2, 11(a1)
+; RV32-FAST-NEXT:    sw a2, 11(a0)
+; RV32-FAST-NEXT:    lw a2, 8(a1)
+; RV32-FAST-NEXT:    sw a2, 8(a0)
+; RV32-FAST-NEXT:    lw a2, 4(a1)
+; RV32-FAST-NEXT:    sw a2, 4(a0)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: aligned_memcpy15:
+; RV64-FAST:       # %bb.0: # %entry
+; RV64-FAST-NEXT:    ld a2, 7(a1)
+; RV64-FAST-NEXT:    sd a2, 7(a0)
+; RV64-FAST-NEXT:    ld a1, 0(a1)
+; RV64-FAST-NEXT:    sd a1, 0(a0)
+; RV64-FAST-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 15, i1 false)
   ret void
 }
 
-define void @t6() nounwind {
-; RV32-LABEL: t6:
+define void @aligned_memcpy16(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-BOTH-LABEL: aligned_memcpy16:
+; RV32-BOTH:       # %bb.0: # %entry
+; RV32-BOTH-NEXT:    lw a2, 12(a1)
+; RV32-BOTH-NEXT:    sw a2, 12(a0)
+; RV32-BOTH-NEXT:    lw a2, 8(a1)
+; RV32-BOTH-NEXT:    sw a2, 8(a0)
+; RV32-BOTH-NEXT:    lw a2, 4(a1)
+; RV32-BOTH-NEXT:    sw a2, 4(a0)
+; RV32-BOTH-NEXT:    lw a1, 0(a1)
+; RV32-BOTH-NEXT:    sw a1, 0(a0)
+; RV32-BOTH-NEXT:    ret
+;
+; RV64-BOTH-LABEL: aligned_memcpy16:
+; RV64-BOTH:       # %bb.0: # %entry
+; RV64-BOTH-NEXT:    ld a2, 8(a1)
+; RV64-BOTH-NEXT:    sd a2, 8(a0)
+; RV64-BOTH-NEXT:    ld a1, 0(a1)
+; RV64-BOTH-NEXT:    sd a1, 0(a0)
+; RV64-BOTH-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 16, i1 false)
+  ret void
+}
+
+define void @aligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-LABEL: aligned_memcpy31:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    lui a0, %hi(spool.splbuf)
-; RV32-NEXT:    addi a0, a0, %lo(spool.splbuf)
-; RV32-NEXT:    lui a1, %hi(.L.str6)
-; RV32-NEXT:    addi a1, a1, %lo(.L.str6)
-; RV32-NEXT:    li a2, 14
-; RV32-NEXT:    call memcpy
-; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    lbu a2, 30(a1)
+; RV32-NEXT:    sb a2, 30(a0)
+; RV32-NEXT:    lh a2, 28(a1)
+; RV32-NEXT:    sh a2, 28(a0)
+; RV32-NEXT:    lw a2, 24(a1)
+; RV32-NEXT:    sw a2, 24(a0)
+; RV32-NEXT:    lw a2, 20(a1)
+; RV32-NEXT:    sw a2, 20(a0)
+; RV32-NEXT:    lw a2, 16(a1)
+; RV32-NEXT:    sw a2, 16(a0)
+; RV32-NEXT:    lw a2, 12(a1)
+; RV32-NEXT:    sw a2, 12(a0)
+; RV32-NEXT:    lw a2, 8(a1)
+; RV32-NEXT:    sw a2, 8(a0)
+; RV32-NEXT:    lw a2, 4(a1)
+; RV32-NEXT:    sw a2, 4(a0)
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    sw a1, 0(a0)
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: t6:
+; RV64-LABEL: aligned_memcpy31:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT:    lui a0, %hi(spool.splbuf)
-; RV64-NEXT:    addi a0, a0, %lo(spool.splbuf)
-; RV64-NEXT:    lui a1, %hi(.L.str6)
-; RV64-NEXT:    addi a1, a1, %lo(.L.str6)
-; RV64-NEXT:    li a2, 14
-; RV64-NEXT:    call memcpy
-; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    lbu a2, 30(a1)
+; RV64-NEXT:    sb a2, 30(a0)
+; RV64-NEXT:    lh a2, 28(a1)
+; RV64-NEXT:    sh a2, 28(a0)
+; RV64-NEXT:    lw a2, 24(a1)
+; RV64-NEXT:    sw a2, 24(a0)
+; RV64-NEXT:    ld a2, 16(a1)
+; RV64-NEXT:    sd a2, 16(a0)
+; RV64-NEXT:    ld a2, 8(a1)
+; RV64-NEXT:    sd a2, 8(a0)
+; RV64-NEXT:    ld a1, 0(a1)
+; RV64-NEXT:    sd a1, 0(a0)
 ; RV64-NEXT:    ret
 ;
-; RV32-FAST-LABEL: t6:
+; RV32-FAST-LABEL: aligned_memcpy31:
 ; RV32-FAST:       # %bb.0: # %entry
-; RV32-FAST-NEXT:    lui a0, %hi(spool.splbuf)
-; RV32-FAST-NEXT:    li a1, 88
-; RV32-FAST-NEXT:    sh a1, %lo(spool.splbuf+12)(a0)
-; RV32-FAST-NEXT:    lui a1, 361862
-; RV32-FAST-NEXT:    addi a1, a1, -1960
-; RV32-FAST-NEXT:    sw a1, %lo(spool.splbuf+8)(a0)
-; RV32-FAST-NEXT:    lui a1, 362199
-; RV32-FAST-NEXT:    addi a1, a1, 559
-; RV32-FAST-NEXT:    sw a1, %lo(spool.splbuf+4)(a0)
-; RV32-FAST-NEXT:    lui a1, 460503
-; RV32-FAST-NEXT:    addi a1, a1, 1071
-; RV32-FAST-NEXT:    sw a1, %lo(spool.splbuf)(a0)
+; RV32-FAST-NEXT:    lw a2, 27(a1)
+; RV32-FAST-NEXT:    sw a2, 27(a0)
+; RV32-FAST-NEXT:    lw a2, 24(a1)
+; RV32-FAST-NEXT:    sw a2, 24(a0)
+; RV32-FAST-NEXT:    lw a2, 20(a1)
+; RV32-FAST-NEXT:    sw a2, 20(a0)
+; RV32-FAST-NEXT:    lw a2, 16(a1)
+; RV32-FAST-NEXT:    sw a2, 16(a0)
+; RV32-FAST-NEXT:    lw a2, 12(a1)
+; RV32-FAST-NEXT:    sw a2, 12(a0)
+; RV32-FAST-NEXT:    lw a2, 8(a1)
+; RV32-FAST-NEXT:    sw a2, 8(a0)
+; RV32-FAST-NEXT:    lw a2, 4(a1)
+; RV32-FAST-NEXT:    sw a2, 4(a0)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a1, 0(a0)
 ; RV32-FAST-NEXT:    ret
 ;
-; RV64-FAST-LABEL: t6:
+; RV64-FAST-LABEL: aligned_memcpy31:
 ; RV64-FAST:       # %bb.0: # %entry
-; RV64-FAST-NEXT:    lui a0, %hi(.L.str6)
-; RV64-FAST-NEXT:    ld a1, %lo(.L.str6)(a0)
-; RV64-FAST-NEXT:    addi a0, a0, %lo(.L.str6)
-; RV64-FAST-NEXT:    ld a0, 6(a0)
-; RV64-FAST-NEXT:    lui a2, %hi(spool.splbuf)
-; RV64-FAST-NEXT:    sd a1, %lo(spool.splbuf)(a2)
-; RV64-FAST-NEXT:    sd a0, %lo(spool.splbuf+6)(a2)
+; RV64-FAST-NEXT:    ld a2, 23(a1)
+; RV64-FAST-NEXT:    sd a2, 23(a0)
+; RV64-FAST-NEXT:    ld a2, 16(a1)
+; RV64-FAST-NEXT:    sd a2, 16(a0)
+; RV64-FAST-NEXT:    ld a2, 8(a1)
+; RV64-FAST-NEXT:    sd a2, 8(a0)
+; RV64-FAST-NEXT:    ld a1, 0(a1)
+; RV64-FAST-NEXT:    sd a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
-  call void @llvm.memcpy.p0.p0.i64(ptr @spool.splbuf, ptr @.str6, i64 14, i1 false)
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 31, i1 false)
   ret void
 }
 
-%struct.Foo = type { i32, i32, i32, i32 }
+; ------------------------------------------------------------------------
+; A few partially aligned cases
 
-define void @t7(ptr nocapture %a, ptr nocapture %b) nounwind {
-; RV32-BOTH-LABEL: t7:
+
+define void @memcpy16_align4(ptr nocapture %dest, ptr nocapture %src) nounwind {
+; RV32-BOTH-LABEL: memcpy16_align4:
 ; RV32-BOTH:       # %bb.0: # %entry
 ; RV32-BOTH-NEXT:    lw a2, 12(a1)
 ; RV32-BOTH-NEXT:    sw a2, 12(a0)
@@ -418,7 +947,7 @@ define void @t7(ptr nocapture %a, ptr nocapture %b) nounwind {
 ; RV32-BOTH-NEXT:    sw a1, 0(a0)
 ; RV32-BOTH-NEXT:    ret
 ;
-; RV64-LABEL: t7:
+; RV64-LABEL: memcpy16_align4:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    lw a2, 12(a1)
 ; RV64-NEXT:    sw a2, 12(a0)
@@ -430,7 +959,7 @@ define void @t7(ptr nocapture %a, ptr nocapture %b) nounwind {
 ; RV64-NEXT:    sw a1, 0(a0)
 ; RV64-NEXT:    ret
 ;
-; RV64-FAST-LABEL: t7:
+; RV64-FAST-LABEL: memcpy16_align4:
 ; RV64-FAST:       # %bb.0: # %entry
 ; RV64-FAST-NEXT:    ld a2, 8(a1)
 ; RV64-FAST-NEXT:    sd a2, 8(a0)
@@ -438,11 +967,58 @@ define void @t7(ptr nocapture %a, ptr nocapture %b) nounwind {
 ; RV64-FAST-NEXT:    sd a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
-  tail call void @llvm.memcpy.p0.p0.i32(ptr align 4 %a, ptr align 4 %b, i32 16, i1 false)
+  tail call void @llvm.memcpy.p0.p0.i32(ptr align 4 %dest, ptr align 4 %src, i32 16, i1 false)
   ret void
 }
 
+define i32 @memcpy11_align8(ptr nocapture %dest, ptr %src) {
+; RV32-LABEL: memcpy11_align8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    lbu a2, 10(a1)
+; RV32-NEXT:    sb a2, 10(a0)
+; RV32-NEXT:    lh a2, 8(a1)
+; RV32-NEXT:    sh a2, 8(a0)
+; RV32-NEXT:    lw a2, 4(a1)
+; RV32-NEXT:    sw a2, 4(a0)
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    sw a1, 0(a0)
+; RV32-NEXT:    li a0, 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: memcpy11_align8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    lbu a2, 10(a1)
+; RV64-NEXT:    sb a2, 10(a0)
+; RV64-NEXT:    lh a2, 8(a1)
+; RV64-NEXT:    sh a2, 8(a0)
+; RV64-NEXT:    ld a1, 0(a1)
+; RV64-NEXT:    sd a1, 0(a0)
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: memcpy11_align8:
+; RV32-FAST:       # %bb.0: # %entry
+; RV32-FAST-NEXT:    lw a2, 7(a1)
+; RV32-FAST-NEXT:    sw a2, 7(a0)
+; RV32-FAST-NEXT:    lw a2, 4(a1)
+; RV32-FAST-NEXT:    sw a2, 4(a0)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    li a0, 0
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: memcpy11_align8:
+; RV64-FAST:       # %bb.0: # %entry
+; RV64-FAST-NEXT:    lw a2, 7(a1)
+; RV64-FAST-NEXT:    sw a2, 7(a0)
+; RV64-FAST-NEXT:    ld a1, 0(a1)
+; RV64-FAST-NEXT:    sd a1, 0(a0)
+; RV64-FAST-NEXT:    li a0, 0
+; RV64-FAST-NEXT:    ret
+entry:
+  call void @llvm.memcpy.p0.p0.i32(ptr align 8 %dest, ptr align 8 %src, i32 11, i1 false)
+  ret i32 0
+}
+
 declare void @llvm.memcpy.p0.p0.i32(ptr nocapture, ptr nocapture, i32, i1) nounwind
 declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; RV64-BOTH: {{.*}}

From 4637c777463248d42fbdc383f324310522ce85d2 Mon Sep 17 00:00:00 2001
From: Pengcheng Wang <wangpengcheng.pp@bytedance.com>
Date: Mon, 13 Jan 2025 11:36:37 +0800
Subject: [PATCH 223/408] Revert "[RISCV] Rework memcpy test" (#122662)

Reverts llvm/llvm-project#120364

The test should be updated due to some recent changes.
---
 llvm/test/CodeGen/RISCV/memcpy.ll | 1180 ++++++++---------------------
 1 file changed, 302 insertions(+), 878 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/memcpy.ll b/llvm/test/CodeGen/RISCV/memcpy.ll
index ce47476de9ce8..1ab3722080f70 100644
--- a/llvm/test/CodeGen/RISCV/memcpy.ll
+++ b/llvm/test/CodeGen/RISCV/memcpy.ll
@@ -7,935 +7,406 @@
 ; RUN:   | FileCheck %s --check-prefixes=RV32-BOTH,RV32-FAST
 ; RUN: llc < %s -mtriple=riscv64 -mattr=+unaligned-scalar-mem \
 ; RUN:   | FileCheck %s --check-prefixes=RV64-BOTH,RV64-FAST
+%struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 }
 
-; ----------------------------------------------------------------------
-; Fully unaligned cases
+@src = external dso_local global %struct.x
+@dst = external dso_local global %struct.x
 
-define void @unaligned_memcpy0(ptr nocapture %dest, ptr %src) nounwind {
-; RV32-BOTH-LABEL: unaligned_memcpy0:
-; RV32-BOTH:       # %bb.0: # %entry
-; RV32-BOTH-NEXT:    ret
-;
-; RV64-BOTH-LABEL: unaligned_memcpy0:
-; RV64-BOTH:       # %bb.0: # %entry
-; RV64-BOTH-NEXT:    ret
-entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 0, i1 false)
-  ret void
-}
-
-define void @unaligned_memcpy1(ptr nocapture %dest, ptr %src) nounwind {
-; RV32-BOTH-LABEL: unaligned_memcpy1:
-; RV32-BOTH:       # %bb.0: # %entry
-; RV32-BOTH-NEXT:    lbu a1, 0(a1)
-; RV32-BOTH-NEXT:    sb a1, 0(a0)
-; RV32-BOTH-NEXT:    ret
-;
-; RV64-BOTH-LABEL: unaligned_memcpy1:
-; RV64-BOTH:       # %bb.0: # %entry
-; RV64-BOTH-NEXT:    lbu a1, 0(a1)
-; RV64-BOTH-NEXT:    sb a1, 0(a0)
-; RV64-BOTH-NEXT:    ret
-entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 1, i1 false)
-  ret void
-}
-
-define void @unaligned_memcpy2(ptr nocapture %dest, ptr %src) nounwind {
-; RV32-LABEL: unaligned_memcpy2:
-; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lbu a2, 1(a1)
-; RV32-NEXT:    sb a2, 1(a0)
-; RV32-NEXT:    lbu a1, 0(a1)
-; RV32-NEXT:    sb a1, 0(a0)
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: unaligned_memcpy2:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    lbu a2, 1(a1)
-; RV64-NEXT:    sb a2, 1(a0)
-; RV64-NEXT:    lbu a1, 0(a1)
-; RV64-NEXT:    sb a1, 0(a0)
-; RV64-NEXT:    ret
-;
-; RV32-FAST-LABEL: unaligned_memcpy2:
-; RV32-FAST:       # %bb.0: # %entry
-; RV32-FAST-NEXT:    lh a1, 0(a1)
-; RV32-FAST-NEXT:    sh a1, 0(a0)
-; RV32-FAST-NEXT:    ret
-;
-; RV64-FAST-LABEL: unaligned_memcpy2:
-; RV64-FAST:       # %bb.0: # %entry
-; RV64-FAST-NEXT:    lh a1, 0(a1)
-; RV64-FAST-NEXT:    sh a1, 0(a0)
-; RV64-FAST-NEXT:    ret
-entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 2, i1 false)
-  ret void
-}
-
-define void @unaligned_memcpy3(ptr nocapture %dest, ptr %src) nounwind {
-; RV32-LABEL: unaligned_memcpy3:
-; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lbu a2, 2(a1)
-; RV32-NEXT:    sb a2, 2(a0)
-; RV32-NEXT:    lbu a2, 1(a1)
-; RV32-NEXT:    sb a2, 1(a0)
-; RV32-NEXT:    lbu a1, 0(a1)
-; RV32-NEXT:    sb a1, 0(a0)
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: unaligned_memcpy3:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    lbu a2, 2(a1)
-; RV64-NEXT:    sb a2, 2(a0)
-; RV64-NEXT:    lbu a2, 1(a1)
-; RV64-NEXT:    sb a2, 1(a0)
-; RV64-NEXT:    lbu a1, 0(a1)
-; RV64-NEXT:    sb a1, 0(a0)
-; RV64-NEXT:    ret
-;
-; RV32-FAST-LABEL: unaligned_memcpy3:
-; RV32-FAST:       # %bb.0: # %entry
-; RV32-FAST-NEXT:    lbu a2, 2(a1)
-; RV32-FAST-NEXT:    sb a2, 2(a0)
-; RV32-FAST-NEXT:    lh a1, 0(a1)
-; RV32-FAST-NEXT:    sh a1, 0(a0)
-; RV32-FAST-NEXT:    ret
-;
-; RV64-FAST-LABEL: unaligned_memcpy3:
-; RV64-FAST:       # %bb.0: # %entry
-; RV64-FAST-NEXT:    lbu a2, 2(a1)
-; RV64-FAST-NEXT:    sb a2, 2(a0)
-; RV64-FAST-NEXT:    lh a1, 0(a1)
-; RV64-FAST-NEXT:    sh a1, 0(a0)
-; RV64-FAST-NEXT:    ret
-entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
-  ret void
-}
-
-define void @unaligned_memcpy4(ptr nocapture %dest, ptr %src) nounwind {
-; RV32-LABEL: unaligned_memcpy4:
-; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lbu a2, 3(a1)
-; RV32-NEXT:    sb a2, 3(a0)
-; RV32-NEXT:    lbu a2, 2(a1)
-; RV32-NEXT:    sb a2, 2(a0)
-; RV32-NEXT:    lbu a2, 1(a1)
-; RV32-NEXT:    sb a2, 1(a0)
-; RV32-NEXT:    lbu a1, 0(a1)
-; RV32-NEXT:    sb a1, 0(a0)
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: unaligned_memcpy4:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    lbu a2, 3(a1)
-; RV64-NEXT:    sb a2, 3(a0)
-; RV64-NEXT:    lbu a2, 2(a1)
-; RV64-NEXT:    sb a2, 2(a0)
-; RV64-NEXT:    lbu a2, 1(a1)
-; RV64-NEXT:    sb a2, 1(a0)
-; RV64-NEXT:    lbu a1, 0(a1)
-; RV64-NEXT:    sb a1, 0(a0)
-; RV64-NEXT:    ret
-;
-; RV32-FAST-LABEL: unaligned_memcpy4:
-; RV32-FAST:       # %bb.0: # %entry
-; RV32-FAST-NEXT:    lw a1, 0(a1)
-; RV32-FAST-NEXT:    sw a1, 0(a0)
-; RV32-FAST-NEXT:    ret
-;
-; RV64-FAST-LABEL: unaligned_memcpy4:
-; RV64-FAST:       # %bb.0: # %entry
-; RV64-FAST-NEXT:    lw a1, 0(a1)
-; RV64-FAST-NEXT:    sw a1, 0(a0)
-; RV64-FAST-NEXT:    ret
-entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 4, i1 false)
-  ret void
-}
+@.str1 = private unnamed_addr constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 1
+@.str2 = private unnamed_addr constant [36 x i8] c"DHRYSTONE PROGRAM, SOME STRING BLAH\00", align 1
+@.str3 = private unnamed_addr constant [24 x i8] c"DHRYSTONE PROGRAM, SOME\00", align 1
+@.str4 = private unnamed_addr constant [18 x i8] c"DHRYSTONE PROGR  \00", align 1
+@.str5 = private unnamed_addr constant [7 x i8] c"DHRYST\00", align 1
+@.str6 = private unnamed_addr constant [14 x i8] c"/tmp/rmXXXXXX\00", align 1
+@spool.splbuf = internal global [512 x i8] zeroinitializer, align 16
 
-define void @unaligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind {
-; RV32-LABEL: unaligned_memcpy7:
+define i32 @t0() {
+; RV32-LABEL: t0:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lbu a2, 6(a1)
-; RV32-NEXT:    sb a2, 6(a0)
-; RV32-NEXT:    lbu a2, 5(a1)
-; RV32-NEXT:    sb a2, 5(a0)
-; RV32-NEXT:    lbu a2, 4(a1)
-; RV32-NEXT:    sb a2, 4(a0)
-; RV32-NEXT:    lbu a2, 3(a1)
-; RV32-NEXT:    sb a2, 3(a0)
-; RV32-NEXT:    lbu a2, 2(a1)
-; RV32-NEXT:    sb a2, 2(a0)
-; RV32-NEXT:    lbu a2, 1(a1)
-; RV32-NEXT:    sb a2, 1(a0)
-; RV32-NEXT:    lbu a1, 0(a1)
-; RV32-NEXT:    sb a1, 0(a0)
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: unaligned_memcpy7:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    lbu a2, 6(a1)
-; RV64-NEXT:    sb a2, 6(a0)
-; RV64-NEXT:    lbu a2, 5(a1)
-; RV64-NEXT:    sb a2, 5(a0)
-; RV64-NEXT:    lbu a2, 4(a1)
-; RV64-NEXT:    sb a2, 4(a0)
-; RV64-NEXT:    lbu a2, 3(a1)
-; RV64-NEXT:    sb a2, 3(a0)
-; RV64-NEXT:    lbu a2, 2(a1)
-; RV64-NEXT:    sb a2, 2(a0)
-; RV64-NEXT:    lbu a2, 1(a1)
-; RV64-NEXT:    sb a2, 1(a0)
-; RV64-NEXT:    lbu a1, 0(a1)
-; RV64-NEXT:    sb a1, 0(a0)
-; RV64-NEXT:    ret
-;
-; RV32-FAST-LABEL: unaligned_memcpy7:
-; RV32-FAST:       # %bb.0: # %entry
-; RV32-FAST-NEXT:    lw a2, 3(a1)
-; RV32-FAST-NEXT:    sw a2, 3(a0)
-; RV32-FAST-NEXT:    lw a1, 0(a1)
-; RV32-FAST-NEXT:    sw a1, 0(a0)
-; RV32-FAST-NEXT:    ret
-;
-; RV64-FAST-LABEL: unaligned_memcpy7:
-; RV64-FAST:       # %bb.0: # %entry
-; RV64-FAST-NEXT:    lw a2, 3(a1)
-; RV64-FAST-NEXT:    sw a2, 3(a0)
-; RV64-FAST-NEXT:    lw a1, 0(a1)
-; RV64-FAST-NEXT:    sw a1, 0(a0)
-; RV64-FAST-NEXT:    ret
-entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 7, i1 false)
-  ret void
-}
-
-define void @unaligned_memcpy8(ptr nocapture %dest, ptr %src) nounwind {
-; RV32-LABEL: unaligned_memcpy8:
-; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lbu a2, 7(a1)
-; RV32-NEXT:    sb a2, 7(a0)
-; RV32-NEXT:    lbu a2, 6(a1)
-; RV32-NEXT:    sb a2, 6(a0)
-; RV32-NEXT:    lbu a2, 5(a1)
-; RV32-NEXT:    sb a2, 5(a0)
-; RV32-NEXT:    lbu a2, 4(a1)
-; RV32-NEXT:    sb a2, 4(a0)
-; RV32-NEXT:    lbu a2, 3(a1)
-; RV32-NEXT:    sb a2, 3(a0)
-; RV32-NEXT:    lbu a2, 2(a1)
-; RV32-NEXT:    sb a2, 2(a0)
-; RV32-NEXT:    lbu a2, 1(a1)
-; RV32-NEXT:    sb a2, 1(a0)
-; RV32-NEXT:    lbu a1, 0(a1)
-; RV32-NEXT:    sb a1, 0(a0)
+; RV32-NEXT:    lui a0, %hi(src)
+; RV32-NEXT:    lw a1, %lo(src)(a0)
+; RV32-NEXT:    lui a2, %hi(dst)
+; RV32-NEXT:    addi a0, a0, %lo(src)
+; RV32-NEXT:    sw a1, %lo(dst)(a2)
+; RV32-NEXT:    lw a1, 4(a0)
+; RV32-NEXT:    lh a3, 8(a0)
+; RV32-NEXT:    lbu a0, 10(a0)
+; RV32-NEXT:    addi a2, a2, %lo(dst)
+; RV32-NEXT:    sw a1, 4(a2)
+; RV32-NEXT:    sh a3, 8(a2)
+; RV32-NEXT:    sb a0, 10(a2)
+; RV32-NEXT:    li a0, 0
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: unaligned_memcpy8:
+; RV64-LABEL: t0:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    lbu a2, 7(a1)
-; RV64-NEXT:    sb a2, 7(a0)
-; RV64-NEXT:    lbu a2, 6(a1)
-; RV64-NEXT:    sb a2, 6(a0)
-; RV64-NEXT:    lbu a2, 5(a1)
-; RV64-NEXT:    sb a2, 5(a0)
-; RV64-NEXT:    lbu a2, 4(a1)
-; RV64-NEXT:    sb a2, 4(a0)
-; RV64-NEXT:    lbu a2, 3(a1)
-; RV64-NEXT:    sb a2, 3(a0)
-; RV64-NEXT:    lbu a2, 2(a1)
-; RV64-NEXT:    sb a2, 2(a0)
-; RV64-NEXT:    lbu a2, 1(a1)
-; RV64-NEXT:    sb a2, 1(a0)
-; RV64-NEXT:    lbu a1, 0(a1)
-; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    lui a0, %hi(src)
+; RV64-NEXT:    lui a1, %hi(dst)
+; RV64-NEXT:    ld a2, %lo(src)(a0)
+; RV64-NEXT:    addi a0, a0, %lo(src)
+; RV64-NEXT:    lh a3, 8(a0)
+; RV64-NEXT:    lbu a0, 10(a0)
+; RV64-NEXT:    sd a2, %lo(dst)(a1)
+; RV64-NEXT:    addi a1, a1, %lo(dst)
+; RV64-NEXT:    sh a3, 8(a1)
+; RV64-NEXT:    sb a0, 10(a1)
+; RV64-NEXT:    li a0, 0
 ; RV64-NEXT:    ret
 ;
-; RV32-FAST-LABEL: unaligned_memcpy8:
+; RV32-FAST-LABEL: t0:
 ; RV32-FAST:       # %bb.0: # %entry
-; RV32-FAST-NEXT:    lw a2, 4(a1)
-; RV32-FAST-NEXT:    sw a2, 4(a0)
-; RV32-FAST-NEXT:    lw a1, 0(a1)
-; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    lui a0, %hi(src)
+; RV32-FAST-NEXT:    lw a1, %lo(src)(a0)
+; RV32-FAST-NEXT:    addi a0, a0, %lo(src)
+; RV32-FAST-NEXT:    lw a2, 4(a0)
+; RV32-FAST-NEXT:    lw a0, 7(a0)
+; RV32-FAST-NEXT:    lui a3, %hi(dst)
+; RV32-FAST-NEXT:    sw a1, %lo(dst)(a3)
+; RV32-FAST-NEXT:    addi a1, a3, %lo(dst)
+; RV32-FAST-NEXT:    sw a0, 7(a1)
+; RV32-FAST-NEXT:    sw a2, 4(a1)
+; RV32-FAST-NEXT:    li a0, 0
 ; RV32-FAST-NEXT:    ret
 ;
-; RV64-FAST-LABEL: unaligned_memcpy8:
+; RV64-FAST-LABEL: t0:
 ; RV64-FAST:       # %bb.0: # %entry
-; RV64-FAST-NEXT:    ld a1, 0(a1)
-; RV64-FAST-NEXT:    sd a1, 0(a0)
+; RV64-FAST-NEXT:    lui a0, %hi(src)
+; RV64-FAST-NEXT:    ld a1, %lo(src)(a0)
+; RV64-FAST-NEXT:    addi a0, a0, %lo(src)
+; RV64-FAST-NEXT:    lw a0, 7(a0)
+; RV64-FAST-NEXT:    lui a2, %hi(dst)
+; RV64-FAST-NEXT:    sd a1, %lo(dst)(a2)
+; RV64-FAST-NEXT:    addi a1, a2, %lo(dst)
+; RV64-FAST-NEXT:    sw a0, 7(a1)
+; RV64-FAST-NEXT:    li a0, 0
 ; RV64-FAST-NEXT:    ret
 entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 8, i1 false)
-  ret void
+  call void @llvm.memcpy.p0.p0.i32(ptr align 8 @dst, ptr align 8 @src, i32 11, i1 false)
+  ret i32 0
 }
 
-define void @unaligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind {
-; RV32-LABEL: unaligned_memcpy15:
+define void @t1(ptr nocapture %C) nounwind {
+; RV32-LABEL: t1:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lbu a2, 14(a1)
-; RV32-NEXT:    sb a2, 14(a0)
-; RV32-NEXT:    lbu a2, 13(a1)
-; RV32-NEXT:    sb a2, 13(a0)
-; RV32-NEXT:    lbu a2, 12(a1)
-; RV32-NEXT:    sb a2, 12(a0)
-; RV32-NEXT:    lbu a2, 11(a1)
-; RV32-NEXT:    sb a2, 11(a0)
-; RV32-NEXT:    lbu a2, 10(a1)
-; RV32-NEXT:    sb a2, 10(a0)
-; RV32-NEXT:    lbu a2, 9(a1)
-; RV32-NEXT:    sb a2, 9(a0)
-; RV32-NEXT:    lbu a2, 8(a1)
-; RV32-NEXT:    sb a2, 8(a0)
-; RV32-NEXT:    lbu a2, 7(a1)
-; RV32-NEXT:    sb a2, 7(a0)
-; RV32-NEXT:    lbu a2, 6(a1)
-; RV32-NEXT:    sb a2, 6(a0)
-; RV32-NEXT:    lbu a2, 5(a1)
-; RV32-NEXT:    sb a2, 5(a0)
-; RV32-NEXT:    lbu a2, 4(a1)
-; RV32-NEXT:    sb a2, 4(a0)
-; RV32-NEXT:    lbu a2, 3(a1)
-; RV32-NEXT:    sb a2, 3(a0)
-; RV32-NEXT:    lbu a2, 2(a1)
-; RV32-NEXT:    sb a2, 2(a0)
-; RV32-NEXT:    lbu a2, 1(a1)
-; RV32-NEXT:    sb a2, 1(a0)
-; RV32-NEXT:    lbu a1, 0(a1)
-; RV32-NEXT:    sb a1, 0(a0)
-; RV32-NEXT:    ret
+; RV32-NEXT:    lui a1, %hi(.L.str1)
+; RV32-NEXT:    addi a1, a1, %lo(.L.str1)
+; RV32-NEXT:    li a2, 31
+; RV32-NEXT:    tail memcpy
 ;
-; RV64-LABEL: unaligned_memcpy15:
+; RV64-LABEL: t1:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    lbu a2, 14(a1)
-; RV64-NEXT:    sb a2, 14(a0)
-; RV64-NEXT:    lbu a2, 13(a1)
-; RV64-NEXT:    sb a2, 13(a0)
-; RV64-NEXT:    lbu a2, 12(a1)
-; RV64-NEXT:    sb a2, 12(a0)
-; RV64-NEXT:    lbu a2, 11(a1)
-; RV64-NEXT:    sb a2, 11(a0)
-; RV64-NEXT:    lbu a2, 10(a1)
-; RV64-NEXT:    sb a2, 10(a0)
-; RV64-NEXT:    lbu a2, 9(a1)
-; RV64-NEXT:    sb a2, 9(a0)
-; RV64-NEXT:    lbu a2, 8(a1)
-; RV64-NEXT:    sb a2, 8(a0)
-; RV64-NEXT:    lbu a2, 7(a1)
-; RV64-NEXT:    sb a2, 7(a0)
-; RV64-NEXT:    lbu a2, 6(a1)
-; RV64-NEXT:    sb a2, 6(a0)
-; RV64-NEXT:    lbu a2, 5(a1)
-; RV64-NEXT:    sb a2, 5(a0)
-; RV64-NEXT:    lbu a2, 4(a1)
-; RV64-NEXT:    sb a2, 4(a0)
-; RV64-NEXT:    lbu a2, 3(a1)
-; RV64-NEXT:    sb a2, 3(a0)
-; RV64-NEXT:    lbu a2, 2(a1)
-; RV64-NEXT:    sb a2, 2(a0)
-; RV64-NEXT:    lbu a2, 1(a1)
-; RV64-NEXT:    sb a2, 1(a0)
-; RV64-NEXT:    lbu a1, 0(a1)
-; RV64-NEXT:    sb a1, 0(a0)
-; RV64-NEXT:    ret
+; RV64-NEXT:    lui a1, %hi(.L.str1)
+; RV64-NEXT:    addi a1, a1, %lo(.L.str1)
+; RV64-NEXT:    li a2, 31
+; RV64-NEXT:    tail memcpy
 ;
-; RV32-FAST-LABEL: unaligned_memcpy15:
+; RV32-FAST-LABEL: t1:
 ; RV32-FAST:       # %bb.0: # %entry
-; RV32-FAST-NEXT:    lw a2, 11(a1)
-; RV32-FAST-NEXT:    sw a2, 11(a0)
-; RV32-FAST-NEXT:    lw a2, 8(a1)
-; RV32-FAST-NEXT:    sw a2, 8(a0)
-; RV32-FAST-NEXT:    lw a2, 4(a1)
-; RV32-FAST-NEXT:    sw a2, 4(a0)
-; RV32-FAST-NEXT:    lw a1, 0(a1)
-; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    lui a1, 1141
+; RV32-FAST-NEXT:    lui a2, 300325
+; RV32-FAST-NEXT:    lui a3, 132181
+; RV32-FAST-NEXT:    lui a4, 340483
+; RV32-FAST-NEXT:    lui a5, 267556
+; RV32-FAST-NEXT:    lui a6, 337154
+; RV32-FAST-NEXT:    addi a1, a1, -439
+; RV32-FAST-NEXT:    sw a1, 27(a0)
+; RV32-FAST-NEXT:    lui a1, 320757
+; RV32-FAST-NEXT:    addi a2, a2, 1107
+; RV32-FAST-NEXT:    addi a3, a3, -689
+; RV32-FAST-NEXT:    addi a4, a4, -947
+; RV32-FAST-NEXT:    sw a4, 16(a0)
+; RV32-FAST-NEXT:    sw a3, 20(a0)
+; RV32-FAST-NEXT:    sw a2, 24(a0)
+; RV32-FAST-NEXT:    lui a2, 365861
+; RV32-FAST-NEXT:    addi a3, a5, 1871
+; RV32-FAST-NEXT:    addi a4, a6, 69
+; RV32-FAST-NEXT:    addi a1, a1, 1107
+; RV32-FAST-NEXT:    addi a2, a2, -1980
+; RV32-FAST-NEXT:    sw a2, 0(a0)
+; RV32-FAST-NEXT:    sw a1, 4(a0)
+; RV32-FAST-NEXT:    sw a4, 8(a0)
+; RV32-FAST-NEXT:    sw a3, 12(a0)
 ; RV32-FAST-NEXT:    ret
 ;
-; RV64-FAST-LABEL: unaligned_memcpy15:
+; RV64-FAST-LABEL: t1:
 ; RV64-FAST:       # %bb.0: # %entry
-; RV64-FAST-NEXT:    ld a2, 7(a1)
-; RV64-FAST-NEXT:    sd a2, 7(a0)
-; RV64-FAST-NEXT:    ld a1, 0(a1)
+; RV64-FAST-NEXT:    lui a1, %hi(.L.str1)
+; RV64-FAST-NEXT:    addi a2, a1, %lo(.L.str1)
+; RV64-FAST-NEXT:    ld a3, 23(a2)
+; RV64-FAST-NEXT:    ld a1, %lo(.L.str1)(a1)
+; RV64-FAST-NEXT:    ld a4, 8(a2)
+; RV64-FAST-NEXT:    ld a2, 16(a2)
+; RV64-FAST-NEXT:    sd a3, 23(a0)
 ; RV64-FAST-NEXT:    sd a1, 0(a0)
+; RV64-FAST-NEXT:    sd a4, 8(a0)
+; RV64-FAST-NEXT:    sd a2, 16(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 15, i1 false)
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str1, i64 31, i1 false)
   ret void
 }
 
-define void @unaligned_memcpy16(ptr nocapture %dest, ptr %src) nounwind {
-; RV32-LABEL: unaligned_memcpy16:
-; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lbu a2, 15(a1)
-; RV32-NEXT:    sb a2, 15(a0)
-; RV32-NEXT:    lbu a2, 14(a1)
-; RV32-NEXT:    sb a2, 14(a0)
-; RV32-NEXT:    lbu a2, 13(a1)
-; RV32-NEXT:    sb a2, 13(a0)
-; RV32-NEXT:    lbu a2, 12(a1)
-; RV32-NEXT:    sb a2, 12(a0)
-; RV32-NEXT:    lbu a2, 11(a1)
-; RV32-NEXT:    sb a2, 11(a0)
-; RV32-NEXT:    lbu a2, 10(a1)
-; RV32-NEXT:    sb a2, 10(a0)
-; RV32-NEXT:    lbu a2, 9(a1)
-; RV32-NEXT:    sb a2, 9(a0)
-; RV32-NEXT:    lbu a2, 8(a1)
-; RV32-NEXT:    sb a2, 8(a0)
-; RV32-NEXT:    lbu a2, 7(a1)
-; RV32-NEXT:    sb a2, 7(a0)
-; RV32-NEXT:    lbu a2, 6(a1)
-; RV32-NEXT:    sb a2, 6(a0)
-; RV32-NEXT:    lbu a2, 5(a1)
-; RV32-NEXT:    sb a2, 5(a0)
-; RV32-NEXT:    lbu a2, 4(a1)
-; RV32-NEXT:    sb a2, 4(a0)
-; RV32-NEXT:    lbu a2, 3(a1)
-; RV32-NEXT:    sb a2, 3(a0)
-; RV32-NEXT:    lbu a2, 2(a1)
-; RV32-NEXT:    sb a2, 2(a0)
-; RV32-NEXT:    lbu a2, 1(a1)
-; RV32-NEXT:    sb a2, 1(a0)
-; RV32-NEXT:    lbu a1, 0(a1)
-; RV32-NEXT:    sb a1, 0(a0)
-; RV32-NEXT:    ret
+define void @t2(ptr nocapture %C) nounwind {
+; RV32-BOTH-LABEL: t2:
+; RV32-BOTH:       # %bb.0: # %entry
+; RV32-BOTH-NEXT:    lui a1, %hi(.L.str2)
+; RV32-BOTH-NEXT:    addi a1, a1, %lo(.L.str2)
+; RV32-BOTH-NEXT:    li a2, 36
+; RV32-BOTH-NEXT:    tail memcpy
 ;
-; RV64-LABEL: unaligned_memcpy16:
+; RV64-LABEL: t2:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    lbu a2, 15(a1)
-; RV64-NEXT:    sb a2, 15(a0)
-; RV64-NEXT:    lbu a2, 14(a1)
-; RV64-NEXT:    sb a2, 14(a0)
-; RV64-NEXT:    lbu a2, 13(a1)
-; RV64-NEXT:    sb a2, 13(a0)
-; RV64-NEXT:    lbu a2, 12(a1)
-; RV64-NEXT:    sb a2, 12(a0)
-; RV64-NEXT:    lbu a2, 11(a1)
-; RV64-NEXT:    sb a2, 11(a0)
-; RV64-NEXT:    lbu a2, 10(a1)
-; RV64-NEXT:    sb a2, 10(a0)
-; RV64-NEXT:    lbu a2, 9(a1)
-; RV64-NEXT:    sb a2, 9(a0)
-; RV64-NEXT:    lbu a2, 8(a1)
-; RV64-NEXT:    sb a2, 8(a0)
-; RV64-NEXT:    lbu a2, 7(a1)
-; RV64-NEXT:    sb a2, 7(a0)
-; RV64-NEXT:    lbu a2, 6(a1)
-; RV64-NEXT:    sb a2, 6(a0)
-; RV64-NEXT:    lbu a2, 5(a1)
-; RV64-NEXT:    sb a2, 5(a0)
-; RV64-NEXT:    lbu a2, 4(a1)
-; RV64-NEXT:    sb a2, 4(a0)
-; RV64-NEXT:    lbu a2, 3(a1)
-; RV64-NEXT:    sb a2, 3(a0)
-; RV64-NEXT:    lbu a2, 2(a1)
-; RV64-NEXT:    sb a2, 2(a0)
-; RV64-NEXT:    lbu a2, 1(a1)
-; RV64-NEXT:    sb a2, 1(a0)
-; RV64-NEXT:    lbu a1, 0(a1)
-; RV64-NEXT:    sb a1, 0(a0)
-; RV64-NEXT:    ret
-;
-; RV32-FAST-LABEL: unaligned_memcpy16:
-; RV32-FAST:       # %bb.0: # %entry
-; RV32-FAST-NEXT:    lw a2, 12(a1)
-; RV32-FAST-NEXT:    sw a2, 12(a0)
-; RV32-FAST-NEXT:    lw a2, 8(a1)
-; RV32-FAST-NEXT:    sw a2, 8(a0)
-; RV32-FAST-NEXT:    lw a2, 4(a1)
-; RV32-FAST-NEXT:    sw a2, 4(a0)
-; RV32-FAST-NEXT:    lw a1, 0(a1)
-; RV32-FAST-NEXT:    sw a1, 0(a0)
-; RV32-FAST-NEXT:    ret
+; RV64-NEXT:    lui a1, %hi(.L.str2)
+; RV64-NEXT:    addi a1, a1, %lo(.L.str2)
+; RV64-NEXT:    li a2, 36
+; RV64-NEXT:    tail memcpy
 ;
-; RV64-FAST-LABEL: unaligned_memcpy16:
+; RV64-FAST-LABEL: t2:
 ; RV64-FAST:       # %bb.0: # %entry
+; RV64-FAST-NEXT:    lui a1, %hi(.L.str2)
+; RV64-FAST-NEXT:    lui a2, 1156
+; RV64-FAST-NEXT:    ld a3, %lo(.L.str2)(a1)
+; RV64-FAST-NEXT:    addi a2, a2, 332
+; RV64-FAST-NEXT:    addi a1, a1, %lo(.L.str2)
+; RV64-FAST-NEXT:    sw a2, 32(a0)
 ; RV64-FAST-NEXT:    ld a2, 8(a1)
+; RV64-FAST-NEXT:    ld a4, 16(a1)
+; RV64-FAST-NEXT:    ld a1, 24(a1)
+; RV64-FAST-NEXT:    sd a3, 0(a0)
 ; RV64-FAST-NEXT:    sd a2, 8(a0)
-; RV64-FAST-NEXT:    ld a1, 0(a1)
-; RV64-FAST-NEXT:    sd a1, 0(a0)
+; RV64-FAST-NEXT:    sd a4, 16(a0)
+; RV64-FAST-NEXT:    sd a1, 24(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 16, i1 false)
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str2, i64 36, i1 false)
   ret void
 }
 
-define void @unaligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind {
-; RV32-LABEL: unaligned_memcpy31:
+define void @t3(ptr nocapture %C) nounwind {
+; RV32-LABEL: t3:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lbu a2, 30(a1)
-; RV32-NEXT:    sb a2, 30(a0)
-; RV32-NEXT:    lbu a2, 29(a1)
-; RV32-NEXT:    sb a2, 29(a0)
-; RV32-NEXT:    lbu a2, 28(a1)
-; RV32-NEXT:    sb a2, 28(a0)
-; RV32-NEXT:    lbu a2, 27(a1)
-; RV32-NEXT:    sb a2, 27(a0)
-; RV32-NEXT:    lbu a2, 26(a1)
-; RV32-NEXT:    sb a2, 26(a0)
-; RV32-NEXT:    lbu a2, 25(a1)
-; RV32-NEXT:    sb a2, 25(a0)
-; RV32-NEXT:    lbu a2, 24(a1)
-; RV32-NEXT:    sb a2, 24(a0)
-; RV32-NEXT:    lbu a2, 23(a1)
-; RV32-NEXT:    sb a2, 23(a0)
-; RV32-NEXT:    lbu a2, 22(a1)
-; RV32-NEXT:    sb a2, 22(a0)
-; RV32-NEXT:    lbu a2, 21(a1)
-; RV32-NEXT:    sb a2, 21(a0)
-; RV32-NEXT:    lbu a2, 20(a1)
-; RV32-NEXT:    sb a2, 20(a0)
-; RV32-NEXT:    lbu a2, 19(a1)
-; RV32-NEXT:    sb a2, 19(a0)
-; RV32-NEXT:    lbu a2, 18(a1)
-; RV32-NEXT:    sb a2, 18(a0)
-; RV32-NEXT:    lbu a2, 17(a1)
-; RV32-NEXT:    sb a2, 17(a0)
-; RV32-NEXT:    lbu a2, 16(a1)
-; RV32-NEXT:    sb a2, 16(a0)
-; RV32-NEXT:    lbu a2, 15(a1)
-; RV32-NEXT:    sb a2, 15(a0)
-; RV32-NEXT:    lbu a2, 14(a1)
-; RV32-NEXT:    sb a2, 14(a0)
-; RV32-NEXT:    lbu a2, 13(a1)
-; RV32-NEXT:    sb a2, 13(a0)
-; RV32-NEXT:    lbu a2, 12(a1)
-; RV32-NEXT:    sb a2, 12(a0)
-; RV32-NEXT:    lbu a2, 11(a1)
-; RV32-NEXT:    sb a2, 11(a0)
-; RV32-NEXT:    lbu a2, 10(a1)
-; RV32-NEXT:    sb a2, 10(a0)
-; RV32-NEXT:    lbu a2, 9(a1)
-; RV32-NEXT:    sb a2, 9(a0)
-; RV32-NEXT:    lbu a2, 8(a1)
-; RV32-NEXT:    sb a2, 8(a0)
-; RV32-NEXT:    lbu a2, 7(a1)
-; RV32-NEXT:    sb a2, 7(a0)
-; RV32-NEXT:    lbu a2, 6(a1)
-; RV32-NEXT:    sb a2, 6(a0)
-; RV32-NEXT:    lbu a2, 5(a1)
-; RV32-NEXT:    sb a2, 5(a0)
-; RV32-NEXT:    lbu a2, 4(a1)
-; RV32-NEXT:    sb a2, 4(a0)
-; RV32-NEXT:    lbu a2, 3(a1)
-; RV32-NEXT:    sb a2, 3(a0)
-; RV32-NEXT:    lbu a2, 2(a1)
-; RV32-NEXT:    sb a2, 2(a0)
-; RV32-NEXT:    lbu a2, 1(a1)
-; RV32-NEXT:    sb a2, 1(a0)
-; RV32-NEXT:    lbu a1, 0(a1)
-; RV32-NEXT:    sb a1, 0(a0)
-; RV32-NEXT:    ret
+; RV32-NEXT:    lui a1, %hi(.L.str3)
+; RV32-NEXT:    addi a1, a1, %lo(.L.str3)
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    tail memcpy
 ;
-; RV64-LABEL: unaligned_memcpy31:
+; RV64-LABEL: t3:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    lbu a2, 30(a1)
-; RV64-NEXT:    sb a2, 30(a0)
-; RV64-NEXT:    lbu a2, 29(a1)
-; RV64-NEXT:    sb a2, 29(a0)
-; RV64-NEXT:    lbu a2, 28(a1)
-; RV64-NEXT:    sb a2, 28(a0)
-; RV64-NEXT:    lbu a2, 27(a1)
-; RV64-NEXT:    sb a2, 27(a0)
-; RV64-NEXT:    lbu a2, 26(a1)
-; RV64-NEXT:    sb a2, 26(a0)
-; RV64-NEXT:    lbu a2, 25(a1)
-; RV64-NEXT:    sb a2, 25(a0)
-; RV64-NEXT:    lbu a2, 24(a1)
-; RV64-NEXT:    sb a2, 24(a0)
-; RV64-NEXT:    lbu a2, 23(a1)
-; RV64-NEXT:    sb a2, 23(a0)
-; RV64-NEXT:    lbu a2, 22(a1)
-; RV64-NEXT:    sb a2, 22(a0)
-; RV64-NEXT:    lbu a2, 21(a1)
-; RV64-NEXT:    sb a2, 21(a0)
-; RV64-NEXT:    lbu a2, 20(a1)
-; RV64-NEXT:    sb a2, 20(a0)
-; RV64-NEXT:    lbu a2, 19(a1)
-; RV64-NEXT:    sb a2, 19(a0)
-; RV64-NEXT:    lbu a2, 18(a1)
-; RV64-NEXT:    sb a2, 18(a0)
-; RV64-NEXT:    lbu a2, 17(a1)
-; RV64-NEXT:    sb a2, 17(a0)
-; RV64-NEXT:    lbu a2, 16(a1)
-; RV64-NEXT:    sb a2, 16(a0)
-; RV64-NEXT:    lbu a2, 15(a1)
-; RV64-NEXT:    sb a2, 15(a0)
-; RV64-NEXT:    lbu a2, 14(a1)
-; RV64-NEXT:    sb a2, 14(a0)
-; RV64-NEXT:    lbu a2, 13(a1)
-; RV64-NEXT:    sb a2, 13(a0)
-; RV64-NEXT:    lbu a2, 12(a1)
-; RV64-NEXT:    sb a2, 12(a0)
-; RV64-NEXT:    lbu a2, 11(a1)
-; RV64-NEXT:    sb a2, 11(a0)
-; RV64-NEXT:    lbu a2, 10(a1)
-; RV64-NEXT:    sb a2, 10(a0)
-; RV64-NEXT:    lbu a2, 9(a1)
-; RV64-NEXT:    sb a2, 9(a0)
-; RV64-NEXT:    lbu a2, 8(a1)
-; RV64-NEXT:    sb a2, 8(a0)
-; RV64-NEXT:    lbu a2, 7(a1)
-; RV64-NEXT:    sb a2, 7(a0)
-; RV64-NEXT:    lbu a2, 6(a1)
-; RV64-NEXT:    sb a2, 6(a0)
-; RV64-NEXT:    lbu a2, 5(a1)
-; RV64-NEXT:    sb a2, 5(a0)
-; RV64-NEXT:    lbu a2, 4(a1)
-; RV64-NEXT:    sb a2, 4(a0)
-; RV64-NEXT:    lbu a2, 3(a1)
-; RV64-NEXT:    sb a2, 3(a0)
-; RV64-NEXT:    lbu a2, 2(a1)
-; RV64-NEXT:    sb a2, 2(a0)
-; RV64-NEXT:    lbu a2, 1(a1)
-; RV64-NEXT:    sb a2, 1(a0)
-; RV64-NEXT:    lbu a1, 0(a1)
-; RV64-NEXT:    sb a1, 0(a0)
-; RV64-NEXT:    ret
+; RV64-NEXT:    lui a1, %hi(.L.str3)
+; RV64-NEXT:    addi a1, a1, %lo(.L.str3)
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    tail memcpy
 ;
-; RV32-FAST-LABEL: unaligned_memcpy31:
+; RV32-FAST-LABEL: t3:
 ; RV32-FAST:       # %bb.0: # %entry
-; RV32-FAST-NEXT:    lw a2, 27(a1)
-; RV32-FAST-NEXT:    sw a2, 27(a0)
-; RV32-FAST-NEXT:    lw a2, 24(a1)
-; RV32-FAST-NEXT:    sw a2, 24(a0)
-; RV32-FAST-NEXT:    lw a2, 20(a1)
-; RV32-FAST-NEXT:    sw a2, 20(a0)
-; RV32-FAST-NEXT:    lw a2, 16(a1)
+; RV32-FAST-NEXT:    lui a1, 1109
+; RV32-FAST-NEXT:    lui a2, 340483
+; RV32-FAST-NEXT:    lui a3, 267556
+; RV32-FAST-NEXT:    lui a4, 337154
+; RV32-FAST-NEXT:    lui a5, 320757
+; RV32-FAST-NEXT:    addi a1, a1, -689
+; RV32-FAST-NEXT:    addi a2, a2, -947
 ; RV32-FAST-NEXT:    sw a2, 16(a0)
-; RV32-FAST-NEXT:    lw a2, 12(a1)
-; RV32-FAST-NEXT:    sw a2, 12(a0)
-; RV32-FAST-NEXT:    lw a2, 8(a1)
-; RV32-FAST-NEXT:    sw a2, 8(a0)
-; RV32-FAST-NEXT:    lw a2, 4(a1)
-; RV32-FAST-NEXT:    sw a2, 4(a0)
-; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a1, 20(a0)
+; RV32-FAST-NEXT:    lui a1, 365861
+; RV32-FAST-NEXT:    addi a2, a3, 1871
+; RV32-FAST-NEXT:    addi a3, a4, 69
+; RV32-FAST-NEXT:    addi a4, a5, 1107
+; RV32-FAST-NEXT:    addi a1, a1, -1980
 ; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    sw a4, 4(a0)
+; RV32-FAST-NEXT:    sw a3, 8(a0)
+; RV32-FAST-NEXT:    sw a2, 12(a0)
 ; RV32-FAST-NEXT:    ret
 ;
-; RV64-FAST-LABEL: unaligned_memcpy31:
+; RV64-FAST-LABEL: t3:
 ; RV64-FAST:       # %bb.0: # %entry
-; RV64-FAST-NEXT:    ld a2, 23(a1)
-; RV64-FAST-NEXT:    sd a2, 23(a0)
-; RV64-FAST-NEXT:    ld a2, 16(a1)
-; RV64-FAST-NEXT:    sd a2, 16(a0)
-; RV64-FAST-NEXT:    ld a2, 8(a1)
-; RV64-FAST-NEXT:    sd a2, 8(a0)
-; RV64-FAST-NEXT:    ld a1, 0(a1)
-; RV64-FAST-NEXT:    sd a1, 0(a0)
+; RV64-FAST-NEXT:    lui a1, %hi(.L.str3)
+; RV64-FAST-NEXT:    ld a2, %lo(.L.str3)(a1)
+; RV64-FAST-NEXT:    addi a1, a1, %lo(.L.str3)
+; RV64-FAST-NEXT:    ld a3, 8(a1)
+; RV64-FAST-NEXT:    ld a1, 16(a1)
+; RV64-FAST-NEXT:    sd a2, 0(a0)
+; RV64-FAST-NEXT:    sd a3, 8(a0)
+; RV64-FAST-NEXT:    sd a1, 16(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 31, i1 false)
-  ret void
-}
-
-; ----------------------------------------------------------------------
-; Fully aligned cases
-
-define void @aligned_memcpy0(ptr nocapture %dest, ptr %src) nounwind {
-; RV32-BOTH-LABEL: aligned_memcpy0:
-; RV32-BOTH:       # %bb.0: # %entry
-; RV32-BOTH-NEXT:    ret
-;
-; RV64-BOTH-LABEL: aligned_memcpy0:
-; RV64-BOTH:       # %bb.0: # %entry
-; RV64-BOTH-NEXT:    ret
-entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 0, i1 false)
-  ret void
-}
-
-define void @aligned_memcpy1(ptr nocapture %dest, ptr %src) nounwind {
-; RV32-BOTH-LABEL: aligned_memcpy1:
-; RV32-BOTH:       # %bb.0: # %entry
-; RV32-BOTH-NEXT:    lbu a1, 0(a1)
-; RV32-BOTH-NEXT:    sb a1, 0(a0)
-; RV32-BOTH-NEXT:    ret
-;
-; RV64-BOTH-LABEL: aligned_memcpy1:
-; RV64-BOTH:       # %bb.0: # %entry
-; RV64-BOTH-NEXT:    lbu a1, 0(a1)
-; RV64-BOTH-NEXT:    sb a1, 0(a0)
-; RV64-BOTH-NEXT:    ret
-entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 1, i1 false)
-  ret void
-}
-
-define void @aligned_memcpy2(ptr nocapture %dest, ptr %src) nounwind {
-; RV32-BOTH-LABEL: aligned_memcpy2:
-; RV32-BOTH:       # %bb.0: # %entry
-; RV32-BOTH-NEXT:    lh a1, 0(a1)
-; RV32-BOTH-NEXT:    sh a1, 0(a0)
-; RV32-BOTH-NEXT:    ret
-;
-; RV64-BOTH-LABEL: aligned_memcpy2:
-; RV64-BOTH:       # %bb.0: # %entry
-; RV64-BOTH-NEXT:    lh a1, 0(a1)
-; RV64-BOTH-NEXT:    sh a1, 0(a0)
-; RV64-BOTH-NEXT:    ret
-entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 2, i1 false)
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str3, i64 24, i1 false)
   ret void
 }
 
-define void @aligned_memcpy3(ptr nocapture %dest, ptr %src) nounwind {
-; RV32-BOTH-LABEL: aligned_memcpy3:
-; RV32-BOTH:       # %bb.0: # %entry
-; RV32-BOTH-NEXT:    lbu a2, 2(a1)
-; RV32-BOTH-NEXT:    sb a2, 2(a0)
-; RV32-BOTH-NEXT:    lh a1, 0(a1)
-; RV32-BOTH-NEXT:    sh a1, 0(a0)
-; RV32-BOTH-NEXT:    ret
-;
-; RV64-BOTH-LABEL: aligned_memcpy3:
-; RV64-BOTH:       # %bb.0: # %entry
-; RV64-BOTH-NEXT:    lbu a2, 2(a1)
-; RV64-BOTH-NEXT:    sb a2, 2(a0)
-; RV64-BOTH-NEXT:    lh a1, 0(a1)
-; RV64-BOTH-NEXT:    sh a1, 0(a0)
-; RV64-BOTH-NEXT:    ret
-entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 3, i1 false)
-  ret void
-}
-
-define void @aligned_memcpy4(ptr nocapture %dest, ptr %src) nounwind {
-; RV32-BOTH-LABEL: aligned_memcpy4:
-; RV32-BOTH:       # %bb.0: # %entry
-; RV32-BOTH-NEXT:    lw a1, 0(a1)
-; RV32-BOTH-NEXT:    sw a1, 0(a0)
-; RV32-BOTH-NEXT:    ret
-;
-; RV64-BOTH-LABEL: aligned_memcpy4:
-; RV64-BOTH:       # %bb.0: # %entry
-; RV64-BOTH-NEXT:    lw a1, 0(a1)
-; RV64-BOTH-NEXT:    sw a1, 0(a0)
-; RV64-BOTH-NEXT:    ret
-entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 4, i1 false)
-  ret void
-}
-
-define void @aligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind {
-; RV32-LABEL: aligned_memcpy7:
+define void @t4(ptr nocapture %C) nounwind {
+; RV32-LABEL: t4:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lbu a2, 6(a1)
-; RV32-NEXT:    sb a2, 6(a0)
-; RV32-NEXT:    lh a2, 4(a1)
-; RV32-NEXT:    sh a2, 4(a0)
-; RV32-NEXT:    lw a1, 0(a1)
-; RV32-NEXT:    sw a1, 0(a0)
-; RV32-NEXT:    ret
+; RV32-NEXT:    lui a1, %hi(.L.str4)
+; RV32-NEXT:    addi a1, a1, %lo(.L.str4)
+; RV32-NEXT:    li a2, 18
+; RV32-NEXT:    tail memcpy
 ;
-; RV64-LABEL: aligned_memcpy7:
+; RV64-LABEL: t4:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    lbu a2, 6(a1)
-; RV64-NEXT:    sb a2, 6(a0)
-; RV64-NEXT:    lh a2, 4(a1)
-; RV64-NEXT:    sh a2, 4(a0)
-; RV64-NEXT:    lw a1, 0(a1)
-; RV64-NEXT:    sw a1, 0(a0)
-; RV64-NEXT:    ret
+; RV64-NEXT:    lui a1, %hi(.L.str4)
+; RV64-NEXT:    addi a1, a1, %lo(.L.str4)
+; RV64-NEXT:    li a2, 18
+; RV64-NEXT:    tail memcpy
 ;
-; RV32-FAST-LABEL: aligned_memcpy7:
+; RV32-FAST-LABEL: t4:
 ; RV32-FAST:       # %bb.0: # %entry
-; RV32-FAST-NEXT:    lw a2, 3(a1)
-; RV32-FAST-NEXT:    sw a2, 3(a0)
-; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    li a1, 32
+; RV32-FAST-NEXT:    lui a2, 132388
+; RV32-FAST-NEXT:    lui a3, 337154
+; RV32-FAST-NEXT:    lui a4, 320757
+; RV32-FAST-NEXT:    sh a1, 16(a0)
+; RV32-FAST-NEXT:    lui a1, 365861
+; RV32-FAST-NEXT:    addi a2, a2, 1871
+; RV32-FAST-NEXT:    addi a3, a3, 69
+; RV32-FAST-NEXT:    addi a4, a4, 1107
+; RV32-FAST-NEXT:    addi a1, a1, -1980
 ; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    sw a4, 4(a0)
+; RV32-FAST-NEXT:    sw a3, 8(a0)
+; RV32-FAST-NEXT:    sw a2, 12(a0)
 ; RV32-FAST-NEXT:    ret
 ;
-; RV64-FAST-LABEL: aligned_memcpy7:
+; RV64-FAST-LABEL: t4:
 ; RV64-FAST:       # %bb.0: # %entry
-; RV64-FAST-NEXT:    lw a2, 3(a1)
-; RV64-FAST-NEXT:    sw a2, 3(a0)
-; RV64-FAST-NEXT:    lw a1, 0(a1)
-; RV64-FAST-NEXT:    sw a1, 0(a0)
+; RV64-FAST-NEXT:    lui a1, %hi(.L.str4)
+; RV64-FAST-NEXT:    ld a2, %lo(.L.str4)(a1)
+; RV64-FAST-NEXT:    addi a1, a1, %lo(.L.str4)
+; RV64-FAST-NEXT:    ld a1, 8(a1)
+; RV64-FAST-NEXT:    li a3, 32
+; RV64-FAST-NEXT:    sd a2, 0(a0)
+; RV64-FAST-NEXT:    sd a1, 8(a0)
+; RV64-FAST-NEXT:    sh a3, 16(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 7, i1 false)
-  ret void
-}
-
-define void @aligned_memcpy8(ptr nocapture %dest, ptr %src) nounwind {
-; RV32-BOTH-LABEL: aligned_memcpy8:
-; RV32-BOTH:       # %bb.0: # %entry
-; RV32-BOTH-NEXT:    lw a2, 4(a1)
-; RV32-BOTH-NEXT:    sw a2, 4(a0)
-; RV32-BOTH-NEXT:    lw a1, 0(a1)
-; RV32-BOTH-NEXT:    sw a1, 0(a0)
-; RV32-BOTH-NEXT:    ret
-;
-; RV64-BOTH-LABEL: aligned_memcpy8:
-; RV64-BOTH:       # %bb.0: # %entry
-; RV64-BOTH-NEXT:    ld a1, 0(a1)
-; RV64-BOTH-NEXT:    sd a1, 0(a0)
-; RV64-BOTH-NEXT:    ret
-entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 8, i1 false)
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str4, i64 18, i1 false)
   ret void
 }
 
-define void @aligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind {
-; RV32-LABEL: aligned_memcpy15:
+define void @t5(ptr nocapture %C) nounwind {
+; RV32-LABEL: t5:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lbu a2, 14(a1)
-; RV32-NEXT:    sb a2, 14(a0)
-; RV32-NEXT:    lh a2, 12(a1)
-; RV32-NEXT:    sh a2, 12(a0)
-; RV32-NEXT:    lw a2, 8(a1)
-; RV32-NEXT:    sw a2, 8(a0)
-; RV32-NEXT:    lw a2, 4(a1)
-; RV32-NEXT:    sw a2, 4(a0)
-; RV32-NEXT:    lw a1, 0(a1)
-; RV32-NEXT:    sw a1, 0(a0)
+; RV32-NEXT:    li a1, 84
+; RV32-NEXT:    li a2, 83
+; RV32-NEXT:    li a3, 89
+; RV32-NEXT:    li a4, 82
+; RV32-NEXT:    li a5, 72
+; RV32-NEXT:    li a6, 68
+; RV32-NEXT:    sb a2, 4(a0)
+; RV32-NEXT:    sb a1, 5(a0)
+; RV32-NEXT:    sb zero, 6(a0)
+; RV32-NEXT:    sb a6, 0(a0)
+; RV32-NEXT:    sb a5, 1(a0)
+; RV32-NEXT:    sb a4, 2(a0)
+; RV32-NEXT:    sb a3, 3(a0)
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: aligned_memcpy15:
+; RV64-LABEL: t5:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    lbu a2, 14(a1)
-; RV64-NEXT:    sb a2, 14(a0)
-; RV64-NEXT:    lh a2, 12(a1)
-; RV64-NEXT:    sh a2, 12(a0)
-; RV64-NEXT:    lw a2, 8(a1)
-; RV64-NEXT:    sw a2, 8(a0)
-; RV64-NEXT:    ld a1, 0(a1)
-; RV64-NEXT:    sd a1, 0(a0)
+; RV64-NEXT:    li a1, 84
+; RV64-NEXT:    li a2, 83
+; RV64-NEXT:    li a3, 89
+; RV64-NEXT:    li a4, 82
+; RV64-NEXT:    li a5, 72
+; RV64-NEXT:    li a6, 68
+; RV64-NEXT:    sb a2, 4(a0)
+; RV64-NEXT:    sb a1, 5(a0)
+; RV64-NEXT:    sb zero, 6(a0)
+; RV64-NEXT:    sb a6, 0(a0)
+; RV64-NEXT:    sb a5, 1(a0)
+; RV64-NEXT:    sb a4, 2(a0)
+; RV64-NEXT:    sb a3, 3(a0)
 ; RV64-NEXT:    ret
 ;
-; RV32-FAST-LABEL: aligned_memcpy15:
+; RV32-FAST-LABEL: t5:
 ; RV32-FAST:       # %bb.0: # %entry
-; RV32-FAST-NEXT:    lw a2, 11(a1)
-; RV32-FAST-NEXT:    sw a2, 11(a0)
-; RV32-FAST-NEXT:    lw a2, 8(a1)
-; RV32-FAST-NEXT:    sw a2, 8(a0)
-; RV32-FAST-NEXT:    lw a2, 4(a1)
-; RV32-FAST-NEXT:    sw a2, 4(a0)
-; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    lui a1, 1349
+; RV32-FAST-NEXT:    addi a1, a1, 857
+; RV32-FAST-NEXT:    sw a1, 3(a0)
+; RV32-FAST-NEXT:    lui a1, 365861
+; RV32-FAST-NEXT:    addi a1, a1, -1980
 ; RV32-FAST-NEXT:    sw a1, 0(a0)
 ; RV32-FAST-NEXT:    ret
 ;
-; RV64-FAST-LABEL: aligned_memcpy15:
+; RV64-FAST-LABEL: t5:
 ; RV64-FAST:       # %bb.0: # %entry
-; RV64-FAST-NEXT:    ld a2, 7(a1)
-; RV64-FAST-NEXT:    sd a2, 7(a0)
-; RV64-FAST-NEXT:    ld a1, 0(a1)
-; RV64-FAST-NEXT:    sd a1, 0(a0)
+; RV64-FAST-NEXT:    lui a1, 1349
+; RV64-FAST-NEXT:    addi a1, a1, 857
+; RV64-FAST-NEXT:    sw a1, 3(a0)
+; RV64-FAST-NEXT:    lui a1, 365861
+; RV64-FAST-NEXT:    addi a1, a1, -1980
+; RV64-FAST-NEXT:    sw a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 15, i1 false)
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str5, i64 7, i1 false)
   ret void
 }
 
-define void @aligned_memcpy16(ptr nocapture %dest, ptr %src) nounwind {
-; RV32-BOTH-LABEL: aligned_memcpy16:
-; RV32-BOTH:       # %bb.0: # %entry
-; RV32-BOTH-NEXT:    lw a2, 12(a1)
-; RV32-BOTH-NEXT:    sw a2, 12(a0)
-; RV32-BOTH-NEXT:    lw a2, 8(a1)
-; RV32-BOTH-NEXT:    sw a2, 8(a0)
-; RV32-BOTH-NEXT:    lw a2, 4(a1)
-; RV32-BOTH-NEXT:    sw a2, 4(a0)
-; RV32-BOTH-NEXT:    lw a1, 0(a1)
-; RV32-BOTH-NEXT:    sw a1, 0(a0)
-; RV32-BOTH-NEXT:    ret
-;
-; RV64-BOTH-LABEL: aligned_memcpy16:
-; RV64-BOTH:       # %bb.0: # %entry
-; RV64-BOTH-NEXT:    ld a2, 8(a1)
-; RV64-BOTH-NEXT:    sd a2, 8(a0)
-; RV64-BOTH-NEXT:    ld a1, 0(a1)
-; RV64-BOTH-NEXT:    sd a1, 0(a0)
-; RV64-BOTH-NEXT:    ret
-entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 16, i1 false)
-  ret void
-}
-
-define void @aligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind {
-; RV32-LABEL: aligned_memcpy31:
+define void @t6() nounwind {
+; RV32-LABEL: t6:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lbu a2, 30(a1)
-; RV32-NEXT:    sb a2, 30(a0)
-; RV32-NEXT:    lh a2, 28(a1)
-; RV32-NEXT:    sh a2, 28(a0)
-; RV32-NEXT:    lw a2, 24(a1)
-; RV32-NEXT:    sw a2, 24(a0)
-; RV32-NEXT:    lw a2, 20(a1)
-; RV32-NEXT:    sw a2, 20(a0)
-; RV32-NEXT:    lw a2, 16(a1)
-; RV32-NEXT:    sw a2, 16(a0)
-; RV32-NEXT:    lw a2, 12(a1)
-; RV32-NEXT:    sw a2, 12(a0)
-; RV32-NEXT:    lw a2, 8(a1)
-; RV32-NEXT:    sw a2, 8(a0)
-; RV32-NEXT:    lw a2, 4(a1)
-; RV32-NEXT:    sw a2, 4(a0)
-; RV32-NEXT:    lw a1, 0(a1)
-; RV32-NEXT:    sw a1, 0(a0)
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lui a0, %hi(spool.splbuf)
+; RV32-NEXT:    addi a0, a0, %lo(spool.splbuf)
+; RV32-NEXT:    lui a1, %hi(.L.str6)
+; RV32-NEXT:    addi a1, a1, %lo(.L.str6)
+; RV32-NEXT:    li a2, 14
+; RV32-NEXT:    call memcpy
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: aligned_memcpy31:
+; RV64-LABEL: t6:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    lbu a2, 30(a1)
-; RV64-NEXT:    sb a2, 30(a0)
-; RV64-NEXT:    lh a2, 28(a1)
-; RV64-NEXT:    sh a2, 28(a0)
-; RV64-NEXT:    lw a2, 24(a1)
-; RV64-NEXT:    sw a2, 24(a0)
-; RV64-NEXT:    ld a2, 16(a1)
-; RV64-NEXT:    sd a2, 16(a0)
-; RV64-NEXT:    ld a2, 8(a1)
-; RV64-NEXT:    sd a2, 8(a0)
-; RV64-NEXT:    ld a1, 0(a1)
-; RV64-NEXT:    sd a1, 0(a0)
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    lui a0, %hi(spool.splbuf)
+; RV64-NEXT:    addi a0, a0, %lo(spool.splbuf)
+; RV64-NEXT:    lui a1, %hi(.L.str6)
+; RV64-NEXT:    addi a1, a1, %lo(.L.str6)
+; RV64-NEXT:    li a2, 14
+; RV64-NEXT:    call memcpy
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
 ;
-; RV32-FAST-LABEL: aligned_memcpy31:
+; RV32-FAST-LABEL: t6:
 ; RV32-FAST:       # %bb.0: # %entry
-; RV32-FAST-NEXT:    lw a2, 27(a1)
-; RV32-FAST-NEXT:    sw a2, 27(a0)
-; RV32-FAST-NEXT:    lw a2, 24(a1)
-; RV32-FAST-NEXT:    sw a2, 24(a0)
-; RV32-FAST-NEXT:    lw a2, 20(a1)
-; RV32-FAST-NEXT:    sw a2, 20(a0)
-; RV32-FAST-NEXT:    lw a2, 16(a1)
-; RV32-FAST-NEXT:    sw a2, 16(a0)
-; RV32-FAST-NEXT:    lw a2, 12(a1)
-; RV32-FAST-NEXT:    sw a2, 12(a0)
-; RV32-FAST-NEXT:    lw a2, 8(a1)
-; RV32-FAST-NEXT:    sw a2, 8(a0)
-; RV32-FAST-NEXT:    lw a2, 4(a1)
-; RV32-FAST-NEXT:    sw a2, 4(a0)
-; RV32-FAST-NEXT:    lw a1, 0(a1)
-; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    lui a0, %hi(spool.splbuf)
+; RV32-FAST-NEXT:    li a1, 88
+; RV32-FAST-NEXT:    sh a1, %lo(spool.splbuf+12)(a0)
+; RV32-FAST-NEXT:    lui a1, 361862
+; RV32-FAST-NEXT:    addi a1, a1, -1960
+; RV32-FAST-NEXT:    sw a1, %lo(spool.splbuf+8)(a0)
+; RV32-FAST-NEXT:    lui a1, 362199
+; RV32-FAST-NEXT:    addi a1, a1, 559
+; RV32-FAST-NEXT:    sw a1, %lo(spool.splbuf+4)(a0)
+; RV32-FAST-NEXT:    lui a1, 460503
+; RV32-FAST-NEXT:    addi a1, a1, 1071
+; RV32-FAST-NEXT:    sw a1, %lo(spool.splbuf)(a0)
 ; RV32-FAST-NEXT:    ret
 ;
-; RV64-FAST-LABEL: aligned_memcpy31:
+; RV64-FAST-LABEL: t6:
 ; RV64-FAST:       # %bb.0: # %entry
-; RV64-FAST-NEXT:    ld a2, 23(a1)
-; RV64-FAST-NEXT:    sd a2, 23(a0)
-; RV64-FAST-NEXT:    ld a2, 16(a1)
-; RV64-FAST-NEXT:    sd a2, 16(a0)
-; RV64-FAST-NEXT:    ld a2, 8(a1)
-; RV64-FAST-NEXT:    sd a2, 8(a0)
-; RV64-FAST-NEXT:    ld a1, 0(a1)
-; RV64-FAST-NEXT:    sd a1, 0(a0)
+; RV64-FAST-NEXT:    lui a0, %hi(.L.str6)
+; RV64-FAST-NEXT:    ld a1, %lo(.L.str6)(a0)
+; RV64-FAST-NEXT:    addi a0, a0, %lo(.L.str6)
+; RV64-FAST-NEXT:    ld a0, 6(a0)
+; RV64-FAST-NEXT:    lui a2, %hi(spool.splbuf)
+; RV64-FAST-NEXT:    sd a1, %lo(spool.splbuf)(a2)
+; RV64-FAST-NEXT:    sd a0, %lo(spool.splbuf+6)(a2)
 ; RV64-FAST-NEXT:    ret
 entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 31, i1 false)
+  call void @llvm.memcpy.p0.p0.i64(ptr @spool.splbuf, ptr @.str6, i64 14, i1 false)
   ret void
 }
 
-; ------------------------------------------------------------------------
-; A few partially aligned cases
+%struct.Foo = type { i32, i32, i32, i32 }
 
-
-define void @memcpy16_align4(ptr nocapture %dest, ptr nocapture %src) nounwind {
-; RV32-BOTH-LABEL: memcpy16_align4:
+define void @t7(ptr nocapture %a, ptr nocapture %b) nounwind {
+; RV32-BOTH-LABEL: t7:
 ; RV32-BOTH:       # %bb.0: # %entry
 ; RV32-BOTH-NEXT:    lw a2, 12(a1)
 ; RV32-BOTH-NEXT:    sw a2, 12(a0)
@@ -947,7 +418,7 @@ define void @memcpy16_align4(ptr nocapture %dest, ptr nocapture %src) nounwind {
 ; RV32-BOTH-NEXT:    sw a1, 0(a0)
 ; RV32-BOTH-NEXT:    ret
 ;
-; RV64-LABEL: memcpy16_align4:
+; RV64-LABEL: t7:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    lw a2, 12(a1)
 ; RV64-NEXT:    sw a2, 12(a0)
@@ -959,7 +430,7 @@ define void @memcpy16_align4(ptr nocapture %dest, ptr nocapture %src) nounwind {
 ; RV64-NEXT:    sw a1, 0(a0)
 ; RV64-NEXT:    ret
 ;
-; RV64-FAST-LABEL: memcpy16_align4:
+; RV64-FAST-LABEL: t7:
 ; RV64-FAST:       # %bb.0: # %entry
 ; RV64-FAST-NEXT:    ld a2, 8(a1)
 ; RV64-FAST-NEXT:    sd a2, 8(a0)
@@ -967,58 +438,11 @@ define void @memcpy16_align4(ptr nocapture %dest, ptr nocapture %src) nounwind {
 ; RV64-FAST-NEXT:    sd a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
-  tail call void @llvm.memcpy.p0.p0.i32(ptr align 4 %dest, ptr align 4 %src, i32 16, i1 false)
+  tail call void @llvm.memcpy.p0.p0.i32(ptr align 4 %a, ptr align 4 %b, i32 16, i1 false)
   ret void
 }
 
-define i32 @memcpy11_align8(ptr nocapture %dest, ptr %src) {
-; RV32-LABEL: memcpy11_align8:
-; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lbu a2, 10(a1)
-; RV32-NEXT:    sb a2, 10(a0)
-; RV32-NEXT:    lh a2, 8(a1)
-; RV32-NEXT:    sh a2, 8(a0)
-; RV32-NEXT:    lw a2, 4(a1)
-; RV32-NEXT:    sw a2, 4(a0)
-; RV32-NEXT:    lw a1, 0(a1)
-; RV32-NEXT:    sw a1, 0(a0)
-; RV32-NEXT:    li a0, 0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: memcpy11_align8:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    lbu a2, 10(a1)
-; RV64-NEXT:    sb a2, 10(a0)
-; RV64-NEXT:    lh a2, 8(a1)
-; RV64-NEXT:    sh a2, 8(a0)
-; RV64-NEXT:    ld a1, 0(a1)
-; RV64-NEXT:    sd a1, 0(a0)
-; RV64-NEXT:    li a0, 0
-; RV64-NEXT:    ret
-;
-; RV32-FAST-LABEL: memcpy11_align8:
-; RV32-FAST:       # %bb.0: # %entry
-; RV32-FAST-NEXT:    lw a2, 7(a1)
-; RV32-FAST-NEXT:    sw a2, 7(a0)
-; RV32-FAST-NEXT:    lw a2, 4(a1)
-; RV32-FAST-NEXT:    sw a2, 4(a0)
-; RV32-FAST-NEXT:    lw a1, 0(a1)
-; RV32-FAST-NEXT:    sw a1, 0(a0)
-; RV32-FAST-NEXT:    li a0, 0
-; RV32-FAST-NEXT:    ret
-;
-; RV64-FAST-LABEL: memcpy11_align8:
-; RV64-FAST:       # %bb.0: # %entry
-; RV64-FAST-NEXT:    lw a2, 7(a1)
-; RV64-FAST-NEXT:    sw a2, 7(a0)
-; RV64-FAST-NEXT:    ld a1, 0(a1)
-; RV64-FAST-NEXT:    sd a1, 0(a0)
-; RV64-FAST-NEXT:    li a0, 0
-; RV64-FAST-NEXT:    ret
-entry:
-  call void @llvm.memcpy.p0.p0.i32(ptr align 8 %dest, ptr align 8 %src, i32 11, i1 false)
-  ret i32 0
-}
-
 declare void @llvm.memcpy.p0.p0.i32(ptr nocapture, ptr nocapture, i32, i1) nounwind
 declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV64-BOTH: {{.*}}

From 681c4a2068702f7483608b89e7a7e9235faf6bd9 Mon Sep 17 00:00:00 2001
From: Pengcheng Wang <wangpengcheng.pp@bytedance.com>
Date: Mon, 13 Jan 2025 11:28:24 +0800
Subject: [PATCH 224/408] Reapply "[RISCV] Rework memcpy test (#120364)"

Use descriptive names and add more cases.

This recommits 59bba39 which was reverted in 4637c77.
---
 llvm/test/CodeGen/RISCV/memcpy.ll | 913 ++++++++++++++++++++----------
 1 file changed, 615 insertions(+), 298 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/memcpy.ll b/llvm/test/CodeGen/RISCV/memcpy.ll
index 1ab3722080f70..447fc26b0106e 100644
--- a/llvm/test/CodeGen/RISCV/memcpy.ll
+++ b/llvm/test/CodeGen/RISCV/memcpy.ll
@@ -7,406 +7,676 @@
 ; RUN:   | FileCheck %s --check-prefixes=RV32-BOTH,RV32-FAST
 ; RUN: llc < %s -mtriple=riscv64 -mattr=+unaligned-scalar-mem \
 ; RUN:   | FileCheck %s --check-prefixes=RV64-BOTH,RV64-FAST
-%struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 }
 
-@src = external dso_local global %struct.x
-@dst = external dso_local global %struct.x
+; ----------------------------------------------------------------------
+; Fully unaligned cases
 
-@.str1 = private unnamed_addr constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 1
-@.str2 = private unnamed_addr constant [36 x i8] c"DHRYSTONE PROGRAM, SOME STRING BLAH\00", align 1
-@.str3 = private unnamed_addr constant [24 x i8] c"DHRYSTONE PROGRAM, SOME\00", align 1
-@.str4 = private unnamed_addr constant [18 x i8] c"DHRYSTONE PROGR  \00", align 1
-@.str5 = private unnamed_addr constant [7 x i8] c"DHRYST\00", align 1
-@.str6 = private unnamed_addr constant [14 x i8] c"/tmp/rmXXXXXX\00", align 1
-@spool.splbuf = internal global [512 x i8] zeroinitializer, align 16
+define void @unaligned_memcpy0(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-BOTH-LABEL: unaligned_memcpy0:
+; RV32-BOTH:       # %bb.0: # %entry
+; RV32-BOTH-NEXT:    ret
+;
+; RV64-BOTH-LABEL: unaligned_memcpy0:
+; RV64-BOTH:       # %bb.0: # %entry
+; RV64-BOTH-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 0, i1 false)
+  ret void
+}
 
-define i32 @t0() {
-; RV32-LABEL: t0:
+define void @unaligned_memcpy1(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-BOTH-LABEL: unaligned_memcpy1:
+; RV32-BOTH:       # %bb.0: # %entry
+; RV32-BOTH-NEXT:    lbu a1, 0(a1)
+; RV32-BOTH-NEXT:    sb a1, 0(a0)
+; RV32-BOTH-NEXT:    ret
+;
+; RV64-BOTH-LABEL: unaligned_memcpy1:
+; RV64-BOTH:       # %bb.0: # %entry
+; RV64-BOTH-NEXT:    lbu a1, 0(a1)
+; RV64-BOTH-NEXT:    sb a1, 0(a0)
+; RV64-BOTH-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 1, i1 false)
+  ret void
+}
+
+define void @unaligned_memcpy2(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-LABEL: unaligned_memcpy2:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lui a0, %hi(src)
-; RV32-NEXT:    lw a1, %lo(src)(a0)
-; RV32-NEXT:    lui a2, %hi(dst)
-; RV32-NEXT:    addi a0, a0, %lo(src)
-; RV32-NEXT:    sw a1, %lo(dst)(a2)
-; RV32-NEXT:    lw a1, 4(a0)
-; RV32-NEXT:    lh a3, 8(a0)
-; RV32-NEXT:    lbu a0, 10(a0)
-; RV32-NEXT:    addi a2, a2, %lo(dst)
-; RV32-NEXT:    sw a1, 4(a2)
-; RV32-NEXT:    sh a3, 8(a2)
-; RV32-NEXT:    sb a0, 10(a2)
-; RV32-NEXT:    li a0, 0
+; RV32-NEXT:    lbu a2, 1(a1)
+; RV32-NEXT:    sb a2, 1(a0)
+; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    sb a1, 0(a0)
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: t0:
+; RV64-LABEL: unaligned_memcpy2:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    lui a0, %hi(src)
-; RV64-NEXT:    lui a1, %hi(dst)
-; RV64-NEXT:    ld a2, %lo(src)(a0)
-; RV64-NEXT:    addi a0, a0, %lo(src)
-; RV64-NEXT:    lh a3, 8(a0)
-; RV64-NEXT:    lbu a0, 10(a0)
-; RV64-NEXT:    sd a2, %lo(dst)(a1)
-; RV64-NEXT:    addi a1, a1, %lo(dst)
-; RV64-NEXT:    sh a3, 8(a1)
-; RV64-NEXT:    sb a0, 10(a1)
-; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    lbu a2, 1(a1)
+; RV64-NEXT:    sb a2, 1(a0)
+; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    sb a1, 0(a0)
 ; RV64-NEXT:    ret
 ;
-; RV32-FAST-LABEL: t0:
+; RV32-FAST-LABEL: unaligned_memcpy2:
 ; RV32-FAST:       # %bb.0: # %entry
-; RV32-FAST-NEXT:    lui a0, %hi(src)
-; RV32-FAST-NEXT:    lw a1, %lo(src)(a0)
-; RV32-FAST-NEXT:    addi a0, a0, %lo(src)
-; RV32-FAST-NEXT:    lw a2, 4(a0)
-; RV32-FAST-NEXT:    lw a0, 7(a0)
-; RV32-FAST-NEXT:    lui a3, %hi(dst)
-; RV32-FAST-NEXT:    sw a1, %lo(dst)(a3)
-; RV32-FAST-NEXT:    addi a1, a3, %lo(dst)
-; RV32-FAST-NEXT:    sw a0, 7(a1)
-; RV32-FAST-NEXT:    sw a2, 4(a1)
-; RV32-FAST-NEXT:    li a0, 0
+; RV32-FAST-NEXT:    lh a1, 0(a1)
+; RV32-FAST-NEXT:    sh a1, 0(a0)
 ; RV32-FAST-NEXT:    ret
 ;
-; RV64-FAST-LABEL: t0:
+; RV64-FAST-LABEL: unaligned_memcpy2:
 ; RV64-FAST:       # %bb.0: # %entry
-; RV64-FAST-NEXT:    lui a0, %hi(src)
-; RV64-FAST-NEXT:    ld a1, %lo(src)(a0)
-; RV64-FAST-NEXT:    addi a0, a0, %lo(src)
-; RV64-FAST-NEXT:    lw a0, 7(a0)
-; RV64-FAST-NEXT:    lui a2, %hi(dst)
-; RV64-FAST-NEXT:    sd a1, %lo(dst)(a2)
-; RV64-FAST-NEXT:    addi a1, a2, %lo(dst)
-; RV64-FAST-NEXT:    sw a0, 7(a1)
-; RV64-FAST-NEXT:    li a0, 0
+; RV64-FAST-NEXT:    lh a1, 0(a1)
+; RV64-FAST-NEXT:    sh a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
-  call void @llvm.memcpy.p0.p0.i32(ptr align 8 @dst, ptr align 8 @src, i32 11, i1 false)
-  ret i32 0
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 2, i1 false)
+  ret void
 }
 
-define void @t1(ptr nocapture %C) nounwind {
-; RV32-LABEL: t1:
+define void @unaligned_memcpy3(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-LABEL: unaligned_memcpy3:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lui a1, %hi(.L.str1)
-; RV32-NEXT:    addi a1, a1, %lo(.L.str1)
-; RV32-NEXT:    li a2, 31
+; RV32-NEXT:    lbu a2, 2(a1)
+; RV32-NEXT:    sb a2, 2(a0)
+; RV32-NEXT:    lbu a2, 1(a1)
+; RV32-NEXT:    sb a2, 1(a0)
+; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    sb a1, 0(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: unaligned_memcpy3:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    lbu a2, 2(a1)
+; RV64-NEXT:    sb a2, 2(a0)
+; RV64-NEXT:    lbu a2, 1(a1)
+; RV64-NEXT:    sb a2, 1(a0)
+; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: unaligned_memcpy3:
+; RV32-FAST:       # %bb.0: # %entry
+; RV32-FAST-NEXT:    lbu a2, 2(a1)
+; RV32-FAST-NEXT:    sb a2, 2(a0)
+; RV32-FAST-NEXT:    lh a1, 0(a1)
+; RV32-FAST-NEXT:    sh a1, 0(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: unaligned_memcpy3:
+; RV64-FAST:       # %bb.0: # %entry
+; RV64-FAST-NEXT:    lbu a2, 2(a1)
+; RV64-FAST-NEXT:    sb a2, 2(a0)
+; RV64-FAST-NEXT:    lh a1, 0(a1)
+; RV64-FAST-NEXT:    sh a1, 0(a0)
+; RV64-FAST-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
+  ret void
+}
+
+define void @unaligned_memcpy4(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-LABEL: unaligned_memcpy4:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    lbu a2, 3(a1)
+; RV32-NEXT:    sb a2, 3(a0)
+; RV32-NEXT:    lbu a2, 2(a1)
+; RV32-NEXT:    sb a2, 2(a0)
+; RV32-NEXT:    lbu a2, 1(a1)
+; RV32-NEXT:    sb a2, 1(a0)
+; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    sb a1, 0(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: unaligned_memcpy4:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    lbu a2, 3(a1)
+; RV64-NEXT:    sb a2, 3(a0)
+; RV64-NEXT:    lbu a2, 2(a1)
+; RV64-NEXT:    sb a2, 2(a0)
+; RV64-NEXT:    lbu a2, 1(a1)
+; RV64-NEXT:    sb a2, 1(a0)
+; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: unaligned_memcpy4:
+; RV32-FAST:       # %bb.0: # %entry
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: unaligned_memcpy4:
+; RV64-FAST:       # %bb.0: # %entry
+; RV64-FAST-NEXT:    lw a1, 0(a1)
+; RV64-FAST-NEXT:    sw a1, 0(a0)
+; RV64-FAST-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 4, i1 false)
+  ret void
+}
+
+define void @unaligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-LABEL: unaligned_memcpy7:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    lbu a2, 6(a1)
+; RV32-NEXT:    sb a2, 6(a0)
+; RV32-NEXT:    lbu a2, 5(a1)
+; RV32-NEXT:    sb a2, 5(a0)
+; RV32-NEXT:    lbu a2, 4(a1)
+; RV32-NEXT:    sb a2, 4(a0)
+; RV32-NEXT:    lbu a2, 3(a1)
+; RV32-NEXT:    sb a2, 3(a0)
+; RV32-NEXT:    lbu a2, 2(a1)
+; RV32-NEXT:    sb a2, 2(a0)
+; RV32-NEXT:    lbu a2, 1(a1)
+; RV32-NEXT:    sb a2, 1(a0)
+; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    sb a1, 0(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: unaligned_memcpy7:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    lbu a2, 6(a1)
+; RV64-NEXT:    sb a2, 6(a0)
+; RV64-NEXT:    lbu a2, 5(a1)
+; RV64-NEXT:    sb a2, 5(a0)
+; RV64-NEXT:    lbu a2, 4(a1)
+; RV64-NEXT:    sb a2, 4(a0)
+; RV64-NEXT:    lbu a2, 3(a1)
+; RV64-NEXT:    sb a2, 3(a0)
+; RV64-NEXT:    lbu a2, 2(a1)
+; RV64-NEXT:    sb a2, 2(a0)
+; RV64-NEXT:    lbu a2, 1(a1)
+; RV64-NEXT:    sb a2, 1(a0)
+; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: unaligned_memcpy7:
+; RV32-FAST:       # %bb.0: # %entry
+; RV32-FAST-NEXT:    lw a2, 3(a1)
+; RV32-FAST-NEXT:    sw a2, 3(a0)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: unaligned_memcpy7:
+; RV64-FAST:       # %bb.0: # %entry
+; RV64-FAST-NEXT:    lw a2, 3(a1)
+; RV64-FAST-NEXT:    sw a2, 3(a0)
+; RV64-FAST-NEXT:    lw a1, 0(a1)
+; RV64-FAST-NEXT:    sw a1, 0(a0)
+; RV64-FAST-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 7, i1 false)
+  ret void
+}
+
+define void @unaligned_memcpy8(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-LABEL: unaligned_memcpy8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    lbu a2, 7(a1)
+; RV32-NEXT:    sb a2, 7(a0)
+; RV32-NEXT:    lbu a2, 6(a1)
+; RV32-NEXT:    sb a2, 6(a0)
+; RV32-NEXT:    lbu a2, 5(a1)
+; RV32-NEXT:    sb a2, 5(a0)
+; RV32-NEXT:    lbu a2, 4(a1)
+; RV32-NEXT:    sb a2, 4(a0)
+; RV32-NEXT:    lbu a2, 3(a1)
+; RV32-NEXT:    sb a2, 3(a0)
+; RV32-NEXT:    lbu a2, 2(a1)
+; RV32-NEXT:    sb a2, 2(a0)
+; RV32-NEXT:    lbu a2, 1(a1)
+; RV32-NEXT:    sb a2, 1(a0)
+; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    sb a1, 0(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: unaligned_memcpy8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    lbu a2, 7(a1)
+; RV64-NEXT:    sb a2, 7(a0)
+; RV64-NEXT:    lbu a2, 6(a1)
+; RV64-NEXT:    sb a2, 6(a0)
+; RV64-NEXT:    lbu a2, 5(a1)
+; RV64-NEXT:    sb a2, 5(a0)
+; RV64-NEXT:    lbu a2, 4(a1)
+; RV64-NEXT:    sb a2, 4(a0)
+; RV64-NEXT:    lbu a2, 3(a1)
+; RV64-NEXT:    sb a2, 3(a0)
+; RV64-NEXT:    lbu a2, 2(a1)
+; RV64-NEXT:    sb a2, 2(a0)
+; RV64-NEXT:    lbu a2, 1(a1)
+; RV64-NEXT:    sb a2, 1(a0)
+; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: unaligned_memcpy8:
+; RV32-FAST:       # %bb.0: # %entry
+; RV32-FAST-NEXT:    lw a2, 4(a1)
+; RV32-FAST-NEXT:    sw a2, 4(a0)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: unaligned_memcpy8:
+; RV64-FAST:       # %bb.0: # %entry
+; RV64-FAST-NEXT:    ld a1, 0(a1)
+; RV64-FAST-NEXT:    sd a1, 0(a0)
+; RV64-FAST-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 8, i1 false)
+  ret void
+}
+
+define void @unaligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-LABEL: unaligned_memcpy15:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    li a2, 15
 ; RV32-NEXT:    tail memcpy
 ;
-; RV64-LABEL: t1:
+; RV64-LABEL: unaligned_memcpy15:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    lui a1, %hi(.L.str1)
-; RV64-NEXT:    addi a1, a1, %lo(.L.str1)
-; RV64-NEXT:    li a2, 31
+; RV64-NEXT:    li a2, 15
 ; RV64-NEXT:    tail memcpy
 ;
-; RV32-FAST-LABEL: t1:
+; RV32-FAST-LABEL: unaligned_memcpy15:
 ; RV32-FAST:       # %bb.0: # %entry
-; RV32-FAST-NEXT:    lui a1, 1141
-; RV32-FAST-NEXT:    lui a2, 300325
-; RV32-FAST-NEXT:    lui a3, 132181
-; RV32-FAST-NEXT:    lui a4, 340483
-; RV32-FAST-NEXT:    lui a5, 267556
-; RV32-FAST-NEXT:    lui a6, 337154
-; RV32-FAST-NEXT:    addi a1, a1, -439
-; RV32-FAST-NEXT:    sw a1, 27(a0)
-; RV32-FAST-NEXT:    lui a1, 320757
-; RV32-FAST-NEXT:    addi a2, a2, 1107
-; RV32-FAST-NEXT:    addi a3, a3, -689
-; RV32-FAST-NEXT:    addi a4, a4, -947
-; RV32-FAST-NEXT:    sw a4, 16(a0)
-; RV32-FAST-NEXT:    sw a3, 20(a0)
-; RV32-FAST-NEXT:    sw a2, 24(a0)
-; RV32-FAST-NEXT:    lui a2, 365861
-; RV32-FAST-NEXT:    addi a3, a5, 1871
-; RV32-FAST-NEXT:    addi a4, a6, 69
-; RV32-FAST-NEXT:    addi a1, a1, 1107
-; RV32-FAST-NEXT:    addi a2, a2, -1980
-; RV32-FAST-NEXT:    sw a2, 0(a0)
-; RV32-FAST-NEXT:    sw a1, 4(a0)
-; RV32-FAST-NEXT:    sw a4, 8(a0)
-; RV32-FAST-NEXT:    sw a3, 12(a0)
+; RV32-FAST-NEXT:    lw a2, 11(a1)
+; RV32-FAST-NEXT:    sw a2, 11(a0)
+; RV32-FAST-NEXT:    lw a2, 8(a1)
+; RV32-FAST-NEXT:    sw a2, 8(a0)
+; RV32-FAST-NEXT:    lw a2, 4(a1)
+; RV32-FAST-NEXT:    sw a2, 4(a0)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a1, 0(a0)
 ; RV32-FAST-NEXT:    ret
 ;
-; RV64-FAST-LABEL: t1:
+; RV64-FAST-LABEL: unaligned_memcpy15:
 ; RV64-FAST:       # %bb.0: # %entry
-; RV64-FAST-NEXT:    lui a1, %hi(.L.str1)
-; RV64-FAST-NEXT:    addi a2, a1, %lo(.L.str1)
-; RV64-FAST-NEXT:    ld a3, 23(a2)
-; RV64-FAST-NEXT:    ld a1, %lo(.L.str1)(a1)
-; RV64-FAST-NEXT:    ld a4, 8(a2)
-; RV64-FAST-NEXT:    ld a2, 16(a2)
-; RV64-FAST-NEXT:    sd a3, 23(a0)
+; RV64-FAST-NEXT:    ld a2, 7(a1)
+; RV64-FAST-NEXT:    sd a2, 7(a0)
+; RV64-FAST-NEXT:    ld a1, 0(a1)
 ; RV64-FAST-NEXT:    sd a1, 0(a0)
-; RV64-FAST-NEXT:    sd a4, 8(a0)
-; RV64-FAST-NEXT:    sd a2, 16(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str1, i64 31, i1 false)
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 15, i1 false)
   ret void
 }
 
-define void @t2(ptr nocapture %C) nounwind {
-; RV32-BOTH-LABEL: t2:
-; RV32-BOTH:       # %bb.0: # %entry
-; RV32-BOTH-NEXT:    lui a1, %hi(.L.str2)
-; RV32-BOTH-NEXT:    addi a1, a1, %lo(.L.str2)
-; RV32-BOTH-NEXT:    li a2, 36
-; RV32-BOTH-NEXT:    tail memcpy
+define void @unaligned_memcpy16(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-LABEL: unaligned_memcpy16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    tail memcpy
 ;
-; RV64-LABEL: t2:
+; RV64-LABEL: unaligned_memcpy16:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    lui a1, %hi(.L.str2)
-; RV64-NEXT:    addi a1, a1, %lo(.L.str2)
-; RV64-NEXT:    li a2, 36
+; RV64-NEXT:    li a2, 16
 ; RV64-NEXT:    tail memcpy
 ;
-; RV64-FAST-LABEL: t2:
+; RV32-FAST-LABEL: unaligned_memcpy16:
+; RV32-FAST:       # %bb.0: # %entry
+; RV32-FAST-NEXT:    lw a2, 12(a1)
+; RV32-FAST-NEXT:    sw a2, 12(a0)
+; RV32-FAST-NEXT:    lw a2, 8(a1)
+; RV32-FAST-NEXT:    sw a2, 8(a0)
+; RV32-FAST-NEXT:    lw a2, 4(a1)
+; RV32-FAST-NEXT:    sw a2, 4(a0)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: unaligned_memcpy16:
 ; RV64-FAST:       # %bb.0: # %entry
-; RV64-FAST-NEXT:    lui a1, %hi(.L.str2)
-; RV64-FAST-NEXT:    lui a2, 1156
-; RV64-FAST-NEXT:    ld a3, %lo(.L.str2)(a1)
-; RV64-FAST-NEXT:    addi a2, a2, 332
-; RV64-FAST-NEXT:    addi a1, a1, %lo(.L.str2)
-; RV64-FAST-NEXT:    sw a2, 32(a0)
 ; RV64-FAST-NEXT:    ld a2, 8(a1)
-; RV64-FAST-NEXT:    ld a4, 16(a1)
-; RV64-FAST-NEXT:    ld a1, 24(a1)
-; RV64-FAST-NEXT:    sd a3, 0(a0)
 ; RV64-FAST-NEXT:    sd a2, 8(a0)
-; RV64-FAST-NEXT:    sd a4, 16(a0)
-; RV64-FAST-NEXT:    sd a1, 24(a0)
+; RV64-FAST-NEXT:    ld a1, 0(a1)
+; RV64-FAST-NEXT:    sd a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str2, i64 36, i1 false)
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 16, i1 false)
   ret void
 }
 
-define void @t3(ptr nocapture %C) nounwind {
-; RV32-LABEL: t3:
+define void @unaligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-LABEL: unaligned_memcpy31:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lui a1, %hi(.L.str3)
-; RV32-NEXT:    addi a1, a1, %lo(.L.str3)
-; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    li a2, 31
 ; RV32-NEXT:    tail memcpy
 ;
-; RV64-LABEL: t3:
+; RV64-LABEL: unaligned_memcpy31:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    lui a1, %hi(.L.str3)
-; RV64-NEXT:    addi a1, a1, %lo(.L.str3)
-; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    li a2, 31
 ; RV64-NEXT:    tail memcpy
 ;
-; RV32-FAST-LABEL: t3:
+; RV32-FAST-LABEL: unaligned_memcpy31:
 ; RV32-FAST:       # %bb.0: # %entry
-; RV32-FAST-NEXT:    lui a1, 1109
-; RV32-FAST-NEXT:    lui a2, 340483
-; RV32-FAST-NEXT:    lui a3, 267556
-; RV32-FAST-NEXT:    lui a4, 337154
-; RV32-FAST-NEXT:    lui a5, 320757
-; RV32-FAST-NEXT:    addi a1, a1, -689
-; RV32-FAST-NEXT:    addi a2, a2, -947
+; RV32-FAST-NEXT:    lw a2, 27(a1)
+; RV32-FAST-NEXT:    sw a2, 27(a0)
+; RV32-FAST-NEXT:    lw a2, 24(a1)
+; RV32-FAST-NEXT:    sw a2, 24(a0)
+; RV32-FAST-NEXT:    lw a2, 20(a1)
+; RV32-FAST-NEXT:    sw a2, 20(a0)
+; RV32-FAST-NEXT:    lw a2, 16(a1)
 ; RV32-FAST-NEXT:    sw a2, 16(a0)
-; RV32-FAST-NEXT:    sw a1, 20(a0)
-; RV32-FAST-NEXT:    lui a1, 365861
-; RV32-FAST-NEXT:    addi a2, a3, 1871
-; RV32-FAST-NEXT:    addi a3, a4, 69
-; RV32-FAST-NEXT:    addi a4, a5, 1107
-; RV32-FAST-NEXT:    addi a1, a1, -1980
-; RV32-FAST-NEXT:    sw a1, 0(a0)
-; RV32-FAST-NEXT:    sw a4, 4(a0)
-; RV32-FAST-NEXT:    sw a3, 8(a0)
+; RV32-FAST-NEXT:    lw a2, 12(a1)
 ; RV32-FAST-NEXT:    sw a2, 12(a0)
+; RV32-FAST-NEXT:    lw a2, 8(a1)
+; RV32-FAST-NEXT:    sw a2, 8(a0)
+; RV32-FAST-NEXT:    lw a2, 4(a1)
+; RV32-FAST-NEXT:    sw a2, 4(a0)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a1, 0(a0)
 ; RV32-FAST-NEXT:    ret
 ;
-; RV64-FAST-LABEL: t3:
+; RV64-FAST-LABEL: unaligned_memcpy31:
 ; RV64-FAST:       # %bb.0: # %entry
-; RV64-FAST-NEXT:    lui a1, %hi(.L.str3)
-; RV64-FAST-NEXT:    ld a2, %lo(.L.str3)(a1)
-; RV64-FAST-NEXT:    addi a1, a1, %lo(.L.str3)
-; RV64-FAST-NEXT:    ld a3, 8(a1)
-; RV64-FAST-NEXT:    ld a1, 16(a1)
-; RV64-FAST-NEXT:    sd a2, 0(a0)
-; RV64-FAST-NEXT:    sd a3, 8(a0)
-; RV64-FAST-NEXT:    sd a1, 16(a0)
+; RV64-FAST-NEXT:    ld a2, 23(a1)
+; RV64-FAST-NEXT:    sd a2, 23(a0)
+; RV64-FAST-NEXT:    ld a2, 16(a1)
+; RV64-FAST-NEXT:    sd a2, 16(a0)
+; RV64-FAST-NEXT:    ld a2, 8(a1)
+; RV64-FAST-NEXT:    sd a2, 8(a0)
+; RV64-FAST-NEXT:    ld a1, 0(a1)
+; RV64-FAST-NEXT:    sd a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str3, i64 24, i1 false)
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 31, i1 false)
+  ret void
+}
+
+; ----------------------------------------------------------------------
+; Fully aligned cases
+
+define void @aligned_memcpy0(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-BOTH-LABEL: aligned_memcpy0:
+; RV32-BOTH:       # %bb.0: # %entry
+; RV32-BOTH-NEXT:    ret
+;
+; RV64-BOTH-LABEL: aligned_memcpy0:
+; RV64-BOTH:       # %bb.0: # %entry
+; RV64-BOTH-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 0, i1 false)
+  ret void
+}
+
+define void @aligned_memcpy1(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-BOTH-LABEL: aligned_memcpy1:
+; RV32-BOTH:       # %bb.0: # %entry
+; RV32-BOTH-NEXT:    lbu a1, 0(a1)
+; RV32-BOTH-NEXT:    sb a1, 0(a0)
+; RV32-BOTH-NEXT:    ret
+;
+; RV64-BOTH-LABEL: aligned_memcpy1:
+; RV64-BOTH:       # %bb.0: # %entry
+; RV64-BOTH-NEXT:    lbu a1, 0(a1)
+; RV64-BOTH-NEXT:    sb a1, 0(a0)
+; RV64-BOTH-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 1, i1 false)
+  ret void
+}
+
+define void @aligned_memcpy2(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-BOTH-LABEL: aligned_memcpy2:
+; RV32-BOTH:       # %bb.0: # %entry
+; RV32-BOTH-NEXT:    lh a1, 0(a1)
+; RV32-BOTH-NEXT:    sh a1, 0(a0)
+; RV32-BOTH-NEXT:    ret
+;
+; RV64-BOTH-LABEL: aligned_memcpy2:
+; RV64-BOTH:       # %bb.0: # %entry
+; RV64-BOTH-NEXT:    lh a1, 0(a1)
+; RV64-BOTH-NEXT:    sh a1, 0(a0)
+; RV64-BOTH-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 2, i1 false)
+  ret void
+}
+
+define void @aligned_memcpy3(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-BOTH-LABEL: aligned_memcpy3:
+; RV32-BOTH:       # %bb.0: # %entry
+; RV32-BOTH-NEXT:    lbu a2, 2(a1)
+; RV32-BOTH-NEXT:    sb a2, 2(a0)
+; RV32-BOTH-NEXT:    lh a1, 0(a1)
+; RV32-BOTH-NEXT:    sh a1, 0(a0)
+; RV32-BOTH-NEXT:    ret
+;
+; RV64-BOTH-LABEL: aligned_memcpy3:
+; RV64-BOTH:       # %bb.0: # %entry
+; RV64-BOTH-NEXT:    lbu a2, 2(a1)
+; RV64-BOTH-NEXT:    sb a2, 2(a0)
+; RV64-BOTH-NEXT:    lh a1, 0(a1)
+; RV64-BOTH-NEXT:    sh a1, 0(a0)
+; RV64-BOTH-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 3, i1 false)
   ret void
 }
 
-define void @t4(ptr nocapture %C) nounwind {
-; RV32-LABEL: t4:
+define void @aligned_memcpy4(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-BOTH-LABEL: aligned_memcpy4:
+; RV32-BOTH:       # %bb.0: # %entry
+; RV32-BOTH-NEXT:    lw a1, 0(a1)
+; RV32-BOTH-NEXT:    sw a1, 0(a0)
+; RV32-BOTH-NEXT:    ret
+;
+; RV64-BOTH-LABEL: aligned_memcpy4:
+; RV64-BOTH:       # %bb.0: # %entry
+; RV64-BOTH-NEXT:    lw a1, 0(a1)
+; RV64-BOTH-NEXT:    sw a1, 0(a0)
+; RV64-BOTH-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 4, i1 false)
+  ret void
+}
+
+define void @aligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-LABEL: aligned_memcpy7:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lui a1, %hi(.L.str4)
-; RV32-NEXT:    addi a1, a1, %lo(.L.str4)
-; RV32-NEXT:    li a2, 18
-; RV32-NEXT:    tail memcpy
+; RV32-NEXT:    lbu a2, 6(a1)
+; RV32-NEXT:    sb a2, 6(a0)
+; RV32-NEXT:    lh a2, 4(a1)
+; RV32-NEXT:    sh a2, 4(a0)
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    sw a1, 0(a0)
+; RV32-NEXT:    ret
 ;
-; RV64-LABEL: t4:
+; RV64-LABEL: aligned_memcpy7:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    lui a1, %hi(.L.str4)
-; RV64-NEXT:    addi a1, a1, %lo(.L.str4)
-; RV64-NEXT:    li a2, 18
-; RV64-NEXT:    tail memcpy
+; RV64-NEXT:    lbu a2, 6(a1)
+; RV64-NEXT:    sb a2, 6(a0)
+; RV64-NEXT:    lh a2, 4(a1)
+; RV64-NEXT:    sh a2, 4(a0)
+; RV64-NEXT:    lw a1, 0(a1)
+; RV64-NEXT:    sw a1, 0(a0)
+; RV64-NEXT:    ret
 ;
-; RV32-FAST-LABEL: t4:
+; RV32-FAST-LABEL: aligned_memcpy7:
 ; RV32-FAST:       # %bb.0: # %entry
-; RV32-FAST-NEXT:    li a1, 32
-; RV32-FAST-NEXT:    lui a2, 132388
-; RV32-FAST-NEXT:    lui a3, 337154
-; RV32-FAST-NEXT:    lui a4, 320757
-; RV32-FAST-NEXT:    sh a1, 16(a0)
-; RV32-FAST-NEXT:    lui a1, 365861
-; RV32-FAST-NEXT:    addi a2, a2, 1871
-; RV32-FAST-NEXT:    addi a3, a3, 69
-; RV32-FAST-NEXT:    addi a4, a4, 1107
-; RV32-FAST-NEXT:    addi a1, a1, -1980
+; RV32-FAST-NEXT:    lw a2, 3(a1)
+; RV32-FAST-NEXT:    sw a2, 3(a0)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
 ; RV32-FAST-NEXT:    sw a1, 0(a0)
-; RV32-FAST-NEXT:    sw a4, 4(a0)
-; RV32-FAST-NEXT:    sw a3, 8(a0)
-; RV32-FAST-NEXT:    sw a2, 12(a0)
 ; RV32-FAST-NEXT:    ret
 ;
-; RV64-FAST-LABEL: t4:
+; RV64-FAST-LABEL: aligned_memcpy7:
 ; RV64-FAST:       # %bb.0: # %entry
-; RV64-FAST-NEXT:    lui a1, %hi(.L.str4)
-; RV64-FAST-NEXT:    ld a2, %lo(.L.str4)(a1)
-; RV64-FAST-NEXT:    addi a1, a1, %lo(.L.str4)
-; RV64-FAST-NEXT:    ld a1, 8(a1)
-; RV64-FAST-NEXT:    li a3, 32
-; RV64-FAST-NEXT:    sd a2, 0(a0)
-; RV64-FAST-NEXT:    sd a1, 8(a0)
-; RV64-FAST-NEXT:    sh a3, 16(a0)
+; RV64-FAST-NEXT:    lw a2, 3(a1)
+; RV64-FAST-NEXT:    sw a2, 3(a0)
+; RV64-FAST-NEXT:    lw a1, 0(a1)
+; RV64-FAST-NEXT:    sw a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str4, i64 18, i1 false)
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 7, i1 false)
+  ret void
+}
+
+define void @aligned_memcpy8(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-BOTH-LABEL: aligned_memcpy8:
+; RV32-BOTH:       # %bb.0: # %entry
+; RV32-BOTH-NEXT:    lw a2, 4(a1)
+; RV32-BOTH-NEXT:    sw a2, 4(a0)
+; RV32-BOTH-NEXT:    lw a1, 0(a1)
+; RV32-BOTH-NEXT:    sw a1, 0(a0)
+; RV32-BOTH-NEXT:    ret
+;
+; RV64-BOTH-LABEL: aligned_memcpy8:
+; RV64-BOTH:       # %bb.0: # %entry
+; RV64-BOTH-NEXT:    ld a1, 0(a1)
+; RV64-BOTH-NEXT:    sd a1, 0(a0)
+; RV64-BOTH-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 8, i1 false)
   ret void
 }
 
-define void @t5(ptr nocapture %C) nounwind {
-; RV32-LABEL: t5:
+define void @aligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-LABEL: aligned_memcpy15:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    li a1, 84
-; RV32-NEXT:    li a2, 83
-; RV32-NEXT:    li a3, 89
-; RV32-NEXT:    li a4, 82
-; RV32-NEXT:    li a5, 72
-; RV32-NEXT:    li a6, 68
-; RV32-NEXT:    sb a2, 4(a0)
-; RV32-NEXT:    sb a1, 5(a0)
-; RV32-NEXT:    sb zero, 6(a0)
-; RV32-NEXT:    sb a6, 0(a0)
-; RV32-NEXT:    sb a5, 1(a0)
-; RV32-NEXT:    sb a4, 2(a0)
-; RV32-NEXT:    sb a3, 3(a0)
+; RV32-NEXT:    lbu a2, 14(a1)
+; RV32-NEXT:    sb a2, 14(a0)
+; RV32-NEXT:    lh a2, 12(a1)
+; RV32-NEXT:    sh a2, 12(a0)
+; RV32-NEXT:    lw a2, 8(a1)
+; RV32-NEXT:    sw a2, 8(a0)
+; RV32-NEXT:    lw a2, 4(a1)
+; RV32-NEXT:    sw a2, 4(a0)
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    sw a1, 0(a0)
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: t5:
+; RV64-LABEL: aligned_memcpy15:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    li a1, 84
-; RV64-NEXT:    li a2, 83
-; RV64-NEXT:    li a3, 89
-; RV64-NEXT:    li a4, 82
-; RV64-NEXT:    li a5, 72
-; RV64-NEXT:    li a6, 68
-; RV64-NEXT:    sb a2, 4(a0)
-; RV64-NEXT:    sb a1, 5(a0)
-; RV64-NEXT:    sb zero, 6(a0)
-; RV64-NEXT:    sb a6, 0(a0)
-; RV64-NEXT:    sb a5, 1(a0)
-; RV64-NEXT:    sb a4, 2(a0)
-; RV64-NEXT:    sb a3, 3(a0)
+; RV64-NEXT:    lbu a2, 14(a1)
+; RV64-NEXT:    sb a2, 14(a0)
+; RV64-NEXT:    lh a2, 12(a1)
+; RV64-NEXT:    sh a2, 12(a0)
+; RV64-NEXT:    lw a2, 8(a1)
+; RV64-NEXT:    sw a2, 8(a0)
+; RV64-NEXT:    ld a1, 0(a1)
+; RV64-NEXT:    sd a1, 0(a0)
 ; RV64-NEXT:    ret
 ;
-; RV32-FAST-LABEL: t5:
+; RV32-FAST-LABEL: aligned_memcpy15:
 ; RV32-FAST:       # %bb.0: # %entry
-; RV32-FAST-NEXT:    lui a1, 1349
-; RV32-FAST-NEXT:    addi a1, a1, 857
-; RV32-FAST-NEXT:    sw a1, 3(a0)
-; RV32-FAST-NEXT:    lui a1, 365861
-; RV32-FAST-NEXT:    addi a1, a1, -1980
+; RV32-FAST-NEXT:    lw a2, 11(a1)
+; RV32-FAST-NEXT:    sw a2, 11(a0)
+; RV32-FAST-NEXT:    lw a2, 8(a1)
+; RV32-FAST-NEXT:    sw a2, 8(a0)
+; RV32-FAST-NEXT:    lw a2, 4(a1)
+; RV32-FAST-NEXT:    sw a2, 4(a0)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
 ; RV32-FAST-NEXT:    sw a1, 0(a0)
 ; RV32-FAST-NEXT:    ret
 ;
-; RV64-FAST-LABEL: t5:
+; RV64-FAST-LABEL: aligned_memcpy15:
 ; RV64-FAST:       # %bb.0: # %entry
-; RV64-FAST-NEXT:    lui a1, 1349
-; RV64-FAST-NEXT:    addi a1, a1, 857
-; RV64-FAST-NEXT:    sw a1, 3(a0)
-; RV64-FAST-NEXT:    lui a1, 365861
-; RV64-FAST-NEXT:    addi a1, a1, -1980
-; RV64-FAST-NEXT:    sw a1, 0(a0)
+; RV64-FAST-NEXT:    ld a2, 7(a1)
+; RV64-FAST-NEXT:    sd a2, 7(a0)
+; RV64-FAST-NEXT:    ld a1, 0(a1)
+; RV64-FAST-NEXT:    sd a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str5, i64 7, i1 false)
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 15, i1 false)
   ret void
 }
 
-define void @t6() nounwind {
-; RV32-LABEL: t6:
+define void @aligned_memcpy16(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-BOTH-LABEL: aligned_memcpy16:
+; RV32-BOTH:       # %bb.0: # %entry
+; RV32-BOTH-NEXT:    lw a2, 12(a1)
+; RV32-BOTH-NEXT:    sw a2, 12(a0)
+; RV32-BOTH-NEXT:    lw a2, 8(a1)
+; RV32-BOTH-NEXT:    sw a2, 8(a0)
+; RV32-BOTH-NEXT:    lw a2, 4(a1)
+; RV32-BOTH-NEXT:    sw a2, 4(a0)
+; RV32-BOTH-NEXT:    lw a1, 0(a1)
+; RV32-BOTH-NEXT:    sw a1, 0(a0)
+; RV32-BOTH-NEXT:    ret
+;
+; RV64-BOTH-LABEL: aligned_memcpy16:
+; RV64-BOTH:       # %bb.0: # %entry
+; RV64-BOTH-NEXT:    ld a2, 8(a1)
+; RV64-BOTH-NEXT:    sd a2, 8(a0)
+; RV64-BOTH-NEXT:    ld a1, 0(a1)
+; RV64-BOTH-NEXT:    sd a1, 0(a0)
+; RV64-BOTH-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 16, i1 false)
+  ret void
+}
+
+define void @aligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind {
+; RV32-LABEL: aligned_memcpy31:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    lui a0, %hi(spool.splbuf)
-; RV32-NEXT:    addi a0, a0, %lo(spool.splbuf)
-; RV32-NEXT:    lui a1, %hi(.L.str6)
-; RV32-NEXT:    addi a1, a1, %lo(.L.str6)
-; RV32-NEXT:    li a2, 14
-; RV32-NEXT:    call memcpy
-; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32-NEXT:    li a2, 31
+; RV32-NEXT:    tail memcpy
 ;
-; RV64-LABEL: t6:
+; RV64-LABEL: aligned_memcpy31:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT:    lui a0, %hi(spool.splbuf)
-; RV64-NEXT:    addi a0, a0, %lo(spool.splbuf)
-; RV64-NEXT:    lui a1, %hi(.L.str6)
-; RV64-NEXT:    addi a1, a1, %lo(.L.str6)
-; RV64-NEXT:    li a2, 14
-; RV64-NEXT:    call memcpy
-; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    lbu a2, 30(a1)
+; RV64-NEXT:    sb a2, 30(a0)
+; RV64-NEXT:    lh a2, 28(a1)
+; RV64-NEXT:    sh a2, 28(a0)
+; RV64-NEXT:    lw a2, 24(a1)
+; RV64-NEXT:    sw a2, 24(a0)
+; RV64-NEXT:    ld a2, 16(a1)
+; RV64-NEXT:    sd a2, 16(a0)
+; RV64-NEXT:    ld a2, 8(a1)
+; RV64-NEXT:    sd a2, 8(a0)
+; RV64-NEXT:    ld a1, 0(a1)
+; RV64-NEXT:    sd a1, 0(a0)
 ; RV64-NEXT:    ret
 ;
-; RV32-FAST-LABEL: t6:
+; RV32-FAST-LABEL: aligned_memcpy31:
 ; RV32-FAST:       # %bb.0: # %entry
-; RV32-FAST-NEXT:    lui a0, %hi(spool.splbuf)
-; RV32-FAST-NEXT:    li a1, 88
-; RV32-FAST-NEXT:    sh a1, %lo(spool.splbuf+12)(a0)
-; RV32-FAST-NEXT:    lui a1, 361862
-; RV32-FAST-NEXT:    addi a1, a1, -1960
-; RV32-FAST-NEXT:    sw a1, %lo(spool.splbuf+8)(a0)
-; RV32-FAST-NEXT:    lui a1, 362199
-; RV32-FAST-NEXT:    addi a1, a1, 559
-; RV32-FAST-NEXT:    sw a1, %lo(spool.splbuf+4)(a0)
-; RV32-FAST-NEXT:    lui a1, 460503
-; RV32-FAST-NEXT:    addi a1, a1, 1071
-; RV32-FAST-NEXT:    sw a1, %lo(spool.splbuf)(a0)
+; RV32-FAST-NEXT:    lw a2, 27(a1)
+; RV32-FAST-NEXT:    sw a2, 27(a0)
+; RV32-FAST-NEXT:    lw a2, 24(a1)
+; RV32-FAST-NEXT:    sw a2, 24(a0)
+; RV32-FAST-NEXT:    lw a2, 20(a1)
+; RV32-FAST-NEXT:    sw a2, 20(a0)
+; RV32-FAST-NEXT:    lw a2, 16(a1)
+; RV32-FAST-NEXT:    sw a2, 16(a0)
+; RV32-FAST-NEXT:    lw a2, 12(a1)
+; RV32-FAST-NEXT:    sw a2, 12(a0)
+; RV32-FAST-NEXT:    lw a2, 8(a1)
+; RV32-FAST-NEXT:    sw a2, 8(a0)
+; RV32-FAST-NEXT:    lw a2, 4(a1)
+; RV32-FAST-NEXT:    sw a2, 4(a0)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a1, 0(a0)
 ; RV32-FAST-NEXT:    ret
 ;
-; RV64-FAST-LABEL: t6:
+; RV64-FAST-LABEL: aligned_memcpy31:
 ; RV64-FAST:       # %bb.0: # %entry
-; RV64-FAST-NEXT:    lui a0, %hi(.L.str6)
-; RV64-FAST-NEXT:    ld a1, %lo(.L.str6)(a0)
-; RV64-FAST-NEXT:    addi a0, a0, %lo(.L.str6)
-; RV64-FAST-NEXT:    ld a0, 6(a0)
-; RV64-FAST-NEXT:    lui a2, %hi(spool.splbuf)
-; RV64-FAST-NEXT:    sd a1, %lo(spool.splbuf)(a2)
-; RV64-FAST-NEXT:    sd a0, %lo(spool.splbuf+6)(a2)
+; RV64-FAST-NEXT:    ld a2, 23(a1)
+; RV64-FAST-NEXT:    sd a2, 23(a0)
+; RV64-FAST-NEXT:    ld a2, 16(a1)
+; RV64-FAST-NEXT:    sd a2, 16(a0)
+; RV64-FAST-NEXT:    ld a2, 8(a1)
+; RV64-FAST-NEXT:    sd a2, 8(a0)
+; RV64-FAST-NEXT:    ld a1, 0(a1)
+; RV64-FAST-NEXT:    sd a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
-  call void @llvm.memcpy.p0.p0.i64(ptr @spool.splbuf, ptr @.str6, i64 14, i1 false)
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 31, i1 false)
   ret void
 }
 
-%struct.Foo = type { i32, i32, i32, i32 }
+; ------------------------------------------------------------------------
+; A few partially aligned cases
+
 
-define void @t7(ptr nocapture %a, ptr nocapture %b) nounwind {
-; RV32-BOTH-LABEL: t7:
+define void @memcpy16_align4(ptr nocapture %dest, ptr nocapture %src) nounwind {
+; RV32-BOTH-LABEL: memcpy16_align4:
 ; RV32-BOTH:       # %bb.0: # %entry
 ; RV32-BOTH-NEXT:    lw a2, 12(a1)
 ; RV32-BOTH-NEXT:    sw a2, 12(a0)
@@ -418,7 +688,7 @@ define void @t7(ptr nocapture %a, ptr nocapture %b) nounwind {
 ; RV32-BOTH-NEXT:    sw a1, 0(a0)
 ; RV32-BOTH-NEXT:    ret
 ;
-; RV64-LABEL: t7:
+; RV64-LABEL: memcpy16_align4:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    lw a2, 12(a1)
 ; RV64-NEXT:    sw a2, 12(a0)
@@ -430,7 +700,7 @@ define void @t7(ptr nocapture %a, ptr nocapture %b) nounwind {
 ; RV64-NEXT:    sw a1, 0(a0)
 ; RV64-NEXT:    ret
 ;
-; RV64-FAST-LABEL: t7:
+; RV64-FAST-LABEL: memcpy16_align4:
 ; RV64-FAST:       # %bb.0: # %entry
 ; RV64-FAST-NEXT:    ld a2, 8(a1)
 ; RV64-FAST-NEXT:    sd a2, 8(a0)
@@ -438,11 +708,58 @@ define void @t7(ptr nocapture %a, ptr nocapture %b) nounwind {
 ; RV64-FAST-NEXT:    sd a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
-  tail call void @llvm.memcpy.p0.p0.i32(ptr align 4 %a, ptr align 4 %b, i32 16, i1 false)
+  tail call void @llvm.memcpy.p0.p0.i32(ptr align 4 %dest, ptr align 4 %src, i32 16, i1 false)
   ret void
 }
 
+define i32 @memcpy11_align8(ptr nocapture %dest, ptr %src) {
+; RV32-LABEL: memcpy11_align8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    lbu a2, 10(a1)
+; RV32-NEXT:    sb a2, 10(a0)
+; RV32-NEXT:    lh a2, 8(a1)
+; RV32-NEXT:    sh a2, 8(a0)
+; RV32-NEXT:    lw a2, 4(a1)
+; RV32-NEXT:    sw a2, 4(a0)
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    sw a1, 0(a0)
+; RV32-NEXT:    li a0, 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: memcpy11_align8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    lbu a2, 10(a1)
+; RV64-NEXT:    sb a2, 10(a0)
+; RV64-NEXT:    lh a2, 8(a1)
+; RV64-NEXT:    sh a2, 8(a0)
+; RV64-NEXT:    ld a1, 0(a1)
+; RV64-NEXT:    sd a1, 0(a0)
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: memcpy11_align8:
+; RV32-FAST:       # %bb.0: # %entry
+; RV32-FAST-NEXT:    lw a2, 7(a1)
+; RV32-FAST-NEXT:    sw a2, 7(a0)
+; RV32-FAST-NEXT:    lw a2, 4(a1)
+; RV32-FAST-NEXT:    sw a2, 4(a0)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    li a0, 0
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: memcpy11_align8:
+; RV64-FAST:       # %bb.0: # %entry
+; RV64-FAST-NEXT:    lw a2, 7(a1)
+; RV64-FAST-NEXT:    sw a2, 7(a0)
+; RV64-FAST-NEXT:    ld a1, 0(a1)
+; RV64-FAST-NEXT:    sd a1, 0(a0)
+; RV64-FAST-NEXT:    li a0, 0
+; RV64-FAST-NEXT:    ret
+entry:
+  call void @llvm.memcpy.p0.p0.i32(ptr align 8 %dest, ptr align 8 %src, i32 11, i1 false)
+  ret i32 0
+}
+
 declare void @llvm.memcpy.p0.p0.i32(ptr nocapture, ptr nocapture, i32, i1) nounwind
 declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; RV64-BOTH: {{.*}}

From 77e6f434ec79db025aa9c7d193179727f1d63714 Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe@amd.com>
Date: Mon, 13 Jan 2025 09:54:57 +0530
Subject: [PATCH 225/408] [SPIRV] convergence anchor intrinsic does not have a
 parent token (#122230)

---
 llvm/include/llvm/IR/IntrinsicInst.h          |  6 +++---
 .../SPIRVConvergenceRegionAnalysis.cpp        | 20 ++++++-------------
 2 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index 3436216d478e3..6ccbb6b185c7d 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -1873,13 +1873,13 @@ class ConvergenceControlInst : public IntrinsicInst {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 
-  bool isAnchor() {
+  bool isAnchor() const {
     return getIntrinsicID() == Intrinsic::experimental_convergence_anchor;
   }
-  bool isEntry() {
+  bool isEntry() const {
     return getIntrinsicID() == Intrinsic::experimental_convergence_entry;
   }
-  bool isLoop() {
+  bool isLoop() const {
     return getIntrinsicID() == Intrinsic::experimental_convergence_loop;
   }
 };
diff --git a/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp b/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp
index cc6daf7ef3442..c23a6c3e8bbe8 100644
--- a/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp
@@ -56,20 +56,12 @@ getConvergenceTokenInternal(BasicBlockType *BB) {
       "Output type must be an intrinsic instruction.");
 
   for (auto &I : *BB) {
-    if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
-      switch (II->getIntrinsicID()) {
-      case Intrinsic::experimental_convergence_entry:
-      case Intrinsic::experimental_convergence_loop:
-        return II;
-      case Intrinsic::experimental_convergence_anchor: {
-        auto Bundle = II->getOperandBundle(LLVMContext::OB_convergencectrl);
-        assert(Bundle->Inputs.size() == 1 &&
-               Bundle->Inputs[0]->getType()->isTokenTy());
-        auto TII = dyn_cast<IntrinsicInst>(Bundle->Inputs[0].get());
-        assert(TII != nullptr);
-        return TII;
-      }
-      }
+    if (auto *CI = dyn_cast<ConvergenceControlInst>(&I)) {
+      // Make sure that the anchor or entry intrinsics did not reach here with a
+      // parent token. This should have failed the verifier.
+      assert(CI->isLoop() ||
+             !CI->getOperandBundle(LLVMContext::OB_convergencectrl));
+      return CI;
     }
 
     if (auto *CI = dyn_cast<CallInst>(&I)) {

From f15da5fb7845979889b73f3f47bb126617e823cf Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Sun, 12 Jan 2025 23:40:25 -0500
Subject: [PATCH 226/408] [AMDGPU] Fix an invalid cast in
 `AMDGPULateCodeGenPrepare::visitLoadInst` (#122494)

Fixes: SWDEV-507695
---
 .../AMDGPU/AMDGPULateCodeGenPrepare.cpp       |  7 +++-
 .../CodeGen/AMDGPU/invalid-cast-load-i1.ll    | 37 +++++++++++++++++++
 2 files changed, 42 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 830b50307f837..f4e651ec477d3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -464,8 +464,11 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
   NewLd->setMetadata(LLVMContext::MD_range, nullptr);
 
   unsigned ShAmt = Adjust * 8;
-  auto *NewVal = IRB.CreateBitCast(
-      IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
+  Value *NewVal = IRB.CreateBitCast(
+      IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt),
+                      DL.typeSizeEqualsStoreSize(LI.getType()) ? IntNTy
+                                                               : LI.getType()),
+      LI.getType());
   LI.replaceAllUsesWith(NewVal);
   DeadInsts.emplace_back(&LI);
 
diff --git a/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll b/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll
new file mode 100644
index 0000000000000..621187100f323
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s -o - | FileCheck %s
+
+define amdgpu_kernel void @load_idx_idy(ptr addrspace(4) %disp, ptr %g) {
+; CHECK-LABEL: load_idx_idy:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dword s6, s[4:5], 0x4
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_lshr_b32 s4, s6, 16
+; CHECK-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x10000
+; CHECK-NEXT:    s_lshl_b64 s[4:5], s[4:5], 6
+; CHECK-NEXT:    s_add_u32 s0, s0, s4
+; CHECK-NEXT:    s_addc_u32 s1, s1, s5
+; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:4
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    s_endpgm
+entry:
+  %disp1 = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %gep_y = getelementptr i8, ptr addrspace(4) %disp1, i64 6
+  %L = load i1, ptr addrspace(4) %gep_y, align 1
+  %idxprom = sext i1 %L to i64
+  %gep0 = getelementptr <32 x i16>, ptr addrspace(4) %disp, i64 %idxprom
+  %gep1 = getelementptr i8, ptr addrspace(4) %gep0, i64 4
+  %L1 = load i8, ptr addrspace(4) %gep1
+  store i8 %L1, ptr %g
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef nonnull align 4 ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

From 7bf1cb702bdbec4b054a65cb58244596e8cd4138 Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke@amd.com>
Date: Mon, 13 Jan 2025 10:11:40 +0530
Subject: [PATCH 227/408] [AMDGPU][NewPM] Port
 AMDGPURemoveIncompatibleFunctions to NPM (#122261)

---
 llvm/lib/Target/AMDGPU/AMDGPU.h               |  2 +-
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |  1 +
 .../AMDGPURemoveIncompatibleFunctions.cpp     | 53 +++++++++++++------
 .../AMDGPURemoveIncompatibleFunctions.h       | 26 +++++++++
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  6 ++-
 .../AMDGPU/remove-incompatible-functions.ll   |  8 +++
 .../AMDGPU/remove-incompatible-s-time.ll      | 10 ++++
 .../remove-incompatible-wave32-feature.ll     |  8 +++
 8 files changed, 96 insertions(+), 18 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.h

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index ad5ee75f0c5d1..78667e628ec1e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -380,7 +380,7 @@ extern char &AMDGPUAnnotateUniformValuesLegacyPassID;
 void initializeAMDGPUCodeGenPreparePass(PassRegistry&);
 extern char &AMDGPUCodeGenPrepareID;
 
-void initializeAMDGPURemoveIncompatibleFunctionsPass(PassRegistry &);
+void initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(PassRegistry &);
 extern char &AMDGPURemoveIncompatibleFunctionsID;
 
 void initializeAMDGPULateCodeGenPrepareLegacyPass(PassRegistry &);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 182e825a59a41..da594be992cb4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -26,6 +26,7 @@ MODULE_PASS("amdgpu-perf-hint",
             AMDGPUPerfHintAnalysisPass(
               *static_cast<const GCNTargetMachine *>(this)))
 MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
+MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
 MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
 #undef MODULE_PASS
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp
index 3a87070a326c2..e2e5c57397d02 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp
@@ -12,6 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AMDGPURemoveIncompatibleFunctions.h"
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
@@ -33,25 +34,16 @@ namespace {
 
 using Generation = AMDGPUSubtarget::Generation;
 
-class AMDGPURemoveIncompatibleFunctions : public ModulePass {
+class AMDGPURemoveIncompatibleFunctions {
 public:
-  static char ID;
-
   AMDGPURemoveIncompatibleFunctions(const TargetMachine *TM = nullptr)
-      : ModulePass(ID), TM(TM) {
+      : TM(TM) {
     assert(TM && "No TargetMachine!");
   }
-
-  StringRef getPassName() const override {
-    return "AMDGPU Remove Incompatible Functions";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {}
-
   /// Checks a single function, returns true if the function must be deleted.
   bool checkFunction(Function &F);
 
-  bool runOnModule(Module &M) override {
+  bool run(Module &M) {
     assert(TM->getTargetTriple().isAMDGCN());
 
     SmallVector<Function *, 4> FnsToDelete;
@@ -71,6 +63,28 @@ class AMDGPURemoveIncompatibleFunctions : public ModulePass {
   const TargetMachine *TM = nullptr;
 };
 
+class AMDGPURemoveIncompatibleFunctionsLegacy : public ModulePass {
+public:
+  static char ID;
+
+  AMDGPURemoveIncompatibleFunctionsLegacy(const TargetMachine *TM)
+      : ModulePass(ID), TM(TM) {}
+
+  bool runOnModule(Module &M) override {
+    AMDGPURemoveIncompatibleFunctions Pass(TM);
+    return Pass.run(M);
+  }
+
+  StringRef getPassName() const override {
+    return "AMDGPU Remove Incompatible Functions";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {}
+
+private:
+  const TargetMachine *TM = nullptr;
+};
+
 StringRef getFeatureName(unsigned Feature) {
   for (const SubtargetFeatureKV &KV : AMDGPUFeatureKV)
     if (Feature == KV.Value)
@@ -131,6 +145,15 @@ void reportFunctionRemoved(Function &F, unsigned Feature) {
 }
 } // end anonymous namespace
 
+PreservedAnalyses
+AMDGPURemoveIncompatibleFunctionsPass::run(Module &M,
+                                           ModuleAnalysisManager &MAM) {
+  AMDGPURemoveIncompatibleFunctions Impl(TM);
+  if (Impl.run(M))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
 bool AMDGPURemoveIncompatibleFunctions::checkFunction(Function &F) {
   if (F.isDeclaration())
     return false;
@@ -182,12 +205,12 @@ bool AMDGPURemoveIncompatibleFunctions::checkFunction(Function &F) {
   return false;
 }
 
-INITIALIZE_PASS(AMDGPURemoveIncompatibleFunctions, DEBUG_TYPE,
+INITIALIZE_PASS(AMDGPURemoveIncompatibleFunctionsLegacy, DEBUG_TYPE,
                 "AMDGPU Remove Incompatible Functions", false, false)
 
-char AMDGPURemoveIncompatibleFunctions::ID = 0;
+char AMDGPURemoveIncompatibleFunctionsLegacy::ID = 0;
 
 ModulePass *
 llvm::createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *TM) {
-  return new AMDGPURemoveIncompatibleFunctions(TM);
+  return new AMDGPURemoveIncompatibleFunctionsLegacy(TM);
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.h b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.h
new file mode 100644
index 0000000000000..e4c858588ece8
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.h
@@ -0,0 +1,26 @@
+//===- AMDGPURemoveIncompatibleFunctions.h ----------------------*- C++- *-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_REMOVEINCOMPATIBLEFUNCTIONS_H
+#define LLVM_LIB_TARGET_AMDGPU_REMOVEINCOMPATIBLEFUNCTIONS_H
+
+#include "llvm/IR/PassManager.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class AMDGPURemoveIncompatibleFunctionsPass
+    : public PassInfoMixin<AMDGPURemoveIncompatibleFunctionsPass> {
+  const TargetMachine *TM;
+
+public:
+  AMDGPURemoveIncompatibleFunctionsPass(const TargetMachine &TM) : TM(&TM) {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_REMOVEINCOMPATIBLEFUNCTIONS_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 0c9d7d00a8a4a..6058f9709c38c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -23,6 +23,7 @@
 #include "AMDGPUISelDAGToDAG.h"
 #include "AMDGPUMacroFusion.h"
 #include "AMDGPUPerfHintAnalysis.h"
+#include "AMDGPURemoveIncompatibleFunctions.h"
 #include "AMDGPUSplitModule.h"
 #include "AMDGPUTargetObjectFile.h"
 #include "AMDGPUTargetTransformInfo.h"
@@ -507,7 +508,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUPromoteAllocaToVectorPass(*PR);
   initializeAMDGPUCodeGenPreparePass(*PR);
   initializeAMDGPULateCodeGenPrepareLegacyPass(*PR);
-  initializeAMDGPURemoveIncompatibleFunctionsPass(*PR);
+  initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(*PR);
   initializeAMDGPULowerModuleLDSLegacyPass(*PR);
   initializeAMDGPULowerBufferFatPointersPass(*PR);
   initializeAMDGPUReserveWWMRegsPass(*PR);
@@ -1925,7 +1926,8 @@ AMDGPUCodeGenPassBuilder::AMDGPUCodeGenPassBuilder(
 }
 
 void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const {
-  // TODO: Missing AMDGPURemoveIncompatibleFunctions
+  if (RemoveIncompatibleFunctions && TM.getTargetTriple().isAMDGCN())
+    addPass(AMDGPURemoveIncompatibleFunctionsPass(TM));
 
   addPass(AMDGPUPrintfRuntimeBindingPass());
   if (LowerCtorDtor)
diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-functions.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-functions.ll
index e0b694ee58f0e..0359bb7183974 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-functions.ll
+++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-functions.ll
@@ -4,11 +4,19 @@
 ; RUN: FileCheck --check-prefix=WARN-GFX7 %s < %t
 ; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s
 
+; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=bonaire -stop-after=amdgpu-remove-incompatible-functions\
+; RUN:   -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX7,IR %s
+; RUN: FileCheck --check-prefix=WARN-GFX7 %s < %t
+
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -stop-after=amdgpu-remove-incompatible-functions\
 ; RUN:   -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX8,IR %s
 ; RUN: FileCheck --check-prefix=WARN-GFX8 %s < %t
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s
 
+; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=fiji -stop-after=amdgpu-remove-incompatible-functions\
+; RUN:   -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX8,IR %s
+; RUN: FileCheck --check-prefix=WARN-GFX8 %s < %t
+
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -stop-after=amdgpu-remove-incompatible-functions\
 ; RUN:   -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX9,GFX906,IR %s
 ; RUN: FileCheck --check-prefix=WARN-GFX906 %s < %t
diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll
index 32fed3ba22c59..676ba1480e6d2 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll
+++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll
@@ -4,11 +4,21 @@
 ; RUN: FileCheck -allow-empty --check-prefixes=WARN-REALTIME,WARN-MEMTIME %s < %t
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s
 
+; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1030 -stop-after=amdgpu-remove-incompatible-functions\
+; RUN:   -pass-remarks=amdgpu-remove-incompatible-functions %s -o - 2>%t | FileCheck -check-prefixes=COMPATIBLE,REALTIME,MEMTIME %s
+; RUN: FileCheck -allow-empty --check-prefixes=WARN-REALTIME,WARN-MEMTIME %s < %t
+; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s
+
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1102 -stop-after=amdgpu-remove-incompatible-functions\
 ; RUN:   -pass-remarks=amdgpu-remove-incompatible-functions %s -o - 2>%t | FileCheck -check-prefixes=INCOMPATIBLE,NOREALTIME,NOMEMTIME %s
 ; RUN: FileCheck --check-prefixes=WARN-NOREALTIME,WARN-NOMEMTIME %s < %t
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1102 -verify-machineinstrs < %s
 
+; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1102 -stop-after=amdgpu-remove-incompatible-functions\
+; RUN:   -pass-remarks=amdgpu-remove-incompatible-functions %s -o - 2>%t | FileCheck -check-prefixes=INCOMPATIBLE,NOREALTIME,NOMEMTIME %s
+; RUN: FileCheck --check-prefixes=WARN-NOREALTIME,WARN-NOMEMTIME %s < %t
+; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1102 -verify-machineinstrs < %s
+
 ; Note: This test checks the IR, but also has a run line to codegen the file just to check we
 ; do not crash when trying to select those functions.
 
diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll
index 406c953a06d97..75a388eb1229b 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll
+++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll
@@ -12,10 +12,18 @@
 ; RUN:   -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX10 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s
 
+; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1011 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=amdgpu-remove-incompatible-functions\
+; RUN:   -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1011 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s
+
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=amdgpu-remove-incompatible-functions\
 ; RUN:   -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX11 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s
 
+; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=amdgpu-remove-incompatible-functions\
+; RUN:   -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s
+
 ; WARN-GFX906: removing function 'needs_wavefrontsize32': +wavefrontsize32 is not supported on the current target
 ; WARN-GFX906-NOT: not supported
 

From ecf4f95c4f55eea0830659654fa264189773a423 Mon Sep 17 00:00:00 2001
From: wldfngrs <wldfngrs@gmail.com>
Date: Mon, 13 Jan 2025 05:46:53 +0100
Subject: [PATCH 228/408] [libc][math][c23] Add tanf16 function (#121018)

- Implementation of tan for 16-bit floating point inputs.
- Exhaustive tests across the 16-bit input range
---
 libc/config/linux/x86_64/entrypoints.txt |   1 +
 libc/docs/headers/math/index.rst         |   2 +-
 libc/include/math.yaml                   |   7 ++
 libc/src/math/CMakeLists.txt             |   1 +
 libc/src/math/generic/CMakeLists.txt     |  19 ++++
 libc/src/math/generic/sincosf16_utils.h  |  25 +++--
 libc/src/math/generic/tanf16.cpp         | 115 +++++++++++++++++++++++
 libc/src/math/generic/tanpif16.cpp       |   2 +-
 libc/src/math/tanf16.h                   |  21 +++++
 libc/test/src/math/CMakeLists.txt        |  11 +++
 libc/test/src/math/cosf16_test.cpp       |   2 +-
 libc/test/src/math/smoke/CMakeLists.txt  |  11 +++
 libc/test/src/math/smoke/tanf16_test.cpp |  34 +++++++
 libc/test/src/math/tanf16_test.cpp       |  40 ++++++++
 14 files changed, 275 insertions(+), 16 deletions(-)
 create mode 100644 libc/src/math/generic/tanf16.cpp
 create mode 100644 libc/src/math/tanf16.h
 create mode 100644 libc/test/src/math/smoke/tanf16_test.cpp
 create mode 100644 libc/test/src/math/tanf16_test.cpp

diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index e7b049c0a6638..723853b2230ae 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -721,6 +721,7 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.sinhf16
     libc.src.math.sinpif16
     libc.src.math.sqrtf16
+    libc.src.math.tanf16
     libc.src.math.tanhf16
     libc.src.math.tanpif16
     libc.src.math.totalorderf16
diff --git a/libc/docs/headers/math/index.rst b/libc/docs/headers/math/index.rst
index 2808165ad539b..8548e4a5773bc 100644
--- a/libc/docs/headers/math/index.rst
+++ b/libc/docs/headers/math/index.rst
@@ -346,7 +346,7 @@ Higher Math Functions
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | sqrt      | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.7.10              | F.10.4.10                  |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| tan       | |check|          | |check|         |                        |                      |                        | 7.12.4.7               | F.10.1.7                   |
+| tan       | |check|          | |check|         |                        | |check|              |                        | 7.12.4.7               | F.10.1.7                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | tanh      | |check|          |                 |                        | |check|              |                        | 7.12.5.6               | F.10.2.6                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
diff --git a/libc/include/math.yaml b/libc/include/math.yaml
index 831d045745677..3a660a59d3605 100644
--- a/libc/include/math.yaml
+++ b/libc/include/math.yaml
@@ -2418,6 +2418,13 @@ functions:
     return_type: float
     arguments:
       - type: float
+  - name: tanf16
+    standards:
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
   - name: tanhf
     standards:
       - stdc
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index e4e2c49642f2d..fe5ebd793b40a 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -501,6 +501,7 @@ add_math_entrypoint_object(sqrtf128)
 
 add_math_entrypoint_object(tan)
 add_math_entrypoint_object(tanf)
+add_math_entrypoint_object(tanf16)
 
 add_math_entrypoint_object(tanh)
 add_math_entrypoint_object(tanhf)
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 382f5b362e2eb..0e57051807b33 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -661,6 +661,25 @@ add_entrypoint_object(
     ${libc_opt_high_flag}
 )
 
+add_entrypoint_object(
+  tanf16
+  SRCS
+    tanf16.cpp
+  HDRS
+    ../tanf16.h
+  DEPENDS
+    .sincosf16_utils
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.__support.FPUtil.cast
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.except_value_utils
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.macros.optimization
+    libc.src.__support.macros.properties.types
+)
+
 add_entrypoint_object(
   tanpif16
   SRCS
diff --git a/libc/src/math/generic/sincosf16_utils.h b/libc/src/math/generic/sincosf16_utils.h
index 87b1dde560c5e..133896b5de7a3 100644
--- a/libc/src/math/generic/sincosf16_utils.h
+++ b/libc/src/math/generic/sincosf16_utils.h
@@ -47,24 +47,23 @@ LIBC_INLINE int32_t range_reduction_sincospif16(float x, float &y) {
 
 // Recall, range reduction:
 //   k = round(x * 32/pi)
-//   y = x * 32/pi - k
 //
-// The constant 0x1.45f306dc9c883p3 is 32/pi rounded to double-precision.
-// 32/pi is generated by Sollya with the following commands:
-// > display = hexadecimal;
-// > round(32/pi, D, RN);
-//
-// The precision choice of 'double' is to minimize rounding errors
-// in this initial scaling step, preserving enough bits so errors accumulated
-// while computing the subtraction: y = x * 32/pi - round(x * 32/pi)
+// The precision choice of 'double' in the following function is to minimize
+// rounding errors in this initial scaling step,
+// preserving enough bits so errors accumulated while computing the subtraction:
+// y = x * 32/pi - round(x * 32/pi)
 // are beyond the least-significant bit of single-precision used during
 // further intermediate computation.
 LIBC_INLINE int32_t range_reduction_sincosf16(float x, float &y) {
-  double prod = x * 0x1.45f306dc9c883p3;
-  double kf = fputil::nearest_integer(prod);
-  y = static_cast<float>(prod - kf);
+  // Generated by Sollya with:
+  // > D(32/pi);
+  constexpr double THIRTYTWO_OVER_PI = 0x1.45f306dc9c883p3;
 
-  return static_cast<int32_t>(kf);
+  double prod = x * THIRTYTWO_OVER_PI;
+  double kd = fputil::nearest_integer(prod);
+  y = static_cast<float>(prod - kd);
+
+  return static_cast<int32_t>(kd);
 }
 
 static LIBC_INLINE void sincosf16_poly_eval(int32_t k, float y, float &sin_k,
diff --git a/libc/src/math/generic/tanf16.cpp b/libc/src/math/generic/tanf16.cpp
new file mode 100644
index 0000000000000..48aa51e456a8a
--- /dev/null
+++ b/libc/src/math/generic/tanf16.cpp
@@ -0,0 +1,115 @@
+//===-- Half-precision tan(x) function ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception.
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/tanf16.h"
+#include "hdr/errno_macros.h"
+#include "hdr/fenv_macros.h"
+#include "sincosf16_utils.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/macros/optimization.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+constexpr size_t N_EXCEPTS = 9;
+
+constexpr fputil::ExceptValues<float16, N_EXCEPTS> TANF16_EXCEPTS{{
+    // (input, RZ output, RU offset, RD offset, RN offset)
+    {0x2894, 0x2894, 1, 0, 1},
+    {0x3091, 0x3099, 1, 0, 0},
+    {0x3098, 0x30a0, 1, 0, 0},
+    {0x55ed, 0x3911, 1, 0, 0},
+    {0x607b, 0xc638, 0, 1, 1},
+    {0x674e, 0x3b7d, 1, 0, 0},
+    {0x6807, 0x4014, 1, 0, 1},
+    {0x6f4d, 0xbe19, 0, 1, 1},
+    {0x7330, 0xcb62, 0, 1, 0},
+}};
+
+LLVM_LIBC_FUNCTION(float16, tanf16, (float16 x)) {
+  using FPBits = fputil::FPBits<float16>;
+  FPBits xbits(x);
+
+  uint16_t x_u = xbits.uintval();
+  uint16_t x_abs = x_u & 0x7fff;
+  bool x_sign = x_u >> 15;
+  float xf = x;
+
+  // Handle exceptional values
+  if (auto r = TANF16_EXCEPTS.lookup_odd(x_abs, x_sign);
+      LIBC_UNLIKELY(r.has_value()))
+    return r.value();
+
+  // |x| <= 0x1.d1p-5
+  if (LIBC_UNLIKELY(x_abs <= 0x2b44)) {
+    // |x| <= 0x1.398p-11
+    if (LIBC_UNLIKELY(x_abs <= 0x10e6)) {
+      // tan(+/-0) = +/-0
+      if (LIBC_UNLIKELY(x_abs == 0))
+        return x;
+
+      int rounding = fputil::quick_get_round();
+
+      // Exhaustive tests show that, when:
+      // x > 0, and rounding upward or
+      // x < 0, and rounding downward then,
+      // tan(x) = x * 2^-11 + x
+      if ((xbits.is_pos() && rounding == FE_UPWARD) ||
+          (xbits.is_neg() && rounding == FE_DOWNWARD))
+        return fputil::cast<float16>(fputil::multiply_add(xf, 0x1.0p-11f, xf));
+      return x;
+    }
+
+    float xsq = xf * xf;
+
+    // Degree-6 minimax odd polynomial of tan(x) generated by Sollya with:
+    // > P = fpminimax(tan(x)/x, [|0, 2, 4, 6|], [|1, SG...|], [0, pi/32]);
+    float result = fputil::polyeval(xsq, 0x1p0f, 0x1.555556p-2f, 0x1.110ee4p-3f,
+                                    0x1.be80f6p-5f);
+
+    return fputil::cast<float16>(xf * result);
+  }
+
+  // tan(+/-inf) = NaN, and tan(NaN) = NaN
+  if (LIBC_UNLIKELY(x_abs >= 0x7c00)) {
+    // x = +/-inf
+    if (x_abs == 0x7c00) {
+      fputil::set_errno_if_required(EDOM);
+      fputil::raise_except_if_required(FE_INVALID);
+    }
+
+    return x + FPBits::quiet_nan().get_val();
+  }
+
+  // Range reduction:
+  // For |x| > pi/32, we perform range reduction as follows:
+  // Find k and y such that:
+  //   x = (k + y) * pi/32;
+  //   k is an integer, |y| < 0.5
+  //
+  // This is done by performing:
+  //   k = round(x * 32/pi)
+  //   y = x * 32/pi - k
+  //
+  // Once k and y are computed, we then deduce the answer by the formula:
+  // tan(x) = sin(x) / cos(x)
+  // 	    = (sin_y * cos_k + cos_y * sin_k) / (cos_y * cos_k - sin_y * sin_k)
+  float sin_k, cos_k, sin_y, cosm1_y;
+  sincosf16_eval(xf, sin_k, cos_k, sin_y, cosm1_y);
+
+  // Note that, cosm1_y = cos_y - 1:
+  using fputil::multiply_add;
+  return fputil::cast<float16>(
+      multiply_add(sin_y, cos_k, multiply_add(cosm1_y, sin_k, sin_k)) /
+      multiply_add(sin_y, -sin_k, multiply_add(cosm1_y, cos_k, cos_k)));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/tanpif16.cpp b/libc/src/math/generic/tanpif16.cpp
index 67635536ee319..cf4f9917d4537 100644
--- a/libc/src/math/generic/tanpif16.cpp
+++ b/libc/src/math/generic/tanpif16.cpp
@@ -79,7 +79,7 @@ LLVM_LIBC_FUNCTION(float16, tanpif16, (float16 x)) {
   //   k = round(x * 32)
   //   y = x * 32 - k
   //
-  // Once k and y are computed, we then deduce the answer by tthe formula:
+  // Once k and y are computed, we then deduce the answer by the formula:
   // tan(x) = sin(x) / cos(x)
   //        = (sin_y * cos_k + cos_y * sin_k) / (cos_y * cos_k - sin_y * sin_k)
   float xf = x;
diff --git a/libc/src/math/tanf16.h b/libc/src/math/tanf16.h
new file mode 100644
index 0000000000000..bf1b61e9837f7
--- /dev/null
+++ b/libc/src/math/tanf16.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for tanf16 ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_TANF16_H
+#define LLVM_LIBC_SRC_MATH_TANF16_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+float16 tanf16(float16 x);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_TANF16_H
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index 16e7d4957ba11..ae8518ee4b4cc 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -190,6 +190,17 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  tanf16_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    tanf16_test.cpp
+  DEPENDS
+    libc.src.math.tanf16  
+)
+
 add_fp_unittest(
   tanpif16_test
   NEED_MPFR
diff --git a/libc/test/src/math/cosf16_test.cpp b/libc/test/src/math/cosf16_test.cpp
index 9e4687f0325c4..b744e7817e4ba 100644
--- a/libc/test/src/math/cosf16_test.cpp
+++ b/libc/test/src/math/cosf16_test.cpp
@@ -17,7 +17,7 @@ namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 // Range: [0, Inf]
 static constexpr uint16_t POS_START = 0x0000U;
-static constexpr uint16_t POS_STOP = 0x7c00u;
+static constexpr uint16_t POS_STOP = 0x7c00U;
 
 // Range: [-Inf, 0]
 static constexpr uint16_t NEG_START = 0x8000U;
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 31f85a3ecfd27..e23e7f41222d4 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -121,6 +121,17 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  tanf16_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    tanf16_test.cpp
+  DEPENDS
+    libc.src.errno.errno
+    libc.src.math.tanf16 
+)
+
 add_fp_unittest(
   tanpif16_test
   SUITE
diff --git a/libc/test/src/math/smoke/tanf16_test.cpp b/libc/test/src/math/smoke/tanf16_test.cpp
new file mode 100644
index 0000000000000..39d1182ba891e
--- /dev/null
+++ b/libc/test/src/math/smoke/tanf16_test.cpp
@@ -0,0 +1,34 @@
+//===-- Unittests for tanf16 ----------------------------------------------===//
+//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception.
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/errno/libc_errno.h"
+#include "src/math/tanf16.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+using LlvmLibcTanf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
+
+TEST_F(LlvmLibcTanf16Test, SpecialNumbers) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::tanf16(aNaN));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ(zero, LIBC_NAMESPACE::tanf16(zero));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ(neg_zero, LIBC_NAMESPACE::tanf16(neg_zero));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::tanf16(inf));
+  EXPECT_MATH_ERRNO(EDOM);
+
+  EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::tanf16(neg_inf));
+  EXPECT_MATH_ERRNO(EDOM);
+}
diff --git a/libc/test/src/math/tanf16_test.cpp b/libc/test/src/math/tanf16_test.cpp
new file mode 100644
index 0000000000000..f2e874182efc1
--- /dev/null
+++ b/libc/test/src/math/tanf16_test.cpp
@@ -0,0 +1,40 @@
+//===-- Exhaustive test for tanf16 ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/tanf16.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+using LlvmLibcTanf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+// Range: [0, Inf]
+static constexpr uint16_t POS_START = 0x0000U;
+static constexpr uint16_t POS_STOP = 0x7c00U;
+
+// Range: [-Inf, 0]
+static constexpr uint16_t NEG_START = 0x8000U;
+static constexpr uint16_t NEG_STOP = 0xfc00U;
+
+TEST_F(LlvmLibcTanf16Test, PositiveRange) {
+  for (uint16_t v = POS_START; v <= POS_STOP; ++v) {
+    float16 x = FPBits(v).get_val();
+    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tan, x,
+                                   LIBC_NAMESPACE::tanf16(x), 0.5);
+  }
+}
+
+TEST_F(LlvmLibcTanf16Test, NegativeRange) {
+  for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) {
+    float16 x = FPBits(v).get_val();
+    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tan, x,
+                                   LIBC_NAMESPACE::tanf16(x), 0.5);
+  }
+}

From f431f93a775d2a18ff67bc53a634a81a4580bbee Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke@amd.com>
Date: Mon, 13 Jan 2025 10:38:24 +0530
Subject: [PATCH 229/408] [CodeGen][NewPM] Use proper NPM AtomicExpandPass in
 AMDGPU (#122086)

`PassRegistry.def` already has this entry, but the dummy definition was
being pulled instead.

I couldn't reproduce the build failures that FIXME referenced, maybe the
Dummy pass getting in the way was part of the cause.
---
 llvm/include/llvm/Passes/MachinePassRegistry.def | 1 -
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp   | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index 29763995e8b51..8a43197d2d45e 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -194,7 +194,6 @@ MACHINE_FUNCTION_PASS_WITH_PARAMS(
 #ifndef DUMMY_FUNCTION_PASS
 #define DUMMY_FUNCTION_PASS(NAME, PASS_NAME)
 #endif
-DUMMY_FUNCTION_PASS("atomic-expand", AtomicExpandPass)
 #undef DUMMY_FUNCTION_PASS
 
 #ifndef DUMMY_MACHINE_MODULE_PASS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 6058f9709c38c..f8b60630bb7f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -49,6 +49,7 @@
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
+#include "llvm/CodeGen/AtomicExpand.h"
 #include "llvm/CodeGen/DeadMachineInstructionElim.h"
 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
@@ -1957,8 +1958,7 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const {
       (AMDGPUAtomicOptimizerStrategy != ScanOptions::None))
     addPass(AMDGPUAtomicOptimizerPass(TM, AMDGPUAtomicOptimizerStrategy));
 
-  // FIXME: Adding atomic-expand manages to break -passes=atomic-expand
-  // addPass(AtomicExpandPass(TM));
+  addPass(AtomicExpandPass(&TM));
 
   if (TM.getOptLevel() > CodeGenOptLevel::None) {
     addPass(AMDGPUPromoteAllocaPass(TM));

From b86153919652302982356990d93fda4ad9ddae30 Mon Sep 17 00:00:00 2001
From: Shourya Goel <shouryagoel10000@gmail.com>
Date: Mon, 13 Jan 2025 11:23:36 +0530
Subject: [PATCH 230/408] [libc][complex] fix compiler support matrix for
 cfloat128 (#122593)

Before this patch, [godbolt](https://godbolt.org/z/6PPsvv9qd) failed to
compile `cfloat128` with `-ffreestanding` but with the patch, the
compilation succeeds, [godbolt](https://godbolt.org/z/4M8zzejss).

Fixes: #122500

cc: @nickdesaulniers
---
 libc/include/llvm-libc-types/cfloat128.h | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/libc/include/llvm-libc-types/cfloat128.h b/libc/include/llvm-libc-types/cfloat128.h
index f76a0c1c2f5af..83fad87910137 100644
--- a/libc/include/llvm-libc-types/cfloat128.h
+++ b/libc/include/llvm-libc-types/cfloat128.h
@@ -18,22 +18,24 @@
 //
 // TODO: Update the complex variant of C23 `_Float128` type detection again when
 // clang supports it.
-#if defined(__STDC_IEC_60559_COMPLEX__) && !defined(__clang__)
-#if !defined(__cplusplus)
-#define LIBC_TYPES_HAS_CFLOAT128
-typedef _Complex _Float128 cfloat128;
-#elif defined(__GNUC__) && __GNUC__ >= 13
-#define LIBC_TYPES_HAS_CFLOAT128
-typedef _Complex _Float128 cfloat128;
-#endif
-#elif __clang_major__ >= 11 &&                                                 \
+#ifdef __clang__
+#if (__clang_major__ >= 11) &&                                                 \
     (defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__))
 // Use _Complex __float128 type. clang uses __SIZEOF_FLOAT128__ or __FLOAT128__
 // macro to notify the availability of __float128 type:
 // https://reviews.llvm.org/D15120
 #define LIBC_TYPES_HAS_CFLOAT128
 typedef _Complex __float128 cfloat128;
-#elif (LDBL_MANT_DIG == 113)
+#endif
+#elif defined(__GNUC__)
+#if (defined(__STDC_IEC_60559_COMPLEX__) || defined(__SIZEOF_FLOAT128__)) &&   \
+    (__GNUC__ >= 13 || (!defined(__cplusplus)))
+#define LIBC_TYPES_HAS_CFLOAT128
+typedef _Complex _Float128 cfloat128;
+#endif
+#endif
+
+#if !defined(LIBC_TYPES_HAS_CFLOAT128) && (LDBL_MANT_DIG == 113)
 #define LIBC_TYPES_HAS_CFLOAT128
 #define LIBC_TYPES_CFLOAT128_IS_COMPLEX_LONG_DOUBLE
 typedef _Complex long double cfloat128;

From 6f558e0e124012fd00927d6d42545649bd7e0dcd Mon Sep 17 00:00:00 2001
From: CHANDRA GHALE <chandra.nitdgp@gmail.com>
Date: Mon, 13 Jan 2025 11:42:13 +0530
Subject: [PATCH 231/408] [OpenMP] codegen support for masked combined
 construct masked taskloop (#121914)

Added codegen support for combined masked constructs `masked taskloop.`
Added implementation for `EmitOMPMaskedTaskLoopDirective`.

---------

Co-authored-by: Chandra Ghale <ghale@pe31.hpc.amslabs.hpecorp.net>
---
 clang/lib/CodeGen/CGStmt.cpp                |  2 +-
 clang/lib/CodeGen/CGStmtOpenMP.cpp          | 12 +++++
 clang/lib/CodeGen/CodeGenFunction.h         |  1 +
 clang/test/OpenMP/masked_taskloop_codegen.c | 50 +++++++++++++++++++++
 4 files changed, 64 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/OpenMP/masked_taskloop_codegen.c

diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index ee10e586d9250..f9258a396b7d0 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -332,7 +332,7 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef<const Attr *> Attrs) {
     EmitOMPMasterTaskLoopDirective(cast<OMPMasterTaskLoopDirective>(*S));
     break;
   case Stmt::OMPMaskedTaskLoopDirectiveClass:
-    llvm_unreachable("masked taskloop directive not supported yet.");
+    EmitOMPMaskedTaskLoopDirective(cast<OMPMaskedTaskLoopDirective>(*S));
     break;
   case Stmt::OMPMasterTaskLoopSimdDirectiveClass:
     EmitOMPMasterTaskLoopSimdDirective(
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 94daf059edba0..2b4ca65e169a6 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -7982,6 +7982,18 @@ void CodeGenFunction::EmitOMPMasterTaskLoopDirective(
   CGM.getOpenMPRuntime().emitMasterRegion(*this, CodeGen, S.getBeginLoc());
 }
 
+void CodeGenFunction::EmitOMPMaskedTaskLoopDirective(
+    const OMPMaskedTaskLoopDirective &S) {
+  auto &&CodeGen = [this, &S](CodeGenFunction &CGF, PrePostActionTy &Action) {
+    Action.Enter(CGF);
+    EmitOMPTaskLoopBasedDirective(S);
+  };
+  auto LPCRegion =
+      CGOpenMPRuntime::LastprivateConditionalRAII::disable(*this, S);
+  OMPLexicalScope Scope(*this, S, std::nullopt, /*EmitPreInitStmt=*/false);
+  CGM.getOpenMPRuntime().emitMaskedRegion(*this, CodeGen, S.getBeginLoc());
+}
+
 void CodeGenFunction::EmitOMPMasterTaskLoopSimdDirective(
     const OMPMasterTaskLoopSimdDirective &S) {
   auto &&CodeGen = [this, &S](CodeGenFunction &CGF, PrePostActionTy &Action) {
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 86328db345508..311f2ae94d046 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -3870,6 +3870,7 @@ class CodeGenFunction : public CodeGenTypeCache {
   void EmitOMPTaskLoopDirective(const OMPTaskLoopDirective &S);
   void EmitOMPTaskLoopSimdDirective(const OMPTaskLoopSimdDirective &S);
   void EmitOMPMasterTaskLoopDirective(const OMPMasterTaskLoopDirective &S);
+  void EmitOMPMaskedTaskLoopDirective(const OMPMaskedTaskLoopDirective &S);
   void
   EmitOMPMasterTaskLoopSimdDirective(const OMPMasterTaskLoopSimdDirective &S);
   void
diff --git a/clang/test/OpenMP/masked_taskloop_codegen.c b/clang/test/OpenMP/masked_taskloop_codegen.c
new file mode 100644
index 0000000000000..26f54c1797bbe
--- /dev/null
+++ b/clang/test/OpenMP/masked_taskloop_codegen.c
@@ -0,0 +1,50 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --prefix-filecheck-ir-name _ --version 5
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fopenmp -fopenmp-version=52 -x c -emit-llvm %s -o - | FileCheck %s
+// expected-no-diagnostics
+#define N 100
+void masked_taskloop(){
+	#pragma omp masked taskloop
+	for( int i = 0; i < N; i++)
+	;
+
+}
+
+int main()
+{
+ masked_taskloop();
+}
+// CHECK-LABEL: define dso_local void @masked_taskloop(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 1
+// CHECK-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_masked(ptr @[[GLOB1]], i32 [[TMP0]], i32 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
+// CHECK-NEXT:    br i1 [[TMP2]], label %[[OMP_IF_THEN:.*]], label %[[OMP_IF_END:.*]]
+// CHECK:       [[OMP_IF_THEN]]:
+// CHECK-NEXT:    call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i64 80, i64 0, ptr @.omp_task_entry.)
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP3]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 5
+// CHECK-NEXT:    store i64 0, ptr [[TMP5]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 6
+// CHECK-NEXT:    store i64 99, ptr [[TMP6]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 7
+// CHECK-NEXT:    store i64 1, ptr [[TMP7]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 9
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP8]], i8 0, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
+// CHECK-NEXT:    call void @__kmpc_taskloop(ptr @[[GLOB1]], i32 [[TMP0]], ptr [[TMP3]], i32 1, ptr [[TMP5]], ptr [[TMP6]], i64 [[TMP9]], i32 1, i32 0, i64 0, ptr null)
+// CHECK-NEXT:    call void @__kmpc_end_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
+// CHECK-NEXT:    call void @__kmpc_end_masked(ptr @[[GLOB1]], i32 [[TMP0]])
+// CHECK-NEXT:    br label %[[OMP_IF_END]]
+// CHECK:       [[OMP_IF_END]]:
+// CHECK-NEXT:    ret void
+//
+// CHECK-LABEL: define dso_local i32 @main(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @masked_taskloop()
+// CHECK-NEXT:    ret i32 0
+

From 36c3466aef6c8bfde0ddc736b8403e2c45f5e1c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Fournier?= <clement.fournier@tu-dresden.de>
Date: Mon, 13 Jan 2025 08:21:07 +0100
Subject: [PATCH 232/408] [mlir][linalg] Fix neutral elt for softmax (#118952)

The decomposition of `linalg.softmax` uses `maxnumf`, but the identity
element that is used in the generated code is the one for `maximumf`.
They are not the same, as the identity for `maxnumf` is `NaN`, while the
one of `maximumf` is `-Infty`. This is wrong and prevents the maxnumf
from being folded.

Related to #114595, which fixed the folder for maxnumf.
---
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp             | 2 +-
 mlir/test/Dialect/Linalg/transform-op-decompose.mlir | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index 8973e87c063b3..c13b663dbf05b 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -2890,7 +2890,7 @@ FailureOr<SmallVector<Value>> SoftmaxOp::decomposeOperation(OpBuilder &b) {
   dims.erase(dims.begin() + reductionDim);
   // Step 1: Compute max along dim.
   Value outputReduce = b.create<tensor::EmptyOp>(loc, dims, elementType);
-  Value neutralForMaxF = arith::getIdentityValue(arith::AtomicRMWKind::maximumf,
+  Value neutralForMaxF = arith::getIdentityValue(arith::AtomicRMWKind::maxnumf,
                                                  elementType, b, loc,
                                                  /*useOnlyFiniteValue=*/true);
   Value neutralForMaxFInit =
diff --git a/mlir/test/Dialect/Linalg/transform-op-decompose.mlir b/mlir/test/Dialect/Linalg/transform-op-decompose.mlir
index 2e211d2fa7dbe..72acf43361f50 100644
--- a/mlir/test/Dialect/Linalg/transform-op-decompose.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-decompose.mlir
@@ -210,7 +210,7 @@ func.func @softmax(%arg0: tensor<2x16x32xf32>, %dst: tensor<2x16x32xf32>) -> ten
 // CHECK-LABEL:      func.func @softmax(
 // CHECK-SAME:           %[[ARG0:[a-zA-Z0-9_]+]]: tensor<2x16x32xf32>, %[[DST:[a-zA-Z0-9_]+]]: tensor<2x16x32xf32>) -> tensor<2x16x32xf32> {
 // CHECK-DAG:        %[[D1:.+]] = tensor.empty() : tensor<2x16xf32>
-// CHECK-DAG:        %[[CST:.+]] = arith.constant -3.40282347E+38 : f32
+// CHECK-DAG:        %[[CST:.+]] = arith.constant 0xFFC00000 : f32
 // CHECK:        %[[D2:.+]] = linalg.fill ins(%[[CST]] : f32) outs(%[[D1]] : tensor<2x16xf32>) -> tensor<2x16xf32>
 // CHECK:        %[[D3:.+]] = linalg.generic {indexing_maps = [#[[$MAP]], #[[$MAP1]]], iterator_types = ["parallel",
 // CHECK-SAME:     "parallel", "reduction"]} ins(%[[ARG0]] : tensor<2x16x32xf32>) outs(%[[D2]] : tensor<2x16xf32>) {

From 76af93fbea6c5f0f3558824ce8ebfafcf80a94d4 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 12 Jan 2025 23:50:58 -0800
Subject: [PATCH 233/408] Partially revert "[TableGen] Avoid repeated hash
 lookups (NFC) (#122586)"

This partially reverts commit 07ff786e39e2190449998d3af1000454dee501be.

The hunk being reverted in this patch seems to break:

  tools/llvm-gsymutil/ARM_AArch64/macho-merged-funcs-dwarf.yaml

under LLVM_ENABLE_EXPENSIVE_CHECKS.
---
 llvm/utils/TableGen/Common/CodeGenSchedule.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp
index 7f4230affca09..1fe322c88bb0f 100644
--- a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp
@@ -415,9 +415,9 @@ void CodeGenSchedModels::collectSTIPredicates() {
   for (const Record *R : Records.getAllDerivedDefinitions("STIPredicate")) {
     const Record *Decl = R->getValueAsDef("Declaration");
 
-    const auto [It, Inserted] =
-        Decl2Index.try_emplace(Decl, STIPredicates.size());
-    if (Inserted) {
+    const auto It = Decl2Index.find(Decl);
+    if (It == Decl2Index.end()) {
+      Decl2Index[Decl] = STIPredicates.size();
       STIPredicateFunction Predicate(Decl);
       Predicate.addDefinition(R);
       STIPredicates.emplace_back(std::move(Predicate));

From 56a37a3c767b037143235bf34d180be85de7fd53 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen@sifive.com>
Date: Mon, 13 Jan 2025 16:11:31 +0800
Subject: [PATCH 234/408] [SLPVectorizer] Refactor
 HorizontalReduction::createOp (NFC) (#121549)

This patch simplifies select-based integer min/max reductions by
utilizing `llvm::getMinMaxReductionPredicate`, and generates
intrinsic-based min/max reductions by utilizing
`llvm::getMinMaxReductionIntrinsicOp`.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 35 ++++++-------------
 1 file changed, 10 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 8a6fbd808de35..e3487b5015342 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -19431,38 +19431,23 @@ class HorizontalReduction {
       return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
                                  Name);
     }
-    case RecurKind::FMax:
-      return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
-    case RecurKind::FMin:
-      return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
-    case RecurKind::FMaximum:
-      return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS);
-    case RecurKind::FMinimum:
-      return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS);
     case RecurKind::SMax:
-      if (UseSelect) {
-        Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
-        return Builder.CreateSelect(Cmp, LHS, RHS, Name);
-      }
-      return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS);
     case RecurKind::SMin:
-      if (UseSelect) {
-        Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
-        return Builder.CreateSelect(Cmp, LHS, RHS, Name);
-      }
-      return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS);
     case RecurKind::UMax:
-      if (UseSelect) {
-        Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
-        return Builder.CreateSelect(Cmp, LHS, RHS, Name);
-      }
-      return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS);
     case RecurKind::UMin:
       if (UseSelect) {
-        Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
+        CmpInst::Predicate Pred = llvm::getMinMaxReductionPredicate(Kind);
+        Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name);
         return Builder.CreateSelect(Cmp, LHS, RHS, Name);
       }
-      return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS);
+      [[fallthrough]];
+    case RecurKind::FMax:
+    case RecurKind::FMin:
+    case RecurKind::FMaximum:
+    case RecurKind::FMinimum: {
+      Intrinsic::ID Id = llvm::getMinMaxReductionIntrinsicOp(Kind);
+      return Builder.CreateBinaryIntrinsic(Id, LHS, RHS);
+    }
     default:
       llvm_unreachable("Unknown reduction operation.");
     }

From 4f96fb5fb349b0030f9c14b4fe389cebc3069702 Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke@amd.com>
Date: Mon, 13 Jan 2025 14:14:13 +0530
Subject: [PATCH 235/408] Reapply "Spiller: Detach legacy pass and supply
 analyses instead (#119181)" (#122665)

Makes Inline Spiller amenable to the new PM.

This reapplies commit a531800344dc54e9c197a13b22e013f919f3f5e1 reverted
because of two unused private members reported on sanitizer bots.
---
 llvm/include/llvm/CodeGen/Spiller.h | 16 ++++++++++--
 llvm/lib/CodeGen/InlineSpiller.cpp  | 40 +++++++++++------------------
 llvm/lib/CodeGen/RegAllocBasic.cpp  | 16 ++++++++----
 llvm/lib/CodeGen/RegAllocGreedy.cpp |  4 ++-
 llvm/lib/CodeGen/RegAllocPBQP.cpp   |  5 +++-
 5 files changed, 47 insertions(+), 34 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/Spiller.h b/llvm/include/llvm/CodeGen/Spiller.h
index 51ad36bc6b1f8..3132cefeb6c68 100644
--- a/llvm/include/llvm/CodeGen/Spiller.h
+++ b/llvm/include/llvm/CodeGen/Spiller.h
@@ -19,6 +19,10 @@ class MachineFunction;
 class MachineFunctionPass;
 class VirtRegMap;
 class VirtRegAuxInfo;
+class LiveIntervals;
+class LiveStacks;
+class MachineDominatorTree;
+class MachineBlockFrequencyInfo;
 
 /// Spiller interface.
 ///
@@ -41,12 +45,20 @@ class Spiller {
   virtual ArrayRef<Register> getReplacedRegs() = 0;
 
   virtual void postOptimization() {}
+
+  struct RequiredAnalyses {
+    LiveIntervals &LIS;
+    LiveStacks &LSS;
+    MachineDominatorTree &MDT;
+    const MachineBlockFrequencyInfo &MBFI;
+  };
 };
 
 /// Create and return a spiller that will insert spill code directly instead
 /// of deferring though VirtRegMap.
-Spiller *createInlineSpiller(MachineFunctionPass &Pass, MachineFunction &MF,
-                             VirtRegMap &VRM, VirtRegAuxInfo &VRAI);
+Spiller *createInlineSpiller(const Spiller::RequiredAnalyses &Analyses,
+                             MachineFunction &MF, VirtRegMap &VRM,
+                             VirtRegAuxInfo &VRAI);
 
 } // end namespace llvm
 
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index 64f290f5930a1..f6681540e2286 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -75,7 +75,6 @@ RestrictStatepointRemat("restrict-statepoint-remat",
                        cl::desc("Restrict remat for statepoint operands"));
 
 namespace {
-
 class HoistSpillHelper : private LiveRangeEdit::Delegate {
   MachineFunction &MF;
   LiveIntervals &LIS;
@@ -128,15 +127,11 @@ class HoistSpillHelper : private LiveRangeEdit::Delegate {
                       DenseMap<MachineBasicBlock *, unsigned> &SpillsToIns);
 
 public:
-  HoistSpillHelper(MachineFunctionPass &pass, MachineFunction &mf,
-                   VirtRegMap &vrm)
-      : MF(mf), LIS(pass.getAnalysis<LiveIntervalsWrapperPass>().getLIS()),
-        LSS(pass.getAnalysis<LiveStacksWrapperLegacy>().getLS()),
-        MDT(pass.getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree()),
+  HoistSpillHelper(const Spiller::RequiredAnalyses &Analyses,
+                   MachineFunction &mf, VirtRegMap &vrm)
+      : MF(mf), LIS(Analyses.LIS), LSS(Analyses.LSS), MDT(Analyses.MDT),
         VRM(vrm), MRI(mf.getRegInfo()), TII(*mf.getSubtarget().getInstrInfo()),
-        TRI(*mf.getSubtarget().getRegisterInfo()),
-        MBFI(
-            pass.getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI()),
+        TRI(*mf.getSubtarget().getRegisterInfo()), MBFI(Analyses.MBFI),
         IPA(LIS, mf.getNumBlockIDs()) {}
 
   void addToMergeableSpills(MachineInstr &Spill, int StackSlot,
@@ -150,12 +145,10 @@ class InlineSpiller : public Spiller {
   MachineFunction &MF;
   LiveIntervals &LIS;
   LiveStacks &LSS;
-  MachineDominatorTree &MDT;
   VirtRegMap &VRM;
   MachineRegisterInfo &MRI;
   const TargetInstrInfo &TII;
   const TargetRegisterInfo &TRI;
-  const MachineBlockFrequencyInfo &MBFI;
 
   // Variables that are valid during spill(), but used by multiple methods.
   LiveRangeEdit *Edit = nullptr;
@@ -190,16 +183,12 @@ class InlineSpiller : public Spiller {
   ~InlineSpiller() override = default;
 
 public:
-  InlineSpiller(MachineFunctionPass &Pass, MachineFunction &MF, VirtRegMap &VRM,
-                VirtRegAuxInfo &VRAI)
-      : MF(MF), LIS(Pass.getAnalysis<LiveIntervalsWrapperPass>().getLIS()),
-        LSS(Pass.getAnalysis<LiveStacksWrapperLegacy>().getLS()),
-        MDT(Pass.getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree()),
-        VRM(VRM), MRI(MF.getRegInfo()), TII(*MF.getSubtarget().getInstrInfo()),
-        TRI(*MF.getSubtarget().getRegisterInfo()),
-        MBFI(
-            Pass.getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI()),
-        HSpiller(Pass, MF, VRM), VRAI(VRAI) {}
+  InlineSpiller(const Spiller::RequiredAnalyses &Analyses, MachineFunction &MF,
+                VirtRegMap &VRM, VirtRegAuxInfo &VRAI)
+      : MF(MF), LIS(Analyses.LIS), LSS(Analyses.LSS), VRM(VRM),
+        MRI(MF.getRegInfo()), TII(*MF.getSubtarget().getInstrInfo()),
+        TRI(*MF.getSubtarget().getRegisterInfo()), HSpiller(Analyses, MF, VRM),
+        VRAI(VRAI) {}
 
   void spill(LiveRangeEdit &) override;
   ArrayRef<Register> getSpilledRegs() override { return RegsToSpill; }
@@ -237,10 +226,11 @@ Spiller::~Spiller() = default;
 
 void Spiller::anchor() {}
 
-Spiller *llvm::createInlineSpiller(MachineFunctionPass &Pass,
-                                   MachineFunction &MF, VirtRegMap &VRM,
-                                   VirtRegAuxInfo &VRAI) {
-  return new InlineSpiller(Pass, MF, VRM, VRAI);
+Spiller *
+llvm::createInlineSpiller(const InlineSpiller::RequiredAnalyses &Analyses,
+                          MachineFunction &MF, VirtRegMap &VRM,
+                          VirtRegAuxInfo &VRAI) {
+  return new InlineSpiller(Analyses, MF, VRM, VRAI);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/RegAllocBasic.cpp b/llvm/lib/CodeGen/RegAllocBasic.cpp
index c05aa1e40e477..f3f34f890be11 100644
--- a/llvm/lib/CodeGen/RegAllocBasic.cpp
+++ b/llvm/lib/CodeGen/RegAllocBasic.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/LiveRegMatrix.h"
 #include "llvm/CodeGen/LiveStacks.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/Passes.h"
@@ -187,6 +188,7 @@ void RABasic::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<ProfileSummaryInfoWrapperPass>();
   AU.addRequired<MachineBlockFrequencyInfoWrapperPass>();
   AU.addPreserved<MachineBlockFrequencyInfoWrapperPass>();
+  AU.addRequired<MachineDominatorTreeWrapperPass>();
   AU.addRequiredID(MachineDominatorsID);
   AU.addPreservedID(MachineDominatorsID);
   AU.addRequired<MachineLoopInfoWrapperPass>();
@@ -310,16 +312,20 @@ bool RABasic::runOnMachineFunction(MachineFunction &mf) {
                     << "********** Function: " << mf.getName() << '\n');
 
   MF = &mf;
+  auto &MBFI = getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI();
+  auto &LiveStks = getAnalysis<LiveStacksWrapperLegacy>().getLS();
+  auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+
   RegAllocBase::init(getAnalysis<VirtRegMapWrapperLegacy>().getVRM(),
                      getAnalysis<LiveIntervalsWrapperPass>().getLIS(),
                      getAnalysis<LiveRegMatrixWrapperLegacy>().getLRM());
-  VirtRegAuxInfo VRAI(
-      *MF, *LIS, *VRM, getAnalysis<MachineLoopInfoWrapperPass>().getLI(),
-      getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI(),
-      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI());
+  VirtRegAuxInfo VRAI(*MF, *LIS, *VRM,
+                      getAnalysis<MachineLoopInfoWrapperPass>().getLI(), MBFI,
+                      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI());
   VRAI.calculateSpillWeightsAndHints();
 
-  SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM, VRAI));
+  SpillerInstance.reset(
+      createInlineSpiller({*LIS, LiveStks, MDT, MBFI}, *MF, *VRM, VRAI));
 
   allocatePhysRegs();
   postOptimization();
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index b94992c20b119..66e9cf546b837 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -2750,6 +2750,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   Bundles = &getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles();
   SpillPlacer = &getAnalysis<SpillPlacementWrapperLegacy>().getResult();
   DebugVars = &getAnalysis<LiveDebugVariablesWrapperLegacy>().getLDV();
+  auto &LSS = getAnalysis<LiveStacksWrapperLegacy>().getLS();
 
   initializeCSRCost();
 
@@ -2770,7 +2771,8 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
       getAnalysis<RegAllocPriorityAdvisorAnalysis>().getAdvisor(*MF, *this);
 
   VRAI = std::make_unique<VirtRegAuxInfo>(*MF, *LIS, *VRM, *Loops, *MBFI);
-  SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM, *VRAI));
+  SpillerInstance.reset(
+      createInlineSpiller({*LIS, LSS, *DomTree, *MBFI}, *MF, *VRM, *VRAI));
 
   VRAI->calculateSpillWeightsAndHints();
 
diff --git a/llvm/lib/CodeGen/RegAllocPBQP.cpp b/llvm/lib/CodeGen/RegAllocPBQP.cpp
index 696c312e4ba00..e230a1be95c9f 100644
--- a/llvm/lib/CodeGen/RegAllocPBQP.cpp
+++ b/llvm/lib/CodeGen/RegAllocPBQP.cpp
@@ -794,6 +794,9 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
   MachineBlockFrequencyInfo &MBFI =
       getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI();
 
+  auto &LiveStks = getAnalysis<LiveStacksWrapperLegacy>().getLS();
+  auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+
   VirtRegMap &VRM = getAnalysis<VirtRegMapWrapperLegacy>().getVRM();
 
   PBQPVirtRegAuxInfo VRAI(
@@ -807,7 +810,7 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
   VirtRegAuxInfo DefaultVRAI(
       MF, LIS, VRM, getAnalysis<MachineLoopInfoWrapperPass>().getLI(), MBFI);
   std::unique_ptr<Spiller> VRegSpiller(
-      createInlineSpiller(*this, MF, VRM, DefaultVRAI));
+      createInlineSpiller({LIS, LiveStks, MDT, MBFI}, MF, VRM, DefaultVRAI));
 
   MF.getRegInfo().freezeReservedRegs();
 

From b270525f730be6e7196667925f5a9bfa153262e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bal=C3=A1zs=20K=C3=A9ri?= <balazs.keri@ericsson.com>
Date: Mon, 13 Jan 2025 09:46:45 +0100
Subject: [PATCH 236/408] [clang][ASTImporter] Not using primary context in
 lookup table (#118466)

`ASTImporterLookupTable` did use the `getPrimaryContext` function to get
the declaration context of the inserted items. This is problematic
because the primary context can change during import of AST items, most
likely if a definition of a previously not defined class is imported.
(For any record the primary context is the definition if there is one.)
The use of primary context is really not important, only for namespaces
because these can be re-opened and lookup in one namespace block is not
enough. This special search is now moved into ASTImporter instead of
relying on the lookup table.
---
 clang/lib/AST/ASTImporter.cpp            |  24 +++-
 clang/lib/AST/ASTImporterLookupTable.cpp |  20 +--
 clang/unittests/AST/ASTImporterTest.cpp  | 152 ++++++++++++++++++++++-
 3 files changed, 181 insertions(+), 15 deletions(-)

diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 26d33b0d94795..dec4c7221bc77 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -3165,6 +3165,7 @@ ExpectedDecl ASTNodeImporter::VisitRecordDecl(RecordDecl *D) {
                 if (Error Err = ImportImplicitMethods(DCXX, FoundCXX))
                   return std::move(Err);
             }
+            // FIXME: We can return FoundDef here.
           }
           PrevDecl = FoundRecord->getMostRecentDecl();
           break;
@@ -9064,9 +9065,26 @@ ASTImporter::findDeclsInToCtx(DeclContext *DC, DeclarationName Name) {
   // We can diagnose this only if we search in the redecl context.
   DeclContext *ReDC = DC->getRedeclContext();
   if (SharedState->getLookupTable()) {
-    ASTImporterLookupTable::LookupResult LookupResult =
-        SharedState->getLookupTable()->lookup(ReDC, Name);
-    return FoundDeclsTy(LookupResult.begin(), LookupResult.end());
+    if (ReDC->isNamespace()) {
+      // Namespaces can be reopened.
+      // Lookup table does not handle this, we must search here in all linked
+      // namespaces.
+      FoundDeclsTy Result;
+      SmallVector<Decl *, 2> NSChain =
+          getCanonicalForwardRedeclChain<NamespaceDecl>(
+              dyn_cast<NamespaceDecl>(ReDC));
+      for (auto *D : NSChain) {
+        ASTImporterLookupTable::LookupResult LookupResult =
+            SharedState->getLookupTable()->lookup(dyn_cast<NamespaceDecl>(D),
+                                                  Name);
+        Result.append(LookupResult.begin(), LookupResult.end());
+      }
+      return Result;
+    } else {
+      ASTImporterLookupTable::LookupResult LookupResult =
+          SharedState->getLookupTable()->lookup(ReDC, Name);
+      return FoundDeclsTy(LookupResult.begin(), LookupResult.end());
+    }
   } else {
     DeclContext::lookup_result NoloadLookupResult = ReDC->noload_lookup(Name);
     FoundDeclsTy Result(NoloadLookupResult.begin(), NoloadLookupResult.end());
diff --git a/clang/lib/AST/ASTImporterLookupTable.cpp b/clang/lib/AST/ASTImporterLookupTable.cpp
index 07d39dcee2583..4ed3198d7ea62 100644
--- a/clang/lib/AST/ASTImporterLookupTable.cpp
+++ b/clang/lib/AST/ASTImporterLookupTable.cpp
@@ -115,8 +115,9 @@ void ASTImporterLookupTable::remove(DeclContext *DC, NamedDecl *ND) {
 #ifndef NDEBUG
   if (!EraseResult) {
     std::string Message =
-        llvm::formatv("Trying to remove not contained Decl '{0}' of type {1}",
-                      Name.getAsString(), DC->getDeclKindName())
+        llvm::formatv(
+            "Trying to remove not contained Decl '{0}' of type {1} from a {2}",
+            Name.getAsString(), ND->getDeclKindName(), DC->getDeclKindName())
             .str();
     llvm_unreachable(Message.c_str());
   }
@@ -125,18 +126,18 @@ void ASTImporterLookupTable::remove(DeclContext *DC, NamedDecl *ND) {
 
 void ASTImporterLookupTable::add(NamedDecl *ND) {
   assert(ND);
-  DeclContext *DC = ND->getDeclContext()->getPrimaryContext();
+  DeclContext *DC = ND->getDeclContext();
   add(DC, ND);
-  DeclContext *ReDC = DC->getRedeclContext()->getPrimaryContext();
+  DeclContext *ReDC = DC->getRedeclContext();
   if (DC != ReDC)
     add(ReDC, ND);
 }
 
 void ASTImporterLookupTable::remove(NamedDecl *ND) {
   assert(ND);
-  DeclContext *DC = ND->getDeclContext()->getPrimaryContext();
+  DeclContext *DC = ND->getDeclContext();
   remove(DC, ND);
-  DeclContext *ReDC = DC->getRedeclContext()->getPrimaryContext();
+  DeclContext *ReDC = DC->getRedeclContext();
   if (DC != ReDC)
     remove(ReDC, ND);
 }
@@ -161,7 +162,7 @@ void ASTImporterLookupTable::updateForced(NamedDecl *ND, DeclContext *OldDC) {
 
 ASTImporterLookupTable::LookupResult
 ASTImporterLookupTable::lookup(DeclContext *DC, DeclarationName Name) const {
-  auto DCI = LookupTable.find(DC->getPrimaryContext());
+  auto DCI = LookupTable.find(DC);
   if (DCI == LookupTable.end())
     return {};
 
@@ -178,7 +179,7 @@ bool ASTImporterLookupTable::contains(DeclContext *DC, NamedDecl *ND) const {
 }
 
 void ASTImporterLookupTable::dump(DeclContext *DC) const {
-  auto DCI = LookupTable.find(DC->getPrimaryContext());
+  auto DCI = LookupTable.find(DC);
   if (DCI == LookupTable.end())
     llvm::errs() << "empty\n";
   const auto &FoundNameMap = DCI->second;
@@ -196,8 +197,7 @@ void ASTImporterLookupTable::dump(DeclContext *DC) const {
 void ASTImporterLookupTable::dump() const {
   for (const auto &Entry : LookupTable) {
     DeclContext *DC = Entry.first;
-    StringRef Primary = DC->getPrimaryContext() ? " primary" : "";
-    llvm::errs() << "== DC:" << cast<Decl>(DC) << Primary << "\n";
+    llvm::errs() << "== DC:" << cast<Decl>(DC) << "\n";
     dump(DC);
   }
 }
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index f2bfde9bed372..a0aaad6082d8c 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -6052,7 +6052,7 @@ TEST_P(ASTImporterLookupTableTest, EnumConstantDecl) {
   EXPECT_EQ(*Res.begin(), A);
 }
 
-TEST_P(ASTImporterLookupTableTest, LookupSearchesInTheWholeRedeclChain) {
+TEST_P(ASTImporterLookupTableTest, LookupSearchesInActualNamespaceOnly) {
   TranslationUnitDecl *ToTU = getToTuDecl(
       R"(
       namespace N {
@@ -6062,7 +6062,9 @@ TEST_P(ASTImporterLookupTableTest, LookupSearchesInTheWholeRedeclChain) {
       }
       )",
       Lang_CXX03);
-  auto *N1 =
+  auto *N1 = FirstDeclMatcher<NamespaceDecl>().match(
+      ToTU, namespaceDecl(hasName("N")));
+  auto *N2 =
       LastDeclMatcher<NamespaceDecl>().match(ToTU, namespaceDecl(hasName("N")));
   auto *A = FirstDeclMatcher<VarDecl>().match(ToTU, varDecl(hasName("A")));
   DeclarationName Name = A->getDeclName();
@@ -6071,6 +6073,7 @@ TEST_P(ASTImporterLookupTableTest, LookupSearchesInTheWholeRedeclChain) {
   auto Res = LT.lookup(N1, Name);
   ASSERT_EQ(Res.size(), 1u);
   EXPECT_EQ(*Res.begin(), A);
+  EXPECT_TRUE(LT.lookup(N2, Name).empty());
 }
 
 TEST_P(ASTImporterOptionSpecificTestBase,
@@ -10170,6 +10173,151 @@ TEST_P(ImportTemplateParmDeclDefaultValue,
       FromD, FromDInherited);
 }
 
+TEST_P(ASTImporterOptionSpecificTestBase, ImportIntoReopenedNamespaceNoMatch1) {
+  const char *ToCode =
+      R"(
+      namespace a {
+      }
+      namespace a {
+        struct X { int A; };
+      }
+      )";
+  Decl *ToTU = getToTuDecl(ToCode, Lang_CXX11);
+  const char *Code =
+      R"(
+      namespace a {
+        struct X { char A; };
+      }
+      )";
+  Decl *FromTU = getTuDecl(Code, Lang_CXX11);
+  auto *FromX = FirstDeclMatcher<CXXRecordDecl>().match(
+      FromTU, cxxRecordDecl(hasName("X")));
+  auto *ImportedX = Import(FromX, Lang_CXX11);
+  EXPECT_FALSE(ImportedX);
+}
+
+TEST_P(ASTImporterOptionSpecificTestBase, ImportIntoReopenedNamespaceNoMatch2) {
+  const char *ToCode =
+      R"(
+      namespace a {
+        struct X { int A; };
+      }
+      namespace a {
+      }
+      )";
+  Decl *ToTU = getToTuDecl(ToCode, Lang_CXX11);
+  const char *Code =
+      R"(
+      namespace a {
+        struct X { char A; };
+      }
+      )";
+  Decl *FromTU = getTuDecl(Code, Lang_CXX11);
+  auto *FromX = FirstDeclMatcher<CXXRecordDecl>().match(
+      FromTU, cxxRecordDecl(hasName("X")));
+  auto *ImportedX = Import(FromX, Lang_CXX11);
+  EXPECT_FALSE(ImportedX);
+}
+
+TEST_P(ASTImporterOptionSpecificTestBase, ImportIntoReopenedNamespaceMatch1) {
+  const char *ToCode =
+      R"(
+      namespace a {
+      }
+      namespace a {
+        struct X { int A; };
+      }
+      )";
+  Decl *ToTU = getToTuDecl(ToCode, Lang_CXX11);
+  const char *Code =
+      R"(
+      namespace a {
+        struct X { int A; };
+      }
+      )";
+  Decl *FromTU = getTuDecl(Code, Lang_CXX11);
+  auto *FromX = FirstDeclMatcher<CXXRecordDecl>().match(
+      FromTU, cxxRecordDecl(hasName("X")));
+  auto *ToX = FirstDeclMatcher<CXXRecordDecl>().match(
+      ToTU, cxxRecordDecl(hasName("X")));
+  auto *ImportedX = Import(FromX, Lang_CXX11);
+  EXPECT_EQ(ImportedX, ToX);
+}
+
+TEST_P(ASTImporterOptionSpecificTestBase, ImportIntoReopenedNamespaceMatch2) {
+  const char *ToCode =
+      R"(
+      namespace a {
+        struct X { int A; };
+      }
+      namespace a {
+      }
+      )";
+  Decl *ToTU = getToTuDecl(ToCode, Lang_CXX11);
+  const char *Code =
+      R"(
+      namespace a {
+        struct X { int A; };
+      }
+      )";
+  Decl *FromTU = getTuDecl(Code, Lang_CXX11);
+  auto *FromX = FirstDeclMatcher<CXXRecordDecl>().match(
+      FromTU, cxxRecordDecl(hasName("X")));
+  auto *ToX = FirstDeclMatcher<CXXRecordDecl>().match(
+      ToTU, cxxRecordDecl(hasName("X")));
+  auto *ImportedX = Import(FromX, Lang_CXX11);
+  EXPECT_EQ(ImportedX, ToX);
+}
+
+TEST_P(ASTImporterLookupTableTest, PrimaryDCChangeAtImport) {
+  const char *ToCode =
+      R"(
+      template <class T>
+      struct X;
+      )";
+  Decl *ToTU = getToTuDecl(ToCode, Lang_CXX11);
+  auto *ToX = FirstDeclMatcher<ClassTemplateDecl>().match(
+      ToTU, classTemplateDecl(hasName("X")));
+  NamedDecl *ToParm = ToX->getTemplateParameters()->getParam(0);
+  DeclContext *OldPrimaryDC = ToX->getTemplatedDecl()->getPrimaryContext();
+  ASSERT_EQ(ToParm->getDeclContext(), ToX->getTemplatedDecl());
+  ASSERT_EQ(SharedStatePtr->getLookupTable()
+                ->lookup(ToX->getTemplatedDecl(), ToParm->getDeclName())
+                .size(),
+            1u);
+  ASSERT_TRUE(SharedStatePtr->getLookupTable()->contains(
+      ToX->getTemplatedDecl(), ToParm));
+
+  const char *Code =
+      R"(
+      template <class T>
+      struct X;
+      template <class T>
+      struct X {};
+      )";
+  Decl *FromTU = getTuDecl(Code, Lang_CXX11);
+  auto *FromX = LastDeclMatcher<ClassTemplateDecl>().match(
+      FromTU, classTemplateDecl(hasName("X")));
+
+  auto *ImportedX = Import(FromX, Lang_CXX11);
+
+  EXPECT_TRUE(ImportedX);
+  EXPECT_EQ(ImportedX->getTemplateParameters()->getParam(0)->getDeclContext(),
+            ImportedX->getTemplatedDecl());
+
+  // ToX did not change at the import.
+  // Verify that primary context has changed after import of class definition.
+  DeclContext *NewPrimaryDC = ToX->getTemplatedDecl()->getPrimaryContext();
+  EXPECT_NE(OldPrimaryDC, NewPrimaryDC);
+  // The lookup table should not be different than it was before.
+  EXPECT_EQ(SharedStatePtr->getLookupTable()
+                ->lookup(ToX->getTemplatedDecl(), ToParm->getDeclName())
+                .size(),
+            1u);
+  EXPECT_TRUE(SharedStatePtr->getLookupTable()->contains(
+      ToX->getTemplatedDecl(), ToParm));
+}
+
 TEST_P(ASTImporterOptionSpecificTestBase,
        ExistingUndeclaredImportDeclaredFriend) {
   Decl *ToTU = getToTuDecl(

From 1b199d19902a752433c397377567ff381261e94a Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Mon, 13 Jan 2025 09:05:18 +0000
Subject: [PATCH 237/408] [ci] Handle the case where all reported tests pass
 but the build is still a failure (#120264)

In this build:
https://buildkite.com/llvm-project/github-pull-requests/builds/126961

The builds actually failed, probably because prerequisite of a test
suite failed to build.

However they still ran other tests and all those passed. This meant that
the test reports were green even though the build was red. On some level
this is technically correct, but it is very misleading in practice.

So I've also passed the build script's return code, as it was when we
entered the on exit handler, to the generator, so that when this happens
again, the report will draw the viewer's attention to the overall
failure. There will be a link in the report to the build's log file, so
the next step to investigate is clear.

It would be nice to say "tests failed and there was some other build
error", but we cannot tell what the non-zero return code was caused by.
Could be either.

The script handles the following situations now:
| Have Result Files? | Tests reported failed? | Return code | Report |

|--------------------|------------------------|-------------|-----------------------------------------------------------------------------|
| Yes | No | 0 | Success style report. |
| Yes | Yes | 0 | Shouldn't happen, but if it did, failure style report
showing the failures. |
| Yes | No | 1 | Failure style report, showing no failures but noting
that the build failed. |
| Yes | Yes | 1 | Failure style report, showing the test failures. |
| No | ? | 0 | No test report, success shown in the normal build
display. |
| No | ? | 1 | No test report, failure shown in the normal build
display. |
---
 .ci/generate_test_report.py | 102 +++++++++++++++++++++++++++++++-----
 .ci/monolithic-linux.sh     |   4 +-
 .ci/monolithic-windows.sh   |   4 +-
 3 files changed, 94 insertions(+), 16 deletions(-)

diff --git a/.ci/generate_test_report.py b/.ci/generate_test_report.py
index ff601a0cde106..6f2137e7803bb 100644
--- a/.ci/generate_test_report.py
+++ b/.ci/generate_test_report.py
@@ -19,12 +19,13 @@ def junit_from_xml(xml):
 
 class TestReports(unittest.TestCase):
     def test_title_only(self):
-        self.assertEqual(_generate_report("Foo", []), ("", "success"))
+        self.assertEqual(_generate_report("Foo", 0, []), ("", "success"))
 
     def test_no_tests_in_testsuite(self):
         self.assertEqual(
             _generate_report(
                 "Foo",
+                1,
                 [
                     junit_from_xml(
                         dedent(
@@ -45,6 +46,7 @@ def test_no_failures(self):
         self.assertEqual(
             _generate_report(
                 "Foo",
+                0,
                 [
                     junit_from_xml(
                         dedent(
@@ -70,10 +72,51 @@ def test_no_failures(self):
             ),
         )
 
+    def test_no_failures_build_failed(self):
+        self.assertEqual(
+            _generate_report(
+                "Foo",
+                1,
+                [
+                    junit_from_xml(
+                        dedent(
+                            """\
+          <?xml version="1.0" encoding="UTF-8"?>
+          <testsuites time="0.00">
+          <testsuite name="Passed" tests="1" failures="0" skipped="0" time="0.00">
+          <testcase classname="Bar/test_1" name="test_1" time="0.00"/>
+          </testsuite>
+          </testsuites>"""
+                        )
+                    )
+                ],
+                buildkite_info={
+                    "BUILDKITE_ORGANIZATION_SLUG": "organization_slug",
+                    "BUILDKITE_PIPELINE_SLUG": "pipeline_slug",
+                    "BUILDKITE_BUILD_NUMBER": "build_number",
+                    "BUILDKITE_JOB_ID": "job_id",
+                },
+            ),
+            (
+                dedent(
+                    """\
+              # Foo
+
+              * 1 test passed
+
+              All tests passed but another part of the build **failed**.
+
+              [Download](https://buildkite.com/organizations/organization_slug/pipelines/pipeline_slug/builds/build_number/jobs/job_id/download.txt) the build's log file to see the details."""
+                ),
+                "error",
+            ),
+        )
+
     def test_report_single_file_single_testsuite(self):
         self.assertEqual(
             _generate_report(
                 "Foo",
+                1,
                 [
                     junit_from_xml(
                         dedent(
@@ -166,6 +209,7 @@ def test_report_single_file_multiple_testsuites(self):
         self.assertEqual(
             _generate_report(
                 "ABC and DEF",
+                1,
                 [
                     junit_from_xml(
                         dedent(
@@ -198,6 +242,7 @@ def test_report_multiple_files_multiple_testsuites(self):
         self.assertEqual(
             _generate_report(
                 "ABC and DEF",
+                1,
                 [
                     junit_from_xml(
                         dedent(
@@ -238,6 +283,7 @@ def test_report_dont_list_failures(self):
         self.assertEqual(
             _generate_report(
                 "Foo",
+                1,
                 [
                     junit_from_xml(
                         dedent(
@@ -272,6 +318,7 @@ def test_report_dont_list_failures_link_to_log(self):
         self.assertEqual(
             _generate_report(
                 "Foo",
+                1,
                 [
                     junit_from_xml(
                         dedent(
@@ -312,6 +359,7 @@ def test_report_size_limit(self):
         self.assertEqual(
             _generate_report(
                 "Foo",
+                1,
                 [
                     junit_from_xml(
                         dedent(
@@ -351,12 +399,18 @@ def test_report_size_limit(self):
 # and output will not be.
 def _generate_report(
     title,
+    return_code,
     junit_objects,
     size_limit=1024 * 1024,
     list_failures=True,
     buildkite_info=None,
 ):
     if not junit_objects:
+        # Note that we do not post an empty report, therefore we can ignore a
+        # non-zero return code in situations like this.
+        #
+        # If we were going to post a report, then yes, it would be misleading
+        # to say we succeeded when the final return code was non-zero.
         return ("", "success")
 
     failures = {}
@@ -385,7 +439,11 @@ def _generate_report(
     if not tests_run:
         return ("", None)
 
-    style = "error" if tests_failed else "success"
+    style = "success"
+    # Either tests failed, or all tests passed but something failed to build.
+    if tests_failed or return_code != 0:
+        style = "error"
+
     report = [f"# {title}", ""]
 
     tests_passed = tests_run - tests_skipped - tests_failed
@@ -400,17 +458,17 @@ def plural(num_tests):
     if tests_failed:
         report.append(f"* {tests_failed} {plural(tests_failed)} failed")
 
-    if not list_failures:
-        if buildkite_info is not None:
-            log_url = (
-                "https://buildkite.com/organizations/{BUILDKITE_ORGANIZATION_SLUG}/"
-                "pipelines/{BUILDKITE_PIPELINE_SLUG}/builds/{BUILDKITE_BUILD_NUMBER}/"
-                "jobs/{BUILDKITE_JOB_ID}/download.txt".format(**buildkite_info)
-            )
-            download_text = f"[Download]({log_url})"
-        else:
-            download_text = "Download"
+    if buildkite_info is not None:
+        log_url = (
+            "https://buildkite.com/organizations/{BUILDKITE_ORGANIZATION_SLUG}/"
+            "pipelines/{BUILDKITE_PIPELINE_SLUG}/builds/{BUILDKITE_BUILD_NUMBER}/"
+            "jobs/{BUILDKITE_JOB_ID}/download.txt".format(**buildkite_info)
+        )
+        download_text = f"[Download]({log_url})"
+    else:
+        download_text = "Download"
 
+    if not list_failures:
         report.extend(
             [
                 "",
@@ -435,11 +493,23 @@ def plural(num_tests):
                         "</details>",
                     ]
                 )
+    elif return_code != 0:
+        # No tests failed but the build was in a failed state. Bring this to the user's
+        # attention.
+        report.extend(
+            [
+                "",
+                "All tests passed but another part of the build **failed**.",
+                "",
+                f"{download_text} the build's log file to see the details.",
+            ]
+        )
 
     report = "\n".join(report)
     if len(report.encode("utf-8")) > size_limit:
         return _generate_report(
             title,
+            return_code,
             junit_objects,
             size_limit,
             list_failures=False,
@@ -449,9 +519,10 @@ def plural(num_tests):
     return report, style
 
 
-def generate_report(title, junit_files, buildkite_info):
+def generate_report(title, return_code, junit_files, buildkite_info):
     return _generate_report(
         title,
+        return_code,
         [JUnitXml.fromfile(p) for p in junit_files],
         buildkite_info=buildkite_info,
     )
@@ -463,6 +534,7 @@ def generate_report(title, junit_files, buildkite_info):
         "title", help="Title of the test report, without Markdown formatting."
     )
     parser.add_argument("context", help="Annotation context to write to.")
+    parser.add_argument("return_code", help="The build's return code.", type=int)
     parser.add_argument("junit_files", help="Paths to JUnit report files.", nargs="*")
     args = parser.parse_args()
 
@@ -477,7 +549,9 @@ def generate_report(title, junit_files, buildkite_info):
     if len(buildkite_info) != len(env_var_names):
         buildkite_info = None
 
-    report, style = generate_report(args.title, args.junit_files, buildkite_info)
+    report, style = generate_report(
+        args.title, args.return_code, args.junit_files, buildkite_info
+    )
 
     if report:
         p = subprocess.Popen(
diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh
index 4bfebd5f75279..55741bc831046 100755
--- a/.ci/monolithic-linux.sh
+++ b/.ci/monolithic-linux.sh
@@ -29,6 +29,8 @@ if [[ -n "${CLEAR_CACHE:-}" ]]; then
 fi
 
 function at-exit {
+  retcode=$?
+
   mkdir -p artifacts
   ccache --print-stats > artifacts/ccache_stats.txt
 
@@ -37,7 +39,7 @@ function at-exit {
   if command -v buildkite-agent 2>&1 >/dev/null
   then
     python3 "${MONOREPO_ROOT}"/.ci/generate_test_report.py ":linux: Linux x64 Test Results" \
-      "linux-x64-test-results" "${BUILD_DIR}"/test-results.*.xml
+      "linux-x64-test-results" $retcode "${BUILD_DIR}"/test-results.*.xml
   fi
 }
 trap at-exit EXIT
diff --git a/.ci/monolithic-windows.sh b/.ci/monolithic-windows.sh
index 25cdd2f419f47..68303a3ea153a 100755
--- a/.ci/monolithic-windows.sh
+++ b/.ci/monolithic-windows.sh
@@ -28,6 +28,8 @@ fi
 
 sccache --zero-stats
 function at-exit {
+  retcode=$?
+
   mkdir -p artifacts
   sccache --show-stats >> artifacts/sccache_stats.txt
 
@@ -36,7 +38,7 @@ function at-exit {
   if command -v buildkite-agent 2>&1 >/dev/null
   then
     python "${MONOREPO_ROOT}"/.ci/generate_test_report.py ":windows: Windows x64 Test Results" \
-      "windows-x64-test-results" "${BUILD_DIR}"/test-results.*.xml
+      "windows-x64-test-results" $retcode "${BUILD_DIR}"/test-results.*.xml
   fi
 }
 trap at-exit EXIT

From d03f35f9b6d031d6a9375d90ccf7cc285f8e4b79 Mon Sep 17 00:00:00 2001
From: xiaoleis-nv <99947620+xiaoleis-nv@users.noreply.github.com>
Date: Mon, 13 Jan 2025 17:33:05 +0800
Subject: [PATCH 238/408] [MLIR][NVVM] Fix the datatype error for nvvm.mma.sync
 when the operand is bf16 (#122664)

The PR fixes the datatype error for `nvvm.mma.sync` when the operand is
`bf16`. This operation originally requires the A/B type to be `f16x2`
for the `bf16` MMA. However, it violates the NVVM intrinsic
[[here](https://github.com/xiaoleis-nv/llvm-project/blob/372044ee09d39942925824f8f335aef40bfe92f0/llvm/include/llvm/IR/IntrinsicsNVVM.td#L119)],
where the A/B operand type should be `i32`. This is a bug, and there are
no tests in MLIR that cover this datatype.

```
    // mma bf16 -> s32 @ m16n8k16/m16n8k8
    !eq(gft,"m16n8k16:a:bf16") : !listsplat(llvm_i32_ty, 4),
    !eq(gft,"m16n8k16:b:bf16") : !listsplat(llvm_i32_ty, 2),
    !eq(gft,"m16n8k8:a:bf16") : !listsplat(llvm_i32_ty, 2),
    !eq(gft,"m16n8k8:b:bf16") : [llvm_i32_ty],
```

This PR addresses this bug and adds tests to guarantee correctness.

Co-authored-by: Xiaolei Shi <xiaoleis@nvidia.com>
---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td |  4 ++--
 mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp  |  7 ++++++-
 mlir/test/Dialect/LLVMIR/nvvm.mlir          | 23 +++++++++++++++++++++
 mlir/test/Target/LLVMIR/nvvmir.mlir         | 12 +++++++++++
 4 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 0b9097e9bbca2..04042903e343e 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -1699,8 +1699,8 @@ def NVVM_MmaOp : NVVM_Op<"mma.sync", [AttrSizedOperandSegments]> {
     | f16      | .m8n8k4   | row/col | row/col | 2x f16x2 | 2x f16x2 | 4x f16x2 or 8xf32 |
     |          | .m16n8k8  | row     | col     | 2x f16x2 | 1x f16x2 | 2x f16x2 or 4 f32 |
     |          | .m16n8k16 | row     | col     | 4x f16x2 | 2x f16x2 | 2x f16x2 or 4 f32 |
-    | bf16     | .m16n8k8  | row     | col     | 2x f16x2 | 1x f16x2 | 2x f16x2 or 4 f32 |
-    |          | .m16n8k16 | row     | col     | 4x f16x2 | 2x f16x2 | 2x f16x2 or 4 f32 |
+    | bf16     | .m16n8k8  | row     | col     | 2x i32   | 1x i32   | 4x f32            |
+    |          | .m16n8k16 | row     | col     | 4x i32   | 2x i32   | 4x f32            |
     | tf32     | .m16n8k4  | row     | col     | 2x i32   | 1x i32   | 4x f32            |
     |          | .m16n8k8  | row     | col     | 4x i32   | 2x i32   | 2x f16x2 or 4 f32 |
     | u8/s8    | .m8n8k16  | row     | col     | 1x i32   | 1x i32   | 2x i32            |
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 838159d676545..d8fde3e765ac4 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -445,8 +445,13 @@ LogicalResult MmaOp::verify() {
       expectedResult.push_back(LLVM::LLVMStructType::getLiteral(
           context, {f32Ty, f32Ty, f32Ty, f32Ty}));
       break;
-    case MMATypes::f16:
     case MMATypes::bf16:
+      kFactor = 8;
+      multiplicandFragType = i32Ty;
+      expectedResult.push_back(LLVM::LLVMStructType::getLiteral(
+          context, {f32Ty, f32Ty, f32Ty, f32Ty}));
+      break;
+    case MMATypes::f16:
       kFactor = 8;
       multiplicandFragType = f16x2Ty;
       expectedResult.push_back(f16x2x2StructTy);
diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir
index a7bdceba01c1e..4c3b6648a41c0 100644
--- a/mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -163,6 +163,29 @@ func.func @nvvm_mma_m8n8k4_f16_f16(%a0 : vector<2xf16>, %a1 : vector<2xf16>,
   llvm.return %0 : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
 }
 
+// CHECK-LABEL: @nvvm_mma_m16n8k8_bf16_bf16
+func.func @nvvm_mma_m16n8k8_bf16_bf16(%a0 : i32, %a1 : i32, %b0 : i32,
+                              %c0 : f32, %c1 : f32, %c2 : f32, %c3 : f32) {
+  // CHECK: nvvm.mma.sync A[{{.*}}] B[{{.*}}] C[{{.*}}] {layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<col>, multiplicandAPtxType = #nvvm.mma_type<bf16>, multiplicandBPtxType = #nvvm.mma_type<bf16>, shape = #nvvm.shape<m = 16, n = 8, k = 8>} : (i32, i32, f32) -> !llvm.struct<(f32, f32, f32, f32)>
+  %0 = nvvm.mma.sync A[%a0, %a1] B[%b0] C[%c0, %c1, %c2, %c3]
+    {layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<col>,
+     multiplicandAPtxType = #nvvm.mma_type<bf16>, multiplicandBPtxType = #nvvm.mma_type<bf16>,
+     shape = #nvvm.shape<m = 16, n = 8, k = 8>} : (i32, i32, f32) -> !llvm.struct<(f32, f32, f32, f32)>
+  llvm.return %0 : !llvm.struct<(f32, f32, f32, f32)>
+}
+
+// CHECK-LABEL: @nvvm_mma_m16n8k16_bf16_bf16
+func.func @nvvm_mma_m16n8k16_bf16_bf16(%a0 : i32, %a1 : i32, %a2 : i32, %a3 : i32,
+                              %b0 : i32, %b1 : i32,
+                              %c0 : f32, %c1 : f32, %c2 : f32, %c3 : f32) {
+  // CHECK: nvvm.mma.sync A[{{.*}}] B[{{.*}}] C[{{.*}}] {layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<col>, multiplicandAPtxType = #nvvm.mma_type<bf16>, multiplicandBPtxType = #nvvm.mma_type<bf16>, shape = #nvvm.shape<m = 16, n = 8, k = 16>} : (i32, i32, f32) -> !llvm.struct<(f32, f32, f32, f32)>
+  %0 = nvvm.mma.sync A[%a0, %a1, %a2, %a3] B[%b0, %b1] C[%c0, %c1, %c2, %c3]
+    {layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<col>,
+     multiplicandAPtxType = #nvvm.mma_type<bf16>, multiplicandBPtxType = #nvvm.mma_type<bf16>,
+     shape = #nvvm.shape<m = 16, n = 8, k = 16>} : (i32, i32, f32) -> !llvm.struct<(f32, f32, f32, f32)>
+  llvm.return %0 : !llvm.struct<(f32, f32, f32, f32)>
+}
+
 // CHECK-LABEL: @nvvm_mma_m8n8k16_s8_s8
 func.func @nvvm_mma_m8n8k16_s8_s8(%a0 : i32, %b0 : i32,
                              %c0 : i32, %c1 : i32) {
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index 2d7710e7cbf27..09e98765413f0 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -291,6 +291,18 @@ llvm.func @nvvm_mma_m16n8k16_f16_f16(%a0 : vector<2xf16>, %a1 : vector<2xf16>,
   llvm.return %0 : !llvm.struct<(vector<2xf16>, vector<2xf16>)>
 }
 
+// CHECK-LABEL: @nvvm_mma_m16n8k16_bf16_bf16
+llvm.func @nvvm_mma_m16n8k16_bf16_bf16(%a0 : i32, %a1 : i32, %a2 : i32, %a3 : i32,
+                              %b0 : i32, %b1 : i32,
+                              %c0 : f32, %c1 : f32, %c2 : f32, %c3 : f32) -> !llvm.struct<(f32, f32, f32, f32)> {
+  // CHECK: call { float, float, float, float } @llvm.nvvm.mma.m16n8k16.row.col.bf16
+  %0 = nvvm.mma.sync A[%a0, %a1, %a2, %a3] B[%b0, %b1] C[%c0, %c1, %c2, %c3]
+    {layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<col>,
+     multiplicandAPtxType = #nvvm.mma_type<bf16>, multiplicandBPtxType = #nvvm.mma_type<bf16>,
+     shape = #nvvm.shape<m = 16, n = 8, k = 16>} : (i32, i32, f32) -> !llvm.struct<(f32, f32, f32, f32)>
+  llvm.return %0 : !llvm.struct<(f32, f32, f32, f32)>
+}
+
 // f32 return type, f16 accumulate type
 // CHECK-LABEL: @nvvm_mma_m16n8k16_f32_f16
 llvm.func @nvvm_mma_m16n8k16_f32_f16(%a0 : vector<2xf16>, %a1 : vector<2xf16>,

From e2a071ece58790f8dd4886e998033cab82e906fb Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard@arm.com>
Date: Mon, 13 Jan 2025 09:55:08 +0000
Subject: [PATCH 239/408] [MachineCP] Correctly handle register masks and
 sub-registers (#122472)

When passing an instruction with a register mask, the machine copy
propagation pass was dropping the information about some copy
instructions which define a register which is preserved by the mask,
because that register overlaps a register which is partially clobbered
by it. This resulted in a miscompilation for AArch64, because this
caused a live copy to be considered dead.

The fix is to clobber register masks by finding the set of reg units
which is preserved by the mask, and clobbering all units not in that
set.
---
 llvm/lib/CodeGen/MachineCopyPropagation.cpp   | 136 ++++++++++--------
 .../CodeGen/AArch64/machine-cp-sub-reg.mir    |  32 ++++-
 2 files changed, 111 insertions(+), 57 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index 49ce4b660c3ae..d2579e2d1b44c 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -164,67 +164,91 @@ class CopyTracker {
       Copies.erase(Unit);
   }
 
-  /// Clobber a single register, removing it from the tracker's copy maps.
-  void clobberRegister(MCRegister Reg, const TargetRegisterInfo &TRI,
-                       const TargetInstrInfo &TII, bool UseCopyInstr) {
-    for (MCRegUnit Unit : TRI.regunits(Reg)) {
-      auto I = Copies.find(Unit);
-      if (I != Copies.end()) {
-        // When we clobber the source of a copy, we need to clobber everything
-        // it defined.
-        markRegsUnavailable(I->second.DefRegs, TRI);
-        // When we clobber the destination of a copy, we need to clobber the
-        // whole register it defined.
-        if (MachineInstr *MI = I->second.MI) {
-          std::optional<DestSourcePair> CopyOperands =
-              isCopyInstr(*MI, TII, UseCopyInstr);
-
-          MCRegister Def = CopyOperands->Destination->getReg().asMCReg();
-          MCRegister Src = CopyOperands->Source->getReg().asMCReg();
-
-          markRegsUnavailable(Def, TRI);
-
-          // Since we clobber the destination of a copy, the semantic of Src's
-          // "DefRegs" to contain Def is no longer effectual. We will also need
-          // to remove the record from the copy maps that indicates Src defined
-          // Def. Failing to do so might cause the target to miss some
-          // opportunities to further eliminate redundant copy instructions.
-          // Consider the following sequence during the
-          // ForwardCopyPropagateBlock procedure:
-          // L1: r0 = COPY r9     <- TrackMI
-          // L2: r0 = COPY r8     <- TrackMI (Remove r9 defined r0 from tracker)
-          // L3: use r0           <- Remove L2 from MaybeDeadCopies
-          // L4: early-clobber r9 <- Clobber r9 (L2 is still valid in tracker)
-          // L5: r0 = COPY r8     <- Remove NopCopy
-          for (MCRegUnit SrcUnit : TRI.regunits(Src)) {
-            auto SrcCopy = Copies.find(SrcUnit);
-            if (SrcCopy != Copies.end() && SrcCopy->second.LastSeenUseInCopy) {
-              // If SrcCopy defines multiple values, we only need
-              // to erase the record for Def in DefRegs.
-              for (auto itr = SrcCopy->second.DefRegs.begin();
-                   itr != SrcCopy->second.DefRegs.end(); itr++) {
-                if (*itr == Def) {
-                  SrcCopy->second.DefRegs.erase(itr);
-                  // If DefReg becomes empty after removal, we can remove the
-                  // SrcCopy from the tracker's copy maps. We only remove those
-                  // entries solely record the Def is defined by Src. If an
-                  // entry also contains the definition record of other Def'
-                  // registers, it cannot be cleared.
-                  if (SrcCopy->second.DefRegs.empty() && !SrcCopy->second.MI) {
-                    Copies.erase(SrcCopy);
-                  }
-                  break;
+  /// Clobber a single register unit, removing it from the tracker's copy maps.
+  void clobberRegUnit(MCRegUnit Unit, const TargetRegisterInfo &TRI,
+                      const TargetInstrInfo &TII, bool UseCopyInstr) {
+    auto I = Copies.find(Unit);
+    if (I != Copies.end()) {
+      // When we clobber the source of a copy, we need to clobber everything
+      // it defined.
+      markRegsUnavailable(I->second.DefRegs, TRI);
+      // When we clobber the destination of a copy, we need to clobber the
+      // whole register it defined.
+      if (MachineInstr *MI = I->second.MI) {
+        std::optional<DestSourcePair> CopyOperands =
+            isCopyInstr(*MI, TII, UseCopyInstr);
+
+        MCRegister Def = CopyOperands->Destination->getReg().asMCReg();
+        MCRegister Src = CopyOperands->Source->getReg().asMCReg();
+
+        markRegsUnavailable(Def, TRI);
+
+        // Since we clobber the destination of a copy, the semantic of Src's
+        // "DefRegs" to contain Def is no longer effectual. We will also need
+        // to remove the record from the copy maps that indicates Src defined
+        // Def. Failing to do so might cause the target to miss some
+        // opportunities to further eliminate redundant copy instructions.
+        // Consider the following sequence during the
+        // ForwardCopyPropagateBlock procedure:
+        // L1: r0 = COPY r9     <- TrackMI
+        // L2: r0 = COPY r8     <- TrackMI (Remove r9 defined r0 from tracker)
+        // L3: use r0           <- Remove L2 from MaybeDeadCopies
+        // L4: early-clobber r9 <- Clobber r9 (L2 is still valid in tracker)
+        // L5: r0 = COPY r8     <- Remove NopCopy
+        for (MCRegUnit SrcUnit : TRI.regunits(Src)) {
+          auto SrcCopy = Copies.find(SrcUnit);
+          if (SrcCopy != Copies.end() && SrcCopy->second.LastSeenUseInCopy) {
+            // If SrcCopy defines multiple values, we only need
+            // to erase the record for Def in DefRegs.
+            for (auto itr = SrcCopy->second.DefRegs.begin();
+                 itr != SrcCopy->second.DefRegs.end(); itr++) {
+              if (*itr == Def) {
+                SrcCopy->second.DefRegs.erase(itr);
+                // If DefReg becomes empty after removal, we can remove the
+                // SrcCopy from the tracker's copy maps. We only remove those
+                // entries solely record the Def is defined by Src. If an
+                // entry also contains the definition record of other Def'
+                // registers, it cannot be cleared.
+                if (SrcCopy->second.DefRegs.empty() && !SrcCopy->second.MI) {
+                  Copies.erase(SrcCopy);
                 }
+                break;
               }
             }
           }
         }
-        // Now we can erase the copy.
-        Copies.erase(I);
       }
+      // Now we can erase the copy.
+      Copies.erase(I);
     }
   }
 
+  /// Clobber a single register, removing it from the tracker's copy maps.
+  void clobberRegister(MCRegister Reg, const TargetRegisterInfo &TRI,
+                       const TargetInstrInfo &TII, bool UseCopyInstr) {
+    for (MCRegUnit Unit : TRI.regunits(Reg)) {
+      clobberRegUnit(Unit, TRI, TII, UseCopyInstr);
+    }
+  }
+
+  /// Clobber all registers which are not preserved by RegMask, removing them
+  /// from the tracker's copy maps.
+  void clobberRegistersExceptMask(const MachineOperand *RegMask,
+                                  const TargetRegisterInfo &TRI,
+                                  const TargetInstrInfo &TII,
+                                  bool UseCopyInstr) {
+    BitVector SafeRegUnits(TRI.getNumRegUnits());
+
+    for (unsigned SafeReg = 0, E = TRI.getNumRegs(); SafeReg < E; ++SafeReg)
+      if (!RegMask->clobbersPhysReg(SafeReg))
+        for (auto SafeUnit : TRI.regunits(SafeReg))
+          SafeRegUnits.set(SafeUnit);
+
+    for (unsigned Unit = 0, E = TRI.getNumRegUnits(); Unit < E; ++Unit)
+      if (!SafeRegUnits.test(Unit))
+        clobberRegUnit(Unit, TRI, TII, UseCopyInstr);
+  }
+
   /// Track copy's src users, and return false if that can't be done.
   /// We can only track if we have a COPY instruction which source is
   /// the same as the Reg.
@@ -960,6 +984,10 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
     // a large set of registers.  Treat clobbered registers the same way as
     // defined registers.
     if (RegMask) {
+      // Invalidate all entries in the copy map which are not preserved by this
+      // register mask.
+      Tracker.clobberRegistersExceptMask(RegMask, *TRI, *TII, UseCopyInstr);
+
       // Erase any MaybeDeadCopies whose destination register is clobbered.
       for (SmallSetVector<MachineInstr *, 8>::iterator DI =
                MaybeDeadCopies.begin();
@@ -978,10 +1006,6 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
         LLVM_DEBUG(dbgs() << "MCP: Removing copy due to regmask clobbering: ";
                    MaybeDead->dump());
 
-        // Make sure we invalidate any entries in the copy maps before erasing
-        // the instruction.
-        Tracker.clobberRegister(Reg, *TRI, *TII, UseCopyInstr);
-
         // erase() will return the next valid iterator pointing to the next
         // element after the erased one.
         DI = MaybeDeadCopies.erase(DI);
diff --git a/llvm/test/CodeGen/AArch64/machine-cp-sub-reg.mir b/llvm/test/CodeGen/AArch64/machine-cp-sub-reg.mir
index 5b379c2bd5629..e7865569c75bd 100644
--- a/llvm/test/CodeGen/AArch64/machine-cp-sub-reg.mir
+++ b/llvm/test/CodeGen/AArch64/machine-cp-sub-reg.mir
@@ -1,5 +1,16 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc -o - %s --run-pass=machine-cp -mcp-use-is-copy-instr -mtriple=arm64-apple-macos --verify-machineinstrs | FileCheck %s
+# RUN: llc -o - %s --run-pass=machine-cp -mcp-use-is-copy-instr -mtriple=arm64-apple-macos | FileCheck %s
+
+--- |
+  declare void @foo()
+
+  define void @test() {
+    unreachable
+  }
+  define void @test2() {
+    unreachable
+  }
+...
 
 ---
 name: test
@@ -30,3 +41,22 @@ body:             |
 
     RET undef $lr, implicit $x0
 ...
+---
+name:            test2
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $q14, $d29, $x0, $x1
+    ; CHECK-LABEL: name: test2
+    ; CHECK: liveins: $q14, $d29, $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $d8 = COPY killed renamable $d29
+    ; CHECK-NEXT: BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+    ; CHECK-NEXT: renamable $b0 = SMAXVv8i8v killed renamable $d8, implicit-def $q0
+    ; CHECK-NEXT: RET_ReallyLR implicit $b0
+    renamable $q8 = COPY renamable $q14
+    renamable $d8 = COPY killed renamable $d29
+    BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+    renamable $b0 = SMAXVv8i8v killed renamable $d8, implicit-def $q0
+    RET_ReallyLR implicit $b0
+...

From 16923da241377b05cf485dcca07f2b00df6bf500 Mon Sep 17 00:00:00 2001
From: xtex <xtex@envs.net>
Date: Mon, 13 Jan 2025 10:12:23 +0000
Subject: [PATCH 240/408] Revert "[clang] Canonicalize absolute paths in
 dependency file" (#121638)

Reverts llvm/llvm-project#117458

https://github.com/llvm/llvm-project/pull/117458#issuecomment-2568804774

https://github.com/ninja-build/ninja/issues/2528
---
 clang/include/clang/Frontend/Utils.h          |  1 -
 clang/lib/Frontend/DependencyFile.cpp         | 20 +++----------------
 clang/test/Frontend/dependency-gen-symlink.c  |  2 +-
 .../dependency-gen-windows-duplicates.c       |  2 +-
 clang/test/VFS/external-names.c               |  2 +-
 5 files changed, 6 insertions(+), 21 deletions(-)

diff --git a/clang/include/clang/Frontend/Utils.h b/clang/include/clang/Frontend/Utils.h
index 8ed17179c9824..604e42067a3f1 100644
--- a/clang/include/clang/Frontend/Utils.h
+++ b/clang/include/clang/Frontend/Utils.h
@@ -120,7 +120,6 @@ class DependencyFileGenerator : public DependencyCollector {
 private:
   void outputDependencyFile(DiagnosticsEngine &Diags);
 
-  llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS;
   std::string OutputFile;
   std::vector<std::string> Targets;
   bool IncludeSystemHeaders;
diff --git a/clang/lib/Frontend/DependencyFile.cpp b/clang/lib/Frontend/DependencyFile.cpp
index 8a36d835d82b3..15fa7de35df97 100644
--- a/clang/lib/Frontend/DependencyFile.cpp
+++ b/clang/lib/Frontend/DependencyFile.cpp
@@ -23,10 +23,8 @@
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/raw_ostream.h"
 #include <optional>
-#include <system_error>
 
 using namespace clang;
 
@@ -238,7 +236,6 @@ void DependencyFileGenerator::attachToPreprocessor(Preprocessor &PP) {
     PP.SetSuppressIncludeNotFoundError(true);
 
   DependencyCollector::attachToPreprocessor(PP);
-  FS = PP.getFileManager().getVirtualFileSystemPtr();
 }
 
 bool DependencyFileGenerator::sawDependency(StringRef Filename, bool FromModule,
@@ -315,22 +312,11 @@ void DependencyFileGenerator::finishedMainFile(DiagnosticsEngine &Diags) {
 /// https://msdn.microsoft.com/en-us/library/dd9y37ha.aspx for NMake info,
 /// https://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx
 /// for Windows file-naming info.
-static void printFilename(raw_ostream &OS, llvm::vfs::FileSystem *FS,
-                          StringRef Filename,
+static void PrintFilename(raw_ostream &OS, StringRef Filename,
                           DependencyOutputFormat OutputFormat) {
   // Convert filename to platform native path
   llvm::SmallString<256> NativePath;
   llvm::sys::path::native(Filename.str(), NativePath);
-  // Resolve absolute path. Make and Ninja canonicalize paths
-  // without checking for symbolic links in the path, for performance concerns.
-  // If there is something like `/bin/../lib64` -> `/usr/lib64`
-  // (where `/bin` links to `/usr/bin`), Make will see them as `/lib64`.
-  if (FS != nullptr && llvm::sys::path::is_absolute(NativePath)) {
-    llvm::SmallString<256> NativePathTmp = NativePath;
-    std::error_code EC = FS->getRealPath(NativePathTmp, NativePath);
-    if (EC)
-      NativePath = NativePathTmp;
-  }
 
   if (OutputFormat == DependencyOutputFormat::NMake) {
     // Add quotes if needed. These are the characters listed as "special" to
@@ -414,7 +400,7 @@ void DependencyFileGenerator::outputDependencyFile(llvm::raw_ostream &OS) {
       Columns = 2;
     }
     OS << ' ';
-    printFilename(OS, FS.get(), File, OutputFormat);
+    PrintFilename(OS, File, OutputFormat);
     Columns += N + 1;
   }
   OS << '\n';
@@ -425,7 +411,7 @@ void DependencyFileGenerator::outputDependencyFile(llvm::raw_ostream &OS) {
     for (auto I = Files.begin(), E = Files.end(); I != E; ++I) {
       if (Index++ == InputFileIndex)
         continue;
-      printFilename(OS, FS.get(), *I, OutputFormat);
+      PrintFilename(OS, *I, OutputFormat);
       OS << ":\n";
     }
   }
diff --git a/clang/test/Frontend/dependency-gen-symlink.c b/clang/test/Frontend/dependency-gen-symlink.c
index 15664a46b90c8..2fa339ad2abf2 100644
--- a/clang/test/Frontend/dependency-gen-symlink.c
+++ b/clang/test/Frontend/dependency-gen-symlink.c
@@ -15,7 +15,7 @@
 // CHECK: dependency-gen-symlink.c.o
 // CHECK: dependency-gen-symlink.c
 // CHECK: a/header.h
-// CHECK-NOT: b/header.h
+// CHECK: b/header.h
 // CHECK-NOT: with-header-guard.h
 #include "a/header.h"
 #include "b/header.h"
diff --git a/clang/test/Frontend/dependency-gen-windows-duplicates.c b/clang/test/Frontend/dependency-gen-windows-duplicates.c
index 0ecc23226fb9c..abd351377dc33 100644
--- a/clang/test/Frontend/dependency-gen-windows-duplicates.c
+++ b/clang/test/Frontend/dependency-gen-windows-duplicates.c
@@ -9,7 +9,7 @@
 // RUN: %clang -MD -MF - %t.dir/test.c -fsyntax-only -I %t.dir/subdir | FileCheck %s
 // CHECK: test.o:
 // CHECK-NEXT: \test.c
-// CHECK-NEXT: \subdir\x.h
+// CHECK-NEXT: \SubDir\X.h
 // File x.h must appear only once (case insensitive check).
 // CHECK-NOT: {{\\|/}}{{x|X}}.{{h|H}}
 
diff --git a/clang/test/VFS/external-names.c b/clang/test/VFS/external-names.c
index dd0b5eb501840..5b7c443b36e56 100644
--- a/clang/test/VFS/external-names.c
+++ b/clang/test/VFS/external-names.c
@@ -47,4 +47,4 @@
 
 // RUN: %clang_cc1 -D REINCLUDE -I %t -ivfsoverlay %t.yaml -Eonly %s -MTfoo -dependency-file %t.dep
 // RUN: cat %t.dep | FileCheck --check-prefix=CHECK-DEP %s
-// CHECK-DEP: Inputs{{..?}}external-names.h
+// CHECK-DEP-NOT: Inputs

From c2979c58d49bf3c7dc892ed9fb49cdca389130ee Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 13 Jan 2025 11:24:02 +0100
Subject: [PATCH 241/408] [Clang] Add release note for pointer overflow
 optimization change (#122462)

Add a release note for optimization change related to pointer overflow
checks. I've put this in the breaking changes section to give it the
best chance of being seen.
---
 clang/docs/ReleaseNotes.rst | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index a14fb189c8e13..8f4adbcd70518 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -58,6 +58,29 @@ code bases.
   containing strict-aliasing violations. The new default behavior can be
   disabled using ``-fno-pointer-tbaa``.
 
+- Clang will now more aggressively use undefined behavior on pointer addition
+  overflow for optimization purposes. For example, a check like
+  ``ptr + unsigned_offset < ptr`` will now optimize to ``false``, because
+  ``ptr + unsigned_offset`` will cause undefined behavior if it overflows (or
+  advances past the end of the object).
+
+  Previously, ``ptr + unsigned_offset < ptr`` was optimized (by both Clang and
+  GCC) to ``(ssize_t)unsigned_offset < 0``. This also results in an incorrect
+  overflow check, but in a way that is less apparent when only testing with
+  pointers in the low half of the address space.
+
+  To avoid pointer addition overflow, it is necessary to perform the addition
+  on integers, for example using
+  ``(uintptr_t)ptr + unsigned_offset < (uintptr_t)ptr``. Sometimes, it is also
+  possible to rewrite checks by only comparing the offset. For example,
+  ``ptr + offset < end_ptr && ptr + offset >= ptr`` can be written as
+  ``offset < (uintptr_t)(end_ptr - ptr)``.
+
+  Undefined behavior due to pointer addition overflow can be reliably detected
+  using ``-fsanitize=pointer-overflow``. It is also possible to use
+  ``-fno-strict-overflow`` to opt-in to a language dialect where signed integer
+  and pointer overflow are well-defined.
+
 C/C++ Language Potentially Breaking Changes
 -------------------------------------------
 

From b5987157e86b3ef87b8ed95f737e0a016974c793 Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Mon, 13 Jan 2025 11:27:23 +0100
Subject: [PATCH 242/408] [flang][OpenMP] Fix
 `omp-declarative-allocate-align.f90` expectations (#122675)

The test was effectively a no-op since we used `//` instead of `!` for
`RUN` and `CHECK` lines. Also, we have to specify the proper OpenMP
version.
---
 .../test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90 b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90
index d0ed0cbb4c831..8daf20e1ae400 100644
--- a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90
+++ b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90
@@ -1,10 +1,10 @@
 ! This test checks lowering of OpenMP allocate Directive with align clause.
 
-// RUN: not flang -fc1 -emit-fir -fopenmp %s 2>&1 | FileCheck %s
+! RUN: not %flang_fc1 -emit-fir -fopenmp -fopenmp-version=51 %s 2>&1 | FileCheck %s
 
 program main
   integer :: x
 
-  // CHECK: not yet implemented: OpenMPDeclarativeAllocate
+  ! CHECK: not yet implemented: OpenMPDeclarativeAllocate
   !$omp allocate(x) align(32)
 end

From a3b3c26048e1e9397cf412b07f09f82fe49e351e Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Mon, 13 Jan 2025 10:30:55 +0000
Subject: [PATCH 243/408] [TableGen] Use assert instead of PrintFatalError in
 TGLexer. NFC. (#122303)

Do not use the PrintFatalError diagnostic machinery for conditions that
can never happen with any input.
---
 llvm/lib/TableGen/TGLexer.cpp | 63 +++++++++++++----------------------
 llvm/lib/TableGen/TGLexer.h   |  5 ++-
 2 files changed, 25 insertions(+), 43 deletions(-)

diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp
index e23aec6efba59..c423023077cd8 100644
--- a/llvm/lib/TableGen/TGLexer.cpp
+++ b/llvm/lib/TableGen/TGLexer.cpp
@@ -235,8 +235,7 @@ tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) {
     return tgtok::dot;
 
   case '\r':
-    PrintFatalError("getNextChar() must never return '\r'");
-    return tgtok::Error;
+    llvm_unreachable("getNextChar() must never return '\r'");
 
   case ' ':
   case '\t':
@@ -664,11 +663,10 @@ bool TGLexer::prepExitInclude(bool IncludeStackMustBeEmpty) {
   PrepIncludeStack.pop_back();
 
   if (IncludeStackMustBeEmpty) {
-    if (!PrepIncludeStack.empty())
-      PrintFatalError("preprocessor include stack is not empty");
+    assert(PrepIncludeStack.empty() &&
+           "preprocessor include stack is not empty");
   } else {
-    if (PrepIncludeStack.empty())
-      PrintFatalError("preprocessor include stack is empty");
+    assert(!PrepIncludeStack.empty() && "preprocessor include stack is empty");
   }
 
   return true;
@@ -718,27 +716,25 @@ tgtok::TokKind TGLexer::prepIsDirective() const {
   return tgtok::Error;
 }
 
-bool TGLexer::prepEatPreprocessorDirective(tgtok::TokKind Kind) {
+void TGLexer::prepEatPreprocessorDirective(tgtok::TokKind Kind) {
   TokStart = CurPtr;
 
-  for (const auto [PKind, PWord] : PreprocessorDirs)
+  for (const auto [PKind, PWord] : PreprocessorDirs) {
     if (PKind == Kind) {
       // Advance CurPtr to the end of the preprocessing word.
       CurPtr += PWord.size();
-      return true;
+      return;
     }
+  }
 
-  PrintFatalError("unsupported preprocessing token in "
-                  "prepEatPreprocessorDirective()");
-  return false;
+  llvm_unreachable(
+      "unsupported preprocessing token in prepEatPreprocessorDirective()");
 }
 
 tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind,
                                         bool ReturnNextLiveToken) {
   // We must be looking at a preprocessing directive.  Eat it!
-  if (!prepEatPreprocessorDirective(Kind))
-    PrintFatalError("lexPreprocessor() called for unknown "
-                    "preprocessor directive");
+  prepEatPreprocessorDirective(Kind);
 
   if (Kind == tgtok::Ifdef || Kind == tgtok::Ifndef) {
     StringRef MacroName = prepLexMacroName();
@@ -820,11 +816,9 @@ tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind,
 
     auto &IfdefOrElseEntry = PrepIncludeStack.back().back();
 
-    if (IfdefOrElseEntry.Kind != tgtok::Ifdef &&
-        IfdefOrElseEntry.Kind != tgtok::Else) {
-      PrintFatalError("invalid preprocessor control on the stack");
-      return tgtok::Error;
-    }
+    assert((IfdefOrElseEntry.Kind == tgtok::Ifdef ||
+            IfdefOrElseEntry.Kind == tgtok::Else) &&
+           "invalid preprocessor control on the stack");
 
     if (!prepSkipDirectiveEnd())
       return ReturnError(CurPtr, "only comments are supported after #endif");
@@ -852,21 +846,17 @@ tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind,
       return ReturnError(CurPtr,
                          "only comments are supported after #define NAME");
 
-    if (!ReturnNextLiveToken) {
-      PrintFatalError("#define must be ignored during the lines skipping");
-      return tgtok::Error;
-    }
+    assert(ReturnNextLiveToken &&
+           "#define must be ignored during the lines skipping");
 
     return LexToken();
   }
 
-  PrintFatalError("preprocessing directive is not supported");
-  return tgtok::Error;
+  llvm_unreachable("preprocessing directive is not supported");
 }
 
 bool TGLexer::prepSkipRegion(bool MustNeverBeFalse) {
-  if (!MustNeverBeFalse)
-    PrintFatalError("invalid recursion.");
+  assert(MustNeverBeFalse && "invalid recursion.");
 
   do {
     // Skip all symbols to the line end.
@@ -902,20 +892,17 @@ bool TGLexer::prepSkipRegion(bool MustNeverBeFalse) {
     if (ProcessedKind == tgtok::Error)
       return false;
 
-    if (Kind != ProcessedKind)
-      PrintFatalError("prepIsDirective() and lexPreprocessor() "
-                      "returned different token kinds");
+    assert(Kind == ProcessedKind && "prepIsDirective() and lexPreprocessor() "
+                                    "returned different token kinds");
 
     // If this preprocessing directive enables tokens processing,
     // then return to the lexPreprocessor() and get to the next token.
     // We can move from line-skipping mode to processing tokens only
     // due to #else or #endif.
     if (prepIsProcessingEnabled()) {
-      if (Kind != tgtok::Else && Kind != tgtok::Endif) {
-        PrintFatalError("tokens processing was enabled by an unexpected "
-                        "preprocessing directive");
-        return false;
-      }
+      assert((Kind == tgtok::Else || Kind == tgtok::Endif) &&
+             "tokens processing was enabled by an unexpected preprocessing "
+             "directive");
 
       return true;
     }
@@ -1053,10 +1040,6 @@ bool TGLexer::prepIsProcessingEnabled() {
 }
 
 void TGLexer::prepReportPreprocessorStackError() {
-  if (PrepIncludeStack.back().empty())
-    PrintFatalError("prepReportPreprocessorStackError() called with "
-                    "empty control stack");
-
   auto &PrepControl = PrepIncludeStack.back().back();
   PrintError(CurBuf.end(), "reached EOF without matching #endif");
   PrintError(PrepControl.SrcPos, "the latest preprocessor control is here");
diff --git a/llvm/lib/TableGen/TGLexer.h b/llvm/lib/TableGen/TGLexer.h
index f8b32dc5377f5..bac583c4e33a1 100644
--- a/llvm/lib/TableGen/TGLexer.h
+++ b/llvm/lib/TableGen/TGLexer.h
@@ -347,14 +347,13 @@ class TGLexer {
   tgtok::TokKind prepIsDirective() const;
 
   // Given a preprocessing token kind, adjusts CurPtr to the end
-  // of the preprocessing directive word.  Returns true, unless
-  // an unsupported token kind is passed in.
+  // of the preprocessing directive word.
   //
   // We use look-ahead prepIsDirective() and prepEatPreprocessorDirective()
   // to avoid adjusting CurPtr before we are sure that '#' is followed
   // by a preprocessing directive.  If it is not, then we fall back to
   // tgtok::paste interpretation of '#'.
-  bool prepEatPreprocessorDirective(tgtok::TokKind Kind);
+  void prepEatPreprocessorDirective(tgtok::TokKind Kind);
 
   // The main "exit" point from the token parsing to preprocessor.
   //

From 7e01a322f850e86be9eefde8ae5a30e532d22cfa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bal=C3=A1zs=20K=C3=A9ri?= <balazs.keri@ericsson.com>
Date: Mon, 13 Jan 2025 11:35:38 +0100
Subject: [PATCH 244/408] [clang][ASTImporter] Fix unused variable warning
 (NFC) (#122686)

---
 clang/unittests/AST/ASTImporterTest.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index a0aaad6082d8c..791248e7a394f 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -10182,7 +10182,7 @@ TEST_P(ASTImporterOptionSpecificTestBase, ImportIntoReopenedNamespaceNoMatch1) {
         struct X { int A; };
       }
       )";
-  Decl *ToTU = getToTuDecl(ToCode, Lang_CXX11);
+  getToTuDecl(ToCode, Lang_CXX11);
   const char *Code =
       R"(
       namespace a {
@@ -10205,7 +10205,7 @@ TEST_P(ASTImporterOptionSpecificTestBase, ImportIntoReopenedNamespaceNoMatch2) {
       namespace a {
       }
       )";
-  Decl *ToTU = getToTuDecl(ToCode, Lang_CXX11);
+  getToTuDecl(ToCode, Lang_CXX11);
   const char *Code =
       R"(
       namespace a {

From 7e2eb0f83e1cf6861c8fd1f038a88a8ddd851c34 Mon Sep 17 00:00:00 2001
From: Durgadoss R <durgadossr@nvidia.com>
Date: Mon, 13 Jan 2025 16:17:42 +0530
Subject: [PATCH 245/408] [NVPTX] Add float to tf32 conversion intrinsics
 (#121507)

This patch adds the missing variants of float to tf32 conversion
intrinsics, with their corresponding lit tests.

PTX Spec link:

https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt

Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
---
 llvm/include/llvm/IR/IntrinsicsNVVM.td   | 10 ++++
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td  | 17 ++++++
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td |  5 --
 llvm/test/CodeGen/NVPTX/convert-sm89.ll  |  7 +++
 llvm/test/CodeGen/NVPTX/convert-sm90.ll  | 68 ++++++++++++++++++++++++
 5 files changed, 102 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/convert-sm90.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index ae04a130bc825..00a76018d8415 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1438,6 +1438,16 @@ let TargetPrefix = "nvvm" in {
 
   def int_nvvm_f2tf32_rna : ClangBuiltin<"__nvvm_f2tf32_rna">,
       Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+  def int_nvvm_f2tf32_rna_satfinite : ClangBuiltin<"__nvvm_f2tf32_rna_satfinite">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+  def int_nvvm_f2tf32_rn : ClangBuiltin<"__nvvm_f2tf32_rn">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+  def int_nvvm_f2tf32_rn_relu : ClangBuiltin<"__nvvm_f2tf32_rn_relu">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+  def int_nvvm_f2tf32_rz : ClangBuiltin<"__nvvm_f2tf32_rz">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+  def int_nvvm_f2tf32_rz_relu : ClangBuiltin<"__nvvm_f2tf32_rz_relu">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
 
   def int_nvvm_ff_to_e4m3x2_rn : ClangBuiltin<"__nvvm_ff_to_e4m3x2_rn">,
       Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index c3e72d6ce3a3f..6a95d9ebef6c7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -725,6 +725,23 @@ let hasSideEffects = false in {
 
   def CVT_f16x2_e4m3x2 : CVT_f16x2_fp8<"e4m3">;
   def CVT_f16x2_e5m2x2 : CVT_f16x2_fp8<"e5m2">;
+
+  // Float to TF32 conversions
+  multiclass CVT_TO_TF32<string Modifier, list<Predicate> Preds = [hasPTX<78>, hasSM<90>]> {
+    defvar Intr = !cast<Intrinsic>("int_nvvm_f2tf32_" # !subst(".", "_", Modifier));
+
+    def NAME : NVPTXInst<(outs Int32Regs:$dst), (ins Float32Regs:$src),
+               "cvt." # Modifier # ".tf32.f32 \t$dst, $src;",
+               [(set i32:$dst, (Intr f32:$src))]>,
+               Requires<Preds>;
+  }
+
+  defm CVT_to_tf32_rn : CVT_TO_TF32<"rn">;
+  defm CVT_to_tf32_rz : CVT_TO_TF32<"rz">;
+  defm CVT_to_tf32_rn_relu  : CVT_TO_TF32<"rn.relu">;
+  defm CVT_to_tf32_rz_relu  : CVT_TO_TF32<"rz.relu">;
+  defm CVT_to_tf32_rna      : CVT_TO_TF32<"rna", [hasPTX<70>, hasSM<80>]>;
+  defm CVT_to_tf32_rna_satf : CVT_TO_TF32<"rna.satfinite", [hasPTX<81>, hasSM<89>]>;
 }
 
 def fpround_oneuse : PatFrag<(ops node:$a), (fpround node:$a), [{
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 22339ebc5484f..4f144cc641080 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1722,11 +1722,6 @@ def : Pat<(int_nvvm_f2bf16_rz f32:$a),
 def : Pat<(int_nvvm_f2bf16_rz_relu f32:$a),
           (CVT_bf16_f32 $a, CvtRZ_RELU)>;
 
-def CVT_tf32_f32 :
-   NVPTXInst<(outs Int32Regs:$dest), (ins Float32Regs:$a),
-                   "cvt.rna.tf32.f32 \t$dest, $a;",
-       [(set i32:$dest, (int_nvvm_f2tf32_rna f32:$a))]>;
-
 def INT_NVVM_LOHI_I2D : F_MATH_2<"mov.b64 \t$dst, {{$src0, $src1}};",
   Float64Regs, Int32Regs, Int32Regs, int_nvvm_lohi_i2d>;
 
diff --git a/llvm/test/CodeGen/NVPTX/convert-sm89.ll b/llvm/test/CodeGen/NVPTX/convert-sm89.ll
index 5d0576aebbe08..30fd76f5a31c2 100644
--- a/llvm/test/CodeGen/NVPTX/convert-sm89.ll
+++ b/llvm/test/CodeGen/NVPTX/convert-sm89.ll
@@ -84,3 +84,10 @@ define <2 x half> @cvt_rn_relu_f16x2_e5m2x2(i16 %in) {
   %val = call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 %in);
   ret <2 x half> %val
 }
+
+; CHECK-LABEL: cvt_rna_satfinite_tf32_f32
+define i32 @cvt_rna_satfinite_tf32_f32(float %f1) {
+; CHECK: cvt.rna.satfinite.tf32.f32
+  %val = call i32 @llvm.nvvm.f2tf32.rna.satfinite(float %f1)
+  ret i32 %val
+}
diff --git a/llvm/test/CodeGen/NVPTX/convert-sm90.ll b/llvm/test/CodeGen/NVPTX/convert-sm90.ll
new file mode 100644
index 0000000000000..5f610e0e91f88
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/convert-sm90.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78| FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-12.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78| %ptxas-verify -arch=sm_90 %}
+
+declare i32 @llvm.nvvm.f2tf32.rn(float %f1)
+declare i32 @llvm.nvvm.f2tf32.rn.relu(float %f1)
+declare i32 @llvm.nvvm.f2tf32.rz(float %f1)
+declare i32 @llvm.nvvm.f2tf32.rz.relu(float %f1)
+
+define i32 @cvt_rn_tf32_f32(float %f1) {
+; CHECK-LABEL: cvt_rn_tf32_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .f32 %f<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rn_tf32_f32_param_0];
+; CHECK-NEXT:    cvt.rn.tf32.f32 %r1, %f1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
+  %val = call i32 @llvm.nvvm.f2tf32.rn(float %f1)
+  ret i32 %val
+}
+
+define i32 @cvt_rn_relu_tf32_f32(float %f1) {
+; CHECK-LABEL: cvt_rn_relu_tf32_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .f32 %f<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rn_relu_tf32_f32_param_0];
+; CHECK-NEXT:    cvt.rn.relu.tf32.f32 %r1, %f1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
+  %val = call i32 @llvm.nvvm.f2tf32.rn.relu(float %f1)
+  ret i32 %val
+}
+
+define i32 @cvt_rz_tf32_f32(float %f1) {
+; CHECK-LABEL: cvt_rz_tf32_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .f32 %f<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rz_tf32_f32_param_0];
+; CHECK-NEXT:    cvt.rz.tf32.f32 %r1, %f1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
+  %val = call i32 @llvm.nvvm.f2tf32.rz(float %f1)
+  ret i32 %val
+}
+
+define i32 @cvt_rz_relu_tf32_f32(float %f1) {
+; CHECK-LABEL: cvt_rz_relu_tf32_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .f32 %f<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rz_relu_tf32_f32_param_0];
+; CHECK-NEXT:    cvt.rz.relu.tf32.f32 %r1, %f1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
+  %val = call i32 @llvm.nvvm.f2tf32.rz.relu(float %f1)
+  ret i32 %val
+}

From f136c800b60dbfacdbb645e7e92acba52e2f279f Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Mon, 13 Jan 2025 12:00:31 +0100
Subject: [PATCH 246/408] Enabled freethreading support in MLIR python bindings
 (#122684)

Reland reverted https://github.com/llvm/llvm-project/pull/107103 with
the fixes for Python 3.8

cc @jpienaar

Co-authored-by: Peter Hawkins <phawkins@google.com>
---
 mlir/cmake/modules/AddMLIRPython.cmake        |  21 +-
 mlir/docs/Bindings/Python.md                  |  40 ++
 .../python/StandaloneExtensionPybind11.cpp    |   4 +-
 mlir/lib/Bindings/Python/Globals.h            |  12 +-
 mlir/lib/Bindings/Python/IRCore.cpp           |  31 +-
 mlir/lib/Bindings/Python/IRModule.cpp         |  18 +-
 mlir/lib/Bindings/Python/IRModule.h           |   1 +
 mlir/lib/Bindings/Python/MainModule.cpp       |   9 +-
 mlir/python/requirements.txt                  |   3 +-
 mlir/test/python/multithreaded_tests.py       | 518 ++++++++++++++++++
 10 files changed, 640 insertions(+), 17 deletions(-)
 create mode 100644 mlir/test/python/multithreaded_tests.py

diff --git a/mlir/cmake/modules/AddMLIRPython.cmake b/mlir/cmake/modules/AddMLIRPython.cmake
index 717a503468a85..0679db9cf93e1 100644
--- a/mlir/cmake/modules/AddMLIRPython.cmake
+++ b/mlir/cmake/modules/AddMLIRPython.cmake
@@ -668,12 +668,31 @@ function(add_mlir_python_extension libname extname)
   elseif(ARG_PYTHON_BINDINGS_LIBRARY STREQUAL "nanobind")
     nanobind_add_module(${libname}
       NB_DOMAIN mlir
+      FREE_THREADED
       ${ARG_SOURCES}
     )
 
     if (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL)
       # Avoids warnings from upstream nanobind.
-      target_compile_options(nanobind-static
+      set(nanobind_target "nanobind-static")
+      if (NOT TARGET ${nanobind_target})
+        # Get correct nanobind target name: nanobind-static-ft or something else
+        # It is set by nanobind_add_module function according to the passed options
+        get_property(all_targets DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY BUILDSYSTEM_TARGETS)
+
+        # Iterate over the list of targets
+        foreach(target ${all_targets})
+          # Check if the target name matches the given string
+          if("${target}" MATCHES "nanobind-")
+            set(nanobind_target "${target}")
+          endif()
+        endforeach()
+
+        if (NOT TARGET ${nanobind_target})
+          message(FATAL_ERROR "Could not find nanobind target to set compile options to")
+        endif()
+      endif()
+      target_compile_options(${nanobind_target}
         PRIVATE
           -Wno-cast-qual
           -Wno-zero-length-array
diff --git a/mlir/docs/Bindings/Python.md b/mlir/docs/Bindings/Python.md
index 32df3310d811d..b8bd0f507a510 100644
--- a/mlir/docs/Bindings/Python.md
+++ b/mlir/docs/Bindings/Python.md
@@ -1187,3 +1187,43 @@ or nanobind and
 utilities to connect to the rest of Python API. The bindings can be located in a
 separate module or in the same module as attributes and types, and
 loaded along with the dialect.
+
+## Free-threading (No-GIL) support
+
+Free-threading or no-GIL support refers to CPython interpreter (>=3.13) with Global Interpreter Lock made optional. For details on the topic, please check [PEP-703](https://peps.python.org/pep-0703/) and this [Python free-threading guide](https://py-free-threading.github.io/).
+
+MLIR Python bindings are free-threading compatible with exceptions (discussed below) in the following sense: it is safe to work in multiple threads with **independent** contexts. Below we show an example code of safe usage:
+
+```python
+# python3.13t example.py
+import concurrent.futures
+
+import mlir.dialects.arith as arith
+from mlir.ir import Context, Location, Module, IntegerType, InsertionPoint
+
+
+def func(py_value):
+    with Context() as ctx:
+        module = Module.create(loc=Location.file("foo.txt", 0, 0))
+
+        dtype = IntegerType.get_signless(64)
+        with InsertionPoint(module.body), Location.name("a"):
+            arith.constant(dtype, py_value)
+
+    return module
+
+
+num_workers = 8
+with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
+    futures = []
+    for i in range(num_workers):
+        futures.append(executor.submit(func, i))
+    assert len(list(f.result() for f in futures)) == num_workers
+```
+
+The exceptions to the free-threading compatibility:
+- IR printing is unsafe, e.g. when using `PassManager` with `PassManager.enable_ir_printing()` which calls thread-unsafe `llvm::raw_ostream`.
+- Usage of `Location.emit_error` is unsafe (due to thread-unsafe `llvm::raw_ostream`).
+- Usage of `Module.dump` is unsafe (due to thread-unsafe `llvm::raw_ostream`).
+- Usage of `mlir.dialects.transform.interpreter` is unsafe.
+- Usage of `mlir.dialects.gpu` and `gpu-module-to-binary` is unsafe.
\ No newline at end of file
diff --git a/mlir/examples/standalone/python/StandaloneExtensionPybind11.cpp b/mlir/examples/standalone/python/StandaloneExtensionPybind11.cpp
index 397db4c20e743..dd3c4c2945cca 100644
--- a/mlir/examples/standalone/python/StandaloneExtensionPybind11.cpp
+++ b/mlir/examples/standalone/python/StandaloneExtensionPybind11.cpp
@@ -12,9 +12,11 @@
 #include "Standalone-c/Dialects.h"
 #include "mlir/Bindings/Python/PybindAdaptors.h"
 
+namespace py = pybind11;
+
 using namespace mlir::python::adaptors;
 
-PYBIND11_MODULE(_standaloneDialectsPybind11, m) {
+PYBIND11_MODULE(_standaloneDialectsPybind11, m, py::mod_gil_not_used()) {
   //===--------------------------------------------------------------------===//
   // standalone dialect
   //===--------------------------------------------------------------------===//
diff --git a/mlir/lib/Bindings/Python/Globals.h b/mlir/lib/Bindings/Python/Globals.h
index 0ec522d14f74b..826a34a535176 100644
--- a/mlir/lib/Bindings/Python/Globals.h
+++ b/mlir/lib/Bindings/Python/Globals.h
@@ -24,6 +24,7 @@ namespace mlir {
 namespace python {
 
 /// Globals that are always accessible once the extension has been initialized.
+/// Methods of this class are thread-safe.
 class PyGlobals {
 public:
   PyGlobals();
@@ -37,12 +38,18 @@ class PyGlobals {
 
   /// Get and set the list of parent modules to search for dialect
   /// implementation classes.
-  std::vector<std::string> &getDialectSearchPrefixes() {
+  std::vector<std::string> getDialectSearchPrefixes() {
+    nanobind::ft_lock_guard lock(mutex);
     return dialectSearchPrefixes;
   }
   void setDialectSearchPrefixes(std::vector<std::string> newValues) {
+    nanobind::ft_lock_guard lock(mutex);
     dialectSearchPrefixes.swap(newValues);
   }
+  void addDialectSearchPrefix(std::string value) {
+    nanobind::ft_lock_guard lock(mutex);
+    dialectSearchPrefixes.push_back(std::move(value));
+  }
 
   /// Loads a python module corresponding to the given dialect namespace.
   /// No-ops if the module has already been loaded or is not found. Raises
@@ -109,6 +116,9 @@ class PyGlobals {
 
 private:
   static PyGlobals *instance;
+
+  nanobind::ft_mutex mutex;
+
   /// Module name prefixes to search under for dialect implementation modules.
   std::vector<std::string> dialectSearchPrefixes;
   /// Map of dialect namespace to external dialect class object.
diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index 453d4f7c7e8bc..463ebdebb3f3f 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -243,9 +243,15 @@ static MlirBlock createBlock(const nb::sequence &pyArgTypes,
 
 /// Wrapper for the global LLVM debugging flag.
 struct PyGlobalDebugFlag {
-  static void set(nb::object &o, bool enable) { mlirEnableGlobalDebug(enable); }
+  static void set(nb::object &o, bool enable) {
+    nb::ft_lock_guard lock(mutex);
+    mlirEnableGlobalDebug(enable);
+  }
 
-  static bool get(const nb::object &) { return mlirIsGlobalDebugEnabled(); }
+  static bool get(const nb::object &) {
+    nb::ft_lock_guard lock(mutex);
+    return mlirIsGlobalDebugEnabled();
+  }
 
   static void bind(nb::module_ &m) {
     // Debug flags.
@@ -255,6 +261,7 @@ struct PyGlobalDebugFlag {
         .def_static(
             "set_types",
             [](const std::string &type) {
+              nb::ft_lock_guard lock(mutex);
               mlirSetGlobalDebugType(type.c_str());
             },
             "types"_a, "Sets specific debug types to be produced by LLVM")
@@ -263,11 +270,17 @@ struct PyGlobalDebugFlag {
           pointers.reserve(types.size());
           for (const std::string &str : types)
             pointers.push_back(str.c_str());
+          nb::ft_lock_guard lock(mutex);
           mlirSetGlobalDebugTypes(pointers.data(), pointers.size());
         });
   }
+
+private:
+  static nb::ft_mutex mutex;
 };
 
+nb::ft_mutex PyGlobalDebugFlag::mutex;
+
 struct PyAttrBuilderMap {
   static bool dunderContains(const std::string &attributeKind) {
     return PyGlobals::get().lookupAttributeBuilder(attributeKind).has_value();
@@ -606,6 +619,7 @@ class PyOpOperandIterator {
 
 PyMlirContext::PyMlirContext(MlirContext context) : context(context) {
   nb::gil_scoped_acquire acquire;
+  nb::ft_lock_guard lock(live_contexts_mutex);
   auto &liveContexts = getLiveContexts();
   liveContexts[context.ptr] = this;
 }
@@ -615,7 +629,10 @@ PyMlirContext::~PyMlirContext() {
   // forContext method, which always puts the associated handle into
   // liveContexts.
   nb::gil_scoped_acquire acquire;
-  getLiveContexts().erase(context.ptr);
+  {
+    nb::ft_lock_guard lock(live_contexts_mutex);
+    getLiveContexts().erase(context.ptr);
+  }
   mlirContextDestroy(context);
 }
 
@@ -632,6 +649,7 @@ nb::object PyMlirContext::createFromCapsule(nb::object capsule) {
 
 PyMlirContextRef PyMlirContext::forContext(MlirContext context) {
   nb::gil_scoped_acquire acquire;
+  nb::ft_lock_guard lock(live_contexts_mutex);
   auto &liveContexts = getLiveContexts();
   auto it = liveContexts.find(context.ptr);
   if (it == liveContexts.end()) {
@@ -647,12 +665,17 @@ PyMlirContextRef PyMlirContext::forContext(MlirContext context) {
   return PyMlirContextRef(it->second, std::move(pyRef));
 }
 
+nb::ft_mutex PyMlirContext::live_contexts_mutex;
+
 PyMlirContext::LiveContextMap &PyMlirContext::getLiveContexts() {
   static LiveContextMap liveContexts;
   return liveContexts;
 }
 
-size_t PyMlirContext::getLiveCount() { return getLiveContexts().size(); }
+size_t PyMlirContext::getLiveCount() {
+  nb::ft_lock_guard lock(live_contexts_mutex);
+  return getLiveContexts().size();
+}
 
 size_t PyMlirContext::getLiveOperationCount() { return liveOperations.size(); }
 
diff --git a/mlir/lib/Bindings/Python/IRModule.cpp b/mlir/lib/Bindings/Python/IRModule.cpp
index f7bf77e5a7e04..e600f1bbd4493 100644
--- a/mlir/lib/Bindings/Python/IRModule.cpp
+++ b/mlir/lib/Bindings/Python/IRModule.cpp
@@ -38,8 +38,11 @@ PyGlobals::PyGlobals() {
 PyGlobals::~PyGlobals() { instance = nullptr; }
 
 bool PyGlobals::loadDialectModule(llvm::StringRef dialectNamespace) {
-  if (loadedDialectModules.contains(dialectNamespace))
-    return true;
+  {
+    nb::ft_lock_guard lock(mutex);
+    if (loadedDialectModules.contains(dialectNamespace))
+      return true;
+  }
   // Since re-entrancy is possible, make a copy of the search prefixes.
   std::vector<std::string> localSearchPrefixes = dialectSearchPrefixes;
   nb::object loaded = nb::none();
@@ -62,12 +65,14 @@ bool PyGlobals::loadDialectModule(llvm::StringRef dialectNamespace) {
     return false;
   // Note: Iterator cannot be shared from prior to loading, since re-entrancy
   // may have occurred, which may do anything.
+  nb::ft_lock_guard lock(mutex);
   loadedDialectModules.insert(dialectNamespace);
   return true;
 }
 
 void PyGlobals::registerAttributeBuilder(const std::string &attributeKind,
                                          nb::callable pyFunc, bool replace) {
+  nb::ft_lock_guard lock(mutex);
   nb::object &found = attributeBuilderMap[attributeKind];
   if (found && !replace) {
     throw std::runtime_error((llvm::Twine("Attribute builder for '") +
@@ -81,6 +86,7 @@ void PyGlobals::registerAttributeBuilder(const std::string &attributeKind,
 
 void PyGlobals::registerTypeCaster(MlirTypeID mlirTypeID,
                                    nb::callable typeCaster, bool replace) {
+  nb::ft_lock_guard lock(mutex);
   nb::object &found = typeCasterMap[mlirTypeID];
   if (found && !replace)
     throw std::runtime_error("Type caster is already registered with caster: " +
@@ -90,6 +96,7 @@ void PyGlobals::registerTypeCaster(MlirTypeID mlirTypeID,
 
 void PyGlobals::registerValueCaster(MlirTypeID mlirTypeID,
                                     nb::callable valueCaster, bool replace) {
+  nb::ft_lock_guard lock(mutex);
   nb::object &found = valueCasterMap[mlirTypeID];
   if (found && !replace)
     throw std::runtime_error("Value caster is already registered: " +
@@ -99,6 +106,7 @@ void PyGlobals::registerValueCaster(MlirTypeID mlirTypeID,
 
 void PyGlobals::registerDialectImpl(const std::string &dialectNamespace,
                                     nb::object pyClass) {
+  nb::ft_lock_guard lock(mutex);
   nb::object &found = dialectClassMap[dialectNamespace];
   if (found) {
     throw std::runtime_error((llvm::Twine("Dialect namespace '") +
@@ -110,6 +118,7 @@ void PyGlobals::registerDialectImpl(const std::string &dialectNamespace,
 
 void PyGlobals::registerOperationImpl(const std::string &operationName,
                                       nb::object pyClass, bool replace) {
+  nb::ft_lock_guard lock(mutex);
   nb::object &found = operationClassMap[operationName];
   if (found && !replace) {
     throw std::runtime_error((llvm::Twine("Operation '") + operationName +
@@ -121,6 +130,7 @@ void PyGlobals::registerOperationImpl(const std::string &operationName,
 
 std::optional<nb::callable>
 PyGlobals::lookupAttributeBuilder(const std::string &attributeKind) {
+  nb::ft_lock_guard lock(mutex);
   const auto foundIt = attributeBuilderMap.find(attributeKind);
   if (foundIt != attributeBuilderMap.end()) {
     assert(foundIt->second && "attribute builder is defined");
@@ -133,6 +143,7 @@ std::optional<nb::callable> PyGlobals::lookupTypeCaster(MlirTypeID mlirTypeID,
                                                         MlirDialect dialect) {
   // Try to load dialect module.
   (void)loadDialectModule(unwrap(mlirDialectGetNamespace(dialect)));
+  nb::ft_lock_guard lock(mutex);
   const auto foundIt = typeCasterMap.find(mlirTypeID);
   if (foundIt != typeCasterMap.end()) {
     assert(foundIt->second && "type caster is defined");
@@ -145,6 +156,7 @@ std::optional<nb::callable> PyGlobals::lookupValueCaster(MlirTypeID mlirTypeID,
                                                          MlirDialect dialect) {
   // Try to load dialect module.
   (void)loadDialectModule(unwrap(mlirDialectGetNamespace(dialect)));
+  nb::ft_lock_guard lock(mutex);
   const auto foundIt = valueCasterMap.find(mlirTypeID);
   if (foundIt != valueCasterMap.end()) {
     assert(foundIt->second && "value caster is defined");
@@ -158,6 +170,7 @@ PyGlobals::lookupDialectClass(const std::string &dialectNamespace) {
   // Make sure dialect module is loaded.
   if (!loadDialectModule(dialectNamespace))
     return std::nullopt;
+  nb::ft_lock_guard lock(mutex);
   const auto foundIt = dialectClassMap.find(dialectNamespace);
   if (foundIt != dialectClassMap.end()) {
     assert(foundIt->second && "dialect class is defined");
@@ -175,6 +188,7 @@ PyGlobals::lookupOperationClass(llvm::StringRef operationName) {
   if (!loadDialectModule(dialectNamespace))
     return std::nullopt;
 
+  nb::ft_lock_guard lock(mutex);
   auto foundIt = operationClassMap.find(operationName);
   if (foundIt != operationClassMap.end()) {
     assert(foundIt->second && "OpView is defined");
diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h
index 8fb32a225e65f..f5fbb6c61b57e 100644
--- a/mlir/lib/Bindings/Python/IRModule.h
+++ b/mlir/lib/Bindings/Python/IRModule.h
@@ -260,6 +260,7 @@ class PyMlirContext {
   // Note that this holds a handle, which does not imply ownership.
   // Mappings will be removed when the context is destructed.
   using LiveContextMap = llvm::DenseMap<void *, PyMlirContext *>;
+  static nanobind::ft_mutex live_contexts_mutex;
   static LiveContextMap &getLiveContexts();
 
   // Interns all live modules associated with this context. Modules tracked
diff --git a/mlir/lib/Bindings/Python/MainModule.cpp b/mlir/lib/Bindings/Python/MainModule.cpp
index 7c4064262012e..6f49431006605 100644
--- a/mlir/lib/Bindings/Python/MainModule.cpp
+++ b/mlir/lib/Bindings/Python/MainModule.cpp
@@ -30,12 +30,8 @@ NB_MODULE(_mlir, m) {
       .def_prop_rw("dialect_search_modules",
                    &PyGlobals::getDialectSearchPrefixes,
                    &PyGlobals::setDialectSearchPrefixes)
-      .def(
-          "append_dialect_search_prefix",
-          [](PyGlobals &self, std::string moduleName) {
-            self.getDialectSearchPrefixes().push_back(std::move(moduleName));
-          },
-          "module_name"_a)
+      .def("append_dialect_search_prefix", &PyGlobals::addDialectSearchPrefix,
+           "module_name"_a)
       .def(
           "_check_dialect_module_loaded",
           [](PyGlobals &self, const std::string &dialectNamespace) {
@@ -76,7 +72,6 @@ NB_MODULE(_mlir, m) {
                   nanobind::cast<std::string>(opClass.attr("OPERATION_NAME"));
               PyGlobals::get().registerOperationImpl(operationName, opClass,
                                                      replace);
-
               // Dict-stuff the new opClass by name onto the dialect class.
               nb::object opClassName = opClass.attr("__name__");
               dialectClass.attr(opClassName) = opClass;
diff --git a/mlir/python/requirements.txt b/mlir/python/requirements.txt
index f240d6ef944ec..1a0075e829aef 100644
--- a/mlir/python/requirements.txt
+++ b/mlir/python/requirements.txt
@@ -2,4 +2,5 @@ nanobind>=2.4, <3.0
 numpy>=1.19.5, <=2.1.2
 pybind11>=2.10.0, <=2.13.6
 PyYAML>=5.4.0, <=6.0.1
-ml_dtypes>=0.1.0, <=0.5.0   # provides several NumPy dtype extensions, including the bf16
+ml_dtypes>=0.1.0, <=0.6.0; python_version<"3.13"   # provides several NumPy dtype extensions, including the bf16
+ml_dtypes>=0.5.0, <=0.6.0; python_version>="3.13"
\ No newline at end of file
diff --git a/mlir/test/python/multithreaded_tests.py b/mlir/test/python/multithreaded_tests.py
new file mode 100644
index 0000000000000..6e1a668346872
--- /dev/null
+++ b/mlir/test/python/multithreaded_tests.py
@@ -0,0 +1,518 @@
+# RUN: %PYTHON %s
+"""
+This script generates multi-threaded tests to check free-threading mode using CPython compiled with TSAN.
+Tests can be run using pytest:
+```bash
+python3.13t -mpytest -vvv multithreaded_tests.py
+```
+
+IMPORTANT. Running tests are not checking the correctness, but just the execution of the tests in multi-threaded context
+and passing if no warnings reported by TSAN and failing otherwise.
+
+
+Details on the generated tests and execution:
+1) Multi-threaded execution: all generated tests are executed independently by
+a pool of threads, running each test multiple times, see @multi_threaded for details
+
+2) Tests generation: we use existing tests: test/python/ir/*.py,
+test/python/dialects/*.py, etc to generate multi-threaded tests.
+In details, we perform the following:
+a) we define a list of source tests to be used to generate multi-threaded tests, see `TEST_MODULES`.
+b) we define `TestAllMultiThreaded` class and add existing tests to the class. See `add_existing_tests` method.
+c) for each test file, we copy and modify it: test/python/ir/affine_expr.py -> /tmp/ir/affine_expr.py.
+In order to import the test file as python module, we remove all executing functions, like
+`@run` or `run(testMethod)`. See `copy_and_update` and `add_existing_tests` methods for details.
+
+
+Observed warnings reported by TSAN.
+
+CPython and free-threading known data-races:
+1) ctypes related races: https://github.com/python/cpython/issues/127945
+2) LLVM related data-races, llvm::raw_ostream is not thread-safe
+- mlir pass manager
+- dialects/transform_interpreter.py
+- ir/diagnostic_handler.py
+- ir/module.py
+3) Dialect gpu module-to-binary method is unsafe
+"""
+import concurrent.futures
+import gc
+import importlib.util
+import os
+import sys
+import threading
+import tempfile
+import unittest
+
+from contextlib import contextmanager
+from functools import partial
+from pathlib import Path
+from typing import Optional, List
+
+import mlir.dialects.arith as arith
+from mlir.dialects import transform
+from mlir.ir import Context, Location, Module, IntegerType, InsertionPoint
+
+
+def import_from_path(module_name: str, file_path: Path):
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)
+    return module
+
+
+def copy_and_update(src_filepath: Path, dst_filepath: Path):
+    # We should remove all calls like `run(testMethod)`
+    with open(src_filepath, "r") as reader, open(dst_filepath, "w") as writer:
+        while True:
+            src_line = reader.readline()
+            if len(src_line) == 0:
+                break
+            skip_lines = [
+                "run(",
+                "@run",
+                "@constructAndPrintInModule",
+                "run_apply_patterns(",
+                "@run_apply_patterns",
+                "@test_in_context",
+                "@construct_and_print_in_module",
+            ]
+            if any(src_line.startswith(line) for line in skip_lines):
+                continue
+            writer.write(src_line)
+
+
+# Helper run functions
+def run(f):
+    f()
+
+
+def run_with_context_and_location(f):
+    print("\nTEST:", f.__name__)
+    with Context(), Location.unknown():
+        f()
+    return f
+
+
+def run_with_insertion_point(f):
+    print("\nTEST:", f.__name__)
+    with Context() as ctx, Location.unknown():
+        module = Module.create()
+        with InsertionPoint(module.body):
+            f(ctx)
+        print(module)
+
+
+def run_with_insertion_point_v2(f):
+    print("\nTEST:", f.__name__)
+    with Context(), Location.unknown():
+        module = Module.create()
+        with InsertionPoint(module.body):
+            f()
+        print(module)
+    return f
+
+
+def run_with_insertion_point_v3(f):
+    with Context(), Location.unknown():
+        module = Module.create()
+        with InsertionPoint(module.body):
+            print("\nTEST:", f.__name__)
+            f(module)
+        print(module)
+    return f
+
+
+def run_with_insertion_point_v4(f):
+    print("\nTEST:", f.__name__)
+    with Context() as ctx, Location.unknown():
+        ctx.allow_unregistered_dialects = True
+        module = Module.create()
+        with InsertionPoint(module.body):
+            f()
+    return f
+
+
+def run_apply_patterns(f):
+    with Context(), Location.unknown():
+        module = Module.create()
+        with InsertionPoint(module.body):
+            sequence = transform.SequenceOp(
+                transform.FailurePropagationMode.Propagate,
+                [],
+                transform.AnyOpType.get(),
+            )
+            with InsertionPoint(sequence.body):
+                apply = transform.ApplyPatternsOp(sequence.bodyTarget)
+                with InsertionPoint(apply.patterns):
+                    f()
+                transform.YieldOp()
+        print("\nTEST:", f.__name__)
+        print(module)
+    return f
+
+
+def run_transform_tensor_ext(f):
+    print("\nTEST:", f.__name__)
+    with Context(), Location.unknown():
+        module = Module.create()
+        with InsertionPoint(module.body):
+            sequence = transform.SequenceOp(
+                transform.FailurePropagationMode.Propagate,
+                [],
+                transform.AnyOpType.get(),
+            )
+            with InsertionPoint(sequence.body):
+                f(sequence.bodyTarget)
+                transform.YieldOp()
+        print(module)
+    return f
+
+
+def run_transform_structured_ext(f):
+    with Context(), Location.unknown():
+        module = Module.create()
+        with InsertionPoint(module.body):
+            print("\nTEST:", f.__name__)
+            f()
+        module.operation.verify()
+        print(module)
+    return f
+
+
+def run_construct_and_print_in_module(f):
+    print("\nTEST:", f.__name__)
+    with Context(), Location.unknown():
+        module = Module.create()
+        with InsertionPoint(module.body):
+            module = f(module)
+        if module is not None:
+            print(module)
+    return f
+
+
+TEST_MODULES = [
+    ("execution_engine", run),
+    ("pass_manager", run),
+    ("dialects/affine", run_with_insertion_point_v2),
+    ("dialects/func", run_with_insertion_point_v2),
+    ("dialects/arith_dialect", run),
+    ("dialects/arith_llvm", run),
+    ("dialects/async_dialect", run),
+    ("dialects/builtin", run),
+    ("dialects/cf", run_with_insertion_point_v4),
+    ("dialects/complex_dialect", run),
+    ("dialects/func", run_with_insertion_point_v2),
+    ("dialects/index_dialect", run_with_insertion_point),
+    ("dialects/llvm", run_with_insertion_point_v2),
+    ("dialects/math_dialect", run),
+    ("dialects/memref", run),
+    ("dialects/ml_program", run_with_insertion_point_v2),
+    ("dialects/nvgpu", run_with_insertion_point_v2),
+    ("dialects/nvvm", run_with_insertion_point_v2),
+    ("dialects/ods_helpers", run),
+    ("dialects/openmp_ops", run_with_insertion_point_v2),
+    ("dialects/pdl_ops", run_with_insertion_point_v2),
+    # ("dialects/python_test", run),  # TODO: Need to pass pybind11 or nanobind argv
+    ("dialects/quant", run),
+    ("dialects/rocdl", run_with_insertion_point_v2),
+    ("dialects/scf", run_with_insertion_point_v2),
+    ("dialects/shape", run),
+    ("dialects/spirv_dialect", run),
+    ("dialects/tensor", run),
+    # ("dialects/tosa", ),  # Nothing to test
+    ("dialects/transform_bufferization_ext", run_with_insertion_point_v2),
+    # ("dialects/transform_extras", ),  # Needs a more complicated execution schema
+    ("dialects/transform_gpu_ext", run_transform_tensor_ext),
+    (
+        "dialects/transform_interpreter",
+        run_with_context_and_location,
+        ["print_", "transform_options", "failed", "include"],
+    ),
+    (
+        "dialects/transform_loop_ext",
+        run_with_insertion_point_v2,
+        ["loopOutline"],
+    ),
+    ("dialects/transform_memref_ext", run_with_insertion_point_v2),
+    ("dialects/transform_nvgpu_ext", run_with_insertion_point_v2),
+    ("dialects/transform_sparse_tensor_ext", run_transform_tensor_ext),
+    ("dialects/transform_structured_ext", run_transform_structured_ext),
+    ("dialects/transform_tensor_ext", run_transform_tensor_ext),
+    (
+        "dialects/transform_vector_ext",
+        run_apply_patterns,
+        ["configurable_patterns"],
+    ),
+    ("dialects/transform", run_with_insertion_point_v3),
+    ("dialects/vector", run_with_context_and_location),
+    ("dialects/gpu/dialect", run_with_context_and_location),
+    ("dialects/gpu/module-to-binary-nvvm", run_with_context_and_location),
+    ("dialects/gpu/module-to-binary-rocdl", run_with_context_and_location),
+    ("dialects/linalg/ops", run),
+    # TO ADD: No proper tests in this dialects/linalg/opsdsl/*
+    # ("dialects/linalg/opsdsl/*", ...),
+    ("dialects/sparse_tensor/dialect", run),
+    ("dialects/sparse_tensor/passes", run),
+    ("integration/dialects/pdl", run_construct_and_print_in_module),
+    ("integration/dialects/transform", run_construct_and_print_in_module),
+    ("integration/dialects/linalg/opsrun", run),
+    ("ir/affine_expr", run),
+    ("ir/affine_map", run),
+    ("ir/array_attributes", run),
+    ("ir/attributes", run),
+    ("ir/blocks", run),
+    ("ir/builtin_types", run),
+    ("ir/context_managers", run),
+    ("ir/debug", run),
+    ("ir/diagnostic_handler", run),
+    ("ir/dialects", run),
+    ("ir/exception", run),
+    ("ir/insertion_point", run),
+    ("ir/integer_set", run),
+    ("ir/location", run),
+    ("ir/module", run),
+    ("ir/operation", run),
+    ("ir/symbol_table", run),
+    ("ir/value", run),
+]
+
+TESTS_TO_SKIP = [
+    "test_execution_engine__testNanoTime_multi_threaded",  # testNanoTime can't run in multiple threads, even with GIL
+    "test_execution_engine__testSharedLibLoad_multi_threaded",  # testSharedLibLoad can't run in multiple threads, even with GIL
+    "test_dialects_arith_dialect__testArithValue_multi_threaded",  # RuntimeError: Value caster is already registered: <class 'dialects/arith_dialect.testArithValue.<locals>.ArithValue'>, even with GIL
+    "test_ir_dialects__testAppendPrefixSearchPath_multi_threaded",  # PyGlobals::setDialectSearchPrefixes is not thread-safe, even with GIL. Strange usage of static PyGlobals vs python exposed _cext.globals
+    "test_ir_value__testValueCasters_multi_threaded",  # RuntimeError: Value caster is already registered: <function testValueCasters.<locals>.dont_cast_int, even with GIL
+    # tests indirectly calling thread-unsafe llvm::raw_ostream
+    "test_execution_engine__testInvalidModule_multi_threaded",  # mlirExecutionEngineCreate calls thread-unsafe llvm::raw_ostream
+    "test_pass_manager__testPrintIrAfterAll_multi_threaded",  # IRPrinterInstrumentation::runAfterPass calls thread-unsafe llvm::raw_ostream
+    "test_pass_manager__testPrintIrBeforeAndAfterAll_multi_threaded",  # IRPrinterInstrumentation::runBeforePass calls thread-unsafe llvm::raw_ostream
+    "test_pass_manager__testPrintIrLargeLimitElements_multi_threaded",  # IRPrinterInstrumentation::runAfterPass calls thread-unsafe llvm::raw_ostream
+    "test_pass_manager__testPrintIrTree_multi_threaded",  # IRPrinterInstrumentation::runAfterPass calls thread-unsafe llvm::raw_ostream
+    "test_pass_manager__testRunPipeline_multi_threaded",  # PrintOpStatsPass::printSummary calls thread-unsafe llvm::raw_ostream
+    "test_dialects_transform_interpreter__include_multi_threaded",  # mlir::transform::PrintOp::apply(mlir::transform::TransformRewriter...) calls thread-unsafe llvm::raw_ostream
+    "test_dialects_transform_interpreter__transform_options_multi_threaded",  # mlir::transform::PrintOp::apply(mlir::transform::TransformRewriter...) calls thread-unsafe llvm::raw_ostream
+    "test_dialects_transform_interpreter__print_self_multi_threaded",  # mlir::transform::PrintOp::apply(mlir::transform::TransformRewriter...) call thread-unsafe llvm::raw_ostream
+    "test_ir_diagnostic_handler__testDiagnosticCallbackException_multi_threaded",  # mlirEmitError calls thread-unsafe llvm::raw_ostream
+    "test_ir_module__testParseSuccess_multi_threaded",  # mlirOperationDump calls thread-unsafe llvm::raw_ostream
+    # False-positive TSAN detected race in llvm::RuntimeDyldELF::registerEHFrames()
+    # Details: https://github.com/llvm/llvm-project/pull/107103/files#r1905726947
+    "test_execution_engine__testCapsule_multi_threaded",
+    "test_execution_engine__testDumpToObjectFile_multi_threaded",
+]
+
+TESTS_TO_XFAIL = [
+    # execution_engine tests:
+    # - ctypes related data-races: https://github.com/python/cpython/issues/127945
+    "test_execution_engine__testBF16Memref_multi_threaded",
+    "test_execution_engine__testBasicCallback_multi_threaded",
+    "test_execution_engine__testComplexMemrefAdd_multi_threaded",
+    "test_execution_engine__testComplexUnrankedMemrefAdd_multi_threaded",
+    "test_execution_engine__testDynamicMemrefAdd2D_multi_threaded",
+    "test_execution_engine__testF16MemrefAdd_multi_threaded",
+    "test_execution_engine__testF8E5M2Memref_multi_threaded",
+    "test_execution_engine__testInvokeFloatAdd_multi_threaded",
+    "test_execution_engine__testInvokeVoid_multi_threaded",  # a ctypes race
+    "test_execution_engine__testMemrefAdd_multi_threaded",
+    "test_execution_engine__testRankedMemRefCallback_multi_threaded",
+    "test_execution_engine__testRankedMemRefWithOffsetCallback_multi_threaded",
+    "test_execution_engine__testUnrankedMemRefCallback_multi_threaded",
+    "test_execution_engine__testUnrankedMemRefWithOffsetCallback_multi_threaded",
+    # dialects tests
+    "test_dialects_memref__testSubViewOpInferReturnTypeExtensiveSlicing_multi_threaded",  # Related to ctypes data races
+    "test_dialects_transform_interpreter__print_other_multi_threaded",  # Fatal Python error: Aborted or mlir::transform::PrintOp::apply(mlir::transform::TransformRewriter...) is not thread-safe
+    "test_dialects_gpu_module-to-binary-rocdl__testGPUToASMBin_multi_threaded",  # Due to global llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp::GCNTrackers variable mutation
+    "test_dialects_gpu_module-to-binary-nvvm__testGPUToASMBin_multi_threaded",
+    "test_dialects_gpu_module-to-binary-nvvm__testGPUToLLVMBin_multi_threaded",
+    "test_dialects_gpu_module-to-binary-rocdl__testGPUToLLVMBin_multi_threaded",
+    # integration tests
+    "test_integration_dialects_linalg_opsrun__test_elemwise_builtin_multi_threaded",  # Related to ctypes data races
+    "test_integration_dialects_linalg_opsrun__test_elemwise_generic_multi_threaded",  # Related to ctypes data races
+    "test_integration_dialects_linalg_opsrun__test_fill_builtin_multi_threaded",  # ctypes
+    "test_integration_dialects_linalg_opsrun__test_fill_generic_multi_threaded",  # ctypes
+    "test_integration_dialects_linalg_opsrun__test_fill_rng_builtin_multi_threaded",  # ctypes
+    "test_integration_dialects_linalg_opsrun__test_fill_rng_generic_multi_threaded",  # ctypes
+    "test_integration_dialects_linalg_opsrun__test_max_pooling_builtin_multi_threaded",  # ctypes
+    "test_integration_dialects_linalg_opsrun__test_max_pooling_generic_multi_threaded",  # ctypes
+    "test_integration_dialects_linalg_opsrun__test_min_pooling_builtin_multi_threaded",  # ctypes
+    "test_integration_dialects_linalg_opsrun__test_min_pooling_generic_multi_threaded",  # ctypes
+]
+
+
+def add_existing_tests(test_modules, test_prefix: str = "_original_test"):
+    def decorator(test_cls):
+        this_folder = Path(__file__).parent.absolute()
+        test_cls.output_folder = tempfile.TemporaryDirectory()
+        output_folder = Path(test_cls.output_folder.name)
+
+        for test_mod_info in test_modules:
+            assert isinstance(test_mod_info, tuple) and len(test_mod_info) in (2, 3)
+            if len(test_mod_info) == 2:
+                test_module_name, exec_fn = test_mod_info
+                test_pattern = None
+            else:
+                test_module_name, exec_fn, test_pattern = test_mod_info
+
+            src_filepath = this_folder / f"{test_module_name}.py"
+            dst_filepath = (output_folder / f"{test_module_name}.py").absolute()
+            if not dst_filepath.parent.exists():
+                dst_filepath.parent.mkdir(parents=True)
+            copy_and_update(src_filepath, dst_filepath)
+            test_mod = import_from_path(test_module_name, dst_filepath)
+            for attr_name in dir(test_mod):
+                is_test_fn = test_pattern is None and attr_name.startswith("test")
+                is_test_fn |= test_pattern is not None and any(
+                    [p in attr_name for p in test_pattern]
+                )
+                if is_test_fn:
+                    obj = getattr(test_mod, attr_name)
+                    if callable(obj):
+                        test_name = f"{test_prefix}_{test_module_name.replace('/', '_')}__{attr_name}"
+
+                        def wrapped_test_fn(
+                            self, *args, __test_fn__=obj, __exec_fn__=exec_fn, **kwargs
+                        ):
+                            __exec_fn__(__test_fn__)
+
+                        setattr(test_cls, test_name, wrapped_test_fn)
+        return test_cls
+
+    return decorator
+
+
+@contextmanager
+def _capture_output(fp):
+    # Inspired from jax test_utils.py capture_stderr method
+    # ``None`` means nothing has not been captured yet.
+    captured = None
+
+    def get_output() -> str:
+        if captured is None:
+            raise ValueError("get_output() called while the context is active.")
+        return captured
+
+    with tempfile.NamedTemporaryFile(mode="w+", encoding="utf-8") as f:
+        original_fd = os.dup(fp.fileno())
+        os.dup2(f.fileno(), fp.fileno())
+        try:
+            yield get_output
+        finally:
+            # Python also has its own buffers, make sure everything is flushed.
+            fp.flush()
+            os.fsync(fp.fileno())
+            f.seek(0)
+            captured = f.read()
+            os.dup2(original_fd, fp.fileno())
+
+
+capture_stdout = partial(_capture_output, sys.stdout)
+capture_stderr = partial(_capture_output, sys.stderr)
+
+
+def multi_threaded(
+    num_workers: int,
+    num_runs: int = 5,
+    skip_tests: Optional[List[str]] = None,
+    xfail_tests: Optional[List[str]] = None,
+    test_prefix: str = "_original_test",
+    multithreaded_test_postfix: str = "_multi_threaded",
+):
+    """Decorator that runs a test in a multi-threaded environment."""
+
+    def decorator(test_cls):
+        for name, test_fn in test_cls.__dict__.copy().items():
+            if not (name.startswith(test_prefix) and callable(test_fn)):
+                continue
+
+            name = f"test{name[len(test_prefix):]}"
+            if skip_tests is not None:
+                if any(
+                    test_name.replace(multithreaded_test_postfix, "") in name
+                    for test_name in skip_tests
+                ):
+                    continue
+
+            def multi_threaded_test_fn(self, *args, __test_fn__=test_fn, **kwargs):
+                with capture_stdout(), capture_stderr() as get_output:
+                    barrier = threading.Barrier(num_workers)
+
+                    def closure():
+                        barrier.wait()
+                        for _ in range(num_runs):
+                            __test_fn__(self, *args, **kwargs)
+
+                    with concurrent.futures.ThreadPoolExecutor(
+                        max_workers=num_workers
+                    ) as executor:
+                        futures = []
+                        for _ in range(num_workers):
+                            futures.append(executor.submit(closure))
+                        # We should call future.result() to re-raise an exception if test has
+                        # failed
+                        assert len(list(f.result() for f in futures)) == num_workers
+
+                    gc.collect()
+                    assert Context._get_live_count() == 0
+
+                captured = get_output()
+                if len(captured) > 0 and "ThreadSanitizer" in captured:
+                    raise RuntimeError(
+                        f"ThreadSanitizer reported warnings:\n{captured}"
+                    )
+
+            test_new_name = f"{name}{multithreaded_test_postfix}"
+            if xfail_tests is not None and test_new_name in xfail_tests:
+                multi_threaded_test_fn = unittest.expectedFailure(
+                    multi_threaded_test_fn
+                )
+
+            setattr(test_cls, test_new_name, multi_threaded_test_fn)
+
+        return test_cls
+
+    return decorator
+
+
+@multi_threaded(
+    num_workers=10,
+    num_runs=20,
+    skip_tests=TESTS_TO_SKIP,
+    xfail_tests=TESTS_TO_XFAIL,
+)
+@add_existing_tests(test_modules=TEST_MODULES, test_prefix="_original_test")
+class TestAllMultiThreaded(unittest.TestCase):
+    @classmethod
+    def tearDownClass(cls):
+        if hasattr(cls, "output_folder"):
+            cls.output_folder.cleanup()
+
+    def _original_test_create_context(self):
+        with Context() as ctx:
+            print(ctx._get_live_count())
+            print(ctx._get_live_module_count())
+            print(ctx._get_live_operation_count())
+            print(ctx._get_live_operation_objects())
+            print(ctx._get_context_again() is ctx)
+            print(ctx._clear_live_operations())
+
+    def _original_test_create_module_with_consts(self):
+        py_values = [123, 234, 345]
+        with Context() as ctx:
+            module = Module.create(loc=Location.file("foo.txt", 0, 0))
+
+            dtype = IntegerType.get_signless(64)
+            with InsertionPoint(module.body), Location.name("a"):
+                arith.constant(dtype, py_values[0])
+
+            with InsertionPoint(module.body), Location.name("b"):
+                arith.constant(dtype, py_values[1])
+
+            with InsertionPoint(module.body), Location.name("c"):
+                arith.constant(dtype, py_values[2])
+
+
+if __name__ == "__main__":
+    # Do not run the tests on CPython with GIL
+    if hasattr(sys, "_is_gil_enabled") and not sys._is_gil_enabled():
+        unittest.main()

From d2ba364440662b1025b2fbc00066eaae8b369cb1 Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Mon, 13 Jan 2025 12:03:06 +0100
Subject: [PATCH 247/408] Fix an unused-variable warning in release build.

---
 llvm/lib/TableGen/TGLexer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp
index c423023077cd8..983242ade0fe5 100644
--- a/llvm/lib/TableGen/TGLexer.cpp
+++ b/llvm/lib/TableGen/TGLexer.cpp
@@ -814,7 +814,7 @@ tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind,
     if (PrepIncludeStack.back().empty())
       return ReturnError(TokStart, "#endif without #ifdef");
 
-    auto &IfdefOrElseEntry = PrepIncludeStack.back().back();
+    [[maybe_unused]] auto &IfdefOrElseEntry = PrepIncludeStack.back().back();
 
     assert((IfdefOrElseEntry.Kind == tgtok::Ifdef ||
             IfdefOrElseEntry.Kind == tgtok::Else) &&

From d7e79663e77e05ed4e7580be1dca00d7ef3b12c5 Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Mon, 13 Jan 2025 12:04:21 +0100
Subject: [PATCH 248/408] Remove an extra trailing `` in Modules.rst, NFC

---
 clang/docs/Modules.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/docs/Modules.rst b/clang/docs/Modules.rst
index 06294e3c58a4f..69a45b7fd9ace 100644
--- a/clang/docs/Modules.rst
+++ b/clang/docs/Modules.rst
@@ -152,7 +152,7 @@ first include path that would refer to the current file. ``#include_next`` is
 interpreted as if the current file had been found in that path.
 If this search finds a file named by a module map, the ``#include_next``
 directive is translated into an import, just like for a ``#include``
-directive.``
+directive.
 
 Module maps
 -----------

From 171d3edd0507422f64cc11b33dac7b7f2b703f76 Mon Sep 17 00:00:00 2001
From: quic_hchandel <165007698+hchandel@users.noreply.github.com>
Date: Mon, 13 Jan 2025 16:36:05 +0530
Subject: [PATCH 249/408] [RISCV] Add Qualcomm uC Xqciint (Interrupts)
 extension (#122256)

This extension adds eleven instructions to accelerate interrupt
servicing.

The current spec can be found at:
https://github.com/quic/riscv-unified-db/releases/latest

This patch adds assembler only support.

---------

Co-authored-by: Harsh Chandel <hchandel@qti.qualcomm.com>
---
 .../Driver/print-supported-extensions-riscv.c |   1 +
 llvm/docs/RISCVUsage.rst                      |   3 +
 llvm/docs/ReleaseNotes.md                     |   2 +
 .../Target/RISCV/AsmParser/RISCVAsmParser.cpp |   3 +
 .../RISCV/Disassembler/RISCVDisassembler.cpp  |   4 +
 .../Target/RISCV/MCTargetDesc/RISCVBaseInfo.h |   1 +
 llvm/lib/Target/RISCV/RISCVFeatures.td        |   8 ++
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      |   1 +
 llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td   |  64 +++++++++++
 llvm/lib/TargetParser/RISCVISAInfo.cpp        |   4 +-
 llvm/test/CodeGen/RISCV/attributes.ll         |   2 +
 llvm/test/MC/RISCV/xqciint-invalid.s          | 105 ++++++++++++++++++
 llvm/test/MC/RISCV/xqciint-valid.s            |  81 ++++++++++++++
 .../TargetParser/RISCVISAInfoTest.cpp         |   3 +-
 14 files changed, 279 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/MC/RISCV/xqciint-invalid.s
 create mode 100644 llvm/test/MC/RISCV/xqciint-valid.s

diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c
index a8d9fcd8569cf..b28e0a07dad24 100644
--- a/clang/test/Driver/print-supported-extensions-riscv.c
+++ b/clang/test/Driver/print-supported-extensions-riscv.c
@@ -196,6 +196,7 @@
 // CHECK-NEXT:     xqcicm               0.2       'Xqcicm' (Qualcomm uC Conditional Move Extension)
 // CHECK-NEXT:     xqcics               0.2       'Xqcics' (Qualcomm uC Conditional Select Extension)
 // CHECK-NEXT:     xqcicsr              0.2       'Xqcicsr' (Qualcomm uC CSR Extension)
+// CHECK-NEXT:     xqciint              0.2       'Xqciint' (Qualcomm uC Interrupts Extension)
 // CHECK-NEXT:     xqcilsm              0.2       'Xqcilsm' (Qualcomm uC Load Store Multiple Extension)
 // CHECK-NEXT:     xqcisls              0.2       'Xqcisls' (Qualcomm uC Scaled Load Store Extension)
 // CHECK-EMPTY:
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index 0dc63f34806b4..a1df0f7d686e6 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -447,6 +447,9 @@ The current vendor extensions supported are:
 ``experimental-Xqcicsr``
   LLVM implements `version 0.2 of the Qualcomm uC CSR extension specification <https://github.com/quic/riscv-unified-db/releases/latest>`__ by Qualcomm.  All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32.
 
+``experimental-Xqciint``
+  LLVM implements `version 0.2 of the Qualcomm uC Interrupts extension specification <https://github.com/quic/riscv-unified-db/releases/latest>`__ by Qualcomm.  All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32.
+
 ``experimental-Xqcilsm``
   LLVM implements `version 0.2 of the Qualcomm uC Load Store Multiple extension specification <https://github.com/quic/riscv-unified-db/releases/latest>`__ by Qualcomm.  All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32.
 
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index a3febf27ae833..d1032138a9db0 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -235,6 +235,8 @@ Changes to the RISC-V Backend
   extension.
 * Adds experimental assembler support for the Qualcomm uC 'Xqcicm` (Conditonal Move)
   extension.
+* Adds experimental assembler support for the Qualcomm uC 'Xqciint` (Interrupts)
+  extension.
 * Added ``Sdext`` and ``Sdtrig`` extensions.
 
 Changes to the WebAssembly Backend
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 2205c67c2d21b..8177280044bf4 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -717,6 +717,7 @@ struct RISCVOperand final : public MCParsedAsmOperand {
   bool isUImm6() const { return IsUImm<6>(); }
   bool isUImm7() const { return IsUImm<7>(); }
   bool isUImm8() const { return IsUImm<8>(); }
+  bool isUImm10() const { return IsUImm<10>(); }
   bool isUImm11() const { return IsUImm<11>(); }
   bool isUImm16() const { return IsUImm<16>(); }
   bool isUImm20() const { return IsUImm<20>(); }
@@ -1590,6 +1591,8 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return generateImmOutOfRangeError(
         Operands, ErrorInfo, -(1 << 9), (1 << 9) - 16,
         "immediate must be a multiple of 16 bytes and non-zero in the range");
+  case Match_InvalidUImm10:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 10) - 1);
   case Match_InvalidUImm11:
     return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 11) - 1);
   case Match_InvalidSImm12:
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index a490910154eb4..971ef90c63327 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -700,6 +700,8 @@ DecodeStatus RISCVDisassembler::getInstruction32(MCInst &MI, uint64_t &Size,
       "Qualcomm uC Conditional Load Immediate custom opcode table");
   TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXqcicm, DecoderTableXqcicm32,
                         "Qualcomm uC Conditional Move custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXqciint, DecoderTableXqciint32,
+                        "Qualcomm uC Interrupts custom opcode table");
   TRY_TO_DECODE(true, DecoderTable32, "RISCV32 table");
 
   return MCDisassembler::Fail;
@@ -732,6 +734,8 @@ DecodeStatus RISCVDisassembler::getInstruction16(MCInst &MI, uint64_t &Size,
   TRY_TO_DECODE_FEATURE(
       RISCV::FeatureVendorXqcicm, DecoderTableXqcicm16,
       "Qualcomm uC Conditional Move custom 16bit opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXqciint, DecoderTableXqciint16,
+                        "Qualcomm uC Interrupts custom 16bit opcode table");
   TRY_TO_DECODE_AND_ADD_SP(STI.hasFeature(RISCV::FeatureVendorXwchc),
                            DecoderTableXwchc16,
                            "WCH QingKe XW custom opcode table");
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index 7048e40822342..ab04b09a7ad15 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -313,6 +313,7 @@ enum OperandType : unsigned {
   OPERAND_UIMM8_LSB000,
   OPERAND_UIMM8_GE32,
   OPERAND_UIMM9_LSB000,
+  OPERAND_UIMM10,
   OPERAND_UIMM10_LSB00_NONZERO,
   OPERAND_UIMM11,
   OPERAND_UIMM12,
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 01bc5387e672e..f721d7148526b 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1302,6 +1302,14 @@ def HasVendorXqcicm
       AssemblerPredicate<(all_of FeatureVendorXqcicm),
                          "'Xqcicm' (Qualcomm uC Conditional Move Extension)">;
 
+def FeatureVendorXqciint
+    : RISCVExperimentalExtension<0, 2, "Qualcomm uC Interrupts Extension",
+                                 [FeatureStdExtZca]>;
+def HasVendorXqciint
+    : Predicate<"Subtarget->hasVendorXqciint()">,
+      AssemblerPredicate<(all_of FeatureVendorXqciint),
+                         "'Xqciint' (Qualcomm uC Interrupts Extension)">;
+
 //===----------------------------------------------------------------------===//
 // LLVM specific features and extensions
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index f24940795e433..1f7e8d87a11b0 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -2473,6 +2473,7 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
         CASE_OPERAND_UIMM(6)
         CASE_OPERAND_UIMM(7)
         CASE_OPERAND_UIMM(8)
+        CASE_OPERAND_UIMM(10)
         CASE_OPERAND_UIMM(12)
         CASE_OPERAND_UIMM(20)
           // clang-format on
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index 6f15646852f91..ce8c0c0a3d4e5 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -28,6 +28,8 @@ def uimm5gt3 : RISCVOp<XLenVT>, ImmLeaf<XLenVT,
   let OperandType = "OPERAND_UIMM5_GT3";
 }
 
+def uimm10 : RISCVUImmLeafOp<10>;
+
 def uimm11 : RISCVUImmLeafOp<11>;
 
 //===----------------------------------------------------------------------===//
@@ -166,6 +168,36 @@ class QCIMVCCI<bits<3> funct3, string opcodestr, DAGOperand immType>
   let rs2 = imm;
 }
 
+let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
+class QCIRVInst16CI_RS1<bits<5> funct5, string OpcodeStr>
+    : RVInst16CI<0b000, 0b10, (outs), (ins GPRNoX0:$rs1), OpcodeStr, "$rs1"> {
+  bits<5> rs1;
+
+  let Inst{12} = 0b1;
+  let Inst{11-7} = rs1;
+  let Inst{6-2} = funct5{4-0};
+}
+
+let hasSideEffects = 1 in
+class QCIRVInst16CI_NONE<bits<5> funct5, string OpcodeStr>
+    : RVInst16CI<0b000, 0b10, (outs), (ins), OpcodeStr, ""> {
+  let Inst{12} = 0b1;
+  let Inst{11-7} = funct5;
+  let Inst{6-2} = 0b00100;
+}
+
+let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
+class QCIInt_IMM<bits<1> funct1, string opcodestr>
+    : RVInstIBase<0b000, OPC_SYSTEM, (outs), (ins uimm10:$imm10), opcodestr,
+                  "$imm10"> {
+  bits<10> imm10;
+
+  let rd = 0;
+  let rs1 = imm10{4-0};
+  let Inst{31-25} = {0b110011, funct1};
+  let Inst{24-20} = imm10{9-5};
+}
+
 //===----------------------------------------------------------------------===//
 // Instructions
 //===----------------------------------------------------------------------===//
@@ -312,6 +344,38 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
   def QC_MVGEUI  : QCIMVCCI<0b111, "qc.mvgeui", uimm5>;
 } // Predicates = [HasVendorXqcicm, IsRV32], DecoderNamespace = "Xqcicm"
 
+let Predicates = [HasVendorXqciint, IsRV32], DecoderNamespace = "Xqciint" in {
+  let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
+  def QC_C_DIR : RVInst16CI<0b000, 0b10, (outs GPRNoX0:$rd), (ins),
+                            "qc.c.dir", "$rd"> {
+    bits<5> rd;
+
+    let Inst{12} = 0b1;
+    let Inst{11-7} = rd;
+    let Inst{6-2} = 0b00000;
+  }
+
+  def QC_SETINTI : QCIInt_IMM<0b0, "qc.setinti">;
+  def QC_CLRINTI : QCIInt_IMM<0b1, "qc.clrinti">;
+
+  def QC_C_EIR    : QCIRVInst16CI_RS1<0b00001, "qc.c.eir">;
+  def QC_C_SETINT : QCIRVInst16CI_RS1<0b00010, "qc.c.setint">;
+  def QC_C_CLRINT : QCIRVInst16CI_RS1<0b00011, "qc.c.clrint">;
+
+  let mayLoad = 0, mayStore = 0 in {
+  def QC_C_DI     : QCIRVInst16CI_NONE<0b10110, "qc.c.di">;
+  def QC_C_EI     : QCIRVInst16CI_NONE<0b10111, "qc.c.ei">;
+  } // mayLoad =0, mayStore = 0
+
+  let mayLoad = 1, mayStore = 1 in {
+  def QC_C_MIENTER      : QCIRVInst16CI_NONE<0b10000, "qc.c.mienter">;
+  def QC_C_MIENTER_NEST : QCIRVInst16CI_NONE<0b10001, "qc.c.mienter.nest">;
+  } // mayLoad = 1, mayStore = 1
+
+  let mayLoad = 1, mayStore = 1, isReturn = 1, isTerminator = 1 in
+  def QC_C_MILEAVERET   : QCIRVInst16CI_NONE<0b10100, "qc.c.mileaveret">;
+} // Predicates = [HasVendorXqciint, IsRV32], DecoderNamespace = "Xqciint"
+
 //===----------------------------------------------------------------------===//
 // Aliases
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp
index d6e1eac0d85af..1995931abfe41 100644
--- a/llvm/lib/TargetParser/RISCVISAInfo.cpp
+++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp
@@ -742,8 +742,8 @@ Error RISCVISAInfo::checkDependency() {
   bool HasZvl = MinVLen != 0;
   bool HasZcmt = Exts.count("zcmt") != 0;
   static constexpr StringLiteral XqciExts[] = {
-      {"xqcia"},  {"xqciac"},  {"xqcicli"}, {"xqcicm"},
-      {"xqcics"}, {"xqcicsr"}, {"xqcilsm"}, {"xqcisls"}};
+      {"xqcia"},   {"xqciac"},  {"xqcicli"}, {"xqcicm"}, {"xqcics"},
+      {"xqcicsr"}, {"xqciint"}, {"xqcilsm"}, {"xqcisls"}};
 
   if (HasI && HasE)
     return getIncompatibleError("i", "e");
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index c0fcc6f611111..a09261609d844 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -87,6 +87,7 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicm %s -o - | FileCheck --check-prefix=RV32XQCICM %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcics %s -o - | FileCheck --check-prefix=RV32XQCICS %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicsr %s -o - | FileCheck --check-prefix=RV32XQCICSR %s
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqciint %s -o - | FileCheck --check-prefix=RV32XQCIINT %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcilsm %s -o - | FileCheck --check-prefix=RV32XQCILSM %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcisls %s -o - | FileCheck --check-prefix=RV32XQCISLS %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zaamo %s -o - | FileCheck --check-prefix=RV32ZAAMO %s
@@ -401,6 +402,7 @@
 ; RV32XQCICM: .attribute 5, "rv32i2p1_zca1p0_xqcicm0p2"
 ; RV32XQCICS: .attribute 5, "rv32i2p1_xqcics0p2"
 ; RV32XQCICSR: .attribute 5, "rv32i2p1_xqcicsr0p2"
+; RV32XQCIINT: .attribute 5, "rv32i2p1_zca1p0_xqciint0p2"
 ; RV32XQCILSM: .attribute 5, "rv32i2p1_xqcilsm0p2"
 ; RV32XQCISLS: .attribute 5, "rv32i2p1_xqcisls0p2"
 ; RV32ZAAMO: .attribute 5, "rv32i2p1_zaamo1p0"
diff --git a/llvm/test/MC/RISCV/xqciint-invalid.s b/llvm/test/MC/RISCV/xqciint-invalid.s
new file mode 100644
index 0000000000000..e748109f41d82
--- /dev/null
+++ b/llvm/test/MC/RISCV/xqciint-invalid.s
@@ -0,0 +1,105 @@
+# Xqciint - Qualcomm uC Interrupts extension
+# RUN: not llvm-mc -triple riscv32 -mattr=+experimental-xqciint < %s 2>&1 \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-IMM %s
+# RUN: not llvm-mc -triple riscv32 -mattr=-experimental-xqciint < %s 2>&1 \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-EXT %s
+
+# CHECK-IMM: :[[@LINE+1]]:12: error: immediate must be an integer in the range [0, 1023]
+qc.setinti 1025
+
+# CHECK: :[[@LINE+1]]:16: error: invalid operand for instruction
+qc.setinti 11, 12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.setinti
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension)
+qc.setinti   10
+
+
+# CHECK-IMM: :[[@LINE+1]]:12: error: immediate must be an integer in the range [0, 1023]
+qc.clrinti 2000
+
+# CHECK: :[[@LINE+1]]:16: error: invalid operand for instruction
+qc.clrinti 22, x4
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.clrinti
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension)
+qc.clrinti 8
+
+
+# CHECK: :[[@LINE+1]]:13: error: invalid operand for instruction
+qc.c.clrint 22
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.c.clrint
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension)
+qc.c.clrint x8
+
+
+# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction
+qc.c.di 22
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension)
+qc.c.di
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.c.dir 22
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.c.dir
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension)
+qc.c.dir x8
+
+
+# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction
+qc.c.ei 22
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension)
+qc.c.ei
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.c.eir 22
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.c.eir
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension)
+qc.c.eir x8
+
+
+# CHECK: :[[@LINE+1]]:19: error: invalid operand for instruction
+qc.c.mienter.nest 22
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension)
+qc.c.mienter.nest
+
+
+# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction
+qc.c.mienter 22
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension)
+qc.c.mienter
+
+
+# CHECK: :[[@LINE+1]]:17: error: invalid operand for instruction
+qc.c.mileaveret 22
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension)
+qc.c.mileaveret
+
+
+# CHECK: :[[@LINE+1]]:13: error: invalid operand for instruction
+qc.c.setint 22
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.c.setint
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension)
+qc.c.setint x8
diff --git a/llvm/test/MC/RISCV/xqciint-valid.s b/llvm/test/MC/RISCV/xqciint-valid.s
new file mode 100644
index 0000000000000..c05a402b5b14a
--- /dev/null
+++ b/llvm/test/MC/RISCV/xqciint-valid.s
@@ -0,0 +1,81 @@
+# Xqciint - Qualcomm uC Interrupts extension
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqciint -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqciint < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-xqciint -M no-aliases --no-print-imm-hex -d - \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqciint -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqciint < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-xqciint --no-print-imm-hex -d - \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+
+# CHECK-INST: qc.setinti      500
+# CHECK-ENC: encoding: [0x73,0x00,0xfa,0xcc]
+qc.setinti 500
+
+# CHECK-INST: qc.setinti      0
+# CHECK-ENC: encoding: [0x73,0x00,0x00,0xcc]
+qc.setinti 0
+
+# CHECK-INST: qc.setinti      1023
+# CHECK-ENC: encoding: [0x73,0x80,0xff,0xcd]
+qc.setinti 1023
+
+
+# CHECK-INST: qc.clrinti      500
+# CHECK-ENC: encoding: [0x73,0x00,0xfa,0xce]
+qc.clrinti 500
+
+# CHECK-INST: qc.clrinti      1023
+# CHECK-ENC: encoding: [0x73,0x80,0xff,0xcf]
+qc.clrinti 1023
+
+# CHECK-INST: qc.clrinti      0
+# CHECK-ENC: encoding: [0x73,0x00,0x00,0xce]
+qc.clrinti 0
+
+
+# CHECK-INST: qc.c.clrint     a0
+# CHECK-ENC: encoding: [0x0e,0x15]
+qc.c.clrint x10
+
+
+# CHECK-INST: qc.c.di
+# CHECK-ENC: encoding: [0x12,0x1b]
+qc.c.di
+
+
+# CHECK-INST: qc.c.dir        a0
+# CHECK-ENC: encoding: [0x02,0x15]
+qc.c.dir x10
+
+
+# CHECK-INST: qc.c.ei
+# CHECK-ENC: encoding: [0x92,0x1b]
+qc.c.ei
+
+
+# CHECK-INST: qc.c.eir        a0
+# CHECK-ENC: encoding: [0x06,0x15]
+qc.c.eir x10
+
+
+# CHECK-INST: qc.c.mienter.nest
+# CHECK-ENC: encoding: [0x92,0x18]
+qc.c.mienter.nest
+
+
+# CHECK-INST: qc.c.mienter
+# CHECK-ENC: encoding: [0x12,0x18]
+qc.c.mienter
+
+
+# CHECK-INST: qc.c.mileaveret
+# CHECK-ENC: encoding: [0x12,0x1a]
+qc.c.mileaveret
+
+
+# CHECK-INST: qc.c.setint     a0
+# CHECK-ENC: encoding: [0x0a,0x15]
+qc.c.setint x10
diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
index 3955d36fce896..3a7ea4550d417 100644
--- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
@@ -656,7 +656,7 @@ TEST(ParseArchString, RejectsConflictingExtensions) {
   for (StringRef Input :
        {"rv64i_xqcisls0p2", "rv64i_xqcia0p2", "rv64i_xqciac0p2",
         "rv64i_xqcicsr0p2", "rv64i_xqcilsm0p2", "rv64i_xqcicm0p2",
-        "rv64i_xqcics0p2", "rv64i_xqcicli0p2"}) {
+        "rv64i_xqcics0p2", "rv64i_xqcicli0p2", "rv64i_xqciint0p2"}) {
     EXPECT_THAT(
         toString(RISCVISAInfo::parseArchString(Input, true).takeError()),
         ::testing::EndsWith(" is only supported for 'rv32'"));
@@ -1121,6 +1121,7 @@ Experimental extensions
     xqcicm               0.2
     xqcics               0.2
     xqcicsr              0.2
+    xqciint              0.2
     xqcilsm              0.2
     xqcisls              0.2
 

From 795e35a653b977bf637d1d049423adc8a63cd20d Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs@arm.com>
Date: Mon, 13 Jan 2025 11:20:35 +0000
Subject: [PATCH 250/408] Reland "[LoopVectorizer] Add support for partial
 reductions" with non-phi operand fix. (#121744)

This relands the reverted #120721 with a fix for cases where neither
reduction operand are the reduction phi. Only
63114239cc8d26225a0ef9920baacfc7cc00fc58 and
63114239cc8d26225a0ef9920baacfc7cc00fc58 are new on top of the reverted
PR.

---------

Co-authored-by: Nicholas Guy <nicholas.guy@arm.com>
---
 .../llvm/Analysis/TargetTransformInfo.h       |   44 +
 .../llvm/Analysis/TargetTransformInfoImpl.h   |    9 +
 llvm/lib/Analysis/TargetTransformInfo.cpp     |   18 +
 .../AArch64/AArch64TargetTransformInfo.h      |   63 +
 .../Transforms/Vectorize/LoopVectorize.cpp    |  141 +-
 .../Transforms/Vectorize/VPRecipeBuilder.h    |   59 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |   63 +-
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |    8 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   80 +-
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |    1 +
 .../AArch64/fully-unrolled-cost.ll            |   20 +-
 .../partial-reduce-dot-product-epilogue.ll    |  213 ++
 .../partial-reduce-dot-product-mixed.ll       |  206 ++
 .../partial-reduce-dot-product-neon.ll        | 1375 +++++++++++
 .../AArch64/partial-reduce-dot-product.ll     | 2164 +++++++++++++++++
 .../AArch64/partial-reduce-no-dotprod.ll      |   61 +
 .../LoopVectorize/AArch64/vplan-printing.ll   |   94 +
 17 files changed, 4588 insertions(+), 31 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 752313ab15858..fe13fc676e303 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -211,6 +211,12 @@ typedef TargetTransformInfo TTI;
 /// for IR-level transformations.
 class TargetTransformInfo {
 public:
+  enum PartialReductionExtendKind { PR_None, PR_SignExtend, PR_ZeroExtend };
+
+  /// Get the kind of extension that an instruction represents.
+  static PartialReductionExtendKind
+  getPartialReductionExtendKind(Instruction *I);
+
   /// Construct a TTI object using a type implementing the \c Concept
   /// API below.
   ///
@@ -1280,6 +1286,20 @@ class TargetTransformInfo {
   /// \return if target want to issue a prefetch in address space \p AS.
   bool shouldPrefetchAddressSpace(unsigned AS) const;
 
+  /// \return The cost of a partial reduction, which is a reduction from a
+  /// vector to another vector with fewer elements of larger size. They are
+  /// represented by the llvm.experimental.partial.reduce.add intrinsic, which
+  /// takes an accumulator and a binary operation operand that itself is fed by
+  /// two extends. An example of an operation that uses a partial reduction is a
+  /// dot product, which reduces two vectors to another of 4 times fewer and 4
+  /// times larger elements.
+  InstructionCost
+  getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB,
+                          Type *AccumType, ElementCount VF,
+                          PartialReductionExtendKind OpAExtend,
+                          PartialReductionExtendKind OpBExtend,
+                          std::optional<unsigned> BinOp = std::nullopt) const;
+
   /// \return The maximum interleave factor that any transform should try to
   /// perform for this target. This number depends on the level of parallelism
   /// and the number of execution units in the CPU.
@@ -2107,6 +2127,20 @@ class TargetTransformInfo::Concept {
   /// \return if target want to issue a prefetch in address space \p AS.
   virtual bool shouldPrefetchAddressSpace(unsigned AS) const = 0;
 
+  /// \return The cost of a partial reduction, which is a reduction from a
+  /// vector to another vector with fewer elements of larger size. They are
+  /// represented by the llvm.experimental.partial.reduce.add intrinsic, which
+  /// takes an accumulator and a binary operation operand that itself is fed by
+  /// two extends. An example of an operation that uses a partial reduction is a
+  /// dot product, which reduces two vectors to another of 4 times fewer and 4
+  /// times larger elements.
+  virtual InstructionCost
+  getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB,
+                          Type *AccumType, ElementCount VF,
+                          PartialReductionExtendKind OpAExtend,
+                          PartialReductionExtendKind OpBExtend,
+                          std::optional<unsigned> BinOp) const = 0;
+
   virtual unsigned getMaxInterleaveFactor(ElementCount VF) = 0;
   virtual InstructionCost getArithmeticInstrCost(
       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
@@ -2786,6 +2820,16 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.shouldPrefetchAddressSpace(AS);
   }
 
+  InstructionCost getPartialReductionCost(
+      unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
+      ElementCount VF, PartialReductionExtendKind OpAExtend,
+      PartialReductionExtendKind OpBExtend,
+      std::optional<unsigned> BinOp = std::nullopt) const override {
+    return Impl.getPartialReductionCost(Opcode, InputTypeA, InputTypeB,
+                                        AccumType, VF, OpAExtend, OpBExtend,
+                                        BinOp);
+  }
+
   unsigned getMaxInterleaveFactor(ElementCount VF) override {
     return Impl.getMaxInterleaveFactor(VF);
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 9c74b2a0c31df..7ac3063ca9a37 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -585,6 +585,15 @@ class TargetTransformInfoImplBase {
   bool enableWritePrefetching() const { return false; }
   bool shouldPrefetchAddressSpace(unsigned AS) const { return !AS; }
 
+  InstructionCost
+  getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB,
+                          Type *AccumType, ElementCount VF,
+                          TTI::PartialReductionExtendKind OpAExtend,
+                          TTI::PartialReductionExtendKind OpBExtend,
+                          std::optional<unsigned> BinOp = std::nullopt) const {
+    return InstructionCost::getInvalid();
+  }
+
   unsigned getMaxInterleaveFactor(ElementCount VF) const { return 1; }
 
   InstructionCost getArithmeticInstrCost(
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index b32dffa9f0fe8..df42dc2746daf 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -863,6 +863,15 @@ bool TargetTransformInfo::shouldPrefetchAddressSpace(unsigned AS) const {
   return TTIImpl->shouldPrefetchAddressSpace(AS);
 }
 
+InstructionCost TargetTransformInfo::getPartialReductionCost(
+    unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
+    ElementCount VF, PartialReductionExtendKind OpAExtend,
+    PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp) const {
+  return TTIImpl->getPartialReductionCost(Opcode, InputTypeA, InputTypeB,
+                                          AccumType, VF, OpAExtend, OpBExtend,
+                                          BinOp);
+}
+
 unsigned TargetTransformInfo::getMaxInterleaveFactor(ElementCount VF) const {
   return TTIImpl->getMaxInterleaveFactor(VF);
 }
@@ -974,6 +983,15 @@ InstructionCost TargetTransformInfo::getShuffleCost(
   return Cost;
 }
 
+TargetTransformInfo::PartialReductionExtendKind
+TargetTransformInfo::getPartialReductionExtendKind(Instruction *I) {
+  if (isa<SExtInst>(I))
+    return PR_SignExtend;
+  if (isa<ZExtInst>(I))
+    return PR_ZeroExtend;
+  return PR_None;
+}
+
 TTI::CastContextHint
 TargetTransformInfo::getCastContextHint(const Instruction *I) {
   if (!I)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 214fb4e352eeb..8e7e590c173ff 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/InstructionCost.h"
 #include <cstdint>
 #include <optional>
 
@@ -357,6 +358,68 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
     return BaseT::isLegalNTLoad(DataType, Alignment);
   }
 
+  InstructionCost
+  getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB,
+                          Type *AccumType, ElementCount VF,
+                          TTI::PartialReductionExtendKind OpAExtend,
+                          TTI::PartialReductionExtendKind OpBExtend,
+                          std::optional<unsigned> BinOp) const {
+
+    InstructionCost Invalid = InstructionCost::getInvalid();
+    InstructionCost Cost(TTI::TCC_Basic);
+
+    if (Opcode != Instruction::Add)
+      return Invalid;
+
+    if (InputTypeA != InputTypeB)
+      return Invalid;
+
+    EVT InputEVT = EVT::getEVT(InputTypeA);
+    EVT AccumEVT = EVT::getEVT(AccumType);
+
+    if (VF.isScalable() && !ST->isSVEorStreamingSVEAvailable())
+      return Invalid;
+    if (VF.isFixed() && (!ST->isNeonAvailable() || !ST->hasDotProd()))
+      return Invalid;
+
+    if (InputEVT == MVT::i8) {
+      switch (VF.getKnownMinValue()) {
+      default:
+        return Invalid;
+      case 8:
+        if (AccumEVT == MVT::i32)
+          Cost *= 2;
+        else if (AccumEVT != MVT::i64)
+          return Invalid;
+        break;
+      case 16:
+        if (AccumEVT == MVT::i64)
+          Cost *= 2;
+        else if (AccumEVT != MVT::i32)
+          return Invalid;
+        break;
+      }
+    } else if (InputEVT == MVT::i16) {
+      // FIXME: Allow i32 accumulator but increase cost, as we would extend
+      //        it to i64.
+      if (VF.getKnownMinValue() != 8 || AccumEVT != MVT::i64)
+        return Invalid;
+    } else
+      return Invalid;
+
+    // AArch64 supports lowering mixed extensions to a usdot but only if the
+    // i8mm or sve/streaming features are available.
+    if (OpAExtend == TTI::PR_None || OpBExtend == TTI::PR_None ||
+        (OpAExtend != OpBExtend && !ST->hasMatMulInt8() &&
+         !ST->isSVEorStreamingSVEAvailable()))
+      return Invalid;
+
+    if (!BinOp || *BinOp != Instruction::Mul)
+      return Invalid;
+
+    return Cost;
+  }
+
   bool enableOrderedReductions() const { return true; }
 
   InstructionCost getInterleavedMemoryOpCost(
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d32a463a996c4..0a13ce902795e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7531,6 +7531,10 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
         }
         continue;
       }
+      // The VPlan-based cost model is more accurate for partial reduction and
+      // comparing against the legacy cost isn't desirable.
+      if (isa<VPPartialReductionRecipe>(&R))
+        return true;
       if (Instruction *UI = GetInstructionForCost(&R))
         SeenInstrs.insert(UI);
     }
@@ -8751,6 +8755,105 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
   return Recipe;
 }
 
+/// Find all possible partial reductions in the loop and track all of those that
+/// are valid so recipes can be formed later.
+void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
+  // Find all possible partial reductions.
+  SmallVector<std::pair<PartialReductionChain, unsigned>, 1>
+      PartialReductionChains;
+  for (const auto &[Phi, RdxDesc] : Legal->getReductionVars())
+    if (std::optional<std::pair<PartialReductionChain, unsigned>> Pair =
+            getScaledReduction(Phi, RdxDesc, Range))
+      PartialReductionChains.push_back(*Pair);
+
+  // A partial reduction is invalid if any of its extends are used by
+  // something that isn't another partial reduction. This is because the
+  // extends are intended to be lowered along with the reduction itself.
+
+  // Build up a set of partial reduction bin ops for efficient use checking.
+  SmallSet<User *, 4> PartialReductionBinOps;
+  for (const auto &[PartialRdx, _] : PartialReductionChains)
+    PartialReductionBinOps.insert(PartialRdx.BinOp);
+
+  auto ExtendIsOnlyUsedByPartialReductions =
+      [&PartialReductionBinOps](Instruction *Extend) {
+        return all_of(Extend->users(), [&](const User *U) {
+          return PartialReductionBinOps.contains(U);
+        });
+      };
+
+  // Check if each use of a chain's two extends is a partial reduction
+  // and only add those that don't have non-partial reduction users.
+  for (auto Pair : PartialReductionChains) {
+    PartialReductionChain Chain = Pair.first;
+    if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
+        ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
+      ScaledReductionExitInstrs.insert(std::make_pair(Chain.Reduction, Pair));
+  }
+}
+
+std::optional<std::pair<PartialReductionChain, unsigned>>
+VPRecipeBuilder::getScaledReduction(PHINode *PHI,
+                                    const RecurrenceDescriptor &Rdx,
+                                    VFRange &Range) {
+  // TODO: Allow scaling reductions when predicating. The select at
+  // the end of the loop chooses between the phi value and most recent
+  // reduction result, both of which have different VFs to the active lane
+  // mask when scaling.
+  if (CM.blockNeedsPredicationForAnyReason(Rdx.getLoopExitInstr()->getParent()))
+    return std::nullopt;
+
+  auto *Update = dyn_cast<BinaryOperator>(Rdx.getLoopExitInstr());
+  if (!Update)
+    return std::nullopt;
+
+  Value *Op = Update->getOperand(0);
+  Value *PhiOp = Update->getOperand(1);
+  if (Op == PHI) {
+    Op = Update->getOperand(1);
+    PhiOp = Update->getOperand(0);
+  }
+  if (PhiOp != PHI)
+    return std::nullopt;
+
+  auto *BinOp = dyn_cast<BinaryOperator>(Op);
+  if (!BinOp || !BinOp->hasOneUse())
+    return std::nullopt;
+
+  using namespace llvm::PatternMatch;
+  Value *A, *B;
+  if (!match(BinOp->getOperand(0), m_ZExtOrSExt(m_Value(A))) ||
+      !match(BinOp->getOperand(1), m_ZExtOrSExt(m_Value(B))))
+    return std::nullopt;
+
+  Instruction *ExtA = cast<Instruction>(BinOp->getOperand(0));
+  Instruction *ExtB = cast<Instruction>(BinOp->getOperand(1));
+
+  TTI::PartialReductionExtendKind OpAExtend =
+      TargetTransformInfo::getPartialReductionExtendKind(ExtA);
+  TTI::PartialReductionExtendKind OpBExtend =
+      TargetTransformInfo::getPartialReductionExtendKind(ExtB);
+
+  PartialReductionChain Chain(Rdx.getLoopExitInstr(), ExtA, ExtB, BinOp);
+
+  unsigned TargetScaleFactor =
+      PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor(
+          A->getType()->getPrimitiveSizeInBits());
+
+  if (LoopVectorizationPlanner::getDecisionAndClampRange(
+          [&](ElementCount VF) {
+            InstructionCost Cost = TTI->getPartialReductionCost(
+                Update->getOpcode(), A->getType(), B->getType(), PHI->getType(),
+                VF, OpAExtend, OpBExtend,
+                std::make_optional(BinOp->getOpcode()));
+            return Cost.isValid();
+          },
+          Range))
+    return std::make_pair(Chain, TargetScaleFactor);
+
+  return std::nullopt;
+}
+
 VPRecipeBase *
 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
                                         ArrayRef<VPValue *> Operands,
@@ -8775,9 +8878,14 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
           Legal->getReductionVars().find(Phi)->second;
       assert(RdxDesc.getRecurrenceStartValue() ==
              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
-      PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
-                                           CM.isInLoopReduction(Phi),
-                                           CM.useOrderedReductions(RdxDesc));
+
+      // If the PHI is used by a partial reduction, set the scale factor.
+      std::optional<std::pair<PartialReductionChain, unsigned>> Pair =
+          getScaledReductionForInstr(RdxDesc.getLoopExitInstr());
+      unsigned ScaleFactor = Pair ? Pair->second : 1;
+      PhiRecipe = new VPReductionPHIRecipe(
+          Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
+          CM.useOrderedReductions(RdxDesc), ScaleFactor);
     } else {
       // TODO: Currently fixed-order recurrences are modeled as chains of
       // first-order recurrences. If there are no users of the intermediate
@@ -8809,6 +8917,9 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
     return tryToWidenMemory(Instr, Operands, Range);
 
+  if (getScaledReductionForInstr(Instr))
+    return tryToCreatePartialReduction(Instr, Operands);
+
   if (!shouldWiden(Instr, Range))
     return nullptr;
 
@@ -8829,6 +8940,21 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
   return tryToWiden(Instr, Operands, VPBB);
 }
 
+VPRecipeBase *
+VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
+                                             ArrayRef<VPValue *> Operands) {
+  assert(Operands.size() == 2 &&
+         "Unexpected number of operands for partial reduction");
+
+  VPValue *BinOp = Operands[0];
+  VPValue *Phi = Operands[1];
+  if (isa<VPReductionPHIRecipe>(BinOp->getDefiningRecipe()))
+    std::swap(BinOp, Phi);
+
+  return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Phi,
+                                      Reduction);
+}
+
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
                                                         ElementCount MaxVF) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
@@ -9252,7 +9378,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
   bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
 
-  VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
+  VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
+                                Builder);
 
   // ---------------------------------------------------------------------------
   // Pre-construction: record ingredients whose recipes we'll need to further
@@ -9298,6 +9425,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
         bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
         return Legal->blockNeedsPredication(BB) || NeedsBlends;
       });
+
+  RecipeBuilder.collectScaledReductions(Range);
+
   auto *MiddleVPBB = Plan->getMiddleBlock();
   VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
@@ -9521,7 +9651,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
 
   // Collect mapping of IR header phis to header phi recipes, to be used in
   // addScalarResumePhis.
-  VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
+  VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
+                                Builder);
   for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
     if (isa<VPCanonicalIVPHIRecipe>(&R))
       continue;
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 5d4a3b555981c..cf653e2d3e658 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -21,8 +21,28 @@ namespace llvm {
 class LoopVectorizationLegality;
 class LoopVectorizationCostModel;
 class TargetLibraryInfo;
+class TargetTransformInfo;
 struct HistogramInfo;
 
+/// A chain of instructions that form a partial reduction.
+/// Designed to match: reduction_bin_op (bin_op (extend (A), (extend (B))),
+/// accumulator).
+struct PartialReductionChain {
+  PartialReductionChain(Instruction *Reduction, Instruction *ExtendA,
+                        Instruction *ExtendB, Instruction *BinOp)
+      : Reduction(Reduction), ExtendA(ExtendA), ExtendB(ExtendB), BinOp(BinOp) {
+  }
+  /// The top-level binary operation that forms the reduction to a scalar
+  /// after the loop body.
+  Instruction *Reduction;
+  /// The extension of each of the inner binary operation's operands.
+  Instruction *ExtendA;
+  Instruction *ExtendB;
+
+  /// The binary operation using the extends that is then reduced.
+  Instruction *BinOp;
+};
+
 /// Helper class to create VPRecipies from IR instructions.
 class VPRecipeBuilder {
   /// The VPlan new recipes are added to.
@@ -34,6 +54,9 @@ class VPRecipeBuilder {
   /// Target Library Info.
   const TargetLibraryInfo *TLI;
 
+  // Target Transform Info.
+  const TargetTransformInfo *TTI;
+
   /// The legality analysis.
   LoopVectorizationLegality *Legal;
 
@@ -63,6 +86,11 @@ class VPRecipeBuilder {
   /// created.
   SmallVector<VPHeaderPHIRecipe *, 4> PhisToFix;
 
+  /// The set of reduction exit instructions that will be scaled to
+  /// a smaller VF via partial reductions, paired with the scaling factor.
+  DenseMap<const Instruction *, std::pair<PartialReductionChain, unsigned>>
+      ScaledReductionExitInstrs;
+
   /// Check if \p I can be widened at the start of \p Range and possibly
   /// decrease the range such that the returned value holds for the entire \p
   /// Range. The function should not be called for memory instructions or calls.
@@ -111,13 +139,35 @@ class VPRecipeBuilder {
   VPHistogramRecipe *tryToWidenHistogram(const HistogramInfo *HI,
                                          ArrayRef<VPValue *> Operands);
 
+  /// Examines reduction operations to see if the target can use a cheaper
+  /// operation with a wider per-iteration input VF and narrower PHI VF.
+  /// Returns null if no scaled reduction was found, otherwise a pair with a
+  /// struct containing reduction information and the scaling factor between the
+  /// number of elements in the input and output.
+  std::optional<std::pair<PartialReductionChain, unsigned>>
+  getScaledReduction(PHINode *PHI, const RecurrenceDescriptor &Rdx,
+                     VFRange &Range);
+
 public:
   VPRecipeBuilder(VPlan &Plan, Loop *OrigLoop, const TargetLibraryInfo *TLI,
+                  const TargetTransformInfo *TTI,
                   LoopVectorizationLegality *Legal,
                   LoopVectorizationCostModel &CM,
                   PredicatedScalarEvolution &PSE, VPBuilder &Builder)
-      : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM),
-        PSE(PSE), Builder(Builder) {}
+      : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal),
+        CM(CM), PSE(PSE), Builder(Builder) {}
+
+  std::optional<std::pair<PartialReductionChain, unsigned>>
+  getScaledReductionForInstr(const Instruction *ExitInst) {
+    auto It = ScaledReductionExitInstrs.find(ExitInst);
+    return It == ScaledReductionExitInstrs.end()
+               ? std::nullopt
+               : std::make_optional(It->second);
+  }
+
+  /// Find all possible partial reductions in the loop and track all of those
+  /// that are valid so recipes can be formed later.
+  void collectScaledReductions(VFRange &Range);
 
   /// Create and return a widened recipe for \p I if one can be created within
   /// the given VF \p Range.
@@ -125,6 +175,11 @@ class VPRecipeBuilder {
                                        ArrayRef<VPValue *> Operands,
                                        VFRange &Range, VPBasicBlock *VPBB);
 
+  /// Create and return a partial reduction recipe for a reduction instruction
+  /// along with binary operation and reduction phi operands.
+  VPRecipeBase *tryToCreatePartialReduction(Instruction *Reduction,
+                                            ArrayRef<VPValue *> Operands);
+
   /// Set the recipe created for given ingredient.
   void setRecipe(Instruction *I, VPRecipeBase *R) {
     assert(!Ingredient2Recipe.contains(I) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index cfbb4ad32d681..1da185f9cfdf4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -883,6 +883,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
     case VPRecipeBase::VPWidenPointerInductionSC:
     case VPRecipeBase::VPReductionPHISC:
     case VPRecipeBase::VPScalarCastSC:
+    case VPRecipeBase::VPPartialReductionSC:
       return true;
     case VPRecipeBase::VPBranchOnMaskSC:
     case VPRecipeBase::VPInterleaveSC:
@@ -2384,23 +2385,28 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
   /// The phi is part of an ordered reduction. Requires IsInLoop to be true.
   bool IsOrdered;
 
+  /// When expanding the reduction PHI, the plan's VF element count is divided
+  /// by this factor to form the reduction phi's VF.
+  unsigned VFScaleFactor = 1;
+
 public:
   /// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p
   /// RdxDesc.
   VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc,
                        VPValue &Start, bool IsInLoop = false,
-                       bool IsOrdered = false)
+                       bool IsOrdered = false, unsigned VFScaleFactor = 1)
       : VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start),
-        RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered) {
+        RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered),
+        VFScaleFactor(VFScaleFactor) {
     assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop");
   }
 
   ~VPReductionPHIRecipe() override = default;
 
   VPReductionPHIRecipe *clone() override {
-    auto *R =
-        new VPReductionPHIRecipe(cast<PHINode>(getUnderlyingInstr()), RdxDesc,
-                                 *getOperand(0), IsInLoop, IsOrdered);
+    auto *R = new VPReductionPHIRecipe(cast<PHINode>(getUnderlyingInstr()),
+                                       RdxDesc, *getOperand(0), IsInLoop,
+                                       IsOrdered, VFScaleFactor);
     R->addOperand(getBackedgeValue());
     return R;
   }
@@ -2431,6 +2437,51 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
   bool isInLoop() const { return IsInLoop; }
 };
 
+/// A recipe for forming partial reductions. In the loop, an accumulator and
+/// vector operand are added together and passed to the next iteration as the
+/// next accumulator. After the loop body, the accumulator is reduced to a
+/// scalar value.
+class VPPartialReductionRecipe : public VPSingleDefRecipe {
+  unsigned Opcode;
+
+public:
+  VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0,
+                           VPValue *Op1)
+      : VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1,
+                                 ReductionInst) {}
+  VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1,
+                           Instruction *ReductionInst = nullptr)
+      : VPSingleDefRecipe(VPDef::VPPartialReductionSC,
+                          ArrayRef<VPValue *>({Op0, Op1}), ReductionInst),
+        Opcode(Opcode) {
+    assert(isa<VPReductionPHIRecipe>(getOperand(1)->getDefiningRecipe()) &&
+           "Unexpected operand order for partial reduction recipe");
+  }
+  ~VPPartialReductionRecipe() override = default;
+
+  VPPartialReductionRecipe *clone() override {
+    return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1));
+  }
+
+  VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
+
+  /// Generate the reduction in the loop.
+  void execute(VPTransformState &State) override;
+
+  /// Return the cost of this VPPartialReductionRecipe.
+  InstructionCost computeCost(ElementCount VF,
+                              VPCostContext &Ctx) const override;
+
+  /// Get the binary op's opcode.
+  unsigned getOpcode() const { return Opcode; }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// A recipe for vectorizing a phi-node as a sequence of mask-based select
 /// instructions.
 class VPBlendRecipe : public VPSingleDefRecipe {
@@ -2640,7 +2691,7 @@ class VPReductionRecipe : public VPSingleDefRecipe {
     return R && classof(R);
   }
 
-  /// Generate the reduction in the loop
+  /// Generate the reduction in the loop.
   void execute(VPTransformState &State) override;
 
   /// Return the cost of VPReductionRecipe.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 35497a7431f76..8fea2c6fd33b6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -231,10 +231,10 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
               [](const auto *R) { return R->getScalarType(); })
           .Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
                 VPScalarIVStepsRecipe, VPWidenGEPRecipe, VPVectorPointerRecipe,
-                VPReverseVectorPointerRecipe, VPWidenCanonicalIVRecipe>(
-              [this](const VPRecipeBase *R) {
-                return inferScalarType(R->getOperand(0));
-              })
+                VPReverseVectorPointerRecipe, VPWidenCanonicalIVRecipe,
+                VPPartialReductionRecipe>([this](const VPRecipeBase *R) {
+            return inferScalarType(R->getOperand(0));
+          })
           .Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPWidenEVLRecipe,
                 VPReplicateRecipe, VPWidenCallRecipe, VPWidenMemoryRecipe,
                 VPWidenSelectRecipe>(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index e54df8bdeac55..4057a51155ece 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -277,6 +277,72 @@ InstructionCost VPRecipeBase::computeCost(ElementCount VF,
   llvm_unreachable("subclasses should implement computeCost");
 }
 
+InstructionCost
+VPPartialReductionRecipe::computeCost(ElementCount VF,
+                                      VPCostContext &Ctx) const {
+  std::optional<unsigned> Opcode = std::nullopt;
+  VPRecipeBase *BinOpR = getOperand(0)->getDefiningRecipe();
+  if (auto *WidenR = dyn_cast<VPWidenRecipe>(BinOpR))
+    Opcode = std::make_optional(WidenR->getOpcode());
+
+  VPRecipeBase *ExtAR = BinOpR->getOperand(0)->getDefiningRecipe();
+  VPRecipeBase *ExtBR = BinOpR->getOperand(1)->getDefiningRecipe();
+
+  auto *PhiType = Ctx.Types.inferScalarType(getOperand(1));
+  auto *InputTypeA = Ctx.Types.inferScalarType(ExtAR ? ExtAR->getOperand(0)
+                                                     : BinOpR->getOperand(0));
+  auto *InputTypeB = Ctx.Types.inferScalarType(ExtBR ? ExtBR->getOperand(0)
+                                                     : BinOpR->getOperand(1));
+
+  auto GetExtendKind = [](VPRecipeBase *R) {
+    // The extend could come from outside the plan.
+    if (!R)
+      return TargetTransformInfo::PR_None;
+    auto *WidenCastR = dyn_cast<VPWidenCastRecipe>(R);
+    if (!WidenCastR)
+      return TargetTransformInfo::PR_None;
+    if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt)
+      return TargetTransformInfo::PR_ZeroExtend;
+    if (WidenCastR->getOpcode() == Instruction::CastOps::SExt)
+      return TargetTransformInfo::PR_SignExtend;
+    return TargetTransformInfo::PR_None;
+  };
+
+  return Ctx.TTI.getPartialReductionCost(getOpcode(), InputTypeA, InputTypeB,
+                                         PhiType, VF, GetExtendKind(ExtAR),
+                                         GetExtendKind(ExtBR), Opcode);
+}
+
+void VPPartialReductionRecipe::execute(VPTransformState &State) {
+  State.setDebugLocFrom(getDebugLoc());
+  auto &Builder = State.Builder;
+
+  assert(getOpcode() == Instruction::Add &&
+         "Unhandled partial reduction opcode");
+
+  Value *BinOpVal = State.get(getOperand(0));
+  Value *PhiVal = State.get(getOperand(1));
+  assert(PhiVal && BinOpVal && "Phi and Mul must be set");
+
+  Type *RetTy = PhiVal->getType();
+
+  CallInst *V = Builder.CreateIntrinsic(
+      RetTy, Intrinsic::experimental_vector_partial_reduce_add,
+      {PhiVal, BinOpVal}, nullptr, "partial.reduce");
+
+  State.set(this, V);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent,
+                                     VPSlotTracker &SlotTracker) const {
+  O << Indent << "PARTIAL-REDUCE ";
+  printAsOperand(O, SlotTracker);
+  O << " = " << Instruction::getOpcodeName(getOpcode()) << " ";
+  printOperands(O, SlotTracker);
+}
+#endif
+
 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
   assert(OpType == OperationType::FPMathOp &&
          "recipe doesn't have fast math flags");
@@ -3356,6 +3422,10 @@ void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
 void VPReductionPHIRecipe::execute(VPTransformState &State) {
   auto &Builder = State.Builder;
 
+  // If this phi is fed by a scaled reduction then it should output a
+  // vector with fewer elements than the VF.
+  ElementCount VF = State.VF.divideCoefficientBy(VFScaleFactor);
+
   // Reductions do not have to start at zero. They can start with
   // any loop invariant values.
   VPValue *StartVPV = getStartValue();
@@ -3366,8 +3436,8 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
   // this value when we vectorize all of the instructions that use the PHI.
   bool ScalarPHI = State.VF.isScalar() || IsInLoop;
-  Type *VecTy = ScalarPHI ? StartV->getType()
-                          : VectorType::get(StartV->getType(), State.VF);
+  Type *VecTy =
+      ScalarPHI ? StartV->getType() : VectorType::get(StartV->getType(), VF);
 
   BasicBlock *HeaderBB = State.CFG.PrevBB;
   assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
@@ -3417,13 +3487,13 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
         // Create start and identity vector values for the reduction in the
         // preheader.
         // TODO: Introduce recipes in VPlan preheader to create initial values.
-        Iden = Builder.CreateVectorSplat(State.VF, Iden);
+        Iden = Builder.CreateVectorSplat(VF, Iden);
         IRBuilderBase::InsertPointGuard IPBuilder(Builder);
         Builder.SetInsertPoint(VectorPH->getTerminator());
         Constant *Zero = Builder.getInt32(0);
         StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
       } else {
-        Iden = Builder.CreateVectorSplat(State.VF, Iden);
+        Iden = Builder.CreateVectorSplat(VF, Iden);
       }
     }
   }
@@ -3441,6 +3511,8 @@ void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
   printAsOperand(O, SlotTracker);
   O << " = phi ";
   printOperands(O, SlotTracker);
+  if (VFScaleFactor != 1)
+    O << " (VF scaled by 1/" << VFScaleFactor << ")";
 }
 #endif
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 957a602091c73..7aaf4002b8b3e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -329,6 +329,7 @@ class VPDef {
     VPInterleaveSC,
     VPReductionEVLSC,
     VPReductionSC,
+    VPPartialReductionSC,
     VPReplicateSC,
     VPScalarCastSC,
     VPScalarIVStepsSC,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
index 1cfb507a74344..c3e8c895fce24 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
@@ -11,10 +11,10 @@ define i64 @test(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction   %exitcond.not = icmp eq i64 %i.iv.next, 16
 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 8: 26
+; CHECK: Cost for VF 8: 30
 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 16: 48
+; CHECK: Cost for VF 16: 56
 ; CHECK: LV: Selecting VF: 16
 entry:
   br label %for.body
@@ -31,8 +31,8 @@ for.body:                                         ; preds = %entry, %for.body
   %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %i.iv
   %1 = load i8, ptr %arrayidx2, align 1
   %conv3 = zext i8 %1 to i64
-  %mul = mul nuw nsw i64 %conv3, %conv
-  %add = add i64 %mul, %sum
+  %div = udiv i64 %conv3, %conv
+  %add = add i64 %div, %sum
   %i.iv.next = add nuw nsw i64 %i.iv, 1
   %exitcond.not = icmp eq i64 %i.iv.next, 16
   br i1 %exitcond.not, label %exit, label %for.body
@@ -45,11 +45,11 @@ define i64 @test_external_iv_user(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction   %exitcond.not = icmp eq i64 %i.iv.next, 16
 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 8: 26
+; CHECK: Cost for VF 8: 30
 ; CHECK-NEXT: Cost of 1 for VF 16: induction instruction   %i.iv.next = add nuw nsw i64 %i.iv, 1
 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 16: 49
+; CHECK: Cost for VF 16: 57
 ; CHECK: LV: Selecting VF: vscale x 2
 entry:
   br label %for.body
@@ -64,8 +64,8 @@ for.body:                                         ; preds = %entry, %for.body
   %arrayidx2 = getelementptr inbounds nuw i8, ptr %b, i64 %i.iv.next
   %1 = load i8, ptr %arrayidx2, align 1
   %conv3 = zext i8 %1 to i64
-  %mul = mul nuw nsw i64 %conv3, %conv
-  %add = add i64 %sum, %mul
+  %div = udiv i64 %conv3, %conv
+  %add = add i64 %sum, %div
   %exitcond.not = icmp eq i64 %i.iv.next, 16
   br i1 %exitcond.not, label %exit, label %for.body
 
@@ -82,11 +82,11 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 {
 ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction   %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction   %exitcond.not = icmp eq i64 %i.iv.next, 16
 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 8: 27
+; CHECK: Cost for VF 8: 24
 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 16: 48
+; CHECK: Cost for VF 16: 42
 ; CHECK: LV: Selecting VF: 16
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
new file mode 100644
index 0000000000000..5cc00daab7ce5
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
@@ -0,0 +1,213 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -mattr=+dotprod -passes=loop-vectorize -force-vector-interleave=1 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define i32 @dotp(ptr %a, ptr %b) #0 {
+; CHECK-LABEL: define i32 @dotp(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  iter.check:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP13]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP15]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 4
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK:       vec.epilog.vector.body:
+; CHECK-NEXT:    [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ [[TMP18]], [[VEC_EPILOG_PH]] ], [ [[TMP27:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX2]], 0
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP22:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP23]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i8>, ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD5]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP26:%.*]] = mul <vscale x 4 x i32> [[TMP25]], [[TMP22]]
+; CHECK-NEXT:    [[TMP27]] = add <vscale x 4 x i32> [[TMP26]], [[VEC_PHI3]]
+; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[INDEX2]], [[TMP17]]
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP27]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
+define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
+; CHECK-LABEL: define void @dotp_small_epilogue_vf(
+; CHECK-SAME: i64 [[IDX_NEG:%.*]], i8 [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  iter.check:
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 1, [[IDX_NEG]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i64>
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr null, align 1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i8> poison, i8 [[TMP2]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT2]], <16 x i8> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[BROADCAST_SPLAT3]] to <16 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <16 x i64> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP4]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[IV_NEXT]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[ADD:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[IV_NEXT]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY:%.*]]
+; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[IND_END6:%.*]] = add i64 [[IDX_NEG]], [[IV_NEXT]]
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[IV_NEXT]]
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[ACCUM:%.*]] = phi i64 [ [[ADD]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[N_MOD_VF4:%.*]] = urem i64 [[TMP0]], 8
+; CHECK-NEXT:    [[N_VEC5:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF4]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[IDX_NEG]], [[N_VEC5]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT7]], <8 x i8> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <8 x i8> [[BROADCAST_SPLAT8]] to <8 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <1 x i64> zeroinitializer, i64 [[ACCUM]], i32 0
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK:       vec.epilog.vector.body:
+; CHECK-NEXT:    [[INDEX9:%.*]] = phi i64 [ [[IV]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI10:%.*]] = phi <1 x i64> [ [[TMP8]], [[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr null, align 1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <8 x i8> poison, i8 [[TMP9]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT12:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT11]], <8 x i8> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = sext <8 x i8> [[BROADCAST_SPLAT12]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = mul <8 x i64> [[TMP10]], [[TMP7]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE13]] = call <1 x i64> @llvm.experimental.vector.partial.reduce.add.v1i64.v8i64(<1 x i64> [[VEC_PHI10]], <8 x i64> [[TMP11]])
+; CHECK-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX9]], 8
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC5]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> [[PARTIAL_REDUCE13]])
+; CHECK-NEXT:    [[CMP_N15:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]]
+; CHECK-NEXT:    br i1 [[CMP_N15]], label [[WHILE_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       vec.epilog.scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IDX_NEG]], [[ITER_CHECK:%.*]] ], [ [[IND_END6]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL16:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[IV_NEXT]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX17:%.*]] = phi i64 [ [[TMP13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[ADD]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    br label [[WHILE_BODY1:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[IV_NEG:%.*]] = phi i64 [ [[IV_NEG_NEXT:%.*]], [[WHILE_BODY1]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[IV_NEXT1:%.*]], [[WHILE_BODY1]] ], [ [[BC_RESUME_VAL16]], [[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[ACCUM1:%.*]] = phi i64 [ [[ADD1:%.*]], [[WHILE_BODY1]] ], [ [[BC_MERGE_RDX17]], [[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV_NEG_NEXT]] = add i64 [[IV_NEG]], 1
+; CHECK-NEXT:    [[EXT_A:%.*]] = sext i8 [[A]] to i64
+; CHECK-NEXT:    [[IV_NEXT1]] = add i64 [[IV1]], 1
+; CHECK-NEXT:    [[B:%.*]] = load i8, ptr null, align 1
+; CHECK-NEXT:    [[EXT_B:%.*]] = sext i8 [[B]] to i64
+; CHECK-NEXT:    [[MUL:%.*]] = mul i64 [[EXT_B]], [[EXT_A]]
+; CHECK-NEXT:    [[ADD1]] = add i64 [[MUL]], [[ACCUM1]]
+; CHECK-NEXT:    [[CMP_IV_NEG:%.*]] = icmp ugt i64 [[IV_NEG]], 0
+; CHECK-NEXT:    [[CMP_IV:%.*]] = icmp ne i64 [[IV1]], -1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = and i1 [[CMP_IV_NEG]], [[CMP_IV]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[WHILE_BODY1]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    [[RESULT:%.*]] = phi i64 [ [[ADD1]], [[WHILE_BODY1]] ], [ [[ADD]], [[MIDDLE_BLOCK]] ], [ [[TMP13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %while.body
+
+while.body:                                       ; preds = %while.body, %entry
+  %iv.neg = phi i64 [ %iv.neg.next, %while.body ], [ %idx.neg, %entry ]
+  %iv = phi i64 [ %iv.next, %while.body ], [ 0, %entry ]
+  %accum = phi i64 [ %add, %while.body ], [ 0, %entry ]
+  %iv.neg.next = add i64 %iv.neg, 1
+  %ext.a = sext i8 %a to i64
+  %iv.next = add i64 %iv, 1
+  %b = load i8, ptr null, align 1
+  %ext.b = sext i8 %b to i64
+  %mul = mul i64 %ext.b, %ext.a
+  %add = add i64 %mul, %accum
+  %cmp.iv.neg = icmp ugt i64 %iv.neg, 0
+  %cmp.iv = icmp ne i64 %iv, -1
+  %exitcond = and i1 %cmp.iv.neg, %cmp.iv
+  br i1 %exitcond, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:                               ; preds = %while.body
+  %result = phi i64 [ %add, %while.body ]
+  ret void
+}
+
+attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
+attributes #1 = { "target-cpu"="apple-m1" }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
new file mode 100644
index 0000000000000..74db8683d5df8
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
@@ -0,0 +1,206 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+i8mm,+dotprod -S < %s | FileCheck %s
+; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+dotprod -S < %s | FileCheck %s --check-prefix=CHECK-NOI8MM
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define i32 @dotp_z_s(ptr %a, ptr %b) #0 {
+; CHECK-LABEL: define i32 @dotp_z_s(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
+; CHECK-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-NOI8MM-LABEL: define i32 @dotp_z_s(
+; CHECK-NOI8MM-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NOI8MM-NEXT:  entry:
+; CHECK-NOI8MM-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NOI8MM:       vector.ph:
+; CHECK-NOI8MM-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NOI8MM:       vector.body:
+; CHECK-NOI8MM-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NOI8MM-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NOI8MM-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-NOI8MM-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-NOI8MM-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NOI8MM-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; CHECK-NOI8MM-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-NOI8MM-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
+; CHECK-NOI8MM-NEXT:    [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-NOI8MM-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
+; CHECK-NOI8MM-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]])
+; CHECK-NOI8MM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NOI8MM-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-NOI8MM-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NOI8MM:       middle.block:
+; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-NOI8MM-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-NOI8MM-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = sext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
+define i32 @dotp_s_z(ptr %a, ptr %b) #0 {
+; CHECK-LABEL: define i32 @dotp_s_z(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
+; CHECK-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-NOI8MM-LABEL: define i32 @dotp_s_z(
+; CHECK-NOI8MM-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NOI8MM-NEXT:  entry:
+; CHECK-NOI8MM-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NOI8MM:       vector.ph:
+; CHECK-NOI8MM-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NOI8MM:       vector.body:
+; CHECK-NOI8MM-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NOI8MM-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NOI8MM-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-NOI8MM-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-NOI8MM-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NOI8MM-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; CHECK-NOI8MM-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-NOI8MM-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
+; CHECK-NOI8MM-NEXT:    [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-NOI8MM-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
+; CHECK-NOI8MM-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]])
+; CHECK-NOI8MM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NOI8MM-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-NOI8MM-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NOI8MM:       middle.block:
+; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-NOI8MM-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-NOI8MM-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = sext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
+!7 = distinct !{!7, !8, !9, !10}
+!8 = !{!"llvm.loop.mustprogress"}
+!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
+!10 = !{!"llvm.loop.vectorize.enable", i1 true}
+attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
new file mode 100644
index 0000000000000..c66695f1b50f0
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
@@ -0,0 +1,1375 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1
+; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED
+; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -force-vector-interleave=1 -vectorizer-maximize-bandwidth -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define i32 @dotp(ptr %a, ptr %b) {
+; CHECK-INTERLEAVE1-LABEL: define i32 @dotp(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVE1:       middle.block:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @dotp(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVED:       middle.block:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-MAXBW-LABEL: define i32 @dotp(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-MAXBW-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-MAXBW:       middle.block:
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
+define i32 @not_dotp_different_types(ptr %a, ptr %b) {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_different_types(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP35:%.*]] = load i16, ptr [[TMP19]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP36:%.*]] = load i16, ptr [[TMP20]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP37:%.*]] = load i16, ptr [[TMP21]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP38:%.*]] = load i16, ptr [[TMP22]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP39:%.*]] = load i16, ptr [[TMP23]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP40:%.*]] = load i16, ptr [[TMP24]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP41:%.*]] = load i16, ptr [[TMP25]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP42:%.*]] = load i16, ptr [[TMP26]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP43:%.*]] = load i16, ptr [[TMP27]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP44:%.*]] = load i16, ptr [[TMP28]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP45:%.*]] = load i16, ptr [[TMP29]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP46:%.*]] = load i16, ptr [[TMP30]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP47:%.*]] = load i16, ptr [[TMP31]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP48:%.*]] = load i16, ptr [[TMP32]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP49:%.*]] = load i16, ptr [[TMP33]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP50:%.*]] = load i16, ptr [[TMP34]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP51:%.*]] = insertelement <16 x i16> poison, i16 [[TMP35]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP52:%.*]] = insertelement <16 x i16> [[TMP51]], i16 [[TMP36]], i32 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP53:%.*]] = insertelement <16 x i16> [[TMP52]], i16 [[TMP37]], i32 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP54:%.*]] = insertelement <16 x i16> [[TMP53]], i16 [[TMP38]], i32 3
+; CHECK-INTERLEAVE1-NEXT:    [[TMP55:%.*]] = insertelement <16 x i16> [[TMP54]], i16 [[TMP39]], i32 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP56:%.*]] = insertelement <16 x i16> [[TMP55]], i16 [[TMP40]], i32 5
+; CHECK-INTERLEAVE1-NEXT:    [[TMP57:%.*]] = insertelement <16 x i16> [[TMP56]], i16 [[TMP41]], i32 6
+; CHECK-INTERLEAVE1-NEXT:    [[TMP58:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[TMP42]], i32 7
+; CHECK-INTERLEAVE1-NEXT:    [[TMP59:%.*]] = insertelement <16 x i16> [[TMP58]], i16 [[TMP43]], i32 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP60:%.*]] = insertelement <16 x i16> [[TMP59]], i16 [[TMP44]], i32 9
+; CHECK-INTERLEAVE1-NEXT:    [[TMP61:%.*]] = insertelement <16 x i16> [[TMP60]], i16 [[TMP45]], i32 10
+; CHECK-INTERLEAVE1-NEXT:    [[TMP62:%.*]] = insertelement <16 x i16> [[TMP61]], i16 [[TMP46]], i32 11
+; CHECK-INTERLEAVE1-NEXT:    [[TMP63:%.*]] = insertelement <16 x i16> [[TMP62]], i16 [[TMP47]], i32 12
+; CHECK-INTERLEAVE1-NEXT:    [[TMP64:%.*]] = insertelement <16 x i16> [[TMP63]], i16 [[TMP48]], i32 13
+; CHECK-INTERLEAVE1-NEXT:    [[TMP65:%.*]] = insertelement <16 x i16> [[TMP64]], i16 [[TMP49]], i32 14
+; CHECK-INTERLEAVE1-NEXT:    [[TMP66:%.*]] = insertelement <16 x i16> [[TMP65]], i16 [[TMP50]], i32 15
+; CHECK-INTERLEAVE1-NEXT:    [[TMP67:%.*]] = zext <16 x i16> [[TMP66]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_different_types(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP137:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 17
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 18
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], 19
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX]], 20
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 21
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], 22
+; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX]], 23
+; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = add i64 [[INDEX]], 24
+; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = add i64 [[INDEX]], 25
+; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = add i64 [[INDEX]], 26
+; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = add i64 [[INDEX]], 27
+; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = add i64 [[INDEX]], 28
+; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = add i64 [[INDEX]], 29
+; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = add i64 [[INDEX]], 30
+; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX]], 31
+; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[TMP32]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP34]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP53:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP16]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP55:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP18]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP56:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP57:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP20]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP58:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP21]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP59:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP22]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP60:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP23]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP61:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP24]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP62:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP25]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP63:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP26]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP64:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP27]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP65:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP28]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP66:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP67:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP30]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP68:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP31]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP69:%.*]] = load i16, ptr [[TMP37]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP70:%.*]] = load i16, ptr [[TMP38]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP71:%.*]] = load i16, ptr [[TMP39]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP72:%.*]] = load i16, ptr [[TMP40]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP73:%.*]] = load i16, ptr [[TMP41]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP74:%.*]] = load i16, ptr [[TMP42]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP75:%.*]] = load i16, ptr [[TMP43]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP76:%.*]] = load i16, ptr [[TMP44]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP77:%.*]] = load i16, ptr [[TMP45]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP78:%.*]] = load i16, ptr [[TMP46]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP79:%.*]] = load i16, ptr [[TMP47]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP80:%.*]] = load i16, ptr [[TMP48]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP81:%.*]] = load i16, ptr [[TMP49]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP82:%.*]] = load i16, ptr [[TMP50]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP83:%.*]] = load i16, ptr [[TMP51]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP84:%.*]] = load i16, ptr [[TMP52]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP88:%.*]] = insertelement <16 x i16> [[TMP87]], i16 [[TMP72]], i32 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP89:%.*]] = insertelement <16 x i16> [[TMP88]], i16 [[TMP73]], i32 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP90:%.*]] = insertelement <16 x i16> [[TMP89]], i16 [[TMP74]], i32 5
+; CHECK-INTERLEAVED-NEXT:    [[TMP91:%.*]] = insertelement <16 x i16> [[TMP90]], i16 [[TMP75]], i32 6
+; CHECK-INTERLEAVED-NEXT:    [[TMP92:%.*]] = insertelement <16 x i16> [[TMP91]], i16 [[TMP76]], i32 7
+; CHECK-INTERLEAVED-NEXT:    [[TMP93:%.*]] = insertelement <16 x i16> [[TMP92]], i16 [[TMP77]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP94:%.*]] = insertelement <16 x i16> [[TMP93]], i16 [[TMP78]], i32 9
+; CHECK-INTERLEAVED-NEXT:    [[TMP95:%.*]] = insertelement <16 x i16> [[TMP94]], i16 [[TMP79]], i32 10
+; CHECK-INTERLEAVED-NEXT:    [[TMP96:%.*]] = insertelement <16 x i16> [[TMP95]], i16 [[TMP80]], i32 11
+; CHECK-INTERLEAVED-NEXT:    [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP81]], i32 12
+; CHECK-INTERLEAVED-NEXT:    [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13
+; CHECK-INTERLEAVED-NEXT:    [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14
+; CHECK-INTERLEAVED-NEXT:    [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15
+; CHECK-INTERLEAVED-NEXT:    [[TMP101:%.*]] = load i16, ptr [[TMP53]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP102:%.*]] = load i16, ptr [[TMP54]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP103:%.*]] = load i16, ptr [[TMP55]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP104:%.*]] = load i16, ptr [[TMP56]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP105:%.*]] = load i16, ptr [[TMP57]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP106:%.*]] = load i16, ptr [[TMP58]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP107:%.*]] = load i16, ptr [[TMP59]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP108:%.*]] = load i16, ptr [[TMP60]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP109:%.*]] = load i16, ptr [[TMP61]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP110:%.*]] = load i16, ptr [[TMP62]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP111:%.*]] = load i16, ptr [[TMP63]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP112:%.*]] = load i16, ptr [[TMP64]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP113:%.*]] = load i16, ptr [[TMP65]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP114:%.*]] = load i16, ptr [[TMP66]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP115:%.*]] = load i16, ptr [[TMP67]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP116:%.*]] = load i16, ptr [[TMP68]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5
+; CHECK-INTERLEAVED-NEXT:    [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6
+; CHECK-INTERLEAVED-NEXT:    [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7
+; CHECK-INTERLEAVED-NEXT:    [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9
+; CHECK-INTERLEAVED-NEXT:    [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10
+; CHECK-INTERLEAVED-NEXT:    [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11
+; CHECK-INTERLEAVED-NEXT:    [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12
+; CHECK-INTERLEAVED-NEXT:    [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13
+; CHECK-INTERLEAVED-NEXT:    [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14
+; CHECK-INTERLEAVED-NEXT:    [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15
+; CHECK-INTERLEAVED-NEXT:    [[TMP133:%.*]] = zext <16 x i16> [[TMP100]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP135:%.*]] = mul <16 x i32> [[TMP133]], [[TMP35]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP137]] = add <16 x i32> [[TMP135]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_different_types(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-MAXBW-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-MAXBW-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-MAXBW-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-MAXBW-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-MAXBW-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-MAXBW-NEXT:    [[TMP35:%.*]] = load i16, ptr [[TMP19]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP36:%.*]] = load i16, ptr [[TMP20]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP37:%.*]] = load i16, ptr [[TMP21]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP38:%.*]] = load i16, ptr [[TMP22]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP39:%.*]] = load i16, ptr [[TMP23]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP40:%.*]] = load i16, ptr [[TMP24]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP41:%.*]] = load i16, ptr [[TMP25]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP42:%.*]] = load i16, ptr [[TMP26]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP43:%.*]] = load i16, ptr [[TMP27]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP44:%.*]] = load i16, ptr [[TMP28]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP45:%.*]] = load i16, ptr [[TMP29]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP46:%.*]] = load i16, ptr [[TMP30]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP47:%.*]] = load i16, ptr [[TMP31]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP48:%.*]] = load i16, ptr [[TMP32]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP49:%.*]] = load i16, ptr [[TMP33]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP50:%.*]] = load i16, ptr [[TMP34]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP51:%.*]] = insertelement <16 x i16> poison, i16 [[TMP35]], i32 0
+; CHECK-MAXBW-NEXT:    [[TMP52:%.*]] = insertelement <16 x i16> [[TMP51]], i16 [[TMP36]], i32 1
+; CHECK-MAXBW-NEXT:    [[TMP53:%.*]] = insertelement <16 x i16> [[TMP52]], i16 [[TMP37]], i32 2
+; CHECK-MAXBW-NEXT:    [[TMP54:%.*]] = insertelement <16 x i16> [[TMP53]], i16 [[TMP38]], i32 3
+; CHECK-MAXBW-NEXT:    [[TMP55:%.*]] = insertelement <16 x i16> [[TMP54]], i16 [[TMP39]], i32 4
+; CHECK-MAXBW-NEXT:    [[TMP56:%.*]] = insertelement <16 x i16> [[TMP55]], i16 [[TMP40]], i32 5
+; CHECK-MAXBW-NEXT:    [[TMP57:%.*]] = insertelement <16 x i16> [[TMP56]], i16 [[TMP41]], i32 6
+; CHECK-MAXBW-NEXT:    [[TMP58:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[TMP42]], i32 7
+; CHECK-MAXBW-NEXT:    [[TMP59:%.*]] = insertelement <16 x i16> [[TMP58]], i16 [[TMP43]], i32 8
+; CHECK-MAXBW-NEXT:    [[TMP60:%.*]] = insertelement <16 x i16> [[TMP59]], i16 [[TMP44]], i32 9
+; CHECK-MAXBW-NEXT:    [[TMP61:%.*]] = insertelement <16 x i16> [[TMP60]], i16 [[TMP45]], i32 10
+; CHECK-MAXBW-NEXT:    [[TMP62:%.*]] = insertelement <16 x i16> [[TMP61]], i16 [[TMP46]], i32 11
+; CHECK-MAXBW-NEXT:    [[TMP63:%.*]] = insertelement <16 x i16> [[TMP62]], i16 [[TMP47]], i32 12
+; CHECK-MAXBW-NEXT:    [[TMP64:%.*]] = insertelement <16 x i16> [[TMP63]], i16 [[TMP48]], i32 13
+; CHECK-MAXBW-NEXT:    [[TMP65:%.*]] = insertelement <16 x i16> [[TMP64]], i16 [[TMP49]], i32 14
+; CHECK-MAXBW-NEXT:    [[TMP66:%.*]] = insertelement <16 x i16> [[TMP65]], i16 [[TMP50]], i32 15
+; CHECK-MAXBW-NEXT:    [[TMP67:%.*]] = zext <16 x i16> [[TMP66]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]]
+; CHECK-MAXBW-NEXT:    [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT:    [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-MAXBW-NEXT:    br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i16, ptr %gep.b, align 2
+  %ext.b = zext i16 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
+define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_loop_carried(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_loop_carried(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-MAXBW-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %mul, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
+define i32 @not_dotp_not_phi(ptr %a, ptr %b) {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_phi(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_phi(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_phi(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-MAXBW-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %ext.b
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
+define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
+; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_unrolled(
+; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-INTERLEAVE1:       middle.block:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled(
+; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-INTERLEAVED:       middle.block:
+; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-MAXBW-LABEL: define i32 @dotp_unrolled(
+; CHECK-MAXBW-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
+; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
+; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
+; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
+; CHECK-MAXBW-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-MAXBW:       middle.block:
+; CHECK-MAXBW-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]])
+; CHECK-MAXBW-NEXT:    [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]])
+; CHECK-MAXBW-NEXT:    [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]])
+; CHECK-MAXBW-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+;
+entry:
+  br label %for.body
+
+for.body:                                    ; preds = %entry, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum3 = phi i32 [ 0, %entry ], [ %add.a3, %for.body ]
+  %accum2 = phi i32 [ 0, %entry ], [ %add.a2, %for.body ]
+  %accum1 = phi i32 [ 0, %entry ], [ %add.a1, %for.body ]
+  %accum0 = phi i32 [ 0, %entry ], [ %add.a0, %for.body ]
+  %gep.a0 = getelementptr inbounds i8, ptr %a, i64 %iv
+  %gep.b0 = getelementptr inbounds i8, ptr %b, i64 %iv
+  %offset.1 = or disjoint i64 %iv, 1
+  %gep.a1 = getelementptr inbounds i8, ptr %a, i64 %offset.1
+  %gep.b1 = getelementptr inbounds i8, ptr %b, i64 %offset.1
+  %offset.2 = or disjoint i64 %iv, 2
+  %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %offset.2
+  %gep.b2 = getelementptr inbounds i8, ptr %b, i64 %offset.2
+  %offset.3 = or disjoint i64 %iv, 3
+  %gep.a3 = getelementptr inbounds i8, ptr %a, i64 %offset.3
+  %gep.b3 = getelementptr inbounds i8, ptr %b, i64 %offset.3
+  %load.a0 = load i8, ptr %gep.a0, align 1
+  %ext.a0 = sext i8 %load.a0 to i32
+  %load.b0 = load i8, ptr %gep.b0, align 1
+  %ext.b0 = sext i8 %load.b0 to i32
+  %mul.a0 = mul nsw i32 %ext.b0, %ext.a0
+  %add.a0 = add nsw i32 %mul.a0, %accum0
+  %load.a1 = load i8, ptr %gep.a1, align 1
+  %ext.a1 = sext i8 %load.a1 to i32
+  %load.b1 = load i8, ptr %gep.b1, align 1
+  %ext.b1 = sext i8 %load.b1 to i32
+  %mul.a1 = mul nsw i32 %ext.a1, %ext.b1
+  %add.a1 = add nsw i32 %mul.a1, %accum1
+  %load.a2 = load i8, ptr %gep.a2, align 1
+  %ext.a2 = sext i8 %load.a2 to i32
+  %load.b2 = load i8, ptr %gep.b2, align 1
+  %ext.b2 = sext i8 %load.b2 to i32
+  %mul.a2 = mul nsw i32 %ext.a2, %ext.b2
+  %add.a2 = add nsw i32 %mul.a2, %accum2
+  %load.a3 = load i8, ptr %gep.a3, align 1
+  %ext.a3 = sext i8 %load.a3 to i32
+  %load.b3 = load i8, ptr %gep.b3, align 1
+  %ext.b3 = sext i8 %load.b3 to i32
+  %mul.a3 = mul nsw i32 %ext.a3, %ext.b3
+  %add.a3 = add nsw i32 %mul.a3, %accum3
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %num_in
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:                                        ; preds = %for.body
+  %result0 = add nsw i32 %add.a0, %add.a1
+  %result1 = add nsw i32 %add.a2, %add.a3
+  %result = add nsw i32 %result0, %result1
+  ret i32 %result
+}
+
+define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated(
+; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-INTERLEAVE1:       middle.block:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated(
+; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 32
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP9]], [[TMP4]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-INTERLEAVED:       middle.block:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated(
+; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-MAXBW:       middle.block:
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = sext i8 %load.a to i32
+  %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = sext i8 %load.b to i32
+  %mul = mul nsw i32 %ext.b, %ext.a
+  %add = add nsw i32 %mul, %accum
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
+define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated_pragma(
+; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 15
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE62]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated_pragma(
+; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 15
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE62]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated_pragma(
+; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 15
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE62]] ]
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
+; CHECK-MAXBW-NEXT:    br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr inbounds i8, ptr %b, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = sext i8 %load.a to i32
+  %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %iv
+  %load.b = load i8, ptr %gep.a2, align 1
+  %ext.b = sext i8 %load.b to i32
+  %mul = mul nsw i32 %ext.b, %ext.a
+  %add = add nsw i32 %mul, %accum
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !7
+
+exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
+define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_extend_user(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-INTERLEAVE1:       middle.block:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1:       scalar.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       for.body:
+; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
+; CHECK-INTERLEAVE1-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-INTERLEAVE1-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-INTERLEAVE1:       for.exit:
+; CHECK-INTERLEAVE1-NEXT:    [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]]
+; CHECK-INTERLEAVE1-NEXT:    ret i32 [[RESULT]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-INTERLEAVED:       middle.block:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP14]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = extractelement <16 x i32> [[TMP10]], i32 15
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED:       scalar.ph:
+; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-INTERLEAVED-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       for.body:
+; CHECK-INTERLEAVED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
+; CHECK-INTERLEAVED-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
+; CHECK-INTERLEAVED-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-INTERLEAVED-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-INTERLEAVED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-INTERLEAVED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-INTERLEAVED:       for.exit:
+; CHECK-INTERLEAVED-NEXT:    [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; CHECK-INTERLEAVED-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
+; CHECK-INTERLEAVED-NEXT:    [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]]
+; CHECK-INTERLEAVED-NEXT:    ret i32 [[RESULT]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-MAXBW-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-MAXBW:       middle.block:
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]])
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
+; CHECK-MAXBW-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-MAXBW:       scalar.ph:
+; CHECK-MAXBW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-MAXBW-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-MAXBW-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-MAXBW:       for.body:
+; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
+; CHECK-MAXBW-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-MAXBW-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-MAXBW-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
+; CHECK-MAXBW-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-MAXBW-NEXT:    [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
+; CHECK-MAXBW-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-MAXBW-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-MAXBW-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; CHECK-MAXBW-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-MAXBW:       for.exit:
+; CHECK-MAXBW-NEXT:    [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-MAXBW-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
+; CHECK-MAXBW-NEXT:    [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]]
+; CHECK-MAXBW-NEXT:    ret i32 [[RESULT]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  %result = add i32 %add, %ext.b
+  ret i32 %result
+}
+
+!7 = distinct !{!7, !8, !9, !10}
+!8 = !{!"llvm.loop.mustprogress"}
+!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
+!10 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
new file mode 100644
index 0000000000000..af2a7b966f700
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -0,0 +1,2164 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1
+; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -vectorizer-maximize-bandwidth -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define i32 @dotp(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @dotp(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY1:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP14:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul <vscale x 4 x i32> [[TMP18]], [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP15]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVE1:       middle.block:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
+; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]]
+; CHECK-INTERLEAVE1:       scalar.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP27]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       for.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
+; CHECK-INTERLEAVE1-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-INTERLEAVE1-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-INTERLEAVE1:       for.exit:
+; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ], [ [[TMP27]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-INTERLEAVE1-NEXT:    ret i32 [[ADD_LCSSA]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @dotp(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY1:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP23:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP24:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP14]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP20]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP28]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = mul i64 [[TMP26]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP28]], i64 [[TMP27]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = mul <vscale x 4 x i32> [[TMP19]], [[TMP12]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul <vscale x 4 x i32> [[TMP29]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP23]] = add <vscale x 4 x i32> [[TMP30]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP25]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVED:       middle.block:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP24]], [[TMP23]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]]
+; CHECK-INTERLEAVED:       scalar.ph:
+; CHECK-INTERLEAVED-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ]
+; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       for.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-INTERLEAVED-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-INTERLEAVED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-INTERLEAVED:       for.exit:
+; CHECK-INTERLEAVED-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ], [ [[TMP16]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-INTERLEAVED-NEXT:    ret i32 [[ADD_LCSSA]]
+;
+; CHECK-MAXBW-LABEL: define i32 @dotp(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP20]], [[TMP13]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP22]])
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
+define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_different_types(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP35:%.*]] = load i16, ptr [[TMP19]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP36:%.*]] = load i16, ptr [[TMP20]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP37:%.*]] = load i16, ptr [[TMP21]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP38:%.*]] = load i16, ptr [[TMP22]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP41:%.*]] = load i16, ptr [[TMP23]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP42:%.*]] = load i16, ptr [[TMP24]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP43:%.*]] = load i16, ptr [[TMP25]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP44:%.*]] = load i16, ptr [[TMP26]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP45:%.*]] = load i16, ptr [[TMP27]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP46:%.*]] = load i16, ptr [[TMP28]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP55:%.*]] = load i16, ptr [[TMP29]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP56:%.*]] = load i16, ptr [[TMP30]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP47:%.*]] = load i16, ptr [[TMP31]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP48:%.*]] = load i16, ptr [[TMP32]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP49:%.*]] = load i16, ptr [[TMP33]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP50:%.*]] = load i16, ptr [[TMP34]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP51:%.*]] = insertelement <16 x i16> poison, i16 [[TMP35]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP52:%.*]] = insertelement <16 x i16> [[TMP51]], i16 [[TMP36]], i32 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP53:%.*]] = insertelement <16 x i16> [[TMP52]], i16 [[TMP37]], i32 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP54:%.*]] = insertelement <16 x i16> [[TMP53]], i16 [[TMP38]], i32 3
+; CHECK-INTERLEAVE1-NEXT:    [[TMP57:%.*]] = insertelement <16 x i16> [[TMP54]], i16 [[TMP41]], i32 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP58:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[TMP42]], i32 5
+; CHECK-INTERLEAVE1-NEXT:    [[TMP59:%.*]] = insertelement <16 x i16> [[TMP58]], i16 [[TMP43]], i32 6
+; CHECK-INTERLEAVE1-NEXT:    [[TMP60:%.*]] = insertelement <16 x i16> [[TMP59]], i16 [[TMP44]], i32 7
+; CHECK-INTERLEAVE1-NEXT:    [[TMP61:%.*]] = insertelement <16 x i16> [[TMP60]], i16 [[TMP45]], i32 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP96:%.*]] = insertelement <16 x i16> [[TMP61]], i16 [[TMP46]], i32 9
+; CHECK-INTERLEAVE1-NEXT:    [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP55]], i32 10
+; CHECK-INTERLEAVE1-NEXT:    [[TMP62:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP56]], i32 11
+; CHECK-INTERLEAVE1-NEXT:    [[TMP63:%.*]] = insertelement <16 x i16> [[TMP62]], i16 [[TMP47]], i32 12
+; CHECK-INTERLEAVE1-NEXT:    [[TMP64:%.*]] = insertelement <16 x i16> [[TMP63]], i16 [[TMP48]], i32 13
+; CHECK-INTERLEAVE1-NEXT:    [[TMP65:%.*]] = insertelement <16 x i16> [[TMP64]], i16 [[TMP49]], i32 14
+; CHECK-INTERLEAVE1-NEXT:    [[TMP66:%.*]] = insertelement <16 x i16> [[TMP65]], i16 [[TMP50]], i32 15
+; CHECK-INTERLEAVE1-NEXT:    [[TMP67:%.*]] = zext <16 x i16> [[TMP66]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP70]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-INTERLEAVE1:       middle.block:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]])
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1:       scalar.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       for.body:
+; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B:%.*]] = load i16, ptr [[GEP_B]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[EXT_B:%.*]] = zext i16 [[LOAD_B]] to i32
+; CHECK-INTERLEAVE1-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-INTERLEAVE1-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-INTERLEAVE1:       for.exit:
+; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-INTERLEAVE1-NEXT:    ret i32 [[ADD_LCSSA]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_different_types(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP137:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 17
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 18
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], 19
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX]], 20
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 21
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], 22
+; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX]], 23
+; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = add i64 [[INDEX]], 24
+; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = add i64 [[INDEX]], 25
+; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = add i64 [[INDEX]], 26
+; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = add i64 [[INDEX]], 27
+; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = add i64 [[INDEX]], 28
+; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = add i64 [[INDEX]], 29
+; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = add i64 [[INDEX]], 30
+; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX]], 31
+; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[TMP32]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP34]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP53:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP55:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP16]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP56:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP57:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP18]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP58:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP59:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP20]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP60:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP21]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP61:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP22]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP62:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP23]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP63:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP24]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP64:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP25]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP65:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP26]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP66:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP27]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP67:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP28]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP68:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP139:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP30]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP140:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP31]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP69:%.*]] = load i16, ptr [[TMP39]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP70:%.*]] = load i16, ptr [[TMP40]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP71:%.*]] = load i16, ptr [[TMP41]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP72:%.*]] = load i16, ptr [[TMP42]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP73:%.*]] = load i16, ptr [[TMP43]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP74:%.*]] = load i16, ptr [[TMP44]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP75:%.*]] = load i16, ptr [[TMP45]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP76:%.*]] = load i16, ptr [[TMP46]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP77:%.*]] = load i16, ptr [[TMP47]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP78:%.*]] = load i16, ptr [[TMP48]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP79:%.*]] = load i16, ptr [[TMP49]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP80:%.*]] = load i16, ptr [[TMP50]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP81:%.*]] = load i16, ptr [[TMP51]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP82:%.*]] = load i16, ptr [[TMP52]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP83:%.*]] = load i16, ptr [[TMP53]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP84:%.*]] = load i16, ptr [[TMP54]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP88:%.*]] = insertelement <16 x i16> [[TMP87]], i16 [[TMP72]], i32 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP89:%.*]] = insertelement <16 x i16> [[TMP88]], i16 [[TMP73]], i32 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP90:%.*]] = insertelement <16 x i16> [[TMP89]], i16 [[TMP74]], i32 5
+; CHECK-INTERLEAVED-NEXT:    [[TMP91:%.*]] = insertelement <16 x i16> [[TMP90]], i16 [[TMP75]], i32 6
+; CHECK-INTERLEAVED-NEXT:    [[TMP92:%.*]] = insertelement <16 x i16> [[TMP91]], i16 [[TMP76]], i32 7
+; CHECK-INTERLEAVED-NEXT:    [[TMP93:%.*]] = insertelement <16 x i16> [[TMP92]], i16 [[TMP77]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP94:%.*]] = insertelement <16 x i16> [[TMP93]], i16 [[TMP78]], i32 9
+; CHECK-INTERLEAVED-NEXT:    [[TMP95:%.*]] = insertelement <16 x i16> [[TMP94]], i16 [[TMP79]], i32 10
+; CHECK-INTERLEAVED-NEXT:    [[TMP96:%.*]] = insertelement <16 x i16> [[TMP95]], i16 [[TMP80]], i32 11
+; CHECK-INTERLEAVED-NEXT:    [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP81]], i32 12
+; CHECK-INTERLEAVED-NEXT:    [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13
+; CHECK-INTERLEAVED-NEXT:    [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14
+; CHECK-INTERLEAVED-NEXT:    [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15
+; CHECK-INTERLEAVED-NEXT:    [[TMP101:%.*]] = load i16, ptr [[TMP55]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP102:%.*]] = load i16, ptr [[TMP56]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP103:%.*]] = load i16, ptr [[TMP57]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP104:%.*]] = load i16, ptr [[TMP58]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP105:%.*]] = load i16, ptr [[TMP59]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP106:%.*]] = load i16, ptr [[TMP60]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP107:%.*]] = load i16, ptr [[TMP61]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP108:%.*]] = load i16, ptr [[TMP62]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP109:%.*]] = load i16, ptr [[TMP63]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP110:%.*]] = load i16, ptr [[TMP64]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP111:%.*]] = load i16, ptr [[TMP65]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP112:%.*]] = load i16, ptr [[TMP66]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP113:%.*]] = load i16, ptr [[TMP67]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP114:%.*]] = load i16, ptr [[TMP68]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP115:%.*]] = load i16, ptr [[TMP139]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP116:%.*]] = load i16, ptr [[TMP140]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5
+; CHECK-INTERLEAVED-NEXT:    [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6
+; CHECK-INTERLEAVED-NEXT:    [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7
+; CHECK-INTERLEAVED-NEXT:    [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9
+; CHECK-INTERLEAVED-NEXT:    [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10
+; CHECK-INTERLEAVED-NEXT:    [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11
+; CHECK-INTERLEAVED-NEXT:    [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12
+; CHECK-INTERLEAVED-NEXT:    [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13
+; CHECK-INTERLEAVED-NEXT:    [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14
+; CHECK-INTERLEAVED-NEXT:    [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15
+; CHECK-INTERLEAVED-NEXT:    [[TMP133:%.*]] = zext <16 x i16> [[TMP100]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP135:%.*]] = mul <16 x i32> [[TMP133]], [[TMP35]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP137]] = add <16 x i32> [[TMP135]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP141:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP141]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-INTERLEAVED:       middle.block:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP138]], [[TMP137]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP142:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_different_types(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-MAXBW-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-MAXBW-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-MAXBW-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-MAXBW-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-MAXBW-NEXT:    [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-MAXBW-NEXT:    [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-MAXBW-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-MAXBW-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-MAXBW-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-MAXBW-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-MAXBW-NEXT:    [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-MAXBW-NEXT:    [[TMP101:%.*]] = load i16, ptr [[TMP37]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP102:%.*]] = load i16, ptr [[TMP38]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP103:%.*]] = load i16, ptr [[TMP39]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP104:%.*]] = load i16, ptr [[TMP40]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP105:%.*]] = load i16, ptr [[TMP41]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP106:%.*]] = load i16, ptr [[TMP42]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP107:%.*]] = load i16, ptr [[TMP43]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP108:%.*]] = load i16, ptr [[TMP44]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP109:%.*]] = load i16, ptr [[TMP45]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP110:%.*]] = load i16, ptr [[TMP46]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP111:%.*]] = load i16, ptr [[TMP47]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP112:%.*]] = load i16, ptr [[TMP48]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP113:%.*]] = load i16, ptr [[TMP49]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP114:%.*]] = load i16, ptr [[TMP50]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP115:%.*]] = load i16, ptr [[TMP51]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP116:%.*]] = load i16, ptr [[TMP52]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0
+; CHECK-MAXBW-NEXT:    [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1
+; CHECK-MAXBW-NEXT:    [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2
+; CHECK-MAXBW-NEXT:    [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3
+; CHECK-MAXBW-NEXT:    [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4
+; CHECK-MAXBW-NEXT:    [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5
+; CHECK-MAXBW-NEXT:    [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6
+; CHECK-MAXBW-NEXT:    [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7
+; CHECK-MAXBW-NEXT:    [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8
+; CHECK-MAXBW-NEXT:    [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9
+; CHECK-MAXBW-NEXT:    [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10
+; CHECK-MAXBW-NEXT:    [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11
+; CHECK-MAXBW-NEXT:    [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12
+; CHECK-MAXBW-NEXT:    [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13
+; CHECK-MAXBW-NEXT:    [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14
+; CHECK-MAXBW-NEXT:    [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15
+; CHECK-MAXBW-NEXT:    [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]]
+; CHECK-MAXBW-NEXT:    [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT:    [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-MAXBW-NEXT:    br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i16, ptr %gep.b, align 2
+  %ext.b = zext i16 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
+define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_loop_carried(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16]] = mul <vscale x 8 x i32> [[TMP15]], [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[VECTOR_RECUR]], <vscale x 8 x i32> [[TMP16]], i32 -1)
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP17]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP17]], i64 [[TMP20]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP21]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = mul <vscale x 8 x i32> [[TMP22]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP25]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[TMP24]], <vscale x 8 x i32> [[TMP25]], i32 -1)
+; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_loop_carried(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP25]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]]
+; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[VECTOR_RECUR]], <vscale x 8 x i32> [[TMP25]], i32 -1)
+; CHECK-MAXBW-NEXT:    [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %mul, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
+define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_phi(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = mul <vscale x 8 x i32> [[TMP15]], [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17]] = add <vscale x 8 x i32> [[TMP16]], [[TMP15]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_phi(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[TMP16]], i64 [[TMP19]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP20]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = mul <vscale x 8 x i32> [[TMP22]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP21]] = add <vscale x 8 x i32> [[TMP30]], [[TMP22]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_phi(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP15]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP14]]
+; CHECK-MAXBW-NEXT:    [[TMP21]] = add <vscale x 8 x i32> [[TMP20]], [[TMP19]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %ext.b
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
+define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_unrolled(
+; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP13]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP15]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP18]]
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP20]], 4
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP36:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP38:%.*]] = mul nsw <vscale x 4 x i32> [[TMP21]], [[TMP36]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP23]] = add <vscale x 4 x i32> [[TMP38]], [[VEC_PHI3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD5]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 4 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP42:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD6]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = mul nsw <vscale x 4 x i32> [[TMP25]], [[TMP42]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP30]] = add <vscale x 4 x i32> [[TMP28]], [[VEC_PHI2]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 4 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD7]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 4 x i8>, ptr [[TMP24]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP33:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD8]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP34:%.*]] = mul nsw <vscale x 4 x i32> [[TMP31]], [[TMP33]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP35]] = add <vscale x 4 x i32> [[TMP34]], [[VEC_PHI1]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 4 x i8>, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP37:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD9]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 4 x i8>, ptr [[TMP29]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP39:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD10]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP40:%.*]] = mul nsw <vscale x 4 x i32> [[TMP37]], [[TMP39]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP41]] = add <vscale x 4 x i32> [[TMP40]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP26]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled(
+; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP13]], 8
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], 8
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP18]]
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP34]], 8
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP64:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP65:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI4:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI5:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI6:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI7:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP56:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP56]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP20]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP66:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD8]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 [[TMP26]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 4 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 4 x i8>, ptr [[TMP72]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD9]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP82:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD10]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = mul nsw <vscale x 4 x i32> [[TMP28]], [[TMP66]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = mul nsw <vscale x 4 x i32> [[TMP82]], [[TMP23]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP50]] = add <vscale x 4 x i32> [[TMP30]], [[VEC_PHI6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP33]] = add <vscale x 4 x i32> [[TMP31]], [[VEC_PHI7]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = mul i64 [[TMP35]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 [[TMP36]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD11:%.*]] = load <vscale x 4 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD12:%.*]] = load <vscale x 4 x i8>, ptr [[TMP37]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP38:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD11]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD12]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = mul i64 [[TMP41]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP42]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD13:%.*]] = load <vscale x 4 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD14:%.*]] = load <vscale x 4 x i8>, ptr [[TMP43]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD13]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP45:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD14]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP46:%.*]] = mul nsw <vscale x 4 x i32> [[TMP38]], [[TMP44]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP47:%.*]] = mul nsw <vscale x 4 x i32> [[TMP39]], [[TMP45]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP48]] = add <vscale x 4 x i32> [[TMP46]], [[VEC_PHI4]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP49]] = add <vscale x 4 x i32> [[TMP47]], [[VEC_PHI5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP51:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP52:%.*]] = mul i64 [[TMP51]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP52]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD15:%.*]] = load <vscale x 4 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD16:%.*]] = load <vscale x 4 x i8>, ptr [[TMP53]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP54:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD15]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP55:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD16]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP57:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP58:%.*]] = mul i64 [[TMP57]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[TMP58]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD17:%.*]] = load <vscale x 4 x i8>, ptr [[TMP24]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD18:%.*]] = load <vscale x 4 x i8>, ptr [[TMP59]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP60:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD17]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP61:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD18]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP62:%.*]] = mul nsw <vscale x 4 x i32> [[TMP54]], [[TMP60]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP63:%.*]] = mul nsw <vscale x 4 x i32> [[TMP55]], [[TMP61]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP64]] = add <vscale x 4 x i32> [[TMP62]], [[VEC_PHI2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP65]] = add <vscale x 4 x i32> [[TMP63]], [[VEC_PHI3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP67:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP68:%.*]] = mul i64 [[TMP67]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 [[TMP68]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD19:%.*]] = load <vscale x 4 x i8>, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD20:%.*]] = load <vscale x 4 x i8>, ptr [[TMP69]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP70:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD19]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP71:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD20]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP73:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP74:%.*]] = mul i64 [[TMP73]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 [[TMP74]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD21:%.*]] = load <vscale x 4 x i8>, ptr [[TMP29]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD22:%.*]] = load <vscale x 4 x i8>, ptr [[TMP75]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP76:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD21]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP77:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD22]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP78:%.*]] = mul nsw <vscale x 4 x i32> [[TMP70]], [[TMP76]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP79:%.*]] = mul nsw <vscale x 4 x i32> [[TMP71]], [[TMP77]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP80]] = add <vscale x 4 x i32> [[TMP78]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP81]] = add <vscale x 4 x i32> [[TMP79]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP40]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+;
+; CHECK-MAXBW-LABEL: define i32 @dotp_unrolled(
+; CHECK-MAXBW-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP1]]
+; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI4:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI5:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI6:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI7:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[TMP6]], 1
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = or disjoint i64 [[TMP6]], 2
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = or disjoint i64 [[TMP6]], 3
+; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]]
+; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 8 x i8>, ptr [[TMP24]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD9]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = mul nsw <vscale x 8 x i32> [[TMP29]], [[TMP23]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE11]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI7]], <vscale x 8 x i32> [[TMP31]])
+; CHECK-MAXBW-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD12:%.*]] = load <vscale x 8 x i8>, ptr [[TMP32]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP37:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD12]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD14:%.*]] = load <vscale x 8 x i8>, ptr [[TMP38]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP43:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD14]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP45:%.*]] = mul nsw <vscale x 8 x i32> [[TMP37]], [[TMP43]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI6]], <vscale x 8 x i32> [[TMP45]])
+; CHECK-MAXBW-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD18:%.*]] = load <vscale x 8 x i8>, ptr [[TMP46]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP51:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD18]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD20:%.*]] = load <vscale x 8 x i8>, ptr [[TMP52]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP57:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD20]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP59:%.*]] = mul nsw <vscale x 8 x i32> [[TMP51]], [[TMP57]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE17]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI5]], <vscale x 8 x i32> [[TMP59]])
+; CHECK-MAXBW-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD24:%.*]] = load <vscale x 8 x i8>, ptr [[TMP60]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP65:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD24]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i8, ptr [[TMP17]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD26:%.*]] = load <vscale x 8 x i8>, ptr [[TMP66]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP71:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD26]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP73:%.*]] = mul nsw <vscale x 8 x i32> [[TMP65]], [[TMP71]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE16]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI4]], <vscale x 8 x i32> [[TMP73]])
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP74:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.body:                                    ; preds = %entry, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum3 = phi i32 [ 0, %entry ], [ %add.a3, %for.body ]
+  %accum2 = phi i32 [ 0, %entry ], [ %add.a2, %for.body ]
+  %accum1 = phi i32 [ 0, %entry ], [ %add.a1, %for.body ]
+  %accum0 = phi i32 [ 0, %entry ], [ %add.a0, %for.body ]
+  %gep.a0 = getelementptr inbounds i8, ptr %a, i64 %iv
+  %gep.b0 = getelementptr inbounds i8, ptr %b, i64 %iv
+  %offset.1 = or disjoint i64 %iv, 1
+  %gep.a1 = getelementptr inbounds i8, ptr %a, i64 %offset.1
+  %gep.b1 = getelementptr inbounds i8, ptr %b, i64 %offset.1
+  %offset.2 = or disjoint i64 %iv, 2
+  %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %offset.2
+  %gep.b2 = getelementptr inbounds i8, ptr %b, i64 %offset.2
+  %offset.3 = or disjoint i64 %iv, 3
+  %gep.a3 = getelementptr inbounds i8, ptr %a, i64 %offset.3
+  %gep.b3 = getelementptr inbounds i8, ptr %b, i64 %offset.3
+  %load.a0 = load i8, ptr %gep.a0, align 1
+  %ext.a0 = sext i8 %load.a0 to i32
+  %load.b0 = load i8, ptr %gep.b0, align 1
+  %ext.b0 = sext i8 %load.b0 to i32
+  %mul.a0 = mul nsw i32 %ext.b0, %ext.a0
+  %add.a0 = add nsw i32 %mul.a0, %accum0
+  %load.a1 = load i8, ptr %gep.a1, align 1
+  %ext.a1 = sext i8 %load.a1 to i32
+  %load.b1 = load i8, ptr %gep.b1, align 1
+  %ext.b1 = sext i8 %load.b1 to i32
+  %mul.a1 = mul nsw i32 %ext.a1, %ext.b1
+  %add.a1 = add nsw i32 %mul.a1, %accum1
+  %load.a2 = load i8, ptr %gep.a2, align 1
+  %ext.a2 = sext i8 %load.a2 to i32
+  %load.b2 = load i8, ptr %gep.b2, align 1
+  %ext.b2 = sext i8 %load.b2 to i32
+  %mul.a2 = mul nsw i32 %ext.a2, %ext.b2
+  %add.a2 = add nsw i32 %mul.a2, %accum2
+  %load.a3 = load i8, ptr %gep.a3, align 1
+  %ext.a3 = sext i8 %load.a3 to i32
+  %load.b3 = load i8, ptr %gep.b3, align 1
+  %ext.b3 = sext i8 %load.b3 to i32
+  %mul.a3 = mul nsw i32 %ext.a3, %ext.b3
+  %add.a3 = add nsw i32 %mul.a3, %accum3
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %num_in
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:                                        ; preds = %for.body
+  %result0 = add nsw i32 %add.a0, %add.a1
+  %result1 = add nsw i32 %add.a2, %add.a3
+  %result = add nsw i32 %result0, %result1
+  ret i32 %result
+}
+
+define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated(
+; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP7]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP10]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP11]], 4
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul nsw <vscale x 4 x i32> [[TMP12]], [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-INTERLEAVE1:       middle.block:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
+; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated(
+; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP8]], 8
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP15]], 8
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = mul nsw <vscale x 4 x i32> [[TMP19]], [[TMP12]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul nsw <vscale x 4 x i32> [[TMP20]], [[TMP25]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP23]] = add <vscale x 4 x i32> [[TMP21]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-INTERLEAVED:       middle.block:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP24]], [[TMP23]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated(
+; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = mul nsw <vscale x 8 x i32> [[TMP20]], [[TMP13]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP22]])
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = sext i8 %load.a to i32
+  %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = sext i8 %load.b to i32
+  %mul = mul nsw i32 %ext.b, %ext.a
+  %add = add nsw i32 %mul, %accum
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
+define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated_pragma(
+; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP10]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP11]]
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]]
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP12]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = sub i64 [[N]], [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0
+; CHECK-INTERLEAVE1-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP5]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP8]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP19]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]]
+; CHECK-INTERLEAVE1-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[TMP20]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated_pragma(
+; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP10]], 1
+; CHECK-INTERLEAVED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP11]]
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP12]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = sub i64 [[N]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0
+; CHECK-INTERLEAVED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP5]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP8]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP19]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[TMP20]], i32 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated_pragma(
+; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; CHECK-MAXBW-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP2]]
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
+; CHECK-MAXBW-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP15]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 4 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
+; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT:    [[TMP19]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; CHECK-MAXBW-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
+; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[TMP20]], i32 0
+; CHECK-MAXBW-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr inbounds i8, ptr %b, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = sext i8 %load.a to i32
+  %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %iv
+  %load.b = load i8, ptr %gep.a2, align 1
+  %ext.b = sext i8 %load.b to i32
+  %mul = mul nsw i32 %ext.b, %ext.a
+  %add = add nsw i32 %mul, %accum
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !7
+
+exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
+define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_extend_user(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP8]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP10]]
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP11]], 4
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul <vscale x 4 x i32> [[TMP12]], [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 8
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP7]]
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP14]], 8
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = mul <vscale x 4 x i32> [[TMP19]], [[TMP12]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul <vscale x 4 x i32> [[TMP20]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP23]] = add <vscale x 4 x i32> [[TMP21]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP20]], [[TMP13]]
+; CHECK-MAXBW-NEXT:    [[TMP24]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  %result = add i32 %add, %ext.b
+  ret i32 %result
+}
+
+define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i64 @dotp_cost_disagreement(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP11]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i8>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD1]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP13]], [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15]] = add <vscale x 2 x i64> [[VEC_PHI]], [[TMP14]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define i64 @dotp_cost_disagreement(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD2]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = add nuw nsw i64 [[TMP6]], 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i64 [[TMP18]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i8>, ptr [[TMP16]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD3]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD4]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP20]], [[TMP12]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP21]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP24]] = add <vscale x 2 x i64> [[VEC_PHI]], [[TMP22]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP25]] = add <vscale x 2 x i64> [[VEC_PHI1]], [[TMP23]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+;
+; CHECK-MAXBW-LABEL: define i64 @dotp_cost_disagreement(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]]
+; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 1 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i64>
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 1
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP11]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i64>
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP13]], [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 1 x i64> @llvm.experimental.vector.partial.reduce.add.nxv1i64.nxv8i64(<vscale x 1 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[TMP14]])
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-MAXBW:       middle.block:
+; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
+  %sum = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds nuw i8, ptr %a, i64 %i.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i64
+  %i.iv.next = add nuw nsw i64 %i.iv, 1
+  %arrayidx2 = getelementptr inbounds nuw i8, ptr %b, i64 %i.iv.next
+  %1 = load i8, ptr %arrayidx2, align 1
+  %conv3 = zext i8 %1 to i64
+  %mul = mul nuw nsw i64 %conv3, %conv
+  %add = add i64 %sum, %mul
+  %exitcond.not = icmp eq i64 %i.iv.next, 16
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:                                 ; preds = %for.body
+  ret i64 %add
+}
+
+define void @not_dotp_not_phi2(ptr %matrix, i32 %n) #0 {
+; CHECK-INTERLEAVE1-LABEL: define void @not_dotp_not_phi2(
+; CHECK-INTERLEAVE1-SAME: ptr [[MATRIX:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP]], label [[FOR_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK-INTERLEAVE1:       for.preheader:
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A:%.*]] = load i8, ptr null, align 1
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A1:%.*]] = load i8, ptr inttoptr (i64 1 to ptr), align 1
+; CHECK-INTERLEAVE1-NEXT:    [[A_EXT:%.*]] = sext i8 [[LOAD_A]] to i32
+; CHECK-INTERLEAVE1-NEXT:    [[A_EXT1:%.*]] = sext i8 [[LOAD_A1]] to i32
+; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       for.body:
+; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_PREHEADER]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[PTR:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[FOR_BODY]] ], [ [[MATRIX]], [[FOR_PREHEADER]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[ADD_1:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_PREHEADER]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[PTR]], i64 1
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_B1:%.*]] = getelementptr i8, ptr [[PTR]], i64 2
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[B_EXT:%.*]] = sext i8 [[LOAD_B]] to i32
+; CHECK-INTERLEAVE1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
+; CHECK-INTERLEAVE1-NEXT:    [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B1:%.*]] = load i8, ptr [[GEP_B1]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[B_EXT1:%.*]] = sext i8 [[LOAD_B1]] to i32
+; CHECK-INTERLEAVE1-NEXT:    [[MUL_1:%.*]] = mul nsw i32 [[A_EXT1]], [[B_EXT1]]
+; CHECK-INTERLEAVE1-NEXT:    [[ADD_1]] = add i32 [[MUL_1]], [[ADD]]
+; CHECK-INTERLEAVE1-NEXT:    [[SCEVGEP]] = getelementptr i8, ptr [[PTR]], i64 16
+; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]]
+;
+; CHECK-INTERLEAVED-LABEL: define void @not_dotp_not_phi2(
+; CHECK-INTERLEAVED-SAME: ptr [[MATRIX:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP]], label [[FOR_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK-INTERLEAVED:       for.preheader:
+; CHECK-INTERLEAVED-NEXT:    [[LOAD_A:%.*]] = load i8, ptr null, align 1
+; CHECK-INTERLEAVED-NEXT:    [[LOAD_A1:%.*]] = load i8, ptr inttoptr (i64 1 to ptr), align 1
+; CHECK-INTERLEAVED-NEXT:    [[A_EXT:%.*]] = sext i8 [[LOAD_A]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[A_EXT1:%.*]] = sext i8 [[LOAD_A1]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[N_VEC]], 16
+; CHECK-INTERLEAVED-NEXT:    [[IND_END1:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP1]]
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = sext i8 [[TMP8]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = sext i8 [[TMP9]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = mul nsw i32 [[A_EXT]], [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul nsw i32 [[A_EXT]], [[TMP11]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = add i32 [[TMP12]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = add i32 [[TMP13]], [[VEC_PHI2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = load i8, ptr [[TMP6]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = load i8, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = sext i8 [[TMP16]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = sext i8 [[TMP17]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul nsw i32 [[A_EXT1]], [[TMP18]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = mul nsw i32 [[A_EXT1]], [[TMP19]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP22]] = add i32 [[TMP20]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP23]] = add i32 [[TMP21]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+;
+; CHECK-MAXBW-LABEL: define void @not_dotp_not_phi2(
+; CHECK-MAXBW-SAME: ptr [[MATRIX:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-MAXBW-NEXT:    br i1 [[CMP]], label [[FOR_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK-MAXBW:       for.preheader:
+; CHECK-MAXBW-NEXT:    [[LOAD_A:%.*]] = load i8, ptr null, align 1
+; CHECK-MAXBW-NEXT:    [[LOAD_A1:%.*]] = load i8, ptr inttoptr (i64 1 to ptr), align 1
+; CHECK-MAXBW-NEXT:    [[A_EXT:%.*]] = sext i8 [[LOAD_A]] to i32
+; CHECK-MAXBW-NEXT:    [[A_EXT1:%.*]] = sext i8 [[LOAD_A1]] to i32
+; CHECK-MAXBW-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-MAXBW:       for.body:
+; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_PREHEADER]] ]
+; CHECK-MAXBW-NEXT:    [[PTR:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[FOR_BODY]] ], [ [[MATRIX]], [[FOR_PREHEADER]] ]
+; CHECK-MAXBW-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[ADD_1:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_PREHEADER]] ]
+; CHECK-MAXBW-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[PTR]], i64 1
+; CHECK-MAXBW-NEXT:    [[GEP_B1:%.*]] = getelementptr i8, ptr [[PTR]], i64 2
+; CHECK-MAXBW-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-MAXBW-NEXT:    [[B_EXT:%.*]] = sext i8 [[LOAD_B]] to i32
+; CHECK-MAXBW-NEXT:    [[MUL:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
+; CHECK-MAXBW-NEXT:    [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-MAXBW-NEXT:    [[LOAD_B1:%.*]] = load i8, ptr [[GEP_B1]], align 1
+; CHECK-MAXBW-NEXT:    [[B_EXT1:%.*]] = sext i8 [[LOAD_B1]] to i32
+; CHECK-MAXBW-NEXT:    [[MUL_1:%.*]] = mul nsw i32 [[A_EXT1]], [[B_EXT1]]
+; CHECK-MAXBW-NEXT:    [[ADD_1]] = add i32 [[MUL_1]], [[ADD]]
+; CHECK-MAXBW-NEXT:    [[SCEVGEP]] = getelementptr i8, ptr [[PTR]], i64 16
+; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-MAXBW-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; CHECK-MAXBW-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]]
+;
+entry:
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %for.preheader, label %exit
+
+for.preheader:                   ; preds = %entry
+  %load.a = load i8, ptr inttoptr (i64 0 to ptr), align 1
+  %load.a1 = load i8, ptr inttoptr (i64 1 to ptr), align 1
+  %a.ext = sext i8 %load.a to i32
+  %a.ext1 = sext i8 %load.a1 to i32
+  br label %for.body
+
+for.body:                             ; preds = %for.preheader, %for.body
+  %iv = phi i32 [ %iv.next, %for.body ], [ 0, %for.preheader ]
+  %ptr = phi ptr [ %scevgep, %for.body ], [ %matrix, %for.preheader ]
+  %accum = phi i32 [ %add.1, %for.body ], [ 0, %for.preheader ]
+  %gep.b = getelementptr i8, ptr %ptr, i64 1
+  %gep.b1 = getelementptr i8, ptr %ptr, i64 2
+  %load.b = load i8, ptr %gep.b, align 1
+  %b.ext = sext i8 %load.b to i32
+  %mul = mul nsw i32 %a.ext, %b.ext
+  %add = add i32 %mul, %accum
+  %load.b1 = load i8, ptr %gep.b1, align 1
+  %b.ext1 = sext i8 %load.b1 to i32
+  %mul.1 = mul nsw i32 %a.ext1, %b.ext1
+  %add.1 = add i32 %mul.1, %add
+  %scevgep = getelementptr i8, ptr %ptr, i64 16
+  %iv.next = add nuw nsw i32 %iv, 1
+  %exitcond.not = icmp eq i32 %iv.next, %n
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                       ; preds = %for.body
+  %add.1.lcssa = phi i32 [ %add.1, %for.body ]
+  %add.float = sitofp i32 %add.1.lcssa to float
+  br label %exit
+
+exit:                                ; preds = %for.exit, %entry
+  %result = phi float [ 0.000000e+00, %entry ], [ %add.float, %for.exit ]
+  store float %result, ptr %matrix, align 4
+  ret void
+}
+
+define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i64 @not_dotp_ext_outside_plan(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[N]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       for.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[EXT_B:%.*]] = zext i16 [[B]] to i64
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[EXT_B]], i64 0
+; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = mul nuw nsw <8 x i64> [[TMP3]], [[BROADCAST_SPLAT]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVE1:       middle.block:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
+; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-INTERLEAVED-LABEL: define i64 @not_dotp_ext_outside_plan(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[CMP:%.*]] = icmp eq i64 [[N]], 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]]
+; CHECK-INTERLEAVED:       for.ph:
+; CHECK-INTERLEAVED-NEXT:    [[EXT_B:%.*]] = zext i16 [[B]] to i64
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[EXT_B]], i64 0
+; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD2]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = mul nuw nsw <8 x i64> [[TMP4]], [[BROADCAST_SPLAT]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul nuw nsw <8 x i64> [[TMP5]], [[BROADCAST_SPLAT]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP8]] = add <8 x i64> [[TMP6]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVED:       middle.block:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-MAXBW-LABEL: define i64 @not_dotp_ext_outside_plan(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[CMP:%.*]] = icmp eq i64 [[N]], 0
+; CHECK-MAXBW-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]]
+; CHECK-MAXBW:       for.ph:
+; CHECK-MAXBW-NEXT:    [[EXT_B:%.*]] = zext i16 [[B]] to i64
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EXT_B]], i64 0
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP7]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP8]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = mul nuw nsw <vscale x 4 x i64> [[TMP9]], [[BROADCAST_SPLAT]]
+; CHECK-MAXBW-NEXT:    [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-MAXBW:       middle.block:
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]])
+; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+;
+entry:
+  %cmp = icmp eq i64 %n, 0
+  br i1 %cmp, label %exit, label %for.ph
+
+for.ph:                                   ; preds = %entry
+  %ext.b = zext i16 %b to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %iv = phi i64 [ 0, %for.ph ], [ %iv.next, %for.body ]
+  %accum = phi i64 [ 0, %for.ph ], [ %add, %for.body ]
+  %gep.a = getelementptr inbounds nuw i16, ptr %a, i64 %iv
+  %load.a = load i16, ptr %gep.a, align 2
+  %ext.a = zext i16 %load.a to i64
+  %mul = mul nuw nsw i64 %ext.a, %ext.b
+  %add = add i64 %mul, %accum
+  %iv.next = add nuw nsw i64 %iv, 1
+  %cmp.1 = icmp eq i64 %iv.next, %n
+  br i1 %cmp.1, label %exit, label %for.body
+
+exit:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %result = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  ret i64 %result
+}
+
+define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i64 @not_dotp_ext_outside_plan2(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[N]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       for.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[EXT_B:%.*]] = zext i16 [[B]] to i64
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[EXT_B]], i64 0
+; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = mul nuw nsw <8 x i64> [[BROADCAST_SPLAT]], [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVE1:       middle.block:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
+; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-INTERLEAVED-LABEL: define i64 @not_dotp_ext_outside_plan2(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[CMP:%.*]] = icmp eq i64 [[N]], 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]]
+; CHECK-INTERLEAVED:       for.ph:
+; CHECK-INTERLEAVED-NEXT:    [[EXT_B:%.*]] = zext i16 [[B]] to i64
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[EXT_B]], i64 0
+; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD2]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = mul nuw nsw <8 x i64> [[BROADCAST_SPLAT]], [[TMP4]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul nuw nsw <8 x i64> [[BROADCAST_SPLAT]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP8]] = add <8 x i64> [[TMP6]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVED:       middle.block:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-MAXBW-LABEL: define i64 @not_dotp_ext_outside_plan2(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[CMP:%.*]] = icmp eq i64 [[N]], 0
+; CHECK-MAXBW-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]]
+; CHECK-MAXBW:       for.ph:
+; CHECK-MAXBW-NEXT:    [[EXT_B:%.*]] = zext i16 [[B]] to i64
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EXT_B]], i64 0
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP7]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP8]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = mul nuw nsw <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-MAXBW:       middle.block:
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]])
+; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+;
+entry:
+  %cmp = icmp eq i64 %n, 0
+  br i1 %cmp, label %exit, label %for.ph
+
+for.ph:                                   ; preds = %entry
+  %ext.b = zext i16 %b to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %iv = phi i64 [ 0, %for.ph ], [ %iv.next, %for.body ]
+  %accum = phi i64 [ 0, %for.ph ], [ %add, %for.body ]
+  %gep.a = getelementptr inbounds nuw i16, ptr %a, i64 %iv
+  %load.a = load i16, ptr %gep.a, align 2
+  %ext.a = zext i16 %load.a to i64
+  %mul = mul nuw nsw i64 %ext.b, %ext.a
+  %add = add i64 %mul, %accum
+  %iv.next = add nuw nsw i64 %iv, 1
+  %cmp.1 = icmp eq i64 %iv.next, %n
+  br i1 %cmp.1, label %exit, label %for.body
+
+exit:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %result = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  ret i64 %result
+}
+
+!7 = distinct !{!7, !8, !9, !10}
+!8 = !{!"llvm.loop.mustprogress"}
+!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
+!10 = !{!"llvm.loop.vectorize.enable", i1 true}
+attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll
new file mode 100644
index 0000000000000..f24b115ab9f99
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define i32 @not_dotp(ptr %a, ptr %b) {
+; CHECK-LABEL: define i32 @not_dotp(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
new file mode 100644
index 0000000000000..5dd9f8ff97cca
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -0,0 +1,94 @@
+; REQUIRES: asserts
+; RUN: opt -mattr=+neon,+dotprod -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-interleave=1 -disable-output %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+; Tests for printing VPlans that are enabled under AArch64
+
+define i32 @print_partial_reduction(ptr %a, ptr %b) {
+; CHECK:      VPlan 'Initial VPlan for VF={8,16},UF>=1' {
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<0> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT:   EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
+; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi ir<0>, ir<[[REDUCE:%.+]]> (VF scaled by 1/4)
+; CHECK-NEXT:   vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT:   CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[STEPS]]>
+; CHECK-NEXT:   vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a>
+; CHECK-NEXT:   WIDEN ir<%load.a> = load vp<[[PTR_A]]>
+; CHECK-NEXT:   WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
+; CHECK-NEXT:   CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[STEPS]]>
+; CHECK-NEXT:   vp<[[PTR_B:%.+]]> = vector-pointer ir<%gep.b>
+; CHECK-NEXT:   WIDEN ir<%load.b> = load vp<[[PTR_B]]>
+; CHECK-NEXT:   WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32
+; CHECK-NEXT:   WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a>
+; CHECK-NEXT:   PARTIAL-REDUCE ir<[[REDUCE]]> = add ir<%mul>, ir<[[ACC]]>
+; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, ir<[[REDUCE]]>
+; CHECK-NEXT:   EMIT vp<[[EXTRACT:%.+]]> = extract-from-end vp<[[RED_RESULT]]>, ir<1>
+; CHECK-NEXT:   EMIT vp<[[CMP:%.+]]> = icmp eq ir<0>, vp<%1>
+; CHECK-NEXT:   EMIT branch-on-cond vp<[[CMP]]>
+; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<%bc.resume.val> = resume-phi vp<[[VEC_TC]]>, ir<0>
+; CHECK-NEXT:   EMIT vp<%bc.merge.rdx> = resume-phi vp<[[RED_RESULT]]>, ir<0>
+; CHECK-NEXT: Successor(s): ir-bb<for.body>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<for.body>:
+; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; CHECK-NEXT:   IR   %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] (extra operand: vp<%bc.merge.rdx> from scalar.ph)
+; CHECK-NEXT:   IR   %gep.a = getelementptr i8, ptr %a, i64 %iv
+; CHECK-NEXT:   IR   %load.a = load i8, ptr %gep.a, align 1
+; CHECK-NEXT:   IR   %ext.a = zext i8 %load.a to i32
+; CHECK-NEXT:   IR   %gep.b = getelementptr i8, ptr %b, i64 %iv
+; CHECK-NEXT:   IR   %load.b = load i8, ptr %gep.b, align 1
+; CHECK-NEXT:   IR   %ext.b = zext i8 %load.b to i32
+; CHECK-NEXT:   IR   %mul = mul i32 %ext.b, %ext.a
+; CHECK-NEXT:   IR   %add = add i32 %mul, %accum
+; CHECK-NEXT:   IR   %iv.next = add i64 %iv, 1
+; CHECK-NEXT:   IR   %exitcond.not = icmp eq i64 %iv.next, 0
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<exit>:
+; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[EXTRACT]]> from middle.block)
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret i32 %add
+}

From 5315f3f8cb8f562ec39f57f2fce79c8e017595f9 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Mon, 13 Jan 2025 11:24:05 +0000
Subject: [PATCH 251/408] Handle leading underscores in
 update_cc_test_checks.py (#121800)

For some ABIs `update_cc_test_checks.py` is unable to generate tests
because of the mismatch between the mangled function names reported by
clang's `-asd-dump` and the function names in LLVM IR.

This patch fixes it by striping the leading underscore from the mangled
name for global functions if the data layout string says they have one.
---
 .../Inputs/c-symbol-mangling.c                |  1 -
 .../Inputs/c-symbol-mangling.c.expected       | 16 +++++++++-
 llvm/utils/UpdateTestChecks/common.py         | 16 ++++++++++
 llvm/utils/update_cc_test_checks.py           | 30 +++++++++++++++----
 4 files changed, 55 insertions(+), 8 deletions(-)

diff --git a/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c b/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c
index 018f992640065..58feddeb6bea0 100644
--- a/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c
+++ b/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c
@@ -18,7 +18,6 @@
 // UTC_ARGS: --enable
 
 #ifdef __arm__
-/// FIXME: UTC does not find this function, but can find all others.
 typedef __attribute__((neon_vector_type(8))) __INT8_TYPE__ int8x8_t;
 int8x8_t test_vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
   return a + b + c;
diff --git a/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c.expected b/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c.expected
index 5d514f9d64c02..e17ce61db9c2b 100644
--- a/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c.expected
+++ b/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c.expected
@@ -18,8 +18,22 @@
 // UTC_ARGS: --enable
 
 #ifdef __arm__
-/// FIXME: UTC does not find this function, but can find all others.
 typedef __attribute__((neon_vector_type(8))) __INT8_TYPE__ int8x8_t;
+// THUMB-DARWIN-LABEL: @test_vaba_s8(
+// THUMB-DARWIN-NEXT:  entry:
+// THUMB-DARWIN-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i8>, align 8
+// THUMB-DARWIN-NEXT:    [[B_ADDR:%.*]] = alloca <8 x i8>, align 8
+// THUMB-DARWIN-NEXT:    [[C_ADDR:%.*]] = alloca <8 x i8>, align 8
+// THUMB-DARWIN-NEXT:    store <8 x i8> [[A:%.*]], ptr [[A_ADDR]], align 8
+// THUMB-DARWIN-NEXT:    store <8 x i8> [[B:%.*]], ptr [[B_ADDR]], align 8
+// THUMB-DARWIN-NEXT:    store <8 x i8> [[C:%.*]], ptr [[C_ADDR]], align 8
+// THUMB-DARWIN-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[A_ADDR]], align 8
+// THUMB-DARWIN-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[B_ADDR]], align 8
+// THUMB-DARWIN-NEXT:    [[ADD:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
+// THUMB-DARWIN-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[C_ADDR]], align 8
+// THUMB-DARWIN-NEXT:    [[ADD1:%.*]] = add <8 x i8> [[ADD]], [[TMP2]]
+// THUMB-DARWIN-NEXT:    ret <8 x i8> [[ADD1]]
+//
 int8x8_t test_vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
   return a + b + c;
 }
diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py
index e1cc02e1a608c..1a875c2b523e4 100644
--- a/llvm/utils/UpdateTestChecks/common.py
+++ b/llvm/utils/UpdateTestChecks/common.py
@@ -557,6 +557,10 @@ def invoke_tool(exe, cmd_args, ir, preprocess_cmd=None, verbose=False):
 UTC_AVOID = "NOTE: Do not autogenerate"
 UNUSED_NOTE = "NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:"
 
+DATA_LAYOUT_RE = re.compile(
+    r"target\s+datalayout\s+=\s+\"(?P<layout>.+)\"$", flags=(re.M | re.S)
+)
+
 OPT_FUNCTION_RE = re.compile(
     r"^(\s*;\s*Function\sAttrs:\s(?P<attrs>[\w\s():,]+?))?\s*define\s+(?P<funcdef_attrs_and_ret>[^@]*)@(?P<func>[\w.$-]+?)\s*"
     r"(?P<args_and_sig>\((\)|(.*?[\w.-]+?)\))[^{]*\{)\n(?P<body>.*?)^\}$",
@@ -651,6 +655,18 @@ def get_triple_from_march(march):
     return "x86"
 
 
+def get_globals_name_prefix(raw_tool_output):
+    m = DATA_LAYOUT_RE.search(raw_tool_output)
+    if not m:
+        return None
+    data_layout = m.group("layout")
+    idx = data_layout.find("m:")
+    if idx < 0:
+        return None
+    ch = data_layout[idx + 2]
+    return "_" if ch == "o" or ch == "x" else None
+
+
 def apply_filters(line, filters):
     has_filter = False
     for f in filters:
diff --git a/llvm/utils/update_cc_test_checks.py b/llvm/utils/update_cc_test_checks.py
index 3ffb07ddf6ad8..7a4796eaabb3b 100755
--- a/llvm/utils/update_cc_test_checks.py
+++ b/llvm/utils/update_cc_test_checks.py
@@ -34,7 +34,7 @@
 }
 
 
-def get_line2func_list(args, clang_args):
+def get_line2func_list(args, clang_args, globals_name_prefix):
     ret = collections.defaultdict(list)
     # Use clang's JSON AST dump to get the mangled name
     json_dump_args = [args.clang] + clang_args + ["-fsyntax-only", "-o", "-"]
@@ -122,6 +122,14 @@ def parse_clang_ast_json(node, loc, search):
         if search is None:
             search = spell
         mangled = node.get("mangledName", spell)
+        # Clang's AST dump includes the globals prefix, but when Clang emits
+        # LLVM IR this is not included and instead added as part of the asm
+        # output. Strip it from the mangled name of globals when needed
+        # (see DataLayout::getGlobalPrefix()).
+        if globals_name_prefix:
+            storage = node.get("storageClass", None)
+            if storage != "static" and mangled[0] == globals_name_prefix:
+                mangled = mangled[1:]
         ret[int(line) - 1].append((spell, mangled, search))
 
     ast = json.loads(stdout)
@@ -249,10 +257,10 @@ def config():
     return args, parser
 
 
-def get_function_body(builder, args, filename, clang_args, extra_commands, prefixes):
+def get_function_body(
+    builder, args, filename, clang_args, extra_commands, prefixes, raw_tool_output
+):
     # TODO Clean up duplication of asm/common build_function_body_dictionary
-    # Invoke external tool and extract function bodies.
-    raw_tool_output = common.invoke_tool(args.clang, clang_args, filename)
     for extra_command in extra_commands:
         extra_args = shlex.split(extra_command)
         with tempfile.NamedTemporaryFile() as f:
@@ -383,13 +391,23 @@ def main():
             common.debug("Extracted clang cmd: clang {}".format(clang_args))
             common.debug("Extracted FileCheck prefixes: {}".format(prefixes))
 
+            # Invoke external tool and extract function bodies.
+            raw_tool_output = common.invoke_tool(ti.args.clang, clang_args, ti.path)
             get_function_body(
-                builder, ti.args, ti.path, clang_args, extra_commands, prefixes
+                builder,
+                ti.args,
+                ti.path,
+                clang_args,
+                extra_commands,
+                prefixes,
+                raw_tool_output,
             )
 
             # Invoke clang -Xclang -ast-dump=json to get mapping from start lines to
             # mangled names. Forward all clang args for now.
-            for k, v in get_line2func_list(ti.args, clang_args).items():
+            for k, v in get_line2func_list(
+                ti.args, clang_args, common.get_globals_name_prefix(raw_tool_output)
+            ).items():
                 line2func_list[k].extend(v)
 
         func_dict = builder.finish_and_get_func_dict()

From 6c5941b09fca487efc5000c82bbce6054bf36a7c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 10 Jan 2025 17:52:06 +0000
Subject: [PATCH 252/408] [X86] subvectorwise-store-of-vector-splat.ll -
 regenerate VPTERNLOG comments

---
 .../subvectorwise-store-of-vector-splat.ll    | 38 +++++++++----------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
index f1fd05565c47e..df8a85fd07258 100644
--- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
+++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
@@ -389,7 +389,7 @@ define void @vec128_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; AVX512-LABEL: vec128_v2i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa %xmm0, (%rdx)
@@ -452,7 +452,7 @@ define void @vec128_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; AVX512-LABEL: vec128_v2f32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa %xmm0, (%rdx)
@@ -599,7 +599,7 @@ define void @vec128_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; AVX512-LABEL: vec128_v4i16:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa %xmm0, (%rdx)
@@ -694,7 +694,7 @@ define void @vec128_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
 ; AVX512-LABEL: vec128_v8i8:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa %xmm0, (%rdx)
@@ -1003,7 +1003,7 @@ define void @vec256_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; AVX512-LABEL: vec256_v2i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %ymm0
 ; AVX512-NEXT:    vmovdqa %ymm0, (%rdx)
@@ -1079,7 +1079,7 @@ define void @vec256_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; AVX512-LABEL: vec256_v2f32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %ymm0
 ; AVX512-NEXT:    vmovdqa %ymm0, (%rdx)
@@ -1355,7 +1355,7 @@ define void @vec256_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; AVX512-LABEL: vec256_v4i16:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %ymm0
 ; AVX512-NEXT:    vmovdqa %ymm0, (%rdx)
@@ -1550,7 +1550,7 @@ define void @vec256_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
 ; AVX512-LABEL: vec256_v8i8:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %ymm0
 ; AVX512-NEXT:    vmovdqa %ymm0, (%rdx)
@@ -2170,7 +2170,7 @@ define void @vec384_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; AVX512-LABEL: vec384_v2i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %ymm0
 ; AVX512-NEXT:    vmovdqa %ymm0, (%rdx)
@@ -2258,7 +2258,7 @@ define void @vec384_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; AVX512-LABEL: vec384_v2f32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %ymm0
 ; AVX512-NEXT:    vmovdqa %ymm0, (%rdx)
@@ -2722,7 +2722,7 @@ define void @vec384_v3i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
 ; AVX512-LABEL: vec384_v3i8:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vpextrb $2, %xmm0, 2(%rsi)
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    movw %ax, (%rsi)
@@ -3006,7 +3006,7 @@ define void @vec384_v3i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; AVX512-LABEL: vec384_v3i16:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vpextrw $2, %xmm0, 4(%rsi)
 ; AVX512-NEXT:    vmovd %xmm0, (%rsi)
 ; AVX512-NEXT:    vpextrw $2, %xmm0, 4(%rdx)
@@ -3664,7 +3664,7 @@ define void @vec384_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; AVX512-LABEL: vec384_v4i16:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %ymm0
 ; AVX512-NEXT:    vmovdqa %ymm0, (%rdx)
@@ -3983,7 +3983,7 @@ define void @vec384_v6i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
 ; AVX512-LABEL: vec384_v6i8:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vpextrw $2, %xmm0, 4(%rsi)
 ; AVX512-NEXT:    vmovd %xmm0, (%rsi)
 ; AVX512-NEXT:    vpextrw $2, %xmm0, 4(%rdx)
@@ -4420,7 +4420,7 @@ define void @vec384_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
 ; AVX512-LABEL: vec384_v8i8:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %ymm0
 ; AVX512-NEXT:    vmovdqa %ymm0, (%rdx)
@@ -5444,7 +5444,7 @@ define void @vec512_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; AVX512-LABEL: vec512_v2i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %zmm0
 ; AVX512-NEXT:    vmovdqa64 %zmm0, (%rdx)
@@ -5540,7 +5540,7 @@ define void @vec512_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; AVX512-LABEL: vec512_v2f32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %zmm0
 ; AVX512-NEXT:    vmovdqa64 %zmm0, (%rdx)
@@ -5965,7 +5965,7 @@ define void @vec512_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; AVX512-LABEL: vec512_v4i16:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %zmm0
 ; AVX512-NEXT:    vmovdqa64 %zmm0, (%rdx)
@@ -6363,7 +6363,7 @@ define void @vec512_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
 ; AVX512-LABEL: vec512_v8i8:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %zmm0
 ; AVX512-NEXT:    vmovdqa64 %zmm0, (%rdx)

From b605dab7a8352158ee0d399b8c3433f9a8b495a3 Mon Sep 17 00:00:00 2001
From: Eisuke Kawashima <e.kawaschima+github@gmail.com>
Date: Mon, 13 Jan 2025 21:00:35 +0900
Subject: [PATCH 253/408] [Polly] Use "is" instead of "==" to check for None
 (#94021)

From PEP8
(https://peps.python.org/pep-0008/#programming-recommendations):

> Comparisons to singletons like None should always be done with is or
is not, never the equality operators.
---
 polly/lib/External/isl/interface/python.cc | 2 +-
 polly/lib/External/isl/libisl-gdb.py       | 4 ++--
 polly/lib/External/isl/python/isl.py.top   | 4 ++--
 polly/test/lit.site.cfg.in                 | 2 +-
 polly/utils/pyscop/isl.py                  | 8 ++++----
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/polly/lib/External/isl/interface/python.cc b/polly/lib/External/isl/interface/python.cc
index e4a8288631297..b60bf315ca703 100644
--- a/polly/lib/External/isl/interface/python.cc
+++ b/polly/lib/External/isl/interface/python.cc
@@ -347,7 +347,7 @@ static void print_persistent_callback_failure_check(int indent,
 		printf(fmt, 0);
 		printf(", '%s') and ", callback_name.c_str());
 		printf(fmt, 0);
-		printf(".%s['exc_info'] != None:\n", callback_name.c_str());
+		printf(".%s['exc_info'] is not None:\n", callback_name.c_str());
 		print_indent(indent, "    exc_info = ");
 		printf(fmt, 0);
 		printf(".%s['exc_info'][0]\n", callback_name.c_str());
diff --git a/polly/lib/External/isl/libisl-gdb.py b/polly/lib/External/isl/libisl-gdb.py
index bf01bc583d15d..bdd3949cf89c0 100644
--- a/polly/lib/External/isl/libisl-gdb.py
+++ b/polly/lib/External/isl/libisl-gdb.py
@@ -70,7 +70,7 @@ def invoke(self, arg, from_tty):
         arg = gdb.parse_and_eval(arg)
         printer = str_lookup_function(arg)
 
-        if printer == None:
+        if printer is None:
             print("No isl printer for this type")
             return
 
@@ -90,7 +90,7 @@ def str_lookup_function(val):
     lookup_tag = val.type.target()
     regex = re.compile("^isl_(.*)$")
 
-    if lookup_tag == None:
+    if lookup_tag is None:
         return None
 
     m = regex.match(str(lookup_tag))
diff --git a/polly/lib/External/isl/python/isl.py.top b/polly/lib/External/isl/python/isl.py.top
index d041315d4e11d..9dc47a1a83251 100644
--- a/polly/lib/External/isl/python/isl.py.top
+++ b/polly/lib/External/isl/python/isl.py.top
@@ -3,7 +3,7 @@ from ctypes import *
 from ctypes.util import find_library
 
 isl_dyld_library_path = os.environ.get('ISL_DYLD_LIBRARY_PATH')
-if isl_dyld_library_path != None:
+if isl_dyld_library_path is not None:
     os.environ['DYLD_LIBRARY_PATH'] =  isl_dyld_library_path
 try:
     isl = cdll.LoadLibrary(isl_dlname)
@@ -29,7 +29,7 @@ class Context:
 
     @staticmethod
     def getDefaultInstance():
-        if Context.defaultInstance == None:
+        if Context.defaultInstance is None:
             Context.defaultInstance = Context()
         return Context.defaultInstance
 
diff --git a/polly/test/lit.site.cfg.in b/polly/test/lit.site.cfg.in
index d8a0b6ae3a3b2..f22063e796def 100644
--- a/polly/test/lit.site.cfg.in
+++ b/polly/test/lit.site.cfg.in
@@ -14,7 +14,7 @@ config.extra_paths = "@POLLY_TEST_EXTRA_PATHS@".split(";")
 ## Check the current platform with regex
 import re
 EAT_ERR_ON_X86 = ' '
-if (re.match(r'^x86_64*', '@LLVM_TARGET_TRIPLE@') == None) :
+if (re.match(r'^x86_64*', '@LLVM_TARGET_TRIPLE@') is None) :
   EAT_ERR_ON_X86 = '|| echo \"error is eaten\"'
 
 for arch in config.targets_to_build.split():
diff --git a/polly/utils/pyscop/isl.py b/polly/utils/pyscop/isl.py
index 5eaf7798e20b9..c06b7bca28042 100644
--- a/polly/utils/pyscop/isl.py
+++ b/polly/utils/pyscop/isl.py
@@ -24,7 +24,7 @@ def from_ptr(ptr):
 
     @staticmethod
     def getDefaultInstance():
-        if Context.defaultInstance == None:
+        if Context.defaultInstance is None:
             Context.defaultInstance = Context()
 
         return Context.defaultInstance
@@ -33,12 +33,12 @@ def getDefaultInstance():
 class IslObject:
     def __init__(self, string="", ctx=None, ptr=None):
         self.initialize_isl_methods()
-        if ptr != None:
+        if ptr is not None:
             self.ptr = ptr
             self.ctx = self.get_isl_method("get_ctx")(self)
             return
 
-        if ctx == None:
+        if ctx is None:
             ctx = Context.getDefaultInstance()
 
         self.ctx = ctx
@@ -236,7 +236,7 @@ class Printer:
     FORMAT_EXT_POLYLIB = 6
 
     def __init__(self, ctx=None):
-        if ctx == None:
+        if ctx is None:
             ctx = Context.getDefaultInstance()
 
         self.ctx = ctx

From ca92bdfa3ef8f9a1cc97167fc96601f8bd7b436b Mon Sep 17 00:00:00 2001
From: Eisuke Kawashima <e.kawaschima+github@gmail.com>
Date: Mon, 13 Jan 2025 21:03:04 +0900
Subject: [PATCH 254/408] [cross-project-tests] Use "is" instead of "==" to
 check for None (#94016)

From PEP8
(https://peps.python.org/pep-0008/#programming-recommendations):

> Comparisons to singletons like None should always be done with is or
is not, never the equality operators.
---
 .../debuginfo-tests/dexter/dex/command/ParseCommand.py        | 2 +-
 .../dex/debugger/DebuggerControllers/ConditionalController.py | 4 ++--
 .../dex/debugger/DebuggerControllers/ControllerHelpers.py     | 2 +-
 .../debuginfo-tests/dexter/dex/debugger/Debuggers.py          | 2 +-
 .../dexter/dex/debugger/visualstudio/VisualStudio.py          | 2 +-
 .../debuginfo-tests/dexter/dex/tools/test/Tool.py             | 2 +-
 cross-project-tests/lit.cfg.py                                | 2 +-
 7 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/command/ParseCommand.py b/cross-project-tests/debuginfo-tests/dexter/dex/command/ParseCommand.py
index 29d7867e80867..4b086e14d4050 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/command/ParseCommand.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/command/ParseCommand.py
@@ -98,7 +98,7 @@ def _build_command(
 
     def label_to_line(label_name: str) -> int:
         line = labels.get(label_name, None)
-        if line != None:
+        if line is not None:
             return line
         raise format_unresolved_label_err(label_name, raw_text, path.base, lineno)
 
diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ConditionalController.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ConditionalController.py
index a7d6b570b55e8..ac3054c3a0edf 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ConditionalController.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ConditionalController.py
@@ -62,7 +62,7 @@ def __init__(
         self.finish_on_remove = finish_on_remove
 
     def has_conditions(self):
-        return self.expression != None
+        return self.expression is not None
 
     def get_conditional_expression_list(self):
         conditional_list = []
@@ -76,7 +76,7 @@ def add_hit(self):
         self.current_hit_count += 1
 
     def should_be_removed(self):
-        if self.max_hit_count == None:
+        if self.max_hit_count is None:
             return False
         return self.current_hit_count >= self.max_hit_count
 
diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ControllerHelpers.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ControllerHelpers.py
index 3e5a7b919d703..a4ca5ae0158e9 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ControllerHelpers.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ControllerHelpers.py
@@ -39,7 +39,7 @@ def update_step_watches(step_info, watches, commands):
         for watch in towatch:
             loc = step_info.current_location
             if (
-                loc.path != None
+                loc.path is not None
                 and os.path.exists(loc.path)
                 and os.path.samefile(watch.path, loc.path)
                 and have_hit_line(watch, loc)
diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/Debuggers.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/Debuggers.py
index 1b0d4d5871cbe..67b715af78698 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/Debuggers.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/Debuggers.py
@@ -183,7 +183,7 @@ def handle_debugger_tool_options(context, defaults):  # noqa
         if options.debugger == "lldb":
             _warn_meaningless_option(context, "--show-debugger")
 
-    if options.source_root_dir != None:
+    if options.source_root_dir is not None:
         if not os.path.isabs(options.source_root_dir):
             raise ToolArgumentError(
                 f'<d>--source-root-dir: expected absolute path, got</> <r>"{options.source_root_dir}"</>'
diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py
index a6752274efac2..a7f12cde1f047 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py
@@ -256,7 +256,7 @@ def delete_breakpoints(self, ids):
         for bp in self._debugger.Breakpoints:
             # We're looking at the user-set breakpoints so there should be no
             # Parent.
-            assert bp.Parent == None
+            assert bp.Parent is None
             this_vsbp = VSBreakpoint(
                 PurePath(bp.File), bp.FileLine, bp.FileColumn, bp.Condition
             )
diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py b/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py
index f07641041254b..c366062cec7a9 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py
@@ -150,7 +150,7 @@ def _get_results_path(self, test_name):
         """Returns the path to the test results directory for the test denoted
         by test_name.
         """
-        assert self.context.options.results_directory != None
+        assert self.context.options.results_directory is not None
         return os.path.join(
             self.context.options.results_directory,
             self._get_results_basename(test_name),
diff --git a/cross-project-tests/lit.cfg.py b/cross-project-tests/lit.cfg.py
index 9935fe6a199da..c2a8bcef26cbf 100644
--- a/cross-project-tests/lit.cfg.py
+++ b/cross-project-tests/lit.cfg.py
@@ -51,7 +51,7 @@
 
 def get_required_attr(config, attr_name):
     attr_value = getattr(config, attr_name, None)
-    if attr_value == None:
+    if attr_value is None:
         lit_config.fatal(
             "No attribute %r in test configuration! You may need to run "
             "tests from your build directory or add this attribute "

From 5609724c2e5b59130c6fcfc128120777282c8d9a Mon Sep 17 00:00:00 2001
From: Eisuke Kawashima <e.kawaschima+github@gmail.com>
Date: Mon, 13 Jan 2025 21:05:10 +0900
Subject: [PATCH 255/408] [Polly] Fix invalid escape sequences (#94037)

These generate a SyntaxWarning since Python 3.12.
---
 polly/test/update_check.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/polly/test/update_check.py b/polly/test/update_check.py
index 88d95c247c063..a973c72ff4e78 100644
--- a/polly/test/update_check.py
+++ b/polly/test/update_check.py
@@ -222,7 +222,12 @@ def classyfier2(lines):
         line = i.__next__()
 
 
-replrepl = {"{{": "{{[{][{]}}", "}}": "{{[}][}]}}", "[[": "{{\[\[}}", "]]": "{{\]\]}}"}
+replrepl = {
+    "{{": "{{[{][{]}}",
+    "}}": "{{[}][}]}}",
+    "[[": r"{{\[\[}}",
+    "]]": r"{{\]\]}}",
+}
 replre = re.compile("|".join(re.escape(k) for k in replrepl.keys()))
 
 
@@ -452,7 +457,7 @@ def main():
     checkre = re.compile(
         r"^\s*\;\s*("
         + "|".join([re.escape(s) for s in checkprefixes])
-        + ")(\-NEXT|\-DAG|\-NOT|\-LABEL|\-SAME)?\s*\:"
+        + r")(\-NEXT|\-DAG|\-NOT|\-LABEL|\-SAME)?\s*\:"
     )
     firstcheckline = None
     firstnoncommentline = None

From 3efe83291f07dcf2423065e63b826407d1ec2609 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Mon, 13 Jan 2025 09:20:50 +0000
Subject: [PATCH 256/408] [AArch64] Fix chain for calls from agnostic-ZA
 functions.

The lowering code was using the wrong chain value, which meant that
the 'smstart' after the call from streaming agnostic-ZA functions ->
non-streaming private-ZA functions was incorrectly removed from the DAG.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |   2 +-
 llvm/test/CodeGen/AArch64/sme-agnostic-za.ll  | 118 ++++++++++++++++++
 2 files changed, 119 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d9877fef1437c..278dd95cd969d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9664,7 +9664,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
         DAG.getConstant(0, DL, MVT::i64));
     TPIDR2.Uses++;
   } else if (RequiresSaveAllZA) {
-    Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Chain,
+    Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Result,
                                      /*IsSave=*/false);
   }
 
diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
index 97522b9a319c0..1f68815411097 100644
--- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
+++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
@@ -82,3 +82,121 @@ define i64 @shared_caller_agnostic_callee(i64 %v) nounwind "aarch64_inout_za" "a
   %res = call i64 @agnostic_decl(i64 %v)
   ret i64 %res
 }
+
+; agnostic-ZA + streaming -> private-ZA + non-streaming
+define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nounwind "aarch64_za_state_agnostic" "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: streaming_agnostic_caller_nonstreaming_private_za_callee:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-112]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x9, x0
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __arm_get_current_vg
+; CHECK-NEXT:    str x0, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x0, x9
+; CHECK-NEXT:    add x29, sp, #64
+; CHECK-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:    bl __arm_sme_state_size
+; CHECK-NEXT:    sub sp, sp, x0
+; CHECK-NEXT:    mov x20, sp
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    bl __arm_sme_save
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    bl private_za_decl
+; CHECK-NEXT:    mov x1, x0
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    bl __arm_sme_restore
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    bl __arm_sme_save
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    mov x0, x1
+; CHECK-NEXT:    bl private_za_decl
+; CHECK-NEXT:    mov x1, x0
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    bl __arm_sme_restore
+; CHECK-NEXT:    mov x0, x1
+; CHECK-NEXT:    sub sp, x29, #64
+; CHECK-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #112 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call i64 @private_za_decl(i64 %v)
+  %res2 = call i64 @private_za_decl(i64 %res)
+  ret i64 %res2
+}
+
+; agnostic-ZA + streaming-compatible -> private-ZA + non-streaming
+define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nounwind "aarch64_za_state_agnostic" "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: streaming_compatible_agnostic_caller_nonstreaming_private_za_callee:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-112]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x9, x0
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __arm_get_current_vg
+; CHECK-NEXT:    str x0, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x0, x9
+; CHECK-NEXT:    add x29, sp, #64
+; CHECK-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:    bl __arm_sme_state_size
+; CHECK-NEXT:    sub sp, sp, x0
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    bl __arm_sme_save
+; CHECK-NEXT:    bl __arm_sme_state
+; CHECK-NEXT:    and x20, x0, #0x1
+; CHECK-NEXT:    tbz w20, #0, .LBB5_2
+; CHECK-NEXT:  // %bb.1:
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:  .LBB5_2:
+; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    bl private_za_decl
+; CHECK-NEXT:    mov x2, x0
+; CHECK-NEXT:    tbz w20, #0, .LBB5_4
+; CHECK-NEXT:  // %bb.3:
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:  .LBB5_4:
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    bl __arm_sme_restore
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    bl __arm_sme_save
+; CHECK-NEXT:    bl __arm_sme_state
+; CHECK-NEXT:    and x20, x0, #0x1
+; CHECK-NEXT:    tbz w20, #0, .LBB5_6
+; CHECK-NEXT:  // %bb.5:
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:  .LBB5_6:
+; CHECK-NEXT:    mov x0, x2
+; CHECK-NEXT:    bl private_za_decl
+; CHECK-NEXT:    mov x1, x0
+; CHECK-NEXT:    tbz w20, #0, .LBB5_8
+; CHECK-NEXT:  // %bb.7:
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:  .LBB5_8:
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    bl __arm_sme_restore
+; CHECK-NEXT:    mov x0, x1
+; CHECK-NEXT:    sub sp, x29, #64
+; CHECK-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #112 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call i64 @private_za_decl(i64 %v)
+  %res2 = call i64 @private_za_decl(i64 %res)
+  ret i64 %res2
+}

From 5eb9acff2845f08741e70be4a5fc1d6afc644375 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 13 Jan 2025 13:11:39 +0100
Subject: [PATCH 257/408] [Polly] Revert changes to isl Python code

This partially reverts b605dab7a8352158ee0d399b8c3433f9a8b495a3,
dropping the changes to isl. This is an external library, so
we shouldn't modify it unless strictly necessary.
---
 polly/lib/External/isl/interface/python.cc | 2 +-
 polly/lib/External/isl/libisl-gdb.py       | 4 ++--
 polly/lib/External/isl/python/isl.py.top   | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/polly/lib/External/isl/interface/python.cc b/polly/lib/External/isl/interface/python.cc
index b60bf315ca703..e4a8288631297 100644
--- a/polly/lib/External/isl/interface/python.cc
+++ b/polly/lib/External/isl/interface/python.cc
@@ -347,7 +347,7 @@ static void print_persistent_callback_failure_check(int indent,
 		printf(fmt, 0);
 		printf(", '%s') and ", callback_name.c_str());
 		printf(fmt, 0);
-		printf(".%s['exc_info'] is not None:\n", callback_name.c_str());
+		printf(".%s['exc_info'] != None:\n", callback_name.c_str());
 		print_indent(indent, "    exc_info = ");
 		printf(fmt, 0);
 		printf(".%s['exc_info'][0]\n", callback_name.c_str());
diff --git a/polly/lib/External/isl/libisl-gdb.py b/polly/lib/External/isl/libisl-gdb.py
index bdd3949cf89c0..bf01bc583d15d 100644
--- a/polly/lib/External/isl/libisl-gdb.py
+++ b/polly/lib/External/isl/libisl-gdb.py
@@ -70,7 +70,7 @@ def invoke(self, arg, from_tty):
         arg = gdb.parse_and_eval(arg)
         printer = str_lookup_function(arg)
 
-        if printer is None:
+        if printer == None:
             print("No isl printer for this type")
             return
 
@@ -90,7 +90,7 @@ def str_lookup_function(val):
     lookup_tag = val.type.target()
     regex = re.compile("^isl_(.*)$")
 
-    if lookup_tag is None:
+    if lookup_tag == None:
         return None
 
     m = regex.match(str(lookup_tag))
diff --git a/polly/lib/External/isl/python/isl.py.top b/polly/lib/External/isl/python/isl.py.top
index 9dc47a1a83251..d041315d4e11d 100644
--- a/polly/lib/External/isl/python/isl.py.top
+++ b/polly/lib/External/isl/python/isl.py.top
@@ -3,7 +3,7 @@ from ctypes import *
 from ctypes.util import find_library
 
 isl_dyld_library_path = os.environ.get('ISL_DYLD_LIBRARY_PATH')
-if isl_dyld_library_path is not None:
+if isl_dyld_library_path != None:
     os.environ['DYLD_LIBRARY_PATH'] =  isl_dyld_library_path
 try:
     isl = cdll.LoadLibrary(isl_dlname)
@@ -29,7 +29,7 @@ class Context:
 
     @staticmethod
     def getDefaultInstance():
-        if Context.defaultInstance is None:
+        if Context.defaultInstance == None:
             Context.defaultInstance = Context()
         return Context.defaultInstance
 

From a1a3e019d7adbacaa848bee12020e4d9a8401c02 Mon Sep 17 00:00:00 2001
From: Eisuke Kawashima <e.kawaschima+github@gmail.com>
Date: Mon, 13 Jan 2025 21:15:22 +0900
Subject: [PATCH 258/408] [cross-project-tests] Fix invalid escape sequences
 (#94031)

These generate a SyntaxWarning with Python 3.12.
---
 .../debuginfo-tests/dexter/dex/command/ParseCommand.py      | 4 ++--
 cross-project-tests/lit.cfg.py                              | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/command/ParseCommand.py b/cross-project-tests/debuginfo-tests/dexter/dex/command/ParseCommand.py
index 4b086e14d4050..4496fdf3cb0e8 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/command/ParseCommand.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/command/ParseCommand.py
@@ -128,7 +128,7 @@ def get_address_object(address_name: str, offset: int = 0):
 
 
 def _search_line_for_cmd_start(line: str, start: int, valid_commands: dict) -> int:
-    """Scan `line` for a string matching any key in `valid_commands`.
+    r"""Scan `line` for a string matching any key in `valid_commands`.
 
     Start searching from `start`.
     Commands escaped with `\` (E.g. `\DexLabel('a')`) are ignored.
@@ -543,7 +543,7 @@ def test_parse_share_line(self):
     def test_parse_escaped(self):
         """Escaped commands are ignored."""
 
-        lines = ['words \MockCmd("IGNORED") words words words\n']
+        lines = ['words \\MockCmd("IGNORED") words words words\n']
 
         values = self._find_all_mock_values_in_lines(lines)
 
diff --git a/cross-project-tests/lit.cfg.py b/cross-project-tests/lit.cfg.py
index c2a8bcef26cbf..66fdd63632885 100644
--- a/cross-project-tests/lit.cfg.py
+++ b/cross-project-tests/lit.cfg.py
@@ -223,7 +223,7 @@ def can_target_host():
     xcode_lldb_vers = subprocess.check_output(["xcrun", "lldb", "--version"]).decode(
         "utf-8"
     )
-    match = re.search("lldb-(\d+)", xcode_lldb_vers)
+    match = re.search(r"lldb-(\d+)", xcode_lldb_vers)
     if match:
         apple_lldb_vers = int(match.group(1))
         if apple_lldb_vers < 1000:
@@ -247,7 +247,7 @@ def get_gdb_version_string():
     if len(gdb_vers_lines) < 1:
         print("Unkown GDB version format (too few lines)", file=sys.stderr)
         return None
-    match = re.search("GNU gdb \(.*?\) ((\d|\.)+)", gdb_vers_lines[0].strip())
+    match = re.search(r"GNU gdb \(.*?\) ((\d|\.)+)", gdb_vers_lines[0].strip())
     if match is None:
         print(f"Unkown GDB version format: {gdb_vers_lines[0]}", file=sys.stderr)
         return None
@@ -261,7 +261,7 @@ def get_clang_default_dwarf_version_string(triple):
     # Get the flags passed by the driver and look for -dwarf-version.
     cmd = f'{llvm_config.use_llvm_tool("clang")} -g -xc  -c - -v -### --target={triple}'
     stderr = subprocess.run(cmd.split(), stderr=subprocess.PIPE).stderr.decode()
-    match = re.search("-dwarf-version=(\d+)", stderr)
+    match = re.search(r"-dwarf-version=(\d+)", stderr)
     if match is None:
         print("Cannot determine default dwarf version", file=sys.stderr)
         return None

From 73b0e8a191f89fd785a052795c1c855d4119119e Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke@amd.com>
Date: Mon, 13 Jan 2025 17:52:30 +0530
Subject: [PATCH 259/408] [AMDGPU][NewPM] Port
 AMDGPUOpenCLEnqueuedBlockLowering to NPM (#122434)

---
 llvm/lib/Target/AMDGPU/AMDGPU.h               |  6 +--
 .../AMDGPUOpenCLEnqueuedBlockLowering.cpp     | 37 ++++++++++++++-----
 .../AMDGPUOpenCLEnqueuedBlockLowering.h       | 23 ++++++++++++
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |  1 +
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  7 ++--
 llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll    |  1 +
 6 files changed, 60 insertions(+), 15 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.h

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 78667e628ec1e..400c5f219cc70 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -444,9 +444,9 @@ void initializeAMDGPUExternalAAWrapperPass(PassRegistry&);
 
 void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &);
 
-ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringPass();
-void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &);
-extern char &AMDGPUOpenCLEnqueuedBlockLoweringID;
+ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass();
+void initializeAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass(PassRegistry &);
+extern char &AMDGPUOpenCLEnqueuedBlockLoweringLegacyID;
 
 void initializeGCNNSAReassignPass(PassRegistry &);
 extern char &GCNNSAReassignID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
index 4f5ca08b46c13..fbd15ad176e3b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
@@ -31,6 +31,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AMDGPUOpenCLEnqueuedBlockLowering.h"
 #include "AMDGPU.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallString.h"
@@ -48,11 +49,16 @@ using namespace llvm;
 namespace {
 
 /// Lower enqueued blocks.
-class AMDGPUOpenCLEnqueuedBlockLowering : public ModulePass {
+class AMDGPUOpenCLEnqueuedBlockLowering {
+public:
+  bool run(Module &M);
+};
+
+class AMDGPUOpenCLEnqueuedBlockLoweringLegacy : public ModulePass {
 public:
   static char ID;
 
-  explicit AMDGPUOpenCLEnqueuedBlockLowering() : ModulePass(ID) {}
+  explicit AMDGPUOpenCLEnqueuedBlockLoweringLegacy() : ModulePass(ID) {}
 
 private:
   bool runOnModule(Module &M) override;
@@ -60,19 +66,32 @@ class AMDGPUOpenCLEnqueuedBlockLowering : public ModulePass {
 
 } // end anonymous namespace
 
-char AMDGPUOpenCLEnqueuedBlockLowering::ID = 0;
+char AMDGPUOpenCLEnqueuedBlockLoweringLegacy::ID = 0;
 
-char &llvm::AMDGPUOpenCLEnqueuedBlockLoweringID =
-    AMDGPUOpenCLEnqueuedBlockLowering::ID;
+char &llvm::AMDGPUOpenCLEnqueuedBlockLoweringLegacyID =
+    AMDGPUOpenCLEnqueuedBlockLoweringLegacy::ID;
 
-INITIALIZE_PASS(AMDGPUOpenCLEnqueuedBlockLowering, DEBUG_TYPE,
+INITIALIZE_PASS(AMDGPUOpenCLEnqueuedBlockLoweringLegacy, DEBUG_TYPE,
                 "Lower OpenCL enqueued blocks", false, false)
 
-ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() {
-  return new AMDGPUOpenCLEnqueuedBlockLowering();
+ModulePass *llvm::createAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass() {
+  return new AMDGPUOpenCLEnqueuedBlockLoweringLegacy();
+}
+
+bool AMDGPUOpenCLEnqueuedBlockLoweringLegacy::runOnModule(Module &M) {
+  AMDGPUOpenCLEnqueuedBlockLowering Impl;
+  return Impl.run(M);
+}
+
+PreservedAnalyses
+AMDGPUOpenCLEnqueuedBlockLoweringPass::run(Module &M, ModuleAnalysisManager &) {
+  AMDGPUOpenCLEnqueuedBlockLowering Impl;
+  if (Impl.run(M))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
 }
 
-bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
+bool AMDGPUOpenCLEnqueuedBlockLowering::run(Module &M) {
   DenseSet<Function *> Callers;
   auto &C = M.getContext();
   bool Changed = false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.h
new file mode 100644
index 0000000000000..16ed7c18d8523
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.h
@@ -0,0 +1,23 @@
+//===- AMDGPUOpenCLEnqueuedBlockLowering.h -----------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_OPENCLENQUEUEDBLOCKLOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_OPENCLENQUEUEDBLOCKLOWERING_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+class AMDGPUOpenCLEnqueuedBlockLoweringPass
+    : public PassInfoMixin<AMDGPUOpenCLEnqueuedBlockLoweringPass> {
+public:
+  AMDGPUOpenCLEnqueuedBlockLoweringPass() = default;
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_OPENCLENQUEUEDBLOCKLOWERING_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index da594be992cb4..6f322074ba74c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -21,6 +21,7 @@ MODULE_PASS("amdgpu-lower-buffer-fat-pointers",
             AMDGPULowerBufferFatPointersPass(*this))
 MODULE_PASS("amdgpu-lower-ctor-dtor", AMDGPUCtorDtorLoweringPass())
 MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
+MODULE_PASS("amdgpu-lower-enqueued-block", AMDGPUOpenCLEnqueuedBlockLoweringPass())
 MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this))
 MODULE_PASS("amdgpu-perf-hint",
             AMDGPUPerfHintAnalysisPass(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index f8b60630bb7f6..6d4547dbc82c3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -22,6 +22,7 @@
 #include "AMDGPUIGroupLP.h"
 #include "AMDGPUISelDAGToDAG.h"
 #include "AMDGPUMacroFusion.h"
+#include "AMDGPUOpenCLEnqueuedBlockLowering.h"
 #include "AMDGPUPerfHintAnalysis.h"
 #include "AMDGPURemoveIncompatibleFunctions.h"
 #include "AMDGPUSplitModule.h"
@@ -501,7 +502,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPULowerKernelArgumentsPass(*PR);
   initializeAMDGPUPromoteKernelArgumentsPass(*PR);
   initializeAMDGPULowerKernelAttributesPass(*PR);
-  initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
+  initializeAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass(*PR);
   initializeAMDGPUPostLegalizerCombinerPass(*PR);
   initializeAMDGPUPreLegalizerCombinerPass(*PR);
   initializeAMDGPURegBankCombinerPass(*PR);
@@ -1175,7 +1176,7 @@ void AMDGPUPassConfig::addIRPasses() {
     addPass(createR600OpenCLImageTypeLoweringPass());
 
   // Replace OpenCL enqueued block function pointers with global variables.
-  addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
+  addPass(createAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass());
 
   // Lower LDS accesses to global memory pass if address sanitizer is enabled.
   if (EnableSwLowerLDS)
@@ -1944,7 +1945,7 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const {
   addPass(AMDGPUAlwaysInlinePass());
   addPass(AlwaysInlinerPass());
 
-  // TODO: Missing OpenCLEnqueuedBlockLowering
+  addPass(AMDGPUOpenCLEnqueuedBlockLoweringPass());
 
   // Runs before PromoteAlloca so the latter can account for function uses
   if (EnableLowerModuleLDS)
diff --git a/llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll b/llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll
index 9391b50c04a5f..d7c8e47f98883 100644
--- a/llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll
+++ b/llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs
 ; RUN: opt -data-layout=A5 -amdgpu-lower-enqueued-block -S < %s | FileCheck %s
+; RUN: opt -data-layout=A5 -mtriple=amdgcn -passes=amdgpu-lower-enqueued-block -S < %s | FileCheck %s
 
 %struct.ndrange_t = type { i32 }
 %opencl.queue_t = type opaque

From 82b9eb1086d45caf74ff3d5dfa519631c247eb14 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Mon, 13 Jan 2025 12:31:29 +0000
Subject: [PATCH 260/408] [Flang][OpenMP] Support teams reductions lowering
 (#122683)

This patch adds PFT to MLIR lowering of teams reductions. Since there is
still no MLIR to LLVM IR translation implemented, compilation of
programs including these constructs will still trigger
not-yet-implemented errors.
---
 flang/lib/Lower/OpenMP/OpenMP.cpp             | 33 ++++++++++++++-----
 .../Lower/OpenMP/Todo/reduction-teams.f90     | 12 -------
 flang/test/Lower/OpenMP/reduction-teams.f90   | 18 ++++++++++
 3 files changed, 42 insertions(+), 21 deletions(-)
 delete mode 100644 flang/test/Lower/OpenMP/Todo/reduction-teams.f90
 create mode 100644 flang/test/Lower/OpenMP/reduction-teams.f90

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index c71fd598d5c8a..8a1029426d30c 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1336,19 +1336,18 @@ static void genWorkshareClauses(lower::AbstractConverter &converter,
   cp.processNowait(clauseOps);
 }
 
-static void genTeamsClauses(lower::AbstractConverter &converter,
-                            semantics::SemanticsContext &semaCtx,
-                            lower::StatementContext &stmtCtx,
-                            const List<Clause> &clauses, mlir::Location loc,
-                            mlir::omp::TeamsOperands &clauseOps) {
+static void genTeamsClauses(
+    lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
+    lower::StatementContext &stmtCtx, const List<Clause> &clauses,
+    mlir::Location loc, mlir::omp::TeamsOperands &clauseOps,
+    llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processAllocate(clauseOps);
   cp.processIf(llvm::omp::Directive::OMPD_teams, clauseOps);
   cp.processNumTeams(stmtCtx, clauseOps);
   cp.processThreadLimit(stmtCtx, clauseOps);
+  cp.processReduction(loc, clauseOps, reductionSyms);
   // TODO Support delayed privatization.
-
-  cp.processTODO<clause::Reduction>(loc, llvm::omp::Directive::OMPD_teams);
 }
 
 static void genWsloopClauses(
@@ -2015,13 +2014,29 @@ genTeamsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
            mlir::Location loc, const ConstructQueue &queue,
            ConstructQueue::const_iterator item) {
   lower::StatementContext stmtCtx;
+
   mlir::omp::TeamsOperands clauseOps;
-  genTeamsClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps);
+  llvm::SmallVector<const semantics::Symbol *> reductionSyms;
+  genTeamsClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps,
+                  reductionSyms);
+
+  EntryBlockArgs args;
+  // TODO: Add private syms and vars.
+  args.reduction.syms = reductionSyms;
+  args.reduction.vars = clauseOps.reductionVars;
+
+  auto genRegionEntryCB = [&](mlir::Operation *op) {
+    genEntryBlock(converter.getFirOpBuilder(), args, op->getRegion(0));
+    bindEntryBlockArgs(
+        converter, llvm::cast<mlir::omp::BlockArgOpenMPOpInterface>(op), args);
+    return llvm::to_vector(args.getSyms());
+  };
 
   return genOpWithBody<mlir::omp::TeamsOp>(
       OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_teams)
-          .setClauses(&item->clauses),
+          .setClauses(&item->clauses)
+          .setGenRegionEntryCb(genRegionEntryCB),
       queue, item, clauseOps);
 }
 
diff --git a/flang/test/Lower/OpenMP/Todo/reduction-teams.f90 b/flang/test/Lower/OpenMP/Todo/reduction-teams.f90
deleted file mode 100644
index db4839593c7e7..0000000000000
--- a/flang/test/Lower/OpenMP/Todo/reduction-teams.f90
+++ /dev/null
@@ -1,12 +0,0 @@
-! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
-! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
-
-! CHECK: not yet implemented: Unhandled clause REDUCTION in TEAMS construct
-subroutine reduction_teams()
-  integer :: i
-  i = 0
-
-  !$omp teams reduction(+:i)
-  i = i + 1
-  !$omp end teams
-end subroutine reduction_teams
diff --git a/flang/test/Lower/OpenMP/reduction-teams.f90 b/flang/test/Lower/OpenMP/reduction-teams.f90
new file mode 100644
index 0000000000000..6997e774c2d42
--- /dev/null
+++ b/flang/test/Lower/OpenMP/reduction-teams.f90
@@ -0,0 +1,18 @@
+! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+! CHECK: omp.declare_reduction @[[RED:.*]] : i32 init {
+
+! CHECK: func.func @_QPreduction_teams() {
+subroutine reduction_teams()
+  integer :: i
+  i = 0
+
+  ! CHECK: omp.teams reduction(@[[RED]] %{{.*}}#0 -> %[[PRIV_I:.*]] : !fir.ref<i32>) {
+  !$omp teams reduction(+:i)
+    ! CHECK: %[[DECL_I:.*]]:2 = hlfir.declare %[[PRIV_I]]
+    ! CHECK: %{{.*}} = fir.load %[[DECL_I]]#0 : !fir.ref<i32>
+    ! CHECK: hlfir.assign %{{.*}} to %[[DECL_I]]#0 : i32, !fir.ref<i32>
+    i = i + 1
+  !$omp end teams
+end subroutine reduction_teams

From e9a55770dcee48a3c28b71720db383762049a778 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 13 Jan 2025 19:35:56 +0700
Subject: [PATCH 261/408] AMDGPU: Add gfx9 run line to scalar_to_vector test
 (#122659)

---
 llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll | 104 +++++++++++++++++--
 1 file changed, 93 insertions(+), 11 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
index e8f86a6ce63ff..949e6f38e9b42 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=SI
-; RUN: llc < %s -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX89,VI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs -| FileCheck %s --check-prefixes=GFX89,GFX9
 
 ; XXX - Why the packing?
 define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
@@ -43,6 +44,27 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr add
 ; VI-NEXT:    v_mov_b32_e32 v1, v0
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: scalar_to_vector_v2i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s10, s6
+; GFX9-NEXT:    s_mov_b32 s11, s7
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff0000
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT:    s_endpgm
   %tmp1 = load i32, ptr addrspace(1) %in, align 4
   %bc = bitcast i32 %tmp1 to <2 x i16>
   %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -90,6 +112,27 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr add
 ; VI-NEXT:    v_mov_b32_e32 v1, v0
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: scalar_to_vector_v2f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s10, s6
+; GFX9-NEXT:    s_mov_b32 s11, s7
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff0000
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT:    s_endpgm
   %tmp1 = load float, ptr addrspace(1) %in, align 4
   %bc = bitcast float %tmp1 to <2 x i16>
   %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -130,6 +173,23 @@ define amdgpu_kernel void @scalar_to_vector_v4i16() {
 ; VI-NEXT:    v_mov_b32_e32 v1, s0
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: scalar_to_vector_v4i16:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    s_lshl_b32 s1, s0, 8
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s1, s0, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX9-NEXT:    s_or_b32 s0, s1, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
 bb:
   %tmp = load <2 x i8>, ptr addrspace(1) undef, align 1
   %tmp1 = shufflevector <2 x i8> %tmp, <2 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -176,6 +236,28 @@ define amdgpu_kernel void @scalar_to_vector_v4f16() {
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: scalar_to_vector_v4f16:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    s_lshl_b32 s1, s0, 8
+; GFX9-NEXT:    s_or_b32 s0, s1, s0
+; GFX9-NEXT:    s_and_b32 s1, s0, 0xff00
+; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x80008
+; GFX9-NEXT:    s_or_b32 s1, s4, s1
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX9-NEXT:    s_and_b32 s4, s1, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX9-NEXT:    s_or_b32 s4, s4, s1
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
 bb:
   %load = load half, ptr addrspace(1) undef, align 1
   %tmp = bitcast half %load to <2 x i8>
@@ -235,16 +317,16 @@ define amdgpu_kernel void @scalar_to_vector_test6(ptr addrspace(1) %out, i8 zero
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
-; VI-LABEL: scalar_to_vector_test6:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; VI-NEXT:    s_endpgm
+; GFX89-LABEL: scalar_to_vector_test6:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX89-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX89-NEXT:    s_mov_b32 s3, 0xf000
+; GFX89-NEXT:    s_mov_b32 s2, -1
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    v_mov_b32_e32 v0, s6
+; GFX89-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX89-NEXT:    s_endpgm
   %newvec0 = insertelement <4 x i8> undef, i8 %val, i32 0
   %bc = bitcast <4 x i8> %newvec0 to <2 x half>
   store <2 x half> %bc, ptr addrspace(1) %out

From f4598194b5d65f6bafa987a2e55c46d03c5d0052 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 13 Jan 2025 19:38:58 +0700
Subject: [PATCH 262/408] DAG: Fold bitcast of scalar_to_vector to anyext
 (#122660)

scalar_to_vector is difficult to make appear and test,
but I found one case where this makes an observable difference.
It fires more often than this in the test suite, but most of them
have no net result in the final code. This helps reduce regressions
in a future commit.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  8 +++++
 llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll  | 30 +++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index da3c834417d6b..02b79c67af3ee 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16012,6 +16012,14 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
     if (SDValue CombineLD = CombineConsecutiveLoads(N0.getNode(), VT))
       return CombineLD;
 
+  // int_vt (bitcast (vec_vt (scalar_to_vector elt_vt:x)))
+  //   => int_vt (any_extend elt_vt:x)
+  if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isScalarInteger()) {
+    SDValue SrcScalar = N0.getOperand(0);
+    if (SrcScalar.getValueType().isScalarInteger())
+      return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, SrcScalar);
+  }
+
   // Remove double bitcasts from shuffles - this is often a legacy of
   // XformToShuffleWithZero being used to combine bitmaskings (of
   // float vectors bitcast to integer vectors) into shuffles.
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
index 949e6f38e9b42..e14666cdac5c2 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
@@ -332,3 +332,33 @@ define amdgpu_kernel void @scalar_to_vector_test6(ptr addrspace(1) %out, i8 zero
   store <2 x half> %bc, ptr addrspace(1) %out
   ret void
 }
+
+; bitcast (scalar_to_vector x) -> any_extend x
+define i64 @bitcast_combine_scalar_to_vector_v4i16(i16 %arg) {
+; SI-LABEL: bitcast_combine_scalar_to_vector_v4i16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; SI-NEXT:    v_and_b32_e32 v2, 0xff00, v0
+; SI-NEXT:    v_bfe_u32 v0, v0, 8, 8
+; SI-NEXT:    v_or_b32_e32 v2, v0, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; SI-NEXT:    v_or_b32_e32 v0, v1, v3
+; SI-NEXT:    v_or_b32_e32 v1, v2, v3
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: bitcast_combine_scalar_to_vector_v4i16:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    v_and_b32_e32 v1, 0xffffff00, v0
+; GFX89-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX89-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX89-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX89-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+  %arg.cast = bitcast i16 %arg to <2 x i8>
+  %tmp1 = shufflevector <2 x i8> %arg.cast, <2 x i8> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %cast = bitcast <8 x i8> %tmp2 to i64
+  ret i64 %cast
+}

From 41a94de75caacb979070ec7a010dfe3c4e9f116f Mon Sep 17 00:00:00 2001
From: Maksim Ivanov <emaxx@google.com>
Date: Mon, 13 Jan 2025 13:42:22 +0100
Subject: [PATCH 263/408] [clang] Refactor attr diagnostics to use %select
 (#122473)

A cleanup follow-up to #118501 and #118567.
---
 clang/examples/Attribute/Attribute.cpp               |  9 +++++----
 .../CallSuperAttribute/CallSuperAttrInfo.cpp         |  5 +++--
 clang/include/clang/Basic/DiagnosticSemaKinds.td     |  9 ++++++++-
 clang/include/clang/Sema/ParsedAttr.h                |  7 +++++++
 clang/lib/Parse/ParseDecl.cpp                        |  5 +++--
 clang/lib/Sema/SemaDeclAttr.cpp                      | 12 ++++++------
 clang/lib/Sema/SemaSwift.cpp                         |  4 ++--
 clang/lib/Sema/SemaType.cpp                          |  9 +++++----
 8 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/clang/examples/Attribute/Attribute.cpp b/clang/examples/Attribute/Attribute.cpp
index 3b90724ad2220..625f1645afbff 100644
--- a/clang/examples/Attribute/Attribute.cpp
+++ b/clang/examples/Attribute/Attribute.cpp
@@ -42,8 +42,8 @@ struct ExampleAttrInfo : public ParsedAttrInfo {
                             const Decl *D) const override {
     // This attribute appertains to functions only.
     if (!isa<FunctionDecl>(D)) {
-      S.Diag(Attr.getLoc(), diag::warn_attribute_wrong_decl_type_str)
-          << Attr << Attr.isRegularKeywordAttribute() << "functions";
+      S.Diag(Attr.getLoc(), diag::warn_attribute_wrong_decl_type)
+          << Attr << Attr.isRegularKeywordAttribute() << ExpectedFunction;
       return false;
     }
     return true;
@@ -99,8 +99,9 @@ struct ExampleAttrInfo : public ParsedAttrInfo {
                             const Stmt *St) const override {
     // This attribute appertains to for loop statements only.
     if (!isa<ForStmt>(St)) {
-      S.Diag(Attr.getLoc(), diag::warn_attribute_wrong_decl_type_str)
-          << Attr << Attr.isRegularKeywordAttribute() << "for loop statements";
+      S.Diag(Attr.getLoc(), diag::warn_attribute_wrong_decl_type)
+          << Attr << Attr.isRegularKeywordAttribute()
+          << ExpectedForLoopStatement;
       return false;
     }
     return true;
diff --git a/clang/examples/CallSuperAttribute/CallSuperAttrInfo.cpp b/clang/examples/CallSuperAttribute/CallSuperAttrInfo.cpp
index 12d4c311586e6..f206a84ab1311 100644
--- a/clang/examples/CallSuperAttribute/CallSuperAttrInfo.cpp
+++ b/clang/examples/CallSuperAttribute/CallSuperAttrInfo.cpp
@@ -168,8 +168,9 @@ struct CallSuperAttrInfo : public ParsedAttrInfo {
                             const Decl *D) const override {
     const auto *TheMethod = dyn_cast_or_null<CXXMethodDecl>(D);
     if (!TheMethod || !TheMethod->isVirtual()) {
-      S.Diag(Attr.getLoc(), diag::warn_attribute_wrong_decl_type_str)
-          << Attr << Attr.isRegularKeywordAttribute() << "virtual functions";
+      S.Diag(Attr.getLoc(), diag::warn_attribute_wrong_decl_type)
+          << Attr << Attr.isRegularKeywordAttribute()
+          << ExpectedVirtualFunction;
       return false;
     }
     MarkedMethods.insert(TheMethod);
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index f04381a32a415..8be4f946dce1c 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3799,7 +3799,14 @@ def warn_attribute_wrong_decl_type : Warning<
   "|types and namespaces"
   "|variables, functions and classes"
   "|kernel functions"
-  "|non-K&R-style functions}2">,
+  "|non-K&R-style functions"
+  "|for loop statements"
+  "|virtual functions"
+  "|parameters and implicit object parameters"
+  "|non-member functions"
+  "|functions, classes, or enumerations"
+  "|classes"
+  "|typedefs}2">,
   InGroup<IgnoredAttributes>;
 def err_attribute_wrong_decl_type : Error<warn_attribute_wrong_decl_type.Summary>;
 def warn_type_attribute_wrong_type : Warning<
diff --git a/clang/include/clang/Sema/ParsedAttr.h b/clang/include/clang/Sema/ParsedAttr.h
index 4fa5fbdb5a7f6..e1faab205f647 100644
--- a/clang/include/clang/Sema/ParsedAttr.h
+++ b/clang/include/clang/Sema/ParsedAttr.h
@@ -1099,6 +1099,13 @@ enum AttributeDeclKind {
   ExpectedFunctionVariableOrClass,
   ExpectedKernelFunction,
   ExpectedFunctionWithProtoType,
+  ExpectedForLoopStatement,
+  ExpectedVirtualFunction,
+  ExpectedParameterOrImplicitObjectParameter,
+  ExpectedNonMemberFunction,
+  ExpectedFunctionOrClassOrEnum,
+  ExpectedClass,
+  ExpectedTypedef,
 };
 
 inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &DB,
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index 7f3f6d568e28c..f136d5007e8a5 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -24,6 +24,7 @@
 #include "clang/Parse/RAIIObjectsForParser.h"
 #include "clang/Sema/EnterExpressionEvaluationContext.h"
 #include "clang/Sema/Lookup.h"
+#include "clang/Sema/ParsedAttr.h"
 #include "clang/Sema/ParsedTemplate.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/SemaCUDA.h"
@@ -3708,9 +3709,9 @@ void Parser::ParseDeclarationSpecifiers(
             continue;
 
           if (PA.getKind() == ParsedAttr::AT_LifetimeBound)
-            Diag(PA.getLoc(), diag::err_attribute_wrong_decl_type_str)
+            Diag(PA.getLoc(), diag::err_attribute_wrong_decl_type)
                 << PA << PA.isRegularKeywordAttribute()
-                << "parameters and implicit object parameters";
+                << ExpectedParameterOrImplicitObjectParameter;
           else
             Diag(PA.getLoc(), diag::err_attribute_not_type_attr)
                 << PA << PA.isRegularKeywordAttribute();
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index bb4d33560b93b..c1663f2d15c88 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -1868,8 +1868,8 @@ static void handleNakedAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
     // This form is not allowed to be written on a member function (static or
     // nonstatic) when in Microsoft compatibility mode.
     if (S.getLangOpts().MSVCCompat && isa<CXXMethodDecl>(D)) {
-      S.Diag(AL.getLoc(), diag::err_attribute_wrong_decl_type_str)
-          << AL << AL.isRegularKeywordAttribute() << "non-member functions";
+      S.Diag(AL.getLoc(), diag::err_attribute_wrong_decl_type)
+          << AL << AL.isRegularKeywordAttribute() << ExpectedNonMemberFunction;
       return;
     }
   }
@@ -2761,9 +2761,9 @@ static void handleWarnUnusedResult(Sema &S, Decl *D, const ParsedAttr &AL) {
     // The standard attribute cannot be applied to variable declarations such
     // as a function pointer.
     if (isa<VarDecl>(D))
-      S.Diag(AL.getLoc(), diag::warn_attribute_wrong_decl_type_str)
+      S.Diag(AL.getLoc(), diag::warn_attribute_wrong_decl_type)
           << AL << AL.isRegularKeywordAttribute()
-          << "functions, classes, or enumerations";
+          << ExpectedFunctionOrClassOrEnum;
 
     // If this is spelled as the standard C++17 attribute, but not in C++17,
     // warn about using it as an extension. If there are attribute arguments,
@@ -5555,8 +5555,8 @@ static void handleNullableTypeAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
 
   if (auto *CRD = dyn_cast<CXXRecordDecl>(D);
       !CRD || !(CRD->isClass() || CRD->isStruct())) {
-    S.Diag(AL.getRange().getBegin(), diag::err_attribute_wrong_decl_type_str)
-        << AL << AL.isRegularKeywordAttribute() << "classes";
+    S.Diag(AL.getRange().getBegin(), diag::err_attribute_wrong_decl_type)
+        << AL << AL.isRegularKeywordAttribute() << ExpectedClass;
     return;
   }
 
diff --git a/clang/lib/Sema/SemaSwift.cpp b/clang/lib/Sema/SemaSwift.cpp
index 24fdfb8e57dc3..fe72d6c85c37a 100644
--- a/clang/lib/Sema/SemaSwift.cpp
+++ b/clang/lib/Sema/SemaSwift.cpp
@@ -650,8 +650,8 @@ void SemaSwift::handleNewType(Decl *D, const ParsedAttr &AL) {
   }
 
   if (!isa<TypedefNameDecl>(D)) {
-    Diag(AL.getLoc(), diag::warn_attribute_wrong_decl_type_str)
-        << AL << AL.isRegularKeywordAttribute() << "typedefs";
+    Diag(AL.getLoc(), diag::warn_attribute_wrong_decl_type)
+        << AL << AL.isRegularKeywordAttribute() << ExpectedTypedef;
     return;
   }
 
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index e3ec327c1b364..2ccf5a8e1d6f3 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -7983,8 +7983,9 @@ static bool handleFunctionTypeAttr(TypeProcessingState &state, ParsedAttr &attr,
     if (!FnTy) {
       // SME ACLE attributes are not supported on K&R-style unprototyped C
       // functions.
-      S.Diag(attr.getLoc(), diag::warn_attribute_wrong_decl_type) <<
-        attr << attr.isRegularKeywordAttribute() << ExpectedFunctionWithProtoType;
+      S.Diag(attr.getLoc(), diag::warn_attribute_wrong_decl_type)
+          << attr << attr.isRegularKeywordAttribute()
+          << ExpectedFunctionWithProtoType;
       attr.setInvalid();
       return false;
     }
@@ -8676,9 +8677,9 @@ static void HandleLifetimeBoundAttr(TypeProcessingState &State,
         CurType, CurType);
     return;
   }
-  State.getSema().Diag(Attr.getLoc(), diag::err_attribute_wrong_decl_type_str)
+  State.getSema().Diag(Attr.getLoc(), diag::err_attribute_wrong_decl_type)
       << Attr << Attr.isRegularKeywordAttribute()
-      << "parameters and implicit object parameters";
+      << ExpectedParameterOrImplicitObjectParameter;
 }
 
 static void HandleLifetimeCaptureByAttr(TypeProcessingState &State,

From 3397950f2d21426c7520d114a12588128906a897 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen@sifive.com>
Date: Mon, 13 Jan 2025 20:58:38 +0800
Subject: [PATCH 264/408] [LV] Fix FindLastIV reduction for epilogue
 vectorization. (#120395)

Following 0e528ac404e13ed2d952a2d83aaf8383293c851e, this patch adjusts
the resume value of VPReductionPHIRecipe for FindLastIV reductions.
Replacing the resume value with:

  ResumeValue = ResumeValue == StartValue ? SentinelValue : ResumeValue;

This addressed the correctness issue when the start value might not be
less than the minimum value of a monotonically increasing induction
variable.

Thanks Florian Hahn for the help.

---------

Co-authored-by: Florian Hahn <flo@fhahn.com>
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 27 +++++++++++++++++++
 .../LoopVectorize/epilog-iv-select-cmp.ll     |  8 ++++--
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0a13ce902795e..ee352c0b12302 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7691,6 +7691,20 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
            "AnyOf expected to start by comparing main resume value to original "
            "start value");
     MainResumeValue = Cmp->getOperand(0);
+  } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(
+                 RdxDesc.getRecurrenceKind())) {
+    using namespace llvm::PatternMatch;
+    Value *Cmp, *OrigResumeV;
+    bool IsExpectedPattern =
+        match(MainResumeValue, m_Select(m_OneUse(m_Value(Cmp)),
+                                        m_Specific(RdxDesc.getSentinelValue()),
+                                        m_Value(OrigResumeV))) &&
+        match(Cmp,
+              m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV),
+                             m_Specific(RdxDesc.getRecurrenceStartValue())));
+    assert(IsExpectedPattern && "Unexpected reduction resume pattern");
+    (void)IsExpectedPattern;
+    MainResumeValue = OrigResumeV;
   }
   PHINode *MainResumePhi = cast<PHINode>(MainResumeValue);
 
@@ -10413,6 +10427,19 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
             cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
         ResumeV =
             Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue());
+      } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) {
+        // VPReductionPHIRecipe for FindLastIV reductions requires an adjustment
+        // to the resume value. The resume value is adjusted to the sentinel
+        // value when the final value from the main vector loop equals the start
+        // value. This ensures correctness when the start value might not be
+        // less than the minimum value of a monotonically increasing induction
+        // variable.
+        IRBuilder<> Builder(
+            cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
+        Value *Cmp =
+            Builder.CreateICmpEQ(ResumeV, RdxDesc.getRecurrenceStartValue());
+        ResumeV =
+            Builder.CreateSelect(Cmp, RdxDesc.getSentinelValue(), ResumeV);
       }
     } else {
       // Retrieve the induction resume values for wide inductions from
diff --git a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll
index 052b4a10e9c8d..06f0f05889116 100644
--- a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll
@@ -40,7 +40,9 @@ define i64 @select_icmp_const(ptr %a, i64 %n) {
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
 ; CHECK:       [[VEC_EPILOG_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 3, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX1:%.*]] = phi i64 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 3, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[BC_MERGE_RDX1]], 3
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = select i1 [[TMP14]], i64 -9223372036854775808, i64 [[BC_MERGE_RDX1]]
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[N]], 4
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
@@ -144,7 +146,9 @@ define i64 @select_fcmp_const_fast(ptr %a, i64 %n) {
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
 ; CHECK:       [[VEC_EPILOG_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 2, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX1:%.*]] = phi i64 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 2, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[BC_MERGE_RDX1]], 2
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = select i1 [[TMP14]], i64 -9223372036854775808, i64 [[BC_MERGE_RDX1]]
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[N]], 4
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0

From 21e58ee9f7de60a7e9202ad3f424ec3ad5a6fce5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy@ericsson.com>
Date: Mon, 13 Jan 2025 14:04:28 +0100
Subject: [PATCH 265/408] [NFC][analyzer][docs] Migrate 'annotations.html' to
 RST (#122246)

This commit migrates the contents of 'annotations.html' in the old
HTML-based documentation of the Clang static analyzer to the new
RST-based documentation.

During this conversion I reordered the sections of this documentation
file by placing the section "Custom Assertion Handlers" as a subsection
of "Annotations to Enhance Generic Checks". (The primary motivation was
that Sphinx complained about inconsistent section levels; with this
change I preserved that sections describing individual annotations are
all on the same level.)

Apart from this change and the format conversion, I didn't review,
validate or edit the contents of this documentation file because I think
it would be better to place any additional changes in separate commits.
---
 clang/docs/LanguageExtensions.rst             |   8 +-
 clang/docs/UsersManual.rst                    |   6 +-
 .../images/example_attribute_nonnull.png      | Bin
 .../images/example_cf_returns_retained.png    | Bin
 .../images/example_ns_returns_retained.png    | Bin
 clang/docs/analyzer/user-docs.rst             |   1 +
 clang/docs/analyzer/user-docs/Annotations.rst | 689 ++++++++++++++++
 clang/docs/analyzer/user-docs/FAQ.rst         |   6 +-
 clang/include/clang/Basic/AttrDocs.td         |   2 +-
 clang/www/analyzer/annotations.html           | 766 +-----------------
 10 files changed, 708 insertions(+), 770 deletions(-)
 rename clang/{www => docs}/analyzer/images/example_attribute_nonnull.png (100%)
 rename clang/{www => docs}/analyzer/images/example_cf_returns_retained.png (100%)
 rename clang/{www => docs}/analyzer/images/example_ns_returns_retained.png (100%)
 create mode 100644 clang/docs/analyzer/user-docs/Annotations.rst

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index e020710c7aa4f..2eb0777dbdc6c 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -2137,8 +2137,8 @@ method; it specifies that the method expects its ``self`` parameter to have a
   - (void) bar __attribute__((ns_consumes_self));
   - (void) baz:(id) __attribute__((ns_consumed)) x;
 
-Further examples of these attributes are available in the static analyzer's `list of annotations for analysis
-<https://clang-analyzer.llvm.org/annotations.html#cocoa_mem>`_.
+Further examples of these attributes are available in the static analyzer's
+`list of annotations for analysis <analyzer/user-docs/Annotations.html#cocoa-mem>`__.
 
 Query for these features with ``__has_attribute(ns_consumed)``,
 ``__has_attribute(ns_returns_retained)``, etc.
@@ -4792,8 +4792,8 @@ Extensions for Static Analysis
 Clang supports additional attributes that are useful for documenting program
 invariants and rules for static analysis tools, such as the `Clang Static
 Analyzer <https://clang-analyzer.llvm.org/>`_. These attributes are documented
-in the analyzer's `list of source-level annotations
-<https://clang-analyzer.llvm.org/annotations.html>`_.
+in the analyzer's `list of annotations for analysis
+<analyzer/user-docs/Annotations.html>`__.
 
 
 Extensions for Dynamic Analysis
diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index 4de288250f3ad..260e84910c6f7 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -1364,10 +1364,8 @@ Controlling Static Analyzer Diagnostics
 While not strictly part of the compiler, the diagnostics from Clang's
 `static analyzer <https://clang-analyzer.llvm.org>`_ can also be
 influenced by the user via changes to the source code. See the available
-`annotations <https://clang-analyzer.llvm.org/annotations.html>`_ and the
-analyzer's `FAQ
-page <https://clang-analyzer.llvm.org/faq.html#exclude_code>`_ for more
-information.
+`annotations <analyzer/user-docs/Annotations.html>`_ and the analyzer's
+`FAQ page <analyzer/user-docs/FAQ.html#exclude-code>`_ for more information.
 
 .. _usersmanual-precompiled-headers:
 
diff --git a/clang/www/analyzer/images/example_attribute_nonnull.png b/clang/docs/analyzer/images/example_attribute_nonnull.png
similarity index 100%
rename from clang/www/analyzer/images/example_attribute_nonnull.png
rename to clang/docs/analyzer/images/example_attribute_nonnull.png
diff --git a/clang/www/analyzer/images/example_cf_returns_retained.png b/clang/docs/analyzer/images/example_cf_returns_retained.png
similarity index 100%
rename from clang/www/analyzer/images/example_cf_returns_retained.png
rename to clang/docs/analyzer/images/example_cf_returns_retained.png
diff --git a/clang/www/analyzer/images/example_ns_returns_retained.png b/clang/docs/analyzer/images/example_ns_returns_retained.png
similarity index 100%
rename from clang/www/analyzer/images/example_ns_returns_retained.png
rename to clang/docs/analyzer/images/example_ns_returns_retained.png
diff --git a/clang/docs/analyzer/user-docs.rst b/clang/docs/analyzer/user-docs.rst
index dd53ae143148c..e265f033a2c54 100644
--- a/clang/docs/analyzer/user-docs.rst
+++ b/clang/docs/analyzer/user-docs.rst
@@ -12,4 +12,5 @@ Contents:
    user-docs/FilingBugs
    user-docs/CrossTranslationUnit
    user-docs/TaintAnalysisConfiguration
+   user-docs/Annotations
    user-docs/FAQ
diff --git a/clang/docs/analyzer/user-docs/Annotations.rst b/clang/docs/analyzer/user-docs/Annotations.rst
new file mode 100644
index 0000000000000..d87e8f4df99c3
--- /dev/null
+++ b/clang/docs/analyzer/user-docs/Annotations.rst
@@ -0,0 +1,689 @@
+==================
+Source Annotations
+==================
+
+The Clang frontend supports several source-level annotations in the form of
+`GCC-style attributes <https://gcc.gnu.org/onlinedocs/gcc/Attribute-Syntax.html>`_
+and pragmas that can help make using the Clang Static Analyzer more useful.
+These annotations can both help suppress false positives as well as enhance the
+analyzer's ability to find bugs.
+
+This page gives a practical overview of such annotations. For more technical
+specifics regarding Clang-specific annotations please see the Clang's list of
+`language extensions <https://clang.llvm.org/docs/LanguageExtensions.html>`_.
+Details of "standard" GCC attributes (that Clang also supports) can
+be found in the `GCC manual <https://gcc.gnu.org/onlinedocs/gcc/>`_, with the
+majority of the relevant attributes being in the section on
+`function attributes <https://gcc.gnu.org/onlinedocs/gcc/Function-Attributes.html>`_.
+
+Note that attributes that are labeled **Clang-specific** are not
+recognized by GCC. Their use can be conditioned using preprocessor macros
+(examples included on this page).
+
+.. contents::
+   :local:
+
+Annotations to Enhance Generic Checks
+_____________________________________
+
+Null Pointer Checking
+#####################
+
+Attribute 'nonnull'
+-------------------
+
+The analyzer recognizes the GCC attribute 'nonnull', which indicates that a
+function expects that a given function parameter is not a null pointer.
+Specific details of the syntax of using the 'nonnull' attribute can be found in
+`GCC's documentation <https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-nonnull-function-attribute>`_.
+
+Both the Clang compiler and GCC will flag warnings for simple cases where a
+null pointer is directly being passed to a function with a 'nonnull' parameter
+(e.g., as a constant). The analyzer extends this checking by using its deeper
+symbolic analysis to track what pointer values are potentially null and then
+flag warnings when they are passed in a function call via a 'nonnull'
+parameter.
+
+**Example**
+
+.. code-block:: c
+
+  int bar(int*p, int q, int *r) __attribute__((nonnull(1,3)));
+
+  int foo(int *p, int *q) {
+     return !p ? bar(q, 2, p)
+               : bar(p, 2, q);
+  }
+
+Running ``scan-build`` over this source produces the following output:
+
+.. image:: ../images/example_attribute_nonnull.png
+
+.. _custom_assertion_handlers:
+
+Custom Assertion Handlers
+#########################
+
+The analyzer exploits code assertions by pruning off paths where the
+assertion condition is false. The idea is capture any program invariants
+specified in the assertion that the developer may know but is not immediately
+apparent in the code itself. In this way assertions make implicit assumptions
+explicit in the code, which not only makes the analyzer more accurate when
+finding bugs, but can help others better able to understand your code as well.
+It can also help remove certain kinds of analyzer false positives by pruning off
+false paths.
+
+In order to exploit assertions, however, the analyzer must understand when it
+encounters an "assertion handler". Typically assertions are
+implemented with a macro, with the macro performing a check for the assertion
+condition and, when the check fails, calling an assertion handler.  For
+example, consider the following code fragment:
+
+.. code-block: c
+
+  void foo(int *p) {
+    assert(p != NULL);
+  }
+
+When this code is preprocessed on Mac OS X it expands to the following:
+
+.. code-block: c
+
+  void foo(int *p) {
+    (__builtin_expect(!(p != NULL), 0) ? __assert_rtn(__func__, "t.c", 4, "p != NULL") : (void)0);
+  }
+
+In this example, the assertion handler is ``__assert_rtn``. When called,
+most assertion handlers typically print an error and terminate the program. The
+analyzer can exploit such semantics by ending the analysis of a path once it
+hits a call to an assertion handler.
+
+The trick, however, is that the analyzer needs to know that a called function
+is an assertion handler; otherwise the analyzer might assume the function call
+returns and it will continue analyzing the path where the assertion condition
+failed. This can lead to false positives, as the assertion condition usually
+implies a safety condition (e.g., a pointer is not null) prior to performing
+some action that depends on that condition (e.g., dereferencing a pointer).
+
+The analyzer knows about several well-known assertion handlers, but can
+automatically infer if a function should be treated as an assertion handler if
+it is annotated with the 'noreturn' attribute or the (Clang-specific)
+'analyzer_noreturn' attribute. Note that, currently, clang does not support
+these attributes on Objective-C methods and C++ methods.
+
+Attribute 'noreturn'
+--------------------
+
+The 'noreturn' attribute is a GCC attribute that can be placed on the
+declarations of functions. It means exactly what its name implies: a function
+with a 'noreturn' attribute should never return.
+
+Specific details of the syntax of using the 'noreturn' attribute can be found
+in `GCC's documentation <https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-noreturn-function-attribute>`__.
+
+Not only does the analyzer exploit this information when pruning false paths,
+but the compiler also takes it seriously and will generate different code (and
+possibly better optimized) under the assumption that the function does not
+return.
+
+**Example**
+
+On Mac OS X, the function prototype for ``__assert_rtn`` (declared in
+``assert.h``) is specifically annotated with the 'noreturn' attribute:
+
+.. code-block: c
+
+  void __assert_rtn(const char *, const char *, int, const char *) __attribute__((__noreturn__));
+
+Attribute 'analyzer_noreturn' (Clang-specific)
+----------------------------------------------
+
+The Clang-specific 'analyzer_noreturn' attribute is almost identical to
+'noreturn' except that it is ignored by the compiler for the purposes of code
+generation.
+
+This attribute is useful for annotating assertion handlers that actually
+*can* return, but for the purpose of using the analyzer we want to
+pretend that such functions do not return.
+
+Because this attribute is Clang-specific, its use should be conditioned with
+the use of preprocessor macros.
+
+**Example**
+
+.. code-block: c
+
+  #ifndef CLANG_ANALYZER_NORETURN
+  #if __has_feature(attribute_analyzer_noreturn)
+  #define CLANG_ANALYZER_NORETURN __attribute__((analyzer_noreturn))
+  #else
+  #define CLANG_ANALYZER_NORETURN
+  #endif
+  #endif
+
+  void my_assert_rtn(const char *, const char *, int, const char *) CLANG_ANALYZER_NORETURN;
+
+Mac OS X API Annotations
+________________________
+
+.. _cocoa_mem:
+
+Cocoa & Core Foundation Memory Management Annotations
+#####################################################
+
+The analyzer supports the proper management of retain counts for
+both Cocoa and Core Foundation objects. This checking is largely based on
+enforcing Cocoa and Core Foundation naming conventions for Objective-C methods
+(Cocoa) and C functions (Core Foundation). Not strictly following these
+conventions can cause the analyzer to miss bugs or flag false positives.
+
+One can educate the analyzer (and others who read your code) about methods or
+functions that deviate from the Cocoa and Core Foundation conventions using the
+attributes described here. However, you should consider using proper naming
+conventions or the `objc_method_family <https://clang.llvm.org/docs/LanguageExtensions.html#the-objc-method-family-attribute>`_
+attribute, if applicable.
+
+.. _ns_returns_retained:
+
+Attribute 'ns_returns_retained' (Clang-specific)
+------------------------------------------------
+
+The GCC-style (Clang-specific) attribute 'ns_returns_retained' allows one to
+annotate an Objective-C method or C function as returning a retained Cocoa
+object that the caller is responsible for releasing (via sending a
+``release`` message to the object). The Foundation framework defines a
+macro ``NS_RETURNS_RETAINED`` that is functionally equivalent to the
+one shown below.
+
+**Placing on Objective-C methods**: For Objective-C methods, this
+annotation essentially tells the analyzer to treat the method as if its name
+begins with "alloc" or "new" or contains the word
+"copy".
+
+**Placing on C functions**: For C functions returning Cocoa objects, the
+analyzer typically does not make any assumptions about whether or not the object
+is returned retained. Explicitly adding the 'ns_returns_retained' attribute to C
+functions allows the analyzer to perform extra checking.
+
+**Example**
+
+.. code-block: objc
+
+  #import <Foundation/Foundation.h>;
+
+  #ifndef __has_feature      // Optional.
+  #define __has_feature(x) 0 // Compatibility with non-clang compilers.
+  #endif
+
+  #ifndef NS_RETURNS_RETAINED
+  #if __has_feature(attribute_ns_returns_retained)
+  #define NS_RETURNS_RETAINED __attribute__((ns_returns_retained))
+  #else
+  #define NS_RETURNS_RETAINED
+  #endif
+  #endif
+
+  @interface MyClass : NSObject {}
+  - (NSString*) returnsRetained NS_RETURNS_RETAINED;
+  - (NSString*) alsoReturnsRetained;
+  @end
+
+  @implementation MyClass
+  - (NSString*) returnsRetained {
+    return [[NSString alloc] initWithCString:"no leak here"];
+  }
+  - (NSString*) alsoReturnsRetained {
+    return [[NSString alloc] initWithCString:"flag a leak"];
+  }
+  @end
+
+Running ``scan-build`` on this source file produces the following output:
+
+.. image:: ../images/example_ns_returns_retained.png
+
+.. _ns_returns_not_retained:
+
+Attribute 'ns_returns_not_retained' (Clang-specific)
+----------------------------------------------------
+
+The 'ns_returns_not_retained' attribute is the complement of
+'`ns_returns_retained`_'. Where a function or method may appear to obey the
+Cocoa conventions and return a retained Cocoa object, this attribute can be
+used to indicate that the object reference returned should not be considered as
+an "owning" reference being returned to the caller. The Foundation
+framework defines a macro ``NS_RETURNS_NOT_RETAINED`` that is functionally
+equivalent to the one shown below.
+
+Usage is identical to `ns_returns_retained`_.  When using the
+attribute, be sure to declare it within the proper macro that checks for
+its availability, as it is not available in earlier versions of the analyzer:
+
+.. code-block:objc
+
+  #ifndef __has_feature      // Optional.
+  #define __has_feature(x) 0 // Compatibility with non-clang compilers.
+  #endif
+
+  #ifndef NS_RETURNS_NOT_RETAINED
+  #if __has_feature(attribute_ns_returns_not_retained)
+  #define NS_RETURNS_NOT_RETAINED __attribute__((ns_returns_not_retained))
+  #else
+  #define NS_RETURNS_NOT_RETAINED
+  #endif
+  #endif
+
+.. _cf_returns_retained:
+
+Attribute 'cf_returns_retained' (Clang-specific)
+------------------------------------------------
+
+The GCC-style (Clang-specific) attribute 'cf_returns_retained' allows one to
+annotate an Objective-C method or C function as returning a retained Core
+Foundation object that the caller is responsible for releasing. The
+CoreFoundation framework defines a macro ``CF_RETURNS_RETAINED`` that is
+functionally equivalent to the one shown below.
+
+**Placing on Objective-C methods**: With respect to Objective-C methods.,
+this attribute is identical in its behavior and usage to 'ns_returns_retained'
+except for the distinction of returning a Core Foundation object instead of a
+Cocoa object.
+
+This distinction is important for the following reason: as Core Foundation is a
+C API, the analyzer cannot always tell that a pointer return value refers to a
+Core Foundation object. In contrast, it is trivial for the analyzer to
+recognize if a pointer refers to a Cocoa object (given the Objective-C type
+system).
+
+**Placing on C functions**: When placing the attribute
+'cf_returns_retained' on the declarations of C functions, the analyzer
+interprets the function as:
+
+1. Returning a Core Foundation Object
+2. Treating the function as if it its name contained the keywords
+   "create" or "copy". This means the returned object as a
+   +1 retain count that must be released by the caller, either by sending a
+   ``release`` message (via toll-free bridging to an Objective-C object
+   pointer), or calling ``CFRelease`` or a similar function.
+
+**Example**
+
+.. code-block:objc
+
+  #import <Cocoa/Cocoa.h>
+
+  #ifndef __has_feature      // Optional.
+  #define __has_feature(x) 0 // Compatibility with non-clang compilers.
+  #endif
+
+  #ifndef CF_RETURNS_RETAINED
+  #if __has_feature(attribute_cf_returns_retained)
+  #define CF_RETURNS_RETAINED __attribute__((cf_returns_retained))
+  #else
+  #define CF_RETURNS_RETAINED
+  #endif
+  #endif
+
+  @interface MyClass : NSObject {}
+  - (NSDate*) returnsCFRetained CF_RETURNS_RETAINED;
+  - (NSDate*) alsoReturnsRetained;
+  - (NSDate*) returnsNSRetained NS_RETURNS_RETAINED;
+  @end
+
+  CF_RETURNS_RETAINED
+  CFDateRef returnsRetainedCFDate()  {
+    return CFDateCreate(0, CFAbsoluteTimeGetCurrent());
+  }
+
+  @implementation MyClass
+  - (NSDate*) returnsCFRetained {
+    return (NSDate*) returnsRetainedCFDate(); // No leak.
+  }
+
+  - (NSDate*) alsoReturnsRetained {
+    return (NSDate*) returnsRetainedCFDate(); // Always report a leak.
+  }
+
+  - (NSDate*) returnsNSRetained {
+    return (NSDate*) returnsRetainedCFDate(); // Report a leak when using GC.
+  }
+  @end
+
+Running ``scan-build`` on this example produces the following output:
+
+.. image:: ../images/example_cf_returns_retained.png
+
+Attribute 'cf_returns_not_retained' (Clang-specific)
+----------------------------------------------------
+
+The 'cf_returns_not_retained' attribute is the complement of
+'`cf_returns_retained`_'. Where a function or method may appear to obey the
+Core Foundation or Cocoa conventions and return a retained Core Foundation
+object, this attribute can be used to indicate that the object reference
+returned should not be considered as an "owning" reference being
+returned to the caller. The CoreFoundation framework defines a macro
+**``CF_RETURNS_NOT_RETAINED``** that is functionally equivalent to the one
+shown below.
+
+Usage is identical to cf_returns_retained_. When using the attribute, be sure
+to declare it within the proper macro that checks for its availability, as it
+is not available in earlier versions of the analyzer:
+
+.. code-block:objc
+
+  #ifndef __has_feature      // Optional.
+  #define __has_feature(x) 0 // Compatibility with non-clang compilers.
+  #endif
+
+  #ifndef CF_RETURNS_NOT_RETAINED
+  #if __has_feature(attribute_cf_returns_not_retained)
+  #define CF_RETURNS_NOT_RETAINED __attribute__((cf_returns_not_retained))
+  #else
+  #define CF_RETURNS_NOT_RETAINED
+  #endif
+  #endif
+
+.. _ns_consumed:
+
+Attribute 'ns_consumed' (Clang-specific)
+----------------------------------------
+
+The 'ns_consumed' attribute can be placed on a specific parameter in either
+the declaration of a function or an Objective-C method. It indicates to the
+static analyzer that a ``release`` message is implicitly sent to the
+parameter upon completion of the call to the given function or method. The
+Foundation framework defines a macro ``NS_RELEASES_ARGUMENT`` that
+is functionally equivalent to the ``NS_CONSUMED`` macro shown below.
+
+**Example**
+
+.. code-block:objc
+
+  #ifndef __has_feature      // Optional.
+  #define __has_feature(x) 0 // Compatibility with non-clang compilers.
+  #endif
+
+  #ifndef NS_CONSUMED
+  #if __has_feature(attribute_ns_consumed)
+  #define NS_CONSUMED __attribute__((ns_consumed))
+  #else
+  #define NS_CONSUMED
+  #endif
+  #endif
+
+  void consume_ns(id NS_CONSUMED x);
+
+  void test() {
+    id x = [[NSObject alloc] init];
+    consume_ns(x); // No leak!
+  }
+
+  @interface Foo : NSObject
+  + (void) releaseArg:(id) NS_CONSUMED x;
+  + (void) releaseSecondArg:(id)x second:(id) NS_CONSUMED y;
+  @end
+
+  void test_method() {
+    id x = [[NSObject alloc] init];
+    [Foo releaseArg:x]; // No leak!
+  }
+
+  void test_method2() {
+    id a = [[NSObject alloc] init];
+    id b = [[NSObject alloc] init];
+    [Foo releaseSecondArg:a second:b]; // 'a' is leaked, but 'b' is released.
+  }
+
+Attribute 'cf_consumed' (Clang-specific)
+----------------------------------------
+
+The 'cf_consumed' attribute is practically identical to ns_consumed_. The
+attribute can be placed on a specific parameter in either the declaration of a
+function or an Objective-C method. It indicates to the static analyzer that the
+object reference is implicitly passed to a call to ``CFRelease`` upon
+completion of the call to the given function or method. The CoreFoundation
+framework defines a macro ``CF_RELEASES_ARGUMENT`` that is functionally
+equivalent to the ``CF_CONSUMED`` macro shown below.
+
+Operationally this attribute is nearly identical to 'ns_consumed'.
+
+**Example**
+
+.. code-block:objc
+
+  #ifndef __has_feature      // Optional.
+  #define __has_feature(x) 0 // Compatibility with non-clang compilers.
+  #endif
+
+  #ifndef CF_CONSUMED
+  #if __has_feature(attribute_cf_consumed)
+  #define CF_CONSUMED __attribute__((cf_consumed))
+  #else
+  #define CF_CONSUMED
+  #endif
+  #endif
+
+  void consume_cf(id CF_CONSUMED x);
+  void consume_CFDate(CFDateRef CF_CONSUMED x);
+
+  void test() {
+    id x = [[NSObject alloc] init];
+    consume_cf(x); // No leak!
+  }
+
+  void test2() {
+    CFDateRef date = CFDateCreate(0, CFAbsoluteTimeGetCurrent());
+    consume_CFDate(date); // No leak, including under GC!
+
+  }
+
+  @interface Foo : NSObject
+  + (void) releaseArg:(CFDateRef) CF_CONSUMED x;
+  @end
+
+  void test_method() {
+    CFDateRef date = CFDateCreate(0, CFAbsoluteTimeGetCurrent());
+    [Foo releaseArg:date]; // No leak!
+  }
+
+.. _ns_consumes_self:
+
+Attribute 'ns_consumes_self' (Clang-specific)
+---------------------------------------------
+
+The 'ns_consumes_self' attribute can be placed only on an Objective-C method
+declaration. It indicates that the receiver of the message is
+"consumed" (a single reference count decremented) after the message
+is sent. This matches the semantics of all "init" methods.
+
+One use of this attribute is declare your own init-like methods that do not
+follow the standard Cocoa naming conventions.
+
+**Example**
+
+.. code-block:objc
+  #ifndef __has_feature
+  #define __has_feature(x) 0 // Compatibility with non-clang compilers.
+  #endif
+
+  #ifndef NS_CONSUMES_SELF
+  #if __has_feature((attribute_ns_consumes_self))
+  #define NS_CONSUMES_SELF __attribute__((ns_consumes_self))
+  #else
+  #define NS_CONSUMES_SELF
+  #endif
+  #endif
+
+  @interface MyClass : NSObject
+  - initWith:(MyClass *)x;
+  - nonstandardInitWith:(MyClass *)x NS_CONSUMES_SELF NS_RETURNS_RETAINED;
+  @end
+
+In this example, ``-nonstandardInitWith:`` has the same ownership
+semantics as the init method ``-initWith:``. The static analyzer will
+observe that the method consumes the receiver, and then returns an object with
+a +1 retain count.
+
+The Foundation framework defines a macro ``NS_REPLACES_RECEIVER`` which is
+functionally equivalent to the combination of ``NS_CONSUMES_SELF`` and
+``NS_RETURNS_RETAINED`` shown above.
+
+Libkern Memory Management Annotations
+#####################################
+
+`Libkern <https://developer.apple.com/documentation/kernel/osobject?language=objc>`_
+requires developers to inherit all heap allocated objects from ``OSObject`` and
+to perform manual reference counting. The reference counting model is very
+similar to MRR (manual retain-release) mode in
+`Objective-C <https://developer.apple.com/library/archive/documentation/Cocoa/Conceptual/MemoryMgmt/Articles/mmRules.html>`_
+or to CoreFoundation reference counting.
+Freshly-allocated objects start with a reference count of 1, and calls to
+``retain`` increment it, while calls to ``release`` decrement it. The object is
+deallocated whenever its reference count reaches zero.
+
+Manually incrementing and decrementing reference counts is error-prone:
+over-retains lead to leaks, and over-releases lead to uses-after-free.
+The analyzer can help the programmer to check for unbalanced
+retain/release calls.
+
+The reference count checking is based on the principle of *locality*: it should
+be possible to establish correctness (lack of leaks/uses after free) by looking
+at each function body, and the declarations (not the definitions) of all the
+functions it interacts with.
+
+In order to support such reasoning, it should be possible to *summarize* the
+behavior of each function, with respect to reference count of its returned
+values and attributes.
+
+By default, the following summaries are assumed:
+
+- All functions starting with ``get`` or ``Get``, unless they are returning
+  subclasses of ``OSIterator``, are assumed to be returning at +0. That is, the
+  caller has no reference count *obligations* with respect to the reference
+  count of the returned object and should leave it untouched.
+
+- All other functions are assumed to return at +1. That is, the caller has an
+  *obligation* to release such objects.
+
+- Functions are assumed not to change the reference count of their parameters,
+  including the implicit ``this`` parameter.
+
+These summaries can be overriden with the following
+`attributes <https://clang.llvm.org/docs/AttributeReference.html#os-returns-not-retained>`_:
+
+Attribute 'os_returns_retained'
+-------------------------------
+
+The ``os_returns_retained`` attribute (accessed through the macro
+``LIBKERN_RETURNS_RETAINED``) plays a role identical to `ns_returns_retained`_
+for functions returning ``OSObject`` subclasses. The attribute indicates that
+it is a callers responsibility to release the returned object.
+
+Attribute 'os_returns_not_retained'
+-----------------------------------
+
+The ``os_returns_not_retained`` attribute (accessed through the macro
+``LIBKERN_RETURNS_NOT_RETAINED``) plays a role identical to
+`ns_returns_not_retained`_ for functions returning ``OSObject`` subclasses. The
+attribute indicates that the caller should not change the retain count of the
+returned object.
+
+
+**Example**
+
+.. code-block:objc
+
+  class MyClass {
+    OSObject *f;
+    LIBKERN_RETURNS_NOT_RETAINED OSObject *myFieldGetter();
+  }
+
+
+  // Note that the annotation only has to be applied to the function declaration.
+  OSObject * MyClass::myFieldGetter() {
+    return f;
+  }
+
+Attribute 'os_consumed'
+-----------------------
+
+Similarly to `ns_consumed`_ attribute, ``os_consumed`` (accessed through
+``LIBKERN_CONSUMED``) attribute, applied to a parameter, indicates that the
+call to the function *consumes* the parameter: the callee should either release
+it or store it and release it in the destructor, while the caller should assume
+one is subtracted from the reference count after the call.
+
+.. code-block:objc
+  IOReturn addToList(LIBKERN_CONSUMED IOPMinformee *newInformee);
+
+Attribute 'os_consumes_this'
+----------------------------
+
+Similarly to `ns_consumes_self`_, the ``os_consumes_self`` attribute indicates
+that the method call *consumes* the implicit ``this`` argument: the caller
+should assume one was subtracted from the reference count of the object after
+the call, and the callee has on obligation to either release the argument, or
+store it and eventually release it in the destructor.
+
+
+.. code-block:objc
+  void addThisToList(OSArray *givenList) LIBKERN_CONSUMES_THIS;
+
+Out Parameters
+--------------
+
+A function can also return an object to a caller by a means of an out parameter
+(a pointer-to-OSObject-pointer is passed, and a callee writes a pointer to an
+object into an argument). Currently the analyzer does not track unannotated out
+parameters by default, but with annotations we distinguish four separate cases:
+
+**1. Non-retained out parameters**, identified using
+``LIBKERN_RETURNS_NOT_RETAINED`` applied to parameters, e.g.:
+
+.. code-block:objc
+  void getterViaOutParam(LIBKERN_RETURNS_NOT_RETAINED OSObject **obj)
+
+Such functions write a non-retained object into an out parameter, and the
+caller has no further obligations.
+
+**2. Retained out parameters**, identified using ``LIBKERN_RETURNS_RETAINED``:
+
+.. code-block:objc
+  void getterViaOutParam(LIBKERN_RETURNS_NOT_RETAINED OSObject **obj)
+
+In such cases a retained object is written into an out parameter, which the caller has then to release in order to avoid a leak.
+
+These two cases are simple - but in practice a functions returning an
+out-parameter usually also return a return code, and then an out parameter may
+or may not be written, which conditionally depends on the exit code, e.g.:
+
+.. code-block:objc
+  bool maybeCreateObject(LIBKERN_RETURNS_RETAINED OSObject **obj);
+
+For such functions, the usual semantics is that an object is written into on "success", and not written into on "failure".
+
+For ``LIBKERN_RETURNS_RETAINED`` we assume the following definition of
+success:
+
+- For functions returning ``OSReturn`` or ``IOReturn`` (any typedef to
+  ``kern_return_t``) success is defined as having an output of zero
+  (``kIOReturnSuccess`` is zero).
+
+- For all others, success is non-zero (e.g. non-nullptr for pointers)
+
+**3. Retained out parameters on zero return** The annotation
+``LIBKERN_RETURNS_RETAINED_ON_ZERO`` states that a retained object is written
+into if and only if the function returns a zero value:
+
+.. code-block:objc
+  bool OSUnserializeXML(void *data, LIBKERN_RETURNS_RETAINED_ON_ZERO OSString **errString);
+
+Then the caller has to release an object if the function has returned zero.
+
+**4. Retained out parameters on non-zero return** Similarly,
+``LIBKERN_RETURNS_RETAINED_ON_NONZERO`` specifies that a retained object is
+written into the parameter if and only if the function has returned a non-zero
+value.
+
+Note that for non-retained out parameters conditionals do not matter, as the
+caller has no obligations regardless of whether an object is written into or
+not.
diff --git a/clang/docs/analyzer/user-docs/FAQ.rst b/clang/docs/analyzer/user-docs/FAQ.rst
index af52e99c91d68..e1147916a767c 100644
--- a/clang/docs/analyzer/user-docs/FAQ.rst
+++ b/clang/docs/analyzer/user-docs/FAQ.rst
@@ -9,7 +9,7 @@ Custom Assertions
 
 Q: How do I tell the analyzer that I do not want the bug being reported here since my custom error handler will safely end the execution before the bug is reached?
 
-You can tell the analyzer that this path is unreachable by teaching it about your `custom assertion handlers <annotations.html#custom_assertions>`_. For example, you can modify the code segment as following:
+You can tell the analyzer that this path is unreachable by teaching it about your `custom assertion handlers <Annotations.html#custom-assertion-handlers>`__. For example, you can modify the code segment as following:
 
 .. code-block:: c
 
@@ -162,7 +162,7 @@ Suppressing Specific Warnings
 
 Q: How can I suppress a specific analyzer warning?
 
-When you encounter an analyzer bug/false positive, check if it's one of the issues discussed above or if the analyzer `annotations <annotations.html#custom_assertions>`_ can resolve the issue by helping the static analyzer understand the code better. Second, please `report it <filing_bugs.html>`_ to help us improve user experience.
+When you encounter an analyzer bug/false positive, check if it's one of the issues discussed above or if the analyzer `annotations <Annotations.html#custom-assertion-handlers>`__ can resolve the issue by helping the static analyzer understand the code better. Second, please `report it <FilingBugs.html>`_ to help us improve user experience.
 
 Sometimes there's really no "good" way to eliminate the issue. In such cases you can "silence" it directly by annotating the problematic line of code with the help of Clang attribute 'suppress':
 
@@ -192,6 +192,8 @@ Sometimes there's really no "good" way to eliminate the issue. In such cases you
      return *result;  // as well as this leak path
    }
 
+.. _exclude_code:
+
 Excluding Code from Analysis
 ----------------------------
 
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 953ff9a700e51..e10f24e239ece 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -1461,7 +1461,7 @@ Mind that many more checkers are affected by dynamic memory modeling changes to
 some extent.
 
 Further reading for other annotations:
-`Source Annotations in the Clang Static Analyzer <https://clang-analyzer.llvm.org/annotations.html>`_.
+`Source Annotations in the Clang Static Analyzer <https://clang.llvm.org/docs/analyzer/user-docs/Annotations.html>`_.
   }];
 }
 
diff --git a/clang/www/analyzer/annotations.html b/clang/www/analyzer/annotations.html
index bf0076e514278..b19d47bce2662 100644
--- a/clang/www/analyzer/annotations.html
+++ b/clang/www/analyzer/annotations.html
@@ -3,6 +3,8 @@
 <html>
 <head>
   <title>Source Annotations</title>
+  <link rel="canonical" href="https://clang.llvm.org/docs/analyzer/user-docs/Annotations.html"/>
+  <meta http-equiv="refresh" content="0;url=https://clang.llvm.org/docs/analyzer/user-docs/Annotations.html" />
   <link type="text/css" rel="stylesheet" href="menu.css">
   <link type="text/css" rel="stylesheet" href="content.css">
   <script type="text/javascript" src="scripts/menu.js"></script>
@@ -15,765 +17,11 @@
 <div id="content">
 
 <h1>Source Annotations</h1>
+<p style="color:red; font-size:200%">This page is deprecated and will be removed in release 21.0</p>
+<p>Its content was migrated to <a href="https://clang.llvm.org/docs/analyzer/user-docs/Annotations.html">the regular LLVM documentation</a>.</p>
+<script>window.location='https://clang.llvm.org/docs/analyzer/user-docs/Annotations.html'</script>
 
-<p>The Clang frontend supports several source-level annotations in the form of
-<a href="https://gcc.gnu.org/onlinedocs/gcc/Attribute-Syntax.html">GCC-style
-attributes</a> and pragmas that can help make using the Clang Static Analyzer
-more useful. These annotations can both help suppress false positives as well as
-enhance the analyzer's ability to find bugs.</p>
-
-<p>This page gives a practical overview of such annotations. For more technical
-specifics regarding Clang-specific annotations please see the Clang's list of <a
-href="https://clang.llvm.org/docs/LanguageExtensions.html">language
-extensions</a>. Details of &quot;standard&quot; GCC attributes (that Clang also
-supports) can be found in the <a href="https://gcc.gnu.org/onlinedocs/gcc/">GCC
-manual</a>, with the majority of the relevant attributes being in the section on
-<a href="https://gcc.gnu.org/onlinedocs/gcc/Function-Attributes.html">function
-attributes</a>.</p>
-
-<p>Note that attributes that are labeled <b>Clang-specific</b> are not
-recognized by GCC. Their use can be conditioned using preprocessor macros
-(examples included on this page).</p>
-
-<h4>Specific Topics</h4>
-
-<ul>
-<li><a href="#generic">Annotations to Enhance Generic Checks</a>
-  <ul>
-    <li><a href="#null_checking"><span>Null Pointer Checking</span></a>
-    <ul>
-      <li><a href="#attr_nonnull"><span>Attribute 'nonnull'</span></a></li>
-    </ul>
-    </li>
-  </ul>
-</li>
-<li><a href="#macosx">Mac OS X API Annotations</a>
-  <ul>
-    <li><a href="#cocoa_mem">Cocoa &amp; Core Foundation Memory Management Annotations</a>
-    <ul>
-      <li><a href="#attr_ns_returns_retained">Attribute 'ns_returns_retained'</a></li>
-      <li><a href="#attr_ns_returns_not_retained">Attribute 'ns_returns_not_retained'</a></li>
-      <li><a href="#attr_cf_returns_retained">Attribute 'cf_returns_retained'</a></li>
-      <li><a href="#attr_cf_returns_not_retained">Attribute 'cf_returns_not_retained'</a></li>
-      <li><a href="#attr_ns_consumed">Attribute 'ns_consumed'</a></li>
-      <li><a href="#attr_cf_consumed">Attribute 'cf_consumed'</a></li>
-      <li><a href="#attr_ns_consumes_self">Attribute 'ns_consumes_self'</a></li>
-    </ul>
-    </li>
-    <li><a href="#osobject_mem">Libkern Memory Management Annotations</a>
-      <ul>
-        <li><a href="#attr_os_returns_retained">Attribute 'os_returns_retained'</a></li>
-        <li><a href="#attr_os_returns_not_retained">Attribute 'os_returns_not_retained'</a></li>
-        <li><a href="#attr_os_consumed">Attribute 'os_consumed'</a></li>
-        <li><a href="#attr_os_consumes_this">Attribute 'os_consumes_this'</a></li>
-        <li><a href="#os_out_parameters">Out Parameters</a></li>
-      </ul>
-
-    </li>
-  </ul>
-</li>
-<li><a href="#custom_assertions">Custom Assertion Handlers</a>
-  <ul>
-    <li><a href="#attr_noreturn">Attribute 'noreturn'</a></li>
-    <li><a href="#attr_analyzer_noreturn">Attribute 'analyzer_noreturn'</a></li>
-  </ul>
-  </li>
-</ul>
-
-<!-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-<h2 id="generic">Annotations to Enhance Generic Checks</h2>
-<!-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-
-<h3 id="null_checking">Null Pointer Checking</h3>
-
-<h4 id="attr_nonnull">Attribute 'nonnull'</h4>
-
-<p>The analyzer recognizes the GCC attribute 'nonnull', which indicates that a
-function expects that a given function parameter is not a null pointer. Specific
-details of the syntax of using the 'nonnull' attribute can be found in <a
-href="https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-nonnull-function-attribute">GCC's
-documentation</a>.</p>
-
-<p>Both the Clang compiler and GCC will flag warnings for simple cases where a
-null pointer is directly being passed to a function with a 'nonnull' parameter
-(e.g., as a constant). The analyzer extends this checking by using its deeper
-symbolic analysis to track what pointer values are potentially null and then
-flag warnings when they are passed in a function call via a 'nonnull'
-parameter.</p>
-
-<p><b>Example</b></p>
-
-<pre class="code_example">
-<span class="command">$ cat test.m</span>
-int bar(int*p, int q, int *r) __attribute__((nonnull(1,3)));
-
-int foo(int *p, int *q) {
-   return !p ? bar(q, 2, p)
-             : bar(p, 2, q);
-}
-</pre>
-
-<p>Running <tt>scan-build</tt> over this source produces the following
-output:</p>
-
-<img src="images/example_attribute_nonnull.png" alt="example attribute nonnull">
-
-<!-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-<h2 id="macosx">Mac OS X API Annotations</h2>
-<!-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-
-<h3 id="cocoa_mem">Cocoa &amp; Core Foundation Memory Management
-Annotations</h3>
-
-<!--
-<p>As described in <a href="/available_checks.html#retain_release">Available
-Checks</a>,
--->
-<p>The analyzer supports the proper management of retain counts for
-both Cocoa and Core Foundation objects. This checking is largely based on
-enforcing Cocoa and Core Foundation naming conventions for Objective-C methods
-(Cocoa) and C functions (Core Foundation). Not strictly following these
-conventions can cause the analyzer to miss bugs or flag false positives.</p>
-
-<p>One can educate the analyzer (and others who read your code) about methods or
-functions that deviate from the Cocoa and Core Foundation conventions using the
-attributes described here. However, you should consider using proper naming
-conventions or the <a
-href="https://clang.llvm.org/docs/LanguageExtensions.html#the-objc-method-family-attribute"><tt>objc_method_family</tt></a>
-attribute, if applicable.</p>
-
-<h4 id="attr_ns_returns_retained">Attribute 'ns_returns_retained'
-(Clang-specific)</h4>
-
-<p>The GCC-style (Clang-specific) attribute 'ns_returns_retained' allows one to
-annotate an Objective-C method or C function as returning a retained Cocoa
-object that the caller is responsible for releasing (via sending a
-<tt>release</tt> message to the object). The Foundation framework defines a
-macro <b><tt>NS_RETURNS_RETAINED</tt></b> that is functionally equivalent to the
-one shown below.</p>
-
-<p><b>Placing on Objective-C methods</b>: For Objective-C methods, this
-annotation essentially tells the analyzer to treat the method as if its name
-begins with &quot;alloc&quot; or &quot;new&quot; or contains the word
-&quot;copy&quot;.</p>
-
-<p><b>Placing on C functions</b>: For C functions returning Cocoa objects, the
-analyzer typically does not make any assumptions about whether or not the object
-is returned retained. Explicitly adding the 'ns_returns_retained' attribute to C
-functions allows the analyzer to perform extra checking.</p>
-
-<p><b>Example</b></p>
-
-<pre class="code_example">
-<span class="command">$ cat test.m</span>
-#import &lt;Foundation/Foundation.h&gt;
-
-#ifndef __has_feature      // Optional.
-#define __has_feature(x) 0 // Compatibility with non-clang compilers.
-#endif
-
-#ifndef NS_RETURNS_RETAINED
-#if __has_feature(attribute_ns_returns_retained)
-<span class="code_highlight">#define NS_RETURNS_RETAINED __attribute__((ns_returns_retained))</span>
-#else
-#define NS_RETURNS_RETAINED
-#endif
-#endif
-
-@interface MyClass : NSObject {}
-- (NSString*) returnsRetained <span class="code_highlight">NS_RETURNS_RETAINED</span>;
-- (NSString*) alsoReturnsRetained;
-@end
-
-@implementation MyClass
-- (NSString*) returnsRetained {
-  return [[NSString alloc] initWithCString:"no leak here"];
-}
-- (NSString*) alsoReturnsRetained {
-  return [[NSString alloc] initWithCString:"flag a leak"];
-}
-@end
-</pre>
-
-<p>Running <tt>scan-build</tt> on this source file produces the following output:</p>
-
-<img src="images/example_ns_returns_retained.png" alt="example returns retained">
-
-<h4 id="attr_ns_returns_not_retained">Attribute 'ns_returns_not_retained'
-(Clang-specific)</h4>
-
-<p>The 'ns_returns_not_retained' attribute is the complement of '<a
-href="#attr_ns_returns_retained">ns_returns_retained</a>'. Where a function or
-method may appear to obey the Cocoa conventions and return a retained Cocoa
-object, this attribute can be used to indicate that the object reference
-returned should not be considered as an &quot;owning&quot; reference being
-returned to the caller. The Foundation framework defines a
-macro <b><tt>NS_RETURNS_NOT_RETAINED</tt></b> that is functionally equivalent to
-the one shown below.</p>
-
-<p>Usage is identical to <a
-href="#attr_ns_returns_retained">ns_returns_retained</a>.  When using the
-attribute, be sure to declare it within the proper macro that checks for
-its availability, as it is not available in earlier versions of the analyzer:</p>
-
-<pre class="code_example">
-<span class="command">$ cat test.m</span>
-#ifndef __has_feature      // Optional.
-#define __has_feature(x) 0 // Compatibility with non-clang compilers.
-#endif
-
-#ifndef NS_RETURNS_NOT_RETAINED
-#if __has_feature(attribute_ns_returns_not_retained)
-<span class="code_highlight">#define NS_RETURNS_NOT_RETAINED __attribute__((ns_returns_not_retained))</span>
-#else
-#define NS_RETURNS_NOT_RETAINED
-#endif
-#endif
-</pre>
-
-<h4 id="attr_cf_returns_retained">Attribute 'cf_returns_retained'
-(Clang-specific)</h4>
-
-<p>The GCC-style (Clang-specific) attribute 'cf_returns_retained' allows one to
-annotate an Objective-C method or C function as returning a retained Core
-Foundation object that the caller is responsible for releasing. The
-CoreFoundation framework defines a macro <b><tt>CF_RETURNS_RETAINED</tt></b>
-that is functionally equivalent to the one shown below.</p>
-
-<p><b>Placing on Objective-C methods</b>: With respect to Objective-C methods.,
-this attribute is identical in its behavior and usage to 'ns_returns_retained'
-except for the distinction of returning a Core Foundation object instead of a
-Cocoa object.
-
-This distinction is important for the following reason:
-as Core Foundation is a C API,
-the analyzer cannot always tell that a pointer return value refers to a
-Core Foundation object.
-In contrast, it is
-trivial for the analyzer to recognize if a pointer refers to a Cocoa object
-(given the Objective-C type system).
-
-<p><b>Placing on C functions</b>: When placing the attribute
-'cf_returns_retained' on the declarations of C functions, the analyzer
-interprets the function as:</p>
-
-<ol>
-  <li>Returning a Core Foundation Object</li>
-  <li>Treating the function as if it its name
-contained the keywords &quot;create&quot; or &quot;copy&quot;. This means the
-returned object as a +1 retain count that must be released by the caller, either
-by sending a <tt>release</tt> message (via toll-free bridging to an Objective-C
-object pointer), or calling <tt>CFRelease</tt> or a similar function.</li>
-</ol>
-
-<p><b>Example</b></p>
-
-<pre class="code_example">
-<span class="command">$ cat test.m</span>
-$ cat test.m
-#import &lt;Cocoa/Cocoa.h&gt;
-
-#ifndef __has_feature      // Optional.
-#define __has_feature(x) 0 // Compatibility with non-clang compilers.
-#endif
-
-#ifndef CF_RETURNS_RETAINED
-#if __has_feature(attribute_cf_returns_retained)
-<span class="code_highlight">#define CF_RETURNS_RETAINED __attribute__((cf_returns_retained))</span>
-#else
-#define CF_RETURNS_RETAINED
-#endif
-#endif
-
-@interface MyClass : NSObject {}
-- (NSDate*) returnsCFRetained <span class="code_highlight">CF_RETURNS_RETAINED</span>;
-- (NSDate*) alsoReturnsRetained;
-- (NSDate*) returnsNSRetained <span class="code_highlight">NS_RETURNS_RETAINED</span>;
-@end
-
-<span class="code_highlight">CF_RETURNS_RETAINED</span>
-CFDateRef returnsRetainedCFDate()  {
-  return CFDateCreate(0, CFAbsoluteTimeGetCurrent());
-}
-
-@implementation MyClass
-- (NSDate*) returnsCFRetained {
-  return (NSDate*) returnsRetainedCFDate(); <b><i>// No leak.</i></b>
-}
-
-- (NSDate*) alsoReturnsRetained {
-  return (NSDate*) returnsRetainedCFDate(); <b><i>// Always report a leak.</i></b>
-}
-
-- (NSDate*) returnsNSRetained {
-  return (NSDate*) returnsRetainedCFDate(); <b><i>// Report a leak when using GC.</i></b>
-}
-@end
-</pre>
-
-<p>Running <tt>scan-build</tt> on this example produces the following output:</p>
-
-<img src="images/example_cf_returns_retained.png" alt="example returns retained">
-
-<h4 id="attr_cf_returns_not_retained">Attribute 'cf_returns_not_retained'
-(Clang-specific)</h4>
-
-<p>The 'cf_returns_not_retained' attribute is the complement of '<a
-href="#attr_cf_returns_retained">cf_returns_retained</a>'. Where a function or
-method may appear to obey the Core Foundation or Cocoa conventions and return
-a retained Core Foundation object, this attribute can be used to indicate that
-the object reference returned should not be considered as an
-&quot;owning&quot; reference being returned to the caller. The
-CoreFoundation framework defines a macro <b><tt>CF_RETURNS_NOT_RETAINED</tt></b>
-that is functionally equivalent to the one shown below.</p>
-
-<p>Usage is identical to <a
-href="#attr_cf_returns_retained">cf_returns_retained</a>.  When using the
-attribute, be sure to declare it within the proper macro that checks for
-its availability, as it is not available in earlier versions of the analyzer:</p>
-
-<pre class="code_example">
-<span class="command">$ cat test.m</span>
-#ifndef __has_feature      // Optional.
-#define __has_feature(x) 0 // Compatibility with non-clang compilers.
-#endif
-
-#ifndef CF_RETURNS_NOT_RETAINED
-#if __has_feature(attribute_cf_returns_not_retained)
-<span class="code_highlight">#define CF_RETURNS_NOT_RETAINED __attribute__((cf_returns_not_retained))</span>
-#else
-#define CF_RETURNS_NOT_RETAINED
-#endif
-#endif
-</pre>
-
-<h4 id="attr_ns_consumed">Attribute 'ns_consumed'
-(Clang-specific)</h4>
-
-<p>The 'ns_consumed' attribute can be placed on a specific parameter in either
-the declaration of a function or an Objective-C method. It indicates to the
-static analyzer that a <tt>release</tt> message is implicitly sent to the
-parameter upon completion of the call to the given function or method. The
-Foundation framework defines a macro <b><tt>NS_RELEASES_ARGUMENT</tt></b> that
-is functionally equivalent to the <tt>NS_CONSUMED</tt> macro shown below.</p>
-
-<p><b>Example</b></p>
-
-<pre class="code_example">
-<span class="command">$ cat test.m</span>
-#ifndef __has_feature      // Optional.
-#define __has_feature(x) 0 // Compatibility with non-clang compilers.
-#endif
-
-#ifndef NS_CONSUMED
-#if __has_feature(attribute_ns_consumed)
-<span class="code_highlight">#define NS_CONSUMED __attribute__((ns_consumed))</span>
-#else
-#define NS_CONSUMED
-#endif
-#endif
-
-void consume_ns(id <span class="code_highlight">NS_CONSUMED</span> x);
-
-void test() {
-  id x = [[NSObject alloc] init];
-  consume_ns(x); <b><i>// No leak!</i></b>
-}
-
-@interface Foo : NSObject
-+ (void) releaseArg:(id) <span class="code_highlight">NS_CONSUMED</span> x;
-+ (void) releaseSecondArg:(id)x second:(id) <span class="code_highlight">NS_CONSUMED</span> y;
-@end
-
-void test_method() {
-  id x = [[NSObject alloc] init];
-  [Foo releaseArg:x]; <b><i>// No leak!</i></b>
-}
-
-void test_method2() {
-  id a = [[NSObject alloc] init];
-  id b = [[NSObject alloc] init];
-  [Foo releaseSecondArg:a second:b]; <b><i>// 'a' is leaked, but 'b' is released.</i></b>
-}
-</pre>
-
-<h4 id="attr_cf_consumed">Attribute 'cf_consumed'
-(Clang-specific)</h4>
-
-<p>The 'cf_consumed' attribute is practically identical to <a
-href="#attr_ns_consumed">ns_consumed</a>. The attribute can be placed on a
-specific parameter in either the declaration of a function or an Objective-C
-method. It indicates to the static analyzer that the object reference is
-implicitly passed to a call to <tt>CFRelease</tt> upon completion of the call
-to the given function or method. The CoreFoundation framework defines a macro
-<b><tt>CF_RELEASES_ARGUMENT</tt></b> that is functionally equivalent to the
-<tt>CF_CONSUMED</tt> macro shown below.</p>
-
-<p>Operationally this attribute is nearly identical to 'ns_consumed'.</p>
-
-<p><b>Example</b></p>
-
-<pre class="code_example">
-<span class="command">$ cat test.m</span>
-#ifndef __has_feature      // Optional.
-#define __has_feature(x) 0 // Compatibility with non-clang compilers.
-#endif
-
-#ifndef CF_CONSUMED
-#if __has_feature(attribute_cf_consumed)
-<span class="code_highlight">#define CF_CONSUMED __attribute__((cf_consumed))</span>
-#else
-#define CF_CONSUMED
-#endif
-#endif
-
-void consume_cf(id <span class="code_highlight">CF_CONSUMED</span> x);
-void consume_CFDate(CFDateRef <span class="code_highlight">CF_CONSUMED</span> x);
-
-void test() {
-  id x = [[NSObject alloc] init];
-  consume_cf(x); <b><i>// No leak!</i></b>
-}
-
-void test2() {
-  CFDateRef date = CFDateCreate(0, CFAbsoluteTimeGetCurrent());
-  consume_CFDate(date); <b><i>// No leak, including under GC!</i></b>
-
-}
-
-@interface Foo : NSObject
-+ (void) releaseArg:(CFDateRef) <span class="code_highlight">CF_CONSUMED</span> x;
-@end
-
-void test_method() {
-  CFDateRef date = CFDateCreate(0, CFAbsoluteTimeGetCurrent());
-  [Foo releaseArg:date]; <b><i>// No leak!</i></b>
-}
-</pre>
-
-<h4 id="attr_ns_consumes_self">Attribute 'ns_consumes_self'
-(Clang-specific)</h4>
-
-<p>The 'ns_consumes_self' attribute can be placed only on an Objective-C method
-declaration. It indicates that the receiver of the message is
-&quot;consumed&quot; (a single reference count decremented) after the message
-is sent. This matches the semantics of all &quot;init&quot; methods.</p>
-
-<p>One use of this attribute is declare your own init-like methods that do not
-follow the standard Cocoa naming conventions.</p>
-
-<p><b>Example</b></p>
-
-<pre class="code_example">
-#ifndef __has_feature
-#define __has_feature(x) 0 // Compatibility with non-clang compilers.
-#endif
-
-#ifndef NS_CONSUMES_SELF
-#if __has_feature((attribute_ns_consumes_self))
-<span class="code_highlight">#define NS_CONSUMES_SELF __attribute__((ns_consumes_self))</span>
-#else
-#define NS_CONSUMES_SELF
-#endif
-#endif
-
-@interface MyClass : NSObject
-- initWith:(MyClass *)x;
-- nonstandardInitWith:(MyClass *)x <span class="code_highlight">NS_CONSUMES_SELF</span> NS_RETURNS_RETAINED;
-@end
-</pre>
-
-<p>In this example, <tt>-nonstandardInitWith:</tt> has the same ownership
-semantics as the init method <tt>-initWith:</tt>. The static analyzer will
-observe that the method consumes the receiver, and then returns an object with
-a +1 retain count.</p>
-
-<p>The Foundation framework defines a macro <b><tt>NS_REPLACES_RECEIVER</tt></b>
-which is functionally equivalent to the combination of <tt>NS_CONSUMES_SELF</tt>
-and <tt>NS_RETURNS_RETAINED</tt> shown above.</p>
-
-<h3 id="osobject_mem">Libkern Memory Management Annotations</h3>
-
-<p><a
-  href="https://developer.apple.com/documentation/kernel/osobject?language=objc">Libkern</a>
-requires developers to inherit all heap allocated objects from <tt>OSObject</tt>
-and to perform manual reference counting.
-The reference counting model is very similar to MRR (manual retain-release) mode in
-<a href="https://developer.apple.com/library/archive/documentation/Cocoa/Conceptual/MemoryMgmt/Articles/mmRules.html">Objective-C</a>
-or to CoreFoundation reference counting.
-Freshly-allocated objects start with a reference count of 1,
-and calls to <tt>retain</tt> increment it,
-while calls to <tt>release</tt> decrement it.
-The object is deallocated whenever its reference count reaches zero.</p>
-
-<p>Manually incrementing and decrementing reference counts is error-prone:
-over-retains lead to leaks, and over-releases lead to uses-after-free.
-The analyzer can help the programmer to check for unbalanced
-retain/release calls.</p>
-
-<p>The reference count checking is based on the principle of
-<em>locality</em>: it should be possible to establish correctness
-(lack of leaks/uses after free) by looking at each function body,
-and the declarations (not the definitions) of all the functions it interacts
-with.</p>
-
-<p>In order to support such reasoning, it should be possible to <em>summarize</em>
-the behavior of each function, with respect to reference count
-of its returned values and attributes.</p>
-
-<p>By default, the following summaries are assumed:</p>
-<ul>
-  <li>All functions starting with <tt>get</tt> or <tt>Get</tt>,
-    unless they are returning subclasses of <tt>OSIterator</tt>,
-  are assumed to be returning at +0.
-  That is, the caller has no reference
-  count <em>obligations</em> with respect to the reference count of the returned object
-  and should leave it untouched.
-  </li>
-
-  <li>
-    All other functions are assumed to return at +1.
-    That is, the caller has an <em>obligation</em> to release such objects.
-  </li>
-
-  <li>
-    Functions are assumed not to change the reference count of their parameters,
-    including the implicit <tt>this</tt> parameter.
-  </li>
-</ul>
-
-<p>These summaries can be overriden with the following
-<a href="https://clang.llvm.org/docs/AttributeReference.html#os-returns-not-retained">attributes</a>:</p>
-
-<h4 id="attr_os_returns_retained">Attribute 'os_returns_retained'</h4>
-
-<p>The <tt>os_returns_retained</tt> attribute (accessed through the macro <tt>
-LIBKERN_RETURNS_RETAINED</tt>) plays a role identical to <a
-href="#attr_ns_returns_retained">ns_returns_retained</a> for functions
-returning <tt>OSObject</tt> subclasses.
-The attribute indicates that it is a callers responsibility to release the
-returned object.
-</p>
-
-
-<h4 id="attr_os_returns_not_retained">Attribute 'os_returns_not_retained'</h4>
-
-<p>The <tt>os_returns_not_retained</tt> attribute (accessed through the macro <tt>
-LIBKERN_RETURNS_NOT_RETAINED</tt>) plays a role identical to <a
-href="#attr_ns_returns_not_retained">ns_returns_not_retained</a> for functions
-returning <tt>OSObject</tt> subclasses.
-The attribute indicates that the caller should not change the retain
-count of the returned object.
-</p>
-
-<h5>Example</h5>
-
-<pre class="code_example">
-class MyClass {
-  OSObject *f;
-  LIBKERN_RETURNS_NOT_RETAINED OSObject *myFieldGetter();
-}
-
-
-// Note that the annotation only has to be applied to the function declaration.
-OSObject * MyClass::myFieldGetter() {
-  return f;
-}
-</pre>
-
-<h4 id="attr_os_consumed">Attribute 'os_consumed'</h4>
-
-<p>Similarly to <a href="#attr_ns_consumed">ns_consumed</a> attribute,
-<tt>os_consumed</tt> (accessed through <tt>LIBKERN_CONSUMED</tt>) attribute,
-applied to a parameter,
-indicates that the call to the function <em>consumes</em> the parameter:
-the callee should either release it or store it and release it in the destructor,
-while the caller should assume one is subtracted from the reference count
-after the call.</p>
-
-<pre class="code_example">
-IOReturn addToList(LIBKERN_CONSUMED IOPMinformee *newInformee);
-</pre>
-
-<h4 id="attr_os_consumes_this">Attribute 'os_consumes_this'</h4>
-
-<p>Similarly to <a href="#attr_ns_consumes_self">ns_consumes_self</a>,
-the <tt>os_consumes_self</tt> attribute indicates that the method call
-<em>consumes</em> the implicit <tt>this</tt> argument: the caller
-should assume one was subtracted from the reference count of the object
-after the call, and the callee has on obligation to either
-release the argument, or store it and eventually release it in the
-destructor.</p>
-
-<pre class="code_example">
-void addThisToList(OSArray *givenList) LIBKERN_CONSUMES_THIS;
-</pre>
-
-<h4 id="os_out_parameters">Out Parameters</h4>
-
-A function can also return an object to a caller by a means of an out parameter
-(a pointer-to-OSObject-pointer is passed, and a callee writes a pointer to an
-object into an argument).
-Currently the analyzer does not track unannotated out
-parameters by default, but with annotations we distinguish four separate cases:
-
-<p><b>1. Non-retained out parameters</b>, identified using
-    <tt>LIBKERN_RETURNS_NOT_RETAINED</tt> applied to parameters, e.g.:</p>
-
-<pre class="code_example">
-void getterViaOutParam(LIBKERN_RETURNS_NOT_RETAINED OSObject **obj)
-</pre>
-
-<p>Such functions write a non-retained object into an out parameter, and the
-caller has no further obligations.</p>
-
-<p><b>2. Retained out parameters</b>,
-identified using <tt>LIBKERN_RETURNS_RETAINED</tt>:</p>
-<pre class="code_example">
-void getterViaOutParam(LIBKERN_RETURNS_NOT_RETAINED OSObject **obj)
-</pre>
-<p>
-In such cases a retained object is written into an out parameter, which the caller has then to release in order to avoid a leak.
-</p>
-
-<p>These two cases are simple - but in practice a functions returning an out-parameter usually also return a return code, and then an out parameter may or may not be written, which conditionally depends on the exit code, e.g.:</p>
-
-<pre class="code_example">
-bool maybeCreateObject(LIBKERN_RETURNS_RETAINED OSObject **obj);
-</pre>
-
-<p>For such functions, the usual semantics is that an object is written into on "success", and not written into on "failure".<p>
-
-<p>For <tt>LIBKERN_RETURNS_RETAINED</tt> we assume the following definition of
-success:</p>
-
-<p>For functions returning <tt>OSReturn</tt> or <tt>IOReturn</tt>
-(any typedef to <tt>kern_return_t</tt>) success is defined as having an output of zero (<tt>kIOReturnSuccess</tt> is zero).
-For all others, success is non-zero (e.g. non-nullptr for pointers)</p>
-
-<p><b>3. Retained out parameters on zero return</b>
-The annotation <tt>LIBKERN_RETURNS_RETAINED_ON_ZERO</tt> states
-that a retained object is written into if and only if the function returns a zero value:</p>
-
-<pre class="code_example">
-bool OSUnserializeXML(void *data, LIBKERN_RETURNS_RETAINED_ON_ZERO OSString **errString);
-</pre>
-
-<p>Then the caller has to release an object if the function has returned zero.</p>
-
-<p><b>4. Retained out parameters on non-zero return</b>
-Similarly, <tt>LIBKERN_RETURNS_RETAINED_ON_NONZERO</tt> specifies that a
-retained object is written into the parameter if and only if the function has
-returned a non-zero value.</p>
-
-<p>Note that for non-retained out parameters conditionals do not matter, as the
-caller has no obligations regardless of whether an object is written into or
-not.</p>
-
-<!-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-<h2 id="custom_assertions">Custom Assertion Handlers</h2>
-<!-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-
-<p>The analyzer exploits code assertions by pruning off paths where the
-assertion condition is false. The idea is capture any program invariants
-specified in the assertion that the developer may know but is not immediately
-apparent in the code itself. In this way assertions make implicit assumptions
-explicit in the code, which not only makes the analyzer more accurate when
-finding bugs, but can help others better able to understand your code as well.
-It can also help remove certain kinds of analyzer false positives by pruning off
-false paths.</p>
-
-<p>In order to exploit assertions, however, the analyzer must understand when it
-encounters an &quot;assertion handler.&quot; Typically assertions are
-implemented with a macro, with the macro performing a check for the assertion
-condition and, when the check fails, calling an assertion handler.  For example, consider the following code
-fragment:</p>
-
-<pre class="code_example">
-void foo(int *p) {
-  assert(p != NULL);
-}
-</pre>
-
-<p>When this code is preprocessed on Mac OS X it expands to the following:</p>
-
-<pre class="code_example">
-void foo(int *p) {
-  (__builtin_expect(!(p != NULL), 0) ? __assert_rtn(__func__, "t.c", 4, "p != NULL") : (void)0);
-}
-</pre>
-
-<p>In this example, the assertion handler is <tt>__assert_rtn</tt>. When called,
-most assertion handlers typically print an error and terminate the program. The
-analyzer can exploit such semantics by ending the analysis of a path once it
-hits a call to an assertion handler.</p>
-
-<p>The trick, however, is that the analyzer needs to know that a called function
-is an assertion handler; otherwise the analyzer might assume the function call
-returns and it will continue analyzing the path where the assertion condition
-failed. This can lead to false positives, as the assertion condition usually
-implies a safety condition (e.g., a pointer is not null) prior to performing
-some action that depends on that condition (e.g., dereferencing a pointer).</p>
-
-<p>The analyzer knows about several well-known assertion handlers, but can
-automatically infer if a function should be treated as an assertion handler if
-it is annotated with the 'noreturn' attribute or the (Clang-specific)
-'analyzer_noreturn' attribute. Note that, currently, clang does not support
-these attributes on Objective-C methods and C++ methods.</p>
-
-<h4 id="attr_noreturn">Attribute 'noreturn'</h4>
-
-<p>The 'noreturn' attribute is a GCC-attribute that can be placed on the
-declarations of functions. It means exactly what its name implies: a function
-with a 'noreturn' attribute should never return.</p>
-
-<p>Specific details of the syntax of using the 'noreturn' attribute can be found
-in <a
-href="https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-noreturn-function-attribute">GCC's
-documentation</a>.</p>
-
-<p>Not only does the analyzer exploit this information when pruning false paths,
-but the compiler also takes it seriously and will generate different code (and
-possibly better optimized) under the assumption that the function does not
-return.</p>
-
-<p><b>Example</b></p>
-
-<p>On Mac OS X, the function prototype for <tt>__assert_rtn</tt> (declared in
-<tt>assert.h</tt>) is specifically annotated with the 'noreturn' attribute:</p>
-
-<pre class="code_example">
-void __assert_rtn(const char *, const char *, int, const char *) <span class="code_highlight">__attribute__((__noreturn__))</span>;
-</pre>
-
-<h4 id="attr_analyzer_noreturn">Attribute 'analyzer_noreturn' (Clang-specific)</h4>
-
-<p>The Clang-specific 'analyzer_noreturn' attribute is almost identical to
-'noreturn' except that it is ignored by the compiler for the purposes of code
-generation.</p>
-
-<p>This attribute is useful for annotating assertion handlers that actually
-<em>can</em> return, but for the purpose of using the analyzer we want to
-pretend that such functions do not return.</p>
-
-<p>Because this attribute is Clang-specific, its use should be conditioned with
-the use of preprocessor macros.</p>
-
-<p><b>Example</b>
-
-<pre class="code_example">
-#ifndef CLANG_ANALYZER_NORETURN
-#if __has_feature(attribute_analyzer_noreturn)
-<span class="code_highlight">#define CLANG_ANALYZER_NORETURN __attribute__((analyzer_noreturn))</span>
-#else
-#define CLANG_ANALYZER_NORETURN
-#endif
-#endif
-
-void my_assert_rtn(const char *, const char *, int, const char *) <span class="code_highlight">CLANG_ANALYZER_NORETURN</span>;
-</pre>
-
-</div>
-</div>
+</div> <!-- content -->
+</div> <!-- page -->
 </body>
 </html>

From 8b4561467e828c3755ad2942715fac00de2be4a7 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Mon, 13 Jan 2025 13:07:19 +0000
Subject: [PATCH 266/408] LAA: add missed swap when inverting src, sink
 (#122254)

When inverting source and sink on a negative induction step, the types
of the source and sink should also be swapped. This fixes a bug in the
code that follows, that computes properties based on these types. With
234cc40 ([LAA] Limit no-overlap check to at least one loop-invariant
accesses.), that code is guarded by a loop-invariant condition: however,
the commit did not add any new tests exercising the guarded code, and
hence the bugfix in this patch requires additional tests to exercise
that guarded codepath.
---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      |  1 +
 .../LoopAccessAnalysis/depend_diff_types.ll   | 76 +++++++++++++++++++
 2 files changed, 77 insertions(+)

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 38e9145826c08..2a68979add666 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1921,6 +1921,7 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
   if (StrideAPtr && *StrideAPtr < 0) {
     std::swap(Src, Sink);
     std::swap(AInst, BInst);
+    std::swap(ATy, BTy);
     std::swap(StrideAPtr, StrideBPtr);
   }
 
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
index 0bdcc35790148..e855578e794fa 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
@@ -194,3 +194,79 @@ loop:
 exit:
   ret void
 }
+
+; In the following test, the sink is loop-invariant.
+
+define void @type_size_equivalence_sink_loopinv(ptr nocapture %vec, i64 %n) {
+; CHECK-LABEL: 'type_size_equivalence_sink_loopinv'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.n = getelementptr inbounds i64, ptr %vec, i64 %n
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+
+  %gep.iv = getelementptr i64, ptr %vec, i64 %iv
+  %ld.i64 = load i64, ptr %gep.iv, align 8
+
+  %ld.i64.i32 = trunc i64 %ld.i64 to i32
+  store i32 %ld.i64.i32, ptr %gep.n, align 8
+
+  %iv.next = add nuw nsw i64 %iv, 1
+  %cond = icmp eq i64 %iv.next, %n
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; Variant of the above, with a negative induction step and a gep exposing
+; type-mismtach.
+
+define void @type_size_equivalence_sink_loopinv_negind(ptr nocapture %vec, i64 %n) {
+; CHECK-LABEL: 'type_size_equivalence_sink_loopinv_negind'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %minus.n = sub nsw i64 0, %n
+  %gep.minus.n = getelementptr inbounds i64, ptr %vec, i64 %minus.n
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+
+  %minus.iv = sub nsw i64 0, %iv
+  %gep.minus.iv = getelementptr i64, ptr %vec, i64 %minus.iv
+  %gep.minus.iv.4 = getelementptr i8, ptr %gep.minus.iv, i64 -4
+  %ld.i64 = load i64, ptr %gep.minus.iv.4, align 8
+
+  %ld.i64.i32 = trunc i64 %ld.i64 to i32
+  store i32 %ld.i64.i32, ptr %gep.minus.n, align 8
+
+  %iv.next = add nuw nsw i64 %iv, 1
+  %cond = icmp eq i64 %iv.next, %n
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  ret void
+}

From 418f5cd6c2b99c8cd45113aada8e83446ebdc350 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen@sifive.com>
Date: Mon, 13 Jan 2025 21:15:03 +0800
Subject: [PATCH 267/408] [LV][EVL] Pre-commit test case for fixed-order
 recurrence with EVL tail folding. (NFC) (#122456)

This test case is from
SingleSource/UnitTests/Vectorizer/recurrences.test.
Pre-commit for #122458
---
 ...ce-tail-with-evl-fixed-order-recurrence.ll | 549 ++++++++++++++++++
 1 file changed, 549 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll

diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
new file mode 100644
index 0000000000000..9f8cf169c0593
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
@@ -0,0 +1,549 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-vectorize \
+; RUN: -prefer-inloop-reductions \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=IF-EVL
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -prefer-inloop-reductions \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=NO-VP
+
+define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
+; IF-EVL-LABEL: define void @first_order_recurrence(
+; IF-EVL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0:[0-9]+]] {
+; IF-EVL-NEXT:  [[ENTRY:.*]]:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[TC]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT:    br i1 [[TMP3]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IF-EVL:       [[VECTOR_PH]]:
+; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], 4
+; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP10]], 1
+; IF-EVL-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 33, i32 [[TMP11]]
+; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IF-EVL:       [[VECTOR_BODY]]:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP14]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP16:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[VP_OP_LOAD]], i32 -1)
+; IF-EVL-NEXT:    [[VP_OP:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[TMP16]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP]], ptr align 4 [[TMP18]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], 4
+; IF-EVL-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP22]], 1
+; IF-EVL-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[VP_OP_LOAD]], i32 [[TMP23]]
+; IF-EVL-NEXT:    br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; IF-EVL:       [[SCALAR_PH]]:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; IF-EVL-NEXT:    br label %[[FOR_BODY:.*]]
+; IF-EVL:       [[FOR_BODY]]:
+; IF-EVL-NEXT:    [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP24:%.*]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]]
+; IF-EVL-NEXT:    [[TMP24]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[FOR1]], [[TMP24]]
+; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS]]
+; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT:    [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       [[FOR_END]]:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: define void @first_order_recurrence(
+; NO-VP-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-VP-NEXT:  [[ENTRY:.*]]:
+; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TC]], [[TMP1]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TC]], [[TMP3]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[TC]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 4
+; NO-VP-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; NO-VP-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 33, i32 [[TMP8]]
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP9]]
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP10]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i32>, ptr [[TMP11]], align 4
+; NO-VP-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1)
+; NO-VP-NEXT:    [[TMP13:%.*]] = add nsw <vscale x 4 x i32> [[TMP12]], [[WIDE_LOAD]]
+; NO-VP-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP9]]
+; NO-VP-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP14]], i32 0
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP13]], ptr [[TMP15]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], 4
+; NO-VP-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 1
+; NO-VP-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i32 [[TMP19]]
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TC]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; NO-VP-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; NO-VP-NEXT:    br label %[[FOR_BODY:.*]]
+; NO-VP:       [[FOR_BODY]]:
+; NO-VP-NEXT:    [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
+; NO-VP-NEXT:    [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP20:%.*]], %[[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]]
+; NO-VP-NEXT:    [[TMP20]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[ADD:%.*]] = add nsw i32 [[FOR1]], [[TMP20]]
+; NO-VP-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS]]
+; NO-VP-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
+; NO-VP-NEXT:    [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP:       [[FOR_END]]:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars = phi i64 [ 0, %entry ], [ %indvars.next, %for.body ]
+  %for1 = phi i32 [ 33, %entry ], [ %0, %for.body ]
+  %arrayidx = getelementptr inbounds nuw i32, ptr %A, i64 %indvars
+  %0 = load i32, ptr %arrayidx, align 4
+  %add = add nsw i32 %for1, %0
+  %arrayidx2 = getelementptr inbounds nuw i32, ptr %B, i64 %indvars
+  store i32 %add, ptr %arrayidx2, align 4
+  %indvars.next = add nuw nsw i64 %indvars, 1
+  %exitcond.not = icmp eq i64 %indvars.next, %TC
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
+; IF-EVL-LABEL: define void @second_order_recurrence(
+; IF-EVL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT:  [[ENTRY:.*]]:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[TC]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT:    br i1 [[TMP3]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IF-EVL:       [[VECTOR_PH]]:
+; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], 4
+; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP10]], 1
+; IF-EVL-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 33, i32 [[TMP11]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP13:%.*]] = mul i32 [[TMP12]], 4
+; IF-EVL-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP13]], 1
+; IF-EVL-NEXT:    [[VECTOR_RECUR_INIT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 22, i32 [[TMP14]]
+; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IF-EVL:       [[VECTOR_BODY]]:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT1]], %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[TMP16:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP16]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP15]])
+; IF-EVL-NEXT:    [[TMP19]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[VP_OP_LOAD]], i32 -1)
+; IF-EVL-NEXT:    [[TMP20:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR2]], <vscale x 4 x i32> [[TMP19]], i32 -1)
+; IF-EVL-NEXT:    [[VP_OP:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[TMP19]], <vscale x 4 x i32> [[TMP20]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP15]])
+; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP16]]
+; IF-EVL-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP21]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP]], ptr align 4 [[TMP22]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP15]])
+; IF-EVL-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP15]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL:       [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP26:%.*]] = mul i32 [[TMP25]], 4
+; IF-EVL-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP26]], 1
+; IF-EVL-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[VP_OP_LOAD]], i32 [[TMP27]]
+; IF-EVL-NEXT:    [[TMP28:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP29:%.*]] = mul i32 [[TMP28]], 4
+; IF-EVL-NEXT:    [[TMP30:%.*]] = sub i32 [[TMP29]], 1
+; IF-EVL-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <vscale x 4 x i32> [[TMP19]], i32 [[TMP30]]
+; IF-EVL-NEXT:    br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; IF-EVL:       [[SCALAR_PH]]:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; IF-EVL-NEXT:    br label %[[FOR_BODY:.*]]
+; IF-EVL:       [[FOR_BODY]]:
+; IF-EVL-NEXT:    [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP31:%.*]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT4]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]]
+; IF-EVL-NEXT:    [[TMP31]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[FOR1]], [[FOR2]]
+; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS]]
+; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT:    [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; IF-EVL:       [[FOR_END]]:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: define void @second_order_recurrence(
+; NO-VP-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT:  [[ENTRY:.*]]:
+; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TC]], [[TMP1]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TC]], [[TMP3]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[TC]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 4
+; NO-VP-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; NO-VP-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 33, i32 [[TMP8]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], 4
+; NO-VP-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP10]], 1
+; NO-VP-NEXT:    [[VECTOR_RECUR_INIT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 22, i32 [[TMP11]]
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT1]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP12]]
+; NO-VP-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP13]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; NO-VP-NEXT:    [[TMP15]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1)
+; NO-VP-NEXT:    [[TMP16:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR2]], <vscale x 4 x i32> [[TMP15]], i32 -1)
+; NO-VP-NEXT:    [[TMP17:%.*]] = add nsw <vscale x 4 x i32> [[TMP15]], [[TMP16]]
+; NO-VP-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP12]]
+; NO-VP-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP18]], i32 0
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP17]], ptr [[TMP19]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], 4
+; NO-VP-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP22]], 1
+; NO-VP-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i32 [[TMP23]]
+; NO-VP-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-NEXT:    [[TMP25:%.*]] = mul i32 [[TMP24]], 4
+; NO-VP-NEXT:    [[TMP26:%.*]] = sub i32 [[TMP25]], 1
+; NO-VP-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <vscale x 4 x i32> [[TMP15]], i32 [[TMP26]]
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TC]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; NO-VP-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; NO-VP-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; NO-VP-NEXT:    br label %[[FOR_BODY:.*]]
+; NO-VP:       [[FOR_BODY]]:
+; NO-VP-NEXT:    [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
+; NO-VP-NEXT:    [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP27:%.*]], %[[FOR_BODY]] ]
+; NO-VP-NEXT:    [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT4]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]]
+; NO-VP-NEXT:    [[TMP27]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[ADD:%.*]] = add nsw i32 [[FOR1]], [[FOR2]]
+; NO-VP-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS]]
+; NO-VP-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
+; NO-VP-NEXT:    [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; NO-VP:       [[FOR_END]]:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars = phi i64 [ 0, %entry ], [ %indvars.next, %for.body ]
+  %for1 = phi i32 [ 33, %entry ], [ %0, %for.body ]
+  %for2 = phi i32 [ 22, %entry ], [ %for1, %for.body ]
+  %arrayidx = getelementptr inbounds nuw i32, ptr %A, i64 %indvars
+  %0 = load i32, ptr %arrayidx, align 4
+  %add = add nsw i32 %for1, %for2
+  %arrayidx2 = getelementptr inbounds nuw i32, ptr %B, i64 %indvars
+  store i32 %add, ptr %arrayidx2, align 4
+  %indvars.next = add nuw nsw i64 %indvars, 1
+  %exitcond.not = icmp eq i64 %indvars.next, %TC
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
+; IF-EVL-LABEL: define void @third_order_recurrence(
+; IF-EVL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT:  [[ENTRY:.*]]:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[TC]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT:    br i1 [[TMP3]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IF-EVL:       [[VECTOR_PH]]:
+; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], 4
+; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP10]], 1
+; IF-EVL-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 33, i32 [[TMP11]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP13:%.*]] = mul i32 [[TMP12]], 4
+; IF-EVL-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP13]], 1
+; IF-EVL-NEXT:    [[VECTOR_RECUR_INIT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 22, i32 [[TMP14]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], 4
+; IF-EVL-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP16]], 1
+; IF-EVL-NEXT:    [[VECTOR_RECUR_INIT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 11, i32 [[TMP17]]
+; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IF-EVL:       [[VECTOR_BODY]]:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT1]], %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[VECTOR_RECUR4:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT3]], %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[TMP19:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP19]]
+; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP20]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]])
+; IF-EVL-NEXT:    [[TMP22]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[VP_OP_LOAD]], i32 -1)
+; IF-EVL-NEXT:    [[TMP23]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR2]], <vscale x 4 x i32> [[TMP22]], i32 -1)
+; IF-EVL-NEXT:    [[TMP24:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR4]], <vscale x 4 x i32> [[TMP23]], i32 -1)
+; IF-EVL-NEXT:    [[VP_OP:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[TMP23]], <vscale x 4 x i32> [[TMP24]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]])
+; IF-EVL-NEXT:    [[VP_OP5:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[VP_OP]], <vscale x 4 x i32> [[TMP22]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]])
+; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP19]]
+; IF-EVL-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP25]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP5]], ptr align 4 [[TMP26]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]])
+; IF-EVL-NEXT:    [[TMP27:%.*]] = zext i32 [[TMP18]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP27]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL:       [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP30:%.*]] = mul i32 [[TMP29]], 4
+; IF-EVL-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP30]], 1
+; IF-EVL-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[VP_OP_LOAD]], i32 [[TMP31]]
+; IF-EVL-NEXT:    [[TMP32:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP33:%.*]] = mul i32 [[TMP32]], 4
+; IF-EVL-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP33]], 1
+; IF-EVL-NEXT:    [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement <vscale x 4 x i32> [[TMP22]], i32 [[TMP34]]
+; IF-EVL-NEXT:    [[TMP35:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP36:%.*]] = mul i32 [[TMP35]], 4
+; IF-EVL-NEXT:    [[TMP37:%.*]] = sub i32 [[TMP36]], 1
+; IF-EVL-NEXT:    [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <vscale x 4 x i32> [[TMP23]], i32 [[TMP37]]
+; IF-EVL-NEXT:    br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; IF-EVL:       [[SCALAR_PH]]:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT9:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT7]], %[[MIDDLE_BLOCK]] ], [ 11, %[[ENTRY]] ]
+; IF-EVL-NEXT:    br label %[[FOR_BODY:.*]]
+; IF-EVL:       [[FOR_BODY]]:
+; IF-EVL-NEXT:    [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP38:%.*]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT8]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[FOR3:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT9]], %[[SCALAR_PH]] ], [ [[FOR2]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]]
+; IF-EVL-NEXT:    [[TMP38]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[FOR2]], [[FOR3]]
+; IF-EVL-NEXT:    [[ADD1:%.*]] = add i32 [[ADD]], [[FOR1]]
+; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS]]
+; IF-EVL-NEXT:    store i32 [[ADD1]], ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT:    [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; IF-EVL:       [[FOR_END]]:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: define void @third_order_recurrence(
+; NO-VP-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT:  [[ENTRY:.*]]:
+; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TC]], [[TMP1]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TC]], [[TMP3]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[TC]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 4
+; NO-VP-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; NO-VP-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 33, i32 [[TMP8]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], 4
+; NO-VP-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP10]], 1
+; NO-VP-NEXT:    [[VECTOR_RECUR_INIT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 22, i32 [[TMP11]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-NEXT:    [[TMP13:%.*]] = mul i32 [[TMP12]], 4
+; NO-VP-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP13]], 1
+; NO-VP-NEXT:    [[VECTOR_RECUR_INIT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 11, i32 [[TMP14]]
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT1]], %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[VECTOR_RECUR4:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT3]], %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP15]]
+; NO-VP-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP16]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i32>, ptr [[TMP17]], align 4
+; NO-VP-NEXT:    [[TMP18]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1)
+; NO-VP-NEXT:    [[TMP19]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR2]], <vscale x 4 x i32> [[TMP18]], i32 -1)
+; NO-VP-NEXT:    [[TMP20:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR4]], <vscale x 4 x i32> [[TMP19]], i32 -1)
+; NO-VP-NEXT:    [[TMP21:%.*]] = add nsw <vscale x 4 x i32> [[TMP19]], [[TMP20]]
+; NO-VP-NEXT:    [[TMP22:%.*]] = add <vscale x 4 x i32> [[TMP21]], [[TMP18]]
+; NO-VP-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP15]]
+; NO-VP-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP23]], i32 0
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP22]], ptr [[TMP24]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-NEXT:    [[TMP27:%.*]] = mul i32 [[TMP26]], 4
+; NO-VP-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP27]], 1
+; NO-VP-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i32 [[TMP28]]
+; NO-VP-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-NEXT:    [[TMP30:%.*]] = mul i32 [[TMP29]], 4
+; NO-VP-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP30]], 1
+; NO-VP-NEXT:    [[VECTOR_RECUR_EXTRACT5:%.*]] = extractelement <vscale x 4 x i32> [[TMP18]], i32 [[TMP31]]
+; NO-VP-NEXT:    [[TMP32:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-NEXT:    [[TMP33:%.*]] = mul i32 [[TMP32]], 4
+; NO-VP-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP33]], 1
+; NO-VP-NEXT:    [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement <vscale x 4 x i32> [[TMP19]], i32 [[TMP34]]
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TC]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; NO-VP-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; NO-VP-NEXT:    [[SCALAR_RECUR_INIT7:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT5]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; NO-VP-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 11, %[[ENTRY]] ]
+; NO-VP-NEXT:    br label %[[FOR_BODY:.*]]
+; NO-VP:       [[FOR_BODY]]:
+; NO-VP-NEXT:    [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
+; NO-VP-NEXT:    [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP35:%.*]], %[[FOR_BODY]] ]
+; NO-VP-NEXT:    [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT7]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ]
+; NO-VP-NEXT:    [[FOR3:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT8]], %[[SCALAR_PH]] ], [ [[FOR2]], %[[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]]
+; NO-VP-NEXT:    [[TMP35]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[ADD:%.*]] = add nsw i32 [[FOR2]], [[FOR3]]
+; NO-VP-NEXT:    [[ADD1:%.*]] = add i32 [[ADD]], [[FOR1]]
+; NO-VP-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS]]
+; NO-VP-NEXT:    store i32 [[ADD1]], ptr [[ARRAYIDX2]], align 4
+; NO-VP-NEXT:    [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; NO-VP:       [[FOR_END]]:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars = phi i64 [ 0, %entry ], [ %indvars.next, %for.body ]
+  %for1 = phi i32 [ 33, %entry ], [ %0, %for.body ]
+  %for2 = phi i32 [ 22, %entry ], [ %for1, %for.body ]
+  %for3 = phi i32 [ 11, %entry ], [ %for2, %for.body ]
+  %arrayidx = getelementptr inbounds nuw i32, ptr %A, i64 %indvars
+  %0 = load i32, ptr %arrayidx, align 4
+  %add = add nsw i32 %for2, %for3
+  %add1 = add i32 %add, %for1
+  %arrayidx2 = getelementptr inbounds nuw i32, ptr %B, i64 %indvars
+  store i32 %add1, ptr %arrayidx2, align 4
+  %indvars.next = add nuw nsw i64 %indvars, 1
+  %exitcond.not = icmp eq i64 %indvars.next, %TC
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}
+;.
+; IF-EVL: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; IF-EVL: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; IF-EVL: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; IF-EVL: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; IF-EVL: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; IF-EVL: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; IF-EVL: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; IF-EVL: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+;.
+; NO-VP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; NO-VP: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; NO-VP: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; NO-VP: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; NO-VP: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; NO-VP: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+;.

From ed80d0c2ae41888d8d0be90e73026bc126d82161 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy@ericsson.com>
Date: Mon, 13 Jan 2025 14:31:21 +0100
Subject: [PATCH 268/408] [NFC][analyzer][docs] Restore/remove orphaned images
 (#122481)

When commit 61a76f58ebf1 converted the static analyzer FAQ from HTML to
RST, it accidentally left out three images (`example_*.png`) that were
previously present in that document. This commit re-adds those three
images to the FAQ (and moves them to the directory for the image assets
of the RST documentation).

Moreover commit 093aaca2b0ad _copied_ the file `scan_build_cmd.png` to
the RST documentation directory instead of just moving it; so this
commit removes its "old" copy which is no longer used (because the old
HTML-based documentation file was replaced by a stub that redirects to
the RST docs).
---
 .../analyzer/images/example_custom_assert.png   | Bin
 .../analyzer/images/example_null_pointer.png    | Bin
 .../analyzer/images/example_use_assert.png      | Bin
 clang/docs/analyzer/user-docs/FAQ.rst           |   6 ++++++
 clang/www/analyzer/images/scan_build_cmd.png    | Bin 29669 -> 0 bytes
 5 files changed, 6 insertions(+)
 rename clang/{www => docs}/analyzer/images/example_custom_assert.png (100%)
 rename clang/{www => docs}/analyzer/images/example_null_pointer.png (100%)
 rename clang/{www => docs}/analyzer/images/example_use_assert.png (100%)
 delete mode 100644 clang/www/analyzer/images/scan_build_cmd.png

diff --git a/clang/www/analyzer/images/example_custom_assert.png b/clang/docs/analyzer/images/example_custom_assert.png
similarity index 100%
rename from clang/www/analyzer/images/example_custom_assert.png
rename to clang/docs/analyzer/images/example_custom_assert.png
diff --git a/clang/www/analyzer/images/example_null_pointer.png b/clang/docs/analyzer/images/example_null_pointer.png
similarity index 100%
rename from clang/www/analyzer/images/example_null_pointer.png
rename to clang/docs/analyzer/images/example_null_pointer.png
diff --git a/clang/www/analyzer/images/example_use_assert.png b/clang/docs/analyzer/images/example_use_assert.png
similarity index 100%
rename from clang/www/analyzer/images/example_use_assert.png
rename to clang/docs/analyzer/images/example_use_assert.png
diff --git a/clang/docs/analyzer/user-docs/FAQ.rst b/clang/docs/analyzer/user-docs/FAQ.rst
index e1147916a767c..58eac783efccd 100644
--- a/clang/docs/analyzer/user-docs/FAQ.rst
+++ b/clang/docs/analyzer/user-docs/FAQ.rst
@@ -9,6 +9,8 @@ Custom Assertions
 
 Q: How do I tell the analyzer that I do not want the bug being reported here since my custom error handler will safely end the execution before the bug is reached?
 
+.. image:: ../images/example_custom_assert.png
+
 You can tell the analyzer that this path is unreachable by teaching it about your `custom assertion handlers <Annotations.html#custom-assertion-handlers>`__. For example, you can modify the code segment as following:
 
 .. code-block:: c
@@ -25,6 +27,8 @@ Null Pointer Dereference
 
 Q: The analyzer reports a null dereference, but I know that the pointer is never null. How can I tell the analyzer that a pointer can never be null?
 
+.. image:: ../images/example_null_pointer.png
+
 The reason the analyzer often thinks that a pointer can be null is because the preceding code checked compared it against null. If you are absolutely sure that it cannot be null, remove the preceding check and, preferably, add an assertion as well. For example:
 
 .. code-block:: c
@@ -143,6 +147,8 @@ Ensuring Loop Body Execution
 
 Q: The analyzer assumes that a loop body is never entered. How can I tell it that the loop body will be entered at least once?
 
+.. image:: ../images/example_use_assert.png
+
 In cases where you know that a loop will always be entered at least once, you can use assertions to inform the analyzer. For example:
 
 .. code-block:: c
diff --git a/clang/www/analyzer/images/scan_build_cmd.png b/clang/www/analyzer/images/scan_build_cmd.png
deleted file mode 100644
index 464fd4e129a2088c150577a8fd52cc302bb90ee1..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 29669
zcmX`Rb980F(mov9$;7rZu`#iYi8-<D6Wg|J`y?}&*miPa+s-$?d++=Gv3K|0U0q%E
ztXj2J*He+oic-i3_y}NNV8}Ak;;LX^;Cuh(D{wIX?+uLJ1p~vWwh|LlmJt&pRd#kT
zx3c{O2IdKV;ccOgrLb)0retP5!i&mlK}ny)9<iTOEG|P%USOtNgbS8PNh{42P(%qI
z1TMjvObJID6huzyi!9VT-M*xMRqykqZ?oij!RvOauCT1^R{;b^<GsShzyN~;g9z>y
zn|$CHZF7G#+aLfVW5F<jB(x_sCc`&3ABq?U(1E1T3`>yY&IhWZ<2u-?m1g`OoT7I-
zzY@*>C-=kh1W`zEU}j;sxW4|T(?sW=1X35<6gy-)5!01K=R0R%En-w~*r=@>%c9p}
zVAquS`tmo{u-#WCYyXxHu}>kv&M<bB++FT|d?%Q6IPVQE^>8c+N5#lT&V)>39+_u>
zV<j^+S`+~G3xJ{3i1-Q!{uO`#N9%_Kvw?wup|~LDe-dAzpu{?12<id<ngWVhmv{o9
zMI>;*qM`tHy}@95)jx*lK9;FtVy0!0MK*^4cAyE5E1H3O$_ZUXcWy%QIVdppF5rg(
z8kin4Sf6uSJNH;yJ0KM@z#CRXDnLrya=(s6<Vqb^1Z*Jyya9swH*82CR04FA$hQhe
zA|p5yFgYXGoIu!X=s;2|LU7h?C}|ixQiA<p2uFDJfa(GijzHZ4WY-{r?{Hzi5wBrl
zf~~%zrv0X02_S`oO(f-ufZh#)B4w3^Q4LxkB^rTa3qL2-8X-UkAuqtC1(ywXE<{wO
zbA`4J&?!KvAi)nZ_^~%b*n(91_q%)uo)OAVu(XhG#%?QUZ2?5a4m?mo2sFRTcU&Cl
z^dLb2>pS4jyaF)Oe;Ka1q5@3B6ID!TB~qxzpniqliiQ@LC|6dnS70n~sxwr<J|J;M
z>4?-8&RXhl;OG$6#w<zb7H!V_ImUJ6dll%y@FUs`^^YPJV=3Zh6-OeF#4V1P{rl6j
z=lf2ExIA$g;e5==5R$2oQLnMePZ&mFcT`PSo7jgxlE(85&@1@o`sb$SJdX@0p{4`y
zJ1I^>JQjLz%-|l_9?%}JPq^HHx_|a=&OPaRsJHL|a3s;wgIc=|dn9|p+W_SRT2ERm
z+8A0rI&#`MI$2tKWhB)YHD8(pxE`oq{op}3WMK(%9kR!yJ(P?Xtm2AA0<#gvKLjKv
zD3;K=BOM|^(v%p&RwScYSmLr|sY&+9VKVu2jVWfb-(=;+NXOL2*v7cV(o@FODe1}R
z73gWIXsZOOysN;g@T@bkqIE*oviMT?lDopX^1Y*Aigt=rN*hZfOGb;WO1$LnBGW_|
z<g4Y*WU@tcW!Hc3OLj{P%e1FY1u|#V1=OWpa&9VlYkL3k=6|((g?Qz^cR1n3<H1|N
z)x%Z%p2E6ekzqc-8ucBY6^6~6or|^C=>5;YP)q!1DqUb@lvRLLVrHBdYBSND(;?fT
z!J!wnHMSQvf7%}70pkZ_zV?ZB>d#zlOzmpz@SlCnPEA&gL?+|r-Gf|Xalg`y+RfsP
zSu9A*zJ~rx6%L~Rb_&~L9!8@1R!&~dV8Lg>e&oZ<n9h+ds70<tw9H_S)ehCp<N@J9
z^kl=9#i-y;TAq+SS}XuXEJ2K#$C$^J$7>*Ips<Cu72N~bW8(|qi|tGEeEPC=m;9{p
z{P8?}^K@H}aE<sAff9}bQ66Cq-WnkcAq^1)u`pymv?pW=;Q_uDJ{o}^E+4y>riZeh
z!GtP_e2)HBCPjBak3*(K!u3at+=L#Z_IcY%5l?+bt7n&I_j6Q8L`buEo_OB?iwv)f
z@Tke?k*YWyawUbTo2uH}`P|{0)m-kJ4ci=>5L>#L!I<j!>G;ri+X(Y`<e1mUw^3k1
zdD?QiFdc~c%P;;uPt1q;3Qx^QtwD7}6IXqJ-kN@hzKy<)ev}rPUV~wh_KvQb-ca4Y
zxWC4vGO7AjJXLj3np2_GfyKT4aVrF<|5o64`nLPndbO^qrWUQyUEbCT`1<VlAUYM%
z7yBrA^+S#c%ag1c3ZRymS7(=U`57%Uu34>Yqn<6bF83DoNJC_m>zIq3tDx_;Nx2Cw
zNH2&8MD#9x^?mh!eR!>gse++FdPS;4szpLXrim4gMImU#g*5fsv)$98Hp4VbmP~e2
zQ<zsPV?l8wy5^nnH2Nz)%s77j@Z&T12dwVo>>=y~;zT)<8KVi~D-#~0K9da-K<7l~
zKqpo^T31T<Y4!U`$2!M~?JE8{_R7>M-73{Ou;Hj3wf)Myz~kD{(pj8WhbN7{j>n4s
zfPa#Qfv=pm&k5Zl(hbk~zzx~s!o}1L(b2}E+~vvH;ephv!&~oZz=zM9%iFl;9C!+x
z-E!Z05V9ok$%iqlsV{a&CJD`t22F~kgbSdyzRbV9y?MYkz=l90!Q4U*LG{8VA<4qy
zqiVuWA$}4T;i%#_Ijy`piqWzz@VBtXP%zTl^61eEQ226*G89pou+K5y9-_ZekVz3s
zMM{)POcF2h>JTGKbxXFB?D4t3uXJYsSwmR~hKb@pk-y?&BmHpNS?YM~`L{TG&!9k&
zC78`L$cjsp%2XkgNHp}62GSey&I+fB*s@d7pGhW}b4g>F-sEo7lRUMoxpacmer#gG
z#g|xrD^1I(mCo|(rp=~eCUd6PM!OOxGjmgb8K$}Oc?>z($=!KJnNP<tyP0#oCjR1|
z&)R>Rxu+7Rf91S6ZGF7&A=2Z}BOW7|AYvofAza~%V(E8DcX{6L+BRx9JyB)KQBf^W
z4a*6o@};1pRi<d9H>Z}T*J}}0dHSqAJL{I39hA~%RV8aMX!X?fbXj}0*6LJs)r2)t
zx0>(iT(_dOF0}@`y`5#9g`Tl+&2W`)*m4ou=h!)0<J)st+V|Bt0v}<%V*9gQx##VF
zw8w4f*x%S>TDCg)xvw8Te(+m+*tg2GYk&^=bbaF=3=H@+d~{&DDC$gs>`gA63=KAQ
z>)k%Lz2hGqg6@J8Ky0AWoBSK`Bf+DvXTei}hls`YXTg%uve;yJZafFEo6wT)#)J5o
z%u~Xn@|7B}{9_s^UyzWCh}hfy#&<vTBD$SoiXxc>XfOpt@NIqP+kJS=?r5)Que_?*
zIsjVu848RE>j+Y_yW8m5^;q<Knt0CnJKz2-L^UigD&IMY!F$at(hgjGjh7|qK8!h%
zJk&or!Xd|Q6D|dqzv^|-_O9GYyp7K!?aggC4`@BxEOEy<EHXqf2v%cPV}gSF96oqj
zhk7+X)LxE)-MSy*2~@J5ClUL^{WrJR*1X$^?zwTEvk~7Xp+65H04{-joxeeP+YZdv
zPkP%||K|Vx96<o4p+EBw-vx1=zn=6+2bdP@xN$_tv{u#$0_ruul-IyTgrUID9{}s0
z*-|gLc3@bd2p4E_mLkE|sFK^bmVrhi5Z2;~uPzDAZ;~wxVK~h}M1PRCrD*-pI%UFX
zyDMI-5RRXZ>sc=?bd2fD*!K$deFt)<Ob3UjzBC*3R@Br*Y}Ls%qIA1dG?naal{&~(
zD#d@rTMrBAx01Ob#*#5nTqx>W0YxEG3fooM;hU|l1Ji;nT}-1H$7z;axenVZ-H9Hn
z;{+6krqG!!^w&4Gd_?jD46L`fgaUg<{UyRrKlmZ@pyHsJVCsp&#AJ+Xb|BE360N0>
ziq7L+CEiHpV~yiqoO`(27@4WJK11eY+Mp7yS#PXVrJPM=e_1=}{CMq*2~rC!A7`^^
z)@gooHrQ7@^5bvL5(Yd@c1j)bvuw~>(%aHj0e@<3c%Deg=vry52H%!{qez!xu&6q$
z^X+|Y^=>skSX*9WaRcF35R_)iXMfbwY|!?4-7Oi7uP=qJ6WC#S;9VuIOYoWTFLohy
z+g<rQi9eP-e0wqkeCL@F`uX_%GjY$fl;&OuJKQ@DTYz7n<}T;tWh0uOB9mp)aI|lz
z$N%MUZT;Cp^J-9PwwProF4iU9L*7$SRi4Y&F{59BQ=xlOdAeni_;uvA(Sg}FPe8cA
zaUT;@9-dEu&5P6B`^>`1*viPy>;R~JHh;GBqVmu%e;D;Rx^fY=^>?jptnIARsPlWp
zYn$p-X&-C{0=qw5MF=w}&Q*m6;JiehjL15$s6x6zYlQLG<AL$raSnx^;u$oFbbggx
zR?n77^Lu!6nLL?S4SbEoJk}kyBkBUVzN`@BLB;)@Mb0Li7z;_Cs;Fj^X3hJWPb+jS
z3^(+{l`$6@>skem`X2?7rH5T@fm2?aGJa&iMt|{(c`=35!XbIh{b0EnLztLqcWcM(
zyzMN`RZx23subO)jF~IzB<$TTbBX`vk@Tbi+x;rvc~Icr#$hd?vS7WS!^2C6nBnXY
z2#^5yPV8(?5rC9Wl^sjp`}amE*4SZ0Q_Q`fT_Q!?Jari~rG42VxgjctGG7IK>1<g*
z{>|<}@%(0HdvjGYq6?dg(!N9v{MZxknfKI9&joZg#F=O{d3ChrU|M>OY>h0_IOA9?
zJtH$o1xF=Yhlx99Q&-dBP3r^SrT0hJ_7J<m<mBIFEeE!(BpX%-<j!2L@=Ck?OG$n}
zxRr&xmfe#lYGpbwzK~?O?{j<n_x*NvxLCY5G01ODK>eZRT6@WH_pSoRpMr~})o>Je
z=r88keCvL$|Ma$@QqH>5xOj6Jc`h@IN>0+N<fd3Yl`%Cp-TVIYy9M{OfN)d7$LKNp
z;oauoUqh(g80KG0)hsMz)ehaFKRY{5KXYH-a!E9W&6nr==|46MzItTd51QS4bnV|p
z9&<Mz!oksZ5`P8Y^|NUPIz<pT2U{D%xre@~R8?rUP``eo#%2i49d6xLQh`Dvj65uX
zOwm-oT#&N7wah-ro2_SKW2<CaZzi;0GU=WanrQsp`6rfw{Vw9V>N@kf8x9Iy5|Ikg
zB3ciP3B4MtkKRe+HcCCzHB!FUwn?w02SiK&Z6_f^CHbpBpqi~<t)bn~Fl%l0qxbX>
z|BW-W6}Q#jO6g2srgm|BKAy{wZNp)}B8D@M9gm|{!k;}6R)4~csf01JR#{VSNy(0R
zU2?^0Wqu`|pM#H@N1G3k=gZaWSmzx3u<q#3W#j(pp4%4xjMQfOsK=lBJLTR!!{1vX
zs@wAgYJc8>`@+--+u#O6wh->%El{rFdGP75dA%uVWVx;Hj$@A#OihLlO|0W!hsTCK
zW6q+4{rXQz21X6x<1rAEA(9tWE7XS62<9;6?byWG{d2v&yU6c;rg^nlqR=1d+uIm?
zX3o4rS{zI{<qYS4xf>T5RWwvroR~;B?UeQ;#mwzD4czWW#f-0}zGz5jRn>86wJ!^F
zaT#>$#akKKq*$|AK8m|%Hf3+Nm+8o~|GHdh&F#_kxl({FQ>z*;<p1z@nSK)z?K_8T
z53~s;h;Jbt7VH$ff5H1|SiXzGF*IE3@xC5<z8HIZY)p&HtPF1Uy#tR9@0F-qzo_|g
z5b`_94A1~0CD|FM%?C5`{}w=~79xZS9gqnYfJX(EHU{QTgS5?14Tg~XHhmWiZCs}i
zz#Rc?fiBsF(*w%@cQ%1oDPo?$f(4r!d|F^&iKG*3Wn}yl(>RV|$2NrJGL!~AErX~u
zq)9Zi_*Qwc!gzt3gF*-0E?y{(SxQ(0Fz85X>wtTRaY%MZqls%1tnjDJ=*W?fhh2~D
zlDU5)Y7cnx>sbW>1pzjSJc<>9<WKyJ$g|7>`5^|4^jrz$9D%IW7{?eLJ#rOB6_d5r
zsR2HG7JEwoSBQJiRMI6nkjsz%uR)lmq{fgtwP2aWgyIC^$W=z#1AV_mTocLB_RjGR
z(st3F_1+^Hdy<c5zZbM8&5QR-Hk2&%d}tiPTf|C)b+k-$GF1cyh>E5JrX>4Ulo{qe
z#6j(21^qQW29{QeWGbx2C7Jxzx?l7gnVTjuy239eOgo%h<xNW9-K!gnVAxeGm;6tK
zCqhRqIldL6&!MCxSrFgoVeduEW*g+H%HPptUQp>@w^99>zw^!gHfvH7+;jEA`E;Z_
z4Kx`{l<(I@8AhJI1LG8PDppd$gouJuIQ)2kmw)9JYc|s{>$T{Zt$8Nx2i{b#(ie>*
zYZeadHof!S1${ovMBM}BNoTtbp<bEK*OgktcBU(~mUy?;2eYSPlrRCIM~+>usd0XX
zt@t8pq_Qd@YJgBT{*9Xd$HCCi<Z>T<yR63(Dz(|w_ci@5mjdv63WN~O2yBc%4z$=m
z#^Tz_D4U^@1ymVecYm!LhzR3qJ~^Mk1%gG3JylL=anuQ`a2Er!#9hHeEyY4UgrhG-
z0Dl}!QOF6um?j|7V!pqYH#{|F^FVJH>k(aI8ekcp+FVCtv0)`*dBEd_M6yu&QqrTT
zMLNhNm6*-pma-jak7>{|RAF0lonkH^ax8J^x7@a9xDvX)+;iOHAyI}QNzxDbQZtv~
zoBulW$A(R#TH@dmYV$ddJJ%&F&Ed;&(aYS3+9=*fBKFE--9i$w`22wA4^j_e5H0Ta
zBVU$?9#9>mpc0@&m93PM8@(Q58&Me%yA^))RmsqLuiz-}sc34#X^xjru<h&WstxNq
zgs+9TkjxG5ntx3I=WdGb#4FFzV<%!2BS~TRXyvhZnaHv<H6i$wZkd0iyyrQ6l6zOi
zkenUWT`g61q+jnvf@0{>cQ2+7u3Vrqlem4DP^xTfmQNq+=@-I0c5JD7|1;+8hQvvm
zCPcA~W{vtoo=#p(A)hBTvy=PGMNuYCLFjDt<jxJcQ9^UDiAudx_gjA|vzBwUk9DE<
z<WR_AVYVt$r^j};6u+asg*Vo#B&;(@Cz6xkx-aY9>F4^|>{8@bWn;WHVm(fDnt7j)
zA-t~so0Ff%2Q*r%EEX-e-!^6ur%C`}q31Ch2OQI1#%uo-*r_1h9Xc#zS>$g4T@l_f
zVTpa$r1#(MVJW-sf9nnT4#*n)ZSZi!&Ec)Xdiqu$%rXEd{$5x?h6ev5<Y)jmU_<Gy
zid~c$QNB?WC8dsj!ZE?|s|BN_*45kf>KK(B4Ud`q^t->=n6c)sJY(Kr>j4<D=|X7~
zi8#(Fe!bz&)=}|WxM?Z_fRNg!AH;1?k!VkU4mn{_j)~r`C`M8ey(-L{I~!S3rE}`-
zrcW6byzK25F@0#2@Z(Em>_1@SZ9-2)x-1?Bk-cN}VwM$}Cqhu$v~jiLbYa(RSE+bs
zc++?oI+41(Ivjk`ysO`QujQW75i^j+;8cVJh}?xlX`DGvWn0vcUOgppjNf-hH^g5`
zJm}!9-cNw#@9cEV)-LxhG*bh42Yen!K^lLYRm>Y#O6yeHn_{EhWkahhwEV2UFEcNU
zE~0Y1a@Oik_tE*E`B%?hq_)NQ^k1Oz`E@xxF?gfifA}g3SiZyg`#ME@Slu)n-#wz|
zYdtU5I(&fCVwYmq<$Yh#csMytS=af7zdN)gTvtB2T<y-YOs9LNJNQn1VmFZYHZ|zH
z>wmp1lT_Le+B^uC`fJwT)#Et)CfFi?%udRt%T@w3e%-!meA)Pon^n8%I`Vs1d;n}3
z-1A5R3ceAd5z2Bc&24==e_dkWq6HrJxFZrdQ$a&#Tv~Q#Bs>k1ddx#g15H7<uJc@?
z)+pjGl6&9Ge%-92M2171pYEZNNKf*sFB3LW_>nayI8G8x1Q}f=II@oe&KqZYbw-*|
zXP3c`W3)1+@_DGjH;~Yg@e4%H6$;T0HQ^4W9HoAigrWplp$UAuu(7&}pN=X|e-w7d
zeMID5iJT62pO-kgD7U#@C&9i)<yy;xWW%lrI4=hOjVMkU#lnIXV%Z-Ph&=<-^&7eI
znY2wf4df1Qw!CQ-DX!BWAe`~Tk#vp%HxeVp=vMe6_7ipd>eEcz5IZL3(PBb)2|)1-
zt~9V0zlj{#G2h1C{T-(-`77>v*pz1La%GNl&SfqeU><-iK3?=hnz=93g0F*@Fe`7#
zMH-kU#ZbsO^EO*0oy~U3RGu&*1nAvUeE&^-HFEbyN~6Z_s}uGTp!3CE{uKlF9`Ilf
z)h8?zFTDABZvhkG@2rIxv0)V;u~bRx1nYKGKX|q9)g9s^abXD^sOdN|t1L_ca%_Sr
z%DTs(A%OCdj!OPj`m5y67ud=u@^$Lj+|j@4J-nl|whI^-BIf^lz`?R|aKXSx!DPgL
zsQ)Y9`@sgNFFt%`3N`y_8(Ga|GZx@}vk|AjZV;DHSg05nNXjtbur~2eQ|DwobU24c
z?{4J&Zo*N4D<|DhSU@V_z4>eWeByHAefr6K>U?uJdv)~}j`@zBr^}dNd(YKmufwI^
z6Yxlu5~D;0T^R}sTtejE;7$Z_slVLPRs%;15fwmQ@NW<i8U5J@`@fH<-^`LCqn<b<
zV0H%o|1P$rtN)7r6KaS?S7zH9gcZrw`ahj1F%SIzPtku=oSQ)Pba#9(TWt+i%F!>$
ze*53Kyb!)}m-Ze?b=BSH$r$I%U!OLx|FP-V9nKYJcS?>;bN9j><Z;pPAEP#50(~iu
zdr0>bNey$;F1Y|OYbyV7fR8AUD64G>)_Lof*#<boCGC!Ti(WZEw?*_n%UFRZ6f-LQ
zc3PeK5)=HnG3!72I}<FKS&4*hb(H_>P=PL40^-dfh8TZ#iU&U4k#zs+2&Zn|e0wSI
zM$xT8>AzB>eP|<*^0CYo(bH)b8ja1;o#UfCuozR~S()!aPP#7`u&a4Ln{!V8hg&ys
zAeYSiv59VD<}Le6ZB=(xN_R@kI;jawkn1h{(8cIDHp4@u`hOui&VyTtoVU;RS={Q0
zxzU#y(H9-c74Of4(|9LWn904@+shfN`ae5_(MF_`+Ad{sq;<J?UtAuv-Q1w*Joxb1
zlOu8?<}NXHONf51tlm10dQ_?T{m+`0zyM6`EWz<K7BBMBoXs%MW%Igf+dPh^8$F%{
znF+u4n%#N938uaoKA5f^SD%_TyMf#P;JXYD$R)jKJ5NkD^d(2PIn5m78+4^7`v=q&
z7<zUIU6(XCb++B~xOy2=-+uM~99nsR<G~>HV8H1Uq)xVbRUq;KrP+}!Xsa#d{_{*^
zK(vCqF}cisE%P1wdz<s)&&bAcy}k2;1<n8BvEl@EK9F81$v(A!o<BOZ&pj?3BW=~k
zv>tVaB)VnHHkBB0VP>&5bI}Gy=AU@Ye^NGWplR!)JzwATpJ0g+`6M%<5xf7MJw4$v
zm+Px)Q6?}NF3z@>M9jlm8jC`!|9~KoOsGJl&tZ4$z8;e(keFn#wi_(obj16TV^Y)q
zTKM1aZ5T&KGf8%7T{+H3R^rQHQZgY`Au%a6@Zn`gy(zOh9XU&B5u?~cDmGcAbDddG
zAn<2by0=xmJ%9L}M#)OP!|fK?Jg#SQ{_FodJ0^q*!D?RoN0~9JjzuzI_#d0JQ?U>J
zji)@g@Md#?l^iCb-eV>=A(K#h^<qKvVrYKklBqNIAG3q+2H9sf&QO6awRHa*kw!-p
zMoGD=UC-<?k(_rD^@kJQTSYyQ(<x>4OMClLz0Vh28g0e%i~Ocp_(WK9!C3n5CfTaH
zzJv>VVFdOo4zKqon9s1kD|?EIm6enK3tRws0zCL(v_#I+#>W%r3uO4di*wc78`)01
z6(Ua->hZHRX+P4?WFVe&KeJ84QA(JPgU$Y^;-@>p+`_EKe46jP*5kP0=k2!36z@^e
z(9n>Kf`S58{IG{>erwNvkN9mL7qEUooj)!0$LRVYoY?z#%BCLpXt(t%CMi^!Y!W#l
z!^>pdZ#P^#>AeJ63bw48KRPB<WEPJ`KVoU3{cQj{as^y2omsHFt?kjG@h4In^?ySZ
zT!PdBN`M&k8#eY9kI6s;Z0Ags`j6xg38^&w>3vLLN>v8YOE^4_Ar8(FwWMVGr0V^3
zze{!KU7Qot@W188;x+$&UbJL^$WVwCCogyI?7M@hvy#ZM4R-i`fFSHjIV-W|*MQF8
zmdSU!mb<1fgjZH^fjo}#Ba3hMyjNa4hKE~8DSm=$SIWS}h5dKgnnJeaY7TG9%e+!$
zXCt~@A`Mzep6O#5*)XBlprDT`DH>`0vRn^|X77Bsw#8l<vijScZ4{$#{>F}3?u~K+
zgvOid&5auEO0!4z*+!2Y1+a1dn@-cGnsp4fk!&zT9t8fP4-%1(37baB{aB@9d2jXq
z5=8+g#5Ve4fK86#JE{7<TS!9Z$|JnM=#d-Q=>;UAz(rnJTVi9UsbUuzQ<h|}t%+Rk
zvqD1ws@GAj$7k@iJbbY^vEcj|pid#1*gGRC8nLoS$&j942J||q+HC#JPOm-h^t5gE
z*ritj5|uYYv@pSqsj!yM29zhaBXwC2M63rgfTUXhL`du+SEs8?R;8-<Ss{l`y$XFJ
z5uiKbJ7Rg`7X0`OR#(;(2fjX*8T?AcisqgVt39*4>q1^=U=p2w{J_2t-cR5<n^*um
zT!}(uwa@Ox_Z9RPnp%m-bOX)~mu`MCF@0LneM2FTf{YUEOdX}Fsy>}8wTo4O>cR~S
zM&UaKvp2zWXu5_oql01`pyTp}%(rbwoM@6!8b8KuTbRE*8@k}rU5WU~XOd2;NtsxL
zxD%Bd89{|Ix49I=6cf>BM*itL1hQ0a2pGCQmZ69fOruv4ip22E_4;U@o;_ljC{sEk
zsKHg|J8lqzUq)x6vPt=vB3O98cBs4xFv0)cn=yv^M*oe(9Y47-{Pgn!$<AvJ@AdYt
zMoXlrwZ6SG30vjlKP*1D1wZ@n)Y%FmV32D|D8fmkk|l|ykQ2SLnvZ#3Jx**|+z-sX
zZ9kFD5=eM<8*1FlPXX;W!<zGd`#u?%c1WZO)yp9nDd+@yLXT~PpV5hyNVb^e6Ti8F
zTG_3xpZsN|Q0P}0!^jQi>X~~p$|gKBdmBA&;ucdYUH4>PIoqAyV|X_tRmv(J-@2=|
z-0#iKV5jF4;_XZ2>@yUDRl|YqC;a!<VyLr_EML=AR?m%~(&}CSw#I|))l*U6;id;a
zuq$y6)b7Y5GHl?=O0-Z8bZa#&=io3zOZY=vl0fa6Y&0k-bS#=Q@R!DGH#tMI30S=6
zp6FDarbA#}o(QSyOI`|Pd;UF%t~5q^zg1=^Bg^3P+BAeW!mmBFa=NTmz3G_U`N2P2
z&$qU~ch+Pr+wWlg^u(`kJ?eth8w(m1N#rfAGO)bNbow*!6@>y{perpX)%cd(iE_zb
z9-i93VV<7;YF^|=6%`2x8W?9vAQ`vA2g|nvdvur;d;bnW-1)>%kSF#p9(T)5pS@pg
z$EC2WP}mC{igug8INwC~55;0ujDel`e}HUDs~7ATKTf%=zjLYue4*C@p15A$x-n0R
zNEBPn;XvA3(O2Ic&R|zj_I?YmZ2|6nRlR>IdGs)OM8G(OBfKn2q(7`<%D17P?{jrk
zZRw#ZUx;>{BxRo<zvBDxV_Uo@L38_G(+Vp4k$zl^8mbJoea&9=egN(_(<Ae8J%}GB
zTJ0I8BwFPkS^BsW75OUXZaK6Zilj|z&f+w>U(5gK#{>-0ZG<J%bHEDwZWBv4Dl|zD
z+e%ZG_n0Zd3cFC7hO0EX%2IZL>2ozoL>cZ%m<&@**ZHjJNC4e03Dmc+3L&eD7s}h>
zqPR8LorXqR-5CDjQ(V#HCw<MQ?ptgc>WOoHBNojh1_@w(ILprhgp~#b@vR25BHl~=
zMs#yX9ejB^xq3!#<cDrb_vz&l);Gf?vFeInJ@6~@#s?I;2O5QVKripex6gtt%N@@2
zv<TO&=TLUewI%UqMN1rRxegT2;{xg`%979ms(OuH)f$Iqf_;gNQ}+2gr!q8p&tkW?
zPxzqo13#g{J}nYO9DPsx8wbL?P*Q|t+tSxEB3&7hXh}Sug$(&;#nvo5_?V(_p|=?F
z#WO1#&*8OxkGiyMnE{N2B@eCFdHYe2LkqgcTMbv+dcM0>2K|i2!{I_cjQ5EW4l6oM
za=23p0j*@-ht}&pzkzj4ZF$C#S=Zyx*Gx*aa6xP<EB&yzq%9`!p+1#k=lG-Ip+_=B
z==}&#0ZI&4Rk9(EiDv?oQ-9v#cb(j$Yl`_A6c5>DSC~u1`&5DGZ}Gu>5r4Bv^Bfpq
znuHm`;rZ{-Xx35+C6<x!?NXu6$zI?Q9mx-=dSP2#^@Q~2TVdXQi{w!|`crE}wb+{W
zM<UwNj6>1y?63ONlS}uWanYdR^`@EpP>#r0pl6EjKABca<d76(i}Q_zHUK`|-YfVu
z7UedgLpa$9&kHen1k)twsb&{{FNJmoGqfI-GyHuY1$*xIY_-*78h7VZbPZRo(2``D
z+83)}rA0o5LCyh9{<4^m2Ycuyfigv?0O<>cK@4`Wv@yHH@O6Ax?0ECzd}Z$CflUb$
zL2~r&%++a_-TSKe8TTzJri+wyUjeLrSHPtY;#A>%^16K)=CH<3bOu9U@Mnu9t^!GS
zpugvbAIE(WztvDBc-LOidgV0qyyaQ)^2}dEp8FZOlSUir*`V&SSp}AYW3$)Ezv;7N
zAD(dQ&txC(9iW+O8~gITuqL7pa_!F|2j%%Ytyw`!kN#EHqt<yVhtEIajSu$=3f!|7
z^#vx1;s0(7Im5#D&jD0l{n~90n+n*MziF0ZM^6+G4U2^h?-;z3`Yt()@i*NkJ*@Pg
z{apu+i*YW?NIq{y)ekU_=XC1|<{!^2R&73?AU<|(g;lGbTo+?jM?Vx7b-T~)y!RjT
z9r{3(Yh9u>z7Nb@iBjF|0-svmA2B|3);sC8mAz$I_*KaryFX+b(UdZ0yl*>_ijR)h
z;*XE|y7H)Nx<dT{m;UeRaahG)g6@#|{?h_o<^$ud_XfYIG^F&IYIWPEqUPcS5x{rI
z9FgEK_f3{}##81fZJ-JiY5LHUve#7DBw-W8scZMkpGkj-#T~Dfps|*;wz`Qj7<4Dn
zzZ_o9jj=hyK2Lui!OZJ1$$EdkW`0zwC-(KXNIJ9`FGZeIAMFciuhY(d5Z69GqTBm~
z90fJwym%I-)(0rfR3yXC%d(caQ@eS9tHDgG;VgGnbOhqK-oR6;StEs18h)|Sv(Qru
z$0zvdrK75CZAjTXxc4h5=n?BKfaDcE^jq4>m~DjFTNm{|Mq9(*fkTlzK+H3%AsL(*
z3K~77<}JP?t{!Jtg|8irg{w**D~SB$%^OLb>wSeDg;Egc4K0>%E<U=xV2A~K=3<$#
zz-oC@E?&PWr>^_D{XUkieLiBozI>SYOaT?J=i@p#K0IH2bn%y|)9L`na7MXZUaKql
zpk7Br%HZ>>!#CyAc}mG@sIOO+J6|A&#ae2F-x^=gZIGm*=F;@_!b-RZ@6BaPCI?69
zL!Y`UDduIE;K_Ta9v}gFVY)H|;lGaG=g*;*f1BT{&lCIRyC{fXU-I>|!noGXGTV2V
zA-in&iB0r!YNd{?D}(wS{jIJe-9Y#kuHCCG)u1<Q5{Uhh{}gz4B<!(ub<cXDN7eQT
za^0%WFUJIYu_@qp40e5mFus@Mra3@LKd-KnY{wv0hO@{Feg2`D(~ZAzD00wK)O!9D
zY0u#e6ncGgbA16i%ubVd=PDB5AK{@c{9*sD8@UC1vF$pq#XSB^K|R};OpWGmXthRx
z_8Xp1iog)Yw&7qL+CUT|vn472xtQz2r2KPbzA$1mB^_m~#5E;`eUXL;o6f1>f*>GZ
zS|+M~(#o1d=>%qjnHb2Xjs34tgPfS%D0s@h|3bP|V7IE8h9y2PxaY;wEXxgn_2-Yg
zIn~?LJI(%RM^QTGr_$|R-O7m3m9yAuPBT=mwp$9NPWd)I#1Arhq_7GebrFB+T`&i-
z`1~OI1-$a_O5>aZp5hZOE(*j;!L0N~s#BUYI2*M|gQgQBD;AlIC@gO*g8e>TJBbvu
zuLqL!ARqF2RL`8W`uf5f-&HbuLFY%C`|TdktfhmX;g?N8SXjt`;8Pr6<{NCPioPkH
zu_~L>4$=L7DXccbSteJckiG<GW25?j+L)#ZM8mZ?0cw-r9{2Rz7ZE7lnnESZm2@^Z
zaKGg{bogv1j=D;Qg<)GNQlXzD#^fFeV6<tp?;?bAlHMn=hPb#DIUgWv)8X9PT^GAB
z(M<URU%>wU(d^Is=rIVDI7bTS{Gv3>997>16ncMp?_)K(#$Hk5k$@un#J_J{Q+V11
zLqAxMAXNC^W>{{*@>SdLuth^MxBKy?vbg1=My@A^tPt~~YAfa~);b1jt04xvzOvE&
z8^=Z*y}dBwId_!gJ<KKnU0Wcr(AihS6foGImrk63d*|lQd-*s3l1(8uF3!GSxSluM
z-1n2S%i&VO*UOq+V&>v?QXEpL#3PTX#HG(4+VA2UbE7Tg$LFZWvCRj8m71Me6@oC)
z$5&p-Y__%4WWUx)70BSn)8``r$K9ZVnlNIqt6bH29$Y{~s7pgQH5npnGzYp!=7LPW
zb2QUj%tgc2Assj;zaC2)cZ4EYBo<Mdt$Y3oIXWz9NCeaB(#Q(8Gp<DMPet`nIbPXJ
zo^j8PklGTchua$L{I)8a2+yi$YiqhDW5^d+lrg?tTY4hNNdXhLJK81k2ekMmr^!T;
zTJsBql+ldH3;~1NYclIwEQ~R;GaJyHe~M*Ial-O+b~=K95dAF-qv((d3b&MJ31MMF
zw8P>qmBLL!_G^l93XSka4REzq*vu@(FAeyJeH~A)_+;SEwVA#3mguTkINeX3SV6&e
z&2x&WEtaj_iqXusAmpnLK&~e%D6_-%LO#|Iw^_E`@2jFsB_0HK`*B-XyI{%1T)e9O
z0vQHn^#EfqH%Xz8V2I~2!$-U>RK)M^!anI~hc}&tO7$9aOgq}VJb0j^8lu^A#|G>n
zlRglrOcLarTN(l$ZvdrKBVV*aMJqp{v#YE&%NFt-8~QXhv+R(uu2!YLvb^~9{npvf
z31t7?$c6NEfXkn~hRywof6wB|dqC`Re7q2U%aH+e-P+>Eo?Ns&u3Hx#6x%<%-S5`0
zRP<cr7E1Q&acr;1;+yM&6MgO~bdhKXU30&F!cN9CXA5n^-#^85{ws+tlbmFb(cRW*
zi*x6{n0jCQmOBacU@3`uZEaET7k9dW19g*}IpOO2^35vc84aKGh;}u_aV=bftod%@
zYn7=;n*2Geui)=51NSU<E460V4dsi~Bu#b==l>N69Agr`yW{Y^^B`v@DKJDD+Dbns
zOnBZ<xx`HC)e{_v4@m+Q>G?v;(q$ZOF}=)%T$5cDh^UNA?^+}zdvEqE_p%nn-$hVQ
z|4`~p&CdyrB^R64fI;BhMWm!t>4tPQc4hVK4XL_dCq+LBCbTV0mxI6mPPnhYHH)cq
zIETTczWG<E85D`JK7GEQ`qlK{=D2G4YB{fqCGD9R04YD<za@&bKg*gFF0}%WNX|Xj
z@#@{1##iV2RpX`u?NJ=RFL*1{kI5DnmW<@{00NaZ50BjDKylh?%VnRL3JgwzsdKCN
zQQ9-THf>9zo74nGR_4U}olcjuHAcb$Dl(PEu8k=txB#O`WQ?_@v<ENUm@JTZj^97a
z;hr|%n2pJW!t?UK1dwlNEFV3F=97h#s%mPPtHo~7izooTn7P0EwpA6R!-8FJ1{B#f
z?tn)R5Mkn_2tHn`K;I+l<y#QgG@klRt>?Os&k*^pq2LF$pPx8ut?%Oz$HUjDmxtF^
z7Tz>?@<}~Ou1L0!S8Sco9nJk-`BT{{&h<=R>_^(6#G~Lj#Os}t!KE02(Wqkr^QY6g
zUSBAe`dt<Z1@vQ<ya!0|kIn$j!z*piwQ<*baH8vjLi%iof91*DicS_hxzMTB{bhb!
z`SqHHx?R5OcD|vDzX##w`<%-C(VJ3X`t%z(k%0%5;AgPR(u127+nTZsuq(Weqo<;N
zp$m}$-)i44;VMnMY6BVl_XsYHJ*97O!QME7(0l2R^gf<_c!$-#{cE{iy4B6b1$4Hw
zsYq9AJg>tr+<EsoXf$ct2&Z;V1%jLg)WMg>(qHl^Vj?34@&f{c^3avS%n7uYbca?u
z2M@l;?juIG&Y2_bE=C(V^qrD~5<93{<*jE$3w=Rn{U35m6&YvX=QCt4lr8T1oblFv
z`}i(cgPu1UWgI=Oj7$#CEOt67QU}EdqgRi~ph3h=&%Zh*!tpu~zLRWYpy1eX?mH~E
zh_Lk7(z1q^aUr#f?r>q=87n~rB3y4MeGxzj!E{faDBu(3LjR=v^r4zgRVW1J?!8(K
zFB2qxeIuht#BK#y1XU*F6oY$I^0$j|TPC(aqz%I%RL(1odp!J^;m3jqgYZXyX1#B4
zop=1Cf4<rz*|!tAD_4Cug>qawgFC95lG8Y<E!dd7JkMK+Q;*l>iln6(c8}|xKj2!k
zzePd-*_810m*a!?;GnXnl`r7PEbR4watb{n*Nm{?8!Gaq2hsz>iOe3)5m3ZQ@y+ha
zRq%_{#DBAJA0b6eL!&g5MK~}5M~Ya~^^6eTW8~-Qs<IfeZ-FN8mDVtWG^-1M4&=9B
zRvO=P2y(c>7mSH=^p3SF0c<?mb?xO+ob&=V<ta8|U61>Siv|*pX7F^YPaK2=6TP+j
zyi8l17GV%H)UQ$7`MkeE!;jgSKWl+wY9K@|j*BU{60cB?IlkcL5Y&B!oPny+Y>dVc
z?qoOF??iM&P8j)gLxs3XZG7JRhb<nO)}w97Ni<F0&|RSIlAMUXz|$2!@oRIxNDyw|
zI#BDr1xK)A42|GMcRa^?5E=3toS`Os6$D!{<WHE1ljU;!(hnL(63Sije}ORMW-i+L
zX`KCe6C&-ptU$07X0@5_w($MfwaP{|wkqKw0mGN*?)Mnx$w%z;d|JOZ{RV>wD~|9O
z`vXM`%koVG&ktSA#o4x4WLN3@WV*Em3t%aI;<l;QSh1jx&}}odCP@h*$TJ9HIahJ{
zW4n~2Nx1j!ss$%Ma&ybzebGT51}S#bj<}{wnmV!PU@)CrSID1AAh7=_AraN?Cd{)p
zCo9fDBQGAE+c-7`o&T_dz;b|MdA1bfon;Z&_e0Cyfqv8X$l>^?w0)+Q2D_z>VRKJW
zJ)_`S%YZmrNtov<&uoxls!$5><wks6TTW~mSB{91A;bM~yso(=(f7<sbbTf~djRT+
zzWjWnyB{T~D<&xArzo2Rf^etbCfTLkH*DXSB=|ILA5JHSj5^_;Y*~ZWH60E)u}9N3
zo^QsjmL}6@Yf7SppGt*=+y!r#-muoYk_`MU@CBcTrY%K?gd<@4THZg7!vMmvg<_u~
z4At>v#7p#d-mlYCICJ_Ze3CQqSDs;UjPbM7Hi24vVyT;$;5#^Y;HE4J<U^4}03F4D
zk<d&N+4{C_cXZDqzs_mO5vP!}vilch-E9RT<T&A3|BMnnx(9t);10=u$wHoV5u;a#
zP_HjgAm6utfH%cECG9)w9?PnPabD0`lO2JbPFqfvWNgvU*j7udU5^2SS`r^{ah$g0
z7s>;%rl{a6mRd!G5`usrQC<9P)Va=DgDnAPibC086B+Nawiv-lau#pTNOyquJ#Cq2
z&B!6UAJNy(m7dPDB?tTMlM~a*81GhXi?UA-Er7}uu)^ZIXtk#OHVK{N?Kc&WkiZ%G
z>JX~*jjFK>xCB>i5wcqkZN)O!sRml9215+nrOBENX+53dAlf?QiaV!m84LHe?Nu3q
zzla=be4J+8+Q8Ep_PofH1IMmz>Jg|02#Zo+WX4{rXE^>;{Z!QZ>m3aKus+EoPbapk
zf^P0N;$4;{5vq^w;;(bb^2#lf)6MD_7N-u+hUuq{yW3FFODRsny2d4XFT_ei;ogTh
z942oBuM4Oz<PNVXstJH<?1Zo2<EczWfBg3ms=*AapJI!#zKTtI=Id9AXMnV*jZmPa
zQ9Y@h!2{J|Sud0X5LS5_;;n;hN?1>8jw{Xy+I!}AFLaCB1N|DcP3sKI9*2<#FK3Oe
zgl+@CH`sK#cd#dfv>(LsNqqB*$zzjRr#%(5q->KoP)(8d0BUP=+Uu`_pD&q6-Ar!D
zzF1To&#|;S5f_(Ct!*A1`yFLjs%>(Nop?+?sgkX98CD-%XvPlaX3EUuB&Ef(l+eS1
z+H~_oh8f9^k%OKlzW2|!eFr!-a1cREqLY{y5hp7v?9*ntg9Dr{RQQ-@pL11njFy!a
zOP*Zevok%zpVdE;^L{-+i<}8#k_tuELYP4tiO#jn4K)eab{W&lz_jGyWy{#E-C5Gq
zXPEhntv5A4H$O!;o!HACB*eXC$3wQFYA)3dAIyQj4-R|=otE{ISg*GhJlPGC?eCQ8
zT)fYW$>W?%nGM8`5BgFrSa&q$H~xrMDKQVl)nBTrovKEK<Dl=FL!JPR?&-g5oNoGM
zNFaxVP2o_3t+r%_8NL!%KYTpbtjco-d?FbzN!61>LLC(%9bjzL*4$POhs<9WDE*&n
z7O=_I0rs+;@^g`Ywq0)d7s(pSpicU^N{{V??jd81D0S8TE~h^)`=6{d?(gVk?^aea
zp@K{a>#B<vv$^jgX?QQ9RT>j`#`sIq`25uQ_N%cNv-(kAr4`emJMyUOTqcKViE5ey
z!_wl8_#04U`Whhos`vP~Z>>R(_w$uk-PJ6_2V_QDCA9VYmM2Jg7QO74fP>^n5=&Fp
zVhx-kb67lncIGMZ7O66xJ)^k1Cuy|Dm0avC@IJW4C}vd?Fbvm$)>O1`v-ru@9*qv5
zne8|0Ju!DlKFc*lA@y`>!==d#u6zY1zdx-4XvZ+UZdUVU3cGET@6x13F$RNeIod*9
za=F^>KbrqXrYFI{O$Tr&7zB~@X(;3i!f-~Q@+IZ^CU2qmyHn#KMrP@w=S%Nr@-uEr
z3sL$uwerTsAyAE4;!tMM$qm?3{a-JDjC`4Ix8d#`P|>Gv@qd)4WoW>4G*75}X!37<
z(T?lPmtCCv1>>Q}da)+<5!~Fw6d5U@+lfg=3jcsWsC9BPGO_V$lK-*4Q6j@=2|x1C
z=xsZ<A*ay$t`ntiiZy(4_A~tlTKU~z@c22S;>bp)BILc3^UaY2z+i;HC+0{1gdVuo
zs{cpoyTgGen~D^Xnmn6D;im?BgcuEmp%P%+p0jEW#re1JBHPxKrA-wmoa9K))9}X#
z%#yL&{rnLtpcW+*_3?C5_CIdT2vur36vu$}7}O_>jK?<6CCE93h~9YpE!v~;`|>oZ
z7s9oiA8F%`MrFsq&*^`i&CV8VzR_Ng@WBC%k@m93irHv9CHfZW2gw4`GHFxIJ!ML_
zb4?T!rpYPu-&V2sWhiugRkvE2Pui*$|3gOq4ih0)`CZLUU~9nX!44JVn@w*|0W}(A
zYm9?~qh`m{cOC+dC(cN)D>6g6hlOgshsTau!MjST_MF&+!Ei)TSBtxltcaWFZk`k;
zn`;t1@sC-ZPt*NE0cJ>U`Q>|DYrn0pCqAlQf<b)4+1$tXdn387VNm!F^sZV^ziAW&
zD0+<3oAT{msO-bdq$GzFk;hQTmH3#Q%=1o?f3h38s>x=?IIj#Iy+*%rcySAJ3O|f{
z@@V7%n#K(;iw@)S6>9u`$uuzvA9?;xfqXus&Rfgy(=|E6!K=1SRq<a!)h|sA9h%Oe
zw)VVQ9T;j?eQ1liWHbffe{NrWP<<-tjZeM%o7q@stOt5OM-Hj;Shol}j!Q(wrh}5b
zD!momaAuGJ7bI}NZZ7Ww!)IiIZa=;l;;gTRuZS6`y67s!h>6RT55btv$ElCr;vkaF
z-uA2IV)I-z`}guQ7Myd;D2eBTp!a2<<)K9Eu%2Kwa!&c&{MFYWfON9h;oL{k)u8;I
z^+^_`dxv-VFNlET6+|+PoPXQRy?$(S`Cz?&8UOnq>u3?{eG{b^6Tpg*hFpB~j~5<+
z`m_ISHj#yoGHIk_G0DN78y!;DWKw``nR!*)?0kWikB&&9cg6iA1vdXZDlm^X*Ap}K
z(aC2|?h9T2Gb-evJjx*t9jOzOc^UlMq>bTA=0ixZ0q8xk+gVjuU@}br(cWexHAXH@
zU{jZs=V1WV%4Ce=wEMa9I6VE6`2~*p#v|)o;&Tg{IK8{yTUfnZ!=~P{|69xS<UGQL
zTz)ij$BLJs^;M^!3ja&o&zomV$d9nC`3CA?64uAEq_%v}!Or7z^b5M`Mb3~@9SaCp
zx=^kJ-j^K>ve>lJzmO6OPkk0$AaHB*_8x9SHk_H0$fH)eV!hlE+{jZVZP%3Zetlk5
z|F+OR|C-?=K^3U#e<6nca1Lb9nFi+LuU?@4=7sGEI{UNg3CC}Jh&UUiWIN!dGbTGU
zw9DiZe2n$~P<t%LW8o|DOFprR9Q}D$Pzif;_hH&%r)fhT$$ed)M(#}I9ean8Au_9h
z7^bGJ0oXF!j+od`BM<W!7_4~g><h7JyDsJ&mUm4h`A)q};MN3Kc&Q)2p3VW~ojTh-
zQ|UZ}-#QwfED-_5mD;dhG^S+|baUtJs1E0c+S4SW%aKqXc%Iasvd;2{(mQi2Q~V#x
z4SGMGc3p(#3jKnVh}OcnditoGN9(yRw=KJK^;buC6g(*+!u4$UYd42aaI{b?dAfX~
z&oFg!?>;?^cdG#<WmN>9C*Bu6Z`;T;ye0TaLZ@n3E&+04w+MliYUQe?rW8ALo*jzk
z2U0*2pO=I5iw`Uz@V@OFyCb5T(HFhliwCnNbccEhB1ECdwg@?C7X1QDtkoaQ^#9V9
zWqJg+pE+JLvsk_&)n5pI1eQn#|38jmgWk3b4<${bQXQ7Zc<-v<{+SJlQPX|%t<yDt
z(8u1x1x=oivQ6=8txZprF6<s7q1&a2fmf+Z#hT~A`{&d4Wl3cX$NoPjO@8%&1>hAs
z02QU{SZ8^K1_bAOkm`Z(*nBF?Ks5jL3cLB%`>tszyw!Vq+b3#j{~kCCqs6lm&hePJ
z9P}n|r|LN3F@=23_wrM?56iiG;cRYQZ!iDL1I_NAo_2(xO}}D~w~jrl=gny!nx1!i
z=~q>^@kQb@B&Lh^lZAp-SHBVXUcdZzDWm)&L&Uf@K%0=$nXZE$FL)QqIrn0t@VeW~
zVaIlvEK7vv#oI%o!j1TDGe9%^tD6z~0cQt(^g>3%^@GA?Grv{`N2kxp^iP&swP-$z
zFU55?dzEc!&JWbdkKf(Sn`LGjRR@Lb`ESKiMT&hp{>!?EIIx;^a*M(5tL;80a?AZ{
ze;k+}{e-?GzCd{yN7{ev3V{b#)3^EgaaQqj!Tx_fwCxs~+qUPqNgfqsaO#V?XP4gM
z-*kP$XM?W_K88Qc^ND>e3a{=45h2_E-8(*$r!&rpQtFCZ*=T1@tGD=)QHp&ITka2v
zxd8`fS28VUJcN(UH2V4VKhfW9M{l=ZV{H(H^1sTy9-h))181wA@)fS6)_K;u%rh-!
zKu3V{FO$Wj8Fp@Yg{QL~f_zuM789cOaMQ-N?T=1&Ad@HWVsWn^I5^`}Qs`JwwB0&j
z2kS*KQas~WQPo$K$hKSQmXC3sa&`xQ4MgWcGm(q2$;|FyCu$h&vS)+M$#0-mtNSf}
zp~K{lU;N#}$Kj_v>Z0l%pZ}s#RHcpZ&s4`3vAUqhTKKZ3Sy(UNd3>$Tf;ZI9gieN$
zo2QebM=WILryMRSELsK6EG~O{MMuHV;(rY!L%rNWS0^sV5e36UPMmft16n)?wD+ma
zkZ&6%3H}Nx7_*k-=nvFAf@1=`#0T%xhf9^qoJt+AdA>#?II<nZ<9(J<OP#x)#>wwu
zB=5NoOG{?=l^o_v4~28az>mw3A#u->CQFwZQgp&T2T~vbtN*ed$3um0?;%D!E60ZW
zhG(h1+_>R&cj`;&II#n4*<`e(xS@YD#B{~hUA0DI?R!vg>)jHKzt3ROA!41*bnIcC
zV&9&@GF5VQ(624Nyg2=Z`-4O2o|31nyiqB2JkbAF+E<3P)otAZ6`+C^C{Wyr7x&;0
ziWMm?Em|Z{Ab8M1vEc4nD6Yk=xKrFc#VxpVbIu#i_nmux-d{Y=ChVQP)?9OpIoBG4
z!efj50wIa$!mG<~k&Z#bglu6pVFYyF$?ZRUQYgr8&r4e-U2F(oP_1NZ29J#93fkhn
z6j>f<9T1qx(<*vp>v<IOB55;K&_X@K4DrlC-1mN06q`++`v(;qsmDW)pVrcU^{Qsk
zpRdaNO?1F<^?FI<U_Lp8F5@>h1c+@bH~dVwKgXW&l&rNG;32#YEA9{3x}|Pb%pRSy
z6{GMJRl6ReSO+ZMUio=?+M0M6!Lwc}Hpzycethb}YQOz!ux?ySbvF9_vuE?B&4N0H
zYr*j^NcPAUMECWZGpW{hyndeN@zKjwfl>x1-#NdFO*6dMXw4CiUA&0$;=Wz2hn?;p
z-yM})uRjmNIAsIYP954A4uZ~#leW`2ymH~Gd^@Th%Hp5Sc4xTFhu*lIPH1k$Y8sTw
zbg(y_M2+pw)k#}-B-KwVIhq`0-eA#PyL!qpRUW+9wm%ZMq#K&2f}8G-p)@C>))9M6
z*ybiZVNS+t3OhOL{I%*x!98UgRHG2YcEJkL?Jixw40X~}AWgGm;F%381zI=XIwqEg
z(ZI;SeuXA&xaz&xqn_yJFEhko(KTa!sTt$=o7v2GR&0V<EA5rJGDHf^1uOj<i_7AF
z!37B)dKto=&O(k=UQ&4Mz&5X2AZ*Yfmu`yTDI*oqJof!ue{*u8%fkVA>QhQlrLCu*
zzKX;~pRMjJCV#n&4d8{GEL0`|1ph{KS?<cFm%0U{etDwuw{+Ms;09CEXrXa^SXnR=
z9v#v$bF(66&R61LV>7+|*LQAn&7xZxP<!OM<A?@lzaKeSNo0@$d|GNNyr1_~sSvy?
z+MvJ`ezLaI{}KFUHZj{5-^q99WK6nczJF{^md*BxNIdLgP8&UR(`t~m@vd@ZMG{N2
z(ZXn=tVd<Tt7T<Yh(ej7FofdL$K94belu8LFb|7l&gCMfJ*oSpU>1CmnBC8`;DUn7
zCdk0ivh+J{_F3o~`wMYJnrl5=gSN5WPUEnn)HF-xOXkc(Kmz>s@ZuIbEKJ~R=umLz
z`AgAvWMgTIUPl+o?Q2+qMHcfJ%p{^FoN#QR(esA(XdbPxj-sTo4mrc<m1NnCT~R?n
zqw$%@dmh>rQZtAWLM-SLR?2kNVAPM#M=r#BGtqu-kpx&1{k&`Cn>XrE(2k+Q?_h0m
z0}U_M&QdmmhzOT&j{wOw+|{NE9DZvTJU<S+E%*0&uCTjzX03lH2ENFVPbShyj2}5~
z4&x&#&2CWIn|tRIe(l+Q&D~r}J-bT|E^=N6=ZpVM3v7*HRQqxS>jjUi$c~K2QYd!6
zad<)Ibpu!WFfAZ$eeBK-aCc>Bi5@m!GjF(UIdmv-YY^Mw&!HV-@MK8usEO(%5Ghj8
zv8v#dxG~UN*mYhbh(4?6Y74!0FV43(^wSU(;~ml+R|kMnYBo2cn@(V?gMZ^a2mRA$
z2Se7S`Em=<JP1@~PLd@|Ehv;jtsyn8gCcW=>^3(Wsi9Ua5v@}ex6wruf<cCk-D*C1
z^i6+ui<eYS3m6p%9=(TnzVI3~{ZQR<oCyEpqea`FpVH@bQCm9SA3t!pS5$(NODyv0
z4Z#2Ju2hD(z=64+C3!7%667~cA4?`r4ZB=4q3}4LK-scpKk7M)1wgv``<nC*zfA3h
zu`%{Z-q41!j;Wf0)+=0x%~+b}^hMZwE#Uk35!uDTruvsWq4W1ff}7hlkGNz-C{nLm
zu<^Mr89}|#P8DfBw{GxezR^OybWqU?10bJ4)4QgF%7ZIbefyp4k}2DTIn4oY-edPD
zj*mOR+?~4kdStiG8o`@H&9YV`%@hayqV+i6KDJ`(l_{6i*<BOCFCpRuh8G{krWtK>
zJ*US<e7ntZXVc~GwH-mn7o(i_nqlj>u;fX{+<Ebg(Q)StGHF`ySLHkUW92sKd+U0h
zC$qcg_t8I_mr+*vWNNs0@Fb)7cfUf=#al%J3($(M;)I3}8&AjhJI{jM`$Nh{y7d-0
zjX51%3T#ETU`s7)$wnR<$%pkuHdFHLUFjo{r6xk>AKRTDJw6oS;JXn?s=gHAycIH?
znv?G`GoRbe=m_RsGnG|L-!w~8(+;9h;OZV)4A;}5TuW$trF)28P$(dxp;O<f=CHHt
zx3gbZI~3<*xF^C+N1{xYj)5gKy%aLsOBe9sZ@#3bEIec`94lL1H28>&X230Uy!Mj~
zq;nt-v}ZI~?z6qbB|DxUR=i-DYKK3^`|D~CC!BW4+p4gK2?k>p{pu&r#_7)F7TgDK
ztP&-W<L2?EKV$H!i$A`g+kTmU#uZe3H1;<v<3kIMm*J$U`uF&)#_CuC_R=4=EqZfb
z7qR>GWyw@<n7(95iJSVIS1|{;Ws6>Bt@ILrrhEcL2O-o>T|Pve0jIC`1@3&){lZ$j
zH`kX+1BWlR?m|V~-{1y}{4qdXJbyZi_V$e$!C-=Dw^tiw%js=6FLy^W$MMQ5Qtd>S
z<H1SdpmFSYJWuT=2At(f=YXh1N_J|F(sqrpco`AWsqyaof&>Y=gfAt|1(}Z7yoLVx
z*2JI$wXnshPDzCVn`QY|QH0GB%$KrHy^<^Ln&fI3;qtbN$<OgaiPV40Pf`l}sCi_b
zJu0<@YBTWqhQ)s>Cc9AY0@X)-l23puswycTfTv_V8wVF-Xg_hNkwF<7PqYD5?A1bA
zsHAKBb9(Fk9?%f`xLioh_6yAH1%b{5TJyyj(miDNs$+Z%j}*u*SL3JmOf3t>^tDDU
ztE`?LX(z@h2nnfjf-yh)@&6g9I?2nwT04-shEm}CtUaQSZ;7AVDY)dXz9is!Cs#4S
z=T&pruaT|El`CCCA<dk}&c62@@9o!lc3E@zSPRfjcRxOyu{|gbK<073s>_)com_pS
z3v-9P3s(G`5Xd9oB((7Em7W@O@&iBm3!PYbQb+JT2;NaEi5B!jlNEEy%eM9$wgR|c
zQ47{nUW{E6>8>ovmVBx|<#I(Qn)&TaXxal7iGF<@k7b{J{7n}k>V9LN)|Mb!$a^&4
zZo4`c#F>-7lqsjsvCn!Ratj_Wk4)6iB50~wJze(Pqs*C^Qj~j9AEqs8m6X)Sn$`4<
zE|ogL;PzSZ0m~Sskk!xXk=U=XS9(pAlM`VIR(GGTFJ>s%<hVzHcO-<rAb$&*-`5iM
zw=B%wVNDmZeD1D(d;bay-0N^%n-`+t+ExbYi8y{b6!P+*e`ht$80o6Fw_78s%$=qt
z9pulIElGbnP|jFCnd#xUcZM<Vb$>g#TCAYFY>$%h*F7Efx*RcwMBkQf=E~;IF0f{Z
z<yZn{2f~uwa<j~!`811FnaR4jSAmOeAtZMCzlQm3TLx58+NDzb#We2M^bHiG)%h<2
zalX{Ro3y*eZR9&`6zH6}i;Ly&ibxI#4CQ(9RN)bne8kyWq<JUcN_jr5sfY!4vduyT
zq3HKt+U_jEPye7BP-@s)?!=CmK?&x)pZ^yAz2HFUS!1Happ;3;{yIB}r@U)Gh1ebq
zJj&-@PoL{>d04W{?Cp3|G`Xvo7b2ejKCCsZb|)Ns>8u=nz<2wxP-vaFz64cmMw)x8
zja28Oo+JNs8QnywhY+vYz?SAH5Jxrs2j#~wP<~^@3llxr=l4NR-XDJ%nZ43#tBkj5
z1V>&AavOVHcvh)f);*?5%c;VEPM}^ZTZ>p5R>W;MUNkH^tM~W83#cZ!zq%8C{blnq
zx-(I&gx^;0%HU@5csO);4kx_R{mpvabx?m)x9tjv%(JQO@I!rTt1eFk(2V8283s2G
z&kqV)i;|SCN1_n{<{tyvS5AtR%F0)&8I<Tt#o|r24->2N<s0>rIC5>m`z_6{N&9rl
zz9pW8&4Sc-VEupQcdAa?Uyif|HTnLCCtuMSv;6(B0;ps@rAeM!fA9vMdGB|)^@_iW
zZdwZH?pfa-#3C@9mzn5S25k$}hL<iyV?Vwrg^;`?b8Hk1-09g_w7Ihv4?DFJ(Tz`g
zwZ~%};njLP{4`{viPR#Clj%)ODf>9X<^tpv_htVQwGNMiQqjiXSP$RbZtF!ufNYIt
zBOBpskFq?l)O5|wG*^4_kGdxgjCcT%eb&@X;`?}aj}_1$yrY0`P2R?%is0!jLujRj
z<J6#QL9bo;uknqy-@qqT!5{7|{#Kc3J5<eYIK|$$CLHv58r)HhN*xT9SWLtj6EJR2
z?5FKrPe$k*-0&HMLu#Y&@WTMFTGD$6N))H;ZV%GJJdU@X|D2*)dJYkGRygr@*62K&
zJ$O6Ju|^76I&k=U5%hPiC{j;l!u3voOMdmfm?Dn<6e=l^@QequcPqTTH-1}3CX%K7
z_07?-FM|w#z-Ipb)d9zCF>%6$ab;9zwYDf_-&Rb6;meElqF{2Ds~E~jh78e!9+`q9
zPa1tb(~P|4d=?)HtP-vJ4ubIcgXRv+<$`Ni(1rBWFBYTh?s%0`GAR$4^p)%3c{|KE
z2g>^`MVAccBESRCh0<=qBEATA^D*?GyoNM6FU0|jp*95^)fnCG$>X3M#x=K~tJ}1q
zN_B#JMEWZme)wZG8tW6lo;ddPOn+*`;du)|Q--DAeLV9bHu>FZ3-yl6ta*A6zQ@{A
z0=IO2uhQ@GKk#B3c+e;pT!d{v5>@LV)E62CJD7S2)m_oGV$*NG4aTLobnSmj1(Z(r
zIoBud)O9ZQT9a8Q7ma`8toY5~mh4!3KgSFQ;16fDwJV<n*m#MI2ER2L463b)h_Vx=
z|Jtwwm;bqb3{w1o-!BsM@qia}@5jipY<Zo)KeaIGL$K78&v>{TTe1qEX%fi`ts*R>
zt%l%SAL&jB3fpCS&o@#}Tin_H?bPIJux*KBk~xLPD*<K-(2F~2AcP@3r+UpA%@=8H
z-&Z!2Fs}j>PtM8XcF_q_KilezLUis==6ekC$jJ|1lG}=yv5XkgNkGfg@~y4?)3ZYG
zk_At^qe<M>(mPjZn(<CFvEbJn^OzjCd^RBHpzBwCSR?^!lZE_TRaj|VIQizR*AcOJ
zB05H?XX)ite~B&en-*7|R5y@a`!~&W;$1_Y-upe+r=+aH`w8pcm4|}$r!zDp_Xj&4
ze|zM5GS_1BkrZlElrk5CiSmS6$KYa=0=)9>sN`OZb9bCM@;Lsk&yt?CRsG21zngGi
z1Cp2a9Cf6M3YOa$Fzt7x4?$Bej!c7An!4SG_{||0xB2I1D{zl~%Wd&BY-chK3zMWh
zytHRizoYZCgh}?=yLaomLygaLuVyH^zu%u^8~Z*gq`yG77xG_2KPI-&Sk*PZp+CBE
zn0hRWEi`bx_ZhMxq?-N(>s38OxZ=yw#1vsqCXennY=b<?>ooi!ya9e`)#79L^_C>M
z|A9~*6Zhq|uCLm@U<rI9dI-dp<ba+kklAv-$u%VBwL!`Yd97RD_ycRg$@Z~Wg?t=K
zCaZ3>7R@K<ckx!2&yNk>_yl<NkS4?Y@hFX;aRM{0oi;V-!ZCw9RMmqHGLxEbSA(eX
zCsSx4N~B7LhsGb6@Bq^lL7^0n4#nFTlBct0oJno9)$VoIU9O<)nByW4i`s1Eb`r>v
zo?)kVMrqv6MUkXm^skU&PI>zD_nD)VomPPC(9!|h*eRo3W%uEF8ueDS-Tu6^1<oiC
z<}$k98}X*^ryKHNg^U#^r%*NcUxu{zo&bN@cu$}%5etV^k6h`^>GOS5;p;#A*YPX<
zjTmfs62#wvHzygc0wATuHjUg<I~{WqAEnhUs3@A4nSOfyB|YgV@}H9qH10du;tt%h
z?Cd@m+2sS-eN@}JDdkiI8OU)_q|-^)tET~O@zU@X6>R6H+@82pj4%n_N0lBq;_VN6
z=r5)3dTdF(7R29Zzf1^KB(FqRevAkU<XlbWDhYqO)FoJw*F!;LV`i?*%@QBVU~J5n
zOKX0@Sl=jtJ2r0I9d?U#FhjlgPNB<I1~~JnTg?-AL1l_hx*>W&x$<qG%1$H65BFPS
zJzeIUnr3u4PDMm%M7dlWr8U(ORwRc8wq_d7B*8t{z{t3v$Bh59X_rM4cWGap^ZNlQ
z7CY3-ZuCiL7&cBh&6Kp$QorXO)7zJIjko8B&er~l11OP5*JYicSoTJL+$q;mp8j}1
zvQ%bjR(7|rn=7_wojq$VZQF3!-9!H(YR*1AZAc?~oe>QZH>UY^AJy<%ox=v9x`xK;
zLdPUGcOY@8Qpegm4pvqv7CeB0m9fcugY!Xk&ub|Q2GeM0Bs=ek5}k3x@{j_hfxGW;
zZ+NYOr!JZB#8uDP`Sq;tn}IuZfa&|U0vjLYC&Cwg$uEUTfgi_G@XfgW_Me#9yz1y`
z+1H_#5v^`I!qCVvzmeWRu_n|c6M-CDJ=Gbnz+jXwGh_Uh7&bx%g+||%IU2`CTpYF6
z(O5W|<T5b&<=FTi0n=6d70PP?HZ@4SXlg!~msrSg5uwqn5IsyzzG(Q+tkhr9SU}ZN
zbI#p-D)_5@ao)e!tR(5$;q<)yHyTTJeqOPyX;Ryeo}_b+>-pd9{XdF(*$BBEj2KO#
z+WrXv9#;$HCm5#qT0Ms~j0SsbMuyXso|GaH2IU`<aa00iC!Xb}Dt7HziC|~7KR@#E
z)Ahz+Pe?}>eWr7jokabI73XW6eUm_$?rxY9C*R|DPeLuWuiQMpVnws#^=PvVKC{@+
zm@5gLUF&t3YAY>f{zv)?3-*?)dc9|}P(M*m4#nl@(E{>MT^%th_422OeLs^%3TtW5
z670La5J@$DZhxv%4`y81($#A9XYFA2E*m9z-Y66H#$tw;xxexS-I}lv<t!Oe?Pvb@
z9c$>wL4|4mX5}!Tl6>9giJr8yi-|Jl%4f<DBKBOxWZsclyS1Obwm!R9a=o=eZ|MF#
zDE|^B^Dvd<=Vwk7YT^T5(|op0$cnSDoZt6*c~0ncwUx(hHjrS$A)&!J$zF>Tl-7tL
zYL$evr>I1PggVNLc+_#aiLiQ3nI5hAUuJ_OB!b{CiwEgZc8e6$dl~=y4GK={FC*8J
zAHsL4gtk60;*-<wQbYM8cuN?K`_FSsQVwK+*LnZwOg~Y1qr3D}=5gRXg-iGbO80?z
z2;IcdwdeM^hUB?pp~pvMU-FQf%)1z$p3TyJa!`M<N}w8d9U}gR(umGjD9RE!>Tl7=
zq)=$B!U7RT7NR4|VKjs^7C@>EEZzpS;H@5)yw;-72Wj?OG)f&DW}p_Y5Uw$Jo~&mo
z%s;A}4EGmOqkTk(8u2bFq8vzNT8{fjwQJnTnQ{YWkU5ov{3@?}gk{A72~-0$Wd?Gw
z8>lWLF4K-$0xrWc&X6L+PkGcRqdaoNq0ezcmS{i0_r{f~D>+hgZ5Te0lpr;>R=6m2
zNib%bPw<c!M$75TH1L0oP1zhxhDqZbh0%gIQj!dj73<|+3Xk~DLuwEgTxvu+>c1g(
zyf4GNqwJa~sP@tyG^gR-QE(ftKu7e4cOyfkTZ4#7%!(p!x<HP&uyBX7gT;i5ULmyi
zU6-DIRbI%O663ah6DjLlv$Mh%GWHQ3{KN3|@0QPq&A-=n28M|Y3T$qC+yvK-o8;G`
zDa!%(+4XH)Y909Q?_1)HC5{pDhYyOO&=@x6@#6)3(AOv9;pNpK4J2lVzGY#_868z+
zVrQQ-eix5|LfZ2$Uw}McU$p)J&danI=<zLv#XWN5M=?H0y+AW4r&g%E!_PgRY}k@#
z%66erG5t&y%A}_l%@jX+N5gEf-0DNfXq=OBAopJb)+!-Z^1S8TfK24jEd}tCm0`P6
zN_R*%C|=`L%vgQvC!PAc=Q;+JDPhIMKSr`;)ug4R7ioT)q?`u;g?|1sYbcM!x|lU(
zHhuD_PsAmOM^La;P{*_aO?%`Cs3<-@T~OB&P9g-Y7x`1DynCG^Mj3IAvTeEDC@M9J
zoJCYhZ>!GIyP8D6Mbt)4h=-JaBL}8hplnwr<>^~2RSP+e*x1+)zsqjF$QeIDFaT&l
zVc{{k7#Z=#E!F759HIEEubW*RsCx3*)rE(N<P!0~Pfi*^9Kka4lRs-|ak5r@Q3wd0
zG6FFRc6g#DixQ>hszLqkD2Nqxhhy;&3*IOi((Hv;e_;4-W3bW~)ut)Q>>YgY#^~FZ
zOebolnBigaj}&^O0O29N<#6@e%l21T0f?_cxAuR%PM0Gl#U58tpzjm%;7)yYmF29@
zo_w+B9_m9=;qHe}N0q?d*Q1Ct;faBEb85{D{~dpXkM-D?m`<bSHt4;e>;%yveV6ri
zcCFI>p4jY@tNyyeoLKLOW%()8<|yG1<^PUMR*b{8t;g&lXZ8BRoJxma?$W<)MwXt-
zvfj6i8A)i?$EL<d+zcUGlA=1xYKs4)T`^pQ%Kk#<t8Xd6ng(S{w<6px<i^_FBjOPL
zipRX(^;;++MRkaaBG=0+pCNH?_ItNPlO_W`Iv0}JP=6iBAb$@nG=Gb7<B9f>8=F;9
zy#vYKmCoUs#DCP}e?)M;Sg-H2-axk3`FW0iEo*VvfGo4<>DP0CQQpZeZX{OK)FZH;
zTBXF!;gW?-4~7VA81H^_K39V--L?*+WMY&K78oC~hIR$ICXpmYc)VF##kG$CDX28K
zjA%CJjYHj<oz)(4X;~i4r_isYNsM@5YB4jfY1Y;gLM+==3YZU7A`B^UF&_8MwbBlJ
zk|wp=ODJXDvD_B9EzG_y{U@ydBx1bk%syGEc)GpB2Rs$#wE6P>&r@;&qVLH?{(9uf
zP$M=2?ZME=lG2(q`C^*2Az?I~cw~+t4@Ftj6z0asjlZjll%OLIJ|3%{JX(7<HO{A!
zC?sysSd$<-Fk&WpS?&7MX{&oYe95+`^uv613vc}~GR33@p#U9kE=aBpL!Y``ZJkO9
z3+8e*m)DNVrXLn;e=d;;K9mFmlX`SCDpnmmVq#;fUbL7`yxk10KS{FF%qVa~<}R=x
znhU|I5Bjk?RCM}1MX^!!la*DsG9K!%F;#=<U%$$O++1kXt1PJY7Rq9tzMy!2dv&^V
z(hPR;ce7G6rzto^rYbNo8b1At3`U0Wv7SD~MraZ{Nr2tT>;8h`=RH;ay|r!==;L{A
zfRCrlnV9(WTG?xCJ0BQ67SXIJp{W+CAk8gw9+%-1nh&ReM|*9USP%`Mi`wsmQ>Pg(
ze-Xq4y8kTt%ysHqnmbbEdq$=q9!;;9g-au|S*)n<u2toPOwJ;hsPnmg(t7R(O*K5T
zWE@6R#fJBE6XOsCM@PrWzAbVQw?7e7=wpEEUCAc*^K}MCS4O;*<nUwRtseei><A|$
z98m^&&$C0Vx+TBAD95O4ZeHx`SY=BeHV64P`U<{&p;Heyk!%q`PDZd81GTq>Wu5tJ
z*j$aRAsVr9d!ROkHm%Gkhj4BoDJ>bX^S5s~`VW>D<fsuK6_v=-m|_{vD?;lToCUJQ
zNm}<Gh51h~W0ALX4flpZe`He{D(KSqKuo`Z!u{C|qY+R=PP~T<BQfU3VpaAF(*vzl
zC$bbCF;19NAqwWO@+^~-Zgkm(Tx6W`qA_a}WwhM<Wv_x<+FO}GM=JCT_t~<I>DH>l
zD`k)f55q)zy*E3S9S*RpOM@KAS^bXd2(0Sf$SyWX373_=&Ot&j8QS_POE2Vn!Px@7
z+8I}8J;RAG;q-J{(Qqav$Uc7GWiR#d0Ty}C&a7Is#Baydlz;RRr3R2;1vE^ovYzPn
zzJB`!d0cFh5F;UbVamWglozyWu1=a?T^(mOm^2Zb5xU#>@f5JuD8fC<ft*}x4248a
z|6HY1OMQmLddI@kKiI-|c*?7sazeUfmr;2%YY+jvS?q^UAVlD_$l0Hrum_g;{9ib2
zPh0hH+|8W@alw5zbBYY<NYg{~7>ugf{04I`0LkVw9`o8RS}#<q>+6G>vTO76d)<WT
z$s>b4jvqbiL?G_YX>jI;zzpYr9Fuvnu&6VYe{!Zb6BE-<L?n-w8j7|RSQI66eV?M8
z)l^&iiyIYi9d<0T#qIe2&pPV8ng18LX~p%KsWi_se1#h%1zg+)-@v=aw9Qi|iphiW
zX^guii<FQ91CeTl$ih`*$xMq&c(1kf^sGfnb<!0<Sy@O^6mRN3au_|u3aa`DL>^NT
zDxSc2&AZReE2q19ti9dCpB^4|h6D%7P!#fgpFstD2BjR8Y#HqmD#`DE?ye8g={NSJ
zdfs=ft9;Z)viDNluSuSF>wGO1r>?VA7BINWsLAfD<WiwLlax|fnd=|_gjfVV+P9}K
zvkpAZ8@OE#KZOKxxOM&1s!m~==@ft8)o_9G<sk<OeMbS-6cqGP8%77JSUFRf(5;bO
zV|}qezGkYQcj}iPGwbW?dx5WMS>OoDc>?&~Ob+!qDo#<N$@tu#KM~}pYd>(OzYcTY
z1#0z9ODQ5vZBV5@q2r+RKC4?j4Vo6~2n`J-yzDDhmgwRX{GNIsC;MNtj}VP=f`D1$
zgQ8-nbnCw=fHlV;mv|0?!nX(9-@{@j-hRt-`0K99Ido2wbF21G&MU)t{qB?45U>Y>
z<cZjy|ChW$-w1FPA1DQv<SwR$<?e?d!?H$7thZK|oOiDZd4UasMZcaqzD%DyI@|>M
zYo~as^uRu$>Jc)XEWAX;fg_3^s{<zkdj)etE+=Z0hGCoMACyo`gJU_F{)63nI9e4Z
zm?G!%4&b}X4f(WG<rIDf-6B{})fB@WR&S~ZC$A8@Sm5p-tS?mmj5Dfqt5hrPYH;u~
zu6{WpEIdYzGH0D_OU}4dTt*?Yn$A+y2O6>Y4lUi|OxsQB1@(R_kFk(w5Nk!dN54iH
zbB~rF3#ab28TslUC&FdtA!NjGzxcOiNr!A+Zlu@b=K2SQT6H-omCqG#_kF^U4I03i
z^mInNi;XR|Nq|(AFnG+yHkAa4&t(BrDAb<tP9h70d(C6&yPM<Tu@38RZ9^PweAJ*7
zq3!`klX>mb7&j@iIr$D4UVJguAaYKo2XVhr8)-M1>TwOls`)uf?Z0&sn22UZl-ga^
zt>avsX2Cv777hin52i`MVkADwfJ>7GT#mr$H&l7wqJ#05sydacnRF7(wAE}0#^cIq
zOj3$%DkXDm;(V4Em2apAISc+%h7qDA1gD9(J29{R!spmk$5X;(^o&fr>5)yNRU=x$
z>OYIJa~J#^PCz<3UXGuuY^^3Sh0Mnp7+MJ7WNP5l8(ewMXPiO@tswjGMBDBB)i|7%
zt^S-EhB&>mc%I~6M}_%%-&qB7HYpuOs6K~aS2M)2djjE~;R(}+;zSg*kZhNR5Was7
zqrOFR=SM@-#cZ3KA=C*&L&g_`$KT^g`Hl(sXd%-b=+p$H0PEFn9cEWE3fEGy&`mkI
z=PORU#L16YU@U;>h&|dDtFeke?H;Y02s5(P0OeWE<CW-Y?O$NHIQ5L_Xkj`sDuwV-
zh3qg_f}{+^!i=8d2K!Oe%N0a10DwZo$*zfgA%?RNfQ+KeqW#whFP>t>O=tv(AS?#Q
zSb1HEau^!fI6PyE#{#A@jr=kP^$8W;!avHG<8%5E-H}UH@KCvOKOQ(1B_Y6iM+d~k
z1W-v(=>aeN`4ug~#Zzb-Y-W`>Rx9Q6?9UR4oXjK2n^3o6QkR$G5vQX844617*hdiq
zuVGX^)}7cC){Fraxmdcq8mwfhNY>g~BH1e3olRt~G)!-2!*~37&cjUqOt_K-%$4Yx
zOHmJ-xfL=or!!7lRGKa!l{ZGb0Bw$16qY=|S}uwK&vPV|_6RixhaGG|0WXWBHQW|^
z0ZGJLjmQa?lYgMoPV{K@Y;nAQ-`KpTQtJq$0Kk(0YJ6**w(s*G7ZErNa(#iIcs%Bc
zkARD+*mg4$I%8dHS;ZODXknFj@IV^7D~eX17*Q4hnJV6q0;wmJ)<tH&*}5FUI>iDB
z!P9jZ2@vl%MP2^382-}!r!AWiM)Hjp3Zem^DH&dhJ|~N97S6aZ-|>v+b&2(LG^>1c
zU~3emKuzm>Y}Y^1v;D2?n0C`SY~@sRnZi+9@@^+;|E%~==2Vm$Mi$CLd(<o4m-^P5
zMj_nC83+ffdRV(TJB?kgpACIXJZ-&p9?TKwL`W>ZwwZvFtj{O(l1vWbgqdMd6>=3}
zdqo?*pb^EZC<;;40ITH&S2NjOsUz`vS^xus0cho<R5mA(8<7Yer*mzi%B`BhB<+pC
ziypA(<yxb)MI5C@oO^$iZz^vF)ei`LY%$eoe*Wk~6{ZCqTe1e{HH{!GZ$~zuT)Kie
zb$RT8=vd)II5bz;*!#dtsWsq|ecw5A4Ywt4vywTJ4UZ@zKa>>5$SV{I&PsOuoTp{A
z7&ys1Qn35i(4c%5%t$7^k!b88BIN+864rN!$KQ#)Z^}w`J=2<gXoxrg(AHhaij#T4
z*Jcbl)2m#@o#ydoFTlM4VKH*_4!TF3ah$43H%_`YbG>H~a3}vt{y(VE7@ab>n-w!4
z_@md1h=}CEynI$i{S-h855j6eI5$K?fDbFUfgClZKhp2Hf-VOBWtL=8eGGmJ5H9(?
zaqC-{$yHQZj`EQL<7Fzf&^PFaD^AKT6es=SM|MSYnRGm-(IeXY{t|f+YJUhW0}Lw6
zh>aMs0DjtOAa!({Cz9RmDm+$Ti*WKMs*t17hz#_%S<1n3Pr0a++rp+@rIjO|&NV!m
zqeI+2l|-uB4<picmL8tyxjBsz7vmph5(OHKzL9!!bjVEzj0x~FoGqU-OY8Vri!|u+
z(TAc8tpMB5bJ-x2t`bG7e4xz=s>SinI5)~jgii%wLot8Q5*6ZbH+1M(c@P^OSxhR2
zBiu9!WmtjoV*@rQs}YvS+xI?QDgbm&rTm}eDvPq=AV<&F8gSeK3t{8PeW}lj_^H}g
zC2P;4Awf(V>9o+d+aqfp!69ghm(oNL#g2FR$MlidTibYvvXoe*RY`5yXg%qGm}-Ru
za`tM*OsoG{Lp=iev^oK7)tia52-6u@-9(wcw{#L<L=lrL8Q}#Myu>!wSJQ24UK0-;
zp~O&%ie`0<h+-%6fBaUWKs7!w+&)k|sx~L~#!q=w#lRXkof|$_r96wr=$h+S@*s)W
zpr+gaM}E-<17~tHmoeZ7okPcOUcQbHR@1%FqV@qvk6c#4o2X^3=mp;6>ajW6wHL#A
zGBou0Gaw6~pm7(q@~Si%3Y7ts=0!vY_CgW%<xe62<0D<LMh%V87`lE`lh>+dKC*B)
z#ptQi*YO2ibr4eI;N<DYYY|h)zscD&Bff}Omi)K-gKFLu{sOseO@8r)YIYWHua-`~
zc-EstprT+v>Qim%-vNq{T-H+zphVAVfFCv<Y2U1)lQzIY**fT3G4h=#?{RPd-Z75>
zIHao9T$3lsmk6^y?V$numaS;OG@X$UoueOgq*l&biLBfi2Oc$WgohqPCX2U9fX8#&
z6TrHc?8idOd@7%e7hZu!<9hgz)q2<;e<hE9DdI5NY5qRtK#^1sl4}TVFzmB)Njk}-
zkxT2zalTfD(F7^z2tb_tpnDTpDiO&uwbRst!ZAd1a%hyaFdu9Gl+W3KXzxx2&!Org
zsuf9J@M!VCcmOOJFoC2@PFQGdPt2?igO*BYlxih<J{n2Q-j;PTs2Vv$e!;f`nl(;q
z$ROPBzCnM`&}U|5mnmAD;fi7OCaF4&56B9P(<P5H=0G9&?bi67jE~4Ls}nm2E!OiN
zTAmJwG}HpLn5fDOdCBM(thEWuMYu4%Z|qU<bO)?P>~d~_;2{fCiTK3kS{;AKDFv<Y
zk`WszsYhjN61jm0BdUlaG3<*$#aUlvu*R78?{GiChz?%IG4m%|$P%ZOO;U_sl2(vf
zuokVi_dEL>y{Z2fF1h?U%brXnxa6$yU}IHU|0?TagpPhoNB0%DE$FdgdIykk8bz!}
z<tAb^;FWJoy<Y+@I)ycmOUWAO-e8uRlFDX2_4z-m!x3ZohPAguXy!1Hv@bZ+p(V!W
zl~6L9>EP>eModrIh5=Zmufyf$TVnG<T|vf<x8)LPQS6?PDDMMxFkbP>biHME&1D4v
z7a}UkXWtW%j;7@_E|gXysqYj`L=cgJD7ajj4|IZ89Z`Z<S*nXRA%@$rf*rQAe}q{7
zDs*9~O>DjaD$oj=ZkN&hPM7DD(?WOwQ)6T1K!v#sF|=qAsB2B12(=8t*ZuL&ja!E#
z#fCOUv!)XfzcgR!z-CPO+!csDdQ$Tv8{jTMESD8hbyKILLc?#)8Ar$k1dCx{Q;kwx
z&?-ZEY?)Et#t>lY^AZz>gx2TS{6l=^ju<al$tRGVStkMnve(qGd#Xtgk;08az(;Zh
z?m6Q$c&39UHG_>Qr%sXO?VHp2RNxo13}mnLbJU38nNS5;d6Bo~Hh}~}$zj5@TpHHQ
zG_efz?~yf+eu($sQ@77kiU<HfPgIPDaLLH@nx*N-J4q$|UFlDl<zkYd+oLuP&=}!}
zAd8A-Ba?-~LWY@4Ai!n;5}E1?Mb*LFZ0Ly`c<lRd+AJbr6&54Yfh&f|>JtmpOEc(Z
z?m+I`@u&Xpyi$BWE~KfIPhz&G8G5ba!5r>2g($xIDNu=A8&e?|mCULFg2}2bGIlYt
zCu6$;RW(Ov7oCD5wI13W`r+RAxHE{XRnMqMCh{G0m151MbTr6m*1Q=3#EY+b(a@A_
zq+XUyV?0hpRuN5pL`x-eBpq)bh>M6ofjnWx04-}(jlMvH<wK=`8lUk4G3cRq4AToI
zPpqHLRR#k`2KnKMsz!SNM)V-sOF66S;p(Fr_Lb$W7o63aAz_#+zkc*9Q5t~VJ&fe_
zRrzjIWgx2I)2f7{CyDjM*~p@}wM15IXm2wpUw*g)0oB4syO+73VZP(ty@b<Q$DjqO
znM@1IsxQd}aY_lah(0wtQPAt4$|MCI>Sl$}MWP`57VY>gwTEI@6bHwM+nojw8O7kF
ziw6&kRNOrSa!r=B4{bRHX=LnWx0^uS1c5yoDz5^f-Xj29u!3vGEG=JN7y@(>+5+7s
z$}8;PSw3{%2+<^RX0|K416lD*Q^9Znxd!EJoR;YbVB$^hKp0C~#e1hAo;CH4VQ6GT
zS(Q6)KMiv^t24#eI|V`DiS<ob7LwZtpI42ehu+Pu+vD90`-V{RBmjdx1W~N4;xoEB
zk$TdOrJsOTH?TsXBjImkx-}Athb01?EV|yOZE>9{Tp|dJoLS?A`a=hZ4^4^o%Xwn;
zCUv>9k{fS&qB;A4i8G>?8%yQbiN-b_LdMa{9u3+r>qrKS$1=*j=}Qz`{VeZ+Aj9+)
zzwWBKtA{?y|3fA-m~nfYJgaN0y39sZ=+)A;;`ztSx1+Ebd(+tO$fAUVIpZdszxt(=
zd?yg%sGgG1KjT#O(secaAD>+;AM2cr81wVLMjLp>)StqaR&dMJ7eCS%=Nk&L!4g73
zUw6#)15=$C;Z9!kx>pg5hPTK%<ucU!R@kOy#GN7iV$XH&mk=4kEs0oum=hWi$qHl2
zR06m|Et1`;1qX!lcy})!_SaydXijZa=lp8TTYQc&w*5}Q<+FP|YSl%)#DID?6Kh^(
zAy5<(#7VcWmECl?virRNmrP&5Lzl1#2XXAgwkxqdV+AIewqRTH0}|h(L_sXiUI~rB
zrIun^NrXflyh>+e*DZ=sc>wJu+_LB&3%utgO|HaiLhe#5sk@w<GLJww6$I2|<-K|s
zJn;$Epy?!aBuA$V$L#?|R0B5f;u-0qmn^di17F8FGZH@+6NspoAPRDhsKYCL1+s07
zQA0i=;vU0<PBCErAA948SXvg31xAw;8>qJS39kv8U0p}j2-FN&+wPiSRLPYzw$yci
zXF>9H%MEJ1T7ofaK<zdvXFnI`oT3@@{{Sf&ngH&X@B6<BcZXhM&Ry5Z_e}3fp$E>e
zVCnZxh6T?4dop%iF{(no?j7zvL*u<J>+hZ3eQ}~hY3?5TFz9d5igBhVU~>gr)-W>S
zIUx*P%vnr=hc-3LSF9PgR4u*X38I*v@6F5E^9#QEcRrpL!QI57O=do7ARykTA0t8T
zw<31N@zM4n+t7ziXvA&^{=a;z@x$x)E0G9fK$T&Vq8T75l#T3?G9dnoW+MZ+<AOOd
zFE>gcHX=*00-R)!rI9kCf#aeqyj?$xj4@aL)_ZhzHg#w77kA1C18@{MX(g!=iBG=&
E2aqv@kpKVy


From d6f7f2a5fa0e305253f936cdc8364eecfd568121 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 13 Jan 2025 14:33:26 +0100
Subject: [PATCH 269/408] Revert "[MachineCP] Correctly handle register masks
 and sub-registers (#122472)"

This reverts commit e2a071ece58790f8dd4886e998033cab82e906fb.

This causes a large compile-time regression.
---
 llvm/lib/CodeGen/MachineCopyPropagation.cpp   | 136 ++++++++----------
 .../CodeGen/AArch64/machine-cp-sub-reg.mir    |  32 +----
 2 files changed, 57 insertions(+), 111 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index d2579e2d1b44c..49ce4b660c3ae 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -164,91 +164,67 @@ class CopyTracker {
       Copies.erase(Unit);
   }
 
-  /// Clobber a single register unit, removing it from the tracker's copy maps.
-  void clobberRegUnit(MCRegUnit Unit, const TargetRegisterInfo &TRI,
-                      const TargetInstrInfo &TII, bool UseCopyInstr) {
-    auto I = Copies.find(Unit);
-    if (I != Copies.end()) {
-      // When we clobber the source of a copy, we need to clobber everything
-      // it defined.
-      markRegsUnavailable(I->second.DefRegs, TRI);
-      // When we clobber the destination of a copy, we need to clobber the
-      // whole register it defined.
-      if (MachineInstr *MI = I->second.MI) {
-        std::optional<DestSourcePair> CopyOperands =
-            isCopyInstr(*MI, TII, UseCopyInstr);
-
-        MCRegister Def = CopyOperands->Destination->getReg().asMCReg();
-        MCRegister Src = CopyOperands->Source->getReg().asMCReg();
-
-        markRegsUnavailable(Def, TRI);
-
-        // Since we clobber the destination of a copy, the semantic of Src's
-        // "DefRegs" to contain Def is no longer effectual. We will also need
-        // to remove the record from the copy maps that indicates Src defined
-        // Def. Failing to do so might cause the target to miss some
-        // opportunities to further eliminate redundant copy instructions.
-        // Consider the following sequence during the
-        // ForwardCopyPropagateBlock procedure:
-        // L1: r0 = COPY r9     <- TrackMI
-        // L2: r0 = COPY r8     <- TrackMI (Remove r9 defined r0 from tracker)
-        // L3: use r0           <- Remove L2 from MaybeDeadCopies
-        // L4: early-clobber r9 <- Clobber r9 (L2 is still valid in tracker)
-        // L5: r0 = COPY r8     <- Remove NopCopy
-        for (MCRegUnit SrcUnit : TRI.regunits(Src)) {
-          auto SrcCopy = Copies.find(SrcUnit);
-          if (SrcCopy != Copies.end() && SrcCopy->second.LastSeenUseInCopy) {
-            // If SrcCopy defines multiple values, we only need
-            // to erase the record for Def in DefRegs.
-            for (auto itr = SrcCopy->second.DefRegs.begin();
-                 itr != SrcCopy->second.DefRegs.end(); itr++) {
-              if (*itr == Def) {
-                SrcCopy->second.DefRegs.erase(itr);
-                // If DefReg becomes empty after removal, we can remove the
-                // SrcCopy from the tracker's copy maps. We only remove those
-                // entries solely record the Def is defined by Src. If an
-                // entry also contains the definition record of other Def'
-                // registers, it cannot be cleared.
-                if (SrcCopy->second.DefRegs.empty() && !SrcCopy->second.MI) {
-                  Copies.erase(SrcCopy);
+  /// Clobber a single register, removing it from the tracker's copy maps.
+  void clobberRegister(MCRegister Reg, const TargetRegisterInfo &TRI,
+                       const TargetInstrInfo &TII, bool UseCopyInstr) {
+    for (MCRegUnit Unit : TRI.regunits(Reg)) {
+      auto I = Copies.find(Unit);
+      if (I != Copies.end()) {
+        // When we clobber the source of a copy, we need to clobber everything
+        // it defined.
+        markRegsUnavailable(I->second.DefRegs, TRI);
+        // When we clobber the destination of a copy, we need to clobber the
+        // whole register it defined.
+        if (MachineInstr *MI = I->second.MI) {
+          std::optional<DestSourcePair> CopyOperands =
+              isCopyInstr(*MI, TII, UseCopyInstr);
+
+          MCRegister Def = CopyOperands->Destination->getReg().asMCReg();
+          MCRegister Src = CopyOperands->Source->getReg().asMCReg();
+
+          markRegsUnavailable(Def, TRI);
+
+          // Since we clobber the destination of a copy, the semantic of Src's
+          // "DefRegs" to contain Def is no longer effectual. We will also need
+          // to remove the record from the copy maps that indicates Src defined
+          // Def. Failing to do so might cause the target to miss some
+          // opportunities to further eliminate redundant copy instructions.
+          // Consider the following sequence during the
+          // ForwardCopyPropagateBlock procedure:
+          // L1: r0 = COPY r9     <- TrackMI
+          // L2: r0 = COPY r8     <- TrackMI (Remove r9 defined r0 from tracker)
+          // L3: use r0           <- Remove L2 from MaybeDeadCopies
+          // L4: early-clobber r9 <- Clobber r9 (L2 is still valid in tracker)
+          // L5: r0 = COPY r8     <- Remove NopCopy
+          for (MCRegUnit SrcUnit : TRI.regunits(Src)) {
+            auto SrcCopy = Copies.find(SrcUnit);
+            if (SrcCopy != Copies.end() && SrcCopy->second.LastSeenUseInCopy) {
+              // If SrcCopy defines multiple values, we only need
+              // to erase the record for Def in DefRegs.
+              for (auto itr = SrcCopy->second.DefRegs.begin();
+                   itr != SrcCopy->second.DefRegs.end(); itr++) {
+                if (*itr == Def) {
+                  SrcCopy->second.DefRegs.erase(itr);
+                  // If DefReg becomes empty after removal, we can remove the
+                  // SrcCopy from the tracker's copy maps. We only remove those
+                  // entries solely record the Def is defined by Src. If an
+                  // entry also contains the definition record of other Def'
+                  // registers, it cannot be cleared.
+                  if (SrcCopy->second.DefRegs.empty() && !SrcCopy->second.MI) {
+                    Copies.erase(SrcCopy);
+                  }
+                  break;
                 }
-                break;
               }
             }
           }
         }
+        // Now we can erase the copy.
+        Copies.erase(I);
       }
-      // Now we can erase the copy.
-      Copies.erase(I);
     }
   }
 
-  /// Clobber a single register, removing it from the tracker's copy maps.
-  void clobberRegister(MCRegister Reg, const TargetRegisterInfo &TRI,
-                       const TargetInstrInfo &TII, bool UseCopyInstr) {
-    for (MCRegUnit Unit : TRI.regunits(Reg)) {
-      clobberRegUnit(Unit, TRI, TII, UseCopyInstr);
-    }
-  }
-
-  /// Clobber all registers which are not preserved by RegMask, removing them
-  /// from the tracker's copy maps.
-  void clobberRegistersExceptMask(const MachineOperand *RegMask,
-                                  const TargetRegisterInfo &TRI,
-                                  const TargetInstrInfo &TII,
-                                  bool UseCopyInstr) {
-    BitVector SafeRegUnits(TRI.getNumRegUnits());
-
-    for (unsigned SafeReg = 0, E = TRI.getNumRegs(); SafeReg < E; ++SafeReg)
-      if (!RegMask->clobbersPhysReg(SafeReg))
-        for (auto SafeUnit : TRI.regunits(SafeReg))
-          SafeRegUnits.set(SafeUnit);
-
-    for (unsigned Unit = 0, E = TRI.getNumRegUnits(); Unit < E; ++Unit)
-      if (!SafeRegUnits.test(Unit))
-        clobberRegUnit(Unit, TRI, TII, UseCopyInstr);
-  }
-
   /// Track copy's src users, and return false if that can't be done.
   /// We can only track if we have a COPY instruction which source is
   /// the same as the Reg.
@@ -984,10 +960,6 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
     // a large set of registers.  Treat clobbered registers the same way as
     // defined registers.
     if (RegMask) {
-      // Invalidate all entries in the copy map which are not preserved by this
-      // register mask.
-      Tracker.clobberRegistersExceptMask(RegMask, *TRI, *TII, UseCopyInstr);
-
       // Erase any MaybeDeadCopies whose destination register is clobbered.
       for (SmallSetVector<MachineInstr *, 8>::iterator DI =
                MaybeDeadCopies.begin();
@@ -1006,6 +978,10 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
         LLVM_DEBUG(dbgs() << "MCP: Removing copy due to regmask clobbering: ";
                    MaybeDead->dump());
 
+        // Make sure we invalidate any entries in the copy maps before erasing
+        // the instruction.
+        Tracker.clobberRegister(Reg, *TRI, *TII, UseCopyInstr);
+
         // erase() will return the next valid iterator pointing to the next
         // element after the erased one.
         DI = MaybeDeadCopies.erase(DI);
diff --git a/llvm/test/CodeGen/AArch64/machine-cp-sub-reg.mir b/llvm/test/CodeGen/AArch64/machine-cp-sub-reg.mir
index e7865569c75bd..5b379c2bd5629 100644
--- a/llvm/test/CodeGen/AArch64/machine-cp-sub-reg.mir
+++ b/llvm/test/CodeGen/AArch64/machine-cp-sub-reg.mir
@@ -1,16 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc -o - %s --run-pass=machine-cp -mcp-use-is-copy-instr -mtriple=arm64-apple-macos | FileCheck %s
-
---- |
-  declare void @foo()
-
-  define void @test() {
-    unreachable
-  }
-  define void @test2() {
-    unreachable
-  }
-...
+# RUN: llc -o - %s --run-pass=machine-cp -mcp-use-is-copy-instr -mtriple=arm64-apple-macos --verify-machineinstrs | FileCheck %s
 
 ---
 name: test
@@ -41,22 +30,3 @@ body:             |
 
     RET undef $lr, implicit $x0
 ...
----
-name:            test2
-tracksRegLiveness: true
-body: |
-  bb.0:
-    liveins: $q14, $d29, $x0, $x1
-    ; CHECK-LABEL: name: test2
-    ; CHECK: liveins: $q14, $d29, $x0, $x1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: renamable $d8 = COPY killed renamable $d29
-    ; CHECK-NEXT: BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
-    ; CHECK-NEXT: renamable $b0 = SMAXVv8i8v killed renamable $d8, implicit-def $q0
-    ; CHECK-NEXT: RET_ReallyLR implicit $b0
-    renamable $q8 = COPY renamable $q14
-    renamable $d8 = COPY killed renamable $d29
-    BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
-    renamable $b0 = SMAXVv8i8v killed renamable $d8, implicit-def $q0
-    RET_ReallyLR implicit $b0
-...

From 22e9024c9f374c0c740647829050c289673dbb11 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 13 Jan 2025 14:40:25 +0100
Subject: [PATCH 270/408] [IR] Introduce captures attribute (#116990)

This introduces the `captures` attribute as described in:
https://discourse.llvm.org/t/rfc-improvements-to-capture-tracking/81420

This initial patch only introduces the IR/bitcode support for the
attribute and its in-memory representation as `CaptureInfo`. This will
be followed by a patch to upgrade and remove the `nocapture` attribute,
and then by actual inference/analysis support.

Based on the RFC feedback, I've used a syntax similar to the `memory`
attribute, though the only "location" that can be specified is `ret`.

I've added some pretty extensive documentation to LangRef on the
semantics. One non-obvious bit here is that using ptrtoint will not
result in a "return-only" capture, even if the ptrtoint result is only
used in the return value. Without this requirement we wouldn't be able
to continue ordinary capture analysis on the return value.
---
 llvm/docs/LangRef.rst                       | 136 ++++++++++++++++++--
 llvm/include/llvm/AsmParser/LLParser.h      |   1 +
 llvm/include/llvm/AsmParser/LLToken.h       |   6 +
 llvm/include/llvm/Bitcode/LLVMBitCodes.h    |   1 +
 llvm/include/llvm/IR/Attributes.h           |   7 +
 llvm/include/llvm/IR/Attributes.td          |   3 +
 llvm/include/llvm/Support/ModRef.h          | 101 +++++++++++++++
 llvm/lib/AsmParser/LLLexer.cpp              |   4 +
 llvm/lib/AsmParser/LLParser.cpp             |  61 +++++++++
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp   |   4 +
 llvm/lib/Bitcode/Writer/BitcodeWriter.cpp   |   2 +
 llvm/lib/IR/AttributeImpl.h                 |   1 +
 llvm/lib/IR/Attributes.cpp                  |  34 ++++-
 llvm/lib/Support/ModRef.cpp                 |  34 +++++
 llvm/lib/Transforms/Utils/CodeExtractor.cpp |   1 +
 llvm/test/Assembler/captures-errors.ll      |  73 +++++++++++
 llvm/test/Assembler/captures.ll             | 103 +++++++++++++++
 llvm/test/Bitcode/attributes.ll             |   5 +
 llvm/unittests/IR/AttributesTest.cpp        |  13 ++
 19 files changed, 580 insertions(+), 10 deletions(-)
 create mode 100644 llvm/test/Assembler/captures-errors.ll
 create mode 100644 llvm/test/Assembler/captures.ll

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 33acb5e73d5ff..8cc9036d1b67f 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -1397,6 +1397,42 @@ Currently, only the following parameter attributes are defined:
     function, returning a pointer to allocated storage disjoint from the
     storage for any other object accessible to the caller.
 
+``captures(...)``
+    This attributes restrict the ways in which the callee may capture the
+    pointer. This is not a valid attribute for return values. This attribute
+    applies only to the particular copy of the pointer passed in this argument.
+
+    The arguments of ``captures`` is a list of captured pointer components,
+    which may be ``none``, or a combination of:
+
+    - ``address``: The integral address of the pointer.
+    - ``address_is_null`` (subset of ``address``): Whether the address is null.
+    - ``provenance``: The ability to access the pointer for both read and write
+      after the function returns.
+    - ``read_provenance`` (subset of ``provenance``): The ability to access the
+      pointer only for reads after the function returns.
+
+    Additionally, it is possible to specify that some components are only
+    captured in certain locations. Currently only the return value (``ret``)
+    and other (default) locations are supported.
+
+    The `pointer capture section <pointercapture>` discusses these semantics
+    in more detail.
+
+    Some examples of how to use the attribute:
+
+    - ``captures(none)``: Pointer not captured.
+    - ``captures(address, provenance)``: Equivalent to omitting the attribute.
+    - ``captures(address)``: Address may be captured, but not provenance.
+    - ``captures(address_is_null)``: Only captures whether the address is null.
+    - ``captures(address, read_provenance)``: Both address and provenance
+      captured, but only for read-only access.
+    - ``captures(ret: address, provenance)``: Pointer captured through return
+      value only.
+    - ``captures(address_is_null, ret: address, provenance)``: The whole pointer
+      is captured through the return value, and additionally whether the pointer
+      is null is captured in some other way.
+
 .. _nocapture:
 
 ``nocapture``
@@ -3339,10 +3375,92 @@ Pointer Capture
 ---------------
 
 Given a function call and a pointer that is passed as an argument or stored in
-the memory before the call, a pointer is *captured* by the call if it makes a
-copy of any part of the pointer that outlives the call.
-To be precise, a pointer is captured if one or more of the following conditions
-hold:
+memory before the call, the call may capture two components of the pointer:
+
+  * The address of the pointer, which is its integral value. This also includes
+    parts of the address or any information about the address, including the
+    fact that it does not equal one specific value. We further distinguish
+    whether only the fact that the address is/isn't null is captured.
+  * The provenance of the pointer, which is the ability to perform memory
+    accesses through the pointer, in the sense of the :ref:`pointer aliasing
+    rules <pointeraliasing>`. We further distinguish whether only read acceses
+    are allowed, or both reads and writes.
+
+For example, the following function captures the address of ``%a``, because
+it is compared to a pointer, leaking information about the identitiy of the
+pointer:
+
+.. code-block:: llvm
+
+    @glb = global i8 0
+
+    define i1 @f(ptr %a) {
+      %c = icmp eq ptr %a, @glb
+      ret i1 %c
+    }
+
+The function does not capture the provenance of the pointer, because the
+``icmp`` instruction only operates on the pointer address. The following
+function captures both the address and provenance of the pointer, as both
+may be read from ``@glb`` after the function returns:
+
+.. code-block:: llvm
+
+    @glb = global ptr null
+
+    define void @f(ptr %a) {
+      store ptr %a, ptr @glb
+      ret void
+    }
+
+The following function captures *neither* the address nor the provenance of
+the pointer:
+
+.. code-block:: llvm
+
+    define i32 @f(ptr %a) {
+      %v = load i32, ptr %a
+      ret i32
+    }
+
+While address capture includes uses of the address within the body of the
+function, provenance capture refers exclusively to the ability to perform
+accesses *after* the function returns. Memory accesses within the function
+itself are not considered pointer captures.
+
+We can further say that the capture only occurs through a specific location.
+In the following example, the pointer (both address and provenance) is captured
+through the return value only:
+
+.. code-block:: llvm
+
+    define ptr @f(ptr %a) {
+      %gep = getelementptr i8, ptr %a, i64 4
+      ret ptr %gep
+    }
+
+However, we always consider direct inspection of the pointer address
+(e.g. using ``ptrtoint``) to be location-independent. The following example
+is *not* considered a return-only capture, even though the ``ptrtoint``
+ultimately only contribues to the return value:
+
+.. code-block:: llvm
+
+    @lookup = constant [4 x i8] [i8 0, i8 1, i8 2, i8 3]
+
+    define ptr @f(ptr %a) {
+      %a.addr = ptrtoint ptr %a to i64
+      %mask = and i64 %a.addr, 3
+      %gep = getelementptr i8, ptr @lookup, i64 %mask
+      ret ptr %gep
+    }
+
+This definition is chosen to allow capture analysis to continue with the return
+value in the usual fashion.
+
+The following describes possible ways to capture a pointer in more detail,
+where unqualified uses of the word "capture" refer to capturing both address
+and provenance.
 
 1. The call stores any bit of the pointer carrying information into a place,
    and the stored bits can be read from the place by the caller after this call
@@ -3381,13 +3499,14 @@ hold:
     @lock = global i1 true
 
     define void @f(ptr %a) {
-      store ptr %a, ptr* @glb
+      store ptr %a, ptr @glb
       store atomic i1 false, ptr @lock release ; %a is captured because another thread can safely read @glb
       store ptr null, ptr @glb
       ret void
     }
 
-3. The call's behavior depends on any bit of the pointer carrying information.
+3. The call's behavior depends on any bit of the pointer carrying information
+   (address capture only).
 
 .. code-block:: llvm
 
@@ -3395,7 +3514,7 @@ hold:
 
     define void @f(ptr %a) {
       %c = icmp eq ptr %a, @glb
-      br i1 %c, label %BB_EXIT, label %BB_CONTINUE ; escapes %a
+      br i1 %c, label %BB_EXIT, label %BB_CONTINUE ; captures address of %a only
     BB_EXIT:
       call void @exit()
       unreachable
@@ -3403,8 +3522,7 @@ hold:
       ret void
     }
 
-4. The pointer is used in a volatile access as its address.
-
+4. The pointer is used as the pointer operand of a volatile access.
 
 .. _volatile:
 
diff --git a/llvm/include/llvm/AsmParser/LLParser.h b/llvm/include/llvm/AsmParser/LLParser.h
index 8b195b028783f..c01de4a289a69 100644
--- a/llvm/include/llvm/AsmParser/LLParser.h
+++ b/llvm/include/llvm/AsmParser/LLParser.h
@@ -379,6 +379,7 @@ namespace llvm {
                                     bool inAttrGrp, LocTy &BuiltinLoc);
     bool parseRangeAttr(AttrBuilder &B);
     bool parseInitializesAttr(AttrBuilder &B);
+    bool parseCapturesAttr(AttrBuilder &B);
     bool parseRequiredTypeAttr(AttrBuilder &B, lltok::Kind AttrToken,
                                Attribute::AttrKind AttrKind);
 
diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h
index 178c911120b4c..7b47bc88ddb25 100644
--- a/llvm/include/llvm/AsmParser/LLToken.h
+++ b/llvm/include/llvm/AsmParser/LLToken.h
@@ -207,6 +207,12 @@ enum Kind {
   kw_inaccessiblememonly,
   kw_inaccessiblemem_or_argmemonly,
 
+  // Captures attribute:
+  kw_address,
+  kw_address_is_null,
+  kw_provenance,
+  kw_read_provenance,
+
   // nofpclass attribute:
   kw_all,
   kw_nan,
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 21fd27d9838db..9eb38c3e44829 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -788,6 +788,7 @@ enum AttributeKindCodes {
   ATTR_KIND_NO_EXT = 99,
   ATTR_KIND_NO_DIVERGENCE_SOURCE = 100,
   ATTR_KIND_SANITIZE_TYPE = 101,
+  ATTR_KIND_CAPTURES = 102,
 };
 
 enum ComdatSelectionKindCodes {
diff --git a/llvm/include/llvm/IR/Attributes.h b/llvm/include/llvm/IR/Attributes.h
index 2755ced404ddd..7612e553fe32e 100644
--- a/llvm/include/llvm/IR/Attributes.h
+++ b/llvm/include/llvm/IR/Attributes.h
@@ -284,6 +284,9 @@ class Attribute {
   /// Returns memory effects.
   MemoryEffects getMemoryEffects() const;
 
+  /// Returns information from captures attribute.
+  CaptureInfo getCaptureInfo() const;
+
   /// Return the FPClassTest for nofpclass
   FPClassTest getNoFPClass() const;
 
@@ -436,6 +439,7 @@ class AttributeSet {
   UWTableKind getUWTableKind() const;
   AllocFnKind getAllocKind() const;
   MemoryEffects getMemoryEffects() const;
+  CaptureInfo getCaptureInfo() const;
   FPClassTest getNoFPClass() const;
   std::string getAsString(bool InAttrGrp = false) const;
 
@@ -1260,6 +1264,9 @@ class AttrBuilder {
   /// Add memory effect attribute.
   AttrBuilder &addMemoryAttr(MemoryEffects ME);
 
+  /// Add captures attribute.
+  AttrBuilder &addCapturesAttr(CaptureInfo CI);
+
   // Add nofpclass attribute
   AttrBuilder &addNoFPClassAttr(FPClassTest NoFPClassMask);
 
diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td
index 61955cf883c3f..4396ec4d04c41 100644
--- a/llvm/include/llvm/IR/Attributes.td
+++ b/llvm/include/llvm/IR/Attributes.td
@@ -183,6 +183,9 @@ def NoCallback : EnumAttr<"nocallback", IntersectAnd, [FnAttr]>;
 /// Function creates no aliases of pointer.
 def NoCapture : EnumAttr<"nocapture", IntersectAnd, [ParamAttr]>;
 
+/// Specify how the pointer may be captured.
+def Captures : IntAttr<"captures", IntersectCustom, [ParamAttr]>;
+
 /// Function is not a source of divergence.
 def NoDivergenceSource : EnumAttr<"nodivergencesource", IntersectAnd, [FnAttr]>;
 
diff --git a/llvm/include/llvm/Support/ModRef.h b/llvm/include/llvm/Support/ModRef.h
index 5a9d80c87ae27..9ecdab71ec8ca 100644
--- a/llvm/include/llvm/Support/ModRef.h
+++ b/llvm/include/llvm/Support/ModRef.h
@@ -273,6 +273,107 @@ raw_ostream &operator<<(raw_ostream &OS, MemoryEffects RMRB);
 // Legacy alias.
 using FunctionModRefBehavior = MemoryEffects;
 
+/// Components of the pointer that may be captured.
+enum class CaptureComponents : uint8_t {
+  None = 0,
+  AddressIsNull = (1 << 0),
+  Address = (1 << 1) | AddressIsNull,
+  ReadProvenance = (1 << 2),
+  Provenance = (1 << 3) | ReadProvenance,
+  All = Address | Provenance,
+  LLVM_MARK_AS_BITMASK_ENUM(Provenance),
+};
+
+inline bool capturesNothing(CaptureComponents CC) {
+  return CC == CaptureComponents::None;
+}
+
+inline bool capturesAnything(CaptureComponents CC) {
+  return CC != CaptureComponents::None;
+}
+
+inline bool capturesAddressIsNullOnly(CaptureComponents CC) {
+  return (CC & CaptureComponents::Address) == CaptureComponents::AddressIsNull;
+}
+
+inline bool capturesAddress(CaptureComponents CC) {
+  return (CC & CaptureComponents::Address) != CaptureComponents::None;
+}
+
+inline bool capturesReadProvenanceOnly(CaptureComponents CC) {
+  return (CC & CaptureComponents::Provenance) ==
+         CaptureComponents::ReadProvenance;
+}
+
+inline bool capturesFullProvenance(CaptureComponents CC) {
+  return (CC & CaptureComponents::Provenance) == CaptureComponents::Provenance;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, CaptureComponents CC);
+
+/// Represents which components of the pointer may be captured in which
+/// location. This represents the captures(...) attribute in IR.
+///
+/// For more information on the precise semantics see LangRef.
+class CaptureInfo {
+  CaptureComponents OtherComponents;
+  CaptureComponents RetComponents;
+
+public:
+  CaptureInfo(CaptureComponents OtherComponents,
+              CaptureComponents RetComponents)
+      : OtherComponents(OtherComponents), RetComponents(RetComponents) {}
+
+  CaptureInfo(CaptureComponents Components)
+      : OtherComponents(Components), RetComponents(Components) {}
+
+  /// Create CaptureInfo that may capture all components of the pointer.
+  static CaptureInfo all() { return CaptureInfo(CaptureComponents::All); }
+
+  /// Get components potentially captured by the return value.
+  CaptureComponents getRetComponents() const { return RetComponents; }
+
+  /// Get components potentially captured through locations other than the
+  /// return value.
+  CaptureComponents getOtherComponents() const { return OtherComponents; }
+
+  /// Get the potentially captured components of the pointer (regardless of
+  /// location).
+  operator CaptureComponents() const { return OtherComponents | RetComponents; }
+
+  bool operator==(CaptureInfo Other) const {
+    return OtherComponents == Other.OtherComponents &&
+           RetComponents == Other.RetComponents;
+  }
+
+  bool operator!=(CaptureInfo Other) const { return !(*this == Other); }
+
+  /// Compute union of CaptureInfos.
+  CaptureInfo operator|(CaptureInfo Other) const {
+    return CaptureInfo(OtherComponents | Other.OtherComponents,
+                       RetComponents | Other.RetComponents);
+  }
+
+  /// Compute intersection of CaptureInfos.
+  CaptureInfo operator&(CaptureInfo Other) const {
+    return CaptureInfo(OtherComponents & Other.OtherComponents,
+                       RetComponents & Other.RetComponents);
+  }
+
+  static CaptureInfo createFromIntValue(uint32_t Data) {
+    return CaptureInfo(CaptureComponents(Data >> 4),
+                       CaptureComponents(Data & 0xf));
+  }
+
+  /// Convert CaptureInfo into an encoded integer value (used by captures
+  /// attribute).
+  uint32_t toIntValue() const {
+    return (uint32_t(OtherComponents) << 4) | uint32_t(RetComponents);
+  }
+};
+
+raw_ostream &operator<<(raw_ostream &OS, CaptureInfo Info);
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 1b8e033134f51..5ea507c009bdc 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -704,6 +704,10 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(argmemonly);
   KEYWORD(inaccessiblememonly);
   KEYWORD(inaccessiblemem_or_argmemonly);
+  KEYWORD(address_is_null);
+  KEYWORD(address);
+  KEYWORD(provenance);
+  KEYWORD(read_provenance);
 
   // nofpclass attribute
   KEYWORD(all);
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 52d48a69f0eb5..81d048b32e139 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -1644,6 +1644,8 @@ bool LLParser::parseEnumAttribute(Attribute::AttrKind Attr, AttrBuilder &B,
     return parseRangeAttr(B);
   case Attribute::Initializes:
     return parseInitializesAttr(B);
+  case Attribute::Captures:
+    return parseCapturesAttr(B);
   default:
     B.addAttribute(Attr);
     Lex.Lex();
@@ -3165,6 +3167,65 @@ bool LLParser::parseInitializesAttr(AttrBuilder &B) {
   return false;
 }
 
+bool LLParser::parseCapturesAttr(AttrBuilder &B) {
+  CaptureComponents Other = CaptureComponents::None;
+  std::optional<CaptureComponents> Ret;
+
+  // We use syntax like captures(ret: address, provenance), so the colon
+  // should not be interpreted as a label terminator.
+  Lex.setIgnoreColonInIdentifiers(true);
+  auto _ = make_scope_exit([&] { Lex.setIgnoreColonInIdentifiers(false); });
+
+  Lex.Lex();
+  if (parseToken(lltok::lparen, "expected '('"))
+    return true;
+
+  CaptureComponents *Current = &Other;
+  bool SeenComponent = false;
+  while (true) {
+    if (EatIfPresent(lltok::kw_ret)) {
+      if (parseToken(lltok::colon, "expected ':'"))
+        return true;
+      if (Ret)
+        return tokError("duplicate 'ret' location");
+      Ret = CaptureComponents::None;
+      Current = &*Ret;
+      SeenComponent = false;
+    }
+
+    if (EatIfPresent(lltok::kw_none)) {
+      if (SeenComponent)
+        return tokError("cannot use 'none' with other component");
+      *Current = CaptureComponents::None;
+    } else {
+      if (SeenComponent && capturesNothing(*Current))
+        return tokError("cannot use 'none' with other component");
+
+      if (EatIfPresent(lltok::kw_address_is_null))
+        *Current |= CaptureComponents::AddressIsNull;
+      else if (EatIfPresent(lltok::kw_address))
+        *Current |= CaptureComponents::Address;
+      else if (EatIfPresent(lltok::kw_provenance))
+        *Current |= CaptureComponents::Provenance;
+      else if (EatIfPresent(lltok::kw_read_provenance))
+        *Current |= CaptureComponents::ReadProvenance;
+      else
+        return tokError("expected one of 'none', 'address', 'address_is_null', "
+                        "'provenance' or 'read_provenance'");
+    }
+
+    SeenComponent = true;
+    if (EatIfPresent(lltok::rparen))
+      break;
+
+    if (parseToken(lltok::comma, "expected ',' or ')'"))
+      return true;
+  }
+
+  B.addCapturesAttr(CaptureInfo(Other, Ret.value_or(Other)));
+  return false;
+}
+
 /// parseOptionalOperandBundles
 ///    ::= /*empty*/
 ///    ::= '[' OperandBundle [, OperandBundle ]* ']'
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index a01ecf0d56642..56f5ff4b20e5d 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -2250,6 +2250,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::CoroElideSafe;
   case bitc::ATTR_KIND_NO_EXT:
     return Attribute::NoExt;
+  case bitc::ATTR_KIND_CAPTURES:
+    return Attribute::Captures;
   }
 }
 
@@ -2389,6 +2391,8 @@ Error BitcodeReader::parseAttributeGroupBlock() {
             B.addAllocKindAttr(static_cast<AllocFnKind>(Record[++i]));
           else if (Kind == Attribute::Memory)
             B.addMemoryAttr(MemoryEffects::createFromIntValue(Record[++i]));
+          else if (Kind == Attribute::Captures)
+            B.addCapturesAttr(CaptureInfo::createFromIntValue(Record[++i]));
           else if (Kind == Attribute::NoFPClass)
             B.addNoFPClassAttr(
                 static_cast<FPClassTest>(Record[++i] & fcAllFlags));
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index b4efd3928a2e6..94d3afa6c1e33 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -907,6 +907,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_INITIALIZES;
   case Attribute::NoExt:
     return bitc::ATTR_KIND_NO_EXT;
+  case Attribute::Captures:
+    return bitc::ATTR_KIND_CAPTURES;
   case Attribute::EndAttrKinds:
     llvm_unreachable("Can not encode end-attribute kinds marker.");
   case Attribute::None:
diff --git a/llvm/lib/IR/AttributeImpl.h b/llvm/lib/IR/AttributeImpl.h
index 82c501dcafcb7..59cc489ade40d 100644
--- a/llvm/lib/IR/AttributeImpl.h
+++ b/llvm/lib/IR/AttributeImpl.h
@@ -346,6 +346,7 @@ class AttributeSetNode final
   UWTableKind getUWTableKind() const;
   AllocFnKind getAllocKind() const;
   MemoryEffects getMemoryEffects() const;
+  CaptureInfo getCaptureInfo() const;
   FPClassTest getNoFPClass() const;
   std::string getAsString(bool InAttrGrp) const;
   Type *getAttributeType(Attribute::AttrKind Kind) const;
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
index e9daa01b899e8..ceb31856283c9 100644
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -487,6 +487,12 @@ MemoryEffects Attribute::getMemoryEffects() const {
   return MemoryEffects::createFromIntValue(pImpl->getValueAsInt());
 }
 
+CaptureInfo Attribute::getCaptureInfo() const {
+  assert(hasAttribute(Attribute::Captures) &&
+         "Can only call getCaptureInfo() on captures attribute");
+  return CaptureInfo::createFromIntValue(pImpl->getValueAsInt());
+}
+
 FPClassTest Attribute::getNoFPClass() const {
   assert(hasAttribute(Attribute::NoFPClass) &&
          "Can only call getNoFPClass() on nofpclass attribute");
@@ -647,6 +653,13 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
     return Result;
   }
 
+  if (hasAttribute(Attribute::Captures)) {
+    std::string Result;
+    raw_string_ostream OS(Result);
+    OS << getCaptureInfo();
+    return Result;
+  }
+
   if (hasAttribute(Attribute::NoFPClass)) {
     std::string Result = "nofpclass";
     raw_string_ostream OS(Result);
@@ -1050,6 +1063,10 @@ AttributeSet::intersectWith(LLVMContext &C, AttributeSet Other) const {
         Intersected.addMemoryAttr(Attr0.getMemoryEffects() |
                                   Attr1.getMemoryEffects());
         break;
+      case Attribute::Captures:
+        Intersected.addCapturesAttr(Attr0.getCaptureInfo() |
+                                    Attr1.getCaptureInfo());
+        break;
       case Attribute::NoFPClass:
         Intersected.addNoFPClassAttr(Attr0.getNoFPClass() &
                                      Attr1.getNoFPClass());
@@ -1170,6 +1187,10 @@ MemoryEffects AttributeSet::getMemoryEffects() const {
   return SetNode ? SetNode->getMemoryEffects() : MemoryEffects::unknown();
 }
 
+CaptureInfo AttributeSet::getCaptureInfo() const {
+  return SetNode ? SetNode->getCaptureInfo() : CaptureInfo::all();
+}
+
 FPClassTest AttributeSet::getNoFPClass() const {
   return SetNode ? SetNode->getNoFPClass() : fcNone;
 }
@@ -1358,6 +1379,12 @@ MemoryEffects AttributeSetNode::getMemoryEffects() const {
   return MemoryEffects::unknown();
 }
 
+CaptureInfo AttributeSetNode::getCaptureInfo() const {
+  if (auto A = findEnumAttribute(Attribute::Captures))
+    return A->getCaptureInfo();
+  return CaptureInfo::all();
+}
+
 FPClassTest AttributeSetNode::getNoFPClass() const {
   if (auto A = findEnumAttribute(Attribute::NoFPClass))
     return A->getNoFPClass();
@@ -2190,6 +2217,10 @@ AttrBuilder &AttrBuilder::addMemoryAttr(MemoryEffects ME) {
   return addRawIntAttr(Attribute::Memory, ME.toIntValue());
 }
 
+AttrBuilder &AttrBuilder::addCapturesAttr(CaptureInfo CI) {
+  return addRawIntAttr(Attribute::Captures, CI.toIntValue());
+}
+
 AttrBuilder &AttrBuilder::addNoFPClassAttr(FPClassTest Mask) {
   if (Mask == fcNone)
     return *this;
@@ -2350,7 +2381,8 @@ AttributeMask AttributeFuncs::typeIncompatible(Type *Ty, AttributeSet AS,
           .addAttribute(Attribute::DereferenceableOrNull)
           .addAttribute(Attribute::Writable)
           .addAttribute(Attribute::DeadOnUnwind)
-          .addAttribute(Attribute::Initializes);
+          .addAttribute(Attribute::Initializes)
+          .addAttribute(Attribute::Captures);
     if (ASK & ASK_UNSAFE_TO_DROP)
       Incompatible.addAttribute(Attribute::Nest)
           .addAttribute(Attribute::SwiftError)
diff --git a/llvm/lib/Support/ModRef.cpp b/llvm/lib/Support/ModRef.cpp
index a4eb70edd38d1..d3b3dd11171f1 100644
--- a/llvm/lib/Support/ModRef.cpp
+++ b/llvm/lib/Support/ModRef.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/Support/ModRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
 
 using namespace llvm;
 
@@ -50,3 +51,36 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, MemoryEffects ME) {
   });
   return OS;
 }
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, CaptureComponents CC) {
+  if (capturesNothing(CC)) {
+    OS << "none";
+    return OS;
+  }
+
+  ListSeparator LS;
+  if (capturesAddressIsNullOnly(CC))
+    OS << LS << "address_is_null";
+  else if (capturesAddress(CC))
+    OS << LS << "address";
+  if (capturesReadProvenanceOnly(CC))
+    OS << LS << "read_provenance";
+  if (capturesFullProvenance(CC))
+    OS << LS << "provenance";
+
+  return OS;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, CaptureInfo CI) {
+  ListSeparator LS;
+  CaptureComponents Other = CI.getOtherComponents();
+  CaptureComponents Ret = CI.getRetComponents();
+
+  OS << "captures(";
+  if (!capturesNothing(Other) || Other == Ret)
+    OS << LS << Other;
+  if (Other != Ret)
+    OS << LS << "ret: " << Ret;
+  OS << ")";
+  return OS;
+}
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 7ddb9e22c8344..af9813775f242 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -975,6 +975,7 @@ Function *CodeExtractor::constructFunctionDeclaration(
       case Attribute::AllocatedPointer:
       case Attribute::AllocAlign:
       case Attribute::ByVal:
+      case Attribute::Captures:
       case Attribute::Dereferenceable:
       case Attribute::DereferenceableOrNull:
       case Attribute::ElementType:
diff --git a/llvm/test/Assembler/captures-errors.ll b/llvm/test/Assembler/captures-errors.ll
new file mode 100644
index 0000000000000..44788c79a2453
--- /dev/null
+++ b/llvm/test/Assembler/captures-errors.ll
@@ -0,0 +1,73 @@
+; RUN: split-file --leading-lines %s %t
+; RUN: not llvm-as < %t/missing-lparen.ll 2>&1 | FileCheck %s --check-prefix=CHECK-MISSING-LPAREN
+; RUN: not llvm-as < %t/missing-rparen.ll 2>&1 | FileCheck %s --check-prefix=CHECK-MISSING-RPAREN
+; RUN: not llvm-as < %t/missing-rparen-none.ll 2>&1 | FileCheck %s --check-prefix=CHECK-MISSING-RPAREN-NONE
+; RUN: not llvm-as < %t/missing-colon.ll 2>&1 | FileCheck %s --check-prefix=CHECK-MISSING-COLON
+; RUN: not llvm-as < %t/invalid-component.ll 2>&1 | FileCheck %s --check-prefix=CHECK-INVALID-COMPONENT
+; RUN: not llvm-as < %t/duplicate-ret.ll 2>&1 | FileCheck %s --check-prefix=CHECK-DUPLICATE-RET
+; RUN: not llvm-as < %t/none-after.ll 2>&1 | FileCheck %s --check-prefix=CHECK-NONE-AFTER
+; RUN: not llvm-as < %t/none-before.ll 2>&1 | FileCheck %s --check-prefix=CHECK-NONE-BEFORE
+; RUN: not opt -disable-output < %t/non-pointer-type.ll 2>&1 | FileCheck %s --check-prefix=CHECK-NON-POINTER-TYPE
+
+;--- missing-lparen.ll
+
+; CHECK-MISSING-LPAREN: <stdin>:[[@LINE+1]]:32: error: expected '('
+define void @test(ptr captures %p) {
+  ret void
+}
+
+;--- missing-rparen.ll
+
+; CHECK-MISSING-RPAREN: <stdin>:[[@LINE+1]]:40: error: expected ',' or ')'
+define void @test(ptr captures(address %p) {
+  ret void
+}
+
+;--- missing-rparen-none.ll
+
+; CHECK-MISSING-RPAREN-NONE: <stdin>:[[@LINE+1]]:37: error: expected ',' or ')'
+define void @test(ptr captures(none %p) {
+  ret void
+}
+
+;--- missing-colon.ll
+
+; CHECK-MISSING-COLON: <stdin>:[[@LINE+1]]:36: error: expected ':'
+define void @test(ptr captures(ret address) %p) {
+  ret void
+}
+
+;--- invalid-component.ll
+
+; CHECK-INVALID-COMPONENT: <stdin>:[[@LINE+1]]:32: error: expected one of 'none', 'address', 'address_is_null', 'provenance' or 'read_provenance'
+define void @test(ptr captures(foo) %p) {
+  ret void
+}
+
+;--- duplicate-ret.ll
+
+; CHECK-DUPLICATE-RET: <stdin>:[[@LINE+1]]:51: error: duplicate 'ret' location
+define void @test(ptr captures(ret: address, ret: provenance) %p) {
+  ret void
+}
+
+;--- none-after.ll
+
+; CHECK-NONE-AFTER: <stdin>:[[@LINE+1]]:45: error: cannot use 'none' with other component
+define void @test(ptr captures(address, none) %p) {
+  ret void
+}
+
+;--- none-before.ll
+
+; CHECK-NONE-BEFORE: <stdin>:[[@LINE+1]]:38: error: cannot use 'none' with other component
+define void @test(ptr captures(none, address) %p) {
+  ret void
+}
+
+;--- non-pointer-type.ll
+
+; CHECK-NON-POINTER-TYPE: Attribute 'captures(none)' applied to incompatible type!
+define void @test(i32 captures(none) %p) {
+  ret void
+}
diff --git a/llvm/test/Assembler/captures.ll b/llvm/test/Assembler/captures.ll
new file mode 100644
index 0000000000000..1521a9df0cb42
--- /dev/null
+++ b/llvm/test/Assembler/captures.ll
@@ -0,0 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S < %s | FileCheck %s
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+define void @test_none(ptr captures(none) %p) {
+; CHECK-LABEL: define void @test_none(
+; CHECK-SAME: ptr captures(none) [[P:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define void @test_address(ptr captures(address) %p) {
+; CHECK-LABEL: define void @test_address(
+; CHECK-SAME: ptr captures(address) [[P:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define void @test_address_is_null(ptr captures(address_is_null) %p) {
+; CHECK-LABEL: define void @test_address_is_null(
+; CHECK-SAME: ptr captures(address_is_null) [[P:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define void @test_address_provenance(ptr captures(address, provenance) %p) {
+; CHECK-LABEL: define void @test_address_provenance(
+; CHECK-SAME: ptr captures(address, provenance) [[P:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define void @test_address_read_provenance(ptr captures(address, read_provenance) %p) {
+; CHECK-LABEL: define void @test_address_read_provenance(
+; CHECK-SAME: ptr captures(address, read_provenance) [[P:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define void @test_ret(ptr captures(ret: address, provenance) %p) {
+; CHECK-LABEL: define void @test_ret(
+; CHECK-SAME: ptr captures(ret: address, provenance) [[P:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define void @test_address_is_null_and_ret(ptr captures(address_is_null, ret: address, provenance) %p) {
+; CHECK-LABEL: define void @test_address_is_null_and_ret(
+; CHECK-SAME: ptr captures(address_is_null, ret: address, provenance) [[P:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define void @test_address_and_ret_none(ptr captures(address, ret: none) %p) {
+; CHECK-LABEL: define void @test_address_and_ret_none(
+; CHECK-SAME: ptr captures(address, ret: none) [[P:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+; Duplicates callpse into one.
+define void @test_duplicate(ptr captures(address, address) %p) {
+; CHECK-LABEL: define void @test_duplicate(
+; CHECK-SAME: ptr captures(address) [[P:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+; read_provenance is a subset of provenance.
+define void @test_duplicate_read_provenance(ptr captures(read_provenance, provenance) %p) {
+; CHECK-LABEL: define void @test_duplicate_read_provenance(
+; CHECK-SAME: ptr captures(provenance) [[P:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+; address_is_null is a subset of address.
+define void @test_duplicate_address_is_null(ptr captures(address_is_null, address) %p) {
+; CHECK-LABEL: define void @test_duplicate_address_is_null(
+; CHECK-SAME: ptr captures(address) [[P:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+; Return-only none is same as plain none.
+define void @test_ret_none(ptr captures(ret: none) %p) {
+; CHECK-LABEL: define void @test_ret_none(
+; CHECK-SAME: ptr captures(none) [[P:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
diff --git a/llvm/test/Bitcode/attributes.ll b/llvm/test/Bitcode/attributes.ll
index 492de663884df..1da9291c71996 100644
--- a/llvm/test/Bitcode/attributes.ll
+++ b/llvm/test/Bitcode/attributes.ll
@@ -562,6 +562,11 @@ define void @initializes(ptr initializes((-4, 0), (4, 8)) %a) {
   ret void
 }
 
+; CHECK: define void @captures(ptr captures(address) %p)
+define void @captures(ptr captures(address) %p) {
+  ret void
+}
+
 ; CHECK: attributes #0 = { noreturn }
 ; CHECK: attributes #1 = { nounwind }
 ; CHECK: attributes #2 = { memory(none) }
diff --git a/llvm/unittests/IR/AttributesTest.cpp b/llvm/unittests/IR/AttributesTest.cpp
index f73f2b20e9fea..f0e34aa273369 100644
--- a/llvm/unittests/IR/AttributesTest.cpp
+++ b/llvm/unittests/IR/AttributesTest.cpp
@@ -437,6 +437,14 @@ TEST(Attributes, SetIntersect) {
         break;
       case Attribute::Range:
         break;
+      case Attribute::Captures:
+        V0 = CaptureInfo(CaptureComponents::AddressIsNull,
+                         CaptureComponents::None)
+                 .toIntValue();
+        V1 = CaptureInfo(CaptureComponents::None,
+                         CaptureComponents::ReadProvenance)
+                 .toIntValue();
+        break;
       default:
         ASSERT_FALSE(true);
       }
@@ -516,6 +524,11 @@ TEST(Attributes, SetIntersect) {
         ASSERT_EQ(Res->getAttribute(Kind).getRange(),
                   ConstantRange(APInt(32, 0), APInt(32, 20)));
         break;
+      case Attribute::Captures:
+        ASSERT_EQ(Res->getCaptureInfo(),
+                  CaptureInfo(CaptureComponents::AddressIsNull,
+                              CaptureComponents::ReadProvenance));
+        break;
       default:
         ASSERT_FALSE(true);
       }

From 7ed451a3f3f777966b05c51af920aa23fa1cd73e Mon Sep 17 00:00:00 2001
From: Lukacma <Marian.Lukac@arm.com>
Date: Mon, 13 Jan 2025 13:44:15 +0000
Subject: [PATCH 271/408] [AArch64] Change feature dependencies of fp8 features
 (#122280)

This patch simplifies feature dependencies of FP8 features and also adds
new tests to check these.
---
 .../fp8-intrinsics/acle_sve2_fp8_fdot.c       | 12 ++++----
 .../sme2-intrinsics/acle_sme2_fp8_fdot.c      | 10 +++----
 .../sme2-intrinsics/acle_sme2_fp8_fvdot.c     | 10 +++----
 .../acle_sme2_fp8_imm.c                       |  2 +-
 llvm/lib/Target/AArch64/AArch64Features.td    | 10 +++----
 llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll     |  4 +--
 .../TargetParser/TargetParserTest.cpp         | 29 +++++++++++++++++--
 7 files changed, 51 insertions(+), 26 deletions(-)

diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c
index 950a19115811e..2f3994df03784 100644
--- a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c
+++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c
@@ -1,12 +1,12 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
-// RUN: %clang_cc1        -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +fp8 -target-feature +ssve-fp8dot2 -target-feature +ssve-fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CHECK-CXX
+// RUN: %clang_cc1        -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +ssve-fp8dot2 -target-feature +ssve-fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CHECK-CXX
 
-// RUN: %clang_cc1        -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -x c++ -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +fp8 -target-feature +ssve-fp8dot2 -target-feature +ssve-fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CHECK-CXX
+// RUN: %clang_cc1        -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -x c++ -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +ssve-fp8dot2 -target-feature +ssve-fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CHECK-CXX
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -target-feature +ssve-fp8dot2 -target-feature +ssve-fp8dot4 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8dot2 -target-feature +fp8dot4 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +ssve-fp8dot2 -target-feature +ssve-fp8dot4 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c
index a151d162e0108..2da4ab541869e 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fvdot.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fvdot.c
index fc95cf541172a..8353b3aebc9fc 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fvdot.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fvdot.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_imm.c b/clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_imm.c
index bea0b29bcc70a..fd5374d928ea9 100644
--- a/clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_imm.c
+++ b/clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_imm.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -fsyntax-only -verify  %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -fsyntax-only -verify  %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index 41eb9a73bd013..5a233e2d870b3 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -486,16 +486,16 @@ def FeatureSSVE_FP8FMA : ExtensionWithMArch<"ssve-fp8fma", "SSVE_FP8FMA", "FEAT_
   "Enable SVE2 FP8 multiply-add instructions", [FeatureSME2, FeatureFP8]>;
 
 def FeatureFP8DOT4: ExtensionWithMArch<"fp8dot4", "FP8DOT4", "FEAT_FP8DOT4",
-  "Enable FP8 4-way dot instructions", [FeatureFP8FMA]>;
+  "Enable FP8 4-way dot instructions", [FeatureNEON, FeatureFP8]>;
 
 def FeatureFP8DOT2: ExtensionWithMArch<"fp8dot2", "FP8DOT2", "FEAT_FP8DOT2",
-  "Enable FP8 2-way dot instructions", [FeatureFP8DOT4]>;
+  "Enable FP8 2-way dot instructions", [FeatureNEON, FeatureFP8]>;
 
 def FeatureSSVE_FP8DOT4 : ExtensionWithMArch<"ssve-fp8dot4", "SSVE_FP8DOT4", "FEAT_SSVE_FP8DOT4",
-  "Enable SVE2 FP8 4-way dot product instructions", [FeatureSSVE_FP8FMA]>;
+  "Enable SVE2 FP8 4-way dot product instructions", [FeatureSME2, FeatureFP8]>;
 
 def FeatureSSVE_FP8DOT2 : ExtensionWithMArch<"ssve-fp8dot2", "SSVE_FP8DOT2", "FEAT_SSVE_FP8DOT2",
-  "Enable SVE2 FP8 2-way dot product instructions", [FeatureSSVE_FP8DOT4]>;
+  "Enable SVE2 FP8 2-way dot product instructions", [FeatureSME2, FeatureFP8]>;
 
 def FeatureSME_LUTv2 : ExtensionWithMArch<"sme-lutv2", "SME_LUTv2", "FEAT_SME_LUTv2",
   "Enable Scalable Matrix Extension (SME) LUTv2 instructions", [FeatureSME2]>;
@@ -504,7 +504,7 @@ def FeatureSMEF8F32 : ExtensionWithMArch<"sme-f8f32", "SMEF8F32", "FEAT_SME_F8F3
   "Enable Scalable Matrix Extension (SME) F8F32 instructions", [FeatureSME2, FeatureFP8]>;
 
 def FeatureSMEF8F16 : ExtensionWithMArch<"sme-f8f16", "SMEF8F16", "FEAT_SME_F8F16",
-  "Enable Scalable Matrix Extension (SME) F8F16 instructions", [FeatureSMEF8F32]>;
+  "Enable Scalable Matrix Extension (SME) F8F16 instructions", [FeatureSME2, FeatureFP8]>;
 
 def FeatureCPA : ExtensionWithMArch<"cpa", "CPA", "FEAT_CPA",
   "Enable Armv9.5-A Checked Pointer Arithmetic">;
diff --git a/llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll b/llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll
index 0cead19a74bfd..478404dcd50aa 100644
--- a/llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll
+++ b/llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mattr=+sve2,+fp8,+fp8dot2,+fp8dot4  < %s | FileCheck %s
-; RUN: llc -mattr=+sme,+fp8,+ssve-fp8dot2,+ssve-fp8dot4 --force-streaming < %s | FileCheck %s
+; RUN: llc -mattr=+sve2,+fp8dot2,+fp8dot4  < %s | FileCheck %s
+; RUN: llc -mattr=+sme,+ssve-fp8dot2,+ssve-fp8dot4 --force-streaming < %s | FileCheck %s
 
 target triple = "aarch64-linux"
 
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index 1f69190e4bec5..c03d3e8575d81 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -1803,7 +1803,7 @@ AArch64ExtensionDependenciesBaseArchTestParams
         {AArch64::ARMV9_6A, {"nofp", "fprcvt"}, {"fp-armv8", "fprcvt"}, {}},
         {AArch64::ARMV9_6A, {"fprcvt", "nofp"}, {}, {"fp-armv8", "fprcvt"}},
 
-        // simd -> {aes, sha2, sha3, sm4, f8f16mm, f8f32mm}
+        // simd -> {aes, sha2, sha3, sm4, f8f16mm, f8f32mm, fp8dot4, fp8dot2}
         {AArch64::ARMV8A, {"nosimd", "aes"}, {"neon", "aes"}, {}},
         {AArch64::ARMV8A, {"aes", "nosimd"}, {}, {"neon", "aes"}},
         {AArch64::ARMV8A, {"nosimd", "sha2"}, {"neon", "sha2"}, {}},
@@ -1816,6 +1816,10 @@ AArch64ExtensionDependenciesBaseArchTestParams
         {AArch64::ARMV9_6A, {"f8f16mm", "nosimd"}, {}, {"neon", "f8f16mm"}},
         {AArch64::ARMV9_6A, {"nosimd", "f8f32mm"}, {"neon", "f8f32mm"}, {}},
         {AArch64::ARMV9_6A, {"f8f32mm", "nosimd"}, {}, {"neon", "f8f32mm"}},
+        {AArch64::ARMV9_6A, {"nosimd", "fp8dot4"}, {"neon", "fp8dot4"}, {}},
+        {AArch64::ARMV9_6A, {"fp8dot4", "nosimd"}, {}, {"neon", "fp8dot4"}},
+        {AArch64::ARMV9_6A, {"nosimd", "fp8dot2"}, {"neon", "fp8dot2"}, {}},
+        {AArch64::ARMV9_6A, {"fp8dot2", "nosimd"}, {}, {"neon", "fp8dot2"}},
 
         // simd -> {rdm, dotprod, fcma}
         {AArch64::ARMV8A, {"nosimd", "rdm"}, {"neon", "rdm"}, {}},
@@ -1940,7 +1944,8 @@ AArch64ExtensionDependenciesBaseArchTestParams
         {AArch64::ARMV9_6A, {"nosme2p1", "sme2p2"}, {"sme2p2", "sme2p1"}, {}},
         {AArch64::ARMV9_6A, {"sme2p2", "nosme2p1"}, {}, {"sme2p1", "sme2p2"}},
 
-        // fp8 -> {sme-f8f16, sme-f8f32, f8f16mm, f8f32mm}
+        // fp8 -> {sme-f8f16, sme-f8f32, f8f16mm, f8f32mm, fp8dot4, fp8dot2,
+        // ssve-fp8dot4, ssve-fp8dot2}
         {AArch64::ARMV8A, {"nofp8", "sme-f8f16"}, {"fp8", "sme-f8f16"}, {}},
         {AArch64::ARMV8A, {"sme-f8f16", "nofp8"}, {}, {"fp8", "sme-f8f16"}},
         {AArch64::ARMV8A, {"nofp8", "sme-f8f32"}, {"fp8", "sme-f8f32"}, {}},
@@ -1949,6 +1954,26 @@ AArch64ExtensionDependenciesBaseArchTestParams
         {AArch64::ARMV9_6A, {"f8f16mm", "nofp8"}, {}, {"fp8", "f8f16mm"}},
         {AArch64::ARMV9_6A, {"nofp8", "f8f32mm"}, {"fp8", "f8f32mm"}, {}},
         {AArch64::ARMV9_6A, {"f8f32mm", "nofp8"}, {}, {"fp8", "f8f32mm"}},
+        {AArch64::ARMV9_6A, {"nofp8", "fp8dot4"}, {"fp8", "fp8dot4"}, {}},
+        {AArch64::ARMV9_6A, {"fp8dot4", "nofp8"}, {}, {"fp8", "fp8dot4"}},
+        {AArch64::ARMV9_6A, {"nofp8", "fp8dot2"}, {"fp8", "fp8dot2"}, {}},
+        {AArch64::ARMV9_6A, {"fp8dot2", "nofp8"}, {}, {"fp8", "fp8dot2"}},
+        {AArch64::ARMV9_6A,
+         {"nofp8", "ssve-fp8dot4"},
+         {"fp8", "ssve-fp8dot4"},
+         {}},
+        {AArch64::ARMV9_6A,
+         {"ssve-fp8dot4", "nofp8"},
+         {},
+         {"fp8", "ssve-fp8dot4"}},
+        {AArch64::ARMV9_6A,
+         {"nofp8", "ssve-fp8dot2"},
+         {"fp8", "ssve-fp8dot2"},
+         {}},
+        {AArch64::ARMV9_6A,
+         {"ssve-fp8dot2", "nofp8"},
+         {},
+         {"fp8", "ssve-fp8dot2"}},
 
         // lse -> lse128
         {AArch64::ARMV8A, {"nolse", "lse128"}, {"lse", "lse128"}, {}},

From ad38e24eb74e97148faec97c4f843b87768b6e9b Mon Sep 17 00:00:00 2001
From: David Pagan <dave.pagan@amd.com>
Date: Mon, 13 Jan 2025 05:44:48 -0800
Subject: [PATCH 272/408] [clang][OpenMP] Add 'align' modifier for 'allocate'
 clause (#121814)

The 'align' modifier is now accepted in the 'allocate' clause. Added LIT
tests covering codegen, PCH, template handling, and serialization for
'align' modifier.

Added support for align-modifier to release notes.

Testing
- New allocate modifier LIT tests.
- OpenMP LIT tests.
- check-all
---
 clang/docs/OpenMPSupport.rst                  |   2 +
 clang/docs/ReleaseNotes.rst                   |   1 +
 clang/include/clang/AST/OpenMPClause.h        |  92 +++-
 .../clang/Basic/DiagnosticParseKinds.td       |   2 +
 clang/include/clang/Basic/OpenMPKinds.def     |   1 +
 clang/include/clang/Basic/OpenMPKinds.h       |   4 +
 clang/include/clang/Sema/SemaOpenMP.h         |  20 +-
 clang/lib/AST/OpenMPClause.cpp                |  58 ++-
 clang/lib/Parse/ParseOpenMP.cpp               |  96 +++-
 clang/lib/Sema/SemaOpenMP.cpp                 | 102 +++--
 clang/lib/Sema/TreeTransform.h                |  30 +-
 clang/lib/Serialization/ASTReader.cpp         |   4 +-
 clang/lib/Serialization/ASTWriter.cpp         |   4 +-
 .../allocate_allocator_modifier_codegen.cpp   | 255 -----------
 .../allocate_allocator_modifier_messages.cpp  |  97 -----
 ...t.cpp => allocate_modifiers_ast_print.cpp} |  77 +++-
 .../OpenMP/allocate_modifiers_codegen.cpp     | 409 ++++++++++++++++++
 .../OpenMP/allocate_modifiers_messages.cpp    | 159 +++++++
 18 files changed, 962 insertions(+), 451 deletions(-)
 delete mode 100644 clang/test/OpenMP/allocate_allocator_modifier_codegen.cpp
 delete mode 100644 clang/test/OpenMP/allocate_allocator_modifier_messages.cpp
 rename clang/test/OpenMP/{allocate_allocator_modifier_ast_print.cpp => allocate_modifiers_ast_print.cpp} (51%)
 create mode 100644 clang/test/OpenMP/allocate_modifiers_codegen.cpp
 create mode 100644 clang/test/OpenMP/allocate_modifiers_messages.cpp

diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst
index a1cb7fe359ebf..673c34bf08a4a 100644
--- a/clang/docs/OpenMPSupport.rst
+++ b/clang/docs/OpenMPSupport.rst
@@ -286,6 +286,8 @@ implementation.
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | memory management            | 'allocator' modifier for allocate clause                     | :good:`done`             | https://github.com/llvm/llvm-project/pull/114883                      |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| memory management            | 'align' modifier for allocate clause                         | :good:`done`             | https://github.com/llvm/llvm-project/pull/121814                      |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | memory management            | new memory management routines                               | :none:`unclaimed`        |                                                                       |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | memory management            | changes to omp_alloctrait_key enum                           | :none:`unclaimed`        |                                                                       |
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 8f4adbcd70518..9eeb872aa57d7 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1370,6 +1370,7 @@ OpenMP Support
   always build support for AMDGPU and NVPTX targets.
 - Added support for combined masked constructs  'omp parallel masked taskloop',
   'omp parallel masked taskloop simd','omp masked taskloop' and 'omp masked taskloop simd' directive.
+- Added support for align-modifier in 'allocate' clause.
 
 Improvements
 ^^^^^^^^^^^^
diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index d2f5267e4da5e..b9088eff3bb52 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -498,6 +498,9 @@ class OMPAllocateClause final
   /// Allocator specified in the clause, or 'nullptr' if the default one is
   /// used.
   Expr *Allocator = nullptr;
+  /// Alignment specified in the clause, or 'nullptr' if the default one is
+  /// used.
+  Expr *Alignment = nullptr;
   /// Position of the ':' delimiter in the clause;
   SourceLocation ColonLoc;
   /// Modifier of 'allocate' clause.
@@ -505,6 +508,41 @@ class OMPAllocateClause final
   /// Location of allocator modifier if any.
   SourceLocation AllocatorModifierLoc;
 
+  // ----------------------------------------------------------------------------
+
+  /// Modifiers for 'allocate' clause.
+  enum { FIRST, SECOND, NUM_MODIFIERS };
+  OpenMPAllocateClauseModifier Modifiers[NUM_MODIFIERS];
+
+  /// Locations of modifiers.
+  SourceLocation ModifiersLoc[NUM_MODIFIERS];
+
+  /// Set the first allocate modifier.
+  ///
+  /// \param M Allocate modifier.
+  void setFirstAllocateModifier(OpenMPAllocateClauseModifier M) {
+    Modifiers[FIRST] = M;
+  }
+
+  /// Set the second allocate modifier.
+  ///
+  /// \param M Allocate modifier.
+  void setSecondAllocateModifier(OpenMPAllocateClauseModifier M) {
+    Modifiers[SECOND] = M;
+  }
+
+  /// Set location of the first allocate modifier.
+  void setFirstAllocateModifierLoc(SourceLocation Loc) {
+    ModifiersLoc[FIRST] = Loc;
+  }
+
+  /// Set location of the second allocate modifier.
+  void setSecondAllocateModifierLoc(SourceLocation Loc) {
+    ModifiersLoc[SECOND] = Loc;
+  }
+
+  // ----------------------------------------------------------------------------
+
   /// Build clause with number of variables \a N.
   ///
   /// \param StartLoc Starting location of the clause.
@@ -514,15 +552,20 @@ class OMPAllocateClause final
   /// \param EndLoc Ending location of the clause.
   /// \param N Number of the variables in the clause.
   OMPAllocateClause(SourceLocation StartLoc, SourceLocation LParenLoc,
-                    Expr *Allocator, SourceLocation ColonLoc,
-                    OpenMPAllocateClauseModifier AllocatorModifier,
-                    SourceLocation AllocatorModifierLoc, SourceLocation EndLoc,
+                    Expr *Allocator, Expr *Alignment, SourceLocation ColonLoc,
+                    OpenMPAllocateClauseModifier Modifier1,
+                    SourceLocation Modifier1Loc,
+                    OpenMPAllocateClauseModifier Modifier2,
+                    SourceLocation Modifier2Loc, SourceLocation EndLoc,
                     unsigned N)
       : OMPVarListClause<OMPAllocateClause>(llvm::omp::OMPC_allocate, StartLoc,
                                             LParenLoc, EndLoc, N),
-        Allocator(Allocator), ColonLoc(ColonLoc),
-        AllocatorModifier(AllocatorModifier),
-        AllocatorModifierLoc(AllocatorModifierLoc) {}
+        Allocator(Allocator), Alignment(Alignment), ColonLoc(ColonLoc) {
+    Modifiers[FIRST] = Modifier1;
+    Modifiers[SECOND] = Modifier2;
+    ModifiersLoc[FIRST] = Modifier1Loc;
+    ModifiersLoc[SECOND] = Modifier2Loc;
+  }
 
   /// Build an empty clause.
   ///
@@ -530,7 +573,10 @@ class OMPAllocateClause final
   explicit OMPAllocateClause(unsigned N)
       : OMPVarListClause<OMPAllocateClause>(llvm::omp::OMPC_allocate,
                                             SourceLocation(), SourceLocation(),
-                                            SourceLocation(), N) {}
+                                            SourceLocation(), N) {
+    Modifiers[FIRST] = OMPC_ALLOCATE_unknown;
+    Modifiers[SECOND] = OMPC_ALLOCATE_unknown;
+  }
 
   /// Sets location of ':' symbol in clause.
   void setColonLoc(SourceLocation CL) { ColonLoc = CL; }
@@ -539,6 +585,7 @@ class OMPAllocateClause final
   void setAllocatorModifier(OpenMPAllocateClauseModifier AM) {
     AllocatorModifier = AM;
   }
+  void setAlignment(Expr *A) { Alignment = A; }
 
 public:
   /// Creates clause with a list of variables \a VL.
@@ -554,19 +601,42 @@ class OMPAllocateClause final
   /// \param VL List of references to the variables.
   static OMPAllocateClause *
   Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc,
-         Expr *Allocator, SourceLocation ColonLoc,
-         OpenMPAllocateClauseModifier AllocatorModifier,
-         SourceLocation AllocatorModifierLoc, SourceLocation EndLoc,
-         ArrayRef<Expr *> VL);
+         Expr *Allocator, Expr *Alignment, SourceLocation ColonLoc,
+         OpenMPAllocateClauseModifier Modifier1, SourceLocation Modifier1Loc,
+         OpenMPAllocateClauseModifier Modifier2, SourceLocation Modifier2Loc,
+         SourceLocation EndLoc, ArrayRef<Expr *> VL);
 
   /// Returns the allocator expression or nullptr, if no allocator is specified.
   Expr *getAllocator() const { return Allocator; }
 
+  /// Returns the alignment expression or nullptr, if no alignment specified.
+  Expr *getAlignment() const { return Alignment; }
+
   /// Return 'allocate' modifier.
   OpenMPAllocateClauseModifier getAllocatorModifier() const {
     return AllocatorModifier;
   }
 
+  /// Get the first modifier of the clause.
+  OpenMPAllocateClauseModifier getFirstAllocateModifier() const {
+    return Modifiers[FIRST];
+  }
+
+  /// Get location of first modifier of the clause.
+  SourceLocation getFirstAllocateModifierLoc() const {
+    return ModifiersLoc[FIRST];
+  }
+
+  /// Get the second modifier of the clause.
+  OpenMPAllocateClauseModifier getSecondAllocateModifier() const {
+    return Modifiers[SECOND];
+  }
+
+  /// Get location of second modifier of the clause.
+  SourceLocation getSecondAllocateModifierLoc() const {
+    return ModifiersLoc[SECOND];
+  }
+
   /// Returns the location of the ':' delimiter.
   SourceLocation getColonLoc() const { return ColonLoc; }
   /// Return the location of the modifier.
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index 86fcae209c40d..3309f59a981fc 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -1658,6 +1658,8 @@ def warn_omp_depend_in_ordered_deprecated : Warning<"'depend' clause for"
 def warn_omp_invalid_attribute_for_ompx_attributes : Warning<"'ompx_attribute' clause only allows "
   "'amdgpu_flat_work_group_size', 'amdgpu_waves_per_eu', and 'launch_bounds'; "
   "%0 is ignored">, InGroup<OpenMPExtensions>;
+def err_omp_duplicate_modifier : Error<"duplicate modifier '%0' in '%1' clause">;
+def err_omp_expected_modifier : Error<"expected modifier in '%0' clause">;
 
 // Pragma loop support.
 def err_pragma_loop_missing_argument : Error<
diff --git a/clang/include/clang/Basic/OpenMPKinds.def b/clang/include/clang/Basic/OpenMPKinds.def
index 3f25e7aafe23b..76a861f416fd5 100644
--- a/clang/include/clang/Basic/OpenMPKinds.def
+++ b/clang/include/clang/Basic/OpenMPKinds.def
@@ -219,6 +219,7 @@ OPENMP_NUMTASKS_MODIFIER(strict)
 
 // Modifiers for 'allocate' clause.
 OPENMP_ALLOCATE_MODIFIER(allocator)
+OPENMP_ALLOCATE_MODIFIER(align)
 
 // Modifiers for the 'doacross' clause.
 OPENMP_DOACROSS_MODIFIER(source)
diff --git a/clang/include/clang/Basic/OpenMPKinds.h b/clang/include/clang/Basic/OpenMPKinds.h
index 900ad6ca6d66f..3e5da2a6abc01 100644
--- a/clang/include/clang/Basic/OpenMPKinds.h
+++ b/clang/include/clang/Basic/OpenMPKinds.h
@@ -230,6 +230,10 @@ enum OpenMPAllocateClauseModifier {
   OMPC_ALLOCATE_unknown
 };
 
+/// Number of allowed allocate-modifiers.
+static constexpr unsigned NumberOfOMPAllocateClauseModifiers =
+    OMPC_ALLOCATE_unknown;
+
 /// Contains 'interop' data for 'append_args' and 'init' clauses.
 class Expr;
 struct OMPInteropInfo final {
diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h
index 3d1cc4fab1c10..a056a96f50233 100644
--- a/clang/include/clang/Sema/SemaOpenMP.h
+++ b/clang/include/clang/Sema/SemaOpenMP.h
@@ -1148,7 +1148,12 @@ class SemaOpenMP : public SemaBase {
     SourceLocation OmpAllMemoryLoc;
     SourceLocation
         StepModifierLoc; /// 'step' modifier location for linear clause
-    OpenMPAllocateClauseModifier AllocClauseModifier = OMPC_ALLOCATE_unknown;
+    SmallVector<OpenMPAllocateClauseModifier,
+                NumberOfOMPAllocateClauseModifiers>
+        AllocClauseModifiers;
+    SmallVector<SourceLocation, NumberOfOMPAllocateClauseModifiers>
+        AllocClauseModifiersLoc;
+    Expr *AllocateAlignment = nullptr;
   };
 
   OMPClause *ActOnOpenMPVarListClause(OpenMPClauseKind Kind,
@@ -1166,10 +1171,15 @@ class SemaOpenMP : public SemaBase {
                                         SourceLocation LParenLoc,
                                         SourceLocation EndLoc);
   /// Called on well-formed 'allocate' clause.
-  OMPClause *ActOnOpenMPAllocateClause(
-      Expr *Allocator, OpenMPAllocateClauseModifier ACModifier,
-      ArrayRef<Expr *> VarList, SourceLocation StartLoc,
-      SourceLocation ColonLoc, SourceLocation LParenLoc, SourceLocation EndLoc);
+  OMPClause *
+  ActOnOpenMPAllocateClause(Expr *Allocator, Expr *Alignment,
+                            OpenMPAllocateClauseModifier FirstModifier,
+                            SourceLocation FirstModifierLoc,
+                            OpenMPAllocateClauseModifier SecondModifier,
+                            SourceLocation SecondModifierLoc,
+                            ArrayRef<Expr *> VarList, SourceLocation StartLoc,
+                            SourceLocation ColonLoc, SourceLocation LParenLoc,
+                            SourceLocation EndLoc);
   /// Called on well-formed 'private' clause.
   OMPClause *ActOnOpenMPPrivateClause(ArrayRef<Expr *> VarList,
                                       SourceLocation StartLoc,
diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp
index 4246ba95d827f..532933d6183ce 100644
--- a/clang/lib/AST/OpenMPClause.cpp
+++ b/clang/lib/AST/OpenMPClause.cpp
@@ -1019,19 +1019,18 @@ OMPPartialClause *OMPPartialClause::CreateEmpty(const ASTContext &C) {
   return new (C) OMPPartialClause();
 }
 
-OMPAllocateClause *
-OMPAllocateClause::Create(const ASTContext &C, SourceLocation StartLoc,
-                          SourceLocation LParenLoc, Expr *Allocator,
-                          SourceLocation ColonLoc,
-                          OpenMPAllocateClauseModifier AllocatorModifier,
-                          SourceLocation AllocatorModifierLoc,
-                          SourceLocation EndLoc, ArrayRef<Expr *> VL) {
+OMPAllocateClause *OMPAllocateClause::Create(
+    const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc,
+    Expr *Allocator, Expr *Alignment, SourceLocation ColonLoc,
+    OpenMPAllocateClauseModifier Modifier1, SourceLocation Modifier1Loc,
+    OpenMPAllocateClauseModifier Modifier2, SourceLocation Modifier2Loc,
+    SourceLocation EndLoc, ArrayRef<Expr *> VL) {
 
   // Allocate space for private variables and initializer expressions.
   void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(VL.size()));
   auto *Clause = new (Mem) OMPAllocateClause(
-      StartLoc, LParenLoc, Allocator, ColonLoc, AllocatorModifier,
-      AllocatorModifierLoc, EndLoc, VL.size());
+      StartLoc, LParenLoc, Allocator, Alignment, ColonLoc, Modifier1,
+      Modifier1Loc, Modifier2, Modifier2Loc, EndLoc, VL.size());
 
   Clause->setVarRefs(VL);
   return Clause;
@@ -2245,21 +2244,48 @@ void OMPClausePrinter::VisitOMPClauseList(T *Node, char StartSym) {
 void OMPClausePrinter::VisitOMPAllocateClause(OMPAllocateClause *Node) {
   if (Node->varlist_empty())
     return;
+
+  Expr *FirstModifier = nullptr;
+  Expr *SecondModifier = nullptr;
+  auto FirstAllocMod = Node->getFirstAllocateModifier();
+  auto SecondAllocMod = Node->getSecondAllocateModifier();
+  bool FirstUnknown = FirstAllocMod == OMPC_ALLOCATE_unknown;
+  bool SecondUnknown = SecondAllocMod == OMPC_ALLOCATE_unknown;
+  if (FirstAllocMod == OMPC_ALLOCATE_allocator ||
+      (FirstAllocMod == OMPC_ALLOCATE_unknown && Node->getAllocator())) {
+    FirstModifier = Node->getAllocator();
+    SecondModifier = Node->getAlignment();
+  } else {
+    FirstModifier = Node->getAlignment();
+    SecondModifier = Node->getAllocator();
+  }
+
   OS << "allocate";
-  OpenMPAllocateClauseModifier Modifier = Node->getAllocatorModifier();
-  if (Expr *Allocator = Node->getAllocator()) {
+  // If we have any explicit modifiers.
+  if (FirstModifier) {
     OS << "(";
-    if (Modifier == OMPC_ALLOCATE_allocator) {
-      OS << getOpenMPSimpleClauseTypeName(Node->getClauseKind(), Modifier);
+    if (!FirstUnknown) {
+      OS << getOpenMPSimpleClauseTypeName(Node->getClauseKind(), FirstAllocMod);
       OS << "(";
-      Allocator->printPretty(OS, nullptr, Policy, 0);
+    }
+    FirstModifier->printPretty(OS, nullptr, Policy, 0);
+    if (!FirstUnknown)
       OS << ")";
-    } else {
-      Allocator->printPretty(OS, nullptr, Policy, 0);
+    if (SecondModifier) {
+      OS << ", ";
+      if (!SecondUnknown) {
+        OS << getOpenMPSimpleClauseTypeName(Node->getClauseKind(),
+                                            SecondAllocMod);
+        OS << "(";
+      }
+      SecondModifier->printPretty(OS, nullptr, Policy, 0);
+      if (!SecondUnknown)
+        OS << ")";
     }
     OS << ":";
     VisitOMPClauseList(Node, ' ');
   } else {
+    // No modifiers. Just print the variable list.
     VisitOMPClauseList(Node, '(');
   }
   OS << ")";
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index b4e973bc84a7b..89b83938f352d 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -4530,32 +4530,88 @@ static bool parseStepSize(Parser &P, SemaOpenMP::OpenMPVarListDataTy &Data,
 }
 
 /// Parse 'allocate' clause modifiers.
-///   If allocator-modifier exists, return an expression for it and set
-///   Data field noting modifier was specified.
-///
+///   If allocator-modifier exists, return an expression for it. For both
+///   allocator and align modifiers, set Data fields as appropriate.
 static ExprResult
 parseOpenMPAllocateClauseModifiers(Parser &P, OpenMPClauseKind Kind,
                                    SemaOpenMP::OpenMPVarListDataTy &Data) {
   const Token &Tok = P.getCurToken();
   Preprocessor &PP = P.getPreprocessor();
   ExprResult Tail;
-  auto Modifier = static_cast<OpenMPAllocateClauseModifier>(
+  ExprResult Val;
+  SourceLocation RLoc;
+  bool AllocatorSeen = false;
+  bool AlignSeen = false;
+  SourceLocation CurrentModifierLoc = Tok.getLocation();
+  auto CurrentModifier = static_cast<OpenMPAllocateClauseModifier>(
       getOpenMPSimpleClauseType(Kind, PP.getSpelling(Tok), P.getLangOpts()));
-  if (Modifier == OMPC_ALLOCATE_allocator) {
-    Data.AllocClauseModifier = Modifier;
+
+  // Modifiers did not exist before 5.1
+  if (P.getLangOpts().OpenMP < 51)
+    return P.ParseAssignmentExpression();
+
+  // An allocator-simple-modifier is exclusive and must appear alone. See
+  // OpenMP6.0 spec, pg. 313, L1 on Modifiers, as well as Table 5.1, pg. 50,
+  // description of "exclusive" property. If we don't recognized an explicit
+  // simple-/complex- modifier, assume we're looking at expression
+  // representing allocator and consider ourselves done.
+  if (CurrentModifier == OMPC_ALLOCATE_unknown)
+    return P.ParseAssignmentExpression();
+
+  do {
     P.ConsumeToken();
-    BalancedDelimiterTracker AllocateT(P, tok::l_paren,
-                                       tok::annot_pragma_openmp_end);
     if (Tok.is(tok::l_paren)) {
-      AllocateT.consumeOpen();
-      Tail = P.ParseAssignmentExpression();
-      AllocateT.consumeClose();
+      switch (CurrentModifier) {
+      case OMPC_ALLOCATE_allocator: {
+        if (AllocatorSeen) {
+          P.Diag(Tok, diag::err_omp_duplicate_modifier)
+              << getOpenMPSimpleClauseTypeName(OMPC_allocate, CurrentModifier)
+              << getOpenMPClauseName(Kind);
+        } else {
+          Data.AllocClauseModifiers.push_back(CurrentModifier);
+          Data.AllocClauseModifiersLoc.push_back(CurrentModifierLoc);
+        }
+        BalancedDelimiterTracker AllocateT(P, tok::l_paren,
+                                           tok::annot_pragma_openmp_end);
+        AllocateT.consumeOpen();
+        Tail = P.ParseAssignmentExpression();
+        AllocateT.consumeClose();
+        AllocatorSeen = true;
+        break;
+      }
+      case OMPC_ALLOCATE_align: {
+        if (AlignSeen) {
+          P.Diag(Tok, diag::err_omp_duplicate_modifier)
+              << getOpenMPSimpleClauseTypeName(OMPC_allocate, CurrentModifier)
+              << getOpenMPClauseName(Kind);
+        } else {
+          Data.AllocClauseModifiers.push_back(CurrentModifier);
+          Data.AllocClauseModifiersLoc.push_back(CurrentModifierLoc);
+        }
+        Val = P.ParseOpenMPParensExpr(getOpenMPClauseName(Kind), RLoc);
+        if (Val.isUsable())
+          Data.AllocateAlignment = Val.get();
+        AlignSeen = true;
+        break;
+      }
+      default:
+        llvm_unreachable("Unexpected allocate modifier");
+      }
     } else {
       P.Diag(Tok, diag::err_expected) << tok::l_paren;
     }
-  } else {
-    Tail = P.ParseAssignmentExpression();
-  }
+    if (Tok.isNot(tok::comma))
+      break;
+    P.ConsumeToken();
+    CurrentModifierLoc = Tok.getLocation();
+    CurrentModifier = static_cast<OpenMPAllocateClauseModifier>(
+        getOpenMPSimpleClauseType(Kind, PP.getSpelling(Tok), P.getLangOpts()));
+    // A modifier followed by a comma implies another modifier.
+    if (CurrentModifier == OMPC_ALLOCATE_unknown) {
+      P.Diag(Tok, diag::err_omp_expected_modifier) << getOpenMPClauseName(Kind);
+      break;
+    }
+  } while (!AllocatorSeen || !AlignSeen);
   return Tail;
 }
 
@@ -4832,7 +4888,8 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
   } else if (Kind == OMPC_allocate ||
              (Kind == OMPC_affinity && Tok.is(tok::identifier) &&
               PP.getSpelling(Tok) == "iterator")) {
-    // Handle optional allocator expression followed by colon delimiter.
+    // Handle optional allocator and align modifiers followed by colon
+    // delimiter.
     ColonProtectionRAIIObject ColonRAII(*this);
     TentativeParsingAction TPA(*this);
     // OpenMP 5.0, 2.10.1, task Construct.
@@ -4849,19 +4906,18 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
     Tail = Actions.CorrectDelayedTyposInExpr(Tail);
     Tail = Actions.ActOnFinishFullExpr(Tail.get(), T.getOpenLocation(),
                                        /*DiscardedValue=*/false);
-    if (Tail.isUsable()) {
+    if (Tail.isUsable() || Data.AllocateAlignment) {
       if (Tok.is(tok::colon)) {
-        Data.DepModOrTailExpr = Tail.get();
+        Data.DepModOrTailExpr = Tail.isUsable() ? Tail.get() : nullptr;
         Data.ColonLoc = ConsumeToken();
         TPA.Commit();
       } else {
         // Colon not found, parse only list of variables.
         TPA.Revert();
-        if (Kind == OMPC_allocate &&
-            Data.AllocClauseModifier == OMPC_ALLOCATE_allocator) {
+        if (Kind == OMPC_allocate && Data.AllocClauseModifiers.size()) {
           SkipUntil(tok::r_paren, tok::annot_pragma_openmp_end,
                     StopBeforeMatch);
-          Diag(Tok, diag::err_modifier_expected_colon) << "allocator";
+          Diag(Tok, diag::err_modifier_expected_colon) << "allocate clause";
         }
       }
     } else {
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 66ff92f554fc4..b83b2b12f4a23 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -5320,6 +5320,8 @@ static void checkAllocateClauses(Sema &S, DSAStackTy *Stack,
       Expr *SimpleRefExpr = E;
       auto Res = getPrivateItem(S, SimpleRefExpr, ELoc, ERange);
       ValueDecl *VD = Res.first;
+      if (!VD)
+        continue;
       DSAStackTy::DSAVarData Data = Stack->getTopDSA(VD, /*FromParent=*/false);
       if (!isOpenMPPrivate(Data.CKind)) {
         S.Diag(E->getExprLoc(),
@@ -5330,10 +5332,8 @@ static void checkAllocateClauses(Sema &S, DSAStackTy *Stack,
       if (checkPreviousOMPAllocateAttribute(S, Stack, E, PrivateVD,
                                             AllocatorKind, AC->getAllocator()))
         continue;
-      // Placeholder until allocate clause supports align modifier.
-      Expr *Alignment = nullptr;
       applyOMPAllocateAttribute(S, PrivateVD, AllocatorKind, AC->getAllocator(),
-                                Alignment, E->getSourceRange());
+                                AC->getAlignment(), E->getSourceRange());
     }
   }
 }
@@ -15617,7 +15617,9 @@ ExprResult SemaOpenMP::VerifyPositiveIntegerConstantInClause(
         << E->getSourceRange();
     return ExprError();
   }
-  if ((CKind == OMPC_aligned || CKind == OMPC_align) && !Result.isPowerOf2()) {
+  if ((CKind == OMPC_aligned || CKind == OMPC_align ||
+       CKind == OMPC_allocate) &&
+      !Result.isPowerOf2()) {
     Diag(E->getExprLoc(), diag::warn_omp_alignment_not_power_of_two)
         << E->getSourceRange();
     return ExprError();
@@ -17153,11 +17155,26 @@ OMPClause *SemaOpenMP::ActOnOpenMPVarListClause(OpenMPClauseKind Kind,
   case OMPC_has_device_addr:
     Res = ActOnOpenMPHasDeviceAddrClause(VarList, Locs);
     break;
-  case OMPC_allocate:
-    Res = ActOnOpenMPAllocateClause(Data.DepModOrTailExpr,
-                                    Data.AllocClauseModifier, VarList, StartLoc,
-                                    LParenLoc, ColonLoc, EndLoc);
+  case OMPC_allocate: {
+    OpenMPAllocateClauseModifier Modifier1 = OMPC_ALLOCATE_unknown;
+    OpenMPAllocateClauseModifier Modifier2 = OMPC_ALLOCATE_unknown;
+    SourceLocation Modifier1Loc, Modifier2Loc;
+    if (!Data.AllocClauseModifiers.empty()) {
+      assert(Data.AllocClauseModifiers.size() <= 2 &&
+             "More allocate modifiers than expected");
+      Modifier1 = Data.AllocClauseModifiers[0];
+      Modifier1Loc = Data.AllocClauseModifiersLoc[0];
+      if (Data.AllocClauseModifiers.size() == 2) {
+        Modifier2 = Data.AllocClauseModifiers[1];
+        Modifier2Loc = Data.AllocClauseModifiersLoc[1];
+      }
+    }
+    Res = ActOnOpenMPAllocateClause(
+        Data.DepModOrTailExpr, Data.AllocateAlignment, Modifier1, Modifier1Loc,
+        Modifier2, Modifier2Loc, VarList, StartLoc, LParenLoc, ColonLoc,
+        EndLoc);
     break;
+  }
   case OMPC_nontemporal:
     Res = ActOnOpenMPNontemporalClause(VarList, StartLoc, LParenLoc, EndLoc);
     break;
@@ -23163,32 +23180,37 @@ SemaOpenMP::ActOnOpenMPHasDeviceAddrClause(ArrayRef<Expr *> VarList,
 }
 
 OMPClause *SemaOpenMP::ActOnOpenMPAllocateClause(
-    Expr *Allocator, OpenMPAllocateClauseModifier AllocClauseModifier,
-    ArrayRef<Expr *> VarList, SourceLocation StartLoc, SourceLocation LParenLoc,
-    SourceLocation ColonLoc, SourceLocation EndLoc) {
-
+    Expr *Allocator, Expr *Alignment,
+    OpenMPAllocateClauseModifier FirstAllocateModifier,
+    SourceLocation FirstAllocateModifierLoc,
+    OpenMPAllocateClauseModifier SecondAllocateModifier,
+    SourceLocation SecondAllocateModifierLoc, ArrayRef<Expr *> VarList,
+    SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation ColonLoc,
+    SourceLocation EndLoc) {
   if (Allocator) {
     // Allocator expression is dependent - skip it for now and build the
     // allocator when instantiated.
-    if (Allocator->isTypeDependent() || Allocator->isValueDependent() ||
-        Allocator->isInstantiationDependent() ||
-        Allocator->containsUnexpandedParameterPack())
-      return nullptr;
-    // OpenMP [2.11.4 allocate Clause, Description]
-    // allocator is an expression of omp_allocator_handle_t type.
-    if (!findOMPAllocatorHandleT(SemaRef, Allocator->getExprLoc(), DSAStack))
-      return nullptr;
+    bool AllocDependent =
+        (Allocator->isTypeDependent() || Allocator->isValueDependent() ||
+         Allocator->isInstantiationDependent() ||
+         Allocator->containsUnexpandedParameterPack());
+    if (!AllocDependent) {
+      // OpenMP [2.11.4 allocate Clause, Description]
+      // allocator is an expression of omp_allocator_handle_t type.
+      if (!findOMPAllocatorHandleT(SemaRef, Allocator->getExprLoc(), DSAStack))
+        return nullptr;
 
-    ExprResult AllocatorRes = SemaRef.DefaultLvalueConversion(Allocator);
-    if (AllocatorRes.isInvalid())
-      return nullptr;
-    AllocatorRes = SemaRef.PerformImplicitConversion(
-        AllocatorRes.get(), DSAStack->getOMPAllocatorHandleT(),
-        AssignmentAction::Initializing,
-        /*AllowExplicit=*/true);
-    if (AllocatorRes.isInvalid())
-      return nullptr;
-    Allocator = AllocatorRes.get();
+      ExprResult AllocatorRes = SemaRef.DefaultLvalueConversion(Allocator);
+      if (AllocatorRes.isInvalid())
+        return nullptr;
+      AllocatorRes = SemaRef.PerformImplicitConversion(
+          AllocatorRes.get(), DSAStack->getOMPAllocatorHandleT(),
+          AssignmentAction::Initializing,
+          /*AllowExplicit=*/true);
+      if (AllocatorRes.isInvalid())
+        return nullptr;
+      Allocator = AllocatorRes.isUsable() ? AllocatorRes.get() : nullptr;
+    }
   } else {
     // OpenMP 5.0, 2.11.4 allocate Clause, Restrictions.
     // allocate clauses that appear on a target construct or on constructs in a
@@ -23199,6 +23221,17 @@ OMPClause *SemaOpenMP::ActOnOpenMPAllocateClause(
         !DSAStack->hasRequiresDeclWithClause<OMPDynamicAllocatorsClause>())
       SemaRef.targetDiag(StartLoc, diag::err_expected_allocator_expression);
   }
+  if (Alignment) {
+    bool AlignmentDependent = Alignment->isTypeDependent() ||
+                              Alignment->isValueDependent() ||
+                              Alignment->isInstantiationDependent() ||
+                              Alignment->containsUnexpandedParameterPack();
+    if (!AlignmentDependent) {
+      ExprResult AlignResult =
+          VerifyPositiveIntegerConstantInClause(Alignment, OMPC_allocate);
+      Alignment = AlignResult.isUsable() ? AlignResult.get() : nullptr;
+    }
+  }
   // Analyze and build list of variables.
   SmallVector<Expr *, 8> Vars;
   for (Expr *RefExpr : VarList) {
@@ -23230,11 +23263,10 @@ OMPClause *SemaOpenMP::ActOnOpenMPAllocateClause(
   if (Allocator)
     DSAStack->addInnerAllocatorExpr(Allocator);
 
-  OpenMPAllocateClauseModifier AllocatorModifier = AllocClauseModifier;
-  SourceLocation AllocatorModifierLoc;
-  return OMPAllocateClause::Create(getASTContext(), StartLoc, LParenLoc,
-                                   Allocator, ColonLoc, AllocatorModifier,
-                                   AllocatorModifierLoc, EndLoc, Vars);
+  return OMPAllocateClause::Create(
+      getASTContext(), StartLoc, LParenLoc, Allocator, Alignment, ColonLoc,
+      FirstAllocateModifier, FirstAllocateModifierLoc, SecondAllocateModifier,
+      SecondAllocateModifierLoc, EndLoc, Vars);
 }
 
 OMPClause *SemaOpenMP::ActOnOpenMPNontemporalClause(ArrayRef<Expr *> VarList,
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 4a3c739ecbeab..4fae2ccb5f6d0 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -2075,15 +2075,18 @@ class TreeTransform {
   ///
   /// By default, performs semantic analysis to build the new OpenMP clause.
   /// Subclasses may override this routine to provide different behavior.
-  OMPClause *RebuildOMPAllocateClause(Expr *Allocate,
-                                      OpenMPAllocateClauseModifier ACModifier,
-                                      ArrayRef<Expr *> VarList,
-                                      SourceLocation StartLoc,
-                                      SourceLocation LParenLoc,
-                                      SourceLocation ColonLoc,
-                                      SourceLocation EndLoc) {
+  OMPClause *
+  RebuildOMPAllocateClause(Expr *Allocate, Expr *Alignment,
+                           OpenMPAllocateClauseModifier FirstModifier,
+                           SourceLocation FirstModifierLoc,
+                           OpenMPAllocateClauseModifier SecondModifier,
+                           SourceLocation SecondModifierLoc,
+                           ArrayRef<Expr *> VarList, SourceLocation StartLoc,
+                           SourceLocation LParenLoc, SourceLocation ColonLoc,
+                           SourceLocation EndLoc) {
     return getSema().OpenMP().ActOnOpenMPAllocateClause(
-        Allocate, ACModifier, VarList, StartLoc, LParenLoc, ColonLoc, EndLoc);
+        Allocate, Alignment, FirstModifier, FirstModifierLoc, SecondModifier,
+        SecondModifierLoc, VarList, StartLoc, LParenLoc, ColonLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'num_teams' clause.
@@ -11224,6 +11227,13 @@ TreeTransform<Derived>::TransformOMPAllocateClause(OMPAllocateClause *C) {
       return nullptr;
     Allocator = AllocatorRes.get();
   }
+  Expr *Alignment = C->getAlignment();
+  if (Alignment) {
+    ExprResult AlignmentRes = getDerived().TransformExpr(Alignment);
+    if (AlignmentRes.isInvalid())
+      return nullptr;
+    Alignment = AlignmentRes.get();
+  }
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
   for (auto *VE : C->varlist()) {
@@ -11233,7 +11243,9 @@ TreeTransform<Derived>::TransformOMPAllocateClause(OMPAllocateClause *C) {
     Vars.push_back(EVar.get());
   }
   return getDerived().RebuildOMPAllocateClause(
-      Allocator, C->getAllocatorModifier(), Vars, C->getBeginLoc(),
+      Allocator, Alignment, C->getFirstAllocateModifier(),
+      C->getFirstAllocateModifierLoc(), C->getSecondAllocateModifier(),
+      C->getSecondAllocateModifierLoc(), Vars, C->getBeginLoc(),
       C->getLParenLoc(), C->getColonLoc(), C->getEndLoc());
 }
 
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index b53f99732cacc..7361cace49dd7 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -11824,10 +11824,12 @@ void OMPClauseReader::VisitOMPMapClause(OMPMapClause *C) {
 }
 
 void OMPClauseReader::VisitOMPAllocateClause(OMPAllocateClause *C) {
-  C->setAllocatorModifier(Record.readEnum<OpenMPAllocateClauseModifier>());
+  C->setFirstAllocateModifier(Record.readEnum<OpenMPAllocateClauseModifier>());
+  C->setSecondAllocateModifier(Record.readEnum<OpenMPAllocateClauseModifier>());
   C->setLParenLoc(Record.readSourceLocation());
   C->setColonLoc(Record.readSourceLocation());
   C->setAllocator(Record.readSubExpr());
+  C->setAlignment(Record.readSubExpr());
   unsigned NumVars = C->varlist_size();
   SmallVector<Expr *, 16> Vars;
   Vars.reserve(NumVars);
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 39004fd4d4c37..345d496a93312 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -7924,10 +7924,12 @@ void OMPClauseWriter::VisitOMPMapClause(OMPMapClause *C) {
 
 void OMPClauseWriter::VisitOMPAllocateClause(OMPAllocateClause *C) {
   Record.push_back(C->varlist_size());
-  Record.writeEnum(C->getAllocatorModifier());
+  Record.writeEnum(C->getFirstAllocateModifier());
+  Record.writeEnum(C->getSecondAllocateModifier());
   Record.AddSourceLocation(C->getLParenLoc());
   Record.AddSourceLocation(C->getColonLoc());
   Record.AddStmt(C->getAllocator());
+  Record.AddStmt(C->getAlignment());
   for (auto *VE : C->varlist())
     Record.AddStmt(VE);
 }
diff --git a/clang/test/OpenMP/allocate_allocator_modifier_codegen.cpp b/clang/test/OpenMP/allocate_allocator_modifier_codegen.cpp
deleted file mode 100644
index 1bf927ebb2eb7..0000000000000
--- a/clang/test/OpenMP/allocate_allocator_modifier_codegen.cpp
+++ /dev/null
@@ -1,255 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 5
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix CHECK-TLS %s
-
-// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck --check-prefix SIMD-ONLY0 %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
-// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck --check-prefix SIMD-ONLY0 %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
-// expected-no-diagnostics
-
-#ifndef HEADER
-#define HEADER
-
-enum omp_allocator_handle_t {
-  omp_null_allocator = 0,
-  omp_default_mem_alloc = 1,
-  omp_large_cap_mem_alloc = 2,
-  omp_const_mem_alloc = 3,
-  omp_high_bw_mem_alloc = 4,
-  omp_low_lat_mem_alloc = 5,
-  omp_cgroup_mem_alloc = 6,
-  omp_pteam_mem_alloc = 7,
-  omp_thread_mem_alloc = 8,
-  KMP_ALLOCATOR_MAX_HANDLE = __UINTPTR_MAX__
-};
-
-template <class T>
-struct ST {
-  static T m;
-};
-
-template <class T, omp_allocator_handle_t TY> T foo() {
-  T v;
- #pragma omp scope private(v) allocate(allocator(TY):v)
-  v = ST<T>::m;
-  return v;
-}
-
-namespace ns {
-int a;
-}
-
-int main() {
-  static int a;
-  static int temp;
-  #pragma omp scope private(ns::a) allocate(allocator(omp_pteam_mem_alloc):ns::a)
-  ns::a++;
-
- #pragma omp scope private(a) allocate(allocator(omp_thread_mem_alloc):a)
-  a = 2;
-  double b = 3;
-  #pragma omp scope private(temp) allocate(temp)
-  temp += foo<int, omp_cgroup_mem_alloc>();
-  return temp+ns::a;
-}
-
-extern template int ST<int>::m;
-
-int b;
-
-void bar(int a, float &z) {
-  #pragma omp scope private(a,z) allocate(allocator(omp_default_mem_alloc):a,z)
-  a += b;
-}
-#endif
-// CHECK-LABEL: define dso_local noundef i32 @main(
-// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[B:%.*]] = alloca double, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
-// CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
-// CHECK-NEXT:    [[DOTA__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 7 to ptr))
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTA__VOID_ADDR]], align 4
-// CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
-// CHECK-NEXT:    store i32 [[INC]], ptr [[DOTA__VOID_ADDR]], align 4
-// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR]], ptr inttoptr (i64 7 to ptr))
-// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2:[0-9]+]], i32 [[TMP0]])
-// CHECK-NEXT:    [[DOTA__VOID_ADDR1:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 8 to ptr))
-// CHECK-NEXT:    store i32 2, ptr [[DOTA__VOID_ADDR1]], align 4
-// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR1]], ptr inttoptr (i64 8 to ptr))
-// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
-// CHECK-NEXT:    store double 3.000000e+00, ptr [[B]], align 8
-// CHECK-NEXT:    [[DOTTEMP__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr null)
-// CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z3fooIiL22omp_allocator_handle_t6EET_v()
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTTEMP__VOID_ADDR]], align 4
-// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[CALL]]
-// CHECK-NEXT:    store i32 [[ADD]], ptr [[DOTTEMP__VOID_ADDR]], align 4
-// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTTEMP__VOID_ADDR]], ptr null)
-// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr @_ZZ4mainE4temp, align 4
-// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr @_ZN2ns1aE, align 4
-// CHECK-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP3]], [[TMP4]]
-// CHECK-NEXT:    ret i32 [[ADD2]]
-//
-//
-// CHECK-LABEL: define linkonce_odr noundef i32 @_Z3fooIiL22omp_allocator_handle_t6EET_v(
-// CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[V:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[V1:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @_ZN2STIiE1mE, align 4
-// CHECK-NEXT:    store i32 [[TMP1]], ptr [[V1]], align 4
-// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[V]], align 4
-// CHECK-NEXT:    ret i32 [[TMP2]]
-//
-//
-// CHECK-LABEL: define dso_local void @_Z3bariRf(
-// CHECK-SAME: i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Z:%.*]]) #[[ATTR3]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[Z_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
-// CHECK-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK-NEXT:    store ptr [[Z]], ptr [[Z_ADDR]], align 8
-// CHECK-NEXT:    [[DOTA__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 1 to ptr))
-// CHECK-NEXT:    [[DOTZ__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 1 to ptr))
-// CHECK-NEXT:    store ptr [[DOTZ__VOID_ADDR]], ptr [[TMP]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @b, align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTA__VOID_ADDR]], align 4
-// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP1]]
-// CHECK-NEXT:    store i32 [[ADD]], ptr [[DOTA__VOID_ADDR]], align 4
-// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTZ__VOID_ADDR]], ptr inttoptr (i64 1 to ptr))
-// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR]], ptr inttoptr (i64 1 to ptr))
-// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK-TLS-LABEL: define dso_local noundef i32 @main(
-// CHECK-TLS-SAME: ) #[[ATTR0:[0-9]+]] {
-// CHECK-TLS-NEXT:  [[ENTRY:.*:]]
-// CHECK-TLS-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
-// CHECK-TLS-NEXT:    [[B:%.*]] = alloca double, align 8
-// CHECK-TLS-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
-// CHECK-TLS-NEXT:    store i32 0, ptr [[RETVAL]], align 4
-// CHECK-TLS-NEXT:    [[DOTA__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 7 to ptr))
-// CHECK-TLS-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTA__VOID_ADDR]], align 4
-// CHECK-TLS-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
-// CHECK-TLS-NEXT:    store i32 [[INC]], ptr [[DOTA__VOID_ADDR]], align 4
-// CHECK-TLS-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR]], ptr inttoptr (i64 7 to ptr))
-// CHECK-TLS-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2:[0-9]+]], i32 [[TMP0]])
-// CHECK-TLS-NEXT:    [[DOTA__VOID_ADDR1:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 8 to ptr))
-// CHECK-TLS-NEXT:    store i32 2, ptr [[DOTA__VOID_ADDR1]], align 4
-// CHECK-TLS-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR1]], ptr inttoptr (i64 8 to ptr))
-// CHECK-TLS-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
-// CHECK-TLS-NEXT:    store double 3.000000e+00, ptr [[B]], align 8
-// CHECK-TLS-NEXT:    [[DOTTEMP__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr null)
-// CHECK-TLS-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z3fooIiL22omp_allocator_handle_t6EET_v()
-// CHECK-TLS-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTTEMP__VOID_ADDR]], align 4
-// CHECK-TLS-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[CALL]]
-// CHECK-TLS-NEXT:    store i32 [[ADD]], ptr [[DOTTEMP__VOID_ADDR]], align 4
-// CHECK-TLS-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTTEMP__VOID_ADDR]], ptr null)
-// CHECK-TLS-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
-// CHECK-TLS-NEXT:    [[TMP3:%.*]] = load i32, ptr @_ZZ4mainE4temp, align 4
-// CHECK-TLS-NEXT:    [[TMP4:%.*]] = load i32, ptr @_ZN2ns1aE, align 4
-// CHECK-TLS-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP3]], [[TMP4]]
-// CHECK-TLS-NEXT:    ret i32 [[ADD2]]
-//
-//
-// CHECK-TLS-LABEL: define linkonce_odr noundef i32 @_Z3fooIiL22omp_allocator_handle_t6EET_v(
-// CHECK-TLS-SAME: ) #[[ATTR3:[0-9]+]] comdat {
-// CHECK-TLS-NEXT:  [[ENTRY:.*:]]
-// CHECK-TLS-NEXT:    [[V:%.*]] = alloca i32, align 4
-// CHECK-TLS-NEXT:    [[V1:%.*]] = alloca i32, align 4
-// CHECK-TLS-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
-// CHECK-TLS-NEXT:    [[TMP1:%.*]] = load i32, ptr @_ZN2STIiE1mE, align 4
-// CHECK-TLS-NEXT:    store i32 [[TMP1]], ptr [[V1]], align 4
-// CHECK-TLS-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
-// CHECK-TLS-NEXT:    [[TMP2:%.*]] = load i32, ptr [[V]], align 4
-// CHECK-TLS-NEXT:    ret i32 [[TMP2]]
-//
-//
-// CHECK-TLS-LABEL: define dso_local void @_Z3bariRf(
-// CHECK-TLS-SAME: i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Z:%.*]]) #[[ATTR3]] {
-// CHECK-TLS-NEXT:  [[ENTRY:.*:]]
-// CHECK-TLS-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
-// CHECK-TLS-NEXT:    [[Z_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-TLS-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
-// CHECK-TLS-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
-// CHECK-TLS-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK-TLS-NEXT:    store ptr [[Z]], ptr [[Z_ADDR]], align 8
-// CHECK-TLS-NEXT:    [[DOTA__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 1 to ptr))
-// CHECK-TLS-NEXT:    [[DOTZ__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 1 to ptr))
-// CHECK-TLS-NEXT:    store ptr [[DOTZ__VOID_ADDR]], ptr [[TMP]], align 8
-// CHECK-TLS-NEXT:    [[TMP1:%.*]] = load i32, ptr @b, align 4
-// CHECK-TLS-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTA__VOID_ADDR]], align 4
-// CHECK-TLS-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP1]]
-// CHECK-TLS-NEXT:    store i32 [[ADD]], ptr [[DOTA__VOID_ADDR]], align 4
-// CHECK-TLS-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTZ__VOID_ADDR]], ptr inttoptr (i64 1 to ptr))
-// CHECK-TLS-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR]], ptr inttoptr (i64 1 to ptr))
-// CHECK-TLS-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
-// CHECK-TLS-NEXT:    ret void
-//
-//
-// SIMD-ONLY0-LABEL: define dso_local noundef i32 @main(
-// SIMD-ONLY0-SAME: ) #[[ATTR0:[0-9]+]] {
-// SIMD-ONLY0-NEXT:  [[ENTRY:.*:]]
-// SIMD-ONLY0-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
-// SIMD-ONLY0-NEXT:    [[A:%.*]] = alloca i32, align 4
-// SIMD-ONLY0-NEXT:    [[A1:%.*]] = alloca i32, align 4
-// SIMD-ONLY0-NEXT:    [[B:%.*]] = alloca double, align 8
-// SIMD-ONLY0-NEXT:    [[TEMP:%.*]] = alloca i32, align 4
-// SIMD-ONLY0-NEXT:    store i32 0, ptr [[RETVAL]], align 4
-// SIMD-ONLY0-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
-// SIMD-ONLY0-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-// SIMD-ONLY0-NEXT:    store i32 [[INC]], ptr [[A]], align 4
-// SIMD-ONLY0-NEXT:    store i32 2, ptr [[A1]], align 4
-// SIMD-ONLY0-NEXT:    store double 3.000000e+00, ptr [[B]], align 8
-// SIMD-ONLY0-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z3fooIiL22omp_allocator_handle_t6EET_v()
-// SIMD-ONLY0-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TEMP]], align 4
-// SIMD-ONLY0-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CALL]]
-// SIMD-ONLY0-NEXT:    store i32 [[ADD]], ptr [[TEMP]], align 4
-// SIMD-ONLY0-NEXT:    [[TMP2:%.*]] = load i32, ptr @_ZZ4mainE4temp, align 4
-// SIMD-ONLY0-NEXT:    [[TMP3:%.*]] = load i32, ptr @_ZN2ns1aE, align 4
-// SIMD-ONLY0-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP2]], [[TMP3]]
-// SIMD-ONLY0-NEXT:    ret i32 [[ADD2]]
-//
-//
-// SIMD-ONLY0-LABEL: define linkonce_odr noundef i32 @_Z3fooIiL22omp_allocator_handle_t6EET_v(
-// SIMD-ONLY0-SAME: ) #[[ATTR1:[0-9]+]] comdat {
-// SIMD-ONLY0-NEXT:  [[ENTRY:.*:]]
-// SIMD-ONLY0-NEXT:    [[V:%.*]] = alloca i32, align 4
-// SIMD-ONLY0-NEXT:    [[V1:%.*]] = alloca i32, align 4
-// SIMD-ONLY0-NEXT:    [[TMP0:%.*]] = load i32, ptr @_ZN2STIiE1mE, align 4
-// SIMD-ONLY0-NEXT:    store i32 [[TMP0]], ptr [[V1]], align 4
-// SIMD-ONLY0-NEXT:    [[TMP1:%.*]] = load i32, ptr [[V]], align 4
-// SIMD-ONLY0-NEXT:    ret i32 [[TMP1]]
-//
-//
-// SIMD-ONLY0-LABEL: define dso_local void @_Z3bariRf(
-// SIMD-ONLY0-SAME: i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Z:%.*]]) #[[ATTR1]] {
-// SIMD-ONLY0-NEXT:  [[ENTRY:.*:]]
-// SIMD-ONLY0-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
-// SIMD-ONLY0-NEXT:    [[Z_ADDR:%.*]] = alloca ptr, align 8
-// SIMD-ONLY0-NEXT:    [[A1:%.*]] = alloca i32, align 4
-// SIMD-ONLY0-NEXT:    [[Z2:%.*]] = alloca float, align 4
-// SIMD-ONLY0-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
-// SIMD-ONLY0-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// SIMD-ONLY0-NEXT:    store ptr [[Z]], ptr [[Z_ADDR]], align 8
-// SIMD-ONLY0-NEXT:    store ptr [[Z2]], ptr [[TMP]], align 8
-// SIMD-ONLY0-NEXT:    [[TMP0:%.*]] = load i32, ptr @b, align 4
-// SIMD-ONLY0-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A1]], align 4
-// SIMD-ONLY0-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
-// SIMD-ONLY0-NEXT:    store i32 [[ADD]], ptr [[A1]], align 4
-// SIMD-ONLY0-NEXT:    ret void
-//
diff --git a/clang/test/OpenMP/allocate_allocator_modifier_messages.cpp b/clang/test/OpenMP/allocate_allocator_modifier_messages.cpp
deleted file mode 100644
index 160c4996c1219..0000000000000
--- a/clang/test/OpenMP/allocate_allocator_modifier_messages.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 %s
-
-typedef enum omp_allocator_handle_t {
-      omp_null_allocator = 0,
-      omp_default_mem_alloc = 1,
-      omp_large_cap_mem_alloc = 2,
-      omp_const_mem_alloc = 3,
-      omp_high_bw_mem_alloc = 4,
-      omp_low_lat_mem_alloc = 5,
-      omp_cgroup_mem_alloc = 6,
-      omp_pteam_mem_alloc = 7,
-      omp_thread_mem_alloc = 8,
-} omp_allocator_handle_t;
-
-int myAlloc() {
-  return 100;
-}
-
-int main() {
-  int a, b, c;
-  // expected-error@+4 {{expected '('}}
-  // expected-error@+3 {{expected expression}}
-  // expected-error@+2 {{expected ')'}}
-  // expected-note@+1 {{to match this '('}}
-  #pragma omp scope private(c) allocate(allocator
-  // expected-error@+6 {{expected expression}}
-  // expected-error@+5 {{expected ')'}}
-  // expected-note@+4 {{to match this '('}}
-  // expected-error@+3 {{expected expression}}
-  // expected-error@+2 {{expected ')'}}
-  // expected-note@+1 {{to match this '('}}
-  #pragma omp scope private(c) allocate(allocator(
-  // expected-error@+4 {{expected expression}}
-  // expected-error@+3 {{expected expression}}
-  // expected-error@+2 {{expected ')'}}
-  // expected-note@+1 {{to match this '('}}
-  #pragma omp scope private(c) allocate(allocator()
-  // expected-error@+2 {{expected expression}}
-  // expected-error@+1 {{expected expression}}
-  #pragma omp scope private(c) allocate(allocator())
-  // expected-error@+6 {{expected ')'}}
-  // expected-note@+5 {{to match this '('}}
-  // expected-error@+4 {{missing ':' after allocator modifier}}
-  // expected-error@+3 {{expected expression}}
-  // expected-error@+2 {{expected ')'}}
-  // expected-note@+1 {{to match this '('}}
-  #pragma omp scope private(c) allocate(allocator(omp_default_mem_alloc
-  // expected-error@+6 {{missing ':' after allocator modifier}}
-  // expected-error@+5 {{expected expression}}
-  // expected-error@+4 {{expected ')'}}
-  // expected-note@+3 {{to match this '('}}
-  // expected-error@+2 {{expected ')'}}
-  // expected-note@+1 {{to match this '('}}
-  #pragma omp scope private(c) allocate(allocator(omp_large_cap_mem_alloc:
-  // expected-error@+4 {{missing ':' after allocator modifier}}
-  // expected-error@+3 {{expected expression}}
-  // expected-error@+2 {{expected ')'}}
-  // expected-note@+1 {{to match this '('}}
-  #pragma omp scope private(c) allocate(allocator(omp_const_mem_alloc)
-  // expected-error@+2 {{missing ':' after allocator modifier}}
-  // expected-error@+1 {{expected expression}}
-  #pragma omp scope private(c) allocate(allocator(omp_high_bw_mem_alloc))
-  // expected-error@+1 {{expected expression}}
-  #pragma omp scope private(c) allocate(allocator(omp_low_lat_mem_alloc):)
-  // expected-error@+6 {{expected ')'}}
-  // expected-note@+5 {{to match this '('}}
-  // expected-error@+4 {{missing ':' after allocator modifier}}
-  // expected-error@+3 {{expected expression}}
-  // expected-error@+2 {{expected ')'}}
-  // expected-note@+1 {{to match this '('}}
-  #pragma omp scope private(c) allocate(allocator(omp_cgroup_mem_alloc:)
-  // expected-error@+4 {{expected ')'}}
-  // expected-note@+3 {{to match this '('}}
-  // expected-error@+2 {{missing ':' after allocator modifier}}
-  // expected-error@+1 {{expected expression}}
-  #pragma omp scope private(c) allocate(allocator(omp_pteam_mem_alloc:))
-  // expected-error@+4 {{expected ')'}}
-  // expected-note@+3 {{to match this '('}}
-  // expected-error@+2 {{missing ':' after allocator modifier}}
-  // expected-error@+1 {{expected expression}}
-  #pragma omp scope private(c) allocate(allocator(omp_thread_mem_alloc:c))
-  // expected-error@+1 {{expected variable name}}
-  #pragma omp scope private(c) allocate(allocator(omp_const_mem_alloc):1)
-  // expected-error@+1 {{expected variable name}}
-  #pragma omp scope private(c) allocate(allocator(omp_const_mem_alloc):-10)
-  // expected-error@+4 {{expected ',' or ')' in 'allocate' clause}}
-  // expected-error@+3 {{expected ')'}}
-  // expected-warning@+2 {{extra tokens at the end of '#pragma omp scope' are ignored}}
-  // expected-note@+1 {{to match this '('}}
-  #pragma omp scope private(a,b,c) allocate(allocator(omp_const_mem_alloc):c:b;a)
-  // expected-error@+1 {{initializing 'const omp_allocator_handle_t' with an expression of incompatible type 'int'}}
-  #pragma omp scope private(c,a,b) allocate(allocator(myAlloc()):a,b,c)
-  // expected-error@+2 {{missing ':' after allocator modifier}}
-  // expected-error@+1 {{expected expression}}
-  #pragma omp scope private(c) allocate(allocator(omp_default_mem_alloc);c)
-  ++a;
-}
diff --git a/clang/test/OpenMP/allocate_allocator_modifier_ast_print.cpp b/clang/test/OpenMP/allocate_modifiers_ast_print.cpp
similarity index 51%
rename from clang/test/OpenMP/allocate_allocator_modifier_ast_print.cpp
rename to clang/test/OpenMP/allocate_modifiers_ast_print.cpp
index 15f3f1dd9bbb9..436647be75da3 100644
--- a/clang/test/OpenMP/allocate_allocator_modifier_ast_print.cpp
+++ b/clang/test/OpenMP/allocate_modifiers_ast_print.cpp
@@ -41,6 +41,11 @@ int main() {
   #pragma omp scope private(c,a,b) allocate(allocator(myAlloc()):a,b,c)
   c++;
   #pragma omp scope private(c,a,b,d) allocate(myAlloc():a,b,c,d)
+  a++;
+  #pragma omp scope private(a,b) allocate(align(2), allocator(omp_const_mem_alloc):a,b)
+  b++;
+  #pragma omp scope private(c,a,b) allocate(allocator(myAlloc()), align(8) :a,b,c)
+  c++;
 // DUMP: FunctionDecl {{.*}}
 // DUMP: DeclRefExpr {{.*}}'omp_allocator_handle_t' EnumConstant {{.*}}'omp_large_cap_mem_alloc' 'omp_allocator_handle_t'
 // DUMP: FunctionDecl {{.*}}
@@ -76,11 +81,81 @@ int main() {
 // DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'b' 'int'
 // DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'c' 'int'
 // DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'d' 'int'
+// DUMP: OMPScopeDirective {{.*}}
+// DUMP: OMPPrivateClause {{.*}}
+// DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'a' 'int'
+// DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'b' 'int'
+// DUMP: OMPAllocateClause {{.*}}
+// DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'a' 'int'
+// DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'b' 'int'
+// DUMP: OMPScopeDirective {{.*}}
+// DUMP: OMPPrivateClause {{.*}}
+// DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'c' 'int'
+// DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'a' 'int'
+// DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'b' 'int'
+// DUMP: OMPAllocateClause {{.*}}
+// DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'a' 'int'
+// DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'b' 'int'
+// DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'c' 'int'
 // PRINT: #pragma omp scope private(a) allocate(omp_const_mem_alloc: a)
 // PRINT: #pragma omp scope private(a,b) allocate(allocator(omp_const_mem_alloc): a,b)
 // PRINT: #pragma omp scope private(c,a,b) allocate(allocator(myAlloc()): a,b,c)
 // PRINT: #pragma omp scope private(c,a,b,d) allocate(myAlloc(): a,b,c,d)
-  d++;
+// PRINT: #pragma omp scope private(a,b) allocate(align(2), allocator(omp_const_mem_alloc): a,b)
+// PRINT: #pragma omp scope private(c,a,b) allocate(allocator(myAlloc()), align(8): a,b,c)
   return a+b+c+d;
 }
+
+template<typename T, unsigned al>
+void templated_func(T n) {
+  int a, b;
+  T mem = n;
+  #pragma omp scope private(mem,a,b) allocate(allocator(n),align(al):mem,a,b)
+  a += b;
+  #pragma omp scope allocate(allocator(n),align(al):mem,a,b) private(mem,a,b)
+  a += b;
+}
+
+void template_inst(int n) {
+  templated_func<omp_allocator_handle_t, 4>(omp_const_mem_alloc);
+  return;
+}
+// DUMP: FunctionTemplateDecl{{.*}}templated_func
+// DUMP: FunctionDecl{{.*}}templated_func 'void (T)'
+// DUMP: OMPScopeDirective
+// DUMP: OMPPrivateClause
+// DUMP: OMPAllocateClause
+// DUMP: DeclRefExpr{{.*}}'T' lvalue Var{{.*}}'mem' 'T'
+// DUMP: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'a' 'int'
+// DUMP: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'b' 'int'
+// DUMP: OMPScopeDirective
+// DUMP: OMPAllocateClause
+// DUMP: DeclRefExpr{{.*}}'T' lvalue Var{{.*}}'mem' 'T'
+// DUMP: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'a' 'int'
+// DUMP: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'b' 'int'
+// DUMP: OMPPrivateClause
+
+// DUMP: FunctionDecl{{.*}}used templated_func 'void (omp_allocator_handle_t)' implicit_instantiation
+// DUMP: TemplateArgument type 'omp_allocator_handle_t'
+// DUMP: EnumType{{.*}}'omp_allocator_handle_t'
+// DUMP: Enum{{.*}}'omp_allocator_handle_t'
+// DUMP: TemplateArgument integral '4U'
+
+// DUMP: OMPScopeDirective
+// DUMP: OMPPrivateClause
+// DUMP: OMPAllocateClause
+// DUMP: DeclRefExpr{{.*}}'omp_allocator_handle_t' lvalue Var{{.*}}'mem' 'omp_allocator_handle_t'
+// DUMP: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'a' 'int'
+// DUMP: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'b' 'int'
+// DUMP: OMPScopeDirective
+// DUMP: OMPAllocateClause
+// DUMP: DeclRefExpr{{.*}}'omp_allocator_handle_t' lvalue Var{{.*}}'mem' 'omp_allocator_handle_t'
+// DUMP: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'a' 'int'
+// DUMP: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'b' 'int'
+// DUMP: OMPPrivateClause
+// PRINT: #pragma omp scope private(mem,a,b) allocate(allocator(n), align(al): mem,a,b)
+// PRINT: #pragma omp scope allocate(allocator(n), align(al): mem,a,b) private(mem,a,b)
+// PRINT: #pragma omp scope private(mem,a,b) allocate(allocator(n), align(4U): mem,a,b)
+// PRINT: #pragma omp scope allocate(allocator(n), align(4U): mem,a,b) private(mem,a,b)
+
 #endif
diff --git a/clang/test/OpenMP/allocate_modifiers_codegen.cpp b/clang/test/OpenMP/allocate_modifiers_codegen.cpp
new file mode 100644
index 0000000000000..d798e9b3435f0
--- /dev/null
+++ b/clang/test/OpenMP/allocate_modifiers_codegen.cpp
@@ -0,0 +1,409 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 5
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix CHECK-TLS %s
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+enum omp_allocator_handle_t {
+  omp_null_allocator = 0,
+  omp_default_mem_alloc = 1,
+  omp_large_cap_mem_alloc = 2,
+  omp_const_mem_alloc = 3,
+  omp_high_bw_mem_alloc = 4,
+  omp_low_lat_mem_alloc = 5,
+  omp_cgroup_mem_alloc = 6,
+  omp_pteam_mem_alloc = 7,
+  omp_thread_mem_alloc = 8,
+  KMP_ALLOCATOR_MAX_HANDLE = __UINTPTR_MAX__
+};
+
+template <class T>
+struct ST {
+  static T m;
+};
+
+template <class T, omp_allocator_handle_t TY, unsigned al> T foo() {
+  T v;
+ #pragma omp scope private(v) allocate(allocator(TY):v)
+  v = ST<T>::m;
+  #pragma omp scope private(v) allocate(align(al), allocator(TY):v)
+  ++v;
+  return v;
+}
+
+namespace ns {
+int a;
+}
+
+omp_allocator_handle_t foo();
+
+int main() {
+  static int a;
+  static int temp;
+  int v;
+  #pragma omp scope private(ns::a) allocate(allocator(omp_pteam_mem_alloc):ns::a)
+  ns::a++;
+  #pragma omp scope private(a) allocate(align(8),allocator(omp_thread_mem_alloc):a)
+  a = 2;
+  #pragma omp scope private(v) allocate(align(1) : v)
+  ++v;
+  #pragma omp scope private(v) allocate(allocator(omp_default_mem_alloc) : v)
+  ++v;
+  #pragma omp scope private(v) allocate(allocator(omp_large_cap_mem_alloc), align(8) : v)
+  ++v;
+  #pragma omp scope private(v) allocate(align(4) : v)
+  ++v;
+  #pragma omp scope private(v) allocate(align(2), allocator(omp_default_mem_alloc) : v)
+  ++v;
+  #pragma omp scope private(v) allocate(align(8), allocator(foo()) : v)
+  ++v;
+
+  double b = 3;
+  #pragma omp scope private(temp) allocate(temp)
+  temp += foo<int, omp_cgroup_mem_alloc, 8>();
+  return temp+ns::a;
+}
+
+extern template int ST<int>::m;
+
+const int b = 8;
+
+void bar(int a, float &z) {
+  #pragma omp scope private(a,z) allocate(align(b), allocator(omp_default_mem_alloc) : a,z)
+  a += b + z;
+}
+#endif
+// CHECK-LABEL: define dso_local noundef i32 @main(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[V:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B:%.*]] = alloca double, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
+// CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
+// CHECK-NEXT:    [[DOTA__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 7 to ptr))
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTA__VOID_ADDR]], align 4
+// CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
+// CHECK-NEXT:    store i32 [[INC]], ptr [[DOTA__VOID_ADDR]], align 4
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR]], ptr inttoptr (i64 7 to ptr))
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2:[0-9]+]], i32 [[TMP0]])
+// CHECK-NEXT:    [[DOTA__VOID_ADDR1:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr inttoptr (i64 8 to ptr))
+// CHECK-NEXT:    store i32 2, ptr [[DOTA__VOID_ADDR1]], align 4
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR1]], ptr inttoptr (i64 8 to ptr))
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT:    [[DOTV__VOID_ADDR:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 4, i64 4, ptr null)
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTV__VOID_ADDR]], align 4
+// CHECK-NEXT:    [[INC2:%.*]] = add nsw i32 [[TMP2]], 1
+// CHECK-NEXT:    store i32 [[INC2]], ptr [[DOTV__VOID_ADDR]], align 4
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR]], ptr null)
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT:    [[DOTV__VOID_ADDR3:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 1 to ptr))
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTV__VOID_ADDR3]], align 4
+// CHECK-NEXT:    [[INC4:%.*]] = add nsw i32 [[TMP3]], 1
+// CHECK-NEXT:    store i32 [[INC4]], ptr [[DOTV__VOID_ADDR3]], align 4
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR3]], ptr inttoptr (i64 1 to ptr))
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT:    [[DOTV__VOID_ADDR5:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr inttoptr (i64 2 to ptr))
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTV__VOID_ADDR5]], align 4
+// CHECK-NEXT:    [[INC6:%.*]] = add nsw i32 [[TMP4]], 1
+// CHECK-NEXT:    store i32 [[INC6]], ptr [[DOTV__VOID_ADDR5]], align 4
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR5]], ptr inttoptr (i64 2 to ptr))
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT:    [[DOTV__VOID_ADDR7:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 4, i64 4, ptr null)
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTV__VOID_ADDR7]], align 4
+// CHECK-NEXT:    [[INC8:%.*]] = add nsw i32 [[TMP5]], 1
+// CHECK-NEXT:    store i32 [[INC8]], ptr [[DOTV__VOID_ADDR7]], align 4
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR7]], ptr null)
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT:    [[DOTV__VOID_ADDR9:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 4, i64 4, ptr inttoptr (i64 1 to ptr))
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTV__VOID_ADDR9]], align 4
+// CHECK-NEXT:    [[INC10:%.*]] = add nsw i32 [[TMP6]], 1
+// CHECK-NEXT:    store i32 [[INC10]], ptr [[DOTV__VOID_ADDR9]], align 4
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR9]], ptr inttoptr (i64 1 to ptr))
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT:    [[CALL:%.*]] = call noundef i64 @_Z3foov()
+// CHECK-NEXT:    [[CONV:%.*]] = inttoptr i64 [[CALL]] to ptr
+// CHECK-NEXT:    [[DOTV__VOID_ADDR11:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr [[CONV]])
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTV__VOID_ADDR11]], align 4
+// CHECK-NEXT:    [[INC12:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-NEXT:    store i32 [[INC12]], ptr [[DOTV__VOID_ADDR11]], align 4
+// CHECK-NEXT:    [[CALL13:%.*]] = call noundef i64 @_Z3foov()
+// CHECK-NEXT:    [[CONV14:%.*]] = inttoptr i64 [[CALL13]] to ptr
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR11]], ptr [[CONV14]])
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT:    store double 3.000000e+00, ptr [[B]], align 8
+// CHECK-NEXT:    [[DOTTEMP__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr null)
+// CHECK-NEXT:    [[CALL15:%.*]] = call noundef i32 @_Z3fooIiL22omp_allocator_handle_t6ELj8EET_v()
+// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTTEMP__VOID_ADDR]], align 4
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP8]], [[CALL15]]
+// CHECK-NEXT:    store i32 [[ADD]], ptr [[DOTTEMP__VOID_ADDR]], align 4
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTTEMP__VOID_ADDR]], ptr null)
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr @_ZZ4mainE4temp, align 4
+// CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr @_ZN2ns1aE, align 4
+// CHECK-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
+// CHECK-NEXT:    ret i32 [[ADD16]]
+//
+//
+// CHECK-LABEL: define linkonce_odr noundef i32 @_Z3fooIiL22omp_allocator_handle_t6ELj8EET_v(
+// CHECK-SAME: ) #[[ATTR4:[0-9]+]] comdat {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[V:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-NEXT:    [[DOTV__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 6 to ptr))
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @_ZN2STIiE1mE, align 4
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[DOTV__VOID_ADDR]], align 4
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR]], ptr inttoptr (i64 6 to ptr))
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT:    [[DOTV__VOID_ADDR1:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr inttoptr (i64 6 to ptr))
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTV__VOID_ADDR1]], align 4
+// CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
+// CHECK-NEXT:    store i32 [[INC]], ptr [[DOTV__VOID_ADDR1]], align 4
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR1]], ptr inttoptr (i64 6 to ptr))
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[V]], align 4
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
+//
+// CHECK-LABEL: define dso_local void @_Z3bariRf(
+// CHECK-SAME: i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Z:%.*]]) #[[ATTR4]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[Z_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store ptr [[Z]], ptr [[Z_ADDR]], align 8
+// CHECK-NEXT:    [[DOTA__VOID_ADDR:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr inttoptr (i64 1 to ptr))
+// CHECK-NEXT:    [[DOTZ__VOID_ADDR:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr inttoptr (i64 1 to ptr))
+// CHECK-NEXT:    store ptr [[DOTZ__VOID_ADDR]], ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[TMP1]], align 4
+// CHECK-NEXT:    [[ADD:%.*]] = fadd float 8.000000e+00, [[TMP2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTA__VOID_ADDR]], align 4
+// CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP3]] to float
+// CHECK-NEXT:    [[ADD1:%.*]] = fadd float [[CONV]], [[ADD]]
+// CHECK-NEXT:    [[CONV2:%.*]] = fptosi float [[ADD1]] to i32
+// CHECK-NEXT:    store i32 [[CONV2]], ptr [[DOTA__VOID_ADDR]], align 4
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTZ__VOID_ADDR]], ptr inttoptr (i64 1 to ptr))
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR]], ptr inttoptr (i64 1 to ptr))
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-TLS-LABEL: define dso_local noundef i32 @main(
+// CHECK-TLS-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-TLS-NEXT:  [[ENTRY:.*:]]
+// CHECK-TLS-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK-TLS-NEXT:    [[V:%.*]] = alloca i32, align 4
+// CHECK-TLS-NEXT:    [[B:%.*]] = alloca double, align 8
+// CHECK-TLS-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
+// CHECK-TLS-NEXT:    store i32 0, ptr [[RETVAL]], align 4
+// CHECK-TLS-NEXT:    [[DOTA__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 7 to ptr))
+// CHECK-TLS-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTA__VOID_ADDR]], align 4
+// CHECK-TLS-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
+// CHECK-TLS-NEXT:    store i32 [[INC]], ptr [[DOTA__VOID_ADDR]], align 4
+// CHECK-TLS-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR]], ptr inttoptr (i64 7 to ptr))
+// CHECK-TLS-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2:[0-9]+]], i32 [[TMP0]])
+// CHECK-TLS-NEXT:    [[DOTA__VOID_ADDR1:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr inttoptr (i64 8 to ptr))
+// CHECK-TLS-NEXT:    store i32 2, ptr [[DOTA__VOID_ADDR1]], align 4
+// CHECK-TLS-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR1]], ptr inttoptr (i64 8 to ptr))
+// CHECK-TLS-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-TLS-NEXT:    [[DOTV__VOID_ADDR:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 4, i64 4, ptr null)
+// CHECK-TLS-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTV__VOID_ADDR]], align 4
+// CHECK-TLS-NEXT:    [[INC2:%.*]] = add nsw i32 [[TMP2]], 1
+// CHECK-TLS-NEXT:    store i32 [[INC2]], ptr [[DOTV__VOID_ADDR]], align 4
+// CHECK-TLS-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR]], ptr null)
+// CHECK-TLS-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-TLS-NEXT:    [[DOTV__VOID_ADDR3:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 1 to ptr))
+// CHECK-TLS-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTV__VOID_ADDR3]], align 4
+// CHECK-TLS-NEXT:    [[INC4:%.*]] = add nsw i32 [[TMP3]], 1
+// CHECK-TLS-NEXT:    store i32 [[INC4]], ptr [[DOTV__VOID_ADDR3]], align 4
+// CHECK-TLS-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR3]], ptr inttoptr (i64 1 to ptr))
+// CHECK-TLS-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-TLS-NEXT:    [[DOTV__VOID_ADDR5:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr inttoptr (i64 2 to ptr))
+// CHECK-TLS-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTV__VOID_ADDR5]], align 4
+// CHECK-TLS-NEXT:    [[INC6:%.*]] = add nsw i32 [[TMP4]], 1
+// CHECK-TLS-NEXT:    store i32 [[INC6]], ptr [[DOTV__VOID_ADDR5]], align 4
+// CHECK-TLS-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR5]], ptr inttoptr (i64 2 to ptr))
+// CHECK-TLS-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-TLS-NEXT:    [[DOTV__VOID_ADDR7:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 4, i64 4, ptr null)
+// CHECK-TLS-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTV__VOID_ADDR7]], align 4
+// CHECK-TLS-NEXT:    [[INC8:%.*]] = add nsw i32 [[TMP5]], 1
+// CHECK-TLS-NEXT:    store i32 [[INC8]], ptr [[DOTV__VOID_ADDR7]], align 4
+// CHECK-TLS-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR7]], ptr null)
+// CHECK-TLS-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-TLS-NEXT:    [[DOTV__VOID_ADDR9:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 4, i64 4, ptr inttoptr (i64 1 to ptr))
+// CHECK-TLS-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTV__VOID_ADDR9]], align 4
+// CHECK-TLS-NEXT:    [[INC10:%.*]] = add nsw i32 [[TMP6]], 1
+// CHECK-TLS-NEXT:    store i32 [[INC10]], ptr [[DOTV__VOID_ADDR9]], align 4
+// CHECK-TLS-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR9]], ptr inttoptr (i64 1 to ptr))
+// CHECK-TLS-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-TLS-NEXT:    [[CALL:%.*]] = call noundef i64 @_Z3foov()
+// CHECK-TLS-NEXT:    [[CONV:%.*]] = inttoptr i64 [[CALL]] to ptr
+// CHECK-TLS-NEXT:    [[DOTV__VOID_ADDR11:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr [[CONV]])
+// CHECK-TLS-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTV__VOID_ADDR11]], align 4
+// CHECK-TLS-NEXT:    [[INC12:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-TLS-NEXT:    store i32 [[INC12]], ptr [[DOTV__VOID_ADDR11]], align 4
+// CHECK-TLS-NEXT:    [[CALL13:%.*]] = call noundef i64 @_Z3foov()
+// CHECK-TLS-NEXT:    [[CONV14:%.*]] = inttoptr i64 [[CALL13]] to ptr
+// CHECK-TLS-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR11]], ptr [[CONV14]])
+// CHECK-TLS-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-TLS-NEXT:    store double 3.000000e+00, ptr [[B]], align 8
+// CHECK-TLS-NEXT:    [[DOTTEMP__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr null)
+// CHECK-TLS-NEXT:    [[CALL15:%.*]] = call noundef i32 @_Z3fooIiL22omp_allocator_handle_t6ELj8EET_v()
+// CHECK-TLS-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTTEMP__VOID_ADDR]], align 4
+// CHECK-TLS-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP8]], [[CALL15]]
+// CHECK-TLS-NEXT:    store i32 [[ADD]], ptr [[DOTTEMP__VOID_ADDR]], align 4
+// CHECK-TLS-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTTEMP__VOID_ADDR]], ptr null)
+// CHECK-TLS-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-TLS-NEXT:    [[TMP9:%.*]] = load i32, ptr @_ZZ4mainE4temp, align 4
+// CHECK-TLS-NEXT:    [[TMP10:%.*]] = load i32, ptr @_ZN2ns1aE, align 4
+// CHECK-TLS-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
+// CHECK-TLS-NEXT:    ret i32 [[ADD16]]
+//
+//
+// CHECK-TLS-LABEL: define linkonce_odr noundef i32 @_Z3fooIiL22omp_allocator_handle_t6ELj8EET_v(
+// CHECK-TLS-SAME: ) #[[ATTR4:[0-9]+]] comdat {
+// CHECK-TLS-NEXT:  [[ENTRY:.*:]]
+// CHECK-TLS-NEXT:    [[V:%.*]] = alloca i32, align 4
+// CHECK-TLS-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-TLS-NEXT:    [[DOTV__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 6 to ptr))
+// CHECK-TLS-NEXT:    [[TMP1:%.*]] = load i32, ptr @_ZN2STIiE1mE, align 4
+// CHECK-TLS-NEXT:    store i32 [[TMP1]], ptr [[DOTV__VOID_ADDR]], align 4
+// CHECK-TLS-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR]], ptr inttoptr (i64 6 to ptr))
+// CHECK-TLS-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-TLS-NEXT:    [[DOTV__VOID_ADDR1:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr inttoptr (i64 6 to ptr))
+// CHECK-TLS-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTV__VOID_ADDR1]], align 4
+// CHECK-TLS-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
+// CHECK-TLS-NEXT:    store i32 [[INC]], ptr [[DOTV__VOID_ADDR1]], align 4
+// CHECK-TLS-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR1]], ptr inttoptr (i64 6 to ptr))
+// CHECK-TLS-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-TLS-NEXT:    [[TMP3:%.*]] = load i32, ptr [[V]], align 4
+// CHECK-TLS-NEXT:    ret i32 [[TMP3]]
+//
+//
+// CHECK-TLS-LABEL: define dso_local void @_Z3bariRf(
+// CHECK-TLS-SAME: i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Z:%.*]]) #[[ATTR4]] {
+// CHECK-TLS-NEXT:  [[ENTRY:.*:]]
+// CHECK-TLS-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-TLS-NEXT:    [[Z_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-TLS-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
+// CHECK-TLS-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-TLS-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK-TLS-NEXT:    store ptr [[Z]], ptr [[Z_ADDR]], align 8
+// CHECK-TLS-NEXT:    [[DOTA__VOID_ADDR:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr inttoptr (i64 1 to ptr))
+// CHECK-TLS-NEXT:    [[DOTZ__VOID_ADDR:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr inttoptr (i64 1 to ptr))
+// CHECK-TLS-NEXT:    store ptr [[DOTZ__VOID_ADDR]], ptr [[TMP]], align 8
+// CHECK-TLS-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK-TLS-NEXT:    [[TMP2:%.*]] = load float, ptr [[TMP1]], align 4
+// CHECK-TLS-NEXT:    [[ADD:%.*]] = fadd float 8.000000e+00, [[TMP2]]
+// CHECK-TLS-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTA__VOID_ADDR]], align 4
+// CHECK-TLS-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP3]] to float
+// CHECK-TLS-NEXT:    [[ADD1:%.*]] = fadd float [[CONV]], [[ADD]]
+// CHECK-TLS-NEXT:    [[CONV2:%.*]] = fptosi float [[ADD1]] to i32
+// CHECK-TLS-NEXT:    store i32 [[CONV2]], ptr [[DOTA__VOID_ADDR]], align 4
+// CHECK-TLS-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTZ__VOID_ADDR]], ptr inttoptr (i64 1 to ptr))
+// CHECK-TLS-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR]], ptr inttoptr (i64 1 to ptr))
+// CHECK-TLS-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-TLS-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define dso_local noundef i32 @main(
+// SIMD-ONLY0-SAME: ) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY0-NEXT:  [[ENTRY:.*:]]
+// SIMD-ONLY0-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[V:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[A1:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[V2:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[V4:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[V6:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[V8:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[V10:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[V12:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[B:%.*]] = alloca double, align 8
+// SIMD-ONLY0-NEXT:    [[TEMP:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    store i32 0, ptr [[RETVAL]], align 4
+// SIMD-ONLY0-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY0-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY0-NEXT:    store i32 [[INC]], ptr [[A]], align 4
+// SIMD-ONLY0-NEXT:    store i32 2, ptr [[A1]], align 4
+// SIMD-ONLY0-NEXT:    [[TMP1:%.*]] = load i32, ptr [[V2]], align 4
+// SIMD-ONLY0-NEXT:    [[INC3:%.*]] = add nsw i32 [[TMP1]], 1
+// SIMD-ONLY0-NEXT:    store i32 [[INC3]], ptr [[V2]], align 4
+// SIMD-ONLY0-NEXT:    [[TMP2:%.*]] = load i32, ptr [[V4]], align 4
+// SIMD-ONLY0-NEXT:    [[INC5:%.*]] = add nsw i32 [[TMP2]], 1
+// SIMD-ONLY0-NEXT:    store i32 [[INC5]], ptr [[V4]], align 4
+// SIMD-ONLY0-NEXT:    [[TMP3:%.*]] = load i32, ptr [[V6]], align 4
+// SIMD-ONLY0-NEXT:    [[INC7:%.*]] = add nsw i32 [[TMP3]], 1
+// SIMD-ONLY0-NEXT:    store i32 [[INC7]], ptr [[V6]], align 4
+// SIMD-ONLY0-NEXT:    [[TMP4:%.*]] = load i32, ptr [[V8]], align 4
+// SIMD-ONLY0-NEXT:    [[INC9:%.*]] = add nsw i32 [[TMP4]], 1
+// SIMD-ONLY0-NEXT:    store i32 [[INC9]], ptr [[V8]], align 4
+// SIMD-ONLY0-NEXT:    [[TMP5:%.*]] = load i32, ptr [[V10]], align 4
+// SIMD-ONLY0-NEXT:    [[INC11:%.*]] = add nsw i32 [[TMP5]], 1
+// SIMD-ONLY0-NEXT:    store i32 [[INC11]], ptr [[V10]], align 4
+// SIMD-ONLY0-NEXT:    [[TMP6:%.*]] = load i32, ptr [[V12]], align 4
+// SIMD-ONLY0-NEXT:    [[INC13:%.*]] = add nsw i32 [[TMP6]], 1
+// SIMD-ONLY0-NEXT:    store i32 [[INC13]], ptr [[V12]], align 4
+// SIMD-ONLY0-NEXT:    store double 3.000000e+00, ptr [[B]], align 8
+// SIMD-ONLY0-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z3fooIiL22omp_allocator_handle_t6ELj8EET_v()
+// SIMD-ONLY0-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TEMP]], align 4
+// SIMD-ONLY0-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP7]], [[CALL]]
+// SIMD-ONLY0-NEXT:    store i32 [[ADD]], ptr [[TEMP]], align 4
+// SIMD-ONLY0-NEXT:    [[TMP8:%.*]] = load i32, ptr @_ZZ4mainE4temp, align 4
+// SIMD-ONLY0-NEXT:    [[TMP9:%.*]] = load i32, ptr @_ZN2ns1aE, align 4
+// SIMD-ONLY0-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// SIMD-ONLY0-NEXT:    ret i32 [[ADD14]]
+//
+//
+// SIMD-ONLY0-LABEL: define linkonce_odr noundef i32 @_Z3fooIiL22omp_allocator_handle_t6ELj8EET_v(
+// SIMD-ONLY0-SAME: ) #[[ATTR1:[0-9]+]] comdat {
+// SIMD-ONLY0-NEXT:  [[ENTRY:.*:]]
+// SIMD-ONLY0-NEXT:    [[V:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[V1:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[V2:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[TMP0:%.*]] = load i32, ptr @_ZN2STIiE1mE, align 4
+// SIMD-ONLY0-NEXT:    store i32 [[TMP0]], ptr [[V1]], align 4
+// SIMD-ONLY0-NEXT:    [[TMP1:%.*]] = load i32, ptr [[V2]], align 4
+// SIMD-ONLY0-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
+// SIMD-ONLY0-NEXT:    store i32 [[INC]], ptr [[V2]], align 4
+// SIMD-ONLY0-NEXT:    [[TMP2:%.*]] = load i32, ptr [[V]], align 4
+// SIMD-ONLY0-NEXT:    ret i32 [[TMP2]]
+//
+//
+// SIMD-ONLY0-LABEL: define dso_local void @_Z3bariRf(
+// SIMD-ONLY0-SAME: i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Z:%.*]]) #[[ATTR1]] {
+// SIMD-ONLY0-NEXT:  [[ENTRY:.*:]]
+// SIMD-ONLY0-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[Z_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY0-NEXT:    [[A1:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[Z2:%.*]] = alloca float, align 4
+// SIMD-ONLY0-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
+// SIMD-ONLY0-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    store ptr [[Z]], ptr [[Z_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    store ptr [[Z2]], ptr [[TMP]], align 8
+// SIMD-ONLY0-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8
+// SIMD-ONLY0-NEXT:    [[TMP1:%.*]] = load float, ptr [[TMP0]], align 4
+// SIMD-ONLY0-NEXT:    [[ADD:%.*]] = fadd float 8.000000e+00, [[TMP1]]
+// SIMD-ONLY0-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A1]], align 4
+// SIMD-ONLY0-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP2]] to float
+// SIMD-ONLY0-NEXT:    [[ADD3:%.*]] = fadd float [[CONV]], [[ADD]]
+// SIMD-ONLY0-NEXT:    [[CONV4:%.*]] = fptosi float [[ADD3]] to i32
+// SIMD-ONLY0-NEXT:    store i32 [[CONV4]], ptr [[A1]], align 4
+// SIMD-ONLY0-NEXT:    ret void
+//
diff --git a/clang/test/OpenMP/allocate_modifiers_messages.cpp b/clang/test/OpenMP/allocate_modifiers_messages.cpp
new file mode 100644
index 0000000000000..6867e78a89ee9
--- /dev/null
+++ b/clang/test/OpenMP/allocate_modifiers_messages.cpp
@@ -0,0 +1,159 @@
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 %s
+
+typedef enum omp_allocator_handle_t {
+      omp_null_allocator = 0,
+      omp_default_mem_alloc = 1,
+      omp_large_cap_mem_alloc = 2,
+      omp_const_mem_alloc = 3,
+      omp_high_bw_mem_alloc = 4,
+      omp_low_lat_mem_alloc = 5,
+      omp_cgroup_mem_alloc = 6,
+      omp_pteam_mem_alloc = 7,
+      omp_thread_mem_alloc = 8,
+} omp_allocator_handle_t;
+
+int myAlloc() {
+  return 100;
+}
+
+int main() {
+  int a, b, c;
+  // expected-error@+4 {{expected '('}}
+  // expected-error@+3 {{expected expression}}
+  // expected-error@+2 {{expected ')'}}
+  // expected-note@+1 {{to match this '('}}
+  #pragma omp scope private(c) allocate(allocator
+  // expected-error@+6 {{expected expression}}
+  // expected-error@+5 {{expected ')'}}
+  // expected-note@+4 {{to match this '('}}
+  // expected-error@+3 {{expected expression}}
+  // expected-error@+2 {{expected ')'}}
+  // expected-note@+1 {{to match this '('}}
+  #pragma omp scope private(c) allocate(allocator(
+  // expected-error@+4 {{expected expression}}
+  // expected-error@+3 {{expected expression}}
+  // expected-error@+2 {{expected ')'}}
+  // expected-note@+1 {{to match this '('}}
+  #pragma omp scope private(c) allocate(allocator()
+  // expected-error@+2 {{expected expression}}
+  // expected-error@+1 {{expected expression}}
+  #pragma omp scope private(c) allocate(allocator())
+  // expected-error@+6 {{expected ')'}}
+  // expected-note@+5 {{to match this '('}}
+  // expected-error@+4 {{missing ':' after allocate clause modifier}}
+  // expected-error@+3 {{expected expression}}
+  // expected-error@+2 {{expected ')'}}
+  // expected-note@+1 {{to match this '('}}
+  #pragma omp scope private(c) allocate(allocator(omp_default_mem_alloc
+  // expected-error@+6 {{missing ':' after allocate clause modifier}}
+  // expected-error@+5 {{expected expression}}
+  // expected-error@+4 {{expected ')'}}
+  // expected-note@+3 {{to match this '('}}
+  // expected-error@+2 {{expected ')'}}
+  // expected-note@+1 {{to match this '('}}
+  #pragma omp scope private(c) allocate(allocator(omp_large_cap_mem_alloc:
+  // expected-error@+4 {{missing ':' after allocate clause modifier}}
+  // expected-error@+3 {{expected expression}}
+  // expected-error@+2 {{expected ')'}}
+  // expected-note@+1 {{to match this '('}}
+  #pragma omp scope private(c) allocate(allocator(omp_const_mem_alloc)
+  // expected-error@+2 {{missing ':' after allocate clause modifier}}
+  // expected-error@+1 {{expected expression}}
+  #pragma omp scope private(c) allocate(allocator(omp_high_bw_mem_alloc))
+  // expected-error@+1 {{expected expression}}
+  #pragma omp scope private(c) allocate(allocator(omp_low_lat_mem_alloc):)
+  // expected-error@+6 {{expected ')'}}
+  // expected-note@+5 {{to match this '('}}
+  // expected-error@+4 {{missing ':' after allocate clause modifier}}
+  // expected-error@+3 {{expected expression}}
+  // expected-error@+2 {{expected ')'}}
+  // expected-note@+1 {{to match this '('}}
+  #pragma omp scope private(c) allocate(allocator(omp_cgroup_mem_alloc:)
+  // expected-error@+4 {{expected ')'}}
+  // expected-note@+3 {{to match this '('}}
+  // expected-error@+2 {{missing ':' after allocate clause modifier}}
+  // expected-error@+1 {{expected expression}}
+  #pragma omp scope private(c) allocate(allocator(omp_pteam_mem_alloc:))
+  // expected-error@+4 {{expected ')'}}
+  // expected-note@+3 {{to match this '('}}
+  // expected-error@+2 {{missing ':' after allocate clause modifier}}
+  // expected-error@+1 {{expected expression}}
+  #pragma omp scope private(c) allocate(allocator(omp_thread_mem_alloc:c))
+  // expected-error@+1 {{expected variable name}}
+  #pragma omp scope private(c) allocate(allocator(omp_const_mem_alloc):1)
+  // expected-error@+1 {{expected variable name}}
+  #pragma omp scope private(c) allocate(allocator(omp_const_mem_alloc):-10)
+  // expected-error@+4 {{expected ',' or ')' in 'allocate' clause}}
+  // expected-error@+3 {{expected ')'}}
+  // expected-warning@+2 {{extra tokens at the end of '#pragma omp scope' are ignored}}
+  // expected-note@+1 {{to match this '('}}
+  #pragma omp scope private(a,b,c) allocate(allocator(omp_const_mem_alloc):c:b;a)
+  // expected-error@+1 {{initializing 'const omp_allocator_handle_t' with an expression of incompatible type 'int'}}
+  #pragma omp scope private(c,a,b) allocate(allocator(myAlloc()):a,b,c)
+  // expected-error@+2 {{missing ':' after allocate clause modifier}}
+  // expected-error@+1 {{expected expression}}
+  #pragma omp scope private(c) allocate(allocator(omp_default_mem_alloc);c)
+  // expected-error@+2 {{duplicate modifier 'allocator' in 'allocate' clause}}
+  // expected-warning@+1 {{aligned clause will be ignored because the requested alignment is not a power of 2}}
+  #pragma omp scope private(a) allocate(allocator(omp_default_mem_alloc), allocator(omp_default_mem_alloc), align(3) : a)
+  // expected-error@+4 {{expected '('}}
+  // expected-error@+3 {{expected expression}}
+  // expected-error@+2 {{expected ')'}}
+  // expected-note@+1 {{to match this '('}}
+  #pragma omp scope private(a) allocate(allocator
+  // expected-error@+4 {{expected '('}}
+  // expected-error@+3 {{expected expression}}
+  // expected-error@+2 {{expected ')'}}
+  // expected-note@+1 {{to match this '('}}
+  #pragma omp scope private(b) allocate(align
+  // expected-error@+1 {{duplicate modifier 'align' in 'allocate' clause}}
+  #pragma omp scope private(a) allocate(align(8), align(4) : a)
+  // expected-error@+5 {{use of undeclared identifier 'align'}}
+  // expected-error@+4 {{expected ',' or ')' in 'allocate' clause}}
+  // expected-error@+3 {{expected ')'}}
+  // expected-note@+2 {{to match this '('}}
+  // expected-error@+1 {{expected variable name}}
+  #pragma omp scope private(a) allocate(omp_default_mem_alloc, align(8) : a)
+  // expected-error@+3 {{expected modifier in 'allocate' clause}}
+  // expected-error@+2 {{missing ':' after allocate clause modifier}}
+  // expected-error@+1 {{expected expression}}
+  #pragma omp scope private(a) allocate(align(8), omp_default_mem_alloc : a)
+  // expected-error@+5 {{expected ',' or ')' in 'allocate' clause}}
+  // expected-error@+4 {{expected ')'}}
+  // expected-note@+3 {{to match this '('}}
+  // expected-error@+2 {{expected variable name}}
+  // expected-error@+1 {{expected variable name}}
+  #pragma omp scope private(a) allocate(omp_default_mem_alloc, omp_default_mem_alloc : a)
+  // expected-error@+2 {{use of undeclared identifier 'undefinedVar'}}
+  // expected-error@+1 {{expected expression}}
+  #pragma omp scope private(a) allocate(undefinedVar : a)
+  // expected-error@+1 {{expected expression}}
+  #pragma omp scope private(a) allocate(align(8), allocator(omp_default_mem_alloc) : )
+  // expected-error@+2 {{missing ':' after allocate clause modifier}}
+  // expected-error@+1 {{expected expression}}
+  #pragma omp scope private(a) allocate(align(8), allocator(omp_default_mem_alloc) )
+  // expected-error@+3 {{expected expression}}
+  // expected-error@+2 {{expected ')'}}
+  // expected-note@+1 {{to match this '('}}
+  #pragma omp scope private(a) allocate(align(8), allocator(omp_default_mem_alloc) :
+
+  // expected-error@+4 {{missing ':' after allocate clause modifier}}
+  // expected-error@+3 {{expected expression}}
+  // expected-error@+2 {{expected ')'}}
+  // expected-note@+1 {{to match this '('}}
+  #pragma omp scope private(a) allocate(align(8), allocator(omp_default_mem_alloc)
+  // expected-error@+4 {{expected '('}}
+  // expected-error@+3 {{expected '('}}
+  // expected-error@+2 {{expected expression}}
+  // expected-error@+1 {{use of undeclared identifier 'allocator'}}
+  #pragma omp scope private(a) allocate(align, allocator : )
+  // expected-error@+7 {{expected expression}}
+  // expected-error@+6 {{expected expression}}
+  // expected-error@+5 {{expected expression}}
+  // expected-error@+4 {{use of undeclared identifier 'allocator'}}
+  // expected-error@+3 {{expected ',' or ')' in 'allocate' clause}}
+  // expected-error@+2 {{expected ')'}}
+  // expected-note@+1 {{to match this '('}}
+  #pragma omp scope private(a) allocate(align(), allocator() : )
+  ++a;
+}

From d98ced1a9d641539d5bbb287bd16378ba3f5dba9 Mon Sep 17 00:00:00 2001
From: Victor Campos <victor.campos@arm.com>
Date: Mon, 13 Jan 2025 13:51:52 +0000
Subject: [PATCH 273/408] [Multilib] Custom flags YAML parsing (#110657)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch is the first step to extend the current multilib system to
support the selection of library variants which do not correspond to
existing command-line options.

Proposal can be found in
https://discourse.llvm.org/t/rfc-multilib-custom-flags/81058

The multilib mechanism supports libraries that target code generation or
language options such as `--target`, `-mcpu`, `-mfpu`,
`-mbranch-protection`. However, some library variants are particular to
features that do not correspond to any command-line options. Examples
include variants for multithreading and semihosting.

This work introduces a way to instruct the multilib system to consider
these features in library selection. This particular patch comprises a
new section in `multilib.yaml` to declare flags for which no option
exists. Henceforth this sort of flag will be called `custom flag` for
clarity.

The `multilib.yaml` file will have a new section called Flags which
contains the declarations of the target’s custom flags:

```yaml
Flags:
- Name: multithreaded
  Values:
  - Name: no-multithreaded
    MacroDefines: [__SINGLE_THREAD__]
  - Name: multithreaded
  Default: no-multithreaded

- Name: io
  Values:
    - Name: io-none
    - Name: io-semihosting
      MacroDefines: [SEMIHOSTING]
    - Name: io-linux-syscalls
      MacroDefines: [LINUX_SYSCALLS, HOSTED=1]
   Default: io-none
```
- Name: the name to categorize a flag.
- Values: a list of possible values.
- Default: it specifies which value this flag should take if not
specified in the command-line invocation. It must be one value from the
Values field.

Each flag Value follows this description:
- Name (required): the name of the custom flag value (string). This is
the string to be used in `-fmultilib-flag=<string>`.
- MacroDefines (optional): a list of strings to be used as macro
definitions. Each string
  is fed into the driver as ``-D<string>``.

A Default value is useful to save users from specifying custom flags
that have a most commonly used value.

The namespace of flag values is common across all flags. This means that
flag values must be unique.
---
 clang/include/clang/Driver/Multilib.h         |  28 +++-
 clang/lib/Driver/Multilib.cpp                 |  73 ++++++++--
 ...remetal-multilib-custom-flags-parsing.yaml | 133 ++++++++++++++++++
 3 files changed, 223 insertions(+), 11 deletions(-)
 create mode 100644 clang/test/Driver/baremetal-multilib-custom-flags-parsing.yaml

diff --git a/clang/include/clang/Driver/Multilib.h b/clang/include/clang/Driver/Multilib.h
index dbed70f4f9008..1dab45c062aee 100644
--- a/clang/include/clang/Driver/Multilib.h
+++ b/clang/include/clang/Driver/Multilib.h
@@ -101,6 +101,25 @@ class Multilib {
 
 raw_ostream &operator<<(raw_ostream &OS, const Multilib &M);
 
+namespace custom_flag {
+struct Declaration;
+using DeclarationPtr = std::shared_ptr<Declaration>;
+
+struct ValueDetail {
+  std::string Name;
+  std::optional<SmallVector<std::string>> MacroDefines;
+  DeclarationPtr Decl;
+};
+
+struct Declaration {
+  std::string Name;
+  SmallVector<ValueDetail> ValueList;
+  std::optional<size_t> DefaultValueIdx;
+};
+
+static constexpr StringRef Prefix = "-fmultilib-flag=";
+} // namespace custom_flag
+
 /// See also MultilibSetBuilder for combining multilibs into a set.
 class MultilibSet {
 public:
@@ -120,15 +139,18 @@ class MultilibSet {
 
 private:
   multilib_list Multilibs;
-  std::vector<FlagMatcher> FlagMatchers;
+  SmallVector<FlagMatcher> FlagMatchers;
+  SmallVector<custom_flag::DeclarationPtr> CustomFlagDecls;
   IncludeDirsFunc IncludeCallback;
   IncludeDirsFunc FilePathsCallback;
 
 public:
   MultilibSet() = default;
   MultilibSet(multilib_list &&Multilibs,
-              std::vector<FlagMatcher> &&FlagMatchers = {})
-      : Multilibs(Multilibs), FlagMatchers(FlagMatchers) {}
+              SmallVector<FlagMatcher> &&FlagMatchers = {},
+              SmallVector<custom_flag::DeclarationPtr> &&CustomFlagDecls = {})
+      : Multilibs(std::move(Multilibs)), FlagMatchers(std::move(FlagMatchers)),
+        CustomFlagDecls(std::move(CustomFlagDecls)) {}
 
   const multilib_list &getMultilibs() { return Multilibs; }
 
diff --git a/clang/lib/Driver/Multilib.cpp b/clang/lib/Driver/Multilib.cpp
index 0207e0f2eb2de..b4b5dbd1bdb5e 100644
--- a/clang/lib/Driver/Multilib.cpp
+++ b/clang/lib/Driver/Multilib.cpp
@@ -10,6 +10,7 @@
 #include "clang/Basic/LLVM.h"
 #include "clang/Driver/Driver.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -201,13 +202,20 @@ struct MultilibGroupSerialization {
 
 struct MultilibSetSerialization {
   llvm::VersionTuple MultilibVersion;
-  std::vector<MultilibGroupSerialization> Groups;
-  std::vector<MultilibSerialization> Multilibs;
-  std::vector<MultilibSet::FlagMatcher> FlagMatchers;
+  SmallVector<MultilibGroupSerialization> Groups;
+  SmallVector<MultilibSerialization> Multilibs;
+  SmallVector<MultilibSet::FlagMatcher> FlagMatchers;
+  SmallVector<custom_flag::DeclarationPtr> CustomFlagDeclarations;
 };
 
 } // end anonymous namespace
 
+LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibSerialization)
+LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibGroupSerialization)
+LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibSet::FlagMatcher)
+LLVM_YAML_IS_SEQUENCE_VECTOR(custom_flag::ValueDetail)
+LLVM_YAML_IS_SEQUENCE_VECTOR(custom_flag::DeclarationPtr)
+
 template <> struct llvm::yaml::MappingTraits<MultilibSerialization> {
   static void mapping(llvm::yaml::IO &io, MultilibSerialization &V) {
     io.mapOptional("Dir", V.Dir);
@@ -255,11 +263,63 @@ template <> struct llvm::yaml::MappingTraits<MultilibSet::FlagMatcher> {
   }
 };
 
+template <>
+struct llvm::yaml::MappingContextTraits<custom_flag::ValueDetail,
+                                        llvm::SmallSet<std::string, 32>> {
+  static void mapping(llvm::yaml::IO &io, custom_flag::ValueDetail &V,
+                      llvm::SmallSet<std::string, 32> &) {
+    io.mapRequired("Name", V.Name);
+    io.mapOptional("MacroDefines", V.MacroDefines);
+  }
+  static std::string validate(IO &io, custom_flag::ValueDetail &V,
+                              llvm::SmallSet<std::string, 32> &NameSet) {
+    if (V.Name.empty())
+      return "custom flag value requires a name";
+    if (!NameSet.insert(V.Name).second)
+      return "duplicate custom flag value name: \"" + V.Name + "\"";
+    return {};
+  }
+};
+
+template <>
+struct llvm::yaml::MappingContextTraits<custom_flag::DeclarationPtr,
+                                        llvm::SmallSet<std::string, 32>> {
+  static void mapping(llvm::yaml::IO &io, custom_flag::DeclarationPtr &V,
+                      llvm::SmallSet<std::string, 32> &NameSet) {
+    assert(!V);
+    V = std::make_shared<custom_flag::Declaration>();
+    io.mapRequired("Name", V->Name);
+    io.mapRequired("Values", V->ValueList, NameSet);
+    std::string DefaultValueName;
+    io.mapRequired("Default", DefaultValueName);
+
+    for (auto [Idx, Value] : llvm::enumerate(V->ValueList)) {
+      Value.Decl = V;
+      if (Value.Name == DefaultValueName) {
+        assert(!V->DefaultValueIdx);
+        V->DefaultValueIdx = Idx;
+      }
+    }
+  }
+  static std::string validate(IO &io, custom_flag::DeclarationPtr &V,
+                              llvm::SmallSet<std::string, 32> &) {
+    if (V->Name.empty())
+      return "custom flag requires a name";
+    if (V->ValueList.empty())
+      return "custom flag must have at least one value";
+    if (!V->DefaultValueIdx)
+      return "custom flag must have a default value";
+    return {};
+  }
+};
+
 template <> struct llvm::yaml::MappingTraits<MultilibSetSerialization> {
   static void mapping(llvm::yaml::IO &io, MultilibSetSerialization &M) {
     io.mapRequired("MultilibVersion", M.MultilibVersion);
     io.mapRequired("Variants", M.Multilibs);
     io.mapOptional("Groups", M.Groups);
+    llvm::SmallSet<std::string, 32> NameSet;
+    io.mapOptionalWithContext("Flags", M.CustomFlagDeclarations, NameSet);
     io.mapOptional("Mappings", M.FlagMatchers);
   }
   static std::string validate(IO &io, MultilibSetSerialization &M) {
@@ -288,10 +348,6 @@ template <> struct llvm::yaml::MappingTraits<MultilibSetSerialization> {
   }
 };
 
-LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibSerialization)
-LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibGroupSerialization)
-LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibSet::FlagMatcher)
-
 llvm::ErrorOr<MultilibSet>
 MultilibSet::parseYaml(llvm::MemoryBufferRef Input,
                        llvm::SourceMgr::DiagHandlerTy DiagHandler,
@@ -319,7 +375,8 @@ MultilibSet::parseYaml(llvm::MemoryBufferRef Input,
     }
   }
 
-  return MultilibSet(std::move(Multilibs), std::move(MS.FlagMatchers));
+  return MultilibSet(std::move(Multilibs), std::move(MS.FlagMatchers),
+                     std::move(MS.CustomFlagDeclarations));
 }
 
 LLVM_DUMP_METHOD void MultilibSet::dump() const {
diff --git a/clang/test/Driver/baremetal-multilib-custom-flags-parsing.yaml b/clang/test/Driver/baremetal-multilib-custom-flags-parsing.yaml
new file mode 100644
index 0000000000000..fe6a9a8d7f1ee
--- /dev/null
+++ b/clang/test/Driver/baremetal-multilib-custom-flags-parsing.yaml
@@ -0,0 +1,133 @@
+# RUN: split-file %s %t
+
+# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/multilib-without-macro-defines.yaml %s -### -o /dev/null 2>&1 \
+# RUN: | FileCheck %s
+# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/multilib-with-macro-defines.yaml %s -### -o /dev/null 2>&1 \
+# RUN: | FileCheck %s
+# CHECK-NOT: error:
+
+# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/missing-flag-name.yaml %s -### -o /dev/null 2>&1 \
+# RUN: | FileCheck %s --check-prefix=CHECK-MISSING-FLAG-NAME
+# CHECK-MISSING-FLAG-NAME: error: custom flag requires a name
+
+# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/missing-flag-values.yaml %s -### -o /dev/null 2>&1 \
+# RUN: | FileCheck %s --check-prefix=CHECK-MISSING-FLAG-VALUES
+# CHECK-MISSING-FLAG-VALUES: error: custom flag must have at least one value
+
+# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/missing-flag-value-default.yaml %s -### -o /dev/null 2>&1 \
+# RUN: | FileCheck %s --check-prefix=CHECK-MISSING-FLAG-VALUE-DEFAULT
+# CHECK-MISSING-FLAG-VALUE-DEFAULT: error: custom flag must have a default value
+
+# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/missing-flag-value-name.yaml %s -### -o /dev/null 2>&1 \
+# RUN: | FileCheck %s --check-prefix=CHECK-MISSING-FLAG-VALUE-NAME
+# CHECK-MISSING-FLAG-VALUE-NAME: error: custom flag value requires a name
+
+# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/duplicate-flag-value-name.yaml %s -### -o /dev/null 2>&1 \
+# RUN: | FileCheck %s --check-prefix=CHECK-DUPLICATE-FLAG-VALUE-NAME
+# CHECK-DUPLICATE-FLAG-VALUE-NAME:      error: duplicate custom flag value name: "value-name"
+# CHECK-DUPLICATE-FLAG-VALUE-NAME-NEXT: - Name: value-name
+
+#--- multilib-without-macro-defines.yaml
+---
+MultilibVersion: 1.0
+
+Variants:
+- Dir: libc
+  Flags: [-fmultilib-flag=a]
+
+Flags:
+  - Name: flag
+    Values:
+    - Name: a
+    - Name: b
+    Default: a
+
+#--- multilib-with-macro-defines.yaml
+---
+MultilibVersion: 1.0
+
+Variants:
+- Dir: libc
+  Flags: [-fmultilib-flag=a]
+
+Flags:
+  - Name: flag
+    Values:
+    - Name: a
+      MacroDefines: [FEATURE_A]
+    - Name: b
+      MacroDefines: [FEATURE_B]
+    Default: a
+
+#--- missing-flag-name.yaml
+---
+MultilibVersion: 1.0
+
+Variants:
+- Dir: libc
+  Flags: [-fmultilib-flag=a]
+
+Flags:
+  - Values:
+    - Name: a
+    Default: a
+
+#--- missing-flag-values.yaml
+---
+MultilibVersion: 1.0
+
+Variants:
+- Dir: libc
+  Flags: [-fmultilib-flag=a]
+
+Flags:
+  - Name: flag
+    Values:
+    Default: a
+
+#--- missing-flag-value-default.yaml
+---
+MultilibVersion: 1.0
+
+Variants:
+- Dir: libc
+  Flags: [-fmultilib-flag=a]
+
+Flags:
+  - Name: flag
+    Values:
+    - Name: a
+    Default:
+
+#--- missing-flag-value-name.yaml
+---
+MultilibVersion: 1.0
+
+Variants:
+- Dir: libc
+  Flags: [-fmultilib-flag=a]
+
+Flags:
+  - Name: flag
+    Values:
+    - Name:
+    Default: a
+
+#--- duplicate-flag-value-name.yaml
+---
+MultilibVersion: 1.0
+
+Variants:
+- Dir: libc
+  Flags: [-fmultilib-flag=value-name]
+
+Flags:
+  - Name: a
+    Values:
+    - Name: value-name
+    - Name: value-a
+    Default: value-name
+  - Name: b
+    Values:
+    - Name: value-name
+    Default: value-name

From 2a551ab3002897ba52a27961b766f3741695c816 Mon Sep 17 00:00:00 2001
From: Victor Campos <victor.campos@arm.com>
Date: Mon, 13 Jan 2025 13:53:53 +0000
Subject: [PATCH 274/408] [Multilib] Add -fmultilib-flag command-line option
 (#110658)

This patch is the second step to extend the current multilib system to
support the selection of library variants which do not correspond to
existing command-line options.

Proposal can be found in
https://discourse.llvm.org/t/rfc-multilib-custom-flags/81058

The multilib mechanism supports libraries that target code generation or
language options such as --target, -mcpu, -mfpu, -mbranch-protection.
However, some library variants are particular to features that do not
correspond to any command-line options. Examples include variants for
multithreading and semihosting.

This work introduces a way to instruct the multilib system to consider
these features in library selection.

The driver must be informed about the multilib custom flags with a new
command-line option.
```
-fmultilib-flag=C
```
Where the grammar for C is:
```
C -> option
option -> multithreaded | no-multithreaded | io-none | io-semihosting | io-linux-syscalls | ...
```
There must be one option instance for each flag specified:
```
-fmultilib-flag=multithreaded -fmultilib-flag=io-semihosting
```
Contradictory options are untied by *last one wins*.

These options are to be used exclusively by the multilib mechanism in
the Clang driver. Hence they are not forwarded to the compiler frontend.
---
 clang/include/clang/Driver/Options.td           |  2 ++
 clang/lib/Driver/ToolChain.cpp                  | 12 ++++++++++++
 clang/test/Driver/print-multi-selection-flags.c |  7 +++++++
 3 files changed, 21 insertions(+)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 80360216c9503..bbf5c0e7e7fd1 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5756,6 +5756,8 @@ def print_multi_directory : Flag<["-", "--"], "print-multi-directory">;
 def print_multi_lib : Flag<["-", "--"], "print-multi-lib">;
 def print_multi_flags : Flag<["-", "--"], "print-multi-flags-experimental">,
   HelpText<"Print the flags used for selecting multilibs (experimental)">;
+def fmultilib_flag : Joined<["-", "--"], "fmultilib-flag=">,
+  Visibility<[ClangOption]>;
 def print_multi_os_directory : Flag<["-", "--"], "print-multi-os-directory">,
   Flags<[Unsupported]>;
 def print_target_triple : Flag<["-", "--"], "print-target-triple">,
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 2b4df64f2789d..acf9d264d631b 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -196,6 +196,15 @@ bool ToolChain::defaultToIEEELongDouble() const {
   return PPC_LINUX_DEFAULT_IEEELONGDOUBLE && getTriple().isOSLinux();
 }
 
+static void processMultilibCustomFlags(Multilib::flags_list &List,
+                                       const llvm::opt::ArgList &Args) {
+  for (const Arg *MultilibFlagArg :
+       Args.filtered(options::OPT_fmultilib_flag)) {
+    List.push_back(MultilibFlagArg->getAsString(Args));
+    MultilibFlagArg->claim();
+  }
+}
+
 static void getAArch64MultilibFlags(const Driver &D,
                                           const llvm::Triple &Triple,
                                           const llvm::opt::ArgList &Args,
@@ -246,6 +255,8 @@ static void getAArch64MultilibFlags(const Driver &D,
   if (ABIArg) {
     Result.push_back(ABIArg->getAsString(Args));
   }
+
+  processMultilibCustomFlags(Result, Args);
 }
 
 static void getARMMultilibFlags(const Driver &D,
@@ -313,6 +324,7 @@ static void getARMMultilibFlags(const Driver &D,
     if (Endian->getOption().matches(options::OPT_mbig_endian))
       Result.push_back(Endian->getAsString(Args));
   }
+  processMultilibCustomFlags(Result, Args);
 }
 
 static void getRISCVMultilibFlags(const Driver &D, const llvm::Triple &Triple,
diff --git a/clang/test/Driver/print-multi-selection-flags.c b/clang/test/Driver/print-multi-selection-flags.c
index 5bf6dca5096a7..cf9522aa06852 100644
--- a/clang/test/Driver/print-multi-selection-flags.c
+++ b/clang/test/Driver/print-multi-selection-flags.c
@@ -90,3 +90,10 @@
 // CHECK-RV32E-ORDER: --target=riscv32-unknown-none-elf
 // CHECK-RV32E-ORDER: -mabi=ilp32e
 // CHECK-RV32E-ORDER: -march=rv32e{{[0-9]+p[0-9]+}}_c{{[0-9]+p[0-9]+}}_zicsr{{[0-9]+p[0-9]+}}
+
+// RUN: %clang -print-multi-flags-experimental --target=armv8m.main-none-eabi -fmultilib-flag=foo -fmultilib-flag=bar | FileCheck --check-prefixes=CHECK-MULTILIB-CUSTOM-FLAG,CHECK-ARM-MULTILIB-CUSTOM-FLAG %s
+// RUN: %clang -print-multi-flags-experimental --target=aarch64-none-eabi     -fmultilib-flag=foo -fmultilib-flag=bar | FileCheck --check-prefixes=CHECK-MULTILIB-CUSTOM-FLAG,CHECK-AARCH64-MULTILIB-CUSTOM-FLAG %s
+// CHECK-ARM-MULTILIB-CUSTOM-FLAG:     --target=thumbv8m.main-unknown-none-eabi
+// CHECK-AARCH64-MULTILIB-CUSTOM-FLAG: --target=aarch64-unknown-none-eabi
+// CHECK-MULTILIB-CUSTOM-FLAG-DAG:     -fmultilib-flag=foo
+// CHECK-MULTILIB-CUSTOM-FLAG-DAG:     -fmultilib-flag=bar

From 162397f98d04415eebe115cdcb01558932e5c802 Mon Sep 17 00:00:00 2001
From: Hui <hui.xie1990@gmail.com>
Date: Mon, 13 Jan 2025 14:09:29 +0000
Subject: [PATCH 275/408] [libc++] Replace stable_sort with sort in flat_map
 (#121431)

Fixes #120788
---
 libcxx/include/__flat_map/flat_map.h          |  8 +--
 libcxx/include/module.modulemap               |  2 +
 .../flat.map/container_stability.pass.cpp     | 68 -------------------
 .../iter_iter_stability.pass.cpp              | 66 ------------------
 .../insert_range_stability.pass.cpp           | 63 -----------------
 5 files changed, 5 insertions(+), 202 deletions(-)
 delete mode 100644 libcxx/test/libcxx/containers/containers.adaptors/flat.map/container_stability.pass.cpp
 delete mode 100644 libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/iter_iter_stability.pass.cpp
 delete mode 100644 libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_range_stability.pass.cpp

diff --git a/libcxx/include/__flat_map/flat_map.h b/libcxx/include/__flat_map/flat_map.h
index 9fe84250b1204..ab53b7a285ca4 100644
--- a/libcxx/include/__flat_map/flat_map.h
+++ b/libcxx/include/__flat_map/flat_map.h
@@ -17,7 +17,7 @@
 #include <__algorithm/ranges_inplace_merge.h>
 #include <__algorithm/ranges_lower_bound.h>
 #include <__algorithm/ranges_partition_point.h>
-#include <__algorithm/ranges_stable_sort.h>
+#include <__algorithm/ranges_sort.h>
 #include <__algorithm/ranges_unique.h>
 #include <__algorithm/ranges_upper_bound.h>
 #include <__algorithm/remove_if.h>
@@ -853,9 +853,7 @@ class flat_map {
   // is no invariant state to preserve
   _LIBCPP_HIDE_FROM_ABI void __sort_and_unique() {
     auto __zv = ranges::views::zip(__containers_.keys, __containers_.values);
-    // To be consistent with std::map's behaviour, we use stable_sort instead of sort.
-    // As a result, if there are duplicated keys, the first value in the original order will be taken.
-    ranges::stable_sort(__zv, __compare_, [](const auto& __p) -> decltype(auto) { return std::get<0>(__p); });
+    ranges::sort(__zv, __compare_, [](const auto& __p) -> decltype(auto) { return std::get<0>(__p); });
     auto __dup_start = ranges::unique(__zv, __key_equiv(__compare_)).begin();
     auto __dist      = ranges::distance(__zv.begin(), __dup_start);
     __containers_.keys.erase(__containers_.keys.begin() + __dist, __containers_.keys.end());
@@ -886,7 +884,7 @@ class flat_map {
         return __compare_(std::get<0>(__p1), std::get<0>(__p2));
       };
       if constexpr (!_WasSorted) {
-        ranges::stable_sort(__zv.begin() + __append_start_offset, __end, __compare_key);
+        ranges::sort(__zv.begin() + __append_start_offset, __end, __compare_key);
       } else {
         _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(
             __is_sorted_and_unique(__containers_.keys | ranges::views::drop(__append_start_offset)),
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index 07ab5649ae45c..6800f8b562650 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -760,6 +760,8 @@ module std [system] {
     module ranges_sort {
       header "__algorithm/ranges_sort.h"
       export std.functional.ranges_operations
+      export std.algorithm.sort
+      export std.algorithm.make_projected
     }
     module ranges_stable_partition {
       header "__algorithm/ranges_stable_partition.h"
diff --git a/libcxx/test/libcxx/containers/containers.adaptors/flat.map/container_stability.pass.cpp b/libcxx/test/libcxx/containers/containers.adaptors/flat.map/container_stability.pass.cpp
deleted file mode 100644
index 0d90c3250061f..0000000000000
--- a/libcxx/test/libcxx/containers/containers.adaptors/flat.map/container_stability.pass.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
-
-// <flat_map>
-
-// flat_map(key_container_type key_cont, mapped_container_type mapped_cont);
-//
-// libc++ uses stable_sort to ensure that flat_map's behavior matches map's,
-// in terms of which duplicate items are kept.
-// This tests a conforming extension.
-
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <flat_map>
-#include <random>
-#include <map>
-#include <vector>
-
-#include "test_macros.h"
-
-struct Mod256 {
-  bool operator()(int x, int y) const { return (x % 256) < (y % 256); }
-};
-
-int main(int, char**) {
-  std::mt19937 randomness;
-  std::vector<uint16_t> values;
-  std::vector<std::pair<uint16_t, uint16_t>> pairs;
-  for (int i = 0; i < 200; ++i) {
-    uint16_t r = randomness();
-    values.push_back(r);
-    pairs.emplace_back(r, r);
-  }
-
-  {
-    std::map<uint16_t, uint16_t, Mod256> m(pairs.begin(), pairs.end());
-    std::flat_map<uint16_t, uint16_t, Mod256> fm(values, values);
-    assert(fm.size() == m.size());
-    LIBCPP_ASSERT(std::ranges::equal(fm, m));
-  }
-  {
-    std::map<uint16_t, uint16_t, Mod256> m(pairs.begin(), pairs.end());
-    std::flat_map<uint16_t, uint16_t, Mod256> fm(values, values, Mod256());
-    assert(fm.size() == m.size());
-    LIBCPP_ASSERT(std::ranges::equal(fm, m));
-  }
-  {
-    std::map<uint16_t, uint16_t, Mod256> m(pairs.begin(), pairs.end());
-    std::flat_map<uint16_t, uint16_t, Mod256> fm(values, values, std::allocator<int>());
-    assert(fm.size() == m.size());
-    LIBCPP_ASSERT(std::ranges::equal(fm, m));
-  }
-  {
-    std::map<uint16_t, uint16_t, Mod256> m(pairs.begin(), pairs.end());
-    std::flat_map<uint16_t, uint16_t, Mod256> fm(values, values, Mod256(), std::allocator<int>());
-    assert(fm.size() == m.size());
-    LIBCPP_ASSERT(std::ranges::equal(fm, m));
-  }
-  return 0;
-}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/iter_iter_stability.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/iter_iter_stability.pass.cpp
deleted file mode 100644
index 14189840ce660..0000000000000
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/iter_iter_stability.pass.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
-
-// <flat_map>
-
-// template<class InputIterator>
-//   flat_map(InputIterator first, InputIterator last, const key_compare& comp = key_compare())
-//
-// libc++ uses stable_sort to ensure that flat_map's behavior matches map's,
-// in terms of which duplicate items are kept.
-// This tests a conforming extension.
-
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <flat_map>
-#include <random>
-#include <map>
-#include <vector>
-
-#include "test_macros.h"
-
-struct Mod256 {
-  bool operator()(int x, int y) const { return (x % 256) < (y % 256); }
-};
-
-int main(int, char**) {
-  std::mt19937 randomness;
-  std::pair<uint16_t, uint16_t> pairs[200];
-  for (auto& pair : pairs) {
-    pair = {uint16_t(randomness()), uint16_t(randomness())};
-  }
-
-  {
-    std::map<uint16_t, uint16_t, Mod256> m(pairs, pairs + 200);
-    std::flat_map<uint16_t, uint16_t, Mod256> fm(pairs, pairs + 200);
-    assert(fm.size() == m.size());
-    LIBCPP_ASSERT(std::ranges::equal(fm, m));
-  }
-  {
-    std::map<uint16_t, uint16_t, Mod256> m(pairs, pairs + 200, std::allocator<int>());
-    std::flat_map<uint16_t, uint16_t, Mod256> fm(pairs, pairs + 200, std::allocator<int>());
-    assert(fm.size() == m.size());
-    LIBCPP_ASSERT(std::ranges::equal(fm, m));
-  }
-  {
-    std::map<uint16_t, uint16_t, Mod256> m(pairs, pairs + 200, Mod256());
-    std::flat_map<uint16_t, uint16_t, Mod256> fm(pairs, pairs + 200, Mod256());
-    assert(fm.size() == m.size());
-    LIBCPP_ASSERT(std::ranges::equal(fm, m));
-  }
-  {
-    std::map<uint16_t, uint16_t, Mod256> m(pairs, pairs + 200, Mod256(), std::allocator<int>());
-    std::flat_map<uint16_t, uint16_t, Mod256> fm(pairs, pairs + 200, Mod256(), std::allocator<int>());
-    assert(fm.size() == m.size());
-    LIBCPP_ASSERT(std::ranges::equal(fm, m));
-  }
-  return 0;
-}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_range_stability.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_range_stability.pass.cpp
deleted file mode 100644
index fabcb1d216a78..0000000000000
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_range_stability.pass.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
-
-// <flat_map>
-
-// template<container-compatible-range<value_type> R>
-//   void insert_range(R&& rg);
-//
-// libc++ uses stable_sort to ensure that flat_map's behavior matches map's,
-// in terms of which duplicate items are kept.
-// This tests a conforming extension.
-
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <flat_map>
-#include <random>
-#include <ranges>
-#include <map>
-#include <vector>
-#include <utility>
-
-#include "test_macros.h"
-
-struct Mod256 {
-  bool operator()(int x, int y) const { return (x % 256) < (y % 256); }
-};
-
-int main(int, char**) {
-  {
-    std::mt19937 randomness;
-    std::pair<uint16_t, uint16_t> pairs[400];
-    for (int i = 0; i < 400; ++i) {
-      uint16_t r = randomness();
-      pairs[i]   = {r, r};
-    }
-
-    std::map<uint16_t, uint16_t, Mod256> m(pairs, pairs + 200);
-    std::flat_map<uint16_t, uint16_t, Mod256> fm(std::sorted_unique, m.begin(), m.end());
-    assert(std::ranges::equal(fm, m));
-
-    fm.insert_range(std::views::counted(pairs + 200, 200));
-    m.insert(pairs + 200, pairs + 400);
-    assert(fm.size() == m.size());
-    LIBCPP_ASSERT(std::ranges::equal(fm, m));
-  }
-
-  {
-    std::vector<std::pair<int, int>> v{{1, 2}, {1, 3}};
-    std::flat_map<int, int> m;
-    m.insert_range(v);
-    assert(m.size() == 1);
-    LIBCPP_ASSERT(m[1] == 2);
-  }
-  return 0;
-}

From cedb44af53f195135e8e8de98f161048d19f8857 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Mon, 13 Jan 2025 09:10:36 -0500
Subject: [PATCH 276/408] [libc++] Pass type information down to
 __libcpp_allocate (#118837)

Currently, places where we call __libcpp_allocate must drop type
information on the ground even when they actually have such information
available. That is unfortunate since some toolchains and system
allocators are able to provide improved security when they know what
type is being allocated.

This is the purpose of http://wg21.link/p2719, where we introduce a new
variant of `operator new` which takes a type in its interface. A
different but related issue is that `std::allocator` does not honor any
in-class `T::operator new` since it is specified to call the global
`::operator new` instead.

This patch closes the gap to make it trivial for implementations that
provide typed memory allocators to actually benefit from that
information in more contexts, and also makes libc++ forward-compatible
with future proposals that would fix the existing defects in
`std::allocator`. It also makes the internal allocation API higher level
by operating on objects instead of operating on bytes of memory.

Since this is a widely-used function and making this a template could
have an impact on debug info sizes, I tried minimizing the number of
templated layers by removing `__do_deallocate_handle_size`, which was
easy to replace with a macro (and IMO this leads to cleaner code).
---
 libcxx/include/CMakeLists.txt                 |  2 +-
 libcxx/include/__functional/function.h        | 19 ++++--
 libcxx/include/__memory/allocator.h           |  4 +-
 .../include/__memory/builtin_new_allocator.h  | 67 -------------------
 .../__memory/unique_temporary_buffer.h        |  2 +-
 libcxx/include/__new/allocate.h               | 44 +++++++-----
 .../include/__string/constexpr_c_functions.h  |  5 +-
 libcxx/include/__utility/element_count.h      | 27 ++++++++
 libcxx/include/__utility/small_buffer.h       |  4 +-
 libcxx/include/module.modulemap               | 19 ++++--
 libcxx/src/memory_resource.cpp                | 10 +--
 .../support.dynamic/libcpp_deallocate.sh.cpp  | 37 +++++-----
 12 files changed, 112 insertions(+), 128 deletions(-)
 delete mode 100644 libcxx/include/__memory/builtin_new_allocator.h
 create mode 100644 libcxx/include/__utility/element_count.h

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index f7721b1047b81..e152383a329fe 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -547,7 +547,6 @@ set(files
   __memory/array_cookie.h
   __memory/assume_aligned.h
   __memory/auto_ptr.h
-  __memory/builtin_new_allocator.h
   __memory/compressed_pair.h
   __memory/concepts.h
   __memory/construct_at.h
@@ -880,6 +879,7 @@ set(files
   __utility/cmp.h
   __utility/convert_to_integral.h
   __utility/declval.h
+  __utility/element_count.h
   __utility/empty.h
   __utility/exception_guard.h
   __utility/exchange.h
diff --git a/libcxx/include/__functional/function.h b/libcxx/include/__functional/function.h
index 2924f6cad6578..08cb731be9725 100644
--- a/libcxx/include/__functional/function.h
+++ b/libcxx/include/__functional/function.h
@@ -22,7 +22,6 @@
 #include <__memory/allocator.h>
 #include <__memory/allocator_destructor.h>
 #include <__memory/allocator_traits.h>
-#include <__memory/builtin_new_allocator.h>
 #include <__memory/compressed_pair.h>
 #include <__memory/unique_ptr.h>
 #include <__type_traits/aligned_storage.h>
@@ -193,6 +192,13 @@ class __alloc_func<_Fp, _Ap, _Rp(_ArgTypes...)> {
   }
 };
 
+template <class _Tp>
+struct __deallocating_deleter {
+  _LIBCPP_HIDE_FROM_ABI void operator()(void* __p) const {
+    std::__libcpp_deallocate<_Tp>(static_cast<_Tp*>(__p), __element_count(1));
+  }
+};
+
 template <class _Fp, class _Rp, class... _ArgTypes>
 class __default_alloc_func<_Fp, _Rp(_ArgTypes...)> {
   _Fp __f_;
@@ -212,8 +218,9 @@ class __default_alloc_func<_Fp, _Rp(_ArgTypes...)> {
   }
 
   _LIBCPP_HIDE_FROM_ABI __default_alloc_func* __clone() const {
-    __builtin_new_allocator::__holder_t __hold = __builtin_new_allocator::__allocate_type<__default_alloc_func>(1);
-    __default_alloc_func* __res                = ::new ((void*)__hold.get()) __default_alloc_func(__f_);
+    using _Self = __default_alloc_func;
+    unique_ptr<_Self, __deallocating_deleter<_Self>> __hold(std::__libcpp_allocate<_Self>(__element_count(1)));
+    _Self* __res = ::new ((void*)__hold.get()) _Self(__f_);
     (void)__hold.release();
     return __res;
   }
@@ -222,7 +229,7 @@ class __default_alloc_func<_Fp, _Rp(_ArgTypes...)> {
 
   _LIBCPP_HIDE_FROM_ABI static void __destroy_and_delete(__default_alloc_func* __f) {
     __f->destroy();
-    __builtin_new_allocator::__deallocate_type<__default_alloc_func>(__f, 1);
+    std::__libcpp_deallocate<__default_alloc_func>(__f, __element_count(1));
   }
 };
 
@@ -668,8 +675,8 @@ class __policy_func<_Rp(_ArgTypes...)> {
       if (__use_small_storage<_Fun>()) {
         ::new ((void*)&__buf_.__small) _Fun(std::move(__f));
       } else {
-        __builtin_new_allocator::__holder_t __hold = __builtin_new_allocator::__allocate_type<_Fun>(1);
-        __buf_.__large                             = ::new ((void*)__hold.get()) _Fun(std::move(__f));
+        unique_ptr<_Fun, __deallocating_deleter<_Fun>> __hold(std::__libcpp_allocate<_Fun>(__element_count(1)));
+        __buf_.__large = ::new ((void*)__hold.get()) _Fun(std::move(__f));
         (void)__hold.release();
       }
     }
diff --git a/libcxx/include/__memory/allocator.h b/libcxx/include/__memory/allocator.h
index a7066885a978a..191a59e6614a0 100644
--- a/libcxx/include/__memory/allocator.h
+++ b/libcxx/include/__memory/allocator.h
@@ -102,7 +102,7 @@ class _LIBCPP_TEMPLATE_VIS allocator : private __non_trivial_if<!is_void<_Tp>::v
     if (__libcpp_is_constant_evaluated()) {
       return static_cast<_Tp*>(::operator new(__n * sizeof(_Tp)));
     } else {
-      return static_cast<_Tp*>(std::__libcpp_allocate(__n * sizeof(_Tp), _LIBCPP_ALIGNOF(_Tp)));
+      return std::__libcpp_allocate<_Tp>(__element_count(__n));
     }
   }
 
@@ -117,7 +117,7 @@ class _LIBCPP_TEMPLATE_VIS allocator : private __non_trivial_if<!is_void<_Tp>::v
     if (__libcpp_is_constant_evaluated()) {
       ::operator delete(__p);
     } else {
-      std::__libcpp_deallocate((void*)__p, __n * sizeof(_Tp), _LIBCPP_ALIGNOF(_Tp));
+      std::__libcpp_deallocate<_Tp>(__p, __element_count(__n));
     }
   }
 
diff --git a/libcxx/include/__memory/builtin_new_allocator.h b/libcxx/include/__memory/builtin_new_allocator.h
deleted file mode 100644
index cde1a6025a9a7..0000000000000
--- a/libcxx/include/__memory/builtin_new_allocator.h
+++ /dev/null
@@ -1,67 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___MEMORY_BUILTIN_NEW_ALLOCATOR_H
-#define _LIBCPP___MEMORY_BUILTIN_NEW_ALLOCATOR_H
-
-#include <__config>
-#include <__cstddef/size_t.h>
-#include <__memory/unique_ptr.h>
-#include <__new/allocate.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-// __builtin_new_allocator -- A non-templated helper for allocating and
-// deallocating memory using __builtin_operator_new and
-// __builtin_operator_delete. It should be used in preference to
-// `std::allocator<T>` to avoid additional instantiations.
-struct __builtin_new_allocator {
-  struct __builtin_new_deleter {
-    typedef void* pointer_type;
-
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit __builtin_new_deleter(size_t __size, size_t __align)
-        : __size_(__size), __align_(__align) {}
-
-    _LIBCPP_HIDE_FROM_ABI void operator()(void* __p) const _NOEXCEPT {
-      std::__libcpp_deallocate(__p, __size_, __align_);
-    }
-
-  private:
-    size_t __size_;
-    size_t __align_;
-  };
-
-  typedef unique_ptr<void, __builtin_new_deleter> __holder_t;
-
-  _LIBCPP_HIDE_FROM_ABI static __holder_t __allocate_bytes(size_t __s, size_t __align) {
-    return __holder_t(std::__libcpp_allocate(__s, __align), __builtin_new_deleter(__s, __align));
-  }
-
-  _LIBCPP_HIDE_FROM_ABI static void __deallocate_bytes(void* __p, size_t __s, size_t __align) _NOEXCEPT {
-    std::__libcpp_deallocate(__p, __s, __align);
-  }
-
-  template <class _Tp>
-  _LIBCPP_NODEBUG _LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI static __holder_t __allocate_type(size_t __n) {
-    return __allocate_bytes(__n * sizeof(_Tp), _LIBCPP_ALIGNOF(_Tp));
-  }
-
-  template <class _Tp>
-  _LIBCPP_NODEBUG _LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI static void
-  __deallocate_type(void* __p, size_t __n) _NOEXCEPT {
-    __deallocate_bytes(__p, __n * sizeof(_Tp), _LIBCPP_ALIGNOF(_Tp));
-  }
-};
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___MEMORY_BUILTIN_NEW_ALLOCATOR_H
diff --git a/libcxx/include/__memory/unique_temporary_buffer.h b/libcxx/include/__memory/unique_temporary_buffer.h
index dea7fa8e18728..32a3f0f081c00 100644
--- a/libcxx/include/__memory/unique_temporary_buffer.h
+++ b/libcxx/include/__memory/unique_temporary_buffer.h
@@ -40,7 +40,7 @@ struct __temporary_buffer_deleter {
       return;
     }
 
-    std::__libcpp_deallocate_unsized((void*)__ptr, _LIBCPP_ALIGNOF(_Tp));
+    std::__libcpp_deallocate_unsized<_Tp>(__ptr);
   }
 };
 
diff --git a/libcxx/include/__new/allocate.h b/libcxx/include/__new/allocate.h
index 71dffc1776eff..a64663c09fa35 100644
--- a/libcxx/include/__new/allocate.h
+++ b/libcxx/include/__new/allocate.h
@@ -14,6 +14,8 @@
 #include <__cstddef/size_t.h>
 #include <__new/align_val_t.h>
 #include <__new/global_new_delete.h> // for _LIBCPP_HAS_SIZED_DEALLOCATION
+#include <__type_traits/type_identity.h>
+#include <__utility/element_count.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -47,52 +49,58 @@ _LIBCPP_HIDE_FROM_ABI void __libcpp_operator_delete(_Args... __args) _NOEXCEPT {
 #endif
 }
 
-inline _LIBCPP_HIDE_FROM_ABI void* __libcpp_allocate(size_t __size, size_t __align) {
+template <class _Tp>
+inline _LIBCPP_HIDE_FROM_ABI _Tp* __libcpp_allocate(__element_count __n, size_t __align = _LIBCPP_ALIGNOF(_Tp)) {
+  size_t __size = static_cast<size_t>(__n) * sizeof(_Tp);
 #if _LIBCPP_HAS_ALIGNED_ALLOCATION
   if (__is_overaligned_for_new(__align)) {
     const align_val_t __align_val = static_cast<align_val_t>(__align);
-    return __libcpp_operator_new(__size, __align_val);
+    return static_cast<_Tp*>(std::__libcpp_operator_new(__size, __align_val));
   }
 #endif
 
   (void)__align;
-  return __libcpp_operator_new(__size);
+  return static_cast<_Tp*>(std::__libcpp_operator_new(__size));
 }
 
-template <class... _Args>
-_LIBCPP_HIDE_FROM_ABI void __do_deallocate_handle_size(void* __ptr, size_t __size, _Args... __args) _NOEXCEPT {
-#if !_LIBCPP_HAS_SIZED_DEALLOCATION
-  (void)__size;
-  return std::__libcpp_operator_delete(__ptr, __args...);
+#if _LIBCPP_HAS_SIZED_DEALLOCATION
+#  define _LIBCPP_ONLY_IF_SIZED_DEALLOCATION(...) __VA_ARGS__
 #else
-  return std::__libcpp_operator_delete(__ptr, __size, __args...);
+#  define _LIBCPP_ONLY_IF_SIZED_DEALLOCATION(...) /* nothing */
 #endif
-}
 
-inline _LIBCPP_HIDE_FROM_ABI void __libcpp_deallocate(void* __ptr, size_t __size, size_t __align) _NOEXCEPT {
+template <class _Tp>
+inline _LIBCPP_HIDE_FROM_ABI void __libcpp_deallocate(
+    __type_identity_t<_Tp>* __ptr, __element_count __n, size_t __align = _LIBCPP_ALIGNOF(_Tp)) _NOEXCEPT {
+  size_t __size = static_cast<size_t>(__n) * sizeof(_Tp);
+  (void)__size;
 #if !_LIBCPP_HAS_ALIGNED_ALLOCATION
   (void)__align;
-  return __do_deallocate_handle_size(__ptr, __size);
+  return std::__libcpp_operator_delete(__ptr _LIBCPP_ONLY_IF_SIZED_DEALLOCATION(, __size));
 #else
   if (__is_overaligned_for_new(__align)) {
     const align_val_t __align_val = static_cast<align_val_t>(__align);
-    return __do_deallocate_handle_size(__ptr, __size, __align_val);
+    return std::__libcpp_operator_delete(__ptr _LIBCPP_ONLY_IF_SIZED_DEALLOCATION(, __size), __align_val);
   } else {
-    return __do_deallocate_handle_size(__ptr, __size);
+    return std::__libcpp_operator_delete(__ptr _LIBCPP_ONLY_IF_SIZED_DEALLOCATION(, __size));
   }
 #endif
 }
 
-inline _LIBCPP_HIDE_FROM_ABI void __libcpp_deallocate_unsized(void* __ptr, size_t __align) _NOEXCEPT {
+#undef _LIBCPP_ONLY_IF_SIZED_DEALLOCATION
+
+template <class _Tp>
+inline _LIBCPP_HIDE_FROM_ABI void
+__libcpp_deallocate_unsized(__type_identity_t<_Tp>* __ptr, size_t __align = _LIBCPP_ALIGNOF(_Tp)) _NOEXCEPT {
 #if !_LIBCPP_HAS_ALIGNED_ALLOCATION
   (void)__align;
-  return __libcpp_operator_delete(__ptr);
+  return std::__libcpp_operator_delete(__ptr);
 #else
   if (__is_overaligned_for_new(__align)) {
     const align_val_t __align_val = static_cast<align_val_t>(__align);
-    return __libcpp_operator_delete(__ptr, __align_val);
+    return std::__libcpp_operator_delete(__ptr, __align_val);
   } else {
-    return __libcpp_operator_delete(__ptr);
+    return std::__libcpp_operator_delete(__ptr);
   }
 #endif
 }
diff --git a/libcxx/include/__string/constexpr_c_functions.h b/libcxx/include/__string/constexpr_c_functions.h
index f50eac34a1c05..0bc128b68b579 100644
--- a/libcxx/include/__string/constexpr_c_functions.h
+++ b/libcxx/include/__string/constexpr_c_functions.h
@@ -25,6 +25,7 @@
 #include <__type_traits/is_trivially_copyable.h>
 #include <__type_traits/is_trivially_lexicographically_comparable.h>
 #include <__type_traits/remove_cv.h>
+#include <__utility/element_count.h>
 #include <__utility/is_pointer_in_range.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -33,10 +34,6 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-// Type used to encode that a function takes an integer that represents a number
-// of elements as opposed to a number of bytes.
-enum class __element_count : size_t {};
-
 template <class _Tp>
 inline const bool __is_char_type = false;
 
diff --git a/libcxx/include/__utility/element_count.h b/libcxx/include/__utility/element_count.h
new file mode 100644
index 0000000000000..82b05a7bde483
--- /dev/null
+++ b/libcxx/include/__utility/element_count.h
@@ -0,0 +1,27 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___UTILITY_ELEMENT_COUNT_H
+#define _LIBCPP___UTILITY_ELEMENT_COUNT_H
+
+#include <__config>
+#include <__cstddef/size_t.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+// Type used to encode that a function takes an integer that represents a number
+// of elements as opposed to a number of bytes.
+enum class __element_count : size_t {};
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___UTILITY_ELEMENT_COUNT_H
diff --git a/libcxx/include/__utility/small_buffer.h b/libcxx/include/__utility/small_buffer.h
index ff6e7e76f14f5..132a57f0fefab 100644
--- a/libcxx/include/__utility/small_buffer.h
+++ b/libcxx/include/__utility/small_buffer.h
@@ -68,7 +68,7 @@ class __small_buffer {
     if constexpr (__fits_in_buffer<_Stored>) {
       return std::launder(reinterpret_cast<_Stored*>(__buffer_));
     } else {
-      byte* __allocation = static_cast<byte*>(std::__libcpp_allocate(sizeof(_Stored), alignof(_Stored)));
+      byte* __allocation = reinterpret_cast<byte*>(std::__libcpp_allocate<_Stored>(__element_count(1)));
       std::construct_at(reinterpret_cast<byte**>(__buffer_), __allocation);
       return std::launder(reinterpret_cast<_Stored*>(__allocation));
     }
@@ -77,7 +77,7 @@ class __small_buffer {
   template <class _Stored>
   _LIBCPP_HIDE_FROM_ABI void __dealloc() noexcept {
     if constexpr (!__fits_in_buffer<_Stored>)
-      std::__libcpp_deallocate(*reinterpret_cast<void**>(__buffer_), sizeof(_Stored), alignof(_Stored));
+      std::__libcpp_deallocate<_Stored>(__get<_Stored>(), __element_count(1));
   }
 
   template <class _Stored, class... _Args>
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index 6800f8b562650..e3204820b5c25 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -1527,14 +1527,16 @@ module std [system] {
     module aligned_alloc                      { header "__memory/aligned_alloc.h" }
     module allocate_at_least                  { header "__memory/allocate_at_least.h" }
     module allocation_guard                   { header "__memory/allocation_guard.h" }
-    module allocator                          { header "__memory/allocator.h" }
+    module allocator {
+      header "__memory/allocator.h"
+      export * // TODO: Workaround for https://github.com/llvm/llvm-project/issues/120108
+    }
     module allocator_arg_t                    { header "__memory/allocator_arg_t.h" }
     module allocator_destructor               { header "__memory/allocator_destructor.h" }
     module allocator_traits                   { header "__memory/allocator_traits.h" }
     module array_cookie                       { header "__memory/array_cookie.h" }
     module assume_aligned                     { header "__memory/assume_aligned.h" }
     module auto_ptr                           { header "__memory/auto_ptr.h" }
-    module builtin_new_allocator              { header "__memory/builtin_new_allocator.h" }
     module compressed_pair                    { header "__memory/compressed_pair.h" }
     module concepts                           { header "__memory/concepts.h" }
     module construct_at                       { header "__memory/construct_at.h" }
@@ -1569,6 +1571,7 @@ module std [system] {
       header "__memory/unique_temporary_buffer.h"
       export std.memory.unique_ptr
       export std_core.type_traits.is_constant_evaluated
+      export * // TODO: Workaround for https://github.com/llvm/llvm-project/issues/120108
     }
     module uses_allocator                     { header "__memory/uses_allocator.h" }
     module uses_allocator_construction        { header "__memory/uses_allocator_construction.h" }
@@ -1604,7 +1607,11 @@ module std [system] {
   module new {
     header "new"
     module align_val_t          { header "__new/align_val_t.h" }
-    module allocate             { header "__new/allocate.h" }
+    module allocate {
+      header "__new/allocate.h"
+      export std.utility.element_count // used as part of the API
+      export * // TODO: Workaround for https://github.com/llvm/llvm-project/issues/120108
+    }
     module destroying_delete_t  { header "__new/destroying_delete_t.h" }
     module exceptions           { header "__new/exceptions.h" }
     module global_new_delete    {
@@ -1911,7 +1918,10 @@ module std [system] {
 
   module string {
     module char_traits              { header "__string/char_traits.h" }
-    module constexpr_c_functions    { header "__string/constexpr_c_functions.h" }
+    module constexpr_c_functions    {
+      header "__string/constexpr_c_functions.h"
+      export std.utility.element_count // used as part of the constexpr C function's API
+    }
     module extern_template_lists    { header "__string/extern_template_lists.h" }
     module fwd                      {  header "__fwd/string.h" }
 
@@ -2021,6 +2031,7 @@ module std [system] {
     }
     module cmp                        { header "__utility/cmp.h" }
     module convert_to_integral        { header "__utility/convert_to_integral.h" }
+    module element_count              { header "__utility/element_count.h" }
     module exception_guard            { header "__utility/exception_guard.h" }
     module exchange                   { header "__utility/exchange.h" }
     module forward_like               { header "__utility/forward_like.h" }
diff --git a/libcxx/src/memory_resource.cpp b/libcxx/src/memory_resource.cpp
index e182e5aa66ef9..e1a9e1a8fac49 100644
--- a/libcxx/src/memory_resource.cpp
+++ b/libcxx/src/memory_resource.cpp
@@ -41,20 +41,22 @@ static bool is_aligned_to(void* ptr, size_t align) {
 class _LIBCPP_EXPORTED_FROM_ABI __new_delete_memory_resource_imp : public memory_resource {
   void* do_allocate(size_t bytes, size_t align) override {
 #if _LIBCPP_HAS_ALIGNED_ALLOCATION
-    return std::__libcpp_allocate(bytes, align);
+    return std::__libcpp_allocate<std::byte>(__element_count(bytes), align);
 #else
     if (bytes == 0)
       bytes = 1;
-    void* result = std::__libcpp_allocate(bytes, align);
+    std::byte* result = std::__libcpp_allocate<std::byte>(__element_count(bytes), align);
     if (!is_aligned_to(result, align)) {
-      std::__libcpp_deallocate(result, bytes, align);
+      std::__libcpp_deallocate<std::byte>(result, __element_count(bytes), align);
       __throw_bad_alloc();
     }
     return result;
 #endif
   }
 
-  void do_deallocate(void* p, size_t bytes, size_t align) override { std::__libcpp_deallocate(p, bytes, align); }
+  void do_deallocate(void* p, size_t bytes, size_t align) override {
+    std::__libcpp_deallocate<std::byte>(static_cast<std::byte*>(p), __element_count(bytes), align);
+  }
 
   bool do_is_equal(const memory_resource& other) const noexcept override { return &other == this; }
 };
diff --git a/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp b/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp
index b283c8aa06f0c..7ead65caf9fda 100644
--- a/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp
+++ b/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp
@@ -96,34 +96,34 @@ struct alloc_stats {
 };
 alloc_stats stats;
 
-void operator delete(void* p)TEST_NOEXCEPT {
+void operator delete(void* p) TEST_NOEXCEPT {
   ::free(p);
   stats.plain_called++;
   stats.last_size = stats.last_align = -1;
 }
 
 #ifndef NO_SIZE
-void operator delete(void* p, std::size_t n)TEST_NOEXCEPT {
+void operator delete(void* p, std::size_t n) TEST_NOEXCEPT {
   ::free(p);
   stats.sized_called++;
-  stats.last_size = n;
+  stats.last_size  = n;
   stats.last_align = -1;
 }
 #endif
 
 #ifndef NO_ALIGN
-void operator delete(void* p, std::align_val_t a)TEST_NOEXCEPT {
+void operator delete(void* p, std::align_val_t a) TEST_NOEXCEPT {
   std::__libcpp_aligned_free(p);
   stats.aligned_called++;
   stats.last_align = static_cast<int>(a);
-  stats.last_size = -1;
+  stats.last_size  = -1;
 }
 
-void operator delete(void* p, std::size_t n, std::align_val_t a)TEST_NOEXCEPT {
+void operator delete(void* p, std::size_t n, std::align_val_t a) TEST_NOEXCEPT {
   std::__libcpp_aligned_free(p);
   stats.aligned_sized_called++;
   stats.last_align = static_cast<int>(a);
-  stats.last_size = n;
+  stats.last_size  = n;
 }
 #endif
 
@@ -135,45 +135,45 @@ void test_libcpp_dealloc() {
   std::size_t over_align_val = TEST_ALIGNOF(std::max_align_t) * 2;
 #endif
   std::size_t under_align_val = TEST_ALIGNOF(int);
-  std::size_t with_size_val = 2;
+  std::size_t with_size_val   = 2;
 
   {
-    std::__libcpp_deallocate_unsized(p, under_align_val);
+    std::__libcpp_deallocate_unsized<char>(static_cast<char*>(p), under_align_val);
     assert(stats.expect_plain());
   }
   stats.reset();
 
 #if defined(NO_SIZE) && defined(NO_ALIGN)
   {
-    std::__libcpp_deallocate(p, with_size_val, over_align_val);
+    std::__libcpp_deallocate<char>(static_cast<char*>(p), std::__element_count(with_size_val), over_align_val);
     assert(stats.expect_plain());
   }
   stats.reset();
 #elif defined(NO_SIZE)
   {
-    std::__libcpp_deallocate(p, with_size_val, over_align_val);
+    std::__libcpp_deallocate<char>(static_cast<char*>(p), std::__element_count(with_size_val), over_align_val);
     assert(stats.expect_align(over_align_val));
   }
   stats.reset();
 #elif defined(NO_ALIGN)
   {
-    std::__libcpp_deallocate(p, with_size_val, over_align_val);
+    std::__libcpp_deallocate<char>(static_cast<char*>(p), std::__element_count(with_size_val), over_align_val);
     assert(stats.expect_size(with_size_val));
   }
   stats.reset();
 #else
   {
-    std::__libcpp_deallocate(p, with_size_val, over_align_val);
+    std::__libcpp_deallocate<char>(static_cast<char*>(p), std::__element_count(with_size_val), over_align_val);
     assert(stats.expect_size_align(with_size_val, over_align_val));
   }
   stats.reset();
   {
-    std::__libcpp_deallocate_unsized(p, over_align_val);
+    std::__libcpp_deallocate_unsized<char>(static_cast<char*>(p), over_align_val);
     assert(stats.expect_align(over_align_val));
   }
   stats.reset();
   {
-    std::__libcpp_deallocate(p, with_size_val, under_align_val);
+    std::__libcpp_deallocate<char>(static_cast<char*>(p), std::__element_count(with_size_val), under_align_val);
     assert(stats.expect_size(with_size_val));
   }
   stats.reset();
@@ -202,13 +202,13 @@ void test_allocator_and_new_match() {
   stats.reset();
 #elif defined(NO_SIZE)
   stats.reset();
-#if TEST_STD_VER >= 11
+#  if TEST_STD_VER >= 11
   {
     int* x = DoNotOptimize(new int(42));
     delete x;
     assert(stats.expect_plain());
   }
-#endif
+#  endif
   stats.reset();
   {
     AlignedType* a = DoNotOptimize(new AlignedType());
@@ -241,8 +241,7 @@ void test_allocator_and_new_match() {
   {
     AlignedType* a = DoNotOptimize(new AlignedType());
     delete a;
-    assert(stats.expect_size_align(sizeof(AlignedType),
-                                   TEST_ALIGNOF(AlignedType)));
+    assert(stats.expect_size_align(sizeof(AlignedType), TEST_ALIGNOF(AlignedType)));
   }
   stats.reset();
 #endif

From d1a622db1b1a93ac61f7f281605c4606b391f24a Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Mon, 13 Jan 2025 09:10:52 -0500
Subject: [PATCH 277/408] [libc++][NFC] Use uint32_t instead of __uint32_t on
 Apple (#122356)

We had a 15 year old occurence of __uint32_t, likely from a time when
uint32_t was not available everywhere.
---
 libcxx/include/__locale | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/include/__locale b/libcxx/include/__locale
index 01c3a2e3456ba..e10eb62fb844b 100644
--- a/libcxx/include/__locale
+++ b/libcxx/include/__locale
@@ -348,7 +348,7 @@ public:
 #  define _LIBCPP_CTYPE_MASK_IS_COMPOSITE_ALPHA
 #elif defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__)
 #  ifdef __APPLE__
-  typedef __uint32_t mask;
+  typedef uint32_t mask;
 #  elif defined(__FreeBSD__)
   typedef unsigned long mask;
 #  elif defined(__NetBSD__)

From 019a902ac644c59dfa8e6091f96846b516444ca5 Mon Sep 17 00:00:00 2001
From: "A. Jiang" <de34@live.cn>
Date: Mon, 13 Jan 2025 22:12:25 +0800
Subject: [PATCH 278/408] [libc++] Deprecate extension
 `packaged_task::result_type` (#122600)

This extension is questionable and non-conforming. Perhaps we should
deprecate and then remove it.

Towards #112856.
---
 libcxx/docs/ReleaseNotes/20.rst               |  4 +++
 libcxx/include/future                         | 20 ++++++-------
 .../futures/futures.task/type.depr.verify.cpp | 28 +++++++++++++++++++
 .../futures/futures.task/types.pass.cpp       | 10 +++----
 4 files changed, 47 insertions(+), 15 deletions(-)
 create mode 100644 libcxx/test/libcxx/thread/futures/futures.task/type.depr.verify.cpp

diff --git a/libcxx/docs/ReleaseNotes/20.rst b/libcxx/docs/ReleaseNotes/20.rst
index 228c3f3432c29..15940948655d7 100644
--- a/libcxx/docs/ReleaseNotes/20.rst
+++ b/libcxx/docs/ReleaseNotes/20.rst
@@ -146,6 +146,8 @@ Deprecations and Removals
   ``__undeclare_reachable`` have been removed from the library. These functions were never implemented in a non-trivial
   way, making it very unlikely that any binary depends on them.
 
+- Non-conforming extension ``packaged_task::result_type`` is deprecated. It will be removed in LLVM 21.
+
 Upcoming Deprecations and Removals
 ----------------------------------
 
@@ -164,6 +166,8 @@ LLVM 21
 - The ``_LIBCPP_VERBOSE_ABORT_NOT_NOEXCEPT`` macro will be removed in LLVM 21, making ``std::__libcpp_verbose_abort``
   unconditionally ``noexcept``.
 
+- Non-conforming extension ``packaged_task::result_type`` will be removed in LLVM 21.
+
 
 ABI Affecting Changes
 ---------------------
diff --git a/libcxx/include/future b/libcxx/include/future
index d777ed8d6016f..72f3ed5ca5d27 100644
--- a/libcxx/include/future
+++ b/libcxx/include/future
@@ -1612,11 +1612,11 @@ inline _Rp __packaged_task_function<_Rp(_ArgTypes...)>::operator()(_ArgTypes...
 template <class _Rp, class... _ArgTypes>
 class _LIBCPP_TEMPLATE_VIS packaged_task<_Rp(_ArgTypes...)> {
 public:
-  typedef _Rp result_type; // extension
+  using result_type _LIBCPP_DEPRECATED = _Rp; // extension
 
 private:
-  __packaged_task_function<result_type(_ArgTypes...)> __f_;
-  promise<result_type> __p_;
+  __packaged_task_function<_Rp(_ArgTypes...)> __f_;
+  promise<_Rp> __p_;
 
 public:
   // construction and destruction
@@ -1653,7 +1653,7 @@ public:
   _LIBCPP_HIDE_FROM_ABI bool valid() const _NOEXCEPT { return __p_.__state_ != nullptr; }
 
   // result retrieval
-  _LIBCPP_HIDE_FROM_ABI future<result_type> get_future() { return __p_.get_future(); }
+  _LIBCPP_HIDE_FROM_ABI future<_Rp> get_future() { return __p_.get_future(); }
 
   // execution
   _LIBCPP_HIDE_FROM_ABI void operator()(_ArgTypes... __args);
@@ -1700,17 +1700,17 @@ template <class _Rp, class... _ArgTypes>
 void packaged_task<_Rp(_ArgTypes...)>::reset() {
   if (!valid())
     __throw_future_error(future_errc::no_state);
-  __p_ = promise<result_type>();
+  __p_ = promise<_Rp>();
 }
 
 template <class... _ArgTypes>
 class _LIBCPP_TEMPLATE_VIS packaged_task<void(_ArgTypes...)> {
 public:
-  typedef void result_type; // extension
+  using result_type _LIBCPP_DEPRECATED = void; // extension
 
 private:
-  __packaged_task_function<result_type(_ArgTypes...)> __f_;
-  promise<result_type> __p_;
+  __packaged_task_function<void(_ArgTypes...)> __f_;
+  promise<void> __p_;
 
 public:
   // construction and destruction
@@ -1745,7 +1745,7 @@ public:
   _LIBCPP_HIDE_FROM_ABI bool valid() const _NOEXCEPT { return __p_.__state_ != nullptr; }
 
   // result retrieval
-  _LIBCPP_HIDE_FROM_ABI future<result_type> get_future() { return __p_.get_future(); }
+  _LIBCPP_HIDE_FROM_ABI future<void> get_future() { return __p_.get_future(); }
 
   // execution
   _LIBCPP_HIDE_FROM_ABI void operator()(_ArgTypes... __args);
@@ -1804,7 +1804,7 @@ template <class... _ArgTypes>
 void packaged_task<void(_ArgTypes...)>::reset() {
   if (!valid())
     __throw_future_error(future_errc::no_state);
-  __p_ = promise<result_type>();
+  __p_ = promise<void>();
 }
 
 template <class _Rp, class... _ArgTypes>
diff --git a/libcxx/test/libcxx/thread/futures/futures.task/type.depr.verify.cpp b/libcxx/test/libcxx/thread/futures/futures.task/type.depr.verify.cpp
new file mode 100644
index 0000000000000..4065637e9eb2a
--- /dev/null
+++ b/libcxx/test/libcxx/thread/futures/futures.task/type.depr.verify.cpp
@@ -0,0 +1,28 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03
+
+// <future>
+
+// template<class R, class... ArgTypes>
+//     class packaged_task<R(ArgTypes...)>
+// {
+// public:
+//     typedef R result_type; // extension
+
+// This libc++ extension is deprecated. See https://github.com/llvm/llvm-project/issues/112856.
+
+#include <future>
+#include <type_traits>
+
+struct A {};
+
+using RA = std::packaged_task<A(int, char)>::result_type;    // expected-warning {{'result_type' is deprecated}}
+using RV = std::packaged_task<void(int, char)>::result_type; // expected-warning {{'result_type' is deprecated}}
diff --git a/libcxx/test/libcxx/thread/futures/futures.task/types.pass.cpp b/libcxx/test/libcxx/thread/futures/futures.task/types.pass.cpp
index 1f17d74513471..659232caa46ec 100644
--- a/libcxx/test/libcxx/thread/futures/futures.task/types.pass.cpp
+++ b/libcxx/test/libcxx/thread/futures/futures.task/types.pass.cpp
@@ -19,16 +19,16 @@
 
 // This is a libc++ extension.
 
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+
 #include <future>
 #include <type_traits>
 
-#include "test_macros.h"
-
 struct A {};
 
-int main(int, char**)
-{
-    static_assert((std::is_same<std::packaged_task<A(int, char)>::result_type, A>::value), "");
+int main(int, char**) {
+  static_assert((std::is_same<std::packaged_task<A(int, char)>::result_type, A>::value), "");
+  static_assert((std::is_same<std::packaged_task<void(int, char)>::result_type, void>::value), "");
 
   return 0;
 }

From 7457f51f6cf61b960e3e6e45e63378debd5c1d5c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 13 Jan 2025 14:13:55 +0000
Subject: [PATCH 279/408] [X86] Fold VPERMV3(X,M,Y) ->
 VPERMV(CONCAT(X,Y),WIDEN(M)) iff the CONCAT is free (#122485)

This extends the existing fold which concatenates X and Y if they are sequential subvectors extracted from the same source.

By using combineConcatVectorOps we can recognise other patterns where X and Y can be concatenated for free (e.g. sequential loads, concatenating repeated instructions etc.), which allows the VPERMV3 fold to be a lot more aggressive.

This required combineConcatVectorOps to be extended to fold the additional case of "concat(extract_subvector(x,lo), extract_subvector(x,hi)) -> extract_subvector(x)", similar to the original VPERMV3 fold where "x" was larger than the concat result type.

This also exposes more cases where we have repeated vector/subvector loads if they have multiple uses - e.g. where we're loading a ymm and the lo/hi xmm pairs independently - in the past we've always considered this to be relatively benign, but I'm not certain if we should now do more to keep these from splitting?
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  51 +-
 ...d_vector_inreg_of_broadcast_from_memory.ll |  28 +-
 .../X86/avx512-shuffles/partial_permute.ll    | 443 ++++++-----
 llvm/test/CodeGen/X86/pr97968.ll              |   4 +-
 .../X86/shuffle-strided-with-offset-512.ll    |   7 +-
 llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll |  10 +-
 .../vector-interleaved-load-i16-stride-3.ll   | 156 ++--
 .../vector-interleaved-load-i16-stride-5.ll   | 120 ++-
 .../vector-interleaved-load-i16-stride-6.ll   | 550 +++++++-------
 .../vector-interleaved-load-i16-stride-7.ll   | 222 +++---
 .../vector-interleaved-load-i16-stride-8.ll   | 188 +++--
 .../vector-interleaved-load-i32-stride-2.ll   |  28 +-
 .../vector-interleaved-load-i32-stride-3.ll   | 152 ++--
 .../vector-interleaved-load-i32-stride-4.ll   | 228 +++---
 .../vector-interleaved-load-i32-stride-5.ll   | 192 +++--
 .../vector-interleaved-load-i32-stride-6.ll   | 716 +++++++++---------
 .../vector-interleaved-load-i32-stride-7.ll   | 284 +++----
 .../vector-interleaved-load-i32-stride-8.ll   | 116 +--
 .../vector-interleaved-load-i64-stride-2.ll   |  52 +-
 .../vector-interleaved-load-i64-stride-6.ll   | 164 ++--
 .../vector-interleaved-load-i64-stride-7.ll   | 152 ++--
 ...d_vector_inreg_of_broadcast_from_memory.ll |  28 +-
 22 files changed, 1919 insertions(+), 1972 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 596139d084570..add51fac4b9e6 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -41701,6 +41701,11 @@ static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
   return SDValue();
 }
 
+static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
+                                      ArrayRef<SDValue> Ops, SelectionDAG &DAG,
+                                      TargetLowering::DAGCombinerInfo &DCI,
+                                      const X86Subtarget &Subtarget);
+
 /// Try to combine x86 target specific shuffles.
 static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
                                     SelectionDAG &DAG,
@@ -42401,25 +42406,17 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
     return SDValue();
   }
   case X86ISD::VPERMV3: {
-    SDValue V1 = peekThroughBitcasts(N.getOperand(0));
-    SDValue V2 = peekThroughBitcasts(N.getOperand(2));
-    MVT SVT = V1.getSimpleValueType();
-    // Combine VPERMV3 to widened VPERMV if the two source operands are split
-    // from the same vector.
-    if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
-        V1.getConstantOperandVal(1) == 0 &&
-        V2.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
-        V2.getConstantOperandVal(1) == SVT.getVectorNumElements() &&
-        V1.getOperand(0) == V2.getOperand(0)) {
-      EVT NVT = V1.getOperand(0).getValueType();
-      if (NVT.is256BitVector() ||
-          (NVT.is512BitVector() && Subtarget.hasEVEX512())) {
-        MVT WideVT = MVT::getVectorVT(
-            VT.getScalarType(), NVT.getSizeInBits() / VT.getScalarSizeInBits());
+    // Combine VPERMV3 to widened VPERMV if the two source operands can be
+    // freely concatenated.
+    if (VT.is128BitVector() ||
+        (VT.is256BitVector() && Subtarget.useAVX512Regs())) {
+      SDValue Ops[] = {N.getOperand(0), N.getOperand(2)};
+      MVT WideVT = VT.getDoubleNumVectorElementsVT();
+      if (SDValue ConcatSrc =
+              combineConcatVectorOps(DL, WideVT, Ops, DAG, DCI, Subtarget)) {
         SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG,
                                       DL, WideVT.getSizeInBits());
-        SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask,
-                                   DAG.getBitcast(WideVT, V1.getOperand(0)));
+        SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, ConcatSrc);
         return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
                            DAG.getIntPtrConstant(0, DL));
       }
@@ -42427,6 +42424,9 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
     SmallVector<SDValue, 2> Ops;
     SmallVector<int, 32> Mask;
     if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) {
+      assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
+      SDValue V1 = peekThroughBitcasts(N.getOperand(0));
+      SDValue V2 = peekThroughBitcasts(N.getOperand(2));
       MVT MaskVT = N.getOperand(1).getSimpleValueType();
       // Canonicalize to VPERMV if both sources are the same.
       if (V1 == V2) {
@@ -57369,10 +57369,8 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
                          Op0.getOperand(1));
   }
 
-  // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
-  // Only concat of subvector high halves which vperm2x128 is best at.
   // TODO: This should go in combineX86ShufflesRecursively eventually.
-  if (VT.is256BitVector() && NumOps == 2) {
+  if (NumOps == 2) {
     SDValue Src0 = peekThroughBitcasts(Ops[0]);
     SDValue Src1 = peekThroughBitcasts(Ops[1]);
     if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
@@ -57381,7 +57379,10 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
       EVT SrcVT1 = Src1.getOperand(0).getValueType();
       unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
       unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
-      if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&
+      // concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128.
+      // Only concat of subvector high halves which vperm2x128 is best at.
+      if (VT.is256BitVector() && SrcVT0.is256BitVector() &&
+          SrcVT1.is256BitVector() &&
           Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
           Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
         return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
@@ -57389,6 +57390,14 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
                            DAG.getBitcast(VT, Src1.getOperand(0)),
                            DAG.getTargetConstant(0x31, DL, MVT::i8));
       }
+      // concat(extract_subvector(x,lo), extract_subvector(x,hi)) -> x.
+      if (Src0.getOperand(0) == Src1.getOperand(0) &&
+          Src0.getConstantOperandAPInt(1) == 0 &&
+          Src1.getConstantOperandAPInt(1) ==
+              Src0.getValueType().getVectorNumElements()) {
+        return DAG.getBitcast(VT, extractSubVector(Src0.getOperand(0), 0, DAG,
+                                                   DL, VT.getSizeInBits()));
+      }
     }
   }
 
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index 1305559bc04e0..3d72319f59ca9 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -1337,10 +1337,9 @@ define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in.
 ;
 ; AVX512BW-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31]
-; AVX512BW-NEXT:    vpermi2w 32(%rdi), %ymm0, %ymm1
-; AVX512BW-NEXT:    vpaddb (%rsi), %zmm1, %zmm0
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31]
+; AVX512BW-NEXT:    vpermw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -1789,10 +1788,9 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i
 ;
 ; AVX512F-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
 ; AVX512F-FAST:       # %bb.0:
-; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7]
-; AVX512F-FAST-NEXT:    vpermi2q 32(%rdi), %ymm0, %ymm1
-; AVX512F-FAST-NEXT:    vpaddb (%rsi), %ymm1, %ymm0
+; AVX512F-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [0,5,0,7]
+; AVX512F-FAST-NEXT:    vpermq (%rdi), %zmm0, %zmm0
+; AVX512F-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512F-FAST-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512F-FAST-NEXT:    vzeroupper
 ; AVX512F-FAST-NEXT:    retq
@@ -1808,10 +1806,9 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i
 ;
 ; AVX512DQ-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
 ; AVX512DQ-FAST:       # %bb.0:
-; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512DQ-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7]
-; AVX512DQ-FAST-NEXT:    vpermi2q 32(%rdi), %ymm0, %ymm1
-; AVX512DQ-FAST-NEXT:    vpaddb (%rsi), %ymm1, %ymm0
+; AVX512DQ-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [0,5,0,7]
+; AVX512DQ-FAST-NEXT:    vpermq (%rdi), %zmm0, %zmm0
+; AVX512DQ-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512DQ-FAST-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512DQ-FAST-NEXT:    vzeroupper
 ; AVX512DQ-FAST-NEXT:    retq
@@ -1827,10 +1824,9 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i
 ;
 ; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
 ; AVX512BW-FAST:       # %bb.0:
-; AVX512BW-FAST-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7]
-; AVX512BW-FAST-NEXT:    vpermi2q 32(%rdi), %ymm0, %ymm1
-; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm1, %zmm0
+; AVX512BW-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [0,5,0,7]
+; AVX512BW-FAST-NEXT:    vpermq (%rdi), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512BW-FAST-NEXT:    vzeroupper
 ; AVX512BW-FAST-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index 5d901a8a380a9..aac5847061cbe 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -149,9 +149,10 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x
 define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask0(ptr %vp) {
 ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm1
 ; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [0,7,13,3,5,13,3,9]
-; CHECK-NEXT:    vpermi2w 16(%rdi), %xmm1, %xmm0
+; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, ptr %vp
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
@@ -160,11 +161,12 @@ define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask0(ptr %vp) {
 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [0,7,13,3,5,13,3,9]
-; CHECK-NEXT:    vpermi2w 16(%rdi), %xmm2, %xmm3
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [0,7,13,3,5,13,3,9]
 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, ptr %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
@@ -176,11 +178,11 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16>
 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
 ; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,7,13,3,5,13,3,9]
 ; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, ptr %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
@@ -192,11 +194,12 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16
 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [3,15,12,7,1,5,8,14]
-; CHECK-NEXT:    vpermi2w 16(%rdi), %xmm2, %xmm3
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [3,15,12,7,1,5,8,14]
 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, ptr %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14>
@@ -208,11 +211,11 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16>
 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
 ; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [3,15,12,7,1,5,8,14]
 ; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, ptr %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14>
@@ -256,9 +259,10 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16
 define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(ptr %vp) {
 ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm1
 ; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [9,7,9,6,9,4,3,2]
-; CHECK-NEXT:    vpermi2w 16(%rdi), %xmm1, %xmm0
+; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, ptr %vp
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
@@ -267,11 +271,12 @@ define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(ptr %vp) {
 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [9,7,9,6,9,4,3,2]
-; CHECK-NEXT:    vpermi2w 16(%rdi), %xmm2, %xmm3
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [9,7,9,6,9,4,3,2]
 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, ptr %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
@@ -283,11 +288,11 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16>
 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
 ; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [9,7,9,6,9,4,3,2]
 ; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, ptr %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
@@ -579,9 +584,9 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x
 define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask0(ptr %vp) {
 ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
 ; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
-; CHECK-NEXT:    vpermi2w 32(%rdi), %ymm1, %ymm0
+; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, ptr %vp
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
@@ -590,11 +595,11 @@ define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask0(ptr %vp) {
 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask0(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
-; CHECK-NEXT:    vpermi2w 32(%rdi), %ymm2, %ymm3
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
 ; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vmovdqu16 %ymm3, %ymm0 {%k1}
+; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, ptr %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
@@ -606,11 +611,10 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask0(ptr %vp, <16 x i1
 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask0(ptr %vp, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
 ; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
 ; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
-; CHECK-NEXT:    vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, ptr %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
@@ -622,11 +626,11 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask0(ptr %vp, <16 x
 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask1(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
-; CHECK-NEXT:    vpermi2w 32(%rdi), %ymm2, %ymm3
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
 ; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vmovdqu16 %ymm3, %ymm0 {%k1}
+; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, ptr %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16, i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25>
@@ -638,11 +642,10 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask1(ptr %vp, <16 x i1
 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask1(ptr %vp, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
 ; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
 ; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
-; CHECK-NEXT:    vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, ptr %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16, i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25>
@@ -686,9 +689,9 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask2(ptr %vp, <16 x
 define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask3(ptr %vp) {
 ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
 ; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
-; CHECK-NEXT:    vpermi2w 32(%rdi), %ymm1, %ymm0
+; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, ptr %vp
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
@@ -697,11 +700,11 @@ define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask3(ptr %vp) {
 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask3(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
-; CHECK-NEXT:    vpermi2w 32(%rdi), %ymm2, %ymm3
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
 ; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vmovdqu16 %ymm3, %ymm0 {%k1}
+; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, ptr %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
@@ -713,11 +716,10 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask3(ptr %vp, <16 x i1
 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask3(ptr %vp, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
 ; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
 ; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
-; CHECK-NEXT:    vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, ptr %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
@@ -810,11 +812,11 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16
 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask2:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [6,18,0,4,10,25,22,10]
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm3
-; CHECK-NEXT:    vpermt2w 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, ptr %vp
@@ -827,11 +829,10 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16>
 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [6,18,0,4,10,25,22,10]
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [6,18,0,4,10,25,22,10]
 ; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpermt2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, ptr %vp
@@ -844,10 +845,9 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16
 define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask3(ptr %vp) {
 ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [19,1,5,31,9,12,17,9]
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
-; CHECK-NEXT:    vpermt2w 32(%rdi), %ymm1, %ymm0
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [19,1,5,31,9,12,17,9]
+; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, ptr %vp
@@ -857,11 +857,11 @@ define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask3(ptr %vp) {
 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask3:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [19,1,5,31,9,12,17,9]
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm3
-; CHECK-NEXT:    vpermt2w 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, ptr %vp
@@ -874,11 +874,10 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16>
 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [19,1,5,31,9,12,17,9]
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [19,1,5,31,9,12,17,9]
 ; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpermt2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, ptr %vp
@@ -1082,11 +1081,12 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32>
 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [5,0,0,3]
-; CHECK-NEXT:    vpermi2d 16(%rdi), %xmm2, %xmm3
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [5,0,0,3]
 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, ptr %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
@@ -1098,11 +1098,11 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %
 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [5,0,0,3]
 ; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpermi2d 16(%rdi), %xmm2, %xmm1 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, ptr %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
@@ -1567,9 +1567,9 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask2(ptr %vp, <8 x i32
 define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask3(ptr %vp) {
 ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [8,4,1,13,15,4,6,12]
-; CHECK-NEXT:    vpermi2d 32(%rdi), %ymm1, %ymm0
+; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, ptr %vp
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
@@ -1578,11 +1578,11 @@ define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask3(ptr %vp) {
 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [8,4,1,13,15,4,6,12]
-; CHECK-NEXT:    vpermi2d 32(%rdi), %ymm2, %ymm3
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [8,4,1,13,15,4,6,12]
 ; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vmovdqa32 %ymm3, %ymm0 {%k1}
+; CHECK-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, ptr %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
@@ -1594,11 +1594,10 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32>
 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [8,4,1,13,15,4,6,12]
 ; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
-; CHECK-NEXT:    vpermi2d 32(%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, ptr %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
@@ -1610,10 +1609,9 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32
 define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(ptr %vp) {
 ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [13,0,0,6]
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
-; CHECK-NEXT:    vpermt2d 32(%rdi), %ymm1, %ymm0
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [13,0,0,6]
+; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, ptr %vp
@@ -1623,11 +1621,11 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(ptr %vp) {
 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [13,0,0,6]
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm3
-; CHECK-NEXT:    vpermt2d 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, ptr %vp
@@ -1640,11 +1638,10 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32>
 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [13,0,0,6]
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [13,0,0,6]
 ; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpermt2d 32(%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, ptr %vp
@@ -1691,11 +1688,11 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32
 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask2:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [2,15,6,9]
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm3
-; CHECK-NEXT:    vpermt2d 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, ptr %vp
@@ -1708,11 +1705,10 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32>
 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [2,15,6,9]
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [2,15,6,9]
 ; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpermt2d 32(%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, ptr %vp
@@ -2474,9 +2470,9 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64>
 define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(ptr %vp) {
 ; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mem_mask3:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovdqa (%rdi), %ymm1
 ; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [7,0,0,2]
-; CHECK-FAST-NEXT:    vpermi2q 32(%rdi), %ymm1, %ymm0
+; CHECK-FAST-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
+; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mem_mask3:
@@ -2492,11 +2488,11 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(ptr %vp) {
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovdqa (%rdi), %ymm2
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [7,0,0,2]
-; CHECK-FAST-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm3
+; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [7,0,0,2]
 ; CHECK-FAST-NEXT:    vptestnmq %ymm1, %ymm1, %k1
-; CHECK-FAST-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
+; CHECK-FAST-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3:
@@ -2516,11 +2512,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> %
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovdqa (%rdi), %ymm2
 ; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [7,0,0,2]
 ; CHECK-FAST-NEXT:    vptestnmq %ymm0, %ymm0, %k1
-; CHECK-FAST-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-FAST-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3:
@@ -2572,11 +2567,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64>
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovdqa (%rdi), %ymm2
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [0,2,7,1]
-; CHECK-FAST-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm3
+; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [0,2,7,1]
 ; CHECK-FAST-NEXT:    vptestnmq %ymm1, %ymm1, %k1
-; CHECK-FAST-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
+; CHECK-FAST-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5:
@@ -2596,11 +2591,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> %
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovdqa (%rdi), %ymm2
 ; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,2,7,1]
 ; CHECK-FAST-NEXT:    vptestnmq %ymm0, %ymm0, %k1
-; CHECK-FAST-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-FAST-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5:
@@ -2620,9 +2614,9 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64>
 define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(ptr %vp) {
 ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
 ; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [7,2,3,2]
-; CHECK-NEXT:    vpermi2q 32(%rdi), %ymm1, %ymm0
+; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, ptr %vp
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
@@ -2631,11 +2625,11 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(ptr %vp) {
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [7,2,3,2]
-; CHECK-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm3
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [7,2,3,2]
 ; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
+; CHECK-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, ptr %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
@@ -2647,11 +2641,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> %
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
 ; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [7,2,3,2]
 ; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
-; CHECK-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, ptr %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
@@ -3032,12 +3025,13 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4
 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %xmm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [3,1,3,7]
-; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm2, %xmm3
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
-; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [3,1,3,7]
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
@@ -3049,12 +3043,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x
 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %xmm2
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [3,1,3,7]
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
-; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z}
-; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
@@ -3066,9 +3060,10 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4
 define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) {
 ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %xmm1
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [1,3,5,3]
-; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm1, %xmm0
+; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
   %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
@@ -3077,12 +3072,13 @@ define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) {
 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %xmm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,3,5,3]
-; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm2, %xmm3
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
-; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,3,5,3]
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
@@ -3094,12 +3090,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x
 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %xmm2
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [1,3,5,3]
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
-; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z}
-; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
@@ -3424,9 +3420,9 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask3(<16 x float> %v
 define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp) {
 ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm1
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [7,6,7,11,5,10,0,4]
-; CHECK-NEXT:    vpermi2ps 32(%rdi), %ymm1, %ymm0
+; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
   %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
@@ -3435,12 +3431,12 @@ define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp) {
 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [7,6,7,11,5,10,0,4]
-; CHECK-NEXT:    vpermi2ps 32(%rdi), %ymm2, %ymm3
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
-; CHECK-NEXT:    vmovaps %ymm3, %ymm0 {%k1}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [7,6,7,11,5,10,0,4]
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
@@ -3452,12 +3448,11 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp, <8 x
 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm2
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [7,6,7,11,5,10,0,4]
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %ymm3, %ymm0, %k1
-; CHECK-NEXT:    vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
@@ -3469,12 +3464,12 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp, <8
 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask1(ptr %vp, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [11,0,9,0,7,14,0,8]
-; CHECK-NEXT:    vpermi2ps 32(%rdi), %ymm2, %ymm3
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
-; CHECK-NEXT:    vmovaps %ymm3, %ymm0 {%k1}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [11,0,9,0,7,14,0,8]
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 11, i32 0, i32 9, i32 0, i32 7, i32 14, i32 0, i32 8>
@@ -3486,12 +3481,11 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask1(ptr %vp, <8 x
 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1(ptr %vp, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm2
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [11,0,9,0,7,14,0,8]
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %ymm3, %ymm0, %k1
-; CHECK-NEXT:    vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 11, i32 0, i32 9, i32 0, i32 7, i32 14, i32 0, i32 8>
@@ -3724,10 +3718,9 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4
 define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) {
 ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [3,3,15,9]
-; CHECK-NEXT:    vmovaps (%rdi), %ymm0
-; CHECK-NEXT:    vpermt2ps 32(%rdi), %ymm1, %ymm0
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [3,3,15,9]
+; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
@@ -3737,12 +3730,12 @@ define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) {
 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [3,3,15,9]
-; CHECK-NEXT:    vmovaps (%rdi), %ymm3
-; CHECK-NEXT:    vpermt2ps 32(%rdi), %ymm2, %ymm3
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
-; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
@@ -3755,12 +3748,11 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x
 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [3,3,15,9]
-; CHECK-NEXT:    vmovaps (%rdi), %ymm1
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
-; CHECK-NEXT:    vpermt2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [3,3,15,9]
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
@@ -4346,9 +4338,9 @@ define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask1(<8 x double>
 define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp) {
 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %ymm1
 ; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [1,6,7,2]
-; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm1, %ymm0
+; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
@@ -4357,12 +4349,12 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp) {
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [1,6,7,2]
-; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm3
-; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [1,6,7,2]
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
@@ -4374,12 +4366,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp, <4
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %ymm2
 ; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [1,6,7,2]
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
-; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
@@ -4441,12 +4432,12 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp,
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovapd (%rdi), %ymm2
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [1,2,3,4]
-; CHECK-FAST-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm3
-; CHECK-FAST-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-FAST-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-FAST-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
+; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [1,2,3,4]
+; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
+; CHECK-FAST-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2:
@@ -4467,12 +4458,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4 x double> %mask) {
 ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovapd (%rdi), %ymm2
 ; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [1,2,3,4]
-; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
-; CHECK-FAST-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-FAST-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-FAST-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-FAST-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
+; CHECK-FAST-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2:
@@ -4493,9 +4483,9 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp,
 define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp) {
 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %ymm1
 ; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [4,2,1,0]
-; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm1, %ymm0
+; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
@@ -4504,12 +4494,12 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp) {
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [4,2,1,0]
-; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm3
-; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [4,2,1,0]
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
@@ -4521,12 +4511,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %ymm2
 ; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [4,2,1,0]
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
-; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
diff --git a/llvm/test/CodeGen/X86/pr97968.ll b/llvm/test/CodeGen/X86/pr97968.ll
index ca5c63cdc1c2e..a539a33e9a281 100644
--- a/llvm/test/CodeGen/X86/pr97968.ll
+++ b/llvm/test/CodeGen/X86/pr97968.ll
@@ -5,8 +5,8 @@ define <2 x i32> @PR97968(<16 x i32> %a0) {
 ; CHECK-LABEL: PR97968:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [2,7,2,7]
-; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %sub0 = shufflevector <16 x i32> %a0, <16 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
index 45842d4148a8b..82c460fc55938 100644
--- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
@@ -65,10 +65,9 @@ define void @shuffle_v16i32_to_v8i32_1(ptr %L, ptr %S) nounwind {
 ;
 ; AVX512BWVL-FAST-ALL-LABEL: shuffle_v16i32_to_v8i32_1:
 ; AVX512BWVL-FAST-ALL:       # %bb.0:
-; AVX512BWVL-FAST-ALL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BWVL-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
-; AVX512BWVL-FAST-ALL-NEXT:    vpermi2d 32(%rdi), %ymm0, %ymm1
-; AVX512BWVL-FAST-ALL-NEXT:    vmovdqa %ymm1, (%rsi)
+; AVX512BWVL-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15]
+; AVX512BWVL-FAST-ALL-NEXT:    vpermps (%rdi), %zmm0, %zmm0
+; AVX512BWVL-FAST-ALL-NEXT:    vmovaps %ymm0, (%rsi)
 ; AVX512BWVL-FAST-ALL-NEXT:    vzeroupper
 ; AVX512BWVL-FAST-ALL-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index e7557134b1486..1d82d57e5552f 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -453,9 +453,8 @@ define <4 x double> @PR34175(ptr %p) {
 ; AVX512BWVL-LABEL: PR34175:
 ; AVX512BWVL:       # %bb.0:
 ; AVX512BWVL-NEXT:    vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
-; AVX512BWVL-NEXT:    vmovdqu (%rdi), %ymm1
-; AVX512BWVL-NEXT:    vpermt2w 32(%rdi), %ymm0, %ymm1
-; AVX512BWVL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX512BWVL-NEXT:    vpermw (%rdi), %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX512BWVL-NEXT:    vcvtdq2pd %xmm0, %ymm0
 ; AVX512BWVL-NEXT:    retq
 ;
@@ -472,9 +471,8 @@ define <4 x double> @PR34175(ptr %p) {
 ; AVX512VBMIVL-LABEL: PR34175:
 ; AVX512VBMIVL:       # %bb.0:
 ; AVX512VBMIVL-NEXT:    vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
-; AVX512VBMIVL-NEXT:    vmovdqu (%rdi), %ymm1
-; AVX512VBMIVL-NEXT:    vpermt2w 32(%rdi), %ymm0, %ymm1
-; AVX512VBMIVL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX512VBMIVL-NEXT:    vpermw (%rdi), %zmm0, %zmm0
+; AVX512VBMIVL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX512VBMIVL-NEXT:    vcvtdq2pd %xmm0, %ymm0
 ; AVX512VBMIVL-NEXT:    retq
   %v = load <32 x i16>, ptr %p, align 2
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
index 0cefc1c32d71b..a39bc6b668669 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
@@ -345,66 +345,66 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512BW-LABEL: load_i16_stride3_vf4:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [0,3,6,9,0,0,0,0]
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX512BW-NEXT:    vpermi2w %xmm2, %xmm1, %xmm0
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm3 = [1,4,7,10,0,0,0,0]
-; AVX512BW-NEXT:    vpermi2w %xmm2, %xmm1, %xmm3
-; AVX512BW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
-; AVX512BW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7]
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512BW-NEXT:    vpermw %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7]
+; AVX512BW-NEXT:    vpermw %ymm1, %ymm2, %ymm1
+; AVX512BW-NEXT:    vpshuflw {{.*#+}} xmm2 = mem[0,3,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm3 = mem[2,1,2,3]
+; AVX512BW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512BW-NEXT:    vmovq %xmm1, (%rcx)
+; AVX512BW-NEXT:    vmovq %xmm1, (%rdx)
+; AVX512BW-NEXT:    vmovq %xmm2, (%rcx)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BW-FCP-LABEL: load_i16_stride3_vf4:
 ; AVX512BW-FCP:       # %bb.0:
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [0,3,6,9,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX512BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX512BW-FCP-NEXT:    vpermi2w %xmm2, %xmm1, %xmm0
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = [1,4,7,10,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2w %xmm2, %xmm1, %xmm3
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = [2,5,8,11,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2w %xmm2, %xmm1, %xmm4
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512BW-FCP-NEXT:    vpermw %ymm1, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7]
+; AVX512BW-FCP-NEXT:    vpermw %ymm1, %ymm2, %ymm2
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,2,3,10,11]
+; AVX512BW-FCP-NEXT:    vpermw %ymm1, %ymm3, %ymm1
 ; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512BW-FCP-NEXT:    vmovq %xmm1, (%rcx)
+; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
 ; AVX512DQ-BW-LABEL: load_i16_stride3_vf4:
 ; AVX512DQ-BW:       # %bb.0:
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = [0,3,6,9,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX512DQ-BW-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX512DQ-BW-NEXT:    vpermi2w %xmm2, %xmm1, %xmm0
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm3 = [1,4,7,10,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2w %xmm2, %xmm1, %xmm3
-; AVX512DQ-BW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
-; AVX512DQ-BW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
-; AVX512DQ-BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7]
+; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512DQ-BW-NEXT:    vpermw %ymm1, %ymm0, %ymm0
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7]
+; AVX512DQ-BW-NEXT:    vpermw %ymm1, %ymm2, %ymm1
+; AVX512DQ-BW-NEXT:    vpshuflw {{.*#+}} xmm2 = mem[0,3,2,3,4,5,6,7]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm3 = mem[2,1,2,3]
+; AVX512DQ-BW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
+; AVX512DQ-BW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
 ; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512DQ-BW-NEXT:    vmovq %xmm1, (%rcx)
+; AVX512DQ-BW-NEXT:    vmovq %xmm1, (%rdx)
+; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rcx)
+; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride3_vf4:
 ; AVX512DQ-BW-FCP:       # %bb.0:
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [0,3,6,9,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %xmm2, %xmm1, %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = [1,4,7,10,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %xmm2, %xmm1, %xmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = [2,5,8,11,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %xmm2, %xmm1, %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm1, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm1, %ymm2, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,2,3,10,11]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm1, %ymm3, %ymm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm1, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <12 x i16>, ptr %in.vec, align 64
   %strided.vec0 = shufflevector <12 x i16> %wide.vec, <12 x i16> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
@@ -629,64 +629,60 @@ define void @load_i16_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-LABEL: load_i16_stride3_vf8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21]
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm0
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [1,4,7,10,13,16,19,22]
-; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm4 = [2,5,8,11,14,17,20,23]
-; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm4
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,13,16,19,22]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,14,17,20,23]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm3, %zmm1
 ; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT:    vmovdqa %xmm3, (%rdx)
-; AVX512BW-NEXT:    vmovdqa %xmm4, (%rcx)
+; AVX512BW-NEXT:    vmovdqa %xmm2, (%rdx)
+; AVX512BW-NEXT:    vmovdqa %xmm1, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BW-FCP-LABEL: load_i16_stride3_vf8:
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21]
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [1,4,7,10,13,16,19,22]
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm4 = [2,5,8,11,14,17,20,23]
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,13,16,19,22]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm2
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,14,17,20,23]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm3, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm2, (%rdx)
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm1, (%rcx)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
 ; AVX512DQ-BW-LABEL: load_i16_stride3_vf8:
 ; AVX512DQ-BW:       # %bb.0:
 ; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21]
-; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm0
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [1,4,7,10,13,16,19,22]
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} xmm4 = [2,5,8,11,14,17,20,23]
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm4
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,13,16,19,22]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm2, %zmm2
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,14,17,20,23]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm3, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm3, (%rdx)
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm4, (%rcx)
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm2, (%rdx)
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm1, (%rcx)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride3_vf8:
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [1,4,7,10,13,16,19,22]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm4 = [2,5,8,11,14,17,20,23]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,13,16,19,22]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,14,17,20,23]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm3, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm2, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm1, (%rcx)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <24 x i16>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
index 68e92d7cf773f..739e6e2369e36 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
@@ -596,24 +596,22 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm1
 ; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm2, %zmm1
-; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX512BW-NEXT:    vpextrw $7, %xmm2, %eax
-; AVX512BW-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm2, %zmm2
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm3
+; AVX512BW-NEXT:    vpextrw $7, %xmm3, %eax
+; AVX512BW-NEXT:    vpinsrw $3, %eax, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpinsrw $3, 32(%rdi), %xmm0, %xmm0
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm2 = [2,7,12,17,0,0,0,0]
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX512BW-NEXT:    vpermi2w %ymm3, %ymm4, %ymm2
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm5 = [3,8,13,18,0,0,0,0]
-; AVX512BW-NEXT:    vpermi2w %ymm3, %ymm4, %ymm5
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm6 = [4,9,14,19,0,0,0,0]
-; AVX512BW-NEXT:    vpermi2w %ymm3, %ymm4, %ymm6
-; AVX512BW-NEXT:    vmovq %xmm1, (%rsi)
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm3, %zmm3
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm4, %zmm4
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm5, %zmm1
+; AVX512BW-NEXT:    vmovq %xmm2, (%rsi)
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rdx)
-; AVX512BW-NEXT:    vmovq %xmm2, (%rcx)
-; AVX512BW-NEXT:    vmovq %xmm5, (%r8)
-; AVX512BW-NEXT:    vmovq %xmm6, (%r9)
+; AVX512BW-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512BW-NEXT:    vmovq %xmm4, (%r8)
+; AVX512BW-NEXT:    vmovq %xmm1, (%r9)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -623,24 +621,22 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
 ; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX512BW-FCP-NEXT:    vpextrw $7, %xmm2, %eax
-; AVX512BW-FCP-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm3
+; AVX512BW-FCP-NEXT:    vpextrw $7, %xmm3, %eax
+; AVX512BW-FCP-NEXT:    vpinsrw $3, %eax, %xmm2, %xmm2
 ; AVX512BW-FCP-NEXT:    vpinsrw $3, 32(%rdi), %xmm0, %xmm0
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = [2,7,12,17,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm3, %ymm4, %ymm2
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = [3,8,13,18,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm3, %ymm4, %ymm5
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm6 = [4,9,14,19,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm3, %ymm4, %ymm6
-; AVX512BW-FCP-NEXT:    vmovq %xmm1, (%rsi)
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm3, %zmm3
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm4, %zmm4
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm5, %zmm1
+; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
 ; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm5, (%r8)
-; AVX512BW-FCP-NEXT:    vmovq %xmm6, (%r9)
+; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%r8)
+; AVX512BW-FCP-NEXT:    vmovq %xmm1, (%r9)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -650,24 +646,22 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm1
 ; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
 ; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm2, %zmm1
-; AVX512DQ-BW-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX512DQ-BW-NEXT:    vpextrw $7, %xmm2, %eax
-; AVX512DQ-BW-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm2, %zmm2
+; AVX512DQ-BW-NEXT:    vmovdqa 16(%rdi), %xmm3
+; AVX512DQ-BW-NEXT:    vpextrw $7, %xmm3, %eax
+; AVX512DQ-BW-NEXT:    vpinsrw $3, %eax, %xmm2, %xmm2
 ; AVX512DQ-BW-NEXT:    vpinsrw $3, 32(%rdi), %xmm0, %xmm0
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm2 = [2,7,12,17,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm3, %ymm4, %ymm2
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm5 = [3,8,13,18,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm3, %ymm4, %ymm5
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm6 = [4,9,14,19,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm3, %ymm4, %ymm6
-; AVX512DQ-BW-NEXT:    vmovq %xmm1, (%rsi)
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm3, %zmm3
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm4, %zmm4
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm5, %zmm1
+; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rsi)
 ; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rdx)
-; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rcx)
-; AVX512DQ-BW-NEXT:    vmovq %xmm5, (%r8)
-; AVX512DQ-BW-NEXT:    vmovq %xmm6, (%r9)
+; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512DQ-BW-NEXT:    vmovq %xmm4, (%r8)
+; AVX512DQ-BW-NEXT:    vmovq %xmm1, (%r9)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -677,24 +671,22 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpextrw $7, %xmm2, %eax
-; AVX512DQ-BW-FCP-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vpextrw $7, %xmm3, %eax
+; AVX512DQ-BW-FCP-NEXT:    vpinsrw $3, %eax, %xmm2, %xmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpinsrw $3, 32(%rdi), %xmm0, %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = [2,7,12,17,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm3, %ymm4, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = [3,8,13,18,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm3, %ymm4, %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm6 = [4,9,14,19,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm3, %ymm4, %ymm6
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm1, (%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm3, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm4, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm5, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm5, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm6, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm1, (%r9)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <20 x i16>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
index 751412c77a59a..c3b53211978ae 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
@@ -293,8 +293,8 @@ define void @load_i16_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vpbroadcastw 4(%rdi), %xmm4
 ; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; AVX512BW-FCP-NEXT:    vmovd {{.*#+}} xmm5 = [3,9,0,0,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2w %xmm1, %xmm0, %xmm5
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm5 = [3,9,1,9,2,10,3,11]
+; AVX512BW-FCP-NEXT:    vpermw (%rdi), %ymm5, %ymm5
 ; AVX512BW-FCP-NEXT:    vpbroadcastw 20(%rdi), %xmm6
 ; AVX512BW-FCP-NEXT:    vpbroadcastw 8(%rdi), %xmm7
 ; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
@@ -307,6 +307,7 @@ define void @load_i16_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vmovd %xmm5, (%r8)
 ; AVX512BW-FCP-NEXT:    vmovd %xmm6, (%r9)
 ; AVX512BW-FCP-NEXT:    vmovd %xmm0, (%rax)
+; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
 ; AVX512DQ-BW-LABEL: load_i16_stride6_vf2:
@@ -346,8 +347,8 @@ define void @load_i16_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastw 4(%rdi), %xmm4
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; AVX512DQ-BW-FCP-NEXT:    vmovd {{.*#+}} xmm5 = [3,9,0,0,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %xmm1, %xmm0, %xmm5
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm5 = [3,9,1,9,2,10,3,11]
+; AVX512DQ-BW-FCP-NEXT:    vpermw (%rdi), %ymm5, %ymm5
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastw 20(%rdi), %xmm6
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastw 8(%rdi), %xmm7
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
@@ -360,6 +361,7 @@ define void @load_i16_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm5, (%r8)
 ; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm6, (%r9)
 ; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm0, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <12 x i16>, ptr %in.vec, align 64
   %strided.vec0 = shufflevector <12 x i16> %wide.vec, <12 x i16> poison, <2 x i32> <i32 0, i32 6>
@@ -580,21 +582,20 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
 ; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7]
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX512-NEXT:    vpermi2d %ymm2, %ymm4, %ymm1
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpermd (%rdi), %zmm1, %zmm1
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [0,13,10,3]
-; AVX512-NEXT:    vpermi2d %ymm4, %ymm2, %ymm6
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm6[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm6[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3]
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX512-NEXT:    vpermt2d (%rdi), %ymm4, %ymm5
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vmovq %xmm3, (%rsi)
 ; AVX512-NEXT:    vmovq %xmm0, (%rdx)
-; AVX512-NEXT:    vmovq %xmm5, (%rcx)
+; AVX512-NEXT:    vmovq %xmm2, (%rcx)
 ; AVX512-NEXT:    vmovq %xmm1, (%r8)
-; AVX512-NEXT:    vmovq %xmm2, (%r9)
-; AVX512-NEXT:    vmovq %xmm4, (%rax)
+; AVX512-NEXT:    vmovq %xmm4, (%r9)
+; AVX512-NEXT:    vmovq %xmm5, (%rax)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
@@ -612,21 +613,20 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7]
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX512-FCP-NEXT:    vpermi2d %ymm2, %ymm4, %ymm1
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpermd (%rdi), %zmm1, %zmm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [0,13,10,3]
-; AVX512-FCP-NEXT:    vpermi2d %ymm4, %ymm2, %ymm6
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm6[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm6[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3]
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX512-FCP-NEXT:    vpermt2d (%rdi), %ymm4, %ymm5
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vmovq %xmm3, (%rsi)
 ; AVX512-FCP-NEXT:    vmovq %xmm0, (%rdx)
-; AVX512-FCP-NEXT:    vmovq %xmm5, (%rcx)
+; AVX512-FCP-NEXT:    vmovq %xmm2, (%rcx)
 ; AVX512-FCP-NEXT:    vmovq %xmm1, (%r8)
-; AVX512-FCP-NEXT:    vmovq %xmm2, (%r9)
-; AVX512-FCP-NEXT:    vmovq %xmm4, (%rax)
+; AVX512-FCP-NEXT:    vmovq %xmm4, (%r9)
+; AVX512-FCP-NEXT:    vmovq %xmm5, (%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -645,21 +645,20 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
 ; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7]
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX512DQ-NEXT:    vpermi2d %ymm2, %ymm4, %ymm1
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpermd (%rdi), %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [0,13,10,3]
-; AVX512DQ-NEXT:    vpermi2d %ymm4, %ymm2, %ymm6
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm6[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm6[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3]
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX512DQ-NEXT:    vpermt2d (%rdi), %ymm4, %ymm5
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vmovq %xmm3, (%rsi)
 ; AVX512DQ-NEXT:    vmovq %xmm0, (%rdx)
-; AVX512DQ-NEXT:    vmovq %xmm5, (%rcx)
+; AVX512DQ-NEXT:    vmovq %xmm2, (%rcx)
 ; AVX512DQ-NEXT:    vmovq %xmm1, (%r8)
-; AVX512DQ-NEXT:    vmovq %xmm2, (%r9)
-; AVX512DQ-NEXT:    vmovq %xmm4, (%rax)
+; AVX512DQ-NEXT:    vmovq %xmm4, (%r9)
+; AVX512DQ-NEXT:    vmovq %xmm5, (%rax)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
@@ -677,21 +676,20 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm2, %ymm4, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpermd (%rdi), %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [0,13,10,3]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm4, %ymm2, %ymm6
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm6[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm6[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT:    vpermt2d (%rdi), %ymm4, %ymm5
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm5, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rcx)
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm1, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%r9)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm4, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovq %xmm4, (%r9)
+; AVX512DQ-FCP-NEXT:    vmovq %xmm5, (%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -699,25 +697,24 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0]
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm0
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm3 = [1,7,13,19,0,0,0,0]
-; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm4 = [2,8,14,20,0,0,0,0]
-; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm4
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm5 = [3,9,15,21,0,0,0,0]
-; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm5
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm6 = [4,10,16,22,0,0,0,0]
-; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm6
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm7 = [5,11,17,23,0,0,0,0]
-; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm7
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm2, %zmm2
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm3, %zmm3
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm4, %zmm4
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm5, %zmm5
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm6, %zmm1
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512BW-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512BW-NEXT:    vmovq %xmm5, (%r8)
-; AVX512BW-NEXT:    vmovq %xmm6, (%r9)
-; AVX512BW-NEXT:    vmovq %xmm7, (%rax)
+; AVX512BW-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512BW-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512BW-NEXT:    vmovq %xmm4, (%r8)
+; AVX512BW-NEXT:    vmovq %xmm5, (%r9)
+; AVX512BW-NEXT:    vmovq %xmm1, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -725,25 +722,24 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = [1,7,13,19,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = [2,8,14,20,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm4
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = [3,9,15,21,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm5
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm6 = [4,10,16,22,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm6
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm7 = [5,11,17,23,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm7
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm2
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm3, %zmm3
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm4, %zmm4
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm5, %zmm5
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm6, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm5, (%r8)
-; AVX512BW-FCP-NEXT:    vmovq %xmm6, (%r9)
-; AVX512BW-FCP-NEXT:    vmovq %xmm7, (%rax)
+; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%r8)
+; AVX512BW-FCP-NEXT:    vmovq %xmm5, (%r9)
+; AVX512BW-FCP-NEXT:    vmovq %xmm1, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -751,25 +747,24 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW:       # %bb.0:
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm0
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm3 = [1,7,13,19,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm4 = [2,8,14,20,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm4
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm5 = [3,9,15,21,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm5
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm6 = [4,10,16,22,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm6
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm7 = [5,11,17,23,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm7
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm2, %zmm2
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm3, %zmm3
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm4, %zmm4
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm5, %zmm5
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm6, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512DQ-BW-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512DQ-BW-NEXT:    vmovq %xmm5, (%r8)
-; AVX512DQ-BW-NEXT:    vmovq %xmm6, (%r9)
-; AVX512DQ-BW-NEXT:    vmovq %xmm7, (%rax)
+; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512DQ-BW-NEXT:    vmovq %xmm4, (%r8)
+; AVX512DQ-BW-NEXT:    vmovq %xmm5, (%r9)
+; AVX512DQ-BW-NEXT:    vmovq %xmm1, (%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -777,25 +772,24 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = [1,7,13,19,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = [2,8,14,20,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = [3,9,15,21,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm6 = [4,10,16,22,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm6
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm7 = [5,11,17,23,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm7
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm3, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm4, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm5, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm6, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm5, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm6, (%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm7, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm5, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm1, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <24 x i16>, ptr %in.vec, align 64
@@ -2865,224 +2859,228 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-LABEL: load_i16_stride6_vf16:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm3
 ; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
 ; AVX512BW-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa 160(%rdi), %ymm4
-; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm5
-; AVX512BW-NEXT:    vpermi2w %ymm4, %ymm5, %ymm0
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm5
+; AVX512BW-NEXT:    vpermw %zmm5, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
-; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm1
 ; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
 ; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2w %ymm4, %ymm5, %ymm1
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
-; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm6
-; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
-; AVX512BW-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2w %ymm5, %ymm4, %ymm6
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
-; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm3, %zmm7
-; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
+; AVX512BW-NEXT:    vpermw %zmm5, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
+; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm2
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
+; AVX512BW-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm6
+; AVX512BW-NEXT:    vmovdqa 160(%rdi), %ymm7
+; AVX512BW-NEXT:    vpermi2w %ymm6, %ymm7, %ymm2
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
+; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm4, %zmm8
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
+; AVX512BW-NEXT:    # ymm8 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vpermi2w %ymm6, %ymm7, %ymm8
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
+; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm4, %zmm6
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
 ; AVX512BW-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2w %ymm5, %ymm4, %ymm7
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
-; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm3, %zmm8
-; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
+; AVX512BW-NEXT:    vpermw %zmm5, %zmm7, %zmm7
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm8
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7]
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
 ; AVX512BW-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2w %ymm4, %ymm5, %ymm8
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
-; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm9
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7]
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
-; AVX512BW-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2w %ymm4, %ymm5, %ymm9
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
-; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm9[5,6,7]
+; AVX512BW-NEXT:    vpermw %zmm5, %zmm8, %zmm5
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm8
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm5[5,6,7]
 ; AVX512BW-NEXT:    vmovdqa %ymm0, (%rsi)
 ; AVX512BW-NEXT:    vmovdqa %ymm1, (%rdx)
-; AVX512BW-NEXT:    vmovdqa %ymm6, (%rcx)
-; AVX512BW-NEXT:    vmovdqa %ymm7, (%r8)
-; AVX512BW-NEXT:    vmovdqa %ymm8, (%r9)
-; AVX512BW-NEXT:    vmovdqa %ymm2, (%rax)
+; AVX512BW-NEXT:    vmovdqa %ymm2, (%rcx)
+; AVX512BW-NEXT:    vmovdqa %ymm6, (%r8)
+; AVX512BW-NEXT:    vmovdqa %ymm7, (%r9)
+; AVX512BW-NEXT:    vmovdqa %ymm3, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BW-FCP-LABEL: load_i16_stride6_vf16:
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm3
 ; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
 ; AVX512BW-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm4
-; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm5
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm4, %ymm5, %ymm0
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm5
+; AVX512BW-FCP-NEXT:    vpermw %zmm5, %zmm0, %zmm0
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2w %zmm3, %zmm2, %zmm1
+; AVX512BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm1
 ; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
 ; AVX512BW-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm4, %ymm5, %ymm1
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2w %zmm3, %zmm2, %zmm6
-; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
-; AVX512BW-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm5, %ymm4, %ymm6
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2w %zmm2, %zmm3, %zmm7
-; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
+; AVX512BW-FCP-NEXT:    vpermw %zmm5, %zmm1, %zmm1
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm2
+; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
+; AVX512BW-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm6
+; AVX512BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm7
+; AVX512BW-FCP-NEXT:    vpermi2w %ymm6, %ymm7, %ymm2
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermi2w %zmm3, %zmm4, %zmm8
+; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
+; AVX512BW-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
+; AVX512BW-FCP-NEXT:    vpermi2w %ymm6, %ymm7, %ymm8
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermi2w %zmm3, %zmm4, %zmm6
+; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
 ; AVX512BW-FCP-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm5, %ymm4, %ymm7
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2w %zmm2, %zmm3, %zmm8
-; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
+; AVX512BW-FCP-NEXT:    vpermw %zmm5, %zmm7, %zmm7
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm8
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7]
+; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
 ; AVX512BW-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm4, %ymm5, %ymm8
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2w %zmm3, %zmm2, %zmm9
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7]
-; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
-; AVX512BW-FCP-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm4, %ymm5, %ymm9
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2w %zmm3, %zmm2, %zmm4
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm9[5,6,7]
+; AVX512BW-FCP-NEXT:    vpermw %zmm5, %zmm8, %zmm5
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm8
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm5[5,6,7]
 ; AVX512BW-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
 ; AVX512BW-FCP-NEXT:    vmovdqa %ymm1, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm6, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm7, (%r8)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm8, (%r9)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm2, (%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm2, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm6, (%r8)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm7, (%r9)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm3, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
 ; AVX512DQ-BW-LABEL: load_i16_stride6_vf16:
 ; AVX512DQ-BW:       # %bb.0:
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm3
 ; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
 ; AVX512DQ-BW-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512DQ-BW-NEXT:    vmovdqa 160(%rdi), %ymm4
-; AVX512DQ-BW-NEXT:    vmovdqa 128(%rdi), %ymm5
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm4, %ymm5, %ymm0
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm3
+; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm4
+; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm5
+; AVX512DQ-BW-NEXT:    vpermw %zmm5, %zmm0, %zmm0
 ; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm1
+; AVX512DQ-BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm1
 ; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
 ; AVX512DQ-BW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm4, %ymm5, %ymm1
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm6
-; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
-; AVX512DQ-BW-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm5, %ymm4, %ymm6
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2w %zmm2, %zmm3, %zmm7
-; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
+; AVX512DQ-BW-NEXT:    vpermw %zmm5, %zmm1, %zmm1
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm2
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
+; AVX512DQ-BW-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQ-BW-NEXT:    vmovdqa 128(%rdi), %ymm6
+; AVX512DQ-BW-NEXT:    vmovdqa 160(%rdi), %ymm7
+; AVX512DQ-BW-NEXT:    vpermi2w %ymm6, %ymm7, %ymm2
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermi2w %zmm3, %zmm4, %zmm8
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
+; AVX512DQ-BW-NEXT:    # ymm8 = mem[0,1,0,1]
+; AVX512DQ-BW-NEXT:    vpermi2w %ymm6, %ymm7, %ymm8
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermi2w %zmm3, %zmm4, %zmm6
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
 ; AVX512DQ-BW-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm5, %ymm4, %ymm7
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2w %zmm2, %zmm3, %zmm8
-; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
+; AVX512DQ-BW-NEXT:    vpermw %zmm5, %zmm7, %zmm7
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm8
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7]
+; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
 ; AVX512DQ-BW-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm4, %ymm5, %ymm8
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm9
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7]
-; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
-; AVX512DQ-BW-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm4, %ymm5, %ymm9
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm4
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm9[5,6,7]
+; AVX512DQ-BW-NEXT:    vpermw %zmm5, %zmm8, %zmm5
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm8
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm5[5,6,7]
 ; AVX512DQ-BW-NEXT:    vmovdqa %ymm0, (%rsi)
 ; AVX512DQ-BW-NEXT:    vmovdqa %ymm1, (%rdx)
-; AVX512DQ-BW-NEXT:    vmovdqa %ymm6, (%rcx)
-; AVX512DQ-BW-NEXT:    vmovdqa %ymm7, (%r8)
-; AVX512DQ-BW-NEXT:    vmovdqa %ymm8, (%r9)
-; AVX512DQ-BW-NEXT:    vmovdqa %ymm2, (%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa %ymm2, (%rcx)
+; AVX512DQ-BW-NEXT:    vmovdqa %ymm6, (%r8)
+; AVX512DQ-BW-NEXT:    vmovdqa %ymm7, (%r9)
+; AVX512DQ-BW-NEXT:    vmovdqa %ymm3, (%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride6_vf16:
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm3
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
 ; AVX512DQ-BW-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm4, %ymm5, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm5, %zmm0, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm3, %zmm2, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
 ; AVX512DQ-BW-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm4, %ymm5, %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm3, %zmm2, %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
-; AVX512DQ-BW-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm5, %ymm4, %ymm6
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm2, %zmm3, %zmm7
-; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm5, %zmm1, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
+; AVX512DQ-BW-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm7
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm6, %ymm7, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm3, %zmm4, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
+; AVX512DQ-BW-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm6, %ymm7, %ymm8
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm3, %zmm4, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
 ; AVX512DQ-BW-FCP-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm5, %ymm4, %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm2, %zmm3, %zmm8
-; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm5, %zmm7, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
 ; AVX512DQ-BW-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm4, %ymm5, %ymm8
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm3, %zmm2, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
-; AVX512DQ-BW-FCP-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm4, %ymm5, %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm3, %zmm2, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm9[5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm5, %zmm8, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm5[5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm1, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm6, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm7, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm8, (%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm2, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm2, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm6, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm7, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm3, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <96 x i16>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
index 713bd757a7b99..95b5ffde48564 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
@@ -321,22 +321,23 @@ define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpsrld $16, %xmm0, %xmm3
 ; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[8,9,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpbroadcastw 8(%rdi), %xmm7
-; AVX512BW-FCP-NEXT:    vpsrlq $48, %xmm1, %xmm8
-; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512BW-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX512BW-FCP-NEXT:    vmovd {{.*#+}} xmm8 = [6,13,0,0,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2w %xmm1, %xmm0, %xmm8
+; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[8,9,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpbroadcastw 8(%rdi), %xmm6
+; AVX512BW-FCP-NEXT:    vpsrlq $48, %xmm1, %xmm7
+; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX512BW-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [6,13,5,13,6,14,7,15]
+; AVX512BW-FCP-NEXT:    vpermw (%rdi), %ymm3, %ymm3
 ; AVX512BW-FCP-NEXT:    vmovd %xmm2, (%rsi)
 ; AVX512BW-FCP-NEXT:    vmovd %xmm4, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovd %xmm6, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovd %xmm5, (%r8)
-; AVX512BW-FCP-NEXT:    vmovd %xmm7, (%r9)
-; AVX512BW-FCP-NEXT:    vmovd %xmm3, (%r10)
-; AVX512BW-FCP-NEXT:    vmovd %xmm8, (%rax)
+; AVX512BW-FCP-NEXT:    vmovd %xmm5, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovd %xmm0, (%r8)
+; AVX512BW-FCP-NEXT:    vmovd %xmm6, (%r9)
+; AVX512BW-FCP-NEXT:    vmovd %xmm1, (%r10)
+; AVX512BW-FCP-NEXT:    vmovd %xmm3, (%rax)
+; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
 ; AVX512DQ-BW-LABEL: load_i16_stride7_vf2:
@@ -378,22 +379,23 @@ define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpsrld $16, %xmm0, %xmm3
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[8,9,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastw 8(%rdi), %xmm7
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $48, %xmm1, %xmm8
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX512DQ-BW-FCP-NEXT:    vmovd {{.*#+}} xmm8 = [6,13,0,0,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %xmm1, %xmm0, %xmm8
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[8,9,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastw 8(%rdi), %xmm6
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $48, %xmm1, %xmm7
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [6,13,5,13,6,14,7,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermw (%rdi), %ymm3, %ymm3
 ; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm2, (%rsi)
 ; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm4, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm6, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm5, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm7, (%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm3, (%r10)
-; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm8, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm5, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm0, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm6, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm1, (%r10)
+; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm3, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <14 x i16>, ptr %in.vec, align 64
   %strided.vec0 = shufflevector <14 x i16> %wide.vec, <14 x i16> poison, <2 x i32> <i32 0, i32 7>
@@ -906,28 +908,27 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0]
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm0
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm3 = [1,8,15,22,0,0,0,0]
-; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm4 = [2,9,16,23,0,0,0,0]
-; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm4
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm5 = [3,10,17,24,0,0,0,0]
-; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm5
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm6 = [4,11,18,25,0,0,0,0]
-; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm6
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm7 = [5,12,19,26,0,0,0,0]
-; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm7
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm8 = [6,13,20,27,0,0,0,0]
-; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm8
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm2, %zmm2
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm3, %zmm3
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm4, %zmm4
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm5, %zmm5
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm6, %zmm6
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm7, %zmm1
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512BW-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512BW-NEXT:    vmovq %xmm5, (%r8)
-; AVX512BW-NEXT:    vmovq %xmm6, (%r9)
-; AVX512BW-NEXT:    vmovq %xmm7, (%r10)
-; AVX512BW-NEXT:    vmovq %xmm8, (%rax)
+; AVX512BW-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512BW-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512BW-NEXT:    vmovq %xmm4, (%r8)
+; AVX512BW-NEXT:    vmovq %xmm5, (%r9)
+; AVX512BW-NEXT:    vmovq %xmm6, (%r10)
+; AVX512BW-NEXT:    vmovq %xmm1, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -936,28 +937,27 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = [1,8,15,22,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = [2,9,16,23,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm4
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = [3,10,17,24,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm5
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm6 = [4,11,18,25,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm6
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm7 = [5,12,19,26,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm7
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm8 = [6,13,20,27,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm8
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm2
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm3, %zmm3
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm4, %zmm4
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm5, %zmm5
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm6, %zmm6
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm7, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm5, (%r8)
-; AVX512BW-FCP-NEXT:    vmovq %xmm6, (%r9)
-; AVX512BW-FCP-NEXT:    vmovq %xmm7, (%r10)
-; AVX512BW-FCP-NEXT:    vmovq %xmm8, (%rax)
+; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%r8)
+; AVX512BW-FCP-NEXT:    vmovq %xmm5, (%r9)
+; AVX512BW-FCP-NEXT:    vmovq %xmm6, (%r10)
+; AVX512BW-FCP-NEXT:    vmovq %xmm1, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -966,28 +966,27 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm0
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm3 = [1,8,15,22,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm4 = [2,9,16,23,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm4
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm5 = [3,10,17,24,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm5
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm6 = [4,11,18,25,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm6
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm7 = [5,12,19,26,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm7
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm8 = [6,13,20,27,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm8
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm2, %zmm2
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm3, %zmm3
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm4, %zmm4
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm5, %zmm5
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm6, %zmm6
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm7, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512DQ-BW-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512DQ-BW-NEXT:    vmovq %xmm5, (%r8)
-; AVX512DQ-BW-NEXT:    vmovq %xmm6, (%r9)
-; AVX512DQ-BW-NEXT:    vmovq %xmm7, (%r10)
-; AVX512DQ-BW-NEXT:    vmovq %xmm8, (%rax)
+; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512DQ-BW-NEXT:    vmovq %xmm4, (%r8)
+; AVX512DQ-BW-NEXT:    vmovq %xmm5, (%r9)
+; AVX512DQ-BW-NEXT:    vmovq %xmm6, (%r10)
+; AVX512DQ-BW-NEXT:    vmovq %xmm1, (%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -996,28 +995,27 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = [1,8,15,22,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = [2,9,16,23,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = [3,10,17,24,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm6 = [4,11,18,25,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm6
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm7 = [5,12,19,26,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm8 = [6,13,20,27,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm3, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm4, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm5, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm6, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm7, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm5, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm6, (%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm7, (%r10)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm8, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm5, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm6, (%r10)
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm1, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <28 x i16>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
index 051b4e300b827..fff21f9aad1bb 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
@@ -623,31 +623,30 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm0
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm3 = [1,9,17,25,0,0,0,0]
-; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm4 = [2,10,18,26,0,0,0,0]
-; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm4
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm5 = [3,11,19,27,0,0,0,0]
-; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm5
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm6 = [4,12,20,28,0,0,0,0]
-; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm6
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm7 = [5,13,21,29,0,0,0,0]
-; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm7
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm8 = [6,14,22,30,0,0,0,0]
-; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm8
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm9 = [7,15,23,31,0,0,0,0]
-; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm9
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm2, %zmm2
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm3, %zmm3
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm4, %zmm4
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm5, %zmm5
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm6, %zmm6
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm7, %zmm7
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm8, %zmm1
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512BW-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512BW-NEXT:    vmovq %xmm5, (%r8)
-; AVX512BW-NEXT:    vmovq %xmm6, (%r9)
-; AVX512BW-NEXT:    vmovq %xmm7, (%r11)
-; AVX512BW-NEXT:    vmovq %xmm8, (%r10)
-; AVX512BW-NEXT:    vmovq %xmm9, (%rax)
+; AVX512BW-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512BW-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512BW-NEXT:    vmovq %xmm4, (%r8)
+; AVX512BW-NEXT:    vmovq %xmm5, (%r9)
+; AVX512BW-NEXT:    vmovq %xmm6, (%r11)
+; AVX512BW-NEXT:    vmovq %xmm7, (%r10)
+; AVX512BW-NEXT:    vmovq %xmm1, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -657,31 +656,30 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = [1,9,17,25,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = [2,10,18,26,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm4
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = [3,11,19,27,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm5
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm6 = [4,12,20,28,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm6
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm7 = [5,13,21,29,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm7
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm8 = [6,14,22,30,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm8
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm9 = [7,15,23,31,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm9
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm2
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm3, %zmm3
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm4, %zmm4
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm5, %zmm5
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm6, %zmm6
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm7, %zmm7
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm8, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm5, (%r8)
-; AVX512BW-FCP-NEXT:    vmovq %xmm6, (%r9)
-; AVX512BW-FCP-NEXT:    vmovq %xmm7, (%r11)
-; AVX512BW-FCP-NEXT:    vmovq %xmm8, (%r10)
-; AVX512BW-FCP-NEXT:    vmovq %xmm9, (%rax)
+; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%r8)
+; AVX512BW-FCP-NEXT:    vmovq %xmm5, (%r9)
+; AVX512BW-FCP-NEXT:    vmovq %xmm6, (%r11)
+; AVX512BW-FCP-NEXT:    vmovq %xmm7, (%r10)
+; AVX512BW-FCP-NEXT:    vmovq %xmm1, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -691,31 +689,30 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm0
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm3 = [1,9,17,25,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm4 = [2,10,18,26,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm4
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm5 = [3,11,19,27,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm5
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm6 = [4,12,20,28,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm6
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm7 = [5,13,21,29,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm7
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm8 = [6,14,22,30,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm8
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm9 = [7,15,23,31,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm9
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm2, %zmm2
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm3, %zmm3
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm4, %zmm4
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm5, %zmm5
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm6, %zmm6
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm7, %zmm7
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm8, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512DQ-BW-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512DQ-BW-NEXT:    vmovq %xmm5, (%r8)
-; AVX512DQ-BW-NEXT:    vmovq %xmm6, (%r9)
-; AVX512DQ-BW-NEXT:    vmovq %xmm7, (%r11)
-; AVX512DQ-BW-NEXT:    vmovq %xmm8, (%r10)
-; AVX512DQ-BW-NEXT:    vmovq %xmm9, (%rax)
+; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512DQ-BW-NEXT:    vmovq %xmm4, (%r8)
+; AVX512DQ-BW-NEXT:    vmovq %xmm5, (%r9)
+; AVX512DQ-BW-NEXT:    vmovq %xmm6, (%r11)
+; AVX512DQ-BW-NEXT:    vmovq %xmm7, (%r10)
+; AVX512DQ-BW-NEXT:    vmovq %xmm1, (%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -725,31 +722,30 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = [1,9,17,25,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = [2,10,18,26,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = [3,11,19,27,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm6 = [4,12,20,28,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm6
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm7 = [5,13,21,29,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm8 = [6,14,22,30,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm8
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm9 = [7,15,23,31,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm2, %ymm1, %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm3, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm4, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm5, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm6, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm7, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm8, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm5, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm6, (%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm7, (%r11)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm8, (%r10)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm9, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm5, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm6, (%r11)
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm7, (%r10)
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm1, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <32 x i16>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll
index 7cb46b79f7f36..f2c5a91d2cca3 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll
@@ -363,11 +363,10 @@ define void @load_i32_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX512-FCP-LABEL: load_i32_stride2_vf8:
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15]
-; AVX512-FCP-NEXT:    vpermi2d 32(%rdi), %ymm1, %ymm2
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
+; AVX512-FCP-NEXT:    vpermps (%rdi), %zmm1, %zmm1
 ; AVX512-FCP-NEXT:    vpmovqd %zmm0, (%rsi)
-; AVX512-FCP-NEXT:    vmovdqa %ymm2, (%rdx)
+; AVX512-FCP-NEXT:    vmovaps %ymm1, (%rdx)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -385,11 +384,10 @@ define void @load_i32_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX512DQ-FCP-LABEL: load_i32_stride2_vf8:
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15]
-; AVX512DQ-FCP-NEXT:    vpermi2d 32(%rdi), %ymm1, %ymm2
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
+; AVX512DQ-FCP-NEXT:    vpermps (%rdi), %zmm1, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpmovqd %zmm0, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, (%rdx)
+; AVX512DQ-FCP-NEXT:    vmovaps %ymm1, (%rdx)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -407,11 +405,10 @@ define void @load_i32_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX512BW-FCP-LABEL: load_i32_stride2_vf8:
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15]
-; AVX512BW-FCP-NEXT:    vpermi2d 32(%rdi), %ymm1, %ymm2
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
+; AVX512BW-FCP-NEXT:    vpermps (%rdi), %zmm1, %zmm1
 ; AVX512BW-FCP-NEXT:    vpmovqd %zmm0, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm2, (%rdx)
+; AVX512BW-FCP-NEXT:    vmovaps %ymm1, (%rdx)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -429,11 +426,10 @@ define void @load_i32_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride2_vf8:
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d 32(%rdi), %ymm1, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermps (%rdi), %zmm1, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpmovqd %zmm0, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm2, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %ymm1, (%rdx)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <16 x i32>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
index 213c5febfca23..d9383f524f1d1 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
@@ -310,128 +310,120 @@ define void @load_i32_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-LABEL: load_i32_stride3_vf4:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9]
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512-NEXT:    vpermi2d %ymm2, %ymm1, %ymm0
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10]
-; AVX512-NEXT:    vpermi2d %ymm2, %ymm1, %ymm3
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11]
-; AVX512-NEXT:    vpermi2d %ymm2, %ymm1, %ymm4
-; AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512-NEXT:    vmovdqa %xmm3, (%rdx)
-; AVX512-NEXT:    vmovdqa %xmm4, (%rcx)
+; AVX512-NEXT:    vmovaps (%rdi), %zmm1
+; AVX512-NEXT:    vpermps %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10]
+; AVX512-NEXT:    vpermps %zmm1, %zmm2, %zmm2
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11]
+; AVX512-NEXT:    vpermps %zmm1, %zmm3, %zmm1
+; AVX512-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512-NEXT:    vmovaps %xmm2, (%rdx)
+; AVX512-NEXT:    vmovaps %xmm1, (%rcx)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: load_i32_stride3_vf4:
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9]
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512-FCP-NEXT:    vpermi2d %ymm2, %ymm1, %ymm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10]
-; AVX512-FCP-NEXT:    vpermi2d %ymm2, %ymm1, %ymm3
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11]
-; AVX512-FCP-NEXT:    vpermi2d %ymm2, %ymm1, %ymm4
-; AVX512-FCP-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
-; AVX512-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
+; AVX512-FCP-NEXT:    vmovaps (%rdi), %zmm1
+; AVX512-FCP-NEXT:    vpermps %zmm1, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10]
+; AVX512-FCP-NEXT:    vpermps %zmm1, %zmm2, %zmm2
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11]
+; AVX512-FCP-NEXT:    vpermps %zmm1, %zmm3, %zmm1
+; AVX512-FCP-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512-FCP-NEXT:    vmovaps %xmm2, (%rdx)
+; AVX512-FCP-NEXT:    vmovaps %xmm1, (%rcx)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: load_i32_stride3_vf4:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9]
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512DQ-NEXT:    vpermi2d %ymm2, %ymm1, %ymm0
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10]
-; AVX512DQ-NEXT:    vpermi2d %ymm2, %ymm1, %ymm3
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11]
-; AVX512DQ-NEXT:    vpermi2d %ymm2, %ymm1, %ymm4
-; AVX512DQ-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512DQ-NEXT:    vmovdqa %xmm3, (%rdx)
-; AVX512DQ-NEXT:    vmovdqa %xmm4, (%rcx)
+; AVX512DQ-NEXT:    vmovaps (%rdi), %zmm1
+; AVX512DQ-NEXT:    vpermps %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10]
+; AVX512DQ-NEXT:    vpermps %zmm1, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11]
+; AVX512DQ-NEXT:    vpermps %zmm1, %zmm3, %zmm1
+; AVX512DQ-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512DQ-NEXT:    vmovaps %xmm2, (%rdx)
+; AVX512DQ-NEXT:    vmovaps %xmm1, (%rcx)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: load_i32_stride3_vf4:
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm2, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm2, %ymm1, %ymm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm2, %ymm1, %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovaps (%rdi), %zmm1
+; AVX512DQ-FCP-NEXT:    vpermps %zmm1, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10]
+; AVX512DQ-FCP-NEXT:    vpermps %zmm1, %zmm2, %zmm2
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11]
+; AVX512DQ-FCP-NEXT:    vpermps %zmm1, %zmm3, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512DQ-FCP-NEXT:    vmovaps %xmm2, (%rdx)
+; AVX512DQ-FCP-NEXT:    vmovaps %xmm1, (%rcx)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i32_stride3_vf4:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9]
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512BW-NEXT:    vpermi2d %ymm2, %ymm1, %ymm0
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10]
-; AVX512BW-NEXT:    vpermi2d %ymm2, %ymm1, %ymm3
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11]
-; AVX512BW-NEXT:    vpermi2d %ymm2, %ymm1, %ymm4
-; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT:    vmovdqa %xmm3, (%rdx)
-; AVX512BW-NEXT:    vmovdqa %xmm4, (%rcx)
+; AVX512BW-NEXT:    vmovaps (%rdi), %zmm1
+; AVX512BW-NEXT:    vpermps %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10]
+; AVX512BW-NEXT:    vpermps %zmm1, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11]
+; AVX512BW-NEXT:    vpermps %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512BW-NEXT:    vmovaps %xmm2, (%rdx)
+; AVX512BW-NEXT:    vmovaps %xmm1, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BW-FCP-LABEL: load_i32_stride3_vf4:
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9]
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512BW-FCP-NEXT:    vpermi2d %ymm2, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10]
-; AVX512BW-FCP-NEXT:    vpermi2d %ymm2, %ymm1, %ymm3
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11]
-; AVX512BW-FCP-NEXT:    vpermi2d %ymm2, %ymm1, %ymm4
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovaps (%rdi), %zmm1
+; AVX512BW-FCP-NEXT:    vpermps %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10]
+; AVX512BW-FCP-NEXT:    vpermps %zmm1, %zmm2, %zmm2
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11]
+; AVX512BW-FCP-NEXT:    vpermps %zmm1, %zmm3, %zmm1
+; AVX512BW-FCP-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512BW-FCP-NEXT:    vmovaps %xmm2, (%rdx)
+; AVX512BW-FCP-NEXT:    vmovaps %xmm1, (%rcx)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
 ; AVX512DQ-BW-LABEL: load_i32_stride3_vf4:
 ; AVX512DQ-BW:       # %bb.0:
 ; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9]
-; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512DQ-BW-NEXT:    vpermi2d %ymm2, %ymm1, %ymm0
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10]
-; AVX512DQ-BW-NEXT:    vpermi2d %ymm2, %ymm1, %ymm3
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11]
-; AVX512DQ-BW-NEXT:    vpermi2d %ymm2, %ymm1, %ymm4
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm3, (%rdx)
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm4, (%rcx)
+; AVX512DQ-BW-NEXT:    vmovaps (%rdi), %zmm1
+; AVX512DQ-BW-NEXT:    vpermps %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10]
+; AVX512DQ-BW-NEXT:    vpermps %zmm1, %zmm2, %zmm2
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11]
+; AVX512DQ-BW-NEXT:    vpermps %zmm1, %zmm3, %zmm1
+; AVX512DQ-BW-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512DQ-BW-NEXT:    vmovaps %xmm2, (%rdx)
+; AVX512DQ-BW-NEXT:    vmovaps %xmm1, (%rcx)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf4:
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm2, %ymm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm2, %ymm1, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm2, %ymm1, %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdi), %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermps %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10]
+; AVX512DQ-BW-FCP-NEXT:    vpermps %zmm1, %zmm2, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11]
+; AVX512DQ-BW-FCP-NEXT:    vpermps %zmm1, %zmm3, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %xmm2, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %xmm1, (%rcx)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <12 x i32>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
index 61f91b2bb0c0c..0bf1260738439 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
@@ -106,13 +106,14 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0]
-; AVX512-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm3
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1]
+; AVX512-FCP-NEXT:    vpermps (%rdi), %ymm3, %ymm3
 ; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512-FCP-NEXT:    vmovq %xmm3, (%rdx)
+; AVX512-FCP-NEXT:    vmovlps %xmm3, (%rdx)
 ; AVX512-FCP-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512-FCP-NEXT:    vpextrq $1, %xmm0, (%r8)
+; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: load_i32_stride4_vf2:
@@ -134,13 +135,14 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0]
-; AVX512DQ-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1]
+; AVX512DQ-FCP-NEXT:    vpermps (%rdi), %ymm3, %ymm3
 ; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rdx)
+; AVX512DQ-FCP-NEXT:    vmovlps %xmm3, (%rdx)
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512DQ-FCP-NEXT:    vpextrq $1, %xmm0, (%r8)
+; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i32_stride4_vf2:
@@ -162,13 +164,14 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BW-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm3
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1]
+; AVX512BW-FCP-NEXT:    vpermps (%rdi), %ymm3, %ymm3
 ; AVX512BW-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
+; AVX512BW-FCP-NEXT:    vmovlps %xmm3, (%rdx)
 ; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512BW-FCP-NEXT:    vpextrq $1, %xmm0, (%r8)
+; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
 ; AVX512DQ-BW-LABEL: load_i32_stride4_vf2:
@@ -190,13 +193,14 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1]
+; AVX512DQ-BW-FCP-NEXT:    vpermps (%rdi), %ymm3, %ymm3
 ; AVX512DQ-BW-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm3, (%rdx)
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512DQ-BW-FCP-NEXT:    vpextrq $1, %xmm0, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <8 x i32>, ptr %in.vec, align 64
   %strided.vec0 = shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <2 x i32> <i32 0, i32 4>
@@ -361,152 +365,144 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-LABEL: load_i32_stride4_vf4:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12]
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512-NEXT:    vpermi2d %ymm2, %ymm1, %ymm0
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13]
-; AVX512-NEXT:    vpermi2d %ymm2, %ymm1, %ymm3
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14]
-; AVX512-NEXT:    vpermi2d %ymm2, %ymm1, %ymm4
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15]
-; AVX512-NEXT:    vpermi2d %ymm2, %ymm1, %ymm5
-; AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512-NEXT:    vmovdqa %xmm3, (%rdx)
-; AVX512-NEXT:    vmovdqa %xmm4, (%rcx)
-; AVX512-NEXT:    vmovdqa %xmm5, (%r8)
+; AVX512-NEXT:    vmovaps (%rdi), %zmm1
+; AVX512-NEXT:    vpermps %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13]
+; AVX512-NEXT:    vpermps %zmm1, %zmm2, %zmm2
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14]
+; AVX512-NEXT:    vpermps %zmm1, %zmm3, %zmm3
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15]
+; AVX512-NEXT:    vpermps %zmm1, %zmm4, %zmm1
+; AVX512-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512-NEXT:    vmovaps %xmm2, (%rdx)
+; AVX512-NEXT:    vmovaps %xmm3, (%rcx)
+; AVX512-NEXT:    vmovaps %xmm1, (%r8)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: load_i32_stride4_vf4:
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12]
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512-FCP-NEXT:    vpermi2d %ymm2, %ymm1, %ymm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13]
-; AVX512-FCP-NEXT:    vpermi2d %ymm2, %ymm1, %ymm3
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14]
-; AVX512-FCP-NEXT:    vpermi2d %ymm2, %ymm1, %ymm4
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15]
-; AVX512-FCP-NEXT:    vpermi2d %ymm2, %ymm1, %ymm5
-; AVX512-FCP-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
-; AVX512-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
-; AVX512-FCP-NEXT:    vmovdqa %xmm5, (%r8)
+; AVX512-FCP-NEXT:    vmovaps (%rdi), %zmm1
+; AVX512-FCP-NEXT:    vpermps %zmm1, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13]
+; AVX512-FCP-NEXT:    vpermps %zmm1, %zmm2, %zmm2
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14]
+; AVX512-FCP-NEXT:    vpermps %zmm1, %zmm3, %zmm3
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15]
+; AVX512-FCP-NEXT:    vpermps %zmm1, %zmm4, %zmm1
+; AVX512-FCP-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512-FCP-NEXT:    vmovaps %xmm2, (%rdx)
+; AVX512-FCP-NEXT:    vmovaps %xmm3, (%rcx)
+; AVX512-FCP-NEXT:    vmovaps %xmm1, (%r8)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: load_i32_stride4_vf4:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12]
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512DQ-NEXT:    vpermi2d %ymm2, %ymm1, %ymm0
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13]
-; AVX512DQ-NEXT:    vpermi2d %ymm2, %ymm1, %ymm3
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14]
-; AVX512DQ-NEXT:    vpermi2d %ymm2, %ymm1, %ymm4
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15]
-; AVX512DQ-NEXT:    vpermi2d %ymm2, %ymm1, %ymm5
-; AVX512DQ-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512DQ-NEXT:    vmovdqa %xmm3, (%rdx)
-; AVX512DQ-NEXT:    vmovdqa %xmm4, (%rcx)
-; AVX512DQ-NEXT:    vmovdqa %xmm5, (%r8)
+; AVX512DQ-NEXT:    vmovaps (%rdi), %zmm1
+; AVX512DQ-NEXT:    vpermps %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13]
+; AVX512DQ-NEXT:    vpermps %zmm1, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14]
+; AVX512DQ-NEXT:    vpermps %zmm1, %zmm3, %zmm3
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15]
+; AVX512DQ-NEXT:    vpermps %zmm1, %zmm4, %zmm1
+; AVX512DQ-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512DQ-NEXT:    vmovaps %xmm2, (%rdx)
+; AVX512DQ-NEXT:    vmovaps %xmm3, (%rcx)
+; AVX512DQ-NEXT:    vmovaps %xmm1, (%r8)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: load_i32_stride4_vf4:
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm2, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm2, %ymm1, %ymm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm2, %ymm1, %ymm4
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm2, %ymm1, %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovaps (%rdi), %zmm1
+; AVX512DQ-FCP-NEXT:    vpermps %zmm1, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13]
+; AVX512DQ-FCP-NEXT:    vpermps %zmm1, %zmm2, %zmm2
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14]
+; AVX512DQ-FCP-NEXT:    vpermps %zmm1, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15]
+; AVX512DQ-FCP-NEXT:    vpermps %zmm1, %zmm4, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512DQ-FCP-NEXT:    vmovaps %xmm2, (%rdx)
+; AVX512DQ-FCP-NEXT:    vmovaps %xmm3, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovaps %xmm1, (%r8)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i32_stride4_vf4:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12]
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512BW-NEXT:    vpermi2d %ymm2, %ymm1, %ymm0
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13]
-; AVX512BW-NEXT:    vpermi2d %ymm2, %ymm1, %ymm3
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14]
-; AVX512BW-NEXT:    vpermi2d %ymm2, %ymm1, %ymm4
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15]
-; AVX512BW-NEXT:    vpermi2d %ymm2, %ymm1, %ymm5
-; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT:    vmovdqa %xmm3, (%rdx)
-; AVX512BW-NEXT:    vmovdqa %xmm4, (%rcx)
-; AVX512BW-NEXT:    vmovdqa %xmm5, (%r8)
+; AVX512BW-NEXT:    vmovaps (%rdi), %zmm1
+; AVX512BW-NEXT:    vpermps %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13]
+; AVX512BW-NEXT:    vpermps %zmm1, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14]
+; AVX512BW-NEXT:    vpermps %zmm1, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15]
+; AVX512BW-NEXT:    vpermps %zmm1, %zmm4, %zmm1
+; AVX512BW-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512BW-NEXT:    vmovaps %xmm2, (%rdx)
+; AVX512BW-NEXT:    vmovaps %xmm3, (%rcx)
+; AVX512BW-NEXT:    vmovaps %xmm1, (%r8)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BW-FCP-LABEL: load_i32_stride4_vf4:
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12]
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512BW-FCP-NEXT:    vpermi2d %ymm2, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13]
-; AVX512BW-FCP-NEXT:    vpermi2d %ymm2, %ymm1, %ymm3
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14]
-; AVX512BW-FCP-NEXT:    vpermi2d %ymm2, %ymm1, %ymm4
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15]
-; AVX512BW-FCP-NEXT:    vpermi2d %ymm2, %ymm1, %ymm5
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm5, (%r8)
+; AVX512BW-FCP-NEXT:    vmovaps (%rdi), %zmm1
+; AVX512BW-FCP-NEXT:    vpermps %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13]
+; AVX512BW-FCP-NEXT:    vpermps %zmm1, %zmm2, %zmm2
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14]
+; AVX512BW-FCP-NEXT:    vpermps %zmm1, %zmm3, %zmm3
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15]
+; AVX512BW-FCP-NEXT:    vpermps %zmm1, %zmm4, %zmm1
+; AVX512BW-FCP-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512BW-FCP-NEXT:    vmovaps %xmm2, (%rdx)
+; AVX512BW-FCP-NEXT:    vmovaps %xmm3, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovaps %xmm1, (%r8)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
 ; AVX512DQ-BW-LABEL: load_i32_stride4_vf4:
 ; AVX512DQ-BW:       # %bb.0:
 ; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12]
-; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512DQ-BW-NEXT:    vpermi2d %ymm2, %ymm1, %ymm0
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13]
-; AVX512DQ-BW-NEXT:    vpermi2d %ymm2, %ymm1, %ymm3
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14]
-; AVX512DQ-BW-NEXT:    vpermi2d %ymm2, %ymm1, %ymm4
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15]
-; AVX512DQ-BW-NEXT:    vpermi2d %ymm2, %ymm1, %ymm5
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm3, (%rdx)
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm4, (%rcx)
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm5, (%r8)
+; AVX512DQ-BW-NEXT:    vmovaps (%rdi), %zmm1
+; AVX512DQ-BW-NEXT:    vpermps %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13]
+; AVX512DQ-BW-NEXT:    vpermps %zmm1, %zmm2, %zmm2
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14]
+; AVX512DQ-BW-NEXT:    vpermps %zmm1, %zmm3, %zmm3
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15]
+; AVX512DQ-BW-NEXT:    vpermps %zmm1, %zmm4, %zmm1
+; AVX512DQ-BW-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512DQ-BW-NEXT:    vmovaps %xmm2, (%rdx)
+; AVX512DQ-BW-NEXT:    vmovaps %xmm3, (%rcx)
+; AVX512DQ-BW-NEXT:    vmovaps %xmm1, (%r8)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride4_vf4:
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm2, %ymm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm2, %ymm1, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm2, %ymm1, %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm2, %ymm1, %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm5, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdi), %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermps %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT:    vpermps %zmm1, %zmm2, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT:    vpermps %zmm1, %zmm3, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermps %zmm1, %zmm4, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %xmm2, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %xmm3, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %xmm1, (%r8)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <16 x i32>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
index d8d48b0b8c73d..c08442f9d9d01 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
@@ -144,19 +144,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-LABEL: load_i32_stride5_vf2:
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [1,6,1,6]
-; AVX512-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm4
-; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [2,7,2,7]
-; AVX512-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm5
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
-; AVX512-FCP-NEXT:    vpbroadcastd 16(%rdi), %ymm1
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
-; AVX512-FCP-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512-FCP-NEXT:    vmovq %xmm4, (%rdx)
-; AVX512-FCP-NEXT:    vmovq %xmm5, (%rcx)
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0]
+; AVX512-FCP-NEXT:    vmovaps (%rdi), %ymm4
+; AVX512-FCP-NEXT:    vpermps %ymm4, %ymm3, %ymm3
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0]
+; AVX512-FCP-NEXT:    vpermps %ymm4, %ymm5, %ymm4
+; AVX512-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
+; AVX512-FCP-NEXT:    vpbroadcastd 16(%rdi), %ymm5
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3]
+; AVX512-FCP-NEXT:    vmovq %xmm2, (%rsi)
+; AVX512-FCP-NEXT:    vmovlps %xmm3, (%rdx)
+; AVX512-FCP-NEXT:    vmovlps %xmm4, (%rcx)
 ; AVX512-FCP-NEXT:    vmovq %xmm0, (%r8)
 ; AVX512-FCP-NEXT:    vmovq %xmm1, (%r9)
 ; AVX512-FCP-NEXT:    vzeroupper
@@ -188,19 +188,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-LABEL: load_i32_stride5_vf2:
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [1,6,1,6]
-; AVX512DQ-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm4
-; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [2,7,2,7]
-; AVX512DQ-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm5
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 16(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
-; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm4, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm5, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0]
+; AVX512DQ-FCP-NEXT:    vmovaps (%rdi), %ymm4
+; AVX512DQ-FCP-NEXT:    vpermps %ymm4, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0]
+; AVX512DQ-FCP-NEXT:    vpermps %ymm4, %ymm5, %ymm4
+; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 16(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3]
+; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rsi)
+; AVX512DQ-FCP-NEXT:    vmovlps %xmm3, (%rdx)
+; AVX512DQ-FCP-NEXT:    vmovlps %xmm4, (%rcx)
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%r8)
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm1, (%r9)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
@@ -232,19 +232,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-LABEL: load_i32_stride5_vf2:
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [1,6,1,6]
-; AVX512BW-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm4
-; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [2,7,2,7]
-; AVX512BW-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm5
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
-; AVX512BW-FCP-NEXT:    vpbroadcastd 16(%rdi), %ymm1
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
-; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm5, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0]
+; AVX512BW-FCP-NEXT:    vmovaps (%rdi), %ymm4
+; AVX512BW-FCP-NEXT:    vpermps %ymm4, %ymm3, %ymm3
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0]
+; AVX512BW-FCP-NEXT:    vpermps %ymm4, %ymm5, %ymm4
+; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
+; AVX512BW-FCP-NEXT:    vpbroadcastd 16(%rdi), %ymm5
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3]
+; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
+; AVX512BW-FCP-NEXT:    vmovlps %xmm3, (%rdx)
+; AVX512BW-FCP-NEXT:    vmovlps %xmm4, (%rcx)
 ; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%r8)
 ; AVX512BW-FCP-NEXT:    vmovq %xmm1, (%r9)
 ; AVX512BW-FCP-NEXT:    vzeroupper
@@ -276,19 +276,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride5_vf2:
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [1,6,1,6]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm4
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [2,7,2,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd 16(%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm5, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdi), %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vpermps %ymm4, %ymm3, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermps %ymm4, %ymm5, %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd 16(%rdi), %ymm5
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm3, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm4, (%rcx)
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%r8)
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm1, (%r9)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
@@ -491,18 +491,17 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm1
 ; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15]
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX512-NEXT:    vpermt2d 32(%rdi), %ymm2, %ymm3
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16]
-; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpermd %zmm0, %zmm2, %zmm2
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16]
+; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
 ; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17]
 ; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
 ; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18]
 ; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
 ; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19]
 ; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
-; AVX512-NEXT:    vmovdqa %xmm3, (%rsi)
-; AVX512-NEXT:    vmovdqa %xmm2, (%rdx)
+; AVX512-NEXT:    vmovdqa %xmm2, (%rsi)
+; AVX512-NEXT:    vmovdqa %xmm3, (%rdx)
 ; AVX512-NEXT:    vmovdqa %xmm4, (%rcx)
 ; AVX512-NEXT:    vmovdqa %xmm5, (%r8)
 ; AVX512-NEXT:    vmovdqa %xmm6, (%r9)
@@ -514,18 +513,17 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15]
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX512-FCP-NEXT:    vpermt2d 32(%rdi), %ymm2, %ymm3
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16]
-; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm2, %zmm2
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16]
+; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17]
 ; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18]
 ; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19]
 ; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
-; AVX512-FCP-NEXT:    vmovdqa %xmm3, (%rsi)
-; AVX512-FCP-NEXT:    vmovdqa %xmm2, (%rdx)
+; AVX512-FCP-NEXT:    vmovdqa %xmm2, (%rsi)
+; AVX512-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
 ; AVX512-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
 ; AVX512-FCP-NEXT:    vmovdqa %xmm5, (%r8)
 ; AVX512-FCP-NEXT:    vmovdqa %xmm6, (%r9)
@@ -537,18 +535,17 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm1
 ; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15]
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX512DQ-NEXT:    vpermt2d 32(%rdi), %ymm2, %ymm3
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16]
-; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16]
+; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
 ; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17]
 ; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
 ; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18]
 ; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
 ; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19]
 ; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
-; AVX512DQ-NEXT:    vmovdqa %xmm3, (%rsi)
-; AVX512DQ-NEXT:    vmovdqa %xmm2, (%rdx)
+; AVX512DQ-NEXT:    vmovdqa %xmm2, (%rsi)
+; AVX512DQ-NEXT:    vmovdqa %xmm3, (%rdx)
 ; AVX512DQ-NEXT:    vmovdqa %xmm4, (%rcx)
 ; AVX512DQ-NEXT:    vmovdqa %xmm5, (%r8)
 ; AVX512DQ-NEXT:    vmovdqa %xmm6, (%r9)
@@ -560,18 +557,17 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX512DQ-FCP-NEXT:    vpermt2d 32(%rdi), %ymm2, %ymm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16]
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm2, %zmm2
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16]
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17]
 ; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18]
 ; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19]
 ; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, (%rdx)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, (%rsi)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, (%r8)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, (%r9)
@@ -583,18 +579,17 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
 ; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15]
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX512BW-NEXT:    vpermt2d 32(%rdi), %ymm2, %ymm3
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16]
-; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpermd %zmm0, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16]
+; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
 ; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17]
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
 ; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18]
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
 ; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19]
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
-; AVX512BW-NEXT:    vmovdqa %xmm3, (%rsi)
-; AVX512BW-NEXT:    vmovdqa %xmm2, (%rdx)
+; AVX512BW-NEXT:    vmovdqa %xmm2, (%rsi)
+; AVX512BW-NEXT:    vmovdqa %xmm3, (%rdx)
 ; AVX512BW-NEXT:    vmovdqa %xmm4, (%rcx)
 ; AVX512BW-NEXT:    vmovdqa %xmm5, (%r8)
 ; AVX512BW-NEXT:    vmovdqa %xmm6, (%r9)
@@ -606,18 +601,17 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
 ; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15]
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX512BW-FCP-NEXT:    vpermt2d 32(%rdi), %ymm2, %ymm3
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16]
-; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; AVX512BW-FCP-NEXT:    vpermd %zmm0, %zmm2, %zmm2
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16]
+; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
 ; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17]
 ; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
 ; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18]
 ; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
 ; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19]
 ; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm3, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm2, (%rdx)
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm2, (%rsi)
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
 ; AVX512BW-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
 ; AVX512BW-FCP-NEXT:    vmovdqa %xmm5, (%r8)
 ; AVX512BW-FCP-NEXT:    vmovdqa %xmm6, (%r9)
@@ -629,18 +623,17 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
 ; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15]
-; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX512DQ-BW-NEXT:    vpermt2d 32(%rdi), %ymm2, %ymm3
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16]
-; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; AVX512DQ-BW-NEXT:    vpermd %zmm0, %zmm2, %zmm2
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16]
+; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
 ; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17]
 ; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
 ; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18]
 ; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
 ; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19]
 ; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm3, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm2, (%rdx)
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm2, (%rsi)
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm3, (%rdx)
 ; AVX512DQ-BW-NEXT:    vmovdqa %xmm4, (%rcx)
 ; AVX512DQ-BW-NEXT:    vmovdqa %xmm5, (%r8)
 ; AVX512DQ-BW-NEXT:    vmovdqa %xmm6, (%r9)
@@ -652,18 +645,17 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpermt2d 32(%rdi), %ymm2, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm0, %zmm2, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17]
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18]
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19]
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm3, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm2, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm2, (%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm5, (%r8)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm6, (%r9)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
index 3ba41ad07ce83..ae3e5445bf266 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
@@ -192,29 +192,28 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-LABEL: load_i32_stride6_vf2:
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [0,6,0,6]
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
-; AVX512-FCP-NEXT:    vpermi2d %xmm2, %xmm1, %xmm0
-; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [1,7,1,7]
-; AVX512-FCP-NEXT:    vpermi2d %xmm2, %xmm1, %xmm4
-; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4]
-; AVX512-FCP-NEXT:    vpermi2d %xmm3, %xmm1, %xmm2
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,5,0,0]
-; AVX512-FCP-NEXT:    vpermi2d %xmm3, %xmm1, %xmm5
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,2,0,0]
-; AVX512-FCP-NEXT:    vmovaps 32(%rdi), %ymm3
-; AVX512-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7]
-; AVX512-FCP-NEXT:    vpermps %ymm3, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0]
-; AVX512-FCP-NEXT:    vpermps %ymm3, %ymm6, %ymm3
-; AVX512-FCP-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512-FCP-NEXT:    vmovq %xmm4, (%rdx)
-; AVX512-FCP-NEXT:    vmovq %xmm2, (%rcx)
-; AVX512-FCP-NEXT:    vmovq %xmm5, (%r8)
-; AVX512-FCP-NEXT:    vmovlps %xmm1, (%r9)
-; AVX512-FCP-NEXT:    vmovlps %xmm3, (%rax)
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0]
+; AVX512-FCP-NEXT:    vmovaps (%rdi), %ymm1
+; AVX512-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0]
+; AVX512-FCP-NEXT:    vpermps %ymm1, %ymm2, %ymm2
+; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4]
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm4
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm5
+; AVX512-FCP-NEXT:    vpermi2d %xmm5, %xmm4, %xmm3
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0]
+; AVX512-FCP-NEXT:    vpermi2d %xmm5, %xmm4, %xmm6
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0]
+; AVX512-FCP-NEXT:    vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vpermps %ymm1, %ymm4, %ymm4
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0]
+; AVX512-FCP-NEXT:    vpermps %ymm1, %ymm5, %ymm1
+; AVX512-FCP-NEXT:    vmovlps %xmm0, (%rsi)
+; AVX512-FCP-NEXT:    vmovlps %xmm2, (%rdx)
+; AVX512-FCP-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512-FCP-NEXT:    vmovq %xmm6, (%r8)
+; AVX512-FCP-NEXT:    vmovlps %xmm4, (%r9)
+; AVX512-FCP-NEXT:    vmovlps %xmm1, (%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -252,29 +251,28 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-LABEL: load_i32_stride6_vf2:
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [0,6,0,6]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT:    vpermi2d %xmm2, %xmm1, %xmm0
-; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [1,7,1,7]
-; AVX512DQ-FCP-NEXT:    vpermi2d %xmm2, %xmm1, %xmm4
-; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4]
-; AVX512DQ-FCP-NEXT:    vpermi2d %xmm3, %xmm1, %xmm2
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,5,0,0]
-; AVX512DQ-FCP-NEXT:    vpermi2d %xmm3, %xmm1, %xmm5
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,2,0,0]
-; AVX512DQ-FCP-NEXT:    vmovaps 32(%rdi), %ymm3
-; AVX512DQ-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermps %ymm3, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0]
-; AVX512DQ-FCP-NEXT:    vpermps %ymm3, %ymm6, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm4, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm5, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovlps %xmm1, (%r9)
-; AVX512DQ-FCP-NEXT:    vmovlps %xmm3, (%rax)
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0]
+; AVX512DQ-FCP-NEXT:    vmovaps (%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0]
+; AVX512DQ-FCP-NEXT:    vpermps %ymm1, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm5
+; AVX512DQ-FCP-NEXT:    vpermi2d %xmm5, %xmm4, %xmm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0]
+; AVX512DQ-FCP-NEXT:    vpermi2d %xmm5, %xmm4, %xmm6
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0]
+; AVX512DQ-FCP-NEXT:    vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermps %ymm1, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0]
+; AVX512DQ-FCP-NEXT:    vpermps %ymm1, %ymm5, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovlps %xmm0, (%rsi)
+; AVX512DQ-FCP-NEXT:    vmovlps %xmm2, (%rdx)
+; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovq %xmm6, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovlps %xmm4, (%r9)
+; AVX512DQ-FCP-NEXT:    vmovlps %xmm1, (%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -312,29 +310,28 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-LABEL: load_i32_stride6_vf2:
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [0,6,0,6]
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX512BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
-; AVX512BW-FCP-NEXT:    vpermi2d %xmm2, %xmm1, %xmm0
-; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [1,7,1,7]
-; AVX512BW-FCP-NEXT:    vpermi2d %xmm2, %xmm1, %xmm4
-; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4]
-; AVX512BW-FCP-NEXT:    vpermi2d %xmm3, %xmm1, %xmm2
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,5,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2d %xmm3, %xmm1, %xmm5
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,2,0,0]
-; AVX512BW-FCP-NEXT:    vmovaps 32(%rdi), %ymm3
-; AVX512BW-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpermps %ymm3, %ymm1, %ymm1
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0]
-; AVX512BW-FCP-NEXT:    vpermps %ymm3, %ymm6, %ymm3
-; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm5, (%r8)
-; AVX512BW-FCP-NEXT:    vmovlps %xmm1, (%r9)
-; AVX512BW-FCP-NEXT:    vmovlps %xmm3, (%rax)
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0]
+; AVX512BW-FCP-NEXT:    vmovaps (%rdi), %ymm1
+; AVX512BW-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0]
+; AVX512BW-FCP-NEXT:    vpermps %ymm1, %ymm2, %ymm2
+; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4]
+; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm4
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm5
+; AVX512BW-FCP-NEXT:    vpermi2d %xmm5, %xmm4, %xmm3
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0]
+; AVX512BW-FCP-NEXT:    vpermi2d %xmm5, %xmm4, %xmm6
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0]
+; AVX512BW-FCP-NEXT:    vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpermps %ymm1, %ymm4, %ymm4
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0]
+; AVX512BW-FCP-NEXT:    vpermps %ymm1, %ymm5, %ymm1
+; AVX512BW-FCP-NEXT:    vmovlps %xmm0, (%rsi)
+; AVX512BW-FCP-NEXT:    vmovlps %xmm2, (%rdx)
+; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovq %xmm6, (%r8)
+; AVX512BW-FCP-NEXT:    vmovlps %xmm4, (%r9)
+; AVX512BW-FCP-NEXT:    vmovlps %xmm1, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -372,29 +369,28 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride6_vf2:
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [0,6,0,6]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm2, %xmm1, %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [1,7,1,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm2, %xmm1, %xmm4
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm3, %xmm1, %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,5,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm3, %xmm1, %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,2,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vmovaps 32(%rdi), %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermps %ymm3, %ymm1, %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermps %ymm3, %ymm6, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm5, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm1, (%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm3, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdi), %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermps %ymm1, %ymm2, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm5
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm5, %xmm4, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm5, %xmm4, %xmm6
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpermps %ymm1, %ymm4, %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermps %ymm1, %ymm5, %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm0, (%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm2, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm6, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm4, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm1, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <12 x i32>, ptr %in.vec, align 64
@@ -1291,352 +1287,360 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-LABEL: load_i32_stride6_vf8:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm3
 ; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm0
 ; AVX512-NEXT:    vmovdqa 160(%rdi), %ymm1
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0]
-; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10]
-; AVX512-NEXT:    vpermi2d %ymm4, %ymm5, %ymm6
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0]
-; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11]
-; AVX512-NEXT:    vpermi2d %ymm4, %ymm5, %ymm7
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12]
-; AVX512-NEXT:    vpermi2d %ymm1, %ymm0, %ymm4
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0]
-; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13]
-; AVX512-NEXT:    vpermi2d %ymm1, %ymm0, %ymm5
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0]
-; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
+; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm4
+; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm5
+; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm6
+; AVX512-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
+; AVX512-NEXT:    vpermi2d %ymm2, %ymm3, %ymm7
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
+; AVX512-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
+; AVX512-NEXT:    vpermi2d %ymm2, %ymm3, %ymm8
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
+; AVX512-NEXT:    vpermd %zmm6, %zmm2, %zmm2
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
+; AVX512-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
+; AVX512-NEXT:    vpermd %zmm6, %zmm3, %zmm3
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
+; AVX512-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
 ; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
-; AVX512-NEXT:    vpermi2d %zmm2, %zmm3, %zmm1
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14]
-; AVX512-NEXT:    vpermi2d %ymm0, %ymm1, %ymm8
+; AVX512-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
+; AVX512-NEXT:    vpermi2d %ymm0, %ymm1, %ymm6
 ; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
-; AVX512-NEXT:    vpermi2d %zmm2, %zmm3, %zmm1
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15]
-; AVX512-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
-; AVX512-NEXT:    vmovdqa %ymm6, (%rsi)
-; AVX512-NEXT:    vmovdqa %ymm7, (%rdx)
-; AVX512-NEXT:    vmovdqa %ymm4, (%rcx)
-; AVX512-NEXT:    vmovdqa %ymm5, (%r8)
-; AVX512-NEXT:    vmovdqa %ymm8, (%r9)
-; AVX512-NEXT:    vmovdqa %ymm2, (%rax)
+; AVX512-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
+; AVX512-NEXT:    vpermi2d %ymm0, %ymm1, %ymm4
+; AVX512-NEXT:    vmovdqa %ymm7, (%rsi)
+; AVX512-NEXT:    vmovdqa %ymm8, (%rdx)
+; AVX512-NEXT:    vmovdqa %ymm2, (%rcx)
+; AVX512-NEXT:    vmovdqa %ymm3, (%r8)
+; AVX512-NEXT:    vmovdqa %ymm6, (%r9)
+; AVX512-NEXT:    vmovdqa %ymm4, (%rax)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: load_i32_stride6_vf8:
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm3
 ; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0]
-; AVX512-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10]
-; AVX512-FCP-NEXT:    vpermi2d %ymm4, %ymm5, %ymm6
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0]
-; AVX512-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11]
-; AVX512-FCP-NEXT:    vpermi2d %ymm4, %ymm5, %ymm7
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12]
-; AVX512-FCP-NEXT:    vpermi2d %ymm1, %ymm0, %ymm4
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0]
-; AVX512-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13]
-; AVX512-FCP-NEXT:    vpermi2d %ymm1, %ymm0, %ymm5
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0]
-; AVX512-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
+; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm4
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm5
+; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm6
+; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
+; AVX512-FCP-NEXT:    vpermi2d %ymm2, %ymm3, %ymm7
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
+; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
+; AVX512-FCP-NEXT:    vpermi2d %ymm2, %ymm3, %ymm8
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
+; AVX512-FCP-NEXT:    vpermd %zmm6, %zmm2, %zmm2
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
+; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
+; AVX512-FCP-NEXT:    vpermd %zmm6, %zmm3, %zmm3
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
+; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
-; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm1
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14]
-; AVX512-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm8
+; AVX512-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
+; AVX512-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm6
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
-; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm1
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15]
-; AVX512-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa %ymm6, (%rsi)
-; AVX512-FCP-NEXT:    vmovdqa %ymm7, (%rdx)
-; AVX512-FCP-NEXT:    vmovdqa %ymm4, (%rcx)
-; AVX512-FCP-NEXT:    vmovdqa %ymm5, (%r8)
-; AVX512-FCP-NEXT:    vmovdqa %ymm8, (%r9)
-; AVX512-FCP-NEXT:    vmovdqa %ymm2, (%rax)
+; AVX512-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
+; AVX512-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm4
+; AVX512-FCP-NEXT:    vmovdqa %ymm7, (%rsi)
+; AVX512-FCP-NEXT:    vmovdqa %ymm8, (%rdx)
+; AVX512-FCP-NEXT:    vmovdqa %ymm2, (%rcx)
+; AVX512-FCP-NEXT:    vmovdqa %ymm3, (%r8)
+; AVX512-FCP-NEXT:    vmovdqa %ymm6, (%r9)
+; AVX512-FCP-NEXT:    vmovdqa %ymm4, (%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: load_i32_stride6_vf8:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm3
 ; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %ymm0
 ; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %ymm1
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0]
-; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10]
-; AVX512DQ-NEXT:    vpermi2d %ymm4, %ymm5, %ymm6
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0]
-; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11]
-; AVX512DQ-NEXT:    vpermi2d %ymm4, %ymm5, %ymm7
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12]
-; AVX512DQ-NEXT:    vpermi2d %ymm1, %ymm0, %ymm4
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0]
-; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13]
-; AVX512DQ-NEXT:    vpermi2d %ymm1, %ymm0, %ymm5
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0]
-; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
+; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm4
+; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm5
+; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %zmm6
+; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
+; AVX512DQ-NEXT:    vpermi2d %ymm2, %ymm3, %ymm7
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
+; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
+; AVX512DQ-NEXT:    vpermi2d %ymm2, %ymm3, %ymm8
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
+; AVX512DQ-NEXT:    vpermd %zmm6, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
+; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
+; AVX512DQ-NEXT:    vpermd %zmm6, %zmm3, %zmm3
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
+; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
 ; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
-; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm3, %zmm1
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14]
-; AVX512DQ-NEXT:    vpermi2d %ymm0, %ymm1, %ymm8
+; AVX512DQ-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
+; AVX512DQ-NEXT:    vpermi2d %ymm0, %ymm1, %ymm6
 ; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
-; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm3, %zmm1
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15]
-; AVX512DQ-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
-; AVX512DQ-NEXT:    vmovdqa %ymm6, (%rsi)
-; AVX512DQ-NEXT:    vmovdqa %ymm7, (%rdx)
-; AVX512DQ-NEXT:    vmovdqa %ymm4, (%rcx)
-; AVX512DQ-NEXT:    vmovdqa %ymm5, (%r8)
-; AVX512DQ-NEXT:    vmovdqa %ymm8, (%r9)
-; AVX512DQ-NEXT:    vmovdqa %ymm2, (%rax)
+; AVX512DQ-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
+; AVX512DQ-NEXT:    vpermi2d %ymm0, %ymm1, %ymm4
+; AVX512DQ-NEXT:    vmovdqa %ymm7, (%rsi)
+; AVX512DQ-NEXT:    vmovdqa %ymm8, (%rdx)
+; AVX512DQ-NEXT:    vmovdqa %ymm2, (%rcx)
+; AVX512DQ-NEXT:    vmovdqa %ymm3, (%r8)
+; AVX512DQ-NEXT:    vmovdqa %ymm6, (%r9)
+; AVX512DQ-NEXT:    vmovdqa %ymm4, (%rax)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: load_i32_stride6_vf8:
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0]
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm4, %ymm5, %ymm6
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0]
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm4, %ymm5, %ymm7
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm1, %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm1, %ymm0, %ymm5
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm6
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm2, %ymm3, %ymm7
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm2, %ymm3, %ymm8
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm6, %zmm2, %zmm2
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm6, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm1
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm8
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm6
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm1
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, (%r9)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, (%rax)
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, (%rsi)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, (%rdx)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm3, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, (%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, (%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i32_stride6_vf8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm3
 ; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm0
 ; AVX512BW-NEXT:    vmovdqa 160(%rdi), %ymm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0]
-; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10]
-; AVX512BW-NEXT:    vpermi2d %ymm4, %ymm5, %ymm6
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0]
-; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11]
-; AVX512BW-NEXT:    vpermi2d %ymm4, %ymm5, %ymm7
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12]
-; AVX512BW-NEXT:    vpermi2d %ymm1, %ymm0, %ymm4
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0]
-; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13]
-; AVX512BW-NEXT:    vpermi2d %ymm1, %ymm0, %ymm5
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0]
-; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm5
+; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm6
+; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
+; AVX512BW-NEXT:    vpermi2d %ymm2, %ymm3, %ymm7
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
+; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
+; AVX512BW-NEXT:    vpermi2d %ymm2, %ymm3, %ymm8
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
+; AVX512BW-NEXT:    vpermd %zmm6, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
+; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
+; AVX512BW-NEXT:    vpermd %zmm6, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
+; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
 ; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
-; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm3, %zmm1
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14]
-; AVX512BW-NEXT:    vpermi2d %ymm0, %ymm1, %ymm8
+; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
+; AVX512BW-NEXT:    vpermi2d %ymm0, %ymm1, %ymm6
 ; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
-; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm3, %zmm1
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15]
-; AVX512BW-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
-; AVX512BW-NEXT:    vmovdqa %ymm6, (%rsi)
-; AVX512BW-NEXT:    vmovdqa %ymm7, (%rdx)
-; AVX512BW-NEXT:    vmovdqa %ymm4, (%rcx)
-; AVX512BW-NEXT:    vmovdqa %ymm5, (%r8)
-; AVX512BW-NEXT:    vmovdqa %ymm8, (%r9)
-; AVX512BW-NEXT:    vmovdqa %ymm2, (%rax)
+; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
+; AVX512BW-NEXT:    vpermi2d %ymm0, %ymm1, %ymm4
+; AVX512BW-NEXT:    vmovdqa %ymm7, (%rsi)
+; AVX512BW-NEXT:    vmovdqa %ymm8, (%rdx)
+; AVX512BW-NEXT:    vmovdqa %ymm2, (%rcx)
+; AVX512BW-NEXT:    vmovdqa %ymm3, (%r8)
+; AVX512BW-NEXT:    vmovdqa %ymm6, (%r9)
+; AVX512BW-NEXT:    vmovdqa %ymm4, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BW-FCP-LABEL: load_i32_stride6_vf8:
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm3
 ; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm0
 ; AVX512BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10]
-; AVX512BW-FCP-NEXT:    vpermi2d %ymm4, %ymm5, %ymm6
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11]
-; AVX512BW-FCP-NEXT:    vpermi2d %ymm4, %ymm5, %ymm7
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12]
-; AVX512BW-FCP-NEXT:    vpermi2d %ymm1, %ymm0, %ymm4
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13]
-; AVX512BW-FCP-NEXT:    vpermi2d %ymm1, %ymm0, %ymm5
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm6
+; AVX512BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
+; AVX512BW-FCP-NEXT:    vpermi2d %ymm2, %ymm3, %ymm7
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
+; AVX512BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
+; AVX512BW-FCP-NEXT:    vpermi2d %ymm2, %ymm3, %ymm8
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
+; AVX512BW-FCP-NEXT:    vpermd %zmm6, %zmm2, %zmm2
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
+; AVX512BW-FCP-NEXT:    vpermd %zmm6, %zmm3, %zmm3
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm1
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14]
-; AVX512BW-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm8
+; AVX512BW-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
+; AVX512BW-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm6
 ; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm1
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15]
-; AVX512BW-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm6, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm7, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm4, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm5, (%r8)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm8, (%r9)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm2, (%rax)
+; AVX512BW-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
+; AVX512BW-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm4
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm7, (%rsi)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm8, (%rdx)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm2, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm3, (%r8)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm6, (%r9)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm4, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
 ; AVX512DQ-BW-LABEL: load_i32_stride6_vf8:
 ; AVX512DQ-BW:       # %bb.0:
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm3
 ; AVX512DQ-BW-NEXT:    vmovdqa 128(%rdi), %ymm0
 ; AVX512DQ-BW-NEXT:    vmovdqa 160(%rdi), %ymm1
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10]
-; AVX512DQ-BW-NEXT:    vpermi2d %ymm4, %ymm5, %ymm6
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11]
-; AVX512DQ-BW-NEXT:    vpermi2d %ymm4, %ymm5, %ymm7
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12]
-; AVX512DQ-BW-NEXT:    vpermi2d %ymm1, %ymm0, %ymm4
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13]
-; AVX512DQ-BW-NEXT:    vpermi2d %ymm1, %ymm0, %ymm5
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm4
+; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm5
+; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm6
+; AVX512DQ-BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
+; AVX512DQ-BW-NEXT:    vpermi2d %ymm2, %ymm3, %ymm7
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
+; AVX512DQ-BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
+; AVX512DQ-BW-NEXT:    vpermi2d %ymm2, %ymm3, %ymm8
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
+; AVX512DQ-BW-NEXT:    vpermd %zmm6, %zmm2, %zmm2
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
+; AVX512DQ-BW-NEXT:    vpermd %zmm6, %zmm3, %zmm3
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
 ; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm3, %zmm1
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14]
-; AVX512DQ-BW-NEXT:    vpermi2d %ymm0, %ymm1, %ymm8
+; AVX512DQ-BW-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
+; AVX512DQ-BW-NEXT:    vpermi2d %ymm0, %ymm1, %ymm6
 ; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm3, %zmm1
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15]
-; AVX512DQ-BW-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
-; AVX512DQ-BW-NEXT:    vmovdqa %ymm6, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovdqa %ymm7, (%rdx)
-; AVX512DQ-BW-NEXT:    vmovdqa %ymm4, (%rcx)
-; AVX512DQ-BW-NEXT:    vmovdqa %ymm5, (%r8)
-; AVX512DQ-BW-NEXT:    vmovdqa %ymm8, (%r9)
-; AVX512DQ-BW-NEXT:    vmovdqa %ymm2, (%rax)
+; AVX512DQ-BW-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
+; AVX512DQ-BW-NEXT:    vpermi2d %ymm0, %ymm1, %ymm4
+; AVX512DQ-BW-NEXT:    vmovdqa %ymm7, (%rsi)
+; AVX512DQ-BW-NEXT:    vmovdqa %ymm8, (%rdx)
+; AVX512DQ-BW-NEXT:    vmovdqa %ymm2, (%rcx)
+; AVX512DQ-BW-NEXT:    vmovdqa %ymm3, (%r8)
+; AVX512DQ-BW-NEXT:    vmovdqa %ymm6, (%r9)
+; AVX512DQ-BW-NEXT:    vmovdqa %ymm4, (%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride6_vf8:
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm3
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm4, %ymm5, %ymm6
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm4, %ymm5, %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm1, %ymm0, %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm1, %ymm0, %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm2, %ymm3, %ymm7
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm2, %ymm3, %ymm8
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm6, %zmm2, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm6, %zmm3, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm8
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm6
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm6, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm7, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm4, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm5, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm8, (%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm2, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm7, (%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm8, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm2, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm3, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm6, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm4, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <48 x i32>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
index d806253ef23a0..694f2bc53c515 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
@@ -204,22 +204,22 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
 ; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm5
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm6
-; AVX512-NEXT:    vpermi2d %ymm5, %ymm6, %ymm1
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
-; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm7
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[1,0,2,3]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
-; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm5
+; AVX512-NEXT:    vpermps (%rdi), %zmm1, %zmm1
+; AVX512-NEXT:    vmovaps (%rdi), %ymm5
+; AVX512-NEXT:    vmovaps 32(%rdi), %ymm6
+; AVX512-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
+; AVX512-NEXT:    vextractf128 $1, %ymm7, %xmm7
+; AVX512-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
+; AVX512-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
+; AVX512-NEXT:    vextractf128 $1, %ymm5, %xmm5
 ; AVX512-NEXT:    vmovq %xmm2, (%rsi)
 ; AVX512-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX512-NEXT:    vmovq %xmm4, (%rcx)
 ; AVX512-NEXT:    vmovq %xmm0, (%r8)
-; AVX512-NEXT:    vmovq %xmm1, (%r9)
-; AVX512-NEXT:    vmovq %xmm7, (%r10)
-; AVX512-NEXT:    vmovq %xmm5, (%rax)
+; AVX512-NEXT:    vmovlps %xmm1, (%r9)
+; AVX512-NEXT:    vmovlps %xmm7, (%r10)
+; AVX512-NEXT:    vmovlps %xmm5, (%rax)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
@@ -227,30 +227,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX512-FCP-NEXT:    vpinsrd $1, 28(%rdi), %xmm0, %xmm2
-; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4]
-; AVX512-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm3
-; AVX512-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm4
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [7,2,0,0]
-; AVX512-FCP-NEXT:    vpermi2d %xmm0, %xmm1, %xmm5
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4,11,0,0]
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm6
-; AVX512-FCP-NEXT:    vpermi2d %ymm1, %ymm6, %ymm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [13,4,6,7]
-; AVX512-FCP-NEXT:    vpermi2d %ymm6, %ymm1, %ymm7
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [6,13,6,7]
-; AVX512-FCP-NEXT:    vpermi2d %ymm1, %ymm6, %ymm8
-; AVX512-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512-FCP-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512-FCP-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512-FCP-NEXT:    vmovq %xmm5, (%r8)
-; AVX512-FCP-NEXT:    vmovq %xmm0, (%r9)
+; AVX512-FCP-NEXT:    vmovaps (%rdi), %zmm0
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512-FCP-NEXT:    vpinsrd $1, 28(%rdi), %xmm1, %xmm3
+; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4]
+; AVX512-FCP-NEXT:    vpermi2d %xmm2, %xmm1, %xmm4
+; AVX512-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm5
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0]
+; AVX512-FCP-NEXT:    vpermi2d %xmm1, %xmm2, %xmm6
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
+; AVX512-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm1
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7]
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm7
+; AVX512-FCP-NEXT:    vpermt2d (%rdi), %ymm2, %ymm7
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,13,6,7]
+; AVX512-FCP-NEXT:    vpermps %zmm0, %zmm2, %zmm0
+; AVX512-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512-FCP-NEXT:    vmovq %xmm3, (%rsi)
+; AVX512-FCP-NEXT:    vmovq %xmm4, (%rdx)
+; AVX512-FCP-NEXT:    vmovq %xmm5, (%rcx)
+; AVX512-FCP-NEXT:    vmovq %xmm6, (%r8)
+; AVX512-FCP-NEXT:    vmovlps %xmm1, (%r9)
 ; AVX512-FCP-NEXT:    vmovq %xmm7, (%r10)
-; AVX512-FCP-NEXT:    vmovq %xmm8, (%rax)
+; AVX512-FCP-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -269,22 +270,22 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
 ; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm5
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm6
-; AVX512DQ-NEXT:    vpermi2d %ymm5, %ymm6, %ymm1
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm7
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[1,0,2,3]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm5
+; AVX512DQ-NEXT:    vpermps (%rdi), %zmm1, %zmm1
+; AVX512DQ-NEXT:    vmovaps (%rdi), %ymm5
+; AVX512DQ-NEXT:    vmovaps 32(%rdi), %ymm6
+; AVX512DQ-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
+; AVX512DQ-NEXT:    vextractf128 $1, %ymm7, %xmm7
+; AVX512DQ-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
+; AVX512DQ-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
+; AVX512DQ-NEXT:    vextractf128 $1, %ymm5, %xmm5
 ; AVX512DQ-NEXT:    vmovq %xmm2, (%rsi)
 ; AVX512DQ-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX512DQ-NEXT:    vmovq %xmm4, (%rcx)
 ; AVX512DQ-NEXT:    vmovq %xmm0, (%r8)
-; AVX512DQ-NEXT:    vmovq %xmm1, (%r9)
-; AVX512DQ-NEXT:    vmovq %xmm7, (%r10)
-; AVX512DQ-NEXT:    vmovq %xmm5, (%rax)
+; AVX512DQ-NEXT:    vmovlps %xmm1, (%r9)
+; AVX512DQ-NEXT:    vmovlps %xmm7, (%r10)
+; AVX512DQ-NEXT:    vmovlps %xmm5, (%rax)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
@@ -292,30 +293,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vpinsrd $1, 28(%rdi), %xmm0, %xmm2
-; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4]
-; AVX512DQ-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm3
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm4
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [7,2,0,0]
-; AVX512DQ-FCP-NEXT:    vpermi2d %xmm0, %xmm1, %xmm5
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4,11,0,0]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm6
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm1, %ymm6, %ymm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [13,4,6,7]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm6, %ymm1, %ymm7
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [6,13,6,7]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm1, %ymm6, %ymm8
-; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm5, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%r9)
+; AVX512DQ-FCP-NEXT:    vmovaps (%rdi), %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT:    vpinsrd $1, 28(%rdi), %xmm1, %xmm3
+; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4]
+; AVX512DQ-FCP-NEXT:    vpermi2d %xmm2, %xmm1, %xmm4
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm5
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0]
+; AVX512DQ-FCP-NEXT:    vpermi2d %xmm1, %xmm2, %xmm6
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
+; AVX512DQ-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm7
+; AVX512DQ-FCP-NEXT:    vpermt2d (%rdi), %ymm2, %ymm7
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,13,6,7]
+; AVX512DQ-FCP-NEXT:    vpermps %zmm0, %zmm2, %zmm0
+; AVX512DQ-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rsi)
+; AVX512DQ-FCP-NEXT:    vmovq %xmm4, (%rdx)
+; AVX512DQ-FCP-NEXT:    vmovq %xmm5, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovq %xmm6, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovlps %xmm1, (%r9)
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm7, (%r10)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm8, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -334,22 +336,22 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
 ; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm5
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm6
-; AVX512BW-NEXT:    vpermi2d %ymm5, %ymm6, %ymm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm7, %xmm7
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[1,0,2,3]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm5, %xmm5
+; AVX512BW-NEXT:    vpermps (%rdi), %zmm1, %zmm1
+; AVX512BW-NEXT:    vmovaps (%rdi), %ymm5
+; AVX512BW-NEXT:    vmovaps 32(%rdi), %ymm6
+; AVX512BW-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
+; AVX512BW-NEXT:    vextractf128 $1, %ymm7, %xmm7
+; AVX512BW-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
+; AVX512BW-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
+; AVX512BW-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
+; AVX512BW-NEXT:    vextractf128 $1, %ymm5, %xmm5
 ; AVX512BW-NEXT:    vmovq %xmm2, (%rsi)
 ; AVX512BW-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX512BW-NEXT:    vmovq %xmm4, (%rcx)
 ; AVX512BW-NEXT:    vmovq %xmm0, (%r8)
-; AVX512BW-NEXT:    vmovq %xmm1, (%r9)
-; AVX512BW-NEXT:    vmovq %xmm7, (%r10)
-; AVX512BW-NEXT:    vmovq %xmm5, (%rax)
+; AVX512BW-NEXT:    vmovlps %xmm1, (%r9)
+; AVX512BW-NEXT:    vmovlps %xmm7, (%r10)
+; AVX512BW-NEXT:    vmovlps %xmm5, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -357,30 +359,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX512BW-FCP-NEXT:    vpinsrd $1, 28(%rdi), %xmm0, %xmm2
-; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4]
-; AVX512BW-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm3
-; AVX512BW-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm4
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [7,2,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2d %xmm0, %xmm1, %xmm5
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4,11,0,0]
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm6
-; AVX512BW-FCP-NEXT:    vpermi2d %ymm1, %ymm6, %ymm0
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [13,4,6,7]
-; AVX512BW-FCP-NEXT:    vpermi2d %ymm6, %ymm1, %ymm7
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [6,13,6,7]
-; AVX512BW-FCP-NEXT:    vpermi2d %ymm1, %ymm6, %ymm8
-; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm5, (%r8)
-; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%r9)
+; AVX512BW-FCP-NEXT:    vmovaps (%rdi), %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512BW-FCP-NEXT:    vpinsrd $1, 28(%rdi), %xmm1, %xmm3
+; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4]
+; AVX512BW-FCP-NEXT:    vpermi2d %xmm2, %xmm1, %xmm4
+; AVX512BW-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm5
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0]
+; AVX512BW-FCP-NEXT:    vpermi2d %xmm1, %xmm2, %xmm6
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
+; AVX512BW-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm1
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm7
+; AVX512BW-FCP-NEXT:    vpermt2d (%rdi), %ymm2, %ymm7
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,13,6,7]
+; AVX512BW-FCP-NEXT:    vpermps %zmm0, %zmm2, %zmm0
+; AVX512BW-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rsi)
+; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%rdx)
+; AVX512BW-FCP-NEXT:    vmovq %xmm5, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovq %xmm6, (%r8)
+; AVX512BW-FCP-NEXT:    vmovlps %xmm1, (%r9)
 ; AVX512BW-FCP-NEXT:    vmovq %xmm7, (%r10)
-; AVX512BW-FCP-NEXT:    vmovq %xmm8, (%rax)
+; AVX512BW-FCP-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -399,22 +402,22 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
 ; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
 ; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %ymm5
-; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm6
-; AVX512DQ-BW-NEXT:    vpermi2d %ymm5, %ymm6, %ymm1
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm7, %xmm7
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[1,0,2,3]
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm5, %xmm5
+; AVX512DQ-BW-NEXT:    vpermps (%rdi), %zmm1, %zmm1
+; AVX512DQ-BW-NEXT:    vmovaps (%rdi), %ymm5
+; AVX512DQ-BW-NEXT:    vmovaps 32(%rdi), %ymm6
+; AVX512DQ-BW-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
+; AVX512DQ-BW-NEXT:    vextractf128 $1, %ymm7, %xmm7
+; AVX512DQ-BW-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
+; AVX512DQ-BW-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
+; AVX512DQ-BW-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
+; AVX512DQ-BW-NEXT:    vextractf128 $1, %ymm5, %xmm5
 ; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rsi)
 ; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX512DQ-BW-NEXT:    vmovq %xmm4, (%rcx)
 ; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%r8)
-; AVX512DQ-BW-NEXT:    vmovq %xmm1, (%r9)
-; AVX512DQ-BW-NEXT:    vmovq %xmm7, (%r10)
-; AVX512DQ-BW-NEXT:    vmovq %xmm5, (%rax)
+; AVX512DQ-BW-NEXT:    vmovlps %xmm1, (%r9)
+; AVX512DQ-BW-NEXT:    vmovlps %xmm7, (%r10)
+; AVX512DQ-BW-NEXT:    vmovlps %xmm5, (%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -422,30 +425,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX512DQ-BW-FCP-NEXT:    vpinsrd $1, 28(%rdi), %xmm0, %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm3
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm4
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [7,2,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm0, %xmm1, %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4,11,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm6
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm1, %ymm6, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [13,4,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm6, %ymm1, %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [6,13,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm1, %ymm6, %ymm8
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm5, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vpinsrd $1, 28(%rdi), %xmm1, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm2, %xmm1, %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm5
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm1, %xmm2, %xmm6
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm7
+; AVX512DQ-BW-FCP-NEXT:    vpermt2d (%rdi), %ymm2, %ymm7
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,13,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpermps %zmm0, %zmm2, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm5, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm6, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm1, (%r9)
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm7, (%r10)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm8, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <14 x i32>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
index f0c95f4fa9ef8..8d7f8d1db8522 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
@@ -222,24 +222,25 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0]
 ; AVX512-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm3
 ; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,13,5,5]
-; AVX512-FCP-NEXT:    vpermi2d %ymm1, %ymm4, %ymm6
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm4
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512-FCP-NEXT:    vmovaps 32(%rdi), %ymm1
+; AVX512-FCP-NEXT:    vmovaps (%rdi), %ymm4
+; AVX512-FCP-NEXT:    vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
+; AVX512-FCP-NEXT:    vextractf128 $1, %ymm5, %xmm5
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,5,13,5,5]
+; AVX512-FCP-NEXT:    vpermps (%rdi), %zmm6, %zmm6
+; AVX512-FCP-NEXT:    vextractf128 $1, %ymm6, %xmm6
+; AVX512-FCP-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
+; AVX512-FCP-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX512-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
+; AVX512-FCP-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX512-FCP-NEXT:    vmovq %xmm2, (%rsi)
 ; AVX512-FCP-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX512-FCP-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512-FCP-NEXT:    vpextrq $1, %xmm0, (%r8)
-; AVX512-FCP-NEXT:    vmovq %xmm5, (%r9)
-; AVX512-FCP-NEXT:    vmovq %xmm6, (%r11)
-; AVX512-FCP-NEXT:    vmovq %xmm4, (%r10)
-; AVX512-FCP-NEXT:    vmovq %xmm1, (%rax)
+; AVX512-FCP-NEXT:    vmovlps %xmm5, (%r9)
+; AVX512-FCP-NEXT:    vmovlps %xmm6, (%r11)
+; AVX512-FCP-NEXT:    vmovlps %xmm4, (%r10)
+; AVX512-FCP-NEXT:    vmovlps %xmm1, (%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -287,24 +288,25 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0]
 ; AVX512DQ-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm3
 ; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,13,5,5]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm1, %ymm4, %ymm6
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovaps 32(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vmovaps (%rdi), %ymm4
+; AVX512DQ-FCP-NEXT:    vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
+; AVX512DQ-FCP-NEXT:    vextractf128 $1, %ymm5, %xmm5
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,5,13,5,5]
+; AVX512DQ-FCP-NEXT:    vpermps (%rdi), %zmm6, %zmm6
+; AVX512DQ-FCP-NEXT:    vextractf128 $1, %ymm6, %xmm6
+; AVX512DQ-FCP-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
+; AVX512DQ-FCP-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX512DQ-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
+; AVX512DQ-FCP-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512DQ-FCP-NEXT:    vpextrq $1, %xmm0, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm5, (%r9)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm6, (%r11)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm4, (%r10)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm1, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovlps %xmm5, (%r9)
+; AVX512DQ-FCP-NEXT:    vmovlps %xmm6, (%r11)
+; AVX512DQ-FCP-NEXT:    vmovlps %xmm4, (%r10)
+; AVX512DQ-FCP-NEXT:    vmovlps %xmm1, (%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -352,24 +354,25 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0]
 ; AVX512BW-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm3
 ; AVX512BW-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX512BW-FCP-NEXT:    vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,13,5,5]
-; AVX512BW-FCP-NEXT:    vpermi2d %ymm1, %ymm4, %ymm6
-; AVX512BW-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm4
-; AVX512BW-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-FCP-NEXT:    vmovaps 32(%rdi), %ymm1
+; AVX512BW-FCP-NEXT:    vmovaps (%rdi), %ymm4
+; AVX512BW-FCP-NEXT:    vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
+; AVX512BW-FCP-NEXT:    vextractf128 $1, %ymm5, %xmm5
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,5,13,5,5]
+; AVX512BW-FCP-NEXT:    vpermps (%rdi), %zmm6, %zmm6
+; AVX512BW-FCP-NEXT:    vextractf128 $1, %ymm6, %xmm6
+; AVX512BW-FCP-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
+; AVX512BW-FCP-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX512BW-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
+; AVX512BW-FCP-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
 ; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512BW-FCP-NEXT:    vpextrq $1, %xmm0, (%r8)
-; AVX512BW-FCP-NEXT:    vmovq %xmm5, (%r9)
-; AVX512BW-FCP-NEXT:    vmovq %xmm6, (%r11)
-; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%r10)
-; AVX512BW-FCP-NEXT:    vmovq %xmm1, (%rax)
+; AVX512BW-FCP-NEXT:    vmovlps %xmm5, (%r9)
+; AVX512BW-FCP-NEXT:    vmovlps %xmm6, (%r11)
+; AVX512BW-FCP-NEXT:    vmovlps %xmm4, (%r10)
+; AVX512BW-FCP-NEXT:    vmovlps %xmm1, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -417,24 +420,25 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0]
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm3
 ; AVX512DQ-BW-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,13,5,5]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm1, %ymm4, %ymm6
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm4
-; AVX512DQ-BW-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovaps 32(%rdi), %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdi), %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
+; AVX512DQ-BW-FCP-NEXT:    vextractf128 $1, %ymm5, %xmm5
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,5,13,5,5]
+; AVX512DQ-BW-FCP-NEXT:    vpermps (%rdi), %zmm6, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vextractf128 $1, %ymm6, %xmm6
+; AVX512DQ-BW-FCP-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
+; AVX512DQ-BW-FCP-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512DQ-BW-FCP-NEXT:    vpextrq $1, %xmm0, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm5, (%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm6, (%r11)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%r10)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm1, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm5, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm6, (%r11)
+; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm4, (%r10)
+; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm1, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <16 x i32>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll
index 2381df6d73289..aa7d8ceb14950 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll
@@ -245,13 +245,12 @@ define void @load_i64_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX512-FCP-LABEL: load_i64_stride2_vf4:
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [0,2,4,6]
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512-FCP-NEXT:    vpermi2q %ymm2, %ymm1, %ymm0
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [1,3,5,7]
-; AVX512-FCP-NEXT:    vpermi2q %ymm2, %ymm1, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
-; AVX512-FCP-NEXT:    vmovdqa %ymm3, (%rdx)
+; AVX512-FCP-NEXT:    vmovaps (%rdi), %zmm1
+; AVX512-FCP-NEXT:    vpermpd %zmm1, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7]
+; AVX512-FCP-NEXT:    vpermpd %zmm1, %zmm2, %zmm1
+; AVX512-FCP-NEXT:    vmovaps %ymm0, (%rsi)
+; AVX512-FCP-NEXT:    vmovaps %ymm1, (%rdx)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -271,13 +270,12 @@ define void @load_i64_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX512DQ-FCP-LABEL: load_i64_stride2_vf4:
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [0,2,4,6]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT:    vpermi2q %ymm2, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [1,3,5,7]
-; AVX512DQ-FCP-NEXT:    vpermi2q %ymm2, %ymm1, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm3, (%rdx)
+; AVX512DQ-FCP-NEXT:    vmovaps (%rdi), %zmm1
+; AVX512DQ-FCP-NEXT:    vpermpd %zmm1, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7]
+; AVX512DQ-FCP-NEXT:    vpermpd %zmm1, %zmm2, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovaps %ymm0, (%rsi)
+; AVX512DQ-FCP-NEXT:    vmovaps %ymm1, (%rdx)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -297,13 +295,12 @@ define void @load_i64_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX512BW-FCP-LABEL: load_i64_stride2_vf4:
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [0,2,4,6]
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512BW-FCP-NEXT:    vpermi2q %ymm2, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [1,3,5,7]
-; AVX512BW-FCP-NEXT:    vpermi2q %ymm2, %ymm1, %ymm3
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm3, (%rdx)
+; AVX512BW-FCP-NEXT:    vmovaps (%rdi), %zmm1
+; AVX512BW-FCP-NEXT:    vpermpd %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7]
+; AVX512BW-FCP-NEXT:    vpermpd %zmm1, %zmm2, %zmm1
+; AVX512BW-FCP-NEXT:    vmovaps %ymm0, (%rsi)
+; AVX512BW-FCP-NEXT:    vmovaps %ymm1, (%rdx)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -323,13 +320,12 @@ define void @load_i64_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX512DQ-BW-FCP-LABEL: load_i64_stride2_vf4:
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [0,2,4,6]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %ymm2, %ymm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [1,3,5,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %ymm2, %ymm1, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm3, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdi), %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermpd %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7]
+; AVX512DQ-BW-FCP-NEXT:    vpermpd %zmm1, %zmm2, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %ymm0, (%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %ymm1, (%rdx)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <8 x i64>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll
index f82bcd1ce3e1e..7d3209397c3df 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll
@@ -611,32 +611,31 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0]
 ; AVX512-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm4
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm5 = [10,0,6,0]
-; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm5
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [0,1,2,4]
-; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm7
-; AVX512-FCP-NEXT:    vpermi2q %ymm7, %ymm5, %ymm6
-; AVX512-FCP-NEXT:    vinserti128 $1, 160(%rdi), %ymm0, %ymm5
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm8 = [11,1,7,0]
-; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm8
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm8 = [0,0,0,6]
-; AVX512-FCP-NEXT:    vpermi2q %ymm7, %ymm4, %ymm8
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm4 = [4,10]
-; AVX512-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm4
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0]
+; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm4
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm5 = [0,1,2,4]
+; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm6
+; AVX512-FCP-NEXT:    vpermi2q %ymm6, %ymm4, %ymm5
+; AVX512-FCP-NEXT:    vinserti128 $1, 160(%rdi), %ymm0, %ymm4
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0]
+; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm7
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm7 = [0,4,0,6]
+; AVX512-FCP-NEXT:    vpermq 128(%rdi), %zmm7, %zmm7
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [4,10]
+; AVX512-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
 ; AVX512-FCP-NEXT:    vpbroadcastq 136(%rdi), %ymm8
-; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3]
+; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3]
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [5,11]
 ; AVX512-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm1, (%rdx)
-; AVX512-FCP-NEXT:    vmovdqa %ymm6, (%rcx)
-; AVX512-FCP-NEXT:    vmovdqa %ymm5, (%r8)
-; AVX512-FCP-NEXT:    vmovdqa %ymm4, (%r9)
+; AVX512-FCP-NEXT:    vmovdqa %ymm5, (%rcx)
+; AVX512-FCP-NEXT:    vmovdqa %ymm4, (%r8)
+; AVX512-FCP-NEXT:    vmovdqa %ymm7, (%r9)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm2, (%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
@@ -694,32 +693,31 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0]
 ; AVX512DQ-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm4
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm5 = [10,0,6,0]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm5
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [0,1,2,4]
-; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm7
-; AVX512DQ-FCP-NEXT:    vpermi2q %ymm7, %ymm5, %ymm6
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, 160(%rdi), %ymm0, %ymm5
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm8 = [11,1,7,0]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm8
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm8 = [0,0,0,6]
-; AVX512DQ-FCP-NEXT:    vpermi2q %ymm7, %ymm4, %ymm8
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm4 = [4,10]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm4
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm4
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm5 = [0,1,2,4]
+; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm6
+; AVX512DQ-FCP-NEXT:    vpermi2q %ymm6, %ymm4, %ymm5
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, 160(%rdi), %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm7
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm7 = [0,4,0,6]
+; AVX512DQ-FCP-NEXT:    vpermq 128(%rdi), %zmm7, %zmm7
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [4,10]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpbroadcastq 136(%rdi), %ymm8
-; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3]
+; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [5,11]
 ; AVX512DQ-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, (%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, (%r9)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, (%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
@@ -777,32 +775,31 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0]
 ; AVX512BW-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm4
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm5 = [10,0,6,0]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm5
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [0,1,2,4]
-; AVX512BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm7
-; AVX512BW-FCP-NEXT:    vpermi2q %ymm7, %ymm5, %ymm6
-; AVX512BW-FCP-NEXT:    vinserti128 $1, 160(%rdi), %ymm0, %ymm5
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm8 = [11,1,7,0]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm8
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm8 = [0,0,0,6]
-; AVX512BW-FCP-NEXT:    vpermi2q %ymm7, %ymm4, %ymm8
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm4 = [4,10]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm4
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm4
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm5 = [0,1,2,4]
+; AVX512BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm6
+; AVX512BW-FCP-NEXT:    vpermi2q %ymm6, %ymm4, %ymm5
+; AVX512BW-FCP-NEXT:    vinserti128 $1, 160(%rdi), %ymm0, %ymm4
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm7
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7]
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm7 = [0,4,0,6]
+; AVX512BW-FCP-NEXT:    vpermq 128(%rdi), %zmm7, %zmm7
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [4,10]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vpbroadcastq 136(%rdi), %ymm8
-; AVX512BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3]
+; AVX512BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3]
 ; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [5,11]
 ; AVX512BW-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
 ; AVX512BW-FCP-NEXT:    vmovdqa %ymm1, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm6, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm5, (%r8)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm4, (%r9)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm5, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm4, (%r8)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm7, (%r9)
 ; AVX512BW-FCP-NEXT:    vmovdqa %ymm2, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
@@ -860,32 +857,31 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0]
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm5 = [10,0,6,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [0,1,2,4]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %ymm7, %ymm5, %ymm6
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 160(%rdi), %ymm0, %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm8 = [11,1,7,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm8
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm8 = [0,0,0,6]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %ymm7, %ymm4, %ymm8
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm4 = [4,10]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm5 = [0,1,2,4]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %ymm6, %ymm4, %ymm5
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 160(%rdi), %ymm0, %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm7 = [0,4,0,6]
+; AVX512DQ-BW-FCP-NEXT:    vpermq 128(%rdi), %zmm7, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [4,10]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq 136(%rdi), %ymm8
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3]
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3]
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [5,11]
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm1, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm6, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm5, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm4, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm5, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm4, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm7, (%r9)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm2, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
index 4e5501b1041d3..cc3e5f3d1d82e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
@@ -709,28 +709,28 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpermi2q %zmm5, %zmm4, %zmm1
 ; AVX512-FCP-NEXT:    vpbroadcastq 176(%rdi), %ymm2
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [0,0,0,7]
-; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm6
-; AVX512-FCP-NEXT:    vpermi2q 160(%rdi), %ymm6, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm7
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm7
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %xmm8
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3]
+; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [0,0,0,7]
+; AVX512-FCP-NEXT:    vpermq %zmm3, %zmm6, %zmm6
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm6
+; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %xmm7
+; AVX512-FCP-NEXT:    vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7]
+; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm8[2,3]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm8
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm9 = [4,11]
 ; AVX512-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm9
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm9
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm9 = [5,12]
 ; AVX512-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm9
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11]
 ; AVX512-FCP-NEXT:    vpermi2q 192(%rdi), %zmm3, %zmm9
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [6,13]
@@ -739,9 +739,9 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm1, (%rdx)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm2, (%rcx)
-; AVX512-FCP-NEXT:    vmovdqa %ymm7, (%r8)
-; AVX512-FCP-NEXT:    vmovdqa %ymm8, (%r9)
-; AVX512-FCP-NEXT:    vmovdqa %ymm6, (%r10)
+; AVX512-FCP-NEXT:    vmovdqa %ymm6, (%r8)
+; AVX512-FCP-NEXT:    vmovdqa %ymm7, (%r9)
+; AVX512-FCP-NEXT:    vmovdqa %ymm8, (%r10)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm3, (%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
@@ -814,28 +814,28 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpermi2q %zmm5, %zmm4, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpbroadcastq 176(%rdi), %ymm2
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [0,0,0,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm6
-; AVX512DQ-FCP-NEXT:    vpermi2q 160(%rdi), %ymm6, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm7
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm7
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %xmm8
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [0,0,0,7]
+; AVX512DQ-FCP-NEXT:    vpermq %zmm3, %zmm6, %zmm6
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm6
+; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %xmm7
+; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm8[2,3]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm8
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm9 = [4,11]
 ; AVX512DQ-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm9
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %ymm9
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm9 = [5,12]
 ; AVX512DQ-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm9
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11]
 ; AVX512DQ-FCP-NEXT:    vpermi2q 192(%rdi), %zmm3, %zmm9
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [6,13]
@@ -844,9 +844,9 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, (%rdx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, (%r9)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, (%r10)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, (%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, (%r10)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm3, (%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
@@ -919,28 +919,28 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vpermi2q %zmm5, %zmm4, %zmm1
 ; AVX512BW-FCP-NEXT:    vpbroadcastq 176(%rdi), %ymm2
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [0,0,0,7]
-; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm6
-; AVX512BW-FCP-NEXT:    vpermi2q 160(%rdi), %ymm6, %ymm2
-; AVX512BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm7
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm7
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vmovdqa 192(%rdi), %xmm8
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3]
+; AVX512BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [0,0,0,7]
+; AVX512BW-FCP-NEXT:    vpermq %zmm3, %zmm6, %zmm6
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm6
+; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vmovdqa 192(%rdi), %xmm7
+; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7]
+; AVX512BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm8[2,3]
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm8
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
 ; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm9 = [4,11]
 ; AVX512BW-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm9
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm9
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
 ; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm9 = [5,12]
 ; AVX512BW-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm9
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11]
 ; AVX512BW-FCP-NEXT:    vpermi2q 192(%rdi), %zmm3, %zmm9
 ; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [6,13]
@@ -949,9 +949,9 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
 ; AVX512BW-FCP-NEXT:    vmovdqa %ymm1, (%rdx)
 ; AVX512BW-FCP-NEXT:    vmovdqa %ymm2, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm7, (%r8)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm8, (%r9)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm6, (%r10)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm6, (%r8)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm7, (%r9)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm8, (%r10)
 ; AVX512BW-FCP-NEXT:    vmovdqa %ymm3, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
@@ -1024,28 +1024,28 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm5, %zmm4, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq 176(%rdi), %ymm2
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [0,0,0,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm6
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q 160(%rdi), %ymm6, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm7
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 192(%rdi), %xmm8
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [0,0,0,7]
+; AVX512DQ-BW-FCP-NEXT:    vpermq %zmm3, %zmm6, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 192(%rdi), %xmm7
+; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm8[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm8
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm9 = [4,11]
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm9 = [5,12]
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11]
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2q 192(%rdi), %zmm3, %zmm9
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [6,13]
@@ -1054,9 +1054,9 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm1, (%rdx)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm2, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm7, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm8, (%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm6, (%r10)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm6, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm7, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm8, (%r10)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm3, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
index 181f5651784d8..acedcf4263906 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -1337,10 +1337,9 @@ define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in.
 ;
 ; AVX512BW-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31]
-; AVX512BW-NEXT:    vpermi2w 32(%rdi), %ymm0, %ymm1
-; AVX512BW-NEXT:    vpaddb (%rsi), %zmm1, %zmm0
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31]
+; AVX512BW-NEXT:    vpermw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -1789,10 +1788,9 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i
 ;
 ; AVX512F-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
 ; AVX512F-FAST:       # %bb.0:
-; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7]
-; AVX512F-FAST-NEXT:    vpermi2q 32(%rdi), %ymm0, %ymm1
-; AVX512F-FAST-NEXT:    vpaddb (%rsi), %ymm1, %ymm0
+; AVX512F-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [0,5,0,7]
+; AVX512F-FAST-NEXT:    vpermq (%rdi), %zmm0, %zmm0
+; AVX512F-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512F-FAST-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512F-FAST-NEXT:    vzeroupper
 ; AVX512F-FAST-NEXT:    retq
@@ -1808,10 +1806,9 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i
 ;
 ; AVX512DQ-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
 ; AVX512DQ-FAST:       # %bb.0:
-; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512DQ-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7]
-; AVX512DQ-FAST-NEXT:    vpermi2q 32(%rdi), %ymm0, %ymm1
-; AVX512DQ-FAST-NEXT:    vpaddb (%rsi), %ymm1, %ymm0
+; AVX512DQ-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [0,5,0,7]
+; AVX512DQ-FAST-NEXT:    vpermq (%rdi), %zmm0, %zmm0
+; AVX512DQ-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512DQ-FAST-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512DQ-FAST-NEXT:    vzeroupper
 ; AVX512DQ-FAST-NEXT:    retq
@@ -1827,10 +1824,9 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i
 ;
 ; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
 ; AVX512BW-FAST:       # %bb.0:
-; AVX512BW-FAST-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7]
-; AVX512BW-FAST-NEXT:    vpermi2q 32(%rdi), %ymm0, %ymm1
-; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm1, %zmm0
+; AVX512BW-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [0,5,0,7]
+; AVX512BW-FAST-NEXT:    vpermq (%rdi), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512BW-FAST-NEXT:    vzeroupper
 ; AVX512BW-FAST-NEXT:    retq

From 65566281edac2b5f75a99302e7200c957db90143 Mon Sep 17 00:00:00 2001
From: Ryan Mansfield <ryan_mansfield@apple.com>
Date: Mon, 13 Jan 2025 09:15:24 -0500
Subject: [PATCH 280/408] [llvm-objdump] Remove leading whitespace for
 PT_GNU_PROPERTY. (#121591)

This fixes the misaligned display of addresses for this p_type.

Previous:

```
   STACK off    0x0000000000000000 vaddr 0x0000000000000000 paddr 0x0000000000000000 align 2**64
         filesz 0x0000000000000000 memsz 0x0000000000000000 flags rw-
   PROPERTY off    0x0000000000000358 vaddr 0x0000000000000358 paddr 0x0000000000000358 align 2**3
         filesz 0x0000000000000020 memsz 0x0000000000000020 flags r--
    NOTE off    0x0000000000000334 vaddr 0x0000000000000334 paddr 0x0000000000000334 align 2**2
         filesz 0x0000000000000020 memsz 0x0000000000000020 flags r--
```

After:


```
   STACK off    0x0000000000000000 vaddr 0x0000000000000000 paddr 0x0000000000000000 align 2**64
         filesz 0x0000000000000000 memsz 0x0000000000000000 flags rw-
PROPERTY off    0x0000000000000358 vaddr 0x0000000000000358 paddr 0x0000000000000358 align 2**3
         filesz 0x0000000000000020 memsz 0x0000000000000020 flags r--
    NOTE off    0x0000000000000334 vaddr 0x0000000000000334 paddr 0x0000000000000334 align 2**2
         filesz 0x0000000000000020 memsz 0x0000000000000020 flags r--
```
---
 llvm/test/tools/llvm-objdump/ELF/pt-gnu-property.test | 2 +-
 llvm/tools/llvm-objdump/ELFDump.cpp                   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/tools/llvm-objdump/ELF/pt-gnu-property.test b/llvm/test/tools/llvm-objdump/ELF/pt-gnu-property.test
index 246337866a777..27f1252b3d5d1 100644
--- a/llvm/test/tools/llvm-objdump/ELF/pt-gnu-property.test
+++ b/llvm/test/tools/llvm-objdump/ELF/pt-gnu-property.test
@@ -2,7 +2,7 @@
 # RUN: llvm-objdump -p %t | FileCheck %s
 
 # CHECK: Program Header:
-# CHECK-NEXT: {{ }}PROPERTY{{ }}
+# CHECK-NEXT: {{^}}PROPERTY{{ }}
 
 --- !ELF
 FileHeader:
diff --git a/llvm/tools/llvm-objdump/ELFDump.cpp b/llvm/tools/llvm-objdump/ELFDump.cpp
index d78cf485587e1..e9e5b059f1786 100644
--- a/llvm/tools/llvm-objdump/ELFDump.cpp
+++ b/llvm/tools/llvm-objdump/ELFDump.cpp
@@ -269,7 +269,7 @@ template <class ELFT> void ELFDumper<ELFT>::printProgramHeaders() {
       outs() << "   RELRO ";
       break;
     case ELF::PT_GNU_PROPERTY:
-      outs() << "   PROPERTY ";
+      outs() << "PROPERTY ";
       break;
     case ELF::PT_GNU_STACK:
       outs() << "   STACK ";

From b5ba4f06db2e159885fc4b6e7709274a3910b8b5 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Mon, 13 Jan 2025 09:16:23 -0500
Subject: [PATCH 281/408] [libc++] Redefine Fuchsia locale base support on top
 of the new API (#122489)

This follows the same path we've been doing for all platforms so far,
moving away from the old definition of the locale base API.

Co-authored-by: Daniel Thornburgh <mysterymath@gmail.com>
---
 libcxx/include/CMakeLists.txt                 |   4 +-
 libcxx/include/__locale_dir/locale_base_api.h |   4 +-
 .../__locale_dir/locale_base_api/fuchsia.h    |  18 ---
 libcxx/include/__locale_dir/support/fuchsia.h | 143 ++++++++++++++++++
 .../support/no_locale/characters.h            |  98 ++++++++++++
 .../__locale_dir/support/no_locale/strtonum.h |  49 ++++++
 libcxx/include/module.modulemap               |   4 +-
 7 files changed, 298 insertions(+), 22 deletions(-)
 delete mode 100644 libcxx/include/__locale_dir/locale_base_api/fuchsia.h
 create mode 100644 libcxx/include/__locale_dir/support/fuchsia.h
 create mode 100644 libcxx/include/__locale_dir/support/no_locale/characters.h
 create mode 100644 libcxx/include/__locale_dir/support/no_locale/strtonum.h

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index e152383a329fe..f3313bf53460a 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -499,7 +499,6 @@ set(files
   __locale_dir/locale_base_api.h
   __locale_dir/locale_base_api/android.h
   __locale_dir/locale_base_api/bsd_locale_fallbacks.h
-  __locale_dir/locale_base_api/fuchsia.h
   __locale_dir/locale_base_api/ibm.h
   __locale_dir/locale_base_api/musl.h
   __locale_dir/locale_base_api/openbsd.h
@@ -507,6 +506,9 @@ set(files
   __locale_dir/support/apple.h
   __locale_dir/support/bsd_like.h
   __locale_dir/support/freebsd.h
+  __locale_dir/support/fuchsia.h
+  __locale_dir/support/no_locale/characters.h
+  __locale_dir/support/no_locale/strtonum.h
   __locale_dir/support/windows.h
   __math/abs.h
   __math/copysign.h
diff --git a/libcxx/include/__locale_dir/locale_base_api.h b/libcxx/include/__locale_dir/locale_base_api.h
index bb0da889f4c84..b112a4aef7765 100644
--- a/libcxx/include/__locale_dir/locale_base_api.h
+++ b/libcxx/include/__locale_dir/locale_base_api.h
@@ -99,6 +99,8 @@
 #  include <__locale_dir/support/freebsd.h>
 #elif defined(_LIBCPP_MSVCRT_LIKE)
 #  include <__locale_dir/support/windows.h>
+#elif defined(__Fuchsia__)
+#  include <__locale_dir/support/fuchsia.h>
 #else
 
 // TODO: This is a temporary definition to bridge between the old way we defined the locale base API
@@ -111,8 +113,6 @@
 #    include <__locale_dir/locale_base_api/android.h>
 #  elif defined(__OpenBSD__)
 #    include <__locale_dir/locale_base_api/openbsd.h>
-#  elif defined(__Fuchsia__)
-#    include <__locale_dir/locale_base_api/fuchsia.h>
 #  elif defined(__wasi__) || _LIBCPP_HAS_MUSL_LIBC
 #    include <__locale_dir/locale_base_api/musl.h>
 #  endif
diff --git a/libcxx/include/__locale_dir/locale_base_api/fuchsia.h b/libcxx/include/__locale_dir/locale_base_api/fuchsia.h
deleted file mode 100644
index f6ef454ba7ada..0000000000000
--- a/libcxx/include/__locale_dir/locale_base_api/fuchsia.h
+++ /dev/null
@@ -1,18 +0,0 @@
-// -*- C++ -*-
-//===-----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_FUCHSIA_H
-#define _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_FUCHSIA_H
-
-#include <__support/xlocale/__posix_l_fallback.h>
-#include <__support/xlocale/__strtonum_fallback.h>
-#include <cstdlib>
-#include <cwchar>
-
-#endif // _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_FUCHSIA_H
diff --git a/libcxx/include/__locale_dir/support/fuchsia.h b/libcxx/include/__locale_dir/support/fuchsia.h
new file mode 100644
index 0000000000000..4a54896c8e268
--- /dev/null
+++ b/libcxx/include/__locale_dir/support/fuchsia.h
@@ -0,0 +1,143 @@
+//===-----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___LOCALE_DIR_SUPPORT_FUCHSIA_H
+#define _LIBCPP___LOCALE_DIR_SUPPORT_FUCHSIA_H
+
+#include <__config>
+#include <__utility/forward.h>
+#include <clocale> // uselocale & friends
+#include <cstdio>
+#include <cstdlib>
+#include <cwchar>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+namespace __locale {
+
+struct __locale_guard {
+  _LIBCPP_HIDE_FROM_ABI __locale_guard(locale_t& __loc) : __old_loc_(::uselocale(__loc)) {}
+
+  _LIBCPP_HIDE_FROM_ABI ~__locale_guard() {
+    if (__old_loc_)
+      ::uselocale(__old_loc_);
+  }
+
+  locale_t __old_loc_;
+
+  __locale_guard(__locale_guard const&)            = delete;
+  __locale_guard& operator=(__locale_guard const&) = delete;
+};
+
+//
+// Locale management
+//
+using __locale_t = locale_t;
+
+inline _LIBCPP_HIDE_FROM_ABI __locale_t __newlocale(int __category_mask, const char* __name, __locale_t __loc) {
+  return ::newlocale(__category_mask, __name, __loc);
+}
+
+inline _LIBCPP_HIDE_FROM_ABI void __freelocale(__locale_t __loc) { ::freelocale(__loc); }
+
+inline _LIBCPP_HIDE_FROM_ABI lconv* __localeconv(__locale_t& __loc) {
+  __locale_guard __current(__loc);
+  return std::localeconv();
+}
+
+//
+// Other functions
+//
+inline _LIBCPP_HIDE_FROM_ABI decltype(MB_CUR_MAX) __mb_len_max(__locale_t __loc) {
+  __locale_guard __current(__loc);
+  return MB_CUR_MAX;
+}
+#if _LIBCPP_HAS_WIDE_CHARACTERS
+inline _LIBCPP_HIDE_FROM_ABI wint_t __btowc(int __ch, __locale_t __loc) {
+  __locale_guard __current(__loc);
+  return std::btowc(__ch);
+}
+inline _LIBCPP_HIDE_FROM_ABI int __wctob(wint_t __ch, __locale_t __loc) {
+  __locale_guard __current(__loc);
+  return std::wctob(__ch);
+}
+inline _LIBCPP_HIDE_FROM_ABI size_t
+__wcsnrtombs(char* __dest, const wchar_t** __src, size_t __nwc, size_t __len, mbstate_t* __ps, __locale_t __loc) {
+  __locale_guard __current(__loc);
+  return ::wcsnrtombs(__dest, __src, __nwc, __len, __ps); // non-standard
+}
+inline _LIBCPP_HIDE_FROM_ABI size_t __wcrtomb(char* __s, wchar_t __ch, mbstate_t* __ps, __locale_t __loc) {
+  __locale_guard __current(__loc);
+  return std::wcrtomb(__s, __ch, __ps);
+}
+inline _LIBCPP_HIDE_FROM_ABI size_t
+__mbsnrtowcs(wchar_t* __dest, const char** __src, size_t __nms, size_t __len, mbstate_t* __ps, __locale_t __loc) {
+  __locale_guard __current(__loc);
+  return ::mbsnrtowcs(__dest, __src, __nms, __len, __ps); // non-standard
+}
+inline _LIBCPP_HIDE_FROM_ABI size_t
+__mbrtowc(wchar_t* __pwc, const char* __s, size_t __n, mbstate_t* __ps, __locale_t __loc) {
+  __locale_guard __current(__loc);
+  return std::mbrtowc(__pwc, __s, __n, __ps);
+}
+inline _LIBCPP_HIDE_FROM_ABI int __mbtowc(wchar_t* __pwc, const char* __pmb, size_t __max, __locale_t __loc) {
+  __locale_guard __current(__loc);
+  return std::mbtowc(__pwc, __pmb, __max);
+}
+inline _LIBCPP_HIDE_FROM_ABI size_t __mbrlen(const char* __s, size_t __n, mbstate_t* __ps, __locale_t __loc) {
+  __locale_guard __current(__loc);
+  return std::mbrlen(__s, __n, __ps);
+}
+inline _LIBCPP_HIDE_FROM_ABI size_t
+__mbsrtowcs(wchar_t* __dest, const char** __src, size_t __len, mbstate_t* __ps, __locale_t __loc) {
+  __locale_guard __current(__loc);
+  return ::mbsrtowcs(__dest, __src, __len, __ps);
+}
+#endif
+
+_LIBCPP_DIAGNOSTIC_PUSH
+_LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wgcc-compat")
+_LIBCPP_GCC_DIAGNOSTIC_IGNORED("-Wformat-nonliteral") // GCC doesn't support [[gnu::format]] on variadic templates
+#ifdef _LIBCPP_COMPILER_CLANG_BASED
+#  define _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(...) _LIBCPP_ATTRIBUTE_FORMAT(__VA_ARGS__)
+#else
+#  define _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(...) /* nothing */
+#endif
+
+template <class... _Args>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(__printf__, 4, 5) int __snprintf(
+    char* __s, size_t __n, __locale_t __loc, const char* __format, _Args&&... __args) {
+  __locale_guard __current(__loc);
+  return std::snprintf(__s, __n, __format, std::forward<_Args>(__args)...);
+}
+template <class... _Args>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(__printf__, 3, 4) int __asprintf(
+    char** __s, __locale_t __loc, const char* __format, _Args&&... __args) {
+  __locale_guard __current(__loc);
+  return ::asprintf(__s, __format, std::forward<_Args>(__args)...); // non-standard
+}
+template <class... _Args>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(__scanf__, 3, 4) int __sscanf(
+    const char* __s, __locale_t __loc, const char* __format, _Args&&... __args) {
+  __locale_guard __current(__loc);
+  return std::sscanf(__s, __format, std::forward<_Args>(__args)...);
+}
+
+_LIBCPP_DIAGNOSTIC_POP
+#undef _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT
+
+} // namespace __locale
+_LIBCPP_END_NAMESPACE_STD
+
+#include <__locale_dir/support/no_locale/characters.h>
+#include <__locale_dir/support/no_locale/strtonum.h>
+
+#endif // _LIBCPP___LOCALE_DIR_SUPPORT_FUCHSIA_H
diff --git a/libcxx/include/__locale_dir/support/no_locale/characters.h b/libcxx/include/__locale_dir/support/no_locale/characters.h
new file mode 100644
index 0000000000000..20e45fc350e2e
--- /dev/null
+++ b/libcxx/include/__locale_dir/support/no_locale/characters.h
@@ -0,0 +1,98 @@
+//===-----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___LOCALE_DIR_SUPPORT_NO_LOCALE_CHARACTERS_H
+#define _LIBCPP___LOCALE_DIR_SUPPORT_NO_LOCALE_CHARACTERS_H
+
+#include <__config>
+#include <__cstddef/size_t.h>
+#include <cctype>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#if _LIBCPP_HAS_WIDE_CHARACTERS
+#  include <cwctype>
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+namespace __locale {
+
+//
+// Character manipulation functions
+//
+inline _LIBCPP_HIDE_FROM_ABI int __islower(int __c, __locale_t) { return std::islower(__c); }
+
+inline _LIBCPP_HIDE_FROM_ABI int __isupper(int __c, __locale_t) { return std::isupper(__c); }
+
+inline _LIBCPP_HIDE_FROM_ABI int __isdigit(int __c, __locale_t) { return std::isdigit(__c); }
+
+inline _LIBCPP_HIDE_FROM_ABI int __isxdigit(int __c, __locale_t) { return std::isxdigit(__c); }
+
+inline _LIBCPP_HIDE_FROM_ABI int __toupper(int __c, __locale_t) { return std::toupper(__c); }
+
+inline _LIBCPP_HIDE_FROM_ABI int __tolower(int __c, __locale_t) { return std::tolower(__c); }
+
+inline _LIBCPP_HIDE_FROM_ABI int __strcoll(const char* __s1, const char* __s2, __locale_t) {
+  return std::strcoll(__s1, __s2);
+}
+
+inline _LIBCPP_HIDE_FROM_ABI size_t __strxfrm(char* __dest, const char* __src, size_t __n, __locale_t) {
+  return std::strxfrm(__dest, __src, __n);
+}
+
+#if _LIBCPP_HAS_WIDE_CHARACTERS
+inline _LIBCPP_HIDE_FROM_ABI int __iswctype(wint_t __c, wctype_t __type, __locale_t) {
+  return std::iswctype(__c, __type);
+}
+
+inline _LIBCPP_HIDE_FROM_ABI int __iswspace(wint_t __c, __locale_t) { return std::iswspace(__c); }
+
+inline _LIBCPP_HIDE_FROM_ABI int __iswprint(wint_t __c, __locale_t) { return std::iswprint(__c); }
+
+inline _LIBCPP_HIDE_FROM_ABI int __iswcntrl(wint_t __c, __locale_t) { return std::iswcntrl(__c); }
+
+inline _LIBCPP_HIDE_FROM_ABI int __iswupper(wint_t __c, __locale_t) { return std::iswupper(__c); }
+
+inline _LIBCPP_HIDE_FROM_ABI int __iswlower(wint_t __c, __locale_t) { return std::iswlower(__c); }
+
+inline _LIBCPP_HIDE_FROM_ABI int __iswalpha(wint_t __c, __locale_t) { return std::iswalpha(__c); }
+
+inline _LIBCPP_HIDE_FROM_ABI int __iswblank(wint_t __c, __locale_t) { return std::iswblank(__c); }
+
+inline _LIBCPP_HIDE_FROM_ABI int __iswdigit(wint_t __c, __locale_t) { return std::iswdigit(__c); }
+
+inline _LIBCPP_HIDE_FROM_ABI int __iswpunct(wint_t __c, __locale_t) { return std::iswpunct(__c); }
+
+inline _LIBCPP_HIDE_FROM_ABI int __iswxdigit(wint_t __c, __locale_t) { return std::iswxdigit(__c); }
+
+inline _LIBCPP_HIDE_FROM_ABI wint_t __towupper(wint_t __c, __locale_t) { return std::towupper(__c); }
+
+inline _LIBCPP_HIDE_FROM_ABI wint_t __towlower(wint_t __c, __locale_t) { return std::towlower(__c); }
+
+inline _LIBCPP_HIDE_FROM_ABI int __wcscoll(const wchar_t* __ws1, const wchar_t* __ws2, __locale_t) {
+  return std::wcscoll(__ws1, __ws2);
+}
+
+inline _LIBCPP_HIDE_FROM_ABI size_t __wcsxfrm(wchar_t* __dest, const wchar_t* __src, size_t __n, __locale_t) {
+  return std::wcsxfrm(__dest, __src, __n);
+}
+#endif // _LIBCPP_HAS_WIDE_CHARACTERS
+
+inline _LIBCPP_HIDE_FROM_ABI size_t
+__strftime(char* __s, size_t __max, const char* __format, const struct tm* __tm, __locale_t) {
+  return std::strftime(__s, __max, __format, __tm);
+}
+
+} // namespace __locale
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___LOCALE_DIR_SUPPORT_NO_LOCALE_CHARACTERS_H
diff --git a/libcxx/include/__locale_dir/support/no_locale/strtonum.h b/libcxx/include/__locale_dir/support/no_locale/strtonum.h
new file mode 100644
index 0000000000000..0e7a32993e736
--- /dev/null
+++ b/libcxx/include/__locale_dir/support/no_locale/strtonum.h
@@ -0,0 +1,49 @@
+//===-----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___LOCALE_DIR_SUPPORT_NO_LOCALE_STRTONUM_H
+#define _LIBCPP___LOCALE_DIR_SUPPORT_NO_LOCALE_STRTONUM_H
+
+#include <__config>
+#include <cstdlib>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+namespace __locale {
+
+//
+// Strtonum functions
+//
+inline _LIBCPP_HIDE_FROM_ABI float __strtof(const char* __nptr, char** __endptr, __locale_t) {
+  return std::strtof(__nptr, __endptr);
+}
+
+inline _LIBCPP_HIDE_FROM_ABI double __strtod(const char* __nptr, char** __endptr, __locale_t) {
+  return std::strtod(__nptr, __endptr);
+}
+
+inline _LIBCPP_HIDE_FROM_ABI long double __strtold(const char* __nptr, char** __endptr, __locale_t) {
+  return std::strtold(__nptr, __endptr);
+}
+
+inline _LIBCPP_HIDE_FROM_ABI long long __strtoll(const char* __nptr, char** __endptr, int __base, __locale_t) {
+  return std::strtoll(__nptr, __endptr, __base);
+}
+
+inline _LIBCPP_HIDE_FROM_ABI unsigned long long
+__strtoull(const char* __nptr, char** __endptr, int __base, __locale_t) {
+  return std::strtoull(__nptr, __endptr, __base);
+}
+
+} // namespace __locale
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___LOCALE_DIR_SUPPORT_NO_LOCALE_STRTONUM_H
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index e3204820b5c25..69f1b7d094ada 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -1480,13 +1480,15 @@ module std [system] {
       textual header "__locale_dir/support/apple.h"
       textual header "__locale_dir/support/bsd_like.h"
       textual header "__locale_dir/support/freebsd.h"
+      textual header "__locale_dir/support/fuchsia.h"
+      textual header "__locale_dir/support/no_locale/characters.h"
+      textual header "__locale_dir/support/no_locale/strtonum.h"
       textual header "__locale_dir/support/windows.h"
     }
 
     module locale_base_api {
       textual header "__locale_dir/locale_base_api/android.h"
       textual header "__locale_dir/locale_base_api/bsd_locale_fallbacks.h"
-      textual header "__locale_dir/locale_base_api/fuchsia.h"
       textual header "__locale_dir/locale_base_api/ibm.h"
       textual header "__locale_dir/locale_base_api/musl.h"
       textual header "__locale_dir/locale_base_api/openbsd.h"

From ce8c64fc8e067608be0dd44ca5399f85bab9e20d Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Mon, 13 Jan 2025 15:19:23 +0100
Subject: [PATCH 282/408] Remove StandaloneExtensionPybind11.cpp FT update as
 does not work with python 3.8 and old pybind11 (#122697)

Description:
- Remove StandaloneExtensionPybind11.cpp FT update as does not work with
python 3.8 and old pybind11

This should also fix the failing toy.test:
https://github.com/llvm/llvm-project/pull/122684#issuecomment-2586802692

cc @jpienaar
---
 .../standalone/python/StandaloneExtensionPybind11.cpp         | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/mlir/examples/standalone/python/StandaloneExtensionPybind11.cpp b/mlir/examples/standalone/python/StandaloneExtensionPybind11.cpp
index dd3c4c2945cca..397db4c20e743 100644
--- a/mlir/examples/standalone/python/StandaloneExtensionPybind11.cpp
+++ b/mlir/examples/standalone/python/StandaloneExtensionPybind11.cpp
@@ -12,11 +12,9 @@
 #include "Standalone-c/Dialects.h"
 #include "mlir/Bindings/Python/PybindAdaptors.h"
 
-namespace py = pybind11;
-
 using namespace mlir::python::adaptors;
 
-PYBIND11_MODULE(_standaloneDialectsPybind11, m, py::mod_gil_not_used()) {
+PYBIND11_MODULE(_standaloneDialectsPybind11, m) {
   //===--------------------------------------------------------------------===//
   // standalone dialect
   //===--------------------------------------------------------------------===//

From 99612a3a18e0c40aac9c52b68e67b106f97ed4fa Mon Sep 17 00:00:00 2001
From: bernhardu <bernhardu@mailbox.org>
Date: Mon, 13 Jan 2025 15:21:03 +0100
Subject: [PATCH 283/408] [win/asan] GetInstructionSize: Support some more 2
 byte instructions. (#120235)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds several instructions seen when trying to run a
executable built with ASan with llvm-mingw.
(x86 and x86_64, using the git tip in llvm-project).

Also includes instructions collected by
Roman Pišl and Eric Pouech in the Wine bug reports below.

```
Related: https://github.com/llvm/llvm-project/issues/96270

Co-authored-by: Roman Pišl <rpisl@seznam.cz>
                https://bugs.winehq.org/show_bug.cgi?id=50993
                https://bugs.winehq.org/attachment.cgi?id=70233
Co-authored-by: Eric Pouech <eric.pouech@gmail.com>
                https://bugs.winehq.org/show_bug.cgi?id=52386
                https://bugs.winehq.org/attachment.cgi?id=71626
```

CC: @zmodem
---
 compiler-rt/lib/interception/interception_win.cpp            | 5 +++++
 compiler-rt/lib/interception/tests/interception_win_test.cpp | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/compiler-rt/lib/interception/interception_win.cpp b/compiler-rt/lib/interception/interception_win.cpp
index bd85c50a083a6..7a1a47a78dbc6 100644
--- a/compiler-rt/lib/interception/interception_win.cpp
+++ b/compiler-rt/lib/interception/interception_win.cpp
@@ -636,12 +636,17 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) {
     case 0xFF8B:  // 8B FF : mov edi, edi
     case 0xEC8B:  // 8B EC : mov ebp, esp
     case 0xc889:  // 89 C8 : mov eax, ecx
+    case 0xD189:  // 89 D1 : mov ecx, edx
     case 0xE589:  // 89 E5 : mov ebp, esp
     case 0xC18B:  // 8B C1 : mov eax, ecx
+    case 0xC031:  // 31 C0 : xor eax, eax
+    case 0xC931:  // 31 C9 : xor ecx, ecx
+    case 0xD231:  // 31 D2 : xor edx, edx
     case 0xC033:  // 33 C0 : xor eax, eax
     case 0xC933:  // 33 C9 : xor ecx, ecx
     case 0xD233:  // 33 D2 : xor edx, edx
     case 0xDB84:  // 84 DB : test bl,bl
+    case 0xC084:  // 84 C0 : test al,al
     case 0xC984:  // 84 C9 : test cl,cl
     case 0xD284:  // 84 D2 : test dl,dl
       return 2;
diff --git a/compiler-rt/lib/interception/tests/interception_win_test.cpp b/compiler-rt/lib/interception/tests/interception_win_test.cpp
index 3a2d8b271113d..e0258a3d0bd51 100644
--- a/compiler-rt/lib/interception/tests/interception_win_test.cpp
+++ b/compiler-rt/lib/interception/tests/interception_win_test.cpp
@@ -839,14 +839,19 @@ const struct InstructionSizeData {
     { 1, {0x90}, 0, "90 : nop"},
     { 1, {0xC3}, 0, "C3 : ret   (for small/empty function interception"},
     { 1, {0xCC}, 0, "CC : int 3  i.e. registering weak functions)"},
+    { 2, {0x31, 0xC0}, 0, "31 C0 : xor eax, eax"},
+    { 2, {0x31, 0xC9}, 0, "31 C9 : xor ecx, ecx"},
+    { 2, {0x31, 0xD2}, 0, "31 D2 : xor edx, edx"},
     { 2, {0x33, 0xC0}, 0, "33 C0 : xor eax, eax"},
     { 2, {0x33, 0xC9}, 0, "33 C9 : xor ecx, ecx"},
     { 2, {0x33, 0xD2}, 0, "33 D2 : xor edx, edx"},
     { 2, {0x6A, 0x71}, 0, "6A XX : push XX"},
+    { 2, {0x84, 0xC0}, 0, "84 C0 : test al,al"},
     { 2, {0x84, 0xC9}, 0, "84 C9 : test cl,cl"},
     { 2, {0x84, 0xD2}, 0, "84 D2 : test dl,dl"},
     { 2, {0x84, 0xDB}, 0, "84 DB : test bl,bl"},
     { 2, {0x89, 0xc8}, 0, "89 C8 : mov eax, ecx"},
+    { 2, {0x89, 0xD1}, 0, "89 D1 : mov ecx, edx"},
     { 2, {0x89, 0xE5}, 0, "89 E5 : mov ebp, esp"},
     { 2, {0x8A, 0x01}, 0, "8A 01 : mov al, byte ptr [ecx]"},
     { 2, {0x8B, 0xC1}, 0, "8B C1 : mov eax, ecx"},

From 5e7d7fb1f0cefbb6f00bcd71867014ce8b71a11e Mon Sep 17 00:00:00 2001
From: Lukacma <Marian.Lukac@arm.com>
Date: Mon, 13 Jan 2025 15:05:47 +0000
Subject: [PATCH 284/408] [AArch64] Fix aarch64-fujitsu-monaka.c test (#122716)

---
 .../Driver/print-enabled-extensions/aarch64-fujitsu-monaka.c    | 2 --
 1 file changed, 2 deletions(-)

diff --git a/clang/test/Driver/print-enabled-extensions/aarch64-fujitsu-monaka.c b/clang/test/Driver/print-enabled-extensions/aarch64-fujitsu-monaka.c
index 3c74e3620df03..01a97a00de542 100644
--- a/clang/test/Driver/print-enabled-extensions/aarch64-fujitsu-monaka.c
+++ b/clang/test/Driver/print-enabled-extensions/aarch64-fujitsu-monaka.c
@@ -28,8 +28,6 @@
 // CHECK-NEXT:     FEAT_FP16                                              Enable half-precision floating-point data processing
 // CHECK-NEXT:     FEAT_FP8                                               Enable FP8 instructions
 // CHECK-NEXT:     FEAT_FP8DOT2                                           Enable FP8 2-way dot instructions
-// CHECK-NEXT:     FEAT_FP8DOT4                                           Enable FP8 4-way dot instructions
-// CHECK-NEXT:     FEAT_FP8FMA                                            Enable Armv9.5-A FP8 multiply-add instructions
 // CHECK-NEXT:     FEAT_FPAC                                              Enable Armv8.3-A Pointer Authentication Faulting enhancement
 // CHECK-NEXT:     FEAT_FRINTTS                                           Enable FRInt[32|64][Z|X] instructions that round a floating-point number to an integer (in FP format) forcing it to fit into a 32- or 64-bit int
 // CHECK-NEXT:     FEAT_FlagM                                             Enable Armv8.4-A Flag Manipulation instructions

From 7378afd167860083eaaf0d8e411c91a28a6605d6 Mon Sep 17 00:00:00 2001
From: Victor Campos <victor.campos@arm.com>
Date: Mon, 13 Jan 2025 15:11:40 +0000
Subject: [PATCH 285/408] Revert "[Multilib] Custom flags YAML parsing"
 (#122722)

Reverts llvm/llvm-project#110657

It seems that this patch is causing the sanitizer bot to fail. Reverting
while I investigate
---
 clang/include/clang/Driver/Multilib.h         |  28 +---
 clang/lib/Driver/Multilib.cpp                 |  73 ++--------
 ...remetal-multilib-custom-flags-parsing.yaml | 133 ------------------
 3 files changed, 11 insertions(+), 223 deletions(-)
 delete mode 100644 clang/test/Driver/baremetal-multilib-custom-flags-parsing.yaml

diff --git a/clang/include/clang/Driver/Multilib.h b/clang/include/clang/Driver/Multilib.h
index 1dab45c062aee..dbed70f4f9008 100644
--- a/clang/include/clang/Driver/Multilib.h
+++ b/clang/include/clang/Driver/Multilib.h
@@ -101,25 +101,6 @@ class Multilib {
 
 raw_ostream &operator<<(raw_ostream &OS, const Multilib &M);
 
-namespace custom_flag {
-struct Declaration;
-using DeclarationPtr = std::shared_ptr<Declaration>;
-
-struct ValueDetail {
-  std::string Name;
-  std::optional<SmallVector<std::string>> MacroDefines;
-  DeclarationPtr Decl;
-};
-
-struct Declaration {
-  std::string Name;
-  SmallVector<ValueDetail> ValueList;
-  std::optional<size_t> DefaultValueIdx;
-};
-
-static constexpr StringRef Prefix = "-fmultilib-flag=";
-} // namespace custom_flag
-
 /// See also MultilibSetBuilder for combining multilibs into a set.
 class MultilibSet {
 public:
@@ -139,18 +120,15 @@ class MultilibSet {
 
 private:
   multilib_list Multilibs;
-  SmallVector<FlagMatcher> FlagMatchers;
-  SmallVector<custom_flag::DeclarationPtr> CustomFlagDecls;
+  std::vector<FlagMatcher> FlagMatchers;
   IncludeDirsFunc IncludeCallback;
   IncludeDirsFunc FilePathsCallback;
 
 public:
   MultilibSet() = default;
   MultilibSet(multilib_list &&Multilibs,
-              SmallVector<FlagMatcher> &&FlagMatchers = {},
-              SmallVector<custom_flag::DeclarationPtr> &&CustomFlagDecls = {})
-      : Multilibs(std::move(Multilibs)), FlagMatchers(std::move(FlagMatchers)),
-        CustomFlagDecls(std::move(CustomFlagDecls)) {}
+              std::vector<FlagMatcher> &&FlagMatchers = {})
+      : Multilibs(Multilibs), FlagMatchers(FlagMatchers) {}
 
   const multilib_list &getMultilibs() { return Multilibs; }
 
diff --git a/clang/lib/Driver/Multilib.cpp b/clang/lib/Driver/Multilib.cpp
index b4b5dbd1bdb5e..0207e0f2eb2de 100644
--- a/clang/lib/Driver/Multilib.cpp
+++ b/clang/lib/Driver/Multilib.cpp
@@ -10,7 +10,6 @@
 #include "clang/Basic/LLVM.h"
 #include "clang/Driver/Driver.h"
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -202,20 +201,13 @@ struct MultilibGroupSerialization {
 
 struct MultilibSetSerialization {
   llvm::VersionTuple MultilibVersion;
-  SmallVector<MultilibGroupSerialization> Groups;
-  SmallVector<MultilibSerialization> Multilibs;
-  SmallVector<MultilibSet::FlagMatcher> FlagMatchers;
-  SmallVector<custom_flag::DeclarationPtr> CustomFlagDeclarations;
+  std::vector<MultilibGroupSerialization> Groups;
+  std::vector<MultilibSerialization> Multilibs;
+  std::vector<MultilibSet::FlagMatcher> FlagMatchers;
 };
 
 } // end anonymous namespace
 
-LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibSerialization)
-LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibGroupSerialization)
-LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibSet::FlagMatcher)
-LLVM_YAML_IS_SEQUENCE_VECTOR(custom_flag::ValueDetail)
-LLVM_YAML_IS_SEQUENCE_VECTOR(custom_flag::DeclarationPtr)
-
 template <> struct llvm::yaml::MappingTraits<MultilibSerialization> {
   static void mapping(llvm::yaml::IO &io, MultilibSerialization &V) {
     io.mapOptional("Dir", V.Dir);
@@ -263,63 +255,11 @@ template <> struct llvm::yaml::MappingTraits<MultilibSet::FlagMatcher> {
   }
 };
 
-template <>
-struct llvm::yaml::MappingContextTraits<custom_flag::ValueDetail,
-                                        llvm::SmallSet<std::string, 32>> {
-  static void mapping(llvm::yaml::IO &io, custom_flag::ValueDetail &V,
-                      llvm::SmallSet<std::string, 32> &) {
-    io.mapRequired("Name", V.Name);
-    io.mapOptional("MacroDefines", V.MacroDefines);
-  }
-  static std::string validate(IO &io, custom_flag::ValueDetail &V,
-                              llvm::SmallSet<std::string, 32> &NameSet) {
-    if (V.Name.empty())
-      return "custom flag value requires a name";
-    if (!NameSet.insert(V.Name).second)
-      return "duplicate custom flag value name: \"" + V.Name + "\"";
-    return {};
-  }
-};
-
-template <>
-struct llvm::yaml::MappingContextTraits<custom_flag::DeclarationPtr,
-                                        llvm::SmallSet<std::string, 32>> {
-  static void mapping(llvm::yaml::IO &io, custom_flag::DeclarationPtr &V,
-                      llvm::SmallSet<std::string, 32> &NameSet) {
-    assert(!V);
-    V = std::make_shared<custom_flag::Declaration>();
-    io.mapRequired("Name", V->Name);
-    io.mapRequired("Values", V->ValueList, NameSet);
-    std::string DefaultValueName;
-    io.mapRequired("Default", DefaultValueName);
-
-    for (auto [Idx, Value] : llvm::enumerate(V->ValueList)) {
-      Value.Decl = V;
-      if (Value.Name == DefaultValueName) {
-        assert(!V->DefaultValueIdx);
-        V->DefaultValueIdx = Idx;
-      }
-    }
-  }
-  static std::string validate(IO &io, custom_flag::DeclarationPtr &V,
-                              llvm::SmallSet<std::string, 32> &) {
-    if (V->Name.empty())
-      return "custom flag requires a name";
-    if (V->ValueList.empty())
-      return "custom flag must have at least one value";
-    if (!V->DefaultValueIdx)
-      return "custom flag must have a default value";
-    return {};
-  }
-};
-
 template <> struct llvm::yaml::MappingTraits<MultilibSetSerialization> {
   static void mapping(llvm::yaml::IO &io, MultilibSetSerialization &M) {
     io.mapRequired("MultilibVersion", M.MultilibVersion);
     io.mapRequired("Variants", M.Multilibs);
     io.mapOptional("Groups", M.Groups);
-    llvm::SmallSet<std::string, 32> NameSet;
-    io.mapOptionalWithContext("Flags", M.CustomFlagDeclarations, NameSet);
     io.mapOptional("Mappings", M.FlagMatchers);
   }
   static std::string validate(IO &io, MultilibSetSerialization &M) {
@@ -348,6 +288,10 @@ template <> struct llvm::yaml::MappingTraits<MultilibSetSerialization> {
   }
 };
 
+LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibSerialization)
+LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibGroupSerialization)
+LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibSet::FlagMatcher)
+
 llvm::ErrorOr<MultilibSet>
 MultilibSet::parseYaml(llvm::MemoryBufferRef Input,
                        llvm::SourceMgr::DiagHandlerTy DiagHandler,
@@ -375,8 +319,7 @@ MultilibSet::parseYaml(llvm::MemoryBufferRef Input,
     }
   }
 
-  return MultilibSet(std::move(Multilibs), std::move(MS.FlagMatchers),
-                     std::move(MS.CustomFlagDeclarations));
+  return MultilibSet(std::move(Multilibs), std::move(MS.FlagMatchers));
 }
 
 LLVM_DUMP_METHOD void MultilibSet::dump() const {
diff --git a/clang/test/Driver/baremetal-multilib-custom-flags-parsing.yaml b/clang/test/Driver/baremetal-multilib-custom-flags-parsing.yaml
deleted file mode 100644
index fe6a9a8d7f1ee..0000000000000
--- a/clang/test/Driver/baremetal-multilib-custom-flags-parsing.yaml
+++ /dev/null
@@ -1,133 +0,0 @@
-# RUN: split-file %s %t
-
-# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/multilib-without-macro-defines.yaml %s -### -o /dev/null 2>&1 \
-# RUN: | FileCheck %s
-# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/multilib-with-macro-defines.yaml %s -### -o /dev/null 2>&1 \
-# RUN: | FileCheck %s
-# CHECK-NOT: error:
-
-# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/missing-flag-name.yaml %s -### -o /dev/null 2>&1 \
-# RUN: | FileCheck %s --check-prefix=CHECK-MISSING-FLAG-NAME
-# CHECK-MISSING-FLAG-NAME: error: custom flag requires a name
-
-# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/missing-flag-values.yaml %s -### -o /dev/null 2>&1 \
-# RUN: | FileCheck %s --check-prefix=CHECK-MISSING-FLAG-VALUES
-# CHECK-MISSING-FLAG-VALUES: error: custom flag must have at least one value
-
-# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/missing-flag-value-default.yaml %s -### -o /dev/null 2>&1 \
-# RUN: | FileCheck %s --check-prefix=CHECK-MISSING-FLAG-VALUE-DEFAULT
-# CHECK-MISSING-FLAG-VALUE-DEFAULT: error: custom flag must have a default value
-
-# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/missing-flag-value-name.yaml %s -### -o /dev/null 2>&1 \
-# RUN: | FileCheck %s --check-prefix=CHECK-MISSING-FLAG-VALUE-NAME
-# CHECK-MISSING-FLAG-VALUE-NAME: error: custom flag value requires a name
-
-# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/duplicate-flag-value-name.yaml %s -### -o /dev/null 2>&1 \
-# RUN: | FileCheck %s --check-prefix=CHECK-DUPLICATE-FLAG-VALUE-NAME
-# CHECK-DUPLICATE-FLAG-VALUE-NAME:      error: duplicate custom flag value name: "value-name"
-# CHECK-DUPLICATE-FLAG-VALUE-NAME-NEXT: - Name: value-name
-
-#--- multilib-without-macro-defines.yaml
----
-MultilibVersion: 1.0
-
-Variants:
-- Dir: libc
-  Flags: [-fmultilib-flag=a]
-
-Flags:
-  - Name: flag
-    Values:
-    - Name: a
-    - Name: b
-    Default: a
-
-#--- multilib-with-macro-defines.yaml
----
-MultilibVersion: 1.0
-
-Variants:
-- Dir: libc
-  Flags: [-fmultilib-flag=a]
-
-Flags:
-  - Name: flag
-    Values:
-    - Name: a
-      MacroDefines: [FEATURE_A]
-    - Name: b
-      MacroDefines: [FEATURE_B]
-    Default: a
-
-#--- missing-flag-name.yaml
----
-MultilibVersion: 1.0
-
-Variants:
-- Dir: libc
-  Flags: [-fmultilib-flag=a]
-
-Flags:
-  - Values:
-    - Name: a
-    Default: a
-
-#--- missing-flag-values.yaml
----
-MultilibVersion: 1.0
-
-Variants:
-- Dir: libc
-  Flags: [-fmultilib-flag=a]
-
-Flags:
-  - Name: flag
-    Values:
-    Default: a
-
-#--- missing-flag-value-default.yaml
----
-MultilibVersion: 1.0
-
-Variants:
-- Dir: libc
-  Flags: [-fmultilib-flag=a]
-
-Flags:
-  - Name: flag
-    Values:
-    - Name: a
-    Default:
-
-#--- missing-flag-value-name.yaml
----
-MultilibVersion: 1.0
-
-Variants:
-- Dir: libc
-  Flags: [-fmultilib-flag=a]
-
-Flags:
-  - Name: flag
-    Values:
-    - Name:
-    Default: a
-
-#--- duplicate-flag-value-name.yaml
----
-MultilibVersion: 1.0
-
-Variants:
-- Dir: libc
-  Flags: [-fmultilib-flag=value-name]
-
-Flags:
-  - Name: a
-    Values:
-    - Name: value-name
-    - Name: value-a
-    Default: value-name
-  - Name: b
-    Values:
-    - Name: value-name
-    Default: value-name

From ab6c89c220192159a66c1a91ad3dd892bad1c3b2 Mon Sep 17 00:00:00 2001
From: Brotcrunsher <brotcrunsher@hotmail.de>
Date: Mon, 13 Jan 2025 16:16:23 +0100
Subject: [PATCH 286/408] [libcxx] Don't hold the lock when calling notify_* on
 gates in std::shared_mutex (#107876)

Holding the associated lock while calling notify_* on a
condition_variable is generally considered a pessimization, as the
notified thread might "instantly" wake up, notice that it can't acquire
the lock, and then goes back to sleep.
---
 libcxx/src/shared_mutex.cpp | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/libcxx/src/shared_mutex.cpp b/libcxx/src/shared_mutex.cpp
index 1a346dda027f8..6180833736956 100644
--- a/libcxx/src/shared_mutex.cpp
+++ b/libcxx/src/shared_mutex.cpp
@@ -38,8 +38,10 @@ bool __shared_mutex_base::try_lock() {
 }
 
 void __shared_mutex_base::unlock() {
-  lock_guard<mutex> _(__mut_);
-  __state_ = 0;
+  {
+    lock_guard<mutex> _(__mut_);
+    __state_ = 0;
+  }
   __gate1_.notify_all();
 }
 
@@ -67,16 +69,20 @@ bool __shared_mutex_base::try_lock_shared() {
 }
 
 void __shared_mutex_base::unlock_shared() {
-  lock_guard<mutex> _(__mut_);
+  unique_lock<mutex> lk(__mut_);
   unsigned num_readers = (__state_ & __n_readers_) - 1;
   __state_ &= ~__n_readers_;
   __state_ |= num_readers;
   if (__state_ & __write_entered_) {
-    if (num_readers == 0)
+    if (num_readers == 0) {
+      lk.unlock();
       __gate2_.notify_one();
+    }
   } else {
-    if (num_readers == __n_readers_ - 1)
+    if (num_readers == __n_readers_ - 1) {
+      lk.unlock();
       __gate1_.notify_one();
+    }
   }
 }
 

From 34ba84fe90b3e369c03e695a395ec632ef60e23d Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Mon, 13 Jan 2025 10:23:15 -0500
Subject: [PATCH 287/408] [SPIRV] Return success when selecting reads and
 writes. (#122162)

The function `selectImageWriteIntrinsic` and `selectReadImageIntrinsic`
are void functions. The should return true if they succeed, and false
otherwise. This commit updates the code to do this.
---
 .../Target/SPIRV/SPIRVInstructionSelector.cpp | 72 +++++++++++--------
 1 file changed, 41 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 28c9b81db51f5..b7b32dd0d626c 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -274,10 +274,10 @@ class SPIRVInstructionSelector : public InstructionSelector {
   bool selectHandleFromBinding(Register &ResVReg, const SPIRVType *ResType,
                                MachineInstr &I) const;
 
-  void selectReadImageIntrinsic(Register &ResVReg, const SPIRVType *ResType,
+  bool selectReadImageIntrinsic(Register &ResVReg, const SPIRVType *ResType,
                                 MachineInstr &I) const;
 
-  void selectImageWriteIntrinsic(MachineInstr &I) const;
+  bool selectImageWriteIntrinsic(MachineInstr &I) const;
 
   // Utilities
   std::pair<Register, bool>
@@ -305,7 +305,7 @@ class SPIRVInstructionSelector : public InstructionSelector {
                                   Register IndexReg, bool IsNonUniform,
                                   MachineIRBuilder MIRBuilder) const;
   SPIRVType *widenTypeToVec4(const SPIRVType *Type, MachineInstr &I) const;
-  void extractSubvector(Register &ResVReg, const SPIRVType *ResType,
+  bool extractSubvector(Register &ResVReg, const SPIRVType *ResType,
                         Register &ReadReg, MachineInstr &InsertionPoint) const;
   bool BuildCOPY(Register DestReg, Register SrcReg, MachineInstr &I) const;
   bool loadVec3BuiltinInputID(SPIRV::BuiltIn::BuiltIn BuiltInValue,
@@ -3002,12 +3002,10 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
     return selectHandleFromBinding(ResVReg, ResType, I);
   }
   case Intrinsic::spv_resource_store_typedbuffer: {
-    selectImageWriteIntrinsic(I);
-    return true;
+    return selectImageWriteIntrinsic(I);
   }
   case Intrinsic::spv_resource_load_typedbuffer: {
-    selectReadImageIntrinsic(ResVReg, ResType, I);
-    return true;
+    return selectReadImageIntrinsic(ResVReg, ResType, I);
   }
   case Intrinsic::spv_discard: {
     return selectDiscard(ResVReg, ResType, I);
@@ -3049,7 +3047,7 @@ bool SPIRVInstructionSelector::selectHandleFromBinding(Register &ResVReg,
       .constrainAllUses(TII, TRI, RBI);
 }
 
-void SPIRVInstructionSelector::selectReadImageIntrinsic(
+bool SPIRVInstructionSelector::selectReadImageIntrinsic(
     Register &ResVReg, const SPIRVType *ResType, MachineInstr &I) const {
 
   // If the load of the image is in a different basic block, then
@@ -3064,35 +3062,40 @@ void SPIRVInstructionSelector::selectReadImageIntrinsic(
 
   uint64_t ResultSize = GR.getScalarOrVectorComponentCount(ResType);
   if (ResultSize == 4) {
-    BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpImageRead))
+    return BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                   TII.get(SPIRV::OpImageRead))
         .addDef(ResVReg)
         .addUse(GR.getSPIRVTypeID(ResType))
         .addUse(ImageReg)
-        .addUse(I.getOperand(3).getReg());
-    return;
+        .addUse(I.getOperand(3).getReg())
+        .constrainAllUses(TII, TRI, RBI);
   }
 
   SPIRVType *ReadType = widenTypeToVec4(ResType, I);
   Register ReadReg = MRI->createVirtualRegister(GR.getRegClass(ReadType));
-  BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpImageRead))
-      .addDef(ReadReg)
-      .addUse(GR.getSPIRVTypeID(ReadType))
-      .addUse(ImageReg)
-      .addUse(I.getOperand(3).getReg());
+  bool Succeed =
+      BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpImageRead))
+          .addDef(ReadReg)
+          .addUse(GR.getSPIRVTypeID(ReadType))
+          .addUse(ImageReg)
+          .addUse(I.getOperand(3).getReg())
+          .constrainAllUses(TII, TRI, RBI);
+  if (!Succeed)
+    return false;
 
   if (ResultSize == 1) {
-    BuildMI(*I.getParent(), I, I.getDebugLoc(),
-            TII.get(SPIRV::OpCompositeExtract))
+    return BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                   TII.get(SPIRV::OpCompositeExtract))
         .addDef(ResVReg)
         .addUse(GR.getSPIRVTypeID(ResType))
         .addUse(ReadReg)
-        .addImm(0);
-    return;
+        .addImm(0)
+        .constrainAllUses(TII, TRI, RBI);
   }
-  extractSubvector(ResVReg, ResType, ReadReg, I);
+  return extractSubvector(ResVReg, ResType, ReadReg, I);
 }
 
-void SPIRVInstructionSelector::extractSubvector(
+bool SPIRVInstructionSelector::extractSubvector(
     Register &ResVReg, const SPIRVType *ResType, Register &ReadReg,
     MachineInstr &InsertionPoint) const {
   SPIRVType *InputType = GR.getResultType(ReadReg);
@@ -3108,12 +3111,16 @@ void SPIRVInstructionSelector::extractSubvector(
   const TargetRegisterClass *ScalarRegClass = GR.getRegClass(ScalarType);
   for (uint64_t I = 0; I < ResultSize; I++) {
     Register ComponentReg = MRI->createVirtualRegister(ScalarRegClass);
-    BuildMI(*InsertionPoint.getParent(), InsertionPoint,
-            InsertionPoint.getDebugLoc(), TII.get(SPIRV::OpCompositeExtract))
-        .addDef(ComponentReg)
-        .addUse(ScalarType->getOperand(0).getReg())
-        .addUse(ReadReg)
-        .addImm(I);
+    bool Succeed = BuildMI(*InsertionPoint.getParent(), InsertionPoint,
+                           InsertionPoint.getDebugLoc(),
+                           TII.get(SPIRV::OpCompositeExtract))
+                       .addDef(ComponentReg)
+                       .addUse(ScalarType->getOperand(0).getReg())
+                       .addUse(ReadReg)
+                       .addImm(I)
+                       .constrainAllUses(TII, TRI, RBI);
+    if (!Succeed)
+      return false;
     ComponentRegisters.emplace_back(ComponentReg);
   }
 
@@ -3125,9 +3132,10 @@ void SPIRVInstructionSelector::extractSubvector(
 
   for (Register ComponentReg : ComponentRegisters)
     MIB.addUse(ComponentReg);
+  return MIB.constrainAllUses(TII, TRI, RBI);
 }
 
-void SPIRVInstructionSelector::selectImageWriteIntrinsic(
+bool SPIRVInstructionSelector::selectImageWriteIntrinsic(
     MachineInstr &I) const {
   // If the load of the image is in a different basic block, then
   // this will generate invalid code. A proper solution is to move
@@ -3142,10 +3150,12 @@ void SPIRVInstructionSelector::selectImageWriteIntrinsic(
   Register DataReg = I.getOperand(3).getReg();
   assert(GR.getResultType(DataReg)->getOpcode() == SPIRV::OpTypeVector);
   assert(GR.getScalarOrVectorComponentCount(GR.getResultType(DataReg)) == 4);
-  BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpImageWrite))
+  return BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                 TII.get(SPIRV::OpImageWrite))
       .addUse(ImageReg)
       .addUse(CoordinateReg)
-      .addUse(DataReg);
+      .addUse(DataReg)
+      .constrainAllUses(TII, TRI, RBI);
 }
 
 Register SPIRVInstructionSelector::buildPointerToResource(

From af524de1fa94e4b4cee8b745d1b68f4ea0090759 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Mon, 13 Jan 2025 06:48:56 -0800
Subject: [PATCH 288/408] [SLP]Do not include subvectors for fully matched
 buildvectors

If the buildvector node fully matched another node, need to exclude
subvectors, when building final shuffle, just a shuffle of the original
node must be emitted.

Fixes #122584
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  4 +-
 .../vectorizable-selects-uniform-cmps.ll      |  4 +-
 .../X86/full-matched-bv-with-subvectors.ll    | 99 +++++++++++++++++++
 3 files changed, 102 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e3487b5015342..df46c69ff3ab4 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -14935,8 +14935,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
           }
         }
         ShuffleBuilder.add(*FrontTE, Mask);
-        Res = ShuffleBuilder.finalize(E->getCommonMask(), SubVectors,
-                                      SubVectorsMask);
+        // Full matched entry found, no need to insert subvectors.
+        Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
         return Res;
       }
       if (!Resized) {
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
index f9e415a3cefc1..27f3155b50dbb 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
@@ -259,11 +259,9 @@ define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP6]], <8 x i8> [[TMP0]], i64 0)
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP7]], <4 x i8> [[TMP3]], i64 12)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ugt <16 x i8> [[TMP8]], splat (i8 -1)
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP8]], <8 x i8> [[TMP0]], i64 0)
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP10]], <4 x i8> [[TMP3]], i64 12)
 ; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> [[TMP11]], <16 x i8> [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> [[TMP8]], <16 x i8> [[TMP13]]
 ; CHECK-NEXT:    store <16 x i8> [[TMP14]], ptr [[PTR]], align 2
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll b/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll
new file mode 100644
index 0000000000000..7576eb7a8f55e
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -slp-threshold=-9999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define i32 @test(i64 %l.549) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: i64 [[L_549:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CONV3:%.*]] = sext i32 0 to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[CONV3]], i32 3
+; CHECK-NEXT:    br label %[[IF_THEN19:.*]]
+; CHECK:       [[P:.*]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i64> [ zeroinitializer, %[[IF_END29:.*]] ], [ [[TMP13:%.*]], %[[IF_END25:.*]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    br i1 false, label %[[S:.*]], label %[[Q:.*]]
+; CHECK:       [[Q]]:
+; CHECK-NEXT:    [[XOR39:%.*]] = phi i64 [ 0, %[[P]] ], [ 0, %[[LAND_LHS_TRUE:.*]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x i64> [ zeroinitializer, %[[P]] ], [ zeroinitializer, %[[LAND_LHS_TRUE]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[XOR39]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP4]], <2 x i64> [[TMP3]], i64 0)
+; CHECK-NEXT:    br i1 false, label %[[LOR_LHS_FALSE:.*]], label %[[R:.*]]
+; CHECK:       [[LOR_LHS_FALSE]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+; CHECK-NEXT:    br i1 false, label %[[LAND_LHS_TRUE]], label %[[S]]
+; CHECK:       [[R]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <4 x i64> [ [[TMP5]], %[[Q]] ], [ [[TMP16:%.*]], %[[IF_THEN19]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+; CHECK-NEXT:    br i1 false, label %[[S]], label %[[LAND_LHS_TRUE]]
+; CHECK:       [[LAND_LHS_TRUE]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x i64> [ [[TMP8]], %[[R]] ], [ zeroinitializer, %[[LOR_LHS_FALSE]] ]
+; CHECK-NEXT:    br i1 false, label %[[Q]], label %[[S]]
+; CHECK:       [[S]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = phi <4 x i64> [ [[TMP9]], %[[LAND_LHS_TRUE]] ], [ [[TMP8]], %[[R]] ], [ [[TMP6]], %[[LOR_LHS_FALSE]] ], [ [[TMP2]], %[[P]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP10]], <4 x i64> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    br label %[[IF_THEN19]]
+; CHECK:       [[IF_THEN19]]:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi <2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP11]], %[[S]] ]
+; CHECK-NEXT:    [[TMP13]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[L_549]], i32 1
+; CHECK-NEXT:    [[TMP16]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP15]], <2 x i64> zeroinitializer, i64 2)
+; CHECK-NEXT:    br i1 false, label %[[R]], label %[[IF_END25]]
+; CHECK:       [[IF_END25]]:
+; CHECK-NEXT:    br i1 false, label %[[IF_END29]], label %[[P]]
+; CHECK:       [[IF_END29]]:
+; CHECK-NEXT:    br label %[[P]]
+;
+entry:
+  %conv3 = sext i32 0 to i64
+  br label %if.then19
+
+p:
+  %l.0 = phi i64 [ %xor, %if.end29 ], [ %l.5493, %if.end25 ]
+  %m.0 = phi i64 [ %not21, %if.end29 ], [ %m.550, %if.end25 ]
+  br i1 false, label %s, label %q
+
+q:
+  %xor39 = phi i64 [ 0, %p ], [ 0, %land.lhs.true ]
+  %l.1 = phi i64 [ 0, %p ], [ 0, %land.lhs.true ]
+  %m.1 = phi i64 [ 0, %p ], [ 0, %land.lhs.true ]
+  br i1 false, label %lor.lhs.false, label %r
+
+lor.lhs.false:
+  br i1 false, label %land.lhs.true, label %s
+
+r:
+  %xor38 = phi i64 [ %xor39, %q ], [ %xor, %if.then19 ]
+  %j.0 = phi i64 [ %conv3, %q ], [ %not21, %if.then19 ]
+  %l.2 = phi i64 [ %l.1, %q ], [ %l.549, %if.then19 ]
+  %m.2 = phi i64 [ %m.1, %q ], [ %m.550, %if.then19 ]
+  br i1 false, label %s, label %land.lhs.true
+
+land.lhs.true:
+  %xor37 = phi i64 [ %xor38, %r ], [ 0, %lor.lhs.false ]
+  %j.1 = phi i64 [ %j.0, %r ], [ 0, %lor.lhs.false ]
+  %l.3 = phi i64 [ %l.2, %r ], [ 0, %lor.lhs.false ]
+  %m.3 = phi i64 [ %m.2, %r ], [ 0, %lor.lhs.false ]
+  br i1 false, label %q, label %s
+
+s:
+  %xor36 = phi i64 [ %xor37, %land.lhs.true ], [ %xor38, %r ], [ %xor39, %lor.lhs.false ], [ %l.0, %p ]
+  %j.2 = phi i64 [ %j.1, %land.lhs.true ], [ %j.0, %r ], [ %conv3, %lor.lhs.false ], [ %m.0, %p ]
+  %l.4 = phi i64 [ %l.3, %land.lhs.true ], [ %l.2, %r ], [ %l.1, %lor.lhs.false ], [ %l.0, %p ]
+  %m.4 = phi i64 [ %m.3, %land.lhs.true ], [ %m.2, %r ], [ %m.1, %lor.lhs.false ], [ %m.0, %p ]
+  br label %if.then19
+
+if.then19:
+  %m.550 = phi i64 [ 0, %entry ], [ %m.4, %s ]
+  %l.5493 = phi i64 [ 0, %entry ], [ %l.4, %s ]
+  %xor = xor i64 0, 0
+  %not21 = xor i64 0, 0
+  br i1 false, label %r, label %if.end25
+
+if.end25:
+  br i1 false, label %if.end29, label %p
+
+if.end29:
+  br label %p
+}
+

From 09a8b7cbc29d8704c343197d4b33b6972366c682 Mon Sep 17 00:00:00 2001
From: gbMattN <146744444+gbMattN@users.noreply.github.com>
Date: Mon, 13 Jan 2025 15:28:37 +0000
Subject: [PATCH 289/408] [TySan] Fix struct access with different bases
 (#120412)

Original pull request
[here](https://github.com/llvm/llvm-project/pull/108385)
Fixes issue https://github.com/llvm/llvm-project/issues/105960

If a member in a struct is also a struct, accessing a member partway
through this inner struct currently causes a false positive. This is
because when checking aliasing, the access offset is seen as greater
than the starting offset of the inner struct, so the loop continues one
iteration, and believes we are accessing the member after the inner
struct.

The next member's offset is greater than the offset we are looking for,
so when we subtract the next member's offset from what we are looking
for, the offset underflows.

To fix this, we check if the member we think we are accessing has a
greater offset than the offset we are looking for. If so, we take a step
back. We cannot do this in the loop, since the loop does not check the
final member. This means the penultimate member would still cause false
positives.
---
 compiler-rt/lib/tysan/tysan.cpp               | 11 +++++
 .../tysan/struct-offset-different-base.cpp    | 49 +++++++++++++++++++
 2 files changed, 60 insertions(+)
 create mode 100644 compiler-rt/test/tysan/struct-offset-different-base.cpp

diff --git a/compiler-rt/lib/tysan/tysan.cpp b/compiler-rt/lib/tysan/tysan.cpp
index 9c87b4782671a..f0230df9260e3 100644
--- a/compiler-rt/lib/tysan/tysan.cpp
+++ b/compiler-rt/lib/tysan/tysan.cpp
@@ -131,6 +131,17 @@ static bool isAliasingLegalUp(tysan_type_descriptor *TDA,
           break;
       }
 
+      // This offset can't be negative. Therefore we must be accessing something
+      // before the current type (not legal) or partially inside the last type.
+      // In the latter case, we adjust Idx.
+      if (TDA->Struct.Members[Idx].Offset > OffsetA) {
+        // Trying to access something before the current type.
+        if (!Idx)
+          return false;
+
+        Idx -= 1;
+      }
+
       OffsetA -= TDA->Struct.Members[Idx].Offset;
       TDA = TDA->Struct.Members[Idx].Type;
     } else {
diff --git a/compiler-rt/test/tysan/struct-offset-different-base.cpp b/compiler-rt/test/tysan/struct-offset-different-base.cpp
new file mode 100644
index 0000000000000..862595de8dc81
--- /dev/null
+++ b/compiler-rt/test/tysan/struct-offset-different-base.cpp
@@ -0,0 +1,49 @@
+// RUN: %clangxx_tysan -O0 %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck %s --implicit-check-not ERROR < %t.out
+
+// Modified reproducer from https://github.com/llvm/llvm-project/issues/105960
+
+#include <stdio.h>
+
+struct inner1 {
+  char buffer;
+  int i;
+};
+
+struct inner2 {
+  char buffer;
+  int i;
+  float endBuffer;
+};
+
+void init_inner1(inner1 *iPtr) { iPtr->i = 200; }
+void init_inner2(inner2 *iPtr) {
+  iPtr->i = 400;
+  iPtr->endBuffer = 413.0f;
+}
+
+struct outer {
+  inner1 foo;
+  inner2 bar;
+  char buffer;
+};
+
+int main(void) {
+  outer *l = new outer();
+
+  init_inner1(&l->foo);
+  init_inner2(&l->bar);
+
+  int access = l->foo.i;
+  printf("Accessed value 1 is %d\n", access);
+  access = l->bar.i;
+  printf("Accessed value 2 is %d\n", access);
+  float fAccess = l->bar.endBuffer;
+  printf("Accessed value 3 is %f\n", fAccess);
+
+  return 0;
+}
+
+// CHECK: Accessed value 1 is 200
+// CHECK: Accessed value 2 is 400
+// CHECK: Accessed value 3 is 413.0

From 3318a7248ae464af0abd0bea5515fa58c962b890 Mon Sep 17 00:00:00 2001
From: goldsteinn <35538541+goldsteinn@users.noreply.github.com>
Date: Mon, 13 Jan 2025 09:38:09 -0600
Subject: [PATCH 290/408] [InstCombine] Fold `(ct{t,l}z Pow2)` -> `Log2(Pow2)`
 (#122620)

- **[InstCombine] Add tests for folding `(ct{t,l}z Pow2)`; NFC**
- **[InstCombine] Fold `(ct{t,l}z Pow2)` -> `Log2(Pow2)`**

Do so we can find `Log2(Pow2)` for "free" with `takeLog2`

https://alive2.llvm.org/ce/z/CL77fo
---
 .../InstCombine/InstCombineCalls.cpp          | 13 +++
 llvm/test/Transforms/InstCombine/cttz.ll      | 93 +++++++++++++++++++
 2 files changed, 106 insertions(+)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 7454382412369..dd5a4ba5a4724 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -588,6 +588,19 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) {
     }
   }
 
+  // cttz(Pow2) -> Log2(Pow2)
+  // ctlz(Pow2) -> BitWidth - 1 - Log2(Pow2)
+  if (auto *R = IC.tryGetLog2(Op0, match(Op1, m_One()))) {
+    if (IsTZ)
+      return IC.replaceInstUsesWith(II, R);
+    BinaryOperator *BO = BinaryOperator::CreateSub(
+        ConstantInt::get(R->getType(), R->getType()->getScalarSizeInBits() - 1),
+        R);
+    BO->setHasNoSignedWrap();
+    BO->setHasNoUnsignedWrap();
+    return BO;
+  }
+
   KnownBits Known = IC.computeKnownBits(Op0, 0, &II);
 
   // Create a mask for bits above (ctlz) or below (cttz) the first known one.
diff --git a/llvm/test/Transforms/InstCombine/cttz.ll b/llvm/test/Transforms/InstCombine/cttz.ll
index cb0bc59ae7995..829213b24e93e 100644
--- a/llvm/test/Transforms/InstCombine/cttz.ll
+++ b/llvm/test/Transforms/InstCombine/cttz.ll
@@ -297,3 +297,96 @@ define i16 @cttz_assume(i16 %x) {
   %cttz = call i16 @llvm.cttz.i16(i16 %x, i1 false)
   ret i16 %cttz
 }
+
+
+declare void @use.i8(i8)
+define i8 @fold_ctz_log2(i8 %x) {
+; CHECK-LABEL: @fold_ctz_log2(
+; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.umin.i8(i8 [[X:%.*]], i8 5)
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %p2 = shl i8 1, %x
+  %v = call i8 @llvm.umin(i8 %p2, i8 32)
+  %r = call i8 @llvm.cttz(i8 %v, i1 false)
+  ret i8 %r
+}
+
+define i9 @fold_ctz_log2_i9_okay(i9 %x) {
+; CHECK-LABEL: @fold_ctz_log2_i9_okay(
+; CHECK-NEXT:    [[R:%.*]] = call i9 @llvm.umin.i9(i9 [[X:%.*]], i9 5)
+; CHECK-NEXT:    ret i9 [[R]]
+;
+  %p2 = shl i9 1, %x
+  %v = call i9 @llvm.umin(i9 %p2, i9 32)
+  %r = call i9 @llvm.cttz(i9 %v, i1 false)
+  ret i9 %r
+}
+
+define i8 @fold_ctz_log2_maybe_z(i8 %x, i8 %y, i1 %c) {
+; CHECK-LABEL: @fold_ctz_log2_maybe_z(
+; CHECK-NEXT:    [[V:%.*]] = shl i8 2, [[V_V:%.*]]
+; CHECK-NEXT:    [[P2_2:%.*]] = shl i8 4, [[Y:%.*]]
+; CHECK-NEXT:    [[V1:%.*]] = select i1 [[C:%.*]], i8 [[V]], i8 [[P2_2]]
+; CHECK-NEXT:    [[R:%.*]] = call range(i8 1, 9) i8 @llvm.cttz.i8(i8 [[V1]], i1 false)
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %p2 = shl i8 2, %x
+  %p2_2 = shl i8 4, %y
+  %v = select i1 %c, i8 %p2, i8 %p2_2
+  %r = call i8 @llvm.cttz(i8 %v, i1 false)
+  ret i8 %r
+}
+
+define i8 @fold_ctz_log2_maybe_z_okay(i8 %x, i8 %y, i1 %c) {
+; CHECK-LABEL: @fold_ctz_log2_maybe_z_okay(
+; CHECK-NEXT:    [[X:%.*]] = add i8 [[X1:%.*]], 1
+; CHECK-NEXT:    [[Y:%.*]] = add i8 [[Y1:%.*]], 2
+; CHECK-NEXT:    [[V_V:%.*]] = select i1 [[C:%.*]], i8 [[X]], i8 [[Y]]
+; CHECK-NEXT:    ret i8 [[V_V]]
+;
+  %p2 = shl i8 2, %x
+  %p2_2 = shl i8 4, %y
+  %v = select i1 %c, i8 %p2, i8 %p2_2
+  %r = call i8 @llvm.cttz(i8 %v, i1 true)
+  ret i8 %r
+}
+
+define i8 @fold_clz_log2(i8 %x) {
+; CHECK-LABEL: @fold_clz_log2(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umin.i8(i8 [[X:%.*]], i8 5)
+; CHECK-NEXT:    [[R:%.*]] = xor i8 [[TMP1]], 7
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %p2 = shl i8 1, %x
+  %v = call i8 @llvm.umin(i8 %p2, i8 32)
+  %r = call i8 @llvm.ctlz(i8 %v, i1 false)
+  ret i8 %r
+}
+
+define i8 @fold_clz_log2_multiuse_fail(i8 %x) {
+; CHECK-LABEL: @fold_clz_log2_multiuse_fail(
+; CHECK-NEXT:    [[P2:%.*]] = shl nuw i8 2, [[X:%.*]]
+; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.umin.i8(i8 [[P2]], i8 32)
+; CHECK-NEXT:    call void @use.i8(i8 [[V]])
+; CHECK-NEXT:    [[R:%.*]] = call range(i8 2, 9) i8 @llvm.ctlz.i8(i8 [[V]], i1 true)
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %p2 = shl nuw i8 2, %x
+  %v = call i8 @llvm.umin(i8 %p2, i8 32)
+  call void @use.i8(i8 %v)
+  %r = call i8 @llvm.ctlz(i8 %v, i1 true)
+  ret i8 %r
+}
+
+
+define i9 @fold_clz_log2_i9(i9 %x) {
+; CHECK-LABEL: @fold_clz_log2_i9(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i9 @llvm.umin.i9(i9 [[X:%.*]], i9 5)
+; CHECK-NEXT:    [[R:%.*]] = sub nuw nsw i9 8, [[TMP1]]
+; CHECK-NEXT:    ret i9 [[R]]
+;
+  %p2 = shl i9 1, %x
+  %v = call i9 @llvm.umin(i9 %p2, i9 32)
+  %r = call i9 @llvm.ctlz(i9 %v, i1 true)
+  ret i9 %r
+}

From e2c49a45da31522d91e2e7b12bbc0901b0519384 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Mon, 13 Jan 2025 10:49:25 -0500
Subject: [PATCH 291/408] [mlir python] Add locking around
 PyMlirContext::liveOperations. (#122720)

In JAX, I observed a race between two PyOperation destructors from
different threads updating the same `liveOperations` map, despite not
intentionally sharing the context between different threads. Since I
don't think we can be completely sure when GC happens and on which
thread, it seems safest simply to add locking here.

We may also want to explicitly support sharing a context between threads
in the future, which would require this change or something similar.
---
 mlir/lib/Bindings/Python/IRCore.cpp | 43 +++++++++++++++++++++--------
 mlir/lib/Bindings/Python/IRModule.h |  3 ++
 2 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index 463ebdebb3f3f..53806ca9f04a4 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -677,29 +677,44 @@ size_t PyMlirContext::getLiveCount() {
   return getLiveContexts().size();
 }
 
-size_t PyMlirContext::getLiveOperationCount() { return liveOperations.size(); }
+size_t PyMlirContext::getLiveOperationCount() {
+  nb::ft_lock_guard lock(liveOperationsMutex);
+  return liveOperations.size();
+}
 
 std::vector<PyOperation *> PyMlirContext::getLiveOperationObjects() {
   std::vector<PyOperation *> liveObjects;
+  nb::ft_lock_guard lock(liveOperationsMutex);
   for (auto &entry : liveOperations)
     liveObjects.push_back(entry.second.second);
   return liveObjects;
 }
 
 size_t PyMlirContext::clearLiveOperations() {
-  for (auto &op : liveOperations)
+
+  LiveOperationMap operations;
+  {
+    nb::ft_lock_guard lock(liveOperationsMutex);
+    std::swap(operations, liveOperations);
+  }
+  for (auto &op : operations)
     op.second.second->setInvalid();
-  size_t numInvalidated = liveOperations.size();
-  liveOperations.clear();
+  size_t numInvalidated = operations.size();
   return numInvalidated;
 }
 
 void PyMlirContext::clearOperation(MlirOperation op) {
-  auto it = liveOperations.find(op.ptr);
-  if (it != liveOperations.end()) {
-    it->second.second->setInvalid();
+  PyOperation *py_op;
+  {
+    nb::ft_lock_guard lock(liveOperationsMutex);
+    auto it = liveOperations.find(op.ptr);
+    if (it == liveOperations.end()) {
+      return;
+    }
+    py_op = it->second.second;
     liveOperations.erase(it);
   }
+  py_op->setInvalid();
 }
 
 void PyMlirContext::clearOperationsInside(PyOperationBase &op) {
@@ -1183,7 +1198,6 @@ PyOperation::~PyOperation() {
 PyOperationRef PyOperation::createInstance(PyMlirContextRef contextRef,
                                            MlirOperation operation,
                                            nb::object parentKeepAlive) {
-  auto &liveOperations = contextRef->liveOperations;
   // Create.
   PyOperation *unownedOperation =
       new PyOperation(std::move(contextRef), operation);
@@ -1195,19 +1209,22 @@ PyOperationRef PyOperation::createInstance(PyMlirContextRef contextRef,
   if (parentKeepAlive) {
     unownedOperation->parentKeepAlive = std::move(parentKeepAlive);
   }
-  liveOperations[operation.ptr] = std::make_pair(pyRef, unownedOperation);
   return PyOperationRef(unownedOperation, std::move(pyRef));
 }
 
 PyOperationRef PyOperation::forOperation(PyMlirContextRef contextRef,
                                          MlirOperation operation,
                                          nb::object parentKeepAlive) {
+  nb::ft_lock_guard lock(contextRef->liveOperationsMutex);
   auto &liveOperations = contextRef->liveOperations;
   auto it = liveOperations.find(operation.ptr);
   if (it == liveOperations.end()) {
     // Create.
-    return createInstance(std::move(contextRef), operation,
-                          std::move(parentKeepAlive));
+    PyOperationRef result = createInstance(std::move(contextRef), operation,
+                                           std::move(parentKeepAlive));
+    liveOperations[operation.ptr] =
+        std::make_pair(result.getObject(), result.get());
+    return result;
   }
   // Use existing.
   PyOperation *existing = it->second.second;
@@ -1218,13 +1235,15 @@ PyOperationRef PyOperation::forOperation(PyMlirContextRef contextRef,
 PyOperationRef PyOperation::createDetached(PyMlirContextRef contextRef,
                                            MlirOperation operation,
                                            nb::object parentKeepAlive) {
+  nb::ft_lock_guard lock(contextRef->liveOperationsMutex);
   auto &liveOperations = contextRef->liveOperations;
   assert(liveOperations.count(operation.ptr) == 0 &&
          "cannot create detached operation that already exists");
   (void)liveOperations;
-
   PyOperationRef created = createInstance(std::move(contextRef), operation,
                                           std::move(parentKeepAlive));
+  liveOperations[operation.ptr] =
+      std::make_pair(created.getObject(), created.get());
   created->attached = false;
   return created;
 }
diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h
index f5fbb6c61b57e..d1fb4308dbb77 100644
--- a/mlir/lib/Bindings/Python/IRModule.h
+++ b/mlir/lib/Bindings/Python/IRModule.h
@@ -277,6 +277,9 @@ class PyMlirContext {
   // attempt to access it will raise an error.
   using LiveOperationMap =
       llvm::DenseMap<void *, std::pair<nanobind::handle, PyOperation *>>;
+  nanobind::ft_mutex liveOperationsMutex;
+
+  // Guarded by liveOperationsMutex in free-threading mode.
   LiveOperationMap liveOperations;
 
   bool emitErrorDiagnostics = false;

From 79e788d02eefdacb08af365389b9055518f3fad6 Mon Sep 17 00:00:00 2001
From: Kelvin Li <kkwli@users.noreply.github.com>
Date: Mon, 13 Jan 2025 10:52:09 -0500
Subject: [PATCH 292/408] [flang][AIX] BIND(C) derived type alignment for AIX
 (#121505)

This patch is to handle the alignment requirement for the `bind(c)`
derived type component that is real type and larger than 4 bytes. The
alignment of such component is 4-byte.
---
 .../flang/Optimizer/CodeGen/TypeConverter.h   |   2 +-
 .../flang/Optimizer/Dialect/FIRTypes.td       |   6 +
 flang/lib/Lower/ConvertType.cpp               |  44 ++++++++
 .../lib/Optimizer/CodeGen/BoxedProcedure.cpp  |   1 +
 flang/lib/Optimizer/CodeGen/TypeConverter.cpp |  10 +-
 flang/lib/Optimizer/Dialect/FIRType.cpp       |  31 +++++-
 flang/lib/Semantics/compute-offsets.cpp       |  88 ++++++++++++++-
 flang/test/Lower/CUDA/cuda-devptr.cuf         |   4 +-
 .../test/Lower/HLFIR/bindc-value-derived.f90  |  18 +--
 flang/test/Lower/OpenMP/copyin.f90            |   2 +-
 flang/test/Lower/derived-types-bindc.f90      |  44 ++++++++
 flang/test/Lower/intentout-deallocate.f90     |  20 ++--
 flang/test/Semantics/offsets04.f90            | 105 ++++++++++++++++++
 13 files changed, 340 insertions(+), 35 deletions(-)
 create mode 100644 flang/test/Lower/derived-types-bindc.f90
 create mode 100644 flang/test/Semantics/offsets04.f90

diff --git a/flang/include/flang/Optimizer/CodeGen/TypeConverter.h b/flang/include/flang/Optimizer/CodeGen/TypeConverter.h
index 7c317ddeea1fa..20270d41b1e9a 100644
--- a/flang/include/flang/Optimizer/CodeGen/TypeConverter.h
+++ b/flang/include/flang/Optimizer/CodeGen/TypeConverter.h
@@ -62,7 +62,7 @@ class LLVMTypeConverter : public mlir::LLVMTypeConverter {
   // fir.type<name(p : TY'...){f : TY...}>  -->  llvm<"%name = { ty... }">
   std::optional<llvm::LogicalResult>
   convertRecordType(fir::RecordType derived,
-                    llvm::SmallVectorImpl<mlir::Type> &results);
+                    llvm::SmallVectorImpl<mlir::Type> &results, bool isPacked);
 
   // Is an extended descriptor needed given the element type of a fir.box type ?
   // Extended descriptors are required for derived types.
diff --git a/flang/include/flang/Optimizer/Dialect/FIRTypes.td b/flang/include/flang/Optimizer/Dialect/FIRTypes.td
index 3919c9191c212..6ae74f16a72d3 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRTypes.td
+++ b/flang/include/flang/Optimizer/Dialect/FIRTypes.td
@@ -346,6 +346,12 @@ def fir_RecordType : FIR_Type<"Record", "type"> {
     void finalize(llvm::ArrayRef<TypePair> lenPList,
                   llvm::ArrayRef<TypePair> typeList);
 
+    // fir.type is unpacked by default. If the flag is set, the packed fir.type
+    // is generated and the alignment is enforced by explicit padding by i8
+    // array fields.
+    bool isPacked() const;
+    void pack(bool);
+
     detail::RecordTypeStorage const *uniqueKey() const;
   }];
 }
diff --git a/flang/lib/Lower/ConvertType.cpp b/flang/lib/Lower/ConvertType.cpp
index 452ddda426fa1..31b85ef2b5476 100644
--- a/flang/lib/Lower/ConvertType.cpp
+++ b/flang/lib/Lower/ConvertType.cpp
@@ -20,6 +20,8 @@
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/TargetParser/Host.h"
+#include "llvm/TargetParser/Triple.h"
 
 #define DEBUG_TYPE "flang-lower-type"
 
@@ -385,9 +387,20 @@ struct TypeBuilderImpl {
     // with dozens of components/parents (modern Fortran).
     derivedTypeInConstruction.try_emplace(&derivedScope, rec);
 
+    auto targetTriple{llvm::Triple(
+        llvm::Triple::normalize(llvm::sys::getDefaultTargetTriple()))};
+    // Always generate packed FIR struct type for bind(c) derived type for AIX
+    if (targetTriple.getOS() == llvm::Triple::OSType::AIX &&
+        tySpec.typeSymbol().attrs().test(Fortran::semantics::Attr::BIND_C) &&
+        !IsIsoCType(&tySpec)) {
+      rec.pack(true);
+    }
+
     // Gather the record type fields.
     // (1) The data components.
     if (converter.getLoweringOptions().getLowerToHighLevelFIR()) {
+      size_t prev_offset{0};
+      unsigned padCounter{0};
       // In HLFIR the parent component is the first fir.type component.
       for (const auto &componentName :
            typeSymbol.get<Fortran::semantics::DerivedTypeDetails>()
@@ -397,7 +410,38 @@ struct TypeBuilderImpl {
                "failed to find derived type component symbol");
         const Fortran::semantics::Symbol &component = scopeIter->second.get();
         mlir::Type ty = genSymbolType(component);
+        if (rec.isPacked()) {
+          auto compSize{component.size()};
+          auto compOffset{component.offset()};
+
+          if (prev_offset < compOffset) {
+            size_t pad{compOffset - prev_offset};
+            mlir::Type i8Ty{mlir::IntegerType::get(context, 8)};
+            fir::SequenceType::Shape shape{static_cast<int64_t>(pad)};
+            mlir::Type padTy{fir::SequenceType::get(shape, i8Ty)};
+            prev_offset += pad;
+            cs.emplace_back("__padding" + std::to_string(padCounter++), padTy);
+          }
+          prev_offset += compSize;
+        }
         cs.emplace_back(converter.getRecordTypeFieldName(component), ty);
+        if (rec.isPacked()) {
+          // For the last component, determine if any padding is needed.
+          if (componentName ==
+              typeSymbol.get<Fortran::semantics::DerivedTypeDetails>()
+                  .componentNames()
+                  .back()) {
+            auto compEnd{component.offset() + component.size()};
+            if (compEnd < derivedScope.size()) {
+              size_t pad{derivedScope.size() - compEnd};
+              mlir::Type i8Ty{mlir::IntegerType::get(context, 8)};
+              fir::SequenceType::Shape shape{static_cast<int64_t>(pad)};
+              mlir::Type padTy{fir::SequenceType::get(shape, i8Ty)};
+              cs.emplace_back("__padding" + std::to_string(padCounter++),
+                              padTy);
+            }
+          }
+        }
       }
     } else {
       for (const auto &component :
diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
index 104ae7408b80c..ad7272eaa9d3f 100644
--- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
+++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
@@ -167,6 +167,7 @@ class BoxprocTypeRewriter : public mlir::TypeConverter {
           cs.emplace_back(t.first, t.second);
       }
       rec.finalize(ps, cs);
+      rec.pack(ty.isPacked());
       return rec;
     });
     addConversion([&](TypeDescType ty) {
diff --git a/flang/lib/Optimizer/CodeGen/TypeConverter.cpp b/flang/lib/Optimizer/CodeGen/TypeConverter.cpp
index c23203efcd3df..0eace903720f0 100644
--- a/flang/lib/Optimizer/CodeGen/TypeConverter.cpp
+++ b/flang/lib/Optimizer/CodeGen/TypeConverter.cpp
@@ -82,7 +82,7 @@ LLVMTypeConverter::LLVMTypeConverter(mlir::ModuleOp module, bool applyTBAA,
       [&](fir::PointerType pointer) { return convertPointerLike(pointer); });
   addConversion(
       [&](fir::RecordType derived, llvm::SmallVectorImpl<mlir::Type> &results) {
-        return convertRecordType(derived, results);
+        return convertRecordType(derived, results, derived.isPacked());
       });
   addConversion(
       [&](fir::ReferenceType ref) { return convertPointerLike(ref); });
@@ -133,8 +133,10 @@ mlir::Type LLVMTypeConverter::indexType() const {
 }
 
 // fir.type<name(p : TY'...){f : TY...}>  -->  llvm<"%name = { ty... }">
-std::optional<llvm::LogicalResult> LLVMTypeConverter::convertRecordType(
-    fir::RecordType derived, llvm::SmallVectorImpl<mlir::Type> &results) {
+std::optional<llvm::LogicalResult>
+LLVMTypeConverter::convertRecordType(fir::RecordType derived,
+                                     llvm::SmallVectorImpl<mlir::Type> &results,
+                                     bool isPacked) {
   auto name = fir::NameUniquer::dropTypeConversionMarkers(derived.getName());
   auto st = mlir::LLVM::LLVMStructType::getIdentified(&getContext(), name);
 
@@ -156,7 +158,7 @@ std::optional<llvm::LogicalResult> LLVMTypeConverter::convertRecordType(
     else
       members.push_back(mlir::cast<mlir::Type>(convertType(mem.second)));
   }
-  if (mlir::failed(st.setBody(members, /*isPacked=*/false)))
+  if (mlir::failed(st.setBody(members, isPacked)))
     return mlir::failure();
   results.push_back(st);
   return mlir::success();
diff --git a/flang/lib/Optimizer/Dialect/FIRType.cpp b/flang/lib/Optimizer/Dialect/FIRType.cpp
index cba7fa6412850..d25e5651f1142 100644
--- a/flang/lib/Optimizer/Dialect/FIRType.cpp
+++ b/flang/lib/Optimizer/Dialect/FIRType.cpp
@@ -165,16 +165,20 @@ struct RecordTypeStorage : public mlir::TypeStorage {
     setTypeList(typeList);
   }
 
+  bool isPacked() const { return packed; }
+  void pack(bool p) { packed = p; }
+
 protected:
   std::string name;
   bool finalized;
+  bool packed;
   std::vector<RecordType::TypePair> lens;
   std::vector<RecordType::TypePair> types;
 
 private:
   RecordTypeStorage() = delete;
   explicit RecordTypeStorage(llvm::StringRef name)
-      : name{name}, finalized{false} {}
+      : name{name}, finalized{false}, packed{false} {}
 };
 
 } // namespace detail
@@ -872,9 +876,14 @@ llvm::LogicalResult fir::PointerType::verify(
 //===----------------------------------------------------------------------===//
 
 // Fortran derived type
+// unpacked:
 // `type` `<` name
 //           (`(` id `:` type (`,` id `:` type)* `)`)?
 //           (`{` id `:` type (`,` id `:` type)* `}`)? '>'
+// packed:
+// `type` `<` name
+//           (`(` id `:` type (`,` id `:` type)* `)`)?
+//           (`<{` id `:` type (`,` id `:` type)* `}>`)? '>'
 mlir::Type fir::RecordType::parse(mlir::AsmParser &parser) {
   llvm::StringRef name;
   if (parser.parseLess() || parser.parseKeyword(&name))
@@ -900,6 +909,10 @@ mlir::Type fir::RecordType::parse(mlir::AsmParser &parser) {
   }
 
   RecordType::TypeList typeList;
+  if (!parser.parseOptionalLess()) {
+    result.pack(true);
+  }
+
   if (!parser.parseOptionalLBrace()) {
     while (true) {
       llvm::StringRef field;
@@ -913,8 +926,10 @@ mlir::Type fir::RecordType::parse(mlir::AsmParser &parser) {
       if (parser.parseOptionalComma())
         break;
     }
-    if (parser.parseRBrace())
-      return {};
+    if (parser.parseOptionalGreater()) {
+      if (parser.parseRBrace())
+        return {};
+    }
   }
 
   if (parser.parseGreater())
@@ -941,6 +956,9 @@ void fir::RecordType::print(mlir::AsmPrinter &printer) const {
       printer << ')';
     }
     if (getTypeList().size()) {
+      if (isPacked()) {
+        printer << '<';
+      }
       char ch = '{';
       for (auto p : getTypeList()) {
         printer << ch << p.first << ':';
@@ -948,6 +966,9 @@ void fir::RecordType::print(mlir::AsmPrinter &printer) const {
         ch = ',';
       }
       printer << '}';
+      if (isPacked()) {
+        printer << '>';
+      }
     }
     recordTypeVisited.erase(uniqueKey());
   }
@@ -973,6 +994,10 @@ RecordType::TypeList fir::RecordType::getLenParamList() const {
 
 bool fir::RecordType::isFinalized() const { return getImpl()->isFinalized(); }
 
+void fir::RecordType::pack(bool p) { getImpl()->pack(p); }
+
+bool fir::RecordType::isPacked() const { return getImpl()->isPacked(); }
+
 detail::RecordTypeStorage const *fir::RecordType::uniqueKey() const {
   return getImpl();
 }
diff --git a/flang/lib/Semantics/compute-offsets.cpp b/flang/lib/Semantics/compute-offsets.cpp
index 94640fa30baa5..6d4fce2f00a6d 100644
--- a/flang/lib/Semantics/compute-offsets.cpp
+++ b/flang/lib/Semantics/compute-offsets.cpp
@@ -17,6 +17,8 @@
 #include "flang/Semantics/symbol.h"
 #include "flang/Semantics/tools.h"
 #include "flang/Semantics/type.h"
+#include "llvm/TargetParser/Host.h"
+#include "llvm/TargetParser/Triple.h"
 #include <algorithm>
 #include <vector>
 
@@ -51,9 +53,12 @@ class ComputeOffsetsHelper {
   SymbolAndOffset Resolve(const SymbolAndOffset &);
   std::size_t ComputeOffset(const EquivalenceObject &);
   // Returns amount of padding that was needed for alignment
-  std::size_t DoSymbol(Symbol &);
+  std::size_t DoSymbol(
+      Symbol &, std::optional<const size_t> newAlign = std::nullopt);
   SizeAndAlignment GetSizeAndAlignment(const Symbol &, bool entire);
   std::size_t Align(std::size_t, std::size_t);
+  std::optional<size_t> CompAlignment(const Symbol &);
+  std::optional<size_t> HasSpecialAlign(const Symbol &, Scope &);
 
   SemanticsContext &context_;
   std::size_t offset_{0};
@@ -65,6 +70,69 @@ class ComputeOffsetsHelper {
       equivalenceBlock_;
 };
 
+// This function is only called if the target platform is AIX.
+static bool isReal8OrLarger(const Fortran::semantics::DeclTypeSpec *type) {
+  return ((type->IsNumeric(common::TypeCategory::Real) ||
+              type->IsNumeric(common::TypeCategory::Complex)) &&
+      evaluate::ToInt64(type->numericTypeSpec().kind()) > 4);
+}
+
+// This function is only called if the target platform is AIX.
+// It determines the alignment of a component. If the component is a derived
+// type, the alignment is computed accordingly.
+std::optional<size_t> ComputeOffsetsHelper::CompAlignment(const Symbol &sym) {
+  size_t max_align{0};
+  constexpr size_t fourByteAlign{4};
+  bool contain_double{false};
+  auto derivedTypeSpec{sym.GetType()->AsDerived()};
+  DirectComponentIterator directs{*derivedTypeSpec};
+  for (auto it{directs.begin()}; it != directs.end(); ++it) {
+    auto type{it->GetType()};
+    auto s{GetSizeAndAlignment(*it, true)};
+    if (isReal8OrLarger(type)) {
+      max_align = std::max(max_align, fourByteAlign);
+      contain_double = true;
+    } else if (type->AsDerived()) {
+      if (const auto newAlgin{CompAlignment(*it)}) {
+        max_align = std::max(max_align, s.alignment);
+      } else {
+        return std::nullopt;
+      }
+    } else {
+      max_align = std::max(max_align, s.alignment);
+    }
+  }
+
+  if (contain_double) {
+    return max_align;
+  } else {
+    return std::nullopt;
+  }
+}
+
+// This function is only called if the target platform is AIX.
+// Special alignment is needed only if it is a bind(c) derived type
+// and contain real type components that have larger than 4 bytes.
+std::optional<size_t> ComputeOffsetsHelper::HasSpecialAlign(
+    const Symbol &sym, Scope &scope) {
+  // On AIX, if the component that is not the first component and is
+  // a float of 8 bytes or larger, it has the 4-byte alignment.
+  // Only set the special alignment for bind(c) derived type on that platform.
+  if (const auto type{sym.GetType()}) {
+    auto &symOwner{sym.owner()};
+    if (symOwner.symbol() && symOwner.IsDerivedType() &&
+        symOwner.symbol()->attrs().HasAny({semantics::Attr::BIND_C}) &&
+        &sym != &(*scope.GetSymbols().front())) {
+      if (isReal8OrLarger(type)) {
+        return 4UL;
+      } else if (type->AsDerived()) {
+        return CompAlignment(sym);
+      }
+    }
+  }
+  return std::nullopt;
+}
+
 void ComputeOffsetsHelper::Compute(Scope &scope) {
   for (Scope &child : scope.children()) {
     ComputeOffsets(context_, child);
@@ -113,7 +181,15 @@ void ComputeOffsetsHelper::Compute(Scope &scope) {
     if (!FindCommonBlockContaining(*symbol) &&
         dependents_.find(symbol) == dependents_.end() &&
         equivalenceBlock_.find(symbol) == equivalenceBlock_.end()) {
-      DoSymbol(*symbol);
+
+      std::optional<size_t> newAlign{std::nullopt};
+      // Handle special alignment requirement for AIX
+      auto triple{llvm::Triple(
+          llvm::Triple::normalize(llvm::sys::getDefaultTargetTriple()))};
+      if (triple.getOS() == llvm::Triple::OSType::AIX) {
+        newAlign = HasSpecialAlign(*symbol, scope);
+      }
+      DoSymbol(*symbol, newAlign);
       if (auto *generic{symbol->detailsIf<GenericDetails>()}) {
         if (Symbol * specific{generic->specific()};
             specific && !FindCommonBlockContaining(*specific)) {
@@ -313,7 +389,8 @@ std::size_t ComputeOffsetsHelper::ComputeOffset(
   return result;
 }
 
-std::size_t ComputeOffsetsHelper::DoSymbol(Symbol &symbol) {
+std::size_t ComputeOffsetsHelper::DoSymbol(
+    Symbol &symbol, std::optional<const size_t> newAlign) {
   if (!symbol.has<ObjectEntityDetails>() && !symbol.has<ProcEntityDetails>()) {
     return 0;
   }
@@ -322,12 +399,13 @@ std::size_t ComputeOffsetsHelper::DoSymbol(Symbol &symbol) {
     return 0;
   }
   std::size_t previousOffset{offset_};
-  offset_ = Align(offset_, s.alignment);
+  size_t alignVal{newAlign.value_or(s.alignment)};
+  offset_ = Align(offset_, alignVal);
   std::size_t padding{offset_ - previousOffset};
   symbol.set_size(s.size);
   symbol.set_offset(offset_);
   offset_ += s.size;
-  alignment_ = std::max(alignment_, s.alignment);
+  alignment_ = std::max(alignment_, alignVal);
   return padding;
 }
 
diff --git a/flang/test/Lower/CUDA/cuda-devptr.cuf b/flang/test/Lower/CUDA/cuda-devptr.cuf
index 2eac890970d52..561d92ecd3e2e 100644
--- a/flang/test/Lower/CUDA/cuda-devptr.cuf
+++ b/flang/test/Lower/CUDA/cuda-devptr.cuf
@@ -38,8 +38,8 @@ end
 
 ! CHECK-LABEL: func.func @_QPsub2()
 ! CHECK: %[[X:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsub2Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
-! CHECK: %[[CPTR:.*]] = fir.field_index cptr, !fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>
-! CHECK: %[[CPTR_COORD:.*]] = fir.coordinate_of %{{.*}}#1, %[[CPTR]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>, !fir.field) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>
+! CHECK: %[[CPTR:.*]] = fir.field_index cptr, !fir.type<_QM__fortran_builtinsT__builtin_c_devptr{{[<]?}}{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}{{[>]?}}>
+! CHECK: %[[CPTR_COORD:.*]] = fir.coordinate_of %{{.*}}#1, %[[CPTR]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_devptr{{[<]?}}{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}{{[>]?}}>>, !fir.field) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>
 ! CHECK: %[[ADDRESS:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
 ! CHECK: %[[ADDRESS_COORD:.*]] = fir.coordinate_of %[[CPTR_COORD]], %[[ADDRESS]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
 ! CHECK: %[[ADDRESS_LOADED:.*]] = fir.load %[[ADDRESS_COORD]] : !fir.ref<i64>
diff --git a/flang/test/Lower/HLFIR/bindc-value-derived.f90 b/flang/test/Lower/HLFIR/bindc-value-derived.f90
index 7a2196dfc8bf1..5af9f8edc804c 100644
--- a/flang/test/Lower/HLFIR/bindc-value-derived.f90
+++ b/flang/test/Lower/HLFIR/bindc-value-derived.f90
@@ -14,11 +14,11 @@ subroutine test(x) bind(c)
     call use_it(x%i)
   end subroutine
 ! CHECK-LABEL:   func.func @test(
-! CHECK-SAME:                    %[[VAL_0:.*]]: !fir.type<_QMbindc_byvalTt{i:i32}>
-! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.type<_QMbindc_byvalTt{i:i32}>
-! CHECK:           fir.store %[[VAL_0]] to %[[VAL_1]] : !fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QMbindc_byvalFtestEx"} : (!fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>, !fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>)
-! CHECK:           %[[VAL_3:.*]] = hlfir.designate %[[VAL_2]]#0{"i"}   : (!fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>) -> !fir.ref<i32>
+! CHECK-SAME:                    %[[VAL_0:.*]]: !fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>
+! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>
+! CHECK:           fir.store %[[VAL_0]] to %[[VAL_1]] : !fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QMbindc_byvalFtestEx"} : (!fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>, !fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>)
+! CHECK:           %[[VAL_3:.*]] = hlfir.designate %[[VAL_2]]#0{"i"}   : (!fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>) -> !fir.ref<i32>
 ! CHECK:           fir.call @_QPuse_it(%[[VAL_3]]) fastmath<contract> : (!fir.ref<i32>) -> ()
 ! CHECK:           return
 ! CHECK:         }
@@ -28,10 +28,10 @@ subroutine call_it(x)
     call test(x)
   end subroutine
 ! CHECK-LABEL:   func.func @_QMbindc_byvalPcall_it(
-! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMbindc_byvalFcall_itEx"} : (!fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>, !fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>)
-! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#1 : !fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>
-! CHECK:           fir.call @test(%[[VAL_2]]) proc_attrs<bind_c> fastmath<contract> : (!fir.type<_QMbindc_byvalTt{i:i32}>) -> ()
+! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]}} {uniq_name = "_QMbindc_byvalFcall_itEx"} : (!fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>, !fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>)
+! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#1 : !fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>
+! CHECK:           fir.call @test(%[[VAL_2]]) proc_attrs<bind_c> fastmath<contract> : (!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>) -> ()
 ! CHECK:           return
 ! CHECK:         }
 end module
diff --git a/flang/test/Lower/OpenMP/copyin.f90 b/flang/test/Lower/OpenMP/copyin.f90
index f3d147c10668f..9e9ccf8e3d914 100644
--- a/flang/test/Lower/OpenMP/copyin.f90
+++ b/flang/test/Lower/OpenMP/copyin.f90
@@ -86,7 +86,7 @@ subroutine copyin_char_chararray()
 end
 
 ! CHECK-LABEL:   func.func @_QPcopyin_derived_type() {
-! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFcopyin_derived_typeE.b.my_type.t_arr) : !fir.ref<!fir.array<2x1x!fir.type<_QM__fortran_type_infoTvalue{genre:i8,__padding0:!fir.array<7xi8>,value:i64}>>>
+! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFcopyin_derived_typeE.b.my_type.t_arr) : !fir.ref<!fir.array<2x1x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 2 : index
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
diff --git a/flang/test/Lower/derived-types-bindc.f90 b/flang/test/Lower/derived-types-bindc.f90
new file mode 100644
index 0000000000000..309b2b7f5f492
--- /dev/null
+++ b/flang/test/Lower/derived-types-bindc.f90
@@ -0,0 +1,44 @@
+! Test padding for BIND(C) derived types lowering for AIX target
+! RUN: %flang_fc1 -emit-llvm %s -o - | FileCheck %s
+
+! REQUIRES: target={{.+}}-aix{{.*}}
+
+subroutine s1()
+  use, intrinsic :: iso_c_binding
+  type, bind(c) :: t0
+    character(c_char) :: x1
+    real(c_double) :: x2
+  end type
+  type(t0) :: xt0
+! CHECK-DAG: %_QFs1Tt0 = type <{ [1 x i8], [3 x i8], double }>
+
+  type, bind(c) :: t1
+    integer(c_short) :: x1
+    real(c_double) :: x2
+  end type
+  type(t1) :: xt1
+! CHECK-DAG: %_QFs1Tt1 = type <{ i16, [2 x i8], double }>
+
+  type, bind(c) :: t2
+    integer(c_short) :: x1
+    real(c_double) :: x2
+    character(c_char) :: x3
+  end type
+  type(t2) :: xt2
+! CHECK-DAG: %_QFs1Tt2 = type <{ i16, [2 x i8], double, [1 x i8], [3 x i8] }>
+
+  type, bind(c) :: t3
+    character(c_char) :: x1
+    complex(c_double_complex) :: x2
+  end type
+  type(t3) :: xt3
+! CHECK-DAG: %_QFs1Tt3 = type <{ [1 x i8], [3 x i8], { double, double } }>
+
+  type, bind(c) :: t4
+    integer(c_short) :: x1
+    complex(c_double_complex) :: x2
+    character(c_char) :: x3
+  end type
+  type(t4) :: xt4
+! CHECK-DAG: %_QFs1Tt4 = type <{ i16, [2 x i8], { double, double }, [1 x i8], [3 x i8] }>
+end subroutine s1
diff --git a/flang/test/Lower/intentout-deallocate.f90 b/flang/test/Lower/intentout-deallocate.f90
index 8e7ccbcc9fdb9..931cf7d48885f 100644
--- a/flang/test/Lower/intentout-deallocate.f90
+++ b/flang/test/Lower/intentout-deallocate.f90
@@ -123,24 +123,24 @@ subroutine sub5(t)
 ! on the caller side.
 
 ! CHECK-LABEL: func.func @_QMmod1Psub4()
-! FIR: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.type<_QMmod1Tt1{i:i32}>>> {bindc_name = "t", uniq_name = "_QMmod1Fsub4Et"}
+! FIR: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.type<_QMmod1Tt1{{[<]?}}{i:i32}{{[>]?}}>>> {bindc_name = "t", uniq_name = "_QMmod1Fsub4Et"}
 ! HLFIR: %[[BOX:.*]]:2 = hlfir.declare {{.*}}"_QMmod1Fsub4Et"
 ! CHECK-NOT: fir.call @_FortranAAllocatableDeallocate
-! CHECK: fir.call @_QMmod1Psub5(%[[BOX]]{{[#0]*}}) {{.*}}: (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMmod1Tt1{i:i32}>>>>) -> ()
+! CHECK: fir.call @_QMmod1Psub5(%[[BOX]]{{[#0]*}}) {{.*}}: (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMmod1Tt1{{[<]?}}{i:i32}{{[>]?}}>>>>) -> ()
 
 ! Check deallocation of allocatble intent(out) on the callee side. Deallocation
 ! is done with a runtime call.
 
 ! CHECK-LABEL: func.func @_QMmod1Psub5(
-! FIR-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.type<_QMmod1Tt1{i:i32}>>>> {fir.bindc_name = "t"})
+! FIR-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.type<_QMmod1Tt1{{[<]?}}{i:i32}{{[>]?}}>>>> {fir.bindc_name = "t"})
 ! HLFIR: %[[ARG0:.*]]:2 = hlfir.declare {{.*}}"_QMmod1Fsub5Et"
-! CHECK: %[[BOX:.*]] = fir.load %[[ARG0]]{{[#1]*}} : !fir.ref<!fir.box<!fir.heap<!fir.type<_QMmod1Tt1{i:i32}>>>>
-! CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[BOX]] : (!fir.box<!fir.heap<!fir.type<_QMmod1Tt1{i:i32}>>>) -> !fir.heap<!fir.type<_QMmod1Tt1{i:i32}>>
-! CHECK: %[[BOX_ADDR_PTR:.*]] = fir.convert %[[BOX_ADDR]] : (!fir.heap<!fir.type<_QMmod1Tt1{i:i32}>>) -> i64
+! CHECK: %[[BOX:.*]] = fir.load %[[ARG0]]{{[#1]*}} : !fir.ref<!fir.box<!fir.heap<!fir.type<_QMmod1Tt1{{[<]?}}{i:i32}{{[>]?}}>>>>
+! CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[BOX]] : (!fir.box<!fir.heap<!fir.type<_QMmod1Tt1{{[<]?}}{i:i32}{{[>]?}}>>>) -> !fir.heap<!fir.type<_QMmod1Tt1{{[<]?}}{i:i32}{{[>]?}}>>
+! CHECK: %[[BOX_ADDR_PTR:.*]] = fir.convert %[[BOX_ADDR]] : (!fir.heap<!fir.type<_QMmod1Tt1{{[<]?}}{i:i32}{{[>]?}}>>) -> i64
 ! CHECK: %[[C0:.*]] = arith.constant 0 : i64
 ! CHECK: %[[IS_ALLOCATED:.*]] = arith.cmpi ne, %[[BOX_ADDR_PTR]], %[[C0]] : i64
 ! CHECK: fir.if %[[IS_ALLOCATED]] {
-! CHECK:   %[[BOX_NONE:.*]] = fir.convert %[[ARG0]]{{[#1]*}} : (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMmod1Tt1{i:i32}>>>>) -> !fir.ref<!fir.box<none>>
+! CHECK:   %[[BOX_NONE:.*]] = fir.convert %[[ARG0]]{{[#1]*}} : (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMmod1Tt1{{[<]?}}{i:i32}{{[>]?}}>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:   %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
   subroutine sub6()
@@ -152,11 +152,11 @@ subroutine sub6()
 ! Deallocation is done with a runtime call.
 
 ! CHECK-LABEL: func.func @_QMmod1Psub6()
-! FIR: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.type<_QMmod1Tt1{i:i32}>>> {bindc_name = "t", uniq_name = "_QMmod1Fsub6Et"}
+! FIR: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.type<_QMmod1Tt1{{[<]?}}{i:i32}{{[>]?}}>>> {bindc_name = "t", uniq_name = "_QMmod1Fsub6Et"}
 ! HLFIR: %[[BOX:.*]]:2 = hlfir.declare {{.*}}"_QMmod1Fsub6Et"
-! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[BOX]]{{[#1]*}} : (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMmod1Tt1{i:i32}>>>>) -> !fir.ref<!fir.box<none>>
+! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[BOX]]{{[#1]*}} : (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMmod1Tt1{{[<]?}}{i:i32}{{[>]?}}>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
-! CHECK: fir.call @sub7(%[[BOX]]{{[#0]*}}) {{.*}}: (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMmod1Tt1{i:i32}>>>>) -> ()
+! CHECK: fir.call @sub7(%[[BOX]]{{[#0]*}}) {{.*}}: (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMmod1Tt1{{[<]?}}{i:i32}{{[>]?}}>>>>) -> ()
 
   subroutine sub8()
     integer, allocatable :: a(:)
diff --git a/flang/test/Semantics/offsets04.f90 b/flang/test/Semantics/offsets04.f90
new file mode 100644
index 0000000000000..d0d871a981c17
--- /dev/null
+++ b/flang/test/Semantics/offsets04.f90
@@ -0,0 +1,105 @@
+!RUN: %flang_fc1 -fdebug-dump-symbols %s | FileCheck %s
+
+!REQUIRES: target={{.+}}-aix{{.*}}
+
+! Size and alignment of bind(c) derived types
+subroutine s1()
+  use, intrinsic :: iso_c_binding
+  type, bind(c) :: dt1
+    character(c_char) :: x1    !CHECK: x1 size=1 offset=0:
+    real(c_double) :: x2       !CHECK: x2 size=8 offset=4:
+  end type
+  type, bind(c) :: dt2
+    character(c_char) :: x1(9) !CHECK: x1 size=9 offset=0:
+    real(c_double) :: x2       !CHECK: x2 size=8 offset=12:
+  end type
+  type, bind(c) :: dt3
+    integer(c_short) :: x1     !CHECK: x1 size=2 offset=0:
+    real(c_double) :: x2       !CHECK: x2 size=8 offset=4:
+  end type
+  type, bind(c) :: dt4
+    integer(c_int) :: x1       !CHECK: x1 size=4 offset=0:
+    real(c_double) :: x2       !CHECK: x2 size=8 offset=4:
+  end type
+  type, bind(c) :: dt5
+    real(c_double) :: x1       !CHECK: x1 size=8 offset=0:
+    real(c_double) :: x2       !CHECK: x2 size=8 offset=8:
+  end type
+  type, bind(c) :: dt6
+    integer(c_long) :: x1      !CHECK: x1 size=8 offset=0:
+    character(c_char) :: x2    !CHECK: x2 size=1 offset=8:
+    real(c_double) :: x3       !CHECK: x3 size=8 offset=12:
+  end type
+  type, bind(c) :: dt7
+    integer(c_long) :: x1      !CHECK: x1 size=8 offset=0:
+    integer(c_long) :: x2      !CHECK: x2 size=8 offset=8:
+    character(c_char) :: x3    !CHECK: x3 size=1 offset=16:
+    real(c_double) :: x4       !CHECK: x4 size=8 offset=20:
+  end type
+  type, bind(c) :: dt8
+    character(c_char) :: x1         !CHECK: x1 size=1 offset=0:
+    complex(c_double_complex) :: x2 !CHECK: x2 size=16 offset=4:
+  end type
+end subroutine
+
+subroutine s2()
+  use, intrinsic :: iso_c_binding
+  type, bind(c) :: dt10
+    character(c_char) :: x1
+    real(c_double) :: x2
+  end type
+  type, bind(c) :: dt11
+    type(dt10) :: y1           !CHECK: y1 size=12 offset=0:
+    real(c_double) :: y2       !CHECK: y2 size=8 offset=12:
+  end type
+  type, bind(c) :: dt12
+    character(c_char) :: y1    !CHECK: y1 size=1 offset=0:
+    type(dt10) :: y2           !CHECK: y2 size=12 offset=4:
+    character(c_char) :: y3    !CHECK: y3 size=1 offset=16:
+  end type
+  type, bind(c) :: dt13
+    integer(c_short) :: y1     !CHECK: y1 size=2 offset=0:
+    type(dt10) :: y2           !CHECK: y2 size=12 offset=4:
+    character(c_char) :: y3    !CHECK: y3 size=1 offset=16:
+  end type
+
+  type, bind(c) :: dt20
+    character(c_char) :: x1
+    integer(c_short) :: x2
+  end type
+  type, bind(c) :: dt21
+    real(c_double) :: y1       !CHECK: y1 size=8 offset=0:
+    type(dt20) :: y2           !CHECK: y2 size=4 offset=8:
+    real(c_double) :: y3       !CHECK: y3 size=8 offset=12:
+  end type
+
+  type, bind(c) :: dt30
+    character(c_char) :: x1
+    character(c_char) :: x2
+  end type
+  type, bind(c) :: dt31
+     integer(c_long) :: y1     !CHECK: y1 size=8 offset=0:
+     type(dt30) :: y2          !CHECK: y2 size=2 offset=8:
+     real(c_double) :: y3      !CHECK: y3 size=8 offset=12:
+  end type
+
+  type, bind(c) :: dt40
+    integer(c_short) :: x1
+    real(c_double) :: x2
+  end type
+  type, bind(c) :: dt41
+    real(c_double) :: y1       !CHECK: y1 size=8 offset=0:
+    type(dt40) :: y2           !CHECK: y2 size=12 offset=8:
+    real(c_double) :: y3       !CHECK: y3 size=8 offset=20:
+  end type
+
+  type, bind(c) :: dt50
+    integer(c_short) :: x1
+    complex(c_double_complex) :: x2
+  end type
+  type, bind(c) :: dt51
+    real(c_double) :: y1            !CHECK: y1 size=8 offset=0:
+    type(dt50) :: y2                !CHECK: y2 size=20 offset=8:
+    complex(c_double_complex) :: y3 !CHECK: y3 size=16 offset=28:
+  end type
+end subroutine

From 051cd36f82ca9c9db599fc0fa782e22645d824c9 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Mon, 13 Jan 2025 15:56:32 +0000
Subject: [PATCH 293/408] [gn build] Port b5ba4f06db2e

---
 llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 83c0811b6814a..74e81f2e98084 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -1462,7 +1462,6 @@ if (current_toolchain == default_toolchain) {
       "__locale_dir/locale_base_api.h",
       "__locale_dir/locale_base_api/android.h",
       "__locale_dir/locale_base_api/bsd_locale_fallbacks.h",
-      "__locale_dir/locale_base_api/fuchsia.h",
       "__locale_dir/locale_base_api/ibm.h",
       "__locale_dir/locale_base_api/musl.h",
       "__locale_dir/locale_base_api/openbsd.h",
@@ -1470,6 +1469,9 @@ if (current_toolchain == default_toolchain) {
       "__locale_dir/support/apple.h",
       "__locale_dir/support/bsd_like.h",
       "__locale_dir/support/freebsd.h",
+      "__locale_dir/support/fuchsia.h",
+      "__locale_dir/support/no_locale/characters.h",
+      "__locale_dir/support/no_locale/strtonum.h",
       "__locale_dir/support/windows.h",
       "__math/abs.h",
       "__math/copysign.h",

From 7059178bd38e770bfadebeadf0811d0d7cb9a9e2 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Mon, 13 Jan 2025 15:56:33 +0000
Subject: [PATCH 294/408] [gn build] Port cedb44af53f1

---
 llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 74e81f2e98084..639095b698c6f 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -1512,7 +1512,6 @@ if (current_toolchain == default_toolchain) {
       "__memory/array_cookie.h",
       "__memory/assume_aligned.h",
       "__memory/auto_ptr.h",
-      "__memory/builtin_new_allocator.h",
       "__memory/compressed_pair.h",
       "__memory/concepts.h",
       "__memory/construct_at.h",
@@ -1845,6 +1844,7 @@ if (current_toolchain == default_toolchain) {
       "__utility/cmp.h",
       "__utility/convert_to_integral.h",
       "__utility/declval.h",
+      "__utility/element_count.h",
       "__utility/empty.h",
       "__utility/exception_guard.h",
       "__utility/exchange.h",

From 7aebacbee965a83c5cc69f6a723605436651672c Mon Sep 17 00:00:00 2001
From: Philipp Schilk <schilk.philipp@gmail.com>
Date: Mon, 13 Jan 2025 17:05:26 +0100
Subject: [PATCH 295/408] [MLIR][TableGen] Use arg index in InferredResultType
 constructor. (#122717)

Trying to constrain two results to be of the same type using
`AllTypesMatch` would cause `mlir-tablgen` to crash on this
assertion[1].

Example:

```tblgen
def OpL5 : NS_Op<"op_with_same_but_unconstraint_results",
    [AllTypesMatch<["result_a", "result_b"]>]> {
  let results = (outs AnyType:$result_a, AnyType:$result_b);
}
```

This is because there was a small bug when constructing the `inferences`
graph from these constraints: The sources should be specified by the
combined arg/result index (in other words, with results negative) not
with the result index.


[1]
https://github.com/llvm/llvm-project/blob/99612a3a18e0c40aac9c52b68e67b106f97ed4fa/mlir/lib/TableGen/Operator.cpp#L526
---
 mlir/lib/TableGen/Operator.cpp     |  4 ++--
 mlir/test/mlir-tblgen/op-result.td | 21 +++++++++++++++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/TableGen/Operator.cpp b/mlir/lib/TableGen/Operator.cpp
index c360c61afd27b..20a43ef15d09e 100644
--- a/mlir/lib/TableGen/Operator.cpp
+++ b/mlir/lib/TableGen/Operator.cpp
@@ -503,8 +503,8 @@ void Operator::populateTypeInferenceInfo(
         for (int otherResultIndex : resultIndices) {
           if (resultIndex == otherResultIndex)
             continue;
-          inference[resultIndex].sources.emplace_back(otherResultIndex,
-                                                      "$_self");
+          inference[resultIndex].sources.emplace_back(
+              InferredResultType::unmapResultIndex(otherResultIndex), "$_self");
         }
       }
     }
diff --git a/mlir/test/mlir-tblgen/op-result.td b/mlir/test/mlir-tblgen/op-result.td
index 51f8b0671a328..f668d9a5a6644 100644
--- a/mlir/test/mlir-tblgen/op-result.td
+++ b/mlir/test/mlir-tblgen/op-result.td
@@ -180,6 +180,27 @@ def OpL4 : NS_Op<"two_inference_edges", [
 // CHECK: inferredReturnTypes[1] = odsInferredType1
 // CHECK: inferredReturnTypes[2] = odsInferredType2
 
+def OpL5 : NS_Op<"op_with_same_but_unconstraint_results",
+    [AllTypesMatch<["result_a", "result_b"]>]> {
+  let results = (outs AnyType:$result_a, AnyType:$result_b);
+}
+
+// CHECK-NOT: LogicalResult OpL5::inferReturnTypes
+
+def OpL6 : NS_Op<"op_with_same_and_constraint_results",
+    [AllTypesMatch<["result_a", "result_b", "result_c"]>]> {
+  let results = (outs AnyType:$result_a, AnyType:$result_b, I32:$result_c);
+}
+
+// CHECK-LABEL: LogicalResult OpL6::inferReturnTypes
+// CHECK-NOT: }
+// CHECK: odsInferredType0 = odsBuilder.getIntegerType(32);
+// CHECK: odsInferredType1 = odsBuilder.getIntegerType(32);
+// CHECK: odsInferredType2 = odsBuilder.getIntegerType(32);
+// CHECK: inferredReturnTypes[0] = odsInferredType0;
+// CHECK: inferredReturnTypes[1] = odsInferredType1;
+// CHECK: inferredReturnTypes[2] = odsInferredType2;
+
 def OpM : NS_Op<"mix_diff_size_variadic_and_normal_results_op", [AttrSizedResultSegments]> {
   let results = (outs Variadic<AnyTensor>:$output1, AnyTensor:$output2, Optional<AnyTensor>:$output3);
 }

From 658ec8593b25f2bd05874deab4582b6759e92e40 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Mon, 13 Jan 2025 20:10:31 +0400
Subject: [PATCH 296/408] [clang] Add test for CWG170 "Pointer-to-member
 conversions" (#121667)

This patch adds test for
[CWG170](https://cplusplus.github.io/CWG/issues/170.html). The
resolution adds explicit undefined behavior, so I think the best we can
do is to put the test into constexpr evaluator. Change to
[expr.static.cast] is not tested, because it was a drive-by fix that
removed an impossible case (I confirmed it using minutes). Minutes
mention several times a comprehensive paper in this design space which
no one seem to remember. I believe it's
[P0149R0](https://wg21.link/p0149r0) "Generalised member pointers".
---
 clang/test/CXX/drs/cwg1xx.cpp | 20 ++++++++++++++++++++
 clang/www/cxx_dr_status.html  |  2 +-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/clang/test/CXX/drs/cwg1xx.cpp b/clang/test/CXX/drs/cwg1xx.cpp
index 98eb86c929009..15bcc20b7fa2a 100644
--- a/clang/test/CXX/drs/cwg1xx.cpp
+++ b/clang/test/CXX/drs/cwg1xx.cpp
@@ -1076,6 +1076,26 @@ namespace cwg169 { // cwg169: 3.4
   };
 } // namespace cwg169
 
+namespace cwg170 { // cwg170: 3.1
+#if __cplusplus >= 201103L
+struct A {};
+struct B : A { int i; };
+struct C : A {};
+struct D : C {};
+
+constexpr int f(int A::*) { return 0; }
+constexpr int g(int C::*) { return 0; }
+constexpr int h(int D::*) { return 0; }
+
+constexpr auto p = static_cast<int A::*>(&B::i);
+constexpr auto q = f(p);
+constexpr auto r = g(p);
+// since-cxx11-error@-1 {{constexpr variable 'r' must be initialized by a constant expression}}
+constexpr auto s = h(p);
+// since-cxx11-error@-1 {{constexpr variable 's' must be initialized by a constant expression}}
+#endif
+} // namespace cwg170
+
 namespace { // cwg171: 3.4
   int cwg171a;
 }
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index f2716f1e4c653..564502c1f3e92 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -1065,7 +1065,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/170.html">170</a></td>
     <td>DRWP</td>
     <td>Pointer-to-member conversions</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="171">
     <td><a href="https://cplusplus.github.io/CWG/issues/171.html">171</a></td>

From f1632d25db47629221b8a25d79b7993b397f6886 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Mon, 13 Jan 2025 16:20:00 +0000
Subject: [PATCH 297/408] IR: introduce ICmpInst::isImpliedByMatchingCmp
 (#122597)

Create an abstraction over isImplied{True,False}ByMatchingCmp to
faithfully communicate the result of both functions, cleaning up code in
callsites. While at it, fix a bug in the implied-false version of the
function, which was inadvertedenly dropping samesign information.
---
 llvm/include/llvm/IR/Instructions.h           | 13 ++----
 llvm/include/llvm/SandboxIR/Instruction.h     | 10 ++---
 llvm/lib/Analysis/ValueTracking.cpp           | 19 ++-------
 llvm/lib/IR/Instructions.cpp                  | 41 +++++++++++--------
 llvm/lib/Transforms/Scalar/NewGVN.cpp         | 16 ++------
 .../implied-condition-samesign.ll             | 37 +++++++++++++++++
 6 files changed, 76 insertions(+), 60 deletions(-)

diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index 59eb504098837..9a41971b63373 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -1266,15 +1266,10 @@ class ICmpInst: public CmpInst {
     return getFlippedSignednessPredicate(getPredicate());
   }
 
-  /// Determine if Pred1 implies Pred2 is true when two compares have matching
-  /// operands.
-  static bool isImpliedTrueByMatchingCmp(CmpPredicate Pred1,
-                                         CmpPredicate Pred2);
-
-  /// Determine if Pred1 implies Pred2 is false when two compares have matching
-  /// operands.
-  static bool isImpliedFalseByMatchingCmp(CmpPredicate Pred1,
-                                          CmpPredicate Pred2);
+  /// Determine if Pred1 implies Pred2 is true, false, or if nothing can be
+  /// inferred about the implication, when two compares have matching operands.
+  static std::optional<bool> isImpliedByMatchingCmp(CmpPredicate Pred1,
+                                                    CmpPredicate Pred2);
 
   void setSameSign(bool B = true) {
     SubclassOptionalData = (SubclassOptionalData & ~SameSign) | (B * SameSign);
diff --git a/llvm/include/llvm/SandboxIR/Instruction.h b/llvm/include/llvm/SandboxIR/Instruction.h
index d7c1eda81c006..34a7feb63bec4 100644
--- a/llvm/include/llvm/SandboxIR/Instruction.h
+++ b/llvm/include/llvm/SandboxIR/Instruction.h
@@ -2547,13 +2547,9 @@ class ICmpInst : public CmpInst {
   WRAP_STATIC_PREDICATE(isGE);
   WRAP_STATIC_PREDICATE(isLE);
 
-  static bool isImpliedTrueByMatchingCmp(CmpPredicate Pred1,
-                                         CmpPredicate Pred2) {
-    return llvm::ICmpInst::isImpliedTrueByMatchingCmp(Pred1, Pred2);
-  }
-  static bool isImpliedFalseByMatchingCmp(CmpPredicate Pred1,
-                                          CmpPredicate Pred2) {
-    return llvm::ICmpInst::isImpliedFalseByMatchingCmp(Pred1, Pred2);
+  static std::optional<bool> isImpliedByMatchingCmp(CmpPredicate Pred1,
+                                                    CmpPredicate Pred2) {
+    return llvm::ICmpInst::isImpliedByMatchingCmp(Pred1, Pred2);
   }
 
   static auto predicates() { return llvm::ICmpInst::predicates(); }
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 0e50fc60ce792..d03e6f5a5754d 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -9384,19 +9384,6 @@ isImpliedCondOperands(CmpInst::Predicate Pred, const Value *ALHS,
   }
 }
 
-/// Return true if "icmp1 LPred X, Y" implies "icmp2 RPred X, Y" is true.
-/// Return false if "icmp1 LPred X, Y" implies "icmp2 RPred X, Y" is false.
-/// Otherwise, return std::nullopt if we can't infer anything.
-static std::optional<bool> isImpliedCondMatchingOperands(CmpPredicate LPred,
-                                                         CmpPredicate RPred) {
-  if (ICmpInst::isImpliedTrueByMatchingCmp(LPred, RPred))
-    return true;
-  if (ICmpInst::isImpliedFalseByMatchingCmp(LPred, RPred))
-    return false;
-
-  return std::nullopt;
-}
-
 /// Return true if "icmp LPred X, LCR" implies "icmp RPred X, RCR" is true.
 /// Return false if "icmp LPred X, LCR" implies "icmp RPred X, RCR" is false.
 /// Otherwise, return std::nullopt if we can't infer anything.
@@ -9489,7 +9476,7 @@ isImpliedCondICmps(const ICmpInst *LHS, CmpPredicate RPred, const Value *R0,
 
   // Can we infer anything when the two compares have matching operands?
   if (L0 == R0 && L1 == R1)
-    return isImpliedCondMatchingOperands(LPred, RPred);
+    return ICmpInst::isImpliedByMatchingCmp(LPred, RPred);
 
   // It only really makes sense in the context of signed comparison for "X - Y
   // must be positive if X >= Y and no overflow".
@@ -9499,7 +9486,7 @@ isImpliedCondICmps(const ICmpInst *LHS, CmpPredicate RPred, const Value *R0,
        CmpPredicate::getMatching(LPred, ICmpInst::ICMP_SGE)) &&
       match(R0, m_NSWSub(m_Specific(L0), m_Specific(L1)))) {
     if (match(R1, m_NonPositive()) &&
-        isImpliedCondMatchingOperands(LPred, RPred) == false)
+        ICmpInst::isImpliedByMatchingCmp(LPred, RPred) == false)
       return false;
   }
 
@@ -9509,7 +9496,7 @@ isImpliedCondICmps(const ICmpInst *LHS, CmpPredicate RPred, const Value *R0,
        CmpPredicate::getMatching(LPred, ICmpInst::ICMP_SLE)) &&
       match(R0, m_NSWSub(m_Specific(L0), m_Specific(L1)))) {
     if (match(R1, m_NonNegative()) &&
-        isImpliedCondMatchingOperands(LPred, RPred) == true)
+        ICmpInst::isImpliedByMatchingCmp(LPred, RPred) == true)
       return true;
   }
 
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 49c148bb68a4d..b8b2c1d7f9a85 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -3886,8 +3886,7 @@ bool CmpInst::isFalseWhenEqual(Predicate predicate) {
   }
 }
 
-bool ICmpInst::isImpliedTrueByMatchingCmp(CmpPredicate Pred1,
-                                          CmpPredicate Pred2) {
+static bool isImpliedTrueByMatchingCmp(CmpPredicate Pred1, CmpPredicate Pred2) {
   // If the predicates match, then we know the first condition implies the
   // second is true.
   if (CmpPredicate::getMatching(Pred1, Pred2))
@@ -3901,25 +3900,35 @@ bool ICmpInst::isImpliedTrueByMatchingCmp(CmpPredicate Pred1,
   switch (Pred1) {
   default:
     break;
-  case ICMP_EQ:
+  case CmpInst::ICMP_EQ:
     // A == B implies A >=u B, A <=u B, A >=s B, and A <=s B are true.
-    return Pred2 == ICMP_UGE || Pred2 == ICMP_ULE || Pred2 == ICMP_SGE ||
-           Pred2 == ICMP_SLE;
-  case ICMP_UGT: // A >u B implies A != B and A >=u B are true.
-    return Pred2 == ICMP_NE || Pred2 == ICMP_UGE;
-  case ICMP_ULT: // A <u B implies A != B and A <=u B are true.
-    return Pred2 == ICMP_NE || Pred2 == ICMP_ULE;
-  case ICMP_SGT: // A >s B implies A != B and A >=s B are true.
-    return Pred2 == ICMP_NE || Pred2 == ICMP_SGE;
-  case ICMP_SLT: // A <s B implies A != B and A <=s B are true.
-    return Pred2 == ICMP_NE || Pred2 == ICMP_SLE;
+    return Pred2 == CmpInst::ICMP_UGE || Pred2 == CmpInst::ICMP_ULE ||
+           Pred2 == CmpInst::ICMP_SGE || Pred2 == CmpInst::ICMP_SLE;
+  case CmpInst::ICMP_UGT: // A >u B implies A != B and A >=u B are true.
+    return Pred2 == CmpInst::ICMP_NE || Pred2 == CmpInst::ICMP_UGE;
+  case CmpInst::ICMP_ULT: // A <u B implies A != B and A <=u B are true.
+    return Pred2 == CmpInst::ICMP_NE || Pred2 == CmpInst::ICMP_ULE;
+  case CmpInst::ICMP_SGT: // A >s B implies A != B and A >=s B are true.
+    return Pred2 == CmpInst::ICMP_NE || Pred2 == CmpInst::ICMP_SGE;
+  case CmpInst::ICMP_SLT: // A <s B implies A != B and A <=s B are true.
+    return Pred2 == CmpInst::ICMP_NE || Pred2 == CmpInst::ICMP_SLE;
   }
   return false;
 }
 
-bool ICmpInst::isImpliedFalseByMatchingCmp(CmpPredicate Pred1,
-                                           CmpPredicate Pred2) {
-  return isImpliedTrueByMatchingCmp(Pred1, getInversePredicate(Pred2));
+static bool isImpliedFalseByMatchingCmp(CmpPredicate Pred1,
+                                        CmpPredicate Pred2) {
+  return isImpliedTrueByMatchingCmp(Pred1,
+                                    ICmpInst::getInverseCmpPredicate(Pred2));
+}
+
+std::optional<bool> ICmpInst::isImpliedByMatchingCmp(CmpPredicate Pred1,
+                                                     CmpPredicate Pred2) {
+  if (isImpliedTrueByMatchingCmp(Pred1, Pred2))
+    return true;
+  if (isImpliedFalseByMatchingCmp(Pred1, Pred2))
+    return false;
+  return std::nullopt;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 3812e99508f73..b5ce860d73523 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -1964,18 +1964,10 @@ NewGVN::ExprResult NewGVN::performSymbolicCmpEvaluation(Instruction *I) const {
         if (PBranch->TrueEdge) {
           // If we know the previous predicate is true and we are in the true
           // edge then we may be implied true or false.
-          if (ICmpInst::isImpliedTrueByMatchingCmp(BranchPredicate,
-                                                   OurPredicate)) {
-            return ExprResult::some(
-                createConstantExpression(ConstantInt::getTrue(CI->getType())),
-                PI);
-          }
-
-          if (ICmpInst::isImpliedFalseByMatchingCmp(BranchPredicate,
-                                                    OurPredicate)) {
-            return ExprResult::some(
-                createConstantExpression(ConstantInt::getFalse(CI->getType())),
-                PI);
+          if (auto R = ICmpInst::isImpliedByMatchingCmp(BranchPredicate,
+                                                        OurPredicate)) {
+            auto *C = ConstantInt::getBool(CI->getType(), *R);
+            return ExprResult::some(createConstantExpression(C), PI);
           }
         } else {
           // Just handle the ne and eq cases, where if we have the same
diff --git a/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll b/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll
index 35cfadaa2965a..0e6db403512ae 100644
--- a/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll
+++ b/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll
@@ -126,6 +126,19 @@ define i1 @sgt_implies_ge_via_assume(i32 %i, i32 %j) {
   ret i1 %i.ge.j
 }
 
+define i1 @sgt_implies_false_le_via_assume(i32 %i, i32 %j) {
+; CHECK-LABEL: define i1 @sgt_implies_false_le_via_assume(
+; CHECK-SAME: i32 [[I:%.*]], i32 [[J:%.*]]) {
+; CHECK-NEXT:    [[I_SGT_J:%.*]] = icmp sgt i32 [[I]], [[J]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[I_SGT_J]])
+; CHECK-NEXT:    ret i1 false
+;
+  %i.sgt.j = icmp sgt i32 %i, %j
+  call void @llvm.assume(i1 %i.sgt.j)
+  %i.le.j = icmp samesign ule i32 %i, %j
+  ret i1 %i.le.j
+}
+
 define i32 @gt_implies_sge_dominating(i32 %a, i32 %len) {
 ; CHECK-LABEL: define i32 @gt_implies_sge_dominating(
 ; CHECK-SAME: i32 [[A:%.*]], i32 [[LEN:%.*]]) {
@@ -150,6 +163,30 @@ end:
   ret i32 -1
 }
 
+define i32 @gt_implies_false_sle_dominating(i32 %a, i32 %len) {
+; CHECK-LABEL: define i32 @gt_implies_false_sle_dominating(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[LEN:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[A_GT_LEN:%.*]] = icmp samesign ugt i32 [[A]], [[LEN]]
+; CHECK-NEXT:    br i1 [[A_GT_LEN]], label %[[TAKEN:.*]], label %[[END:.*]]
+; CHECK:       [[TAKEN]]:
+; CHECK-NEXT:    ret i32 0
+; CHECK:       [[END]]:
+; CHECK-NEXT:    ret i32 -1
+;
+entry:
+  %a.gt.len = icmp samesign ugt i32 %a, %len
+  br i1 %a.gt.len, label %taken, label %end
+
+taken:
+  %a.sle.len = icmp sle i32 %a, %len
+  %res = select i1 %a.sle.len, i32 30, i32 0
+  ret i32 %res
+
+end:
+  ret i32 -1
+}
+
 define i32 @gt_implies_sge_dominating_cr(i32 %a, i32 %len) {
 ; CHECK-LABEL: define i32 @gt_implies_sge_dominating_cr(
 ; CHECK-SAME: i32 [[A:%.*]], i32 [[LEN:%.*]]) {

From 3d507a890540082f5c7fc15a0f9b1dd85be02174 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Mon, 13 Jan 2025 16:31:01 +0000
Subject: [PATCH 298/408] [llvm][Docs] Add new LLDB Python guidance to release
 notes (#122719)

As decided in
https://discourse.llvm.org/t/rfc-lets-document-and-enforce-a-minimum-python-version-for-lldb/82731
and implemented by https://github.com/llvm/llvm-project/pull/114807.
---
 llvm/docs/ReleaseNotes.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index d1032138a9db0..a9d9e5fc7ace4 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -379,6 +379,10 @@ Changes to the LLVM tools
 Changes to LLDB
 ---------------------------------
 
+* It is now recommended that LLDB be built with Python >= 3.8, but no changes
+  have been made to the supported Python versions. The next release, LLDB 21,
+  will require Python >= 3.8.
+
 * LLDB now supports inline diagnostics for the expression evaluator and command line parser.
 
   Old:

From 9256485043fe5cc3a24dba649deef8ae69e6d702 Mon Sep 17 00:00:00 2001
From: CarolineConcatto <caroline.concatto@arm.com>
Date: Mon, 13 Jan 2025 16:34:33 +0000
Subject: [PATCH 299/408] [Clang][LLVM][AArch64]Add new feature SSVE-BitPerm
 (#121947)

The 20204-12 ISA update release adds a new feature: FEAT_SSVE_BitPerm,
which allows the sve-bitperm instructions to run in streaming mode.

It also removes the requirement of FEAT_SVE2 for FEAT_SVE_BitPerm. The
sve2-bitperm feature is now an alias for sve-bitperm and sve2.

A new feature flag sve-bitperm is added to reflect the change that the
instructions under FEAT_SVE_BitPerm are supported if:
 on non streaming mode with FEAT_SVE2 and FEAT_SVE_BitPerm or
 in streaming mode with FEAT_SME and FEAT_SSVE_BitPerm
---
 clang/include/clang/Basic/arm_sve.td          |  2 +-
 clang/lib/Basic/Targets/AArch64.cpp           | 10 +-
 clang/lib/Basic/Targets/AArch64.h             |  2 +-
 clang/test/CodeGen/AArch64/fmv-dependencies.c |  2 +-
 .../AArch64/sve2-intrinsics/acle_sve2_bdep.c  |  8 +-
 .../AArch64/sve2-intrinsics/acle_sve2_bext.c  |  8 +-
 .../AArch64/sve2-intrinsics/acle_sve2_bgrp.c  |  8 +-
 clang/test/CodeGen/AArch64/targetattr.c       |  2 +-
 .../Driver/aarch64-implied-sme-features.c     |  5 +-
 .../Driver/aarch64-implied-sve-features.c     | 14 ++-
 .../print-supported-extensions-aarch64.c      |  4 +-
 .../Preprocessor/aarch64-target-features.c    |  7 +-
 .../acle_sve2_aes_bitperm_sha3_sm4.cpp        | 96 +++++++++----------
 llvm/lib/Target/AArch64/AArch64.td            |  4 +-
 llvm/lib/Target/AArch64/AArch64Features.td    | 11 ++-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  8 +-
 llvm/lib/Target/AArch64/AArch64Processors.td  | 36 +++----
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  4 +-
 .../AArch64/AsmParser/AArch64AsmParser.cpp    |  6 +-
 llvm/lib/TargetParser/AArch64TargetParser.cpp |  5 +
 llvm/test/MC/AArch64/SVE2/bdep-diagnostics.s  |  2 +-
 llvm/test/MC/AArch64/SVE2/bdep.s              | 18 ++--
 llvm/test/MC/AArch64/SVE2/bext.s              | 18 ++--
 llvm/test/MC/AArch64/SVE2/bgrp.s              | 18 ++--
 .../MC/AArch64/SVE2/directive-arch-negative.s | 12 ++-
 llvm/test/MC/AArch64/SVE2/directive-arch.s    |  6 +-
 .../SVE2/directive-arch_extension-negative.s  |  8 +-
 .../AArch64/SVE2/directive-arch_extension.s   |  2 +-
 .../MC/AArch64/SVE2/directive-cpu-negative.s  | 12 ++-
 llvm/test/MC/AArch64/SVE2/directive-cpu.s     |  6 +-
 .../TargetParser/TargetParserTest.cpp         | 40 +++++---
 31 files changed, 232 insertions(+), 152 deletions(-)

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 1c6bdb8cad2d1..47f1754aeb629 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1988,7 +1988,7 @@ def SVSM4E    : SInst<"svsm4e[_{d}]",    "ddd", "Ui", MergeNone, "aarch64_sve_sm
 def SVSM4EKEY : SInst<"svsm4ekey[_{d}]", "ddd", "Ui", MergeNone, "aarch64_sve_sm4ekey", [IsOverloadNone]>;
 }
 
-let SVETargetGuard = "sve2-bitperm", SMETargetGuard = InvalidMode in {
+let SVETargetGuard = "sve2,sve-bitperm", SMETargetGuard = InvalidMode in {
 def SVBDEP   : SInst<"svbdep[_{d}]",   "ddd", "UcUsUiUl", MergeNone, "aarch64_sve_bdep_x">;
 def SVBDEP_N : SInst<"svbdep[_n_{d}]", "dda", "UcUsUiUl", MergeNone, "aarch64_sve_bdep_x">;
 def SVBEXT   : SInst<"svbext[_{d}]",   "ddd", "UcUsUiUl", MergeNone, "aarch64_sve_bext_x">;
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 1bf58661d0efc..4e211deb9faba 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -485,7 +485,7 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
   if (HasSVE2 && HasSVEAES)
     Builder.defineMacro("__ARM_FEATURE_SVE2_AES", "1");
 
-  if (HasSVE2 && HasSVE2BitPerm)
+  if (HasSVE2 && HasSVEBitPerm)
     Builder.defineMacro("__ARM_FEATURE_SVE2_BITPERM", "1");
 
   if (HasSVE2 && HasSVE2SHA3)
@@ -769,7 +769,7 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const {
       .Case("f64mm", FPU & SveMode && HasMatmulFP64)
       .Case("sve2", FPU & SveMode && HasSVE2)
       .Case("sve-aes", HasSVEAES)
-      .Case("sve2-bitperm", FPU & SveMode && HasSVE2BitPerm)
+      .Case("sve-bitperm", FPU & HasSVEBitPerm)
       .Case("sve2-sha3", FPU & SveMode && HasSVE2SHA3)
       .Case("sve2-sm4", FPU & SveMode && HasSVE2SM4)
       .Case("sve2p1", FPU & SveMode && HasSVE2p1)
@@ -881,12 +881,10 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
     }
     if (Feature == "+sve-b16b16")
       HasSVEB16B16 = true;
-    if (Feature == "+sve2-bitperm") {
+    if (Feature == "+sve-bitperm") {
       FPU |= NeonMode;
-      FPU |= SveMode;
       HasFullFP16 = true;
-      HasSVE2 = true;
-      HasSVE2BitPerm = true;
+      HasSVEBitPerm = true;
     }
     if (Feature == "+f32mm") {
       FPU |= NeonMode;
diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h
index cedf3286806ac..ecf80b23a508c 100644
--- a/clang/lib/Basic/Targets/AArch64.h
+++ b/clang/lib/Basic/Targets/AArch64.h
@@ -82,7 +82,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
   bool HasSVE2SHA3 = false;
   bool HasSVE2SM4 = false;
   bool HasSVEB16B16 = false;
-  bool HasSVE2BitPerm = false;
+  bool HasSVEBitPerm = false;
   bool HasMatmulFP64 = false;
   bool HasMatmulFP32 = false;
   bool HasLSE = false;
diff --git a/clang/test/CodeGen/AArch64/fmv-dependencies.c b/clang/test/CodeGen/AArch64/fmv-dependencies.c
index 097b85e989d86..8dda3b647fcd0 100644
--- a/clang/test/CodeGen/AArch64/fmv-dependencies.c
+++ b/clang/test/CodeGen/AArch64/fmv-dependencies.c
@@ -192,7 +192,7 @@ int caller() {
 // CHECK: attributes #[[sve]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a"
 // CHECK: attributes #[[sve2]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+v8a"
 // CHECK: attributes #[[sve2_aes]] = { {{.*}} "target-features"="+aes,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve-aes,+sve2,+sve2-aes,+v8a"
-// CHECK: attributes #[[sve2_bitperm]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+sve2-bitperm,+v8a"
+// CHECK: attributes #[[sve2_bitperm]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve-bitperm,+sve2,+sve2-bitperm,+v8a"
 // CHECK: attributes #[[sve2_sha3]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sha2,+sha3,+sve,+sve2,+sve2-sha3,+v8a"
 // CHECK: attributes #[[sve2_sm4]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sm4,+sve,+sve2,+sve2-sm4,+v8a"
 // CHECK: attributes #[[wfxt]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a,+wfxt"
diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bdep.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bdep.c
index d7c070d412a8f..d4681394a0508 100644
--- a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bdep.c
+++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bdep.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bext.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bext.c
index 30b798e21f7a1..6d654b9353e7a 100644
--- a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bext.c
+++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bext.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bgrp.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bgrp.c
index 58445c6b810c7..a98d8e8a2b37c 100644
--- a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bgrp.c
+++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bgrp.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/AArch64/targetattr.c b/clang/test/CodeGen/AArch64/targetattr.c
index ee7a07244ef9a..f8d5f9912c0d7 100644
--- a/clang/test/CodeGen/AArch64/targetattr.c
+++ b/clang/test/CodeGen/AArch64/targetattr.c
@@ -204,7 +204,7 @@ void applem4() {}
 // CHECK: attributes #[[ATTR1]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rdm,+sve,+v8.1a,+v8.2a,+v8a" }
 // CHECK: attributes #[[ATTR2]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rdm,+sve,+sve2,+v8.1a,+v8.2a,+v8a" }
 // CHECK: attributes #[[ATTR3]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+predres,+ras,+rcpc,+rdm,+sb,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" }
-// CHECK: attributes #[[ATTR4]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a710" "target-features"="+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+ete,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+ssbs,+sve,+sve2,+sve2-bitperm,+trbe,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a" }
+// CHECK: attributes #[[ATTR4]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a710" "target-features"="+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+ete,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+ssbs,+sve,+sve-bitperm,+sve2,+trbe,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a" }
 // CHECK: attributes #[[ATTR5]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "tune-cpu"="cortex-a710" }
 // CHECK: attributes #[[ATTR6]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+ete,+fp-armv8,+neon,+trbe,+v8a" }
 // CHECK: attributes #[[ATTR7]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "tune-cpu"="generic" }
diff --git a/clang/test/Driver/aarch64-implied-sme-features.c b/clang/test/Driver/aarch64-implied-sme-features.c
index 4d507c0e99dd9..23ec27ff1aaff 100644
--- a/clang/test/Driver/aarch64-implied-sme-features.c
+++ b/clang/test/Driver/aarch64-implied-sme-features.c
@@ -51,4 +51,7 @@
 // SME-SUBFEATURE-CONFLICT-REV: "-target-feature" "+bf16"{{.*}} "-target-feature" "+sme" "-target-feature" "+sme-i16i64"
 
 // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+ssve-aes %s -### 2>&1 | FileCheck %s --check-prefix=SVE-AES
-// SVE-AES: "-target-feature" "+sme" "-target-feature" "+sme2" "-target-feature" "+ssve-aes" "-target-feature" "+sve-aes"
\ No newline at end of file
+// SVE-AES: "-target-feature" "+sme" "-target-feature" "+sme2" "-target-feature" "+ssve-aes" "-target-feature" "+sve-aes"
+
++// RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+ssve-bitperm %s -### 2>&1 | FileCheck %s --check-prefix=SVE-BITPERM
++// SVE-BITPERM: "-target-feature" "+sme" "-target-feature" "+sme2" "-target-feature" "+ssve-bitperm" "-target-feature" "+sve-bitperm"
diff --git a/clang/test/Driver/aarch64-implied-sve-features.c b/clang/test/Driver/aarch64-implied-sve-features.c
index e5f1e55345414..ecc1e9500b667 100644
--- a/clang/test/Driver/aarch64-implied-sve-features.c
+++ b/clang/test/Driver/aarch64-implied-sve-features.c
@@ -23,17 +23,24 @@
 // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve+sve2 %s -### 2>&1 | FileCheck %s --check-prefix=SVE-SVE2
 // SVE-SVE2: "-target-feature" "+sve" "-target-feature" "+sve2"
 
+// RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve-bitperm %s -### 2>&1 | FileCheck %s --check-prefix=SVE-BITPERM
+// SVE-BITPERM: "-target-feature" "+sve-bitperm"
+
 // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-bitperm %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-BITPERM
-// SVE2-BITPERM: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "+sve2-bitperm"
+// SVE2-BITPERM:  "-target-feature" "+sve" "-target-feature" "+sve-bitperm" "-target-feature" "+sve2" "-target-feature" "+sve2-bitperm"
 
 // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+nosve2-bitperm %s -### 2>&1 | FileCheck %s --check-prefix=NOSVE2-BITPERM
+// NOSVE2-BITPERM-NOT: "-target-feature" "+sve-bitperm"
 // NOSVE2-BITPERM-NOT: "-target-feature" "+sve2-bitperm"
 // NOSVE2-BITPERM-NOT: "-target-feature" "+sve2"
 // NOSVE2-BITPERM-NOT: "-target-feature" "+sve"
 // NOSVE2-BITPERM-NOT: sve2-bitperm"
 
+// RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve-bitperm+nosve-bitperm %s -### 2>&1 | FileCheck %s --check-prefix=SVE-BITPERM-REVERT
+// SVE-BITPERM-REVERT: "-target-feature" "-sve-bitperm"
+
 // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-bitperm+nosve2-bitperm %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-BITPERM-REVERT
-// SVE2-BITPERM-REVERT: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "-sve2-bitperm"
+// SVE2-BITPERM-REVERT: "-target-feature" "+sve" "-target-feature" "-sve-bitperm" "-target-feature" "-sve2" "-target-feature" "-sve2-bitperm"
 
 // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-aes+nosve2-aes %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-AES-REVERT
 // SVE2-AES-REVERT: "-target-feature" "+sve" "-target-feature" "-sve-aes" "-target-feature" "+sve2" "-target-feature" "-sve2-aes"
@@ -57,7 +64,7 @@
 // SVE2-SM4: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "+sve2-sm4"
 
 // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-bitperm+nosve2-aes %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-SUBFEATURE-MIX
-// SVE2-SUBFEATURE-MIX: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "+sve2-bitperm"
+// SVE2-SUBFEATURE-MIX: "-target-feature" "+sve" "-target-feature" "+sve-bitperm" "-target-feature" "+sve2" "-target-feature" "+sve2-bitperm"
 // SVE2-SUBFEATURE-NOT: sve2-aes
 
 // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-sm4+nosve2 %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-SUBFEATURE-CONFLICT
@@ -72,6 +79,7 @@
 // SVE-SUBFEATURE-CONFLICT-REV: "-target-feature" "+sve" "-target-feature" "+sve-aes" "-target-feature" "+sve2" "-target-feature" "+sve2-aes"
 
 // RUN: %clang --target=aarch64-linux-gnu -mcpu=neoverse-n2+nosve2 %s -### 2>&1 | FileCheck %s --check-prefix=SVE-MCPU-FEATURES
+// SVE-MCPU-FEATURES-NOT: "-target-feature" "+sve-bitperm"
 // SVE-MCPU-FEATURES-NOT: "-target-feature" "+sve2-bitperm"
 // SVE-MCPU-FEATURES-NOT: "-target-feature" "+sve2"
 // SVE-MCPU-FEATURES: "-target-feature" "+sve"
diff --git a/clang/test/Driver/print-supported-extensions-aarch64.c b/clang/test/Driver/print-supported-extensions-aarch64.c
index 09d499548aa56..75aa1a3aeecdd 100644
--- a/clang/test/Driver/print-supported-extensions-aarch64.c
+++ b/clang/test/Driver/print-supported-extensions-aarch64.c
@@ -78,6 +78,7 @@
 // CHECK-NEXT:     predres2            FEAT_SPECRES2                                          Enable Speculation Restriction Instruction
 // CHECK-NEXT:     ssbs                FEAT_SSBS, FEAT_SSBS2                                  Enable Speculative Store Bypass Safe bit
 // CHECK-NEXT:     ssve-aes            FEAT_SSVE_AES                                          Enable Armv9.6-A SVE AES support in streaming SVE mode
+// CHECK-NEXT:     ssve-bitperm        FEAT_SSVE_BitPerm                                      Enable Armv9.6-A SVE BitPerm support in streaming SVE mode
 // CHECK-NEXT:     ssve-fp8dot2        FEAT_SSVE_FP8DOT2                                      Enable SVE2 FP8 2-way dot product instructions
 // CHECK-NEXT:     ssve-fp8dot4        FEAT_SSVE_FP8DOT4                                      Enable SVE2 FP8 4-way dot product instructions
 // CHECK-NEXT:     ssve-fp8fma         FEAT_SSVE_FP8FMA                                       Enable SVE2 FP8 multiply-add instructions
@@ -86,10 +87,11 @@
 // CHECK-NEXT:     sve-aes2            FEAT_SVE_AES2                                          Enable Armv9.6-A SVE multi-vector AES and multi-vector quadword polynomial multiply instructions
 // CHECK-NEXT:     sve-b16b16          FEAT_SVE_B16B16                                        Enable SVE2 non-widening and SME2 Z-targeting non-widening BFloat16 instructions
 // CHECK-NEXT:     sve-bfscale         FEAT_SVE_BFSCALE                                       Enable Armv9.6-A SVE BFloat16 scaling instructions
+// CHECK-NEXT:     sve-bitperm         FEAT_SVE_BitPerm                                       Enable bit permutation SVE2 instructions
 // CHECK-NEXT:     sve-f16f32mm        FEAT_SVE_F16F32MM                                      Enable Armv9.6-A FP16 to FP32 Matrix Multiply
 // CHECK-NEXT:     sve2                FEAT_SVE2                                              Enable Scalable Vector Extension 2 (SVE2) instructions
 // CHECK-NEXT:     sve2-aes                                                                   Shorthand for +sve2+sve-aes
-// CHECK-NEXT:     sve2-bitperm        FEAT_SVE_BitPerm                                       Enable bit permutation SVE2 instructions
+// CHECK-NEXT:     sve2-bitperm                                                               Shorthand for +sve2+sve-bitperm
 // CHECK-NEXT:     sve2-sha3           FEAT_SVE_SHA3                                          Enable SHA3 SVE2 instructions
 // CHECK-NEXT:     sve2-sm4            FEAT_SVE_SM4                                           Enable SM4 SVE2 instructions
 // CHECK-NEXT:     sve2p1              FEAT_SVE2p1                                            Enable Scalable Vector Extension 2.1 instructions
diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c
index 86265f630296c..b10c55447d9af 100644
--- a/clang/test/Preprocessor/aarch64-target-features.c
+++ b/clang/test/Preprocessor/aarch64-target-features.c
@@ -246,7 +246,12 @@
 // CHECK-SVE2SHA3: __ARM_FEATURE_SVE2_SHA3 1
 // RUN: %clang -target aarch64-none-linux-gnu -march=armv9-a+sve2-sm4 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2SM4 %s
 // CHECK-SVE2SM4: __ARM_FEATURE_SVE2_SM4 1
-// RUN: %clang -target aarch64-none-linux-gnu -march=armv9-a+sve2-bitperm -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2BITPERM %s
+// RUN: %clang -target aarch64-none-linux-gnu -march=armv9-a+sve-bitperm -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVEBITPERM %s
+// CHECK-SVEBITPERM: __ARM_FEATURE_SVE2_BITPERM 1
+
+// RUN: %clang -target aarch64-none-linux-gnu -march=armv8-a+sve2-bitperm -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2BITPERM %s
+// RUN: %clang -target aarch64-none-linux-gnu -march=armv8-a+sve-bitperm+sve2 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2BITPERM %s
+// CHECK-SVE2BITPERM: __ARM_FEATURE_SVE2 1
 // CHECK-SVE2BITPERM: __ARM_FEATURE_SVE2_BITPERM 1
 
 // RUN: %clang -target aarch64-none-linux-gnu -march=armv9-a+sve2p1 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2p1 %s
diff --git a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_aes_bitperm_sha3_sm4.cpp b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_aes_bitperm_sha3_sm4.cpp
index 93d4b00701693..985ea15ac2a4e 100644
--- a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_aes_bitperm_sha3_sm4.cpp
+++ b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_aes_bitperm_sha3_sm4.cpp
@@ -26,61 +26,61 @@ void test(uint8_t u8, uint16_t u16, uint32_t u32, uint64_t u64)
   // expected-error@+2 {{'svaesmc_u8' needs target feature sve,sve2,sve-aes}}
   // overload-error@+1 {{'svaesmc' needs target feature sve,sve2,sve-aes}}
   SVE_ACLE_FUNC(svaesmc,_u8,,)(svundef_u8());
-  // expected-error@+2 {{'svbdep_u8' needs target feature sve,sve2-bitperm}}
-  // overload-error@+1 {{'svbdep' needs target feature sve,sve2-bitperm}}
+  // expected-error@+2 {{'svbdep_u8' needs target feature sve,sve2,sve-bitperm}}
+  // overload-error@+1 {{'svbdep' needs target feature sve,sve2,sve-bitperm}}
   SVE_ACLE_FUNC(svbdep,_u8,,)(svundef_u8(), svundef_u8());
-  // expected-error@+2 {{'svbdep_n_u8' needs target feature sve,sve2-bitperm}}
-  // overload-error@+1 {{'svbdep' needs target feature sve,sve2-bitperm}}
+  // expected-error@+2 {{'svbdep_n_u8' needs target feature sve,sve2,sve-bitperm}}
+  // overload-error@+1 {{'svbdep' needs target feature sve,sve2,sve-bitperm}}
   SVE_ACLE_FUNC(svbdep,_n_u8,,)(svundef_u8(), u8);
-  // expected-error@+2 {{'svbext_u8' needs target feature sve,sve2-bitperm}}
-  // overload-error@+1 {{'svbext' needs target feature sve,sve2-bitperm}}
+  // expected-error@+2 {{'svbext_u8' needs target feature sve,sve2,sve-bitperm}}
+  // overload-error@+1 {{'svbext' needs target feature sve,sve2,sve-bitperm}}
   SVE_ACLE_FUNC(svbext,_u8,,)(svundef_u8(), svundef_u8());
-  // expected-error@+2 {{'svbext_n_u8' needs target feature sve,sve2-bitperm}}
-  // overload-error@+1 {{'svbext' needs target feature sve,sve2-bitperm}}
+  // expected-error@+2 {{'svbext_n_u8' needs target feature sve,sve2,sve-bitperm}}
+  // overload-error@+1 {{'svbext' needs target feature sve,sve2,sve-bitperm}}
   SVE_ACLE_FUNC(svbext,_n_u8,,)(svundef_u8(), u8);
-  // expected-error@+2 {{'svbgrp_u8' needs target feature sve,sve2-bitperm}}
-  // overload-error@+1 {{'svbgrp' needs target feature sve,sve2-bitperm}}
+  // expected-error@+2 {{'svbgrp_u8' needs target feature sve,sve2,sve-bitperm}}
+  // overload-error@+1 {{'svbgrp' needs target feature sve,sve2,sve-bitperm}}
   SVE_ACLE_FUNC(svbgrp,_u8,,)(svundef_u8(), svundef_u8());
-  // expected-error@+2 {{'svbgrp_n_u8' needs target feature sve,sve2-bitperm}}
-  // overload-error@+1 {{'svbgrp' needs target feature sve,sve2-bitperm}}
+  // expected-error@+2 {{'svbgrp_n_u8' needs target feature sve,sve2,sve-bitperm}}
+  // overload-error@+1 {{'svbgrp' needs target feature sve,sve2,sve-bitperm}}
   SVE_ACLE_FUNC(svbgrp,_n_u8,,)(svundef_u8(), u8);
   
-  // expected-error@+2 {{'svbdep_u16' needs target feature sve,sve2-bitperm}}
-  // overload-error@+1 {{'svbdep' needs target feature sve,sve2-bitperm}}
+  // expected-error@+2 {{'svbdep_u16' needs target feature sve,sve2,sve-bitperm}}
+  // overload-error@+1 {{'svbdep' needs target feature sve,sve2,sve-bitperm}}
   SVE_ACLE_FUNC(svbdep,_u16,,)(svundef_u16(), svundef_u16());
-  // expected-error@+2 {{'svbdep_n_u16' needs target feature sve,sve2-bitperm}}
-  // overload-error@+1 {{'svbdep' needs target feature sve,sve2-bitperm}}
+  // expected-error@+2 {{'svbdep_n_u16' needs target feature sve,sve2,sve-bitperm}}
+  // overload-error@+1 {{'svbdep' needs target feature sve,sve2,sve-bitperm}}
   SVE_ACLE_FUNC(svbdep,_n_u16,,)(svundef_u16(), u16);
-  // expected-error@+2 {{'svbext_u16' needs target feature sve,sve2-bitperm}}
-  // overload-error@+1 {{'svbext' needs target feature sve,sve2-bitperm}}
+  // expected-error@+2 {{'svbext_u16' needs target feature sve,sve2,sve-bitperm}}
+  // overload-error@+1 {{'svbext' needs target feature sve,sve2,sve-bitperm}}
   SVE_ACLE_FUNC(svbext,_u16,,)(svundef_u16(), svundef_u16());
-  // expected-error@+2 {{'svbext_n_u16' needs target feature sve,sve2-bitperm}}
-  // overload-error@+1 {{'svbext' needs target feature sve,sve2-bitperm}}
+  // expected-error@+2 {{'svbext_n_u16' needs target feature sve,sve2,sve-bitperm}}
+  // overload-error@+1 {{'svbext' needs target feature sve,sve2,sve-bitperm}}
   SVE_ACLE_FUNC(svbext,_n_u16,,)(svundef_u16(), u16);
-  // expected-error@+2 {{'svbgrp_u16' needs target feature sve,sve2-bitperm}}
-  // overload-error@+1 {{'svbgrp' needs target feature sve,sve2-bitperm}}
+  // expected-error@+2 {{'svbgrp_u16' needs target feature sve,sve2,sve-bitperm}}
+  // overload-error@+1 {{'svbgrp' needs target feature sve,sve2,sve-bitperm}}
   SVE_ACLE_FUNC(svbgrp,_u16,,)(svundef_u16(), svundef_u16());
-  // expected-error@+2 {{'svbgrp_n_u16' needs target feature sve,sve2-bitperm}}
-  // overload-error@+1 {{'svbgrp' needs target feature sve,sve2-bitperm}}
+  // expected-error@+2 {{'svbgrp_n_u16' needs target feature sve,sve2,sve-bitperm}}
+  // overload-error@+1 {{'svbgrp' needs target feature sve,sve2,sve-bitperm}}
   SVE_ACLE_FUNC(svbgrp,_n_u16,,)(svundef_u16(), u16);
   
-  // expected-error@+2 {{'svbdep_u32' needs target feature sve,sve2-bitperm}}
-  // overload-error@+1 {{'svbdep' needs target feature sve,sve2-bitperm}}
+  // expected-error@+2 {{'svbdep_u32' needs target feature sve,sve2,sve-bitperm}}
+  // overload-error@+1 {{'svbdep' needs target feature sve,sve2,sve-bitperm}}
   SVE_ACLE_FUNC(svbdep,_u32,,)(svundef_u32(), svundef_u32());
-  // expected-error@+2 {{'svbdep_n_u32' needs target feature sve,sve2-bitperm}}
-  // overload-error@+1 {{'svbdep' needs target feature sve,sve2-bitperm}}
+  // expected-error@+2 {{'svbdep_n_u32' needs target feature sve,sve2,sve-bitperm}}
+  // overload-error@+1 {{'svbdep' needs target feature sve,sve2,sve-bitperm}}
   SVE_ACLE_FUNC(svbdep,_n_u32,,)(svundef_u32(), u32);
-  // expected-error@+2 {{'svbext_u32' needs target feature sve,sve2-bitperm}}
-  // overload-error@+1 {{'svbext' needs target feature sve,sve2-bitperm}}
+  // expected-error@+2 {{'svbext_u32' needs target feature sve,sve2,sve-bitperm}}
+  // overload-error@+1 {{'svbext' needs target feature sve,sve2,sve-bitperm}}
   SVE_ACLE_FUNC(svbext,_u32,,)(svundef_u32(), svundef_u32());
-  // expected-error@+2 {{'svbext_n_u32' needs target feature sve,sve2-bitperm}}
-  // overload-error@+1 {{'svbext' needs target feature sve,sve2-bitperm}}
+  // expected-error@+2 {{'svbext_n_u32' needs target feature sve,sve2,sve-bitperm}}
+  // overload-error@+1 {{'svbext' needs target feature sve,sve2,sve-bitperm}}
   SVE_ACLE_FUNC(svbext,_n_u32,,)(svundef_u32(), u32);
-  // expected-error@+2 {{'svbgrp_u32' needs target feature sve,sve2-bitperm}}
-  // overload-error@+1 {{'svbgrp' needs target feature sve,sve2-bitperm}}
+  // expected-error@+2 {{'svbgrp_u32' needs target feature sve,sve2,sve-bitperm}}
+  // overload-error@+1 {{'svbgrp' needs target feature sve,sve2,sve-bitperm}}
   SVE_ACLE_FUNC(svbgrp,_u32,,)(svundef_u32(), svundef_u32());
-  // expected-error@+2 {{'svbgrp_n_u32' needs target feature sve,sve2-bitperm}}
-  // overload-error@+1 {{'svbgrp' needs target feature sve,sve2-bitperm}}
+  // expected-error@+2 {{'svbgrp_n_u32' needs target feature sve,sve2,sve-bitperm}}
+  // overload-error@+1 {{'svbgrp' needs target feature sve,sve2,sve-bitperm}}
   SVE_ACLE_FUNC(svbgrp,_n_u32,,)(svundef_u32(), u32);
   // expected-error@+2 {{'svsm4e_u32' needs target feature sve,sve2-sm4}}
   // overload-error@+1 {{'svsm4e' needs target feature sve,sve2-sm4}}
@@ -89,23 +89,23 @@ void test(uint8_t u8, uint16_t u16, uint32_t u32, uint64_t u64)
   // overload-error@+1 {{'svsm4ekey' needs target feature sve,sve2-sm4}}
   SVE_ACLE_FUNC(svsm4ekey,_u32,,)(svundef_u32(), svundef_u32());
   
-  // expected-error@+2 {{'svbdep_u64' needs target feature sve,sve2-bitperm}}
-  // overload-error@+1 {{'svbdep' needs target feature sve,sve2-bitperm}}
+  // expected-error@+2 {{'svbdep_u64' needs target feature sve,sve2,sve-bitperm}}
+  // overload-error@+1 {{'svbdep' needs target feature sve,sve2,sve-bitperm}}
   SVE_ACLE_FUNC(svbdep,_u64,,)(svundef_u64(), svundef_u64());
-  // expected-error@+2 {{'svbdep_n_u64' needs target feature sve,sve2-bitperm}}
-  // overload-error@+1 {{'svbdep' needs target feature sve,sve2-bitperm}}
+  // expected-error@+2 {{'svbdep_n_u64' needs target feature sve,sve2,sve-bitperm}}
+  // overload-error@+1 {{'svbdep' needs target feature sve,sve2,sve-bitperm}}
   SVE_ACLE_FUNC(svbdep,_n_u64,,)(svundef_u64(), u64);
-  // expected-error@+2 {{'svbext_u64' needs target feature sve,sve2-bitperm}}
-  // overload-error@+1 {{'svbext' needs target feature sve,sve2-bitperm}}
+  // expected-error@+2 {{'svbext_u64' needs target feature sve,sve2,sve-bitperm}}
+  // overload-error@+1 {{'svbext' needs target feature sve,sve2,sve-bitperm}}
   SVE_ACLE_FUNC(svbext,_u64,,)(svundef_u64(), svundef_u64());
-  // expected-error@+2 {{'svbext_n_u64' needs target feature sve,sve2-bitperm}}
-  // overload-error@+1 {{'svbext' needs target feature sve,sve2-bitperm}}
+  // expected-error@+2 {{'svbext_n_u64' needs target feature sve,sve2,sve-bitperm}}
+  // overload-error@+1 {{'svbext' needs target feature sve,sve2,sve-bitperm}}
   SVE_ACLE_FUNC(svbext,_n_u64,,)(svundef_u64(), u64);
-  // expected-error@+2 {{'svbgrp_u64' needs target feature sve,sve2-bitperm}}
-  // overload-error@+1 {{'svbgrp' needs target feature sve,sve2-bitperm}}
+  // expected-error@+2 {{'svbgrp_u64' needs target feature sve,sve2,sve-bitperm}}
+  // overload-error@+1 {{'svbgrp' needs target feature sve,sve2,sve-bitperm}}
   SVE_ACLE_FUNC(svbgrp,_u64,,)(svundef_u64(), svundef_u64());
-  // expected-error@+2 {{'svbgrp_n_u64' needs target feature sve,sve2-bitperm}}
-  // overload-error@+1 {{'svbgrp' needs target feature sve,sve2-bitperm}}
+  // expected-error@+2 {{'svbgrp_n_u64' needs target feature sve,sve2,sve-bitperm}}
+  // overload-error@+1 {{'svbgrp' needs target feature sve,sve2,sve-bitperm}}
   SVE_ACLE_FUNC(svbgrp,_n_u64,,)(svundef_u64(), u64);
   // expected-error@+2 {{'svpmullb_pair_u64' needs target feature sve,sve2,sve-aes}}
   // overload-error@+1 {{'svpmullb_pair' needs target feature sve,sve2,sve-aes}}
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index e3dd334e7b098..20e77b3be2a27 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -63,7 +63,7 @@ def SVE2p1Unsupported : AArch64Unsupported;
 
 def SVE2Unsupported : AArch64Unsupported {
   let F = !listconcat([HasSVE2, HasSVE2orSME, HasSVE2orSME2, HasSSVE_FP8FMA, HasSMEF8F16,
-                       HasSMEF8F32, HasSVEAES, HasSVE2SHA3, HasSVE2SM4, HasSVE2BitPerm,
+                       HasSMEF8F32, HasSVEAES, HasSVE2SHA3, HasSVE2SM4, HasSVEBitPerm,
                        HasSVEB16B16],
                        SVE2p1Unsupported.F);
 }
@@ -74,7 +74,7 @@ def SVEUnsupported : AArch64Unsupported {
 }
 
 let F = [HasSME2p2, HasSVE2p2orSME2p2, HasNonStreamingSVEorSME2p2,
-         HasNonStreamingSVE2p2orSME2p2] in
+         HasNonStreamingSVE2p2orSME2p2, HasNonStreamingSVE2orSSVE_BitPerm] in
 def SME2p2Unsupported : AArch64Unsupported;
 
 def SME2p1Unsupported : AArch64Unsupported {
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index 5a233e2d870b3..76405750db640 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -376,9 +376,11 @@ def FeatureSVE2SM4 : ExtensionWithMArch<"sve2-sm4", "SVE2SM4", "FEAT_SVE_SM4",
 def FeatureSVE2SHA3 : ExtensionWithMArch<"sve2-sha3", "SVE2SHA3", "FEAT_SVE_SHA3",
   "Enable SHA3 SVE2 instructions", [FeatureSVE2, FeatureSHA3]>;
 
-def FeatureSVE2BitPerm : ExtensionWithMArch<"sve2-bitperm", "SVE2BitPerm",
-  "FEAT_SVE_BitPerm",
-  "Enable bit permutation SVE2 instructions", [FeatureSVE2]>;
+def FeatureSVEBitPerm : ExtensionWithMArch<"sve-bitperm", "SVEBitPerm",
+  "FEAT_SVE_BitPerm",  "Enable bit permutation SVE2 instructions">;
+
+def FeatureAliasSVE2BitPerm : ExtensionWithMArch<"sve2-bitperm", "SVE2BitPerm",
+  "",  "Shorthand for +sve2+sve-bitperm", [FeatureSVE2, FeatureSVEBitPerm]>;
 
 def FeatureTRBE : Extension<"trbe", "TRBE", "FEAT_TRBE",
   "Enable Trace Buffer Extension">;
@@ -565,6 +567,9 @@ def FeaturePCDPHINT: ExtensionWithMArch<"pcdphint", "PCDPHINT", "FEAT_PCDPHINT",
 def FeaturePoPS: ExtensionWithMArch<"pops", "PoPS", "FEAT_PoPS",
   "Enable Armv9.6-A Point Of Physical Storage (PoPS) DC instructions">;
 
+def FeatureSSVE_BitPerm : ExtensionWithMArch<"ssve-bitperm", "SSVE_BitPerm", "FEAT_SSVE_BitPerm",
+  "Enable Armv9.6-A SVE BitPerm support in streaming SVE mode", [FeatureSME2, FeatureSVEBitPerm]>;
+
 //===----------------------------------------------------------------------===//
 //  Other Features
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index c6f5cdcd1d5fe..948701f897855 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -155,8 +155,8 @@ def HasSVE2SM4       : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasS
                                  AssemblerPredicateWithAll<(all_of FeatureSVE2SM4), "sve2-sm4">;
 def HasSVE2SHA3      : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2SHA3()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVE2SHA3), "sve2-sha3">;
-def HasSVE2BitPerm   : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2BitPerm()">,
-                                 AssemblerPredicateWithAll<(all_of FeatureSVE2BitPerm), "sve2-bitperm">;
+def HasSVEBitPerm   : Predicate<"Subtarget->hasSVEBitPerm()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureSVEBitPerm), "sve-bitperm">;
 def HasSMEandIsNonStreamingSafe
                      : Predicate<"Subtarget->hasSME()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSME), "sme">;
@@ -286,6 +286,10 @@ def HasNonStreamingSVE2p2orSME2p2
                 "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSME2p2())">,
                 AssemblerPredicateWithAll<(any_of FeatureSVE2p2, FeatureSME2p2),
                 "sme2p2 or sve2p2">;
+def HasNonStreamingSVE2orSSVE_BitPerm
+    : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2()) ||"
+                "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSSVE_BitPerm())">,
+                AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSSVE_BitPerm), "sve2 or ssve-bitperm">;
 
 // A subset of NEON instructions are legal in Streaming SVE execution mode,
 // so don't need the additional check for 'isNeonAvailable'.
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 2da67126a1753..364ab0d82bf88 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -694,7 +694,7 @@ def ProcessorFeatures {
                                  FeatureLSE, FeatureRAS, FeatureRDM];
   list<SubtargetFeature> A510 = [HasV9_0aOps, FeatureNEON, FeaturePerfMon,
                                  FeatureMatMulInt8, FeatureBF16, FeatureAM,
-                                 FeatureMTE, FeatureETE, FeatureSVE2BitPerm,
+                                 FeatureMTE, FeatureETE, FeatureSVEBitPerm,
                                  FeatureFP16FML,
                                  FeatureCCIDX,
                                  FeatureSB, FeaturePAuth, FeatureSSBS, FeatureSVE, FeatureSVE2,
@@ -702,7 +702,7 @@ def ProcessorFeatures {
                                  FeatureFPARMv8,FeatureFullFP16, FeatureJS, FeatureLSE,
                                  FeatureRAS, FeatureRCPC, FeatureRDM];
   list<SubtargetFeature> A520 = [HasV9_2aOps, FeaturePerfMon, FeatureAM,
-                                 FeatureMTE, FeatureETE, FeatureSVE2BitPerm,
+                                 FeatureMTE, FeatureETE, FeatureSVEBitPerm,
                                  FeatureFP16FML,
                                  FeatureCCIDX,
                                  FeatureSB, FeatureSSBS, FeaturePAuth, FeatureFlagM, FeaturePredRes,
@@ -711,7 +711,7 @@ def ProcessorFeatures {
                                  FeatureNEON, FeatureLSE, FeatureRAS, FeatureRCPC, FeatureRDM,
                                  FeatureDotProd];
   list<SubtargetFeature> A520AE = [HasV9_2aOps, FeaturePerfMon, FeatureAM,
-                                 FeatureMTE, FeatureETE, FeatureSVE2BitPerm,
+                                 FeatureMTE, FeatureETE, FeatureSVEBitPerm,
                                  FeatureFP16FML,
                                  FeatureCCIDX,
                                  FeatureSB, FeatureSSBS, FeaturePAuth, FeatureFlagM, FeaturePredRes,
@@ -747,14 +747,14 @@ def ProcessorFeatures {
   list<SubtargetFeature> A710 = [HasV9_0aOps, FeatureNEON, FeaturePerfMon,
                                  FeatureCCIDX, FeatureSSBS,
                                  FeatureETE, FeatureMTE, FeatureFP16FML,
-                                 FeatureSVE2BitPerm, FeatureBF16, FeatureMatMulInt8,
+                                 FeatureSVEBitPerm, FeatureBF16, FeatureMatMulInt8,
                                  FeaturePAuth, FeatureFlagM, FeatureSB, FeatureSVE, FeatureSVE2,
                                  FeatureComplxNum, FeatureCRC, FeatureDotProd, FeatureFPARMv8,
                                  FeatureFullFP16, FeatureJS, FeatureLSE, FeatureRAS, FeatureRCPC, FeatureRDM];
   list<SubtargetFeature> A715 = [HasV9_0aOps, FeatureNEON, FeatureMTE,
                                  FeatureCCIDX,
                                  FeatureFP16FML, FeatureSVE, FeatureTRBE,
-                                 FeatureSVE2BitPerm, FeatureBF16, FeatureETE,
+                                 FeatureSVEBitPerm, FeatureBF16, FeatureETE,
                                  FeaturePerfMon, FeatureMatMulInt8, FeatureSPE,
                                  FeatureSB, FeatureSSBS, FeatureFullFP16, FeaturePAuth, FeaturePredRes, FeatureFlagM,
                                  FeatureSVE2, FeatureComplxNum, FeatureCRC,
@@ -763,7 +763,7 @@ def ProcessorFeatures {
                                  FeatureRCPC, FeatureRDM];
   list<SubtargetFeature> A720 = [HasV9_2aOps, FeatureMTE, FeatureFP16FML,
                                  FeatureCCIDX,
-                                 FeatureTRBE, FeatureSVE2BitPerm, FeatureETE,
+                                 FeatureTRBE, FeatureSVEBitPerm, FeatureETE,
                                  FeaturePerfMon, FeatureSPE, FeatureSPE_EEF,
                                  FeatureSB, FeatureSSBS, FeaturePAuth, FeatureFlagM, FeaturePredRes,
                                  FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, FeatureCRC,
@@ -772,7 +772,7 @@ def ProcessorFeatures {
                                  FeatureRCPC, FeatureRDM];
   list<SubtargetFeature> A720AE = [HasV9_2aOps, FeatureMTE, FeatureFP16FML,
                                  FeatureCCIDX,
-                                 FeatureTRBE, FeatureSVE2BitPerm, FeatureETE,
+                                 FeatureTRBE, FeatureSVEBitPerm, FeatureETE,
                                  FeaturePerfMon, FeatureSPE, FeatureSPE_EEF,
                                  FeatureSB, FeatureSSBS, FeaturePAuth, FeatureFlagM, FeaturePredRes,
                                  FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, FeatureCRC,
@@ -782,7 +782,7 @@ def ProcessorFeatures {
   list<SubtargetFeature> A725 = [HasV9_2aOps, FeatureMTE, FeatureFP16FML,
                                  FeatureCCIDX,
                                  FeatureETE, FeaturePerfMon, FeatureSPE,
-                                 FeatureSVE2BitPerm, FeatureSPE_EEF, FeatureTRBE,
+                                 FeatureSVEBitPerm, FeatureSPE_EEF, FeatureTRBE,
                                  FeatureFlagM, FeaturePredRes, FeatureSB, FeatureSSBS,
                                  FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, FeatureCRC,
                                  FeatureDotProd, FeatureFPARMv8, FeatureFullFP16, FeatureMatMulInt8,
@@ -814,7 +814,7 @@ def ProcessorFeatures {
                                  FeatureRCPC, FeatureCRC, FeatureLSE, FeatureRAS, FeatureRDM];
   list<SubtargetFeature> X2   = [HasV9_0aOps, FeatureNEON, FeaturePerfMon,
                                  FeatureMatMulInt8, FeatureBF16, FeatureAM,
-                                 FeatureMTE, FeatureETE, FeatureSVE2BitPerm,
+                                 FeatureMTE, FeatureETE, FeatureSVEBitPerm,
                                  FeatureFP16FML,
                                  FeatureCCIDX,
                                  FeaturePAuth, FeatureSSBS, FeatureSB, FeatureSVE, FeatureSVE2, FeatureFlagM,
@@ -823,7 +823,7 @@ def ProcessorFeatures {
   list<SubtargetFeature> X3 =   [HasV9_0aOps, FeatureSVE, FeatureNEON,
                                  FeaturePerfMon, FeatureETE, FeatureTRBE,
                                  FeatureSPE, FeatureBF16, FeatureMatMulInt8,
-                                 FeatureMTE, FeatureSVE2BitPerm, FeatureFullFP16,
+                                 FeatureMTE, FeatureSVEBitPerm, FeatureFullFP16,
                                  FeatureFP16FML,
                                  FeatureCCIDX,
                                  FeatureSB, FeaturePAuth, FeaturePredRes, FeatureFlagM, FeatureSSBS,
@@ -831,7 +831,7 @@ def ProcessorFeatures {
                                  FeatureLSE, FeatureRAS, FeatureRCPC, FeatureRDM, FeatureDotProd];
   list<SubtargetFeature> X4 =   [HasV9_2aOps,
                                  FeaturePerfMon, FeatureETE, FeatureTRBE,
-                                 FeatureSPE, FeatureMTE, FeatureSVE2BitPerm,
+                                 FeatureSPE, FeatureMTE, FeatureSVEBitPerm,
                                  FeatureFP16FML, FeatureSPE_EEF,
                                  FeatureCCIDX,
                                  FeatureSB, FeatureSSBS, FeaturePAuth, FeatureFlagM, FeaturePredRes,
@@ -841,7 +841,7 @@ def ProcessorFeatures {
   list<SubtargetFeature> X925 = [HasV9_2aOps, FeatureMTE, FeatureFP16FML,
                                  FeatureCCIDX,
                                  FeatureETE, FeaturePerfMon, FeatureSPE,
-                                 FeatureSVE2BitPerm, FeatureSPE_EEF, FeatureTRBE,
+                                 FeatureSVEBitPerm, FeatureSPE_EEF, FeatureTRBE,
                                  FeatureFlagM, FeaturePredRes, FeatureSB, FeatureSSBS,
                                  FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, FeatureCRC,
                                  FeatureDotProd, FeatureFPARMv8, FeatureFullFP16, FeatureMatMulInt8,
@@ -855,7 +855,7 @@ def ProcessorFeatures {
                                    FeatureFPAC, FeatureFP16FML, FeatureRandGen,
                                    FeatureSSBS, FeatureLS64, FeatureCLRBHB,
                                    FeatureSPECRES2, FeatureSVEAES, FeatureSVE2SM4,
-                                   FeatureSVE2SHA3, FeatureSVE2BitPerm, FeatureETE,
+                                   FeatureSVE2SHA3, FeatureSVE2, FeatureSVEBitPerm, FeatureETE,
                                    FeatureMEC, FeatureFP8DOT2];
   list<SubtargetFeature> Carmel   = [HasV8_2aOps, FeatureNEON, FeatureSHA2, FeatureAES,
                                      FeatureFullFP16, FeatureCRC, FeatureLSE, FeatureRAS, FeatureRDM,
@@ -942,7 +942,7 @@ def ProcessorFeatures {
                                        FeaturePerfMon, FeatureCRC, FeatureLSE, FeatureRAS, FeatureRDM];
   list<SubtargetFeature> NeoverseN2 = [HasV9_0aOps, FeatureBF16, FeatureETE, FeatureFP16FML,
                                        FeatureMatMulInt8, FeatureMTE, FeatureSVE2,
-                                       FeatureSVE2BitPerm, FeatureTRBE,
+                                       FeatureSVEBitPerm, FeatureTRBE,
                                        FeaturePerfMon,
                                        FeatureCCIDX,
                                        FeatureDotProd, FeatureFullFP16, FeatureSB, FeatureSSBS, FeatureSVE,
@@ -951,7 +951,7 @@ def ProcessorFeatures {
   list<SubtargetFeature> NeoverseN3 = [HasV9_2aOps, FeatureETE, FeatureFP16FML,
                                       FeatureFullFP16, FeatureMTE, FeaturePerfMon,
                                       FeatureRandGen, FeatureSPE, FeatureSPE_EEF,
-                                      FeatureSVE2BitPerm,
+                                      FeatureSVEBitPerm,
                                       FeatureCCIDX,
                                       FeatureSSBS, FeatureSB, FeaturePredRes, FeaturePAuth, FeatureFlagM,
                                       FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum,
@@ -978,7 +978,7 @@ def ProcessorFeatures {
                                        FeatureRCPC, FeatureRDM];
   list<SubtargetFeature> NeoverseV2 = [HasV9_0aOps, FeatureBF16, FeatureSPE,
                                        FeaturePerfMon, FeatureETE, FeatureMatMulInt8,
-                                       FeatureNEON, FeatureSVE2BitPerm, FeatureFP16FML,
+                                       FeatureNEON, FeatureSVEBitPerm, FeatureFP16FML,
                                        FeatureMTE, FeatureRandGen,
                                        FeatureCCIDX,
                                        FeatureSVE, FeatureSVE2, FeatureSSBS, FeatureFullFP16, FeatureDotProd,
@@ -988,7 +988,7 @@ def ProcessorFeatures {
                                       FeatureFullFP16, FeatureLS64, FeatureMTE,
                                       FeaturePerfMon, FeatureRandGen, FeatureSPE,
                                       FeatureCCIDX,
-                                      FeatureSPE_EEF, FeatureSVE2BitPerm, FeatureBRBE,
+                                      FeatureSPE_EEF, FeatureSVEBitPerm, FeatureBRBE,
                                       FeatureSSBS, FeatureSB, FeaturePredRes, FeaturePAuth, FeatureFlagM,
                                       FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, FeatureCRC,
                                       FeatureDotProd, FeatureFPARMv8, FeatureMatMulInt8, FeatureJS, FeatureLSE,
@@ -996,7 +996,7 @@ def ProcessorFeatures {
   list<SubtargetFeature> NeoverseV3AE = [HasV9_2aOps, FeatureETE, FeatureFP16FML,
                                       FeatureFullFP16, FeatureLS64, FeatureMTE,
                                       FeaturePerfMon, FeatureRandGen, FeatureSPE,
-                                      FeatureSPE_EEF, FeatureSVE2BitPerm, FeatureBRBE,
+                                      FeatureSPE_EEF, FeatureSVEBitPerm, FeatureBRBE,
                                       FeatureSSBS, FeatureSB, FeaturePredRes, FeaturePAuth, FeatureFlagM,
                                       FeatureCCIDX,
                                       FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, FeatureCRC,
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 7dd6d49bf2022..22715c61126d1 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3946,12 +3946,12 @@ let Predicates = [HasSVE2SHA3] in {
   defm RAX1_ZZZ_D : sve2_crypto_cons_bin_op<0b1, "rax1", ZPR64, int_aarch64_sve_rax1, nxv2i64>;
 } // End HasSVE2SHA3
 
-let Predicates = [HasSVE2BitPerm] in {
+let Predicates = [HasSVEBitPerm, HasNonStreamingSVE2orSSVE_BitPerm] in {
   // SVE2 bitwise permute
   defm BEXT_ZZZ : sve2_misc_bitwise<0b1100, "bext", int_aarch64_sve_bext_x>;
   defm BDEP_ZZZ : sve2_misc_bitwise<0b1101, "bdep", int_aarch64_sve_bdep_x>;
   defm BGRP_ZZZ : sve2_misc_bitwise<0b1110, "bgrp", int_aarch64_sve_bgrp_x>;
-} // End HasSVE2BitPerm
+}
 
 let Predicates = [HasSVEAES2, HasNonStreamingSVE2p1orSSVE_AES] in {
   // SVE_AES2 multi-vector instructions (x2)
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index f44afd804c2bd..c37c57590f906 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -3755,7 +3755,10 @@ static const struct Extension {
     {"sve2-aes", {AArch64::FeatureAliasSVE2AES, AArch64::FeatureSVEAES}},
     {"sve2-sm4", {AArch64::FeatureSVE2SM4}},
     {"sve2-sha3", {AArch64::FeatureSVE2SHA3}},
-    {"sve2-bitperm", {AArch64::FeatureSVE2BitPerm}},
+    {"sve-bitperm", {AArch64::FeatureSVEBitPerm}},
+    {"sve2-bitperm",
+     {AArch64::FeatureAliasSVE2BitPerm, AArch64::FeatureSVEBitPerm,
+      AArch64::FeatureSVE2}},
     {"sve2p1", {AArch64::FeatureSVE2p1}},
     {"ls64", {AArch64::FeatureLS64}},
     {"xs", {AArch64::FeatureXS}},
@@ -3827,6 +3830,7 @@ static const struct Extension {
     {"lsui", {AArch64::FeatureLSUI}},
     {"occmo", {AArch64::FeatureOCCMO}},
     {"pcdphint", {AArch64::FeaturePCDPHINT}},
+    {"ssve-bitperm", {AArch64::FeatureSSVE_BitPerm}},
 };
 
 static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {
diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp
index 7d0b8c333f72f..34ca03a47e0a4 100644
--- a/llvm/lib/TargetParser/AArch64TargetParser.cpp
+++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp
@@ -276,6 +276,11 @@ void AArch64::ExtensionSet::disable(ArchExtKind E) {
   if (E == AEK_SVE2AES)
     disable(AEK_SVEAES);
 
+  if (E == AEK_SVE2BITPERM){
+    disable(AEK_SVEBITPERM);
+    disable(AEK_SVE2);
+  }
+
   if (!Enabled.test(E))
     return;
 
diff --git a/llvm/test/MC/AArch64/SVE2/bdep-diagnostics.s b/llvm/test/MC/AArch64/SVE2/bdep-diagnostics.s
index 08a589e1f963f..9e40830882c87 100644
--- a/llvm/test/MC/AArch64/SVE2/bdep-diagnostics.s
+++ b/llvm/test/MC/AArch64/SVE2/bdep-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2-bitperm  2>&1 < %s| FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+sve-bitperm  2>&1 < %s| FileCheck %s
 
 
 // ------------------------------------------------------------------------- //
diff --git a/llvm/test/MC/AArch64/SVE2/bdep.s b/llvm/test/MC/AArch64/SVE2/bdep.s
index a6ef95d9f2619..44c848d0b3b59 100644
--- a/llvm/test/MC/AArch64/SVE2/bdep.s
+++ b/llvm/test/MC/AArch64/SVE2/bdep.s
@@ -1,34 +1,36 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2-bitperm < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+sve-bitperm < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+ssve-bitperm < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
 // RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-bitperm < %s \
-// RUN:        | llvm-objdump -d --mattr=+sve2-bitperm - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-bitperm < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-bitperm < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2,+sve-bitperm - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-bitperm < %s \
 // RUN:   | llvm-objdump -d --mattr=-sve2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
 
 bdep z0.b, z1.b, z31.b
 // CHECK-INST: bdep z0.b, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0xb4,0x1f,0x45]
-// CHECK-ERROR: instruction requires: sve2-bitperm
+// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm
 // CHECK-UNKNOWN: 451fb420 <unknown>
 
 bdep z0.h, z1.h, z31.h
 // CHECK-INST: bdep z0.h, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0xb4,0x5f,0x45]
-// CHECK-ERROR: instruction requires: sve2-bitperm
+// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm
 // CHECK-UNKNOWN: 455fb420 <unknown>
 
 bdep z0.s, z1.s, z31.s
 // CHECK-INST: bdep z0.s, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0xb4,0x9f,0x45]
-// CHECK-ERROR: instruction requires: sve2-bitperm
+// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm
 // CHECK-UNKNOWN: 459fb420 <unknown>
 
 bdep z0.d, z1.d, z31.d
 // CHECK-INST: bdep z0.d, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0xb4,0xdf,0x45]
-// CHECK-ERROR: instruction requires: sve2-bitperm
+// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm
 // CHECK-UNKNOWN: 45dfb420 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/bext.s b/llvm/test/MC/AArch64/SVE2/bext.s
index 43272205ab897..ea519c22cceb5 100644
--- a/llvm/test/MC/AArch64/SVE2/bext.s
+++ b/llvm/test/MC/AArch64/SVE2/bext.s
@@ -1,34 +1,36 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2-bitperm < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+sve-bitperm < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+ssve-bitperm < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
 // RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-bitperm < %s \
-// RUN:        | llvm-objdump -d --mattr=+sve2-bitperm - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-bitperm < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-bitperm < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2,+sve-bitperm - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-bitperm < %s \
 // RUN:   | llvm-objdump -d --mattr=-sve2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
 
 bext z0.b, z1.b, z31.b
 // CHECK-INST: bext z0.b, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0xb0,0x1f,0x45]
-// CHECK-ERROR: instruction requires: sve2-bitperm
+// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm
 // CHECK-UNKNOWN: 451fb020 <unknown>
 
 bext z0.h, z1.h, z31.h
 // CHECK-INST: bext z0.h, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0xb0,0x5f,0x45]
-// CHECK-ERROR: instruction requires: sve2-bitperm
+// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm
 // CHECK-UNKNOWN: 455fb020 <unknown>
 
 bext z0.s, z1.s, z31.s
 // CHECK-INST: bext z0.s, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0xb0,0x9f,0x45]
-// CHECK-ERROR: instruction requires: sve2-bitperm
+// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm
 // CHECK-UNKNOWN: 459fb020 <unknown>
 
 bext z0.d, z1.d, z31.d
 // CHECK-INST: bext z0.d, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0xb0,0xdf,0x45]
-// CHECK-ERROR: instruction requires: sve2-bitperm
+// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm
 // CHECK-UNKNOWN: 45dfb020 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/bgrp.s b/llvm/test/MC/AArch64/SVE2/bgrp.s
index fb96946dc3c53..eb58d13511583 100644
--- a/llvm/test/MC/AArch64/SVE2/bgrp.s
+++ b/llvm/test/MC/AArch64/SVE2/bgrp.s
@@ -1,34 +1,36 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2-bitperm < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+sve-bitperm < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+ssve-bitperm < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
 // RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-bitperm < %s \
-// RUN:        | llvm-objdump -d --mattr=+sve2-bitperm - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-bitperm < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-bitperm < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2,+sve-bitperm - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-bitperm < %s \
 // RUN:   | llvm-objdump -d --mattr=-sve2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
 
 bgrp z0.b, z1.b, z31.b
 // CHECK-INST: bgrp z0.b, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0xb8,0x1f,0x45]
-// CHECK-ERROR: instruction requires: sve2-bitperm
+// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm
 // CHECK-UNKNOWN: 451fb820 <unknown>
 
 bgrp z0.h, z1.h, z31.h
 // CHECK-INST: bgrp z0.h, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0xb8,0x5f,0x45]
-// CHECK-ERROR: instruction requires: sve2-bitperm
+// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm
 // CHECK-UNKNOWN: 455fb820 <unknown>
 
 bgrp z0.s, z1.s, z31.s
 // CHECK-INST: bgrp z0.s, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0xb8,0x9f,0x45]
-// CHECK-ERROR: instruction requires: sve2-bitperm
+// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm
 // CHECK-UNKNOWN: 459fb820 <unknown>
 
 bgrp z0.d, z1.d, z31.d
 // CHECK-INST: bgrp z0.d, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0xb8,0xdf,0x45]
-// CHECK-ERROR: instruction requires: sve2-bitperm
+// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm
 // CHECK-UNKNOWN: 45dfb820 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/directive-arch-negative.s b/llvm/test/MC/AArch64/SVE2/directive-arch-negative.s
index 090d8af85825a..2cfce3b232ffc 100644
--- a/llvm/test/MC/AArch64/SVE2/directive-arch-negative.s
+++ b/llvm/test/MC/AArch64/SVE2/directive-arch-negative.s
@@ -29,10 +29,16 @@ rax1 z0.d, z0.d, z0.d
 // CHECK: error: instruction requires: sve2-sha3
 // CHECK-NEXT: rax1 z0.d, z0.d, z0.d
 
-.arch armv9-a+sve2-bitperm
-.arch armv9-a+nosve2-bitperm
+.arch armv9-a+ssve-bitperm
+.arch armv9-a+nossve-bitperm
 bgrp z21.s, z10.s, z21.s
-// CHECK: error: instruction requires: sve2-bitperm
+// CHECK: error: instruction requires: sve-bitperm
+// CHECK-NEXT: bgrp z21.s, z10.s, z21.s
+
+.arch armv9-a+sve2+sve-bitperm
+.arch armv9-a+sve2+nosve-bitperm
+bgrp z21.s, z10.s, z21.s
+// CHECK: error: instruction requires: sve-bitperm
 // CHECK-NEXT: bgrp z21.s, z10.s, z21.s
 
 .arch armv9-a+f8f16mm
diff --git a/llvm/test/MC/AArch64/SVE2/directive-arch.s b/llvm/test/MC/AArch64/SVE2/directive-arch.s
index 1319a8a186971..203541a09ad37 100644
--- a/llvm/test/MC/AArch64/SVE2/directive-arch.s
+++ b/llvm/test/MC/AArch64/SVE2/directive-arch.s
@@ -20,7 +20,11 @@ sm4e z0.s, z0.s, z0.s
 rax1 z0.d, z0.d, z0.d
 // CHECK: rax1 z0.d, z0.d, z0.d
 
-.arch armv9-a+sve2-bitperm
+.arch armv9-a+sve2+sve-bitperm
+bgrp z21.s, z10.s, z21.s
+// CHECK: bgrp z21.s, z10.s, z21.s
+
+.arch armv9-a+ssve-bitperm
 bgrp z21.s, z10.s, z21.s
 // CHECK: bgrp z21.s, z10.s, z21.s
 
diff --git a/llvm/test/MC/AArch64/SVE2/directive-arch_extension-negative.s b/llvm/test/MC/AArch64/SVE2/directive-arch_extension-negative.s
index 2eb22ebf7428c..2fab61597576f 100644
--- a/llvm/test/MC/AArch64/SVE2/directive-arch_extension-negative.s
+++ b/llvm/test/MC/AArch64/SVE2/directive-arch_extension-negative.s
@@ -34,7 +34,13 @@ rax1 z0.d, z0.d, z0.d
 .arch_extension sve2-bitperm
 .arch_extension nosve2-bitperm
 bgrp z21.s, z10.s, z21.s
-// CHECK: error: instruction requires: sve2-bitperm
+// CHECK: error: instruction requires: sve2 or ssve-bitperm sve-bitperm
+// CHECK-NEXT: bgrp z21.s, z10.s, z21.s
+
+.arch_extension sve2-bitperm
+.arch_extension nosve2
+bgrp z21.s, z10.s, z21.s
+// CHECK: error: instruction requires: sve2 or ssve-bitperm
 // CHECK-NEXT: bgrp z21.s, z10.s, z21.s
 
 .arch_extension f8f16mm
diff --git a/llvm/test/MC/AArch64/SVE2/directive-arch_extension.s b/llvm/test/MC/AArch64/SVE2/directive-arch_extension.s
index ce56127ca93b1..e45e1f9881422 100644
--- a/llvm/test/MC/AArch64/SVE2/directive-arch_extension.s
+++ b/llvm/test/MC/AArch64/SVE2/directive-arch_extension.s
@@ -20,7 +20,7 @@ sm4e z0.s, z0.s, z0.s
 rax1 z0.d, z0.d, z0.d
 // CHECK: rax1 z0.d, z0.d, z0.d
 
-.arch_extension sve2-bitperm
+.arch_extension ssve-bitperm
 bgrp z21.s, z10.s, z21.s
 // CHECK: bgrp z21.s, z10.s, z21.s
 
diff --git a/llvm/test/MC/AArch64/SVE2/directive-cpu-negative.s b/llvm/test/MC/AArch64/SVE2/directive-cpu-negative.s
index 461b9298df621..a50b990949424 100644
--- a/llvm/test/MC/AArch64/SVE2/directive-cpu-negative.s
+++ b/llvm/test/MC/AArch64/SVE2/directive-cpu-negative.s
@@ -29,10 +29,16 @@ rax1 z0.d, z0.d, z0.d
 // CHECK: error: instruction requires: sve2-sha3
 // CHECK-NEXT: rax1 z0.d, z0.d, z0.d
 
-.cpu generic+sve2-bitperm
-.cpu generic+nosve2-bitperm
+.cpu generic+sve2+sve-bitperm
+.cpu generic+sve2+nosve-bitperm
 bgrp z21.s, z10.s, z21.s
-// CHECK: error: instruction requires: sve2-bitperm
+// CHECK: error: instruction requires: sve-bitperm
+// CHECK-NEXT: bgrp z21.s, z10.s, z21.s
+
+.cpu generic+ssve-bitperm
+.cpu generic+nossve-bitperm
+bgrp z21.s, z10.s, z21.s
+// CHECK: error: instruction requires: sve2 or ssve-bitperm sve-bitperm
 // CHECK-NEXT: bgrp z21.s, z10.s, z21.s
 
 .cpu generic+sve2+f8f16mm
diff --git a/llvm/test/MC/AArch64/SVE2/directive-cpu.s b/llvm/test/MC/AArch64/SVE2/directive-cpu.s
index c54a3a9f272c3..0d873dd9b53f1 100644
--- a/llvm/test/MC/AArch64/SVE2/directive-cpu.s
+++ b/llvm/test/MC/AArch64/SVE2/directive-cpu.s
@@ -20,7 +20,11 @@ sm4e z0.s, z0.s, z0.s
 rax1 z0.d, z0.d, z0.d
 // CHECK: rax1 z0.d, z0.d, z0.d
 
-.cpu generic+sve2-bitperm
+.cpu generic+sve2+sve-bitperm
+bgrp z21.s, z10.s, z21.s
+// CHECK: bgrp z21.s, z10.s, z21.s
+
+.cpu generic+ssve-bitperm
 bgrp z21.s, z10.s, z21.s
 // CHECK: bgrp z21.s, z10.s, z21.s
 
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index c03d3e8575d81..84d9af0ec48f2 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -1343,7 +1343,9 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
       AArch64::AEK_FPRCVT,       AArch64::AEK_CMPBR,
       AArch64::AEK_LSUI,         AArch64::AEK_OCCMO,
       AArch64::AEK_PCDPHINT,     AArch64::AEK_POPS,
-      AArch64::AEK_SVEAES};
+      AArch64::AEK_SVEAES,       AArch64::AEK_SVEBITPERM,
+      AArch64::AEK_SSVE_BITPERM,
+  };
 
   std::vector<StringRef> Features;
 
@@ -1382,7 +1384,9 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
   EXPECT_TRUE(llvm::is_contained(Features, "+sve2-aes"));
   EXPECT_TRUE(llvm::is_contained(Features, "+sve2-sm4"));
   EXPECT_TRUE(llvm::is_contained(Features, "+sve2-sha3"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+sve-bitperm"));
   EXPECT_TRUE(llvm::is_contained(Features, "+sve2-bitperm"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+ssve-bitperm"));
   EXPECT_TRUE(llvm::is_contained(Features, "+sve-aes2"));
   EXPECT_TRUE(llvm::is_contained(Features, "+ssve-aes"));
   EXPECT_TRUE(llvm::is_contained(Features, "+sve2p1"));
@@ -1554,6 +1558,8 @@ TEST(TargetParserTest, AArch64ArchExtFeature) {
       {"sve2-sha3", "nosve2-sha3", "+sve2-sha3", "-sve2-sha3"},
       {"sve2p1", "nosve2p1", "+sve2p1", "-sve2p1"},
       {"sve2p2", "nosve2p2", "+sve2p2", "-sve2p2"},
+      {"sve-bitperm", "nosve-bitperm", "+sve-bitperm", "-sve-bitperm"},
+      {"ssve-bitperm", "nossve-bitperm", "+ssve-bitperm", "-ssve-bitperm"},
       {"sve2-bitperm", "nosve2-bitperm", "+sve2-bitperm", "-sve2-bitperm"},
       {"sve-aes2", "nosve-aes2", "+sve-aes2", "-sve-aes2"},
       {"ssve-aes", "nossve-aes", "+ssve-aes", "-ssve-aes"},
@@ -1754,13 +1760,13 @@ AArch64ExtensionDependenciesBaseArchTestParams
 
         // Long dependency chains: sve2-bitperm -> sve2 -> sve -> fp16 -> fp
         {AArch64::ARMV8A,
-         {"nofp", "sve2-bitperm"},
-         {"fp-armv8", "fullfp16", "sve", "sve2", "sve2-bitperm"},
+         {"nofp", "sve2", "sve-bitperm"},
+         {"fp-armv8", "fullfp16", "sve", "sve2", "sve-bitperm"},
          {}},
         {AArch64::ARMV8A,
-         {"sve2-bitperm", "nofp16"},
+         {"sve2", "sve-bitperm", "nofp16"},
          {"fp-armv8"},
-         {"full-fp16", "sve", "sve2", "sve2-bitperm"}},
+         {"full-fp16", "sve", "sve2", "sve-bitperm"}},
 
         // Meaning of +crypto varies with base architecture.
         {AArch64::ARMV8A, {"crypto"}, {"aes", "sha2"}, {}},
@@ -1864,12 +1870,20 @@ AArch64ExtensionDependenciesBaseArchTestParams
         {AArch64::ARMV8A, {"sve2p1", "nosve2"}, {}, {"sve2", "sve2p1"}},
         {AArch64::ARMV8A,
          {"nosve2", "sve2-bitperm"},
-         {"sve2", "sve2-bitperm"},
+         {"sve2", "sve-bitperm"},
          {}},
         {AArch64::ARMV8A,
          {"sve2-bitperm", "nosve2"},
-         {},
-         {"sve2", "sve2-bitperm"}},
+         {"sve"},
+         {"sve-bitperm", "sve2", "sve2-bitperm"}},
+        {AArch64::ARMV8A,
+         {"ssve-bitperm", "nosve-bitperm"},
+         {"sme"},
+         {"ssve-bitperm", "sve-bitperm"}},
+        {AArch64::ARMV8A,
+         {"nosve-bitperm", "ssve-bitperm"},
+         {"sve-bitperm", "sve-bitperm"},
+         {""}},
         {AArch64::ARMV8A, {"nosve2", "sve2-sha3"}, {"sve2", "sve2-sha3"}, {}},
         {AArch64::ARMV8A, {"sve2-sha3", "nosve2"}, {}, {"sve2", "sve2-sha3"}},
         {AArch64::ARMV8A, {"nosve2", "sve2-sm4"}, {"sve2", "sve2-sm4"}, {}},
@@ -2040,10 +2054,10 @@ AArch64ExtensionDependenciesBaseCPUTestParams
          {}},
         {"cortex-a520",
          {},
-         {"v9.2a",    "bf16",    "crc",  "dotprod",      "flagm", "fp-armv8",
-          "fullfp16", "fp16fml", "i8mm", "lse",          "mte",   "pauth",
-          "perfmon",  "predres", "ras",  "rcpc",         "rdm",   "sb",
-          "neon",     "ssbs",    "sve",  "sve2-bitperm", "sve2"},
+         {"v9.2a",    "bf16",    "crc",  "dotprod",     "flagm", "fp-armv8",
+          "fullfp16", "fp16fml", "i8mm", "lse",         "mte",   "pauth",
+          "perfmon",  "predres", "ras",  "rcpc",        "rdm",   "sb",
+          "neon",     "ssbs",    "sve",  "sve-bitperm", "sve2"},
          {}},
 
         // Negative modifiers
@@ -2058,4 +2072,4 @@ INSTANTIATE_TEST_SUITE_P(
     AArch64ExtensionDependenciesBaseCPUTestFixture,
     ::testing::ValuesIn(AArch64ExtensionDependenciesCPUData));
 
-} // namespace
\ No newline at end of file
+} // namespace

From 8d306ccdef6b70881f5d79b09c03df720351e5f8 Mon Sep 17 00:00:00 2001
From: William Moses <gh@wsmoses.com>
Date: Mon, 13 Jan 2025 11:00:04 -0600
Subject: [PATCH 300/408] [MLIR][NVVM] Enable inlining of func's calling nvvm
 intrinsics (#122650)

---
 .../LLVMIR/Transforms/InlinerInterfaceImpl.h     |  7 +++++++
 mlir/include/mlir/InitAllDialects.h              |  1 +
 .../LLVMIR/Transforms/InlinerInterfaceImpl.cpp   |  7 +++++++
 mlir/test/Dialect/LLVMIR/inlining-nvvm.mlir      | 16 ++++++++++++++++
 4 files changed, 31 insertions(+)
 create mode 100644 mlir/test/Dialect/LLVMIR/inlining-nvvm.mlir

diff --git a/mlir/include/mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h b/mlir/include/mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h
index e99b0476a6b10..69cc2e32285b6 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h
@@ -23,6 +23,13 @@ namespace LLVM {
 void registerInlinerInterface(DialectRegistry &registry);
 
 } // namespace LLVM
+
+namespace NVVM {
+/// Register the `NVVMInlinerInterface` implementation of
+/// `DialectInlinerInterface` with the NVVM dialect.
+void registerInlinerInterface(DialectRegistry &registry);
+} // namespace NVVM
+
 } // namespace mlir
 
 #endif // MLIR_DIALECT_LLVMIR_TRANSFORMS_INLINERINTERFACEIMPL_H
diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h
index c102f811cce4b..0da82825c8287 100644
--- a/mlir/include/mlir/InitAllDialects.h
+++ b/mlir/include/mlir/InitAllDialects.h
@@ -167,6 +167,7 @@ inline void registerAllDialects(DialectRegistry &registry) {
   gpu::registerBufferDeallocationOpInterfaceExternalModels(registry);
   gpu::registerValueBoundsOpInterfaceExternalModels(registry);
   LLVM::registerInlinerInterface(registry);
+  NVVM::registerInlinerInterface(registry);
   linalg::registerAllDialectInterfaceImplementations(registry);
   linalg::registerRuntimeVerifiableOpInterfaceExternalModels(registry);
   memref::registerAllocationOpInterfaceExternalModels(registry);
diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp
index b3bed5ab5f412..233cadebeec02 100644
--- a/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h"
 #include "mlir/Analysis/SliceWalk.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/Interfaces/DataLayoutInterfaces.h"
 #include "mlir/Interfaces/ViewLikeInterface.h"
@@ -815,3 +816,9 @@ void mlir::LLVM::registerInlinerInterface(DialectRegistry &registry) {
     dialect->addInterfaces<LLVMInlinerInterface>();
   });
 }
+
+void mlir::NVVM::registerInlinerInterface(DialectRegistry &registry) {
+  registry.addExtension(+[](MLIRContext *ctx, NVVM::NVVMDialect *dialect) {
+    dialect->addInterfaces<LLVMInlinerInterface>();
+  });
+}
diff --git a/mlir/test/Dialect/LLVMIR/inlining-nvvm.mlir b/mlir/test/Dialect/LLVMIR/inlining-nvvm.mlir
new file mode 100644
index 0000000000000..6dc8ebb431508
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/inlining-nvvm.mlir
@@ -0,0 +1,16 @@
+// RUN: mlir-opt %s -inline -split-input-file | FileCheck %s
+
+// UNSUPPORTED: system-windows
+
+llvm.func @threadidx() -> i32 {
+  %tid = nvvm.read.ptx.sreg.tid.x : i32
+  llvm.return %tid : i32
+}
+
+// CHECK-LABEL: func @caller
+llvm.func @caller() -> i32 {
+  // CHECK-NOT: llvm.call @threadidx
+  // CHECK: nvvm.read.ptx.sreg.tid.x
+  %z = llvm.call @threadidx() : () -> (i32)
+  llvm.return %z : i32
+}

From 305b25c2f4264971515f71d192b75a9d27037c2d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= <mgorny@gentoo.org>
Date: Mon, 13 Jan 2025 18:18:12 +0100
Subject: [PATCH 301/408] [flang] Support discovering LLVM/Clang/MLIR without
 explicit *_DIR (#122639)

Support discovering LLVM, Clang and MLIR via the standard CMake logic in
addition to explicitly specified `LLVM_DIR`, etc. To prevent breaking
anyone's workflow the way #120914 did, this change explicitly introduces
two possible code paths based on variables provided:

1. If `LLVM_DIR`, etc. are defined, the current logic is used as-is.

2. If they are not defined, `find_package()` is called normally to
discover the packages using the standard CMake logic, and the discovered
paths are added

---------

Co-authored-by: Slava Zakharin <szakharin@nvidia.com>
---
 flang/CMakeLists.txt | 48 ++++++++++++++++++++++++++++----------------
 1 file changed, 31 insertions(+), 17 deletions(-)

diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt
index 68947eaa9c9bd..b619553ef8302 100644
--- a/flang/CMakeLists.txt
+++ b/flang/CMakeLists.txt
@@ -91,28 +91,37 @@ if (FLANG_STANDALONE_BUILD)
 
   # If the user specifies a relative path to LLVM_DIR, the calls to include
   # LLVM modules fail. Append the absolute path to LLVM_DIR instead.
-  get_filename_component(LLVM_DIR_ABSOLUTE ${LLVM_DIR}
-    REALPATH BASE_DIR ${CMAKE_CURRENT_BINARY_DIR})
-  list(APPEND CMAKE_MODULE_PATH ${LLVM_DIR_ABSOLUTE})
+  if (LLVM_DIR)
+    get_filename_component(LLVM_DIR_ABSOLUTE ${LLVM_DIR}
+      REALPATH BASE_DIR ${CMAKE_CURRENT_BINARY_DIR})
+    list(APPEND CMAKE_MODULE_PATH ${LLVM_DIR_ABSOLUTE})
+  endif()
   # We need a pre-built/installed version of LLVM.
   find_package(LLVM REQUIRED HINTS "${LLVM_DIR_ABSOLUTE}")
+  if (NOT LLVM_DIR_ABSOLUTE)
+    # If the user did not specify a LLVM_DIR (and therefore LLVM_DIR_ABSOLUTE
+    # was not set), append the discovered path to CMAKE_MODULE_PATH.
+    list(APPEND CMAKE_MODULE_PATH ${LLVM_DIR})
+  endif()
 
   # Users might specify a path to CLANG_DIR that's:
   #   * a full path, or
   #   * a path relative to the path of this script.
   # Append the absolute path to CLANG_DIR so that find_package works in both
   # cases.
-  get_filename_component(
-    CLANG_DIR_ABSOLUTE
-    ${CLANG_DIR}
-    REALPATH
-    BASE_DIR ${CMAKE_CURRENT_BINARY_DIR})
-  list(APPEND CMAKE_MODULE_PATH ${CLANG_DIR_ABSOLUTE})
-
-  # TODO: Remove when libclangDriver is lifted out of Clang
-  find_package(Clang REQUIRED PATHS "${CLANG_DIR_ABSOLUTE}" NO_DEFAULT_PATH)
-  if (NOT Clang_FOUND)
-    message(FATAL_ERROR "Failed to find Clang")
+  if (CLANG_DIR)
+    get_filename_component(
+      CLANG_DIR_ABSOLUTE
+      ${CLANG_DIR}
+      REALPATH
+      BASE_DIR ${CMAKE_CURRENT_BINARY_DIR})
+    list(APPEND CMAKE_MODULE_PATH ${CLANG_DIR_ABSOLUTE})
+
+    # TODO: Remove when libclangDriver is lifted out of Clang
+    find_package(Clang REQUIRED PATHS "${CLANG_DIR_ABSOLUTE}" NO_DEFAULT_PATH)
+  else()
+    find_package(Clang REQUIRED)
+    list(APPEND CMAKE_MODULE_PATH ${Clang_DIR})
   endif()
 
   # If LLVM links to zlib we need the imported targets so we can too.
@@ -134,10 +143,15 @@ if (FLANG_STANDALONE_BUILD)
   include(TableGen)
   # If the user specifies a relative path to MLIR_DIR, the calls to include
   # MLIR modules fail. Append the absolute path to MLIR_DIR instead.
-  get_filename_component(MLIR_DIR_ABSOLUTE ${MLIR_DIR}
-    REALPATH BASE_DIR ${CMAKE_CURRENT_BINARY_DIR})
-  list(APPEND CMAKE_MODULE_PATH ${MLIR_DIR_ABSOLUTE})
+  if (MLIR_DIR)
+    get_filename_component(MLIR_DIR_ABSOLUTE ${MLIR_DIR}
+      REALPATH BASE_DIR ${CMAKE_CURRENT_BINARY_DIR})
+    list(APPEND CMAKE_MODULE_PATH ${MLIR_DIR_ABSOLUTE})
+  endif()
   find_package(MLIR REQUIRED CONFIG HINTS ${MLIR_DIR_ABSOLUTE})
+  if (NOT MLIR_DIR_ABSOLUTE)
+    list(APPEND CMAKE_MODULE_PATH ${MLIR_DIR})
+  endif()
   # Use SYSTEM for the same reasons as for LLVM includes
   include_directories(SYSTEM ${MLIR_INCLUDE_DIRS})
   include(AddMLIR)

From 26e13091ea5ac3a53d11b50265a506f88129d6ff Mon Sep 17 00:00:00 2001
From: Brox Chen <guochen2@amd.com>
Date: Mon, 13 Jan 2025 12:26:36 -0500
Subject: [PATCH 302/408] [AMDGPU][True16][CodeGen] true16 codegen pattern for
 v_pack_b32_f16 (#121988)

true16 codegen pattern for v_pack_b32_f16
---
 llvm/lib/Target/AMDGPU/SIInstructions.td   |   3 +
 llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll  |   9 +-
 llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll |   9 +-
 llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll     |  59 ++-
 llvm/test/CodeGen/AMDGPU/sitofp.f16.ll     |  23 +-
 llvm/test/CodeGen/AMDGPU/uitofp.f16.ll     |  23 +-
 llvm/test/CodeGen/AMDGPU/v_pack.ll         | 408 +++++++++++++++++++++
 7 files changed, 450 insertions(+), 84 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 4325ab448e581..cdc1132579d8d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3390,6 +3390,9 @@ let SubtargetPredicate = isGFX9Plus in {
 let True16Predicate = NotHasTrue16BitInsts in
   def : PackB32Pat<V_PACK_B32_F16_e64>;
 
+let True16Predicate = UseRealTrue16Insts in
+  def : PackB32Pat<V_PACK_B32_F16_t16_e64>;
+
 let True16Predicate = UseFakeTrue16Insts in
   def : PackB32Pat<V_PACK_B32_F16_fake16_e64>;
 } // End SubtargetPredicate = isGFX9Plus
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
index 84a3a3e88d238..32d8aa18d9713 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
@@ -160,14 +160,9 @@ define amdgpu_kernel void @ceil_v2f16(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_ceil_f16_e32 v0.l, v0.l
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_ceil_f16_e32 v0.h, v1.l
-; GFX11-NEXT:    v_mov_b16_e32 v1.l, v0.l
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
index 9909cfd32b11f..f6a9fadb33865 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
@@ -161,14 +161,9 @@ define amdgpu_kernel void @floor_v2f16(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_floor_f16_e32 v0.l, v0.l
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_floor_f16_e32 v0.h, v1.l
-; GFX11-NEXT:    v_mov_b16_e32 v1.l, v0.l
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
index 53c26cadbf75a..ff1c3da1d5fe5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
@@ -480,9 +480,8 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v3.l, v2.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v2f16_v2i32:
@@ -610,9 +609,7 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) {
 ; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v1.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v3.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v2f16_v2i16:
@@ -737,15 +734,13 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) {
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v3, v3, s0, 0x7fff
 ; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v2, v2, s0, 0x7fff
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v4, v4, s0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v5.l, v3.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v2, v4, s0, 0x7fff
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, v1.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_perm_b32 v0, v3, v0, 0x5040100
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, v1.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v3f16_v3i32:
@@ -891,12 +886,9 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) {
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v2.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, v1.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v5.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v3f16_v3i16:
@@ -1036,24 +1028,21 @@ define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) {
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    s_movk_i32 s0, 0x8000
-; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v5, v5, s0, 0x7fff
 ; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v3, v3, s0, 0x7fff
-; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
 ; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v2, v2, s0, 0x7fff
 ; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v4, v4, s0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v1.h, v6.l, v5.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v6.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v1.h, v7.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v7.l, v3.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, v1.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.h
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-SDAG-TRUE16-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v4f16_v4i32:
@@ -1238,20 +1227,14 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) {
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, v1.l, v3.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v1.l, v3.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, v6.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v6.l, v5.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v1.h, v7.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
-; GFX11-SDAG-TRUE16-NEXT:    v_perm_b32 v1, v1, v3, 0x5040100
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v4f16_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
index 245df6684384c..94b22b79f6632 100644
--- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
@@ -237,14 +237,9 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v0.l, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -338,17 +333,13 @@ define amdgpu_kernel void @sitofp_v2i32_to_v2f16(
 ; GFX11-TRUE16-NEXT:    buffer_load_b64 v[0:1], off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX11-TRUE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f32_i32_e32 v2, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
index bc1b102d33de1..2a2fd93bc2d0b 100644
--- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
@@ -237,14 +237,9 @@ define amdgpu_kernel void @uitofp_v2i16_to_v2f16(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v0.l, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -338,17 +333,13 @@ define amdgpu_kernel void @uitofp_v2i32_to_v2f16(
 ; GFX11-TRUE16-NEXT:    buffer_load_b64 v[0:1], off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX11-TRUE16-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f32_u32_e32 v2, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll
index 2eba67b06bae1..072151dd6f5a0 100644
--- a/llvm/test/CodeGen/AMDGPU/v_pack.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll
@@ -1,6 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 ; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GISEL %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GCN-FAKE16 %s
+; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GCN-REAL16 %s
+; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-REAL16 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
@@ -38,6 +42,89 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace
 ; GISEL-NEXT:    ; use v0
 ; GISEL-NEXT:    ;;#ASMEND
 ; GISEL-NEXT:    s_endpgm
+;
+; GFX11-GCN-FAKE16-LABEL: v_pack_b32_v2f16:
+; GFX11-GCN-FAKE16:       ; %bb.0:
+; GFX11-GCN-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GCN-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GCN-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GCN-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-GCN-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GCN-FAKE16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GCN-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3] glc dlc
+; GFX11-GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GCN-FAKE16-NEXT:    v_add_f16_e32 v1, 2.0, v1
+; GFX11-GCN-FAKE16-NEXT:    v_add_f16_e32 v0, 2.0, v0
+; GFX11-GCN-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GCN-FAKE16-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX11-GCN-FAKE16-NEXT:    ;;#ASMSTART
+; GFX11-GCN-FAKE16-NEXT:    ; use v0
+; GFX11-GCN-FAKE16-NEXT:    ;;#ASMEND
+; GFX11-GCN-FAKE16-NEXT:    s_endpgm
+;
+; GFX11-GISEL-FAKE16-LABEL: v_pack_b32_v2f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3] glc dlc
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v1, 2.0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, 2.0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX11-GISEL-FAKE16-NEXT:    ;;#ASMSTART
+; GFX11-GISEL-FAKE16-NEXT:    ; use v0
+; GFX11-GISEL-FAKE16-NEXT:    ;;#ASMEND
+; GFX11-GISEL-FAKE16-NEXT:    s_endpgm
+;
+; GFX11-GCN-REAL16-LABEL: v_pack_b32_v2f16:
+; GFX11-GCN-REAL16:       ; %bb.0:
+; GFX11-GCN-REAL16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GCN-REAL16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GCN-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GCN-REAL16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-GCN-REAL16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GCN-REAL16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-GCN-REAL16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GCN-REAL16-NEXT:    global_load_u16 v2, v0, s[2:3] glc dlc
+; GFX11-GCN-REAL16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GCN-REAL16-NEXT:    v_mov_b16_e32 v0.l, v1.l
+; GFX11-GCN-REAL16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-GCN-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GCN-REAL16-NEXT:    v_add_f16_e32 v0.l, 2.0, v0.l
+; GFX11-GCN-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v0.h
+; GFX11-GCN-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GCN-REAL16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-GCN-REAL16-NEXT:    ;;#ASMSTART
+; GFX11-GCN-REAL16-NEXT:    ; use v0
+; GFX11-GCN-REAL16-NEXT:    ;;#ASMEND
+; GFX11-GCN-REAL16-NEXT:    s_endpgm
+;
+; GFX11-GISEL-REAL16-LABEL: v_pack_b32_v2f16:
+; GFX11-GISEL-REAL16:       ; %bb.0:
+; GFX11-GISEL-REAL16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-REAL16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-REAL16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-GISEL-REAL16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-REAL16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-GISEL-REAL16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-REAL16-NEXT:    global_load_u16 v2, v0, s[2:3] glc dlc
+; GFX11-GISEL-REAL16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-REAL16-NEXT:    v_add_f16_e32 v0.l, 2.0, v1.l
+; GFX11-GISEL-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v2.l
+; GFX11-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-REAL16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-GISEL-REAL16-NEXT:    ;;#ASMSTART
+; GFX11-GISEL-REAL16-NEXT:    ; use v0
+; GFX11-GISEL-REAL16-NEXT:    ;;#ASMEND
+; GFX11-GISEL-REAL16-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in0.gep = getelementptr inbounds half, ptr addrspace(1) %in0, i64 %tid.ext
@@ -87,6 +174,89 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs
 ; GISEL-NEXT:    ; use v0
 ; GISEL-NEXT:    ;;#ASMEND
 ; GISEL-NEXT:    s_endpgm
+;
+; GFX11-GCN-FAKE16-LABEL: v_pack_b32_v2f16_sub:
+; GFX11-GCN-FAKE16:       ; %bb.0:
+; GFX11-GCN-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GCN-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GCN-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GCN-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-GCN-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GCN-FAKE16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GCN-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3] glc dlc
+; GFX11-GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GCN-FAKE16-NEXT:    v_subrev_f16_e32 v1, 2.0, v1
+; GFX11-GCN-FAKE16-NEXT:    v_add_f16_e32 v0, 2.0, v0
+; GFX11-GCN-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GCN-FAKE16-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX11-GCN-FAKE16-NEXT:    ;;#ASMSTART
+; GFX11-GCN-FAKE16-NEXT:    ; use v0
+; GFX11-GCN-FAKE16-NEXT:    ;;#ASMEND
+; GFX11-GCN-FAKE16-NEXT:    s_endpgm
+;
+; GFX11-GISEL-FAKE16-LABEL: v_pack_b32_v2f16_sub:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3] glc dlc
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_subrev_f16_e32 v1, 2.0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, 2.0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX11-GISEL-FAKE16-NEXT:    ;;#ASMSTART
+; GFX11-GISEL-FAKE16-NEXT:    ; use v0
+; GFX11-GISEL-FAKE16-NEXT:    ;;#ASMEND
+; GFX11-GISEL-FAKE16-NEXT:    s_endpgm
+;
+; GFX11-GCN-REAL16-LABEL: v_pack_b32_v2f16_sub:
+; GFX11-GCN-REAL16:       ; %bb.0:
+; GFX11-GCN-REAL16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GCN-REAL16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GCN-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GCN-REAL16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-GCN-REAL16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GCN-REAL16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-GCN-REAL16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GCN-REAL16-NEXT:    global_load_u16 v2, v0, s[2:3] glc dlc
+; GFX11-GCN-REAL16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GCN-REAL16-NEXT:    v_mov_b16_e32 v0.l, v1.l
+; GFX11-GCN-REAL16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-GCN-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GCN-REAL16-NEXT:    v_subrev_f16_e32 v0.l, 2.0, v0.l
+; GFX11-GCN-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v0.h
+; GFX11-GCN-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GCN-REAL16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-GCN-REAL16-NEXT:    ;;#ASMSTART
+; GFX11-GCN-REAL16-NEXT:    ; use v0
+; GFX11-GCN-REAL16-NEXT:    ;;#ASMEND
+; GFX11-GCN-REAL16-NEXT:    s_endpgm
+;
+; GFX11-GISEL-REAL16-LABEL: v_pack_b32_v2f16_sub:
+; GFX11-GISEL-REAL16:       ; %bb.0:
+; GFX11-GISEL-REAL16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-REAL16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-REAL16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-GISEL-REAL16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-REAL16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-GISEL-REAL16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-REAL16-NEXT:    global_load_u16 v2, v0, s[2:3] glc dlc
+; GFX11-GISEL-REAL16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-REAL16-NEXT:    v_subrev_f16_e32 v0.l, 2.0, v1.l
+; GFX11-GISEL-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v2.l
+; GFX11-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-REAL16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-GISEL-REAL16-NEXT:    ;;#ASMSTART
+; GFX11-GISEL-REAL16-NEXT:    ; use v0
+; GFX11-GISEL-REAL16-NEXT:    ;;#ASMEND
+; GFX11-GISEL-REAL16-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in0.gep = getelementptr inbounds half, ptr addrspace(1) %in0, i64 %tid.ext
@@ -136,6 +306,78 @@ define amdgpu_kernel void @fptrunc(
 ; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GISEL-NEXT:    s_endpgm
+;
+; GFX11-GCN-FAKE16-LABEL: fptrunc:
+; GFX11-GCN-FAKE16:       ; %bb.0:
+; GFX11-GCN-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GCN-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-GCN-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-GCN-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-GCN-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-GCN-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GCN-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-GCN-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-GCN-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-GCN-FAKE16-NEXT:    buffer_load_b64 v[0:1], off, s[8:11], 0
+; GFX11-GCN-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GCN-FAKE16-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-GCN-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-GCN-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GCN-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-GCN-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-GCN-FAKE16-NEXT:    s_endpgm
+;
+; GFX11-GISEL-FAKE16-LABEL: fptrunc:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, s2
+; GFX11-GISEL-FAKE16-NEXT:    v_cvt_f16_f32_e32 v1, s3
+; GFX11-GISEL-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-GISEL-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-FAKE16-NEXT:    s_endpgm
+;
+; GFX11-GCN-REAL16-LABEL: fptrunc:
+; GFX11-GCN-REAL16:       ; %bb.0:
+; GFX11-GCN-REAL16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GCN-REAL16-NEXT:    s_mov_b32 s6, -1
+; GFX11-GCN-REAL16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-GCN-REAL16-NEXT:    s_mov_b32 s10, s6
+; GFX11-GCN-REAL16-NEXT:    s_mov_b32 s11, s7
+; GFX11-GCN-REAL16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GCN-REAL16-NEXT:    s_mov_b32 s8, s2
+; GFX11-GCN-REAL16-NEXT:    s_mov_b32 s9, s3
+; GFX11-GCN-REAL16-NEXT:    s_mov_b32 s4, s0
+; GFX11-GCN-REAL16-NEXT:    buffer_load_b64 v[1:2], off, s[8:11], 0
+; GFX11-GCN-REAL16-NEXT:    s_mov_b32 s5, s1
+; GFX11-GCN-REAL16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GCN-REAL16-NEXT:    v_cvt_f16_f32_e32 v0.l, v2
+; GFX11-GCN-REAL16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
+; GFX11-GCN-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GCN-REAL16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-GCN-REAL16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-GCN-REAL16-NEXT:    s_endpgm
+;
+; GFX11-GISEL-REAL16-LABEL: fptrunc:
+; GFX11-GISEL-REAL16:       ; %bb.0:
+; GFX11-GISEL-REAL16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-REAL16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-REAL16-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-GISEL-REAL16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-REAL16-NEXT:    v_cvt_f16_f32_e32 v0.l, s2
+; GFX11-GISEL-REAL16-NEXT:    v_cvt_f16_f32_e32 v0.h, s3
+; GFX11-GISEL-REAL16-NEXT:    s_mov_b32 s2, -1
+; GFX11-GISEL-REAL16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-REAL16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-GISEL-REAL16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-REAL16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
   %a.val = load <2 x float>, ptr addrspace(1) %a
@@ -178,6 +420,89 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(
 ; GISEL-NEXT:    ; use v0
 ; GISEL-NEXT:    ;;#ASMEND
 ; GISEL-NEXT:    s_endpgm
+;
+; GFX11-GCN-FAKE16-LABEL: v_pack_b32.fabs:
+; GFX11-GCN-FAKE16:       ; %bb.0:
+; GFX11-GCN-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GCN-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GCN-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GCN-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-GCN-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GCN-FAKE16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GCN-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3] glc dlc
+; GFX11-GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GCN-FAKE16-NEXT:    v_add_f16_e32 v1, 2.0, v1
+; GFX11-GCN-FAKE16-NEXT:    v_add_f16_e32 v0, 2.0, v0
+; GFX11-GCN-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GCN-FAKE16-NEXT:    v_pack_b32_f16 v0, |v1|, |v0|
+; GFX11-GCN-FAKE16-NEXT:    ;;#ASMSTART
+; GFX11-GCN-FAKE16-NEXT:    ; use v0
+; GFX11-GCN-FAKE16-NEXT:    ;;#ASMEND
+; GFX11-GCN-FAKE16-NEXT:    s_endpgm
+;
+; GFX11-GISEL-FAKE16-LABEL: v_pack_b32.fabs:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3] glc dlc
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v1, 2.0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, 2.0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, |v1|, |v0|
+; GFX11-GISEL-FAKE16-NEXT:    ;;#ASMSTART
+; GFX11-GISEL-FAKE16-NEXT:    ; use v0
+; GFX11-GISEL-FAKE16-NEXT:    ;;#ASMEND
+; GFX11-GISEL-FAKE16-NEXT:    s_endpgm
+;
+; GFX11-GCN-REAL16-LABEL: v_pack_b32.fabs:
+; GFX11-GCN-REAL16:       ; %bb.0:
+; GFX11-GCN-REAL16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GCN-REAL16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GCN-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GCN-REAL16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-GCN-REAL16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GCN-REAL16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-GCN-REAL16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GCN-REAL16-NEXT:    global_load_u16 v2, v0, s[2:3] glc dlc
+; GFX11-GCN-REAL16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GCN-REAL16-NEXT:    v_mov_b16_e32 v0.l, v1.l
+; GFX11-GCN-REAL16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-GCN-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GCN-REAL16-NEXT:    v_add_f16_e32 v0.l, 2.0, v0.l
+; GFX11-GCN-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v0.h
+; GFX11-GCN-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GCN-REAL16-NEXT:    v_pack_b32_f16 v0, |v0.l|, |v0.h|
+; GFX11-GCN-REAL16-NEXT:    ;;#ASMSTART
+; GFX11-GCN-REAL16-NEXT:    ; use v0
+; GFX11-GCN-REAL16-NEXT:    ;;#ASMEND
+; GFX11-GCN-REAL16-NEXT:    s_endpgm
+;
+; GFX11-GISEL-REAL16-LABEL: v_pack_b32.fabs:
+; GFX11-GISEL-REAL16:       ; %bb.0:
+; GFX11-GISEL-REAL16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-REAL16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-REAL16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-GISEL-REAL16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-REAL16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-GISEL-REAL16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-REAL16-NEXT:    global_load_u16 v2, v0, s[2:3] glc dlc
+; GFX11-GISEL-REAL16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-REAL16-NEXT:    v_add_f16_e32 v0.l, 2.0, v1.l
+; GFX11-GISEL-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v2.l
+; GFX11-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-REAL16-NEXT:    v_pack_b32_f16 v0, |v0.l|, |v0.h|
+; GFX11-GISEL-REAL16-NEXT:    ;;#ASMSTART
+; GFX11-GISEL-REAL16-NEXT:    ; use v0
+; GFX11-GISEL-REAL16-NEXT:    ;;#ASMEND
+; GFX11-GISEL-REAL16-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in0.gep = getelementptr inbounds half, ptr addrspace(1) %in0, i64 %tid.ext
@@ -229,6 +554,89 @@ define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace(
 ; GISEL-NEXT:    ; use v0
 ; GISEL-NEXT:    ;;#ASMEND
 ; GISEL-NEXT:    s_endpgm
+;
+; GFX11-GCN-FAKE16-LABEL: v_pack_b32.fneg:
+; GFX11-GCN-FAKE16:       ; %bb.0:
+; GFX11-GCN-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GCN-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GCN-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GCN-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-GCN-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GCN-FAKE16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GCN-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3] glc dlc
+; GFX11-GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GCN-FAKE16-NEXT:    v_add_f16_e32 v1, 2.0, v1
+; GFX11-GCN-FAKE16-NEXT:    v_add_f16_e32 v0, 2.0, v0
+; GFX11-GCN-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GCN-FAKE16-NEXT:    v_pack_b32_f16 v0, -v1, -v0
+; GFX11-GCN-FAKE16-NEXT:    ;;#ASMSTART
+; GFX11-GCN-FAKE16-NEXT:    ; use v0
+; GFX11-GCN-FAKE16-NEXT:    ;;#ASMEND
+; GFX11-GCN-FAKE16-NEXT:    s_endpgm
+;
+; GFX11-GISEL-FAKE16-LABEL: v_pack_b32.fneg:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3] glc dlc
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v1, 2.0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, 2.0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, -v1, -v0
+; GFX11-GISEL-FAKE16-NEXT:    ;;#ASMSTART
+; GFX11-GISEL-FAKE16-NEXT:    ; use v0
+; GFX11-GISEL-FAKE16-NEXT:    ;;#ASMEND
+; GFX11-GISEL-FAKE16-NEXT:    s_endpgm
+;
+; GFX11-GCN-REAL16-LABEL: v_pack_b32.fneg:
+; GFX11-GCN-REAL16:       ; %bb.0:
+; GFX11-GCN-REAL16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GCN-REAL16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GCN-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GCN-REAL16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-GCN-REAL16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GCN-REAL16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-GCN-REAL16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GCN-REAL16-NEXT:    global_load_u16 v2, v0, s[2:3] glc dlc
+; GFX11-GCN-REAL16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GCN-REAL16-NEXT:    v_mov_b16_e32 v0.l, v1.l
+; GFX11-GCN-REAL16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-GCN-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GCN-REAL16-NEXT:    v_add_f16_e32 v0.l, 2.0, v0.l
+; GFX11-GCN-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v0.h
+; GFX11-GCN-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GCN-REAL16-NEXT:    v_pack_b32_f16 v0, -v0.l, -v0.h
+; GFX11-GCN-REAL16-NEXT:    ;;#ASMSTART
+; GFX11-GCN-REAL16-NEXT:    ; use v0
+; GFX11-GCN-REAL16-NEXT:    ;;#ASMEND
+; GFX11-GCN-REAL16-NEXT:    s_endpgm
+;
+; GFX11-GISEL-REAL16-LABEL: v_pack_b32.fneg:
+; GFX11-GISEL-REAL16:       ; %bb.0:
+; GFX11-GISEL-REAL16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-REAL16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-REAL16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-GISEL-REAL16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-REAL16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-GISEL-REAL16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-REAL16-NEXT:    global_load_u16 v2, v0, s[2:3] glc dlc
+; GFX11-GISEL-REAL16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-REAL16-NEXT:    v_add_f16_e32 v0.l, 2.0, v1.l
+; GFX11-GISEL-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v2.l
+; GFX11-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-REAL16-NEXT:    v_pack_b32_f16 v0, -v0.l, -v0.h
+; GFX11-GISEL-REAL16-NEXT:    ;;#ASMSTART
+; GFX11-GISEL-REAL16-NEXT:    ; use v0
+; GFX11-GISEL-REAL16-NEXT:    ;;#ASMEND
+; GFX11-GISEL-REAL16-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in0.gep = getelementptr inbounds half, ptr addrspace(1) %in0, i64 %tid.ext

From 092d6283838dea79670750b9415955c5f0cb5178 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Mon, 13 Jan 2025 09:02:56 -0800
Subject: [PATCH 303/408] [SLP]Check for div/rem instructions before extending
 with poisons

Need to check if the instructions can be safely extended with poison
before actually doing this to avoid incorrect transformations.

Fixes #122691
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  9 ++-
 .../X86/div-possibly-extended-with-poisons.ll | 71 +++++++++++++++++++
 2 files changed, 79 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/div-possibly-extended-with-poisons.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index df46c69ff3ab4..4b0ed5b30179b 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -8091,6 +8091,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             NonUniqueValueVL.append(
                 PWSz - UniqueValues.size(),
                 PoisonValue::get(UniqueValues.front()->getType()));
+            // Check that extended with poisons operations are still valid for
+            // vectorization (div/rem are not allowed).
+            if (!getSameOpcode(NonUniqueValueVL, *TLI).valid()) {
+              LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
+              newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
+              return false;
+            }
             VL = NonUniqueValueVL;
           }
           return true;
@@ -17818,7 +17825,7 @@ bool BoUpSLP::collectValuesToDemote(
   };
   if (E.isGather() || !Visited.insert(&E).second ||
       any_of(E.Scalars, [&](Value *V) {
-        return all_of(V->users(), [&](User *U) {
+        return !isa<PoisonValue>(V) && all_of(V->users(), [&](User *U) {
           return isa<InsertElementInst>(U) && !getTreeEntry(U);
         });
       }))
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/div-possibly-extended-with-poisons.ll b/llvm/test/Transforms/SLPVectorizer/X86/div-possibly-extended-with-poisons.ll
new file mode 100644
index 0000000000000..07ee8f840721f
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/div-possibly-extended-with-poisons.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-100 < %s | FileCheck %s
+
+define i8 @test(ptr %g_127, i32 %0, i16 %1) {
+; CHECK-LABEL: define i8 @test(
+; CHECK-SAME: ptr [[G_127:%.*]], i32 [[TMP0:%.*]], i16 [[TMP1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_INC434_I:.*]]
+; CHECK:       [[FOR_COND166_PREHEADER_I:.*]]:
+; CHECK-NEXT:    br label %[[FOR_INC434_I]]
+; CHECK:       [[FOR_INC434_I]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 60, %[[FOR_COND166_PREHEADER_I]] ]
+; CHECK-NEXT:    [[CONV8_I_I:%.*]] = zext nneg i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[DIV_I_I_1:%.*]] = udiv i64 [[CONV8_I_I]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[DIV_I_I_1]] to i16
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i16> poison, i16 [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i64> poison, i64 [[CONV8_I_I]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = udiv <4 x i64> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i16> [[TMP10]], <4 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 1, i32 2, i32 poison, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP11]], <8 x i32> <i32 0, i32 8, i32 poison, i32 10, i32 11, i32 poison, i32 13, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i16> [[TMP12]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 3, i32 4, i32 4, i32 6, i32 6>
+; CHECK-NEXT:    [[TMP14:%.*]] = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> [[TMP13]])
+; CHECK-NEXT:    [[TMP15:%.*]] = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> [[TMP14]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = and i16 [[TMP15]], [[TMP1]]
+; CHECK-NEXT:    [[AND14_I_2_I_5:%.*]] = zext i16 [[OP_RDX]] to i32
+; CHECK-NEXT:    store i32 [[AND14_I_2_I_5]], ptr [[G_127]], align 4
+; CHECK-NEXT:    ret i8 0
+;
+entry:
+  br label %for.inc434.i
+
+for.cond166.preheader.i:
+  br label %for.inc434.i
+
+for.inc434.i:
+  %2 = phi i64 [ 0, %entry ], [ 60, %for.cond166.preheader.i ]
+  %conv8.i.i = zext nneg i32 %0 to i64
+  %div.i.i.1 = udiv i64 %conv8.i.i, %2
+  %3 = trunc i64 %div.i.i.1 to i16
+  %call12.i.2.i.1 = tail call i16 @llvm.bswap.i16(i16 %3)
+  %and14.i.2.i.118 = and i16 %1, %call12.i.2.i.1
+  %div.i.i.2 = udiv i64 %conv8.i.i, %2
+  %4 = trunc i64 %div.i.i.2 to i16
+  %call12.i.i.2 = tail call i16 @llvm.bswap.i16(i16 %4)
+  %and14.i.i.219 = and i16 %and14.i.2.i.118, %call12.i.i.2
+  %call12.i.2.i.2 = tail call i16 @llvm.bswap.i16(i16 %4)
+  %and14.i.2.i.220 = and i16 %and14.i.i.219, %call12.i.2.i.2
+  %div.i.i.3 = udiv i64 %conv8.i.i, %2
+  %5 = trunc i64 %div.i.i.3 to i16
+  %call12.i.2.i.3 = tail call i16 @llvm.bswap.i16(i16 %5)
+  %and14.i.2.i.322 = and i16 %and14.i.2.i.220, %call12.i.2.i.3
+  %div.i.i.4 = udiv i64 %conv8.i.i, %2
+  %6 = trunc i64 %div.i.i.4 to i16
+  %call12.i.i.4 = tail call i16 @llvm.bswap.i16(i16 %6)
+  %and14.i.i.423 = and i16 %and14.i.2.i.322, %call12.i.i.4
+  %call12.i.2.i.4 = tail call i16 @llvm.bswap.i16(i16 %6)
+  %and14.i.2.i.424 = and i16 %and14.i.i.423, %call12.i.2.i.4
+  %div.i.i.5 = udiv i64 %conv8.i.i, %2
+  %7 = trunc i64 %div.i.i.5 to i16
+  %call12.i.i.5 = tail call i16 @llvm.bswap.i16(i16 %7)
+  %and14.i.i.525 = and i16 %and14.i.2.i.424, %call12.i.i.5
+  %call12.i.2.i.5 = tail call i16 @llvm.bswap.i16(i16 %7)
+  %and14.i.2.i.51 = and i16 %and14.i.i.525, %call12.i.2.i.5
+  %and14.i.2.i.5 = zext i16 %and14.i.2.i.51 to i32
+  store i32 %and14.i.2.i.5, ptr %g_127, align 4
+  ret i8 0
+}

From 1a7d46fac8aa5ff4a96db01937cdb3b106253ac6 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Mon, 13 Jan 2025 09:36:41 -0800
Subject: [PATCH 304/408] [BoundsChecking]  Add guard= pass parameter (#122575)

And use that as an argument for
allow_ubsan_check when needed.

Other ubsan checks use SanitizerKind,
but those are known to the clang only.

So make it a parameter in LLVM.
---
 .../Instrumentation/BoundsChecking.h          |  1 +
 llvm/lib/Passes/PassBuilder.cpp               | 16 ++++--
 .../Instrumentation/BoundsChecking.cpp        | 11 +++-
 .../BoundsChecking/runtimes.ll                | 51 +++++++++++++++++++
 4 files changed, 74 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Instrumentation/BoundsChecking.h b/llvm/include/llvm/Transforms/Instrumentation/BoundsChecking.h
index 836fc907375d3..ab2dcee06551e 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/BoundsChecking.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/BoundsChecking.h
@@ -29,6 +29,7 @@ class BoundsCheckingPass : public PassInfoMixin<BoundsCheckingPass> {
     };
     std::optional<Runtime> Rt; // Trap if empty.
     bool Merge = false;
+    std::optional<int8_t> GuardKind; // `allow_ubsan_check` argument.
   };
 
   BoundsCheckingPass(Options Opts) : Opts(Opts) {}
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index aac4407740055..f923d5aabe0a0 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -1315,10 +1315,18 @@ parseBoundsCheckingOptions(StringRef Params) {
     } else if (ParamName == "merge") {
       Options.Merge = true;
     } else {
-      return make_error<StringError>(
-          formatv("invalid BoundsChecking pass parameter '{0}' ", ParamName)
-              .str(),
-          inconvertibleErrorCode());
+      StringRef ParamEQ;
+      StringRef Val;
+      std::tie(ParamEQ, Val) = ParamName.split('=');
+      int8_t Id = 0;
+      if (ParamEQ == "guard" && !Val.getAsInteger(0, Id)) {
+        Options.GuardKind = Id;
+      } else {
+        return make_error<StringError>(
+            formatv("invalid BoundsChecking pass parameter '{0}' ", ParamName)
+                .str(),
+            inconvertibleErrorCode());
+      }
     }
   }
   return Options;
diff --git a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
index 8004552250b47..609678f9979c6 100644
--- a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -214,8 +214,15 @@ static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI,
         Or = getBoundsCheckCond(AI->getPointerOperand(), AI->getValOperand(),
                                 DL, TLI, ObjSizeEval, IRB, SE);
     }
-    if (Or)
+    if (Or) {
+      if (Opts.GuardKind) {
+        llvm::Value *Allow = IRB.CreateIntrinsic(
+            IRB.getInt1Ty(), Intrinsic::allow_ubsan_check,
+            {llvm::ConstantInt::getSigned(IRB.getInt8Ty(), *Opts.GuardKind)});
+        Or = IRB.CreateAnd(Or, Allow);
+      }
       TrapInfo.push_back(std::make_pair(&I, Or));
+    }
   }
 
   std::string Name;
@@ -299,5 +306,7 @@ void BoundsCheckingPass::printPipeline(
   }
   if (Opts.Merge)
     OS << ";merge";
+  if (Opts.GuardKind)
+    OS << ";guard=" << static_cast<int>(*Opts.GuardKind);
   OS << ">";
 }
diff --git a/llvm/test/Instrumentation/BoundsChecking/runtimes.ll b/llvm/test/Instrumentation/BoundsChecking/runtimes.ll
index ccc7e93615fed..7cf78a5d54e71 100644
--- a/llvm/test/Instrumentation/BoundsChecking/runtimes.ll
+++ b/llvm/test/Instrumentation/BoundsChecking/runtimes.ll
@@ -9,6 +9,8 @@
 ; RUN: opt < %s -passes='bounds-checking<min-rt>'       -S | FileCheck %s --check-prefixes=MINRT-NOMERGE
 ; RUN: opt < %s -passes='bounds-checking<min-rt-abort>' -S | FileCheck %s --check-prefixes=MINRTABORT-NOMERGE
 ;
+; RUN: opt < %s -passes='bounds-checking<trap;guard=3>'   -S | FileCheck %s --check-prefixes=TR-GUARD
+; RUN: opt < %s -passes='bounds-checking<rt;guard=-5>'     -S | FileCheck %s --check-prefixes=RT-GUARD
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 define void @f1(i64 %x) nounwind {
@@ -123,6 +125,42 @@ define void @f1(i64 %x) nounwind {
 ; MINRTABORT-NOMERGE:       [[TRAP]]:
 ; MINRTABORT-NOMERGE-NEXT:    call void @__ubsan_handle_local_out_of_bounds_minimal_abort() #[[ATTR2:[0-9]+]], !nosanitize [[META0]]
 ; MINRTABORT-NOMERGE-NEXT:    unreachable, !nosanitize [[META0]]
+;
+; TR-GUARD-LABEL: define void @f1(
+; TR-GUARD-SAME: i64 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; TR-GUARD-NEXT:    [[TMP1:%.*]] = mul i64 16, [[X]]
+; TR-GUARD-NEXT:    [[TMP2:%.*]] = alloca i128, i64 [[X]], align 8
+; TR-GUARD-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP1]], 0, !nosanitize [[META0:![0-9]+]]
+; TR-GUARD-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 16, !nosanitize [[META0]]
+; TR-GUARD-NEXT:    [[TMP5:%.*]] = or i1 false, [[TMP4]], !nosanitize [[META0]]
+; TR-GUARD-NEXT:    [[TMP6:%.*]] = or i1 false, [[TMP5]], !nosanitize [[META0]]
+; TR-GUARD-NEXT:    [[TMP7:%.*]] = call i1 @llvm.allow.ubsan.check(i8 3), !nosanitize [[META0]]
+; TR-GUARD-NEXT:    [[TMP8:%.*]] = and i1 [[TMP6]], [[TMP7]], !nosanitize [[META0]]
+; TR-GUARD-NEXT:    br i1 [[TMP8]], label %[[TRAP:.*]], label %[[BB9:.*]]
+; TR-GUARD:       [[BB9]]:
+; TR-GUARD-NEXT:    [[TMP10:%.*]] = load i128, ptr [[TMP2]], align 4
+; TR-GUARD-NEXT:    ret void
+; TR-GUARD:       [[TRAP]]:
+; TR-GUARD-NEXT:    call void @llvm.ubsantrap(i8 3) #[[ATTR3:[0-9]+]], !nosanitize [[META0]]
+; TR-GUARD-NEXT:    unreachable, !nosanitize [[META0]]
+;
+; RT-GUARD-LABEL: define void @f1(
+; RT-GUARD-SAME: i64 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; RT-GUARD-NEXT:    [[TMP1:%.*]] = mul i64 16, [[X]]
+; RT-GUARD-NEXT:    [[TMP2:%.*]] = alloca i128, i64 [[X]], align 8
+; RT-GUARD-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP1]], 0, !nosanitize [[META0:![0-9]+]]
+; RT-GUARD-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 16, !nosanitize [[META0]]
+; RT-GUARD-NEXT:    [[TMP5:%.*]] = or i1 false, [[TMP4]], !nosanitize [[META0]]
+; RT-GUARD-NEXT:    [[TMP6:%.*]] = or i1 false, [[TMP5]], !nosanitize [[META0]]
+; RT-GUARD-NEXT:    [[TMP7:%.*]] = call i1 @llvm.allow.ubsan.check(i8 -5), !nosanitize [[META0]]
+; RT-GUARD-NEXT:    [[TMP8:%.*]] = and i1 [[TMP6]], [[TMP7]], !nosanitize [[META0]]
+; RT-GUARD-NEXT:    br i1 [[TMP8]], label %[[TRAP:.*]], label %[[BB9:.*]]
+; RT-GUARD:       [[BB9]]:
+; RT-GUARD-NEXT:    [[TMP10:%.*]] = load i128, ptr [[TMP2]], align 4
+; RT-GUARD-NEXT:    ret void
+; RT-GUARD:       [[TRAP]]:
+; RT-GUARD-NEXT:    call void @__ubsan_handle_local_out_of_bounds() #[[ATTR2:[0-9]+]], !nosanitize [[META0]]
+; RT-GUARD-NEXT:    br label %[[BB9]], !nosanitize [[META0]]
 ;
   %1 = alloca i128, i64 %x
   %3 = load i128, ptr %1, align 4
@@ -154,6 +192,15 @@ define void @f1(i64 %x) nounwind {
 ; MINRTABORT-NOMERGE: attributes #[[ATTR1:[0-9]+]] = { noreturn nounwind }
 ; MINRTABORT-NOMERGE: attributes #[[ATTR2]] = { nomerge noreturn nounwind }
 ;.
+; TR-GUARD: attributes #[[ATTR0]] = { nounwind }
+; TR-GUARD: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
+; TR-GUARD: attributes #[[ATTR2:[0-9]+]] = { cold noreturn nounwind }
+; TR-GUARD: attributes #[[ATTR3]] = { nomerge noreturn nounwind }
+;.
+; RT-GUARD: attributes #[[ATTR0]] = { nounwind }
+; RT-GUARD: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
+; RT-GUARD: attributes #[[ATTR2]] = { nomerge nounwind }
+;.
 ; TR: [[META0]] = !{}
 ;.
 ; RT: [[META0]] = !{}
@@ -168,3 +215,7 @@ define void @f1(i64 %x) nounwind {
 ;.
 ; MINRTABORT-NOMERGE: [[META0]] = !{}
 ;.
+; TR-GUARD: [[META0]] = !{}
+;.
+; RT-GUARD: [[META0]] = !{}
+;.

From 2d2fc4eb6820dfaebb5225c9f5c7275d0f1d0e86 Mon Sep 17 00:00:00 2001
From: Andreas Jonson <andjo403@hotmail.com>
Date: Mon, 13 Jan 2025 18:48:10 +0100
Subject: [PATCH 305/408] [InstCombine] Test for trunc in align assume (NFC)

---
 llvm/test/Transforms/InstCombine/assume.ll | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll
index 52f0adf02a396..2d7bc49b6dcae 100644
--- a/llvm/test/Transforms/InstCombine/assume.ll
+++ b/llvm/test/Transforms/InstCombine/assume.ll
@@ -34,6 +34,23 @@ define i32 @foo1(ptr %a) #0 {
   ret i32 %t0
 }
 
+define i32 @align_assume_trunc_cond(ptr %a) #0 {
+; CHECK-LABEL: @align_assume_trunc_cond(
+; CHECK-NEXT:    [[T0:%.*]] = load i32, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 [[PTRINT]] to i1
+; CHECK-NEXT:    [[MASKCOND:%.*]] = xor i1 [[TRUNC]], true
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
+; CHECK-NEXT:    ret i32 [[T0]]
+;
+  %t0 = load i32, ptr %a, align 4
+  %ptrint = ptrtoint ptr %a to i64
+  %trunc = trunc i64 %ptrint to i1
+  %maskcond = xor i1 %trunc, true
+  tail call void @llvm.assume(i1 %maskcond)
+  ret i32 %t0
+}
+
 ; Same check as in @foo1, but make sure it works if the assume is first too.
 
 define i32 @foo2(ptr %a) #0 {

From 409ca49feb6659e279358f77b393f7955fb1bbda Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Mon, 13 Jan 2025 09:55:44 -0800
Subject: [PATCH 306/408] [ubsan] Pass fsanitize-skip-hot-cutoff into
 -fsanitize=bounds (#122576)

---
 clang/lib/CodeGen/BackendUtil.cpp      | 12 ++++++++
 clang/lib/CodeGen/CGExpr.cpp           |  4 ++-
 clang/test/CodeGen/allow-ubsan-check.c | 38 +++++++++++++++-----------
 3 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index bcf6db1467ffc..79e6bf3d24dff 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -85,6 +85,7 @@
 #include "llvm/Transforms/Scalar/JumpThreading.h"
 #include "llvm/Transforms/Utils/Debugify.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <limits>
 #include <memory>
 #include <optional>
 using namespace clang;
@@ -119,6 +120,9 @@ static cl::opt<PGOOptions::ColdFuncOpt> ClPGOColdFuncAttr(
 
 extern cl::opt<InstrProfCorrelator::ProfCorrelatorKind> ProfileCorrelate;
 } // namespace llvm
+namespace clang {
+extern llvm::cl::opt<bool> ClSanitizeGuardChecks;
+}
 
 namespace {
 
@@ -1023,6 +1027,14 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
       PB.registerScalarOptimizerLateEPCallback([this](FunctionPassManager &FPM,
                                                       OptimizationLevel Level) {
         BoundsCheckingPass::Options Options;
+        if (CodeGenOpts.SanitizeSkipHotCutoffs[SanitizerKind::SO_LocalBounds] ||
+            ClSanitizeGuardChecks) {
+          static_assert(SanitizerKind::SO_LocalBounds <=
+                            std::numeric_limits<
+                                decltype(Options.GuardKind)::value_type>::max(),
+                        "Update type of llvm.allow.ubsan.check.");
+          Options.GuardKind = SanitizerKind::SO_LocalBounds;
+        }
         Options.Merge =
             CodeGenOpts.SanitizeMergeHandlers.has(SanitizerKind::LocalBounds);
         if (!CodeGenOpts.SanitizeTrap.has(SanitizerKind::LocalBounds)) {
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 060d02b7f1487..6e5a21c8f01e7 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -52,11 +52,13 @@
 using namespace clang;
 using namespace CodeGen;
 
+namespace clang {
 // TODO: Introduce frontend options to enabled per sanitizers, similar to
 // `fsanitize-trap`.
-static llvm::cl::opt<bool> ClSanitizeGuardChecks(
+llvm::cl::opt<bool> ClSanitizeGuardChecks(
     "ubsan-guard-checks", llvm::cl::Optional,
     llvm::cl::desc("Guard UBSAN checks with `llvm.allow.ubsan.check()`."));
+} // namespace clang
 
 //===--------------------------------------------------------------------===//
 //                        Defines for metadata
diff --git a/clang/test/CodeGen/allow-ubsan-check.c b/clang/test/CodeGen/allow-ubsan-check.c
index fb264ce32ab99..38b4848c1edc1 100644
--- a/clang/test/CodeGen/allow-ubsan-check.c
+++ b/clang/test/CodeGen/allow-ubsan-check.c
@@ -174,12 +174,14 @@ void use(double*);
 // CHECK-NEXT:    [[VLA:%.*]] = alloca double, i64 [[TMP0]], align 16
 // CHECK-NEXT:    call void @use(ptr noundef nonnull [[VLA]]) #[[ATTR7:[0-9]+]]
 // CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
-// CHECK-NEXT:    [[DOTNOT:%.*]] = icmp ugt i64 [[TMP0]], [[IDXPROM]]
-// CHECK-NEXT:    br i1 [[DOTNOT]], label %[[BB1:.*]], label %[[TRAP:.*]]
-// CHECK:       [[BB1]]:
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ule i64 [[TMP0]], [[IDXPROM]]
+// CHECK-NEXT:    [[TMP2:%.*]] = call i1 @llvm.allow.ubsan.check(i8 71), !nosanitize [[META2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 [[TMP1]], [[TMP2]], !nosanitize [[META2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label %[[TRAP:.*]], label %[[BB4:.*]]
+// CHECK:       [[BB4]]:
 // CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[VLA]], i64 [[IDXPROM]]
-// CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA8:![0-9]+]]
-// CHECK-NEXT:    ret double [[TMP2]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA8:![0-9]+]]
+// CHECK-NEXT:    ret double [[TMP5]]
 // CHECK:       [[TRAP]]:
 // CHECK-NEXT:    call void @__ubsan_handle_local_out_of_bounds_abort() #[[ATTR6]], !nosanitize [[META2]]
 // CHECK-NEXT:    unreachable, !nosanitize [[META2]]
@@ -191,12 +193,14 @@ void use(double*);
 // TR-NEXT:    [[VLA:%.*]] = alloca double, i64 [[TMP0]], align 16
 // TR-NEXT:    call void @use(ptr noundef nonnull [[VLA]]) #[[ATTR6:[0-9]+]]
 // TR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
-// TR-NEXT:    [[DOTNOT:%.*]] = icmp ugt i64 [[TMP0]], [[IDXPROM]]
-// TR-NEXT:    br i1 [[DOTNOT]], label %[[BB1:.*]], label %[[TRAP:.*]]
-// TR:       [[BB1]]:
+// TR-NEXT:    [[TMP1:%.*]] = icmp ule i64 [[TMP0]], [[IDXPROM]]
+// TR-NEXT:    [[TMP2:%.*]] = call i1 @llvm.allow.ubsan.check(i8 71), !nosanitize [[META2]]
+// TR-NEXT:    [[TMP3:%.*]] = and i1 [[TMP1]], [[TMP2]], !nosanitize [[META2]]
+// TR-NEXT:    br i1 [[TMP3]], label %[[TRAP:.*]], label %[[BB4:.*]]
+// TR:       [[BB4]]:
 // TR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[VLA]], i64 [[IDXPROM]]
-// TR-NEXT:    [[TMP2:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA7:![0-9]+]]
-// TR-NEXT:    ret double [[TMP2]]
+// TR-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA7:![0-9]+]]
+// TR-NEXT:    ret double [[TMP5]]
 // TR:       [[TRAP]]:
 // TR-NEXT:    call void @llvm.ubsantrap(i8 3) #[[ATTR5]], !nosanitize [[META2]]
 // TR-NEXT:    unreachable, !nosanitize [[META2]]
@@ -208,15 +212,17 @@ void use(double*);
 // REC-NEXT:    [[VLA:%.*]] = alloca double, i64 [[TMP0]], align 16
 // REC-NEXT:    call void @use(ptr noundef nonnull [[VLA]]) #[[ATTR5:[0-9]+]]
 // REC-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
-// REC-NEXT:    [[DOTNOT:%.*]] = icmp ugt i64 [[TMP0]], [[IDXPROM]]
-// REC-NEXT:    br i1 [[DOTNOT]], label %[[BB1:.*]], label %[[TRAP:.*]]
-// REC:       [[BB1]]:
+// REC-NEXT:    [[TMP1:%.*]] = icmp ule i64 [[TMP0]], [[IDXPROM]]
+// REC-NEXT:    [[TMP2:%.*]] = call i1 @llvm.allow.ubsan.check(i8 71), !nosanitize [[META2]]
+// REC-NEXT:    [[TMP3:%.*]] = and i1 [[TMP1]], [[TMP2]], !nosanitize [[META2]]
+// REC-NEXT:    br i1 [[TMP3]], label %[[TRAP:.*]], label %[[BB4:.*]]
+// REC:       [[BB4]]:
 // REC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[VLA]], i64 [[IDXPROM]]
-// REC-NEXT:    [[TMP2:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA8:![0-9]+]]
-// REC-NEXT:    ret double [[TMP2]]
+// REC-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA8:![0-9]+]]
+// REC-NEXT:    ret double [[TMP5]]
 // REC:       [[TRAP]]:
 // REC-NEXT:    call void @__ubsan_handle_local_out_of_bounds() #[[ATTR6]], !nosanitize [[META2]]
-// REC-NEXT:    br label %[[BB1]], !nosanitize [[META2]]
+// REC-NEXT:    br label %[[BB4]], !nosanitize [[META2]]
 //
 double lbounds(int b, int i) {
   double a[b];

From 7c165f7fccfd40ae3bc2823d0ccd50257c21ab3e Mon Sep 17 00:00:00 2001
From: jimingham <jingham@apple.com>
Date: Mon, 13 Jan 2025 10:08:50 -0800
Subject: [PATCH 307/408] The _code field in an NSError is signed, not
 unsigned. (#119764)

The NSError summary provider was fetching and printing the `_code` field
as an unsigned integer, but it's defined to be an NSInteger, which is
signed.
---
 lldb/source/Plugins/Language/ObjC/NSError.cpp          | 10 +++++-----
 .../TestDataFormatterObjCNSError.py                    |  6 ++++--
 .../data-formatter/data-formatter-objc/main.m          |  2 +-
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/lldb/source/Plugins/Language/ObjC/NSError.cpp b/lldb/source/Plugins/Language/ObjC/NSError.cpp
index 2356bc4ef4bab..bb54044ae1d61 100644
--- a/lldb/source/Plugins/Language/ObjC/NSError.cpp
+++ b/lldb/source/Plugins/Language/ObjC/NSError.cpp
@@ -66,8 +66,8 @@ bool lldb_private::formatters::NSError_SummaryProvider(
   lldb::addr_t domain_location = ptr_value + 3 * ptr_size;
 
   Status error;
-  uint64_t code = process_sp->ReadUnsignedIntegerFromMemory(code_location,
-                                                            ptr_size, 0, error);
+  int64_t code = process_sp->ReadSignedIntegerFromMemory(code_location,
+                                                         ptr_size, 0, error);
   if (error.Fail())
     return false;
 
@@ -77,7 +77,7 @@ bool lldb_private::formatters::NSError_SummaryProvider(
     return false;
 
   if (!domain_str_value) {
-    stream.Printf("domain: nil - code: %" PRIu64, code);
+    stream.Printf("domain: nil - code: %" PRIi64, code);
     return true;
   }
 
@@ -98,11 +98,11 @@ bool lldb_private::formatters::NSError_SummaryProvider(
   StreamString domain_str_summary;
   if (NSStringSummaryProvider(*domain_str_sp, domain_str_summary, options) &&
       !domain_str_summary.Empty()) {
-    stream.Printf("domain: %s - code: %" PRIu64, domain_str_summary.GetData(),
+    stream.Printf("domain: %s - code: %" PRIi64, domain_str_summary.GetData(),
                   code);
     return true;
   } else {
-    stream.Printf("domain: nil - code: %" PRIu64, code);
+    stream.Printf("domain: nil - code: %" PRIi64, code);
     return true;
   }
 }
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/TestDataFormatterObjCNSError.py b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/TestDataFormatterObjCNSError.py
index 8a052cf84ef0e..de15e5915750b 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/TestDataFormatterObjCNSError.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/TestDataFormatterObjCNSError.py
@@ -23,10 +23,12 @@ def test_nserror_with_run_command_no_const(self):
         self.appkit_tester_impl(self.nserror_data_formatter_commands, False)
 
     def nserror_data_formatter_commands(self):
-        self.expect("frame variable nserror", substrs=['domain: @"Foobar" - code: 12'])
+        self.expect(
+            "frame variable nserror", substrs=['domain: @"Foobar" - code: -1234']
+        )
 
         self.expect(
-            "frame variable nserrorptr", substrs=['domain: @"Foobar" - code: 12']
+            "frame variable nserrorptr", substrs=['domain: @"Foobar" - code: -1234']
         )
 
         self.expect("frame variable nserror->_userInfo", substrs=["2 key/value pairs"])
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/main.m b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/main.m
index 0ca5cf98bd3a5..314bada49303d 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/main.m
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/main.m
@@ -618,7 +618,7 @@ int main(int argc, const char *argv[]) {
 
   NSDictionary *error_userInfo = @{@"a" : @1, @"b" : @2};
   NSError *nserror = [[NSError alloc] initWithDomain:@"Foobar"
-                                                code:12
+                                                code:-1234
                                             userInfo:error_userInfo];
   NSError **nserrorptr = &nserror;
 

From 380bb51b70b6d9f3da07a87f56fc3fe44bc78691 Mon Sep 17 00:00:00 2001
From: joaosaffran <126493771+joaosaffran@users.noreply.github.com>
Date: Mon, 13 Jan 2025 10:31:25 -0800
Subject: [PATCH 308/408] [HLSL] Adding Flatten and Branch if attributes with
 test fixes (#122157)

- Adding the changes from PRs:
  - #116331
  - #121852
- Fixes test `tools/dxil-dis/debug-info.ll`
- Address some missed comments in the previous PR

---------

Co-authored-by: joaosaffran <joao.saffran@microsoft.com>
---
 clang/include/clang/Basic/Attr.td             | 10 ++
 clang/lib/CodeGen/CGStmt.cpp                  |  6 ++
 clang/lib/CodeGen/CodeGenFunction.cpp         | 26 ++++-
 clang/lib/CodeGen/CodeGenFunction.h           |  4 +
 clang/lib/Sema/SemaStmtAttr.cpp               |  8 ++
 clang/test/AST/HLSL/HLSLControlFlowHint.hlsl  | 43 ++++++++
 .../test/CodeGenHLSL/HLSLControlFlowHint.hlsl | 48 +++++++++
 llvm/include/llvm/IR/IntrinsicsSPIRV.td       |  2 +-
 .../Target/DirectX/DXILTranslateMetadata.cpp  | 36 +++++++
 .../Target/SPIRV/SPIRVInstructionSelector.cpp | 29 ++++--
 llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp   | 44 ++++++---
 .../CodeGen/DirectX/HLSLControlFlowHint.ll    | 98 +++++++++++++++++++
 .../HLSLControlFlowHint-pass-check.ll         | 90 +++++++++++++++++
 .../SPIRV/structurizer/HLSLControlFlowHint.ll | 91 +++++++++++++++++
 14 files changed, 516 insertions(+), 19 deletions(-)
 create mode 100644 clang/test/AST/HLSL/HLSLControlFlowHint.hlsl
 create mode 100644 clang/test/CodeGenHLSL/HLSLControlFlowHint.hlsl
 create mode 100644 llvm/test/CodeGen/DirectX/HLSLControlFlowHint.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/structurizer/HLSLControlFlowHint-pass-check.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/structurizer/HLSLControlFlowHint.ll

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index c0632aaa51625..a752d94b06fad 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -4353,6 +4353,16 @@ def HLSLLoopHint: StmtAttr {
   let Documentation = [HLSLLoopHintDocs, HLSLUnrollHintDocs];
 }
 
+def HLSLControlFlowHint: StmtAttr {
+  /// [branch]
+  /// [flatten]
+  let Spellings = [Microsoft<"branch">, Microsoft<"flatten">];
+  let Subjects = SubjectList<[IfStmt],
+                              ErrorDiag, "'if' statements">;
+  let LangOpts = [HLSL];
+  let Documentation = [InternalOnly];
+}
+
 def CapturedRecord : InheritableAttr {
   // This attribute has no spellings as it is only ever created implicitly.
   let Spellings = [];
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index f9258a396b7d0..4ba8ee1ca17d4 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -760,6 +760,8 @@ void CodeGenFunction::EmitAttributedStmt(const AttributedStmt &S) {
   bool noinline = false;
   bool alwaysinline = false;
   bool noconvergent = false;
+  HLSLControlFlowHintAttr::Spelling flattenOrBranch =
+      HLSLControlFlowHintAttr::SpellingNotCalculated;
   const CallExpr *musttail = nullptr;
 
   for (const auto *A : S.getAttrs()) {
@@ -791,6 +793,9 @@ void CodeGenFunction::EmitAttributedStmt(const AttributedStmt &S) {
         Builder.CreateAssumption(AssumptionVal);
       }
     } break;
+    case attr::HLSLControlFlowHint: {
+      flattenOrBranch = cast<HLSLControlFlowHintAttr>(A)->getSemanticSpelling();
+    } break;
     }
   }
   SaveAndRestore save_nomerge(InNoMergeAttributedStmt, nomerge);
@@ -798,6 +803,7 @@ void CodeGenFunction::EmitAttributedStmt(const AttributedStmt &S) {
   SaveAndRestore save_alwaysinline(InAlwaysInlineAttributedStmt, alwaysinline);
   SaveAndRestore save_noconvergent(InNoConvergentAttributedStmt, noconvergent);
   SaveAndRestore save_musttail(MustTailCall, musttail);
+  SaveAndRestore save_flattenOrBranch(HLSLControlFlowAttr, flattenOrBranch);
   EmitStmt(S.getSubStmt(), S.getAttrs());
 }
 
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index d6f3716afabdf..11fdddba1144b 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -40,6 +40,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/FPEnv.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/MDBuilder.h"
@@ -2086,7 +2087,30 @@ void CodeGenFunction::EmitBranchOnBoolExpr(
     Weights = createProfileWeights(TrueCount, CurrentCount - TrueCount);
   }
 
-  Builder.CreateCondBr(CondV, TrueBlock, FalseBlock, Weights, Unpredictable);
+  llvm::Instruction *BrInst = Builder.CreateCondBr(CondV, TrueBlock, FalseBlock,
+                                                   Weights, Unpredictable);
+  switch (HLSLControlFlowAttr) {
+  case HLSLControlFlowHintAttr::Microsoft_branch:
+  case HLSLControlFlowHintAttr::Microsoft_flatten: {
+    llvm::MDBuilder MDHelper(CGM.getLLVMContext());
+
+    llvm::ConstantInt *BranchHintConstant =
+        HLSLControlFlowAttr ==
+                HLSLControlFlowHintAttr::Spelling::Microsoft_branch
+            ? llvm::ConstantInt::get(CGM.Int32Ty, 1)
+            : llvm::ConstantInt::get(CGM.Int32Ty, 2);
+
+    SmallVector<llvm::Metadata *, 2> Vals(
+        {MDHelper.createString("hlsl.controlflow.hint"),
+         MDHelper.createConstant(BranchHintConstant)});
+    BrInst->setMetadata("hlsl.controlflow.hint",
+                        llvm::MDNode::get(CGM.getLLVMContext(), Vals));
+    break;
+  }
+  // This is required to avoid warnings during compilation
+  case HLSLControlFlowHintAttr::SpellingNotCalculated:
+    break;
+  }
 }
 
 /// ErrorUnsupported - Print out an error that codegen doesn't support the
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 311f2ae94d046..b115c15bf01a9 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -615,6 +615,10 @@ class CodeGenFunction : public CodeGenTypeCache {
   /// True if the current statement has noconvergent attribute.
   bool InNoConvergentAttributedStmt = false;
 
+  /// HLSL Branch attribute.
+  HLSLControlFlowHintAttr::Spelling HLSLControlFlowAttr =
+      HLSLControlFlowHintAttr::SpellingNotCalculated;
+
   // The CallExpr within the current statement that the musttail attribute
   // applies to.  nullptr if there is no 'musttail' on the current statement.
   const CallExpr *MustTailCall = nullptr;
diff --git a/clang/lib/Sema/SemaStmtAttr.cpp b/clang/lib/Sema/SemaStmtAttr.cpp
index 106e2430de901..422d8abc1028a 100644
--- a/clang/lib/Sema/SemaStmtAttr.cpp
+++ b/clang/lib/Sema/SemaStmtAttr.cpp
@@ -619,6 +619,12 @@ static Attr *handleHLSLLoopHintAttr(Sema &S, Stmt *St, const ParsedAttr &A,
   return ::new (S.Context) HLSLLoopHintAttr(S.Context, A, UnrollFactor);
 }
 
+static Attr *handleHLSLControlFlowHint(Sema &S, Stmt *St, const ParsedAttr &A,
+                                       SourceRange Range) {
+
+  return ::new (S.Context) HLSLControlFlowHintAttr(S.Context, A);
+}
+
 static Attr *ProcessStmtAttribute(Sema &S, Stmt *St, const ParsedAttr &A,
                                   SourceRange Range) {
   if (A.isInvalid() || A.getKind() == ParsedAttr::IgnoredAttribute)
@@ -655,6 +661,8 @@ static Attr *ProcessStmtAttribute(Sema &S, Stmt *St, const ParsedAttr &A,
     return handleLoopHintAttr(S, St, A, Range);
   case ParsedAttr::AT_HLSLLoopHint:
     return handleHLSLLoopHintAttr(S, St, A, Range);
+  case ParsedAttr::AT_HLSLControlFlowHint:
+    return handleHLSLControlFlowHint(S, St, A, Range);
   case ParsedAttr::AT_OpenCLUnrollHint:
     return handleOpenCLUnrollHint(S, St, A, Range);
   case ParsedAttr::AT_Suppress:
diff --git a/clang/test/AST/HLSL/HLSLControlFlowHint.hlsl b/clang/test/AST/HLSL/HLSLControlFlowHint.hlsl
new file mode 100644
index 0000000000000..a36779c05fbc9
--- /dev/null
+++ b/clang/test/AST/HLSL/HLSLControlFlowHint.hlsl
@@ -0,0 +1,43 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-compute -ast-dump %s | FileCheck %s
+
+// CHECK: FunctionDecl 0x{{[0-9A-Fa-f]+}} <{{.*}}> {{.*}} used branch 'int (int)'
+// CHECK: AttributedStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>
+// CHECK-NEXT: -HLSLControlFlowHintAttr 0x{{[0-9A-Fa-f]+}} <{{.*}}> branch
+export int branch(int X){
+    int resp;
+    [branch] if (X > 0) {
+        resp = -X;
+    } else {
+        resp = X * 2;
+    }
+
+    return resp;
+}
+
+// CHECK: FunctionDecl 0x{{[0-9A-Fa-f]+}} <{{.*}}> {{.*}} used flatten 'int (int)'
+// CHECK: AttributedStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>
+// CHECK-NEXT: -HLSLControlFlowHintAttr 0x{{[0-9A-Fa-f]+}} <{{.*}}> flatten
+export int flatten(int X){
+    int resp;
+    [flatten] if (X > 0) {
+        resp = -X;
+    } else {
+        resp = X * 2;
+    }
+
+    return resp;
+}
+
+// CHECK: FunctionDecl 0x{{[0-9A-Fa-f]+}} <{{.*}}> {{.*}} used no_attr 'int (int)'
+// CHECK-NOT: AttributedStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>
+// CHECK-NOT: -HLSLControlFlowHintAttr
+export int no_attr(int X){
+    int resp;
+    if (X > 0) {
+        resp = -X;
+    } else {
+        resp = X * 2;
+    }
+
+    return resp;
+}
diff --git a/clang/test/CodeGenHLSL/HLSLControlFlowHint.hlsl b/clang/test/CodeGenHLSL/HLSLControlFlowHint.hlsl
new file mode 100644
index 0000000000000..aa13b27581850
--- /dev/null
+++ b/clang/test/CodeGenHLSL/HLSLControlFlowHint.hlsl
@@ -0,0 +1,48 @@
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple spirv-vulkan-library %s -fnative-half-type -emit-llvm -o - | FileCheck %s
+
+// CHECK: define {{.*}} i32 {{.*}}test_branch{{.*}}(i32 {{.*}} [[VALD:%.*]])
+// CHECK: [[PARAM:%.*]] = load i32, ptr [[VALD]].addr, align 4
+// CHECK: [[CMP:%.*]] = icmp sgt i32 [[PARAM]], 0
+// CHECK: br i1 [[CMP]], label %if.then, label %if.else, !hlsl.controlflow.hint [[HINT_BRANCH:![0-9]+]]
+export int test_branch(int X){
+    int resp;
+    [branch] if (X > 0) {
+        resp = -X;
+    } else {
+        resp = X * 2;
+    }
+
+    return resp;
+}
+
+// CHECK: define {{.*}} i32 {{.*}}test_flatten{{.*}}(i32 {{.*}} [[VALD:%.*]])
+// CHECK: [[PARAM:%.*]] = load i32, ptr [[VALD]].addr, align 4
+// CHECK: [[CMP:%.*]] = icmp sgt i32 [[PARAM]], 0
+// CHECK: br i1 [[CMP]], label %if.then, label %if.else, !hlsl.controlflow.hint [[HINT_FLATTEN:![0-9]+]]
+export int test_flatten(int X){
+    int resp;
+    [flatten] if (X > 0) {
+        resp = -X;
+    } else {
+        resp = X * 2;
+    }
+
+    return resp;
+}
+
+// CHECK: define {{.*}} i32 {{.*}}test_no_attr{{.*}}(i32 {{.*}} [[VALD:%.*]])
+// CHECK-NOT: !hlsl.controlflow.hint
+export int test_no_attr(int X){
+    int resp;
+    if (X > 0) {
+        resp = -X;
+    } else {
+        resp = X * 2;
+    }
+
+    return resp;
+}
+
+//CHECK: [[HINT_BRANCH]] = !{!"hlsl.controlflow.hint", i32 1}
+//CHECK: [[HINT_FLATTEN]] = !{!"hlsl.controlflow.hint", i32 2}
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index b4d2dce66a6f0..37057271b6c28 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -33,7 +33,7 @@ let TargetPrefix = "spv" in {
   def int_spv_ptrcast : Intrinsic<[llvm_any_ty], [llvm_any_ty, llvm_metadata_ty, llvm_i32_ty], [ImmArg<ArgIndex<2>>]>;
   def int_spv_switch : Intrinsic<[], [llvm_any_ty, llvm_vararg_ty]>;
   def int_spv_loop_merge : Intrinsic<[], [llvm_vararg_ty]>;
-  def int_spv_selection_merge : Intrinsic<[], [llvm_vararg_ty]>;
+  def int_spv_selection_merge : Intrinsic<[], [llvm_any_ty, llvm_i32_ty], [ImmArg<ArgIndex<1>>]>;
   def int_spv_cmpxchg : Intrinsic<[llvm_i32_ty], [llvm_any_ty, llvm_vararg_ty]>;
   def int_spv_unreachable : Intrinsic<[], []>;
   def int_spv_alloca : Intrinsic<[llvm_any_ty], [llvm_i8_ty], [ImmArg<ArgIndex<0>>]>;
diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
index 5afe6b2d2883d..5fd5c226eef89 100644
--- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
+++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
@@ -15,12 +15,14 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/DXILMetadataAnalysis.h"
 #include "llvm/Analysis/DXILResource.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
@@ -300,6 +302,38 @@ static MDTuple *emitTopLevelLibraryNode(Module &M, MDNode *RMD,
   return constructEntryMetadata(nullptr, nullptr, RMD, Properties, Ctx);
 }
 
+// TODO: We might need to refactor this to be more generic,
+// in case we need more metadata to be replaced.
+static void translateBranchMetadata(Module &M) {
+  for (Function &F : M) {
+    for (BasicBlock &BB : F) {
+      Instruction *BBTerminatorInst = BB.getTerminator();
+
+      MDNode *HlslControlFlowMD =
+          BBTerminatorInst->getMetadata("hlsl.controlflow.hint");
+
+      if (!HlslControlFlowMD)
+        continue;
+
+      assert(HlslControlFlowMD->getNumOperands() == 2 &&
+             "invalid operands for hlsl.controlflow.hint");
+
+      MDBuilder MDHelper(M.getContext());
+      ConstantInt *Op1 =
+          mdconst::extract<ConstantInt>(HlslControlFlowMD->getOperand(1));
+
+      SmallVector<llvm::Metadata *, 2> Vals(
+          ArrayRef<Metadata *>{MDHelper.createString("dx.controlflow.hints"),
+                               MDHelper.createConstant(Op1)});
+
+      MDNode *MDNode = llvm::MDNode::get(M.getContext(), Vals);
+
+      BBTerminatorInst->setMetadata("dx.controlflow.hints", MDNode);
+      BBTerminatorInst->setMetadata("hlsl.controlflow.hint", nullptr);
+    }
+  }
+}
+
 static void translateMetadata(Module &M, DXILBindingMap &DBM,
                               DXILResourceTypeMap &DRTM,
                               const Resources &MDResources,
@@ -372,6 +406,7 @@ PreservedAnalyses DXILTranslateMetadata::run(Module &M,
   const dxil::ModuleMetadataInfo MMDI = MAM.getResult<DXILMetadataAnalysis>(M);
 
   translateMetadata(M, DBM, DRTM, MDResources, ShaderFlags, MMDI);
+  translateBranchMetadata(M);
 
   return PreservedAnalyses::all();
 }
@@ -409,6 +444,7 @@ class DXILTranslateMetadataLegacy : public ModulePass {
         getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata();
 
     translateMetadata(M, DBM, DRTM, MDResources, ShaderFlags, MMDI);
+    translateBranchMetadata(M);
     return true;
   }
 };
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index b7b32dd0d626c..1d6be7619ecf4 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -33,6 +33,7 @@
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/IR/IntrinsicsSPIRV.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 
 #define DEBUG_TYPE "spirv-isel"
 
@@ -45,6 +46,17 @@ using ExtInstList =
 
 namespace {
 
+llvm::SPIRV::SelectionControl::SelectionControl
+getSelectionOperandForImm(int Imm) {
+  if (Imm == 2)
+    return SPIRV::SelectionControl::Flatten;
+  if (Imm == 1)
+    return SPIRV::SelectionControl::DontFlatten;
+  if (Imm == 0)
+    return SPIRV::SelectionControl::None;
+  llvm_unreachable("Invalid immediate");
+}
+
 #define GET_GLOBALISEL_PREDICATE_BITSET
 #include "SPIRVGenGlobalISel.inc"
 #undef GET_GLOBALISEL_PREDICATE_BITSET
@@ -2818,12 +2830,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
     }
     return MIB.constrainAllUses(TII, TRI, RBI);
   }
-  case Intrinsic::spv_loop_merge:
-  case Intrinsic::spv_selection_merge: {
-    const auto Opcode = IID == Intrinsic::spv_selection_merge
-                            ? SPIRV::OpSelectionMerge
-                            : SPIRV::OpLoopMerge;
-    auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(Opcode));
+  case Intrinsic::spv_loop_merge: {
+    auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpLoopMerge));
     for (unsigned i = 1; i < I.getNumExplicitOperands(); ++i) {
       assert(I.getOperand(i).isMBB());
       MIB.addMBB(I.getOperand(i).getMBB());
@@ -2831,6 +2839,15 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
     MIB.addImm(SPIRV::SelectionControl::None);
     return MIB.constrainAllUses(TII, TRI, RBI);
   }
+  case Intrinsic::spv_selection_merge: {
+    auto MIB =
+        BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpSelectionMerge));
+    assert(I.getOperand(1).isMBB() &&
+           "operand 1 to spv_selection_merge must be a basic block");
+    MIB.addMBB(I.getOperand(1).getMBB());
+    MIB.addImm(getSelectionOperandForImm(I.getOperand(2).getImm()));
+    return MIB.constrainAllUses(TII, TRI, RBI);
+  }
   case Intrinsic::spv_cmpxchg:
     return selectAtomicCmpXchg(ResVReg, ResType, I);
   case Intrinsic::spv_unreachable:
diff --git a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
index 336cde4e78224..2e4343c7922f1 100644
--- a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
@@ -18,14 +18,16 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
-#include "llvm/IR/Analysis.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsSPIRV.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
@@ -646,8 +648,7 @@ class SPIRVStructurizer : public FunctionPass {
       Builder.SetInsertPoint(Header->getTerminator());
 
       auto MergeAddress = BlockAddress::get(BB.getParent(), &BB);
-      SmallVector<Value *, 1> Args = {MergeAddress};
-      Builder.CreateIntrinsic(Intrinsic::spv_selection_merge, {}, {Args});
+      createOpSelectMerge(&Builder, MergeAddress);
 
       Modified = true;
     }
@@ -769,10 +770,9 @@ class SPIRVStructurizer : public FunctionPass {
       BasicBlock *Merge = Candidates[0];
 
       auto MergeAddress = BlockAddress::get(Merge->getParent(), Merge);
-      SmallVector<Value *, 1> Args = {MergeAddress};
       IRBuilder<> Builder(&BB);
       Builder.SetInsertPoint(BB.getTerminator());
-      Builder.CreateIntrinsic(Intrinsic::spv_selection_merge, {}, {Args});
+      createOpSelectMerge(&Builder, MergeAddress);
     }
 
     return Modified;
@@ -1105,8 +1105,7 @@ class SPIRVStructurizer : public FunctionPass {
         Builder.SetInsertPoint(Header->getTerminator());
 
         auto MergeAddress = BlockAddress::get(Merge->getParent(), Merge);
-        SmallVector<Value *, 1> Args = {MergeAddress};
-        Builder.CreateIntrinsic(Intrinsic::spv_selection_merge, {}, {Args});
+        createOpSelectMerge(&Builder, MergeAddress);
         continue;
       }
 
@@ -1120,8 +1119,7 @@ class SPIRVStructurizer : public FunctionPass {
       Builder.SetInsertPoint(Header->getTerminator());
 
       auto MergeAddress = BlockAddress::get(NewMerge->getParent(), NewMerge);
-      SmallVector<Value *, 1> Args = {MergeAddress};
-      Builder.CreateIntrinsic(Intrinsic::spv_selection_merge, {}, {Args});
+      createOpSelectMerge(&Builder, MergeAddress);
     }
 
     return Modified;
@@ -1208,6 +1206,27 @@ class SPIRVStructurizer : public FunctionPass {
     AU.addPreserved<SPIRVConvergenceRegionAnalysisWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }
+
+  void createOpSelectMerge(IRBuilder<> *Builder, BlockAddress *MergeAddress) {
+    Instruction *BBTerminatorInst = Builder->GetInsertBlock()->getTerminator();
+
+    MDNode *MDNode = BBTerminatorInst->getMetadata("hlsl.controlflow.hint");
+
+    ConstantInt *BranchHint = llvm::ConstantInt::get(Builder->getInt32Ty(), 0);
+
+    if (MDNode) {
+      assert(MDNode->getNumOperands() == 2 &&
+             "invalid metadata hlsl.controlflow.hint");
+      BranchHint = mdconst::extract<ConstantInt>(MDNode->getOperand(1));
+
+      assert(BranchHint && "invalid metadata value for hlsl.controlflow.hint");
+    }
+
+    llvm::SmallVector<llvm::Value *, 2> Args = {MergeAddress, BranchHint};
+
+    Builder->CreateIntrinsic(Intrinsic::spv_selection_merge,
+                             {MergeAddress->getType()}, {Args});
+  }
 };
 } // namespace llvm
 
@@ -1229,8 +1248,11 @@ FunctionPass *llvm::createSPIRVStructurizerPass() {
 
 PreservedAnalyses SPIRVStructurizerWrapper::run(Function &F,
                                                 FunctionAnalysisManager &AF) {
-  FunctionPass *StructurizerPass = createSPIRVStructurizerPass();
-  if (!StructurizerPass->runOnFunction(F))
+
+  auto FPM = legacy::FunctionPassManager(F.getParent());
+  FPM.add(createSPIRVStructurizerPass());
+
+  if (!FPM.run(F))
     return PreservedAnalyses::all();
   PreservedAnalyses PA;
   PA.preserveSet<CFGAnalyses>();
diff --git a/llvm/test/CodeGen/DirectX/HLSLControlFlowHint.ll b/llvm/test/CodeGen/DirectX/HLSLControlFlowHint.ll
new file mode 100644
index 0000000000000..6a5274429930e
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/HLSLControlFlowHint.ll
@@ -0,0 +1,98 @@
+; RUN: opt -S -dxil-op-lower -dxil-translate-metadata -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+
+; This test make sure LLVM metadata is being translated into DXIL.
+
+
+; CHECK: define i32 @test_branch(i32 %X)
+; CHECK-NOT: hlsl.controlflow.hint
+; CHECK: br i1 %cmp, label %if.then, label %if.else, !dx.controlflow.hints [[HINT_BRANCH:![0-9]+]]
+define i32 @test_branch(i32 %X) {
+entry:
+  %X.addr = alloca i32, align 4
+  %resp = alloca i32, align 4
+  store i32 %X, ptr %X.addr, align 4
+  %0 = load i32, ptr %X.addr, align 4
+  %cmp = icmp sgt i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !0
+
+if.then:                                          ; preds = %entry
+  %1 = load i32, ptr %X.addr, align 4
+  %sub = sub nsw i32 0, %1
+  store i32 %sub, ptr %resp, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %2 = load i32, ptr %X.addr, align 4
+  %mul = mul nsw i32 %2, 2
+  store i32 %mul, ptr %resp, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %3 = load i32, ptr %resp, align 4
+  ret i32 %3
+}
+
+
+; CHECK: define i32 @test_flatten(i32 %X)
+; CHECK-NOT: hlsl.controlflow.hint
+; CHECK: br i1 %cmp, label %if.then, label %if.else, !dx.controlflow.hints [[HINT_FLATTEN:![0-9]+]]
+define i32 @test_flatten(i32 %X) {
+entry:
+  %X.addr = alloca i32, align 4
+  %resp = alloca i32, align 4
+  store i32 %X, ptr %X.addr, align 4
+  %0 = load i32, ptr %X.addr, align 4
+  %cmp = icmp sgt i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !1
+
+if.then:                                          ; preds = %entry
+  %1 = load i32, ptr %X.addr, align 4
+  %sub = sub nsw i32 0, %1
+  store i32 %sub, ptr %resp, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %2 = load i32, ptr %X.addr, align 4
+  %mul = mul nsw i32 %2, 2
+  store i32 %mul, ptr %resp, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %3 = load i32, ptr %resp, align 4
+  ret i32 %3
+}
+
+
+; CHECK: define i32 @test_no_attr(i32 %X)
+; CHECK-NOT: hlsl.controlflow.hint
+; CHECK-NOT: !dx.controlflow.hints
+define i32 @test_no_attr(i32 %X) {
+entry:
+  %X.addr = alloca i32, align 4
+  %resp = alloca i32, align 4
+  store i32 %X, ptr %X.addr, align 4
+  %0 = load i32, ptr %X.addr, align 4
+  %cmp = icmp sgt i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %1 = load i32, ptr %X.addr, align 4
+  %sub = sub nsw i32 0, %1
+  store i32 %sub, ptr %resp, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %2 = load i32, ptr %X.addr, align 4
+  %mul = mul nsw i32 %2, 2
+  store i32 %mul, ptr %resp, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %3 = load i32, ptr %resp, align 4
+  ret i32 %3
+}
+; CHECK-NOT: hlsl.controlflow.hint
+; CHECK: [[HINT_BRANCH]] = !{!"dx.controlflow.hints", i32 1}
+; CHECK: [[HINT_FLATTEN]] = !{!"dx.controlflow.hints", i32 2}
+!0 = !{!"hlsl.controlflow.hint", i32 1}
+!1 = !{!"hlsl.controlflow.hint", i32 2}
diff --git a/llvm/test/CodeGen/SPIRV/structurizer/HLSLControlFlowHint-pass-check.ll b/llvm/test/CodeGen/SPIRV/structurizer/HLSLControlFlowHint-pass-check.ll
new file mode 100644
index 0000000000000..9911b3119ce52
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/structurizer/HLSLControlFlowHint-pass-check.ll
@@ -0,0 +1,90 @@
+; RUN: opt -passes='spirv-structurizer' -S -mtriple=spirv-unknown-unknown %s | FileCheck %s
+
+; CHECK-LABEL: define spir_func noundef i32 @test_branch
+; CHECK: call void @llvm.spv.selection.merge.p0(ptr blockaddress(@test_branch, %if.end), i32 1)
+; CHECK-NEXT: br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !{{[0-9]+}}
+define spir_func noundef i32 @test_branch(i32 noundef %X) {
+entry:
+  %X.addr = alloca i32, align 4
+  %resp = alloca i32, align 4
+  store i32 %X, ptr %X.addr, align 4
+  %0 = load i32, ptr %X.addr, align 4
+  %cmp = icmp sgt i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !0
+
+if.then:                                          ; preds = %entry
+  %1 = load i32, ptr %X.addr, align 4
+  %sub = sub nsw i32 0, %1
+  store i32 %sub, ptr %resp, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %2 = load i32, ptr %X.addr, align 4
+  %mul = mul nsw i32 %2, 2
+  store i32 %mul, ptr %resp, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %3 = load i32, ptr %resp, align 4
+  ret i32 %3
+}
+
+; CHECK-LABEL: define spir_func noundef i32 @test_flatten
+; CHECK: call void @llvm.spv.selection.merge.p0(ptr blockaddress(@test_flatten, %if.end), i32 2)
+; CHECK-NEXT: br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !{{[0-9]+}}
+define spir_func noundef i32 @test_flatten(i32 noundef %X) {
+entry:
+  %X.addr = alloca i32, align 4
+  %resp = alloca i32, align 4
+  store i32 %X, ptr %X.addr, align 4
+  %0 = load i32, ptr %X.addr, align 4
+  %cmp = icmp sgt i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !1
+
+if.then:                                          ; preds = %entry
+  %1 = load i32, ptr %X.addr, align 4
+  %sub = sub nsw i32 0, %1
+  store i32 %sub, ptr %resp, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %2 = load i32, ptr %X.addr, align 4
+  %mul = mul nsw i32 %2, 2
+  store i32 %mul, ptr %resp, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %3 = load i32, ptr %resp, align 4
+  ret i32 %3
+}
+; CHECK-LABEL: define spir_func noundef i32 @test_no_attr
+; CHECK: call void @llvm.spv.selection.merge.p0(ptr blockaddress(@test_no_attr, %if.end), i32 0)
+; CHECK-NEXT: br i1 %cmp, label %if.then, label %if.else
+define spir_func noundef i32 @test_no_attr(i32 noundef %X) {
+entry:
+  %X.addr = alloca i32, align 4
+  %resp = alloca i32, align 4
+  store i32 %X, ptr %X.addr, align 4
+  %0 = load i32, ptr %X.addr, align 4
+  %cmp = icmp sgt i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %1 = load i32, ptr %X.addr, align 4
+  %sub = sub nsw i32 0, %1
+  store i32 %sub, ptr %resp, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %2 = load i32, ptr %X.addr, align 4
+  %mul = mul nsw i32 %2, 2
+  store i32 %mul, ptr %resp, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %3 = load i32, ptr %resp, align 4
+  ret i32 %3
+}
+
+!0 = !{!"hlsl.controlflow.hint", i32 1}
+!1 = !{!"hlsl.controlflow.hint", i32 2}
diff --git a/llvm/test/CodeGen/SPIRV/structurizer/HLSLControlFlowHint.ll b/llvm/test/CodeGen/SPIRV/structurizer/HLSLControlFlowHint.ll
new file mode 100644
index 0000000000000..848eaf70f5a19
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/structurizer/HLSLControlFlowHint.ll
@@ -0,0 +1,91 @@
+; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+
+define spir_func noundef i32 @test_branch(i32 noundef %X) {
+entry:
+; CHECK-LABEL: ; -- Begin function test_branch
+; OpSelectionMerge %[[#]] DontFlatten
+  %X.addr = alloca i32, align 4
+  %resp = alloca i32, align 4
+  store i32 %X, ptr %X.addr, align 4
+  %0 = load i32, ptr %X.addr, align 4
+  %cmp = icmp sgt i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !0
+
+if.then:                                          ; preds = %entry
+  %1 = load i32, ptr %X.addr, align 4
+  %sub = sub nsw i32 0, %1
+  store i32 %sub, ptr %resp, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %2 = load i32, ptr %X.addr, align 4
+  %mul = mul nsw i32 %2, 2
+  store i32 %mul, ptr %resp, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %3 = load i32, ptr %resp, align 4
+  ret i32 %3
+}
+
+
+define spir_func noundef i32 @test_flatten(i32 noundef %X) {
+entry:
+; CHECK-LABEL: ; -- Begin function test_flatten
+; OpSelectionMerge %[[#]] Flatten
+  %X.addr = alloca i32, align 4
+  %resp = alloca i32, align 4
+  store i32 %X, ptr %X.addr, align 4
+  %0 = load i32, ptr %X.addr, align 4
+  %cmp = icmp sgt i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !1
+
+if.then:                                          ; preds = %entry
+  %1 = load i32, ptr %X.addr, align 4
+  %sub = sub nsw i32 0, %1
+  store i32 %sub, ptr %resp, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %2 = load i32, ptr %X.addr, align 4
+  %mul = mul nsw i32 %2, 2
+  store i32 %mul, ptr %resp, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %3 = load i32, ptr %resp, align 4
+  ret i32 %3
+}
+
+define spir_func noundef i32 @test_no_attr(i32 noundef %X) {
+entry:
+; CHECK-LABEL: ; -- Begin function test_no_attr
+; OpSelectionMerge %[[#]] None
+  %X.addr = alloca i32, align 4
+  %resp = alloca i32, align 4
+  store i32 %X, ptr %X.addr, align 4
+  %0 = load i32, ptr %X.addr, align 4
+  %cmp = icmp sgt i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %1 = load i32, ptr %X.addr, align 4
+  %sub = sub nsw i32 0, %1
+  store i32 %sub, ptr %resp, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %2 = load i32, ptr %X.addr, align 4
+  %mul = mul nsw i32 %2, 2
+  store i32 %mul, ptr %resp, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %3 = load i32, ptr %resp, align 4
+  ret i32 %3
+}
+
+!0 = !{!"hlsl.controlflow.hint", i32 1}
+!1 = !{!"hlsl.controlflow.hint", i32 2}

From ae546175235b67d2047196543738f9d981568f0f Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Mon, 13 Jan 2025 10:26:15 -0800
Subject: [PATCH 309/408] [SLP][NFC]Add a test with incorrect extractelement
 parameter after extending with poison

---
 .../X86/extractelemets-extended-by-poison.ll  | 86 +++++++++++++++++++
 1 file changed, 86 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll
new file mode 100644
index 0000000000000..6af59aee54e55
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define i32 @test() {
+; CHECK-LABEL: define i32 @test() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr null, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = or i64 poison, 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> <i64 poison, i64 poison, i64 poison, i64 poison, i64 0, i64 poison, i64 poison, i64 poison>, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 12, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP4]], <4 x i64> [[TMP0]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc <8 x i64> [[TMP5]] to <8 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5>
+; CHECK-NEXT:    [[TMP8:%.*]] = add <16 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
+; CHECK-NEXT:    [[INC_3_3_I_1:%.*]] = or i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[TMP8]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> poison)
+; CHECK-NEXT:    [[OP_RDX:%.*]] = or i32 [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    ret i32 [[OP_RDX]]
+;
+entry:
+  %.pre.i = load i64, ptr getelementptr inbounds nuw (i8, ptr null, i64 24), align 8
+  %.pre50.i = load i64, ptr getelementptr inbounds nuw (i8, ptr null, i64 16), align 16
+  %.pre51.i = load i64, ptr getelementptr inbounds nuw (i8, ptr null, i64 8), align 8
+  %.pre52.i = load i64, ptr null, align 16
+  %0 = or i64 %.pre51.i, 0
+  %1 = trunc i64 %.pre.i to i32
+  %2 = add i32 %1, 0
+  %3 = trunc i64 %.pre50.i to i32
+  %4 = add i32 %3, 0
+  %5 = trunc i64 %.pre51.i to i32
+  %6 = add i32 %5, 0
+  %7 = trunc i64 0 to i32
+  %8 = add i32 %5, 0
+  %9 = add i32 %7, 0
+  %10 = add i32 %1, 0
+  %11 = add i32 %3, 0
+  %12 = add i32 %5, 0
+  %13 = add i32 %7, 0
+  %14 = trunc i64 %.pre.i to i32
+  %15 = add i32 %14, 0
+  %16 = trunc i64 %.pre50.i to i32
+  %17 = add i32 %16, 0
+  %18 = trunc i64 %.pre51.i to i32
+  %19 = add i32 %18, 0
+  %20 = trunc i64 %.pre52.i to i32
+  %conv14.1.i = or i32 %9, %13
+  %21 = or i32 %conv14.1.i, %6
+  %22 = or i32 %21, %8
+  %23 = or i32 %22, %12
+  %24 = or i32 %23, %4
+  %25 = or i32 %24, %11
+  %26 = or i32 %25, %2
+  %27 = or i32 %26, %10
+  %28 = or i32 %27, %15
+  %29 = or i32 %28, %17
+  %30 = or i32 %29, %19
+  %31 = add i32 %14, 0
+  %32 = add i32 %16, 0
+  %33 = add i32 %18, 0
+  %34 = add i32 %20, 0
+  %35 = add i32 %14, 0
+  %36 = add i32 %16, 0
+  %37 = add i32 %18, 0
+  %38 = add i32 %20, 0
+  %39 = add i32 %14, 0
+  %40 = add i32 %16, 0
+  %41 = add i32 %18, 0
+  %42 = add i32 %20, 0
+  %inc.3.3.i.1 = or i64 %.pre52.i, 0
+  %conv14.i.1 = or i32 %38, %34
+  %conv14.1.i.1 = or i32 %conv14.i.1, %42
+  %conv14.3.i.1 = or i32 %conv14.1.i.1, %33
+  %conv14.145.i.1 = or i32 %conv14.3.i.1, %37
+  %conv14.1.1.i.1 = or i32 %conv14.145.i.1, %41
+  %conv14.3.1.i.1 = or i32 %conv14.1.1.i.1, %32
+  %conv14.247.i.1 = or i32 %conv14.3.1.i.1, %36
+  %conv14.1.2.i.1 = or i32 %conv14.247.i.1, %40
+  %conv14.3.2.i.1 = or i32 %conv14.1.2.i.1, %31
+  %conv14.349.i.1 = or i32 %conv14.3.2.i.1, %35
+  %conv14.1.3.i.1 = or i32 %conv14.349.i.1, %39
+  %conv14.3.3.i.1 = or i32 %conv14.1.3.i.1, %30
+  ret i32 %conv14.3.3.i.1
+}

From 9844badfca51e0eba72964552fd624224cbaacb0 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 13 Jan 2025 10:36:40 -0800
Subject: [PATCH 310/408] [X86] Use loaded/stored element size when
 parsing/printing gather/scatter pointer size in Intel syntax. (#122530)

This matches binutils.
---
 llvm/lib/Target/X86/AsmParser/X86Operand.h    |  46 +-
 llvm/lib/Target/X86/X86InstrAVX512.td         |  80 +-
 llvm/lib/Target/X86/X86InstrOperands.td       |  39 +-
 llvm/lib/Target/X86/X86InstrSSE.td            |  16 +-
 .../MC/Disassembler/X86/apx/evex-format.txt   |   2 +-
 .../test/MC/Disassembler/X86/intel-syntax.txt |   4 +-
 llvm/test/MC/X86/avx-64-intel.s               |  64 +-
 llvm/test/MC/X86/avx512-intel.s               | 448 ++++-----
 llvm/test/MC/X86/avx512f_vl-intel.s           | 896 +++++++++---------
 llvm/test/MC/X86/intel-syntax.s               |   2 +-
 llvm/utils/TableGen/X86RecognizableInstr.cpp  |  38 +-
 11 files changed, 808 insertions(+), 827 deletions(-)

diff --git a/llvm/lib/Target/X86/AsmParser/X86Operand.h b/llvm/lib/Target/X86/AsmParser/X86Operand.h
index 07a00af881afe..d715fd1903802 100644
--- a/llvm/lib/Target/X86/AsmParser/X86Operand.h
+++ b/llvm/lib/Target/X86/AsmParser/X86Operand.h
@@ -340,46 +340,38 @@ struct X86Operand final : public MCParsedAsmOperand {
     return Mem.IndexReg >= LowR && Mem.IndexReg <= HighR;
   }
 
+  bool isMem32_RC128() const {
+    return isMem32() && isMemIndexReg(X86::XMM0, X86::XMM15);
+  }
   bool isMem64_RC128() const {
     return isMem64() && isMemIndexReg(X86::XMM0, X86::XMM15);
   }
-  bool isMem128_RC128() const {
-    return isMem128() && isMemIndexReg(X86::XMM0, X86::XMM15);
-  }
-  bool isMem128_RC256() const {
-    return isMem128() && isMemIndexReg(X86::YMM0, X86::YMM15);
+  bool isMem32_RC256() const {
+    return isMem32() && isMemIndexReg(X86::YMM0, X86::YMM15);
   }
-  bool isMem256_RC128() const {
-    return isMem256() && isMemIndexReg(X86::XMM0, X86::XMM15);
-  }
-  bool isMem256_RC256() const {
-    return isMem256() && isMemIndexReg(X86::YMM0, X86::YMM15);
+  bool isMem64_RC256() const {
+    return isMem64() && isMemIndexReg(X86::YMM0, X86::YMM15);
   }
 
+  bool isMem32_RC128X() const {
+    return isMem32() && X86II::isXMMReg(Mem.IndexReg);
+  }
   bool isMem64_RC128X() const {
     return isMem64() && X86II::isXMMReg(Mem.IndexReg);
   }
-  bool isMem128_RC128X() const {
-    return isMem128() && X86II::isXMMReg(Mem.IndexReg);
+  bool isMem32_RC256X() const {
+    return isMem32() && X86II::isYMMReg(Mem.IndexReg);
   }
-  bool isMem128_RC256X() const {
-    return isMem128() && X86II::isYMMReg(Mem.IndexReg);
+  bool isMem64_RC256X() const {
+    return isMem64() && X86II::isYMMReg(Mem.IndexReg);
   }
-  bool isMem256_RC128X() const {
-    return isMem256() && X86II::isXMMReg(Mem.IndexReg);
+  bool isMem32_RC512() const {
+    return isMem32() && X86II::isZMMReg(Mem.IndexReg);
   }
-  bool isMem256_RC256X() const {
-    return isMem256() && X86II::isYMMReg(Mem.IndexReg);
-  }
-  bool isMem256_RC512() const {
-    return isMem256() && X86II::isZMMReg(Mem.IndexReg);
-  }
-  bool isMem512_RC256X() const {
-    return isMem512() && X86II::isYMMReg(Mem.IndexReg);
-  }
-  bool isMem512_RC512() const {
-    return isMem512() && X86II::isZMMReg(Mem.IndexReg);
+  bool isMem64_RC512() const {
+    return isMem64() && X86II::isZMMReg(Mem.IndexReg);
   }
+
   bool isMem512_GR16() const {
     if (!isMem512())
       return false;
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index abf016000fc8e..9d8c123185a7c 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -10279,36 +10279,36 @@ multiclass avx512_gather<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
 multiclass avx512_gather_q_pd<bits<8> dopc, bits<8> qopc,
                         AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
   defm NAME#D#SUFF#Z: avx512_gather<dopc, OpcodeStr#"d", _.info512,
-                                      vy512xmem>, EVEX_V512, REX_W;
+                                      vy64xmem>, EVEX_V512, REX_W;
   defm NAME#Q#SUFF#Z: avx512_gather<qopc, OpcodeStr#"q", _.info512,
-                                      vz512mem>, EVEX_V512, REX_W;
+                                      vz64mem>, EVEX_V512, REX_W;
 let Predicates = [HasVLX] in {
   defm NAME#D#SUFF#Z256: avx512_gather<dopc, OpcodeStr#"d", _.info256,
-                              vx256xmem>, EVEX_V256, REX_W;
+                              vx64xmem>, EVEX_V256, REX_W;
   defm NAME#Q#SUFF#Z256: avx512_gather<qopc, OpcodeStr#"q", _.info256,
-                              vy256xmem>, EVEX_V256, REX_W;
+                              vy64xmem>, EVEX_V256, REX_W;
   defm NAME#D#SUFF#Z128: avx512_gather<dopc, OpcodeStr#"d", _.info128,
-                              vx128xmem>, EVEX_V128, REX_W;
+                              vx64xmem>, EVEX_V128, REX_W;
   defm NAME#Q#SUFF#Z128: avx512_gather<qopc, OpcodeStr#"q", _.info128,
-                              vx128xmem>, EVEX_V128, REX_W;
+                              vx64xmem>, EVEX_V128, REX_W;
 }
 }
 
 multiclass avx512_gather_d_ps<bits<8> dopc, bits<8> qopc,
                        AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
-  defm NAME#D#SUFF#Z: avx512_gather<dopc, OpcodeStr#"d", _.info512, vz512mem>,
+  defm NAME#D#SUFF#Z: avx512_gather<dopc, OpcodeStr#"d", _.info512, vz32mem>,
                                        EVEX_V512;
-  defm NAME#Q#SUFF#Z: avx512_gather<qopc, OpcodeStr#"q", _.info256, vz256mem>,
+  defm NAME#Q#SUFF#Z: avx512_gather<qopc, OpcodeStr#"q", _.info256, vz32mem>,
                                        EVEX_V512;
 let Predicates = [HasVLX] in {
   defm NAME#D#SUFF#Z256: avx512_gather<dopc, OpcodeStr#"d", _.info256,
-                                          vy256xmem>, EVEX_V256;
+                                          vy32xmem>, EVEX_V256;
   defm NAME#Q#SUFF#Z256: avx512_gather<qopc, OpcodeStr#"q", _.info128,
-                                          vy128xmem>, EVEX_V256;
+                                          vy32xmem>, EVEX_V256;
   defm NAME#D#SUFF#Z128: avx512_gather<dopc, OpcodeStr#"d", _.info128,
-                                          vx128xmem>, EVEX_V128;
+                                          vx32xmem>, EVEX_V128;
   defm NAME#Q#SUFF#Z128: avx512_gather<qopc, OpcodeStr#"q", _.info128,
-                                          vx64xmem, VK2WM>, EVEX_V128;
+                                          vx32xmem, VK2WM>, EVEX_V128;
 }
 }
 
@@ -10336,36 +10336,36 @@ let mayStore = 1, Constraints = "$mask = $mask_wb", ExeDomain = _.ExeDomain,
 multiclass avx512_scatter_q_pd<bits<8> dopc, bits<8> qopc,
                         AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
   defm NAME#D#SUFF#Z: avx512_scatter<dopc, OpcodeStr#"d", _.info512,
-                                      vy512xmem>, EVEX_V512, REX_W;
+                                      vy64xmem>, EVEX_V512, REX_W;
   defm NAME#Q#SUFF#Z: avx512_scatter<qopc, OpcodeStr#"q", _.info512,
-                                      vz512mem>, EVEX_V512, REX_W;
+                                      vz64mem>, EVEX_V512, REX_W;
 let Predicates = [HasVLX] in {
   defm NAME#D#SUFF#Z256: avx512_scatter<dopc, OpcodeStr#"d", _.info256,
-                              vx256xmem>, EVEX_V256, REX_W;
+                              vx64xmem>, EVEX_V256, REX_W;
   defm NAME#Q#SUFF#Z256: avx512_scatter<qopc, OpcodeStr#"q", _.info256,
-                              vy256xmem>, EVEX_V256, REX_W;
+                              vy64xmem>, EVEX_V256, REX_W;
   defm NAME#D#SUFF#Z128: avx512_scatter<dopc, OpcodeStr#"d", _.info128,
-                              vx128xmem>, EVEX_V128, REX_W;
+                              vx64xmem>, EVEX_V128, REX_W;
   defm NAME#Q#SUFF#Z128: avx512_scatter<qopc, OpcodeStr#"q", _.info128,
-                              vx128xmem>, EVEX_V128, REX_W;
+                              vx64xmem>, EVEX_V128, REX_W;
 }
 }
 
 multiclass avx512_scatter_d_ps<bits<8> dopc, bits<8> qopc,
                        AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
-  defm NAME#D#SUFF#Z: avx512_scatter<dopc, OpcodeStr#"d", _.info512, vz512mem>,
+  defm NAME#D#SUFF#Z: avx512_scatter<dopc, OpcodeStr#"d", _.info512, vz32mem>,
                                        EVEX_V512;
-  defm NAME#Q#SUFF#Z: avx512_scatter<qopc, OpcodeStr#"q", _.info256, vz256mem>,
+  defm NAME#Q#SUFF#Z: avx512_scatter<qopc, OpcodeStr#"q", _.info256, vz32mem>,
                                        EVEX_V512;
 let Predicates = [HasVLX] in {
   defm NAME#D#SUFF#Z256: avx512_scatter<dopc, OpcodeStr#"d", _.info256,
-                                          vy256xmem>, EVEX_V256;
+                                          vy32xmem>, EVEX_V256;
   defm NAME#Q#SUFF#Z256: avx512_scatter<qopc, OpcodeStr#"q", _.info128,
-                                          vy128xmem>, EVEX_V256;
+                                          vy32xmem>, EVEX_V256;
   defm NAME#D#SUFF#Z128: avx512_scatter<dopc, OpcodeStr#"d", _.info128,
-                                          vx128xmem>, EVEX_V128;
+                                          vx32xmem>, EVEX_V128;
   defm NAME#Q#SUFF#Z128: avx512_scatter<qopc, OpcodeStr#"q", _.info128,
-                                          vx64xmem, VK2WM>, EVEX_V128;
+                                          vx32xmem, VK2WM>, EVEX_V128;
 }
 }
 
@@ -10385,52 +10385,52 @@ multiclass avx512_gather_scatter_prefetch<bits<8> opc, Format F, string OpcodeSt
 }
 
 defm VGATHERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dps",
-                     VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+                     VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VGATHERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qps",
-                     VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+                     VK8WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VGATHERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dpd",
-                     VK8WM, vy512xmem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vy64xmem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>;
 
 defm VGATHERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qpd",
-                     VK8WM, vz512mem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vz64mem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>;
 
 defm VGATHERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dps",
-                     VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+                     VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VGATHERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qps",
-                     VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+                     VK8WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VGATHERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dpd",
-                     VK8WM, vy512xmem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vy64xmem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>;
 
 defm VGATHERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qpd",
-                     VK8WM, vz512mem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vz64mem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>;
 
 defm VSCATTERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dps",
-                     VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+                     VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VSCATTERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qps",
-                     VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+                     VK8WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VSCATTERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dpd",
-                     VK8WM, vy512xmem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vy64xmem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>;
 
 defm VSCATTERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qpd",
-                     VK8WM, vz512mem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vz64mem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>;
 
 defm VSCATTERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dps",
-                     VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+                     VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VSCATTERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qps",
-                     VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+                     VK8WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd",
-                     VK8WM, vy512xmem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vy64xmem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>;
 
 defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd",
-                     VK8WM, vz512mem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vz64mem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>;
 
 multiclass cvt_by_vec_width<bits<8> opc, X86VectorVTInfo Vec, string OpcodeStr, SchedWrite Sched> {
 def rk : AVX512XS8I<opc, MRMSrcReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src),
diff --git a/llvm/lib/Target/X86/X86InstrOperands.td b/llvm/lib/Target/X86/X86InstrOperands.td
index 2102cb4b6b5b7..4f427d6d9d72e 100644
--- a/llvm/lib/Target/X86/X86InstrOperands.td
+++ b/llvm/lib/Target/X86/X86InstrOperands.td
@@ -25,20 +25,18 @@ let RenderMethod = "addMemOperands", SuperClasses = [X86MemAsmOperand] in {
   def X86Mem256AsmOperand : AsmOperandClass { let Name = "Mem256"; }
   def X86Mem512AsmOperand : AsmOperandClass { let Name = "Mem512"; }
   // Gather mem operands
+  def X86Mem32_RC128Operand  : AsmOperandClass { let Name = "Mem32_RC128"; }
   def X86Mem64_RC128Operand  : AsmOperandClass { let Name = "Mem64_RC128"; }
-  def X86Mem128_RC128Operand : AsmOperandClass { let Name = "Mem128_RC128"; }
-  def X86Mem256_RC128Operand : AsmOperandClass { let Name = "Mem256_RC128"; }
-  def X86Mem128_RC256Operand : AsmOperandClass { let Name = "Mem128_RC256"; }
-  def X86Mem256_RC256Operand : AsmOperandClass { let Name = "Mem256_RC256"; }
+  def X86Mem32_RC256Operand  : AsmOperandClass { let Name = "Mem32_RC256"; }
+  def X86Mem64_RC256Operand  : AsmOperandClass { let Name = "Mem64_RC256"; }
 
+  def X86Mem32_RC128XOperand  : AsmOperandClass { let Name = "Mem32_RC128X"; }
   def X86Mem64_RC128XOperand  : AsmOperandClass { let Name = "Mem64_RC128X"; }
-  def X86Mem128_RC128XOperand : AsmOperandClass { let Name = "Mem128_RC128X"; }
-  def X86Mem256_RC128XOperand : AsmOperandClass { let Name = "Mem256_RC128X"; }
-  def X86Mem128_RC256XOperand : AsmOperandClass { let Name = "Mem128_RC256X"; }
-  def X86Mem256_RC256XOperand : AsmOperandClass { let Name = "Mem256_RC256X"; }
-  def X86Mem512_RC256XOperand : AsmOperandClass { let Name = "Mem512_RC256X"; }
-  def X86Mem256_RC512Operand  : AsmOperandClass { let Name = "Mem256_RC512"; }
-  def X86Mem512_RC512Operand  : AsmOperandClass { let Name = "Mem512_RC512"; }
+  def X86Mem32_RC256XOperand  : AsmOperandClass { let Name = "Mem32_RC256X"; }
+  def X86Mem64_RC256XOperand  : AsmOperandClass { let Name = "Mem64_RC256X"; }
+  def X86Mem32_RC512Operand   : AsmOperandClass { let Name = "Mem32_RC512"; }
+  def X86Mem64_RC512Operand   : AsmOperandClass { let Name = "Mem64_RC512"; }
+
   def X86Mem512_GR16Operand : AsmOperandClass { let Name = "Mem512_GR16"; }
   def X86Mem512_GR32Operand : AsmOperandClass { let Name = "Mem512_GR32"; }
   def X86Mem512_GR64Operand : AsmOperandClass { let Name = "Mem512_GR64"; }
@@ -97,20 +95,17 @@ def i512mem_GR32 : X86MemOperand<"printzmmwordmem", X86Mem512_GR32Operand, 512>;
 def i512mem_GR64 : X86MemOperand<"printzmmwordmem", X86Mem512_GR64Operand, 512>;
 
 // Gather mem operands
+def vx32mem  : X86VMemOperand<VR128,  "printdwordmem",  X86Mem32_RC128Operand, 32>;
 def vx64mem  : X86VMemOperand<VR128,  "printqwordmem",  X86Mem64_RC128Operand, 64>;
-def vx128mem : X86VMemOperand<VR128,  "printxmmwordmem", X86Mem128_RC128Operand, 128>;
-def vx256mem : X86VMemOperand<VR128,  "printymmwordmem", X86Mem256_RC128Operand, 256>;
-def vy128mem : X86VMemOperand<VR256,  "printxmmwordmem", X86Mem128_RC256Operand, 128>;
-def vy256mem : X86VMemOperand<VR256,  "printymmwordmem", X86Mem256_RC256Operand, 256>;
+def vy32mem  : X86VMemOperand<VR256,  "printdwordmem",  X86Mem32_RC256Operand, 32>;
+def vy64mem  : X86VMemOperand<VR256,  "printqwordmem",  X86Mem64_RC256Operand, 64>;
 
+def vx32xmem  : X86VMemOperand<VR128X, "printdwordmem",  X86Mem32_RC128XOperand, 32>;
 def vx64xmem  : X86VMemOperand<VR128X, "printqwordmem",  X86Mem64_RC128XOperand, 64>;
-def vx128xmem : X86VMemOperand<VR128X, "printxmmwordmem", X86Mem128_RC128XOperand, 128>;
-def vx256xmem : X86VMemOperand<VR128X, "printymmwordmem", X86Mem256_RC128XOperand, 256>;
-def vy128xmem : X86VMemOperand<VR256X, "printxmmwordmem", X86Mem128_RC256XOperand, 128>;
-def vy256xmem : X86VMemOperand<VR256X, "printymmwordmem", X86Mem256_RC256XOperand, 256>;
-def vy512xmem : X86VMemOperand<VR256X, "printzmmwordmem", X86Mem512_RC256XOperand, 512>;
-def vz256mem  : X86VMemOperand<VR512,  "printymmwordmem", X86Mem256_RC512Operand, 256>;
-def vz512mem  : X86VMemOperand<VR512,  "printzmmwordmem", X86Mem512_RC512Operand, 512>;
+def vy32xmem  : X86VMemOperand<VR256X, "printdwordmem",  X86Mem32_RC256XOperand, 32>;
+def vy64xmem  : X86VMemOperand<VR256X, "printqwordmem",  X86Mem64_RC256XOperand, 64>;
+def vz32mem   : X86VMemOperand<VR512,  "printdwordmem",  X86Mem32_RC512Operand,  32>;
+def vz64mem   : X86VMemOperand<VR512,  "printqwordmem",  X86Mem64_RC512Operand,  64>;
 
 def shmem : X86MemOperand<"printwordmem", X86Mem16AsmOperand>;
 def ssmem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>;
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 036d7d92f3f89..6aadb788c851e 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -8078,26 +8078,26 @@ let Predicates = [HasAVX2] in {
     = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
     in {
     defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq",
-                                  VR256, vx128mem, vx256mem>, REX_W;
+                                  VR256, vx64mem, vx64mem>, REX_W;
     defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq",
-                                  VR256, vx128mem, vy256mem>, REX_W;
+                                  VR256, vx64mem, vy64mem>, REX_W;
     defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd",
-                                  VR256, vx128mem, vy256mem>;
+                                  VR256, vx32mem, vy32mem>;
     defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd",
-                                  VR128, vx64mem, vy128mem>;
+                                  VR128, vx32mem, vy32mem>;
 
     let ExeDomain = SSEPackedDouble in {
       defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd",
-                                    VR256, vx128mem, vx256mem>, REX_W;
+                                    VR256, vx64mem, vx64mem>, REX_W;
       defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd",
-                                    VR256, vx128mem, vy256mem>, REX_W;
+                                    VR256, vx64mem, vy64mem>, REX_W;
     }
 
     let ExeDomain = SSEPackedSingle in {
       defm VGATHERDPS : avx2_gather<0x92, "vgatherdps",
-                                    VR256, vx128mem, vy256mem>;
+                                    VR256, vx32mem, vy32mem>;
       defm VGATHERQPS : avx2_gather<0x93, "vgatherqps",
-                                    VR128, vx64mem, vy128mem>;
+                                    VR128, vx32mem, vy32mem>;
     }
   }
 }
diff --git a/llvm/test/MC/Disassembler/X86/apx/evex-format.txt b/llvm/test/MC/Disassembler/X86/apx/evex-format.txt
index e9a9f1327a17e..53ae3b8b73ab4 100644
--- a/llvm/test/MC/Disassembler/X86/apx/evex-format.txt
+++ b/llvm/test/MC/Disassembler/X86/apx/evex-format.txt
@@ -90,7 +90,7 @@
 ## MRM5m
 
 # ATT:   vscatterpf0dps	(%r16,%zmm0) {%k1}
-# INTEL: vscatterpf0dps	{k1}, zmmword ptr [r16 + zmm0]
+# INTEL: vscatterpf0dps	{k1}, dword ptr [r16 + zmm0]
 0x62,0xfa,0x7d,0x49,0xc6,0x2c,0x00
 
 # ATT:   subq	$127, 123(%r16), %r17
diff --git a/llvm/test/MC/Disassembler/X86/intel-syntax.txt b/llvm/test/MC/Disassembler/X86/intel-syntax.txt
index c7c0fce268cd2..f9284ab388441 100644
--- a/llvm/test/MC/Disassembler/X86/intel-syntax.txt
+++ b/llvm/test/MC/Disassembler/X86/intel-syntax.txt
@@ -108,10 +108,10 @@
 # CHECK: vshufpd xmm0, xmm1, xmm2, 1
 0xc5 0xf1 0xc6 0xc2 0x01
 
-# CHECK: vpgatherqq ymm2, ymmword ptr [rdi + 2*ymm1], ymm0
+# CHECK: vpgatherqq ymm2, qword ptr [rdi + 2*ymm1], ymm0
 0xc4 0xe2 0xfd 0x91 0x14 0x4f
 
-# CHECK: vpgatherdd xmm10, xmmword ptr [r15 + 2*xmm9], xmm8
+# CHECK: vpgatherdd xmm10, dword ptr [r15 + 2*xmm9], xmm8
 0xc4 0x02 0x39 0x90 0x14 0x4f
 
 # CHECK: xsave64 [rax]
diff --git a/llvm/test/MC/X86/avx-64-intel.s b/llvm/test/MC/X86/avx-64-intel.s
index c1f20d204a8c4..392f6e9928427 100644
--- a/llvm/test/MC/X86/avx-64-intel.s
+++ b/llvm/test/MC/X86/avx-64-intel.s
@@ -1,68 +1,68 @@
 // RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
 
-// CHECK: vgatherdpd    xmm2, xmmword ptr [rdi + 2*xmm1], xmm0 
+// CHECK: vgatherdpd    xmm2, qword ptr [rdi + 2*xmm1], xmm0 
 // CHECK: encoding: [0xc4,0xe2,0xf9,0x92,0x14,0x4f]
-          vgatherdpd    xmm2, xmmword ptr [rdi + 2*xmm1], xmm0 
+          vgatherdpd    xmm2, qword ptr [rdi + 2*xmm1], xmm0 
 
-// CHECK: vgatherqpd    xmm2, xmmword ptr [rdi + 2*xmm1], xmm0 
+// CHECK: vgatherqpd    xmm2, qword ptr [rdi + 2*xmm1], xmm0 
 // CHECK: encoding: [0xc4,0xe2,0xf9,0x93,0x14,0x4f]
-          vgatherqpd    xmm2, xmmword ptr [rdi + 2*xmm1], xmm0 
+          vgatherqpd    xmm2, qword ptr [rdi + 2*xmm1], xmm0 
 
-// CHECK: vgatherdpd    ymm2, ymmword ptr [rdi + 2*xmm1], ymm0 
+// CHECK: vgatherdpd    ymm2, qword ptr [rdi + 2*xmm1], ymm0 
 // CHECK: encoding: [0xc4,0xe2,0xfd,0x92,0x14,0x4f]
-          vgatherdpd    ymm2, ymmword ptr [rdi + 2*xmm1], ymm0 
+          vgatherdpd    ymm2, qword ptr [rdi + 2*xmm1], ymm0 
 
-// CHECK: vgatherqpd    ymm2, ymmword ptr [rdi + 2*ymm1], ymm0 
+// CHECK: vgatherqpd    ymm2, qword ptr [rdi + 2*ymm1], ymm0 
 // CHECK: encoding: [0xc4,0xe2,0xfd,0x93,0x14,0x4f]
-          vgatherqpd    ymm2, ymmword ptr [rdi + 2*ymm1], ymm0 
+          vgatherqpd    ymm2, qword ptr [rdi + 2*ymm1], ymm0 
 
-// CHECK: vgatherdps    xmm10, xmmword ptr [r15 + 2*xmm9], xmm8 
+// CHECK: vgatherdps    xmm10, dword ptr [r15 + 2*xmm9], xmm8 
 // CHECK: encoding: [0xc4,0x02,0x39,0x92,0x14,0x4f]
-          vgatherdps    xmm10, xmmword ptr [r15 + 2*xmm9], xmm8 
+          vgatherdps    xmm10, dword ptr [r15 + 2*xmm9], xmm8 
 
-// CHECK: vgatherqps    xmm10, qword ptr [r15 + 2*xmm9], xmm8 
+// CHECK: vgatherqps    xmm10, dword ptr [r15 + 2*xmm9], xmm8 
 // CHECK: encoding: [0xc4,0x02,0x39,0x93,0x14,0x4f]
-          vgatherqps    xmm10, qword ptr [r15 + 2*xmm9], xmm8 
+          vgatherqps    xmm10, dword ptr [r15 + 2*xmm9], xmm8 
 
-// CHECK: vgatherdps    ymm10, ymmword ptr [r15 + 2*ymm9], ymm8 
+// CHECK: vgatherdps    ymm10, dword ptr [r15 + 2*ymm9], ymm8 
 // CHECK: encoding: [0xc4,0x02,0x3d,0x92,0x14,0x4f]
-          vgatherdps    ymm10, ymmword ptr [r15 + 2*ymm9], ymm8 
+          vgatherdps    ymm10, dword ptr [r15 + 2*ymm9], ymm8 
 
-// CHECK: vgatherqps    xmm10, xmmword ptr [r15 + 2*ymm9], xmm8 
+// CHECK: vgatherqps    xmm10, dword ptr [r15 + 2*ymm9], xmm8 
 // CHECK: encoding: [0xc4,0x02,0x3d,0x93,0x14,0x4f]
-          vgatherqps    xmm10, xmmword ptr [r15 + 2*ymm9], xmm8 
+          vgatherqps    xmm10, dword ptr [r15 + 2*ymm9], xmm8 
 
-// CHECK: vpgatherdq    xmm2, xmmword ptr [rdi + 2*xmm1], xmm0 
+// CHECK: vpgatherdq    xmm2, qword ptr [rdi + 2*xmm1], xmm0 
 // CHECK: encoding: [0xc4,0xe2,0xf9,0x90,0x14,0x4f]
-          vpgatherdq    xmm2, xmmword ptr [rdi + 2*xmm1], xmm0 
+          vpgatherdq    xmm2, qword ptr [rdi + 2*xmm1], xmm0 
 
-// CHECK: vpgatherqq    xmm2, xmmword ptr [rdi + 2*xmm1], xmm0 
+// CHECK: vpgatherqq    xmm2, qword ptr [rdi + 2*xmm1], xmm0 
 // CHECK: encoding: [0xc4,0xe2,0xf9,0x91,0x14,0x4f]
-          vpgatherqq    xmm2, xmmword ptr [rdi + 2*xmm1], xmm0 
+          vpgatherqq    xmm2, qword ptr [rdi + 2*xmm1], xmm0 
 
-// CHECK: vpgatherdq    ymm2, ymmword ptr [rdi + 2*xmm1], ymm0 
+// CHECK: vpgatherdq    ymm2, qword ptr [rdi + 2*xmm1], ymm0 
 // CHECK: encoding: [0xc4,0xe2,0xfd,0x90,0x14,0x4f]
-          vpgatherdq    ymm2, ymmword ptr [rdi + 2*xmm1], ymm0 
+          vpgatherdq    ymm2, qword ptr [rdi + 2*xmm1], ymm0 
 
-// CHECK: vpgatherqq    ymm2, ymmword ptr [rdi + 2*ymm1], ymm0 
+// CHECK: vpgatherqq    ymm2, qword ptr [rdi + 2*ymm1], ymm0 
 // CHECK: encoding: [0xc4,0xe2,0xfd,0x91,0x14,0x4f]
-          vpgatherqq    ymm2, ymmword ptr [rdi + 2*ymm1], ymm0 
+          vpgatherqq    ymm2, qword ptr [rdi + 2*ymm1], ymm0 
 
-// CHECK: vpgatherdd    xmm10, xmmword ptr [r15 + 2*xmm9], xmm8 
+// CHECK: vpgatherdd    xmm10, dword ptr [r15 + 2*xmm9], xmm8 
 // CHECK: encoding: [0xc4,0x02,0x39,0x90,0x14,0x4f]
-          vpgatherdd    xmm10, xmmword ptr [r15 + 2*xmm9], xmm8 
+          vpgatherdd    xmm10, dword ptr [r15 + 2*xmm9], xmm8 
 
-// CHECK: vpgatherqd    xmm10, qword ptr [r15 + 2*xmm9], xmm8 
+// CHECK: vpgatherqd    xmm10, dword ptr [r15 + 2*xmm9], xmm8 
 // CHECK: encoding: [0xc4,0x02,0x39,0x91,0x14,0x4f]
-          vpgatherqd    xmm10, qword ptr [r15 + 2*xmm9], xmm8 
+          vpgatherqd    xmm10, dword ptr [r15 + 2*xmm9], xmm8 
 
-// CHECK: vpgatherdd    ymm10, ymmword ptr [r15 + 2*ymm9], ymm8 
+// CHECK: vpgatherdd    ymm10, dword ptr [r15 + 2*ymm9], ymm8 
 // CHECK: encoding: [0xc4,0x02,0x3d,0x90,0x14,0x4f]
-          vpgatherdd    ymm10, ymmword ptr [r15 + 2*ymm9], ymm8 
+          vpgatherdd    ymm10, dword ptr [r15 + 2*ymm9], ymm8 
 
-// CHECK: vpgatherqd    xmm10, xmmword ptr [r15 + 2*ymm9], xmm8 
+// CHECK: vpgatherqd    xmm10, dword ptr [r15 + 2*ymm9], xmm8 
 // CHECK: encoding: [0xc4,0x02,0x3d,0x91,0x14,0x4f]
-          vpgatherqd    xmm10, xmmword ptr [r15 + 2*ymm9], xmm8 
+          vpgatherqd    xmm10, dword ptr [r15 + 2*ymm9], xmm8 
 
 // CHECK: vcvtpd2ps xmm0, xmm15 
 // CHECK: encoding: [0xc4,0xc1,0x79,0x5a,0xc7]
diff --git a/llvm/test/MC/X86/avx512-intel.s b/llvm/test/MC/X86/avx512-intel.s
index d8ad3c4426176..1cbf21c7eb1b0 100644
--- a/llvm/test/MC/X86/avx512-intel.s
+++ b/llvm/test/MC/X86/avx512-intel.s
@@ -37900,450 +37900,450 @@ vaddpd zmm1, zmm1, zmm2, {rz-sae}
 // CHECK:  encoding: [0x62,0xe2,0xa5,0x50,0x77,0xaa,0xf8,0xfb,0xff,0xff]
           vpermi2pd zmm21, zmm27, qword ptr [rdx - 1032]{1to8}
 
-// CHECK: vgatherdpd zmm6 {k1}, zmmword ptr [r14 + 8*ymm16 + 123]
+// CHECK: vgatherdpd zmm6 {k1}, qword ptr [r14 + 8*ymm16 + 123]
 // CHECK:  encoding: [0x62,0xd2,0xfd,0x41,0x92,0xb4,0xc6,0x7b,0x00,0x00,0x00]
-          vgatherdpd zmm6 {k1},ZMMWORD PTR [r14+ymm16*8+0x7b]
+          vgatherdpd zmm6 {k1},QWORD PTR [r14+ymm16*8+0x7b]
 
-// CHECK: vgatherdpd zmm6 {k1}, zmmword ptr [r9 + ymm16 + 256]
+// CHECK: vgatherdpd zmm6 {k1}, qword ptr [r9 + ymm16 + 256]
 // CHECK:  encoding: [0x62,0xd2,0xfd,0x41,0x92,0x74,0x01,0x20]
-          vgatherdpd zmm6{k1},ZMMWORD PTR [r9+ymm16*1+0x100]
+          vgatherdpd zmm6{k1},QWORD PTR [r9+ymm16*1+0x100]
 
-// CHECK: vgatherdpd zmm6 {k1}, zmmword ptr [rcx + 4*ymm16 + 1024]
+// CHECK: vgatherdpd zmm6 {k1}, qword ptr [rcx + 4*ymm16 + 1024]
 // CHECK:  encoding: [0x62,0xf2,0xfd,0x41,0x92,0xb4,0x81,0x00,0x04,0x00,0x00]
-          vgatherdpd zmm6{k1},ZMMWORD PTR [rcx+ymm16*4+0x400]
+          vgatherdpd zmm6{k1},QWORD PTR [rcx+ymm16*4+0x400]
 
-// CHECK: vgatherdps zmm9 {k1}, zmmword ptr [r14 + 8*zmm19 + 123]
+// CHECK: vgatherdps zmm9 {k1}, dword ptr [r14 + 8*zmm19 + 123]
 // CHECK:  encoding: [0x62,0x52,0x7d,0x41,0x92,0x8c,0xde,0x7b,0x00,0x00,0x00]
-          vgatherdps zmm9{k1},ZMMWORD PTR [r14+zmm19*8+0x7b]
+          vgatherdps zmm9{k1},DWORD PTR [r14+zmm19*8+0x7b]
 
-// CHECK: vgatherdps zmm9 {k1}, zmmword ptr [r9 + zmm19 + 256]
+// CHECK: vgatherdps zmm9 {k1}, dword ptr [r9 + zmm19 + 256]
 // CHECK:  encoding: [0x62,0x52,0x7d,0x41,0x92,0x4c,0x19,0x40]
-          vgatherdps zmm9{k1},ZMMWORD PTR [r9+zmm19*1+0x100]
+          vgatherdps zmm9{k1},DWORD PTR [r9+zmm19*1+0x100]
 
-// CHECK: vgatherdps zmm9 {k1}, zmmword ptr [rcx + 4*zmm19 + 1024]
+// CHECK: vgatherdps zmm9 {k1}, dword ptr [rcx + 4*zmm19 + 1024]
 // CHECK:  encoding: [0x62,0x72,0x7d,0x41,0x92,0x8c,0x99,0x00,0x04,0x00,0x00]
-          vgatherdps zmm9{k1},ZMMWORD PTR [rcx+zmm19*4+0x400]
+          vgatherdps zmm9{k1},DWORD PTR [rcx+zmm19*4+0x400]
 
-// CHECK: vgatherqpd zmm29 {k1}, zmmword ptr [r14 + 8*zmm2 + 123]
+// CHECK: vgatherqpd zmm29 {k1}, qword ptr [r14 + 8*zmm2 + 123]
 // CHECK:  encoding: [0x62,0x42,0xfd,0x49,0x93,0xac,0xd6,0x7b,0x00,0x00,0x00]
-          vgatherqpd zmm29{k1},ZMMWORD PTR [r14+zmm2*8+0x7b]
+          vgatherqpd zmm29{k1},QWORD PTR [r14+zmm2*8+0x7b]
 
-// CHECK: vgatherqpd zmm29 {k1}, zmmword ptr [r9 + zmm2 + 256]
+// CHECK: vgatherqpd zmm29 {k1}, qword ptr [r9 + zmm2 + 256]
 // CHECK:  encoding: [0x62,0x42,0xfd,0x49,0x93,0x6c,0x11,0x20]
-          vgatherqpd zmm29{k1},ZMMWORD PTR [r9+zmm2*1+0x100]
+          vgatherqpd zmm29{k1},QWORD PTR [r9+zmm2*1+0x100]
 
-// CHECK: vgatherqpd zmm29 {k1}, zmmword ptr [rcx + 4*zmm2 + 1024]
+// CHECK: vgatherqpd zmm29 {k1}, qword ptr [rcx + 4*zmm2 + 1024]
 // CHECK:  encoding: [0x62,0x62,0xfd,0x49,0x93,0xac,0x91,0x00,0x04,0x00,0x00]
-          vgatherqpd zmm29{k1},ZMMWORD PTR [rcx+zmm2*4+0x400]
+          vgatherqpd zmm29{k1},QWORD PTR [rcx+zmm2*4+0x400]
 
-// CHECK: vgatherqps ymm18 {k1}, ymmword ptr [r14 + 8*zmm4 + 123]
+// CHECK: vgatherqps ymm18 {k1}, dword ptr [r14 + 8*zmm4 + 123]
 // CHECK:  encoding: [0x62,0xc2,0x7d,0x49,0x93,0x94,0xe6,0x7b,0x00,0x00,0x00]
-          vgatherqps ymm18{k1},YMMWORD PTR [r14+zmm4*8+0x7b]
+          vgatherqps ymm18{k1},DWORD PTR [r14+zmm4*8+0x7b]
 
-// CHECK: vgatherqps ymm18 {k1}, ymmword ptr [r9 + zmm4 + 256]
+// CHECK: vgatherqps ymm18 {k1}, dword ptr [r9 + zmm4 + 256]
 // CHECK:  encoding: [0x62,0xc2,0x7d,0x49,0x93,0x54,0x21,0x40]
-          vgatherqps ymm18{k1},YMMWORD PTR [r9+zmm4*1+0x100]
+          vgatherqps ymm18{k1},DWORD PTR [r9+zmm4*1+0x100]
 
-// CHECK: vgatherqps ymm18 {k1}, ymmword ptr [rcx + 4*zmm4 + 1024]
+// CHECK: vgatherqps ymm18 {k1}, dword ptr [rcx + 4*zmm4 + 1024]
 // CHECK:  encoding: [0x62,0xe2,0x7d,0x49,0x93,0x94,0xa1,0x00,0x04,0x00,0x00]
-          vgatherqps ymm18{k1},YMMWORD PTR [rcx+zmm4*4+0x400]
+          vgatherqps ymm18{k1},DWORD PTR [rcx+zmm4*4+0x400]
 
-// CHECK: vpgatherdd zmm17 {k1}, zmmword ptr [r14 + 8*zmm11 + 123]
+// CHECK: vpgatherdd zmm17 {k1}, dword ptr [r14 + 8*zmm11 + 123]
 // CHECK:  encoding: [0x62,0x82,0x7d,0x49,0x90,0x8c,0xde,0x7b,0x00,0x00,0x00]
-          vpgatherdd zmm17{k1},ZMMWORD PTR [r14+zmm11*8+0x7b]
+          vpgatherdd zmm17{k1},DWORD PTR [r14+zmm11*8+0x7b]
 
-// CHECK: vpgatherdd zmm17 {k1}, zmmword ptr [r9 + zmm11 + 256]
+// CHECK: vpgatherdd zmm17 {k1}, dword ptr [r9 + zmm11 + 256]
 // CHECK:  encoding: [0x62,0x82,0x7d,0x49,0x90,0x4c,0x19,0x40]
-          vpgatherdd zmm17{k1},ZMMWORD PTR [r9+zmm11*1+0x100]
+          vpgatherdd zmm17{k1},DWORD PTR [r9+zmm11*1+0x100]
 
-// CHECK: vpgatherdd zmm17 {k1}, zmmword ptr [rcx + 4*zmm11 + 1024]
+// CHECK: vpgatherdd zmm17 {k1}, dword ptr [rcx + 4*zmm11 + 1024]
 // CHECK:  encoding: [0x62,0xa2,0x7d,0x49,0x90,0x8c,0x99,0x00,0x04,0x00,0x00]
-          vpgatherdd zmm17{k1},ZMMWORD PTR [rcx+zmm11*4+0x400]
+          vpgatherdd zmm17{k1},DWORD PTR [rcx+zmm11*4+0x400]
 
-// CHECK: vpgatherdq zmm8 {k1}, zmmword ptr [r14 + 8*ymm14 + 123]
+// CHECK: vpgatherdq zmm8 {k1}, qword ptr [r14 + 8*ymm14 + 123]
 // CHECK:  encoding: [0x62,0x12,0xfd,0x49,0x90,0x84,0xf6,0x7b,0x00,0x00,0x00]
-          vpgatherdq zmm8{k1},ZMMWORD PTR [r14+ymm14*8+0x7b]
+          vpgatherdq zmm8{k1},QWORD PTR [r14+ymm14*8+0x7b]
 
-// CHECK: vpgatherdq zmm8 {k1}, zmmword ptr [r9 + ymm14 + 256]
+// CHECK: vpgatherdq zmm8 {k1}, qword ptr [r9 + ymm14 + 256]
 // CHECK:  encoding: [0x62,0x12,0xfd,0x49,0x90,0x44,0x31,0x20]
-          vpgatherdq zmm8{k1},ZMMWORD PTR [r9+ymm14*1+0x100]
+          vpgatherdq zmm8{k1},QWORD PTR [r9+ymm14*1+0x100]
 
-// CHECK: vpgatherdq zmm8 {k1}, zmmword ptr [rcx + 4*ymm14 + 1024]
+// CHECK: vpgatherdq zmm8 {k1}, qword ptr [rcx + 4*ymm14 + 1024]
 // CHECK:  encoding: [0x62,0x32,0xfd,0x49,0x90,0x84,0xb1,0x00,0x04,0x00,0x00]
-          vpgatherdq zmm8{k1},ZMMWORD PTR [rcx+ymm14*4+0x400]
+          vpgatherdq zmm8{k1},QWORD PTR [rcx+ymm14*4+0x400]
 
-// CHECK: vpgatherqd ymm3 {k1}, ymmword ptr [r14 + 8*zmm17 + 123]
+// CHECK: vpgatherqd ymm3 {k1}, dword ptr [r14 + 8*zmm17 + 123]
 // CHECK:  encoding: [0x62,0xd2,0x7d,0x41,0x91,0x9c,0xce,0x7b,0x00,0x00,0x00]
-          vpgatherqd ymm3{k1},YMMWORD PTR [r14+zmm17*8+0x7b]
+          vpgatherqd ymm3{k1},DWORD PTR [r14+zmm17*8+0x7b]
 
-// CHECK: vpgatherqd ymm3 {k1}, ymmword ptr [r9 + zmm17 + 256]
+// CHECK: vpgatherqd ymm3 {k1}, dword ptr [r9 + zmm17 + 256]
 // CHECK:  encoding: [0x62,0xd2,0x7d,0x41,0x91,0x5c,0x09,0x40]
-          vpgatherqd ymm3{k1},YMMWORD PTR [r9+zmm17*1+0x100]
+          vpgatherqd ymm3{k1},DWORD PTR [r9+zmm17*1+0x100]
 
-// CHECK: vpgatherqd ymm3 {k1}, ymmword ptr [rcx + 4*zmm17 + 1024]
+// CHECK: vpgatherqd ymm3 {k1}, dword ptr [rcx + 4*zmm17 + 1024]
 // CHECK:  encoding: [0x62,0xf2,0x7d,0x41,0x91,0x9c,0x89,0x00,0x04,0x00,0x00]
-          vpgatherqd ymm3{k1},YMMWORD PTR [rcx+zmm17*4+0x400]
+          vpgatherqd ymm3{k1},DWORD PTR [rcx+zmm17*4+0x400]
 
-// CHECK: vpgatherqq zmm17 {k1}, zmmword ptr [r14 + 8*zmm21 + 123]
+// CHECK: vpgatherqq zmm17 {k1}, qword ptr [r14 + 8*zmm21 + 123]
 // CHECK:  encoding: [0x62,0xc2,0xfd,0x41,0x91,0x8c,0xee,0x7b,0x00,0x00,0x00]
-          vpgatherqq zmm17{k1},ZMMWORD PTR [r14+zmm21*8+0x7b]
+          vpgatherqq zmm17{k1},QWORD PTR [r14+zmm21*8+0x7b]
 
-// CHECK: vpgatherqq zmm17 {k1}, zmmword ptr [r9 + zmm21 + 256]
+// CHECK: vpgatherqq zmm17 {k1}, qword ptr [r9 + zmm21 + 256]
 // CHECK:  encoding: [0x62,0xc2,0xfd,0x41,0x91,0x4c,0x29,0x20]
-          vpgatherqq zmm17{k1},ZMMWORD PTR [r9+zmm21*1+0x100]
+          vpgatherqq zmm17{k1},QWORD PTR [r9+zmm21*1+0x100]
 
-// CHECK: vpgatherqq zmm17 {k1}, zmmword ptr [rcx + 4*zmm21 + 1024]
+// CHECK: vpgatherqq zmm17 {k1}, qword ptr [rcx + 4*zmm21 + 1024]
 // CHECK:  encoding: [0x62,0xe2,0xfd,0x41,0x91,0x8c,0xa9,0x00,0x04,0x00,0x00]
-          vpgatherqq zmm17{k1},ZMMWORD PTR [rcx+zmm21*4+0x400]
+          vpgatherqq zmm17{k1},QWORD PTR [rcx+zmm21*4+0x400]
 
-// CHECK: vpscatterdd zmmword ptr [r14 + 8*zmm16 + 123] {k1}, zmm19
+// CHECK: vpscatterdd dword ptr [r14 + 8*zmm16 + 123] {k1}, zmm19
 // CHECK:  encoding: [0x62,0xc2,0x7d,0x41,0xa0,0x9c,0xc6,0x7b,0x00,0x00,0x00]
-          vpscatterdd ZMMWORD PTR [r14+zmm16*8+0x7b]{k1},zmm19
+          vpscatterdd DWORD PTR [r14+zmm16*8+0x7b]{k1},zmm19
 
-// CHECK: vpscatterdd zmmword ptr [r14 + 8*zmm16 + 123] {k1}, zmm19
+// CHECK: vpscatterdd dword ptr [r14 + 8*zmm16 + 123] {k1}, zmm19
 // CHECK:  encoding: [0x62,0xc2,0x7d,0x41,0xa0,0x9c,0xc6,0x7b,0x00,0x00,0x00]
-          vpscatterdd ZMMWORD PTR [r14+zmm16*8+0x7b]{k1},zmm19
+          vpscatterdd DWORD PTR [r14+zmm16*8+0x7b]{k1},zmm19
 
-// CHECK: vpscatterdd zmmword ptr [r9 + zmm16 + 256] {k1}, zmm19
+// CHECK: vpscatterdd dword ptr [r9 + zmm16 + 256] {k1}, zmm19
 // CHECK:  encoding: [0x62,0xc2,0x7d,0x41,0xa0,0x5c,0x01,0x40]
-          vpscatterdd ZMMWORD PTR [r9+zmm16*1+0x100]{k1},zmm19
+          vpscatterdd DWORD PTR [r9+zmm16*1+0x100]{k1},zmm19
 
-// CHECK: vpscatterdd zmmword ptr [rcx + 4*zmm16 + 1024] {k1}, zmm19
+// CHECK: vpscatterdd dword ptr [rcx + 4*zmm16 + 1024] {k1}, zmm19
 // CHECK:  encoding: [0x62,0xe2,0x7d,0x41,0xa0,0x9c,0x81,0x00,0x04,0x00,0x00]
-          vpscatterdd ZMMWORD PTR [rcx+zmm16*4+0x400]{k1},zmm19
+          vpscatterdd DWORD PTR [rcx+zmm16*4+0x400]{k1},zmm19
 
-// CHECK: vpscatterdq zmmword ptr [r14 + 8*ymm6 + 123] {k1}, zmm5
+// CHECK: vpscatterdq qword ptr [r14 + 8*ymm6 + 123] {k1}, zmm5
 // CHECK:  encoding: [0x62,0xd2,0xfd,0x49,0xa0,0xac,0xf6,0x7b,0x00,0x00,0x00]
-          vpscatterdq ZMMWORD PTR [r14+ymm6*8+0x7b]{k1},zmm5
+          vpscatterdq QWORD PTR [r14+ymm6*8+0x7b]{k1},zmm5
 
-// CHECK: vpscatterdq zmmword ptr [r14 + 8*ymm6 + 123] {k1}, zmm5
+// CHECK: vpscatterdq qword ptr [r14 + 8*ymm6 + 123] {k1}, zmm5
 // CHECK:  encoding: [0x62,0xd2,0xfd,0x49,0xa0,0xac,0xf6,0x7b,0x00,0x00,0x00]
-          vpscatterdq ZMMWORD PTR [r14+ymm6*8+0x7b]{k1},zmm5
+          vpscatterdq QWORD PTR [r14+ymm6*8+0x7b]{k1},zmm5
 
-// CHECK: vpscatterdq zmmword ptr [r9 + ymm6 + 256] {k1}, zmm5
+// CHECK: vpscatterdq qword ptr [r9 + ymm6 + 256] {k1}, zmm5
 // CHECK:  encoding: [0x62,0xd2,0xfd,0x49,0xa0,0x6c,0x31,0x20]
-          vpscatterdq ZMMWORD PTR [r9+ymm6*1+0x100]{k1},zmm5
+          vpscatterdq QWORD PTR [r9+ymm6*1+0x100]{k1},zmm5
 
-// CHECK: vpscatterdq zmmword ptr [rcx + 4*ymm6 + 1024] {k1}, zmm5
+// CHECK: vpscatterdq qword ptr [rcx + 4*ymm6 + 1024] {k1}, zmm5
 // CHECK:  encoding: [0x62,0xf2,0xfd,0x49,0xa0,0xac,0xb1,0x00,0x04,0x00,0x00]
-          vpscatterdq ZMMWORD PTR [rcx+ymm6*4+0x400]{k1},zmm5
+          vpscatterdq QWORD PTR [rcx+ymm6*4+0x400]{k1},zmm5
 
-// CHECK: vpscatterqd ymmword ptr [r14 + 8*zmm2 + 123] {k1}, ymm20
+// CHECK: vpscatterqd dword ptr [r14 + 8*zmm2 + 123] {k1}, ymm20
 // CHECK:  encoding: [0x62,0xc2,0x7d,0x49,0xa1,0xa4,0xd6,0x7b,0x00,0x00,0x00]
-          vpscatterqd YMMWORD PTR [r14+zmm2*8+0x7b]{k1},ymm20
+          vpscatterqd DWORD PTR [r14+zmm2*8+0x7b]{k1},ymm20
 
-// CHECK: vpscatterqd ymmword ptr [r14 + 8*zmm2 + 123] {k1}, ymm20
+// CHECK: vpscatterqd dword ptr [r14 + 8*zmm2 + 123] {k1}, ymm20
 // CHECK:  encoding: [0x62,0xc2,0x7d,0x49,0xa1,0xa4,0xd6,0x7b,0x00,0x00,0x00]
-          vpscatterqd YMMWORD PTR [r14+zmm2*8+0x7b]{k1},ymm20
+          vpscatterqd DWORD PTR [r14+zmm2*8+0x7b]{k1},ymm20
 
-// CHECK: vpscatterqd ymmword ptr [r9 + zmm2 + 256] {k1}, ymm20
+// CHECK: vpscatterqd dword ptr [r9 + zmm2 + 256] {k1}, ymm20
 // CHECK:  encoding: [0x62,0xc2,0x7d,0x49,0xa1,0x64,0x11,0x40]
-          vpscatterqd YMMWORD PTR [r9+zmm2*1+0x100]{k1},ymm20
+          vpscatterqd DWORD PTR [r9+zmm2*1+0x100]{k1},ymm20
 
-// CHECK: vpscatterqd ymmword ptr [rcx + 4*zmm2 + 1024] {k1}, ymm20
+// CHECK: vpscatterqd dword ptr [rcx + 4*zmm2 + 1024] {k1}, ymm20
 // CHECK:  encoding: [0x62,0xe2,0x7d,0x49,0xa1,0xa4,0x91,0x00,0x04,0x00,0x00]
-          vpscatterqd YMMWORD PTR [rcx+zmm2*4+0x400]{k1},ymm20
+          vpscatterqd DWORD PTR [rcx+zmm2*4+0x400]{k1},ymm20
 
-// CHECK: vpscatterqq zmmword ptr [r14 + 8*zmm20 + 123] {k1}, zmm14
+// CHECK: vpscatterqq qword ptr [r14 + 8*zmm20 + 123] {k1}, zmm14
 // CHECK:  encoding: [0x62,0x52,0xfd,0x41,0xa1,0xb4,0xe6,0x7b,0x00,0x00,0x00]
-          vpscatterqq ZMMWORD PTR [r14+zmm20*8+0x7b]{k1},zmm14
+          vpscatterqq QWORD PTR [r14+zmm20*8+0x7b]{k1},zmm14
 
-// CHECK: vpscatterqq zmmword ptr [r14 + 8*zmm20 + 123] {k1}, zmm14
+// CHECK: vpscatterqq qword ptr [r14 + 8*zmm20 + 123] {k1}, zmm14
 // CHECK:  encoding: [0x62,0x52,0xfd,0x41,0xa1,0xb4,0xe6,0x7b,0x00,0x00,0x00]
-          vpscatterqq ZMMWORD PTR [r14+zmm20*8+0x7b]{k1},zmm14
+          vpscatterqq QWORD PTR [r14+zmm20*8+0x7b]{k1},zmm14
 
-// CHECK: vpscatterqq zmmword ptr [r9 + zmm20 + 256] {k1}, zmm14
+// CHECK: vpscatterqq qword ptr [r9 + zmm20 + 256] {k1}, zmm14
 // CHECK:  encoding: [0x62,0x52,0xfd,0x41,0xa1,0x74,0x21,0x20]
-          vpscatterqq ZMMWORD PTR [r9+zmm20*1+0x100]{k1},zmm14
+          vpscatterqq QWORD PTR [r9+zmm20*1+0x100]{k1},zmm14
 
-// CHECK: vpscatterqq zmmword ptr [rcx + 4*zmm20 + 1024] {k1}, zmm14
+// CHECK: vpscatterqq qword ptr [rcx + 4*zmm20 + 1024] {k1}, zmm14
 // CHECK:  encoding: [0x62,0x72,0xfd,0x41,0xa1,0xb4,0xa1,0x00,0x04,0x00,0x00]
-          vpscatterqq ZMMWORD PTR [rcx+zmm20*4+0x400]{k1},zmm14
+          vpscatterqq QWORD PTR [rcx+zmm20*4+0x400]{k1},zmm14
 
-// CHECK: vscatterdpd zmmword ptr [r14 + 8*ymm24 + 123] {k1}, zmm18
+// CHECK: vscatterdpd qword ptr [r14 + 8*ymm24 + 123] {k1}, zmm18
 // CHECK:  encoding: [0x62,0x82,0xfd,0x41,0xa2,0x94,0xc6,0x7b,0x00,0x00,0x00]
-          vscatterdpd ZMMWORD PTR [r14+ymm24*8+0x7b]{k1},zmm18
+          vscatterdpd QWORD PTR [r14+ymm24*8+0x7b]{k1},zmm18
 
-// CHECK: vscatterdpd zmmword ptr [r14 + 8*ymm24 + 123] {k1}, zmm18
+// CHECK: vscatterdpd qword ptr [r14 + 8*ymm24 + 123] {k1}, zmm18
 // CHECK:  encoding: [0x62,0x82,0xfd,0x41,0xa2,0x94,0xc6,0x7b,0x00,0x00,0x00]
-          vscatterdpd ZMMWORD PTR [r14+ymm24*8+0x7b]{k1},zmm18
+          vscatterdpd QWORD PTR [r14+ymm24*8+0x7b]{k1},zmm18
 
-// CHECK: vscatterdpd zmmword ptr [r9 + ymm24 + 256] {k1}, zmm18
+// CHECK: vscatterdpd qword ptr [r9 + ymm24 + 256] {k1}, zmm18
 // CHECK:  encoding: [0x62,0x82,0xfd,0x41,0xa2,0x54,0x01,0x20]
-          vscatterdpd ZMMWORD PTR [r9+ymm24*1+0x100]{k1},zmm18
+          vscatterdpd QWORD PTR [r9+ymm24*1+0x100]{k1},zmm18
 
-// CHECK: vscatterdpd zmmword ptr [rcx + 4*ymm24 + 1024] {k1}, zmm18
+// CHECK: vscatterdpd qword ptr [rcx + 4*ymm24 + 1024] {k1}, zmm18
 // CHECK:  encoding: [0x62,0xa2,0xfd,0x41,0xa2,0x94,0x81,0x00,0x04,0x00,0x00]
-          vscatterdpd ZMMWORD PTR [rcx+ymm24*4+0x400]{k1},zmm18
+          vscatterdpd QWORD PTR [rcx+ymm24*4+0x400]{k1},zmm18
 
-// CHECK: vscatterdps zmmword ptr [r14 + 8*zmm19 + 123] {k1}, zmm17
+// CHECK: vscatterdps dword ptr [r14 + 8*zmm19 + 123] {k1}, zmm17
 // CHECK:  encoding: [0x62,0xc2,0x7d,0x41,0xa2,0x8c,0xde,0x7b,0x00,0x00,0x00]
-          vscatterdps ZMMWORD PTR [r14+zmm19*8+0x7b]{k1},zmm17
+          vscatterdps DWORD PTR [r14+zmm19*8+0x7b]{k1},zmm17
 
-// CHECK: vscatterdps zmmword ptr [r14 + 8*zmm19 + 123] {k1}, zmm17
+// CHECK: vscatterdps dword ptr [r14 + 8*zmm19 + 123] {k1}, zmm17
 // CHECK:  encoding: [0x62,0xc2,0x7d,0x41,0xa2,0x8c,0xde,0x7b,0x00,0x00,0x00]
-          vscatterdps ZMMWORD PTR [r14+zmm19*8+0x7b]{k1},zmm17
+          vscatterdps DWORD PTR [r14+zmm19*8+0x7b]{k1},zmm17
 
-// CHECK: vscatterdps zmmword ptr [r9 + zmm19 + 256] {k1}, zmm17
+// CHECK: vscatterdps dword ptr [r9 + zmm19 + 256] {k1}, zmm17
 // CHECK:  encoding: [0x62,0xc2,0x7d,0x41,0xa2,0x4c,0x19,0x40]
-          vscatterdps ZMMWORD PTR [r9+zmm19*1+0x100]{k1},zmm17
+          vscatterdps DWORD PTR [r9+zmm19*1+0x100]{k1},zmm17
 
-// CHECK: vscatterdps zmmword ptr [rcx + 4*zmm19 + 1024] {k1}, zmm17
+// CHECK: vscatterdps dword ptr [rcx + 4*zmm19 + 1024] {k1}, zmm17
 // CHECK:  encoding: [0x62,0xe2,0x7d,0x41,0xa2,0x8c,0x99,0x00,0x04,0x00,0x00]
-          vscatterdps ZMMWORD PTR [rcx+zmm19*4+0x400]{k1},zmm17
+          vscatterdps DWORD PTR [rcx+zmm19*4+0x400]{k1},zmm17
 
-// CHECK: vscatterqpd zmmword ptr [r14 + 8*zmm28 + 123] {k1}, zmm22
+// CHECK: vscatterqpd qword ptr [r14 + 8*zmm28 + 123] {k1}, zmm22
 // CHECK:  encoding: [0x62,0x82,0xfd,0x41,0xa3,0xb4,0xe6,0x7b,0x00,0x00,0x00]
-          vscatterqpd ZMMWORD PTR [r14+zmm28*8+0x7b]{k1},zmm22
+          vscatterqpd QWORD PTR [r14+zmm28*8+0x7b]{k1},zmm22
 
-// CHECK: vscatterqpd zmmword ptr [r14 + 8*zmm28 + 123] {k1}, zmm22
+// CHECK: vscatterqpd qword ptr [r14 + 8*zmm28 + 123] {k1}, zmm22
 // CHECK:  encoding: [0x62,0x82,0xfd,0x41,0xa3,0xb4,0xe6,0x7b,0x00,0x00,0x00]
-          vscatterqpd ZMMWORD PTR [r14+zmm28*8+0x7b]{k1},zmm22
+          vscatterqpd QWORD PTR [r14+zmm28*8+0x7b]{k1},zmm22
 
-// CHECK: vscatterqpd zmmword ptr [r9 + zmm28 + 256] {k1}, zmm22
+// CHECK: vscatterqpd qword ptr [r9 + zmm28 + 256] {k1}, zmm22
 // CHECK:  encoding: [0x62,0x82,0xfd,0x41,0xa3,0x74,0x21,0x20]
-          vscatterqpd ZMMWORD PTR [r9+zmm28*1+0x100]{k1},zmm22
+          vscatterqpd QWORD PTR [r9+zmm28*1+0x100]{k1},zmm22
 
-// CHECK: vscatterqpd zmmword ptr [rcx + 4*zmm28 + 1024] {k1}, zmm22
+// CHECK: vscatterqpd qword ptr [rcx + 4*zmm28 + 1024] {k1}, zmm22
 // CHECK:  encoding: [0x62,0xa2,0xfd,0x41,0xa3,0xb4,0xa1,0x00,0x04,0x00,0x00]
-          vscatterqpd ZMMWORD PTR [rcx+zmm28*4+0x400]{k1},zmm22
+          vscatterqpd QWORD PTR [rcx+zmm28*4+0x400]{k1},zmm22
 
-// CHECK: vscatterqps ymmword ptr [r14 + 8*zmm27 + 123] {k1}, ymm6
+// CHECK: vscatterqps dword ptr [r14 + 8*zmm27 + 123] {k1}, ymm6
 // CHECK:  encoding: [0x62,0x92,0x7d,0x41,0xa3,0xb4,0xde,0x7b,0x00,0x00,0x00]
-          vscatterqps YMMWORD PTR [r14+zmm27*8+0x7b]{k1},ymm6
+          vscatterqps DWORD PTR [r14+zmm27*8+0x7b]{k1},ymm6
 
-// CHECK: vscatterqps ymmword ptr [r14 + 8*zmm27 + 123] {k1}, ymm6
+// CHECK: vscatterqps dword ptr [r14 + 8*zmm27 + 123] {k1}, ymm6
 // CHECK:  encoding: [0x62,0x92,0x7d,0x41,0xa3,0xb4,0xde,0x7b,0x00,0x00,0x00]
-          vscatterqps YMMWORD PTR [r14+zmm27*8+0x7b]{k1},ymm6
+          vscatterqps DWORD PTR [r14+zmm27*8+0x7b]{k1},ymm6
 
-// CHECK: vscatterqps ymmword ptr [r9 + zmm27 + 256] {k1}, ymm6
+// CHECK: vscatterqps dword ptr [r9 + zmm27 + 256] {k1}, ymm6
 // CHECK:  encoding: [0x62,0x92,0x7d,0x41,0xa3,0x74,0x19,0x40]
-          vscatterqps YMMWORD PTR [r9+zmm27*1+0x100]{k1},ymm6
+          vscatterqps DWORD PTR [r9+zmm27*1+0x100]{k1},ymm6
 
-// CHECK: vscatterqps ymmword ptr [rcx + 4*zmm27 + 1024] {k1}, ymm6
+// CHECK: vscatterqps dword ptr [rcx + 4*zmm27 + 1024] {k1}, ymm6
 // CHECK:  encoding: [0x62,0xb2,0x7d,0x41,0xa3,0xb4,0x99,0x00,0x04,0x00,0x00]
-          vscatterqps YMMWORD PTR [rcx+zmm27*4+0x400]{k1},ymm6
+          vscatterqps DWORD PTR [rcx+zmm27*4+0x400]{k1},ymm6
 
-// CHECK: vscatterdpd zmmword ptr [r14 + 8*ymm27 - 123] {k1}, zmm18
+// CHECK: vscatterdpd qword ptr [r14 + 8*ymm27 - 123] {k1}, zmm18
 // CHECK:  encoding: [0x62,0x82,0xfd,0x41,0xa2,0x94,0xde,0x85,0xff,0xff,0xff]
-          vscatterdpd ZMMWORD PTR [r14+ymm27*8-0x7b]{k1},zmm18
+          vscatterdpd QWORD PTR [r14+ymm27*8-0x7b]{k1},zmm18
 
-// CHECK: vscatterdpd zmmword ptr [r14 + 8*ymm27 - 123] {k1}, zmm18
+// CHECK: vscatterdpd qword ptr [r14 + 8*ymm27 - 123] {k1}, zmm18
 // CHECK:  encoding: [0x62,0x82,0xfd,0x41,0xa2,0x94,0xde,0x85,0xff,0xff,0xff]
-          vscatterdpd ZMMWORD PTR [r14+ymm27*8-0x7b]{k1},zmm18
+          vscatterdpd QWORD PTR [r14+ymm27*8-0x7b]{k1},zmm18
 
-// CHECK: vscatterdpd zmmword ptr [r9 + ymm27 + 256] {k1}, zmm18
+// CHECK: vscatterdpd qword ptr [r9 + ymm27 + 256] {k1}, zmm18
 // CHECK:  encoding: [0x62,0x82,0xfd,0x41,0xa2,0x54,0x19,0x20]
-          vscatterdpd ZMMWORD PTR [r9+ymm27*1+0x100]{k1},zmm18
+          vscatterdpd QWORD PTR [r9+ymm27*1+0x100]{k1},zmm18
 
-// CHECK: vscatterdpd zmmword ptr [rcx + 4*ymm27 + 1024] {k1}, zmm18
+// CHECK: vscatterdpd qword ptr [rcx + 4*ymm27 + 1024] {k1}, zmm18
 // CHECK:  encoding: [0x62,0xa2,0xfd,0x41,0xa2,0x94,0x99,0x00,0x04,0x00,0x00]
-          vscatterdpd ZMMWORD PTR [rcx+ymm27*4+0x400]{k1},zmm18
+          vscatterdpd QWORD PTR [rcx+ymm27*4+0x400]{k1},zmm18
 
-// CHECK: vscatterdps zmmword ptr [r14 + 8*zmm17 - 123] {k1}, zmm1
+// CHECK: vscatterdps dword ptr [r14 + 8*zmm17 - 123] {k1}, zmm1
 // CHECK:  encoding: [0x62,0xd2,0x7d,0x41,0xa2,0x8c,0xce,0x85,0xff,0xff,0xff]
-          vscatterdps ZMMWORD PTR [r14+zmm17*8-0x7b]{k1},zmm1
+          vscatterdps DWORD PTR [r14+zmm17*8-0x7b]{k1},zmm1
 
-// CHECK: vscatterdps zmmword ptr [r14 + 8*zmm17 - 123] {k1}, zmm1
+// CHECK: vscatterdps dword ptr [r14 + 8*zmm17 - 123] {k1}, zmm1
 // CHECK:  encoding: [0x62,0xd2,0x7d,0x41,0xa2,0x8c,0xce,0x85,0xff,0xff,0xff]
-          vscatterdps ZMMWORD PTR [r14+zmm17*8-0x7b]{k1},zmm1
+          vscatterdps DWORD PTR [r14+zmm17*8-0x7b]{k1},zmm1
 
-// CHECK: vscatterdps zmmword ptr [r9 + zmm17 + 256] {k1}, zmm1
+// CHECK: vscatterdps dword ptr [r9 + zmm17 + 256] {k1}, zmm1
 // CHECK:  encoding: [0x62,0xd2,0x7d,0x41,0xa2,0x4c,0x09,0x40]
-          vscatterdps ZMMWORD PTR [r9+zmm17*1+0x100]{k1},zmm1
+          vscatterdps DWORD PTR [r9+zmm17*1+0x100]{k1},zmm1
 
-// CHECK: vscatterdps zmmword ptr [rcx + 4*zmm17 + 1024] {k1}, zmm1
+// CHECK: vscatterdps dword ptr [rcx + 4*zmm17 + 1024] {k1}, zmm1
 // CHECK:  encoding: [0x62,0xf2,0x7d,0x41,0xa2,0x8c,0x89,0x00,0x04,0x00,0x00]
-          vscatterdps ZMMWORD PTR [rcx+zmm17*4+0x400]{k1},zmm1
+          vscatterdps DWORD PTR [rcx+zmm17*4+0x400]{k1},zmm1
 
-// CHECK: vscatterqpd zmmword ptr [r14 + 8*zmm25 - 123] {k1}, zmm8
+// CHECK: vscatterqpd qword ptr [r14 + 8*zmm25 - 123] {k1}, zmm8
 // CHECK:  encoding: [0x62,0x12,0xfd,0x41,0xa3,0x84,0xce,0x85,0xff,0xff,0xff]
-          vscatterqpd ZMMWORD PTR [r14+zmm25*8-0x7b]{k1},zmm8
+          vscatterqpd QWORD PTR [r14+zmm25*8-0x7b]{k1},zmm8
 
-// CHECK: vscatterqpd zmmword ptr [r14 + 8*zmm25 - 123] {k1}, zmm8
+// CHECK: vscatterqpd qword ptr [r14 + 8*zmm25 - 123] {k1}, zmm8
 // CHECK:  encoding: [0x62,0x12,0xfd,0x41,0xa3,0x84,0xce,0x85,0xff,0xff,0xff]
-          vscatterqpd ZMMWORD PTR [r14+zmm25*8-0x7b]{k1},zmm8
+          vscatterqpd QWORD PTR [r14+zmm25*8-0x7b]{k1},zmm8
 
-// CHECK: vscatterqpd zmmword ptr [r9 + zmm25 + 256] {k1}, zmm8
+// CHECK: vscatterqpd qword ptr [r9 + zmm25 + 256] {k1}, zmm8
 // CHECK:  encoding: [0x62,0x12,0xfd,0x41,0xa3,0x44,0x09,0x20]
-          vscatterqpd ZMMWORD PTR [r9+zmm25*1+0x100]{k1},zmm8
+          vscatterqpd QWORD PTR [r9+zmm25*1+0x100]{k1},zmm8
 
-// CHECK: vscatterqpd zmmword ptr [rcx + 4*zmm25 + 1024] {k1}, zmm8
+// CHECK: vscatterqpd qword ptr [rcx + 4*zmm25 + 1024] {k1}, zmm8
 // CHECK:  encoding: [0x62,0x32,0xfd,0x41,0xa3,0x84,0x89,0x00,0x04,0x00,0x00]
-          vscatterqpd ZMMWORD PTR [rcx+zmm25*4+0x400]{k1},zmm8
+          vscatterqpd QWORD PTR [rcx+zmm25*4+0x400]{k1},zmm8
 
-// CHECK: vscatterqps ymmword ptr [r14 + 8*zmm10 - 123] {k1}, ymm13
+// CHECK: vscatterqps dword ptr [r14 + 8*zmm10 - 123] {k1}, ymm13
 // CHECK:  encoding: [0x62,0x12,0x7d,0x49,0xa3,0xac,0xd6,0x85,0xff,0xff,0xff]
-          vscatterqps YMMWORD PTR [r14+zmm10*8-0x7b]{k1},ymm13
+          vscatterqps DWORD PTR [r14+zmm10*8-0x7b]{k1},ymm13
 
-// CHECK: vscatterqps ymmword ptr [r14 + 8*zmm10 - 123] {k1}, ymm13
+// CHECK: vscatterqps dword ptr [r14 + 8*zmm10 - 123] {k1}, ymm13
 // CHECK:  encoding: [0x62,0x12,0x7d,0x49,0xa3,0xac,0xd6,0x85,0xff,0xff,0xff]
-          vscatterqps YMMWORD PTR [r14+zmm10*8-0x7b]{k1},ymm13
+          vscatterqps DWORD PTR [r14+zmm10*8-0x7b]{k1},ymm13
 
-// CHECK: vscatterqps ymmword ptr [r9 + zmm10 + 256] {k1}, ymm13
+// CHECK: vscatterqps dword ptr [r9 + zmm10 + 256] {k1}, ymm13
 // CHECK:  encoding: [0x62,0x12,0x7d,0x49,0xa3,0x6c,0x11,0x40]
-          vscatterqps YMMWORD PTR [r9+zmm10*1+0x100]{k1},ymm13
+          vscatterqps DWORD PTR [r9+zmm10*1+0x100]{k1},ymm13
 
-// CHECK: vscatterqps ymmword ptr [rcx + 4*zmm10 + 1024] {k1}, ymm13
+// CHECK: vscatterqps dword ptr [rcx + 4*zmm10 + 1024] {k1}, ymm13
 // CHECK:  encoding: [0x62,0x32,0x7d,0x49,0xa3,0xac,0x91,0x00,0x04,0x00,0x00]
-          vscatterqps YMMWORD PTR [rcx+zmm10*4+0x400]{k1},ymm13
+          vscatterqps DWORD PTR [rcx+zmm10*4+0x400]{k1},ymm13
 
-// CHECK: vgatherdpd zmm30 {k1}, zmmword ptr [r14 + 8*ymm5 - 123]
+// CHECK: vgatherdpd zmm30 {k1}, qword ptr [r14 + 8*ymm5 - 123]
 // CHECK:  encoding: [0x62,0x42,0xfd,0x49,0x92,0xb4,0xee,0x85,0xff,0xff,0xff]
-          vgatherdpd zmm30{k1},ZMMWORD PTR [r14+ymm5*8-0x7b]
+          vgatherdpd zmm30{k1},QWORD PTR [r14+ymm5*8-0x7b]
 
-// CHECK: vgatherdpd zmm30 {k1}, zmmword ptr [r9 + ymm5 + 256]
+// CHECK: vgatherdpd zmm30 {k1}, qword ptr [r9 + ymm5 + 256]
 // CHECK:  encoding: [0x62,0x42,0xfd,0x49,0x92,0x74,0x29,0x20]
-          vgatherdpd zmm30{k1},ZMMWORD PTR [r9+ymm5*1+0x100]
+          vgatherdpd zmm30{k1},QWORD PTR [r9+ymm5*1+0x100]
 
-// CHECK: vgatherdpd zmm30 {k1}, zmmword ptr [rcx + 4*ymm5 + 1024]
+// CHECK: vgatherdpd zmm30 {k1}, qword ptr [rcx + 4*ymm5 + 1024]
 // CHECK:  encoding: [0x62,0x62,0xfd,0x49,0x92,0xb4,0xa9,0x00,0x04,0x00,0x00]
-          vgatherdpd zmm30{k1},ZMMWORD PTR [rcx+ymm5*4+0x400]
+          vgatherdpd zmm30{k1},QWORD PTR [rcx+ymm5*4+0x400]
 
-// CHECK: vgatherdps zmm8 {k1}, zmmword ptr [r14 + 8*zmm26 - 123]
+// CHECK: vgatherdps zmm8 {k1}, dword ptr [r14 + 8*zmm26 - 123]
 // CHECK:  encoding: [0x62,0x12,0x7d,0x41,0x92,0x84,0xd6,0x85,0xff,0xff,0xff]
-          vgatherdps zmm8{k1},ZMMWORD PTR [r14+zmm26*8-0x7b]
+          vgatherdps zmm8{k1},DWORD PTR [r14+zmm26*8-0x7b]
 
-// CHECK: vgatherdps zmm8 {k1}, zmmword ptr [r9 + zmm26 + 256]
+// CHECK: vgatherdps zmm8 {k1}, dword ptr [r9 + zmm26 + 256]
 // CHECK:  encoding: [0x62,0x12,0x7d,0x41,0x92,0x44,0x11,0x40]
-          vgatherdps zmm8{k1},ZMMWORD PTR [r9+zmm26*1+0x100]
+          vgatherdps zmm8{k1},DWORD PTR [r9+zmm26*1+0x100]
 
-// CHECK: vgatherdps zmm8 {k1}, zmmword ptr [rcx + 4*zmm26 + 1024]
+// CHECK: vgatherdps zmm8 {k1}, dword ptr [rcx + 4*zmm26 + 1024]
 // CHECK:  encoding: [0x62,0x32,0x7d,0x41,0x92,0x84,0x91,0x00,0x04,0x00,0x00]
-          vgatherdps zmm8{k1},ZMMWORD PTR [rcx+zmm26*4+0x400]
+          vgatherdps zmm8{k1},DWORD PTR [rcx+zmm26*4+0x400]
 
-// CHECK: vgatherqpd zmm27 {k1}, zmmword ptr [r14 + 8*zmm13 - 123]
+// CHECK: vgatherqpd zmm27 {k1}, qword ptr [r14 + 8*zmm13 - 123]
 // CHECK:  encoding: [0x62,0x02,0xfd,0x49,0x93,0x9c,0xee,0x85,0xff,0xff,0xff]
-          vgatherqpd zmm27{k1},ZMMWORD PTR [r14+zmm13*8-0x7b]
+          vgatherqpd zmm27{k1},QWORD PTR [r14+zmm13*8-0x7b]
 
-// CHECK: vgatherqpd zmm27 {k1}, zmmword ptr [r9 + zmm13 + 256]
+// CHECK: vgatherqpd zmm27 {k1}, qword ptr [r9 + zmm13 + 256]
 // CHECK:  encoding: [0x62,0x02,0xfd,0x49,0x93,0x5c,0x29,0x20]
-          vgatherqpd zmm27{k1},ZMMWORD PTR [r9+zmm13*1+0x100]
+          vgatherqpd zmm27{k1},QWORD PTR [r9+zmm13*1+0x100]
 
-// CHECK: vgatherqpd zmm27 {k1}, zmmword ptr [rcx + 4*zmm13 + 1024]
+// CHECK: vgatherqpd zmm27 {k1}, qword ptr [rcx + 4*zmm13 + 1024]
 // CHECK:  encoding: [0x62,0x22,0xfd,0x49,0x93,0x9c,0xa9,0x00,0x04,0x00,0x00]
-          vgatherqpd zmm27{k1},ZMMWORD PTR [rcx+zmm13*4+0x400]
+          vgatherqpd zmm27{k1},QWORD PTR [rcx+zmm13*4+0x400]
 
-// CHECK: vgatherqps ymm27 {k1}, ymmword ptr [r14 + 8*zmm14 - 123]
+// CHECK: vgatherqps ymm27 {k1}, dword ptr [r14 + 8*zmm14 - 123]
 // CHECK:  encoding: [0x62,0x02,0x7d,0x49,0x93,0x9c,0xf6,0x85,0xff,0xff,0xff]
-          vgatherqps ymm27{k1},YMMWORD PTR [r14+zmm14*8-0x7b]
+          vgatherqps ymm27{k1},DWORD PTR [r14+zmm14*8-0x7b]
 
-// CHECK: vgatherqps ymm27 {k1}, ymmword ptr [r9 + zmm14 + 256]
+// CHECK: vgatherqps ymm27 {k1}, dword ptr [r9 + zmm14 + 256]
 // CHECK:  encoding: [0x62,0x02,0x7d,0x49,0x93,0x5c,0x31,0x40]
-          vgatherqps ymm27{k1},YMMWORD PTR [r9+zmm14*1+0x100]
+          vgatherqps ymm27{k1},DWORD PTR [r9+zmm14*1+0x100]
 
-// CHECK: vgatherqps ymm27 {k1}, ymmword ptr [rcx + 4*zmm14 + 1024]
+// CHECK: vgatherqps ymm27 {k1}, dword ptr [rcx + 4*zmm14 + 1024]
 // CHECK:  encoding: [0x62,0x22,0x7d,0x49,0x93,0x9c,0xb1,0x00,0x04,0x00,0x00]
-          vgatherqps ymm27{k1},YMMWORD PTR [rcx+zmm14*4+0x400]
+          vgatherqps ymm27{k1},DWORD PTR [rcx+zmm14*4+0x400]
 
-// CHECK: vpgatherdd zmm7 {k1}, zmmword ptr [r14 + 8*zmm16 - 123]
+// CHECK: vpgatherdd zmm7 {k1}, dword ptr [r14 + 8*zmm16 - 123]
 // CHECK:  encoding: [0x62,0xd2,0x7d,0x41,0x90,0xbc,0xc6,0x85,0xff,0xff,0xff]
-          vpgatherdd zmm7{k1},ZMMWORD PTR [r14+zmm16*8-0x7b]
+          vpgatherdd zmm7{k1},DWORD PTR [r14+zmm16*8-0x7b]
 
-// CHECK: vpgatherdd zmm7 {k1}, zmmword ptr [r9 + zmm16 + 256]
+// CHECK: vpgatherdd zmm7 {k1}, dword ptr [r9 + zmm16 + 256]
 // CHECK:  encoding: [0x62,0xd2,0x7d,0x41,0x90,0x7c,0x01,0x40]
-          vpgatherdd zmm7{k1},ZMMWORD PTR [r9+zmm16*1+0x100]
+          vpgatherdd zmm7{k1},DWORD PTR [r9+zmm16*1+0x100]
 
-// CHECK: vpgatherdd zmm7 {k1}, zmmword ptr [rcx + 4*zmm16 + 1024]
+// CHECK: vpgatherdd zmm7 {k1}, dword ptr [rcx + 4*zmm16 + 1024]
 // CHECK:  encoding: [0x62,0xf2,0x7d,0x41,0x90,0xbc,0x81,0x00,0x04,0x00,0x00]
-          vpgatherdd zmm7{k1},ZMMWORD PTR [rcx+zmm16*4+0x400]
+          vpgatherdd zmm7{k1},DWORD PTR [rcx+zmm16*4+0x400]
 
-// CHECK: vpgatherdq zmm25 {k1}, zmmword ptr [r14 + 8*ymm7 - 123]
+// CHECK: vpgatherdq zmm25 {k1}, qword ptr [r14 + 8*ymm7 - 123]
 // CHECK:  encoding: [0x62,0x42,0xfd,0x49,0x90,0x8c,0xfe,0x85,0xff,0xff,0xff]
-          vpgatherdq zmm25{k1},ZMMWORD PTR [r14+ymm7*8-0x7b]
+          vpgatherdq zmm25{k1},QWORD PTR [r14+ymm7*8-0x7b]
 
-// CHECK: vpgatherdq zmm25 {k1}, zmmword ptr [r9 + ymm7 + 256]
+// CHECK: vpgatherdq zmm25 {k1}, qword ptr [r9 + ymm7 + 256]
 // CHECK:  encoding: [0x62,0x42,0xfd,0x49,0x90,0x4c,0x39,0x20]
-          vpgatherdq zmm25{k1},ZMMWORD PTR [r9+ymm7*1+0x100]
+          vpgatherdq zmm25{k1},QWORD PTR [r9+ymm7*1+0x100]
 
-// CHECK: vpgatherdq zmm25 {k1}, zmmword ptr [rcx + 4*ymm7 + 1024]
+// CHECK: vpgatherdq zmm25 {k1}, qword ptr [rcx + 4*ymm7 + 1024]
 // CHECK:  encoding: [0x62,0x62,0xfd,0x49,0x90,0x8c,0xb9,0x00,0x04,0x00,0x00]
-          vpgatherdq zmm25{k1},ZMMWORD PTR [rcx+ymm7*4+0x400]
+          vpgatherdq zmm25{k1},QWORD PTR [rcx+ymm7*4+0x400]
 
-// CHECK: vpgatherqd ymm19 {k1}, ymmword ptr [r14 + 8*zmm17 - 123]
+// CHECK: vpgatherqd ymm19 {k1}, dword ptr [r14 + 8*zmm17 - 123]
 // CHECK:  encoding: [0x62,0xc2,0x7d,0x41,0x91,0x9c,0xce,0x85,0xff,0xff,0xff]
-          vpgatherqd ymm19{k1},YMMWORD PTR [r14+zmm17*8-0x7b]
+          vpgatherqd ymm19{k1},DWORD PTR [r14+zmm17*8-0x7b]
 
-// CHECK: vpgatherqd ymm19 {k1}, ymmword ptr [r9 + zmm17 + 256]
+// CHECK: vpgatherqd ymm19 {k1}, dword ptr [r9 + zmm17 + 256]
 // CHECK:  encoding: [0x62,0xc2,0x7d,0x41,0x91,0x5c,0x09,0x40]
-          vpgatherqd ymm19{k1},YMMWORD PTR [r9+zmm17*1+0x100]
+          vpgatherqd ymm19{k1},DWORD PTR [r9+zmm17*1+0x100]
 
-// CHECK: vpgatherqd ymm19 {k1}, ymmword ptr [rcx + 4*zmm17 + 1024]
+// CHECK: vpgatherqd ymm19 {k1}, dword ptr [rcx + 4*zmm17 + 1024]
 // CHECK:  encoding: [0x62,0xe2,0x7d,0x41,0x91,0x9c,0x89,0x00,0x04,0x00,0x00]
-          vpgatherqd ymm19{k1},YMMWORD PTR [rcx+zmm17*4+0x400]
+          vpgatherqd ymm19{k1},DWORD PTR [rcx+zmm17*4+0x400]
 
-// CHECK: vpgatherqq zmm10 {k1}, zmmword ptr [r14 + 8*zmm13 - 123]
+// CHECK: vpgatherqq zmm10 {k1}, qword ptr [r14 + 8*zmm13 - 123]
 // CHECK:  encoding: [0x62,0x12,0xfd,0x49,0x91,0x94,0xee,0x85,0xff,0xff,0xff]
-          vpgatherqq zmm10{k1},ZMMWORD PTR [r14+zmm13*8-0x7b]
+          vpgatherqq zmm10{k1},QWORD PTR [r14+zmm13*8-0x7b]
 
-// CHECK: vpgatherqq zmm10 {k1}, zmmword ptr [r9 + zmm13 + 256]
+// CHECK: vpgatherqq zmm10 {k1}, qword ptr [r9 + zmm13 + 256]
 // CHECK:  encoding: [0x62,0x12,0xfd,0x49,0x91,0x54,0x29,0x20]
-          vpgatherqq zmm10{k1},ZMMWORD PTR [r9+zmm13*1+0x100]
+          vpgatherqq zmm10{k1},QWORD PTR [r9+zmm13*1+0x100]
 
-// CHECK: vpgatherqq zmm10 {k1}, zmmword ptr [rcx + 4*zmm13 + 1024]
+// CHECK: vpgatherqq zmm10 {k1}, qword ptr [rcx + 4*zmm13 + 1024]
 // CHECK:  encoding: [0x62,0x32,0xfd,0x49,0x91,0x94,0xa9,0x00,0x04,0x00,0x00]
-          vpgatherqq zmm10{k1},ZMMWORD PTR [rcx+zmm13*4+0x400]
+          vpgatherqq zmm10{k1},QWORD PTR [rcx+zmm13*4+0x400]
 
-// CHECK: vpscatterdd zmmword ptr [r14 + 8*zmm4 - 123] {k1}, zmm23
+// CHECK: vpscatterdd dword ptr [r14 + 8*zmm4 - 123] {k1}, zmm23
 // CHECK:  encoding: [0x62,0xc2,0x7d,0x49,0xa0,0xbc,0xe6,0x85,0xff,0xff,0xff]
-          vpscatterdd ZMMWORD PTR [r14+zmm4*8-0x7b]{k1},zmm23
+          vpscatterdd DWORD PTR [r14+zmm4*8-0x7b]{k1},zmm23
 
-// CHECK: vpscatterdd zmmword ptr [r14 + 8*zmm4 - 123] {k1}, zmm23
+// CHECK: vpscatterdd dword ptr [r14 + 8*zmm4 - 123] {k1}, zmm23
 // CHECK:  encoding: [0x62,0xc2,0x7d,0x49,0xa0,0xbc,0xe6,0x85,0xff,0xff,0xff]
-          vpscatterdd ZMMWORD PTR [r14+zmm4*8-0x7b]{k1},zmm23
+          vpscatterdd DWORD PTR [r14+zmm4*8-0x7b]{k1},zmm23
 
-// CHECK: vpscatterdd zmmword ptr [r9 + zmm4 + 256] {k1}, zmm23
+// CHECK: vpscatterdd dword ptr [r9 + zmm4 + 256] {k1}, zmm23
 // CHECK:  encoding: [0x62,0xc2,0x7d,0x49,0xa0,0x7c,0x21,0x40]
-          vpscatterdd ZMMWORD PTR [r9+zmm4*1+0x100]{k1},zmm23
+          vpscatterdd DWORD PTR [r9+zmm4*1+0x100]{k1},zmm23
 
-// CHECK: vpscatterdd zmmword ptr [rcx + 4*zmm4 + 1024] {k1}, zmm23
+// CHECK: vpscatterdd dword ptr [rcx + 4*zmm4 + 1024] {k1}, zmm23
 // CHECK:  encoding: [0x62,0xe2,0x7d,0x49,0xa0,0xbc,0xa1,0x00,0x04,0x00,0x00]
-          vpscatterdd ZMMWORD PTR [rcx+zmm4*4+0x400]{k1},zmm23
+          vpscatterdd DWORD PTR [rcx+zmm4*4+0x400]{k1},zmm23
 
-// CHECK: vpscatterdq zmmword ptr [r14 + 8*ymm25 - 123] {k1}, zmm1
+// CHECK: vpscatterdq qword ptr [r14 + 8*ymm25 - 123] {k1}, zmm1
 // CHECK:  encoding: [0x62,0x92,0xfd,0x41,0xa0,0x8c,0xce,0x85,0xff,0xff,0xff]
-          vpscatterdq ZMMWORD PTR [r14+ymm25*8-0x7b]{k1},zmm1
+          vpscatterdq QWORD PTR [r14+ymm25*8-0x7b]{k1},zmm1
 
-// CHECK: vpscatterdq zmmword ptr [r14 + 8*ymm25 - 123] {k1}, zmm1
+// CHECK: vpscatterdq qword ptr [r14 + 8*ymm25 - 123] {k1}, zmm1
 // CHECK:  encoding: [0x62,0x92,0xfd,0x41,0xa0,0x8c,0xce,0x85,0xff,0xff,0xff]
-          vpscatterdq ZMMWORD PTR [r14+ymm25*8-0x7b]{k1},zmm1
+          vpscatterdq QWORD PTR [r14+ymm25*8-0x7b]{k1},zmm1
 
-// CHECK: vpscatterdq zmmword ptr [r9 + ymm25 + 256] {k1}, zmm1
+// CHECK: vpscatterdq qword ptr [r9 + ymm25 + 256] {k1}, zmm1
 // CHECK:  encoding: [0x62,0x92,0xfd,0x41,0xa0,0x4c,0x09,0x20]
-          vpscatterdq ZMMWORD PTR [r9+ymm25*1+0x100]{k1},zmm1
+          vpscatterdq QWORD PTR [r9+ymm25*1+0x100]{k1},zmm1
 
-// CHECK: vpscatterdq zmmword ptr [rcx + 4*ymm25 + 1024] {k1}, zmm1
+// CHECK: vpscatterdq qword ptr [rcx + 4*ymm25 + 1024] {k1}, zmm1
 // CHECK:  encoding: [0x62,0xb2,0xfd,0x41,0xa0,0x8c,0x89,0x00,0x04,0x00,0x00]
-          vpscatterdq ZMMWORD PTR [rcx+ymm25*4+0x400]{k1},zmm1
+          vpscatterdq QWORD PTR [rcx+ymm25*4+0x400]{k1},zmm1
 
-// CHECK: vpscatterqd ymmword ptr [r14 + 8*zmm22 - 123] {k1}, ymm23
+// CHECK: vpscatterqd dword ptr [r14 + 8*zmm22 - 123] {k1}, ymm23
 // CHECK:  encoding: [0x62,0xc2,0x7d,0x41,0xa1,0xbc,0xf6,0x85,0xff,0xff,0xff]
-          vpscatterqd YMMWORD PTR [r14+zmm22*8-0x7b]{k1},ymm23
+          vpscatterqd DWORD PTR [r14+zmm22*8-0x7b]{k1},ymm23
 
-// CHECK: vpscatterqd ymmword ptr [r14 + 8*zmm22 - 123] {k1}, ymm23
+// CHECK: vpscatterqd dword ptr [r14 + 8*zmm22 - 123] {k1}, ymm23
 // CHECK:  encoding: [0x62,0xc2,0x7d,0x41,0xa1,0xbc,0xf6,0x85,0xff,0xff,0xff]
-          vpscatterqd YMMWORD PTR [r14+zmm22*8-0x7b]{k1},ymm23
+          vpscatterqd DWORD PTR [r14+zmm22*8-0x7b]{k1},ymm23
 
-// CHECK: vpscatterqd ymmword ptr [r9 + zmm22 + 256] {k1}, ymm23
+// CHECK: vpscatterqd dword ptr [r9 + zmm22 + 256] {k1}, ymm23
 // CHECK:  encoding: [0x62,0xc2,0x7d,0x41,0xa1,0x7c,0x31,0x40]
-          vpscatterqd YMMWORD PTR [r9+zmm22*1+0x100]{k1},ymm23
+          vpscatterqd DWORD PTR [r9+zmm22*1+0x100]{k1},ymm23
 
-// CHECK: vpscatterqd ymmword ptr [rcx + 4*zmm22 + 1024] {k1}, ymm23
+// CHECK: vpscatterqd dword ptr [rcx + 4*zmm22 + 1024] {k1}, ymm23
 // CHECK:  encoding: [0x62,0xe2,0x7d,0x41,0xa1,0xbc,0xb1,0x00,0x04,0x00,0x00]
-          vpscatterqd YMMWORD PTR [rcx+zmm22*4+0x400]{k1},ymm23
+          vpscatterqd DWORD PTR [rcx+zmm22*4+0x400]{k1},ymm23
 
-// CHECK: vpscatterqq zmmword ptr [r14 + 8*zmm8 - 123] {k1}, zmm2
+// CHECK: vpscatterqq qword ptr [r14 + 8*zmm8 - 123] {k1}, zmm2
 // CHECK:  encoding: [0x62,0x92,0xfd,0x49,0xa1,0x94,0xc6,0x85,0xff,0xff,0xff]
-          vpscatterqq ZMMWORD PTR [r14+zmm8*8-0x7b]{k1},zmm2
+          vpscatterqq QWORD PTR [r14+zmm8*8-0x7b]{k1},zmm2
 
-// CHECK: vpscatterqq zmmword ptr [r14 + 8*zmm8 - 123] {k1}, zmm2
+// CHECK: vpscatterqq qword ptr [r14 + 8*zmm8 - 123] {k1}, zmm2
 // CHECK:  encoding: [0x62,0x92,0xfd,0x49,0xa1,0x94,0xc6,0x85,0xff,0xff,0xff]
-          vpscatterqq ZMMWORD PTR [r14+zmm8*8-0x7b]{k1},zmm2
+          vpscatterqq QWORD PTR [r14+zmm8*8-0x7b]{k1},zmm2
 
-// CHECK: vpscatterqq zmmword ptr [r9 + zmm8 + 256] {k1}, zmm2
+// CHECK: vpscatterqq qword ptr [r9 + zmm8 + 256] {k1}, zmm2
 // CHECK:  encoding: [0x62,0x92,0xfd,0x49,0xa1,0x54,0x01,0x20]
-          vpscatterqq ZMMWORD PTR [r9+zmm8*1+0x100]{k1},zmm2
+          vpscatterqq QWORD PTR [r9+zmm8*1+0x100]{k1},zmm2
 
-// CHECK: vpscatterqq zmmword ptr [rcx + 4*zmm8 + 1024] {k1}, zmm2
+// CHECK: vpscatterqq qword ptr [rcx + 4*zmm8 + 1024] {k1}, zmm2
 // CHECK:  encoding: [0x62,0xb2,0xfd,0x49,0xa1,0x94,0x81,0x00,0x04,0x00,0x00]
-          vpscatterqq ZMMWORD PTR [rcx+zmm8*4+0x400]{k1},zmm2
+          vpscatterqq QWORD PTR [rcx+zmm8*4+0x400]{k1},zmm2
diff --git a/llvm/test/MC/X86/avx512f_vl-intel.s b/llvm/test/MC/X86/avx512f_vl-intel.s
index 31c43afe50171..ed3292b83f4d7 100644
--- a/llvm/test/MC/X86/avx512f_vl-intel.s
+++ b/llvm/test/MC/X86/avx512f_vl-intel.s
@@ -224,901 +224,901 @@
 // CHECK:  encoding: [0x62,0xf1,0x64,0x30,0xc2,0xa2,0xfc,0xfd,0xff,0xff,0x7b]
           vcmpps k4,ymm19,DWORD PTR [rdx-0x204]{1to8},0x7b
 
-// CHECK: vgatherdpd	xmm17 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] 
+// CHECK: vgatherdpd	xmm17 {k1}, qword ptr [r14 + 8*xmm31 + 123] 
 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x92,0x8c,0xfe,0x7b,0x00,0x00,0x00]
-          vgatherdpd	xmm17 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] 
+          vgatherdpd	xmm17 {k1}, qword ptr [r14 + 8*xmm31 + 123] 
 
-// CHECK: vgatherdpd	xmm17 {k1}, xmmword ptr [r9 + xmm31 + 256] 
+// CHECK: vgatherdpd	xmm17 {k1}, qword ptr [r9 + xmm31 + 256] 
 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x92,0x4c,0x39,0x20]
-          vgatherdpd	xmm17 {k1}, xmmword ptr [r9 + xmm31 + 256] 
+          vgatherdpd	xmm17 {k1}, qword ptr [r9 + xmm31 + 256] 
 
-// CHECK: vgatherdpd	xmm17 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] 
+// CHECK: vgatherdpd	xmm17 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
 // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0x92,0x8c,0xb9,0x00,0x04,0x00,0x00]
-          vgatherdpd	xmm17 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] 
+          vgatherdpd	xmm17 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
 
-// CHECK: vgatherdpd	ymm23 {k1}, ymmword ptr [r14 + 8*xmm31 + 123] 
+// CHECK: vgatherdpd	ymm23 {k1}, qword ptr [r14 + 8*xmm31 + 123] 
 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x92,0xbc,0xfe,0x7b,0x00,0x00,0x00]
-          vgatherdpd	ymm23 {k1}, ymmword ptr [r14 + 8*xmm31 + 123] 
+          vgatherdpd	ymm23 {k1}, qword ptr [r14 + 8*xmm31 + 123] 
 
-// CHECK: vgatherdpd	ymm23 {k1}, ymmword ptr [r9 + xmm31 + 256] 
+// CHECK: vgatherdpd	ymm23 {k1}, qword ptr [r9 + xmm31 + 256] 
 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x92,0x7c,0x39,0x20]
-          vgatherdpd	ymm23 {k1}, ymmword ptr [r9 + xmm31 + 256] 
+          vgatherdpd	ymm23 {k1}, qword ptr [r9 + xmm31 + 256] 
 
-// CHECK: vgatherdpd	ymm23 {k1}, ymmword ptr [rcx + 4*xmm31 + 1024] 
+// CHECK: vgatherdpd	ymm23 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
 // CHECK: encoding: [0x62,0xa2,0xfd,0x21,0x92,0xbc,0xb9,0x00,0x04,0x00,0x00]
-          vgatherdpd	ymm23 {k1}, ymmword ptr [rcx + 4*xmm31 + 1024] 
+          vgatherdpd	ymm23 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
 
-// CHECK: vgatherdpd	xmm23 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] 
+// CHECK: vgatherdpd	xmm23 {k1}, qword ptr [r14 + 8*xmm31 - 123] 
 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x92,0xbc,0xfe,0x85,0xff,0xff,0xff]
-          vgatherdpd	xmm23 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] 
+          vgatherdpd	xmm23 {k1}, qword ptr [r14 + 8*xmm31 - 123] 
 
-// CHECK: vgatherdpd	xmm23 {k1}, xmmword ptr [r9 + xmm31 + 256] 
+// CHECK: vgatherdpd	xmm23 {k1}, qword ptr [r9 + xmm31 + 256] 
 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x92,0x7c,0x39,0x20]
-          vgatherdpd	xmm23 {k1}, xmmword ptr [r9 + xmm31 + 256] 
+          vgatherdpd	xmm23 {k1}, qword ptr [r9 + xmm31 + 256] 
 
-// CHECK: vgatherdpd	xmm23 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] 
+// CHECK: vgatherdpd	xmm23 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
 // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0x92,0xbc,0xb9,0x00,0x04,0x00,0x00]
-          vgatherdpd	xmm23 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] 
+          vgatherdpd	xmm23 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
 
-// CHECK: vgatherdpd	ymm18 {k1}, ymmword ptr [r14 + 8*xmm31 - 123] 
+// CHECK: vgatherdpd	ymm18 {k1}, qword ptr [r14 + 8*xmm31 - 123] 
 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x92,0x94,0xfe,0x85,0xff,0xff,0xff]
-          vgatherdpd	ymm18 {k1}, ymmword ptr [r14 + 8*xmm31 - 123] 
+          vgatherdpd	ymm18 {k1}, qword ptr [r14 + 8*xmm31 - 123] 
 
-// CHECK: vgatherdpd	ymm18 {k1}, ymmword ptr [r9 + xmm31 + 256] 
+// CHECK: vgatherdpd	ymm18 {k1}, qword ptr [r9 + xmm31 + 256] 
 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x92,0x54,0x39,0x20]
-          vgatherdpd	ymm18 {k1}, ymmword ptr [r9 + xmm31 + 256] 
+          vgatherdpd	ymm18 {k1}, qword ptr [r9 + xmm31 + 256] 
 
-// CHECK: vgatherdpd	ymm18 {k1}, ymmword ptr [rcx + 4*xmm31 + 1024] 
+// CHECK: vgatherdpd	ymm18 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
 // CHECK: encoding: [0x62,0xa2,0xfd,0x21,0x92,0x94,0xb9,0x00,0x04,0x00,0x00]
-          vgatherdpd	ymm18 {k1}, ymmword ptr [rcx + 4*xmm31 + 1024] 
+          vgatherdpd	ymm18 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
 
-// CHECK: vgatherdps	xmm18 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] 
+// CHECK: vgatherdps	xmm18 {k1}, dword ptr [r14 + 8*xmm31 + 123] 
 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x92,0x94,0xfe,0x7b,0x00,0x00,0x00]
-          vgatherdps	xmm18 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] 
+          vgatherdps	xmm18 {k1}, dword ptr [r14 + 8*xmm31 + 123] 
 
-// CHECK: vgatherdps	xmm18 {k1}, xmmword ptr [r9 + xmm31 + 256] 
+// CHECK: vgatherdps	xmm18 {k1}, dword ptr [r9 + xmm31 + 256] 
 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x92,0x54,0x39,0x40]
-          vgatherdps	xmm18 {k1}, xmmword ptr [r9 + xmm31 + 256] 
+          vgatherdps	xmm18 {k1}, dword ptr [r9 + xmm31 + 256] 
 
-// CHECK: vgatherdps	xmm18 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] 
+// CHECK: vgatherdps	xmm18 {k1}, dword ptr [rcx + 4*xmm31 + 1024] 
 // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0x92,0x94,0xb9,0x00,0x04,0x00,0x00]
-          vgatherdps	xmm18 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] 
+          vgatherdps	xmm18 {k1}, dword ptr [rcx + 4*xmm31 + 1024] 
 
-// CHECK: vgatherdps	ymm27 {k1}, ymmword ptr [r14 + 8*ymm31 + 123] 
+// CHECK: vgatherdps	ymm27 {k1}, dword ptr [r14 + 8*ymm31 + 123] 
 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x92,0x9c,0xfe,0x7b,0x00,0x00,0x00]
-          vgatherdps	ymm27 {k1}, ymmword ptr [r14 + 8*ymm31 + 123] 
+          vgatherdps	ymm27 {k1}, dword ptr [r14 + 8*ymm31 + 123] 
 
-// CHECK: vgatherdps	ymm27 {k1}, ymmword ptr [r9 + ymm31 + 256] 
+// CHECK: vgatherdps	ymm27 {k1}, dword ptr [r9 + ymm31 + 256] 
 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x92,0x5c,0x39,0x40]
-          vgatherdps	ymm27 {k1}, ymmword ptr [r9 + ymm31 + 256] 
+          vgatherdps	ymm27 {k1}, dword ptr [r9 + ymm31 + 256] 
 
-// CHECK: vgatherdps	ymm27 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] 
+// CHECK: vgatherdps	ymm27 {k1}, dword ptr [rcx + 4*ymm31 + 1024] 
 // CHECK: encoding: [0x62,0x22,0x7d,0x21,0x92,0x9c,0xb9,0x00,0x04,0x00,0x00]
-          vgatherdps	ymm27 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] 
+          vgatherdps	ymm27 {k1}, dword ptr [rcx + 4*ymm31 + 1024] 
 
-// CHECK: vgatherdps	xmm29 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] 
+// CHECK: vgatherdps	xmm29 {k1}, dword ptr [r14 + 8*xmm31 - 123] 
 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0x92,0xac,0xfe,0x85,0xff,0xff,0xff]
-          vgatherdps	xmm29 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] 
+          vgatherdps	xmm29 {k1}, dword ptr [r14 + 8*xmm31 - 123] 
 
-// CHECK: vgatherdps	xmm29 {k1}, xmmword ptr [r9 + xmm31 + 256] 
+// CHECK: vgatherdps	xmm29 {k1}, dword ptr [r9 + xmm31 + 256] 
 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0x92,0x6c,0x39,0x40]
-          vgatherdps	xmm29 {k1}, xmmword ptr [r9 + xmm31 + 256] 
+          vgatherdps	xmm29 {k1}, dword ptr [r9 + xmm31 + 256] 
 
-// CHECK: vgatherdps	xmm29 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] 
+// CHECK: vgatherdps	xmm29 {k1}, dword ptr [rcx + 4*xmm31 + 1024] 
 // CHECK: encoding: [0x62,0x22,0x7d,0x01,0x92,0xac,0xb9,0x00,0x04,0x00,0x00]
-          vgatherdps	xmm29 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] 
+          vgatherdps	xmm29 {k1}, dword ptr [rcx + 4*xmm31 + 1024] 
 
-// CHECK: vgatherdps	ymm21 {k1}, ymmword ptr [r14 + 8*ymm31 - 123] 
+// CHECK: vgatherdps	ymm21 {k1}, dword ptr [r14 + 8*ymm31 - 123] 
 // CHECK: encoding: [0x62,0x82,0x7d,0x21,0x92,0xac,0xfe,0x85,0xff,0xff,0xff]
-          vgatherdps	ymm21 {k1}, ymmword ptr [r14 + 8*ymm31 - 123] 
+          vgatherdps	ymm21 {k1}, dword ptr [r14 + 8*ymm31 - 123] 
 
-// CHECK: vgatherdps	ymm21 {k1}, ymmword ptr [r9 + ymm31 + 256] 
+// CHECK: vgatherdps	ymm21 {k1}, dword ptr [r9 + ymm31 + 256] 
 // CHECK: encoding: [0x62,0x82,0x7d,0x21,0x92,0x6c,0x39,0x40]
-          vgatherdps	ymm21 {k1}, ymmword ptr [r9 + ymm31 + 256] 
+          vgatherdps	ymm21 {k1}, dword ptr [r9 + ymm31 + 256] 
 
-// CHECK: vgatherdps	ymm21 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] 
+// CHECK: vgatherdps	ymm21 {k1}, dword ptr [rcx + 4*ymm31 + 1024] 
 // CHECK: encoding: [0x62,0xa2,0x7d,0x21,0x92,0xac,0xb9,0x00,0x04,0x00,0x00]
-          vgatherdps	ymm21 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] 
+          vgatherdps	ymm21 {k1}, dword ptr [rcx + 4*ymm31 + 1024] 
 
-// CHECK: vgatherqpd	xmm17 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] 
+// CHECK: vgatherqpd	xmm17 {k1}, qword ptr [r14 + 8*xmm31 + 123] 
 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x93,0x8c,0xfe,0x7b,0x00,0x00,0x00]
-          vgatherqpd	xmm17 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] 
+          vgatherqpd	xmm17 {k1}, qword ptr [r14 + 8*xmm31 + 123] 
 
-// CHECK: vgatherqpd	xmm17 {k1}, xmmword ptr [r9 + xmm31 + 256] 
+// CHECK: vgatherqpd	xmm17 {k1}, qword ptr [r9 + xmm31 + 256] 
 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x93,0x4c,0x39,0x20]
-          vgatherqpd	xmm17 {k1}, xmmword ptr [r9 + xmm31 + 256] 
+          vgatherqpd	xmm17 {k1}, qword ptr [r9 + xmm31 + 256] 
 
-// CHECK: vgatherqpd	xmm17 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] 
+// CHECK: vgatherqpd	xmm17 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
 // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0x93,0x8c,0xb9,0x00,0x04,0x00,0x00]
-          vgatherqpd	xmm17 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] 
+          vgatherqpd	xmm17 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
 
-// CHECK: vgatherqpd	ymm29 {k1}, ymmword ptr [r14 + 8*ymm31 + 123] 
+// CHECK: vgatherqpd	ymm29 {k1}, qword ptr [r14 + 8*ymm31 + 123] 
 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0x93,0xac,0xfe,0x7b,0x00,0x00,0x00]
-          vgatherqpd	ymm29 {k1}, ymmword ptr [r14 + 8*ymm31 + 123] 
+          vgatherqpd	ymm29 {k1}, qword ptr [r14 + 8*ymm31 + 123] 
 
-// CHECK: vgatherqpd	ymm29 {k1}, ymmword ptr [r9 + ymm31 + 256] 
+// CHECK: vgatherqpd	ymm29 {k1}, qword ptr [r9 + ymm31 + 256] 
 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0x93,0x6c,0x39,0x20]
-          vgatherqpd	ymm29 {k1}, ymmword ptr [r9 + ymm31 + 256] 
+          vgatherqpd	ymm29 {k1}, qword ptr [r9 + ymm31 + 256] 
 
-// CHECK: vgatherqpd	ymm29 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] 
+// CHECK: vgatherqpd	ymm29 {k1}, qword ptr [rcx + 4*ymm31 + 1024] 
 // CHECK: encoding: [0x62,0x22,0xfd,0x21,0x93,0xac,0xb9,0x00,0x04,0x00,0x00]
-          vgatherqpd	ymm29 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] 
+          vgatherqpd	ymm29 {k1}, qword ptr [rcx + 4*ymm31 + 1024] 
 
-// CHECK: vgatherqpd	xmm18 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] 
+// CHECK: vgatherqpd	xmm18 {k1}, qword ptr [r14 + 8*xmm31 - 123] 
 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x93,0x94,0xfe,0x85,0xff,0xff,0xff]
-          vgatherqpd	xmm18 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] 
+          vgatherqpd	xmm18 {k1}, qword ptr [r14 + 8*xmm31 - 123] 
 
-// CHECK: vgatherqpd	xmm18 {k1}, xmmword ptr [r9 + xmm31 + 256] 
+// CHECK: vgatherqpd	xmm18 {k1}, qword ptr [r9 + xmm31 + 256] 
 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x93,0x54,0x39,0x20]
-          vgatherqpd	xmm18 {k1}, xmmword ptr [r9 + xmm31 + 256] 
+          vgatherqpd	xmm18 {k1}, qword ptr [r9 + xmm31 + 256] 
 
-// CHECK: vgatherqpd	xmm18 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] 
+// CHECK: vgatherqpd	xmm18 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
 // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0x93,0x94,0xb9,0x00,0x04,0x00,0x00]
-          vgatherqpd	xmm18 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] 
+          vgatherqpd	xmm18 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
 
-// CHECK: vgatherqpd	ymm21 {k1}, ymmword ptr [r14 + 8*ymm31 - 123] 
+// CHECK: vgatherqpd	ymm21 {k1}, qword ptr [r14 + 8*ymm31 - 123] 
 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x93,0xac,0xfe,0x85,0xff,0xff,0xff]
-          vgatherqpd	ymm21 {k1}, ymmword ptr [r14 + 8*ymm31 - 123] 
+          vgatherqpd	ymm21 {k1}, qword ptr [r14 + 8*ymm31 - 123] 
 
-// CHECK: vgatherqpd	ymm21 {k1}, ymmword ptr [r9 + ymm31 + 256] 
+// CHECK: vgatherqpd	ymm21 {k1}, qword ptr [r9 + ymm31 + 256] 
 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x93,0x6c,0x39,0x20]
-          vgatherqpd	ymm21 {k1}, ymmword ptr [r9 + ymm31 + 256] 
+          vgatherqpd	ymm21 {k1}, qword ptr [r9 + ymm31 + 256] 
 
-// CHECK: vgatherqpd	ymm21 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] 
+// CHECK: vgatherqpd	ymm21 {k1}, qword ptr [rcx + 4*ymm31 + 1024] 
 // CHECK: encoding: [0x62,0xa2,0xfd,0x21,0x93,0xac,0xb9,0x00,0x04,0x00,0x00]
-          vgatherqpd	ymm21 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] 
+          vgatherqpd	ymm21 {k1}, qword ptr [rcx + 4*ymm31 + 1024] 
 
-// CHECK: vgatherqps	xmm21 {k1}, qword ptr [r14 + 8*xmm31 + 123] 
+// CHECK: vgatherqps	xmm21 {k1}, dword ptr [r14 + 8*xmm31 + 123] 
 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x93,0xac,0xfe,0x7b,0x00,0x00,0x00]
-          vgatherqps	xmm21 {k1}, qword ptr [r14 + 8*xmm31 + 123] 
+          vgatherqps	xmm21 {k1}, dword ptr [r14 + 8*xmm31 + 123] 
 
-// CHECK: vgatherqps	xmm21 {k1}, qword ptr [r9 + xmm31 + 256] 
+// CHECK: vgatherqps	xmm21 {k1}, dword ptr [r9 + xmm31 + 256] 
 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x93,0x6c,0x39,0x40]
-          vgatherqps	xmm21 {k1}, qword ptr [r9 + xmm31 + 256] 
+          vgatherqps	xmm21 {k1}, dword ptr [r9 + xmm31 + 256] 
 
-// CHECK: vgatherqps	xmm21 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
+// CHECK: vgatherqps	xmm21 {k1}, dword ptr [rcx + 4*xmm31 + 1024] 
 // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0x93,0xac,0xb9,0x00,0x04,0x00,0x00]
-          vgatherqps	xmm21 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
+          vgatherqps	xmm21 {k1}, dword ptr [rcx + 4*xmm31 + 1024] 
 
-// CHECK: vgatherqps	xmm19 {k1}, xmmword ptr [r14 + 8*ymm31 + 123] 
+// CHECK: vgatherqps	xmm19 {k1}, dword ptr [r14 + 8*ymm31 + 123] 
 // CHECK: encoding: [0x62,0x82,0x7d,0x21,0x93,0x9c,0xfe,0x7b,0x00,0x00,0x00]
-          vgatherqps	xmm19 {k1}, xmmword ptr [r14 + 8*ymm31 + 123] 
+          vgatherqps	xmm19 {k1}, dword ptr [r14 + 8*ymm31 + 123] 
 
-// CHECK: vgatherqps	xmm19 {k1}, xmmword ptr [r9 + ymm31 + 256] 
+// CHECK: vgatherqps	xmm19 {k1}, dword ptr [r9 + ymm31 + 256] 
 // CHECK: encoding: [0x62,0x82,0x7d,0x21,0x93,0x5c,0x39,0x40]
-          vgatherqps	xmm19 {k1}, xmmword ptr [r9 + ymm31 + 256] 
+          vgatherqps	xmm19 {k1}, dword ptr [r9 + ymm31 + 256] 
 
-// CHECK: vgatherqps	xmm19 {k1}, xmmword ptr [rcx + 4*ymm31 + 1024] 
+// CHECK: vgatherqps	xmm19 {k1}, dword ptr [rcx + 4*ymm31 + 1024] 
 // CHECK: encoding: [0x62,0xa2,0x7d,0x21,0x93,0x9c,0xb9,0x00,0x04,0x00,0x00]
-          vgatherqps	xmm19 {k1}, xmmword ptr [rcx + 4*ymm31 + 1024] 
+          vgatherqps	xmm19 {k1}, dword ptr [rcx + 4*ymm31 + 1024] 
 
-// CHECK: vgatherqps	xmm22 {k1}, qword ptr [r14 + 8*xmm31 - 123] 
+// CHECK: vgatherqps	xmm22 {k1}, dword ptr [r14 + 8*xmm31 - 123] 
 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x93,0xb4,0xfe,0x85,0xff,0xff,0xff]
-          vgatherqps	xmm22 {k1}, qword ptr [r14 + 8*xmm31 - 123] 
+          vgatherqps	xmm22 {k1}, dword ptr [r14 + 8*xmm31 - 123] 
 
-// CHECK: vgatherqps	xmm22 {k1}, qword ptr [r9 + xmm31 + 256] 
+// CHECK: vgatherqps	xmm22 {k1}, dword ptr [r9 + xmm31 + 256] 
 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x93,0x74,0x39,0x40]
-          vgatherqps	xmm22 {k1}, qword ptr [r9 + xmm31 + 256] 
+          vgatherqps	xmm22 {k1}, dword ptr [r9 + xmm31 + 256] 
 
-// CHECK: vgatherqps	xmm22 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
+// CHECK: vgatherqps	xmm22 {k1}, dword ptr [rcx + 4*xmm31 + 1024] 
 // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0x93,0xb4,0xb9,0x00,0x04,0x00,0x00]
-          vgatherqps	xmm22 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
+          vgatherqps	xmm22 {k1}, dword ptr [rcx + 4*xmm31 + 1024] 
 
-// CHECK: vgatherqps	xmm30 {k1}, xmmword ptr [r14 + 8*ymm31 - 123] 
+// CHECK: vgatherqps	xmm30 {k1}, dword ptr [r14 + 8*ymm31 - 123] 
 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x93,0xb4,0xfe,0x85,0xff,0xff,0xff]
-          vgatherqps	xmm30 {k1}, xmmword ptr [r14 + 8*ymm31 - 123] 
+          vgatherqps	xmm30 {k1}, dword ptr [r14 + 8*ymm31 - 123] 
 
-// CHECK: vgatherqps	xmm30 {k1}, xmmword ptr [r9 + ymm31 + 256] 
+// CHECK: vgatherqps	xmm30 {k1}, dword ptr [r9 + ymm31 + 256] 
 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x93,0x74,0x39,0x40]
-          vgatherqps	xmm30 {k1}, xmmword ptr [r9 + ymm31 + 256] 
+          vgatherqps	xmm30 {k1}, dword ptr [r9 + ymm31 + 256] 
 
-// CHECK: vgatherqps	xmm30 {k1}, xmmword ptr [rcx + 4*ymm31 + 1024] 
+// CHECK: vgatherqps	xmm30 {k1}, dword ptr [rcx + 4*ymm31 + 1024] 
 // CHECK: encoding: [0x62,0x22,0x7d,0x21,0x93,0xb4,0xb9,0x00,0x04,0x00,0x00]
-          vgatherqps	xmm30 {k1}, xmmword ptr [rcx + 4*ymm31 + 1024] 
+          vgatherqps	xmm30 {k1}, dword ptr [rcx + 4*ymm31 + 1024] 
 
-// CHECK: vpgatherdd	xmm17 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] 
+// CHECK: vpgatherdd	xmm17 {k1}, dword ptr [r14 + 8*xmm31 + 123] 
 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x90,0x8c,0xfe,0x7b,0x00,0x00,0x00]
-          vpgatherdd	xmm17 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] 
+          vpgatherdd	xmm17 {k1}, dword ptr [r14 + 8*xmm31 + 123] 
 
-// CHECK: vpgatherdd	xmm17 {k1}, xmmword ptr [r9 + xmm31 + 256] 
+// CHECK: vpgatherdd	xmm17 {k1}, dword ptr [r9 + xmm31 + 256] 
 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x90,0x4c,0x39,0x40]
-          vpgatherdd	xmm17 {k1}, xmmword ptr [r9 + xmm31 + 256] 
+          vpgatherdd	xmm17 {k1}, dword ptr [r9 + xmm31 + 256] 
 
-// CHECK: vpgatherdd	xmm17 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] 
+// CHECK: vpgatherdd	xmm17 {k1}, dword ptr [rcx + 4*xmm31 + 1024] 
 // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0x90,0x8c,0xb9,0x00,0x04,0x00,0x00]
-          vpgatherdd	xmm17 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] 
+          vpgatherdd	xmm17 {k1}, dword ptr [rcx + 4*xmm31 + 1024] 
 
-// CHECK: vpgatherdd	ymm19 {k1}, ymmword ptr [r14 + 8*ymm31 + 123] 
+// CHECK: vpgatherdd	ymm19 {k1}, dword ptr [r14 + 8*ymm31 + 123] 
 // CHECK: encoding: [0x62,0x82,0x7d,0x21,0x90,0x9c,0xfe,0x7b,0x00,0x00,0x00]
-          vpgatherdd	ymm19 {k1}, ymmword ptr [r14 + 8*ymm31 + 123] 
+          vpgatherdd	ymm19 {k1}, dword ptr [r14 + 8*ymm31 + 123] 
 
-// CHECK: vpgatherdd	ymm19 {k1}, ymmword ptr [r9 + ymm31 + 256] 
+// CHECK: vpgatherdd	ymm19 {k1}, dword ptr [r9 + ymm31 + 256] 
 // CHECK: encoding: [0x62,0x82,0x7d,0x21,0x90,0x5c,0x39,0x40]
-          vpgatherdd	ymm19 {k1}, ymmword ptr [r9 + ymm31 + 256] 
+          vpgatherdd	ymm19 {k1}, dword ptr [r9 + ymm31 + 256] 
 
-// CHECK: vpgatherdd	ymm19 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] 
+// CHECK: vpgatherdd	ymm19 {k1}, dword ptr [rcx + 4*ymm31 + 1024] 
 // CHECK: encoding: [0x62,0xa2,0x7d,0x21,0x90,0x9c,0xb9,0x00,0x04,0x00,0x00]
-          vpgatherdd	ymm19 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] 
+          vpgatherdd	ymm19 {k1}, dword ptr [rcx + 4*ymm31 + 1024] 
 
-// CHECK: vpgatherdd	xmm22 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] 
+// CHECK: vpgatherdd	xmm22 {k1}, dword ptr [r14 + 8*xmm31 - 123] 
 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x90,0xb4,0xfe,0x85,0xff,0xff,0xff]
-          vpgatherdd	xmm22 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] 
+          vpgatherdd	xmm22 {k1}, dword ptr [r14 + 8*xmm31 - 123] 
 
-// CHECK: vpgatherdd	xmm22 {k1}, xmmword ptr [r9 + xmm31 + 256] 
+// CHECK: vpgatherdd	xmm22 {k1}, dword ptr [r9 + xmm31 + 256] 
 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x90,0x74,0x39,0x40]
-          vpgatherdd	xmm22 {k1}, xmmword ptr [r9 + xmm31 + 256] 
+          vpgatherdd	xmm22 {k1}, dword ptr [r9 + xmm31 + 256] 
 
-// CHECK: vpgatherdd	xmm22 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] 
+// CHECK: vpgatherdd	xmm22 {k1}, dword ptr [rcx + 4*xmm31 + 1024] 
 // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0x90,0xb4,0xb9,0x00,0x04,0x00,0x00]
-          vpgatherdd	xmm22 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] 
+          vpgatherdd	xmm22 {k1}, dword ptr [rcx + 4*xmm31 + 1024] 
 
-// CHECK: vpgatherdd	ymm29 {k1}, ymmword ptr [r14 + 8*ymm31 - 123] 
+// CHECK: vpgatherdd	ymm29 {k1}, dword ptr [r14 + 8*ymm31 - 123] 
 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x90,0xac,0xfe,0x85,0xff,0xff,0xff]
-          vpgatherdd	ymm29 {k1}, ymmword ptr [r14 + 8*ymm31 - 123] 
+          vpgatherdd	ymm29 {k1}, dword ptr [r14 + 8*ymm31 - 123] 
 
-// CHECK: vpgatherdd	ymm29 {k1}, ymmword ptr [r9 + ymm31 + 256] 
+// CHECK: vpgatherdd	ymm29 {k1}, dword ptr [r9 + ymm31 + 256] 
 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x90,0x6c,0x39,0x40]
-          vpgatherdd	ymm29 {k1}, ymmword ptr [r9 + ymm31 + 256] 
+          vpgatherdd	ymm29 {k1}, dword ptr [r9 + ymm31 + 256] 
 
-// CHECK: vpgatherdd	ymm29 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] 
+// CHECK: vpgatherdd	ymm29 {k1}, dword ptr [rcx + 4*ymm31 + 1024] 
 // CHECK: encoding: [0x62,0x22,0x7d,0x21,0x90,0xac,0xb9,0x00,0x04,0x00,0x00]
-          vpgatherdd	ymm29 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] 
+          vpgatherdd	ymm29 {k1}, dword ptr [rcx + 4*ymm31 + 1024] 
 
-// CHECK: vpgatherdq	xmm17 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] 
+// CHECK: vpgatherdq	xmm17 {k1}, qword ptr [r14 + 8*xmm31 + 123] 
 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x90,0x8c,0xfe,0x7b,0x00,0x00,0x00]
-          vpgatherdq	xmm17 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] 
+          vpgatherdq	xmm17 {k1}, qword ptr [r14 + 8*xmm31 + 123] 
 
-// CHECK: vpgatherdq	xmm17 {k1}, xmmword ptr [r9 + xmm31 + 256] 
+// CHECK: vpgatherdq	xmm17 {k1}, qword ptr [r9 + xmm31 + 256] 
 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x90,0x4c,0x39,0x20]
-          vpgatherdq	xmm17 {k1}, xmmword ptr [r9 + xmm31 + 256] 
+          vpgatherdq	xmm17 {k1}, qword ptr [r9 + xmm31 + 256] 
 
-// CHECK: vpgatherdq	xmm17 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] 
+// CHECK: vpgatherdq	xmm17 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
 // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0x90,0x8c,0xb9,0x00,0x04,0x00,0x00]
-          vpgatherdq	xmm17 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] 
+          vpgatherdq	xmm17 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
 
-// CHECK: vpgatherdq	ymm26 {k1}, ymmword ptr [r14 + 8*xmm31 + 123] 
+// CHECK: vpgatherdq	ymm26 {k1}, qword ptr [r14 + 8*xmm31 + 123] 
 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0x90,0x94,0xfe,0x7b,0x00,0x00,0x00]
-          vpgatherdq	ymm26 {k1}, ymmword ptr [r14 + 8*xmm31 + 123] 
+          vpgatherdq	ymm26 {k1}, qword ptr [r14 + 8*xmm31 + 123] 
 
-// CHECK: vpgatherdq	ymm26 {k1}, ymmword ptr [r9 + xmm31 + 256] 
+// CHECK: vpgatherdq	ymm26 {k1}, qword ptr [r9 + xmm31 + 256] 
 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0x90,0x54,0x39,0x20]
-          vpgatherdq	ymm26 {k1}, ymmword ptr [r9 + xmm31 + 256] 
+          vpgatherdq	ymm26 {k1}, qword ptr [r9 + xmm31 + 256] 
 
-// CHECK: vpgatherdq	ymm26 {k1}, ymmword ptr [rcx + 4*xmm31 + 1024] 
+// CHECK: vpgatherdq	ymm26 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
 // CHECK: encoding: [0x62,0x22,0xfd,0x21,0x90,0x94,0xb9,0x00,0x04,0x00,0x00]
-          vpgatherdq	ymm26 {k1}, ymmword ptr [rcx + 4*xmm31 + 1024] 
+          vpgatherdq	ymm26 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
 
-// CHECK: vpgatherdq	xmm25 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] 
+// CHECK: vpgatherdq	xmm25 {k1}, qword ptr [r14 + 8*xmm31 - 123] 
 // CHECK: encoding: [0x62,0x02,0xfd,0x01,0x90,0x8c,0xfe,0x85,0xff,0xff,0xff]
-          vpgatherdq	xmm25 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] 
+          vpgatherdq	xmm25 {k1}, qword ptr [r14 + 8*xmm31 - 123] 
 
-// CHECK: vpgatherdq	xmm25 {k1}, xmmword ptr [r9 + xmm31 + 256] 
+// CHECK: vpgatherdq	xmm25 {k1}, qword ptr [r9 + xmm31 + 256] 
 // CHECK: encoding: [0x62,0x02,0xfd,0x01,0x90,0x4c,0x39,0x20]
-          vpgatherdq	xmm25 {k1}, xmmword ptr [r9 + xmm31 + 256] 
+          vpgatherdq	xmm25 {k1}, qword ptr [r9 + xmm31 + 256] 
 
-// CHECK: vpgatherdq	xmm25 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] 
+// CHECK: vpgatherdq	xmm25 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
 // CHECK: encoding: [0x62,0x22,0xfd,0x01,0x90,0x8c,0xb9,0x00,0x04,0x00,0x00]
-          vpgatherdq	xmm25 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] 
+          vpgatherdq	xmm25 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
 
-// CHECK: vpgatherdq	ymm22 {k1}, ymmword ptr [r14 + 8*xmm31 - 123] 
+// CHECK: vpgatherdq	ymm22 {k1}, qword ptr [r14 + 8*xmm31 - 123] 
 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x90,0xb4,0xfe,0x85,0xff,0xff,0xff]
-          vpgatherdq	ymm22 {k1}, ymmword ptr [r14 + 8*xmm31 - 123] 
+          vpgatherdq	ymm22 {k1}, qword ptr [r14 + 8*xmm31 - 123] 
 
-// CHECK: vpgatherdq	ymm22 {k1}, ymmword ptr [r9 + xmm31 + 256] 
+// CHECK: vpgatherdq	ymm22 {k1}, qword ptr [r9 + xmm31 + 256] 
 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x90,0x74,0x39,0x20]
-          vpgatherdq	ymm22 {k1}, ymmword ptr [r9 + xmm31 + 256] 
+          vpgatherdq	ymm22 {k1}, qword ptr [r9 + xmm31 + 256] 
 
-// CHECK: vpgatherdq	ymm22 {k1}, ymmword ptr [rcx + 4*xmm31 + 1024] 
+// CHECK: vpgatherdq	ymm22 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
 // CHECK: encoding: [0x62,0xa2,0xfd,0x21,0x90,0xb4,0xb9,0x00,0x04,0x00,0x00]
-          vpgatherdq	ymm22 {k1}, ymmword ptr [rcx + 4*xmm31 + 1024] 
+          vpgatherdq	ymm22 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
 
-// CHECK: vpgatherqd	xmm21 {k1}, qword ptr [r14 + 8*xmm31 + 123] 
+// CHECK: vpgatherqd	xmm21 {k1}, dword ptr [r14 + 8*xmm31 + 123] 
 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x91,0xac,0xfe,0x7b,0x00,0x00,0x00]
-          vpgatherqd	xmm21 {k1}, qword ptr [r14 + 8*xmm31 + 123] 
+          vpgatherqd	xmm21 {k1}, dword ptr [r14 + 8*xmm31 + 123] 
 
-// CHECK: vpgatherqd	xmm21 {k1}, qword ptr [r9 + xmm31 + 256] 
+// CHECK: vpgatherqd	xmm21 {k1}, dword ptr [r9 + xmm31 + 256] 
 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x91,0x6c,0x39,0x40]
-          vpgatherqd	xmm21 {k1}, qword ptr [r9 + xmm31 + 256] 
+          vpgatherqd	xmm21 {k1}, dword ptr [r9 + xmm31 + 256] 
 
-// CHECK: vpgatherqd	xmm21 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
+// CHECK: vpgatherqd	xmm21 {k1}, dword ptr [rcx + 4*xmm31 + 1024] 
 // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0x91,0xac,0xb9,0x00,0x04,0x00,0x00]
-          vpgatherqd	xmm21 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
+          vpgatherqd	xmm21 {k1}, dword ptr [rcx + 4*xmm31 + 1024] 
 
-// CHECK: vpgatherqd	xmm25 {k1}, xmmword ptr [r14 + 8*ymm31 + 123] 
+// CHECK: vpgatherqd	xmm25 {k1}, dword ptr [r14 + 8*ymm31 + 123] 
 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x91,0x8c,0xfe,0x7b,0x00,0x00,0x00]
-          vpgatherqd	xmm25 {k1}, xmmword ptr [r14 + 8*ymm31 + 123] 
+          vpgatherqd	xmm25 {k1}, dword ptr [r14 + 8*ymm31 + 123] 
 
-// CHECK: vpgatherqd	xmm25 {k1}, xmmword ptr [r9 + ymm31 + 256] 
+// CHECK: vpgatherqd	xmm25 {k1}, dword ptr [r9 + ymm31 + 256] 
 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x91,0x4c,0x39,0x40]
-          vpgatherqd	xmm25 {k1}, xmmword ptr [r9 + ymm31 + 256] 
+          vpgatherqd	xmm25 {k1}, dword ptr [r9 + ymm31 + 256] 
 
-// CHECK: vpgatherqd	xmm25 {k1}, xmmword ptr [rcx + 4*ymm31 + 1024] 
+// CHECK: vpgatherqd	xmm25 {k1}, dword ptr [rcx + 4*ymm31 + 1024] 
 // CHECK: encoding: [0x62,0x22,0x7d,0x21,0x91,0x8c,0xb9,0x00,0x04,0x00,0x00]
-          vpgatherqd	xmm25 {k1}, xmmword ptr [rcx + 4*ymm31 + 1024] 
+          vpgatherqd	xmm25 {k1}, dword ptr [rcx + 4*ymm31 + 1024] 
 
-// CHECK: vpgatherqd	xmm30 {k1}, qword ptr [r14 + 8*xmm31 - 123] 
+// CHECK: vpgatherqd	xmm30 {k1}, dword ptr [r14 + 8*xmm31 - 123] 
 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0x91,0xb4,0xfe,0x85,0xff,0xff,0xff]
-          vpgatherqd	xmm30 {k1}, qword ptr [r14 + 8*xmm31 - 123] 
+          vpgatherqd	xmm30 {k1}, dword ptr [r14 + 8*xmm31 - 123] 
 
-// CHECK: vpgatherqd	xmm30 {k1}, qword ptr [r9 + xmm31 + 256] 
+// CHECK: vpgatherqd	xmm30 {k1}, dword ptr [r9 + xmm31 + 256] 
 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0x91,0x74,0x39,0x40]
-          vpgatherqd	xmm30 {k1}, qword ptr [r9 + xmm31 + 256] 
+          vpgatherqd	xmm30 {k1}, dword ptr [r9 + xmm31 + 256] 
 
-// CHECK: vpgatherqd	xmm30 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
+// CHECK: vpgatherqd	xmm30 {k1}, dword ptr [rcx + 4*xmm31 + 1024] 
 // CHECK: encoding: [0x62,0x22,0x7d,0x01,0x91,0xb4,0xb9,0x00,0x04,0x00,0x00]
-          vpgatherqd	xmm30 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
+          vpgatherqd	xmm30 {k1}, dword ptr [rcx + 4*xmm31 + 1024] 
 
-// CHECK: vpgatherqd	xmm28 {k1}, xmmword ptr [r14 + 8*ymm31 - 123] 
+// CHECK: vpgatherqd	xmm28 {k1}, dword ptr [r14 + 8*ymm31 - 123] 
 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x91,0xa4,0xfe,0x85,0xff,0xff,0xff]
-          vpgatherqd	xmm28 {k1}, xmmword ptr [r14 + 8*ymm31 - 123] 
+          vpgatherqd	xmm28 {k1}, dword ptr [r14 + 8*ymm31 - 123] 
 
-// CHECK: vpgatherqd	xmm28 {k1}, xmmword ptr [r9 + ymm31 + 256] 
+// CHECK: vpgatherqd	xmm28 {k1}, dword ptr [r9 + ymm31 + 256] 
 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x91,0x64,0x39,0x40]
-          vpgatherqd	xmm28 {k1}, xmmword ptr [r9 + ymm31 + 256] 
+          vpgatherqd	xmm28 {k1}, dword ptr [r9 + ymm31 + 256] 
 
-// CHECK: vpgatherqd	xmm28 {k1}, xmmword ptr [rcx + 4*ymm31 + 1024] 
+// CHECK: vpgatherqd	xmm28 {k1}, dword ptr [rcx + 4*ymm31 + 1024] 
 // CHECK: encoding: [0x62,0x22,0x7d,0x21,0x91,0xa4,0xb9,0x00,0x04,0x00,0x00]
-          vpgatherqd	xmm28 {k1}, xmmword ptr [rcx + 4*ymm31 + 1024] 
+          vpgatherqd	xmm28 {k1}, dword ptr [rcx + 4*ymm31 + 1024] 
 
-// CHECK: vpgatherqq	xmm18 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] 
+// CHECK: vpgatherqq	xmm18 {k1}, qword ptr [r14 + 8*xmm31 + 123] 
 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x91,0x94,0xfe,0x7b,0x00,0x00,0x00]
-          vpgatherqq	xmm18 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] 
+          vpgatherqq	xmm18 {k1}, qword ptr [r14 + 8*xmm31 + 123] 
 
-// CHECK: vpgatherqq	xmm18 {k1}, xmmword ptr [r9 + xmm31 + 256] 
+// CHECK: vpgatherqq	xmm18 {k1}, qword ptr [r9 + xmm31 + 256] 
 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x91,0x54,0x39,0x20]
-          vpgatherqq	xmm18 {k1}, xmmword ptr [r9 + xmm31 + 256] 
+          vpgatherqq	xmm18 {k1}, qword ptr [r9 + xmm31 + 256] 
 
-// CHECK: vpgatherqq	xmm18 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] 
+// CHECK: vpgatherqq	xmm18 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
 // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0x91,0x94,0xb9,0x00,0x04,0x00,0x00]
-          vpgatherqq	xmm18 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] 
+          vpgatherqq	xmm18 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
 
-// CHECK: vpgatherqq	ymm19 {k1}, ymmword ptr [r14 + 8*ymm31 + 123] 
+// CHECK: vpgatherqq	ymm19 {k1}, qword ptr [r14 + 8*ymm31 + 123] 
 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x91,0x9c,0xfe,0x7b,0x00,0x00,0x00]
-          vpgatherqq	ymm19 {k1}, ymmword ptr [r14 + 8*ymm31 + 123] 
+          vpgatherqq	ymm19 {k1}, qword ptr [r14 + 8*ymm31 + 123] 
 
-// CHECK: vpgatherqq	ymm19 {k1}, ymmword ptr [r9 + ymm31 + 256] 
+// CHECK: vpgatherqq	ymm19 {k1}, qword ptr [r9 + ymm31 + 256] 
 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x91,0x5c,0x39,0x20]
-          vpgatherqq	ymm19 {k1}, ymmword ptr [r9 + ymm31 + 256] 
+          vpgatherqq	ymm19 {k1}, qword ptr [r9 + ymm31 + 256] 
 
-// CHECK: vpgatherqq	ymm19 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] 
+// CHECK: vpgatherqq	ymm19 {k1}, qword ptr [rcx + 4*ymm31 + 1024] 
 // CHECK: encoding: [0x62,0xa2,0xfd,0x21,0x91,0x9c,0xb9,0x00,0x04,0x00,0x00]
-          vpgatherqq	ymm19 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] 
+          vpgatherqq	ymm19 {k1}, qword ptr [rcx + 4*ymm31 + 1024] 
 
-// CHECK: vpgatherqq	xmm23 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] 
+// CHECK: vpgatherqq	xmm23 {k1}, qword ptr [r14 + 8*xmm31 - 123] 
 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x91,0xbc,0xfe,0x85,0xff,0xff,0xff]
-          vpgatherqq	xmm23 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] 
+          vpgatherqq	xmm23 {k1}, qword ptr [r14 + 8*xmm31 - 123] 
 
-// CHECK: vpgatherqq	xmm23 {k1}, xmmword ptr [r9 + xmm31 + 256] 
+// CHECK: vpgatherqq	xmm23 {k1}, qword ptr [r9 + xmm31 + 256] 
 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x91,0x7c,0x39,0x20]
-          vpgatherqq	xmm23 {k1}, xmmword ptr [r9 + xmm31 + 256] 
+          vpgatherqq	xmm23 {k1}, qword ptr [r9 + xmm31 + 256] 
 
-// CHECK: vpgatherqq	xmm23 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] 
+// CHECK: vpgatherqq	xmm23 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
 // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0x91,0xbc,0xb9,0x00,0x04,0x00,0x00]
-          vpgatherqq	xmm23 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] 
+          vpgatherqq	xmm23 {k1}, qword ptr [rcx + 4*xmm31 + 1024] 
 
-// CHECK: vpgatherqq	ymm26 {k1}, ymmword ptr [r14 + 8*ymm31 - 123] 
+// CHECK: vpgatherqq	ymm26 {k1}, qword ptr [r14 + 8*ymm31 - 123] 
 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0x91,0x94,0xfe,0x85,0xff,0xff,0xff]
-          vpgatherqq	ymm26 {k1}, ymmword ptr [r14 + 8*ymm31 - 123] 
+          vpgatherqq	ymm26 {k1}, qword ptr [r14 + 8*ymm31 - 123] 
 
-// CHECK: vpgatherqq	ymm26 {k1}, ymmword ptr [r9 + ymm31 + 256] 
+// CHECK: vpgatherqq	ymm26 {k1}, qword ptr [r9 + ymm31 + 256] 
 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0x91,0x54,0x39,0x20]
-          vpgatherqq	ymm26 {k1}, ymmword ptr [r9 + ymm31 + 256] 
+          vpgatherqq	ymm26 {k1}, qword ptr [r9 + ymm31 + 256] 
 
-// CHECK: vpgatherqq	ymm26 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] 
+// CHECK: vpgatherqq	ymm26 {k1}, qword ptr [rcx + 4*ymm31 + 1024] 
 // CHECK: encoding: [0x62,0x22,0xfd,0x21,0x91,0x94,0xb9,0x00,0x04,0x00,0x00]
-          vpgatherqq	ymm26 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] 
+          vpgatherqq	ymm26 {k1}, qword ptr [rcx + 4*ymm31 + 1024] 
 
-// CHECK: vpscatterdd	xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm20 
+// CHECK: vpscatterdd	dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm20 
 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa0,0xa4,0xfe,0x7b,0x00,0x00,0x00]
-          vpscatterdd	xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm20 
+          vpscatterdd	dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm20 
 
-// CHECK: vpscatterdd	xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm20 
+// CHECK: vpscatterdd	dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm20 
 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa0,0xa4,0xfe,0x7b,0x00,0x00,0x00]
-          vpscatterdd	xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm20 
+          vpscatterdd	dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm20 
 
-// CHECK: vpscatterdd	xmmword ptr [r9 + xmm31 + 256] {k1}, xmm20 
+// CHECK: vpscatterdd	dword ptr [r9 + xmm31 + 256] {k1}, xmm20 
 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa0,0x64,0x39,0x40]
-          vpscatterdd	xmmword ptr [r9 + xmm31 + 256] {k1}, xmm20 
+          vpscatterdd	dword ptr [r9 + xmm31 + 256] {k1}, xmm20 
 
-// CHECK: vpscatterdd	xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm20 
+// CHECK: vpscatterdd	dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm20 
 // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0xa0,0xa4,0xb9,0x00,0x04,0x00,0x00]
-          vpscatterdd	xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm20 
+          vpscatterdd	dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm20 
 
-// CHECK: vpscatterdd	ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm28 
+// CHECK: vpscatterdd	dword ptr [r14 + 8*ymm31 + 123] {k1}, ymm28 
 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa0,0xa4,0xfe,0x7b,0x00,0x00,0x00]
-          vpscatterdd	ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm28 
+          vpscatterdd	dword ptr [r14 + 8*ymm31 + 123] {k1}, ymm28 
 
-// CHECK: vpscatterdd	ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm28 
+// CHECK: vpscatterdd	dword ptr [r14 + 8*ymm31 + 123] {k1}, ymm28 
 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa0,0xa4,0xfe,0x7b,0x00,0x00,0x00]
-          vpscatterdd	ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm28 
+          vpscatterdd	dword ptr [r14 + 8*ymm31 + 123] {k1}, ymm28 
 
-// CHECK: vpscatterdd	ymmword ptr [r9 + ymm31 + 256] {k1}, ymm28 
+// CHECK: vpscatterdd	dword ptr [r9 + ymm31 + 256] {k1}, ymm28 
 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa0,0x64,0x39,0x40]
-          vpscatterdd	ymmword ptr [r9 + ymm31 + 256] {k1}, ymm28 
+          vpscatterdd	dword ptr [r9 + ymm31 + 256] {k1}, ymm28 
 
-// CHECK: vpscatterdd	ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm28 
+// CHECK: vpscatterdd	dword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm28 
 // CHECK: encoding: [0x62,0x22,0x7d,0x21,0xa0,0xa4,0xb9,0x00,0x04,0x00,0x00]
-          vpscatterdd	ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm28 
+          vpscatterdd	dword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm28 
 
-// CHECK: vpscatterdd	xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm17 
+// CHECK: vpscatterdd	dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm17 
 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa0,0x8c,0xfe,0x85,0xff,0xff,0xff]
-          vpscatterdd	xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm17 
+          vpscatterdd	dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm17 
 
-// CHECK: vpscatterdd	xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm17 
+// CHECK: vpscatterdd	dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm17 
 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa0,0x8c,0xfe,0x85,0xff,0xff,0xff]
-          vpscatterdd	xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm17 
+          vpscatterdd	dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm17 
 
-// CHECK: vpscatterdd	xmmword ptr [r9 + xmm31 + 256] {k1}, xmm17 
+// CHECK: vpscatterdd	dword ptr [r9 + xmm31 + 256] {k1}, xmm17 
 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa0,0x4c,0x39,0x40]
-          vpscatterdd	xmmword ptr [r9 + xmm31 + 256] {k1}, xmm17 
+          vpscatterdd	dword ptr [r9 + xmm31 + 256] {k1}, xmm17 
 
-// CHECK: vpscatterdd	xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm17 
+// CHECK: vpscatterdd	dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm17 
 // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0xa0,0x8c,0xb9,0x00,0x04,0x00,0x00]
-          vpscatterdd	xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm17 
+          vpscatterdd	dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm17 
 
-// CHECK: vpscatterdd	ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm24 
+// CHECK: vpscatterdd	dword ptr [r14 + 8*ymm31 - 123] {k1}, ymm24 
 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa0,0x84,0xfe,0x85,0xff,0xff,0xff]
-          vpscatterdd	ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm24 
+          vpscatterdd	dword ptr [r14 + 8*ymm31 - 123] {k1}, ymm24 
 
-// CHECK: vpscatterdd	ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm24 
+// CHECK: vpscatterdd	dword ptr [r14 + 8*ymm31 - 123] {k1}, ymm24 
 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa0,0x84,0xfe,0x85,0xff,0xff,0xff]
-          vpscatterdd	ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm24 
+          vpscatterdd	dword ptr [r14 + 8*ymm31 - 123] {k1}, ymm24 
 
-// CHECK: vpscatterdd	ymmword ptr [r9 + ymm31 + 256] {k1}, ymm24 
+// CHECK: vpscatterdd	dword ptr [r9 + ymm31 + 256] {k1}, ymm24 
 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa0,0x44,0x39,0x40]
-          vpscatterdd	ymmword ptr [r9 + ymm31 + 256] {k1}, ymm24 
+          vpscatterdd	dword ptr [r9 + ymm31 + 256] {k1}, ymm24 
 
-// CHECK: vpscatterdd	ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm24 
+// CHECK: vpscatterdd	dword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm24 
 // CHECK: encoding: [0x62,0x22,0x7d,0x21,0xa0,0x84,0xb9,0x00,0x04,0x00,0x00]
-          vpscatterdd	ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm24 
+          vpscatterdd	dword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm24 
 
-// CHECK: vpscatterdq	xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 
+// CHECK: vpscatterdq	qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 
 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa0,0xac,0xfe,0x7b,0x00,0x00,0x00]
-          vpscatterdq	xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 
+          vpscatterdq	qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 
 
-// CHECK: vpscatterdq	xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 
+// CHECK: vpscatterdq	qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 
 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa0,0xac,0xfe,0x7b,0x00,0x00,0x00]
-          vpscatterdq	xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 
+          vpscatterdq	qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 
 
-// CHECK: vpscatterdq	xmmword ptr [r9 + xmm31 + 256] {k1}, xmm21 
+// CHECK: vpscatterdq	qword ptr [r9 + xmm31 + 256] {k1}, xmm21 
 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa0,0x6c,0x39,0x20]
-          vpscatterdq	xmmword ptr [r9 + xmm31 + 256] {k1}, xmm21 
+          vpscatterdq	qword ptr [r9 + xmm31 + 256] {k1}, xmm21 
 
-// CHECK: vpscatterdq	xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm21 
+// CHECK: vpscatterdq	qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm21 
 // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0xa0,0xac,0xb9,0x00,0x04,0x00,0x00]
-          vpscatterdq	xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm21 
+          vpscatterdq	qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm21 
 
-// CHECK: vpscatterdq	ymmword ptr [r14 + 8*xmm31 + 123] {k1}, ymm28 
+// CHECK: vpscatterdq	qword ptr [r14 + 8*xmm31 + 123] {k1}, ymm28 
 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa0,0xa4,0xfe,0x7b,0x00,0x00,0x00]
-          vpscatterdq	ymmword ptr [r14 + 8*xmm31 + 123] {k1}, ymm28 
+          vpscatterdq	qword ptr [r14 + 8*xmm31 + 123] {k1}, ymm28 
 
-// CHECK: vpscatterdq	ymmword ptr [r14 + 8*xmm31 + 123] {k1}, ymm28 
+// CHECK: vpscatterdq	qword ptr [r14 + 8*xmm31 + 123] {k1}, ymm28 
 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa0,0xa4,0xfe,0x7b,0x00,0x00,0x00]
-          vpscatterdq	ymmword ptr [r14 + 8*xmm31 + 123] {k1}, ymm28 
+          vpscatterdq	qword ptr [r14 + 8*xmm31 + 123] {k1}, ymm28 
 
-// CHECK: vpscatterdq	ymmword ptr [r9 + xmm31 + 256] {k1}, ymm28 
+// CHECK: vpscatterdq	qword ptr [r9 + xmm31 + 256] {k1}, ymm28 
 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa0,0x64,0x39,0x20]
-          vpscatterdq	ymmword ptr [r9 + xmm31 + 256] {k1}, ymm28 
+          vpscatterdq	qword ptr [r9 + xmm31 + 256] {k1}, ymm28 
 
-// CHECK: vpscatterdq	ymmword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm28 
+// CHECK: vpscatterdq	qword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm28 
 // CHECK: encoding: [0x62,0x22,0xfd,0x21,0xa0,0xa4,0xb9,0x00,0x04,0x00,0x00]
-          vpscatterdq	ymmword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm28 
+          vpscatterdq	qword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm28 
 
-// CHECK: vpscatterdq	xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 
+// CHECK: vpscatterdq	qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 
 // CHECK: encoding: [0x62,0x02,0xfd,0x01,0xa0,0xa4,0xfe,0x85,0xff,0xff,0xff]
-          vpscatterdq	xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 
+          vpscatterdq	qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 
 
-// CHECK: vpscatterdq	xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 
+// CHECK: vpscatterdq	qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 
 // CHECK: encoding: [0x62,0x02,0xfd,0x01,0xa0,0xa4,0xfe,0x85,0xff,0xff,0xff]
-          vpscatterdq	xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 
+          vpscatterdq	qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 
 
-// CHECK: vpscatterdq	xmmword ptr [r9 + xmm31 + 256] {k1}, xmm28 
+// CHECK: vpscatterdq	qword ptr [r9 + xmm31 + 256] {k1}, xmm28 
 // CHECK: encoding: [0x62,0x02,0xfd,0x01,0xa0,0x64,0x39,0x20]
-          vpscatterdq	xmmword ptr [r9 + xmm31 + 256] {k1}, xmm28 
+          vpscatterdq	qword ptr [r9 + xmm31 + 256] {k1}, xmm28 
 
-// CHECK: vpscatterdq	xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 
+// CHECK: vpscatterdq	qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 
 // CHECK: encoding: [0x62,0x22,0xfd,0x01,0xa0,0xa4,0xb9,0x00,0x04,0x00,0x00]
-          vpscatterdq	xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 
+          vpscatterdq	qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 
 
-// CHECK: vpscatterdq	ymmword ptr [r14 + 8*xmm31 - 123] {k1}, ymm20 
+// CHECK: vpscatterdq	qword ptr [r14 + 8*xmm31 - 123] {k1}, ymm20 
 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa0,0xa4,0xfe,0x85,0xff,0xff,0xff]
-          vpscatterdq	ymmword ptr [r14 + 8*xmm31 - 123] {k1}, ymm20 
+          vpscatterdq	qword ptr [r14 + 8*xmm31 - 123] {k1}, ymm20 
 
-// CHECK: vpscatterdq	ymmword ptr [r14 + 8*xmm31 - 123] {k1}, ymm20 
+// CHECK: vpscatterdq	qword ptr [r14 + 8*xmm31 - 123] {k1}, ymm20 
 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa0,0xa4,0xfe,0x85,0xff,0xff,0xff]
-          vpscatterdq	ymmword ptr [r14 + 8*xmm31 - 123] {k1}, ymm20 
+          vpscatterdq	qword ptr [r14 + 8*xmm31 - 123] {k1}, ymm20 
 
-// CHECK: vpscatterdq	ymmword ptr [r9 + xmm31 + 256] {k1}, ymm20 
+// CHECK: vpscatterdq	qword ptr [r9 + xmm31 + 256] {k1}, ymm20 
 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa0,0x64,0x39,0x20]
-          vpscatterdq	ymmword ptr [r9 + xmm31 + 256] {k1}, ymm20 
+          vpscatterdq	qword ptr [r9 + xmm31 + 256] {k1}, ymm20 
 
-// CHECK: vpscatterdq	ymmword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm20 
+// CHECK: vpscatterdq	qword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm20 
 // CHECK: encoding: [0x62,0xa2,0xfd,0x21,0xa0,0xa4,0xb9,0x00,0x04,0x00,0x00]
-          vpscatterdq	ymmword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm20 
+          vpscatterdq	qword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm20 
 
-// CHECK: vpscatterqd	qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm22 
+// CHECK: vpscatterqd	dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm22 
 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa1,0xb4,0xfe,0x7b,0x00,0x00,0x00]
-          vpscatterqd	qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm22 
+          vpscatterqd	dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm22 
 
-// CHECK: vpscatterqd	qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm22 
+// CHECK: vpscatterqd	dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm22 
 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa1,0xb4,0xfe,0x7b,0x00,0x00,0x00]
-          vpscatterqd	qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm22 
+          vpscatterqd	dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm22 
 
-// CHECK: vpscatterqd	qword ptr [r9 + xmm31 + 256] {k1}, xmm22 
+// CHECK: vpscatterqd	dword ptr [r9 + xmm31 + 256] {k1}, xmm22 
 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa1,0x74,0x39,0x40]
-          vpscatterqd	qword ptr [r9 + xmm31 + 256] {k1}, xmm22 
+          vpscatterqd	dword ptr [r9 + xmm31 + 256] {k1}, xmm22 
 
-// CHECK: vpscatterqd	qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm22 
+// CHECK: vpscatterqd	dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm22 
 // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0xa1,0xb4,0xb9,0x00,0x04,0x00,0x00]
-          vpscatterqd	qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm22 
+          vpscatterqd	dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm22 
 
-// CHECK: vpscatterqd	xmmword ptr [r14 + 8*ymm31 + 123] {k1}, xmm24 
+// CHECK: vpscatterqd	dword ptr [r14 + 8*ymm31 + 123] {k1}, xmm24 
 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa1,0x84,0xfe,0x7b,0x00,0x00,0x00]
-          vpscatterqd	xmmword ptr [r14 + 8*ymm31 + 123] {k1}, xmm24 
+          vpscatterqd	dword ptr [r14 + 8*ymm31 + 123] {k1}, xmm24 
 
-// CHECK: vpscatterqd	xmmword ptr [r14 + 8*ymm31 + 123] {k1}, xmm24 
+// CHECK: vpscatterqd	dword ptr [r14 + 8*ymm31 + 123] {k1}, xmm24 
 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa1,0x84,0xfe,0x7b,0x00,0x00,0x00]
-          vpscatterqd	xmmword ptr [r14 + 8*ymm31 + 123] {k1}, xmm24 
+          vpscatterqd	dword ptr [r14 + 8*ymm31 + 123] {k1}, xmm24 
 
-// CHECK: vpscatterqd	xmmword ptr [r9 + ymm31 + 256] {k1}, xmm24 
+// CHECK: vpscatterqd	dword ptr [r9 + ymm31 + 256] {k1}, xmm24 
 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa1,0x44,0x39,0x40]
-          vpscatterqd	xmmword ptr [r9 + ymm31 + 256] {k1}, xmm24 
+          vpscatterqd	dword ptr [r9 + ymm31 + 256] {k1}, xmm24 
 
-// CHECK: vpscatterqd	xmmword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm24 
+// CHECK: vpscatterqd	dword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm24 
 // CHECK: encoding: [0x62,0x22,0x7d,0x21,0xa1,0x84,0xb9,0x00,0x04,0x00,0x00]
-          vpscatterqd	xmmword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm24 
+          vpscatterqd	dword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm24 
 
-// CHECK: vpscatterqd	qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm22 
+// CHECK: vpscatterqd	dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm22 
 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa1,0xb4,0xfe,0x85,0xff,0xff,0xff]
-          vpscatterqd	qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm22 
+          vpscatterqd	dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm22 
 
-// CHECK: vpscatterqd	qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm22 
+// CHECK: vpscatterqd	dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm22 
 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa1,0xb4,0xfe,0x85,0xff,0xff,0xff]
-          vpscatterqd	qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm22 
+          vpscatterqd	dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm22 
 
-// CHECK: vpscatterqd	qword ptr [r9 + xmm31 + 256] {k1}, xmm22 
+// CHECK: vpscatterqd	dword ptr [r9 + xmm31 + 256] {k1}, xmm22 
 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa1,0x74,0x39,0x40]
-          vpscatterqd	qword ptr [r9 + xmm31 + 256] {k1}, xmm22 
+          vpscatterqd	dword ptr [r9 + xmm31 + 256] {k1}, xmm22 
 
-// CHECK: vpscatterqd	qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm22 
+// CHECK: vpscatterqd	dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm22 
 // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0xa1,0xb4,0xb9,0x00,0x04,0x00,0x00]
-          vpscatterqd	qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm22 
+          vpscatterqd	dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm22 
 
-// CHECK: vpscatterqd	xmmword ptr [r14 + 8*ymm31 - 123] {k1}, xmm29 
+// CHECK: vpscatterqd	dword ptr [r14 + 8*ymm31 - 123] {k1}, xmm29 
 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa1,0xac,0xfe,0x85,0xff,0xff,0xff]
-          vpscatterqd	xmmword ptr [r14 + 8*ymm31 - 123] {k1}, xmm29 
+          vpscatterqd	dword ptr [r14 + 8*ymm31 - 123] {k1}, xmm29 
 
-// CHECK: vpscatterqd	xmmword ptr [r14 + 8*ymm31 - 123] {k1}, xmm29 
+// CHECK: vpscatterqd	dword ptr [r14 + 8*ymm31 - 123] {k1}, xmm29 
 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa1,0xac,0xfe,0x85,0xff,0xff,0xff]
-          vpscatterqd	xmmword ptr [r14 + 8*ymm31 - 123] {k1}, xmm29 
+          vpscatterqd	dword ptr [r14 + 8*ymm31 - 123] {k1}, xmm29 
 
-// CHECK: vpscatterqd	xmmword ptr [r9 + ymm31 + 256] {k1}, xmm29 
+// CHECK: vpscatterqd	dword ptr [r9 + ymm31 + 256] {k1}, xmm29 
 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa1,0x6c,0x39,0x40]
-          vpscatterqd	xmmword ptr [r9 + ymm31 + 256] {k1}, xmm29 
+          vpscatterqd	dword ptr [r9 + ymm31 + 256] {k1}, xmm29 
 
-// CHECK: vpscatterqd	xmmword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm29 
+// CHECK: vpscatterqd	dword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm29 
 // CHECK: encoding: [0x62,0x22,0x7d,0x21,0xa1,0xac,0xb9,0x00,0x04,0x00,0x00]
-          vpscatterqd	xmmword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm29 
+          vpscatterqd	dword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm29 
 
-// CHECK: vpscatterqq	xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 
+// CHECK: vpscatterqq	qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 
 // CHECK: encoding: [0x62,0x02,0xfd,0x01,0xa1,0xa4,0xfe,0x7b,0x00,0x00,0x00]
-          vpscatterqq	xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 
+          vpscatterqq	qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 
 
-// CHECK: vpscatterqq	xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 
+// CHECK: vpscatterqq	qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 
 // CHECK: encoding: [0x62,0x02,0xfd,0x01,0xa1,0xa4,0xfe,0x7b,0x00,0x00,0x00]
-          vpscatterqq	xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 
+          vpscatterqq	qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 
 
-// CHECK: vpscatterqq	xmmword ptr [r9 + xmm31 + 256] {k1}, xmm28 
+// CHECK: vpscatterqq	qword ptr [r9 + xmm31 + 256] {k1}, xmm28 
 // CHECK: encoding: [0x62,0x02,0xfd,0x01,0xa1,0x64,0x39,0x20]
-          vpscatterqq	xmmword ptr [r9 + xmm31 + 256] {k1}, xmm28 
+          vpscatterqq	qword ptr [r9 + xmm31 + 256] {k1}, xmm28 
 
-// CHECK: vpscatterqq	xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 
+// CHECK: vpscatterqq	qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 
 // CHECK: encoding: [0x62,0x22,0xfd,0x01,0xa1,0xa4,0xb9,0x00,0x04,0x00,0x00]
-          vpscatterqq	xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 
+          vpscatterqq	qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 
 
-// CHECK: vpscatterqq	ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm19 
+// CHECK: vpscatterqq	qword ptr [r14 + 8*ymm31 + 123] {k1}, ymm19 
 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa1,0x9c,0xfe,0x7b,0x00,0x00,0x00]
-          vpscatterqq	ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm19 
+          vpscatterqq	qword ptr [r14 + 8*ymm31 + 123] {k1}, ymm19 
 
-// CHECK: vpscatterqq	ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm19 
+// CHECK: vpscatterqq	qword ptr [r14 + 8*ymm31 + 123] {k1}, ymm19 
 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa1,0x9c,0xfe,0x7b,0x00,0x00,0x00]
-          vpscatterqq	ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm19 
+          vpscatterqq	qword ptr [r14 + 8*ymm31 + 123] {k1}, ymm19 
 
-// CHECK: vpscatterqq	ymmword ptr [r9 + ymm31 + 256] {k1}, ymm19 
+// CHECK: vpscatterqq	qword ptr [r9 + ymm31 + 256] {k1}, ymm19 
 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa1,0x5c,0x39,0x20]
-          vpscatterqq	ymmword ptr [r9 + ymm31 + 256] {k1}, ymm19 
+          vpscatterqq	qword ptr [r9 + ymm31 + 256] {k1}, ymm19 
 
-// CHECK: vpscatterqq	ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm19 
+// CHECK: vpscatterqq	qword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm19 
 // CHECK: encoding: [0x62,0xa2,0xfd,0x21,0xa1,0x9c,0xb9,0x00,0x04,0x00,0x00]
-          vpscatterqq	ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm19 
+          vpscatterqq	qword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm19 
 
-// CHECK: vpscatterqq	xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm24 
+// CHECK: vpscatterqq	qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm24 
 // CHECK: encoding: [0x62,0x02,0xfd,0x01,0xa1,0x84,0xfe,0x85,0xff,0xff,0xff]
-          vpscatterqq	xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm24 
+          vpscatterqq	qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm24 
 
-// CHECK: vpscatterqq	xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm24 
+// CHECK: vpscatterqq	qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm24 
 // CHECK: encoding: [0x62,0x02,0xfd,0x01,0xa1,0x84,0xfe,0x85,0xff,0xff,0xff]
-          vpscatterqq	xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm24 
+          vpscatterqq	qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm24 
 
-// CHECK: vpscatterqq	xmmword ptr [r9 + xmm31 + 256] {k1}, xmm24 
+// CHECK: vpscatterqq	qword ptr [r9 + xmm31 + 256] {k1}, xmm24 
 // CHECK: encoding: [0x62,0x02,0xfd,0x01,0xa1,0x44,0x39,0x20]
-          vpscatterqq	xmmword ptr [r9 + xmm31 + 256] {k1}, xmm24 
+          vpscatterqq	qword ptr [r9 + xmm31 + 256] {k1}, xmm24 
 
-// CHECK: vpscatterqq	xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm24 
+// CHECK: vpscatterqq	qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm24 
 // CHECK: encoding: [0x62,0x22,0xfd,0x01,0xa1,0x84,0xb9,0x00,0x04,0x00,0x00]
-          vpscatterqq	xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm24 
+          vpscatterqq	qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm24 
 
-// CHECK: vpscatterqq	ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm17 
+// CHECK: vpscatterqq	qword ptr [r14 + 8*ymm31 - 123] {k1}, ymm17 
 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa1,0x8c,0xfe,0x85,0xff,0xff,0xff]
-          vpscatterqq	ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm17 
+          vpscatterqq	qword ptr [r14 + 8*ymm31 - 123] {k1}, ymm17 
 
-// CHECK: vpscatterqq	ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm17 
+// CHECK: vpscatterqq	qword ptr [r14 + 8*ymm31 - 123] {k1}, ymm17 
 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa1,0x8c,0xfe,0x85,0xff,0xff,0xff]
-          vpscatterqq	ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm17 
+          vpscatterqq	qword ptr [r14 + 8*ymm31 - 123] {k1}, ymm17 
 
-// CHECK: vpscatterqq	ymmword ptr [r9 + ymm31 + 256] {k1}, ymm17 
+// CHECK: vpscatterqq	qword ptr [r9 + ymm31 + 256] {k1}, ymm17 
 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa1,0x4c,0x39,0x20]
-          vpscatterqq	ymmword ptr [r9 + ymm31 + 256] {k1}, ymm17 
+          vpscatterqq	qword ptr [r9 + ymm31 + 256] {k1}, ymm17 
 
-// CHECK: vpscatterqq	ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm17 
+// CHECK: vpscatterqq	qword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm17 
 // CHECK: encoding: [0x62,0xa2,0xfd,0x21,0xa1,0x8c,0xb9,0x00,0x04,0x00,0x00]
-          vpscatterqq	ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm17 
+          vpscatterqq	qword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm17 
 
-// CHECK: vscatterdpd	xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm18 
+// CHECK: vscatterdpd	qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm18 
 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa2,0x94,0xfe,0x7b,0x00,0x00,0x00]
-          vscatterdpd	xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm18 
+          vscatterdpd	qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm18 
 
-// CHECK: vscatterdpd	xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm18 
+// CHECK: vscatterdpd	qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm18 
 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa2,0x94,0xfe,0x7b,0x00,0x00,0x00]
-          vscatterdpd	xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm18 
+          vscatterdpd	qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm18 
 
-// CHECK: vscatterdpd	xmmword ptr [r9 + xmm31 + 256] {k1}, xmm18 
+// CHECK: vscatterdpd	qword ptr [r9 + xmm31 + 256] {k1}, xmm18 
 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa2,0x54,0x39,0x20]
-          vscatterdpd	xmmword ptr [r9 + xmm31 + 256] {k1}, xmm18 
+          vscatterdpd	qword ptr [r9 + xmm31 + 256] {k1}, xmm18 
 
-// CHECK: vscatterdpd	xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm18 
+// CHECK: vscatterdpd	qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm18 
 // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0xa2,0x94,0xb9,0x00,0x04,0x00,0x00]
-          vscatterdpd	xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm18 
+          vscatterdpd	qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm18 
 
-// CHECK: vscatterdpd	ymmword ptr [r14 + 8*xmm31 + 123] {k1}, ymm30 
+// CHECK: vscatterdpd	qword ptr [r14 + 8*xmm31 + 123] {k1}, ymm30 
 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa2,0xb4,0xfe,0x7b,0x00,0x00,0x00]
-          vscatterdpd	ymmword ptr [r14 + 8*xmm31 + 123] {k1}, ymm30 
+          vscatterdpd	qword ptr [r14 + 8*xmm31 + 123] {k1}, ymm30 
 
-// CHECK: vscatterdpd	ymmword ptr [r14 + 8*xmm31 + 123] {k1}, ymm30 
+// CHECK: vscatterdpd	qword ptr [r14 + 8*xmm31 + 123] {k1}, ymm30 
 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa2,0xb4,0xfe,0x7b,0x00,0x00,0x00]
-          vscatterdpd	ymmword ptr [r14 + 8*xmm31 + 123] {k1}, ymm30 
+          vscatterdpd	qword ptr [r14 + 8*xmm31 + 123] {k1}, ymm30 
 
-// CHECK: vscatterdpd	ymmword ptr [r9 + xmm31 + 256] {k1}, ymm30 
+// CHECK: vscatterdpd	qword ptr [r9 + xmm31 + 256] {k1}, ymm30 
 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa2,0x74,0x39,0x20]
-          vscatterdpd	ymmword ptr [r9 + xmm31 + 256] {k1}, ymm30 
+          vscatterdpd	qword ptr [r9 + xmm31 + 256] {k1}, ymm30 
 
-// CHECK: vscatterdpd	ymmword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm30 
+// CHECK: vscatterdpd	qword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm30 
 // CHECK: encoding: [0x62,0x22,0xfd,0x21,0xa2,0xb4,0xb9,0x00,0x04,0x00,0x00]
-          vscatterdpd	ymmword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm30 
+          vscatterdpd	qword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm30 
 
-// CHECK: vscatterdpd	xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 
+// CHECK: vscatterdpd	qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 
 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa2,0x9c,0xfe,0x85,0xff,0xff,0xff]
-          vscatterdpd	xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 
+          vscatterdpd	qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 
 
-// CHECK: vscatterdpd	xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 
+// CHECK: vscatterdpd	qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 
 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa2,0x9c,0xfe,0x85,0xff,0xff,0xff]
-          vscatterdpd	xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 
+          vscatterdpd	qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 
 
-// CHECK: vscatterdpd	xmmword ptr [r9 + xmm31 + 256] {k1}, xmm19 
+// CHECK: vscatterdpd	qword ptr [r9 + xmm31 + 256] {k1}, xmm19 
 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa2,0x5c,0x39,0x20]
-          vscatterdpd	xmmword ptr [r9 + xmm31 + 256] {k1}, xmm19 
+          vscatterdpd	qword ptr [r9 + xmm31 + 256] {k1}, xmm19 
 
-// CHECK: vscatterdpd	xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm19 
+// CHECK: vscatterdpd	qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm19 
 // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0xa2,0x9c,0xb9,0x00,0x04,0x00,0x00]
-          vscatterdpd	xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm19 
+          vscatterdpd	qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm19 
 
-// CHECK: vscatterdpd	ymmword ptr [r14 + 8*xmm31 - 123] {k1}, ymm26 
+// CHECK: vscatterdpd	qword ptr [r14 + 8*xmm31 - 123] {k1}, ymm26 
 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa2,0x94,0xfe,0x85,0xff,0xff,0xff]
-          vscatterdpd	ymmword ptr [r14 + 8*xmm31 - 123] {k1}, ymm26 
+          vscatterdpd	qword ptr [r14 + 8*xmm31 - 123] {k1}, ymm26 
 
-// CHECK: vscatterdpd	ymmword ptr [r14 + 8*xmm31 - 123] {k1}, ymm26 
+// CHECK: vscatterdpd	qword ptr [r14 + 8*xmm31 - 123] {k1}, ymm26 
 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa2,0x94,0xfe,0x85,0xff,0xff,0xff]
-          vscatterdpd	ymmword ptr [r14 + 8*xmm31 - 123] {k1}, ymm26 
+          vscatterdpd	qword ptr [r14 + 8*xmm31 - 123] {k1}, ymm26 
 
-// CHECK: vscatterdpd	ymmword ptr [r9 + xmm31 + 256] {k1}, ymm26 
+// CHECK: vscatterdpd	qword ptr [r9 + xmm31 + 256] {k1}, ymm26 
 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa2,0x54,0x39,0x20]
-          vscatterdpd	ymmword ptr [r9 + xmm31 + 256] {k1}, ymm26 
+          vscatterdpd	qword ptr [r9 + xmm31 + 256] {k1}, ymm26 
 
-// CHECK: vscatterdpd	ymmword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm26 
+// CHECK: vscatterdpd	qword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm26 
 // CHECK: encoding: [0x62,0x22,0xfd,0x21,0xa2,0x94,0xb9,0x00,0x04,0x00,0x00]
-          vscatterdpd	ymmword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm26 
+          vscatterdpd	qword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm26 
 
-// CHECK: vscatterdps	xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm24 
+// CHECK: vscatterdps	dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm24 
 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa2,0x84,0xfe,0x7b,0x00,0x00,0x00]
-          vscatterdps	xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm24 
+          vscatterdps	dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm24 
 
-// CHECK: vscatterdps	xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm24 
+// CHECK: vscatterdps	dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm24 
 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa2,0x84,0xfe,0x7b,0x00,0x00,0x00]
-          vscatterdps	xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm24 
+          vscatterdps	dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm24 
 
-// CHECK: vscatterdps	xmmword ptr [r9 + xmm31 + 256] {k1}, xmm24 
+// CHECK: vscatterdps	dword ptr [r9 + xmm31 + 256] {k1}, xmm24 
 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa2,0x44,0x39,0x40]
-          vscatterdps	xmmword ptr [r9 + xmm31 + 256] {k1}, xmm24 
+          vscatterdps	dword ptr [r9 + xmm31 + 256] {k1}, xmm24 
 
-// CHECK: vscatterdps	xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm24 
+// CHECK: vscatterdps	dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm24 
 // CHECK: encoding: [0x62,0x22,0x7d,0x01,0xa2,0x84,0xb9,0x00,0x04,0x00,0x00]
-          vscatterdps	xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm24 
+          vscatterdps	dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm24 
 
-// CHECK: vscatterdps	ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 
+// CHECK: vscatterdps	dword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 
 // CHECK: encoding: [0x62,0x82,0x7d,0x21,0xa2,0xbc,0xfe,0x7b,0x00,0x00,0x00]
-          vscatterdps	ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 
+          vscatterdps	dword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 
 
-// CHECK: vscatterdps	ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 
+// CHECK: vscatterdps	dword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 
 // CHECK: encoding: [0x62,0x82,0x7d,0x21,0xa2,0xbc,0xfe,0x7b,0x00,0x00,0x00]
-          vscatterdps	ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 
+          vscatterdps	dword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 
 
-// CHECK: vscatterdps	ymmword ptr [r9 + ymm31 + 256] {k1}, ymm23 
+// CHECK: vscatterdps	dword ptr [r9 + ymm31 + 256] {k1}, ymm23 
 // CHECK: encoding: [0x62,0x82,0x7d,0x21,0xa2,0x7c,0x39,0x40]
-          vscatterdps	ymmword ptr [r9 + ymm31 + 256] {k1}, ymm23 
+          vscatterdps	dword ptr [r9 + ymm31 + 256] {k1}, ymm23 
 
-// CHECK: vscatterdps	ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm23 
+// CHECK: vscatterdps	dword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm23 
 // CHECK: encoding: [0x62,0xa2,0x7d,0x21,0xa2,0xbc,0xb9,0x00,0x04,0x00,0x00]
-          vscatterdps	ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm23 
+          vscatterdps	dword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm23 
 
-// CHECK: vscatterdps	xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 
+// CHECK: vscatterdps	dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 
 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa2,0xa4,0xfe,0x85,0xff,0xff,0xff]
-          vscatterdps	xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 
+          vscatterdps	dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 
 
-// CHECK: vscatterdps	xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 
+// CHECK: vscatterdps	dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 
 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa2,0xa4,0xfe,0x85,0xff,0xff,0xff]
-          vscatterdps	xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 
+          vscatterdps	dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 
 
-// CHECK: vscatterdps	xmmword ptr [r9 + xmm31 + 256] {k1}, xmm28 
+// CHECK: vscatterdps	dword ptr [r9 + xmm31 + 256] {k1}, xmm28 
 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa2,0x64,0x39,0x40]
-          vscatterdps	xmmword ptr [r9 + xmm31 + 256] {k1}, xmm28 
+          vscatterdps	dword ptr [r9 + xmm31 + 256] {k1}, xmm28 
 
-// CHECK: vscatterdps	xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 
+// CHECK: vscatterdps	dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 
 // CHECK: encoding: [0x62,0x22,0x7d,0x01,0xa2,0xa4,0xb9,0x00,0x04,0x00,0x00]
-          vscatterdps	xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 
+          vscatterdps	dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 
 
-// CHECK: vscatterdps	ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm25 
+// CHECK: vscatterdps	dword ptr [r14 + 8*ymm31 - 123] {k1}, ymm25 
 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa2,0x8c,0xfe,0x85,0xff,0xff,0xff]
-          vscatterdps	ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm25 
+          vscatterdps	dword ptr [r14 + 8*ymm31 - 123] {k1}, ymm25 
 
-// CHECK: vscatterdps	ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm25 
+// CHECK: vscatterdps	dword ptr [r14 + 8*ymm31 - 123] {k1}, ymm25 
 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa2,0x8c,0xfe,0x85,0xff,0xff,0xff]
-          vscatterdps	ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm25 
+          vscatterdps	dword ptr [r14 + 8*ymm31 - 123] {k1}, ymm25 
 
-// CHECK: vscatterdps	ymmword ptr [r9 + ymm31 + 256] {k1}, ymm25 
+// CHECK: vscatterdps	dword ptr [r9 + ymm31 + 256] {k1}, ymm25 
 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa2,0x4c,0x39,0x40]
-          vscatterdps	ymmword ptr [r9 + ymm31 + 256] {k1}, ymm25 
+          vscatterdps	dword ptr [r9 + ymm31 + 256] {k1}, ymm25 
 
-// CHECK: vscatterdps	ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm25 
+// CHECK: vscatterdps	dword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm25 
 // CHECK: encoding: [0x62,0x22,0x7d,0x21,0xa2,0x8c,0xb9,0x00,0x04,0x00,0x00]
-          vscatterdps	ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm25 
+          vscatterdps	dword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm25 
 
-// CHECK: vscatterqpd	xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 
+// CHECK: vscatterqpd	qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 
 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa3,0xac,0xfe,0x7b,0x00,0x00,0x00]
-          vscatterqpd	xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 
+          vscatterqpd	qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 
 
-// CHECK: vscatterqpd	xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 
+// CHECK: vscatterqpd	qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 
 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa3,0xac,0xfe,0x7b,0x00,0x00,0x00]
-          vscatterqpd	xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 
+          vscatterqpd	qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 
 
-// CHECK: vscatterqpd	xmmword ptr [r9 + xmm31 + 256] {k1}, xmm21 
+// CHECK: vscatterqpd	qword ptr [r9 + xmm31 + 256] {k1}, xmm21 
 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa3,0x6c,0x39,0x20]
-          vscatterqpd	xmmword ptr [r9 + xmm31 + 256] {k1}, xmm21 
+          vscatterqpd	qword ptr [r9 + xmm31 + 256] {k1}, xmm21 
 
-// CHECK: vscatterqpd	xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm21 
+// CHECK: vscatterqpd	qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm21 
 // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0xa3,0xac,0xb9,0x00,0x04,0x00,0x00]
-          vscatterqpd	xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm21 
+          vscatterqpd	qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm21 
 
-// CHECK: vscatterqpd	ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 
+// CHECK: vscatterqpd	qword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 
 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa3,0xbc,0xfe,0x7b,0x00,0x00,0x00]
-          vscatterqpd	ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 
+          vscatterqpd	qword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 
 
-// CHECK: vscatterqpd	ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 
+// CHECK: vscatterqpd	qword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 
 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa3,0xbc,0xfe,0x7b,0x00,0x00,0x00]
-          vscatterqpd	ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 
+          vscatterqpd	qword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 
 
-// CHECK: vscatterqpd	ymmword ptr [r9 + ymm31 + 256] {k1}, ymm23 
+// CHECK: vscatterqpd	qword ptr [r9 + ymm31 + 256] {k1}, ymm23 
 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa3,0x7c,0x39,0x20]
-          vscatterqpd	ymmword ptr [r9 + ymm31 + 256] {k1}, ymm23 
+          vscatterqpd	qword ptr [r9 + ymm31 + 256] {k1}, ymm23 
 
-// CHECK: vscatterqpd	ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm23 
+// CHECK: vscatterqpd	qword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm23 
 // CHECK: encoding: [0x62,0xa2,0xfd,0x21,0xa3,0xbc,0xb9,0x00,0x04,0x00,0x00]
-          vscatterqpd	ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm23 
+          vscatterqpd	qword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm23 
 
-// CHECK: vscatterqpd	xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 
+// CHECK: vscatterqpd	qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 
 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa3,0x9c,0xfe,0x85,0xff,0xff,0xff]
-          vscatterqpd	xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 
+          vscatterqpd	qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 
 
-// CHECK: vscatterqpd	xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 
+// CHECK: vscatterqpd	qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 
 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa3,0x9c,0xfe,0x85,0xff,0xff,0xff]
-          vscatterqpd	xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 
+          vscatterqpd	qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 
 
-// CHECK: vscatterqpd	xmmword ptr [r9 + xmm31 + 256] {k1}, xmm19 
+// CHECK: vscatterqpd	qword ptr [r9 + xmm31 + 256] {k1}, xmm19 
 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa3,0x5c,0x39,0x20]
-          vscatterqpd	xmmword ptr [r9 + xmm31 + 256] {k1}, xmm19 
+          vscatterqpd	qword ptr [r9 + xmm31 + 256] {k1}, xmm19 
 
-// CHECK: vscatterqpd	xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm19 
+// CHECK: vscatterqpd	qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm19 
 // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0xa3,0x9c,0xb9,0x00,0x04,0x00,0x00]
-          vscatterqpd	xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm19 
+          vscatterqpd	qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm19 
 
-// CHECK: vscatterqpd	ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm29 
+// CHECK: vscatterqpd	qword ptr [r14 + 8*ymm31 - 123] {k1}, ymm29 
 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa3,0xac,0xfe,0x85,0xff,0xff,0xff]
-          vscatterqpd	ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm29 
+          vscatterqpd	qword ptr [r14 + 8*ymm31 - 123] {k1}, ymm29 
 
-// CHECK: vscatterqpd	ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm29 
+// CHECK: vscatterqpd	qword ptr [r14 + 8*ymm31 - 123] {k1}, ymm29 
 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa3,0xac,0xfe,0x85,0xff,0xff,0xff]
-          vscatterqpd	ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm29 
+          vscatterqpd	qword ptr [r14 + 8*ymm31 - 123] {k1}, ymm29 
 
-// CHECK: vscatterqpd	ymmword ptr [r9 + ymm31 + 256] {k1}, ymm29 
+// CHECK: vscatterqpd	qword ptr [r9 + ymm31 + 256] {k1}, ymm29 
 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa3,0x6c,0x39,0x20]
-          vscatterqpd	ymmword ptr [r9 + ymm31 + 256] {k1}, ymm29 
+          vscatterqpd	qword ptr [r9 + ymm31 + 256] {k1}, ymm29 
 
-// CHECK: vscatterqpd	ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm29 
+// CHECK: vscatterqpd	qword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm29 
 // CHECK: encoding: [0x62,0x22,0xfd,0x21,0xa3,0xac,0xb9,0x00,0x04,0x00,0x00]
-          vscatterqpd	ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm29 
+          vscatterqpd	qword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm29 
 
-// CHECK: vscatterqps	qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 
+// CHECK: vscatterqps	dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 
 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa3,0xa4,0xfe,0x7b,0x00,0x00,0x00]
-          vscatterqps	qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 
+          vscatterqps	dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 
 
-// CHECK: vscatterqps	qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 
+// CHECK: vscatterqps	dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 
 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa3,0xa4,0xfe,0x7b,0x00,0x00,0x00]
-          vscatterqps	qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 
+          vscatterqps	dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 
 
-// CHECK: vscatterqps	qword ptr [r9 + xmm31 + 256] {k1}, xmm28 
+// CHECK: vscatterqps	dword ptr [r9 + xmm31 + 256] {k1}, xmm28 
 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa3,0x64,0x39,0x40]
-          vscatterqps	qword ptr [r9 + xmm31 + 256] {k1}, xmm28 
+          vscatterqps	dword ptr [r9 + xmm31 + 256] {k1}, xmm28 
 
-// CHECK: vscatterqps	qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 
+// CHECK: vscatterqps	dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 
 // CHECK: encoding: [0x62,0x22,0x7d,0x01,0xa3,0xa4,0xb9,0x00,0x04,0x00,0x00]
-          vscatterqps	qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 
+          vscatterqps	dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 
 
-// CHECK: vscatterqps	xmmword ptr [r14 + 8*ymm31 + 123] {k1}, xmm25 
+// CHECK: vscatterqps	dword ptr [r14 + 8*ymm31 + 123] {k1}, xmm25 
 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa3,0x8c,0xfe,0x7b,0x00,0x00,0x00]
-          vscatterqps	xmmword ptr [r14 + 8*ymm31 + 123] {k1}, xmm25 
+          vscatterqps	dword ptr [r14 + 8*ymm31 + 123] {k1}, xmm25 
 
-// CHECK: vscatterqps	xmmword ptr [r14 + 8*ymm31 + 123] {k1}, xmm25 
+// CHECK: vscatterqps	dword ptr [r14 + 8*ymm31 + 123] {k1}, xmm25 
 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa3,0x8c,0xfe,0x7b,0x00,0x00,0x00]
-          vscatterqps	xmmword ptr [r14 + 8*ymm31 + 123] {k1}, xmm25 
+          vscatterqps	dword ptr [r14 + 8*ymm31 + 123] {k1}, xmm25 
 
-// CHECK: vscatterqps	xmmword ptr [r9 + ymm31 + 256] {k1}, xmm25 
+// CHECK: vscatterqps	dword ptr [r9 + ymm31 + 256] {k1}, xmm25 
 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa3,0x4c,0x39,0x40]
-          vscatterqps	xmmword ptr [r9 + ymm31 + 256] {k1}, xmm25 
+          vscatterqps	dword ptr [r9 + ymm31 + 256] {k1}, xmm25 
 
-// CHECK: vscatterqps	xmmword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm25 
+// CHECK: vscatterqps	dword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm25 
 // CHECK: encoding: [0x62,0x22,0x7d,0x21,0xa3,0x8c,0xb9,0x00,0x04,0x00,0x00]
-          vscatterqps	xmmword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm25 
+          vscatterqps	dword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm25 
 
-// CHECK: vscatterqps	qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm27 
+// CHECK: vscatterqps	dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm27 
 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa3,0x9c,0xfe,0x85,0xff,0xff,0xff]
-          vscatterqps	qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm27 
+          vscatterqps	dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm27 
 
-// CHECK: vscatterqps	qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm27 
+// CHECK: vscatterqps	dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm27 
 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa3,0x9c,0xfe,0x85,0xff,0xff,0xff]
-          vscatterqps	qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm27 
+          vscatterqps	dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm27 
 
-// CHECK: vscatterqps	qword ptr [r9 + xmm31 + 256] {k1}, xmm27 
+// CHECK: vscatterqps	dword ptr [r9 + xmm31 + 256] {k1}, xmm27 
 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa3,0x5c,0x39,0x40]
-          vscatterqps	qword ptr [r9 + xmm31 + 256] {k1}, xmm27 
+          vscatterqps	dword ptr [r9 + xmm31 + 256] {k1}, xmm27 
 
-// CHECK: vscatterqps	qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm27 
+// CHECK: vscatterqps	dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm27 
 // CHECK: encoding: [0x62,0x22,0x7d,0x01,0xa3,0x9c,0xb9,0x00,0x04,0x00,0x00]
-          vscatterqps	qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm27 
+          vscatterqps	dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm27 
 
-// CHECK: vscatterqps	xmmword ptr [r14 + 8*ymm31 - 123] {k1}, xmm23 
+// CHECK: vscatterqps	dword ptr [r14 + 8*ymm31 - 123] {k1}, xmm23 
 // CHECK: encoding: [0x62,0x82,0x7d,0x21,0xa3,0xbc,0xfe,0x85,0xff,0xff,0xff]
-          vscatterqps	xmmword ptr [r14 + 8*ymm31 - 123] {k1}, xmm23 
+          vscatterqps	dword ptr [r14 + 8*ymm31 - 123] {k1}, xmm23 
 
-// CHECK: vscatterqps	xmmword ptr [r14 + 8*ymm31 - 123] {k1}, xmm23 
+// CHECK: vscatterqps	dword ptr [r14 + 8*ymm31 - 123] {k1}, xmm23 
 // CHECK: encoding: [0x62,0x82,0x7d,0x21,0xa3,0xbc,0xfe,0x85,0xff,0xff,0xff]
-          vscatterqps	xmmword ptr [r14 + 8*ymm31 - 123] {k1}, xmm23 
+          vscatterqps	dword ptr [r14 + 8*ymm31 - 123] {k1}, xmm23 
 
-// CHECK: vscatterqps	xmmword ptr [r9 + ymm31 + 256] {k1}, xmm23 
+// CHECK: vscatterqps	dword ptr [r9 + ymm31 + 256] {k1}, xmm23 
 // CHECK: encoding: [0x62,0x82,0x7d,0x21,0xa3,0x7c,0x39,0x40]
-          vscatterqps	xmmword ptr [r9 + ymm31 + 256] {k1}, xmm23 
+          vscatterqps	dword ptr [r9 + ymm31 + 256] {k1}, xmm23 
 
-// CHECK: vscatterqps	xmmword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm23 
+// CHECK: vscatterqps	dword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm23 
 // CHECK: encoding: [0x62,0xa2,0x7d,0x21,0xa3,0xbc,0xb9,0x00,0x04,0x00,0x00]
-          vscatterqps	xmmword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm23 
+          vscatterqps	dword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm23 
 
 // CHECK: vcvtpd2ps xmm0, xmm23 
 // CHECK: encoding: [0x62,0xb1,0xfd,0x08,0x5a,0xc7]
diff --git a/llvm/test/MC/X86/intel-syntax.s b/llvm/test/MC/X86/intel-syntax.s
index 2b365699eec7b..c622832d24bea 100644
--- a/llvm/test/MC/X86/intel-syntax.s
+++ b/llvm/test/MC/X86/intel-syntax.s
@@ -144,7 +144,7 @@ main:
 // CHECK: vshufpd $1, %xmm2, %xmm1, %xmm0
     vshufpd XMM0, XMM1, XMM2, 1
 // CHECK: vpgatherdd %xmm8, (%r15,%xmm9,2), %xmm1
-    vpgatherdd XMM10, XMMWORD PTR [R15 + 2*XMM9], XMM8
+    vpgatherdd XMM10, DWORD PTR [R15 + 2*XMM9], XMM8
 // CHECK: movsd -8, %xmm5
     movsd   XMM5, QWORD PTR [-8]
 // CHECK: movsl (%rsi), %es:(%rdi)
diff --git a/llvm/utils/TableGen/X86RecognizableInstr.cpp b/llvm/utils/TableGen/X86RecognizableInstr.cpp
index c6cd3da13646a..607a6bd27c21f 100644
--- a/llvm/utils/TableGen/X86RecognizableInstr.cpp
+++ b/llvm/utils/TableGen/X86RecognizableInstr.cpp
@@ -1147,19 +1147,16 @@ OperandType RecognizableInstr::typeFromString(const std::string &s,
   TYPE("VK4Pair", TYPE_VK_PAIR)
   TYPE("VK8Pair", TYPE_VK_PAIR)
   TYPE("VK16Pair", TYPE_VK_PAIR)
+  TYPE("vx32mem", TYPE_MVSIBX)
   TYPE("vx64mem", TYPE_MVSIBX)
-  TYPE("vx128mem", TYPE_MVSIBX)
-  TYPE("vx256mem", TYPE_MVSIBX)
-  TYPE("vy128mem", TYPE_MVSIBY)
-  TYPE("vy256mem", TYPE_MVSIBY)
+  TYPE("vy32mem", TYPE_MVSIBY)
+  TYPE("vy64mem", TYPE_MVSIBY)
+  TYPE("vx32xmem", TYPE_MVSIBX)
   TYPE("vx64xmem", TYPE_MVSIBX)
-  TYPE("vx128xmem", TYPE_MVSIBX)
-  TYPE("vx256xmem", TYPE_MVSIBX)
-  TYPE("vy128xmem", TYPE_MVSIBY)
-  TYPE("vy256xmem", TYPE_MVSIBY)
-  TYPE("vy512xmem", TYPE_MVSIBY)
-  TYPE("vz256mem", TYPE_MVSIBZ)
-  TYPE("vz512mem", TYPE_MVSIBZ)
+  TYPE("vy32xmem", TYPE_MVSIBY)
+  TYPE("vy64xmem", TYPE_MVSIBY)
+  TYPE("vz32mem", TYPE_MVSIBZ)
+  TYPE("vz64mem", TYPE_MVSIBZ)
   TYPE("BNDR", TYPE_BNDR)
   TYPE("TILE", TYPE_TMM)
   TYPE("TILEPair", TYPE_TMM_PAIR)
@@ -1372,19 +1369,16 @@ RecognizableInstr::memoryEncodingFromString(const std::string &s,
   ENCODING("anymem", ENCODING_RM)
   ENCODING("opaquemem", ENCODING_RM)
   ENCODING("sibmem", ENCODING_SIB)
+  ENCODING("vx32mem", ENCODING_VSIB)
   ENCODING("vx64mem", ENCODING_VSIB)
-  ENCODING("vx128mem", ENCODING_VSIB)
-  ENCODING("vx256mem", ENCODING_VSIB)
-  ENCODING("vy128mem", ENCODING_VSIB)
-  ENCODING("vy256mem", ENCODING_VSIB)
+  ENCODING("vy32mem", ENCODING_VSIB)
+  ENCODING("vy64mem", ENCODING_VSIB)
+  ENCODING("vx32xmem", ENCODING_VSIB)
   ENCODING("vx64xmem", ENCODING_VSIB)
-  ENCODING("vx128xmem", ENCODING_VSIB)
-  ENCODING("vx256xmem", ENCODING_VSIB)
-  ENCODING("vy128xmem", ENCODING_VSIB)
-  ENCODING("vy256xmem", ENCODING_VSIB)
-  ENCODING("vy512xmem", ENCODING_VSIB)
-  ENCODING("vz256mem", ENCODING_VSIB)
-  ENCODING("vz512mem", ENCODING_VSIB)
+  ENCODING("vy32xmem", ENCODING_VSIB)
+  ENCODING("vy64xmem", ENCODING_VSIB)
+  ENCODING("vz32mem", ENCODING_VSIB)
+  ENCODING("vz64mem", ENCODING_VSIB)
   errs() << "Unhandled memory encoding " << s << "\n";
   llvm_unreachable("Unhandled memory encoding");
 }

From b39c5cb6977f35ad727d86b2dd6232099734ffd3 Mon Sep 17 00:00:00 2001
From: William Moses <gh@wsmoses.com>
Date: Mon, 13 Jan 2025 12:37:16 -0600
Subject: [PATCH 311/408] [MLIR][LLVM] Fix inlining of a single block ending
 with unreachable (#122646)

alternate option to https://github.com/llvm/llvm-project/pull/122615
---
 mlir/include/mlir/Transforms/InliningUtils.h     | 10 ++++++++++
 .../LLVMIR/Transforms/InlinerInterfaceImpl.cpp   |  8 ++++++++
 mlir/lib/Transforms/Utils/InliningUtils.cpp      | 16 +++++++++++++++-
 mlir/test/Dialect/LLVMIR/inlining.mlir           | 16 ++++++++++++++++
 4 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Transforms/InliningUtils.h b/mlir/include/mlir/Transforms/InliningUtils.h
index 88fc033a6ab7b..becfe9b047ef4 100644
--- a/mlir/include/mlir/Transforms/InliningUtils.h
+++ b/mlir/include/mlir/Transforms/InliningUtils.h
@@ -176,6 +176,13 @@ class DialectInlinerInterface
   /// is invoked before inlined terminator operations have been processed.
   virtual void processInlinedCallBlocks(
       Operation *call, iterator_range<Region::iterator> inlinedBlocks) const {}
+
+  /// Returns true if the inliner can assume a fast path of not creating a new
+  /// block, if there is only one block.
+  virtual bool allowSingleBlockOptimization(
+      iterator_range<Region::iterator> inlinedBlocks) const {
+    return true;
+  }
 };
 
 /// This interface provides the hooks into the inlining interface.
@@ -223,6 +230,9 @@ class InlinerInterface
 
   virtual void processInlinedCallBlocks(
       Operation *call, iterator_range<Region::iterator> inlinedBlocks) const;
+
+  virtual bool allowSingleBlockOptimization(
+      iterator_range<Region::iterator> inlinedBlocks) const;
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp
index 233cadebeec02..79dd3e3069648 100644
--- a/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp
@@ -744,6 +744,14 @@ struct LLVMInlinerInterface : public DialectInlinerInterface {
     op->erase();
   }
 
+  bool allowSingleBlockOptimization(
+      iterator_range<Region::iterator> inlinedBlocks) const final {
+    if (!inlinedBlocks.empty() &&
+        isa<LLVM::UnreachableOp>(inlinedBlocks.begin()->getTerminator()))
+      return false;
+    return true;
+  }
+
   /// Handle the given inlined return by replacing the uses of the call with the
   /// operands of the return. This overload is called when the inlined region
   /// only contains one block.
diff --git a/mlir/lib/Transforms/Utils/InliningUtils.cpp b/mlir/lib/Transforms/Utils/InliningUtils.cpp
index 0db097d14cd3c..0cae63c58ca7b 100644
--- a/mlir/lib/Transforms/Utils/InliningUtils.cpp
+++ b/mlir/lib/Transforms/Utils/InliningUtils.cpp
@@ -118,6 +118,18 @@ void InlinerInterface::handleTerminator(Operation *op,
   handler->handleTerminator(op, valuesToRepl);
 }
 
+/// Returns true if the inliner can assume a fast path of not creating a
+/// new block, if there is only one block.
+bool InlinerInterface::allowSingleBlockOptimization(
+    iterator_range<Region::iterator> inlinedBlocks) const {
+  if (inlinedBlocks.empty()) {
+    return true;
+  }
+  auto *handler = getInterfaceFor(inlinedBlocks.begin()->getParentOp());
+  assert(handler && "expected valid dialect handler");
+  return handler->allowSingleBlockOptimization(inlinedBlocks);
+}
+
 Value InlinerInterface::handleArgument(OpBuilder &builder, Operation *call,
                                        Operation *callable, Value argument,
                                        DictionaryAttr argumentAttrs) const {
@@ -294,8 +306,10 @@ inlineRegionImpl(InlinerInterface &interface, Region *src, Block *inlineBlock,
     interface.processInlinedCallBlocks(call, newBlocks);
   interface.processInlinedBlocks(newBlocks);
 
+  bool singleBlockFastPath = interface.allowSingleBlockOptimization(newBlocks);
+
   // Handle the case where only a single block was inlined.
-  if (std::next(newBlocks.begin()) == newBlocks.end()) {
+  if (singleBlockFastPath && std::next(newBlocks.begin()) == newBlocks.end()) {
     // Run the result attribute handler on the terminator operands.
     Operation *firstBlockTerminator = firstNewBlock->getTerminator();
     builder.setInsertionPoint(firstBlockTerminator);
diff --git a/mlir/test/Dialect/LLVMIR/inlining.mlir b/mlir/test/Dialect/LLVMIR/inlining.mlir
index edaac4da0b044..eb249a4771753 100644
--- a/mlir/test/Dialect/LLVMIR/inlining.mlir
+++ b/mlir/test/Dialect/LLVMIR/inlining.mlir
@@ -676,3 +676,19 @@ llvm.func @caller(%x : i32) -> i32 {
   %z = llvm.call @private_func(%x) : (i32) -> (i32)
   llvm.return %z : i32
 }
+
+// -----
+
+llvm.func @unreachable_func(%a : i32) -> i32 {
+  "llvm.intr.trap"() : () -> ()
+  llvm.unreachable
+}
+
+// CHECK-LABEL: func @caller
+llvm.func @caller(%x : i32) -> i32 {
+  // CHECK-NOT: llvm.call @unreachable_func
+  // CHECK: llvm.intr.trap
+  // CHECK: llvm.unreachable
+  %z = llvm.call @unreachable_func(%x) : (i32) -> (i32)
+  llvm.return %z : i32
+}

From 16e45b8fac797c6d4ba161228b54665492204a9d Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Mon, 13 Jan 2025 18:53:07 +0000
Subject: [PATCH 312/408] [AArch64] Implement FP8 SVE/SME reinterpret
 intrinsics (#121063)

---
 .../acle_sve2_fp8_reinterpret.c               | 3182 +++++++++++++++++
 clang/utils/TableGen/SveEmitter.cpp           |    5 +-
 2 files changed, 3185 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_reinterpret.c

diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_reinterpret.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_reinterpret.c
new file mode 100644
index 0000000000000..7c70bcf6b4d66
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_reinterpret.c
@@ -0,0 +1,3182 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1        -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s
+// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s -check-prefix=CHECK-CXX
+
+// RUN: %clang_cc1        -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s
+// RUN: %clang_cc1 -x c++ -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s -check-prefix=CHECK-CXX
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +fp8 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +fp8 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+#ifdef __ARM_FEATURE_SME
+#include <arm_sme.h>
+#else
+#include <arm_sve.h>
+#endif
+
+#ifdef SVE_OVERLOADED_FORMS
+#define SVE_ACLE_FUNC(A1, A2_UNUSED) A1
+#else
+#define SVE_ACLE_FUNC(A1, A2) A1##A2
+#endif
+
+#ifdef __ARM_FEATURE_SME
+#define STREAMING __arm_streaming
+#else
+#define STREAMING
+#endif
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svreinterpret_s8_mf8(
+// CHECK-SAME: <vscale x 16 x i8> [[OP:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[OP]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 16 x i8> @_Z25test_svreinterpret_s8_mf8u13__SVMfloat8_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[OP]]
+//
+svint8_t test_svreinterpret_s8_mf8(svmfloat8_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_s8, _mf8)(op);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svreinterpret_u8_mf8(
+// CHECK-SAME: <vscale x 16 x i8> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[OP]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 16 x i8> @_Z25test_svreinterpret_u8_mf8u13__SVMfloat8_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[OP]]
+//
+svuint8_t test_svreinterpret_u8_mf8(svmfloat8_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_u8, _mf8)(op);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svreinterpret_mf8_s8(
+// CHECK-SAME: <vscale x 16 x i8> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[OP]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 16 x i8> @_Z25test_svreinterpret_mf8_s8u10__SVInt8_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[OP]]
+//
+svmfloat8_t test_svreinterpret_mf8_s8(svint8_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _s8)(op);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svreinterpret_mf8_u8(
+// CHECK-SAME: <vscale x 16 x i8> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[OP]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 16 x i8> @_Z25test_svreinterpret_mf8_u8u11__SVUint8_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[OP]]
+//
+svmfloat8_t test_svreinterpret_mf8_u8(svuint8_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _u8)(op);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svreinterpret_mf8_mf8(
+// CHECK-SAME: <vscale x 16 x i8> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[OP]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 16 x i8> @_Z26test_svreinterpret_mf8_mf8u13__SVMfloat8_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[OP]]
+//
+svmfloat8_t test_svreinterpret_mf8_mf8(svmfloat8_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _mf8)(op);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svreinterpret_mf8_s16(
+// CHECK-SAME: <vscale x 8 x i16> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP]] to <vscale x 16 x i8>
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 16 x i8> @_Z26test_svreinterpret_mf8_s16u11__SVInt16_t(
+// CHECK-CXX-SAME: <vscale x 8 x i16> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svmfloat8_t test_svreinterpret_mf8_s16(svint16_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _s16)(op);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svreinterpret_mf8_u16(
+// CHECK-SAME: <vscale x 8 x i16> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP]] to <vscale x 16 x i8>
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 16 x i8> @_Z26test_svreinterpret_mf8_u16u12__SVUint16_t(
+// CHECK-CXX-SAME: <vscale x 8 x i16> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svmfloat8_t test_svreinterpret_mf8_u16(svuint16_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _u16)(op);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svreinterpret_mf8_s32(
+// CHECK-SAME: <vscale x 4 x i32> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP]] to <vscale x 16 x i8>
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 16 x i8> @_Z26test_svreinterpret_mf8_s32u11__SVInt32_t(
+// CHECK-CXX-SAME: <vscale x 4 x i32> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svmfloat8_t test_svreinterpret_mf8_s32(svint32_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _s32)(op);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svreinterpret_mf8_u32(
+// CHECK-SAME: <vscale x 4 x i32> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP]] to <vscale x 16 x i8>
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 16 x i8> @_Z26test_svreinterpret_mf8_u32u12__SVUint32_t(
+// CHECK-CXX-SAME: <vscale x 4 x i32> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svmfloat8_t test_svreinterpret_mf8_u32(svuint32_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _u32)(op);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svreinterpret_mf8_s64(
+// CHECK-SAME: <vscale x 2 x i64> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP]] to <vscale x 16 x i8>
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 16 x i8> @_Z26test_svreinterpret_mf8_s64u11__SVInt64_t(
+// CHECK-CXX-SAME: <vscale x 2 x i64> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svmfloat8_t test_svreinterpret_mf8_s64(svint64_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _s64)(op);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svreinterpret_mf8_u64(
+// CHECK-SAME: <vscale x 2 x i64> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP]] to <vscale x 16 x i8>
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 16 x i8> @_Z26test_svreinterpret_mf8_u64u12__SVUint64_t(
+// CHECK-CXX-SAME: <vscale x 2 x i64> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svmfloat8_t test_svreinterpret_mf8_u64(svuint64_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _u64)(op);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svreinterpret_mf8_f16(
+// CHECK-SAME: <vscale x 8 x half> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x half> [[OP]] to <vscale x 16 x i8>
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 16 x i8> @_Z26test_svreinterpret_mf8_f16u13__SVFloat16_t(
+// CHECK-CXX-SAME: <vscale x 8 x half> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x half> [[OP]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svmfloat8_t test_svreinterpret_mf8_f16(svfloat16_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _f16)(op);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svreinterpret_mf8_bf16(
+// CHECK-SAME: <vscale x 8 x bfloat> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP]] to <vscale x 16 x i8>
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 16 x i8> @_Z27test_svreinterpret_mf8_bf16u14__SVBfloat16_t(
+// CHECK-CXX-SAME: <vscale x 8 x bfloat> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svmfloat8_t test_svreinterpret_mf8_bf16(svbfloat16_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _bf16)(op);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svreinterpret_mf8_f32(
+// CHECK-SAME: <vscale x 4 x float> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x float> [[OP]] to <vscale x 16 x i8>
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 16 x i8> @_Z26test_svreinterpret_mf8_f32u13__SVFloat32_t(
+// CHECK-CXX-SAME: <vscale x 4 x float> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x float> [[OP]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svmfloat8_t test_svreinterpret_mf8_f32(svfloat32_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _f32)(op);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svreinterpret_mf8_f64(
+// CHECK-SAME: <vscale x 2 x double> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x double> [[OP]] to <vscale x 16 x i8>
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 16 x i8> @_Z26test_svreinterpret_mf8_f64u13__SVFloat64_t(
+// CHECK-CXX-SAME: <vscale x 2 x double> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x double> [[OP]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svmfloat8_t test_svreinterpret_mf8_f64(svfloat64_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _f64)(op);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svreinterpret_s16_mf8(
+// CHECK-SAME: <vscale x 16 x i8> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP]] to <vscale x 8 x i16>
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 8 x i16> @_Z26test_svreinterpret_s16_mf8u13__SVMfloat8_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP]] to <vscale x 8 x i16>
+// CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+svint16_t test_svreinterpret_s16_mf8(svmfloat8_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_s16, _mf8)(op);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svreinterpret_u16_mf8(
+// CHECK-SAME: <vscale x 16 x i8> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP]] to <vscale x 8 x i16>
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 8 x i16> @_Z26test_svreinterpret_u16_mf8u13__SVMfloat8_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP]] to <vscale x 8 x i16>
+// CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+svuint16_t test_svreinterpret_u16_mf8(svmfloat8_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_u16, _mf8)(op);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i32> @test_svreinterpret_s32_mf8(
+// CHECK-SAME: <vscale x 16 x i8> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP]] to <vscale x 4 x i32>
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 4 x i32> @_Z26test_svreinterpret_s32_mf8u13__SVMfloat8_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP]] to <vscale x 4 x i32>
+// CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
+//
+svint32_t test_svreinterpret_s32_mf8(svmfloat8_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_s32, _mf8)(op);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i32> @test_svreinterpret_u32_mf8(
+// CHECK-SAME: <vscale x 16 x i8> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP]] to <vscale x 4 x i32>
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 4 x i32> @_Z26test_svreinterpret_u32_mf8u13__SVMfloat8_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP]] to <vscale x 4 x i32>
+// CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
+//
+svuint32_t test_svreinterpret_u32_mf8(svmfloat8_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_u32, _mf8)(op);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x i64> @test_svreinterpret_s64_mf8(
+// CHECK-SAME: <vscale x 16 x i8> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP]] to <vscale x 2 x i64>
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 2 x i64> @_Z26test_svreinterpret_s64_mf8u13__SVMfloat8_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP]] to <vscale x 2 x i64>
+// CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+svint64_t test_svreinterpret_s64_mf8(svmfloat8_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_s64, _mf8)(op);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x i64> @test_svreinterpret_u64_mf8(
+// CHECK-SAME: <vscale x 16 x i8> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP]] to <vscale x 2 x i64>
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 2 x i64> @_Z26test_svreinterpret_u64_mf8u13__SVMfloat8_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP]] to <vscale x 2 x i64>
+// CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+svuint64_t test_svreinterpret_u64_mf8(svmfloat8_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_u64, _mf8)(op);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svreinterpret_f16_mf8(
+// CHECK-SAME: <vscale x 16 x i8> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP]] to <vscale x 8 x half>
+// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 8 x half> @_Z26test_svreinterpret_f16_mf8u13__SVMfloat8_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP]] to <vscale x 8 x half>
+// CHECK-CXX-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+svfloat16_t test_svreinterpret_f16_mf8(svmfloat8_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_f16, _mf8)(op);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @test_svreinterpret_bf16_mf8(
+// CHECK-SAME: <vscale x 16 x i8> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP]] to <vscale x 8 x bfloat>
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 8 x bfloat> @_Z27test_svreinterpret_bf16_mf8u13__SVMfloat8_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP]] to <vscale x 8 x bfloat>
+// CHECK-CXX-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svreinterpret_bf16_mf8(svmfloat8_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_bf16, _mf8)(op);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svreinterpret_f32_mf8(
+// CHECK-SAME: <vscale x 16 x i8> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP]] to <vscale x 4 x float>
+// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 4 x float> @_Z26test_svreinterpret_f32_mf8u13__SVMfloat8_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP]] to <vscale x 4 x float>
+// CHECK-CXX-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+svfloat32_t test_svreinterpret_f32_mf8(svmfloat8_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_f32, _mf8)(op);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x double> @test_svreinterpret_f64_mf8(
+// CHECK-SAME: <vscale x 16 x i8> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP]] to <vscale x 2 x double>
+// CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 2 x double> @_Z26test_svreinterpret_f64_mf8u13__SVMfloat8_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP]] to <vscale x 2 x double>
+// CHECK-CXX-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+svfloat64_t test_svreinterpret_f64_mf8(svmfloat8_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_f64, _mf8)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_s8_mf8_x2(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], <vscale x 16 x i8> [[TMP4]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z28test_svreinterpret_s8_mf8_x213svmfloat8x2_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], <vscale x 16 x i8> [[TMP4]], 1
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+//
+svint8x2_t test_svreinterpret_s8_mf8_x2(svmfloat8x2_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_s8, _mf8_x2)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_u8_mf8_x2(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], <vscale x 16 x i8> [[TMP4]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z28test_svreinterpret_u8_mf8_x213svmfloat8x2_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], <vscale x 16 x i8> [[TMP4]], 1
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+//
+svuint8x2_t test_svreinterpret_u8_mf8_x2(svmfloat8x2_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_u8, _mf8_x2)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_s8_x2(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], <vscale x 16 x i8> [[TMP4]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z28test_svreinterpret_mf8_s8_x210svint8x2_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], <vscale x 16 x i8> [[TMP4]], 1
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+//
+svmfloat8x2_t test_svreinterpret_mf8_s8_x2(svint8x2_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _s8_x2)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_u8_x2(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], <vscale x 16 x i8> [[TMP4]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z28test_svreinterpret_mf8_u8_x211svuint8x2_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], <vscale x 16 x i8> [[TMP4]], 1
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+//
+svmfloat8x2_t test_svreinterpret_mf8_u8_x2(svuint8x2_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _u8_x2)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_mf8_x2(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], <vscale x 16 x i8> [[TMP4]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z29test_svreinterpret_mf8_mf8_x213svmfloat8x2_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], <vscale x 16 x i8> [[TMP4]], 1
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+//
+svmfloat8x2_t test_svreinterpret_mf8_mf8_x2(svmfloat8x2_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _mf8_x2)(op);
+}
+
+//
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_s16_x2(
+// CHECK-SAME: <vscale x 8 x i16> [[OP_COERCE0:%.*]], <vscale x 8 x i16> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x i16> [[TMP2]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x i16> [[TMP5]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP6]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z29test_svreinterpret_mf8_s16_x211svint16x2_t(
+// CHECK-CXX-SAME: <vscale x 8 x i16> [[OP_COERCE0:%.*]], <vscale x 8 x i16> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x i16> [[TMP2]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x i16> [[TMP5]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP6]], 1
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]]
+//
+svmfloat8x2_t test_svreinterpret_mf8_s16_x2(svint16x2_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _s16_x2)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_u16_x2(
+// CHECK-SAME: <vscale x 8 x i16> [[OP_COERCE0:%.*]], <vscale x 8 x i16> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x i16> [[TMP2]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x i16> [[TMP5]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP6]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z29test_svreinterpret_mf8_u16_x212svuint16x2_t(
+// CHECK-CXX-SAME: <vscale x 8 x i16> [[OP_COERCE0:%.*]], <vscale x 8 x i16> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x i16> [[TMP2]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x i16> [[TMP5]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP6]], 1
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]]
+//
+svmfloat8x2_t test_svreinterpret_mf8_u16_x2(svuint16x2_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _u16_x2)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_s32_x2(
+// CHECK-SAME: <vscale x 4 x i32> [[OP_COERCE0:%.*]], <vscale x 4 x i32> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 4 x i32> [[TMP2]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 4 x i32> [[TMP5]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP6]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z29test_svreinterpret_mf8_s32_x211svint32x2_t(
+// CHECK-CXX-SAME: <vscale x 4 x i32> [[OP_COERCE0:%.*]], <vscale x 4 x i32> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 4 x i32> [[TMP2]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 4 x i32> [[TMP5]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP6]], 1
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]]
+//
+svmfloat8x2_t test_svreinterpret_mf8_s32_x2(svint32x2_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _s32_x2)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_u32_x2(
+// CHECK-SAME: <vscale x 4 x i32> [[OP_COERCE0:%.*]], <vscale x 4 x i32> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 4 x i32> [[TMP2]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 4 x i32> [[TMP5]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP6]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z29test_svreinterpret_mf8_u32_x212svuint32x2_t(
+// CHECK-CXX-SAME: <vscale x 4 x i32> [[OP_COERCE0:%.*]], <vscale x 4 x i32> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 4 x i32> [[TMP2]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 4 x i32> [[TMP5]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP6]], 1
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]]
+//
+svmfloat8x2_t test_svreinterpret_mf8_u32_x2(svuint32x2_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _u32_x2)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_s64_x2(
+// CHECK-SAME: <vscale x 2 x i64> [[OP_COERCE0:%.*]], <vscale x 2 x i64> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 2 x i64> [[TMP2]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 2 x i64> [[TMP5]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP6]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z29test_svreinterpret_mf8_s64_x211svint64x2_t(
+// CHECK-CXX-SAME: <vscale x 2 x i64> [[OP_COERCE0:%.*]], <vscale x 2 x i64> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 2 x i64> [[TMP2]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 2 x i64> [[TMP5]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP6]], 1
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]]
+//
+svmfloat8x2_t test_svreinterpret_mf8_s64_x2(svint64x2_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _s64_x2)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_u64_x2(
+// CHECK-SAME: <vscale x 2 x i64> [[OP_COERCE0:%.*]], <vscale x 2 x i64> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 2 x i64> [[TMP2]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 2 x i64> [[TMP5]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP6]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z29test_svreinterpret_mf8_u64_x212svuint64x2_t(
+// CHECK-CXX-SAME: <vscale x 2 x i64> [[OP_COERCE0:%.*]], <vscale x 2 x i64> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 2 x i64> [[TMP2]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 2 x i64> [[TMP5]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP6]], 1
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]]
+//
+svmfloat8x2_t test_svreinterpret_mf8_u64_x2(svuint64x2_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _u64_x2)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_f16_x2(
+// CHECK-SAME: <vscale x 8 x half> [[OP_COERCE0:%.*]], <vscale x 8 x half> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], <vscale x 8 x half> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x half> [[TMP2]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x half> [[TMP5]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP6]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z29test_svreinterpret_mf8_f16_x213svfloat16x2_t(
+// CHECK-CXX-SAME: <vscale x 8 x half> [[OP_COERCE0:%.*]], <vscale x 8 x half> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], <vscale x 8 x half> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 0
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x half> [[TMP2]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 1
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x half> [[TMP5]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP6]], 1
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]]
+//
+svmfloat8x2_t test_svreinterpret_mf8_f16_x2(svfloat16x2_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _f16_x2)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_bf16_x2(
+// CHECK-SAME: <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP6]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z30test_svreinterpret_mf8_bf16_x214svbfloat16x2_t(
+// CHECK-CXX-SAME: <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP6]], 1
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]]
+//
+svmfloat8x2_t test_svreinterpret_mf8_bf16_x2(svbfloat16x2_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _bf16_x2)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_f32_x2(
+// CHECK-SAME: <vscale x 4 x float> [[OP_COERCE0:%.*]], <vscale x 4 x float> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], <vscale x 4 x float> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 4 x float> [[TMP2]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 4 x float> [[TMP5]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP6]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z29test_svreinterpret_mf8_f32_x213svfloat32x2_t(
+// CHECK-CXX-SAME: <vscale x 4 x float> [[OP_COERCE0:%.*]], <vscale x 4 x float> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], <vscale x 4 x float> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 0
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 4 x float> [[TMP2]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 1
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 4 x float> [[TMP5]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP6]], 1
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]]
+//
+svmfloat8x2_t test_svreinterpret_mf8_f32_x2(svfloat32x2_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _f32_x2)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_f64_x2(
+// CHECK-SAME: <vscale x 2 x double> [[OP_COERCE0:%.*]], <vscale x 2 x double> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], <vscale x 2 x double> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 2 x double> [[TMP2]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 2 x double> [[TMP5]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP6]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z29test_svreinterpret_mf8_f64_x213svfloat64x2_t(
+// CHECK-CXX-SAME: <vscale x 2 x double> [[OP_COERCE0:%.*]], <vscale x 2 x double> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], <vscale x 2 x double> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 0
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 2 x double> [[TMP2]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 1
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 2 x double> [[TMP5]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP6]], 1
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]]
+//
+svmfloat8x2_t test_svreinterpret_mf8_f64_x2(svfloat64x2_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _f64_x2)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svreinterpret_s16_mf8_x2(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i8> [[TMP2]] to <vscale x 8 x i16>
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 16 x i8> [[TMP5]] to <vscale x 8 x i16>
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], <vscale x 8 x i16> [[TMP6]], 1
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP7]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16> } @_Z29test_svreinterpret_s16_mf8_x213svmfloat8x2_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i8> [[TMP2]] to <vscale x 8 x i16>
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 16 x i8> [[TMP5]] to <vscale x 8 x i16>
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], <vscale x 8 x i16> [[TMP6]], 1
+// CHECK-CXX-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP7]]
+//
+svint16x2_t test_svreinterpret_s16_mf8_x2(svmfloat8x2_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_s16, _mf8_x2)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svreinterpret_u16_mf8_x2(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i8> [[TMP2]] to <vscale x 8 x i16>
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 16 x i8> [[TMP5]] to <vscale x 8 x i16>
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], <vscale x 8 x i16> [[TMP6]], 1
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP7]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16> } @_Z29test_svreinterpret_u16_mf8_x213svmfloat8x2_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i8> [[TMP2]] to <vscale x 8 x i16>
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 16 x i8> [[TMP5]] to <vscale x 8 x i16>
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], <vscale x 8 x i16> [[TMP6]], 1
+// CHECK-CXX-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP7]]
+//
+svuint16x2_t test_svreinterpret_u16_mf8_x2(svmfloat8x2_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_u16, _mf8_x2)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32> } @test_svreinterpret_s32_mf8_x2(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i8> [[TMP2]] to <vscale x 4 x i32>
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 16 x i8> [[TMP5]] to <vscale x 4 x i32>
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], <vscale x 4 x i32> [[TMP6]], 1
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP7]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32> } @_Z29test_svreinterpret_s32_mf8_x213svmfloat8x2_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i8> [[TMP2]] to <vscale x 4 x i32>
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 16 x i8> [[TMP5]] to <vscale x 4 x i32>
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], <vscale x 4 x i32> [[TMP6]], 1
+// CHECK-CXX-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP7]]
+//
+svint32x2_t test_svreinterpret_s32_mf8_x2(svmfloat8x2_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_s32, _mf8_x2)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32> } @test_svreinterpret_u32_mf8_x2(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i8> [[TMP2]] to <vscale x 4 x i32>
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 16 x i8> [[TMP5]] to <vscale x 4 x i32>
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], <vscale x 4 x i32> [[TMP6]], 1
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP7]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32> } @_Z29test_svreinterpret_u32_mf8_x213svmfloat8x2_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i8> [[TMP2]] to <vscale x 4 x i32>
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 16 x i8> [[TMP5]] to <vscale x 4 x i32>
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], <vscale x 4 x i32> [[TMP6]], 1
+// CHECK-CXX-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP7]]
+//
+svuint32x2_t test_svreinterpret_u32_mf8_x2(svmfloat8x2_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_u32, _mf8_x2)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64> } @test_svreinterpret_s64_mf8_x2(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i8> [[TMP2]] to <vscale x 2 x i64>
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 16 x i8> [[TMP5]] to <vscale x 2 x i64>
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], <vscale x 2 x i64> [[TMP6]], 1
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP7]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64> } @_Z29test_svreinterpret_s64_mf8_x213svmfloat8x2_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i8> [[TMP2]] to <vscale x 2 x i64>
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 16 x i8> [[TMP5]] to <vscale x 2 x i64>
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], <vscale x 2 x i64> [[TMP6]], 1
+// CHECK-CXX-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP7]]
+//
+svint64x2_t test_svreinterpret_s64_mf8_x2(svmfloat8x2_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_s64, _mf8_x2)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64> } @test_svreinterpret_u64_mf8_x2(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i8> [[TMP2]] to <vscale x 2 x i64>
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 16 x i8> [[TMP5]] to <vscale x 2 x i64>
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], <vscale x 2 x i64> [[TMP6]], 1
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP7]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64> } @_Z29test_svreinterpret_u64_mf8_x213svmfloat8x2_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i8> [[TMP2]] to <vscale x 2 x i64>
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 16 x i8> [[TMP5]] to <vscale x 2 x i64>
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], <vscale x 2 x i64> [[TMP6]], 1
+// CHECK-CXX-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP7]]
+//
+svuint64x2_t test_svreinterpret_u64_mf8_x2(svmfloat8x2_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_u64, _mf8_x2)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half> } @test_svreinterpret_f16_mf8_x2(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i8> [[TMP2]] to <vscale x 8 x half>
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 16 x i8> [[TMP5]] to <vscale x 8 x half>
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], <vscale x 8 x half> [[TMP6]], 1
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP7]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half> } @_Z29test_svreinterpret_f16_mf8_x213svmfloat8x2_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i8> [[TMP2]] to <vscale x 8 x half>
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 16 x i8> [[TMP5]] to <vscale x 8 x half>
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], <vscale x 8 x half> [[TMP6]], 1
+// CHECK-CXX-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP7]]
+//
+svfloat16x2_t test_svreinterpret_f16_mf8_x2(svmfloat8x2_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_f16, _mf8_x2)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svreinterpret_bf16_mf8_x2(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i8> [[TMP2]] to <vscale x 8 x bfloat>
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 16 x i8> [[TMP5]] to <vscale x 8 x bfloat>
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @_Z30test_svreinterpret_bf16_mf8_x213svmfloat8x2_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i8> [[TMP2]] to <vscale x 8 x bfloat>
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 16 x i8> [[TMP5]] to <vscale x 8 x bfloat>
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
+// CHECK-CXX-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
+//
+svbfloat16x2_t test_svreinterpret_bf16_mf8_x2(svmfloat8x2_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_bf16, _mf8_x2)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 4 x float>, <vscale x 4 x float> } @test_svreinterpret_f32_mf8_x2(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i8> [[TMP2]] to <vscale x 4 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 16 x i8> [[TMP5]] to <vscale x 4 x float>
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], <vscale x 4 x float> [[TMP6]], 1
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP7]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 4 x float>, <vscale x 4 x float> } @_Z29test_svreinterpret_f32_mf8_x213svmfloat8x2_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i8> [[TMP2]] to <vscale x 4 x float>
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 16 x i8> [[TMP5]] to <vscale x 4 x float>
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], <vscale x 4 x float> [[TMP6]], 1
+// CHECK-CXX-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP7]]
+//
+svfloat32x2_t test_svreinterpret_f32_mf8_x2(svmfloat8x2_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_f32, _mf8_x2)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 2 x double>, <vscale x 2 x double> } @test_svreinterpret_f64_mf8_x2(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i8> [[TMP2]] to <vscale x 2 x double>
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 16 x i8> [[TMP5]] to <vscale x 2 x double>
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], <vscale x 2 x double> [[TMP6]], 1
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP7]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 2 x double>, <vscale x 2 x double> } @_Z29test_svreinterpret_f64_mf8_x213svmfloat8x2_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i8> [[TMP2]] to <vscale x 2 x double>
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 16 x i8> [[TMP5]] to <vscale x 2 x double>
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], <vscale x 2 x double> [[TMP6]], 1
+// CHECK-CXX-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP7]]
+//
+svfloat64x2_t test_svreinterpret_f64_mf8_x2(svmfloat8x2_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_f64, _mf8_x2)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_s8_mf8_x3(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP5]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP7]], 2
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z28test_svreinterpret_s8_mf8_x313svmfloat8x3_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP5]], 1
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP7]], 2
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]]
+//
+svint8x3_t test_svreinterpret_s8_mf8_x3(svmfloat8x3_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_s8, _mf8_x3)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_u8_mf8_x3(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP5]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP7]], 2
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z28test_svreinterpret_u8_mf8_x313svmfloat8x3_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP5]], 1
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP7]], 2
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]]
+//
+svuint8x3_t test_svreinterpret_u8_mf8_x3(svmfloat8x3_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_u8, _mf8_x3)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_s8_x3(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP5]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP7]], 2
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z28test_svreinterpret_mf8_s8_x310svint8x3_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP5]], 1
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP7]], 2
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]]
+//
+svmfloat8x3_t test_svreinterpret_mf8_s8_x3(svint8x3_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _s8_x3)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_u8_x3(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP5]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP7]], 2
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z28test_svreinterpret_mf8_u8_x311svuint8x3_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP5]], 1
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP7]], 2
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]]
+//
+svmfloat8x3_t test_svreinterpret_mf8_u8_x3(svuint8x3_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _u8_x3)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_mf8_x3(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP5]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP7]], 2
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z29test_svreinterpret_mf8_mf8_x313svmfloat8x3_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP5]], 1
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP7]], 2
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]]
+//
+svmfloat8x3_t test_svreinterpret_mf8_mf8_x3(svmfloat8x3_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _mf8_x3)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_s16_x3(
+// CHECK-SAME: <vscale x 8 x i16> [[OP_COERCE0:%.*]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], <vscale x 8 x i16> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], <vscale x 8 x i16> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x i16> [[TMP3]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x i16> [[TMP6]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP7]], 1
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x i16> [[TMP9]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]], <vscale x 16 x i8> [[TMP10]], 2
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z29test_svreinterpret_mf8_s16_x311svint16x3_t(
+// CHECK-CXX-SAME: <vscale x 8 x i16> [[OP_COERCE0:%.*]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], <vscale x 8 x i16> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], <vscale x 8 x i16> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x i16> [[TMP3]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x i16> [[TMP6]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP7]], 1
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 2
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x i16> [[TMP9]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]], <vscale x 16 x i8> [[TMP10]], 2
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+svmfloat8x3_t test_svreinterpret_mf8_s16_x3(svint16x3_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _s16_x3)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_u16_x3(
+// CHECK-SAME: <vscale x 8 x i16> [[OP_COERCE0:%.*]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], <vscale x 8 x i16> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], <vscale x 8 x i16> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x i16> [[TMP3]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x i16> [[TMP6]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP7]], 1
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x i16> [[TMP9]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]], <vscale x 16 x i8> [[TMP10]], 2
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z29test_svreinterpret_mf8_u16_x312svuint16x3_t(
+// CHECK-CXX-SAME: <vscale x 8 x i16> [[OP_COERCE0:%.*]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], <vscale x 8 x i16> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], <vscale x 8 x i16> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x i16> [[TMP3]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x i16> [[TMP6]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP7]], 1
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 2
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x i16> [[TMP9]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]], <vscale x 16 x i8> [[TMP10]], 2
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+svmfloat8x3_t test_svreinterpret_mf8_u16_x3(svuint16x3_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _u16_x3)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_s32_x3(
+// CHECK-SAME: <vscale x 4 x i32> [[OP_COERCE0:%.*]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], <vscale x 4 x i32> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], <vscale x 4 x i32> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 4 x i32> [[TMP3]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 4 x i32> [[TMP6]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP7]], 1
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 4 x i32> [[TMP9]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]], <vscale x 16 x i8> [[TMP10]], 2
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z29test_svreinterpret_mf8_s32_x311svint32x3_t(
+// CHECK-CXX-SAME: <vscale x 4 x i32> [[OP_COERCE0:%.*]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], <vscale x 4 x i32> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], <vscale x 4 x i32> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 4 x i32> [[TMP3]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 4 x i32> [[TMP6]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP7]], 1
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 2
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 4 x i32> [[TMP9]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]], <vscale x 16 x i8> [[TMP10]], 2
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+svmfloat8x3_t test_svreinterpret_mf8_s32_x3(svint32x3_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _s32_x3)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_u32_x3(
+// CHECK-SAME: <vscale x 4 x i32> [[OP_COERCE0:%.*]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], <vscale x 4 x i32> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], <vscale x 4 x i32> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 4 x i32> [[TMP3]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 4 x i32> [[TMP6]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP7]], 1
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 4 x i32> [[TMP9]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]], <vscale x 16 x i8> [[TMP10]], 2
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z29test_svreinterpret_mf8_u32_x312svuint32x3_t(
+// CHECK-CXX-SAME: <vscale x 4 x i32> [[OP_COERCE0:%.*]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], <vscale x 4 x i32> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], <vscale x 4 x i32> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 4 x i32> [[TMP3]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 4 x i32> [[TMP6]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP7]], 1
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 2
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 4 x i32> [[TMP9]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]], <vscale x 16 x i8> [[TMP10]], 2
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+svmfloat8x3_t test_svreinterpret_mf8_u32_x3(svuint32x3_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _u32_x3)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_s64_x3(
+// CHECK-SAME: <vscale x 2 x i64> [[OP_COERCE0:%.*]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], <vscale x 2 x i64> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], <vscale x 2 x i64> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 2 x i64> [[TMP3]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 2 x i64> [[TMP6]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP7]], 1
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 2 x i64> [[TMP9]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]], <vscale x 16 x i8> [[TMP10]], 2
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z29test_svreinterpret_mf8_s64_x311svint64x3_t(
+// CHECK-CXX-SAME: <vscale x 2 x i64> [[OP_COERCE0:%.*]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], <vscale x 2 x i64> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], <vscale x 2 x i64> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 2 x i64> [[TMP3]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 2 x i64> [[TMP6]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP7]], 1
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 2
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 2 x i64> [[TMP9]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]], <vscale x 16 x i8> [[TMP10]], 2
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+svmfloat8x3_t test_svreinterpret_mf8_s64_x3(svint64x3_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _s64_x3)(op);
+}
+
+//
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_u64_x3(
+// CHECK-SAME: <vscale x 2 x i64> [[OP_COERCE0:%.*]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], <vscale x 2 x i64> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], <vscale x 2 x i64> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 2 x i64> [[TMP3]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 2 x i64> [[TMP6]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP7]], 1
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 2 x i64> [[TMP9]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]], <vscale x 16 x i8> [[TMP10]], 2
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z29test_svreinterpret_mf8_u64_x312svuint64x3_t(
+// CHECK-CXX-SAME: <vscale x 2 x i64> [[OP_COERCE0:%.*]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], <vscale x 2 x i64> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], <vscale x 2 x i64> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 2 x i64> [[TMP3]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 2 x i64> [[TMP6]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP7]], 1
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 2
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 2 x i64> [[TMP9]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]], <vscale x 16 x i8> [[TMP10]], 2
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+svmfloat8x3_t test_svreinterpret_mf8_u64_x3(svuint64x3_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _u64_x3)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_f16_x3(
+// CHECK-SAME: <vscale x 8 x half> [[OP_COERCE0:%.*]], <vscale x 8 x half> [[OP_COERCE1:%.*]], <vscale x 8 x half> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], <vscale x 8 x half> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], <vscale x 8 x half> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x half> [[TMP3]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x half> [[TMP6]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP7]], 1
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x half> [[TMP9]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]], <vscale x 16 x i8> [[TMP10]], 2
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z29test_svreinterpret_mf8_f16_x313svfloat16x3_t(
+// CHECK-CXX-SAME: <vscale x 8 x half> [[OP_COERCE0:%.*]], <vscale x 8 x half> [[OP_COERCE1:%.*]], <vscale x 8 x half> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], <vscale x 8 x half> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], <vscale x 8 x half> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x half> [[TMP3]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 1
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x half> [[TMP6]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP7]], 1
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 2
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x half> [[TMP9]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]], <vscale x 16 x i8> [[TMP10]], 2
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+svmfloat8x3_t test_svreinterpret_mf8_f16_x3(svfloat16x3_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _f16_x3)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_bf16_x3(
+// CHECK-SAME: <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP7]], 1
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]], <vscale x 16 x i8> [[TMP10]], 2
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z30test_svreinterpret_mf8_bf16_x314svbfloat16x3_t(
+// CHECK-CXX-SAME: <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP7]], 1
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]], <vscale x 16 x i8> [[TMP10]], 2
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+svmfloat8x3_t test_svreinterpret_mf8_bf16_x3(svbfloat16x3_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _bf16_x3)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_f32_x3(
+// CHECK-SAME: <vscale x 4 x float> [[OP_COERCE0:%.*]], <vscale x 4 x float> [[OP_COERCE1:%.*]], <vscale x 4 x float> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], <vscale x 4 x float> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], <vscale x 4 x float> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 4 x float> [[TMP3]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 4 x float> [[TMP6]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP7]], 1
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 4 x float> [[TMP9]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]], <vscale x 16 x i8> [[TMP10]], 2
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z29test_svreinterpret_mf8_f32_x313svfloat32x3_t(
+// CHECK-CXX-SAME: <vscale x 4 x float> [[OP_COERCE0:%.*]], <vscale x 4 x float> [[OP_COERCE1:%.*]], <vscale x 4 x float> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], <vscale x 4 x float> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], <vscale x 4 x float> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 4 x float> [[TMP3]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 4 x float> [[TMP6]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP7]], 1
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 2
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 4 x float> [[TMP9]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]], <vscale x 16 x i8> [[TMP10]], 2
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+svmfloat8x3_t test_svreinterpret_mf8_f32_x3(svfloat32x3_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _f32_x3)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_f64_x3(
+// CHECK-SAME: <vscale x 2 x double> [[OP_COERCE0:%.*]], <vscale x 2 x double> [[OP_COERCE1:%.*]], <vscale x 2 x double> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], <vscale x 2 x double> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], <vscale x 2 x double> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 2 x double> [[TMP3]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 2 x double> [[TMP6]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP7]], 1
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 2 x double> [[TMP9]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]], <vscale x 16 x i8> [[TMP10]], 2
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z29test_svreinterpret_mf8_f64_x313svfloat64x3_t(
+// CHECK-CXX-SAME: <vscale x 2 x double> [[OP_COERCE0:%.*]], <vscale x 2 x double> [[OP_COERCE1:%.*]], <vscale x 2 x double> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], <vscale x 2 x double> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], <vscale x 2 x double> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 2 x double> [[TMP3]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 1
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 2 x double> [[TMP6]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP7]], 1
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 2
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 2 x double> [[TMP9]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]], <vscale x 16 x i8> [[TMP10]], 2
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+svmfloat8x3_t test_svreinterpret_mf8_f64_x3(svfloat64x3_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _f64_x3)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svreinterpret_s16_mf8_x3(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 16 x i8> [[TMP3]] to <vscale x 8 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 16 x i8> [[TMP6]] to <vscale x 8 x i16>
+// CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]], <vscale x 8 x i16> [[TMP7]], 1
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 16 x i8> [[TMP9]] to <vscale x 8 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP8]], <vscale x 8 x i16> [[TMP10]], 2
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP11]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @_Z29test_svreinterpret_s16_mf8_x313svmfloat8x3_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 16 x i8> [[TMP3]] to <vscale x 8 x i16>
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP4]], 0
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 16 x i8> [[TMP6]] to <vscale x 8 x i16>
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]], <vscale x 8 x i16> [[TMP7]], 1
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 16 x i8> [[TMP9]] to <vscale x 8 x i16>
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP8]], <vscale x 8 x i16> [[TMP10]], 2
+// CHECK-CXX-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP11]]
+//
+svint16x3_t test_svreinterpret_s16_mf8_x3(svmfloat8x3_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_s16, _mf8_x3)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svreinterpret_u16_mf8_x3(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 16 x i8> [[TMP3]] to <vscale x 8 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 16 x i8> [[TMP6]] to <vscale x 8 x i16>
+// CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]], <vscale x 8 x i16> [[TMP7]], 1
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 16 x i8> [[TMP9]] to <vscale x 8 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP8]], <vscale x 8 x i16> [[TMP10]], 2
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP11]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @_Z29test_svreinterpret_u16_mf8_x313svmfloat8x3_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 16 x i8> [[TMP3]] to <vscale x 8 x i16>
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP4]], 0
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 16 x i8> [[TMP6]] to <vscale x 8 x i16>
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]], <vscale x 8 x i16> [[TMP7]], 1
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 16 x i8> [[TMP9]] to <vscale x 8 x i16>
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP8]], <vscale x 8 x i16> [[TMP10]], 2
+// CHECK-CXX-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP11]]
+//
+svuint16x3_t test_svreinterpret_u16_mf8_x3(svmfloat8x3_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_u16, _mf8_x3)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @test_svreinterpret_s32_mf8_x3(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 16 x i8> [[TMP3]] to <vscale x 4 x i32>
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 16 x i8> [[TMP6]] to <vscale x 4 x i32>
+// CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]], <vscale x 4 x i32> [[TMP7]], 1
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 16 x i8> [[TMP9]] to <vscale x 4 x i32>
+// CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP8]], <vscale x 4 x i32> [[TMP10]], 2
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP11]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @_Z29test_svreinterpret_s32_mf8_x313svmfloat8x3_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 16 x i8> [[TMP3]] to <vscale x 4 x i32>
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP4]], 0
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 16 x i8> [[TMP6]] to <vscale x 4 x i32>
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]], <vscale x 4 x i32> [[TMP7]], 1
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 16 x i8> [[TMP9]] to <vscale x 4 x i32>
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP8]], <vscale x 4 x i32> [[TMP10]], 2
+// CHECK-CXX-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP11]]
+//
+svint32x3_t test_svreinterpret_s32_mf8_x3(svmfloat8x3_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_s32, _mf8_x3)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @test_svreinterpret_u32_mf8_x3(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 16 x i8> [[TMP3]] to <vscale x 4 x i32>
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 16 x i8> [[TMP6]] to <vscale x 4 x i32>
+// CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]], <vscale x 4 x i32> [[TMP7]], 1
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 16 x i8> [[TMP9]] to <vscale x 4 x i32>
+// CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP8]], <vscale x 4 x i32> [[TMP10]], 2
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP11]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @_Z29test_svreinterpret_u32_mf8_x313svmfloat8x3_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 16 x i8> [[TMP3]] to <vscale x 4 x i32>
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP4]], 0
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 16 x i8> [[TMP6]] to <vscale x 4 x i32>
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]], <vscale x 4 x i32> [[TMP7]], 1
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 16 x i8> [[TMP9]] to <vscale x 4 x i32>
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP8]], <vscale x 4 x i32> [[TMP10]], 2
+// CHECK-CXX-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP11]]
+//
+svuint32x3_t test_svreinterpret_u32_mf8_x3(svmfloat8x3_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_u32, _mf8_x3)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @test_svreinterpret_s64_mf8_x3(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 16 x i8> [[TMP3]] to <vscale x 2 x i64>
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 16 x i8> [[TMP6]] to <vscale x 2 x i64>
+// CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]], <vscale x 2 x i64> [[TMP7]], 1
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 16 x i8> [[TMP9]] to <vscale x 2 x i64>
+// CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP8]], <vscale x 2 x i64> [[TMP10]], 2
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP11]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @_Z29test_svreinterpret_s64_mf8_x313svmfloat8x3_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 16 x i8> [[TMP3]] to <vscale x 2 x i64>
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP4]], 0
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 16 x i8> [[TMP6]] to <vscale x 2 x i64>
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]], <vscale x 2 x i64> [[TMP7]], 1
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 16 x i8> [[TMP9]] to <vscale x 2 x i64>
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP8]], <vscale x 2 x i64> [[TMP10]], 2
+// CHECK-CXX-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP11]]
+//
+svint64x3_t test_svreinterpret_s64_mf8_x3(svmfloat8x3_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_s64, _mf8_x3)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @test_svreinterpret_u64_mf8_x3(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 16 x i8> [[TMP3]] to <vscale x 2 x i64>
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 16 x i8> [[TMP6]] to <vscale x 2 x i64>
+// CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]], <vscale x 2 x i64> [[TMP7]], 1
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 16 x i8> [[TMP9]] to <vscale x 2 x i64>
+// CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP8]], <vscale x 2 x i64> [[TMP10]], 2
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP11]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @_Z29test_svreinterpret_u64_mf8_x313svmfloat8x3_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 16 x i8> [[TMP3]] to <vscale x 2 x i64>
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP4]], 0
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 16 x i8> [[TMP6]] to <vscale x 2 x i64>
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]], <vscale x 2 x i64> [[TMP7]], 1
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 16 x i8> [[TMP9]] to <vscale x 2 x i64>
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP8]], <vscale x 2 x i64> [[TMP10]], 2
+// CHECK-CXX-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP11]]
+//
+svuint64x3_t test_svreinterpret_u64_mf8_x3(svmfloat8x3_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_u64, _mf8_x3)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @test_svreinterpret_f16_mf8_x3(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 16 x i8> [[TMP3]] to <vscale x 8 x half>
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 16 x i8> [[TMP6]] to <vscale x 8 x half>
+// CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]], <vscale x 8 x half> [[TMP7]], 1
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 16 x i8> [[TMP9]] to <vscale x 8 x half>
+// CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP8]], <vscale x 8 x half> [[TMP10]], 2
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP11]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @_Z29test_svreinterpret_f16_mf8_x313svmfloat8x3_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 16 x i8> [[TMP3]] to <vscale x 8 x half>
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[TMP4]], 0
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 16 x i8> [[TMP6]] to <vscale x 8 x half>
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]], <vscale x 8 x half> [[TMP7]], 1
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 16 x i8> [[TMP9]] to <vscale x 8 x half>
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP8]], <vscale x 8 x half> [[TMP10]], 2
+// CHECK-CXX-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP11]]
+//
+svfloat16x3_t test_svreinterpret_f16_mf8_x3(svmfloat8x3_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_f16, _mf8_x3)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svreinterpret_bf16_mf8_x3(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 16 x i8> [[TMP3]] to <vscale x 8 x bfloat>
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 16 x i8> [[TMP6]] to <vscale x 8 x bfloat>
+// CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 16 x i8> [[TMP9]] to <vscale x 8 x bfloat>
+// CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @_Z30test_svreinterpret_bf16_mf8_x313svmfloat8x3_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 16 x i8> [[TMP3]] to <vscale x 8 x bfloat>
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 16 x i8> [[TMP6]] to <vscale x 8 x bfloat>
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 16 x i8> [[TMP9]] to <vscale x 8 x bfloat>
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
+// CHECK-CXX-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
+//
+svbfloat16x3_t test_svreinterpret_bf16_mf8_x3(svmfloat8x3_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_bf16, _mf8_x3)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @test_svreinterpret_f32_mf8_x3(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 16 x i8> [[TMP3]] to <vscale x 4 x float>
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 16 x i8> [[TMP6]] to <vscale x 4 x float>
+// CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]], <vscale x 4 x float> [[TMP7]], 1
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 16 x i8> [[TMP9]] to <vscale x 4 x float>
+// CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP8]], <vscale x 4 x float> [[TMP10]], 2
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP11]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @_Z29test_svreinterpret_f32_mf8_x313svmfloat8x3_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 16 x i8> [[TMP3]] to <vscale x 4 x float>
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[TMP4]], 0
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 16 x i8> [[TMP6]] to <vscale x 4 x float>
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]], <vscale x 4 x float> [[TMP7]], 1
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 16 x i8> [[TMP9]] to <vscale x 4 x float>
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP8]], <vscale x 4 x float> [[TMP10]], 2
+// CHECK-CXX-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP11]]
+//
+svfloat32x3_t test_svreinterpret_f32_mf8_x3(svmfloat8x3_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_f32, _mf8_x3)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @test_svreinterpret_f64_mf8_x3(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 16 x i8> [[TMP3]] to <vscale x 2 x double>
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 16 x i8> [[TMP6]] to <vscale x 2 x double>
+// CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]], <vscale x 2 x double> [[TMP7]], 1
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 16 x i8> [[TMP9]] to <vscale x 2 x double>
+// CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP8]], <vscale x 2 x double> [[TMP10]], 2
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP11]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @_Z29test_svreinterpret_f64_mf8_x313svmfloat8x3_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 16 x i8> [[TMP3]] to <vscale x 2 x double>
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[TMP4]], 0
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 16 x i8> [[TMP6]] to <vscale x 2 x double>
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]], <vscale x 2 x double> [[TMP7]], 1
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 16 x i8> [[TMP9]] to <vscale x 2 x double>
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP8]], <vscale x 2 x double> [[TMP10]], 2
+// CHECK-CXX-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP11]]
+//
+svfloat64x3_t test_svreinterpret_f64_mf8_x3(svmfloat8x3_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_f64, _mf8_x3)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_s8_mf8_x4(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], <vscale x 16 x i8> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP6]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]], <vscale x 16 x i8> [[TMP8]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP10]], 3
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z28test_svreinterpret_s8_mf8_x413svmfloat8x4_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], <vscale x 16 x i8> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3]], 3
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP6]], 1
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]], <vscale x 16 x i8> [[TMP8]], 2
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP10]], 3
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+svint8x4_t test_svreinterpret_s8_mf8_x4(svmfloat8x4_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_s8, _mf8_x4)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_u8_mf8_x4(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], <vscale x 16 x i8> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP6]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]], <vscale x 16 x i8> [[TMP8]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP10]], 3
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z28test_svreinterpret_u8_mf8_x413svmfloat8x4_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], <vscale x 16 x i8> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3]], 3
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP6]], 1
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]], <vscale x 16 x i8> [[TMP8]], 2
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP10]], 3
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+svuint8x4_t test_svreinterpret_u8_mf8_x4(svmfloat8x4_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_u8, _mf8_x4)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_s8_x4(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], <vscale x 16 x i8> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP6]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]], <vscale x 16 x i8> [[TMP8]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP10]], 3
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z28test_svreinterpret_mf8_s8_x410svint8x4_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], <vscale x 16 x i8> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3]], 3
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP6]], 1
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]], <vscale x 16 x i8> [[TMP8]], 2
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP10]], 3
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+svmfloat8x4_t test_svreinterpret_mf8_s8_x4(svint8x4_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _s8_x4)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_u8_x4(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], <vscale x 16 x i8> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP6]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]], <vscale x 16 x i8> [[TMP8]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP10]], 3
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z28test_svreinterpret_mf8_u8_x411svuint8x4_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], <vscale x 16 x i8> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3]], 3
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP6]], 1
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]], <vscale x 16 x i8> [[TMP8]], 2
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP10]], 3
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+svmfloat8x4_t test_svreinterpret_mf8_u8_x4(svuint8x4_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _u8_x4)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_mf8_x4(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], <vscale x 16 x i8> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP6]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]], <vscale x 16 x i8> [[TMP8]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP10]], 3
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z29test_svreinterpret_mf8_mf8_x413svmfloat8x4_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], <vscale x 16 x i8> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3]], 3
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP6]], 1
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]], <vscale x 16 x i8> [[TMP8]], 2
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP10]], 3
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+svmfloat8x4_t test_svreinterpret_mf8_mf8_x4(svmfloat8x4_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _mf8_x4)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_s16_x4(
+// CHECK-SAME: <vscale x 8 x i16> [[OP_COERCE0:%.*]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], <vscale x 8 x i16> [[OP_COERCE2:%.*]], <vscale x 8 x i16> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], <vscale x 8 x i16> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], <vscale x 8 x i16> [[OP_COERCE3]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x i16> [[TMP4]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP5]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x i16> [[TMP7]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP8]], 1
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x i16> [[TMP10]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP11]], 2
+// CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x i16> [[TMP13]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[TMP14]], 3
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z29test_svreinterpret_mf8_s16_x411svint16x4_t(
+// CHECK-CXX-SAME: <vscale x 8 x i16> [[OP_COERCE0:%.*]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], <vscale x 8 x i16> [[OP_COERCE2:%.*]], <vscale x 8 x i16> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], <vscale x 8 x i16> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], <vscale x 8 x i16> [[OP_COERCE3]], 3
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x i16> [[TMP4]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP5]], 0
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 1
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x i16> [[TMP7]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP8]], 1
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 2
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x i16> [[TMP10]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP11]], 2
+// CHECK-CXX-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 3
+// CHECK-CXX-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x i16> [[TMP13]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[TMP14]], 3
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]
+//
+svmfloat8x4_t test_svreinterpret_mf8_s16_x4(svint16x4_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _s16_x4)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_u16_x4(
+// CHECK-SAME: <vscale x 8 x i16> [[OP_COERCE0:%.*]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], <vscale x 8 x i16> [[OP_COERCE2:%.*]], <vscale x 8 x i16> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], <vscale x 8 x i16> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], <vscale x 8 x i16> [[OP_COERCE3]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x i16> [[TMP4]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP5]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x i16> [[TMP7]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP8]], 1
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x i16> [[TMP10]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP11]], 2
+// CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x i16> [[TMP13]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[TMP14]], 3
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z29test_svreinterpret_mf8_u16_x412svuint16x4_t(
+// CHECK-CXX-SAME: <vscale x 8 x i16> [[OP_COERCE0:%.*]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], <vscale x 8 x i16> [[OP_COERCE2:%.*]], <vscale x 8 x i16> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], <vscale x 8 x i16> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], <vscale x 8 x i16> [[OP_COERCE3]], 3
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x i16> [[TMP4]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP5]], 0
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 1
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x i16> [[TMP7]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP8]], 1
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 2
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x i16> [[TMP10]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP11]], 2
+// CHECK-CXX-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 3
+// CHECK-CXX-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x i16> [[TMP13]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[TMP14]], 3
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]
+//
+svmfloat8x4_t test_svreinterpret_mf8_u16_x4(svuint16x4_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _u16_x4)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_s32_x4(
+// CHECK-SAME: <vscale x 4 x i32> [[OP_COERCE0:%.*]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], <vscale x 4 x i32> [[OP_COERCE2:%.*]], <vscale x 4 x i32> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], <vscale x 4 x i32> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], <vscale x 4 x i32> [[OP_COERCE3]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 4 x i32> [[TMP4]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP5]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 4 x i32> [[TMP7]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP8]], 1
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 4 x i32> [[TMP10]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP11]], 2
+// CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 4 x i32> [[TMP13]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[TMP14]], 3
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z29test_svreinterpret_mf8_s32_x411svint32x4_t(
+// CHECK-CXX-SAME: <vscale x 4 x i32> [[OP_COERCE0:%.*]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], <vscale x 4 x i32> [[OP_COERCE2:%.*]], <vscale x 4 x i32> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], <vscale x 4 x i32> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], <vscale x 4 x i32> [[OP_COERCE3]], 3
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 4 x i32> [[TMP4]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP5]], 0
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 1
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 4 x i32> [[TMP7]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP8]], 1
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 2
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 4 x i32> [[TMP10]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP11]], 2
+// CHECK-CXX-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 3
+// CHECK-CXX-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 4 x i32> [[TMP13]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[TMP14]], 3
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]
+//
+svmfloat8x4_t test_svreinterpret_mf8_s32_x4(svint32x4_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _s32_x4)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_u32_x4(
+// CHECK-SAME: <vscale x 4 x i32> [[OP_COERCE0:%.*]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], <vscale x 4 x i32> [[OP_COERCE2:%.*]], <vscale x 4 x i32> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], <vscale x 4 x i32> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], <vscale x 4 x i32> [[OP_COERCE3]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 4 x i32> [[TMP4]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP5]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 4 x i32> [[TMP7]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP8]], 1
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 4 x i32> [[TMP10]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP11]], 2
+// CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 4 x i32> [[TMP13]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[TMP14]], 3
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z29test_svreinterpret_mf8_u32_x412svuint32x4_t(
+// CHECK-CXX-SAME: <vscale x 4 x i32> [[OP_COERCE0:%.*]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], <vscale x 4 x i32> [[OP_COERCE2:%.*]], <vscale x 4 x i32> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], <vscale x 4 x i32> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], <vscale x 4 x i32> [[OP_COERCE3]], 3
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 4 x i32> [[TMP4]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP5]], 0
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 1
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 4 x i32> [[TMP7]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP8]], 1
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 2
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 4 x i32> [[TMP10]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP11]], 2
+// CHECK-CXX-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 3
+// CHECK-CXX-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 4 x i32> [[TMP13]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[TMP14]], 3
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]
+//
+svmfloat8x4_t test_svreinterpret_mf8_u32_x4(svuint32x4_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _u32_x4)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_s64_x4(
+// CHECK-SAME: <vscale x 2 x i64> [[OP_COERCE0:%.*]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], <vscale x 2 x i64> [[OP_COERCE2:%.*]], <vscale x 2 x i64> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], <vscale x 2 x i64> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], <vscale x 2 x i64> [[OP_COERCE3]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 2 x i64> [[TMP4]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP5]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 2 x i64> [[TMP7]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP8]], 1
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 2 x i64> [[TMP10]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP11]], 2
+// CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 2 x i64> [[TMP13]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[TMP14]], 3
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z29test_svreinterpret_mf8_s64_x411svint64x4_t(
+// CHECK-CXX-SAME: <vscale x 2 x i64> [[OP_COERCE0:%.*]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], <vscale x 2 x i64> [[OP_COERCE2:%.*]], <vscale x 2 x i64> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], <vscale x 2 x i64> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], <vscale x 2 x i64> [[OP_COERCE3]], 3
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 2 x i64> [[TMP4]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP5]], 0
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 1
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 2 x i64> [[TMP7]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP8]], 1
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 2
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 2 x i64> [[TMP10]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP11]], 2
+// CHECK-CXX-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 3
+// CHECK-CXX-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 2 x i64> [[TMP13]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[TMP14]], 3
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]
+//
+svmfloat8x4_t test_svreinterpret_mf8_s64_x4(svint64x4_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _s64_x4)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_u64_x4(
+// CHECK-SAME: <vscale x 2 x i64> [[OP_COERCE0:%.*]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], <vscale x 2 x i64> [[OP_COERCE2:%.*]], <vscale x 2 x i64> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], <vscale x 2 x i64> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], <vscale x 2 x i64> [[OP_COERCE3]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 2 x i64> [[TMP4]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP5]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 2 x i64> [[TMP7]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP8]], 1
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 2 x i64> [[TMP10]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP11]], 2
+// CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 2 x i64> [[TMP13]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[TMP14]], 3
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z29test_svreinterpret_mf8_u64_x412svuint64x4_t(
+// CHECK-CXX-SAME: <vscale x 2 x i64> [[OP_COERCE0:%.*]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], <vscale x 2 x i64> [[OP_COERCE2:%.*]], <vscale x 2 x i64> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], <vscale x 2 x i64> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], <vscale x 2 x i64> [[OP_COERCE3]], 3
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 2 x i64> [[TMP4]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP5]], 0
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 1
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 2 x i64> [[TMP7]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP8]], 1
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 2
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 2 x i64> [[TMP10]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP11]], 2
+// CHECK-CXX-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 3
+// CHECK-CXX-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 2 x i64> [[TMP13]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[TMP14]], 3
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]
+//
+svmfloat8x4_t test_svreinterpret_mf8_u64_x4(svuint64x4_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _u64_x4)(op);
+}
+
+//
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_f16_x4(
+// CHECK-SAME: <vscale x 8 x half> [[OP_COERCE0:%.*]], <vscale x 8 x half> [[OP_COERCE1:%.*]], <vscale x 8 x half> [[OP_COERCE2:%.*]], <vscale x 8 x half> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], <vscale x 8 x half> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], <vscale x 8 x half> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], <vscale x 8 x half> [[OP_COERCE3]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x half> [[TMP4]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP5]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x half> [[TMP7]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP8]], 1
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x half> [[TMP10]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP11]], 2
+// CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x half> [[TMP13]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[TMP14]], 3
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z29test_svreinterpret_mf8_f16_x413svfloat16x4_t(
+// CHECK-CXX-SAME: <vscale x 8 x half> [[OP_COERCE0:%.*]], <vscale x 8 x half> [[OP_COERCE1:%.*]], <vscale x 8 x half> [[OP_COERCE2:%.*]], <vscale x 8 x half> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], <vscale x 8 x half> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], <vscale x 8 x half> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], <vscale x 8 x half> [[OP_COERCE3]], 3
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x half> [[TMP4]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP5]], 0
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 1
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x half> [[TMP7]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP8]], 1
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 2
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x half> [[TMP10]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP11]], 2
+// CHECK-CXX-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 3
+// CHECK-CXX-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x half> [[TMP13]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[TMP14]], 3
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]
+//
+svmfloat8x4_t test_svreinterpret_mf8_f16_x4(svfloat16x4_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _f16_x4)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_bf16_x4(
+// CHECK-SAME: <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP5]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP8]], 1
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP11]], 2
+// CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[TMP14]], 3
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z30test_svreinterpret_mf8_bf16_x414svbfloat16x4_t(
+// CHECK-CXX-SAME: <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3]], 3
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP5]], 0
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP8]], 1
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP11]], 2
+// CHECK-CXX-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// CHECK-CXX-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[TMP14]], 3
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]
+//
+svmfloat8x4_t test_svreinterpret_mf8_bf16_x4(svbfloat16x4_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _bf16_x4)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_f32_x4(
+// CHECK-SAME: <vscale x 4 x float> [[OP_COERCE0:%.*]], <vscale x 4 x float> [[OP_COERCE1:%.*]], <vscale x 4 x float> [[OP_COERCE2:%.*]], <vscale x 4 x float> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], <vscale x 4 x float> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], <vscale x 4 x float> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], <vscale x 4 x float> [[OP_COERCE3]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 4 x float> [[TMP4]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP5]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 4 x float> [[TMP7]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP8]], 1
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 4 x float> [[TMP10]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP11]], 2
+// CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 4 x float> [[TMP13]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[TMP14]], 3
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z29test_svreinterpret_mf8_f32_x413svfloat32x4_t(
+// CHECK-CXX-SAME: <vscale x 4 x float> [[OP_COERCE0:%.*]], <vscale x 4 x float> [[OP_COERCE1:%.*]], <vscale x 4 x float> [[OP_COERCE2:%.*]], <vscale x 4 x float> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], <vscale x 4 x float> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], <vscale x 4 x float> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], <vscale x 4 x float> [[OP_COERCE3]], 3
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 4 x float> [[TMP4]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP5]], 0
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 1
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 4 x float> [[TMP7]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP8]], 1
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 2
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 4 x float> [[TMP10]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP11]], 2
+// CHECK-CXX-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 3
+// CHECK-CXX-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 4 x float> [[TMP13]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[TMP14]], 3
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]
+//
+svmfloat8x4_t test_svreinterpret_mf8_f32_x4(svfloat32x4_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _f32_x4)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreinterpret_mf8_f64_x4(
+// CHECK-SAME: <vscale x 2 x double> [[OP_COERCE0:%.*]], <vscale x 2 x double> [[OP_COERCE1:%.*]], <vscale x 2 x double> [[OP_COERCE2:%.*]], <vscale x 2 x double> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], <vscale x 2 x double> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], <vscale x 2 x double> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], <vscale x 2 x double> [[OP_COERCE3]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 2 x double> [[TMP4]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP5]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 2 x double> [[TMP7]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP8]], 1
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 2 x double> [[TMP10]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP11]], 2
+// CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 2 x double> [[TMP13]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[TMP14]], 3
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z29test_svreinterpret_mf8_f64_x413svfloat64x4_t(
+// CHECK-CXX-SAME: <vscale x 2 x double> [[OP_COERCE0:%.*]], <vscale x 2 x double> [[OP_COERCE1:%.*]], <vscale x 2 x double> [[OP_COERCE2:%.*]], <vscale x 2 x double> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], <vscale x 2 x double> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], <vscale x 2 x double> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], <vscale x 2 x double> [[OP_COERCE3]], 3
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 2 x double> [[TMP4]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP5]], 0
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 1
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 2 x double> [[TMP7]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP8]], 1
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 2
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 2 x double> [[TMP10]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP11]], 2
+// CHECK-CXX-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 3
+// CHECK-CXX-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 2 x double> [[TMP13]] to <vscale x 16 x i8>
+// CHECK-CXX-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[TMP14]], 3
+// CHECK-CXX-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]
+//
+svmfloat8x4_t test_svreinterpret_mf8_f64_x4(svfloat64x4_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_mf8, _f64_x4)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svreinterpret_s16_mf8_x4(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], <vscale x 16 x i8> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 16 x i8> [[TMP4]] to <vscale x 8 x i16>
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP5]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 16 x i8> [[TMP7]] to <vscale x 8 x i16>
+// CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP6]], <vscale x 8 x i16> [[TMP8]], 1
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 16 x i8> [[TMP10]] to <vscale x 8 x i16>
+// CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]], <vscale x 8 x i16> [[TMP11]], 2
+// CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 16 x i8> [[TMP13]] to <vscale x 8 x i16>
+// CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP12]], <vscale x 8 x i16> [[TMP14]], 3
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP15]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @_Z29test_svreinterpret_s16_mf8_x413svmfloat8x4_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], <vscale x 16 x i8> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3]], 3
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 16 x i8> [[TMP4]] to <vscale x 8 x i16>
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP5]], 0
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 16 x i8> [[TMP7]] to <vscale x 8 x i16>
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP6]], <vscale x 8 x i16> [[TMP8]], 1
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 16 x i8> [[TMP10]] to <vscale x 8 x i16>
+// CHECK-CXX-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]], <vscale x 8 x i16> [[TMP11]], 2
+// CHECK-CXX-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-CXX-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 16 x i8> [[TMP13]] to <vscale x 8 x i16>
+// CHECK-CXX-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP12]], <vscale x 8 x i16> [[TMP14]], 3
+// CHECK-CXX-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP15]]
+//
+svint16x4_t test_svreinterpret_s16_mf8_x4(svmfloat8x4_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_s16, _mf8_x4)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svreinterpret_u16_mf8_x4(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], <vscale x 16 x i8> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 16 x i8> [[TMP4]] to <vscale x 8 x i16>
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP5]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 16 x i8> [[TMP7]] to <vscale x 8 x i16>
+// CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP6]], <vscale x 8 x i16> [[TMP8]], 1
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 16 x i8> [[TMP10]] to <vscale x 8 x i16>
+// CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]], <vscale x 8 x i16> [[TMP11]], 2
+// CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 16 x i8> [[TMP13]] to <vscale x 8 x i16>
+// CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP12]], <vscale x 8 x i16> [[TMP14]], 3
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP15]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @_Z29test_svreinterpret_u16_mf8_x413svmfloat8x4_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], <vscale x 16 x i8> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3]], 3
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 16 x i8> [[TMP4]] to <vscale x 8 x i16>
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP5]], 0
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 16 x i8> [[TMP7]] to <vscale x 8 x i16>
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP6]], <vscale x 8 x i16> [[TMP8]], 1
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 16 x i8> [[TMP10]] to <vscale x 8 x i16>
+// CHECK-CXX-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]], <vscale x 8 x i16> [[TMP11]], 2
+// CHECK-CXX-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-CXX-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 16 x i8> [[TMP13]] to <vscale x 8 x i16>
+// CHECK-CXX-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP12]], <vscale x 8 x i16> [[TMP14]], 3
+// CHECK-CXX-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP15]]
+//
+svuint16x4_t test_svreinterpret_u16_mf8_x4(svmfloat8x4_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_u16, _mf8_x4)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @test_svreinterpret_s32_mf8_x4(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], <vscale x 16 x i8> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 16 x i8> [[TMP4]] to <vscale x 4 x i32>
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP5]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 16 x i8> [[TMP7]] to <vscale x 4 x i32>
+// CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP6]], <vscale x 4 x i32> [[TMP8]], 1
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 16 x i8> [[TMP10]] to <vscale x 4 x i32>
+// CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], <vscale x 4 x i32> [[TMP11]], 2
+// CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 16 x i8> [[TMP13]] to <vscale x 4 x i32>
+// CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP12]], <vscale x 4 x i32> [[TMP14]], 3
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP15]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @_Z29test_svreinterpret_s32_mf8_x413svmfloat8x4_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], <vscale x 16 x i8> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3]], 3
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 16 x i8> [[TMP4]] to <vscale x 4 x i32>
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP5]], 0
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 16 x i8> [[TMP7]] to <vscale x 4 x i32>
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP6]], <vscale x 4 x i32> [[TMP8]], 1
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 16 x i8> [[TMP10]] to <vscale x 4 x i32>
+// CHECK-CXX-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], <vscale x 4 x i32> [[TMP11]], 2
+// CHECK-CXX-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-CXX-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 16 x i8> [[TMP13]] to <vscale x 4 x i32>
+// CHECK-CXX-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP12]], <vscale x 4 x i32> [[TMP14]], 3
+// CHECK-CXX-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP15]]
+//
+svint32x4_t test_svreinterpret_s32_mf8_x4(svmfloat8x4_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_s32, _mf8_x4)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @test_svreinterpret_u32_mf8_x4(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], <vscale x 16 x i8> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 16 x i8> [[TMP4]] to <vscale x 4 x i32>
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP5]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 16 x i8> [[TMP7]] to <vscale x 4 x i32>
+// CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP6]], <vscale x 4 x i32> [[TMP8]], 1
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 16 x i8> [[TMP10]] to <vscale x 4 x i32>
+// CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], <vscale x 4 x i32> [[TMP11]], 2
+// CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 16 x i8> [[TMP13]] to <vscale x 4 x i32>
+// CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP12]], <vscale x 4 x i32> [[TMP14]], 3
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP15]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @_Z29test_svreinterpret_u32_mf8_x413svmfloat8x4_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], <vscale x 16 x i8> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3]], 3
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 16 x i8> [[TMP4]] to <vscale x 4 x i32>
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP5]], 0
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 16 x i8> [[TMP7]] to <vscale x 4 x i32>
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP6]], <vscale x 4 x i32> [[TMP8]], 1
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 16 x i8> [[TMP10]] to <vscale x 4 x i32>
+// CHECK-CXX-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], <vscale x 4 x i32> [[TMP11]], 2
+// CHECK-CXX-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-CXX-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 16 x i8> [[TMP13]] to <vscale x 4 x i32>
+// CHECK-CXX-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP12]], <vscale x 4 x i32> [[TMP14]], 3
+// CHECK-CXX-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP15]]
+//
+svuint32x4_t test_svreinterpret_u32_mf8_x4(svmfloat8x4_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_u32, _mf8_x4)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @test_svreinterpret_s64_mf8_x4(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], <vscale x 16 x i8> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 16 x i8> [[TMP4]] to <vscale x 2 x i64>
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP5]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 16 x i8> [[TMP7]] to <vscale x 2 x i64>
+// CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP6]], <vscale x 2 x i64> [[TMP8]], 1
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 16 x i8> [[TMP10]] to <vscale x 2 x i64>
+// CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]], <vscale x 2 x i64> [[TMP11]], 2
+// CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 16 x i8> [[TMP13]] to <vscale x 2 x i64>
+// CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP12]], <vscale x 2 x i64> [[TMP14]], 3
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP15]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @_Z29test_svreinterpret_s64_mf8_x413svmfloat8x4_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], <vscale x 16 x i8> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3]], 3
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 16 x i8> [[TMP4]] to <vscale x 2 x i64>
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP5]], 0
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 16 x i8> [[TMP7]] to <vscale x 2 x i64>
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP6]], <vscale x 2 x i64> [[TMP8]], 1
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 16 x i8> [[TMP10]] to <vscale x 2 x i64>
+// CHECK-CXX-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]], <vscale x 2 x i64> [[TMP11]], 2
+// CHECK-CXX-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-CXX-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 16 x i8> [[TMP13]] to <vscale x 2 x i64>
+// CHECK-CXX-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP12]], <vscale x 2 x i64> [[TMP14]], 3
+// CHECK-CXX-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP15]]
+//
+svint64x4_t test_svreinterpret_s64_mf8_x4(svmfloat8x4_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_s64, _mf8_x4)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @test_svreinterpret_u64_mf8_x4(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], <vscale x 16 x i8> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 16 x i8> [[TMP4]] to <vscale x 2 x i64>
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP5]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 16 x i8> [[TMP7]] to <vscale x 2 x i64>
+// CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP6]], <vscale x 2 x i64> [[TMP8]], 1
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 16 x i8> [[TMP10]] to <vscale x 2 x i64>
+// CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]], <vscale x 2 x i64> [[TMP11]], 2
+// CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 16 x i8> [[TMP13]] to <vscale x 2 x i64>
+// CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP12]], <vscale x 2 x i64> [[TMP14]], 3
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP15]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @_Z29test_svreinterpret_u64_mf8_x413svmfloat8x4_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], <vscale x 16 x i8> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3]], 3
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 16 x i8> [[TMP4]] to <vscale x 2 x i64>
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP5]], 0
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 16 x i8> [[TMP7]] to <vscale x 2 x i64>
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP6]], <vscale x 2 x i64> [[TMP8]], 1
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 16 x i8> [[TMP10]] to <vscale x 2 x i64>
+// CHECK-CXX-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]], <vscale x 2 x i64> [[TMP11]], 2
+// CHECK-CXX-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-CXX-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 16 x i8> [[TMP13]] to <vscale x 2 x i64>
+// CHECK-CXX-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP12]], <vscale x 2 x i64> [[TMP14]], 3
+// CHECK-CXX-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP15]]
+//
+svuint64x4_t test_svreinterpret_u64_mf8_x4(svmfloat8x4_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_u64, _mf8_x4)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @test_svreinterpret_f16_mf8_x4(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], <vscale x 16 x i8> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 16 x i8> [[TMP4]] to <vscale x 8 x half>
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[TMP5]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 16 x i8> [[TMP7]] to <vscale x 8 x half>
+// CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP6]], <vscale x 8 x half> [[TMP8]], 1
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 16 x i8> [[TMP10]] to <vscale x 8 x half>
+// CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]], <vscale x 8 x half> [[TMP11]], 2
+// CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 16 x i8> [[TMP13]] to <vscale x 8 x half>
+// CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP12]], <vscale x 8 x half> [[TMP14]], 3
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP15]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @_Z29test_svreinterpret_f16_mf8_x413svmfloat8x4_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], <vscale x 16 x i8> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3]], 3
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 16 x i8> [[TMP4]] to <vscale x 8 x half>
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[TMP5]], 0
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 16 x i8> [[TMP7]] to <vscale x 8 x half>
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP6]], <vscale x 8 x half> [[TMP8]], 1
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 16 x i8> [[TMP10]] to <vscale x 8 x half>
+// CHECK-CXX-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]], <vscale x 8 x half> [[TMP11]], 2
+// CHECK-CXX-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-CXX-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 16 x i8> [[TMP13]] to <vscale x 8 x half>
+// CHECK-CXX-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP12]], <vscale x 8 x half> [[TMP14]], 3
+// CHECK-CXX-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP15]]
+//
+svfloat16x4_t test_svreinterpret_f16_mf8_x4(svmfloat8x4_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_f16, _mf8_x4)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svreinterpret_bf16_mf8_x4(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], <vscale x 16 x i8> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 16 x i8> [[TMP4]] to <vscale x 8 x bfloat>
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 16 x i8> [[TMP7]] to <vscale x 8 x bfloat>
+// CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 16 x i8> [[TMP10]] to <vscale x 8 x bfloat>
+// CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
+// CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 16 x i8> [[TMP13]] to <vscale x 8 x bfloat>
+// CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @_Z30test_svreinterpret_bf16_mf8_x413svmfloat8x4_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], <vscale x 16 x i8> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3]], 3
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 16 x i8> [[TMP4]] to <vscale x 8 x bfloat>
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 16 x i8> [[TMP7]] to <vscale x 8 x bfloat>
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 16 x i8> [[TMP10]] to <vscale x 8 x bfloat>
+// CHECK-CXX-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
+// CHECK-CXX-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-CXX-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 16 x i8> [[TMP13]] to <vscale x 8 x bfloat>
+// CHECK-CXX-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
+// CHECK-CXX-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
+//
+svbfloat16x4_t test_svreinterpret_bf16_mf8_x4(svmfloat8x4_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_bf16, _mf8_x4)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @test_svreinterpret_f32_mf8_x4(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], <vscale x 16 x i8> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 16 x i8> [[TMP4]] to <vscale x 4 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[TMP5]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 16 x i8> [[TMP7]] to <vscale x 4 x float>
+// CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP6]], <vscale x 4 x float> [[TMP8]], 1
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 16 x i8> [[TMP10]] to <vscale x 4 x float>
+// CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]], <vscale x 4 x float> [[TMP11]], 2
+// CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 16 x i8> [[TMP13]] to <vscale x 4 x float>
+// CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], <vscale x 4 x float> [[TMP14]], 3
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP15]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @_Z29test_svreinterpret_f32_mf8_x413svmfloat8x4_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], <vscale x 16 x i8> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3]], 3
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 16 x i8> [[TMP4]] to <vscale x 4 x float>
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[TMP5]], 0
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 16 x i8> [[TMP7]] to <vscale x 4 x float>
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP6]], <vscale x 4 x float> [[TMP8]], 1
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 16 x i8> [[TMP10]] to <vscale x 4 x float>
+// CHECK-CXX-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]], <vscale x 4 x float> [[TMP11]], 2
+// CHECK-CXX-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-CXX-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 16 x i8> [[TMP13]] to <vscale x 4 x float>
+// CHECK-CXX-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], <vscale x 4 x float> [[TMP14]], 3
+// CHECK-CXX-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP15]]
+//
+svfloat32x4_t test_svreinterpret_f32_mf8_x4(svmfloat8x4_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_f32, _mf8_x4)(op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @test_svreinterpret_f64_mf8_x4(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], <vscale x 16 x i8> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 16 x i8> [[TMP4]] to <vscale x 2 x double>
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[TMP5]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 16 x i8> [[TMP7]] to <vscale x 2 x double>
+// CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP6]], <vscale x 2 x double> [[TMP8]], 1
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 16 x i8> [[TMP10]] to <vscale x 2 x double>
+// CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]], <vscale x 2 x double> [[TMP11]], 2
+// CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 16 x i8> [[TMP13]] to <vscale x 2 x double>
+// CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP12]], <vscale x 2 x double> [[TMP14]], 3
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP15]]
+//
+// CHECK-CXX-LABEL: define dso_local { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @_Z29test_svreinterpret_f64_mf8_x413svmfloat8x4_t(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], <vscale x 16 x i8> [[OP_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0]], 0
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1]], 1
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2]], 2
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3]], 3
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 16 x i8> [[TMP4]] to <vscale x 2 x double>
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[TMP5]], 0
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 16 x i8> [[TMP7]] to <vscale x 2 x double>
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP6]], <vscale x 2 x double> [[TMP8]], 1
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 16 x i8> [[TMP10]] to <vscale x 2 x double>
+// CHECK-CXX-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]], <vscale x 2 x double> [[TMP11]], 2
+// CHECK-CXX-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-CXX-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 16 x i8> [[TMP13]] to <vscale x 2 x double>
+// CHECK-CXX-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP12]], <vscale x 2 x double> [[TMP14]], 3
+// CHECK-CXX-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP15]]
+//
+svfloat64x4_t test_svreinterpret_f64_mf8_x4(svmfloat8x4_t op) STREAMING {
+  return SVE_ACLE_FUNC(svreinterpret_f64, _mf8_x4)(op);
+}
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index 97b768db3a313..35477cfc3cf45 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -295,7 +295,7 @@ class SVEEmitter {
     const char *Suffix;
   };
 
-  static const std::array<ReinterpretTypeInfo, 12> Reinterprets;
+  static const std::array<ReinterpretTypeInfo, 13> Reinterprets;
 
   const RecordKeeper &Records;
   StringMap<uint64_t> EltTypes;
@@ -418,9 +418,10 @@ class SVEEmitter {
                        SmallVectorImpl<std::unique_ptr<Intrinsic>> &Out);
 };
 
-const std::array<SVEEmitter::ReinterpretTypeInfo, 12> SVEEmitter::Reinterprets =
+const std::array<SVEEmitter::ReinterpretTypeInfo, 13> SVEEmitter::Reinterprets =
     {{{SVEType("c", 'd'), "s8"},
       {SVEType("Uc", 'd'), "u8"},
+      {SVEType("m", 'd'), "mf8"},
       {SVEType("s", 'd'), "s16"},
       {SVEType("Us", 'd'), "u16"},
       {SVEType("i", 'd'), "s32"},

From 066b88879ab5c195e7e14609e546cc238c2f3bf3 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Mon, 13 Jan 2025 10:51:26 -0800
Subject: [PATCH 313/408] [SLP]Correctly set vector operand for extracts with
 poisons

When extracts are vectorized and it has some poison values instead of
instructions, need to correctly set the vectorized operand not as
poison, but as a main vector operand of the main extract instruction.

Fixes #122583
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp       | 11 +++++++++++
 .../X86/extractelemets-extended-by-poison.ll          |  8 ++++++--
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4b0ed5b30179b..2742c3777c1ed 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2441,6 +2441,17 @@ class BoUpSLP {
           // operations or alternating sequences (e.g., +, -), we can safely
           // tell the inverse operations by checking commutativity.
           if (isa<PoisonValue>(VL[Lane])) {
+            if (auto *EI = dyn_cast<ExtractElementInst>(VL0)) {
+              if (OpIdx == 0) {
+                OpsVec[OpIdx][Lane] = {EI->getVectorOperand(), true, false};
+                continue;
+              }
+            } else if (auto *EV = dyn_cast<ExtractValueInst>(VL0)) {
+              if (OpIdx == 0) {
+                OpsVec[OpIdx][Lane] = {EV->getAggregateOperand(), true, false};
+                continue;
+              }
+            }
             OpsVec[OpIdx][Lane] = {
                 PoisonValue::get(VL0->getOperand(OpIdx)->getType()), true,
                 false};
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll
index 6af59aee54e55..71390b643f43d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll
@@ -5,18 +5,22 @@ define i32 @test() {
 ; CHECK-LABEL: define i32 @test() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr null, align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = or i64 poison, 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> <i32 1, i32 2, i32 2, i32 3, i32 3, i32 3, i32 2, i32 1>
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = or i64 [[TMP12]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> <i64 poison, i64 poison, i64 poison, i64 poison, i64 0, i64 poison, i64 poison, i64 poison>, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 12, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP4]], <4 x i64> [[TMP0]], i64 0)
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc <8 x i64> [[TMP5]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5>
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = add <8 x i32> [[TMP14]], zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <16 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
 ; CHECK-NEXT:    [[INC_3_3_I_1:%.*]] = or i64 [[TMP9]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[TMP8]])
-; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> poison)
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP15]])
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = or i32 [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    ret i32 [[OP_RDX]]
 ;

From b3ad84b534bf17311b92fbfe142c0542c6e95332 Mon Sep 17 00:00:00 2001
From: Sean Perry <perry@ca.ibm.com>
Date: Mon, 13 Jan 2025 14:02:20 -0500
Subject: [PATCH 314/408] [libc++][z/OS] __cxx03 subdir was added by mistake
 (#122763)

The header <features.h> is a system header. It's not part of the headers
in __cxx03.
---
 libcxx/include/__cxx03/__config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/include/__cxx03/__config b/libcxx/include/__cxx03/__config
index 3e8f181664c97..880d14a50a052 100644
--- a/libcxx/include/__cxx03/__config
+++ b/libcxx/include/__cxx03/__config
@@ -230,7 +230,7 @@ _LIBCPP_HARDENING_MODE_DEBUG
 #  endif
 
 #  if defined(__MVS__)
-#    include <__cxx03/features.h> // for __NATIVE_ASCII_F
+#    include <features.h> // for __NATIVE_ASCII_F
 #  endif
 
 #  if defined(_WIN32)

From 2f7ade4b5e399962e18f5f9a0ab0b7335deece51 Mon Sep 17 00:00:00 2001
From: Kirill Stoimenov <kstoimenov@google.com>
Date: Mon, 13 Jan 2025 19:03:40 +0000
Subject: [PATCH 315/408] Revert "[aarch64][win] Add support for import call
 optimization (equivalent to MSVC /d2ImportCallOptimization) (#121516)"

Breaks sanitizer build: https://lab.llvm.org/buildbot/#/builders/52/builds/5179

This reverts commits:
5ee0a71df919a328c714e25f0935c21e586cc18b
d997a722c194feec5f3a94dec5acdce59ac5e55b
---
 llvm/include/llvm/CodeGen/MIRYamlMapping.h    |  45 ++-----
 llvm/include/llvm/CodeGen/MachineFunction.h   |  25 ----
 llvm/include/llvm/CodeGen/SelectionDAG.h      |  14 ---
 llvm/include/llvm/MC/MCObjectFileInfo.h       |   5 -
 llvm/include/llvm/MC/MCStreamer.h             |   8 --
 llvm/include/llvm/MC/MCWinCOFFObjectWriter.h  |   1 -
 llvm/include/llvm/MC/MCWinCOFFStreamer.h      |   2 -
 llvm/lib/CodeGen/MIRParser/MIRParser.cpp      |  74 ++----------
 llvm/lib/CodeGen/MIRPrinter.cpp               |  33 +----
 .../SelectionDAG/ScheduleDAGSDNodes.cpp       |   4 -
 llvm/lib/MC/MCAsmStreamer.cpp                 |  14 ---
 llvm/lib/MC/MCObjectFileInfo.cpp              |   5 -
 llvm/lib/MC/MCParser/COFFAsmParser.cpp        |  34 ------
 llvm/lib/MC/MCStreamer.cpp                    |   4 -
 llvm/lib/MC/MCWinCOFFStreamer.cpp             | 114 ------------------
 llvm/lib/MC/WinCOFFObjectWriter.cpp           |  27 ++---
 llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp |  72 -----------
 .../Target/AArch64/AArch64ISelLowering.cpp    |  14 +--
 .../win-import-call-optimization-nocalls.ll   |  18 ---
 .../AArch64/win-import-call-optimization.ll   |  48 --------
 .../CodeGen/MIR/AArch64/called-globals.mir    |  61 ----------
 .../CodeGen/MIR/X86/call-site-info-error1.mir |   2 +-
 .../CodeGen/MIR/X86/call-site-info-error2.mir |   2 +-
 .../MC/AArch64/win-import-call-optimization.s |  72 -----------
 llvm/test/MC/COFF/bad-parse.s                 |  13 --
 25 files changed, 38 insertions(+), 673 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AArch64/win-import-call-optimization-nocalls.ll
 delete mode 100644 llvm/test/CodeGen/AArch64/win-import-call-optimization.ll
 delete mode 100644 llvm/test/CodeGen/MIR/AArch64/called-globals.mir
 delete mode 100644 llvm/test/MC/AArch64/win-import-call-optimization.s
 delete mode 100644 llvm/test/MC/COFF/bad-parse.s

diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
index dbad3469d047d..09a6ca936fe1f 100644
--- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h
+++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
@@ -457,16 +457,6 @@ template <> struct ScalarTraits<FrameIndex> {
   static QuotingType mustQuote(StringRef S) { return needsQuotes(S); }
 };
 
-/// Identifies call instruction location in machine function.
-struct MachineInstrLoc {
-  unsigned BlockNum;
-  unsigned Offset;
-
-  bool operator==(const MachineInstrLoc &Other) const {
-    return BlockNum == Other.BlockNum && Offset == Other.Offset;
-  }
-};
-
 /// Serializable representation of CallSiteInfo.
 struct CallSiteInfo {
   // Representation of call argument and register which is used to
@@ -480,6 +470,16 @@ struct CallSiteInfo {
     }
   };
 
+  /// Identifies call instruction location in machine function.
+  struct MachineInstrLoc {
+    unsigned BlockNum;
+    unsigned Offset;
+
+    bool operator==(const MachineInstrLoc &Other) const {
+      return BlockNum == Other.BlockNum && Offset == Other.Offset;
+    }
+  };
+
   MachineInstrLoc CallLocation;
   std::vector<ArgRegPair> ArgForwardingRegs;
 
@@ -595,26 +595,6 @@ template <> struct MappingTraits<MachineJumpTable::Entry> {
   }
 };
 
-struct CalledGlobal {
-  MachineInstrLoc CallSite;
-  StringValue Callee;
-  unsigned Flags;
-
-  bool operator==(const CalledGlobal &Other) const {
-    return CallSite == Other.CallSite && Callee == Other.Callee &&
-           Flags == Other.Flags;
-  }
-};
-
-template <> struct MappingTraits<CalledGlobal> {
-  static void mapping(IO &YamlIO, CalledGlobal &CG) {
-    YamlIO.mapRequired("bb", CG.CallSite.BlockNum);
-    YamlIO.mapRequired("offset", CG.CallSite.Offset);
-    YamlIO.mapRequired("callee", CG.Callee);
-    YamlIO.mapRequired("flags", CG.Flags);
-  }
-};
-
 } // end namespace yaml
 } // end namespace llvm
 
@@ -626,7 +606,6 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::FixedMachineStackObject)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::CallSiteInfo)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::MachineConstantPoolValue)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::MachineJumpTable::Entry)
-LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::CalledGlobal)
 
 namespace llvm {
 namespace yaml {
@@ -785,7 +764,6 @@ struct MachineFunction {
   std::vector<DebugValueSubstitution> DebugValueSubstitutions;
   MachineJumpTable JumpTableInfo;
   std::vector<StringValue> MachineMetadataNodes;
-  std::vector<CalledGlobal> CalledGlobals;
   BlockStringValue Body;
 };
 
@@ -844,9 +822,6 @@ template <> struct MappingTraits<MachineFunction> {
     if (!YamlIO.outputting() || !MF.MachineMetadataNodes.empty())
       YamlIO.mapOptional("machineMetadataNodes", MF.MachineMetadataNodes,
                          std::vector<StringValue>());
-    if (!YamlIO.outputting() || !MF.CalledGlobals.empty())
-      YamlIO.mapOptional("calledGlobals", MF.CalledGlobals,
-                         std::vector<CalledGlobal>());
     YamlIO.mapOptional("body", MF.Body, BlockStringValue());
   }
 };
diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h
index 282aee2a69c4d..d696add8a1af5 100644
--- a/llvm/include/llvm/CodeGen/MachineFunction.h
+++ b/llvm/include/llvm/CodeGen/MachineFunction.h
@@ -354,11 +354,6 @@ class LLVM_ABI MachineFunction {
   /// a table of valid targets for Windows EHCont Guard.
   std::vector<MCSymbol *> CatchretTargets;
 
-  /// Mapping of call instruction to the global value and target flags that it
-  /// calls, if applicable.
-  DenseMap<const MachineInstr *, std::pair<const GlobalValue *, unsigned>>
-      CalledGlobalsMap;
-
   /// \name Exception Handling
   /// \{
 
@@ -1187,26 +1182,6 @@ class LLVM_ABI MachineFunction {
     CatchretTargets.push_back(Target);
   }
 
-  /// Tries to get the global and target flags for a call site, if the
-  /// instruction is a call to a global.
-  std::pair<const GlobalValue *, unsigned>
-  tryGetCalledGlobal(const MachineInstr *MI) const {
-    return CalledGlobalsMap.lookup(MI);
-  }
-
-  /// Notes the global and target flags for a call site.
-  void addCalledGlobal(const MachineInstr *MI,
-                       std::pair<const GlobalValue *, unsigned> Details) {
-    assert(MI && "MI must not be null");
-    assert(Details.first && "Global must not be null");
-    CalledGlobalsMap.insert({MI, Details});
-  }
-
-  /// Iterates over the full set of call sites and their associated globals.
-  auto getCalledGlobals() const {
-    return llvm::make_range(CalledGlobalsMap.begin(), CalledGlobalsMap.end());
-  }
-
   /// \name Exception Handling
   /// \{
 
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index b31ad11c3ee0e..ff7caec41855f 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -293,7 +293,6 @@ class SelectionDAG {
     MDNode *HeapAllocSite = nullptr;
     MDNode *PCSections = nullptr;
     MDNode *MMRA = nullptr;
-    std::pair<const GlobalValue *, unsigned> CalledGlobal{};
     bool NoMerge = false;
   };
   /// Out-of-line extra information for SDNodes.
@@ -2374,19 +2373,6 @@ class SelectionDAG {
     auto It = SDEI.find(Node);
     return It != SDEI.end() ? It->second.MMRA : nullptr;
   }
-  /// Set CalledGlobal to be associated with Node.
-  void addCalledGlobal(const SDNode *Node, const GlobalValue *GV,
-                       unsigned OpFlags) {
-    SDEI[Node].CalledGlobal = {GV, OpFlags};
-  }
-  /// Return CalledGlobal associated with Node, or a nullopt if none exists.
-  std::optional<std::pair<const GlobalValue *, unsigned>>
-  getCalledGlobal(const SDNode *Node) {
-    auto I = SDEI.find(Node);
-    return I != SDEI.end()
-               ? std::make_optional(std::move(I->second).CalledGlobal)
-               : std::nullopt;
-  }
   /// Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
   void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge) {
     if (NoMerge)
diff --git a/llvm/include/llvm/MC/MCObjectFileInfo.h b/llvm/include/llvm/MC/MCObjectFileInfo.h
index fb575fe721015..e2a2c84e47910 100644
--- a/llvm/include/llvm/MC/MCObjectFileInfo.h
+++ b/llvm/include/llvm/MC/MCObjectFileInfo.h
@@ -73,10 +73,6 @@ class MCObjectFileInfo {
   /// to emit them into.
   MCSection *CompactUnwindSection = nullptr;
 
-  /// If import call optimization is supported by the target, this is the
-  /// section to emit import call data to.
-  MCSection *ImportCallSection = nullptr;
-
   // Dwarf sections for debug info.  If a target supports debug info, these must
   // be set.
   MCSection *DwarfAbbrevSection = nullptr;
@@ -273,7 +269,6 @@ class MCObjectFileInfo {
   MCSection *getBSSSection() const { return BSSSection; }
   MCSection *getReadOnlySection() const { return ReadOnlySection; }
   MCSection *getLSDASection() const { return LSDASection; }
-  MCSection *getImportCallSection() const { return ImportCallSection; }
   MCSection *getCompactUnwindSection() const { return CompactUnwindSection; }
   MCSection *getDwarfAbbrevSection() const { return DwarfAbbrevSection; }
   MCSection *getDwarfInfoSection() const { return DwarfInfoSection; }
diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h
index 558b14cebfd3d..21da4dac4872b 100644
--- a/llvm/include/llvm/MC/MCStreamer.h
+++ b/llvm/include/llvm/MC/MCStreamer.h
@@ -569,14 +569,6 @@ class MCStreamer {
   /// \param Symbol - Symbol the image relative relocation should point to.
   virtual void emitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset);
 
-  /// Emits the physical number of the section containing the given symbol as
-  /// assigned during object writing (i.e., this is not a runtime relocation).
-  virtual void emitCOFFSecNumber(MCSymbol const *Symbol);
-
-  /// Emits the offset of the symbol from the beginning of the section during
-  /// object writing (i.e., this is not a runtime relocation).
-  virtual void emitCOFFSecOffset(MCSymbol const *Symbol);
-
   /// Emits an lcomm directive with XCOFF csect information.
   ///
   /// \param LabelSym - Label on the block of storage.
diff --git a/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h b/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h
index 13d8c7d060c9e..a4ede61e45099 100644
--- a/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h
+++ b/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h
@@ -72,7 +72,6 @@ class WinCOFFObjectWriter final : public MCObjectWriter {
                         const MCFixup &Fixup, MCValue Target,
                         uint64_t &FixedValue) override;
   uint64_t writeObject(MCAssembler &Asm) override;
-  int getSectionNumber(const MCSection &Section) const;
 };
 
 /// Construct a new Win COFF writer instance.
diff --git a/llvm/include/llvm/MC/MCWinCOFFStreamer.h b/llvm/include/llvm/MC/MCWinCOFFStreamer.h
index 2425abe51e6dd..5c39d80538944 100644
--- a/llvm/include/llvm/MC/MCWinCOFFStreamer.h
+++ b/llvm/include/llvm/MC/MCWinCOFFStreamer.h
@@ -58,8 +58,6 @@ class MCWinCOFFStreamer : public MCObjectStreamer {
   void emitCOFFSectionIndex(MCSymbol const *Symbol) override;
   void emitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset) override;
   void emitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) override;
-  void emitCOFFSecNumber(MCSymbol const *Symbol) override;
-  void emitCOFFSecOffset(MCSymbol const *Symbol) override;
   void emitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                         Align ByteAlignment) override;
   void emitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index de2fe925c2d5c..e2543f883f91c 100644
--- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -158,9 +158,6 @@ class MIRParserImpl {
                                  MachineFunction &MF,
                                  const yaml::MachineFunction &YMF);
 
-  bool parseCalledGlobals(PerFunctionMIParsingState &PFS, MachineFunction &MF,
-                          const yaml::MachineFunction &YMF);
-
 private:
   bool parseMDNode(PerFunctionMIParsingState &PFS, MDNode *&Node,
                    const yaml::StringValue &Source);
@@ -186,9 +183,6 @@ class MIRParserImpl {
 
   void setupDebugValueTracking(MachineFunction &MF,
     PerFunctionMIParsingState &PFS, const yaml::MachineFunction &YamlMF);
-
-  bool parseMachineInst(MachineFunction &MF, yaml::MachineInstrLoc MILoc,
-                        MachineInstr const *&MI);
 };
 
 } // end namespace llvm
@@ -463,34 +457,24 @@ bool MIRParserImpl::computeFunctionProperties(
   return false;
 }
 
-bool MIRParserImpl::parseMachineInst(MachineFunction &MF,
-                                     yaml::MachineInstrLoc MILoc,
-                                     MachineInstr const *&MI) {
-  if (MILoc.BlockNum >= MF.size()) {
-    return error(Twine(MF.getName()) +
-                 Twine(" instruction block out of range.") +
-                 " Unable to reference bb:" + Twine(MILoc.BlockNum));
-  }
-  auto BB = std::next(MF.begin(), MILoc.BlockNum);
-  if (MILoc.Offset >= BB->size())
-    return error(
-        Twine(MF.getName()) + Twine(" instruction offset out of range.") +
-        " Unable to reference instruction at bb: " + Twine(MILoc.BlockNum) +
-        " at offset:" + Twine(MILoc.Offset));
-  MI = &*std::next(BB->instr_begin(), MILoc.Offset);
-  return false;
-}
-
 bool MIRParserImpl::initializeCallSiteInfo(
     PerFunctionMIParsingState &PFS, const yaml::MachineFunction &YamlMF) {
   MachineFunction &MF = PFS.MF;
   SMDiagnostic Error;
   const TargetMachine &TM = MF.getTarget();
   for (auto &YamlCSInfo : YamlMF.CallSitesInfo) {
-    yaml::MachineInstrLoc MILoc = YamlCSInfo.CallLocation;
-    const MachineInstr *CallI;
-    if (parseMachineInst(MF, MILoc, CallI))
-      return true;
+    yaml::CallSiteInfo::MachineInstrLoc MILoc = YamlCSInfo.CallLocation;
+    if (MILoc.BlockNum >= MF.size())
+      return error(Twine(MF.getName()) +
+                   Twine(" call instruction block out of range.") +
+                   " Unable to reference bb:" + Twine(MILoc.BlockNum));
+    auto CallB = std::next(MF.begin(), MILoc.BlockNum);
+    if (MILoc.Offset >= CallB->size())
+      return error(Twine(MF.getName()) +
+                   Twine(" call instruction offset out of range.") +
+                   " Unable to reference instruction at bb: " +
+                   Twine(MILoc.BlockNum) + " at offset:" + Twine(MILoc.Offset));
+    auto CallI = std::next(CallB->instr_begin(), MILoc.Offset);
     if (!CallI->isCall(MachineInstr::IgnoreBundle))
       return error(Twine(MF.getName()) +
                    Twine(" call site info should reference call "
@@ -657,9 +641,6 @@ MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF,
   if (initializeCallSiteInfo(PFS, YamlMF))
     return true;
 
-  if (parseCalledGlobals(PFS, MF, YamlMF))
-    return true;
-
   setupDebugValueTracking(MF, PFS, YamlMF);
 
   MF.getSubtarget().mirFileLoaded(MF);
@@ -1130,37 +1111,6 @@ bool MIRParserImpl::parseMachineMetadataNodes(
   return false;
 }
 
-bool MIRParserImpl::parseCalledGlobals(PerFunctionMIParsingState &PFS,
-                                       MachineFunction &MF,
-                                       const yaml::MachineFunction &YMF) {
-  Function &F = MF.getFunction();
-  for (const auto &YamlCG : YMF.CalledGlobals) {
-    yaml::MachineInstrLoc MILoc = YamlCG.CallSite;
-    const MachineInstr *CallI;
-    if (parseMachineInst(MF, MILoc, CallI))
-      return true;
-    if (!CallI->isCall(MachineInstr::IgnoreBundle))
-      return error(Twine(MF.getName()) +
-                   Twine(" called global should reference call "
-                         "instruction. Instruction at bb:") +
-                   Twine(MILoc.BlockNum) + " at offset:" + Twine(MILoc.Offset) +
-                   " is not a call instruction");
-
-    auto Callee =
-        F.getParent()->getValueSymbolTable().lookup(YamlCG.Callee.Value);
-    if (!Callee)
-      return error(YamlCG.Callee.SourceRange.Start,
-                   "use of undefined global '" + YamlCG.Callee.Value + "'");
-    if (!isa<GlobalValue>(Callee))
-      return error(YamlCG.Callee.SourceRange.Start,
-                   "use of non-global value '" + YamlCG.Callee.Value + "'");
-
-    MF.addCalledGlobal(CallI, {cast<GlobalValue>(Callee), YamlCG.Flags});
-  }
-
-  return false;
-}
-
 SMDiagnostic MIRParserImpl::diagFromMIStringDiag(const SMDiagnostic &Error,
                                                  SMRange SourceRange) {
   assert(SourceRange.isValid() && "Invalid source range");
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index b6da495590fe1..c8f6341c1224d 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -133,9 +133,6 @@ class MIRPrinter {
   void convertMachineMetadataNodes(yaml::MachineFunction &YMF,
                                    const MachineFunction &MF,
                                    MachineModuleSlotTracker &MST);
-  void convertCalledGlobals(yaml::MachineFunction &YMF,
-                            const MachineFunction &MF,
-                            MachineModuleSlotTracker &MST);
 
 private:
   void initRegisterMaskIds(const MachineFunction &MF);
@@ -272,8 +269,6 @@ void MIRPrinter::print(const MachineFunction &MF) {
   // function.
   convertMachineMetadataNodes(YamlMF, MF, MST);
 
-  convertCalledGlobals(YamlMF, MF, MST);
-
   yaml::Output Out(OS);
   if (!SimplifyMIR)
       Out.setWriteDefaultValues(true);
@@ -560,7 +555,7 @@ void MIRPrinter::convertCallSiteObjects(yaml::MachineFunction &YMF,
   const auto *TRI = MF.getSubtarget().getRegisterInfo();
   for (auto CSInfo : MF.getCallSitesInfo()) {
     yaml::CallSiteInfo YmlCS;
-    yaml::MachineInstrLoc CallLocation;
+    yaml::CallSiteInfo::MachineInstrLoc CallLocation;
 
     // Prepare instruction position.
     MachineBasicBlock::const_instr_iterator CallI = CSInfo.first->getIterator();
@@ -601,32 +596,6 @@ void MIRPrinter::convertMachineMetadataNodes(yaml::MachineFunction &YMF,
   }
 }
 
-void MIRPrinter::convertCalledGlobals(yaml::MachineFunction &YMF,
-                                      const MachineFunction &MF,
-                                      MachineModuleSlotTracker &MST) {
-  for (const auto &[CallInst, CG] : MF.getCalledGlobals()) {
-    // If the call instruction was dropped, then we don't need to print it.
-    auto BB = CallInst->getParent();
-    if (BB) {
-      yaml::MachineInstrLoc CallSite;
-      CallSite.BlockNum = CallInst->getParent()->getNumber();
-      CallSite.Offset = std::distance(CallInst->getParent()->instr_begin(),
-                                      CallInst->getIterator());
-
-      yaml::CalledGlobal YamlCG{CallSite, CG.first->getName().str(), CG.second};
-      YMF.CalledGlobals.push_back(YamlCG);
-    }
-  }
-
-  // Sort by position of call instructions.
-  llvm::sort(YMF.CalledGlobals.begin(), YMF.CalledGlobals.end(),
-             [](yaml::CalledGlobal A, yaml::CalledGlobal B) {
-               if (A.CallSite.BlockNum == B.CallSite.BlockNum)
-                 return A.CallSite.Offset < B.CallSite.Offset;
-               return A.CallSite.BlockNum < B.CallSite.BlockNum;
-             });
-}
-
 void MIRPrinter::convert(yaml::MachineFunction &MF,
                          const MachineConstantPool &ConstantPool) {
   unsigned ID = 0;
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index bafe26ff7d6b7..dff7243b0a99c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -908,10 +908,6 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
         It->setMMRAMetadata(MF, MMRA);
     }
 
-    if (auto CalledGlobal = DAG->getCalledGlobal(Node))
-      if (CalledGlobal->first)
-        MF.addCalledGlobal(MI, *CalledGlobal);
-
     return MI;
   };
 
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index dd8058c6d5cd8..01fe11ed20501 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -209,8 +209,6 @@ class MCAsmStreamer final : public MCStreamer {
   void emitCOFFSectionIndex(MCSymbol const *Symbol) override;
   void emitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset) override;
   void emitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) override;
-  void emitCOFFSecNumber(MCSymbol const *Symbol) override;
-  void emitCOFFSecOffset(MCSymbol const *Symbol) override;
   void emitXCOFFLocalCommonSymbol(MCSymbol *LabelSym, uint64_t Size,
                                   MCSymbol *CsectSym, Align Alignment) override;
   void emitXCOFFSymbolLinkageWithVisibility(MCSymbol *Symbol,
@@ -895,18 +893,6 @@ void MCAsmStreamer::emitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) {
   EmitEOL();
 }
 
-void MCAsmStreamer::emitCOFFSecNumber(MCSymbol const *Symbol) {
-  OS << "\t.secnum\t";
-  Symbol->print(OS, MAI);
-  EmitEOL();
-}
-
-void MCAsmStreamer::emitCOFFSecOffset(MCSymbol const *Symbol) {
-  OS << "\t.secoffset\t";
-  Symbol->print(OS, MAI);
-  EmitEOL();
-}
-
 // We need an XCOFF-specific version of this directive as the AIX syntax
 // requires a QualName argument identifying the csect name and storage mapping
 // class to appear before the alignment if we are specifying it.
diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp
index 150e38a94db6a..f37e138edc36b 100644
--- a/llvm/lib/MC/MCObjectFileInfo.cpp
+++ b/llvm/lib/MC/MCObjectFileInfo.cpp
@@ -596,11 +596,6 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
                                           COFF::IMAGE_SCN_MEM_READ);
   }
 
-  if (T.getArch() == Triple::aarch64) {
-    ImportCallSection =
-        Ctx->getCOFFSection(".impcall", COFF::IMAGE_SCN_LNK_INFO);
-  }
-
   // Debug info.
   COFFDebugSymbolsSection =
       Ctx->getCOFFSection(".debug$S", (COFF::IMAGE_SCN_MEM_DISCARDABLE |
diff --git a/llvm/lib/MC/MCParser/COFFAsmParser.cpp b/llvm/lib/MC/MCParser/COFFAsmParser.cpp
index dd5ce9964a194..4d95a72085283 100644
--- a/llvm/lib/MC/MCParser/COFFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/COFFAsmParser.cpp
@@ -70,8 +70,6 @@ class COFFAsmParser : public MCAsmParserExtension {
     addDirectiveHandler<&COFFAsmParser::parseDirectiveSymbolAttribute>(
         ".weak_anti_dep");
     addDirectiveHandler<&COFFAsmParser::parseDirectiveCGProfile>(".cg_profile");
-    addDirectiveHandler<&COFFAsmParser::parseDirectiveSecNum>(".secnum");
-    addDirectiveHandler<&COFFAsmParser::parseDirectiveSecOffset>(".secoffset");
 
     // Win64 EH directives.
     addDirectiveHandler<&COFFAsmParser::parseSEHDirectiveStartProc>(
@@ -128,8 +126,6 @@ class COFFAsmParser : public MCAsmParserExtension {
   bool parseDirectiveLinkOnce(StringRef, SMLoc);
   bool parseDirectiveRVA(StringRef, SMLoc);
   bool parseDirectiveCGProfile(StringRef, SMLoc);
-  bool parseDirectiveSecNum(StringRef, SMLoc);
-  bool parseDirectiveSecOffset(StringRef, SMLoc);
 
   // Win64 EH directives.
   bool parseSEHDirectiveStartProc(StringRef, SMLoc);
@@ -581,36 +577,6 @@ bool COFFAsmParser::parseDirectiveSymIdx(StringRef, SMLoc) {
   return false;
 }
 
-bool COFFAsmParser::parseDirectiveSecNum(StringRef, SMLoc) {
-  StringRef SymbolID;
-  if (getParser().parseIdentifier(SymbolID))
-    return TokError("expected identifier in directive");
-
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in directive");
-
-  MCSymbol *Symbol = getContext().getOrCreateSymbol(SymbolID);
-
-  Lex();
-  getStreamer().emitCOFFSecNumber(Symbol);
-  return false;
-}
-
-bool COFFAsmParser::parseDirectiveSecOffset(StringRef, SMLoc) {
-  StringRef SymbolID;
-  if (getParser().parseIdentifier(SymbolID))
-    return TokError("expected identifier in directive");
-
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in directive");
-
-  MCSymbol *Symbol = getContext().getOrCreateSymbol(SymbolID);
-
-  Lex();
-  getStreamer().emitCOFFSecOffset(Symbol);
-  return false;
-}
-
 /// ::= [ identifier ]
 bool COFFAsmParser::parseCOMDATType(COFF::COMDATType &Type) {
   StringRef TypeId = getTok().getIdentifier();
diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp
index e690723c0e502..ccf65df150e78 100644
--- a/llvm/lib/MC/MCStreamer.cpp
+++ b/llvm/lib/MC/MCStreamer.cpp
@@ -1023,10 +1023,6 @@ void MCStreamer::emitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset) {}
 
 void MCStreamer::emitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) {}
 
-void MCStreamer::emitCOFFSecNumber(MCSymbol const *Symbol) {}
-
-void MCStreamer::emitCOFFSecOffset(MCSymbol const *Symbol) {}
-
 /// EmitRawText - If this file is backed by an assembly streamer, this dumps
 /// the specified string in the output .s file.  This capability is
 /// indicated by the hasRawTextSupport() predicate.
diff --git a/llvm/lib/MC/MCWinCOFFStreamer.cpp b/llvm/lib/MC/MCWinCOFFStreamer.cpp
index 8fd46bc8b0255..395d4db3103d7 100644
--- a/llvm/lib/MC/MCWinCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCWinCOFFStreamer.cpp
@@ -29,7 +29,6 @@
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSymbolCOFF.h"
 #include "llvm/MC/MCTargetOptions.h"
-#include "llvm/MC/MCValue.h"
 #include "llvm/MC/MCWinCOFFObjectWriter.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -44,91 +43,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "WinCOFFStreamer"
 
-/// MCExpr that represents the physical number for the sections that contains
-/// a symbol.
-class MCCOFFSectionNumberTargetExpr final : public MCTargetExpr {
-  const MCSymbol &SectionSymbol;
-  const WinCOFFObjectWriter &Writer;
-
-  MCCOFFSectionNumberTargetExpr(const MCSymbol &SectionSymbol_,
-                                const WinCOFFObjectWriter &Writer_)
-      : SectionSymbol(SectionSymbol_), Writer(Writer_) {}
-
-public:
-  static MCCOFFSectionNumberTargetExpr *
-  create(const MCSymbol &SectionSymbol, const WinCOFFObjectWriter &Writer,
-         MCContext &Ctx) {
-    return new (Ctx) MCCOFFSectionNumberTargetExpr(SectionSymbol, Writer);
-  }
-
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override {
-    OS << ":secnum:";
-    SectionSymbol.print(OS, MAI);
-  }
-
-  bool evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
-                                 const MCFixup *Fixup) const override {
-    auto sectionNumber = Writer.getSectionNumber(SectionSymbol.getSection());
-    assert(sectionNumber != 0 &&
-           "Containing section was not assigned a number");
-    Res = MCValue::get(sectionNumber);
-    return true;
-  }
-
-  void visitUsedExpr(MCStreamer &Streamer) const override {
-    // Contains no sub-expressions.
-  }
-
-  MCFragment *findAssociatedFragment() const override {
-    return SectionSymbol.getFragment();
-  }
-
-  void fixELFSymbolsInTLSFixups(MCAssembler &) const override {
-    llvm_unreachable("Not supported for ELF");
-  }
-};
-
-/// MCExpr that represents the offset to a symbol from the beginning of its
-/// section.
-class MCCOFFSectionOffsetTargetExpr final : public MCTargetExpr {
-  const MCSymbol &Symbol;
-
-  MCCOFFSectionOffsetTargetExpr(const MCSymbol &Symbol_) : Symbol(Symbol_) {}
-
-public:
-  static MCCOFFSectionOffsetTargetExpr *create(const MCSymbol &Symbol,
-                                               MCContext &Ctx) {
-    return new (Ctx) MCCOFFSectionOffsetTargetExpr(Symbol);
-  }
-
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override {
-    OS << ":secoffset:";
-    Symbol.print(OS, MAI);
-  }
-
-  bool evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
-                                 const MCFixup *Fixup) const override {
-    uint64_t CallsiteOffset = 0;
-    if (!Asm->getSymbolOffset(Symbol, CallsiteOffset)) {
-      return true;
-    }
-    Res = MCValue::get(CallsiteOffset);
-    return true;
-  }
-
-  void visitUsedExpr(MCStreamer &Streamer) const override {
-    // Contains no sub-expressions.
-  }
-
-  MCFragment *findAssociatedFragment() const override {
-    return Symbol.getFragment();
-  }
-
-  void fixELFSymbolsInTLSFixups(MCAssembler &) const override {
-    llvm_unreachable("Not supported for ELF");
-  }
-};
-
 MCWinCOFFStreamer::MCWinCOFFStreamer(MCContext &Context,
                                      std::unique_ptr<MCAsmBackend> MAB,
                                      std::unique_ptr<MCCodeEmitter> CE,
@@ -366,34 +280,6 @@ void MCWinCOFFStreamer::emitCOFFImgRel32(const MCSymbol *Symbol,
   DF->appendContents(4, 0);
 }
 
-void MCWinCOFFStreamer::emitCOFFSecNumber(MCSymbol const *Symbol) {
-  visitUsedSymbol(*Symbol);
-  MCDataFragment *DF = getOrCreateDataFragment();
-  // Create Symbol for section number.
-  const MCExpr *MCE = MCCOFFSectionNumberTargetExpr::create(
-      *Symbol, this->getWriter(), getContext());
-  // Build the relocation.
-  MCFixup Fixup = MCFixup::create(DF->getContents().size(), MCE, FK_Data_4);
-  // Record the relocation.
-  DF->getFixups().push_back(Fixup);
-  // Emit 4 bytes (zeros) to the object file.
-  DF->appendContents(4, 0);
-}
-
-void MCWinCOFFStreamer::emitCOFFSecOffset(MCSymbol const *Symbol) {
-  visitUsedSymbol(*Symbol);
-  MCDataFragment *DF = getOrCreateDataFragment();
-  // Create Symbol for section offset.
-  const MCExpr *MCE =
-      MCCOFFSectionOffsetTargetExpr::create(*Symbol, getContext());
-  // Build the relocation.
-  MCFixup Fixup = MCFixup::create(DF->getContents().size(), MCE, FK_Data_4);
-  // Record the relocation.
-  DF->getFixups().push_back(Fixup);
-  // Emit 4 bytes (zeros) to the object file.
-  DF->appendContents(4, 0);
-}
-
 void MCWinCOFFStreamer::emitCommonSymbol(MCSymbol *S, uint64_t Size,
                                          Align ByteAlignment) {
   auto *Symbol = cast<MCSymbolCOFF>(S);
diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp
index 39e02d0522bcf..09d2b08e43050 100644
--- a/llvm/lib/MC/WinCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp
@@ -163,7 +163,6 @@ class llvm::WinCOFFWriter {
                         const MCFixup &Fixup, MCValue Target,
                         uint64_t &FixedValue);
   uint64_t writeObject(MCAssembler &Asm);
-  int getSectionNumber(const MCSection &Section) const;
 
 private:
   COFFSymbol *createSymbol(StringRef Name);
@@ -819,15 +818,6 @@ void WinCOFFWriter::executePostLayoutBinding(MCAssembler &Asm) {
       if (!Symbol.isTemporary() ||
           cast<MCSymbolCOFF>(Symbol).getClass() == COFF::IMAGE_SYM_CLASS_STATIC)
         defineSymbol(Asm, Symbol);
-
-  UseBigObj = Sections.size() > COFF::MaxNumberOfSections16;
-  Header.NumberOfSections = Sections.size();
-  Header.NumberOfSymbols = 0;
-  if (Sections.size() > INT32_MAX)
-    report_fatal_error(
-        "PE COFF object files can't have more than 2147483647 sections");
-
-  assignSectionNumbers();
 }
 
 void WinCOFFWriter::recordRelocation(MCAssembler &Asm,
@@ -990,7 +980,16 @@ static std::time_t getTime() {
 uint64_t WinCOFFWriter::writeObject(MCAssembler &Asm) {
   uint64_t StartOffset = W.OS.tell();
 
+  if (Sections.size() > INT32_MAX)
+    report_fatal_error(
+        "PE COFF object files can't have more than 2147483647 sections");
+
+  UseBigObj = Sections.size() > COFF::MaxNumberOfSections16;
+  Header.NumberOfSections = Sections.size();
+  Header.NumberOfSymbols = 0;
+
   setWeakDefaultNames();
+  assignSectionNumbers();
   if (Mode != DwoOnly)
     createFileSymbols(Asm);
 
@@ -1144,10 +1143,6 @@ uint64_t WinCOFFWriter::writeObject(MCAssembler &Asm) {
   return W.OS.tell() - StartOffset;
 }
 
-int WinCOFFWriter::getSectionNumber(const MCSection &Section) const {
-  return SectionMap.at(&Section)->Number;
-}
-
 //------------------------------------------------------------------------------
 // WinCOFFObjectWriter class implementation
 
@@ -1199,10 +1194,6 @@ uint64_t WinCOFFObjectWriter::writeObject(MCAssembler &Asm) {
   return TotalSize;
 }
 
-int WinCOFFObjectWriter::getSectionNumber(const MCSection &Section) const {
-  return ObjWriter->getSectionNumber(Section);
-}
-
 MCWinCOFFObjectTargetWriter::MCWinCOFFObjectTargetWriter(unsigned Machine_)
     : Machine(Machine_) {}
 
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 27e65d60122fd..9d9d9889b3858 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -24,7 +24,6 @@
 #include "MCTargetDesc/AArch64TargetStreamer.h"
 #include "TargetInfo/AArch64TargetInfo.h"
 #include "Utils/AArch64BaseInfo.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
@@ -78,11 +77,6 @@ static cl::opt<PtrauthCheckMode> PtrauthAuthChecks(
     cl::desc("Check pointer authentication auth/resign failures"),
     cl::init(Default));
 
-static cl::opt<bool> EnableImportCallOptimization(
-    "aarch64-win-import-call-optimization", cl::Hidden,
-    cl::desc("Enable import call optimization for AArch64 Windows"),
-    cl::init(false));
-
 #define DEBUG_TYPE "asm-printer"
 
 namespace {
@@ -95,8 +89,6 @@ class AArch64AsmPrinter : public AsmPrinter {
 #ifndef NDEBUG
   unsigned InstsEmitted;
 #endif
-  DenseMap<MCSection *, std::vector<std::pair<MCSymbol *, MCSymbol *>>>
-      SectionToImportedFunctionCalls;
 
 public:
   AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
@@ -301,11 +293,6 @@ class AArch64AsmPrinter : public AsmPrinter {
                               MCSymbol *LazyPointer) override;
   void emitMachOIFuncStubHelperBody(Module &M, const GlobalIFunc &GI,
                                     MCSymbol *LazyPointer) override;
-
-  /// Checks if this instruction is part of a sequence that is eligle for import
-  /// call optimization and, if so, records it to be emitted in the import call
-  /// section.
-  void recordIfImportCall(const MachineInstr *BranchInst);
 };
 
 } // end anonymous namespace
@@ -943,38 +930,6 @@ void AArch64AsmPrinter::emitEndOfAsmFile(Module &M) {
   // Emit stack and fault map information.
   FM.serializeToFaultMapSection();
 
-  // If import call optimization is enabled, emit the appropriate section.
-  // We do this whether or not we recorded any import calls.
-  if (EnableImportCallOptimization && TT.isOSBinFormatCOFF()) {
-    OutStreamer->switchSection(getObjFileLowering().getImportCallSection());
-
-    // Section always starts with some magic.
-    constexpr char ImpCallMagic[12] = "Imp_Call_V1";
-    OutStreamer->emitBytes(StringRef{ImpCallMagic, sizeof(ImpCallMagic)});
-
-    // Layout of this section is:
-    // Per section that contains calls to imported functions:
-    //  uint32_t SectionSize: Size in bytes for information in this section.
-    //  uint32_t Section Number
-    //  Per call to imported function in section:
-    //    uint32_t Kind: the kind of imported function.
-    //    uint32_t BranchOffset: the offset of the branch instruction in its
-    //                            parent section.
-    //    uint32_t TargetSymbolId: the symbol id of the called function.
-    for (auto &[Section, CallsToImportedFuncs] :
-         SectionToImportedFunctionCalls) {
-      unsigned SectionSize =
-          sizeof(uint32_t) * (2 + 3 * CallsToImportedFuncs.size());
-      OutStreamer->emitInt32(SectionSize);
-      OutStreamer->emitCOFFSecNumber(Section->getBeginSymbol());
-      for (auto &[CallsiteSymbol, CalledSymbol] : CallsToImportedFuncs) {
-        // Kind is always IMAGE_REL_ARM64_DYNAMIC_IMPORT_CALL (0x13).
-        OutStreamer->emitInt32(0x13);
-        OutStreamer->emitCOFFSecOffset(CallsiteSymbol);
-        OutStreamer->emitCOFFSymbolIndex(CalledSymbol);
-      }
-    }
-  }
 }
 
 void AArch64AsmPrinter::emitLOHs() {
@@ -2748,7 +2703,6 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
   case AArch64::TCRETURNriALL: {
     emitPtrauthTailCallHardening(MI);
 
-    recordIfImportCall(MI);
     MCInst TmpInst;
     TmpInst.setOpcode(AArch64::BR);
     TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
@@ -2760,7 +2714,6 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
 
     MCOperand Dest;
     MCInstLowering.lowerOperand(MI->getOperand(0), Dest);
-    recordIfImportCall(MI);
     MCInst TmpInst;
     TmpInst.setOpcode(AArch64::B);
     TmpInst.addOperand(Dest);
@@ -3091,14 +3044,6 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
     TS->emitARM64WinCFISaveAnyRegQPX(MI->getOperand(0).getImm(),
                                      -MI->getOperand(2).getImm());
     return;
-
-  case AArch64::BLR:
-  case AArch64::BR:
-    recordIfImportCall(MI);
-    MCInst TmpInst;
-    MCInstLowering.Lower(MI, TmpInst);
-    EmitToStreamer(*OutStreamer, TmpInst);
-    return;
   }
 
   // Finally, do the automated lowerings for everything else.
@@ -3107,23 +3052,6 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
   EmitToStreamer(*OutStreamer, TmpInst);
 }
 
-void AArch64AsmPrinter::recordIfImportCall(
-    const llvm::MachineInstr *BranchInst) {
-  if (!EnableImportCallOptimization ||
-      !TM.getTargetTriple().isOSBinFormatCOFF())
-    return;
-
-  auto [GV, OpFlags] = BranchInst->getMF()->tryGetCalledGlobal(BranchInst);
-  if (GV && GV->hasDLLImportStorageClass()) {
-    auto *CallSiteSymbol = MMI->getContext().createNamedTempSymbol("impcall");
-    OutStreamer->emitLabel(CallSiteSymbol);
-
-    auto *CalledSymbol = MCInstLowering.GetGlobalValueSymbol(GV, OpFlags);
-    SectionToImportedFunctionCalls[OutStreamer->getCurrentSectionOnly()]
-        .push_back({CallSiteSymbol, CalledSymbol});
-  }
-}
-
 void AArch64AsmPrinter::emitMachOIFuncStubBody(Module &M, const GlobalIFunc &GI,
                                                MCSymbol *LazyPointer) {
   // _ifunc:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 278dd95cd969d..7e82a433a85ad 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9450,14 +9450,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
   // node so that legalize doesn't hack it.
-  const GlobalValue *CalledGlobal = nullptr;
-  unsigned OpFlags = 0;
   if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-    CalledGlobal = G->getGlobal();
-    OpFlags = Subtarget->classifyGlobalFunctionReference(CalledGlobal,
-                                                         getTargetMachine());
+    auto GV = G->getGlobal();
+    unsigned OpFlags =
+        Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine());
     if (OpFlags & AArch64II::MO_GOT) {
-      Callee = DAG.getTargetGlobalAddress(CalledGlobal, DL, PtrVT, 0, OpFlags);
+      Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
       Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
     } else {
       const GlobalValue *GV = G->getGlobal();
@@ -9577,8 +9575,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
     DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
     DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
-    if (CalledGlobal)
-      DAG.addCalledGlobal(Ret.getNode(), CalledGlobal, OpFlags);
     return Ret;
   }
 
@@ -9590,8 +9586,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
   InGlue = Chain.getValue(1);
   DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
-  if (CalledGlobal)
-    DAG.addCalledGlobal(Chain.getNode(), CalledGlobal, OpFlags);
 
   uint64_t CalleePopBytes =
       DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
diff --git a/llvm/test/CodeGen/AArch64/win-import-call-optimization-nocalls.ll b/llvm/test/CodeGen/AArch64/win-import-call-optimization-nocalls.ll
deleted file mode 100644
index 81d6d6369dcbf..0000000000000
--- a/llvm/test/CodeGen/AArch64/win-import-call-optimization-nocalls.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc -mtriple=aarch64-pc-windows-msvc -aarch64-win-import-call-optimization < %s | FileCheck %s
-
-define dso_local void @normal_call() local_unnamed_addr {
-entry:
-  call void @a()
-  ret void
-}
-; CHECK-LABEL:  normal_call:
-; CHECK:        bl a
-
-declare void @a() local_unnamed_addr
-
-; Even if there are no calls to imported functions, we still need to emit the
-; .impcall section.
-
-; CHECK-LABEL  .section   .impcall,"yi"
-; CHECK-NEXT   .asciz  "Imp_Call_V1"
-; CHECK-NOT    .secnum
diff --git a/llvm/test/CodeGen/AArch64/win-import-call-optimization.ll b/llvm/test/CodeGen/AArch64/win-import-call-optimization.ll
deleted file mode 100644
index 6bb118ba1e159..0000000000000
--- a/llvm/test/CodeGen/AArch64/win-import-call-optimization.ll
+++ /dev/null
@@ -1,48 +0,0 @@
-; RUN: llc -mtriple=aarch64-pc-windows-msvc -aarch64-win-import-call-optimization < %s | FileCheck %s --check-prefix=CHECK-ENABLED
-; RUN: llc -mtriple=aarch64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK-DISABLED
-
-; CHECK-DISABLED-NOT: .section        .impcall
-
-define dso_local void @normal_call() local_unnamed_addr section "nc_sect" {
-entry:
-  call void @a()
-  call void @a()
-  ret void
-}
-; CHECK-ENABLED-LABEL:  normal_call:
-; CHECK-ENABLED:        adrp    [[ADRPREG:x[0-9]+]], __imp_a
-; CHECK-ENABLED-NEXT:   ldr     [[LDRREG:x[0-9]+]], [[[ADRPREG]], :lo12:__imp_a]
-; CHECK-ENABLED-NEXT:   .Limpcall0:
-; CHECK-ENABLED-NEXT:   blr     [[LDRREG]]
-; CHECK-ENABLED-NEXT:   .Limpcall1:
-; CHECK-ENABLED-NEXT:   blr     [[LDRREG]]
-
-define dso_local void @tail_call() local_unnamed_addr section "tc_sect" {
-entry:
-  tail call void @b()
-  ret void
-}
-; CHECK-ENABLED-LABEL:  tail_call:
-; CHECK-ENABLED:        adrp    [[ADRPREG:x[0-9]+]], __imp_b
-; CHECK-ENABLED-NEXT:   ldr     [[LDRREG:x[0-9]+]], [[[ADRPREG]], :lo12:__imp_b]
-; CHECK-ENABLED-NEXT:   .Limpcall2:
-; CHECK-ENABLED-NEXT:   br      [[LDRREG]]
-
-declare dllimport void @a() local_unnamed_addr
-declare dllimport void @b() local_unnamed_addr
-
-; CHECK-ENABLED-LABEL  .section   .impcall,"yi"
-; CHECK-ENABLED-NEXT   .asciz  "Imp_Call_V1"
-; CHECK-ENABLED-NEXT   .word   32
-; CHECK-ENABLED-NEXT   .secnum nc_sect
-; CHECK-ENABLED-NEXT   .word   19
-; CHECK-ENABLED-NEXT   .secoffset      .Limpcall0
-; CHECK-ENABLED-NEXT   .symidx __imp_a
-; CHECK-ENABLED-NEXT   .word   19
-; CHECK-ENABLED-NEXT   .secoffset      .Limpcall1
-; CHECK-ENABLED-NEXT   .symidx __imp_a
-; CHECK-ENABLED-NEXT   .word   20
-; CHECK-ENABLED-NEXT   .secnum tc_sect
-; CHECK-ENABLED-NEXT   .word   19
-; CHECK-ENABLED-NEXT   .secoffset      .Limpcall2
-; CHECK-ENABLED-NEXT   .symidx __imp_b
diff --git a/llvm/test/CodeGen/MIR/AArch64/called-globals.mir b/llvm/test/CodeGen/MIR/AArch64/called-globals.mir
deleted file mode 100644
index cf0f0a23e2d91..0000000000000
--- a/llvm/test/CodeGen/MIR/AArch64/called-globals.mir
+++ /dev/null
@@ -1,61 +0,0 @@
-# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass none -o - %s | FileCheck %s
-
---- |
-  declare dllimport void @callee_func() local_unnamed_addr
-
-  define dso_local void @caller() local_unnamed_addr {
-  entry:
-    call void @callee_func()
-    call void @callee_func()
-    ret void
-  }
-...
----
-name:            caller
-stack:
-  - { id: 0, name: '', type: spill-slot, offset: -8, size: 8, alignment: 8,
-      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 1, name: '', type: spill-slot, offset: -16, size: 8, alignment: 8,
-      stack-id: default, callee-saved-register: '$x19', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-calledGlobals:
-  - bb:              0
-    offset:          7
-    callee:          callee_func
-    flags:           144
-  - bb:              0
-    offset:          8
-    callee:          callee_func
-    flags:           144
-body:             |
-  bb.0.entry:
-    liveins: $x19, $lr
-
-    early-clobber $sp = frame-setup STRXpre killed $x19, $sp, -16 :: (store (s64) into %stack.1)
-    frame-setup SEH_SaveReg_X 19, -16
-    frame-setup STRXui killed $lr, $sp, 1 :: (store (s64) into %stack.0)
-    frame-setup SEH_SaveReg 30, 8
-    frame-setup SEH_PrologEnd
-    $x19 = ADRP target-flags(aarch64-page, aarch64-got, aarch64-dllimport) @callee_func
-    renamable $x19 = LDRXui killed $x19, target-flags(aarch64-pageoff, aarch64-got, aarch64-nc, aarch64-dllimport) @callee_func
-    BLR renamable $x19, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
-    BLR killed renamable $x19, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
-    frame-destroy SEH_EpilogStart
-    $lr = frame-destroy LDRXui $sp, 1 :: (load (s64) from %stack.0)
-    frame-destroy SEH_SaveReg 30, 8
-    early-clobber $sp, $x19 = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.1)
-    frame-destroy SEH_SaveReg_X 19, -16
-    frame-destroy SEH_EpilogEnd
-    RET undef $lr
-...
-
-# CHECK-LABEL: calledGlobals:
-# CHECK-NEXT:  - bb:              0
-# CHECK-NEXT:    offset:          7
-# CHECK-NEXT:    callee:          callee_func
-# CHECK-NEXT:    flags:           144
-# CHECK-NEXT:  - bb:              0
-# CHECK-NEXT:    offset:          8
-# CHECK-NEXT:    callee:          callee_func
-# CHECK-NEXT:    flags:           144
diff --git a/llvm/test/CodeGen/MIR/X86/call-site-info-error1.mir b/llvm/test/CodeGen/MIR/X86/call-site-info-error1.mir
index e4dab779216a8..096a80f77dbb6 100644
--- a/llvm/test/CodeGen/MIR/X86/call-site-info-error1.mir
+++ b/llvm/test/CodeGen/MIR/X86/call-site-info-error1.mir
@@ -1,5 +1,5 @@
 # RUN: not llc -mtriple=x86_64-- -run-pass none -debug-entry-values %s -o - 2>&1 | FileCheck %s
-# CHECK: baa instruction block out of range. Unable to reference bb:1
+# CHECK: baa call instruction block out of range. Unable to reference bb:1
 --- |
   define dso_local i32 @baa(i32 %a) local_unnamed_addr {
   entry:
diff --git a/llvm/test/CodeGen/MIR/X86/call-site-info-error2.mir b/llvm/test/CodeGen/MIR/X86/call-site-info-error2.mir
index 183610b326eeb..bd5b2451a8d76 100644
--- a/llvm/test/CodeGen/MIR/X86/call-site-info-error2.mir
+++ b/llvm/test/CodeGen/MIR/X86/call-site-info-error2.mir
@@ -1,5 +1,5 @@
 # RUN: not llc -mtriple=x86_64-- -run-pass none -debug-entry-values %s -o - 2>&1 | FileCheck %s
-# CHECK: baa instruction offset out of range. Unable to reference instruction at bb: 0 at offset:1
+# CHECK: baa call instruction offset out of range. Unable to reference instruction at bb: 0 at offset:1
 --- |
   define dso_local i32 @baa(i32 %a) local_unnamed_addr {
   entry:
diff --git a/llvm/test/MC/AArch64/win-import-call-optimization.s b/llvm/test/MC/AArch64/win-import-call-optimization.s
deleted file mode 100644
index f26e17b9b62cc..0000000000000
--- a/llvm/test/MC/AArch64/win-import-call-optimization.s
+++ /dev/null
@@ -1,72 +0,0 @@
-// RUN: llvm-mc -triple aarch64-windows-msvc -filetype obj -o %t.obj %s
-// RUN: llvm-readobj --sections --sd --relocs %t.obj | FileCheck %s
-
-.section        nc_sect,"xr"
-normal_call:
-  str     x30, [sp, #-16]!                // 8-byte Folded Spill
-  adrp    x8, __imp_a
-  ldr     x8, [x8, :lo12:__imp_a]
-.Limpcall0:
-  blr     x8
-  ldr     x30, [sp], #16                  // 8-byte Folded Reload
-  ret
-
-.section        tc_sect,"xr"
-tail_call:
-  adrp    x8, __imp_b
-  ldr     x8, [x8, :lo12:__imp_b]
-.Limpcall1:
-  br     x8
-
-.section        .impcall,"yi"
-.asciz  "Imp_Call_V1"
-.word   20
-.secnum nc_sect
-.word   19
-.secoffset      .Limpcall0
-.symidx __imp_a
-.word   20
-.secnum tc_sect
-.word   19
-.secoffset      .Limpcall1
-.symidx __imp_b
-
-// CHECK-LABEL: Name: .impcall (2E 69 6D 70 63 61 6C 6C)
-// CHECK-NEXT:  VirtualSize: 0x0
-// CHECK-NEXT:  VirtualAddress: 0x0
-// CHECK-NEXT:  RawDataSize: 52
-// CHECK-NEXT:  PointerToRawData: 0x150
-// CHECK-NEXT:  PointerToRelocations: 0x0
-// CHECK-NEXT:  PointerToLineNumbers: 0x0
-// CHECK-NEXT:  RelocationCount: 0
-// CHECK-NEXT:  LineNumberCount: 0
-// CHECK-NEXT:  Characteristics [
-// CHECK-NEXT:    IMAGE_SCN_ALIGN_4BYTES
-// CHECK-NEXT:    IMAGE_SCN_LNK_INFO
-// CHECK-NEXT:  ]
-// CHECK-NEXT:  SectionData (
-// CHECK-NEXT:    0000: 496D705F 43616C6C 5F563100 14000000  |Imp_Call_V1.....|
-// CHECK-NEXT:    0010:
-// CHECK-SAME:    [[#%.2X,NCSECT:]]000000
-// CHECK-SAME:    13000000
-// CHECK-SAME:    [[#%.2X,NCOFFSET:]]000000
-// CHECK-SAME:    [[#%.2X,NCSYM:]]000000
-// CHECK-NEXT:    0020:
-// CHECK-SAME:    14000000
-// CHECK-SAME:    [[#%.2X,TCSECT:]]000000
-// CHECK-SAME:    13000000
-// CHECK-SAME:    [[#%.2X,TCOFFSET:]]000000
-// CHECK-NEXT:    0030:
-// CHECK-SAME:    [[#%.2X,TCSYM:]]000000
-// CHECK-NEXT:  )
-
-// CHECK-LABEL: Relocations [
-// CHECK-NEXT:     Section ([[#%u,NCSECT]]) nc_sect {
-// CHECK-NEXT:       0x[[#%x,NCOFFSET - 8]] IMAGE_REL_ARM64_PAGEBASE_REL21 __imp_a ([[#%u,NCSYM]])
-// CHECK-NEXT:       0x[[#%x,NCOFFSET - 4]] IMAGE_REL_ARM64_PAGEOFFSET_12L __imp_a ([[#%u,NCSYM]])
-// CHECK-NEXT:     }
-// CHECK-NEXT:     Section ([[#%u,TCSECT]]) tc_sect {
-// CHECK-NEXT:       0x[[#%x,TCOFFSET - 8]] IMAGE_REL_ARM64_PAGEBASE_REL21 __imp_b ([[#%u,TCSYM]])
-// CHECK-NEXT:       0x[[#%x,TCOFFSET - 4]] IMAGE_REL_ARM64_PAGEOFFSET_12L __imp_b ([[#%u,TCSYM]])
-// CHECK-NEXT:     }
-// CHECK-NEXT:   ]
diff --git a/llvm/test/MC/COFF/bad-parse.s b/llvm/test/MC/COFF/bad-parse.s
deleted file mode 100644
index 2491f41abeb4e..0000000000000
--- a/llvm/test/MC/COFF/bad-parse.s
+++ /dev/null
@@ -1,13 +0,0 @@
-// RUN: not llvm-mc -filetype=obj -triple i386-pc-win32 %s 2>&1 | FileCheck %s
-
-        .data
-
-// CHECK: [[@LINE+1]]:{{[0-9]+}}: error: expected identifier in directive
-        .secnum
-// CHECK: [[@LINE+1]]:{{[0-9]+}}: error: unexpected token in directive
-        .secnum section extra
-
-// CHECK: [[@LINE+1]]:{{[0-9]+}}: error: expected identifier in directive
-        .secoffset
-// CHECK: [[@LINE+1]]:{{[0-9]+}}: error: unexpected token in directive
-        .secoffset section extra

From de252e7777c1c6b45626a58aa34ea99dee58cc40 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Mon, 13 Jan 2025 14:37:06 -0500
Subject: [PATCH 316/408] [lldb] Add amd64 ArchSpec (#122533)

amd64 is used on OpenBSD.
---
 lldb/include/lldb/Utility/ArchSpec.h    | 2 ++
 lldb/source/Utility/ArchSpec.cpp        | 4 ++++
 lldb/unittests/Utility/ArchSpecTest.cpp | 6 ++++++
 3 files changed, 12 insertions(+)

diff --git a/lldb/include/lldb/Utility/ArchSpec.h b/lldb/include/lldb/Utility/ArchSpec.h
index 2a74058673bae..7e9bc23a75acb 100644
--- a/lldb/include/lldb/Utility/ArchSpec.h
+++ b/lldb/include/lldb/Utility/ArchSpec.h
@@ -215,6 +215,8 @@ class ArchSpec {
 
     eCore_x86_64_x86_64,
     eCore_x86_64_x86_64h, // Haswell enabled x86_64
+    eCore_x86_64_amd64,
+
     eCore_hexagon_generic,
     eCore_hexagon_hexagonv4,
     eCore_hexagon_hexagonv5,
diff --git a/lldb/source/Utility/ArchSpec.cpp b/lldb/source/Utility/ArchSpec.cpp
index 85bb85044ec15..b13e8ff1ec373 100644
--- a/lldb/source/Utility/ArchSpec.cpp
+++ b/lldb/source/Utility/ArchSpec.cpp
@@ -218,6 +218,9 @@ static const CoreDefinition g_core_definitions[] = {
      ArchSpec::eCore_x86_64_x86_64, "x86_64"},
     {eByteOrderLittle, 8, 1, 15, llvm::Triple::x86_64,
      ArchSpec::eCore_x86_64_x86_64h, "x86_64h"},
+    {eByteOrderLittle, 8, 1, 15, llvm::Triple::x86_64,
+     ArchSpec::eCore_x86_64_amd64, "amd64"},
+
     {eByteOrderLittle, 4, 4, 4, llvm::Triple::hexagon,
      ArchSpec::eCore_hexagon_generic, "hexagon"},
     {eByteOrderLittle, 4, 4, 4, llvm::Triple::hexagon,
@@ -1227,6 +1230,7 @@ static bool cores_match(const ArchSpec::Core core1, const ArchSpec::Core core2,
     break;
 
   case ArchSpec::eCore_x86_64_x86_64h:
+  case ArchSpec::eCore_x86_64_amd64:
     if (!enforce_exact_match) {
       try_inverse = false;
       if (core2 == ArchSpec::eCore_x86_64_x86_64)
diff --git a/lldb/unittests/Utility/ArchSpecTest.cpp b/lldb/unittests/Utility/ArchSpecTest.cpp
index de3590b73bbaa..74a4b48456b01 100644
--- a/lldb/unittests/Utility/ArchSpecTest.cpp
+++ b/lldb/unittests/Utility/ArchSpecTest.cpp
@@ -129,6 +129,12 @@ TEST(ArchSpecTest, TestSetTriple) {
   EXPECT_STREQ("msp430", AS.GetArchitectureName());
   EXPECT_EQ(ArchSpec::eCore_msp430, AS.GetCore());
 
+  AS = ArchSpec();
+  EXPECT_TRUE(AS.SetTriple("amd64-unknown-openbsd"));
+  EXPECT_EQ(llvm::Triple::x86_64, AS.GetTriple().getArch());
+  EXPECT_STREQ("amd64", AS.GetArchitectureName());
+  EXPECT_EQ(ArchSpec::eCore_x86_64_amd64, AS.GetCore());
+
   // Various flavors of invalid triples.
   AS = ArchSpec();
   EXPECT_FALSE(AS.SetTriple("unknown-unknown-unknown"));

From abba01adad5dfc54f781357d924c8021c9306615 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 13 Jan 2025 11:37:37 -0800
Subject: [PATCH 317/408] [ADT] Deprecate PointerUnion::{is,get} (NFC)
 (#122623)

PointerUnion::{is,get} have been soft deprecated in PointerUnion.h:

  // FIXME: Replace the uses of is(), get() and dyn_cast() with
  //        isa<T>, cast<T> and the llvm::dyn_cast<T>

This patch actually deprecates them with [[deprecated]].

I'm not touching PointerUnion::dyn_cast for now because we have not
migrated away from it yet.
---
 llvm/include/llvm/ADT/PointerUnion.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/ADT/PointerUnion.h b/llvm/include/llvm/ADT/PointerUnion.h
index 7d4ed02b62262..cdbd76d7f505b 100644
--- a/llvm/include/llvm/ADT/PointerUnion.h
+++ b/llvm/include/llvm/ADT/PointerUnion.h
@@ -147,12 +147,18 @@ class PointerUnion
   //        isa<T>, cast<T> and the llvm::dyn_cast<T>
 
   /// Test if the Union currently holds the type matching T.
-  template <typename T> inline bool is() const { return isa<T>(*this); }
+  template <typename T>
+  [[deprecated("Use isa instead")]]
+  inline bool is() const {
+    return isa<T>(*this);
+  }
 
   /// Returns the value of the specified pointer type.
   ///
   /// If the specified pointer type is incorrect, assert.
-  template <typename T> inline T get() const {
+  template <typename T>
+  [[deprecated("Use cast instead")]]
+  inline T get() const {
     assert(isa<T>(*this) && "Invalid accessor called");
     return cast<T>(*this);
   }

From 8ed99689038bcc89dca92a4a3a13a4ede166bd7e Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 13 Jan 2025 11:38:00 -0800
Subject: [PATCH 318/408] [AST] Migrate away from PointerUnion::dyn_cast (NFC)
 (#122651)

Note that PointerUnion::dyn_cast has been soft deprecated in
PointerUnion.h:

  // FIXME: Replace the uses of is(), get() and dyn_cast() with
  //        isa<T>, cast<T> and the llvm::dyn_cast<T>

Literal migration would result in dyn_cast_if_present (see the
definition of PointerUnion::dyn_cast), but this patch uses dyn_cast
because we expect Ptr to be nonnull.
---
 clang/include/clang/AST/DeclBase.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
index 82932e098c86f..77abd8b657a61 100644
--- a/clang/include/clang/AST/DeclBase.h
+++ b/clang/include/clang/AST/DeclBase.h
@@ -1334,7 +1334,7 @@ class DeclListNode {
 
     reference operator*() const {
       assert(Ptr && "dereferencing end() iterator");
-      if (DeclListNode *CurNode = Ptr.dyn_cast<DeclListNode*>())
+      if (DeclListNode *CurNode = dyn_cast<DeclListNode *>(Ptr))
         return CurNode->D;
       return cast<NamedDecl *>(Ptr);
     }
@@ -1344,7 +1344,7 @@ class DeclListNode {
     inline iterator &operator++() { // ++It
       assert(!Ptr.isNull() && "Advancing empty iterator");
 
-      if (DeclListNode *CurNode = Ptr.dyn_cast<DeclListNode*>())
+      if (DeclListNode *CurNode = dyn_cast<DeclListNode *>(Ptr))
         Ptr = CurNode->Rest;
       else
         Ptr = nullptr;

From 19c0a6b5eb3a0c0619fccc140740500737fdcd47 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 13 Jan 2025 11:38:20 -0800
Subject: [PATCH 319/408] [Analysis] Migrate away from PointerUnion::dyn_cast
 (NFC) (#122652)

Note that PointerUnion::dyn_cast has been soft deprecated in
PointerUnion.h:

  // FIXME: Replace the uses of is(), get() and dyn_cast() with
  //        isa<T>, cast<T> and the llvm::dyn_cast<T>

Literal migration would result in dyn_cast_if_present (see the
definition of PointerUnion::dyn_cast), but this patch uses dyn_cast
because we expect Ctx->FunArgs to be nonnull.
---
 clang/lib/Analysis/ThreadSafetyCommon.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Analysis/ThreadSafetyCommon.cpp b/clang/lib/Analysis/ThreadSafetyCommon.cpp
index 050daee1168d4..13cd7e26dc16f 100644
--- a/clang/lib/Analysis/ThreadSafetyCommon.cpp
+++ b/clang/lib/Analysis/ThreadSafetyCommon.cpp
@@ -336,7 +336,7 @@ til::SExpr *SExprBuilder::translateDeclRefExpr(const DeclRefExpr *DRE,
               : (cast<ObjCMethodDecl>(D)->getCanonicalDecl() == Canonical)) {
         // Substitute call arguments for references to function parameters
         if (const Expr *const *FunArgs =
-                Ctx->FunArgs.dyn_cast<const Expr *const *>()) {
+                dyn_cast<const Expr *const *>(Ctx->FunArgs)) {
           assert(I < Ctx->NumArgs);
           return translate(FunArgs[I], Ctx->Prev);
         }

From 01ee66ea62b2db4d20a9596a450466751aa82624 Mon Sep 17 00:00:00 2001
From: Kelvin Li <kkwli@users.noreply.github.com>
Date: Mon, 13 Jan 2025 14:44:29 -0500
Subject: [PATCH 320/408] [flang][OMP] change malloc.h to stdlib.h in
 collapse_test.inc (NFC) (#122711)

---
 openmp/runtime/test/worksharing/for/collapse_test.inc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openmp/runtime/test/worksharing/for/collapse_test.inc b/openmp/runtime/test/worksharing/for/collapse_test.inc
index 3075bd04e958f..52f919dfbcc7d 100644
--- a/openmp/runtime/test/worksharing/for/collapse_test.inc
+++ b/openmp/runtime/test/worksharing/for/collapse_test.inc
@@ -1,5 +1,5 @@
 #include <omp.h>
-#include <malloc.h>
+#include <stdlib.h>
 #include <stdio.h>
 #include <memory.h>
 

From e3cd88a7be1dfd912bb6e7c7e888e7b442ffb5de Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Mon, 13 Jan 2025 11:56:11 -0800
Subject: [PATCH 321/408] [flang] Fixed StackArrays assertion after #121919.
 (#122550)

`findAllocaLoopInsertionPoint()` hit assertion not being able
to find the `fir.freemem` because of the `fir.convert`.
I think it is better to look for `fir.freemem` same way
with the look-through walk.
---
 .../lib/Optimizer/Transforms/StackArrays.cpp  | 54 +++++++++++--------
 flang/test/Transforms/stack-arrays.fir        | 42 +++++++++++++++
 2 files changed, 73 insertions(+), 23 deletions(-)

diff --git a/flang/lib/Optimizer/Transforms/StackArrays.cpp b/flang/lib/Optimizer/Transforms/StackArrays.cpp
index 2a9d3397e87b0..9a6566bef50f1 100644
--- a/flang/lib/Optimizer/Transforms/StackArrays.cpp
+++ b/flang/lib/Optimizer/Transforms/StackArrays.cpp
@@ -198,7 +198,9 @@ class AllocMemConversion : public mlir::OpRewritePattern<fir::AllocMemOp> {
 
   /// Determine where to insert the alloca operation. The returned value should
   /// be checked to see if it is inside a loop
-  static InsertionPoint findAllocaInsertionPoint(fir::AllocMemOp &oldAlloc);
+  static InsertionPoint
+  findAllocaInsertionPoint(fir::AllocMemOp &oldAlloc,
+                           const llvm::SmallVector<mlir::Operation *> &freeOps);
 
 private:
   /// Handle to the DFA (already run)
@@ -206,7 +208,9 @@ class AllocMemConversion : public mlir::OpRewritePattern<fir::AllocMemOp> {
 
   /// If we failed to find an insertion point not inside a loop, see if it would
   /// be safe to use an llvm.stacksave/llvm.stackrestore inside the loop
-  static InsertionPoint findAllocaLoopInsertionPoint(fir::AllocMemOp &oldAlloc);
+  static InsertionPoint findAllocaLoopInsertionPoint(
+      fir::AllocMemOp &oldAlloc,
+      const llvm::SmallVector<mlir::Operation *> &freeOps);
 
   /// Returns the alloca if it was successfully inserted, otherwise {}
   std::optional<fir::AllocaOp>
@@ -484,6 +488,22 @@ StackArraysAnalysisWrapper::analyseFunction(mlir::Operation *func) {
   llvm::DenseSet<mlir::Value> freedValues;
   point.appendFreedValues(freedValues);
 
+  // Find all fir.freemem operations corresponding to fir.allocmem
+  // in freedValues. It is best to find the association going back
+  // from fir.freemem to fir.allocmem through the def-use chains,
+  // so that we can use lookThroughDeclaresAndConverts same way
+  // the AllocationAnalysis is handling them.
+  llvm::DenseMap<mlir::Operation *, llvm::SmallVector<mlir::Operation *>>
+      allocToFreeMemMap;
+  func->walk([&](fir::FreeMemOp freeOp) {
+    mlir::Value memref = lookThroughDeclaresAndConverts(freeOp.getHeapref());
+    if (!freedValues.count(memref))
+      return;
+
+    auto allocMem = memref.getDefiningOp<fir::AllocMemOp>();
+    allocToFreeMemMap[allocMem].push_back(freeOp);
+  });
+
   // We only replace allocations which are definately freed on all routes
   // through the function because otherwise the allocation may have an intende
   // lifetime longer than the current stack frame (e.g. a heap allocation which
@@ -491,7 +511,8 @@ StackArraysAnalysisWrapper::analyseFunction(mlir::Operation *func) {
   for (mlir::Value freedValue : freedValues) {
     fir::AllocMemOp allocmem = freedValue.getDefiningOp<fir::AllocMemOp>();
     InsertionPoint insertionPoint =
-        AllocMemConversion::findAllocaInsertionPoint(allocmem);
+        AllocMemConversion::findAllocaInsertionPoint(
+            allocmem, allocToFreeMemMap[allocmem]);
     if (insertionPoint)
       candidateOps.insert({allocmem, insertionPoint});
   }
@@ -578,8 +599,9 @@ static bool isInLoop(mlir::Operation *op) {
          op->getParentOfType<mlir::LoopLikeOpInterface>();
 }
 
-InsertionPoint
-AllocMemConversion::findAllocaInsertionPoint(fir::AllocMemOp &oldAlloc) {
+InsertionPoint AllocMemConversion::findAllocaInsertionPoint(
+    fir::AllocMemOp &oldAlloc,
+    const llvm::SmallVector<mlir::Operation *> &freeOps) {
   // Ideally the alloca should be inserted at the end of the function entry
   // block so that we do not allocate stack space in a loop. However,
   // the operands to the alloca may not be available that early, so insert it
@@ -596,7 +618,7 @@ AllocMemConversion::findAllocaInsertionPoint(fir::AllocMemOp &oldAlloc) {
       if (isInLoop(oldAllocOp)) {
         // where we want to put it is in a loop, and even the old location is in
         // a loop. Give up.
-        return findAllocaLoopInsertionPoint(oldAlloc);
+        return findAllocaLoopInsertionPoint(oldAlloc, freeOps);
       }
       return {oldAllocOp};
     }
@@ -657,28 +679,14 @@ AllocMemConversion::findAllocaInsertionPoint(fir::AllocMemOp &oldAlloc) {
   return checkReturn(&entryBlock);
 }
 
-InsertionPoint
-AllocMemConversion::findAllocaLoopInsertionPoint(fir::AllocMemOp &oldAlloc) {
+InsertionPoint AllocMemConversion::findAllocaLoopInsertionPoint(
+    fir::AllocMemOp &oldAlloc,
+    const llvm::SmallVector<mlir::Operation *> &freeOps) {
   mlir::Operation *oldAllocOp = oldAlloc;
   // This is only called as a last resort. We should try to insert at the
   // location of the old allocation, which is inside of a loop, using
   // llvm.stacksave/llvm.stackrestore
 
-  // find freemem ops
-  llvm::SmallVector<mlir::Operation *, 1> freeOps;
-
-  for (mlir::Operation *user : oldAllocOp->getUsers()) {
-    if (auto declareOp = mlir::dyn_cast_if_present<fir::DeclareOp>(user)) {
-      for (mlir::Operation *user : declareOp->getUsers()) {
-        if (mlir::isa<fir::FreeMemOp>(user))
-          freeOps.push_back(user);
-      }
-    }
-
-    if (mlir::isa<fir::FreeMemOp>(user))
-      freeOps.push_back(user);
-  }
-
   assert(freeOps.size() && "DFA should only return freed memory");
 
   // Don't attempt to reason about a stacksave/stackrestore between different
diff --git a/flang/test/Transforms/stack-arrays.fir b/flang/test/Transforms/stack-arrays.fir
index 444136d53e034..a784cea9bc3a4 100644
--- a/flang/test/Transforms/stack-arrays.fir
+++ b/flang/test/Transforms/stack-arrays.fir
@@ -418,3 +418,45 @@ func.func @lookthrough() {
 // CHECK: func.func @lookthrough() {
 // CHECK:     fir.alloca !fir.array<42xi32>
 // CHECK-NOT: fir.freemem
+
+// StackArrays is better to find fir.freemem ops corresponding to fir.allocmem
+// using the same look through mechanism as during the allocation analysis,
+// looking through fir.convert and fir.declare.
+func.func @finding_freemem_in_block() {
+  %c0 = arith.constant 0 : index
+  %c10_i32 = arith.constant 10 : i32
+  %c1_i32 = arith.constant 1 : i32
+  %0 = fir.alloca i32 {bindc_name = "k", uniq_name = "k"}
+  %1 = fir.declare %0 {uniq_name = "k"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  fir.store %c1_i32 to %1 : !fir.ref<i32>
+  cf.br ^bb1
+^bb1:  // 2 preds: ^bb0, ^bb2
+  %2 = fir.load %1 : !fir.ref<i32>
+  %3 = arith.cmpi sle, %2, %c10_i32 : i32
+  cf.cond_br %3, ^bb2, ^bb3
+^bb2:  // pred: ^bb1
+  %4 = fir.declare %1 {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "x"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %5 = fir.load %4 : !fir.ref<i32>
+  %6 = fir.convert %5 : (i32) -> index
+  %7 = arith.cmpi sgt, %6, %c0 : index
+  %8 = arith.select %7, %6, %c0 : index
+  %9 = fir.shape %8 : (index) -> !fir.shape<1>
+  %10 = fir.allocmem !fir.array<?xi32>, %8 {bindc_name = ".tmp.expr_result", uniq_name = ""}
+  %11 = fir.convert %10 : (!fir.heap<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+  %12 = fir.declare %11(%9) {uniq_name = ".tmp.expr_result"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.ref<!fir.array<?xi32>>
+  %13 = fir.embox %12(%9) : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+  %14 = fir.call @_QPfunc(%1) fastmath<fast> : (!fir.ref<i32>) -> !fir.array<?xi32>
+  fir.save_result %14 to %12(%9) : !fir.array<?xi32>, !fir.ref<!fir.array<?xi32>>, !fir.shape<1>
+  fir.call @_QPsub(%13) fastmath<fast> : (!fir.box<!fir.array<?xi32>>) -> ()
+  %15 = fir.convert %12 : (!fir.ref<!fir.array<?xi32>>) -> !fir.heap<!fir.array<?xi32>>
+  fir.freemem %15 : !fir.heap<!fir.array<?xi32>>
+  %16 = fir.load %1 : !fir.ref<i32>
+  %17 = arith.addi %16, %c1_i32 : i32
+  fir.store %17 to %1 : !fir.ref<i32>
+  cf.br ^bb1
+^bb3:  // pred: ^bb1
+  return
+}
+// CHECK: func.func @finding_freemem_in_block() {
+// CHECK:     fir.alloca !fir.array<?xi32>
+// CHECK-NOT: fir.freemem

From 22a280d3924abf2cd2587dcff247bba884303c57 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Mon, 13 Jan 2025 20:03:58 +0000
Subject: [PATCH 322/408] LICM: teach hoistMinMax about samesign (#122730)

Follow up on 4a0d53a (PatternMatch: migrate to CmpPredicate) to get rid
of one of the FIXMEs it introduced by replacing a predicate comparison
with CmpPredicate::getMatching.
---
 llvm/lib/Transforms/Scalar/LICM.cpp  | 20 +++++-----
 llvm/test/Transforms/LICM/min_max.ll | 58 ++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index a5d5eecb1ebf8..0bab01904406e 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -2453,16 +2453,17 @@ static bool hoistMinMax(Instruction &I, Loop &L, ICFLoopSafetyInfo &SafetyInfo,
   if (!MatchICmpAgainstInvariant(Cond1, P1, LHS1, RHS1) ||
       !MatchICmpAgainstInvariant(Cond2, P2, LHS2, RHS2))
     return false;
-  // FIXME: Use CmpPredicate::getMatching here.
-  if (P1 != static_cast<CmpInst::Predicate>(P2) || LHS1 != LHS2)
+  auto MatchingPred = CmpPredicate::getMatching(P1, P2);
+  if (!MatchingPred || LHS1 != LHS2)
     return false;
 
   // Everything is fine, we can do the transform.
-  bool UseMin = ICmpInst::isLT(P1) || ICmpInst::isLE(P1);
+  bool UseMin = ICmpInst::isLT(*MatchingPred) || ICmpInst::isLE(*MatchingPred);
   assert(
-      (UseMin || ICmpInst::isGT(P1) || ICmpInst::isGE(P1)) &&
+      (UseMin || ICmpInst::isGT(*MatchingPred) ||
+       ICmpInst::isGE(*MatchingPred)) &&
       "Relational predicate is either less (or equal) or greater (or equal)!");
-  Intrinsic::ID id = ICmpInst::isSigned(P1)
+  Intrinsic::ID id = ICmpInst::isSigned(*MatchingPred)
                          ? (UseMin ? Intrinsic::smin : Intrinsic::smax)
                          : (UseMin ? Intrinsic::umin : Intrinsic::umax);
   auto *Preheader = L.getLoopPreheader();
@@ -2475,11 +2476,12 @@ static bool hoistMinMax(Instruction &I, Loop &L, ICFLoopSafetyInfo &SafetyInfo,
   if (isa<SelectInst>(I))
     RHS2 = Builder.CreateFreeze(RHS2, RHS2->getName() + ".fr");
   Value *NewRHS = Builder.CreateBinaryIntrinsic(
-      id, RHS1, RHS2, nullptr, StringRef("invariant.") +
-                                   (ICmpInst::isSigned(P1) ? "s" : "u") +
-                                   (UseMin ? "min" : "max"));
+      id, RHS1, RHS2, nullptr,
+      StringRef("invariant.") +
+          (ICmpInst::isSigned(*MatchingPred) ? "s" : "u") +
+          (UseMin ? "min" : "max"));
   Builder.SetInsertPoint(&I);
-  ICmpInst::Predicate P = P1;
+  ICmpInst::Predicate P = *MatchingPred;
   if (Inverse)
     P = ICmpInst::getInversePredicate(P);
   Value *NewCond = Builder.CreateICmp(P, LHS1, NewRHS);
diff --git a/llvm/test/Transforms/LICM/min_max.ll b/llvm/test/Transforms/LICM/min_max.ll
index c2bf0a7f20cc1..04f309b785ebc 100644
--- a/llvm/test/Transforms/LICM/min_max.ll
+++ b/llvm/test/Transforms/LICM/min_max.ll
@@ -242,6 +242,35 @@ exit:
   ret i32 %iv
 }
 
+define i32 @test_sgt_samesign(i32 %start, i32 %inv_1, i32 %inv_2) {
+; CHECK-LABEL: @test_sgt_samesign(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[INV_1:%.*]], i32 [[INV_2:%.*]])
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp sgt i32 [[IV]], [[INVARIANT_SMAX]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[IV_LCSSA:%.*]] = phi i32 [ [[IV]], [[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[IV_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [%start, %entry], [%iv.next, %loop]
+  %cmp_1 = icmp samesign ugt i32 %iv, %inv_1
+  %cmp_2 = icmp sgt i32 %iv, %inv_2
+  %loop_cond = and i1 %cmp_1, %cmp_2
+  %iv.next = add i32 %iv, 1
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:
+  ret i32 %iv
+}
+
 ; turn to %iv >=s smax(inv_1, inv_2) and hoist it out of loop.
 define i32 @test_sge(i32 %start, i32 %inv_1, i32 %inv_2) {
 ; CHECK-LABEL: @test_sge(
@@ -272,6 +301,35 @@ exit:
   ret i32 %iv
 }
 
+define i32 @test_sge_samesign(i32 %start, i32 %inv_1, i32 %inv_2) {
+; CHECK-LABEL: @test_sge_samesign(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[INV_1:%.*]], i32 [[INV_2:%.*]])
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp sge i32 [[IV]], [[INVARIANT_SMAX]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[IV_LCSSA:%.*]] = phi i32 [ [[IV]], [[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[IV_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [%start, %entry], [%iv.next, %loop]
+  %cmp_1 = icmp sge i32 %iv, %inv_1
+  %cmp_2 = icmp samesign uge i32 %iv, %inv_2
+  %loop_cond = and i1 %cmp_1, %cmp_2
+  %iv.next = add i32 %iv, 1
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:
+  ret i32 %iv
+}
+
 ; Turn OR to AND and handle accordingly.
 define i32 @test_ult_inv(i32 %start, i32 %inv_1, i32 %inv_2) {
 ; CHECK-LABEL: @test_ult_inv(

From c6c864da3fdfbe98f6302f209056fe3069d071ae Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean@nvidia.com>
Date: Mon, 13 Jan 2025 12:10:26 -0800
Subject: [PATCH 323/408] [FunctionAttrs] Treat byval calls as only reading
 ptrs (#122618)

Since byval arguments are passed via a hidden copy of the pointee, they
do not have the same semantics as normal pointer arguments. The callee
cannot capture or write to the pointer and the copy is a read of the
pointer.
---
 llvm/include/llvm/IR/InstrTypes.h             |  5 +++++
 llvm/lib/Analysis/AliasAnalysis.cpp           |  4 +---
 llvm/lib/Transforms/IPO/FunctionAttrs.cpp     |  7 ++++--
 .../Transforms/FunctionAttrs/readattrs.ll     | 22 +++++++++++++++++++
 .../memcpy-byval-forwarding-clobbers.ll       | 13 +++--------
 5 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
index 7ad34e4f22339..b8d9cc10292f4 100644
--- a/llvm/include/llvm/IR/InstrTypes.h
+++ b/llvm/include/llvm/IR/InstrTypes.h
@@ -1667,6 +1667,11 @@ class CallBase : public Instruction {
   // FIXME: Once this API is no longer duplicated in `CallSite`, rename this to
   // better indicate that this may return a conservative answer.
   bool doesNotCapture(unsigned OpNo) const {
+    // If the argument is passed byval, the callee does not have access to the
+    // original pointer and thus cannot capture it.
+    if (OpNo < arg_size() && isByValArgument(OpNo))
+      return true;
+
     return dataOperandHasImpliedAttr(OpNo, Attribute::NoCapture);
   }
 
diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp
index 62cdce56fe388..061a7e8e5c349 100644
--- a/llvm/lib/Analysis/AliasAnalysis.cpp
+++ b/llvm/lib/Analysis/AliasAnalysis.cpp
@@ -636,9 +636,7 @@ ModRefInfo AAResults::callCapturesBefore(const Instruction *I,
     // Only look at the no-capture or byval pointer arguments.  If this
     // pointer were passed to arguments that were neither of these, then it
     // couldn't be no-capture.
-    if (!(*CI)->getType()->isPointerTy() ||
-        (!Call->doesNotCapture(ArgNo) && ArgNo < Call->arg_size() &&
-         !Call->isByValArgument(ArgNo)))
+    if (!(*CI)->getType()->isPointerTy() || !Call->doesNotCapture(ArgNo))
       continue;
 
     AliasResult AR =
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index fe9cca01a8f31..06b5d791abe95 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -852,7 +852,7 @@ determinePointerAccessAttrs(Argument *A,
         continue;
       }
 
-      // Given we've explictily handled the callee operand above, what's left
+      // Given we've explicitly handled the callee operand above, what's left
       // must be a data operand (e.g. argument or operand bundle)
       const unsigned UseIndex = CB.getDataOperandNo(U);
 
@@ -890,11 +890,14 @@ determinePointerAccessAttrs(Argument *A,
           // can participate in the speculation.
           break;
 
+      const bool IsByVal =
+          CB.isArgOperand(U) && CB.isByValArgument(CB.getArgOperandNo(U));
+
       // The accessors used on call site here do the right thing for calls and
       // invokes with operand bundles.
       if (CB.doesNotAccessMemory(UseIndex)) {
         /* nop */
-      } else if (!isModSet(ArgMR) || CB.onlyReadsMemory(UseIndex)) {
+      } else if (!isModSet(ArgMR) || CB.onlyReadsMemory(UseIndex) || IsByVal) {
         IsRead = true;
       } else if (!isRefSet(ArgMR) ||
                  CB.dataOperandHasImpliedAttr(UseIndex, Attribute::WriteOnly)) {
diff --git a/llvm/test/Transforms/FunctionAttrs/readattrs.ll b/llvm/test/Transforms/FunctionAttrs/readattrs.ll
index 004c0485d764a..e60954c9cd29a 100644
--- a/llvm/test/Transforms/FunctionAttrs/readattrs.ll
+++ b/llvm/test/Transforms/FunctionAttrs/readattrs.ll
@@ -762,5 +762,27 @@ define void @writable_readnone(ptr writable dereferenceable(4) %p) {
   ret void
 }
 
+declare void @byval_param(ptr byval(i32) %p)
+
+define void @call_byval_param(ptr %p) {
+; FNATTRS-LABEL: define {{[^@]+}}@call_byval_param
+; FNATTRS-SAME: (ptr nocapture readonly [[P:%.*]]) {
+; FNATTRS-NEXT:    call void @byval_param(ptr byval(i32) [[P]])
+; FNATTRS-NEXT:    ret void
+;
+; ATTRIBUTOR-LABEL: define {{[^@]+}}@call_byval_param
+; ATTRIBUTOR-SAME: (ptr nocapture readonly [[P:%.*]]) {
+; ATTRIBUTOR-NEXT:    call void @byval_param(ptr nocapture readonly byval(i32) [[P]])
+; ATTRIBUTOR-NEXT:    ret void
+;
+; ATTRIBUTOR-CGSCC-LABEL: define {{[^@]+}}@call_byval_param
+; ATTRIBUTOR-CGSCC-SAME: (ptr nocapture readonly [[P:%.*]]) {
+; ATTRIBUTOR-CGSCC-NEXT:    call void @byval_param(ptr nocapture readonly byval(i32) [[P]])
+; ATTRIBUTOR-CGSCC-NEXT:    ret void
+;
+  call void @byval_param(ptr byval(i32) %p)
+  ret void
+}
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; COMMON: {{.*}}
diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-byval-forwarding-clobbers.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-byval-forwarding-clobbers.ll
index c28754ebe422f..383040c6c89e2 100644
--- a/llvm/test/Transforms/MemCpyOpt/memcpy-byval-forwarding-clobbers.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy-byval-forwarding-clobbers.ll
@@ -11,19 +11,15 @@ declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
 
-; %a.2's lifetime ends before the call to @check. Cannot replace
-; %a.1 with %a.2 in the call to @check.
+; %a.2's lifetime ends before the call to @check. We must remove the call to
+; @llvm.lifetime.end in order to replace %a.1 with %a.2 in the call to @check.
 define i1 @alloca_forwarding_lifetime_end_clobber() {
 ; CHECK-LABEL: @alloca_forwarding_lifetime_end_clobber(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A_1:%.*]] = alloca i64, align 8
 ; CHECK-NEXT:    [[A_2:%.*]] = alloca i64, align 8
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[A_2]])
 ; CHECK-NEXT:    call void @init(ptr sret(i64) align 8 [[A_2]])
 ; CHECK-NEXT:    store i8 0, ptr [[A_2]], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[A_1]], ptr [[A_2]], i64 8, i1 false)
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[A_2]])
-; CHECK-NEXT:    [[CALL:%.*]] = call i1 @check(ptr byval(i64) align 8 [[A_1]])
+; CHECK-NEXT:    [[CALL:%.*]] = call i1 @check(ptr byval(i64) align 8 [[A_2]])
 ; CHECK-NEXT:    ret i1 [[CALL]]
 ;
 entry:
@@ -94,13 +90,10 @@ entry:
 define i1 @alloca_forwarding_unrelated_call_noclobber() {
 ; CHECK-LABEL: @alloca_forwarding_unrelated_call_noclobber(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A_1:%.*]] = alloca i64, align 8
 ; CHECK-NEXT:    [[A_2:%.*]] = alloca i64, align 8
 ; CHECK-NEXT:    [[A_3:%.*]] = alloca i64, align 8
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[A_2]])
 ; CHECK-NEXT:    call void @init(ptr sret(i64) align 8 [[A_2]])
 ; CHECK-NEXT:    store i8 0, ptr [[A_2]], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[A_1]], ptr [[A_2]], i64 8, i1 false)
 ; CHECK-NEXT:    call void @clobber(ptr [[A_3]])
 ; CHECK-NEXT:    [[CALL:%.*]] = call i1 @check(ptr byval(i64) align 8 [[A_2]])
 ; CHECK-NEXT:    ret i1 [[CALL]]

From 83be69cf9ade7e1f78df297518b1490d54794edc Mon Sep 17 00:00:00 2001
From: offsake <sergey.i.zverev@intel.com>
Date: Mon, 13 Jan 2025 12:29:51 -0800
Subject: [PATCH 324/408] [VPlan][Coverity] Fix coverity CID1579964. (#121805)

Fix for the Coverity hit with CID1579964 in VPlan.cpp.

Coverity message with some context follows.

[Cov] var_compare_op: Comparing TermBr to null implies that TermBr might
be null.
434    } else if (TermBr && !TermBr->isConditional()) {
435      TermBr->setSuccessor(0, NewBB);
436    } else {
437 // Set each forward successor here when it is created, excluding
438 // backedges. A backward successor is set when the branch is
created.
439      unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;

[Cov] CID 1579964: (#1 of 1): Dereference after null check
(FORWARD_NULL)
[Cov] var_deref_model: Passing null pointer TermBr to getSuccessor,
which dereferences it.
---
 llvm/lib/Transforms/Vectorize/VPlan.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index e804f81c36dba..aa41c41e90c4c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -438,10 +438,10 @@ void VPBasicBlock::connectToPredecessors(VPTransformState::CFGState &CFG) {
       // Set each forward successor here when it is created, excluding
       // backedges. A backward successor is set when the branch is created.
       unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
-      assert(
-          (!TermBr->getSuccessor(idx) ||
-           (isa<VPIRBasicBlock>(this) && TermBr->getSuccessor(idx) == NewBB)) &&
-          "Trying to reset an existing successor block.");
+      assert((TermBr && (!TermBr->getSuccessor(idx) ||
+                         (isa<VPIRBasicBlock>(this) &&
+                          TermBr->getSuccessor(idx) == NewBB))) &&
+             "Trying to reset an existing successor block.");
       TermBr->setSuccessor(idx, NewBB);
     }
     CFG.DTU.applyUpdates({{DominatorTree::Insert, PredBB, NewBB}});

From 550d32f205202b73f21903b29df04fe2e89ae648 Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245@hotmail.com>
Date: Mon, 13 Jan 2025 15:31:05 -0500
Subject: [PATCH 325/408] [libc++][test] Add exception tests for vector
 capacity operations (#118141)

As a follow-up to #117662, this PR provides a comprehensive set of
exception tests for the following capacity-related functions in
`std::vector`. Specifically, it includes tests for the following
functions:
- `reserve(size_type)`
- `resize(size_type)` and `resize(size_type, const_reference)`
- `shrink_to_fit()`

Previously, the exception safety tests for these functions were either
missing or inadequate. We need a thorough coverage of exception tests to
validate that these operations provide strong exception guarantees under
various exceptional scenarios.
---
 .../std/containers/sequences/vector/common.h  | 135 ++++++
 .../vector/vector.capacity/reserve.pass.cpp   | 180 +++-----
 .../reserve_exceptions.pass.cpp               | 306 ++++++++++++++
 .../resize_size_exceptions.pass.cpp           | 394 ++++++++++++++++++
 .../resize_size_value_exceptions.pass.cpp     | 231 ++++++++++
 .../shrink_to_fit_exceptions.pass.cpp         | 257 ++++++++++++
 6 files changed, 1388 insertions(+), 115 deletions(-)
 create mode 100644 libcxx/test/std/containers/sequences/vector/vector.capacity/reserve_exceptions.pass.cpp
 create mode 100644 libcxx/test/std/containers/sequences/vector/vector.capacity/resize_size_exceptions.pass.cpp
 create mode 100644 libcxx/test/std/containers/sequences/vector/vector.capacity/resize_size_value_exceptions.pass.cpp
 create mode 100644 libcxx/test/std/containers/sequences/vector/vector.capacity/shrink_to_fit_exceptions.pass.cpp

diff --git a/libcxx/test/std/containers/sequences/vector/common.h b/libcxx/test/std/containers/sequences/vector/common.h
index ff8147ef6b838..4af6559a06e73 100644
--- a/libcxx/test/std/containers/sequences/vector/common.h
+++ b/libcxx/test/std/containers/sequences/vector/common.h
@@ -9,12 +9,18 @@
 #ifndef TEST_STD_CONTAINERS_SEQUENCES_VECTOR_COMMON_H
 #define TEST_STD_CONTAINERS_SEQUENCES_VECTOR_COMMON_H
 
+#include <array>
 #include <cassert>
 #include <cstddef>
+#include <cstdlib>
 #include <memory>
+#include <string>
 #include <type_traits>
+#include <utility>
+#include <vector>
 
 #include "count_new.h"
+#include "test_macros.h"
 
 struct throwing_t {
   int* throw_after_n_ = nullptr;
@@ -48,6 +54,95 @@ struct throwing_t {
   }
 };
 
+#if TEST_STD_VER >= 11
+
+template <typename T>
+struct move_only_throwing_t {
+  T data_;
+  int* throw_after_n_ = nullptr;
+  bool moved_from_    = false;
+
+  move_only_throwing_t() = default;
+
+  explicit move_only_throwing_t(const T& data, int& throw_after_n) : data_(data), throw_after_n_(&throw_after_n) {
+    if (throw_after_n == 0)
+      throw 1;
+    --throw_after_n;
+  }
+
+  explicit move_only_throwing_t(T&& data, int& throw_after_n) : data_(std::move(data)), throw_after_n_(&throw_after_n) {
+    if (throw_after_n == 0)
+      throw 1;
+    --throw_after_n;
+  }
+
+  move_only_throwing_t(const move_only_throwing_t&)            = delete;
+  move_only_throwing_t& operator=(const move_only_throwing_t&) = delete;
+
+  move_only_throwing_t(move_only_throwing_t&& rhs) : data_(std::move(rhs.data_)), throw_after_n_(rhs.throw_after_n_) {
+    rhs.throw_after_n_ = nullptr;
+    rhs.moved_from_    = true;
+    if (throw_after_n_ == nullptr || *throw_after_n_ == 0)
+      throw 1;
+    --*throw_after_n_;
+  }
+
+  move_only_throwing_t& operator=(move_only_throwing_t&& rhs) {
+    if (this == &rhs)
+      return *this;
+    data_              = std::move(rhs.data_);
+    throw_after_n_     = rhs.throw_after_n_;
+    rhs.moved_from_    = true;
+    rhs.throw_after_n_ = nullptr;
+    if (throw_after_n_ == nullptr || *throw_after_n_ == 0)
+      throw 1;
+    --*throw_after_n_;
+    return *this;
+  }
+
+  friend bool operator==(const move_only_throwing_t& lhs, const move_only_throwing_t& rhs) {
+    return lhs.data_ == rhs.data_;
+  }
+  friend bool operator!=(const move_only_throwing_t& lhs, const move_only_throwing_t& rhs) {
+    return lhs.data_ != rhs.data_;
+  }
+};
+
+#endif
+
+template <typename T>
+struct throwing_data {
+  T data_;
+  int* throw_after_n_ = nullptr;
+  throwing_data() { throw 0; }
+
+  throwing_data(const T& data, int& throw_after_n) : data_(data), throw_after_n_(&throw_after_n) {
+    if (throw_after_n == 0)
+      throw 0;
+    --throw_after_n;
+  }
+
+  throwing_data(const throwing_data& rhs) : data_(rhs.data_), throw_after_n_(rhs.throw_after_n_) {
+    if (throw_after_n_ == nullptr || *throw_after_n_ == 0)
+      throw 1;
+    --*throw_after_n_;
+  }
+
+  throwing_data& operator=(const throwing_data& rhs) {
+    data_          = rhs.data_;
+    throw_after_n_ = rhs.throw_after_n_;
+    if (throw_after_n_ == nullptr || *throw_after_n_ == 0)
+      throw 1;
+    --*throw_after_n_;
+    return *this;
+  }
+
+  friend bool operator==(const throwing_data& lhs, const throwing_data& rhs) {
+    return lhs.data_ == rhs.data_ && lhs.throw_after_n_ == rhs.throw_after_n_;
+  }
+  friend bool operator!=(const throwing_data& lhs, const throwing_data& rhs) { return !(lhs == rhs); }
+};
+
 template <class T>
 struct throwing_allocator {
   using value_type = T;
@@ -125,4 +220,44 @@ inline void check_new_delete_called() {
   assert(globalMemCounter.aligned_new_array_called == globalMemCounter.aligned_delete_array_called);
 }
 
+template <class T, typename Alloc>
+void use_unspecified_but_valid_state_vector(std::vector<T, Alloc> const& v) {
+  assert(v.size() >= 0); // make sure it can be called
+  assert(v.capacity() >= 0);
+  assert(v.empty() || !v.empty());
+  for (auto it = v.begin(); it != v.end(); ++it) {
+    auto& element = *it;
+    (void)element;
+  }
+}
+
+static const std::array<char, 62> letters = {
+    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
+    'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
+    'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'};
+
+inline std::string getString(std::size_t n, std::size_t len) {
+  std::string s;
+  s.reserve(len);
+  for (std::size_t i = 0; i < len; ++i)
+    s += letters[(i * i + n) % letters.size()];
+  return s;
+}
+
+inline std::vector<int> getIntegerInputs(std::size_t n) {
+  std::vector<int> v;
+  v.reserve(n);
+  for (std::size_t i = 0; i < n; ++i)
+    v.push_back(static_cast<int>(i * i + n));
+  return v;
+}
+
+inline std::vector<std::string> getStringInputsWithLength(std::size_t n, std::size_t len) {
+  std::vector<std::string> v;
+  v.reserve(n);
+  for (std::size_t i = 0; i < n; ++i)
+    v.push_back(getString(i, len));
+  return v;
+}
+
 #endif // TEST_STD_CONTAINERS_SEQUENCES_VECTOR_COMMON_H
diff --git a/libcxx/test/std/containers/sequences/vector/vector.capacity/reserve.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.capacity/reserve.pass.cpp
index b8548ad72d437..38e969335e0ad 100644
--- a/libcxx/test/std/containers/sequences/vector/vector.capacity/reserve.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector/vector.capacity/reserve.pass.cpp
@@ -19,126 +19,76 @@
 #include "asan_testing.h"
 
 TEST_CONSTEXPR_CXX20 bool tests() {
-    {
-        std::vector<int> v;
-        v.reserve(10);
-        assert(v.capacity() >= 10);
-        assert(is_contiguous_container_asan_correct(v));
-    }
-    {
-        std::vector<int> v(100);
-        assert(v.capacity() == 100);
-        v.reserve(50);
-        assert(v.size() == 100);
-        assert(v.capacity() == 100);
-        v.reserve(150);
-        assert(v.size() == 100);
-        assert(v.capacity() == 150);
-        assert(is_contiguous_container_asan_correct(v));
-    }
-    {
-        // Add 1 for implementations that dynamically allocate a container proxy.
-        std::vector<int, limited_allocator<int, 250 + 1> > v(100);
-        assert(v.capacity() == 100);
-        v.reserve(50);
-        assert(v.size() == 100);
-        assert(v.capacity() == 100);
-        v.reserve(150);
-        assert(v.size() == 100);
-        assert(v.capacity() == 150);
-        assert(is_contiguous_container_asan_correct(v));
-    }
-#ifndef TEST_HAS_NO_EXCEPTIONS
-    if (!TEST_IS_CONSTANT_EVALUATED) {
-        std::vector<int> v;
-        std::size_t sz = v.max_size() + 1;
-
-        try {
-            v.reserve(sz);
-            assert(false);
-        } catch (const std::length_error&) {
-            assert(v.size() == 0);
-            assert(v.capacity() == 0);
-        }
-    }
-    if (!TEST_IS_CONSTANT_EVALUATED) {
-        std::vector<int> v(10, 42);
-        int* previous_data = v.data();
-        std::size_t previous_capacity = v.capacity();
-        std::size_t sz = v.max_size() + 1;
-
-        try {
-            v.reserve(sz);
-            assert(false);
-        } catch (std::length_error&) {
-            assert(v.size() == 10);
-            assert(v.capacity() == previous_capacity);
-            assert(v.data() == previous_data);
-
-            for (int i = 0; i < 10; ++i) {
-                assert(v[i] == 42);
-            }
-        }
-    }
-#endif
+  {
+    std::vector<int> v;
+    v.reserve(10);
+    assert(v.capacity() >= 10);
+    assert(is_contiguous_container_asan_correct(v));
+  }
+  {
+    std::vector<int> v(100);
+    assert(v.capacity() == 100);
+    v.reserve(50);
+    assert(v.size() == 100);
+    assert(v.capacity() == 100);
+    v.reserve(150);
+    assert(v.size() == 100);
+    assert(v.capacity() == 150);
+    assert(is_contiguous_container_asan_correct(v));
+  }
+  {
+    // Add 1 for implementations that dynamically allocate a container proxy.
+    std::vector<int, limited_allocator<int, 250 + 1> > v(100);
+    assert(v.capacity() == 100);
+    v.reserve(50);
+    assert(v.size() == 100);
+    assert(v.capacity() == 100);
+    v.reserve(150);
+    assert(v.size() == 100);
+    assert(v.capacity() == 150);
+    assert(is_contiguous_container_asan_correct(v));
+  }
 #if TEST_STD_VER >= 11
-    {
-        std::vector<int, min_allocator<int>> v;
-        v.reserve(10);
-        assert(v.capacity() >= 10);
-        assert(is_contiguous_container_asan_correct(v));
-    }
-    {
-        std::vector<int, min_allocator<int>> v(100);
-        assert(v.capacity() == 100);
-        v.reserve(50);
-        assert(v.size() == 100);
-        assert(v.capacity() == 100);
-        v.reserve(150);
-        assert(v.size() == 100);
-        assert(v.capacity() == 150);
-        assert(is_contiguous_container_asan_correct(v));
-    }
-    {
-      std::vector<int, safe_allocator<int>> v;
-      v.reserve(10);
-      assert(v.capacity() >= 10);
-      assert(is_contiguous_container_asan_correct(v));
-    }
-    {
-      std::vector<int, safe_allocator<int>> v(100);
-      assert(v.capacity() == 100);
-      v.reserve(50);
-      assert(v.size() == 100);
-      assert(v.capacity() == 100);
-      v.reserve(150);
-      assert(v.size() == 100);
-      assert(v.capacity() == 150);
-      assert(is_contiguous_container_asan_correct(v));
-    }
-#endif
-#ifndef TEST_HAS_NO_EXCEPTIONS
-    if (!TEST_IS_CONSTANT_EVALUATED) {
-        std::vector<int, limited_allocator<int, 100> > v;
-        v.reserve(50);
-        assert(v.capacity() == 50);
-        assert(is_contiguous_container_asan_correct(v));
-        try {
-            v.reserve(101);
-            assert(false);
-        } catch (const std::length_error&) {
-            // no-op
-        }
-        assert(v.capacity() == 50);
-        assert(is_contiguous_container_asan_correct(v));
-    }
+  {
+    std::vector<int, min_allocator<int>> v;
+    v.reserve(10);
+    assert(v.capacity() >= 10);
+    assert(is_contiguous_container_asan_correct(v));
+  }
+  {
+    std::vector<int, min_allocator<int>> v(100);
+    assert(v.capacity() == 100);
+    v.reserve(50);
+    assert(v.size() == 100);
+    assert(v.capacity() == 100);
+    v.reserve(150);
+    assert(v.size() == 100);
+    assert(v.capacity() == 150);
+    assert(is_contiguous_container_asan_correct(v));
+  }
+  {
+    std::vector<int, safe_allocator<int>> v;
+    v.reserve(10);
+    assert(v.capacity() >= 10);
+    assert(is_contiguous_container_asan_correct(v));
+  }
+  {
+    std::vector<int, safe_allocator<int>> v(100);
+    assert(v.capacity() == 100);
+    v.reserve(50);
+    assert(v.size() == 100);
+    assert(v.capacity() == 100);
+    v.reserve(150);
+    assert(v.size() == 100);
+    assert(v.capacity() == 150);
+    assert(is_contiguous_container_asan_correct(v));
+  }
 #endif
 
-    return true;
+  return true;
 }
 
-int main(int, char**)
-{
+int main(int, char**) {
   tests();
 
 #if TEST_STD_VER > 17
diff --git a/libcxx/test/std/containers/sequences/vector/vector.capacity/reserve_exceptions.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.capacity/reserve_exceptions.pass.cpp
new file mode 100644
index 0000000000000..c381e23a04d02
--- /dev/null
+++ b/libcxx/test/std/containers/sequences/vector/vector.capacity/reserve_exceptions.pass.cpp
@@ -0,0 +1,306 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-exceptions
+
+// This test file validates that std::vector<T>::reserve provides the strong exception guarantee if T is
+// Cpp17MoveInsertable and no exception is thrown by the move constructor of T during the reserve call.
+// It also checks that if T's move constructor is not noexcept, reserve provides only the basic exception
+// guarantee.
+
+#include <cstddef>
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+#include "../common.h"
+#include "MoveOnly.h"
+#include "count_new.h"
+#include "increasing_allocator.h"
+#include "min_allocator.h"
+#include "test_allocator.h"
+#include "test_iterators.h"
+#include "test_macros.h"
+
+template <typename T, typename Alloc>
+void test_allocation_exception_for_strong_guarantee(
+    std::vector<T, Alloc>& v, const std::vector<T>& values, std::size_t new_cap) {
+  assert(v.size() == values.size());
+  T* old_data          = v.data();
+  std::size_t old_size = v.size();
+  std::size_t old_cap  = v.capacity();
+
+  try {
+    v.reserve(new_cap);
+  } catch (...) { // std::length_error, std::bad_alloc
+    assert(v.data() == old_data);
+    assert(v.size() == old_size);
+    assert(v.capacity() == old_cap);
+    for (std::size_t i = 0; i < v.size(); ++i)
+      assert(v[i] == values[i]);
+  }
+}
+
+template <typename T, typename Alloc>
+void test_copy_ctor_exception_for_strong_guarantee(std::vector<throwing_data<T>, Alloc>& v,
+                                                   const std::vector<T>& values) {
+  assert(v.empty() && !values.empty());
+  int throw_after = values.size() + values.size() / 2; // Trigger an exception halfway through reallocation
+  v.reserve(values.size());
+  for (std::size_t i = 0; i < values.size(); ++i)
+    v.emplace_back(values[i], throw_after);
+
+  throwing_data<T>* old_data = v.data();
+  std::size_t old_size       = v.size();
+  std::size_t old_cap        = v.capacity();
+  std::size_t new_cap        = 2 * old_cap;
+
+  try {
+    v.reserve(new_cap);
+  } catch (...) {
+    assert(v.data() == old_data);
+    assert(v.size() == old_size);
+    assert(v.capacity() == old_cap);
+    for (std::size_t i = 0; i < v.size(); ++i)
+      assert(v[i].data_ == values[i]);
+  }
+}
+
+#if TEST_STD_VER >= 11
+
+template <typename T, typename Alloc>
+void test_move_ctor_exception_for_basic_guarantee(std::vector<move_only_throwing_t<T>, Alloc>& v,
+                                                  const std::vector<T>& values) {
+  assert(v.empty() && !values.empty());
+  int throw_after = values.size() + values.size() / 2; // Trigger an exception halfway through reallocation
+  v.reserve(values.size());
+  for (std::size_t i = 0; i < values.size(); ++i)
+    v.emplace_back(values[i], throw_after);
+
+  try {
+    v.reserve(2 * v.capacity());
+  } catch (...) {
+    use_unspecified_but_valid_state_vector(v);
+  }
+}
+
+#endif
+
+// Check the strong exception guarantee during reallocation failures
+void test_allocation_exceptions() {
+  //
+  // Tests for std::length_error during reallocation failures
+  //
+  {
+    std::vector<int> v;
+    test_allocation_exception_for_strong_guarantee(v, std::vector<int>(), v.max_size() + 1);
+  }
+  check_new_delete_called();
+
+  {
+    int a[] = {1, 2, 3, 4, 5};
+    std::vector<int> in(a, a + sizeof(a) / sizeof(a[0]));
+    std::vector<int> v(in.begin(), in.end());
+    test_allocation_exception_for_strong_guarantee(v, in, v.max_size() + 1);
+  }
+  check_new_delete_called();
+
+  {
+    int a[] = {1, 2, 3, 4, 5};
+    std::vector<int> in(a, a + sizeof(a) / sizeof(a[0]));
+    std::vector<int, min_allocator<int> > v(in.begin(), in.end());
+    test_allocation_exception_for_strong_guarantee(v, in, v.max_size() + 1);
+  }
+  check_new_delete_called();
+
+  {
+    int a[] = {1, 2, 3, 4, 5};
+    std::vector<int> in(a, a + sizeof(a) / sizeof(a[0]));
+    std::vector<int, safe_allocator<int> > v(in.begin(), in.end());
+    test_allocation_exception_for_strong_guarantee(v, in, v.max_size() + 1);
+  }
+  check_new_delete_called();
+
+  {
+    int a[] = {1, 2, 3, 4, 5};
+    std::vector<int> in(a, a + sizeof(a) / sizeof(a[0]));
+    std::vector<int, test_allocator<int> > v(in.begin(), in.end());
+    test_allocation_exception_for_strong_guarantee(v, in, v.max_size() + 1);
+  }
+  check_new_delete_called();
+
+  {
+    std::vector<int> in(10, 42);
+    std::vector<int, limited_allocator<int, 100> > v(in.begin(), in.end());
+    test_allocation_exception_for_strong_guarantee(v, in, v.max_size() + 1);
+  }
+  check_new_delete_called();
+
+#if TEST_STD_VER >= 23
+  {
+    std::vector<int> in{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    std::vector<int, increasing_allocator<int>> v(in.begin(), in.end());
+    test_allocation_exception_for_strong_guarantee(v, in, v.max_size() + 1);
+  }
+  check_new_delete_called();
+#endif
+
+  //
+  // Tests for std::bad_alloc during reallocation failures
+  //
+  {
+    std::vector<int> in(10, 42);
+    std::vector<int, limited_allocator<int, 100> > v(in.begin(), in.end());
+    test_allocation_exception_for_strong_guarantee(v, in, 91);
+  }
+  check_new_delete_called();
+
+  {
+    std::vector<int> in(10, 42);
+    std::vector<int, limited_allocator<int, 100> > v(in.begin(), in.end());
+    v.reserve(30);
+    test_allocation_exception_for_strong_guarantee(v, in, 61);
+  }
+  check_new_delete_called();
+
+#if TEST_STD_VER >= 11
+  {
+    std::vector<MoveOnly> in(10);
+    std::vector<MoveOnly, limited_allocator<MoveOnly, 100> > v(10);
+    test_allocation_exception_for_strong_guarantee(v, in, 91);
+  }
+  check_new_delete_called();
+
+  {
+    std::vector<MoveOnly> in(10);
+    in.insert(in.cbegin() + 5, MoveOnly(42));
+    std::vector<MoveOnly, limited_allocator<MoveOnly, 100> > v(10);
+    v.reserve(30);
+    v.insert(v.cbegin() + 5, MoveOnly(42));
+    test_allocation_exception_for_strong_guarantee(v, in, 61);
+  }
+  check_new_delete_called();
+#endif
+
+  { // Practical example: Testing with 100 integers.
+    auto in = getIntegerInputs(100);
+    std::vector<int, limited_allocator<int, 299> > v(in.begin(), in.end());
+    test_allocation_exception_for_strong_guarantee(v, in, 200);
+  }
+  check_new_delete_called();
+
+  { // Practical example: Testing with 100 strings, each 256 characters long.
+    std::vector<std::string> in = getStringInputsWithLength(100, 256);
+    std::vector<std::string, limited_allocator<std::string, 299> > v(in.begin(), in.end());
+    test_allocation_exception_for_strong_guarantee(v, in, 200);
+  }
+  check_new_delete_called();
+}
+
+// Check the strong exception guarantee during copy-constructor failures
+void test_copy_ctor_exceptions() {
+  {
+    int a[] = {1, 2, 3, 4, 5};
+    std::vector<int> in(a, a + sizeof(a) / sizeof(a[0]));
+    std::vector<throwing_data<int> > v;
+    test_copy_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+  {
+    int a[] = {1, 2, 3, 4, 5};
+    std::vector<int> in(a, a + sizeof(a) / sizeof(a[0]));
+    std::vector<throwing_data<int>, min_allocator<throwing_data<int> > > v;
+    test_copy_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+  {
+    std::vector<int> in(10, 42);
+    std::vector<throwing_data<int>, safe_allocator<throwing_data<int> > > v;
+    test_copy_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+  {
+    std::vector<int> in(10, 42);
+    std::vector<throwing_data<int>, test_allocator<throwing_data<int> > > v;
+    test_copy_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+  {
+    std::vector<int> in(10, 42);
+    std::vector<throwing_data<int>, limited_allocator<throwing_data<int>, 100> > v;
+    test_copy_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+#if TEST_STD_VER >= 23
+  {
+    std::vector<int> in{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    std::vector<throwing_data<int>, increasing_allocator<throwing_data<int>>> v;
+    test_copy_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+#endif
+
+  { // Practical example: Testing with 100 integers.
+    auto in = getIntegerInputs(100);
+    std::vector<throwing_data<int> > v;
+    test_copy_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+  { // Practical example: Testing with 100 strings, each 256 characters long.
+    std::vector<std::string> in = getStringInputsWithLength(100, 256);
+    std::vector<throwing_data<std::string> > v;
+    test_copy_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+}
+
+#if TEST_STD_VER >= 11
+
+// Check that if T is Cpp17MoveInsertible && !Cpp17CopyInsertible, and T's move-ctor is not noexcept, then
+// std::vector::reserve only provides basic guarantee.
+void test_move_ctor_exceptions() {
+  {
+    std::vector<int> in{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    std::vector<move_only_throwing_t<int>> v;
+    test_move_ctor_exception_for_basic_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+#  if TEST_STD_VER >= 23
+  {
+    std::vector<int> in{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    std::vector<move_only_throwing_t<int>, increasing_allocator<move_only_throwing_t<int>>> v;
+    test_move_ctor_exception_for_basic_guarantee(v, in);
+  }
+  check_new_delete_called();
+#  endif
+
+  {
+    // Practical example: Testing with 100 strings, each 256 characters long.
+    std::vector<std::string> in = getStringInputsWithLength(100, 256);
+    std::vector<move_only_throwing_t<std::string> > v;
+    test_move_ctor_exception_for_basic_guarantee(v, in);
+  }
+  check_new_delete_called();
+}
+
+#endif
+
+int main(int, char**) {
+  test_allocation_exceptions();
+  test_copy_ctor_exceptions();
+#if TEST_STD_VER >= 11
+  test_move_ctor_exceptions();
+#endif
+}
diff --git a/libcxx/test/std/containers/sequences/vector/vector.capacity/resize_size_exceptions.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.capacity/resize_size_exceptions.pass.cpp
new file mode 100644
index 0000000000000..b5e12ea8e5d0a
--- /dev/null
+++ b/libcxx/test/std/containers/sequences/vector/vector.capacity/resize_size_exceptions.pass.cpp
@@ -0,0 +1,394 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-exceptions
+
+// This test file validates that std::vector<T>::resize(size_type) provides the strong exception guarantee
+// if no exception is thrown by the move constructor of T during the resize call. It also checks that if
+// T's move constructor is not noexcept, resize provides only the basic exception guarantee.
+
+#include <cstddef>
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+#include "../common.h"
+#include "MoveOnly.h"
+#include "count_new.h"
+#include "increasing_allocator.h"
+#include "min_allocator.h"
+#include "test_allocator.h"
+#include "test_iterators.h"
+#include "test_macros.h"
+
+template <typename T, typename Alloc>
+void test_allocation_exception_for_strong_guarantee(
+    std::vector<T, Alloc>& v, const std::vector<T>& values, std::size_t new_size) {
+  assert(v.size() == values.size());
+  T* old_data          = v.data();
+  std::size_t old_size = v.size();
+  std::size_t old_cap  = v.capacity();
+
+  try {
+    v.resize(new_size);
+  } catch (...) { // std::length_error, std::bad_alloc
+    assert(v.data() == old_data);
+    assert(v.size() == old_size);
+    assert(v.capacity() == old_cap);
+    for (std::size_t i = 0; i < v.size(); ++i)
+      assert(v[i] == values[i]);
+  }
+}
+
+template <typename T, typename Alloc>
+void test_default_ctor_exception_for_strong_guarantee(
+    std::vector<throwing_data<T>, Alloc>& v, const std::vector<T>& values) {
+  assert(v.empty() && !values.empty());
+  int throw_after = values.size() + 10;
+  v.reserve(values.size());
+  for (std::size_t i = 0; i < values.size(); ++i)
+    v.emplace_back(values[i], throw_after);
+
+  throwing_data<T>* old_data = v.data();
+  std::size_t old_size       = v.size();
+  std::size_t old_cap        = v.capacity();
+  std::size_t new_size       = old_size + 1;
+
+  try {
+    v.resize(new_size);
+  } catch (...) {
+    assert(v.data() == old_data);
+    assert(v.size() == old_size);
+    assert(v.capacity() == old_cap);
+    for (std::size_t i = 0; i < v.size(); ++i)
+      assert(v[i].data_ == values[i]);
+  }
+}
+
+template <typename T, typename Alloc>
+void test_copy_ctor_exception_for_strong_guarantee(std::vector<throwing_data<T>, Alloc>& v,
+                                                   const std::vector<T>& values) {
+  assert(v.empty() && !values.empty());
+  int throw_after = values.size() + values.size() / 2; // Trigger an exception halfway through reallocation
+  v.reserve(values.size());
+  for (std::size_t i = 0; i < values.size(); ++i)
+    v.emplace_back(values[i], throw_after);
+
+  throwing_data<T>* old_data = v.data();
+  std::size_t old_size       = v.size();
+  std::size_t old_cap        = v.capacity();
+  std::size_t new_size       = 2 * old_cap;
+
+  try {
+    v.resize(new_size);
+  } catch (...) {
+    assert(v.data() == old_data);
+    assert(v.size() == old_size);
+    assert(v.capacity() == old_cap);
+    for (std::size_t i = 0; i < v.size(); ++i)
+      assert(v[i].data_ == values[i]);
+  }
+}
+
+#if TEST_STD_VER >= 11
+
+template <typename T, typename Alloc>
+void test_move_ctor_exception_for_basic_guarantee(std::vector<move_only_throwing_t<T>, Alloc>& v,
+                                                  const std::vector<T>& values) {
+  assert(v.empty() && !values.empty());
+  int throw_after = values.size() + values.size() / 2; // Trigger an exception halfway through reallocation
+  v.reserve(values.size());
+  for (std::size_t i = 0; i < values.size(); ++i)
+    v.emplace_back(values[i], throw_after);
+
+  try {
+    v.resize(2 * v.capacity());
+  } catch (...) {
+    use_unspecified_but_valid_state_vector(v);
+  }
+}
+
+#endif
+
+// Check the strong exception guarantee during reallocation failures
+void test_allocation_exceptions() {
+  //
+  // Tests for std::length_error during reallocation failures
+  //
+  {
+    std::vector<int> v;
+    test_allocation_exception_for_strong_guarantee(v, std::vector<int>(), v.max_size() + 1);
+  }
+  check_new_delete_called();
+
+  {
+    int a[] = {1, 2, 3, 4, 5};
+    std::vector<int> in(a, a + sizeof(a) / sizeof(a[0]));
+    std::vector<int> v(in.begin(), in.end());
+    test_allocation_exception_for_strong_guarantee(v, in, v.max_size() + 1);
+  }
+  check_new_delete_called();
+
+  {
+    int a[] = {1, 2, 3, 4, 5};
+    std::vector<int> in(a, a + sizeof(a) / sizeof(a[0]));
+    std::vector<int, min_allocator<int> > v(in.begin(), in.end());
+    test_allocation_exception_for_strong_guarantee(v, in, v.max_size() + 1);
+  }
+  check_new_delete_called();
+
+  {
+    int a[] = {1, 2, 3, 4, 5};
+    std::vector<int> in(a, a + sizeof(a) / sizeof(a[0]));
+    std::vector<int, safe_allocator<int> > v(in.begin(), in.end());
+    test_allocation_exception_for_strong_guarantee(v, in, v.max_size() + 1);
+  }
+  check_new_delete_called();
+
+  {
+    int a[] = {1, 2, 3, 4, 5};
+    std::vector<int> in(a, a + sizeof(a) / sizeof(a[0]));
+    std::vector<int, test_allocator<int> > v(in.begin(), in.end());
+    test_allocation_exception_for_strong_guarantee(v, in, v.max_size() + 1);
+  }
+  check_new_delete_called();
+
+  {
+    std::vector<int> in(10, 42);
+    std::vector<int, limited_allocator<int, 100> > v(in.begin(), in.end());
+    test_allocation_exception_for_strong_guarantee(v, in, v.max_size() + 1);
+  }
+  check_new_delete_called();
+
+#if TEST_STD_VER >= 23
+  {
+    std::vector<int> in{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    std::vector<int, increasing_allocator<int>> v(in.begin(), in.end());
+    test_allocation_exception_for_strong_guarantee(v, in, v.max_size() + 1);
+  }
+  check_new_delete_called();
+#endif
+
+  //
+  // Tests for std::bad_alloc during reallocation failures
+  //
+  {
+    std::vector<int> in(10, 42);
+    std::vector<int, limited_allocator<int, 100> > v(in.begin(), in.end());
+    test_allocation_exception_for_strong_guarantee(v, in, 91);
+  }
+  check_new_delete_called();
+
+  {
+    std::vector<int> in(10, 42);
+    std::vector<int, limited_allocator<int, 100> > v(in.begin(), in.end());
+    v.reserve(30);
+    test_allocation_exception_for_strong_guarantee(v, in, 61);
+  }
+  check_new_delete_called();
+
+#if TEST_STD_VER >= 11
+  {
+    std::vector<MoveOnly> in(10);
+    std::vector<MoveOnly, limited_allocator<MoveOnly, 100> > v(10);
+    test_allocation_exception_for_strong_guarantee(v, in, 91);
+  }
+  check_new_delete_called();
+
+  {
+    std::vector<MoveOnly> in(10);
+    in.insert(in.cbegin() + 5, MoveOnly(42));
+    std::vector<MoveOnly, limited_allocator<MoveOnly, 100> > v(10);
+    v.reserve(30);
+    v.insert(v.cbegin() + 5, MoveOnly(42));
+    test_allocation_exception_for_strong_guarantee(v, in, 61);
+  }
+  check_new_delete_called();
+#endif
+
+  { // Practical example: Testing with 100 integers.
+    auto in = getIntegerInputs(100);
+    std::vector<int, limited_allocator<int, 299> > v(in.begin(), in.end());
+    test_allocation_exception_for_strong_guarantee(v, in, 200);
+  }
+  check_new_delete_called();
+
+  { // Practical example: Testing with 100 strings, each 256 characters long.
+    std::vector<std::string> in = getStringInputsWithLength(100, 256);
+    std::vector<std::string, limited_allocator<std::string, 299> > v(in.begin(), in.end());
+    test_allocation_exception_for_strong_guarantee(v, in, 200);
+  }
+  check_new_delete_called();
+}
+
+// Check the strong exception guarantee during default-constructor failures
+void test_default_ctor_exceptions() {
+  {
+    int a[] = {1, 2, 3, 4, 5};
+    std::vector<int> in(a, a + sizeof(a) / sizeof(a[0]));
+    std::vector<throwing_data<int> > v;
+    test_default_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+  {
+    int a[] = {1, 2, 3, 4, 5};
+    std::vector<int> in(a, a + sizeof(a) / sizeof(a[0]));
+    std::vector<throwing_data<int>, min_allocator<throwing_data<int> > > v;
+    test_default_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+  {
+    std::vector<int> in(10, 42);
+    std::vector<throwing_data<int>, safe_allocator<throwing_data<int> > > v;
+    test_default_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+  {
+    std::vector<int> in(10, 42);
+    std::vector<throwing_data<int>, test_allocator<throwing_data<int> > > v;
+    test_default_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+  {
+    std::vector<int> in(10, 42);
+    std::vector<throwing_data<int>, limited_allocator<throwing_data<int>, 100> > v;
+    test_default_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+#if TEST_STD_VER >= 23
+  {
+    std::vector<int> in{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    std::vector<throwing_data<int>, increasing_allocator<throwing_data<int>>> v;
+    test_default_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+#endif
+
+  { // Practical example: Testing with 100 integers.
+    auto in = getIntegerInputs(100);
+    std::vector<throwing_data<int> > v;
+    test_default_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+  { // Practical example: Testing with 100 strings, each 256 characters long.
+    std::vector<std::string> in = getStringInputsWithLength(100, 256);
+    std::vector<throwing_data<std::string> > v;
+    test_default_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+}
+
+// Check the strong exception guarantee during copy-constructor failures
+void test_copy_ctor_exceptions() {
+  {
+    int a[] = {1, 2, 3, 4, 5};
+    std::vector<int> in(a, a + sizeof(a) / sizeof(a[0]));
+    std::vector<throwing_data<int> > v;
+    test_copy_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+  {
+    int a[] = {1, 2, 3, 4, 5};
+    std::vector<int> in(a, a + sizeof(a) / sizeof(a[0]));
+    std::vector<throwing_data<int>, min_allocator<throwing_data<int> > > v;
+    test_copy_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+  {
+    std::vector<int> in(10, 42);
+    std::vector<throwing_data<int>, safe_allocator<throwing_data<int> > > v;
+    test_copy_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+  {
+    std::vector<int> in(10, 42);
+    std::vector<throwing_data<int>, test_allocator<throwing_data<int> > > v;
+    test_copy_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+  {
+    std::vector<int> in(10, 42);
+    std::vector<throwing_data<int>, limited_allocator<throwing_data<int>, 100> > v;
+    test_copy_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+#if TEST_STD_VER >= 23
+  {
+    std::vector<int> in{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    std::vector<throwing_data<int>, increasing_allocator<throwing_data<int>>> v;
+    test_copy_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+#endif
+
+  { // Practical example: Testing with 100 integers.
+    auto in = getIntegerInputs(100);
+    std::vector<throwing_data<int> > v;
+    test_copy_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+  { // Practical example: Testing with 100 strings, each 256 characters long.
+    std::vector<std::string> in = getStringInputsWithLength(100, 256);
+    std::vector<throwing_data<std::string> > v;
+    test_copy_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+}
+
+#if TEST_STD_VER >= 11
+
+// Check that if T is Cpp17MoveInsertible && !Cpp17CopyInsertible, and T's move-ctor is not noexcept, then
+// std::vector::reserve only provides basic guarantee.
+void test_move_ctor_exceptions() {
+  {
+    std::vector<int> in{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    std::vector<move_only_throwing_t<int>> v;
+    test_move_ctor_exception_for_basic_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+#  if TEST_STD_VER >= 23
+  {
+    std::vector<int> in{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    std::vector<move_only_throwing_t<int>, increasing_allocator<move_only_throwing_t<int>>> v;
+    test_move_ctor_exception_for_basic_guarantee(v, in);
+  }
+  check_new_delete_called();
+#  endif
+
+  {
+    // Practical example: Testing with 100 strings, each 256 characters long.
+    std::vector<std::string> in = getStringInputsWithLength(100, 256);
+    std::vector<move_only_throwing_t<std::string> > v;
+    test_move_ctor_exception_for_basic_guarantee(v, in);
+  }
+  check_new_delete_called();
+}
+
+#endif
+
+int main(int, char**) {
+  test_allocation_exceptions();
+  test_default_ctor_exceptions();
+  test_copy_ctor_exceptions();
+#if TEST_STD_VER >= 11
+  test_move_ctor_exceptions();
+#endif
+}
diff --git a/libcxx/test/std/containers/sequences/vector/vector.capacity/resize_size_value_exceptions.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.capacity/resize_size_value_exceptions.pass.cpp
new file mode 100644
index 0000000000000..7217b47d69e4d
--- /dev/null
+++ b/libcxx/test/std/containers/sequences/vector/vector.capacity/resize_size_value_exceptions.pass.cpp
@@ -0,0 +1,231 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-exceptions
+
+// Check that std::vector<T>::resize(size_type sz, const value_type& x) provides the strong exception guarantee
+// if T is Cpp17CopyInsertable.
+
+#include <cstddef>
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+#include "../common.h"
+#include "count_new.h"
+#include "increasing_allocator.h"
+#include "min_allocator.h"
+#include "test_allocator.h"
+#include "test_iterators.h"
+#include "test_macros.h"
+
+template <typename T, typename Alloc>
+void test_allocation_exception_for_strong_guarantee(
+    std::vector<T, Alloc>& v, const std::vector<T>& values, std::size_t new_size) {
+  assert(v.size() == values.size());
+  T* old_data          = v.data();
+  std::size_t old_size = v.size();
+  std::size_t old_cap  = v.capacity();
+
+  try {
+    v.resize(new_size, values.empty() ? T() : values[0]);
+  } catch (...) { // std::length_error, std::bad_alloc
+    assert(v.data() == old_data);
+    assert(v.size() == old_size);
+    assert(v.capacity() == old_cap);
+    for (std::size_t i = 0; i < v.size(); ++i)
+      assert(v[i] == values[i]);
+  }
+}
+
+template <typename T, typename Alloc>
+void test_copy_ctor_exception_for_strong_guarantee(std::vector<throwing_data<T>, Alloc>& v,
+                                                   const std::vector<T>& values) {
+  assert(v.empty() && !values.empty());
+  int throw_after = values.size() + values.size() / 2; // Trigger an exception halfway through reallocation
+  v.reserve(values.size());
+  for (std::size_t i = 0; i < values.size(); ++i)
+    v.emplace_back(values[i], throw_after);
+
+  throwing_data<T>* old_data = v.data();
+  std::size_t old_size       = v.size();
+  std::size_t old_cap        = v.capacity();
+  std::size_t new_size       = 2 * old_cap;
+
+  try {
+    int n = new_size - old_size + 1;
+    throwing_data<T> t(T(), n);
+    v.resize(new_size, t);
+  } catch (...) {
+    assert(v.data() == old_data);
+    assert(v.size() == old_size);
+    assert(v.capacity() == old_cap);
+    for (std::size_t i = 0; i < v.size(); ++i)
+      assert(v[i].data_ == values[i]);
+  }
+}
+
+// Check the strong exception guarantee during reallocation failures
+void test_allocation_exceptions() {
+  //
+  // Tests for std::length_error during reallocation failures
+  //
+  {
+    std::vector<int> v;
+    test_allocation_exception_for_strong_guarantee(v, std::vector<int>(), v.max_size() + 1);
+  }
+  check_new_delete_called();
+
+  {
+    int a[] = {1, 2, 3, 4, 5};
+    std::vector<int> in(a, a + sizeof(a) / sizeof(a[0]));
+    std::vector<int> v(in.begin(), in.end());
+    test_allocation_exception_for_strong_guarantee(v, in, v.max_size() + 1);
+  }
+  check_new_delete_called();
+
+  {
+    int a[] = {1, 2, 3, 4, 5};
+    std::vector<int> in(a, a + sizeof(a) / sizeof(a[0]));
+    std::vector<int, min_allocator<int> > v(in.begin(), in.end());
+    test_allocation_exception_for_strong_guarantee(v, in, v.max_size() + 1);
+  }
+  check_new_delete_called();
+
+  {
+    int a[] = {1, 2, 3, 4, 5};
+    std::vector<int> in(a, a + sizeof(a) / sizeof(a[0]));
+    std::vector<int, safe_allocator<int> > v(in.begin(), in.end());
+    test_allocation_exception_for_strong_guarantee(v, in, v.max_size() + 1);
+  }
+  check_new_delete_called();
+
+  {
+    int a[] = {1, 2, 3, 4, 5};
+    std::vector<int> in(a, a + sizeof(a) / sizeof(a[0]));
+    std::vector<int, test_allocator<int> > v(in.begin(), in.end());
+    test_allocation_exception_for_strong_guarantee(v, in, v.max_size() + 1);
+  }
+  check_new_delete_called();
+
+  {
+    std::vector<int> in(10, 42);
+    std::vector<int, limited_allocator<int, 100> > v(in.begin(), in.end());
+    test_allocation_exception_for_strong_guarantee(v, in, v.max_size() + 1);
+  }
+  check_new_delete_called();
+
+#if TEST_STD_VER >= 23
+  {
+    std::vector<int> in{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    std::vector<int, increasing_allocator<int>> v(in.begin(), in.end());
+    test_allocation_exception_for_strong_guarantee(v, in, v.max_size() + 1);
+  }
+  check_new_delete_called();
+#endif
+
+  //
+  // Tests for std::bad_alloc during reallocation failures
+  //
+  {
+    std::vector<int> in(10, 42);
+    std::vector<int, limited_allocator<int, 100> > v(in.begin(), in.end());
+    test_allocation_exception_for_strong_guarantee(v, in, 91);
+  }
+  check_new_delete_called();
+
+  {
+    std::vector<int> in(10, 42);
+    std::vector<int, limited_allocator<int, 100> > v(in.begin(), in.end());
+    v.reserve(30);
+    test_allocation_exception_for_strong_guarantee(v, in, 61);
+  }
+  check_new_delete_called();
+
+  { // Practical example: Testing with 100 integers.
+    auto in = getIntegerInputs(100);
+    std::vector<int, limited_allocator<int, 299> > v(in.begin(), in.end());
+    test_allocation_exception_for_strong_guarantee(v, in, 200);
+  }
+  check_new_delete_called();
+
+  { // Practical example: Testing with 100 strings, each 256 characters long.
+    std::vector<std::string> in = getStringInputsWithLength(100, 256);
+    std::vector<std::string, limited_allocator<std::string, 299> > v(in.begin(), in.end());
+    test_allocation_exception_for_strong_guarantee(v, in, 200);
+  }
+  check_new_delete_called();
+}
+
+// Check the strong exception guarantee during copy-constructor failures
+void test_copy_ctor_exceptions() {
+  {
+    int a[] = {1, 2, 3, 4, 5};
+    std::vector<int> in(a, a + sizeof(a) / sizeof(a[0]));
+    std::vector<throwing_data<int> > v;
+    test_copy_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+  {
+    int a[] = {1, 2, 3, 4, 5};
+    std::vector<int> in(a, a + sizeof(a) / sizeof(a[0]));
+    std::vector<throwing_data<int>, min_allocator<throwing_data<int> > > v;
+    test_copy_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+  {
+    std::vector<int> in(10, 42);
+    std::vector<throwing_data<int>, safe_allocator<throwing_data<int> > > v;
+    test_copy_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+  {
+    std::vector<int> in(10, 42);
+    std::vector<throwing_data<int>, test_allocator<throwing_data<int> > > v;
+    test_copy_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+  {
+    std::vector<int> in(10, 42);
+    std::vector<throwing_data<int>, limited_allocator<throwing_data<int>, 100> > v;
+    test_copy_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+#if TEST_STD_VER >= 23
+  {
+    std::vector<int> in{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    std::vector<throwing_data<int>, increasing_allocator<throwing_data<int>>> v;
+    test_copy_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+#endif
+
+  { // Practical example: Testing with 100 integers.
+    auto in = getIntegerInputs(100);
+    std::vector<throwing_data<int> > v;
+    test_copy_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+  { // Practical example: Testing with 100 strings, each 256 characters long.
+    std::vector<std::string> in = getStringInputsWithLength(100, 256);
+    std::vector<throwing_data<std::string> > v;
+    test_copy_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+}
+
+int main(int, char**) {
+  test_allocation_exceptions();
+  test_copy_ctor_exceptions();
+}
diff --git a/libcxx/test/std/containers/sequences/vector/vector.capacity/shrink_to_fit_exceptions.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.capacity/shrink_to_fit_exceptions.pass.cpp
new file mode 100644
index 0000000000000..521a25fdeda0f
--- /dev/null
+++ b/libcxx/test/std/containers/sequences/vector/vector.capacity/shrink_to_fit_exceptions.pass.cpp
@@ -0,0 +1,257 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-exceptions
+
+// This test file validates that std::vector<T>::shrink_to_fit provides the strong exception guarantee when
+// T is Cpp17MoveInsertable and its move constructor does not throw exceptions during the shrink_to_fit
+// call. Additionally, it checks that for move-only types where T's move constructor is not noexcept, only
+// the basic exception guarantee is ensured.
+
+#include <cstddef>
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+#include "../common.h"
+#include "MoveOnly.h"
+#include "count_new.h"
+#include "increasing_allocator.h"
+#include "min_allocator.h"
+#include "test_allocator.h"
+#include "test_iterators.h"
+#include "test_macros.h"
+
+template <typename T, typename Alloc>
+void test_allocation_exception_for_strong_guarantee(std::vector<T, Alloc>& v, const std::vector<T>& values) {
+  assert(v.size() == values.size());
+  T* old_data          = v.data();
+  std::size_t old_size = v.size();
+  std::size_t old_cap  = v.capacity();
+
+  try {
+    v.shrink_to_fit();
+  } catch (...) {
+  }
+
+  // As shrink_to_fit may swallow any exceptions, we place the checks outisde the catch block.
+  assert(v.data() == old_data);
+  assert(v.size() == old_size);
+  assert(v.capacity() == old_cap);
+  for (std::size_t i = 0; i < v.size(); ++i)
+    assert(v[i] == values[i]);
+}
+
+template <typename T, typename Alloc>
+void test_copy_ctor_exception_for_strong_guarantee(std::vector<throwing_data<T>, Alloc>& v,
+                                                   const std::vector<T>& values) {
+  assert(v.empty() && !values.empty());
+  v.reserve(values.size() * 2);
+  int throw_after = values.size() + values.size() / 2; // Trigger an exception halfway through reallocation
+  for (std::size_t i = 0; i < values.size(); ++i)
+    v.emplace_back(values[i], throw_after);
+
+  throwing_data<T>* old_data = v.data();
+  std::size_t old_size       = v.size();
+  std::size_t old_cap        = v.capacity();
+
+  try {
+    v.shrink_to_fit();
+  } catch (...) {
+  }
+
+  assert(v.data() == old_data);
+  assert(v.size() == old_size);
+  assert(v.capacity() == old_cap);
+  for (std::size_t i = 0; i < v.size(); ++i)
+    assert(v[i].data_ == values[i]);
+}
+
+#if TEST_STD_VER >= 11
+
+template <typename T, typename Alloc>
+void test_move_ctor_exception_for_basic_guarantee(std::vector<move_only_throwing_t<T>, Alloc>& v,
+                                                  const std::vector<T>& values) {
+  assert(v.empty() && !values.empty());
+  v.reserve(values.size() * 2);
+  int throw_after = values.size() + values.size() / 2; // Trigger an exception halfway through reallocation
+  for (std::size_t i = 0; i < values.size(); ++i)
+    v.emplace_back(values[i], throw_after);
+
+  try {
+    v.shrink_to_fit();
+  } catch (...) {
+  }
+  use_unspecified_but_valid_state_vector(v);
+}
+
+#endif
+
+// Check the strong exception guarantee during reallocation failures
+void test_allocation_exceptions() {
+  {
+    int a[] = {1, 2, 3, 4, 5};
+    std::vector<int> in(a, a + sizeof(a) / sizeof(a[0]));
+    std::vector<int, limited_allocator<int, 100> > v;
+    v.reserve(100);
+    for (std::size_t i = 0; i < in.size(); ++i)
+      v.push_back(in[i]);
+    test_allocation_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+  {
+    std::vector<int> in(50, 42);
+    std::vector<int, limited_allocator<int, 100> > v;
+    v.reserve(100);
+    for (std::size_t i = 0; i < in.size(); ++i)
+      v.push_back(in[i]);
+    test_allocation_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+  {
+    std::vector<int> in(10, 42);
+    std::vector<int, limited_allocator<int, 100> > v(in.begin(), in.end());
+    v.reserve(90);
+    test_allocation_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+#if TEST_STD_VER >= 11
+  {
+    std::vector<MoveOnly> in(10);
+    std::vector<MoveOnly, limited_allocator<MoveOnly, 100> > v(10);
+    v.reserve(90);
+    test_allocation_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+  {
+    std::vector<MoveOnly> in(10);
+    std::vector<MoveOnly, limited_allocator<MoveOnly, 100> > v(10);
+    v.reserve(90);
+    in.insert(in.cbegin() + 5, MoveOnly(42));
+    v.insert(v.cbegin() + 5, MoveOnly(42));
+    test_allocation_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+#endif
+
+  { // Practical example: Testing with 100 integers.
+    auto in = getIntegerInputs(100);
+    std::vector<int, limited_allocator<int, 100> > v(in.begin(), in.end());
+    in.erase(in.end() - 10, in.end());
+    v.erase(v.end() - 10, v.end());
+    test_allocation_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+  { // Practical example: Testing with 100 strings, each 256 characters long.
+    std::vector<std::string> in = getStringInputsWithLength(100, 256);
+    std::vector<std::string, limited_allocator<std::string, 300> > v(in.begin(), in.end());
+    v.reserve(200);
+    test_allocation_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+}
+
+// Check the strong exception guarantee during copy-constructor failures
+void test_copy_ctor_exceptions() {
+  {
+    int a[] = {1, 2, 3, 4, 5};
+    std::vector<int> in(a, a + sizeof(a) / sizeof(a[0]));
+    std::vector<throwing_data<int> > v;
+    test_copy_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+  {
+    int a[] = {1, 2, 3, 4, 5};
+    std::vector<int> in(a, a + sizeof(a) / sizeof(a[0]));
+    std::vector<throwing_data<int>, min_allocator<throwing_data<int> > > v;
+    test_copy_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+  {
+    std::vector<int> in(10, 42);
+    std::vector<throwing_data<int>, safe_allocator<throwing_data<int> > > v;
+    test_copy_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+  {
+    std::vector<int> in(10, 42);
+    std::vector<throwing_data<int>, test_allocator<throwing_data<int> > > v;
+    test_copy_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+  {
+    std::vector<int> in(10, 42);
+    std::vector<throwing_data<int>, limited_allocator<throwing_data<int>, 100> > v;
+    test_copy_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+  { // Practical example: Testing with 100 integers.
+    auto in = getIntegerInputs(100);
+    std::vector<throwing_data<int> > v;
+    test_copy_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+  { // Practical example: Testing with 100 strings, each 256 characters long.
+    std::vector<std::string> in = getStringInputsWithLength(100, 256);
+    std::vector<throwing_data<std::string> > v;
+    test_copy_ctor_exception_for_strong_guarantee(v, in);
+  }
+  check_new_delete_called();
+}
+
+#if TEST_STD_VER >= 11
+
+// Check that if T is Cpp17MoveInsertible && !Cpp17CopyInsertible, and T's move-ctor is not noexcept, then
+// std::vector::shrink_to_fit only provides basic guarantee.
+void test_move_ctor_exceptions() {
+  {
+    std::vector<int> in{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    std::vector<move_only_throwing_t<int>> v;
+    test_move_ctor_exception_for_basic_guarantee(v, in);
+  }
+  check_new_delete_called();
+
+#  if TEST_STD_VER >= 23
+  {
+    std::vector<int> in{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    std::vector<move_only_throwing_t<int>, increasing_allocator<move_only_throwing_t<int>>> v;
+    test_move_ctor_exception_for_basic_guarantee(v, in);
+  }
+  check_new_delete_called();
+#  endif
+
+  {
+    // Practical example: Testing with 100 strings, each 256 characters long.
+    std::vector<std::string> in = getStringInputsWithLength(100, 256);
+    std::vector<move_only_throwing_t<std::string> > v;
+    test_move_ctor_exception_for_basic_guarantee(v, in);
+  }
+  check_new_delete_called();
+}
+
+#endif
+
+int main(int, char**) {
+  test_allocation_exceptions();
+  test_copy_ctor_exceptions();
+#if TEST_STD_VER >= 11
+  test_move_ctor_exceptions();
+#endif
+  return 0;
+}

From e44f03dd4ea84d3c12c916fdf02d63503c2872e2 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland@gmail.com>
Date: Mon, 13 Jan 2025 15:38:03 -0500
Subject: [PATCH 326/408] [RISCV][VLOPT] Add floating point widening and
 narrowing bf16 convert support (#122353)

We already have getOperandInfo tests that cover this instruction.
---
 llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp    |   6 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-fp.ll     |  44 +-
 llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll    | 294 +++++------
 llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll    | 294 +++++------
 llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll    | 458 +++++++++---------
 llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll       | 215 ++++----
 llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll       | 207 ++++----
 llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll       |  74 +--
 llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll       |  74 +--
 .../test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll |   8 +-
 llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll     |  31 +-
 .../test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll |   8 +-
 llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll     |  31 +-
 llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll      |  66 +--
 llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll       | 207 ++++----
 llvm/test/CodeGen/RISCV/rvv/vl-opt.mir        |  12 +-
 16 files changed, 1014 insertions(+), 1015 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index 69ee210071286..8156eaff8a04c 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -551,6 +551,7 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
   case RISCV::VFWCVT_F_XU_V:
   case RISCV::VFWCVT_F_X_V:
   case RISCV::VFWCVT_F_F_V:
+  case RISCV::VFWCVTBF16_F_F_V:
     return IsMODef ? MILog2SEW + 1 : MILog2SEW;
 
   // Def and Op1 uses EEW=2*SEW. Op2 uses EEW=SEW.
@@ -607,7 +608,8 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
   case RISCV::VFNCVT_F_XU_W:
   case RISCV::VFNCVT_F_X_W:
   case RISCV::VFNCVT_F_F_W:
-  case RISCV::VFNCVT_ROD_F_F_W: {
+  case RISCV::VFNCVT_ROD_F_F_W:
+  case RISCV::VFNCVTBF16_F_F_W: {
     bool IsOp1 = HasPassthru ? MO.getOperandNo() == 2 : MO.getOperandNo() == 1;
     bool TwoTimes = IsOp1;
     return TwoTimes ? MILog2SEW + 1 : MILog2SEW;
@@ -1045,6 +1047,7 @@ static bool isSupportedInstr(const MachineInstr &MI) {
   case RISCV::VFWCVT_F_XU_V:
   case RISCV::VFWCVT_F_X_V:
   case RISCV::VFWCVT_F_F_V:
+  case RISCV::VFWCVTBF16_F_F_V:
   // Narrowing Floating-Point/Integer Type-Convert Instructions
   case RISCV::VFNCVT_XU_F_W:
   case RISCV::VFNCVT_X_F_W:
@@ -1054,6 +1057,7 @@ static bool isSupportedInstr(const MachineInstr &MI) {
   case RISCV::VFNCVT_F_X_W:
   case RISCV::VFNCVT_F_F_W:
   case RISCV::VFNCVT_ROD_F_F_W:
+  case RISCV::VFNCVTBF16_F_F_W:
     return true;
   }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
index b8710a518287a..49db94e1a02df 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
@@ -32,12 +32,11 @@ define void @fadd_v6bf16(ptr %x, ptr %y) {
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a1)
 ; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfadd.vv v8, v12, v10
-; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
 ; CHECK-NEXT:    ret
@@ -167,12 +166,11 @@ define void @fsub_v6bf16(ptr %x, ptr %y) {
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a1)
 ; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfsub.vv v8, v12, v10
-; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
 ; CHECK-NEXT:    ret
@@ -302,12 +300,11 @@ define void @fmul_v6bf16(ptr %x, ptr %y) {
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a1)
 ; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmul.vv v8, v12, v10
-; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
 ; CHECK-NEXT:    ret
@@ -437,12 +434,11 @@ define void @fdiv_v6bf16(ptr %x, ptr %y) {
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a1)
 ; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v8, v12, v10
-; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
 ; CHECK-NEXT:    ret
@@ -1196,9 +1192,7 @@ define void @copysign_neg_trunc_v3bf16_v3f32(ptr %x, ptr %y) {
 ; CHECK-NEXT:    lui a1, 8
 ; CHECK-NEXT:    addi a2, a1, -1
 ; CHECK-NEXT:    vand.vx v8, v8, a2
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v9
-; CHECK-NEXT:    vsetivli zero, 3, e16, mf2, ta, ma
 ; CHECK-NEXT:    vxor.vx v9, v10, a1
 ; CHECK-NEXT:    vand.vx v9, v9, a1
 ; CHECK-NEXT:    vor.vv v8, v8, v9
@@ -2263,13 +2257,12 @@ define void @fadd_vf_v6bf16(ptr %x, bfloat %y) {
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfadd.vv v8, v10, v12
-; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
 ; CHECK-NEXT:    ret
@@ -2404,13 +2397,12 @@ define void @fadd_fv_v6bf16(ptr %x, bfloat %y) {
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfadd.vv v8, v12, v10
-; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
 ; CHECK-NEXT:    ret
@@ -2545,13 +2537,12 @@ define void @fsub_vf_v6bf16(ptr %x, bfloat %y) {
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfsub.vv v8, v10, v12
-; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
 ; CHECK-NEXT:    ret
@@ -2686,13 +2677,12 @@ define void @fsub_fv_v6bf16(ptr %x, bfloat %y) {
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfsub.vv v8, v12, v10
-; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
 ; CHECK-NEXT:    ret
@@ -2827,13 +2817,12 @@ define void @fmul_vf_v6bf16(ptr %x, bfloat %y) {
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmul.vv v8, v10, v12
-; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
 ; CHECK-NEXT:    ret
@@ -2968,13 +2957,12 @@ define void @fmul_fv_v6bf16(ptr %x, bfloat %y) {
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmul.vv v8, v12, v10
-; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
 ; CHECK-NEXT:    ret
@@ -3109,13 +3097,12 @@ define void @fdiv_vf_v6bf16(ptr %x, bfloat %y) {
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v8, v10, v12
-; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
 ; CHECK-NEXT:    ret
@@ -3250,13 +3237,12 @@ define void @fdiv_fv_v6bf16(ptr %x, bfloat %y) {
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v8, v12, v10
-; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
 ; CHECK-NEXT:    ret
@@ -4908,7 +4894,6 @@ define void @fmuladd_v6bf16(ptr %x, ptr %y, ptr %z) {
 ; CHECK-NEXT:    vle16.v v8, (a1)
 ; CHECK-NEXT:    vle16.v v9, (a0)
 ; CHECK-NEXT:    vle16.v v10, (a2)
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -4919,7 +4904,7 @@ define void @fmuladd_v6bf16(ptr %x, ptr %y, ptr %z) {
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfadd.vv v8, v8, v12
-; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
 ; CHECK-NEXT:    ret
@@ -5082,7 +5067,6 @@ define void @fmsub_fmuladd_v6bf16(ptr %x, ptr %y, ptr %z) {
 ; CHECK-NEXT:    vle16.v v8, (a1)
 ; CHECK-NEXT:    vle16.v v9, (a0)
 ; CHECK-NEXT:    vle16.v v10, (a2)
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -5093,7 +5077,7 @@ define void @fmsub_fmuladd_v6bf16(ptr %x, ptr %y, ptr %z) {
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfsub.vv v8, v8, v12
-; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
index 33fe73a097e32..7e0c3f45de463 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
@@ -19,14 +19,14 @@ declare <vscale x 1 x bfloat> @llvm.vp.maximum.nxv1bf16(<vscale x 1 x bfloat>, <
 define <vscale x 1 x bfloat> @vfmax_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmax_vv_nxv1bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v11, v11, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v9, v11, v8, v0
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8, v0.t
@@ -43,13 +43,13 @@ define <vscale x 1 x bfloat> @vfmax_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vsca
 define <vscale x 1 x bfloat> @vfmax_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmax_vv_nxv1bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v9, v10, v8, v0
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
@@ -66,14 +66,14 @@ declare <vscale x 2 x bfloat> @llvm.vp.maximum.nxv2bf16(<vscale x 2 x bfloat>, <
 define <vscale x 2 x bfloat> @vfmax_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmax_vv_nxv2bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v11, v11, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v9, v11, v8, v0
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8, v0.t
@@ -90,13 +90,13 @@ define <vscale x 2 x bfloat> @vfmax_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vsca
 define <vscale x 2 x bfloat> @vfmax_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmax_vv_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v9, v10, v8, v0
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
@@ -113,15 +113,15 @@ declare <vscale x 4 x bfloat> @llvm.vp.maximum.nxv4bf16(<vscale x 4 x bfloat>, <
 define <vscale x 4 x bfloat> @vfmax_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmax_vv_nxv4bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v8, v12, v12, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v16, v12, v14, v0
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmfeq.vv v8, v14, v14, v0.t
@@ -139,13 +139,13 @@ define <vscale x 4 x bfloat> @vfmax_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vsca
 define <vscale x 4 x bfloat> @vfmax_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmax_vv_nxv4bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
 ; CHECK-NEXT:    vmfeq.vv v0, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v10, v12, v10, v0
@@ -162,15 +162,15 @@ declare <vscale x 8 x bfloat> @llvm.vp.maximum.nxv8bf16(<vscale x 8 x bfloat>, <
 define <vscale x 8 x bfloat> @vfmax_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmax_vv_nxv8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v8, v16, v16, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v20, v10
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v24, v16, v20, v0
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vmfeq.vv v8, v20, v20, v0.t
@@ -188,13 +188,13 @@ define <vscale x 8 x bfloat> @vfmax_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vsca
 define <vscale x 8 x bfloat> @vfmax_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmax_vv_nxv8bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v12, v16, v12, v0
@@ -217,15 +217,15 @@ define <vscale x 16 x bfloat> @vfmax_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <v
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v8, v24, v24, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
@@ -258,10 +258,10 @@ define <vscale x 16 x bfloat> @vfmax_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
@@ -296,64 +296,62 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    li a2, 25
+; CHECK-NEXT:    mul a1, a1, a2
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv8r.v v16, v8
+; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 5
+; CHECK-NEXT:    li a5, 24
+; CHECK-NEXT:    mul a4, a4, a5
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
 ; CHECK-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v12, v0, a2
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs1r.v v0, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v13, v24, v24, v0.t
-; CHECK-NEXT:    vmv8r.v v0, v16
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    slli a3, a3, 4
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
-; CHECK-NEXT:    vmv1r.v v0, v13
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    li a4, 24
-; CHECK-NEXT:    mul a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v24, v24, v16, v0
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v12, v24, v24, v0.t
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    li a3, 25
+; CHECK-NEXT:    mul a2, a2, a3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
 ; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfeq.vv v13, v16, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a2, a2, a3
+; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmfeq.vv v12, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
@@ -371,35 +369,43 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB10_2:
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v24, v16, v16, v0.t
-; CHECK-NEXT:    vmv8r.v v8, v16
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v8, v24, v24, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    li a1, 25
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vmerge.vvm v24, v24, v16, v0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 25
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
@@ -411,7 +417,7 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    li a1, 25
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
@@ -442,68 +448,61 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
+; CHECK-NEXT:    li a2, 25
+; CHECK-NEXT:    mul a1, a1, a2
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x19, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 25 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v7
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv8r.v v0, v8
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vmset.m v24
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v12, v7, a2
+; CHECK-NEXT:    vslidedown.vx v8, v24, a2
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v13, v24, v24, v0.t
-; CHECK-NEXT:    vmv8r.v v0, v16
 ; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
-; CHECK-NEXT:    vmv1r.v v0, v13
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    li a4, 24
-; CHECK-NEXT:    mul a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v24, v24, v16, v0
+; CHECK-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v4
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v12, v24, v24, v0.t
+; CHECK-NEXT:    vmv8r.v v0, v16
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    slli a3, a2, 4
+; CHECK-NEXT:    add a2, a3, a2
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
 ; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfeq.vv v13, v16, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a2, a2, a3
+; CHECK-NEXT:    slli a3, a2, 3
+; CHECK-NEXT:    add a2, a3, a2
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vl1r.v v13, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vmfeq.vv v12, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    slli a3, a2, 3
+; CHECK-NEXT:    add a2, a3, a2
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
@@ -511,7 +510,8 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    slli a3, a2, 3
+; CHECK-NEXT:    add a2, a3, a2
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
@@ -519,43 +519,49 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB11_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 4
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v0
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a1, a0, 4
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a1, a0, 4
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfmax.vv v16, v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a1, a0, 3
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    li a1, 25
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
index c65712e9965aa..d877073c3487d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
@@ -19,14 +19,14 @@ declare <vscale x 1 x bfloat> @llvm.vp.minimum.nxv1bf16(<vscale x 1 x bfloat>, <
 define <vscale x 1 x bfloat> @vfmin_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmin_vv_nxv1bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v11, v11, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v9, v11, v8, v0
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8, v0.t
@@ -43,13 +43,13 @@ define <vscale x 1 x bfloat> @vfmin_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vsca
 define <vscale x 1 x bfloat> @vfmin_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmin_vv_nxv1bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v9, v10, v8, v0
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
@@ -66,14 +66,14 @@ declare <vscale x 2 x bfloat> @llvm.vp.minimum.nxv2bf16(<vscale x 2 x bfloat>, <
 define <vscale x 2 x bfloat> @vfmin_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmin_vv_nxv2bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v11, v11, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v9, v11, v8, v0
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8, v0.t
@@ -90,13 +90,13 @@ define <vscale x 2 x bfloat> @vfmin_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vsca
 define <vscale x 2 x bfloat> @vfmin_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmin_vv_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v9, v10, v8, v0
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
@@ -113,15 +113,15 @@ declare <vscale x 4 x bfloat> @llvm.vp.minimum.nxv4bf16(<vscale x 4 x bfloat>, <
 define <vscale x 4 x bfloat> @vfmin_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmin_vv_nxv4bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v8, v12, v12, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v16, v12, v14, v0
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmfeq.vv v8, v14, v14, v0.t
@@ -139,13 +139,13 @@ define <vscale x 4 x bfloat> @vfmin_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vsca
 define <vscale x 4 x bfloat> @vfmin_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmin_vv_nxv4bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
 ; CHECK-NEXT:    vmfeq.vv v0, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v10, v12, v10, v0
@@ -162,15 +162,15 @@ declare <vscale x 8 x bfloat> @llvm.vp.minimum.nxv8bf16(<vscale x 8 x bfloat>, <
 define <vscale x 8 x bfloat> @vfmin_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmin_vv_nxv8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v8, v16, v16, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v20, v10
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v24, v16, v20, v0
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vmfeq.vv v8, v20, v20, v0.t
@@ -188,13 +188,13 @@ define <vscale x 8 x bfloat> @vfmin_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vsca
 define <vscale x 8 x bfloat> @vfmin_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmin_vv_nxv8bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v12, v16, v12, v0
@@ -217,15 +217,15 @@ define <vscale x 16 x bfloat> @vfmin_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <v
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v8, v24, v24, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
@@ -258,10 +258,10 @@ define <vscale x 16 x bfloat> @vfmin_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
@@ -296,64 +296,62 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    li a2, 25
+; CHECK-NEXT:    mul a1, a1, a2
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv8r.v v16, v8
+; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 5
+; CHECK-NEXT:    li a5, 24
+; CHECK-NEXT:    mul a4, a4, a5
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
 ; CHECK-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v12, v0, a2
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs1r.v v0, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v13, v24, v24, v0.t
-; CHECK-NEXT:    vmv8r.v v0, v16
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    slli a3, a3, 4
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
-; CHECK-NEXT:    vmv1r.v v0, v13
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    li a4, 24
-; CHECK-NEXT:    mul a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v24, v24, v16, v0
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v12, v24, v24, v0.t
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    li a3, 25
+; CHECK-NEXT:    mul a2, a2, a3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
 ; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfeq.vv v13, v16, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a2, a2, a3
+; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmfeq.vv v12, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
@@ -371,35 +369,43 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB10_2:
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v24, v16, v16, v0.t
-; CHECK-NEXT:    vmv8r.v v8, v16
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v8, v24, v24, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    li a1, 25
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vmerge.vvm v24, v24, v16, v0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 25
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
@@ -411,7 +417,7 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    li a1, 25
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
@@ -442,68 +448,61 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
+; CHECK-NEXT:    li a2, 25
+; CHECK-NEXT:    mul a1, a1, a2
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x19, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 25 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v7
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv8r.v v0, v8
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vmset.m v24
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v12, v7, a2
+; CHECK-NEXT:    vslidedown.vx v8, v24, a2
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v13, v24, v24, v0.t
-; CHECK-NEXT:    vmv8r.v v0, v16
 ; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
-; CHECK-NEXT:    vmv1r.v v0, v13
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    li a4, 24
-; CHECK-NEXT:    mul a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v24, v24, v16, v0
+; CHECK-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v4
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v12, v24, v24, v0.t
+; CHECK-NEXT:    vmv8r.v v0, v16
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    slli a3, a2, 4
+; CHECK-NEXT:    add a2, a3, a2
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
 ; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfeq.vv v13, v16, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a2, a2, a3
+; CHECK-NEXT:    slli a3, a2, 3
+; CHECK-NEXT:    add a2, a3, a2
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vl1r.v v13, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vmfeq.vv v12, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    slli a3, a2, 3
+; CHECK-NEXT:    add a2, a3, a2
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
@@ -511,7 +510,8 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    slli a3, a2, 3
+; CHECK-NEXT:    add a2, a3, a2
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
@@ -519,43 +519,49 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB11_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 4
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v0
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a1, a0, 4
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a1, a0, 4
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfmin.vv v16, v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a1, a0, 3
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    li a1, 25
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
index 06f48762e24c2..30ef3dccd426b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
@@ -17,10 +17,10 @@ declare <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat>, <vscale
 define <vscale x 1 x i1> @fcmp_oeq_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fcmp_oeq_vv_nxv1bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v9, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"oeq", <vscale x 1 x i1> %m, i32 %evl)
@@ -31,11 +31,11 @@ define <vscale x 1 x i1> @fcmp_oeq_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat
 ; CHECK-LABEL: fcmp_oeq_vf_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
@@ -48,11 +48,11 @@ define <vscale x 1 x i1> @fcmp_oeq_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, b
 ; CHECK-LABEL: fcmp_oeq_vf_swap_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
@@ -64,10 +64,10 @@ define <vscale x 1 x i1> @fcmp_oeq_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, b
 define <vscale x 1 x i1> @fcmp_ogt_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fcmp_ogt_vv_nxv1bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmflt.vv v0, v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"ogt", <vscale x 1 x i1> %m, i32 %evl)
@@ -78,11 +78,11 @@ define <vscale x 1 x i1> @fcmp_ogt_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat
 ; CHECK-LABEL: fcmp_ogt_vf_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmflt.vv v0, v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
@@ -95,11 +95,11 @@ define <vscale x 1 x i1> @fcmp_ogt_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, b
 ; CHECK-LABEL: fcmp_ogt_vf_swap_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmflt.vv v0, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
@@ -111,10 +111,10 @@ define <vscale x 1 x i1> @fcmp_ogt_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, b
 define <vscale x 1 x i1> @fcmp_oge_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fcmp_oge_vv_nxv1bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfle.vv v0, v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"oge", <vscale x 1 x i1> %m, i32 %evl)
@@ -125,11 +125,11 @@ define <vscale x 1 x i1> @fcmp_oge_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat
 ; CHECK-LABEL: fcmp_oge_vf_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfle.vv v0, v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
@@ -142,11 +142,11 @@ define <vscale x 1 x i1> @fcmp_oge_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, b
 ; CHECK-LABEL: fcmp_oge_vf_swap_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfle.vv v0, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
@@ -158,10 +158,10 @@ define <vscale x 1 x i1> @fcmp_oge_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, b
 define <vscale x 1 x i1> @fcmp_olt_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fcmp_olt_vv_nxv1bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmflt.vv v0, v9, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"olt", <vscale x 1 x i1> %m, i32 %evl)
@@ -172,11 +172,11 @@ define <vscale x 1 x i1> @fcmp_olt_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat
 ; CHECK-LABEL: fcmp_olt_vf_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmflt.vv v0, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
@@ -189,11 +189,11 @@ define <vscale x 1 x i1> @fcmp_olt_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, b
 ; CHECK-LABEL: fcmp_olt_vf_swap_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmflt.vv v0, v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
@@ -205,10 +205,10 @@ define <vscale x 1 x i1> @fcmp_olt_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, b
 define <vscale x 1 x i1> @fcmp_ole_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fcmp_ole_vv_nxv1bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfle.vv v0, v9, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"ole", <vscale x 1 x i1> %m, i32 %evl)
@@ -219,11 +219,11 @@ define <vscale x 1 x i1> @fcmp_ole_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat
 ; CHECK-LABEL: fcmp_ole_vf_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfle.vv v0, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
@@ -236,11 +236,11 @@ define <vscale x 1 x i1> @fcmp_ole_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, b
 ; CHECK-LABEL: fcmp_ole_vf_swap_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfle.vv v0, v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
@@ -252,10 +252,10 @@ define <vscale x 1 x i1> @fcmp_ole_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, b
 define <vscale x 1 x i1> @fcmp_one_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fcmp_one_vv_nxv1bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    vmflt.vv v9, v10, v9, v0.t
 ; CHECK-NEXT:    vmor.mm v0, v9, v8
@@ -268,11 +268,11 @@ define <vscale x 1 x i1> @fcmp_one_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat
 ; CHECK-LABEL: fcmp_one_vf_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    vmflt.vv v9, v10, v9, v0.t
 ; CHECK-NEXT:    vmor.mm v0, v9, v8
@@ -287,11 +287,11 @@ define <vscale x 1 x i1> @fcmp_one_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, b
 ; CHECK-LABEL: fcmp_one_vf_swap_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v10, v9, v0.t
 ; CHECK-NEXT:    vmflt.vv v9, v9, v10, v0.t
 ; CHECK-NEXT:    vmor.mm v0, v9, v8
@@ -305,10 +305,10 @@ define <vscale x 1 x i1> @fcmp_one_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, b
 define <vscale x 1 x i1> @fcmp_ord_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fcmp_ord_vv_nxv1bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v8, v10, v10, v0.t
 ; CHECK-NEXT:    vmfeq.vv v9, v9, v9, v0.t
 ; CHECK-NEXT:    vmand.mm v0, v9, v8
@@ -321,14 +321,14 @@ define <vscale x 1 x i1> @fcmp_ord_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat
 ; CHECK-LABEL: fcmp_ord_vf_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vmv.v.x v8, a1
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v9, v9, v9, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v8, v10, v10, v0.t
 ; CHECK-NEXT:    vmand.mm v0, v9, v8
 ; CHECK-NEXT:    ret
@@ -342,14 +342,14 @@ define <vscale x 1 x i1> @fcmp_ord_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, b
 ; CHECK-LABEL: fcmp_ord_vf_swap_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vmv.v.x v8, a1
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v9, v9, v9, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v8, v10, v10, v0.t
 ; CHECK-NEXT:    vmand.mm v0, v8, v9
 ; CHECK-NEXT:    ret
@@ -362,10 +362,10 @@ define <vscale x 1 x i1> @fcmp_ord_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, b
 define <vscale x 1 x i1> @fcmp_ueq_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fcmp_ueq_vv_nxv1bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    vmflt.vv v9, v10, v9, v0.t
 ; CHECK-NEXT:    vmnor.mm v0, v9, v8
@@ -378,11 +378,11 @@ define <vscale x 1 x i1> @fcmp_ueq_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat
 ; CHECK-LABEL: fcmp_ueq_vf_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    vmflt.vv v9, v10, v9, v0.t
 ; CHECK-NEXT:    vmnor.mm v0, v9, v8
@@ -397,11 +397,11 @@ define <vscale x 1 x i1> @fcmp_ueq_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, b
 ; CHECK-LABEL: fcmp_ueq_vf_swap_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v10, v9, v0.t
 ; CHECK-NEXT:    vmflt.vv v9, v9, v10, v0.t
 ; CHECK-NEXT:    vmnor.mm v0, v9, v8
@@ -415,10 +415,10 @@ define <vscale x 1 x i1> @fcmp_ueq_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, b
 define <vscale x 1 x i1> @fcmp_ugt_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fcmp_ugt_vv_nxv1bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfle.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v8
 ; CHECK-NEXT:    ret
@@ -430,11 +430,11 @@ define <vscale x 1 x i1> @fcmp_ugt_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat
 ; CHECK-LABEL: fcmp_ugt_vf_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfle.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v8
 ; CHECK-NEXT:    ret
@@ -448,11 +448,11 @@ define <vscale x 1 x i1> @fcmp_ugt_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, b
 ; CHECK-LABEL: fcmp_ugt_vf_swap_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfle.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v8
 ; CHECK-NEXT:    ret
@@ -465,10 +465,10 @@ define <vscale x 1 x i1> @fcmp_ugt_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, b
 define <vscale x 1 x i1> @fcmp_uge_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fcmp_uge_vv_nxv1bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v8
 ; CHECK-NEXT:    ret
@@ -480,11 +480,11 @@ define <vscale x 1 x i1> @fcmp_uge_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat
 ; CHECK-LABEL: fcmp_uge_vf_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v8
 ; CHECK-NEXT:    ret
@@ -498,11 +498,11 @@ define <vscale x 1 x i1> @fcmp_uge_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, b
 ; CHECK-LABEL: fcmp_uge_vf_swap_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v8
 ; CHECK-NEXT:    ret
@@ -515,10 +515,10 @@ define <vscale x 1 x i1> @fcmp_uge_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, b
 define <vscale x 1 x i1> @fcmp_ult_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fcmp_ult_vv_nxv1bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfle.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v8
 ; CHECK-NEXT:    ret
@@ -530,11 +530,11 @@ define <vscale x 1 x i1> @fcmp_ult_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat
 ; CHECK-LABEL: fcmp_ult_vf_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfle.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v8
 ; CHECK-NEXT:    ret
@@ -548,11 +548,11 @@ define <vscale x 1 x i1> @fcmp_ult_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, b
 ; CHECK-LABEL: fcmp_ult_vf_swap_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfle.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v8
 ; CHECK-NEXT:    ret
@@ -565,10 +565,10 @@ define <vscale x 1 x i1> @fcmp_ult_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, b
 define <vscale x 1 x i1> @fcmp_ule_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fcmp_ule_vv_nxv1bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v8
 ; CHECK-NEXT:    ret
@@ -580,11 +580,11 @@ define <vscale x 1 x i1> @fcmp_ule_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat
 ; CHECK-LABEL: fcmp_ule_vf_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v8
 ; CHECK-NEXT:    ret
@@ -598,11 +598,11 @@ define <vscale x 1 x i1> @fcmp_ule_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, b
 ; CHECK-LABEL: fcmp_ule_vf_swap_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v8
 ; CHECK-NEXT:    ret
@@ -615,10 +615,10 @@ define <vscale x 1 x i1> @fcmp_ule_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, b
 define <vscale x 1 x i1> @fcmp_une_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fcmp_une_vv_nxv1bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfne.vv v0, v9, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"une", <vscale x 1 x i1> %m, i32 %evl)
@@ -629,11 +629,11 @@ define <vscale x 1 x i1> @fcmp_une_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat
 ; CHECK-LABEL: fcmp_une_vf_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfne.vv v0, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
@@ -646,11 +646,11 @@ define <vscale x 1 x i1> @fcmp_une_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, b
 ; CHECK-LABEL: fcmp_une_vf_swap_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfne.vv v0, v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
@@ -662,10 +662,10 @@ define <vscale x 1 x i1> @fcmp_une_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, b
 define <vscale x 1 x i1> @fcmp_uno_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fcmp_uno_vv_nxv1bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfne.vv v8, v10, v10, v0.t
 ; CHECK-NEXT:    vmfne.vv v9, v9, v9, v0.t
 ; CHECK-NEXT:    vmor.mm v0, v9, v8
@@ -678,14 +678,14 @@ define <vscale x 1 x i1> @fcmp_uno_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat
 ; CHECK-LABEL: fcmp_uno_vf_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vmv.v.x v8, a1
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfne.vv v9, v9, v9, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfne.vv v8, v10, v10, v0.t
 ; CHECK-NEXT:    vmor.mm v0, v9, v8
 ; CHECK-NEXT:    ret
@@ -699,14 +699,14 @@ define <vscale x 1 x i1> @fcmp_uno_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, b
 ; CHECK-LABEL: fcmp_uno_vf_swap_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vmv.v.x v8, a1
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfne.vv v9, v9, v9, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfne.vv v8, v10, v10, v0.t
 ; CHECK-NEXT:    vmor.mm v0, v8, v9
 ; CHECK-NEXT:    ret
@@ -721,10 +721,10 @@ declare <vscale x 3 x i1> @llvm.vp.fcmp.nxv3bf16(<vscale x 3 x bfloat>, <vscale
 define <vscale x 3 x i1> @fcmp_oeq_vv_nxv3bf16(<vscale x 3 x bfloat> %va, <vscale x 3 x bfloat> %vb, <vscale x 3 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fcmp_oeq_vv_nxv3bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v8, v12, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
@@ -737,10 +737,10 @@ declare <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat>, <vscale
 define <vscale x 8 x i1> @fcmp_oeq_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fcmp_oeq_vv_nxv8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v8, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
@@ -752,11 +752,11 @@ define <vscale x 8 x i1> @fcmp_oeq_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat
 ; CHECK-LABEL: fcmp_oeq_vf_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
@@ -770,11 +770,11 @@ define <vscale x 8 x i1> @fcmp_oeq_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 ; CHECK-LABEL: fcmp_oeq_vf_swap_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v8, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
@@ -787,10 +787,10 @@ define <vscale x 8 x i1> @fcmp_oeq_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 define <vscale x 8 x i1> @fcmp_ogt_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fcmp_ogt_vv_nxv8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
@@ -802,11 +802,11 @@ define <vscale x 8 x i1> @fcmp_ogt_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat
 ; CHECK-LABEL: fcmp_ogt_vf_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
@@ -820,11 +820,11 @@ define <vscale x 8 x i1> @fcmp_ogt_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 ; CHECK-LABEL: fcmp_ogt_vf_swap_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
@@ -837,10 +837,10 @@ define <vscale x 8 x i1> @fcmp_ogt_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 define <vscale x 8 x i1> @fcmp_oge_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fcmp_oge_vv_nxv8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
@@ -852,11 +852,11 @@ define <vscale x 8 x i1> @fcmp_oge_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat
 ; CHECK-LABEL: fcmp_oge_vf_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
@@ -870,11 +870,11 @@ define <vscale x 8 x i1> @fcmp_oge_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 ; CHECK-LABEL: fcmp_oge_vf_swap_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfle.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
@@ -887,10 +887,10 @@ define <vscale x 8 x i1> @fcmp_oge_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 define <vscale x 8 x i1> @fcmp_olt_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fcmp_olt_vv_nxv8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
@@ -902,11 +902,11 @@ define <vscale x 8 x i1> @fcmp_olt_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat
 ; CHECK-LABEL: fcmp_olt_vf_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
@@ -920,11 +920,11 @@ define <vscale x 8 x i1> @fcmp_olt_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 ; CHECK-LABEL: fcmp_olt_vf_swap_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
@@ -937,10 +937,10 @@ define <vscale x 8 x i1> @fcmp_olt_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 define <vscale x 8 x i1> @fcmp_ole_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fcmp_ole_vv_nxv8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
@@ -952,11 +952,11 @@ define <vscale x 8 x i1> @fcmp_ole_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat
 ; CHECK-LABEL: fcmp_ole_vf_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfle.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
@@ -970,11 +970,11 @@ define <vscale x 8 x i1> @fcmp_ole_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 ; CHECK-LABEL: fcmp_ole_vf_swap_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
@@ -987,10 +987,10 @@ define <vscale x 8 x i1> @fcmp_ole_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 define <vscale x 8 x i1> @fcmp_one_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fcmp_one_vv_nxv8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; CHECK-NEXT:    vmflt.vv v9, v12, v16, v0.t
 ; CHECK-NEXT:    vmor.mm v0, v9, v8
@@ -1003,11 +1003,11 @@ define <vscale x 8 x i1> @fcmp_one_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat
 ; CHECK-LABEL: fcmp_one_vf_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    vmflt.vv v9, v16, v12, v0.t
 ; CHECK-NEXT:    vmor.mm v0, v9, v8
@@ -1022,11 +1022,11 @@ define <vscale x 8 x i1> @fcmp_one_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 ; CHECK-LABEL: fcmp_one_vf_swap_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; CHECK-NEXT:    vmflt.vv v9, v12, v16, v0.t
 ; CHECK-NEXT:    vmor.mm v0, v9, v8
@@ -1040,13 +1040,13 @@ define <vscale x 8 x i1> @fcmp_one_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 define <vscale x 8 x i1> @fcmp_ord_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fcmp_ord_vv_nxv8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v10, v12, v12, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v8, v12, v12, v0.t
 ; CHECK-NEXT:    vmand.mm v0, v8, v10
 ; CHECK-NEXT:    ret
@@ -1058,14 +1058,14 @@ define <vscale x 8 x i1> @fcmp_ord_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat
 ; CHECK-LABEL: fcmp_ord_vf_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vmv.v.x v8, a1
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v10, v12, v12, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v8, v12, v12, v0.t
 ; CHECK-NEXT:    vmand.mm v0, v10, v8
 ; CHECK-NEXT:    ret
@@ -1079,14 +1079,14 @@ define <vscale x 8 x i1> @fcmp_ord_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 ; CHECK-LABEL: fcmp_ord_vf_swap_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vmv.v.x v8, a1
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v10, v12, v12, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v8, v12, v12, v0.t
 ; CHECK-NEXT:    vmand.mm v0, v8, v10
 ; CHECK-NEXT:    ret
@@ -1099,10 +1099,10 @@ define <vscale x 8 x i1> @fcmp_ord_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 define <vscale x 8 x i1> @fcmp_ueq_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fcmp_ueq_vv_nxv8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; CHECK-NEXT:    vmflt.vv v9, v12, v16, v0.t
 ; CHECK-NEXT:    vmnor.mm v0, v9, v8
@@ -1115,11 +1115,11 @@ define <vscale x 8 x i1> @fcmp_ueq_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat
 ; CHECK-LABEL: fcmp_ueq_vf_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    vmflt.vv v9, v16, v12, v0.t
 ; CHECK-NEXT:    vmnor.mm v0, v9, v8
@@ -1134,11 +1134,11 @@ define <vscale x 8 x i1> @fcmp_ueq_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 ; CHECK-LABEL: fcmp_ueq_vf_swap_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; CHECK-NEXT:    vmflt.vv v9, v12, v16, v0.t
 ; CHECK-NEXT:    vmnor.mm v0, v9, v8
@@ -1152,10 +1152,10 @@ define <vscale x 8 x i1> @fcmp_ueq_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 define <vscale x 8 x i1> @fcmp_ugt_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fcmp_ugt_vv_nxv8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v8
 ; CHECK-NEXT:    ret
@@ -1167,11 +1167,11 @@ define <vscale x 8 x i1> @fcmp_ugt_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat
 ; CHECK-LABEL: fcmp_ugt_vf_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfle.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v8
 ; CHECK-NEXT:    ret
@@ -1185,11 +1185,11 @@ define <vscale x 8 x i1> @fcmp_ugt_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 ; CHECK-LABEL: fcmp_ugt_vf_swap_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v8
 ; CHECK-NEXT:    ret
@@ -1202,10 +1202,10 @@ define <vscale x 8 x i1> @fcmp_ugt_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 define <vscale x 8 x i1> @fcmp_uge_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fcmp_uge_vv_nxv8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v8
 ; CHECK-NEXT:    ret
@@ -1217,11 +1217,11 @@ define <vscale x 8 x i1> @fcmp_uge_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat
 ; CHECK-LABEL: fcmp_uge_vf_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v8
 ; CHECK-NEXT:    ret
@@ -1235,11 +1235,11 @@ define <vscale x 8 x i1> @fcmp_uge_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 ; CHECK-LABEL: fcmp_uge_vf_swap_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v8
 ; CHECK-NEXT:    ret
@@ -1252,10 +1252,10 @@ define <vscale x 8 x i1> @fcmp_uge_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 define <vscale x 8 x i1> @fcmp_ult_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fcmp_ult_vv_nxv8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v8
 ; CHECK-NEXT:    ret
@@ -1267,11 +1267,11 @@ define <vscale x 8 x i1> @fcmp_ult_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat
 ; CHECK-LABEL: fcmp_ult_vf_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v8
 ; CHECK-NEXT:    ret
@@ -1285,11 +1285,11 @@ define <vscale x 8 x i1> @fcmp_ult_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 ; CHECK-LABEL: fcmp_ult_vf_swap_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfle.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v8
 ; CHECK-NEXT:    ret
@@ -1302,10 +1302,10 @@ define <vscale x 8 x i1> @fcmp_ult_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 define <vscale x 8 x i1> @fcmp_ule_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fcmp_ule_vv_nxv8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v8
 ; CHECK-NEXT:    ret
@@ -1317,11 +1317,11 @@ define <vscale x 8 x i1> @fcmp_ule_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat
 ; CHECK-LABEL: fcmp_ule_vf_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v8
 ; CHECK-NEXT:    ret
@@ -1335,11 +1335,11 @@ define <vscale x 8 x i1> @fcmp_ule_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 ; CHECK-LABEL: fcmp_ule_vf_swap_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v8
 ; CHECK-NEXT:    ret
@@ -1352,10 +1352,10 @@ define <vscale x 8 x i1> @fcmp_ule_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 define <vscale x 8 x i1> @fcmp_une_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fcmp_une_vv_nxv8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfne.vv v8, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
@@ -1367,11 +1367,11 @@ define <vscale x 8 x i1> @fcmp_une_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat
 ; CHECK-LABEL: fcmp_une_vf_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfne.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
@@ -1385,11 +1385,11 @@ define <vscale x 8 x i1> @fcmp_une_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 ; CHECK-LABEL: fcmp_une_vf_swap_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfne.vv v8, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
@@ -1402,13 +1402,13 @@ define <vscale x 8 x i1> @fcmp_une_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 define <vscale x 8 x i1> @fcmp_uno_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fcmp_uno_vv_nxv8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfne.vv v10, v12, v12, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfne.vv v8, v12, v12, v0.t
 ; CHECK-NEXT:    vmor.mm v0, v8, v10
 ; CHECK-NEXT:    ret
@@ -1420,14 +1420,14 @@ define <vscale x 8 x i1> @fcmp_uno_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat
 ; CHECK-LABEL: fcmp_uno_vf_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vmv.v.x v8, a1
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfne.vv v10, v12, v12, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfne.vv v8, v12, v12, v0.t
 ; CHECK-NEXT:    vmor.mm v0, v10, v8
 ; CHECK-NEXT:    ret
@@ -1441,14 +1441,14 @@ define <vscale x 8 x i1> @fcmp_uno_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 ; CHECK-LABEL: fcmp_uno_vf_swap_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vmv.v.x v8, a1
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfne.vv v10, v12, v12, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfne.vv v8, v12, v12, v0.t
 ; CHECK-NEXT:    vmor.mm v0, v8, v10
 ; CHECK-NEXT:    ret
@@ -1521,14 +1521,14 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a7, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v4
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli t0, a0, 4
-; CHECK-NEXT:    add a0, t0, a0
+; CHECK-NEXT:    slli a7, a0, 4
+; CHECK-NEXT:    add a0, a7, a0
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
@@ -1536,7 +1536,7 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a7, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v6, v16, v8, v0.t
 ; CHECK-NEXT:    bltu a6, a4, .LBB85_2
 ; CHECK-NEXT:  # %bb.1:
@@ -1549,16 +1549,16 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a6, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a7, a0, 4
-; CHECK-NEXT:    add a0, a7, a0
+; CHECK-NEXT:    slli a6, a0, 4
+; CHECK-NEXT:    add a0, a6, a0
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
-; CHECK-NEXT:    vsetvli zero, a6, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v5, v24, v8, v0.t
 ; CHECK-NEXT:    add a0, a3, a3
 ; CHECK-NEXT:    bltu a2, a5, .LBB85_4
@@ -1573,6 +1573,9 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    vl1r.v v7, (a6) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli a6, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v7, a3
+; CHECK-NEXT:    sltu a6, a2, a5
+; CHECK-NEXT:    addi a6, a6, -1
+; CHECK-NEXT:    and a5, a6, a5
 ; CHECK-NEXT:    csrr a6, vlenb
 ; CHECK-NEXT:    mv a7, a6
 ; CHECK-NEXT:    slli a6, a6, 3
@@ -1582,31 +1585,28 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    add a6, sp, a6
 ; CHECK-NEXT:    addi a6, a6, 16
 ; CHECK-NEXT:    vl8r.v v16, (a6) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli a6, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a5, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
-; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    slli a7, a6, 4
-; CHECK-NEXT:    add a6, a7, a6
-; CHECK-NEXT:    add a6, sp, a6
-; CHECK-NEXT:    addi a6, a6, 16
-; CHECK-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    slli a7, a6, 5
-; CHECK-NEXT:    add a6, a7, a6
-; CHECK-NEXT:    add a6, sp, a6
-; CHECK-NEXT:    addi a6, a6, 16
-; CHECK-NEXT:    vl8r.v v24, (a6) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    slli a6, a5, 4
+; CHECK-NEXT:    add a5, a6, a5
+; CHECK-NEXT:    add a5, sp, a5
+; CHECK-NEXT:    addi a5, a5, 16
+; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    slli a6, a5, 5
+; CHECK-NEXT:    add a5, a6, a5
+; CHECK-NEXT:    add a5, sp, a5
+; CHECK-NEXT:    addi a5, a5, 16
+; CHECK-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28
-; CHECK-NEXT:    sltu a6, a2, a5
-; CHECK-NEXT:    addi a6, a6, -1
-; CHECK-NEXT:    and a5, a6, a5
-; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    slli a7, a6, 4
-; CHECK-NEXT:    add a6, a7, a6
-; CHECK-NEXT:    add a6, sp, a6
-; CHECK-NEXT:    addi a6, a6, 16
-; CHECK-NEXT:    vl8r.v v24, (a6) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    slli a6, a5, 4
+; CHECK-NEXT:    add a5, a6, a5
+; CHECK-NEXT:    add a5, sp, a5
+; CHECK-NEXT:    addi a5, a5, 16
+; CHECK-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v4, v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v5, v6, a3
@@ -1615,16 +1615,16 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    mv a2, a4
 ; CHECK-NEXT:  .LBB85_6:
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a5, a4, 5
-; CHECK-NEXT:    add a4, a5, a4
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a4, a2, 5
+; CHECK-NEXT:    add a2, a4, a2
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v8, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v4, a3
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
index 31359c3f68ec7..4a8ee54d87eba 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
@@ -17,10 +17,10 @@ declare <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat>, <vsc
 define <vscale x 1 x bfloat> @vfadd_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfadd_vv_nxv1bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfadd.vv v9, v9, v10, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -32,10 +32,10 @@ define <vscale x 1 x bfloat> @vfadd_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vsca
 define <vscale x 1 x bfloat> @vfadd_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, i32 zeroext %evl) {
 ; CHECK-LABEL: vfadd_vv_nxv1bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfadd.vv v9, v9, v10
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -48,11 +48,11 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloa
 ; CHECK-LABEL: vfadd_vf_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfadd.vv v9, v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -67,11 +67,11 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_commute(<vscale x 1 x bfloat> %v
 ; CHECK-LABEL: vfadd_vf_nxv1bf16_commute:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfadd.vv v9, v8, v10, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -86,11 +86,11 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
 ; CHECK-LABEL: vfadd_vf_nxv1bf16_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfadd.vv v9, v10, v8
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -105,11 +105,11 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_unmasked_commute(<vscale x 1 x b
 ; CHECK-LABEL: vfadd_vf_nxv1bf16_unmasked_commute:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfadd.vv v9, v8, v10
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -125,10 +125,10 @@ declare <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat>, <vsc
 define <vscale x 2 x bfloat> @vfadd_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfadd_vv_nxv2bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfadd.vv v9, v9, v10, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -140,10 +140,10 @@ define <vscale x 2 x bfloat> @vfadd_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vsca
 define <vscale x 2 x bfloat> @vfadd_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, i32 zeroext %evl) {
 ; CHECK-LABEL: vfadd_vv_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfadd.vv v9, v9, v10
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -156,11 +156,11 @@ define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloa
 ; CHECK-LABEL: vfadd_vf_nxv2bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfadd.vv v9, v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -175,11 +175,11 @@ define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
 ; CHECK-LABEL: vfadd_vf_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfadd.vv v9, v10, v8
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -195,10 +195,10 @@ declare <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat>, <vsc
 define <vscale x 4 x bfloat> @vfadd_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfadd_vv_nxv4bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfadd.vv v10, v12, v10, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
@@ -210,10 +210,10 @@ define <vscale x 4 x bfloat> @vfadd_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vsca
 define <vscale x 4 x bfloat> @vfadd_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, i32 zeroext %evl) {
 ; CHECK-LABEL: vfadd_vv_nxv4bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfadd.vv v10, v12, v10
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
@@ -226,11 +226,11 @@ define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloa
 ; CHECK-LABEL: vfadd_vf_nxv4bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfadd.vv v10, v10, v12, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
@@ -245,11 +245,11 @@ define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 ; CHECK-LABEL: vfadd_vf_nxv4bf16_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfadd.vv v10, v10, v12
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
@@ -265,10 +265,10 @@ declare <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat>, <vsc
 define <vscale x 8 x bfloat> @vfadd_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfadd_vv_nxv8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfadd.vv v12, v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
@@ -280,10 +280,10 @@ define <vscale x 8 x bfloat> @vfadd_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vsca
 define <vscale x 8 x bfloat> @vfadd_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, i32 zeroext %evl) {
 ; CHECK-LABEL: vfadd_vv_nxv8bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfadd.vv v12, v16, v12
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
@@ -296,11 +296,11 @@ define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloa
 ; CHECK-LABEL: vfadd_vf_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfadd.vv v12, v12, v16, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
@@ -315,11 +315,11 @@ define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 ; CHECK-LABEL: vfadd_vf_nxv8bf16_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfadd.vv v12, v12, v16
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
@@ -335,10 +335,10 @@ declare <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat>, <
 define <vscale x 16 x bfloat> @vfadd_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfadd_vv_nxv16bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfadd.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
@@ -350,10 +350,10 @@ define <vscale x 16 x bfloat> @vfadd_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <v
 define <vscale x 16 x bfloat> @vfadd_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, i32 zeroext %evl) {
 ; CHECK-LABEL: vfadd_vv_nxv16bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfadd.vv v16, v24, v16
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
@@ -366,11 +366,11 @@ define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bf
 ; CHECK-LABEL: vfadd_vf_nxv16bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v12, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfadd.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
@@ -385,11 +385,11 @@ define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ; CHECK-LABEL: vfadd_vf_nxv16bf16_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v12, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfadd.vv v16, v16, v24
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
@@ -411,23 +411,22 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfadd.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -437,10 +436,11 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:  .LBB22_2:
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfadd.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
@@ -466,22 +466,21 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v7
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT:    vmset.m v24
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v7, a2
+; CHECK-NEXT:    vslidedown.vx v0, v24, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfadd.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -491,9 +490,10 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:  .LBB23_2:
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfadd.vv v16, v24, v16
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
@@ -518,14 +518,10 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vmv8r.v v16, v8
+; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vmv8r.v v24, v8
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
-; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.x v16, a1
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a3, a1, 3
@@ -546,15 +542,18 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a4, a3, 3
-; CHECK-NEXT:    add a3, a4, a3
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a3, a2, 3
+; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfadd.vv v16, v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -564,20 +563,21 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; CHECK-NEXT:  .LBB24_2:
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 3
-; CHECK-NEXT:    add a1, a2, a1
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 3
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfadd.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
@@ -604,16 +604,10 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vmset.m v7
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vmset.m v24
 ; CHECK-NEXT:    vmv.v.x v16, a1
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
@@ -624,18 +618,22 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v7, a2
+; CHECK-NEXT:    vslidedown.vx v0, v24, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vmv8r.v v16, v8
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfadd.vv v16, v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -645,14 +643,15 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:  .LBB25_2:
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfadd.vv v16, v16, v24
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
index 2205769d3494a..2997fad370e46 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
@@ -17,10 +17,10 @@ declare <vscale x 1 x bfloat> @llvm.vp.fdiv.nxv1bf16(<vscale x 1 x bfloat>, <vsc
 define <vscale x 1 x bfloat> @vfdiv_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfdiv_vv_nxv1bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v9, v9, v10, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -32,10 +32,10 @@ define <vscale x 1 x bfloat> @vfdiv_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vsca
 define <vscale x 1 x bfloat> @vfdiv_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, i32 zeroext %evl) {
 ; CHECK-LABEL: vfdiv_vv_nxv1bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v9, v9, v10
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -48,11 +48,11 @@ define <vscale x 1 x bfloat> @vfdiv_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloa
 ; CHECK-LABEL: vfdiv_vf_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v9, v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -67,11 +67,11 @@ define <vscale x 1 x bfloat> @vfdiv_vf_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
 ; CHECK-LABEL: vfdiv_vf_nxv1bf16_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v9, v10, v8
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -87,10 +87,10 @@ declare <vscale x 2 x bfloat> @llvm.vp.fdiv.nxv2bf16(<vscale x 2 x bfloat>, <vsc
 define <vscale x 2 x bfloat> @vfdiv_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfdiv_vv_nxv2bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v9, v9, v10, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -102,10 +102,10 @@ define <vscale x 2 x bfloat> @vfdiv_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vsca
 define <vscale x 2 x bfloat> @vfdiv_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, i32 zeroext %evl) {
 ; CHECK-LABEL: vfdiv_vv_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v9, v9, v10
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -118,11 +118,11 @@ define <vscale x 2 x bfloat> @vfdiv_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloa
 ; CHECK-LABEL: vfdiv_vf_nxv2bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v9, v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -137,11 +137,11 @@ define <vscale x 2 x bfloat> @vfdiv_vf_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
 ; CHECK-LABEL: vfdiv_vf_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v9, v10, v8
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -157,10 +157,10 @@ declare <vscale x 4 x bfloat> @llvm.vp.fdiv.nxv4bf16(<vscale x 4 x bfloat>, <vsc
 define <vscale x 4 x bfloat> @vfdiv_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfdiv_vv_nxv4bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v10, v12, v10, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
@@ -172,10 +172,10 @@ define <vscale x 4 x bfloat> @vfdiv_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vsca
 define <vscale x 4 x bfloat> @vfdiv_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, i32 zeroext %evl) {
 ; CHECK-LABEL: vfdiv_vv_nxv4bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v10, v12, v10
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
@@ -188,11 +188,11 @@ define <vscale x 4 x bfloat> @vfdiv_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloa
 ; CHECK-LABEL: vfdiv_vf_nxv4bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v10, v10, v12, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
@@ -207,11 +207,11 @@ define <vscale x 4 x bfloat> @vfdiv_vf_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 ; CHECK-LABEL: vfdiv_vf_nxv4bf16_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v10, v10, v12
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
@@ -227,10 +227,10 @@ declare <vscale x 8 x bfloat> @llvm.vp.fdiv.nxv8bf16(<vscale x 8 x bfloat>, <vsc
 define <vscale x 8 x bfloat> @vfdiv_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfdiv_vv_nxv8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v12, v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
@@ -242,10 +242,10 @@ define <vscale x 8 x bfloat> @vfdiv_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vsca
 define <vscale x 8 x bfloat> @vfdiv_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, i32 zeroext %evl) {
 ; CHECK-LABEL: vfdiv_vv_nxv8bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v12, v16, v12
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
@@ -258,11 +258,11 @@ define <vscale x 8 x bfloat> @vfdiv_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloa
 ; CHECK-LABEL: vfdiv_vf_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v12, v12, v16, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
@@ -277,11 +277,11 @@ define <vscale x 8 x bfloat> @vfdiv_vf_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 ; CHECK-LABEL: vfdiv_vf_nxv8bf16_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v12, v12, v16
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
@@ -297,10 +297,10 @@ declare <vscale x 16 x bfloat> @llvm.vp.fdiv.nxv16bf16(<vscale x 16 x bfloat>, <
 define <vscale x 16 x bfloat> @vfdiv_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfdiv_vv_nxv16bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
@@ -312,10 +312,10 @@ define <vscale x 16 x bfloat> @vfdiv_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <v
 define <vscale x 16 x bfloat> @vfdiv_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, i32 zeroext %evl) {
 ; CHECK-LABEL: vfdiv_vv_nxv16bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v16, v24, v16
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
@@ -328,11 +328,11 @@ define <vscale x 16 x bfloat> @vfdiv_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bf
 ; CHECK-LABEL: vfdiv_vf_nxv16bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v12, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
@@ -347,11 +347,11 @@ define <vscale x 16 x bfloat> @vfdiv_vf_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ; CHECK-LABEL: vfdiv_vf_nxv16bf16_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v12, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v16, v16, v24
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
@@ -373,23 +373,22 @@ define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -399,10 +398,11 @@ define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:  .LBB20_2:
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
@@ -428,22 +428,21 @@ define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v7
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT:    vmset.m v24
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v7, a2
+; CHECK-NEXT:    vslidedown.vx v0, v24, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -453,9 +452,10 @@ define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:  .LBB21_2:
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v16, v24, v16
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
@@ -480,14 +480,10 @@ define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vmv8r.v v16, v8
+; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vmv8r.v v24, v8
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
-; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.x v16, a1
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a3, a1, 3
@@ -508,15 +504,18 @@ define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a4, a3, 3
-; CHECK-NEXT:    add a3, a4, a3
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a3, a2, 3
+; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v16, v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -526,20 +525,21 @@ define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; CHECK-NEXT:  .LBB22_2:
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 3
-; CHECK-NEXT:    add a1, a2, a1
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 3
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
@@ -566,16 +566,10 @@ define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vmset.m v7
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vmset.m v24
 ; CHECK-NEXT:    vmv.v.x v16, a1
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
@@ -586,18 +580,22 @@ define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v7, a2
+; CHECK-NEXT:    vslidedown.vx v0, v24, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vmv8r.v v16, v8
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v16, v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -607,14 +605,15 @@ define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:  .LBB23_2:
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v16, v16, v24
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
index 5d998c4e739d5..74d289951530a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
@@ -17,10 +17,10 @@ declare <vscale x 1 x bfloat> @llvm.vp.maxnum.nxv1bf16(<vscale x 1 x bfloat>, <v
 define <vscale x 1 x bfloat> @vfmax_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmax_vv_nxv1bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfmax.vv v9, v9, v10, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -32,10 +32,10 @@ define <vscale x 1 x bfloat> @vfmax_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vsca
 define <vscale x 1 x bfloat> @vfmax_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmax_vv_nxv1bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfmax.vv v9, v9, v10
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -49,10 +49,10 @@ declare <vscale x 2 x bfloat> @llvm.vp.maxnum.nxv2bf16(<vscale x 2 x bfloat>, <v
 define <vscale x 2 x bfloat> @vfmax_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmax_vv_nxv2bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmax.vv v9, v9, v10, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -64,10 +64,10 @@ define <vscale x 2 x bfloat> @vfmax_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vsca
 define <vscale x 2 x bfloat> @vfmax_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmax_vv_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmax.vv v9, v9, v10
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -81,10 +81,10 @@ declare <vscale x 4 x bfloat> @llvm.vp.maxnum.nxv4bf16(<vscale x 4 x bfloat>, <v
 define <vscale x 4 x bfloat> @vfmax_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmax_vv_nxv4bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmax.vv v10, v12, v10, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
@@ -96,10 +96,10 @@ define <vscale x 4 x bfloat> @vfmax_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vsca
 define <vscale x 4 x bfloat> @vfmax_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmax_vv_nxv4bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmax.vv v10, v12, v10
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
@@ -113,10 +113,10 @@ declare <vscale x 8 x bfloat> @llvm.vp.maxnum.nxv8bf16(<vscale x 8 x bfloat>, <v
 define <vscale x 8 x bfloat> @vfmax_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmax_vv_nxv8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfmax.vv v12, v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
@@ -128,10 +128,10 @@ define <vscale x 8 x bfloat> @vfmax_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vsca
 define <vscale x 8 x bfloat> @vfmax_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmax_vv_nxv8bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfmax.vv v12, v16, v12
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
@@ -145,10 +145,10 @@ declare <vscale x 16 x bfloat> @llvm.vp.maxnum.nxv16bf16(<vscale x 16 x bfloat>,
 define <vscale x 16 x bfloat> @vfmax_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmax_vv_nxv16bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmax.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
@@ -160,10 +160,10 @@ define <vscale x 16 x bfloat> @vfmax_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <v
 define <vscale x 16 x bfloat> @vfmax_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmax_vv_nxv16bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmax.vv v16, v24, v16
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
@@ -183,23 +183,22 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmax.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -209,10 +208,11 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:  .LBB10_2:
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmax.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
@@ -238,22 +238,21 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v7
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT:    vmset.m v24
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v7, a2
+; CHECK-NEXT:    vslidedown.vx v0, v24, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmax.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -263,9 +262,10 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:  .LBB11_2:
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmax.vv v16, v24, v16
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
index 48a4c13869009..1aaddef39bc1c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
@@ -17,10 +17,10 @@ declare <vscale x 1 x bfloat> @llvm.vp.minnum.nxv1bf16(<vscale x 1 x bfloat>, <v
 define <vscale x 1 x bfloat> @vfmin_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmin_vv_nxv1bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfmin.vv v9, v9, v10, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -32,10 +32,10 @@ define <vscale x 1 x bfloat> @vfmin_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vsca
 define <vscale x 1 x bfloat> @vfmin_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmin_vv_nxv1bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfmin.vv v9, v9, v10
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -49,10 +49,10 @@ declare <vscale x 2 x bfloat> @llvm.vp.minnum.nxv2bf16(<vscale x 2 x bfloat>, <v
 define <vscale x 2 x bfloat> @vfmin_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmin_vv_nxv2bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmin.vv v9, v9, v10, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -64,10 +64,10 @@ define <vscale x 2 x bfloat> @vfmin_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vsca
 define <vscale x 2 x bfloat> @vfmin_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmin_vv_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmin.vv v9, v9, v10
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -81,10 +81,10 @@ declare <vscale x 4 x bfloat> @llvm.vp.minnum.nxv4bf16(<vscale x 4 x bfloat>, <v
 define <vscale x 4 x bfloat> @vfmin_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmin_vv_nxv4bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmin.vv v10, v12, v10, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
@@ -96,10 +96,10 @@ define <vscale x 4 x bfloat> @vfmin_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vsca
 define <vscale x 4 x bfloat> @vfmin_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmin_vv_nxv4bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmin.vv v10, v12, v10
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
@@ -113,10 +113,10 @@ declare <vscale x 8 x bfloat> @llvm.vp.minnum.nxv8bf16(<vscale x 8 x bfloat>, <v
 define <vscale x 8 x bfloat> @vfmin_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmin_vv_nxv8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfmin.vv v12, v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
@@ -128,10 +128,10 @@ define <vscale x 8 x bfloat> @vfmin_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vsca
 define <vscale x 8 x bfloat> @vfmin_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmin_vv_nxv8bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfmin.vv v12, v16, v12
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
@@ -145,10 +145,10 @@ declare <vscale x 16 x bfloat> @llvm.vp.minnum.nxv16bf16(<vscale x 16 x bfloat>,
 define <vscale x 16 x bfloat> @vfmin_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmin_vv_nxv16bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmin.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
@@ -160,10 +160,10 @@ define <vscale x 16 x bfloat> @vfmin_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <v
 define <vscale x 16 x bfloat> @vfmin_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmin_vv_nxv16bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmin.vv v16, v24, v16
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
@@ -183,23 +183,22 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmin.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -209,10 +208,11 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:  .LBB10_2:
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmin.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
@@ -238,22 +238,21 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v7
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT:    vmset.m v24
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v7, a2
+; CHECK-NEXT:    vslidedown.vx v0, v24, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmin.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -263,9 +262,10 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:  .LBB11_2:
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmin.vv v16, v24, v16
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll
index 575d50d11f0ac..33decd8aa1b91 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll
@@ -7,9 +7,9 @@
 define <vscale x 2 x i1> @vfptosi_nxv2i1_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfptosi_nxv2i1_nxv2bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v9, v0.t
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0, v0.t
 ; CHECK-NEXT:    ret
@@ -20,9 +20,9 @@ define <vscale x 2 x i1> @vfptosi_nxv2i1_nxv2bf16(<vscale x 2 x bfloat> %va, <vs
 define <vscale x 2 x i1> @vfptosi_nxv2i1_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vfptosi_nxv2i1_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v9
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
index e33ab98c0f85d..ed0023e21497e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
@@ -7,9 +7,8 @@
 define <vscale x 2 x i7> @vfptosi_v4i7_v4bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfptosi_v4i7_v4bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vfncvt.rtz.x.f.w v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
@@ -21,9 +20,8 @@ define <vscale x 2 x i7> @vfptosi_v4i7_v4bf16(<vscale x 2 x bfloat> %va, <vscale
 define <vscale x 2 x i8> @vfptosi_nxv2i8_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfptosi_nxv2i8_nxv2bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vfncvt.rtz.x.f.w v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
@@ -35,9 +33,8 @@ define <vscale x 2 x i8> @vfptosi_nxv2i8_nxv2bf16(<vscale x 2 x bfloat> %va, <vs
 define <vscale x 2 x i8> @vfptosi_nxv2i8_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vfptosi_nxv2i8_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vfncvt.rtz.x.f.w v8, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0
@@ -49,9 +46,8 @@ define <vscale x 2 x i8> @vfptosi_nxv2i8_nxv2bf16_unmasked(<vscale x 2 x bfloat>
 define <vscale x 2 x i16> @vfptosi_nxv2i16_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfptosi_nxv2i16_nxv2bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vfncvt.rtz.x.f.w v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 %evl)
@@ -61,9 +57,8 @@ define <vscale x 2 x i16> @vfptosi_nxv2i16_nxv2bf16(<vscale x 2 x bfloat> %va, <
 define <vscale x 2 x i16> @vfptosi_nxv2i16_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vfptosi_nxv2i16_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vfncvt.rtz.x.f.w v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -73,9 +68,9 @@ define <vscale x 2 x i16> @vfptosi_nxv2i16_nxv2bf16_unmasked(<vscale x 2 x bfloa
 define <vscale x 2 x i32> @vfptosi_nxv2i32_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfptosi_nxv2i32_nxv2bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 %evl)
@@ -85,9 +80,9 @@ define <vscale x 2 x i32> @vfptosi_nxv2i32_nxv2bf16(<vscale x 2 x bfloat> %va, <
 define <vscale x 2 x i32> @vfptosi_nxv2i32_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vfptosi_nxv2i32_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -97,9 +92,9 @@ define <vscale x 2 x i32> @vfptosi_nxv2i32_nxv2bf16_unmasked(<vscale x 2 x bfloa
 define <vscale x 2 x i64> @vfptosi_nxv2i64_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfptosi_nxv2i64_nxv2bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 %evl)
@@ -109,9 +104,9 @@ define <vscale x 2 x i64> @vfptosi_nxv2i64_nxv2bf16(<vscale x 2 x bfloat> %va, <
 define <vscale x 2 x i64> @vfptosi_nxv2i64_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vfptosi_nxv2i64_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll
index e1d0ad4758586..59c6791c12f79 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll
@@ -7,9 +7,9 @@
 define <vscale x 2 x i1> @vfptoui_nxv2i1_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfptoui_nxv2i1_nxv2bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.xu.f.v v8, v9, v0.t
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0, v0.t
 ; CHECK-NEXT:    ret
@@ -20,9 +20,9 @@ define <vscale x 2 x i1> @vfptoui_nxv2i1_nxv2bf16(<vscale x 2 x bfloat> %va, <vs
 define <vscale x 2 x i1> @vfptoui_nxv2i1_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vfptoui_nxv2i1_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.xu.f.v v8, v9
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
index 86222ecfadfea..a11139fea9e5b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
@@ -7,9 +7,8 @@
 define <vscale x 2 x i7> @vfptoui_v4i7_v4bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfptoui_v4i7_v4bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vfncvt.rtz.x.f.w v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
@@ -21,9 +20,8 @@ define <vscale x 2 x i7> @vfptoui_v4i7_v4bf16(<vscale x 2 x bfloat> %va, <vscale
 define <vscale x 2 x i8> @vfptoui_nxv2i8_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfptoui_nxv2i8_nxv2bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vfncvt.rtz.xu.f.w v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
@@ -35,9 +33,8 @@ define <vscale x 2 x i8> @vfptoui_nxv2i8_nxv2bf16(<vscale x 2 x bfloat> %va, <vs
 define <vscale x 2 x i8> @vfptoui_nxv2i8_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vfptoui_nxv2i8_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vfncvt.rtz.xu.f.w v8, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0
@@ -49,9 +46,8 @@ define <vscale x 2 x i8> @vfptoui_nxv2i8_nxv2bf16_unmasked(<vscale x 2 x bfloat>
 define <vscale x 2 x i16> @vfptoui_nxv2i16_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfptoui_nxv2i16_nxv2bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vfncvt.rtz.xu.f.w v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 %evl)
@@ -61,9 +57,8 @@ define <vscale x 2 x i16> @vfptoui_nxv2i16_nxv2bf16(<vscale x 2 x bfloat> %va, <
 define <vscale x 2 x i16> @vfptoui_nxv2i16_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vfptoui_nxv2i16_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vfncvt.rtz.xu.f.w v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -73,9 +68,9 @@ define <vscale x 2 x i16> @vfptoui_nxv2i16_nxv2bf16_unmasked(<vscale x 2 x bfloa
 define <vscale x 2 x i32> @vfptoui_nxv2i32_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfptoui_nxv2i32_nxv2bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.xu.f.v v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 %evl)
@@ -85,9 +80,9 @@ define <vscale x 2 x i32> @vfptoui_nxv2i32_nxv2bf16(<vscale x 2 x bfloat> %va, <
 define <vscale x 2 x i32> @vfptoui_nxv2i32_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vfptoui_nxv2i32_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.xu.f.v v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -97,9 +92,9 @@ define <vscale x 2 x i32> @vfptoui_nxv2i32_nxv2bf16_unmasked(<vscale x 2 x bfloa
 define <vscale x 2 x i64> @vfptoui_nxv2i64_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfptoui_nxv2i64_nxv2bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 %evl)
@@ -109,9 +104,9 @@ define <vscale x 2 x i64> @vfptoui_nxv2i64_nxv2bf16(<vscale x 2 x bfloat> %va, <
 define <vscale x 2 x i64> @vfptoui_nxv2i64_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vfptoui_nxv2i64_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
index e94d0a60bbfc7..6193fdd38a642 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
@@ -17,9 +17,9 @@ declare <vscale x 1 x bfloat> @llvm.vp.sqrt.nxv1bf16(<vscale x 1 x bfloat>, <vsc
 define <vscale x 1 x bfloat> @vfsqrt_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfsqrt_vv_nxv1bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -31,9 +31,9 @@ define <vscale x 1 x bfloat> @vfsqrt_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vsc
 define <vscale x 1 x bfloat> @vfsqrt_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vfsqrt_vv_nxv1bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v9, v9
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -47,9 +47,9 @@ declare <vscale x 2 x bfloat> @llvm.vp.sqrt.nxv2bf16(<vscale x 2 x bfloat>, <vsc
 define <vscale x 2 x bfloat> @vfsqrt_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfsqrt_vv_nxv2bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -61,9 +61,9 @@ define <vscale x 2 x bfloat> @vfsqrt_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vsc
 define <vscale x 2 x bfloat> @vfsqrt_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vfsqrt_vv_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v9, v9
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -77,9 +77,9 @@ declare <vscale x 4 x bfloat> @llvm.vp.sqrt.nxv4bf16(<vscale x 4 x bfloat>, <vsc
 define <vscale x 4 x bfloat> @vfsqrt_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfsqrt_vv_nxv4bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
@@ -91,9 +91,9 @@ define <vscale x 4 x bfloat> @vfsqrt_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vsc
 define <vscale x 4 x bfloat> @vfsqrt_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vfsqrt_vv_nxv4bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v10, v10
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
@@ -107,9 +107,9 @@ declare <vscale x 8 x bfloat> @llvm.vp.sqrt.nxv8bf16(<vscale x 8 x bfloat>, <vsc
 define <vscale x 8 x bfloat> @vfsqrt_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfsqrt_vv_nxv8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
@@ -121,9 +121,9 @@ define <vscale x 8 x bfloat> @vfsqrt_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vsc
 define <vscale x 8 x bfloat> @vfsqrt_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vfsqrt_vv_nxv8bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v12, v12
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
@@ -137,9 +137,9 @@ declare <vscale x 16 x bfloat> @llvm.vp.sqrt.nxv16bf16(<vscale x 16 x bfloat>, <
 define <vscale x 16 x bfloat> @vfsqrt_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfsqrt_vv_nxv16bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
@@ -151,9 +151,9 @@ define <vscale x 16 x bfloat> @vfsqrt_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <
 define <vscale x 16 x bfloat> @vfsqrt_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vfsqrt_vv_nxv16bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v16, v16
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
@@ -173,13 +173,13 @@ define <vscale x 32 x bfloat> @vfsqrt_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    sltu a4, a0, a3
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
@@ -187,9 +187,10 @@ define <vscale x 32 x bfloat> @vfsqrt_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB10_2:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
@@ -207,14 +208,14 @@ define <vscale x 32 x bfloat> @vfsqrt_vv_nxv32bf16_unmasked(<vscale x 32 x bfloa
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    sltu a4, a0, a3
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v16, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -222,8 +223,9 @@ define <vscale x 32 x bfloat> @vfsqrt_vv_nxv32bf16_unmasked(<vscale x 32 x bfloa
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB11_2:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v16, v16
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
index 56ed560f9ec93..08664fd1c4819 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
@@ -17,10 +17,10 @@ declare <vscale x 1 x bfloat> @llvm.vp.fsub.nxv1bf16(<vscale x 1 x bfloat>, <vsc
 define <vscale x 1 x bfloat> @vfsub_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfsub_vv_nxv1bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfsub.vv v9, v9, v10, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -32,10 +32,10 @@ define <vscale x 1 x bfloat> @vfsub_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vsca
 define <vscale x 1 x bfloat> @vfsub_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, i32 zeroext %evl) {
 ; CHECK-LABEL: vfsub_vv_nxv1bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfsub.vv v9, v9, v10
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -48,11 +48,11 @@ define <vscale x 1 x bfloat> @vfsub_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloa
 ; CHECK-LABEL: vfsub_vf_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfsub.vv v9, v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -67,11 +67,11 @@ define <vscale x 1 x bfloat> @vfsub_vf_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
 ; CHECK-LABEL: vfsub_vf_nxv1bf16_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfsub.vv v9, v10, v8
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -87,10 +87,10 @@ declare <vscale x 2 x bfloat> @llvm.vp.fsub.nxv2bf16(<vscale x 2 x bfloat>, <vsc
 define <vscale x 2 x bfloat> @vfsub_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfsub_vv_nxv2bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfsub.vv v9, v9, v10, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -102,10 +102,10 @@ define <vscale x 2 x bfloat> @vfsub_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vsca
 define <vscale x 2 x bfloat> @vfsub_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, i32 zeroext %evl) {
 ; CHECK-LABEL: vfsub_vv_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfsub.vv v9, v9, v10
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -118,11 +118,11 @@ define <vscale x 2 x bfloat> @vfsub_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloa
 ; CHECK-LABEL: vfsub_vf_nxv2bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfsub.vv v9, v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -137,11 +137,11 @@ define <vscale x 2 x bfloat> @vfsub_vf_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
 ; CHECK-LABEL: vfsub_vf_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfsub.vv v9, v10, v8
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
@@ -157,10 +157,10 @@ declare <vscale x 4 x bfloat> @llvm.vp.fsub.nxv4bf16(<vscale x 4 x bfloat>, <vsc
 define <vscale x 4 x bfloat> @vfsub_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfsub_vv_nxv4bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfsub.vv v10, v12, v10, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
@@ -172,10 +172,10 @@ define <vscale x 4 x bfloat> @vfsub_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vsca
 define <vscale x 4 x bfloat> @vfsub_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, i32 zeroext %evl) {
 ; CHECK-LABEL: vfsub_vv_nxv4bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfsub.vv v10, v12, v10
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
@@ -188,11 +188,11 @@ define <vscale x 4 x bfloat> @vfsub_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloa
 ; CHECK-LABEL: vfsub_vf_nxv4bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfsub.vv v10, v10, v12, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
@@ -207,11 +207,11 @@ define <vscale x 4 x bfloat> @vfsub_vf_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 ; CHECK-LABEL: vfsub_vf_nxv4bf16_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfsub.vv v10, v10, v12
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
@@ -227,10 +227,10 @@ declare <vscale x 8 x bfloat> @llvm.vp.fsub.nxv8bf16(<vscale x 8 x bfloat>, <vsc
 define <vscale x 8 x bfloat> @vfsub_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfsub_vv_nxv8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfsub.vv v12, v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
@@ -242,10 +242,10 @@ define <vscale x 8 x bfloat> @vfsub_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vsca
 define <vscale x 8 x bfloat> @vfsub_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, i32 zeroext %evl) {
 ; CHECK-LABEL: vfsub_vv_nxv8bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfsub.vv v12, v16, v12
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
@@ -258,11 +258,11 @@ define <vscale x 8 x bfloat> @vfsub_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloa
 ; CHECK-LABEL: vfsub_vf_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfsub.vv v12, v12, v16, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
@@ -277,11 +277,11 @@ define <vscale x 8 x bfloat> @vfsub_vf_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 ; CHECK-LABEL: vfsub_vf_nxv8bf16_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfsub.vv v12, v12, v16
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
@@ -297,10 +297,10 @@ declare <vscale x 16 x bfloat> @llvm.vp.fsub.nxv16bf16(<vscale x 16 x bfloat>, <
 define <vscale x 16 x bfloat> @vfsub_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfsub_vv_nxv16bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsub.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
@@ -312,10 +312,10 @@ define <vscale x 16 x bfloat> @vfsub_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <v
 define <vscale x 16 x bfloat> @vfsub_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, i32 zeroext %evl) {
 ; CHECK-LABEL: vfsub_vv_nxv16bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsub.vv v16, v24, v16
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
@@ -328,11 +328,11 @@ define <vscale x 16 x bfloat> @vfsub_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bf
 ; CHECK-LABEL: vfsub_vf_nxv16bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v12, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsub.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
@@ -347,11 +347,11 @@ define <vscale x 16 x bfloat> @vfsub_vf_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ; CHECK-LABEL: vfsub_vf_nxv16bf16_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v12, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsub.vv v16, v16, v24
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
@@ -373,23 +373,22 @@ define <vscale x 32 x bfloat> @vfsub_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsub.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -399,10 +398,11 @@ define <vscale x 32 x bfloat> @vfsub_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:  .LBB20_2:
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsub.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
@@ -428,22 +428,21 @@ define <vscale x 32 x bfloat> @vfsub_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v7
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT:    vmset.m v24
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v7, a2
+; CHECK-NEXT:    vslidedown.vx v0, v24, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsub.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -453,9 +452,10 @@ define <vscale x 32 x bfloat> @vfsub_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:  .LBB21_2:
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsub.vv v16, v24, v16
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
@@ -480,14 +480,10 @@ define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vmv8r.v v16, v8
+; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vmv8r.v v24, v8
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
-; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.x v16, a1
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a3, a1, 3
@@ -508,15 +504,18 @@ define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a4, a3, 3
-; CHECK-NEXT:    add a3, a4, a3
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a3, a2, 3
+; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsub.vv v16, v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -526,20 +525,21 @@ define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; CHECK-NEXT:  .LBB22_2:
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 3
-; CHECK-NEXT:    add a1, a2, a1
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 3
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsub.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
@@ -566,16 +566,10 @@ define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vmset.m v7
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vmset.m v24
 ; CHECK-NEXT:    vmv.v.x v16, a1
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
@@ -586,18 +580,22 @@ define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v7, a2
+; CHECK-NEXT:    vslidedown.vx v0, v24, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vmv8r.v v16, v8
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsub.vv v16, v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -607,14 +605,15 @@ define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:  .LBB23_2:
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsub.vv v16, v16, v24
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir
index 0a366f4fd89cf..56bfe0fd3eb93 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc %s -o - -mtriple=riscv64 -mattr=+v -run-pass=riscv-vl-optimizer -verify-machineinstrs | FileCheck %s
+# RUN: llc %s -o - -mtriple=riscv64 -mattr=+v,+zvfbfmin -run-pass=riscv-vl-optimizer -verify-machineinstrs | FileCheck %s
 
 ---
 name: vnsrl_wv_user
@@ -130,4 +130,14 @@ body: |
     %x:vr = PseudoVFCVT_X_F_V_M1 $noreg, $noreg, 0, -1, 3 /* e32 */, 0
     %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
 ...
+---
+name: vfncvtbf16_f_f_w_nofpexcept
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vfncvtbf16_f_f_w_nofpexcept
+    ; CHECK: early-clobber %x:vr = nofpexcept PseudoVFNCVTBF16_F_F_W_M1_E16 $noreg, $noreg, 7, 1, 4 /* e16 */, 0 /* tu, mu */, implicit $frm
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 4 /* e16 */, 0 /* tu, mu */
+    %x:vr = nofpexcept PseudoVFNCVTBF16_F_F_W_M1_E16 $noreg, $noreg, 7, -1, 4 /* e16 */, 0 /* tu, mu */, implicit $frm
+    %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 4 /* e16 */, 0
+...
 

From 87f4240230592a6ea2fc6c5ad2c189c372e1ea84 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Mon, 13 Jan 2025 20:40:16 +0000
Subject: [PATCH 327/408] [AArch64] Generate BSP instead of TBL for select
 shuffles. (#121474)

In using BIF/BIT/BSL the constant mask has a larger chance of being
regular, being able to be materialized with a movi. On some cpus the
BIF/BIT/BSL is slightly quicker too.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 17 +++++++++
 .../test/CodeGen/AArch64/arm64-neon-3vdiff.ll | 11 +++---
 llvm/test/CodeGen/AArch64/shuffle-select.ll   | 37 +++++--------------
 3 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7e82a433a85ad..3b8e0f1958f86 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14004,6 +14004,23 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
                                   dl);
   }
 
+  // Check for a "select shuffle", generating a BSL to pick between lanes in
+  // V1/V2.
+  if (ShuffleVectorInst::isSelectMask(ShuffleMask, NumElts)) {
+    assert(VT.getScalarSizeInBits() <= 32 &&
+           "Expected larger vector element sizes to be handled already");
+    SmallVector<SDValue> MaskElts;
+    for (int M : ShuffleMask)
+      MaskElts.push_back(DAG.getConstant(
+          M >= static_cast<int>(NumElts) ? 0 : 0xffffffff, dl, MVT::i32));
+    EVT IVT = VT.changeVectorElementTypeToInteger();
+    SDValue MaskConst = DAG.getBuildVector(IVT, dl, MaskElts);
+    return DAG.getBitcast(VT, DAG.getNode(AArch64ISD::BSP, dl, IVT, MaskConst,
+                                          DAG.getBitcast(IVT, V1),
+                                          DAG.getBitcast(IVT, V2)));
+  }
+
+  // Fall back to generating a TBL
   return GenerateTBL(Op, ShuffleMask, DAG);
 }
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
index 79645e32074c8..9fb8e4c8fe031 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
@@ -2537,14 +2537,13 @@ entry:
 define <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %scale.coerce) {
 ; CHECK-LABEL: cmplx_mul_combined_re_im:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    lsr x9, x0, #16
-; CHECK-NEXT:    adrp x8, .LCPI196_0
+; CHECK-NEXT:    lsr x8, x0, #16
+; CHECK-NEXT:    movi v1.2d, #0xffff0000ffff0000
 ; CHECK-NEXT:    fmov d5, x0
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI196_0]
 ; CHECK-NEXT:    rev32 v4.8h, v0.8h
-; CHECK-NEXT:    dup v1.8h, w9
-; CHECK-NEXT:    sqneg v2.8h, v1.8h
-; CHECK-NEXT:    tbl v1.16b, { v1.16b, v2.16b }, v3.16b
+; CHECK-NEXT:    dup v2.8h, w8
+; CHECK-NEXT:    sqneg v3.8h, v2.8h
+; CHECK-NEXT:    bsl v1.16b, v2.16b, v3.16b
 ; CHECK-NEXT:    sqdmull v2.4s, v0.4h, v5.h[0]
 ; CHECK-NEXT:    sqdmull2 v0.4s, v0.8h, v5.h[0]
 ; CHECK-NEXT:    sqdmlal v2.4s, v4.4h, v1.4h
diff --git a/llvm/test/CodeGen/AArch64/shuffle-select.ll b/llvm/test/CodeGen/AArch64/shuffle-select.ll
index eeccaa170397d..f4e7b314d2001 100644
--- a/llvm/test/CodeGen/AArch64/shuffle-select.ll
+++ b/llvm/test/CodeGen/AArch64/shuffle-select.ll
@@ -4,12 +4,8 @@
 define <8 x i8> @sel_v8i8(<8 x i8> %v0, <8 x i8> %v1) {
 ; CHECK-LABEL: sel_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    adrp x8, .LCPI0_0
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI0_0]
-; CHECK-NEXT:    tbl v0.8b, { v0.16b }, v1.8b
+; CHECK-NEXT:    movi d2, #0xff00ff00ff00ff
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   ret <8 x i8> %tmp0
@@ -18,11 +14,8 @@ define <8 x i8> @sel_v8i8(<8 x i8> %v0, <8 x i8> %v1) {
 define <16 x i8> @sel_v16i8(<16 x i8> %v0, <16 x i8> %v1) {
 ; CHECK-LABEL: sel_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI1_0
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI1_0]
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    movi v2.2d, #0xff00ff00ff00ff
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
   ret <16 x i8> %tmp0
@@ -32,10 +25,8 @@ define <16 x i8> @sel_v16i8_poison(<16 x i8> %v0, <16 x i8> %v1) {
 ; CHECK-LABEL: sel_v16i8_poison:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI2_0
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI2_0]
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 22, i32 23, i32 24, i32 25, i32 26, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
   ret <16 x i8> %tmp0
@@ -45,10 +36,8 @@ define <16 x i8> @sel_v16i8_unregular(<16 x i8> %v0, <16 x i8> %v1) {
 ; CHECK-LABEL: sel_v16i8_unregular:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI3_0
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI3_0]
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 27, i32 28, i32 29, i32 30, i32 31>
   ret <16 x i8> %tmp0
@@ -67,11 +56,8 @@ define <4 x i16> @sel_v4i16(<4 x i16> %v0, <4 x i16> %v1) {
 define <8 x i16> @sel_v8i16(<8 x i16> %v0, <8 x i16> %v1) {
 ; CHECK-LABEL: sel_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI5_0
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI5_0]
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    movi v2.2d, #0x00ffff0000ffff
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   ret <8 x i16> %tmp0
@@ -121,11 +107,8 @@ define <4 x half> @sel_v4f16(<4 x half> %v0, <4 x half> %v1) {
 define <8 x half> @sel_v8f16(<8 x half> %v0, <8 x half> %v1) {
 ; CHECK-LABEL: sel_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI10_0
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI10_0]
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    movi v2.2d, #0x00ffff0000ffff
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   ret <8 x half> %tmp0

From 137974002d0f395e9e6e7cb54403c2ed760433a6 Mon Sep 17 00:00:00 2001
From: Valery Chernov <vchernov@nvidia.com>
Date: Tue, 14 Jan 2025 00:49:34 +0400
Subject: [PATCH 328/408] [NVPTX] Fix segfault with i128 types in arrays
 (#120562)

- Process i128 array with custom ComputePTXValueVTs. The i128 elements
should be handled and split into i64 types in the recursion.
- Add corresponding tests
---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp |  9 +++
 llvm/test/CodeGen/NVPTX/i128-array.ll       | 68 +++++++++++++++++++++
 2 files changed, 77 insertions(+)
 create mode 100644 llvm/test/CodeGen/NVPTX/i128-array.ll

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 5c1f717694a4c..208d724f7ae28 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -261,6 +261,15 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
     return;
   }
 
+  // Given an array type, recursively traverse the elements with custom ComputePTXValueVTs.
+  if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    Type *EltTy = ATy->getElementType();
+    uint64_t EltSize = DL.getTypeAllocSize(EltTy);
+    for (int I : llvm::seq<int>(ATy->getNumElements()))
+      ComputePTXValueVTs(TLI, DL, EltTy, ValueVTs, Offsets, StartingOffset + I * EltSize);
+    return;
+  }
+
   ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
   for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
     EVT VT = TempVTs[i];
diff --git a/llvm/test/CodeGen/NVPTX/i128-array.ll b/llvm/test/CodeGen/NVPTX/i128-array.ll
new file mode 100644
index 0000000000000..348df8dcc7373
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/i128-array.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -O0 -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+define [2 x i128] @foo(i64 %a, i32 %b) {
+; CHECK-LABEL: foo(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [foo_param_1];
+; CHECK-NEXT:    ld.param.u64 %rd1, [foo_param_0];
+; CHECK-NEXT:    shr.s64 %rd2, %rd1, 63;
+; CHECK-NEXT:    cvt.s64.s32 %rd3, %r1;
+; CHECK-NEXT:    shr.s64 %rd4, %rd3, 63;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd1, %rd2};
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0+16], {%rd3, %rd4};
+; CHECK-NEXT:    ret;
+  %1 = sext i64 %a to i128
+  %2 = sext i32 %b to i128
+  %3 = insertvalue [2 x i128] undef, i128 %1, 0
+  %4 = insertvalue [2 x i128] %3, i128 %2, 1
+
+  ret [2 x i128] %4
+}
+
+define [2 x i128] @foo2(ptr byval([2 x i128]) %a) {
+; CHECK-LABEL: foo2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b64 %rd1, foo2_param_0;
+; CHECK-NEXT:    ld.param.u64 %rd2, [foo2_param_0+8];
+; CHECK-NEXT:    ld.param.u64 %rd3, [foo2_param_0];
+; CHECK-NEXT:    ld.param.u64 %rd4, [foo2_param_0+24];
+; CHECK-NEXT:    ld.param.u64 %rd5, [foo2_param_0+16];
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd3, %rd2};
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0+16], {%rd5, %rd4};
+; CHECK-NEXT:    ret;
+  %ptr0 = getelementptr [2 x i128], ptr %a, i64 0, i32 0
+  %1 = load i128, i128* %ptr0
+  %ptr1 = getelementptr [2 x i128], ptr %a, i64 0, i32 1
+  %2 = load i128, i128* %ptr1
+  %3 = insertvalue [2 x i128] undef, i128 %1, 0
+  %4 = insertvalue [2 x i128] %3, i128 %2, 1
+
+  ret [2 x i128] %4
+}
+
+define [2 x i128] @foo3([2 x i128] %a) {
+; CHECK-LABEL: foo3(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.u64 {%rd3, %rd4}, [foo3_param_0+16];
+; CHECK-NEXT:    ld.param.v2.u64 {%rd1, %rd2}, [foo3_param_0];
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd1, %rd2};
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0+16], {%rd3, %rd4};
+; CHECK-NEXT:    ret;
+  %1 = extractvalue [2 x i128] %a, 0
+  %2 = extractvalue [2 x i128] %a, 1
+  %3 = insertvalue [2 x i128] undef, i128 %1, 0
+  %4 = insertvalue [2 x i128] %3, i128 %2, 1
+
+  ret [2 x i128] %4
+}

From 98e2328451193885e532ae930491f59ac742e938 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Mon, 13 Jan 2025 12:49:39 -0800
Subject: [PATCH 329/408] [SLP][NFC]Add a test with non-power-of-2 gathered
 consecutive loads, NFC

---
 .../X86/gather-loads-non-power-of-2.ll        | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/gather-loads-non-power-of-2.ll

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-loads-non-power-of-2.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-loads-non-power-of-2.ll
new file mode 100644
index 0000000000000..be0ed2c34a365
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-loads-non-power-of-2.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=x86_64-unknown-linux --passes=slp-vectorizer < %s | FileCheck %s
+
+define <6 x double> @test(ptr %a) {
+; CHECK-LABEL: define <6 x double> @test(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[A]], i16 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <6 x double> [[TMP3]], <6 x double> [[TMP4]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7>
+; CHECK-NEXT:    ret <6 x double> [[TMP5]]
+;
+entry:
+  %1 = load double, ptr %a, align 8
+  %2 = getelementptr double, ptr %a, i16 1
+  %3 = load double, ptr %2, align 8
+  %4 = getelementptr double, ptr %a, i16 2
+  %5 = load double, ptr %4, align 8
+  %6 = getelementptr double, ptr %a, i16 3
+  %7 = load double, ptr %6, align 8
+  %8 = getelementptr double, ptr %a, i16 4
+  %9 = load double, ptr %8, align 8
+  %10 = getelementptr double, ptr %a, i16 5
+  %11 = load double, ptr %10, align 8
+  %12 = insertelement <6 x double> poison, double %1, i32 0
+  %13 = insertelement <6 x double> %12, double %3, i32 1
+  %14 = insertelement <6 x double> %13, double %5, i32 2
+  %15 = insertelement <6 x double> %14, double %7, i32 3
+  %16 = insertelement <6 x double> %15, double %9, i32 4
+  %17 = insertelement <6 x double> %16, double %11, i32 5
+  ret <6 x double> %17
+}

From ec3525f7844878767b70b78753affbe44acfa9ed Mon Sep 17 00:00:00 2001
From: Kevin McAfee <kmcafee@nvidia.com>
Date: Mon, 13 Jan 2025 15:58:37 -0500
Subject: [PATCH 330/408] [NVPTX] Attempt to load params using symbol addition
 node directly (#119935)

During instruction selection on load instructions, transform loads of
[register+offset] into [symbol+offset] if the register value is the
result of an ADD instruction(s) of a symbol and constant(s). This
enables the removal of any ADD(s) of the symbol that are not combined
with the load to create a ld.param. This is normally not an issue when
DAG combines are enabled as any extra ADDs would be folded. However,
when DAG combines are disabled, there may be cases where an ADD of a
symbol is consumed by multiple other nodes and is retained in generated
code as a PTX `add` instruction that uses the symbol as an operand -
this is illegal PTX.
---
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 24 ++++++---
 llvm/test/CodeGen/NVPTX/param-add.ll        | 54 +++++++++++++++++++++
 2 files changed, 71 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/param-add.ll

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 39f5571692058..2e66b67dfdcc7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -2482,15 +2482,25 @@ bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
 bool NVPTXDAGToDAGISel::SelectADDRsi_imp(SDNode *OpNode, SDValue Addr,
                                          SDValue &Base, SDValue &Offset,
                                          MVT VT) {
-  if (isAddLike(Addr)) {
-    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
-      SDValue base = Addr.getOperand(0);
-      if (SelectDirectAddr(base, Base)) {
-        Offset =
-            CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode), VT);
-        return true;
+  std::function<std::optional<uint64_t>(SDValue, uint64_t)>
+      FindRootAddressAndTotalOffset =
+          [&](SDValue Addr,
+              uint64_t AccumulatedOffset) -> std::optional<uint64_t> {
+    if (isAddLike(Addr)) {
+      if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
+        SDValue PossibleBaseAddr = Addr.getOperand(0);
+        AccumulatedOffset += CN->getZExtValue();
+        if (SelectDirectAddr(PossibleBaseAddr, Base))
+          return AccumulatedOffset;
+        return FindRootAddressAndTotalOffset(PossibleBaseAddr,
+                                             AccumulatedOffset);
       }
     }
+    return std::nullopt;
+  };
+  if (auto AccumulatedOffset = FindRootAddressAndTotalOffset(Addr, 0)) {
+    Offset = CurDAG->getTargetConstant(*AccumulatedOffset, SDLoc(OpNode), VT);
+    return true;
   }
   return false;
 }
diff --git a/llvm/test/CodeGen/NVPTX/param-add.ll b/llvm/test/CodeGen/NVPTX/param-add.ll
new file mode 100644
index 0000000000000..afabc113541c2
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/param-add.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 --debug-counter=dagcombine=0 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 --debug-counter=dagcombine=0 | %ptxas-verify %}
+
+; REQUIRES: asserts
+; asserts are required for --debug-counter=dagcombine=0 to have the intended
+; effect of disabling DAG combines, which exposes the bug. When combines are
+; enabled the bug does not occur.
+
+%struct.1float = type <{ [1 x float] }>
+
+declare i32 @callee(%struct.1float %a)
+
+define i32 @test(%struct.1float alignstack(32) %data) {
+; CHECK-LABEL: test(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<18>;
+; CHECK-NEXT:    .reg .f32 %f<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u8 %r1, [test_param_0+1];
+; CHECK-NEXT:    shl.b32 %r2, %r1, 8;
+; CHECK-NEXT:    ld.param.u8 %r3, [test_param_0];
+; CHECK-NEXT:    or.b32 %r4, %r2, %r3;
+; CHECK-NEXT:    ld.param.u8 %r5, [test_param_0+3];
+; CHECK-NEXT:    shl.b32 %r6, %r5, 8;
+; CHECK-NEXT:    ld.param.u8 %r7, [test_param_0+2];
+; CHECK-NEXT:    or.b32 %r8, %r6, %r7;
+; CHECK-NEXT:    shl.b32 %r9, %r8, 16;
+; CHECK-NEXT:    or.b32 %r17, %r9, %r4;
+; CHECK-NEXT:    mov.b32 %f1, %r17;
+; CHECK-NEXT:    shr.u32 %r12, %r17, 8;
+; CHECK-NEXT:    shr.u32 %r13, %r17, 16;
+; CHECK-NEXT:    shr.u32 %r14, %r17, 24;
+; CHECK-NEXT:    { // callseq 0, 0
+; CHECK-NEXT:    .param .align 1 .b8 param0[4];
+; CHECK-NEXT:    st.param.b8 [param0], %r17;
+; CHECK-NEXT:    st.param.b8 [param0+1], %r12;
+; CHECK-NEXT:    st.param.b8 [param0+2], %r13;
+; CHECK-NEXT:    st.param.b8 [param0+3], %r14;
+; CHECK-NEXT:    .param .b32 retval0;
+; CHECK-NEXT:    call.uni (retval0),
+; CHECK-NEXT:    callee,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    ld.param.b32 %r15, [retval0];
+; CHECK-NEXT:    } // callseq 0
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
+; CHECK-NEXT:    ret;
+
+  %1 = call i32 @callee(%struct.1float %data)
+  ret i32 %1
+}

From 0c1c49f0ff8003aee22c3f26fca03c2f5385f355 Mon Sep 17 00:00:00 2001
From: Fabian Mora <fmora.dev@gmail.com>
Date: Mon, 13 Jan 2025 16:11:33 -0500
Subject: [PATCH 331/408] [mlir][AMDGPU] Fix raw buffer ptr ops lowering
 (#122293)

This patch fixes several bugs in the lowering of AMDGPU raw buffer
operations. These bugs include:
- Incorrectly handling the offset of the memref, causing errors when
using subviews.
- Using the MaximumOp (float specific op) to calculate the number of
records.
 - The number of records in the static shape case.
 - The lowering when index bitwidth=i64.

Furthermore this patch also switches to use MLIR's data layout to get
the type size.

---------

Co-authored-by: Jakub Kuderski <kubakuderski@gmail.com>
---
 .../AMDGPUToROCDL/AMDGPUToROCDL.cpp           | 123 ++++++++++--------
 .../AMDGPUToROCDL/amdgpu-to-rocdl.mlir        |  46 ++++---
 2 files changed, 103 insertions(+), 66 deletions(-)

diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 4100b086fad8b..1564e417a7a48 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -30,10 +30,23 @@ namespace mlir {
 using namespace mlir;
 using namespace mlir::amdgpu;
 
+/// Convert an unsigned number `val` to i32.
+static Value convertUnsignedToI32(ConversionPatternRewriter &rewriter,
+                                  Location loc, Value val) {
+  IntegerType i32 = rewriter.getI32Type();
+  // Force check that `val` is of int type.
+  auto valTy = cast<IntegerType>(val.getType());
+  if (i32 == valTy)
+    return val;
+  return valTy.getWidth() > 32
+             ? Value(rewriter.create<LLVM::TruncOp>(loc, i32, val))
+             : Value(rewriter.create<LLVM::ZExtOp>(loc, i32, val));
+}
+
 static Value createI32Constant(ConversionPatternRewriter &rewriter,
                                Location loc, int32_t value) {
-  Type llvmI32 = rewriter.getI32Type();
-  return rewriter.create<LLVM::ConstantOp>(loc, llvmI32, value);
+  Type i32 = rewriter.getI32Type();
+  return rewriter.create<LLVM::ConstantOp>(loc, i32, value);
 }
 
 static Value createI1Constant(ConversionPatternRewriter &rewriter, Location loc,
@@ -42,6 +55,27 @@ static Value createI1Constant(ConversionPatternRewriter &rewriter, Location loc,
   return rewriter.create<LLVM::ConstantOp>(loc, llvmI1, value);
 }
 
+/// Returns the linear index used to access an element in the memref.
+static Value getLinearIndexI32(ConversionPatternRewriter &rewriter,
+                               Location loc, MemRefDescriptor &memRefDescriptor,
+                               ValueRange indices, ArrayRef<int64_t> strides) {
+  IntegerType i32 = rewriter.getI32Type();
+  Value index;
+  for (auto [i, increment, stride] : llvm::enumerate(indices, strides)) {
+    if (stride != 1) { // Skip if stride is 1.
+      Value strideValue =
+          ShapedType::isDynamic(stride)
+              ? convertUnsignedToI32(rewriter, loc,
+                                     memRefDescriptor.stride(rewriter, loc, i))
+              : rewriter.create<LLVM::ConstantOp>(loc, i32, stride);
+      increment = rewriter.create<LLVM::MulOp>(loc, increment, strideValue);
+    }
+    index =
+        index ? rewriter.create<LLVM::AddOp>(loc, index, increment) : increment;
+  }
+  return index ? index : createI32Constant(rewriter, loc, 0);
+}
+
 namespace {
 // Define commonly used chipsets versions for convenience.
 constexpr Chipset kGfx908 = Chipset(9, 0, 8);
@@ -88,17 +122,12 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
     Type llvmWantedDataType = this->typeConverter->convertType(wantedDataType);
 
     Type i32 = rewriter.getI32Type();
-    Type llvmI32 = this->typeConverter->convertType(i32);
-    Type llvmI16 = this->typeConverter->convertType(rewriter.getI16Type());
+    Type i16 = rewriter.getI16Type();
 
-    auto toI32 = [&](Value val) -> Value {
-      if (val.getType() == llvmI32)
-        return val;
-
-      return rewriter.create<LLVM::TruncOp>(loc, llvmI32, val);
-    };
-
-    int64_t elementByteWidth = memrefType.getElementTypeBitWidth() / 8;
+    // Get the type size in bytes.
+    DataLayout dataLayout = DataLayout::closest(gpuOp);
+    int64_t elementByteWidth =
+        dataLayout.getTypeSizeInBits(memrefType.getElementType()) / 8;
     Value byteWidthConst = createI32Constant(rewriter, loc, elementByteWidth);
 
     // If we want to load a vector<NxT> with total size <= 32
@@ -114,7 +143,8 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
     }
     if (auto dataVector = dyn_cast<VectorType>(wantedDataType)) {
       uint32_t vecLen = dataVector.getNumElements();
-      uint32_t elemBits = dataVector.getElementTypeBitWidth();
+      uint32_t elemBits =
+          dataLayout.getTypeSizeInBits(dataVector.getElementType());
       uint32_t totalBits = elemBits * vecLen;
       bool usePackedFp16 =
           isa_and_present<RawBufferAtomicFaddOp>(*gpuOp) && vecLen == 2;
@@ -167,28 +197,36 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
 
     MemRefDescriptor memrefDescriptor(memref);
 
-    Value ptr = memrefDescriptor.alignedPtr(rewriter, loc);
+    Value ptr = memrefDescriptor.bufferPtr(
+        rewriter, loc, *this->getTypeConverter(), memrefType);
     // The stride value is always 0 for raw buffers. This also disables
     // swizling.
     Value stride = rewriter.create<LLVM::ConstantOp>(
-        loc, llvmI16, rewriter.getI16IntegerAttr(0));
+        loc, i16, rewriter.getI16IntegerAttr(0));
+    // Get the number of elements.
     Value numRecords;
-    if (memrefType.hasStaticShape() && memrefType.getLayout().isIdentity()) {
-      numRecords = createI32Constant(
-          rewriter, loc,
-          static_cast<int32_t>(memrefType.getNumElements() * elementByteWidth));
+    if (memrefType.hasStaticShape() &&
+        !llvm::any_of(strides, ShapedType::isDynamic)) {
+      int64_t size = memrefType.getRank() == 0 ? 1 : 0;
+      ArrayRef<int64_t> shape = memrefType.getShape();
+      for (uint32_t i = 0, e = memrefType.getRank(); i < e; ++i)
+        size = std::max(shape[i] * strides[i], size);
+      size = size * elementByteWidth;
+      assert(size < std::numeric_limits<uint32_t>::max() &&
+             "the memref buffer is too large");
+      numRecords = createI32Constant(rewriter, loc, static_cast<int32_t>(size));
     } else {
       Value maxIndex;
       for (uint32_t i = 0, e = memrefType.getRank(); i < e; ++i) {
-        Value size = toI32(memrefDescriptor.size(rewriter, loc, i));
-        Value stride = toI32(memrefDescriptor.stride(rewriter, loc, i));
-        stride = rewriter.create<LLVM::MulOp>(loc, stride, byteWidthConst);
+        Value size = memrefDescriptor.size(rewriter, loc, i);
+        Value stride = memrefDescriptor.stride(rewriter, loc, i);
         Value maxThisDim = rewriter.create<LLVM::MulOp>(loc, size, stride);
-        maxIndex = maxIndex ? rewriter.create<LLVM::MaximumOp>(loc, maxIndex,
-                                                               maxThisDim)
-                            : maxThisDim;
+        maxIndex =
+            maxIndex ? rewriter.create<LLVM::UMaxOp>(loc, maxIndex, maxThisDim)
+                     : maxThisDim;
       }
-      numRecords = maxIndex;
+      numRecords = rewriter.create<LLVM::MulOp>(
+          loc, convertUnsignedToI32(rewriter, loc, maxIndex), byteWidthConst);
     }
 
     // Flag word:
@@ -218,40 +256,23 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
     args.push_back(resource);
 
     // Indexing (voffset)
-    Value voffset = createI32Constant(rewriter, loc, 0);
-    for (auto pair : llvm::enumerate(adaptor.getIndices())) {
-      size_t i = pair.index();
-      Value index = pair.value();
-      Value strideOp;
-      if (ShapedType::isDynamic(strides[i])) {
-        strideOp = rewriter.create<LLVM::MulOp>(
-            loc, toI32(memrefDescriptor.stride(rewriter, loc, i)),
-            byteWidthConst);
-      } else {
-        strideOp =
-            createI32Constant(rewriter, loc, strides[i] * elementByteWidth);
-      }
-      index = rewriter.create<LLVM::MulOp>(loc, index, strideOp);
-      voffset = rewriter.create<LLVM::AddOp>(loc, voffset, index);
-    }
-    if (adaptor.getIndexOffset()) {
-      int32_t indexOffset = *gpuOp.getIndexOffset() * elementByteWidth;
-      Value extraOffsetConst = createI32Constant(rewriter, loc, indexOffset);
+    Value voffset = getLinearIndexI32(rewriter, loc, memrefDescriptor,
+                                      adaptor.getIndices(), strides);
+    if (std::optional<int32_t> indexOffset = adaptor.getIndexOffset();
+        indexOffset && *indexOffset > 0) {
+      Value extraOffsetConst = createI32Constant(rewriter, loc, *indexOffset);
       voffset =
           voffset ? rewriter.create<LLVM::AddOp>(loc, voffset, extraOffsetConst)
                   : extraOffsetConst;
     }
+    voffset = rewriter.create<LLVM::MulOp>(loc, voffset, byteWidthConst);
     args.push_back(voffset);
 
+    // SGPR offset.
     Value sgprOffset = adaptor.getSgprOffset();
     if (!sgprOffset)
       sgprOffset = createI32Constant(rewriter, loc, 0);
-    if (ShapedType::isDynamic(offset))
-      sgprOffset = rewriter.create<LLVM::AddOp>(
-          loc, toI32(memrefDescriptor.offset(rewriter, loc)), sgprOffset);
-    else if (offset > 0)
-      sgprOffset = rewriter.create<LLVM::AddOp>(
-          loc, sgprOffset, createI32Constant(rewriter, loc, offset));
+    sgprOffset = rewriter.create<LLVM::MulOp>(loc, sgprOffset, byteWidthConst);
     args.push_back(sgprOffset);
 
     // bit 0: GLC = 0 (atomics drop value, less coherency)
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
index 4c7515dc81051..af6331646f0a5 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
@@ -31,21 +31,37 @@ func.func @gpu_gcn_raw_buffer_load_i32(%buf: memref<64xi32>, %idx: i32) -> i32 {
 }
 
 // CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i32_strided
-func.func @gpu_gcn_raw_buffer_load_i32_strided(%buf: memref<64xi32, strided<[?], offset: ?>>, %idx: i32) -> i32 {
-  // CHECK-DAG: %[[rstride:.*]] = llvm.mlir.constant(0 : i16)
-  // CHECK-DAG: %[[elem_size:.*]] = llvm.mlir.constant(4 : i32)
-  // CHECK: %[[size:.*]] = llvm.extractvalue %{{.*}}[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-  // CHECK: %[[size32:.*]] = llvm.trunc %[[size]] : i64 to i32
-  // CHECK: %[[stride:.*]] = llvm.extractvalue %{{.*}}[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-  // CHECK: %[[stride32:.*]] = llvm.trunc %[[stride]] : i64 to i32
-  // CHECK: %[[tmp:.*]] = llvm.mul %[[stride32]], %[[elem_size]] : i32
-  // CHECK: %[[numRecords:.*]] = llvm.mul %[[size32]], %[[tmp]] : i32
-  // GFX9:  %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
-  // RDNA:  %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
-  // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %[[rstride]], %[[numRecords]], %[[flags]] : !llvm.ptr to <8>
-  // CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
-  // CHECK: return %[[ret]]
-  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%idx] : memref<64xi32, strided<[?], offset: ?>>, i32 -> i32
+func.func @gpu_gcn_raw_buffer_load_i32_strided(%buf: memref<16x16xi32, strided<[?, ?], offset: ?>>, %i: i32, %j: i32) -> i32 {
+    // CHECK: %[[descriptor:.*]] = builtin.unrealized_conversion_cast %{{.*}} : memref<16x16xi32, strided<[?, ?], offset: ?>> to !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    // CHECK: %[[elem_size:.*]] = llvm.mlir.constant(4 : i32) : i32
+    // CHECK: %[[algn_ptr:.*]] = llvm.extractvalue %[[descriptor]][1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    // CHECK: %[[offset:.*]] = llvm.extractvalue %[[descriptor]][2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    // CHECK: %[[ptr:.*]] = llvm.getelementptr %[[algn_ptr]][%[[offset]]] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+    // CHECK: %[[stride:.*]] = llvm.mlir.constant(0 : i16) : i16
+    // CHECK: %[[sz_i:.*]] = llvm.extractvalue %[[descriptor]][3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    // CHECK: %[[stride_i:.*]] = llvm.extractvalue %[[descriptor]][4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    // CHECK: %[[ext_i:.*]] = llvm.mul %[[sz_i]], %[[stride_i]] : i64
+    // CHECK: %[[sz_j:.*]] = llvm.extractvalue %[[descriptor]][3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    // CHECK: %[[stride_j:.*]] = llvm.extractvalue %[[descriptor]][4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    // CHECK: %[[ext_j:.*]] = llvm.mul %[[sz_j]], %[[stride_j]] : i64
+    // CHECK: %[[num_records:.*]] = llvm.intr.umax(%[[ext_i]], %[[ext_j]]) : (i64, i64) -> i64
+    // CHECK: %[[num_rec_i32:.*]] = llvm.trunc %[[num_records]] : i64 to i32
+    // CHECK: %[[num_rec_bytes_i32:.*]] = llvm.mul %[[num_rec_i32]], %[[elem_size]] : i32
+    // CHECK: %[[rsrc:.*]] = rocdl.make.buffer.rsrc %[[ptr]], %[[stride]], %[[num_rec_bytes_i32]], %{{.*}} : !llvm.ptr to <8>
+    // CHECK: %[[stride_i_1:.*]] = llvm.extractvalue %[[descriptor]][4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    // CHECK: %[[stride_i_i32:.*]] = llvm.trunc %[[stride_i_1]] : i64 to i32
+    // CHECK: %[[t_0:.*]] = llvm.mul %{{.*}}, %[[stride_i_i32]] : i32
+    // CHECK: %[[stride_j_1:.*]] = llvm.extractvalue %[[descriptor]][4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    // CHECK: %[[stride_j_i32:.*]] = llvm.trunc %[[stride_j_1]] : i64 to i32
+    // CHECK: %[[t_1:.*]] = llvm.mul %{{.*}}, %[[stride_j_i32]] : i32
+    // CHECK: %[[index:.*]] = llvm.add %[[t_0]], %[[t_1]] : i32
+    // CHECK: %[[vgpr_off:.*]] = llvm.mul %[[index]], %[[elem_size]] : i32
+    // CHECK: %[[zero_0:.*]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK: %[[sgpr_off:.*]] = llvm.mul %[[zero_0]], %[[elem_size]] : i32
+    // CHECK: %[[zero_1:.*]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK: %[[v:.*]] = rocdl.raw.ptr.buffer.load %[[rsrc]], %[[vgpr_off]], %[[sgpr_off]], %[[zero_1]] : i32
+    // CHECK: return %[[v]] : i32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%i, %j] :  memref<16x16xi32, strided<[?, ?], offset: ?>>, i32, i32 -> i32
   func.return %0 : i32
 }
 

From a100fd8cbd3dad3846a6212d97279ca23db85c75 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Mon, 13 Jan 2025 21:15:40 +0000
Subject: [PATCH 332/408] [libc++abi][ItaniumDemangle] Demangle DF16b as
 std::bfloat16_t (#120109)

This mangling is official in the Itanium C++ ABI specification and is
already supported by clang.
---
 libcxxabi/src/demangle/ItaniumDemangle.h     | 5 ++++-
 libcxxabi/test/test_demangle.pass.cpp        | 4 +++-
 llvm/include/llvm/Demangle/ItaniumDemangle.h | 5 ++++-
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/libcxxabi/src/demangle/ItaniumDemangle.h b/libcxxabi/src/demangle/ItaniumDemangle.h
index e4752bed6da8b..3df41b5f4d7d0 100644
--- a/libcxxabi/src/demangle/ItaniumDemangle.h
+++ b/libcxxabi/src/demangle/ItaniumDemangle.h
@@ -4330,9 +4330,12 @@ Node *AbstractManglingParser<Derived, Alloc>::parseType() {
     case 'h':
       First += 2;
       return make<NameType>("half");
-    //                ::= DF <number> _ # ISO/IEC TS 18661 binary floating point (N bits)
+    //       ::= DF16b         # C++23 std::bfloat16_t
+    //       ::= DF <number> _ # ISO/IEC TS 18661 binary floating point (N bits)
     case 'F': {
       First += 2;
+      if (consumeIf("16b"))
+        return make<NameType>("std::bfloat16_t");
       Node *DimensionNumber = make<NameType>(parseNumber());
       if (!DimensionNumber)
         return nullptr;
diff --git a/libcxxabi/test/test_demangle.pass.cpp b/libcxxabi/test/test_demangle.pass.cpp
index 67b9df212ff3b..e9c74f70a094b 100644
--- a/libcxxabi/test/test_demangle.pass.cpp
+++ b/libcxxabi/test/test_demangle.pass.cpp
@@ -33,7 +33,7 @@
 // Is long double fp128?
 #define LDBL_FP128 (__LDBL_MANT_DIG__ == 113)
 
-const char *cases[][2] = {
+const char* cases[][2] = {
     // clang-format off
     {"_Z1A", "A"},
     {"_Z1Av", "A()"},
@@ -30245,6 +30245,8 @@ const char *cases[][2] = {
     {"_Z1fDSDRj", "f(_Sat unsigned _Fract)"},
     {"_Z1fDSDRl", "f(_Sat long _Fract)"},
     {"_Z1fDSDRm", "f(_Sat unsigned long _Fract)"},
+
+    {"_Z11bfloat16addDF16bDF16b", "bfloat16add(std::bfloat16_t, std::bfloat16_t)"},
     // clang-format on
 };
 
diff --git a/llvm/include/llvm/Demangle/ItaniumDemangle.h b/llvm/include/llvm/Demangle/ItaniumDemangle.h
index 7fba3fdc1abc9..b0363c1a7a786 100644
--- a/llvm/include/llvm/Demangle/ItaniumDemangle.h
+++ b/llvm/include/llvm/Demangle/ItaniumDemangle.h
@@ -4330,9 +4330,12 @@ Node *AbstractManglingParser<Derived, Alloc>::parseType() {
     case 'h':
       First += 2;
       return make<NameType>("half");
-    //                ::= DF <number> _ # ISO/IEC TS 18661 binary floating point (N bits)
+    //       ::= DF16b         # C++23 std::bfloat16_t
+    //       ::= DF <number> _ # ISO/IEC TS 18661 binary floating point (N bits)
     case 'F': {
       First += 2;
+      if (consumeIf("16b"))
+        return make<NameType>("std::bfloat16_t");
       Node *DimensionNumber = make<NameType>(parseNumber());
       if (!DimensionNumber)
         return nullptr;

From 1907a29dedfb9625772a332bb6d6c31d89fb36d3 Mon Sep 17 00:00:00 2001
From: Tom Honermann <tom.honermann@intel.com>
Date: Mon, 13 Jan 2025 16:24:46 -0500
Subject: [PATCH 333/408] [Clang][NFC] Indentation fixes and unneeded semicolon
 removal. (#122794)

---
 clang/lib/Frontend/CompilerInvocation.cpp | 2 +-
 clang/lib/Serialization/ASTReaderDecl.cpp | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 39bed84536c6a..58658dedbaf1e 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -3280,7 +3280,7 @@ static void GenerateHeaderSearchArgs(const HeaderSearchOptions &Opts,
     }();
 
     GenerateArg(Consumer, Opt, It->Path);
-  };
+  }
 
   // Note: some paths that came from "[-iprefix=xx] -iwithprefixbefore=yy" may
   // have already been generated as "-I[xx]yy". If that's the case, their
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index dee5169ae5723..ee72dd844b59a 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -451,9 +451,8 @@ class ASTDeclReader : public DeclVisitor<ASTDeclReader, void> {
   void VisitOMPDeclareMapperDecl(OMPDeclareMapperDecl *D);
   void VisitOMPRequiresDecl(OMPRequiresDecl *D);
   void VisitOMPCapturedExprDecl(OMPCapturedExprDecl *D);
-  };
-
-  } // namespace clang
+};
+} // namespace clang
 
 namespace {
 

From 7e191038957cf5e22da55ba577ce6e033236b05f Mon Sep 17 00:00:00 2001
From: Pedro Lobo <pedro.lobo@tecnico.ulisboa.pt>
Date: Mon, 13 Jan 2025 21:38:40 +0000
Subject: [PATCH 334/408] [DebugInfo] Map VAM args to `poison` instead of
 `undef` [NFC] (#122756)

If an argument cannot be mapped in `Mapper::mapValue`, it can be mapped
to `poison` instead of `undef`.
---
 llvm/lib/Transforms/Utils/ValueMapper.cpp           | 4 ++--
 llvm/unittests/Transforms/Utils/ValueMapperTest.cpp | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp
index 3faea48466ba9..0b57c3bc538c6 100644
--- a/llvm/lib/Transforms/Utils/ValueMapper.cpp
+++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -410,9 +410,9 @@ Value *Mapper::mapValue(const Value *V) {
         } else if ((Flags & RF_IgnoreMissingLocals) && isa<LocalAsMetadata>(VAM)) {
             MappedArgs.push_back(VAM);
         } else {
-          // If we cannot map the value, set the argument as undef.
+          // If we cannot map the value, set the argument as poison.
           MappedArgs.push_back(ValueAsMetadata::get(
-              UndefValue::get(VAM->getValue()->getType())));
+              PoisonValue::get(VAM->getValue()->getType())));
         }
       }
       return MetadataAsValue::get(V->getContext(),
diff --git a/llvm/unittests/Transforms/Utils/ValueMapperTest.cpp b/llvm/unittests/Transforms/Utils/ValueMapperTest.cpp
index d9e63565b809e..fb1b4edf25328 100644
--- a/llvm/unittests/Transforms/Utils/ValueMapperTest.cpp
+++ b/llvm/unittests/Transforms/Utils/ValueMapperTest.cpp
@@ -347,8 +347,8 @@ TEST(ValueMapperTest, mapValueLocalInArgList) {
   // such as "metadata i32 %x" don't currently successfully maintain that
   // property.  To keep RemapInstruction from crashing we need a non-null
   // return here, but we also shouldn't reference the unmapped local.  Use
-  // undef for uses in a DIArgList.
-  auto *N0 = UndefValue::get(Type::getInt8Ty(C));
+  // poison for uses in a DIArgList.
+  auto *N0 = PoisonValue::get(Type::getInt8Ty(C));
   auto *N0AM = ValueAsMetadata::get(N0);
   std::vector<ValueAsMetadata*> N0Elts;
   N0Elts.push_back(N0AM);

From a10ce71ac4ef55cc9a80c0aece501a09bd39cc9a Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Mon, 13 Jan 2025 21:40:10 +0000
Subject: [PATCH 335/408] [ARM] Add mayStore to more store instructions

As in #121565 we need to mark all stores as mayStore, hasSideEffects is not
enough to prevent moving loads past the instructions. And marking the
instructions as mayStore is a sensible thing to do on its own.
---
 llvm/lib/Target/ARM/ARMInstrFormats.td           |  2 ++
 llvm/lib/Target/ARM/ARMInstrInfo.td              | 11 ++++++++---
 llvm/lib/Target/ARM/ARMInstrNEON.td              | 13 +++++++++++++
 llvm/lib/Target/ARM/ARMInstrThumb2.td            |  9 ++++++++-
 .../ARM/cortex-a57-memory-instructions.s         | 16 ++++++++--------
 llvm/test/tools/llvm-mca/ARM/cortex-a57-thumb.s  | 16 ++++++++--------
 llvm/test/tools/llvm-mca/ARM/m4-int.s            |  6 +++---
 llvm/test/tools/llvm-mca/ARM/m55-int.s           |  6 +++---
 llvm/test/tools/llvm-mca/ARM/m7-int.s            |  6 +++---
 llvm/test/tools/llvm-mca/ARM/m85-int.s           |  6 +++---
 10 files changed, 59 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMInstrFormats.td b/llvm/lib/Target/ARM/ARMInstrFormats.td
index 041601748b1f7..9eb9114069143 100644
--- a/llvm/lib/Target/ARM/ARMInstrFormats.td
+++ b/llvm/lib/Target/ARM/ARMInstrFormats.td
@@ -686,6 +686,8 @@ class AIstr_ex_or_rel<bits<2> opcod, bits<2> opcod2, dag oops, dag iops, InstrIt
   let Inst{9-8}   = opcod2;
   let Inst{7-4}   = 0b1001;
   let Inst{3-0}   = Rt;
+
+  let mayStore = 1;
 }
 // Atomic load/store instructions
 class AIldrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index 009b60c8400f0..2a3a4e91eee4c 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -3388,6 +3388,8 @@ def STRD_POST: AI3ldstidx<0b1111, 0, 0, (outs GPR:$Rn_wb),
 
 // STRT, STRBT, and STRHT
 
+let mayStore = 1, hasSideEffects = 0 in {
+
 def STRBT_POST_REG : AI2ldstidx<0, 1, 0, (outs GPR:$Rn_wb),
                    (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset),
                    IndexModePost, StFrm, IIC_iStore_bh_ru,
@@ -3428,7 +3430,6 @@ def STRBT_POST
   : ARMAsmPseudo<"strbt${q} $Rt, $addr",
                  (ins GPR:$Rt, addr_offset_none:$addr, pred:$q)>;
 
-let mayStore = 1, hasSideEffects = 0 in {
 def STRT_POST_REG : AI2ldstidx<0, 0, 0, (outs GPR:$Rn_wb),
                    (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset),
                    IndexModePost, StFrm, IIC_iStore_ru,
@@ -3464,7 +3465,6 @@ def STRT_POST_IMM
   let Inst{11-0} = offset{11-0};
   let DecoderMethod = "DecodeAddrMode2IdxInstruction";
 }
-}
 
 def STRT_POST
   : ARMAsmPseudo<"strt${q} $Rt, $addr",
@@ -3493,7 +3493,6 @@ multiclass AI3strT<bits<4> op, string opc> {
   }
 }
 
-
 defm STRHT : AI3strT<0b1011, "strht">;
 
 def STL : AIstrrel<0b00, (outs), (ins GPR:$Rt, addr_offset_none:$addr),
@@ -3503,6 +3502,8 @@ def STLB : AIstrrel<0b10, (outs), (ins GPR:$Rt, addr_offset_none:$addr),
 def STLH : AIstrrel<0b11, (outs), (ins GPR:$Rt, addr_offset_none:$addr),
                     NoItinerary, "stlh", "\t$Rt, $addr", []>;
 
+} // mayStore = 1, hasSideEffects = 0
+
 //===----------------------------------------------------------------------===//
 //  Load / store multiple Instructions.
 //
@@ -5633,15 +5634,19 @@ multiclass LdSt2Cop<bit load, bit Dbit, string asm, list<dag> pattern> {
   }
 }
 
+let mayLoad = 1 in {
 defm LDC   : LdStCop <1, 0, "ldc", [(int_arm_ldc timm:$cop, timm:$CRd, addrmode5:$addr)]>;
 defm LDCL  : LdStCop <1, 1, "ldcl", [(int_arm_ldcl timm:$cop, timm:$CRd, addrmode5:$addr)]>;
 defm LDC2  : LdSt2Cop<1, 0, "ldc2", [(int_arm_ldc2 timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
 defm LDC2L : LdSt2Cop<1, 1, "ldc2l", [(int_arm_ldc2l timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
+}
 
+let mayStore = 1 in {
 defm STC   : LdStCop <0, 0, "stc", [(int_arm_stc timm:$cop, timm:$CRd, addrmode5:$addr)]>;
 defm STCL  : LdStCop <0, 1, "stcl", [(int_arm_stcl timm:$cop, timm:$CRd, addrmode5:$addr)]>;
 defm STC2  : LdSt2Cop<0, 0, "stc2", [(int_arm_stc2 timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
 defm STC2L : LdSt2Cop<0, 1, "stc2l", [(int_arm_stc2l timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
+}
 
 } // DecoderNamespace = "CoProc"
 
diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td
index 20c52206fd3cd..3335f52f15555 100644
--- a/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -8186,6 +8186,7 @@ def VLD1LNdWB_register_Asm_32 :
                   (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
                        rGPR:$Rm, pred:$p)>;
 
+let mayStore = 1 in {
 
 // VST1 single-lane pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
@@ -8224,6 +8225,8 @@ def VST1LNdWB_register_Asm_32 :
                   (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
                        rGPR:$Rm, pred:$p)>;
 
+}
+
 // VLD2 single-lane pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
 def VLD2LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".8", "$list, $addr",
@@ -8282,6 +8285,7 @@ def VLD2LNqWB_register_Asm_32 :
                   (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 
+let mayStore = 1 in {
 
 // VST2 single-lane pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
@@ -8342,6 +8346,8 @@ def VST2LNqWB_register_Asm_32 :
                   (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 
+}
+
 // VLD3 all-lanes pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
 def VLD3DUPdAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
@@ -8531,6 +8537,8 @@ def VLD3qWB_register_Asm_32 :
                   (ins VecListThreeQ:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 
+let mayStore = 1 in {
+
 // VST3 single-lane pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
 def VST3LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr",
@@ -8650,6 +8658,8 @@ def VST3qWB_register_Asm_32 :
                   (ins VecListThreeQ:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 
+}
+
 // VLD4 all-lanes pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
 def VLD4DUPdAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
@@ -8853,6 +8863,8 @@ def VLD4qWB_register_Asm_32 :
                   (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
                        rGPR:$Rm, pred:$p)>;
 
+let mayStore = 1 in {
+
 // VST4 single-lane pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
 def VST4LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr",
@@ -8983,6 +8995,7 @@ def VST4qWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr, $Rm",
                   (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
                        rGPR:$Rm, pred:$p)>;
+}
 
 // VMOV/VMVN takes an optional datatype suffix
 defm : NEONDTAnyInstAlias<"vmov${p}", "$Vd, $Vm",
diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td
index 5f01cfcb53f9c..033df9c2fd204 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -1795,6 +1795,8 @@ def t2STRH_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
             Sched<[WriteST]>;
 }
 
+let mayStore = 1, hasSideEffects = 0 in {
+
 // F5.1.229 STR (immediate) T4
 // .w suffixes; Constraints can't be used on t2InstAlias to describe
 // "$Rn =  $Rn_wb,@earlyclobber $Rn_wb" on POST or
@@ -1850,6 +1852,8 @@ def t2STRT   : T2IstT<0b10, "strt", IIC_iStore_i>;
 def t2STRBT  : T2IstT<0b00, "strbt", IIC_iStore_bh_i>;
 def t2STRHT  : T2IstT<0b01, "strht", IIC_iStore_bh_i>;
 
+} // mayStore = 1, hasSideEffects = 0
+
 // ldrd / strd pre / post variants
 
 let mayLoad = 1, hasSideEffects = 0 in
@@ -4482,16 +4486,19 @@ multiclass t2LdStCop<bits<4> op31_28, bit load, bit Dbit, string asm, list<dag>
 }
 
 let DecoderNamespace = "Thumb2CoProc" in {
+let mayLoad = 1 in {
 defm t2LDC   : t2LdStCop<0b1110, 1, 0, "ldc", [(int_arm_ldc timm:$cop, timm:$CRd, addrmode5:$addr)]>;
 defm t2LDCL  : t2LdStCop<0b1110, 1, 1, "ldcl", [(int_arm_ldcl timm:$cop, timm:$CRd, addrmode5:$addr)]>;
 defm t2LDC2  : t2LdStCop<0b1111, 1, 0, "ldc2", [(int_arm_ldc2 timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
 defm t2LDC2L : t2LdStCop<0b1111, 1, 1, "ldc2l", [(int_arm_ldc2l timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
-
+}
+let mayStore = 1 in {
 defm t2STC   : t2LdStCop<0b1110, 0, 0, "stc", [(int_arm_stc timm:$cop, timm:$CRd, addrmode5:$addr)]>;
 defm t2STCL  : t2LdStCop<0b1110, 0, 1, "stcl", [(int_arm_stcl timm:$cop, timm:$CRd, addrmode5:$addr)]>;
 defm t2STC2  : t2LdStCop<0b1111, 0, 0, "stc2", [(int_arm_stc2 timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
 defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l", [(int_arm_stc2l timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
 }
+}
 
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/tools/llvm-mca/ARM/cortex-a57-memory-instructions.s b/llvm/test/tools/llvm-mca/ARM/cortex-a57-memory-instructions.s
index 36a2f04f4ace0..8aff9168932fc 100644
--- a/llvm/test/tools/llvm-mca/ARM/cortex-a57-memory-instructions.s
+++ b/llvm/test/tools/llvm-mca/ARM/cortex-a57-memory-instructions.s
@@ -306,10 +306,10 @@
 # CHECK-NEXT:  2      2     1.00           *            strb	r6, [r2], -r4
 # CHECK-NEXT:  2      3     1.00           *            strb	r7, [r12, -r3, lsl #5]
 # CHECK-NEXT:  2      2     1.00           *            strb	sp, [r7], r2, asr #12
-# CHECK-NEXT:  2      1     1.00                  U     strbt	r6, [r2], #12
-# CHECK-NEXT:  2      1     1.00                  U     strbt	r5, [r6], #-13
-# CHECK-NEXT:  2      2     1.00                  U     strbt	r4, [r9], r5
-# CHECK-NEXT:  2      2     1.00                  U     strbt	r3, [r8], -r2, lsl #3
+# CHECK-NEXT:  2      1     1.00           *            strbt	r6, [r2], #12
+# CHECK-NEXT:  2      1     1.00           *            strbt	r5, [r6], #-13
+# CHECK-NEXT:  2      2     1.00           *            strbt	r4, [r9], r5
+# CHECK-NEXT:  2      2     1.00           *            strbt	r3, [r8], -r2, lsl #3
 # CHECK-NEXT:  1      1     1.00           *            strd	r0, r1, [r4]
 # CHECK-NEXT:  1      1     1.00           *            strd	r2, r3, [r6, #1]
 # CHECK-NEXT:  1      1     1.00           *            strd	r2, r3, [r6, r2]
@@ -332,10 +332,10 @@
 # CHECK-NEXT:  2      1     1.00           *            strh	r1, [r2, -r1]!
 # CHECK-NEXT:  2      1     1.00           *            strh	r9, [r7], r2
 # CHECK-NEXT:  2      1     1.00           *            strh	r4, [r3], -r2
-# CHECK-NEXT:  2      1     1.00                  U     strht	r2, [r5], #76
-# CHECK-NEXT:  2      1     1.00                  U     strht	r8, [r1], #-25
-# CHECK-NEXT:  2      1     1.00                  U     strht	r5, [r3], r4
-# CHECK-NEXT:  2      1     1.00                  U     strht	r6, [r8], -r0
+# CHECK-NEXT:  2      1     1.00           *            strht	r2, [r5], #76
+# CHECK-NEXT:  2      1     1.00           *            strht	r8, [r1], #-25
+# CHECK-NEXT:  2      1     1.00           *            strht	r5, [r3], r4
+# CHECK-NEXT:  2      1     1.00           *            strht	r6, [r8], -r0
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - A57UnitB
diff --git a/llvm/test/tools/llvm-mca/ARM/cortex-a57-thumb.s b/llvm/test/tools/llvm-mca/ARM/cortex-a57-thumb.s
index 6c56e1dbf0244..44bf028dc1223 100644
--- a/llvm/test/tools/llvm-mca/ARM/cortex-a57-thumb.s
+++ b/llvm/test/tools/llvm-mca/ARM/cortex-a57-thumb.s
@@ -1594,14 +1594,14 @@
 # CHECK-NEXT:  1      1     1.00           *            strh.w	r8, [r8, r2, lsl #2]
 # CHECK-NEXT:  1      1     1.00           *            strh.w	r7, [sp, r2, lsl #1]
 # CHECK-NEXT:  1      1     1.00           *            strh.w	r7, [sp, r2]
-# CHECK-NEXT:  2      1     1.00                  U     strht	r1, [r2]
-# CHECK-NEXT:  2      1     1.00                  U     strht	r1, [r8]
-# CHECK-NEXT:  2      1     1.00                  U     strht	r1, [r8, #3]
-# CHECK-NEXT:  2      1     1.00                  U     strht	r1, [r8, #255]
-# CHECK-NEXT:  1      1     1.00                  U     strt	r1, [r2]
-# CHECK-NEXT:  1      1     1.00                  U     strt	r1, [r8]
-# CHECK-NEXT:  1      1     1.00                  U     strt	r1, [r8, #3]
-# CHECK-NEXT:  1      1     1.00                  U     strt	r1, [r8, #255]
+# CHECK-NEXT:  2      1     1.00           *            strht	r1, [r2]
+# CHECK-NEXT:  2      1     1.00           *            strht	r1, [r8]
+# CHECK-NEXT:  2      1     1.00           *            strht	r1, [r8, #3]
+# CHECK-NEXT:  2      1     1.00           *            strht	r1, [r8, #255]
+# CHECK-NEXT:  1      1     1.00           *            strt	r1, [r2]
+# CHECK-NEXT:  1      1     1.00           *            strt	r1, [r8]
+# CHECK-NEXT:  1      1     1.00           *            strt	r1, [r8, #3]
+# CHECK-NEXT:  1      1     1.00           *            strt	r1, [r8, #255]
 # CHECK-NEXT:  0      0     0.00                  U     itet	eq
 # CHECK-NEXT:  1      1     0.50                        subeq	r1, r2, #4
 # CHECK-NEXT:  1      1     0.50                        subwne	r5, r3, #1023
diff --git a/llvm/test/tools/llvm-mca/ARM/m4-int.s b/llvm/test/tools/llvm-mca/ARM/m4-int.s
index c2468efea2051..1ede2c8ce53fa 100644
--- a/llvm/test/tools/llvm-mca/ARM/m4-int.s
+++ b/llvm/test/tools/llvm-mca/ARM/m4-int.s
@@ -776,7 +776,7 @@ yield
 # CHECK-NEXT:  1      1     1.00           *            strb	r0, [r1, r2]
 # CHECK-NEXT:  1      1     1.00           *            strb.w	r0, [r1, r2]
 # CHECK-NEXT:  1      1     1.00           *            strb.w	r0, [r1, r2, lsl #1]
-# CHECK-NEXT:  1      1     1.00                  U     strbt	r0, [r1, #1]
+# CHECK-NEXT:  1      1     1.00           *            strbt	r0, [r1, #1]
 # CHECK-NEXT:  1      1     1.00           *            strd	r0, r1, [r2, #4]
 # CHECK-NEXT:  1      1     1.00           *            strd	r0, r1, [r2], #4
 # CHECK-NEXT:  1      1     1.00           *            strd	r0, r1, [r2, #4]!
@@ -793,8 +793,8 @@ yield
 # CHECK-NEXT:  1      1     1.00           *            strh	r0, [r1, r2]
 # CHECK-NEXT:  1      1     1.00           *            strh.w	r0, [r1, r2]
 # CHECK-NEXT:  1      1     1.00           *            strh.w	r0, [r1, r2, lsl #1]
-# CHECK-NEXT:  1      1     1.00                  U     strht	r0, [r1, #1]
-# CHECK-NEXT:  1      1     1.00                  U     strt	r0, [r1, #1]
+# CHECK-NEXT:  1      1     1.00           *            strht	r0, [r1, #1]
+# CHECK-NEXT:  1      1     1.00           *            strt	r0, [r1, #1]
 # CHECK-NEXT:  1      1     1.00                  U     sub	sp, #4
 # CHECK-NEXT:  1      1     1.00                        sub.w	r0, sp, #1
 # CHECK-NEXT:  1      1     1.00                        subs.w	r0, sp, #1
diff --git a/llvm/test/tools/llvm-mca/ARM/m55-int.s b/llvm/test/tools/llvm-mca/ARM/m55-int.s
index 9347aa1a09a20..0810bf9aa3209 100644
--- a/llvm/test/tools/llvm-mca/ARM/m55-int.s
+++ b/llvm/test/tools/llvm-mca/ARM/m55-int.s
@@ -836,7 +836,7 @@ yield
 # CHECK-NEXT:  1      1     1.00           *            strb	r0, [r1, r2]
 # CHECK-NEXT:  1      1     1.00           *            strb.w	r0, [r1, r2]
 # CHECK-NEXT:  1      1     1.00           *            strb.w	r0, [r1, r2, lsl #1]
-# CHECK-NEXT:  1      1     1.00                  U     strbt	r0, [r1, #1]
+# CHECK-NEXT:  1      1     1.00           *            strbt	r0, [r1, #1]
 # CHECK-NEXT:  1      2     1.00           *            strd	r0, r1, [r2, #4]
 # CHECK-NEXT:  1      2     1.00           *            strd	r0, r1, [r2], #4
 # CHECK-NEXT:  1      2     1.00           *            strd	r0, r1, [r2, #4]!
@@ -853,8 +853,8 @@ yield
 # CHECK-NEXT:  1      1     1.00           *            strh	r0, [r1, r2]
 # CHECK-NEXT:  1      1     1.00           *            strh.w	r0, [r1, r2]
 # CHECK-NEXT:  1      1     1.00           *            strh.w	r0, [r1, r2, lsl #1]
-# CHECK-NEXT:  1      1     1.00                  U     strht	r0, [r1, #1]
-# CHECK-NEXT:  1      1     1.00                  U     strt	r0, [r1, #1]
+# CHECK-NEXT:  1      1     1.00           *            strht	r0, [r1, #1]
+# CHECK-NEXT:  1      1     1.00           *            strt	r0, [r1, #1]
 # CHECK-NEXT:  1      1     1.00                  U     sub	sp, #4
 # CHECK-NEXT:  1      1     0.50                        sub.w	r0, sp, #1
 # CHECK-NEXT:  1      1     0.50                        subs.w	r0, sp, #1
diff --git a/llvm/test/tools/llvm-mca/ARM/m7-int.s b/llvm/test/tools/llvm-mca/ARM/m7-int.s
index 4ec6ed56c924e..9140b619a0928 100644
--- a/llvm/test/tools/llvm-mca/ARM/m7-int.s
+++ b/llvm/test/tools/llvm-mca/ARM/m7-int.s
@@ -752,7 +752,7 @@ yield
 # CHECK-NEXT:  1      3     1.00           *            strb	r0, [r1, r2]
 # CHECK-NEXT:  1      3     1.00           *            strb.w	r0, [r1, r2]
 # CHECK-NEXT:  1      3     1.00           *            strb.w	r0, [r1, r2, lsl #1]
-# CHECK-NEXT:  1      3     1.00                  U     strbt	r0, [r1, #1]
+# CHECK-NEXT:  1      3     1.00           *            strbt	r0, [r1, #1]
 # CHECK-NEXT:  1      3     1.00           *            strd	r0, r1, [r2, #4]
 # CHECK-NEXT:  1      3     1.00           *            strd	r0, r1, [r2], #4
 # CHECK-NEXT:  1      3     1.00           *            strd	r0, r1, [r2, #4]!
@@ -769,8 +769,8 @@ yield
 # CHECK-NEXT:  1      3     1.00           *            strh	r0, [r1, r2]
 # CHECK-NEXT:  1      3     1.00           *            strh.w	r0, [r1, r2]
 # CHECK-NEXT:  1      3     1.00           *            strh.w	r0, [r1, r2, lsl #1]
-# CHECK-NEXT:  1      3     1.00                  U     strht	r0, [r1, #1]
-# CHECK-NEXT:  1      3     1.00                  U     strt	r0, [r1, #1]
+# CHECK-NEXT:  1      3     1.00           *            strht	r0, [r1, #1]
+# CHECK-NEXT:  1      3     1.00           *            strt	r0, [r1, #1]
 # CHECK-NEXT:  1      1     0.50                        subs	r0, r1, #1
 # CHECK-NEXT:  1      1     0.50                        subs	r0, #1
 # CHECK-NEXT:  1      1     0.50                        sub.w	r0, r1, #1
diff --git a/llvm/test/tools/llvm-mca/ARM/m85-int.s b/llvm/test/tools/llvm-mca/ARM/m85-int.s
index ae6570e104863..39a3e07f94097 100644
--- a/llvm/test/tools/llvm-mca/ARM/m85-int.s
+++ b/llvm/test/tools/llvm-mca/ARM/m85-int.s
@@ -865,7 +865,7 @@ yield.w
 # CHECK-NEXT:  1      3     0.50           *            strb	r0, [r1, r2]
 # CHECK-NEXT:  1      3     0.50           *            strb.w	r0, [r1, r2]
 # CHECK-NEXT:  1      3     0.50           *            strb.w	r0, [r1, r2, lsl #1]
-# CHECK-NEXT:  1      3     0.50                  U     strbt	r0, [r1, #1]
+# CHECK-NEXT:  1      3     0.50           *            strbt	r0, [r1, #1]
 # CHECK-NEXT:  1      3     1.00           *            strd	r0, r1, [r2, #4]
 # CHECK-NEXT:  1      3     1.00           *            strd	r0, r1, [r2], #4
 # CHECK-NEXT:  1      3     1.00           *            strd	r0, r1, [r2, #4]!
@@ -882,8 +882,8 @@ yield.w
 # CHECK-NEXT:  1      3     0.50           *            strh	r0, [r1, r2]
 # CHECK-NEXT:  1      3     0.50           *            strh.w	r0, [r1, r2]
 # CHECK-NEXT:  1      3     0.50           *            strh.w	r0, [r1, r2, lsl #1]
-# CHECK-NEXT:  1      3     0.50                  U     strht	r0, [r1, #1]
-# CHECK-NEXT:  1      3     0.50                  U     strt	r0, [r1, #1]
+# CHECK-NEXT:  1      3     0.50           *            strht	r0, [r1, #1]
+# CHECK-NEXT:  1      3     0.50           *            strt	r0, [r1, #1]
 # CHECK-NEXT:  1      2     0.50                  U     sub	sp, #4
 # CHECK-NEXT:  1      1     0.50                        sub.w	r0, sp, #1
 # CHECK-NEXT:  1      1     0.50                        subs.w	r0, sp, #1

From a39aaf35d3858a5542f532e399482c2bb0259dac Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Mon, 13 Jan 2025 13:42:59 -0800
Subject: [PATCH 336/408] Reland: "[Exegesis] Add the ability to dry-run the
 measurement phase (#121991)" (#122775)

This relands f8f8598fd886cddfd374fa43eb6d7d37d301b576

Follow up on #122371:
The problem here is a little subtle: when we dry-run the measurement
phase, we create a LLJIT instance without actually executing the
snippets. The key is, LLJIT has its own TargetMachine which uses triple
designated by LLVM_TARGET_ARCH (which is default to host). On a machine
that does not support Exegesis, the LLJIT would fail to create its
TargetMachine because llvm-exegesis don't even register the host's
target!

Putting this test into any of the target-specific folder won't help,
because it's about the host. And personally I don't really want to use
`exegesis-can-execute-<arch>` for generic tests like this -- it's too
strict as we don't actually need to execute the snippet.

My solution here is creating another test feature which is added only
when LLVM_TARGET_ARCH is supported by llvm-exegesis. This feature is
something in between `<arch>-registered-target` and
`exegesis-can-execute-<arch>`.
---
 llvm/docs/CommandGuide/llvm-exegesis.rst      |  1 +
 .../llvm-exegesis/dry-run-measurement.test    | 11 +++++++
 llvm/test/tools/llvm-exegesis/lit.local.cfg   |  6 ++++
 .../tools/llvm-exegesis/lib/BenchmarkResult.h |  1 +
 .../llvm-exegesis/lib/BenchmarkRunner.cpp     | 33 ++++++++++++++-----
 llvm/tools/llvm-exegesis/lib/Target.cpp       |  4 +--
 llvm/tools/llvm-exegesis/llvm-exegesis.cpp    |  9 +++--
 7 files changed, 52 insertions(+), 13 deletions(-)
 create mode 100644 llvm/test/tools/llvm-exegesis/dry-run-measurement.test

diff --git a/llvm/docs/CommandGuide/llvm-exegesis.rst b/llvm/docs/CommandGuide/llvm-exegesis.rst
index 8266d891a5e6b..d357c2ceea418 100644
--- a/llvm/docs/CommandGuide/llvm-exegesis.rst
+++ b/llvm/docs/CommandGuide/llvm-exegesis.rst
@@ -301,6 +301,7 @@ OPTIONS
   * ``prepare-and-assemble-snippet``: Same as ``prepare-snippet``, but also dumps an excerpt of the sequence (hex encoded).
   * ``assemble-measured-code``: Same as ``prepare-and-assemble-snippet``. but also creates the full sequence that can be dumped to a file using ``--dump-object-to-disk``.
   * ``measure``: Same as ``assemble-measured-code``, but also runs the measurement.
+  * ``dry-run-measurement``: Same as measure, but does not actually execute the snippet.
 
 .. option:: --x86-lbr-sample-period=<nBranches/sample>
 
diff --git a/llvm/test/tools/llvm-exegesis/dry-run-measurement.test b/llvm/test/tools/llvm-exegesis/dry-run-measurement.test
new file mode 100644
index 0000000000000..02e1ec521cf27
--- /dev/null
+++ b/llvm/test/tools/llvm-exegesis/dry-run-measurement.test
@@ -0,0 +1,11 @@
+# RUN: llvm-exegesis --mtriple=riscv64 --mcpu=sifive-p470 --mode=latency --opcode-name=ADD --use-dummy-perf-counters --benchmark-phase=dry-run-measurement | FileCheck %s
+# REQUIRES: riscv-registered-target && native-registered-exegesis-target
+
+# This test makes sure that llvm-exegesis doesn't execute "cross-compiled" snippets in the presence of
+# --dry-run-measurement. RISC-V was chosen simply because most of the time we run tests on X86 machines.
+
+# Should not contain misleading results.
+# CHECK: measurements:    []
+
+# Should not contain error messages like "snippet crashed while running: Segmentation fault".
+# CHECK: error:           ''
diff --git a/llvm/test/tools/llvm-exegesis/lit.local.cfg b/llvm/test/tools/llvm-exegesis/lit.local.cfg
index a51a2d73442fa..343f34c58673e 100644
--- a/llvm/test/tools/llvm-exegesis/lit.local.cfg
+++ b/llvm/test/tools/llvm-exegesis/lit.local.cfg
@@ -30,6 +30,12 @@ def can_use_perf_counters(mode, extra_options=[]):
         print("could not exec llvm-exegesis")
         return False
 
+# LLJIT builds its own TargetMachine using arch designated by LLVM_TARGET_ARCH, which
+# is default to host. We don't want tests that use LLJIT (but not necessarily
+# execute the snippets) to run on machines that are not even supported by
+# exegesis.
+if config.root.native_target in ["AArch64", "Mips", "PowerPC", "RISCV", "X86"]:
+    config.available_features.add("native-registered-exegesis-target")
 
 for arch in ["aarch64", "mips", "powerpc", "x86_64"]:
     if can_execute_generated_snippets(arch):
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
index 3c09a8380146e..5480d85616878 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
@@ -38,6 +38,7 @@ enum class BenchmarkPhaseSelectorE {
   PrepareAndAssembleSnippet,
   AssembleMeasuredCode,
   Measure,
+  DryRunMeasure,
 };
 
 enum class BenchmarkFilter { All, RegOnly, WithMem };
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
index a7771b99e97b1..cc46f7feb6cf7 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
@@ -99,7 +99,7 @@ class InProcessFunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor {
   static Expected<std::unique_ptr<InProcessFunctionExecutorImpl>>
   create(const LLVMState &State, object::OwningBinary<object::ObjectFile> Obj,
          BenchmarkRunner::ScratchSpace *Scratch,
-         std::optional<int> BenchmarkProcessCPU) {
+         std::optional<int> BenchmarkProcessCPU, bool DryRun) {
     Expected<ExecutableFunction> EF =
         ExecutableFunction::create(State.createTargetMachine(), std::move(Obj));
 
@@ -107,14 +107,17 @@ class InProcessFunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor {
       return EF.takeError();
 
     return std::unique_ptr<InProcessFunctionExecutorImpl>(
-        new InProcessFunctionExecutorImpl(State, std::move(*EF), Scratch));
+        new InProcessFunctionExecutorImpl(State, std::move(*EF), Scratch,
+                                          DryRun));
   }
 
 private:
   InProcessFunctionExecutorImpl(const LLVMState &State,
                                 ExecutableFunction Function,
-                                BenchmarkRunner::ScratchSpace *Scratch)
-      : State(State), Function(std::move(Function)), Scratch(Scratch) {}
+                                BenchmarkRunner::ScratchSpace *Scratch,
+                                bool DryRun)
+      : State(State), Function(std::move(Function)), Scratch(Scratch),
+        DryRun(DryRun) {}
 
   static void accumulateCounterValues(const SmallVector<int64_t, 4> &NewValues,
                                       SmallVector<int64_t, 4> *Result) {
@@ -143,9 +146,14 @@ class InProcessFunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor {
       CrashRecoveryContext CRC;
       CrashRecoveryContext::Enable();
       const bool Crashed = !CRC.RunSafely([this, Counter, ScratchPtr]() {
-        Counter->start();
-        this->Function(ScratchPtr);
-        Counter->stop();
+        if (DryRun) {
+          Counter->start();
+          Counter->stop();
+        } else {
+          Counter->start();
+          this->Function(ScratchPtr);
+          Counter->stop();
+        }
       });
       CrashRecoveryContext::Disable();
       PS.reset();
@@ -177,6 +185,7 @@ class InProcessFunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor {
   const LLVMState &State;
   const ExecutableFunction Function;
   BenchmarkRunner::ScratchSpace *const Scratch;
+  bool DryRun = false;
 };
 
 #ifdef __linux__
@@ -664,6 +673,9 @@ Expected<std::unique_ptr<BenchmarkRunner::FunctionExecutor>>
 BenchmarkRunner::createFunctionExecutor(
     object::OwningBinary<object::ObjectFile> ObjectFile,
     const BenchmarkKey &Key, std::optional<int> BenchmarkProcessCPU) const {
+  bool DryRun =
+      BenchmarkPhaseSelector == BenchmarkPhaseSelectorE::DryRunMeasure;
+
   switch (ExecutionMode) {
   case ExecutionModeE::InProcess: {
     if (BenchmarkProcessCPU.has_value())
@@ -671,7 +683,8 @@ BenchmarkRunner::createFunctionExecutor(
                                  "support benchmark core pinning.");
 
     auto InProcessExecutorOrErr = InProcessFunctionExecutorImpl::create(
-        State, std::move(ObjectFile), Scratch.get(), BenchmarkProcessCPU);
+        State, std::move(ObjectFile), Scratch.get(), BenchmarkProcessCPU,
+        DryRun);
     if (!InProcessExecutorOrErr)
       return InProcessExecutorOrErr.takeError();
 
@@ -679,6 +692,10 @@ BenchmarkRunner::createFunctionExecutor(
   }
   case ExecutionModeE::SubProcess: {
 #ifdef __linux__
+    if (DryRun)
+      return make_error<Failure>("The subprocess execution mode cannot "
+                                 "dry-run measurement at this moment.");
+
     auto SubProcessExecutorOrErr = SubProcessFunctionExecutorImpl::create(
         State, std::move(ObjectFile), Key, BenchmarkProcessCPU);
     if (!SubProcessExecutorOrErr)
diff --git a/llvm/tools/llvm-exegesis/lib/Target.cpp b/llvm/tools/llvm-exegesis/lib/Target.cpp
index 29e58692f0e92..e2251ff978888 100644
--- a/llvm/tools/llvm-exegesis/lib/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Target.cpp
@@ -98,7 +98,7 @@ ExegesisTarget::createBenchmarkRunner(
     return nullptr;
   case Benchmark::Latency:
   case Benchmark::InverseThroughput:
-    if (BenchmarkPhaseSelector == BenchmarkPhaseSelectorE::Measure &&
+    if (BenchmarkPhaseSelector >= BenchmarkPhaseSelectorE::Measure &&
         !PfmCounters.CycleCounter) {
       const char *ModeName = Mode == Benchmark::Latency
                                  ? "latency"
@@ -116,7 +116,7 @@ ExegesisTarget::createBenchmarkRunner(
         State, Mode, BenchmarkPhaseSelector, ResultAggMode, ExecutionMode,
         ValidationCounters, BenchmarkRepeatCount);
   case Benchmark::Uops:
-    if (BenchmarkPhaseSelector == BenchmarkPhaseSelectorE::Measure &&
+    if (BenchmarkPhaseSelector >= BenchmarkPhaseSelectorE::Measure &&
         !PfmCounters.UopsCounter && !PfmCounters.IssueCounters)
       return make_error<Failure>(
           "can't run 'uops' mode, sched model does not define uops or issue "
diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
index fa37e05956be8..07bd44ee64f1f 100644
--- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
+++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
@@ -132,7 +132,10 @@ static cl::opt<BenchmarkPhaseSelectorE> BenchmarkPhaseSelector(
         clEnumValN(
             BenchmarkPhaseSelectorE::Measure, "measure",
             "Same as prepare-measured-code, but also runs the measurement "
-            "(default)")),
+            "(default)"),
+        clEnumValN(
+            BenchmarkPhaseSelectorE::DryRunMeasure, "dry-run-measurement",
+            "Same as measure, but does not actually execute the snippet")),
     cl::init(BenchmarkPhaseSelectorE::Measure));
 
 static cl::opt<bool>
@@ -476,7 +479,7 @@ static void runBenchmarkConfigurations(
 }
 
 void benchmarkMain() {
-  if (BenchmarkPhaseSelector == BenchmarkPhaseSelectorE::Measure &&
+  if (BenchmarkPhaseSelector >= BenchmarkPhaseSelectorE::Measure &&
       !UseDummyPerfCounters) {
 #ifndef HAVE_LIBPFM
     ExitWithError(
@@ -501,7 +504,7 @@ void benchmarkMain() {
 
   // Preliminary check to ensure features needed for requested
   // benchmark mode are present on target CPU and/or OS.
-  if (BenchmarkPhaseSelector == BenchmarkPhaseSelectorE::Measure)
+  if (BenchmarkPhaseSelector >= BenchmarkPhaseSelectorE::Measure)
     ExitOnErr(State.getExegesisTarget().checkFeatureSupport());
 
   if (ExecutionMode == BenchmarkRunner::ExecutionModeE::SubProcess &&

From 0f3aeca16fc1de8d172fd14c908ebbd0fe61eeb4 Mon Sep 17 00:00:00 2001
From: Brox Chen <guochen2@amd.com>
Date: Mon, 13 Jan 2025 16:48:00 -0500
Subject: [PATCH 337/408] [AMDGPU][True16][CodeGen] Update and/or/xor codegen
 pattern for i16 (#121835)

In true16 flow, remove and/or/xor 32bit patterns for i16
---
 llvm/lib/Target/AMDGPU/VOP2Instructions.td | 24 ++++++++++++++++++----
 llvm/test/CodeGen/AMDGPU/uaddsat.ll        |  8 +++-----
 llvm/test/CodeGen/AMDGPU/usubsat.ll        |  5 ++---
 3 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index ca4a0fa706c30..6bbf19179b7f6 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -1261,23 +1261,39 @@ class ZExt_i16_i1_Pat <SDNode ext> : GCNPat <
                      $src)
 >;
 
-foreach vt = [i16, v2i16] in {
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
 def : GCNPat <
-  (and vt:$src0, vt:$src1),
+  (and i16:$src0, i16:$src1),
   (V_AND_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1)
 >;
 
 def : GCNPat <
-  (or vt:$src0, vt:$src1),
+  (or i16:$src0, i16:$src1),
   (V_OR_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1)
 >;
 
 def : GCNPat <
-  (xor vt:$src0, vt:$src1),
+  (xor i16:$src0, i16:$src1),
   (V_XOR_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1)
 >;
 }
 
+def : GCNPat <
+  (and v2i16:$src0, v2i16:$src1),
+  (V_AND_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1)
+>;
+
+def : GCNPat <
+  (or v2i16:$src0, v2i16:$src1),
+  (V_OR_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1)
+>;
+
+def : GCNPat <
+  (xor v2i16:$src0, v2i16:$src1),
+  (V_XOR_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1)
+>;
+
 let Predicates = [Has16BitInsts, isGFX8GFX9] in {
 
 // Undo sub x, c -> add x, -c canonicalization since c is more likely
diff --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll
index 2775de29368fb..572793e1c5d71 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll
@@ -42,12 +42,10 @@ define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) {
 ; GFX11-TRUE16-LABEL: v_uaddsat_i8:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_min_u16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll
index 775602ab80cde..75866e33da23a 100644
--- a/llvm/test/CodeGen/AMDGPU/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll
@@ -39,9 +39,8 @@ define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) {
 ; GFX11-TRUE16-LABEL: v_usubsat_i8:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.l, v0.l, v0.h clamp
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;

From 283dca56f8dddbf2f144730a01675c94b04f57cb Mon Sep 17 00:00:00 2001
From: Daniel Paoliello <danpao@microsoft.com>
Date: Mon, 13 Jan 2025 14:00:14 -0800
Subject: [PATCH 338/408] Reapply "[aarch64][win] Add support for import call
 optimization (equivalent to MSVC /d2ImportCallOptimization) (#121516)"
 (#122777)

This reverts commit 2f7ade4b5e399962e18f5f9a0ab0b7335deece51.

Fix is available in #122762
---
 llvm/include/llvm/CodeGen/MIRYamlMapping.h    |  45 +++++--
 llvm/include/llvm/CodeGen/MachineFunction.h   |  25 ++++
 llvm/include/llvm/CodeGen/SelectionDAG.h      |  14 +++
 llvm/include/llvm/MC/MCObjectFileInfo.h       |   5 +
 llvm/include/llvm/MC/MCStreamer.h             |   8 ++
 llvm/include/llvm/MC/MCWinCOFFObjectWriter.h  |   1 +
 llvm/include/llvm/MC/MCWinCOFFStreamer.h      |   2 +
 llvm/lib/CodeGen/MIRParser/MIRParser.cpp      |  74 ++++++++++--
 llvm/lib/CodeGen/MIRPrinter.cpp               |  33 ++++-
 .../SelectionDAG/ScheduleDAGSDNodes.cpp       |   4 +
 llvm/lib/MC/MCAsmStreamer.cpp                 |  14 +++
 llvm/lib/MC/MCObjectFileInfo.cpp              |   5 +
 llvm/lib/MC/MCParser/COFFAsmParser.cpp        |  34 ++++++
 llvm/lib/MC/MCStreamer.cpp                    |   4 +
 llvm/lib/MC/MCWinCOFFStreamer.cpp             | 114 ++++++++++++++++++
 llvm/lib/MC/WinCOFFObjectWriter.cpp           |  27 +++--
 llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp |  72 +++++++++++
 .../Target/AArch64/AArch64ISelLowering.cpp    |  14 ++-
 .../win-import-call-optimization-nocalls.ll   |  18 +++
 .../AArch64/win-import-call-optimization.ll   |  48 ++++++++
 .../CodeGen/MIR/AArch64/called-globals.mir    |  61 ++++++++++
 .../CodeGen/MIR/X86/call-site-info-error1.mir |   2 +-
 .../CodeGen/MIR/X86/call-site-info-error2.mir |   2 +-
 .../MC/AArch64/win-import-call-optimization.s |  72 +++++++++++
 llvm/test/MC/COFF/bad-parse.s                 |  13 ++
 25 files changed, 673 insertions(+), 38 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/win-import-call-optimization-nocalls.ll
 create mode 100644 llvm/test/CodeGen/AArch64/win-import-call-optimization.ll
 create mode 100644 llvm/test/CodeGen/MIR/AArch64/called-globals.mir
 create mode 100644 llvm/test/MC/AArch64/win-import-call-optimization.s
 create mode 100644 llvm/test/MC/COFF/bad-parse.s

diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
index 09a6ca936fe1f..dbad3469d047d 100644
--- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h
+++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
@@ -457,6 +457,16 @@ template <> struct ScalarTraits<FrameIndex> {
   static QuotingType mustQuote(StringRef S) { return needsQuotes(S); }
 };
 
+/// Identifies call instruction location in machine function.
+struct MachineInstrLoc {
+  unsigned BlockNum;
+  unsigned Offset;
+
+  bool operator==(const MachineInstrLoc &Other) const {
+    return BlockNum == Other.BlockNum && Offset == Other.Offset;
+  }
+};
+
 /// Serializable representation of CallSiteInfo.
 struct CallSiteInfo {
   // Representation of call argument and register which is used to
@@ -470,16 +480,6 @@ struct CallSiteInfo {
     }
   };
 
-  /// Identifies call instruction location in machine function.
-  struct MachineInstrLoc {
-    unsigned BlockNum;
-    unsigned Offset;
-
-    bool operator==(const MachineInstrLoc &Other) const {
-      return BlockNum == Other.BlockNum && Offset == Other.Offset;
-    }
-  };
-
   MachineInstrLoc CallLocation;
   std::vector<ArgRegPair> ArgForwardingRegs;
 
@@ -595,6 +595,26 @@ template <> struct MappingTraits<MachineJumpTable::Entry> {
   }
 };
 
+struct CalledGlobal {
+  MachineInstrLoc CallSite;
+  StringValue Callee;
+  unsigned Flags;
+
+  bool operator==(const CalledGlobal &Other) const {
+    return CallSite == Other.CallSite && Callee == Other.Callee &&
+           Flags == Other.Flags;
+  }
+};
+
+template <> struct MappingTraits<CalledGlobal> {
+  static void mapping(IO &YamlIO, CalledGlobal &CG) {
+    YamlIO.mapRequired("bb", CG.CallSite.BlockNum);
+    YamlIO.mapRequired("offset", CG.CallSite.Offset);
+    YamlIO.mapRequired("callee", CG.Callee);
+    YamlIO.mapRequired("flags", CG.Flags);
+  }
+};
+
 } // end namespace yaml
 } // end namespace llvm
 
@@ -606,6 +626,7 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::FixedMachineStackObject)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::CallSiteInfo)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::MachineConstantPoolValue)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::MachineJumpTable::Entry)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::CalledGlobal)
 
 namespace llvm {
 namespace yaml {
@@ -764,6 +785,7 @@ struct MachineFunction {
   std::vector<DebugValueSubstitution> DebugValueSubstitutions;
   MachineJumpTable JumpTableInfo;
   std::vector<StringValue> MachineMetadataNodes;
+  std::vector<CalledGlobal> CalledGlobals;
   BlockStringValue Body;
 };
 
@@ -822,6 +844,9 @@ template <> struct MappingTraits<MachineFunction> {
     if (!YamlIO.outputting() || !MF.MachineMetadataNodes.empty())
       YamlIO.mapOptional("machineMetadataNodes", MF.MachineMetadataNodes,
                          std::vector<StringValue>());
+    if (!YamlIO.outputting() || !MF.CalledGlobals.empty())
+      YamlIO.mapOptional("calledGlobals", MF.CalledGlobals,
+                         std::vector<CalledGlobal>());
     YamlIO.mapOptional("body", MF.Body, BlockStringValue());
   }
 };
diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h
index d696add8a1af5..282aee2a69c4d 100644
--- a/llvm/include/llvm/CodeGen/MachineFunction.h
+++ b/llvm/include/llvm/CodeGen/MachineFunction.h
@@ -354,6 +354,11 @@ class LLVM_ABI MachineFunction {
   /// a table of valid targets for Windows EHCont Guard.
   std::vector<MCSymbol *> CatchretTargets;
 
+  /// Mapping of call instruction to the global value and target flags that it
+  /// calls, if applicable.
+  DenseMap<const MachineInstr *, std::pair<const GlobalValue *, unsigned>>
+      CalledGlobalsMap;
+
   /// \name Exception Handling
   /// \{
 
@@ -1182,6 +1187,26 @@ class LLVM_ABI MachineFunction {
     CatchretTargets.push_back(Target);
   }
 
+  /// Tries to get the global and target flags for a call site, if the
+  /// instruction is a call to a global.
+  std::pair<const GlobalValue *, unsigned>
+  tryGetCalledGlobal(const MachineInstr *MI) const {
+    return CalledGlobalsMap.lookup(MI);
+  }
+
+  /// Notes the global and target flags for a call site.
+  void addCalledGlobal(const MachineInstr *MI,
+                       std::pair<const GlobalValue *, unsigned> Details) {
+    assert(MI && "MI must not be null");
+    assert(Details.first && "Global must not be null");
+    CalledGlobalsMap.insert({MI, Details});
+  }
+
+  /// Iterates over the full set of call sites and their associated globals.
+  auto getCalledGlobals() const {
+    return llvm::make_range(CalledGlobalsMap.begin(), CalledGlobalsMap.end());
+  }
+
   /// \name Exception Handling
   /// \{
 
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index ff7caec41855f..b31ad11c3ee0e 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -293,6 +293,7 @@ class SelectionDAG {
     MDNode *HeapAllocSite = nullptr;
     MDNode *PCSections = nullptr;
     MDNode *MMRA = nullptr;
+    std::pair<const GlobalValue *, unsigned> CalledGlobal{};
     bool NoMerge = false;
   };
   /// Out-of-line extra information for SDNodes.
@@ -2373,6 +2374,19 @@ class SelectionDAG {
     auto It = SDEI.find(Node);
     return It != SDEI.end() ? It->second.MMRA : nullptr;
   }
+  /// Set CalledGlobal to be associated with Node.
+  void addCalledGlobal(const SDNode *Node, const GlobalValue *GV,
+                       unsigned OpFlags) {
+    SDEI[Node].CalledGlobal = {GV, OpFlags};
+  }
+  /// Return CalledGlobal associated with Node, or a nullopt if none exists.
+  std::optional<std::pair<const GlobalValue *, unsigned>>
+  getCalledGlobal(const SDNode *Node) {
+    auto I = SDEI.find(Node);
+    return I != SDEI.end()
+               ? std::make_optional(std::move(I->second).CalledGlobal)
+               : std::nullopt;
+  }
   /// Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
   void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge) {
     if (NoMerge)
diff --git a/llvm/include/llvm/MC/MCObjectFileInfo.h b/llvm/include/llvm/MC/MCObjectFileInfo.h
index e2a2c84e47910..fb575fe721015 100644
--- a/llvm/include/llvm/MC/MCObjectFileInfo.h
+++ b/llvm/include/llvm/MC/MCObjectFileInfo.h
@@ -73,6 +73,10 @@ class MCObjectFileInfo {
   /// to emit them into.
   MCSection *CompactUnwindSection = nullptr;
 
+  /// If import call optimization is supported by the target, this is the
+  /// section to emit import call data to.
+  MCSection *ImportCallSection = nullptr;
+
   // Dwarf sections for debug info.  If a target supports debug info, these must
   // be set.
   MCSection *DwarfAbbrevSection = nullptr;
@@ -269,6 +273,7 @@ class MCObjectFileInfo {
   MCSection *getBSSSection() const { return BSSSection; }
   MCSection *getReadOnlySection() const { return ReadOnlySection; }
   MCSection *getLSDASection() const { return LSDASection; }
+  MCSection *getImportCallSection() const { return ImportCallSection; }
   MCSection *getCompactUnwindSection() const { return CompactUnwindSection; }
   MCSection *getDwarfAbbrevSection() const { return DwarfAbbrevSection; }
   MCSection *getDwarfInfoSection() const { return DwarfInfoSection; }
diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h
index 21da4dac4872b..558b14cebfd3d 100644
--- a/llvm/include/llvm/MC/MCStreamer.h
+++ b/llvm/include/llvm/MC/MCStreamer.h
@@ -569,6 +569,14 @@ class MCStreamer {
   /// \param Symbol - Symbol the image relative relocation should point to.
   virtual void emitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset);
 
+  /// Emits the physical number of the section containing the given symbol as
+  /// assigned during object writing (i.e., this is not a runtime relocation).
+  virtual void emitCOFFSecNumber(MCSymbol const *Symbol);
+
+  /// Emits the offset of the symbol from the beginning of the section during
+  /// object writing (i.e., this is not a runtime relocation).
+  virtual void emitCOFFSecOffset(MCSymbol const *Symbol);
+
   /// Emits an lcomm directive with XCOFF csect information.
   ///
   /// \param LabelSym - Label on the block of storage.
diff --git a/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h b/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h
index a4ede61e45099..13d8c7d060c9e 100644
--- a/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h
+++ b/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h
@@ -72,6 +72,7 @@ class WinCOFFObjectWriter final : public MCObjectWriter {
                         const MCFixup &Fixup, MCValue Target,
                         uint64_t &FixedValue) override;
   uint64_t writeObject(MCAssembler &Asm) override;
+  int getSectionNumber(const MCSection &Section) const;
 };
 
 /// Construct a new Win COFF writer instance.
diff --git a/llvm/include/llvm/MC/MCWinCOFFStreamer.h b/llvm/include/llvm/MC/MCWinCOFFStreamer.h
index 5c39d80538944..2425abe51e6dd 100644
--- a/llvm/include/llvm/MC/MCWinCOFFStreamer.h
+++ b/llvm/include/llvm/MC/MCWinCOFFStreamer.h
@@ -58,6 +58,8 @@ class MCWinCOFFStreamer : public MCObjectStreamer {
   void emitCOFFSectionIndex(MCSymbol const *Symbol) override;
   void emitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset) override;
   void emitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) override;
+  void emitCOFFSecNumber(MCSymbol const *Symbol) override;
+  void emitCOFFSecOffset(MCSymbol const *Symbol) override;
   void emitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                         Align ByteAlignment) override;
   void emitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index e2543f883f91c..de2fe925c2d5c 100644
--- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -158,6 +158,9 @@ class MIRParserImpl {
                                  MachineFunction &MF,
                                  const yaml::MachineFunction &YMF);
 
+  bool parseCalledGlobals(PerFunctionMIParsingState &PFS, MachineFunction &MF,
+                          const yaml::MachineFunction &YMF);
+
 private:
   bool parseMDNode(PerFunctionMIParsingState &PFS, MDNode *&Node,
                    const yaml::StringValue &Source);
@@ -183,6 +186,9 @@ class MIRParserImpl {
 
   void setupDebugValueTracking(MachineFunction &MF,
     PerFunctionMIParsingState &PFS, const yaml::MachineFunction &YamlMF);
+
+  bool parseMachineInst(MachineFunction &MF, yaml::MachineInstrLoc MILoc,
+                        MachineInstr const *&MI);
 };
 
 } // end namespace llvm
@@ -457,24 +463,34 @@ bool MIRParserImpl::computeFunctionProperties(
   return false;
 }
 
+bool MIRParserImpl::parseMachineInst(MachineFunction &MF,
+                                     yaml::MachineInstrLoc MILoc,
+                                     MachineInstr const *&MI) {
+  if (MILoc.BlockNum >= MF.size()) {
+    return error(Twine(MF.getName()) +
+                 Twine(" instruction block out of range.") +
+                 " Unable to reference bb:" + Twine(MILoc.BlockNum));
+  }
+  auto BB = std::next(MF.begin(), MILoc.BlockNum);
+  if (MILoc.Offset >= BB->size())
+    return error(
+        Twine(MF.getName()) + Twine(" instruction offset out of range.") +
+        " Unable to reference instruction at bb: " + Twine(MILoc.BlockNum) +
+        " at offset:" + Twine(MILoc.Offset));
+  MI = &*std::next(BB->instr_begin(), MILoc.Offset);
+  return false;
+}
+
 bool MIRParserImpl::initializeCallSiteInfo(
     PerFunctionMIParsingState &PFS, const yaml::MachineFunction &YamlMF) {
   MachineFunction &MF = PFS.MF;
   SMDiagnostic Error;
   const TargetMachine &TM = MF.getTarget();
   for (auto &YamlCSInfo : YamlMF.CallSitesInfo) {
-    yaml::CallSiteInfo::MachineInstrLoc MILoc = YamlCSInfo.CallLocation;
-    if (MILoc.BlockNum >= MF.size())
-      return error(Twine(MF.getName()) +
-                   Twine(" call instruction block out of range.") +
-                   " Unable to reference bb:" + Twine(MILoc.BlockNum));
-    auto CallB = std::next(MF.begin(), MILoc.BlockNum);
-    if (MILoc.Offset >= CallB->size())
-      return error(Twine(MF.getName()) +
-                   Twine(" call instruction offset out of range.") +
-                   " Unable to reference instruction at bb: " +
-                   Twine(MILoc.BlockNum) + " at offset:" + Twine(MILoc.Offset));
-    auto CallI = std::next(CallB->instr_begin(), MILoc.Offset);
+    yaml::MachineInstrLoc MILoc = YamlCSInfo.CallLocation;
+    const MachineInstr *CallI;
+    if (parseMachineInst(MF, MILoc, CallI))
+      return true;
     if (!CallI->isCall(MachineInstr::IgnoreBundle))
       return error(Twine(MF.getName()) +
                    Twine(" call site info should reference call "
@@ -641,6 +657,9 @@ MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF,
   if (initializeCallSiteInfo(PFS, YamlMF))
     return true;
 
+  if (parseCalledGlobals(PFS, MF, YamlMF))
+    return true;
+
   setupDebugValueTracking(MF, PFS, YamlMF);
 
   MF.getSubtarget().mirFileLoaded(MF);
@@ -1111,6 +1130,37 @@ bool MIRParserImpl::parseMachineMetadataNodes(
   return false;
 }
 
+bool MIRParserImpl::parseCalledGlobals(PerFunctionMIParsingState &PFS,
+                                       MachineFunction &MF,
+                                       const yaml::MachineFunction &YMF) {
+  Function &F = MF.getFunction();
+  for (const auto &YamlCG : YMF.CalledGlobals) {
+    yaml::MachineInstrLoc MILoc = YamlCG.CallSite;
+    const MachineInstr *CallI;
+    if (parseMachineInst(MF, MILoc, CallI))
+      return true;
+    if (!CallI->isCall(MachineInstr::IgnoreBundle))
+      return error(Twine(MF.getName()) +
+                   Twine(" called global should reference call "
+                         "instruction. Instruction at bb:") +
+                   Twine(MILoc.BlockNum) + " at offset:" + Twine(MILoc.Offset) +
+                   " is not a call instruction");
+
+    auto Callee =
+        F.getParent()->getValueSymbolTable().lookup(YamlCG.Callee.Value);
+    if (!Callee)
+      return error(YamlCG.Callee.SourceRange.Start,
+                   "use of undefined global '" + YamlCG.Callee.Value + "'");
+    if (!isa<GlobalValue>(Callee))
+      return error(YamlCG.Callee.SourceRange.Start,
+                   "use of non-global value '" + YamlCG.Callee.Value + "'");
+
+    MF.addCalledGlobal(CallI, {cast<GlobalValue>(Callee), YamlCG.Flags});
+  }
+
+  return false;
+}
+
 SMDiagnostic MIRParserImpl::diagFromMIStringDiag(const SMDiagnostic &Error,
                                                  SMRange SourceRange) {
   assert(SourceRange.isValid() && "Invalid source range");
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index c8f6341c1224d..b6da495590fe1 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -133,6 +133,9 @@ class MIRPrinter {
   void convertMachineMetadataNodes(yaml::MachineFunction &YMF,
                                    const MachineFunction &MF,
                                    MachineModuleSlotTracker &MST);
+  void convertCalledGlobals(yaml::MachineFunction &YMF,
+                            const MachineFunction &MF,
+                            MachineModuleSlotTracker &MST);
 
 private:
   void initRegisterMaskIds(const MachineFunction &MF);
@@ -269,6 +272,8 @@ void MIRPrinter::print(const MachineFunction &MF) {
   // function.
   convertMachineMetadataNodes(YamlMF, MF, MST);
 
+  convertCalledGlobals(YamlMF, MF, MST);
+
   yaml::Output Out(OS);
   if (!SimplifyMIR)
       Out.setWriteDefaultValues(true);
@@ -555,7 +560,7 @@ void MIRPrinter::convertCallSiteObjects(yaml::MachineFunction &YMF,
   const auto *TRI = MF.getSubtarget().getRegisterInfo();
   for (auto CSInfo : MF.getCallSitesInfo()) {
     yaml::CallSiteInfo YmlCS;
-    yaml::CallSiteInfo::MachineInstrLoc CallLocation;
+    yaml::MachineInstrLoc CallLocation;
 
     // Prepare instruction position.
     MachineBasicBlock::const_instr_iterator CallI = CSInfo.first->getIterator();
@@ -596,6 +601,32 @@ void MIRPrinter::convertMachineMetadataNodes(yaml::MachineFunction &YMF,
   }
 }
 
+void MIRPrinter::convertCalledGlobals(yaml::MachineFunction &YMF,
+                                      const MachineFunction &MF,
+                                      MachineModuleSlotTracker &MST) {
+  for (const auto &[CallInst, CG] : MF.getCalledGlobals()) {
+    // If the call instruction was dropped, then we don't need to print it.
+    auto BB = CallInst->getParent();
+    if (BB) {
+      yaml::MachineInstrLoc CallSite;
+      CallSite.BlockNum = CallInst->getParent()->getNumber();
+      CallSite.Offset = std::distance(CallInst->getParent()->instr_begin(),
+                                      CallInst->getIterator());
+
+      yaml::CalledGlobal YamlCG{CallSite, CG.first->getName().str(), CG.second};
+      YMF.CalledGlobals.push_back(YamlCG);
+    }
+  }
+
+  // Sort by position of call instructions.
+  llvm::sort(YMF.CalledGlobals.begin(), YMF.CalledGlobals.end(),
+             [](yaml::CalledGlobal A, yaml::CalledGlobal B) {
+               if (A.CallSite.BlockNum == B.CallSite.BlockNum)
+                 return A.CallSite.Offset < B.CallSite.Offset;
+               return A.CallSite.BlockNum < B.CallSite.BlockNum;
+             });
+}
+
 void MIRPrinter::convert(yaml::MachineFunction &MF,
                          const MachineConstantPool &ConstantPool) {
   unsigned ID = 0;
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index dff7243b0a99c..bafe26ff7d6b7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -908,6 +908,10 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
         It->setMMRAMetadata(MF, MMRA);
     }
 
+    if (auto CalledGlobal = DAG->getCalledGlobal(Node))
+      if (CalledGlobal->first)
+        MF.addCalledGlobal(MI, *CalledGlobal);
+
     return MI;
   };
 
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index 01fe11ed20501..dd8058c6d5cd8 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -209,6 +209,8 @@ class MCAsmStreamer final : public MCStreamer {
   void emitCOFFSectionIndex(MCSymbol const *Symbol) override;
   void emitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset) override;
   void emitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) override;
+  void emitCOFFSecNumber(MCSymbol const *Symbol) override;
+  void emitCOFFSecOffset(MCSymbol const *Symbol) override;
   void emitXCOFFLocalCommonSymbol(MCSymbol *LabelSym, uint64_t Size,
                                   MCSymbol *CsectSym, Align Alignment) override;
   void emitXCOFFSymbolLinkageWithVisibility(MCSymbol *Symbol,
@@ -893,6 +895,18 @@ void MCAsmStreamer::emitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) {
   EmitEOL();
 }
 
+void MCAsmStreamer::emitCOFFSecNumber(MCSymbol const *Symbol) {
+  OS << "\t.secnum\t";
+  Symbol->print(OS, MAI);
+  EmitEOL();
+}
+
+void MCAsmStreamer::emitCOFFSecOffset(MCSymbol const *Symbol) {
+  OS << "\t.secoffset\t";
+  Symbol->print(OS, MAI);
+  EmitEOL();
+}
+
 // We need an XCOFF-specific version of this directive as the AIX syntax
 // requires a QualName argument identifying the csect name and storage mapping
 // class to appear before the alignment if we are specifying it.
diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp
index f37e138edc36b..150e38a94db6a 100644
--- a/llvm/lib/MC/MCObjectFileInfo.cpp
+++ b/llvm/lib/MC/MCObjectFileInfo.cpp
@@ -596,6 +596,11 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
                                           COFF::IMAGE_SCN_MEM_READ);
   }
 
+  if (T.getArch() == Triple::aarch64) {
+    ImportCallSection =
+        Ctx->getCOFFSection(".impcall", COFF::IMAGE_SCN_LNK_INFO);
+  }
+
   // Debug info.
   COFFDebugSymbolsSection =
       Ctx->getCOFFSection(".debug$S", (COFF::IMAGE_SCN_MEM_DISCARDABLE |
diff --git a/llvm/lib/MC/MCParser/COFFAsmParser.cpp b/llvm/lib/MC/MCParser/COFFAsmParser.cpp
index 4d95a72085283..dd5ce9964a194 100644
--- a/llvm/lib/MC/MCParser/COFFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/COFFAsmParser.cpp
@@ -70,6 +70,8 @@ class COFFAsmParser : public MCAsmParserExtension {
     addDirectiveHandler<&COFFAsmParser::parseDirectiveSymbolAttribute>(
         ".weak_anti_dep");
     addDirectiveHandler<&COFFAsmParser::parseDirectiveCGProfile>(".cg_profile");
+    addDirectiveHandler<&COFFAsmParser::parseDirectiveSecNum>(".secnum");
+    addDirectiveHandler<&COFFAsmParser::parseDirectiveSecOffset>(".secoffset");
 
     // Win64 EH directives.
     addDirectiveHandler<&COFFAsmParser::parseSEHDirectiveStartProc>(
@@ -126,6 +128,8 @@ class COFFAsmParser : public MCAsmParserExtension {
   bool parseDirectiveLinkOnce(StringRef, SMLoc);
   bool parseDirectiveRVA(StringRef, SMLoc);
   bool parseDirectiveCGProfile(StringRef, SMLoc);
+  bool parseDirectiveSecNum(StringRef, SMLoc);
+  bool parseDirectiveSecOffset(StringRef, SMLoc);
 
   // Win64 EH directives.
   bool parseSEHDirectiveStartProc(StringRef, SMLoc);
@@ -577,6 +581,36 @@ bool COFFAsmParser::parseDirectiveSymIdx(StringRef, SMLoc) {
   return false;
 }
 
+bool COFFAsmParser::parseDirectiveSecNum(StringRef, SMLoc) {
+  StringRef SymbolID;
+  if (getParser().parseIdentifier(SymbolID))
+    return TokError("expected identifier in directive");
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  MCSymbol *Symbol = getContext().getOrCreateSymbol(SymbolID);
+
+  Lex();
+  getStreamer().emitCOFFSecNumber(Symbol);
+  return false;
+}
+
+bool COFFAsmParser::parseDirectiveSecOffset(StringRef, SMLoc) {
+  StringRef SymbolID;
+  if (getParser().parseIdentifier(SymbolID))
+    return TokError("expected identifier in directive");
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  MCSymbol *Symbol = getContext().getOrCreateSymbol(SymbolID);
+
+  Lex();
+  getStreamer().emitCOFFSecOffset(Symbol);
+  return false;
+}
+
 /// ::= [ identifier ]
 bool COFFAsmParser::parseCOMDATType(COFF::COMDATType &Type) {
   StringRef TypeId = getTok().getIdentifier();
diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp
index ccf65df150e78..e690723c0e502 100644
--- a/llvm/lib/MC/MCStreamer.cpp
+++ b/llvm/lib/MC/MCStreamer.cpp
@@ -1023,6 +1023,10 @@ void MCStreamer::emitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset) {}
 
 void MCStreamer::emitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) {}
 
+void MCStreamer::emitCOFFSecNumber(MCSymbol const *Symbol) {}
+
+void MCStreamer::emitCOFFSecOffset(MCSymbol const *Symbol) {}
+
 /// EmitRawText - If this file is backed by an assembly streamer, this dumps
 /// the specified string in the output .s file.  This capability is
 /// indicated by the hasRawTextSupport() predicate.
diff --git a/llvm/lib/MC/MCWinCOFFStreamer.cpp b/llvm/lib/MC/MCWinCOFFStreamer.cpp
index 395d4db3103d7..8fd46bc8b0255 100644
--- a/llvm/lib/MC/MCWinCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCWinCOFFStreamer.cpp
@@ -29,6 +29,7 @@
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSymbolCOFF.h"
 #include "llvm/MC/MCTargetOptions.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/MC/MCWinCOFFObjectWriter.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -43,6 +44,91 @@ using namespace llvm;
 
 #define DEBUG_TYPE "WinCOFFStreamer"
 
+/// MCExpr that represents the physical number for the sections that contains
+/// a symbol.
+class MCCOFFSectionNumberTargetExpr final : public MCTargetExpr {
+  const MCSymbol &SectionSymbol;
+  const WinCOFFObjectWriter &Writer;
+
+  MCCOFFSectionNumberTargetExpr(const MCSymbol &SectionSymbol_,
+                                const WinCOFFObjectWriter &Writer_)
+      : SectionSymbol(SectionSymbol_), Writer(Writer_) {}
+
+public:
+  static MCCOFFSectionNumberTargetExpr *
+  create(const MCSymbol &SectionSymbol, const WinCOFFObjectWriter &Writer,
+         MCContext &Ctx) {
+    return new (Ctx) MCCOFFSectionNumberTargetExpr(SectionSymbol, Writer);
+  }
+
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override {
+    OS << ":secnum:";
+    SectionSymbol.print(OS, MAI);
+  }
+
+  bool evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
+                                 const MCFixup *Fixup) const override {
+    auto sectionNumber = Writer.getSectionNumber(SectionSymbol.getSection());
+    assert(sectionNumber != 0 &&
+           "Containing section was not assigned a number");
+    Res = MCValue::get(sectionNumber);
+    return true;
+  }
+
+  void visitUsedExpr(MCStreamer &Streamer) const override {
+    // Contains no sub-expressions.
+  }
+
+  MCFragment *findAssociatedFragment() const override {
+    return SectionSymbol.getFragment();
+  }
+
+  void fixELFSymbolsInTLSFixups(MCAssembler &) const override {
+    llvm_unreachable("Not supported for ELF");
+  }
+};
+
+/// MCExpr that represents the offset to a symbol from the beginning of its
+/// section.
+class MCCOFFSectionOffsetTargetExpr final : public MCTargetExpr {
+  const MCSymbol &Symbol;
+
+  MCCOFFSectionOffsetTargetExpr(const MCSymbol &Symbol_) : Symbol(Symbol_) {}
+
+public:
+  static MCCOFFSectionOffsetTargetExpr *create(const MCSymbol &Symbol,
+                                               MCContext &Ctx) {
+    return new (Ctx) MCCOFFSectionOffsetTargetExpr(Symbol);
+  }
+
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override {
+    OS << ":secoffset:";
+    Symbol.print(OS, MAI);
+  }
+
+  bool evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
+                                 const MCFixup *Fixup) const override {
+    uint64_t CallsiteOffset = 0;
+    if (!Asm->getSymbolOffset(Symbol, CallsiteOffset)) {
+      return true;
+    }
+    Res = MCValue::get(CallsiteOffset);
+    return true;
+  }
+
+  void visitUsedExpr(MCStreamer &Streamer) const override {
+    // Contains no sub-expressions.
+  }
+
+  MCFragment *findAssociatedFragment() const override {
+    return Symbol.getFragment();
+  }
+
+  void fixELFSymbolsInTLSFixups(MCAssembler &) const override {
+    llvm_unreachable("Not supported for ELF");
+  }
+};
+
 MCWinCOFFStreamer::MCWinCOFFStreamer(MCContext &Context,
                                      std::unique_ptr<MCAsmBackend> MAB,
                                      std::unique_ptr<MCCodeEmitter> CE,
@@ -280,6 +366,34 @@ void MCWinCOFFStreamer::emitCOFFImgRel32(const MCSymbol *Symbol,
   DF->appendContents(4, 0);
 }
 
+void MCWinCOFFStreamer::emitCOFFSecNumber(MCSymbol const *Symbol) {
+  visitUsedSymbol(*Symbol);
+  MCDataFragment *DF = getOrCreateDataFragment();
+  // Create Symbol for section number.
+  const MCExpr *MCE = MCCOFFSectionNumberTargetExpr::create(
+      *Symbol, this->getWriter(), getContext());
+  // Build the relocation.
+  MCFixup Fixup = MCFixup::create(DF->getContents().size(), MCE, FK_Data_4);
+  // Record the relocation.
+  DF->getFixups().push_back(Fixup);
+  // Emit 4 bytes (zeros) to the object file.
+  DF->appendContents(4, 0);
+}
+
+void MCWinCOFFStreamer::emitCOFFSecOffset(MCSymbol const *Symbol) {
+  visitUsedSymbol(*Symbol);
+  MCDataFragment *DF = getOrCreateDataFragment();
+  // Create Symbol for section offset.
+  const MCExpr *MCE =
+      MCCOFFSectionOffsetTargetExpr::create(*Symbol, getContext());
+  // Build the relocation.
+  MCFixup Fixup = MCFixup::create(DF->getContents().size(), MCE, FK_Data_4);
+  // Record the relocation.
+  DF->getFixups().push_back(Fixup);
+  // Emit 4 bytes (zeros) to the object file.
+  DF->appendContents(4, 0);
+}
+
 void MCWinCOFFStreamer::emitCommonSymbol(MCSymbol *S, uint64_t Size,
                                          Align ByteAlignment) {
   auto *Symbol = cast<MCSymbolCOFF>(S);
diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp
index 09d2b08e43050..39e02d0522bcf 100644
--- a/llvm/lib/MC/WinCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp
@@ -163,6 +163,7 @@ class llvm::WinCOFFWriter {
                         const MCFixup &Fixup, MCValue Target,
                         uint64_t &FixedValue);
   uint64_t writeObject(MCAssembler &Asm);
+  int getSectionNumber(const MCSection &Section) const;
 
 private:
   COFFSymbol *createSymbol(StringRef Name);
@@ -818,6 +819,15 @@ void WinCOFFWriter::executePostLayoutBinding(MCAssembler &Asm) {
       if (!Symbol.isTemporary() ||
           cast<MCSymbolCOFF>(Symbol).getClass() == COFF::IMAGE_SYM_CLASS_STATIC)
         defineSymbol(Asm, Symbol);
+
+  UseBigObj = Sections.size() > COFF::MaxNumberOfSections16;
+  Header.NumberOfSections = Sections.size();
+  Header.NumberOfSymbols = 0;
+  if (Sections.size() > INT32_MAX)
+    report_fatal_error(
+        "PE COFF object files can't have more than 2147483647 sections");
+
+  assignSectionNumbers();
 }
 
 void WinCOFFWriter::recordRelocation(MCAssembler &Asm,
@@ -980,16 +990,7 @@ static std::time_t getTime() {
 uint64_t WinCOFFWriter::writeObject(MCAssembler &Asm) {
   uint64_t StartOffset = W.OS.tell();
 
-  if (Sections.size() > INT32_MAX)
-    report_fatal_error(
-        "PE COFF object files can't have more than 2147483647 sections");
-
-  UseBigObj = Sections.size() > COFF::MaxNumberOfSections16;
-  Header.NumberOfSections = Sections.size();
-  Header.NumberOfSymbols = 0;
-
   setWeakDefaultNames();
-  assignSectionNumbers();
   if (Mode != DwoOnly)
     createFileSymbols(Asm);
 
@@ -1143,6 +1144,10 @@ uint64_t WinCOFFWriter::writeObject(MCAssembler &Asm) {
   return W.OS.tell() - StartOffset;
 }
 
+int WinCOFFWriter::getSectionNumber(const MCSection &Section) const {
+  return SectionMap.at(&Section)->Number;
+}
+
 //------------------------------------------------------------------------------
 // WinCOFFObjectWriter class implementation
 
@@ -1194,6 +1199,10 @@ uint64_t WinCOFFObjectWriter::writeObject(MCAssembler &Asm) {
   return TotalSize;
 }
 
+int WinCOFFObjectWriter::getSectionNumber(const MCSection &Section) const {
+  return ObjWriter->getSectionNumber(Section);
+}
+
 MCWinCOFFObjectTargetWriter::MCWinCOFFObjectTargetWriter(unsigned Machine_)
     : Machine(Machine_) {}
 
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 9d9d9889b3858..27e65d60122fd 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -24,6 +24,7 @@
 #include "MCTargetDesc/AArch64TargetStreamer.h"
 #include "TargetInfo/AArch64TargetInfo.h"
 #include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
@@ -77,6 +78,11 @@ static cl::opt<PtrauthCheckMode> PtrauthAuthChecks(
     cl::desc("Check pointer authentication auth/resign failures"),
     cl::init(Default));
 
+static cl::opt<bool> EnableImportCallOptimization(
+    "aarch64-win-import-call-optimization", cl::Hidden,
+    cl::desc("Enable import call optimization for AArch64 Windows"),
+    cl::init(false));
+
 #define DEBUG_TYPE "asm-printer"
 
 namespace {
@@ -89,6 +95,8 @@ class AArch64AsmPrinter : public AsmPrinter {
 #ifndef NDEBUG
   unsigned InstsEmitted;
 #endif
+  DenseMap<MCSection *, std::vector<std::pair<MCSymbol *, MCSymbol *>>>
+      SectionToImportedFunctionCalls;
 
 public:
   AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
@@ -293,6 +301,11 @@ class AArch64AsmPrinter : public AsmPrinter {
                               MCSymbol *LazyPointer) override;
   void emitMachOIFuncStubHelperBody(Module &M, const GlobalIFunc &GI,
                                     MCSymbol *LazyPointer) override;
+
+  /// Checks if this instruction is part of a sequence that is eligle for import
+  /// call optimization and, if so, records it to be emitted in the import call
+  /// section.
+  void recordIfImportCall(const MachineInstr *BranchInst);
 };
 
 } // end anonymous namespace
@@ -930,6 +943,38 @@ void AArch64AsmPrinter::emitEndOfAsmFile(Module &M) {
   // Emit stack and fault map information.
   FM.serializeToFaultMapSection();
 
+  // If import call optimization is enabled, emit the appropriate section.
+  // We do this whether or not we recorded any import calls.
+  if (EnableImportCallOptimization && TT.isOSBinFormatCOFF()) {
+    OutStreamer->switchSection(getObjFileLowering().getImportCallSection());
+
+    // Section always starts with some magic.
+    constexpr char ImpCallMagic[12] = "Imp_Call_V1";
+    OutStreamer->emitBytes(StringRef{ImpCallMagic, sizeof(ImpCallMagic)});
+
+    // Layout of this section is:
+    // Per section that contains calls to imported functions:
+    //  uint32_t SectionSize: Size in bytes for information in this section.
+    //  uint32_t Section Number
+    //  Per call to imported function in section:
+    //    uint32_t Kind: the kind of imported function.
+    //    uint32_t BranchOffset: the offset of the branch instruction in its
+    //                            parent section.
+    //    uint32_t TargetSymbolId: the symbol id of the called function.
+    for (auto &[Section, CallsToImportedFuncs] :
+         SectionToImportedFunctionCalls) {
+      unsigned SectionSize =
+          sizeof(uint32_t) * (2 + 3 * CallsToImportedFuncs.size());
+      OutStreamer->emitInt32(SectionSize);
+      OutStreamer->emitCOFFSecNumber(Section->getBeginSymbol());
+      for (auto &[CallsiteSymbol, CalledSymbol] : CallsToImportedFuncs) {
+        // Kind is always IMAGE_REL_ARM64_DYNAMIC_IMPORT_CALL (0x13).
+        OutStreamer->emitInt32(0x13);
+        OutStreamer->emitCOFFSecOffset(CallsiteSymbol);
+        OutStreamer->emitCOFFSymbolIndex(CalledSymbol);
+      }
+    }
+  }
 }
 
 void AArch64AsmPrinter::emitLOHs() {
@@ -2703,6 +2748,7 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
   case AArch64::TCRETURNriALL: {
     emitPtrauthTailCallHardening(MI);
 
+    recordIfImportCall(MI);
     MCInst TmpInst;
     TmpInst.setOpcode(AArch64::BR);
     TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
@@ -2714,6 +2760,7 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
 
     MCOperand Dest;
     MCInstLowering.lowerOperand(MI->getOperand(0), Dest);
+    recordIfImportCall(MI);
     MCInst TmpInst;
     TmpInst.setOpcode(AArch64::B);
     TmpInst.addOperand(Dest);
@@ -3044,6 +3091,14 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
     TS->emitARM64WinCFISaveAnyRegQPX(MI->getOperand(0).getImm(),
                                      -MI->getOperand(2).getImm());
     return;
+
+  case AArch64::BLR:
+  case AArch64::BR:
+    recordIfImportCall(MI);
+    MCInst TmpInst;
+    MCInstLowering.Lower(MI, TmpInst);
+    EmitToStreamer(*OutStreamer, TmpInst);
+    return;
   }
 
   // Finally, do the automated lowerings for everything else.
@@ -3052,6 +3107,23 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
   EmitToStreamer(*OutStreamer, TmpInst);
 }
 
+void AArch64AsmPrinter::recordIfImportCall(
+    const llvm::MachineInstr *BranchInst) {
+  if (!EnableImportCallOptimization ||
+      !TM.getTargetTriple().isOSBinFormatCOFF())
+    return;
+
+  auto [GV, OpFlags] = BranchInst->getMF()->tryGetCalledGlobal(BranchInst);
+  if (GV && GV->hasDLLImportStorageClass()) {
+    auto *CallSiteSymbol = MMI->getContext().createNamedTempSymbol("impcall");
+    OutStreamer->emitLabel(CallSiteSymbol);
+
+    auto *CalledSymbol = MCInstLowering.GetGlobalValueSymbol(GV, OpFlags);
+    SectionToImportedFunctionCalls[OutStreamer->getCurrentSectionOnly()]
+        .push_back({CallSiteSymbol, CalledSymbol});
+  }
+}
+
 void AArch64AsmPrinter::emitMachOIFuncStubBody(Module &M, const GlobalIFunc &GI,
                                                MCSymbol *LazyPointer) {
   // _ifunc:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3b8e0f1958f86..d4a114c275fb7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9450,12 +9450,14 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
   // node so that legalize doesn't hack it.
+  const GlobalValue *CalledGlobal = nullptr;
+  unsigned OpFlags = 0;
   if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-    auto GV = G->getGlobal();
-    unsigned OpFlags =
-        Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine());
+    CalledGlobal = G->getGlobal();
+    OpFlags = Subtarget->classifyGlobalFunctionReference(CalledGlobal,
+                                                         getTargetMachine());
     if (OpFlags & AArch64II::MO_GOT) {
-      Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
+      Callee = DAG.getTargetGlobalAddress(CalledGlobal, DL, PtrVT, 0, OpFlags);
       Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
     } else {
       const GlobalValue *GV = G->getGlobal();
@@ -9575,6 +9577,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
     DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
     DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
+    if (CalledGlobal)
+      DAG.addCalledGlobal(Ret.getNode(), CalledGlobal, OpFlags);
     return Ret;
   }
 
@@ -9586,6 +9590,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
   InGlue = Chain.getValue(1);
   DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
+  if (CalledGlobal)
+    DAG.addCalledGlobal(Chain.getNode(), CalledGlobal, OpFlags);
 
   uint64_t CalleePopBytes =
       DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
diff --git a/llvm/test/CodeGen/AArch64/win-import-call-optimization-nocalls.ll b/llvm/test/CodeGen/AArch64/win-import-call-optimization-nocalls.ll
new file mode 100644
index 0000000000000..81d6d6369dcbf
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/win-import-call-optimization-nocalls.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=aarch64-pc-windows-msvc -aarch64-win-import-call-optimization < %s | FileCheck %s
+
+define dso_local void @normal_call() local_unnamed_addr {
+entry:
+  call void @a()
+  ret void
+}
+; CHECK-LABEL:  normal_call:
+; CHECK:        bl a
+
+declare void @a() local_unnamed_addr
+
+; Even if there are no calls to imported functions, we still need to emit the
+; .impcall section.
+
+; CHECK-LABEL  .section   .impcall,"yi"
+; CHECK-NEXT   .asciz  "Imp_Call_V1"
+; CHECK-NOT    .secnum
diff --git a/llvm/test/CodeGen/AArch64/win-import-call-optimization.ll b/llvm/test/CodeGen/AArch64/win-import-call-optimization.ll
new file mode 100644
index 0000000000000..6bb118ba1e159
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/win-import-call-optimization.ll
@@ -0,0 +1,48 @@
+; RUN: llc -mtriple=aarch64-pc-windows-msvc -aarch64-win-import-call-optimization < %s | FileCheck %s --check-prefix=CHECK-ENABLED
+; RUN: llc -mtriple=aarch64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK-DISABLED
+
+; CHECK-DISABLED-NOT: .section        .impcall
+
+define dso_local void @normal_call() local_unnamed_addr section "nc_sect" {
+entry:
+  call void @a()
+  call void @a()
+  ret void
+}
+; CHECK-ENABLED-LABEL:  normal_call:
+; CHECK-ENABLED:        adrp    [[ADRPREG:x[0-9]+]], __imp_a
+; CHECK-ENABLED-NEXT:   ldr     [[LDRREG:x[0-9]+]], [[[ADRPREG]], :lo12:__imp_a]
+; CHECK-ENABLED-NEXT:   .Limpcall0:
+; CHECK-ENABLED-NEXT:   blr     [[LDRREG]]
+; CHECK-ENABLED-NEXT:   .Limpcall1:
+; CHECK-ENABLED-NEXT:   blr     [[LDRREG]]
+
+define dso_local void @tail_call() local_unnamed_addr section "tc_sect" {
+entry:
+  tail call void @b()
+  ret void
+}
+; CHECK-ENABLED-LABEL:  tail_call:
+; CHECK-ENABLED:        adrp    [[ADRPREG:x[0-9]+]], __imp_b
+; CHECK-ENABLED-NEXT:   ldr     [[LDRREG:x[0-9]+]], [[[ADRPREG]], :lo12:__imp_b]
+; CHECK-ENABLED-NEXT:   .Limpcall2:
+; CHECK-ENABLED-NEXT:   br      [[LDRREG]]
+
+declare dllimport void @a() local_unnamed_addr
+declare dllimport void @b() local_unnamed_addr
+
+; CHECK-ENABLED-LABEL  .section   .impcall,"yi"
+; CHECK-ENABLED-NEXT   .asciz  "Imp_Call_V1"
+; CHECK-ENABLED-NEXT   .word   32
+; CHECK-ENABLED-NEXT   .secnum nc_sect
+; CHECK-ENABLED-NEXT   .word   19
+; CHECK-ENABLED-NEXT   .secoffset      .Limpcall0
+; CHECK-ENABLED-NEXT   .symidx __imp_a
+; CHECK-ENABLED-NEXT   .word   19
+; CHECK-ENABLED-NEXT   .secoffset      .Limpcall1
+; CHECK-ENABLED-NEXT   .symidx __imp_a
+; CHECK-ENABLED-NEXT   .word   20
+; CHECK-ENABLED-NEXT   .secnum tc_sect
+; CHECK-ENABLED-NEXT   .word   19
+; CHECK-ENABLED-NEXT   .secoffset      .Limpcall2
+; CHECK-ENABLED-NEXT   .symidx __imp_b
diff --git a/llvm/test/CodeGen/MIR/AArch64/called-globals.mir b/llvm/test/CodeGen/MIR/AArch64/called-globals.mir
new file mode 100644
index 0000000000000..cf0f0a23e2d91
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/AArch64/called-globals.mir
@@ -0,0 +1,61 @@
+# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass none -o - %s | FileCheck %s
+
+--- |
+  declare dllimport void @callee_func() local_unnamed_addr
+
+  define dso_local void @caller() local_unnamed_addr {
+  entry:
+    call void @callee_func()
+    call void @callee_func()
+    ret void
+  }
+...
+---
+name:            caller
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -8, size: 8, alignment: 8,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -16, size: 8, alignment: 8,
+      stack-id: default, callee-saved-register: '$x19', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+calledGlobals:
+  - bb:              0
+    offset:          7
+    callee:          callee_func
+    flags:           144
+  - bb:              0
+    offset:          8
+    callee:          callee_func
+    flags:           144
+body:             |
+  bb.0.entry:
+    liveins: $x19, $lr
+
+    early-clobber $sp = frame-setup STRXpre killed $x19, $sp, -16 :: (store (s64) into %stack.1)
+    frame-setup SEH_SaveReg_X 19, -16
+    frame-setup STRXui killed $lr, $sp, 1 :: (store (s64) into %stack.0)
+    frame-setup SEH_SaveReg 30, 8
+    frame-setup SEH_PrologEnd
+    $x19 = ADRP target-flags(aarch64-page, aarch64-got, aarch64-dllimport) @callee_func
+    renamable $x19 = LDRXui killed $x19, target-flags(aarch64-pageoff, aarch64-got, aarch64-nc, aarch64-dllimport) @callee_func
+    BLR renamable $x19, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+    BLR killed renamable $x19, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+    frame-destroy SEH_EpilogStart
+    $lr = frame-destroy LDRXui $sp, 1 :: (load (s64) from %stack.0)
+    frame-destroy SEH_SaveReg 30, 8
+    early-clobber $sp, $x19 = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.1)
+    frame-destroy SEH_SaveReg_X 19, -16
+    frame-destroy SEH_EpilogEnd
+    RET undef $lr
+...
+
+# CHECK-LABEL: calledGlobals:
+# CHECK-NEXT:  - bb:              0
+# CHECK-NEXT:    offset:          7
+# CHECK-NEXT:    callee:          callee_func
+# CHECK-NEXT:    flags:           144
+# CHECK-NEXT:  - bb:              0
+# CHECK-NEXT:    offset:          8
+# CHECK-NEXT:    callee:          callee_func
+# CHECK-NEXT:    flags:           144
diff --git a/llvm/test/CodeGen/MIR/X86/call-site-info-error1.mir b/llvm/test/CodeGen/MIR/X86/call-site-info-error1.mir
index 096a80f77dbb6..e4dab779216a8 100644
--- a/llvm/test/CodeGen/MIR/X86/call-site-info-error1.mir
+++ b/llvm/test/CodeGen/MIR/X86/call-site-info-error1.mir
@@ -1,5 +1,5 @@
 # RUN: not llc -mtriple=x86_64-- -run-pass none -debug-entry-values %s -o - 2>&1 | FileCheck %s
-# CHECK: baa call instruction block out of range. Unable to reference bb:1
+# CHECK: baa instruction block out of range. Unable to reference bb:1
 --- |
   define dso_local i32 @baa(i32 %a) local_unnamed_addr {
   entry:
diff --git a/llvm/test/CodeGen/MIR/X86/call-site-info-error2.mir b/llvm/test/CodeGen/MIR/X86/call-site-info-error2.mir
index bd5b2451a8d76..183610b326eeb 100644
--- a/llvm/test/CodeGen/MIR/X86/call-site-info-error2.mir
+++ b/llvm/test/CodeGen/MIR/X86/call-site-info-error2.mir
@@ -1,5 +1,5 @@
 # RUN: not llc -mtriple=x86_64-- -run-pass none -debug-entry-values %s -o - 2>&1 | FileCheck %s
-# CHECK: baa call instruction offset out of range. Unable to reference instruction at bb: 0 at offset:1
+# CHECK: baa instruction offset out of range. Unable to reference instruction at bb: 0 at offset:1
 --- |
   define dso_local i32 @baa(i32 %a) local_unnamed_addr {
   entry:
diff --git a/llvm/test/MC/AArch64/win-import-call-optimization.s b/llvm/test/MC/AArch64/win-import-call-optimization.s
new file mode 100644
index 0000000000000..f26e17b9b62cc
--- /dev/null
+++ b/llvm/test/MC/AArch64/win-import-call-optimization.s
@@ -0,0 +1,72 @@
+// RUN: llvm-mc -triple aarch64-windows-msvc -filetype obj -o %t.obj %s
+// RUN: llvm-readobj --sections --sd --relocs %t.obj | FileCheck %s
+
+.section        nc_sect,"xr"
+normal_call:
+  str     x30, [sp, #-16]!                // 8-byte Folded Spill
+  adrp    x8, __imp_a
+  ldr     x8, [x8, :lo12:__imp_a]
+.Limpcall0:
+  blr     x8
+  ldr     x30, [sp], #16                  // 8-byte Folded Reload
+  ret
+
+.section        tc_sect,"xr"
+tail_call:
+  adrp    x8, __imp_b
+  ldr     x8, [x8, :lo12:__imp_b]
+.Limpcall1:
+  br     x8
+
+.section        .impcall,"yi"
+.asciz  "Imp_Call_V1"
+.word   20
+.secnum nc_sect
+.word   19
+.secoffset      .Limpcall0
+.symidx __imp_a
+.word   20
+.secnum tc_sect
+.word   19
+.secoffset      .Limpcall1
+.symidx __imp_b
+
+// CHECK-LABEL: Name: .impcall (2E 69 6D 70 63 61 6C 6C)
+// CHECK-NEXT:  VirtualSize: 0x0
+// CHECK-NEXT:  VirtualAddress: 0x0
+// CHECK-NEXT:  RawDataSize: 52
+// CHECK-NEXT:  PointerToRawData: 0x150
+// CHECK-NEXT:  PointerToRelocations: 0x0
+// CHECK-NEXT:  PointerToLineNumbers: 0x0
+// CHECK-NEXT:  RelocationCount: 0
+// CHECK-NEXT:  LineNumberCount: 0
+// CHECK-NEXT:  Characteristics [
+// CHECK-NEXT:    IMAGE_SCN_ALIGN_4BYTES
+// CHECK-NEXT:    IMAGE_SCN_LNK_INFO
+// CHECK-NEXT:  ]
+// CHECK-NEXT:  SectionData (
+// CHECK-NEXT:    0000: 496D705F 43616C6C 5F563100 14000000  |Imp_Call_V1.....|
+// CHECK-NEXT:    0010:
+// CHECK-SAME:    [[#%.2X,NCSECT:]]000000
+// CHECK-SAME:    13000000
+// CHECK-SAME:    [[#%.2X,NCOFFSET:]]000000
+// CHECK-SAME:    [[#%.2X,NCSYM:]]000000
+// CHECK-NEXT:    0020:
+// CHECK-SAME:    14000000
+// CHECK-SAME:    [[#%.2X,TCSECT:]]000000
+// CHECK-SAME:    13000000
+// CHECK-SAME:    [[#%.2X,TCOFFSET:]]000000
+// CHECK-NEXT:    0030:
+// CHECK-SAME:    [[#%.2X,TCSYM:]]000000
+// CHECK-NEXT:  )
+
+// CHECK-LABEL: Relocations [
+// CHECK-NEXT:     Section ([[#%u,NCSECT]]) nc_sect {
+// CHECK-NEXT:       0x[[#%x,NCOFFSET - 8]] IMAGE_REL_ARM64_PAGEBASE_REL21 __imp_a ([[#%u,NCSYM]])
+// CHECK-NEXT:       0x[[#%x,NCOFFSET - 4]] IMAGE_REL_ARM64_PAGEOFFSET_12L __imp_a ([[#%u,NCSYM]])
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Section ([[#%u,TCSECT]]) tc_sect {
+// CHECK-NEXT:       0x[[#%x,TCOFFSET - 8]] IMAGE_REL_ARM64_PAGEBASE_REL21 __imp_b ([[#%u,TCSYM]])
+// CHECK-NEXT:       0x[[#%x,TCOFFSET - 4]] IMAGE_REL_ARM64_PAGEOFFSET_12L __imp_b ([[#%u,TCSYM]])
+// CHECK-NEXT:     }
+// CHECK-NEXT:   ]
diff --git a/llvm/test/MC/COFF/bad-parse.s b/llvm/test/MC/COFF/bad-parse.s
new file mode 100644
index 0000000000000..2491f41abeb4e
--- /dev/null
+++ b/llvm/test/MC/COFF/bad-parse.s
@@ -0,0 +1,13 @@
+// RUN: not llvm-mc -filetype=obj -triple i386-pc-win32 %s 2>&1 | FileCheck %s
+
+        .data
+
+// CHECK: [[@LINE+1]]:{{[0-9]+}}: error: expected identifier in directive
+        .secnum
+// CHECK: [[@LINE+1]]:{{[0-9]+}}: error: unexpected token in directive
+        .secnum section extra
+
+// CHECK: [[@LINE+1]]:{{[0-9]+}}: error: expected identifier in directive
+        .secoffset
+// CHECK: [[@LINE+1]]:{{[0-9]+}}: error: unexpected token in directive
+        .secoffset section extra

From 19032bfe87fa0f4a3a7b3e68daafc93331b71e0d Mon Sep 17 00:00:00 2001
From: Daniel Paoliello <danpao@microsoft.com>
Date: Mon, 13 Jan 2025 14:00:31 -0800
Subject: [PATCH 339/408] [aarch64][win] Update Called Globals info when
 updating Call Site info (#122762)

Fixes the "use after poison" issue introduced by #121516 (see
<https://github.com/llvm/llvm-project/pull/121516#issuecomment-2585912395>).

The root cause of this issue is that #121516 introduced "Called Global"
information for call instructions modeling how "Call Site" info is
stored in the machine function, HOWEVER it didn't copy the
copy/move/erase operations for call site information.

The fix is to rename and update the existing copy/move/erase functions
so they also take care of Called Global info.
---
 llvm/include/llvm/CodeGen/MachineFunction.h   | 39 ++++----
 llvm/include/llvm/CodeGen/MachineInstr.h      | 13 +--
 llvm/include/llvm/CodeGen/SelectionDAG.h      |  6 +-
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp    |  4 +-
 llvm/lib/CodeGen/BranchFolding.cpp            |  6 +-
 llvm/lib/CodeGen/IfConversion.cpp             | 18 ++--
 llvm/lib/CodeGen/InlineSpiller.cpp            |  6 +-
 llvm/lib/CodeGen/LiveRangeEdit.cpp            |  6 +-
 llvm/lib/CodeGen/MIRPrinter.cpp               | 19 ++--
 llvm/lib/CodeGen/MachineFunction.cpp          | 90 +++++++++++--------
 llvm/lib/CodeGen/MachineInstr.cpp             |  8 +-
 llvm/lib/CodeGen/MachineLICM.cpp              |  6 +-
 llvm/lib/CodeGen/MachineOutliner.cpp          |  4 +-
 llvm/lib/CodeGen/PeepholeOptimizer.cpp        |  6 +-
 .../SelectionDAG/ScheduleDAGSDNodes.cpp       | 14 +--
 llvm/lib/CodeGen/TailDuplicator.cpp           |  6 +-
 llvm/lib/CodeGen/TargetInstrInfo.cpp          |  6 +-
 llvm/lib/CodeGen/UnreachableBlockElim.cpp     |  6 +-
 llvm/lib/CodeGen/XRayInstrumentation.cpp      |  4 +-
 .../AArch64CleanupLocalDynamicTLSPass.cpp     |  6 +-
 .../AArch64/AArch64ExpandPseudoInsts.cpp      |  8 +-
 .../Target/AArch64/AArch64SLSHardening.cpp    |  2 +-
 llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp  | 21 +++--
 llvm/lib/Target/ARM/ARMSLSHardening.cpp       |  2 +-
 llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp  |  7 +-
 llvm/lib/Target/X86/X86ExpandPseudo.cpp       | 10 +--
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  4 +-
 .../X86/X86SpeculativeLoadHardening.cpp       |  6 +-
 28 files changed, 176 insertions(+), 157 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h
index 282aee2a69c4d..d517b5e664729 100644
--- a/llvm/include/llvm/CodeGen/MachineFunction.h
+++ b/llvm/include/llvm/CodeGen/MachineFunction.h
@@ -354,11 +354,6 @@ class LLVM_ABI MachineFunction {
   /// a table of valid targets for Windows EHCont Guard.
   std::vector<MCSymbol *> CatchretTargets;
 
-  /// Mapping of call instruction to the global value and target flags that it
-  /// calls, if applicable.
-  DenseMap<const MachineInstr *, std::pair<const GlobalValue *, unsigned>>
-      CalledGlobalsMap;
-
   /// \name Exception Handling
   /// \{
 
@@ -494,6 +489,11 @@ class LLVM_ABI MachineFunction {
     SmallVector<ArgRegPair, 1> ArgRegPairs;
   };
 
+  struct CalledGlobalInfo {
+    const GlobalValue *Callee;
+    unsigned TargetFlags;
+  };
+
 private:
   Delegate *TheDelegate = nullptr;
   GISelChangeObserver *Observer = nullptr;
@@ -506,6 +506,11 @@ class LLVM_ABI MachineFunction {
   /// instruction if debug entry value support is enabled.
   CallSiteInfoMap::iterator getCallSiteInfo(const MachineInstr *MI);
 
+  using CalledGlobalsMap = DenseMap<const MachineInstr *, CalledGlobalInfo>;
+  /// Mapping of call instruction to the global value and target flags that it
+  /// calls, if applicable.
+  CalledGlobalsMap CalledGlobalsInfo;
+
   // Callbacks for insertion and removal.
   void handleInsertion(MachineInstr &MI);
   void handleRemoval(MachineInstr &MI);
@@ -1189,22 +1194,20 @@ class LLVM_ABI MachineFunction {
 
   /// Tries to get the global and target flags for a call site, if the
   /// instruction is a call to a global.
-  std::pair<const GlobalValue *, unsigned>
-  tryGetCalledGlobal(const MachineInstr *MI) const {
-    return CalledGlobalsMap.lookup(MI);
+  CalledGlobalInfo tryGetCalledGlobal(const MachineInstr *MI) const {
+    return CalledGlobalsInfo.lookup(MI);
   }
 
   /// Notes the global and target flags for a call site.
-  void addCalledGlobal(const MachineInstr *MI,
-                       std::pair<const GlobalValue *, unsigned> Details) {
+  void addCalledGlobal(const MachineInstr *MI, CalledGlobalInfo Details) {
     assert(MI && "MI must not be null");
-    assert(Details.first && "Global must not be null");
-    CalledGlobalsMap.insert({MI, Details});
+    assert(Details.Callee && "Global must not be null");
+    CalledGlobalsInfo.insert({MI, Details});
   }
 
   /// Iterates over the full set of call sites and their associated globals.
   auto getCalledGlobals() const {
-    return llvm::make_range(CalledGlobalsMap.begin(), CalledGlobalsMap.end());
+    return llvm::make_range(CalledGlobalsInfo.begin(), CalledGlobalsInfo.end());
   }
 
   /// \name Exception Handling
@@ -1383,7 +1386,7 @@ class LLVM_ABI MachineFunction {
 
   /// Start tracking the arguments passed to the call \p CallI.
   void addCallSiteInfo(const MachineInstr *CallI, CallSiteInfo &&CallInfo) {
-    assert(CallI->isCandidateForCallSiteEntry());
+    assert(CallI->isCandidateForAdditionalCallInfo());
     bool Inserted =
         CallSitesInfo.try_emplace(CallI, std::move(CallInfo)).second;
     (void)Inserted;
@@ -1399,18 +1402,16 @@ class LLVM_ABI MachineFunction {
 
   /// Erase the call site info for \p MI. It is used to remove a call
   /// instruction from the instruction stream.
-  void eraseCallSiteInfo(const MachineInstr *MI);
+  void eraseAdditionalCallInfo(const MachineInstr *MI);
   /// Copy the call site info from \p Old to \ New. Its usage is when we are
   /// making a copy of the instruction that will be inserted at different point
   /// of the instruction stream.
-  void copyCallSiteInfo(const MachineInstr *Old,
-                        const MachineInstr *New);
+  void copyAdditionalCallInfo(const MachineInstr *Old, const MachineInstr *New);
 
   /// Move the call site info from \p Old to \New call site info. This function
   /// is used when we are replacing one call instruction with another one to
   /// the same callee.
-  void moveCallSiteInfo(const MachineInstr *Old,
-                        const MachineInstr *New);
+  void moveAdditionalCallInfo(const MachineInstr *Old, const MachineInstr *New);
 
   unsigned getNewDebugInstrNum() {
     return ++DebugInstrNumberingCount;
diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h
index 1932bb9bd3dab..efac83d9e1c92 100644
--- a/llvm/include/llvm/CodeGen/MachineInstr.h
+++ b/llvm/include/llvm/CodeGen/MachineInstr.h
@@ -957,13 +957,14 @@ class MachineInstr
     return hasProperty(MCID::Call, Type);
   }
 
-  /// Return true if this is a call instruction that may have an associated
-  /// call site entry in the debug info.
-  bool isCandidateForCallSiteEntry(QueryType Type = IgnoreBundle) const;
+  /// Return true if this is a call instruction that may have an additional
+  /// information associated with it.
+  bool isCandidateForAdditionalCallInfo(QueryType Type = IgnoreBundle) const;
+
   /// Return true if copying, moving, or erasing this instruction requires
-  /// updating Call Site Info (see \ref copyCallSiteInfo, \ref moveCallSiteInfo,
-  /// \ref eraseCallSiteInfo).
-  bool shouldUpdateCallSiteInfo() const;
+  /// updating additional call info (see \ref copyCallInfo, \ref moveCallInfo,
+  /// \ref eraseCallInfo).
+  bool shouldUpdateAdditionalCallInfo() const;
 
   /// Returns true if the specified instruction stops control flow
   /// from executing the instruction immediately following it.  Examples include
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index b31ad11c3ee0e..ba0538f7084ee 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -287,13 +287,14 @@ class SelectionDAG {
   SDDbgInfo *DbgInfo;
 
   using CallSiteInfo = MachineFunction::CallSiteInfo;
+  using CalledGlobalInfo = MachineFunction::CalledGlobalInfo;
 
   struct NodeExtraInfo {
     CallSiteInfo CSInfo;
     MDNode *HeapAllocSite = nullptr;
     MDNode *PCSections = nullptr;
     MDNode *MMRA = nullptr;
-    std::pair<const GlobalValue *, unsigned> CalledGlobal{};
+    CalledGlobalInfo CalledGlobal{};
     bool NoMerge = false;
   };
   /// Out-of-line extra information for SDNodes.
@@ -2380,8 +2381,7 @@ class SelectionDAG {
     SDEI[Node].CalledGlobal = {GV, OpFlags};
   }
   /// Return CalledGlobal associated with Node, or a nullopt if none exists.
-  std::optional<std::pair<const GlobalValue *, unsigned>>
-  getCalledGlobal(const SDNode *Node) {
+  std::optional<CalledGlobalInfo> getCalledGlobal(const SDNode *Node) {
     auto I = SDEI.find(Node);
     return I != SDEI.end()
                ? std::make_optional(std::move(I->second).CalledGlobal)
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 11de4b61797bd..60d911d0383ed 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -918,7 +918,7 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP,
 
       // Skip instructions which aren't calls. Both calls and tail-calling jump
       // instructions (e.g TAILJMPd64) are classified correctly here.
-      if (!MI.isCandidateForCallSiteEntry())
+      if (!MI.isCandidateForAdditionalCallInfo())
         continue;
 
       // Skip instructions marked as frame setup, as they are not interesting to
@@ -2019,7 +2019,7 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
 
   // When describing calls, we need a label for the call instruction.
   if (!NoDebug && SP->areAllCallsDescribed() &&
-      MI->isCandidateForCallSiteEntry(MachineInstr::AnyInBundle) &&
+      MI->isCandidateForAdditionalCallInfo(MachineInstr::AnyInBundle) &&
       (!MI->hasDelaySlot() || delaySlotSupported(*MI))) {
     const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
     bool IsTail = TII->isTailCall(*MI);
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
index 1dd7cccd90119..bc1a65064a8c5 100644
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -165,10 +165,10 @@ void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) {
   // Avoid matching if this pointer gets reused.
   TriedMerging.erase(MBB);
 
-  // Update call site info.
+  // Update call info.
   for (const MachineInstr &MI : *MBB)
-    if (MI.shouldUpdateCallSiteInfo())
-      MF->eraseCallSiteInfo(&MI);
+    if (MI.shouldUpdateAdditionalCallInfo())
+      MF->eraseAdditionalCallInfo(&MI);
 
   // Remove the block.
   MF->erase(MBB);
diff --git a/llvm/lib/CodeGen/IfConversion.cpp b/llvm/lib/CodeGen/IfConversion.cpp
index 7b6d1465651d5..fa817097029aa 100644
--- a/llvm/lib/CodeGen/IfConversion.cpp
+++ b/llvm/lib/CodeGen/IfConversion.cpp
@@ -1834,9 +1834,9 @@ bool IfConverter::IfConvertDiamondCommon(
   }
   while (NumDups1 != 0) {
     // Since this instruction is going to be deleted, update call
-    // site info state if the instruction is call instruction.
-    if (DI2->shouldUpdateCallSiteInfo())
-      MBB2.getParent()->eraseCallSiteInfo(&*DI2);
+    // info state if the instruction is call instruction.
+    if (DI2->shouldUpdateAdditionalCallInfo())
+      MBB2.getParent()->eraseAdditionalCallInfo(&*DI2);
 
     ++DI2;
     if (DI2 == MBB2.end())
@@ -1883,9 +1883,9 @@ bool IfConverter::IfConvertDiamondCommon(
     --DI1;
 
     // Since this instruction is going to be deleted, update call
-    // site info state if the instruction is call instruction.
-    if (DI1->shouldUpdateCallSiteInfo())
-      MBB1.getParent()->eraseCallSiteInfo(&*DI1);
+    // info state if the instruction is call instruction.
+    if (DI1->shouldUpdateAdditionalCallInfo())
+      MBB1.getParent()->eraseAdditionalCallInfo(&*DI1);
 
     // skip dbg_value instructions
     if (!DI1->isDebugInstr())
@@ -2169,9 +2169,9 @@ void IfConverter::CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
       break;
 
     MachineInstr *MI = MF.CloneMachineInstr(&I);
-    // Make a copy of the call site info.
-    if (I.isCandidateForCallSiteEntry())
-      MF.copyCallSiteInfo(&I, MI);
+    // Make a copy of the call info.
+    if (I.isCandidateForAdditionalCallInfo())
+      MF.copyAdditionalCallInfo(&I, MI);
 
     ToBBI.BB->insert(ToBBI.BB->end(), MI);
     ToBBI.NonPredSize++;
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index f6681540e2286..d98254650a001 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -997,9 +997,9 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, unsigned>> Ops,
       HSpiller.rmFromMergeableSpills(*MI, FI))
     --NumSpills;
   LIS.ReplaceMachineInstrInMaps(*MI, *FoldMI);
-  // Update the call site info.
-  if (MI->isCandidateForCallSiteEntry())
-    MI->getMF()->moveCallSiteInfo(MI, FoldMI);
+  // Update the call info.
+  if (MI->isCandidateForAdditionalCallInfo())
+    MI->getMF()->moveAdditionalCallInfo(MI, FoldMI);
 
   // If we've folded a store into an instruction labelled with debug-info,
   // record a substitution from the old operand to the memory operand. Handle
diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp
index 7b630e88b2a60..0b637bf8a9c56 100644
--- a/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -253,9 +253,9 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
     return false;
   LLVM_DEBUG(dbgs() << "                folded: " << *FoldMI);
   LIS.ReplaceMachineInstrInMaps(*UseMI, *FoldMI);
-  // Update the call site info.
-  if (UseMI->shouldUpdateCallSiteInfo())
-    UseMI->getMF()->moveCallSiteInfo(UseMI, FoldMI);
+  // Update the call info.
+  if (UseMI->shouldUpdateAdditionalCallInfo())
+    UseMI->getMF()->moveAdditionalCallInfo(UseMI, FoldMI);
   UseMI->eraseFromParent();
   DefMI->addRegisterDead(LI->reg(), nullptr);
   Dead.push_back(DefMI);
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index b6da495590fe1..0b41c90442a5d 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -605,17 +605,14 @@ void MIRPrinter::convertCalledGlobals(yaml::MachineFunction &YMF,
                                       const MachineFunction &MF,
                                       MachineModuleSlotTracker &MST) {
   for (const auto &[CallInst, CG] : MF.getCalledGlobals()) {
-    // If the call instruction was dropped, then we don't need to print it.
-    auto BB = CallInst->getParent();
-    if (BB) {
-      yaml::MachineInstrLoc CallSite;
-      CallSite.BlockNum = CallInst->getParent()->getNumber();
-      CallSite.Offset = std::distance(CallInst->getParent()->instr_begin(),
-                                      CallInst->getIterator());
-
-      yaml::CalledGlobal YamlCG{CallSite, CG.first->getName().str(), CG.second};
-      YMF.CalledGlobals.push_back(YamlCG);
-    }
+    yaml::MachineInstrLoc CallSite;
+    CallSite.BlockNum = CallInst->getParent()->getNumber();
+    CallSite.Offset = std::distance(CallInst->getParent()->instr_begin(),
+                                    CallInst->getIterator());
+
+    yaml::CalledGlobal YamlCG{CallSite, CG.Callee->getName().str(),
+                              CG.TargetFlags};
+    YMF.CalledGlobals.push_back(YamlCG);
   }
 
   // Sort by position of call instructions.
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index e6b9538fe9a02..b8dbe834a4d51 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -459,11 +459,11 @@ MachineInstr &MachineFunction::cloneMachineInstrBundle(
       break;
     ++I;
   }
-  // Copy over call site info to the cloned instruction if needed. If Orig is in
-  // a bundle, copyCallSiteInfo takes care of finding the call instruction in
-  // the bundle.
-  if (Orig.shouldUpdateCallSiteInfo())
-    copyCallSiteInfo(&Orig, FirstClone);
+  // Copy over call info to the cloned instruction if needed. If Orig is in
+  // a bundle, copyAdditionalCallInfo takes care of finding the call instruction
+  // in the bundle.
+  if (Orig.shouldUpdateAdditionalCallInfo())
+    copyAdditionalCallInfo(&Orig, FirstClone);
   return *FirstClone;
 }
 
@@ -476,8 +476,13 @@ void MachineFunction::deleteMachineInstr(MachineInstr *MI) {
   // be triggered during the implementation of support for the
   // call site info of a new architecture. If the assertion is triggered,
   // back trace will tell where to insert a call to updateCallSiteInfo().
-  assert((!MI->isCandidateForCallSiteEntry() || !CallSitesInfo.contains(MI)) &&
+  assert((!MI->isCandidateForAdditionalCallInfo() ||
+          !CallSitesInfo.contains(MI)) &&
          "Call site info was not updated!");
+  // Verify that the "called globals" info is in a valid state.
+  assert((!MI->isCandidateForAdditionalCallInfo() ||
+          !CalledGlobalsInfo.contains(MI)) &&
+         "Called globals info was not updated!");
   // Strip it for parts. The operand array and the MI object itself are
   // independently recyclable.
   if (MI->Operands)
@@ -911,7 +916,7 @@ try_next:;
 
 MachineFunction::CallSiteInfoMap::iterator
 MachineFunction::getCallSiteInfo(const MachineInstr *MI) {
-  assert(MI->isCandidateForCallSiteEntry() &&
+  assert(MI->isCandidateForAdditionalCallInfo() &&
          "Call site info refers only to call (MI) candidates");
 
   if (!Target.Options.EmitCallSiteInfo)
@@ -926,59 +931,74 @@ static const MachineInstr *getCallInstr(const MachineInstr *MI) {
 
   for (const auto &BMI : make_range(getBundleStart(MI->getIterator()),
                                     getBundleEnd(MI->getIterator())))
-    if (BMI.isCandidateForCallSiteEntry())
+    if (BMI.isCandidateForAdditionalCallInfo())
       return &BMI;
 
   llvm_unreachable("Unexpected bundle without a call site candidate");
 }
 
-void MachineFunction::eraseCallSiteInfo(const MachineInstr *MI) {
-  assert(MI->shouldUpdateCallSiteInfo() &&
-         "Call site info refers only to call (MI) candidates or "
+void MachineFunction::eraseAdditionalCallInfo(const MachineInstr *MI) {
+  assert(MI->shouldUpdateAdditionalCallInfo() &&
+         "Call info refers only to call (MI) candidates or "
          "candidates inside bundles");
 
   const MachineInstr *CallMI = getCallInstr(MI);
+
   CallSiteInfoMap::iterator CSIt = getCallSiteInfo(CallMI);
-  if (CSIt == CallSitesInfo.end())
-    return;
-  CallSitesInfo.erase(CSIt);
+  if (CSIt != CallSitesInfo.end())
+    CallSitesInfo.erase(CSIt);
+
+  CalledGlobalsMap::iterator CGIt = CalledGlobalsInfo.find(CallMI);
+  if (CGIt != CalledGlobalsInfo.end())
+    CalledGlobalsInfo.erase(CGIt);
 }
 
-void MachineFunction::copyCallSiteInfo(const MachineInstr *Old,
-                                       const MachineInstr *New) {
-  assert(Old->shouldUpdateCallSiteInfo() &&
-         "Call site info refers only to call (MI) candidates or "
+void MachineFunction::copyAdditionalCallInfo(const MachineInstr *Old,
+                                             const MachineInstr *New) {
+  assert(Old->shouldUpdateAdditionalCallInfo() &&
+         "Call info refers only to call (MI) candidates or "
          "candidates inside bundles");
 
-  if (!New->isCandidateForCallSiteEntry())
-    return eraseCallSiteInfo(Old);
+  if (!New->isCandidateForAdditionalCallInfo())
+    return eraseAdditionalCallInfo(Old);
 
   const MachineInstr *OldCallMI = getCallInstr(Old);
   CallSiteInfoMap::iterator CSIt = getCallSiteInfo(OldCallMI);
-  if (CSIt == CallSitesInfo.end())
-    return;
+  if (CSIt != CallSitesInfo.end()) {
+    CallSiteInfo CSInfo = CSIt->second;
+    CallSitesInfo[New] = CSInfo;
+  }
 
-  CallSiteInfo CSInfo = CSIt->second;
-  CallSitesInfo[New] = CSInfo;
+  CalledGlobalsMap::iterator CGIt = CalledGlobalsInfo.find(OldCallMI);
+  if (CGIt != CalledGlobalsInfo.end()) {
+    CalledGlobalInfo CGInfo = CGIt->second;
+    CalledGlobalsInfo[New] = CGInfo;
+  }
 }
 
-void MachineFunction::moveCallSiteInfo(const MachineInstr *Old,
-                                       const MachineInstr *New) {
-  assert(Old->shouldUpdateCallSiteInfo() &&
-         "Call site info refers only to call (MI) candidates or "
+void MachineFunction::moveAdditionalCallInfo(const MachineInstr *Old,
+                                             const MachineInstr *New) {
+  assert(Old->shouldUpdateAdditionalCallInfo() &&
+         "Call info refers only to call (MI) candidates or "
          "candidates inside bundles");
 
-  if (!New->isCandidateForCallSiteEntry())
-    return eraseCallSiteInfo(Old);
+  if (!New->isCandidateForAdditionalCallInfo())
+    return eraseAdditionalCallInfo(Old);
 
   const MachineInstr *OldCallMI = getCallInstr(Old);
   CallSiteInfoMap::iterator CSIt = getCallSiteInfo(OldCallMI);
-  if (CSIt == CallSitesInfo.end())
-    return;
+  if (CSIt != CallSitesInfo.end()) {
+    CallSiteInfo CSInfo = std::move(CSIt->second);
+    CallSitesInfo.erase(CSIt);
+    CallSitesInfo[New] = CSInfo;
+  }
 
-  CallSiteInfo CSInfo = std::move(CSIt->second);
-  CallSitesInfo.erase(CSIt);
-  CallSitesInfo[New] = CSInfo;
+  CalledGlobalsMap::iterator CGIt = CalledGlobalsInfo.find(OldCallMI);
+  if (CGIt != CalledGlobalsInfo.end()) {
+    CalledGlobalInfo CGInfo = std::move(CGIt->second);
+    CalledGlobalsInfo.erase(CGIt);
+    CalledGlobalsInfo[New] = CGInfo;
+  }
 }
 
 void MachineFunction::setDebugInstrNumberingCount(unsigned Num) {
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index 958efa79d7e9d..ef36dfc472197 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -773,7 +773,7 @@ void MachineInstr::eraseFromBundle() {
   getParent()->erase_instr(this);
 }
 
-bool MachineInstr::isCandidateForCallSiteEntry(QueryType Type) const {
+bool MachineInstr::isCandidateForAdditionalCallInfo(QueryType Type) const {
   if (!isCall(Type))
     return false;
   switch (getOpcode()) {
@@ -786,10 +786,10 @@ bool MachineInstr::isCandidateForCallSiteEntry(QueryType Type) const {
   return true;
 }
 
-bool MachineInstr::shouldUpdateCallSiteInfo() const {
+bool MachineInstr::shouldUpdateAdditionalCallInfo() const {
   if (isBundle())
-    return isCandidateForCallSiteEntry(MachineInstr::AnyInBundle);
-  return isCandidateForCallSiteEntry();
+    return isCandidateForAdditionalCallInfo(MachineInstr::AnyInBundle);
+  return isCandidateForAdditionalCallInfo();
 }
 
 unsigned MachineInstr::getNumExplicitOperands() const {
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index d1d5509dc482a..1f6de0d6b2416 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -1459,9 +1459,9 @@ MachineInstr *MachineLICMImpl::ExtractHoistableLoad(MachineInstr *MI,
 
   // Otherwise we successfully unfolded a load that we can hoist.
 
-  // Update the call site info.
-  if (MI->shouldUpdateCallSiteInfo())
-    MF.eraseCallSiteInfo(MI);
+  // Update the call info.
+  if (MI->shouldUpdateAdditionalCallInfo())
+    MF.eraseAdditionalCallInfo(MI);
 
   MI->eraseFromParent();
   return NewMIs[0];
diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp
index 4c5489434c69b..e3fe9494acef8 100644
--- a/llvm/lib/CodeGen/MachineOutliner.cpp
+++ b/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -1149,8 +1149,8 @@ bool MachineOutliner::outline(
               InstrUseRegs.insert(MOP.getReg());
             }
           }
-          if (MI->isCandidateForCallSiteEntry())
-            MI->getMF()->eraseCallSiteInfo(MI);
+          if (MI->isCandidateForAdditionalCallInfo())
+            MI->getMF()->eraseAdditionalCallInfo(MI);
         }
 
         for (const Register &I : DefRegs)
diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
index ad5796aa7f8c2..5d76d3688dfef 100644
--- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -1868,9 +1868,9 @@ bool PeepholeOptimizer::run(MachineFunction &MF) {
               LocalMIs.erase(MI);
               LocalMIs.erase(DefMI);
               LocalMIs.insert(FoldMI);
-              // Update the call site info.
-              if (MI->shouldUpdateCallSiteInfo())
-                MI->getMF()->moveCallSiteInfo(MI, FoldMI);
+              // Update the call info.
+              if (MI->shouldUpdateAdditionalCallInfo())
+                MI->getMF()->moveAdditionalCallInfo(MI, FoldMI);
               MI->eraseFromParent();
               DefMI->eraseFromParent();
               MRI->markUsesInDebugValueAsUndef(FoldedReg);
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index bafe26ff7d6b7..ac6c44ec63545 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -888,9 +888,13 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
       MI = &*std::next(Before);
     }
 
-    if (MI->isCandidateForCallSiteEntry() &&
-        DAG->getTarget().Options.EmitCallSiteInfo) {
-      MF.addCallSiteInfo(MI, DAG->getCallSiteInfo(Node));
+    if (MI->isCandidateForAdditionalCallInfo()) {
+      if (DAG->getTarget().Options.EmitCallSiteInfo)
+        MF.addCallSiteInfo(MI, DAG->getCallSiteInfo(Node));
+
+      if (auto CalledGlobal = DAG->getCalledGlobal(Node))
+        if (CalledGlobal->Callee)
+          MF.addCalledGlobal(MI, *CalledGlobal);
     }
 
     if (DAG->getNoMergeSiteInfo(Node)) {
@@ -908,10 +912,6 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
         It->setMMRAMetadata(MF, MMRA);
     }
 
-    if (auto CalledGlobal = DAG->getCalledGlobal(Node))
-      if (CalledGlobal->first)
-        MF.addCalledGlobal(MI, *CalledGlobal);
-
     return MI;
   };
 
diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp
index d0013923838b3..f5346c8805733 100644
--- a/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -1068,10 +1068,10 @@ void TailDuplicator::removeDeadBlock(
   LLVM_DEBUG(dbgs() << "\nRemoving MBB: " << *MBB);
 
   MachineFunction *MF = MBB->getParent();
-  // Update the call site info.
+  // Update the call info.
   for (const MachineInstr &MI : *MBB)
-    if (MI.shouldUpdateCallSiteInfo())
-      MF->eraseCallSiteInfo(&MI);
+    if (MI.shouldUpdateAdditionalCallInfo())
+      MF->eraseAdditionalCallInfo(&MI);
 
   if (RemovalCallback)
     (*RemovalCallback)(MBB);
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 770b851f3607a..7a905b65f26e5 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -150,12 +150,12 @@ TargetInstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
   // Save off the debug loc before erasing the instruction.
   DebugLoc DL = Tail->getDebugLoc();
 
-  // Update call site info and remove all the dead instructions
+  // Update call info and remove all the dead instructions
   // from the end of MBB.
   while (Tail != MBB->end()) {
     auto MI = Tail++;
-    if (MI->shouldUpdateCallSiteInfo())
-      MBB->getParent()->eraseCallSiteInfo(&*MI);
+    if (MI->shouldUpdateAdditionalCallInfo())
+      MBB->getParent()->eraseAdditionalCallInfo(&*MI);
     MBB->erase(MI);
   }
 
diff --git a/llvm/lib/CodeGen/UnreachableBlockElim.cpp b/llvm/lib/CodeGen/UnreachableBlockElim.cpp
index 6e3b69b4b9611..aa4ccec3338a9 100644
--- a/llvm/lib/CodeGen/UnreachableBlockElim.cpp
+++ b/llvm/lib/CodeGen/UnreachableBlockElim.cpp
@@ -141,10 +141,10 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
 
   // Actually remove the blocks now.
   for (MachineBasicBlock *BB : DeadBlocks) {
-    // Remove any call site information for calls in the block.
+    // Remove any call information for calls in the block.
     for (auto &I : BB->instrs())
-      if (I.shouldUpdateCallSiteInfo())
-        BB->getParent()->eraseCallSiteInfo(&I);
+      if (I.shouldUpdateAdditionalCallInfo())
+        BB->getParent()->eraseAdditionalCallInfo(&I);
 
     BB->eraseFromParent();
   }
diff --git a/llvm/lib/CodeGen/XRayInstrumentation.cpp b/llvm/lib/CodeGen/XRayInstrumentation.cpp
index 9035e10716c3a..0873d9956356e 100644
--- a/llvm/lib/CodeGen/XRayInstrumentation.cpp
+++ b/llvm/lib/CodeGen/XRayInstrumentation.cpp
@@ -112,8 +112,8 @@ void XRayInstrumentation::replaceRetWithPatchableRet(
         for (auto &MO : T.operands())
           MIB.add(MO);
         Terminators.push_back(&T);
-        if (T.shouldUpdateCallSiteInfo())
-          MF.eraseCallSiteInfo(&T);
+        if (T.shouldUpdateAdditionalCallInfo())
+          MF.eraseAdditionalCallInfo(&T);
       }
     }
   }
diff --git a/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
index 10661b6414612..a85c7b2c72536 100644
--- a/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
@@ -105,9 +105,9 @@ struct LDTLSCleanup : public MachineFunctionPass {
                                  TII->get(TargetOpcode::COPY), AArch64::X0)
                              .addReg(TLSBaseAddrReg);
 
-    // Update the call site info.
-    if (I.shouldUpdateCallSiteInfo())
-      I.getMF()->eraseCallSiteInfo(&I);
+    // Update the call info.
+    if (I.shouldUpdateAdditionalCallInfo())
+      I.getMF()->eraseAdditionalCallInfo(&I);
 
     // Erase the TLS_base_addr instruction.
     I.eraseFromParent();
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index bd542f62dccb4..b44c48afe705b 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -879,8 +879,8 @@ bool AArch64ExpandPseudo::expandCALL_RVMARKER(
                      .add(RVTarget)
                      .getInstr();
 
-  if (MI.shouldUpdateCallSiteInfo())
-    MBB.getParent()->moveCallSiteInfo(&MI, OriginalCall);
+  if (MI.shouldUpdateAdditionalCallInfo())
+    MBB.getParent()->moveAdditionalCallInfo(&MI, OriginalCall);
 
   MI.eraseFromParent();
   finalizeBundle(MBB, OriginalCall->getIterator(),
@@ -908,8 +908,8 @@ bool AArch64ExpandPseudo::expandCALL_BTI(MachineBasicBlock &MBB,
           .addImm(36)
           .getInstr();
 
-  if (MI.shouldUpdateCallSiteInfo())
-    MBB.getParent()->moveCallSiteInfo(&MI, Call);
+  if (MI.shouldUpdateAdditionalCallInfo())
+    MBB.getParent()->moveAdditionalCallInfo(&MI, Call);
 
   MI.eraseFromParent();
   finalizeBundle(MBB, Call->getIterator(), std::next(BTI->getIterator()));
diff --git a/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp
index a8bd899a09b85..91410a5af3dc3 100644
--- a/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp
@@ -475,7 +475,7 @@ void SLSHardeningInserter::convertBLRToBL(
   BL->removeOperand(SecondOpIdxToRemove);
   // Now copy over the implicit operands from the original BLR
   BL->copyImplicitOps(MF, BLR);
-  MF.moveCallSiteInfo(&BLR, BL);
+  MF.moveAdditionalCallInfo(&BLR, BL);
   // Also add the register operands of the original BLR* instruction
   // as being used in the called thunk.
   for (unsigned OpIdx = 0; OpIdx < NumRegOperands; ++OpIdx) {
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 3fda15a429017..2e5dc09c00ce6 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -2300,10 +2300,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       for (unsigned i = 2, e = MBBI->getNumOperands(); i != e; ++i)
         NewMI->addOperand(MBBI->getOperand(i));
 
-
-      // Update call site info and delete the pseudo instruction TCRETURN.
-      if (MI.isCandidateForCallSiteEntry())
-        MI.getMF()->moveCallSiteInfo(&MI, &*NewMI);
+      // Update call info and delete the pseudo instruction TCRETURN.
+      if (MI.isCandidateForAdditionalCallInfo())
+        MI.getMF()->moveAdditionalCallInfo(&MI, &*NewMI);
       // Copy nomerge flag over to new instruction.
       if (MI.getFlag(MachineInstr::NoMerge))
         NewMI->setFlag(MachineInstr::NoMerge);
@@ -2414,8 +2413,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
 
       for (const MachineOperand &MO : llvm::drop_begin(MI.operands()))
         NewCall->addOperand(MO);
-      if (MI.isCandidateForCallSiteEntry())
-        MI.getMF()->moveCallSiteInfo(&MI, NewCall.getInstr());
+      if (MI.isCandidateForAdditionalCallInfo())
+        MI.getMF()->moveAdditionalCallInfo(&MI, NewCall.getInstr());
 
       CMSERestoreFPRegs(MBB, MBBI, DL, OriginalClearRegs); // restore FP registers
 
@@ -2652,9 +2651,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
 
       MIB.cloneMemRefs(MI);
       MIB.copyImplicitOps(MI);
-      // Update the call site info.
-      if (MI.isCandidateForCallSiteEntry())
-        MF->moveCallSiteInfo(&MI, &*MIB);
+      // Update the call info.
+      if (MI.isCandidateForAdditionalCallInfo())
+        MF->moveAdditionalCallInfo(&MI, &*MIB);
       MI.eraseFromParent();
       return true;
     }
@@ -3254,8 +3253,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       MIB.cloneMemRefs(MI);
       for (unsigned i = 0; i < MI.getNumOperands(); ++i)
         MIB.add(MI.getOperand(i));
-      if (MI.isCandidateForCallSiteEntry())
-        MF.moveCallSiteInfo(&MI, MIB.getInstr());
+      if (MI.isCandidateForAdditionalCallInfo())
+        MF.moveAdditionalCallInfo(&MI, MIB.getInstr());
       MIBundleBuilder Bundler(MBB, MI);
       Bundler.append(MIB);
       Bundler.append(BuildMI(MF, MI.getDebugLoc(), TII->get(ARM::t2BTI)));
diff --git a/llvm/lib/Target/ARM/ARMSLSHardening.cpp b/llvm/lib/Target/ARM/ARMSLSHardening.cpp
index d77db17090feb..23acc3cfba68e 100644
--- a/llvm/lib/Target/ARM/ARMSLSHardening.cpp
+++ b/llvm/lib/Target/ARM/ARMSLSHardening.cpp
@@ -348,7 +348,7 @@ MachineBasicBlock &ARMSLSHardening::ConvertIndirectCallToIndirectJump(
   BL->removeOperand(SecondOpIdxToRemove);
   // Now copy over the implicit operands from the original IndirectCall
   BL->copyImplicitOps(MF, IndirectCall);
-  MF.moveCallSiteInfo(&IndirectCall, BL);
+  MF.moveAdditionalCallInfo(&IndirectCall, BL);
   // Also add the register called in the IndirectCall as being used in the
   // called thunk.
   BL->addOperand(MachineOperand::CreateReg(Reg, false /*isDef*/, true /*isImp*/,
diff --git a/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp b/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
index 99d7e5c57a9bc..3a9421fae0f60 100644
--- a/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -563,9 +563,10 @@ Iter MipsDelaySlotFiller::replaceWithCompactBranch(MachineBasicBlock &MBB,
   Branch = TII->genInstrWithNewOpc(NewOpcode, Branch);
 
   auto *ToErase = cast<MachineInstr>(&*std::next(Branch));
-  // Update call site info for the Branch.
-  if (ToErase->shouldUpdateCallSiteInfo())
-    ToErase->getMF()->moveCallSiteInfo(ToErase, cast<MachineInstr>(&*Branch));
+  // Update call info for the Branch.
+  if (ToErase->shouldUpdateAdditionalCallInfo())
+    ToErase->getMF()->moveAdditionalCallInfo(ToErase,
+                                             cast<MachineInstr>(&*Branch));
   ToErase->eraseFromParent();
   return Branch;
 }
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index dc4fe07826191..fc8a0eaed140d 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -230,8 +230,8 @@ void X86ExpandPseudo::expandCALL_RVMARKER(MachineBasicBlock &MBB,
                      .addReg(TargetReg, RegState::Define)
                      .addReg(X86::RAX)
                      .getInstr();
-  if (MI.shouldUpdateCallSiteInfo())
-    MBB.getParent()->moveCallSiteInfo(&MI, Marker);
+  if (MI.shouldUpdateAdditionalCallInfo())
+    MBB.getParent()->moveAdditionalCallInfo(&MI, Marker);
 
   // Emit call to ObjC runtime.
   const uint32_t *RegMask =
@@ -360,9 +360,9 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     NewMI.copyImplicitOps(*MBBI->getParent()->getParent(), *MBBI);
     NewMI.setCFIType(*MBB.getParent(), MI.getCFIType());
 
-    // Update the call site info.
-    if (MBBI->isCandidateForCallSiteEntry())
-      MBB.getParent()->moveCallSiteInfo(&*MBBI, &NewMI);
+    // Update the call info.
+    if (MBBI->isCandidateForAdditionalCallInfo())
+      MBB.getParent()->moveAdditionalCallInfo(&*MBBI, &NewMI);
 
     // Delete the pseudo instruction TCRETURN.
     MBB.erase(MBBI);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index add51fac4b9e6..434d88db04163 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -60845,8 +60845,8 @@ X86TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
       MBBI = MBB.insert(OrigCall, NewMI);
     assert(MBBI->isCall() &&
            "Unexpected instruction after memory operand unfolding");
-    if (OrigCall->shouldUpdateCallSiteInfo())
-      MF.moveCallSiteInfo(&*OrigCall, &*MBBI);
+    if (OrigCall->shouldUpdateAdditionalCallInfo())
+      MF.moveAdditionalCallInfo(&*OrigCall, &*MBBI);
     MBBI->setCFIType(MF, OrigCall->getCFIType());
     OrigCall->eraseFromParent();
     break;
diff --git a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
index ce5a1bce5b107..0ad17cc877b9f 100644
--- a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -918,9 +918,9 @@ void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads(
         for (auto *NewMI : NewMIs)
           MBB.insert(MI.getIterator(), NewMI);
 
-        // Update the call site info.
-        if (MI.isCandidateForCallSiteEntry())
-          MF.eraseCallSiteInfo(&MI);
+        // Update the call info.
+        if (MI.isCandidateForAdditionalCallInfo())
+          MF.eraseAdditionalCallInfo(&MI);
 
         MI.eraseFromParent();
         LLVM_DEBUG({

From 61e2841d8b83319e90cd44cf77770d2e41080cc2 Mon Sep 17 00:00:00 2001
From: Vladislav Dzhidzhoev <vdzhidzhoev@accesssoftek.com>
Date: Mon, 13 Jan 2025 23:04:53 +0100
Subject: [PATCH 340/408] [lldb][test] Fix some 'import-std-module' tests
 (#122358)

Some tests from 'import-std-module' used to fail on the builder
https://lab.llvm.org/staging/#/builders/195/builds/4470,
since libcxx is set up to be linked statically with test binaries
on it.

Thus, they were temporarily disabled in #112530. Here, this commit
is reverted.

Jitted expressions from the tests try to call __libcpp_verbose_abort
function that is not present in the process image, which causes
the failure.

Here, this symbol is explicitly referenced from the test source files.
---
 .../import-std-module/array/TestArrayFromStdModule.py        | 1 -
 .../API/commands/expression/import-std-module/array/main.cpp | 5 +++++
 .../TestDbgInfoContentVectorFromStdModule.py                 | 1 -
 .../import-std-module/vector-dbg-info-content/main.cpp       | 5 +++++
 .../vector-of-vectors/TestVectorOfVectorsFromStdModule.py    | 1 -
 .../expression/import-std-module/vector-of-vectors/main.cpp  | 5 +++++
 6 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/lldb/test/API/commands/expression/import-std-module/array/TestArrayFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/array/TestArrayFromStdModule.py
index bafc762829621..13ab6b0c9ac1f 100644
--- a/lldb/test/API/commands/expression/import-std-module/array/TestArrayFromStdModule.py
+++ b/lldb/test/API/commands/expression/import-std-module/array/TestArrayFromStdModule.py
@@ -10,7 +10,6 @@
 class TestCase(TestBase):
     @add_test_categories(["libc++"])
     @skipIf(compiler=no_match("clang"))
-    @skipIfLinux  # https://discourse.llvm.org/t/lldb-test-failures-on-linux/80095
     def test(self):
         self.build()
 
diff --git a/lldb/test/API/commands/expression/import-std-module/array/main.cpp b/lldb/test/API/commands/expression/import-std-module/array/main.cpp
index 9bcd0b574042a..b71e4f876769d 100644
--- a/lldb/test/API/commands/expression/import-std-module/array/main.cpp
+++ b/lldb/test/API/commands/expression/import-std-module/array/main.cpp
@@ -1,5 +1,10 @@
+#include <__verbose_abort>
 #include <array>
 
+// Some expressons from the test need this symbol to be compiled when libcxx is
+// built statically.
+void *libcpp_verbose_abort_ptr = (void *)&std::__libcpp_verbose_abort;
+
 struct DbgInfo {
   int v = 4;
 };
diff --git a/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py
index 71eaeef20e792..1c32222e64f14 100644
--- a/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py
+++ b/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py
@@ -14,7 +14,6 @@ class TestDbgInfoContentVector(TestBase):
     @skipIf(compiler="clang", compiler_version=["<", "12.0"])
     @skipIf(macos_version=["<", "14.0"])
     @skipIfDarwin  # https://github.com/llvm/llvm-project/issues/106475
-    @skipIfLinux  # https://discourse.llvm.org/t/lldb-test-failures-on-linux/80095
     def test(self):
         self.build()
 
diff --git a/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/main.cpp b/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/main.cpp
index 24c3fec75d2f5..30f74acdd5e36 100644
--- a/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/main.cpp
+++ b/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/main.cpp
@@ -1,5 +1,10 @@
+#include <__verbose_abort>
 #include <vector>
 
+// Some expressons from the test need this symbol to be compiled when libcxx is
+// built statically.
+void *libcpp_verbose_abort_ptr = (void *)&std::__libcpp_verbose_abort;
+
 struct Foo {
   int a;
 };
diff --git a/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py
index e9415fd53651f..a1f33271f39d2 100644
--- a/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py
+++ b/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py
@@ -10,7 +10,6 @@
 class TestVectorOfVectors(TestBase):
     @add_test_categories(["libc++"])
     @skipIf(compiler=no_match("clang"))
-    @skipIfLinux  # https://discourse.llvm.org/t/lldb-test-failures-on-linux/80095
     def test(self):
         self.build()
 
diff --git a/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/main.cpp b/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/main.cpp
index b5ada909e4397..158dcf7a5f680 100644
--- a/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/main.cpp
+++ b/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/main.cpp
@@ -1,5 +1,10 @@
+#include <__verbose_abort>
 #include <vector>
 
+// Some expressons from the test need this symbol to be compiled when libcxx is
+// built statically.
+void *libcpp_verbose_abort_ptr = (void *)&std::__libcpp_verbose_abort;
+
 int main(int argc, char **argv) {
   std::vector<std::vector<int> > a = {{1, 2, 3}, {3, 2, 1}};
   return 0; // Set break point at this line.

From bab7920fd7ea822543b8f1aa8037d489eea2cb73 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Mon, 13 Jan 2025 17:06:25 -0500
Subject: [PATCH 341/408] [RISCV][CG]Use processShuffleMasks for per-register
 shuffles

Patch adds usage of processShuffleMasks in in codegen
in lowerShuffleViaVRegSplitting. This function is already used for X86
shuffles estimations and in DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE
functions, unifies the code.

Reviewers: topperc, wangpc-pp, lukel97, preames

Reviewed By: preames

Pull Request: https://github.com/llvm/llvm-project/pull/121765
---
 llvm/include/llvm/Analysis/VectorUtils.h      |   3 +-
 llvm/lib/Analysis/VectorUtils.cpp             |  10 +-
 .../SelectionDAG/LegalizeVectorTypes.cpp      |   4 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 121 ++++++++++++------
 .../lib/Target/X86/X86TargetTransformInfo.cpp |   6 +-
 .../rvv/fixed-vectors-shuffle-exact-vlen.ll   | 102 +++++++--------
 6 files changed, 144 insertions(+), 102 deletions(-)

diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index a903eaa6cbe54..5d41d1cd14ef4 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -270,7 +270,8 @@ void processShuffleMasks(
     ArrayRef<int> Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs,
     unsigned NumOfUsedRegs, function_ref<void()> NoInputAction,
     function_ref<void(ArrayRef<int>, unsigned, unsigned)> SingleInputAction,
-    function_ref<void(ArrayRef<int>, unsigned, unsigned)> ManyInputsAction);
+    function_ref<void(ArrayRef<int>, unsigned, unsigned, bool)>
+        ManyInputsAction);
 
 /// Compute the demanded elements mask of horizontal binary operations. A
 /// horizontal operation combines two adjacent elements in a vector operand.
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index b4b311cb727a1..ad80e458ab57d 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -557,7 +557,8 @@ void llvm::processShuffleMasks(
     ArrayRef<int> Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs,
     unsigned NumOfUsedRegs, function_ref<void()> NoInputAction,
     function_ref<void(ArrayRef<int>, unsigned, unsigned)> SingleInputAction,
-    function_ref<void(ArrayRef<int>, unsigned, unsigned)> ManyInputsAction) {
+    function_ref<void(ArrayRef<int>, unsigned, unsigned, bool)>
+        ManyInputsAction) {
   SmallVector<SmallVector<SmallVector<int>>> Res(NumOfDestRegs);
   // Try to perform better estimation of the permutation.
   // 1. Split the source/destination vectors into real registers.
@@ -628,6 +629,7 @@ void llvm::processShuffleMasks(
         }
       };
       int SecondIdx;
+      bool NewReg = true;
       do {
         int FirstIdx = -1;
         SecondIdx = -1;
@@ -645,7 +647,8 @@ void llvm::processShuffleMasks(
           SecondIdx = I;
           SecondMask = RegMask;
           CombineMasks(FirstMask, SecondMask);
-          ManyInputsAction(FirstMask, FirstIdx, SecondIdx);
+          ManyInputsAction(FirstMask, FirstIdx, SecondIdx, NewReg);
+          NewReg = false;
           NormalizeMask(FirstMask);
           RegMask.clear();
           SecondMask = FirstMask;
@@ -653,7 +656,8 @@ void llvm::processShuffleMasks(
         }
         if (FirstIdx != SecondIdx && SecondIdx >= 0) {
           CombineMasks(SecondMask, FirstMask);
-          ManyInputsAction(SecondMask, SecondIdx, FirstIdx);
+          ManyInputsAction(SecondMask, SecondIdx, FirstIdx, NewReg);
+          NewReg = false;
           Dest[FirstIdx].clear();
           NormalizeMask(SecondMask);
         }
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 5117eb8d91dfb..f39d9ca15496a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -3059,8 +3059,8 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N,
           Inputs[Idx] = Output;
         },
         [&AccumulateResults, &Output, &DAG = DAG, NewVT, &DL, &Inputs,
-         &TmpInputs,
-         &BuildVector](ArrayRef<int> Mask, unsigned Idx1, unsigned Idx2) {
+         &TmpInputs, &BuildVector](ArrayRef<int> Mask, unsigned Idx1,
+                                   unsigned Idx2, bool /*Unused*/) {
           if (AccumulateResults(Idx1)) {
             if (Inputs[Idx1]->getOpcode() == ISD::BUILD_VECTOR &&
                 Inputs[Idx2]->getOpcode() == ISD::BUILD_VECTOR)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 95f1deed8b6c0..b25cb128bce9f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5104,7 +5104,6 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,
   SDValue V1 = SVN->getOperand(0);
   SDValue V2 = SVN->getOperand(1);
   ArrayRef<int> Mask = SVN->getMask();
-  unsigned NumElts = VT.getVectorNumElements();
 
   // If we don't know exact data layout, not much we can do.  If this
   // is already m1 or smaller, no point in splitting further.
@@ -5121,58 +5120,102 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,
 
   MVT ElemVT = VT.getVectorElementType();
   unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
-  unsigned VRegsPerSrc = NumElts / ElemsPerVReg;
-
-  SmallVector<std::pair<int, SmallVector<int>>>
-    OutMasks(VRegsPerSrc, {-1, {}});
-
-  // Check if our mask can be done as a 1-to-1 mapping from source
-  // to destination registers in the group without needing to
-  // write each destination more than once.
-  for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx++) {
-    int DstVecIdx = DstIdx / ElemsPerVReg;
-    int DstSubIdx = DstIdx % ElemsPerVReg;
-    int SrcIdx = Mask[DstIdx];
-    if (SrcIdx < 0 || (unsigned)SrcIdx >= 2 * NumElts)
-      continue;
-    int SrcVecIdx = SrcIdx / ElemsPerVReg;
-    int SrcSubIdx = SrcIdx % ElemsPerVReg;
-    if (OutMasks[DstVecIdx].first == -1)
-      OutMasks[DstVecIdx].first = SrcVecIdx;
-    if (OutMasks[DstVecIdx].first != SrcVecIdx)
-      // Note: This case could easily be handled by keeping track of a chain
-      // of source values and generating two element shuffles below.  This is
-      // less an implementation question, and more a profitability one.
-      return SDValue();
-
-    OutMasks[DstVecIdx].second.resize(ElemsPerVReg, -1);
-    OutMasks[DstVecIdx].second[DstSubIdx] = SrcSubIdx;
-  }
 
   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
   MVT OneRegVT = MVT::getVectorVT(ElemVT, ElemsPerVReg);
   MVT M1VT = getContainerForFixedLengthVector(DAG, OneRegVT, Subtarget);
   assert(M1VT == getLMUL1VT(M1VT));
   unsigned NumOpElts = M1VT.getVectorMinNumElements();
-  SDValue Vec = DAG.getUNDEF(ContainerVT);
+  unsigned NumElts = ContainerVT.getVectorMinNumElements();
+  unsigned NumOfSrcRegs = NumElts / NumOpElts;
+  unsigned NumOfDestRegs = NumElts / NumOpElts;
   // The following semantically builds up a fixed length concat_vector
   // of the component shuffle_vectors.  We eagerly lower to scalable here
   // to avoid DAG combining it back to a large shuffle_vector again.
   V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
   V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
-  for (unsigned DstVecIdx = 0 ; DstVecIdx < OutMasks.size(); DstVecIdx++) {
-    auto &[SrcVecIdx, SrcSubMask] = OutMasks[DstVecIdx];
-    if (SrcVecIdx == -1)
-      continue;
-    unsigned ExtractIdx = (SrcVecIdx % VRegsPerSrc) * NumOpElts;
-    SDValue SrcVec = (unsigned)SrcVecIdx >= VRegsPerSrc ? V2 : V1;
+  SmallVector<SmallVector<std::tuple<unsigned, unsigned, SmallVector<int>>>>
+      Operands;
+  processShuffleMasks(
+      Mask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs,
+      [&]() { Operands.emplace_back(); },
+      [&](ArrayRef<int> SrcSubMask, unsigned SrcVecIdx, unsigned DstVecIdx) {
+        Operands.emplace_back().emplace_back(
+            SrcVecIdx, UINT_MAX,
+            SmallVector<int>(SrcSubMask.begin(), SrcSubMask.end()));
+      },
+      [&](ArrayRef<int> SrcSubMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
+        if (NewReg)
+          Operands.emplace_back();
+        Operands.back().emplace_back(
+            Idx1, Idx2, SmallVector<int>(SrcSubMask.begin(), SrcSubMask.end()));
+      });
+  assert(Operands.size() == NumOfDestRegs && "Whole vector must be processed");
+  // Note: check that we do not emit too many shuffles here to prevent code
+  // size explosion.
+  // TODO: investigate, if it can be improved by extra analysis of the masks to
+  // check if the code is more profitable.
+  unsigned NumShuffles = std::accumulate(
+      Operands.begin(), Operands.end(), 0u,
+      [&](unsigned N,
+          ArrayRef<std::tuple<unsigned, unsigned, SmallVector<int>>> Data) {
+        if (Data.empty())
+          return N;
+        N += Data.size();
+        for (const auto &P : Data) {
+          unsigned Idx2 = std::get<1>(P);
+          ArrayRef<int> Mask = std::get<2>(P);
+          if (Idx2 != UINT_MAX)
+            ++N;
+          else if (ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
+            --N;
+        }
+        return N;
+      });
+  if ((NumOfDestRegs > 2 && NumShuffles > NumOfDestRegs) ||
+      (NumOfDestRegs <= 2 && NumShuffles >= 4))
+    return SDValue();
+  auto ExtractValue = [&, &DAG = DAG](SDValue SrcVec, unsigned ExtractIdx) {
     SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec,
                                  DAG.getVectorIdxConstant(ExtractIdx, DL));
     SubVec = convertFromScalableVector(OneRegVT, SubVec, DAG, Subtarget);
-    SubVec = DAG.getVectorShuffle(OneRegVT, DL, SubVec, SubVec, SrcSubMask);
-    SubVec = convertToScalableVector(M1VT, SubVec, DAG, Subtarget);
-    unsigned InsertIdx = DstVecIdx * NumOpElts;
-    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Vec, SubVec,
+    return SubVec;
+  };
+  auto PerformShuffle = [&, &DAG = DAG](SDValue SubVec1, SDValue SubVec2,
+                                        ArrayRef<int> Mask) {
+    SDValue SubVec = DAG.getVectorShuffle(OneRegVT, DL, SubVec1, SubVec2, Mask);
+    return SubVec;
+  };
+  SDValue Vec = DAG.getUNDEF(ContainerVT);
+  for (auto [I, Data] : enumerate(Operands)) {
+    if (Data.empty())
+      continue;
+    SmallDenseMap<unsigned, SDValue, 4> Values;
+    for (unsigned I : seq<unsigned>(Data.size())) {
+      const auto &[Idx1, Idx2, _] = Data[I];
+      if (Values.contains(Idx1)) {
+        assert(Idx2 != UINT_MAX && Values.contains(Idx2) &&
+               "Expected both indices to be extracted already.");
+        break;
+      }
+      SDValue V = ExtractValue(Idx1 >= NumOfSrcRegs ? V2 : V1,
+                               (Idx1 % NumOfSrcRegs) * NumOpElts);
+      Values[Idx1] = V;
+      if (Idx2 != UINT_MAX)
+        Values[Idx2] = ExtractValue(Idx2 >= NumOfSrcRegs ? V2 : V1,
+                                    (Idx2 % NumOfSrcRegs) * NumOpElts);
+    }
+    SDValue V;
+    for (const auto &[Idx1, Idx2, Mask] : Data) {
+      SDValue V1 = Values.at(Idx1);
+      SDValue V2 = Idx2 == UINT_MAX ? V1 : Values.at(Idx2);
+      V = PerformShuffle(V1, V2, Mask);
+      Values[Idx1] = V;
+    }
+
+    unsigned InsertIdx = I * NumOpElts;
+    V = convertToScalableVector(M1VT, V, DAG, Subtarget);
+    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Vec, V,
                       DAG.getVectorIdxConstant(InsertIdx, DL));
   }
   return convertFromScalableVector(VT, Vec, DAG, Subtarget);
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index c19bcfc5524cc..413b54343ef0e 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1774,9 +1774,9 @@ InstructionCost X86TTIImpl::getShuffleCost(
               PrevSrcReg = SrcReg;
               PrevRegMask = RegMask;
             },
-            [this, SingleOpTy, CostKind, &Cost](ArrayRef<int> RegMask,
-                                                unsigned /*Unused*/,
-                                                unsigned /*Unused*/) {
+            [this, SingleOpTy, CostKind,
+             &Cost](ArrayRef<int> RegMask, unsigned /*Unused*/,
+                    unsigned /*Unused*/, bool /*Unused*/) {
               Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
                                      CostKind, 0, nullptr);
             });
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
index 4603c0d24f5d7..54d0acc3ba8b5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
@@ -168,12 +168,11 @@ define <4 x i64> @m2_splat_into_slide_two_source_v2_lo(<4 x i64> %v1, <4 x i64>
 define <4 x i64> @m2_splat_into_slide_two_source(<4 x i64> %v1, <4 x i64> %v2) vscale_range(2,2) {
 ; CHECK-LABEL: m2_splat_into_slide_two_source:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v0, 12
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v13, v10, 1
+; CHECK-NEXT:    vslideup.vi v13, v11, 1
 ; CHECK-NEXT:    vrgather.vi v12, v8, 0
-; CHECK-NEXT:    vslideup.vi v12, v10, 1, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv2r.v v8, v12
 ; CHECK-NEXT:    ret
   %res = shufflevector <4 x i64> %v1, <4 x i64> %v2, <4 x i32> <i32 0, i32 0, i32 5, i32 6>
   ret <4 x i64> %res
@@ -183,18 +182,17 @@ define void @shuffle1(ptr %explicit_0, ptr %explicit_1) vscale_range(2,2) {
 ; CHECK-LABEL: shuffle1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi a0, a0, 252
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    vsetivli zero, 3, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    li a0, 175
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v8, 1
-; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vadd.vi v8, v8, 1
-; CHECK-NEXT:    vrgather.vv v11, v9, v8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v8, v10, 0, v0
+; CHECK-NEXT:    vle32.v v11, (a0)
+; CHECK-NEXT:    vmv.v.i v0, 5
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; CHECK-NEXT:    vsrl.vi v10, v10, 1
+; CHECK-NEXT:    vadd.vi v10, v10, 1
+; CHECK-NEXT:    vrgather.vv v9, v11, v10, v0.t
 ; CHECK-NEXT:    addi a0, a1, 672
 ; CHECK-NEXT:    vs2r.v v8, (a0)
 ; CHECK-NEXT:    ret
@@ -211,15 +209,15 @@ define void @shuffle1(ptr %explicit_0, ptr %explicit_1) vscale_range(2,2) {
 define <16 x float> @shuffle2(<4 x float> %a) vscale_range(2,2) {
 ; CHECK-LABEL: shuffle2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vid.v v9
-; CHECK-NEXT:    li a0, -97
-; CHECK-NEXT:    vadd.vv v9, v9, v9
-; CHECK-NEXT:    vrsub.vi v9, v9, 4
-; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vrgather.vv v13, v8, v9
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vmerge.vim v8, v12, 0, v0
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; CHECK-NEXT:    vid.v v13
+; CHECK-NEXT:    vadd.vv v13, v13, v13
+; CHECK-NEXT:    vmv.v.i v0, 6
+; CHECK-NEXT:    vrsub.vi v13, v13, 4
+; CHECK-NEXT:    vrgather.vv v9, v12, v13, v0.t
 ; CHECK-NEXT:    ret
   %b = extractelement <4 x float> %a, i32 2
   %c = insertelement <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %b, i32 5
@@ -231,16 +229,15 @@ define <16 x float> @shuffle2(<4 x float> %a) vscale_range(2,2) {
 define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) vscale_range(2,2) {
 ; RV32-LABEL: extract_any_extend_vector_inreg_v16i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, mu
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vmv.v.i v16, 0
-; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
+; RV32-NEXT:    vmv.v.i v0, 1
 ; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vrgather.vi v16, v8, 15, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV32-NEXT:    vrgather.vi v18, v15, 1, v0.t
+; RV32-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vx v8, v16, a0
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
@@ -258,13 +255,14 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) vsca
 ; RV64-NEXT:    addi s0, sp, 256
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    andi sp, sp, -128
-; RV64-NEXT:    li a1, -17
+; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV64-NEXT:    vmv.v.i v0, 1
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vmv.s.x v0, a1
-; RV64-NEXT:    vrgather.vi v16, v8, 15
-; RV64-NEXT:    vmerge.vim v8, v16, 0, v0
+; RV64-NEXT:    vmv.v.i v16, 0
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
+; RV64-NEXT:    vrgather.vi v18, v15, 1, v0.t
 ; RV64-NEXT:    mv s2, sp
-; RV64-NEXT:    vs8r.v v8, (s2)
+; RV64-NEXT:    vs8r.v v16, (s2)
 ; RV64-NEXT:    andi a0, a0, 15
 ; RV64-NEXT:    li a1, 8
 ; RV64-NEXT:    call __muldi3
@@ -290,21 +288,16 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) vsca
 define <4 x double> @shuffles_add(<4 x double> %0, <4 x double> %1) vscale_range(2,2) {
 ; CHECK-LABEL: shuffles_add:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
+; CHECK-NEXT:    vmv1r.v v13, v10
+; CHECK-NEXT:    vslideup.vi v13, v11, 1
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    vmv.v.i v0, 1
+; CHECK-NEXT:    vrgather.vi v12, v9, 0
+; CHECK-NEXT:    vmv1r.v v9, v11
+; CHECK-NEXT:    vrgather.vi v9, v10, 1, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vrgather.vi v12, v8, 2
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vid.v v14
-; CHECK-NEXT:    vmv.v.i v0, 12
-; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vrgather.vi v16, v8, 3
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vadd.vv v8, v14, v14
-; CHECK-NEXT:    vadd.vi v9, v8, -4
-; CHECK-NEXT:    vadd.vi v8, v8, -3
-; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v12, v10, v9, v0.t
-; CHECK-NEXT:    vrgatherei16.vv v16, v10, v8, v0.t
-; CHECK-NEXT:    vfadd.vv v8, v12, v16
+; CHECK-NEXT:    vfadd.vv v8, v12, v8
 ; CHECK-NEXT:    ret
   %3 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
   %4 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
@@ -332,12 +325,13 @@ entry:
 define <16 x i32> @m4_linear_num_of_shuffles_in_chunks(<16 x i32> %0) vscale_range(2,2) {
 ; CHECK-LABEL: m4_linear_num_of_shuffles_in_chunks:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a0, %hi(.LCPI18_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI18_0)
-; CHECK-NEXT:    vl2re16.v v16, (a0)
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.v.i v0, 8
+; CHECK-NEXT:    vrgather.vi v12, v10, 0
+; CHECK-NEXT:    vrgather.vi v12, v11, 0, v0.t
+; CHECK-NEXT:    vrgather.vi v14, v8, 2
+; CHECK-NEXT:    vrgather.vi v15, v10, 3
+; CHECK-NEXT:    vmv4r.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %1 = shufflevector <16 x i32> %0, <16 x i32> poison, <16 x i32> <i32 poison, i32 poison, i32 8, i32 12, i32 poison, i32 poison, i32 poison, i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 11, i32 poison>

From ad56f6267f6b208c46074d9f58464f171418d619 Mon Sep 17 00:00:00 2001
From: Congcong Cai <congcongcai0907@163.com>
Date: Tue, 14 Jan 2025 06:12:04 +0800
Subject: [PATCH 342/408] [APFloat][NFC]extract
 `fltSemantics::isRepresentableBy` to header (#122636)

isRepresentableBy is useful to check float point type compatibility
---
 llvm/include/llvm/ADT/APFloat.h |  5 +++++
 llvm/lib/Support/APFloat.cpp    | 20 +++++++++-----------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index bf80fa5a06580..9792749230cbf 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -281,6 +281,11 @@ struct APFloatBase {
   /// anything real.
   static const fltSemantics &Bogus() LLVM_READNONE;
 
+  // Returns true if any number described by this semantics can be precisely
+  // represented by the specified semantics. Does not take into account
+  // the value of fltNonfiniteBehavior, hasZero, hasSignedRepr.
+  static bool isRepresentableBy(const fltSemantics &A, const fltSemantics &B);
+
   /// @}
 
   /// IEEE-754R 5.11: Floating Point Comparison Relations.
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index c9adfca8b3b76..b0d92ae37fe8f 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -125,14 +125,6 @@ struct fltSemantics {
 
   /* Whether this semantics can represent signed values */
   bool hasSignedRepr = true;
-
-  // Returns true if any number described by this semantics can be precisely
-  // represented by the specified semantics. Does not take into account
-  // the value of fltNonfiniteBehavior.
-  bool isRepresentableBy(const fltSemantics &S) const {
-    return maxExponent <= S.maxExponent && minExponent >= S.minExponent &&
-           precision <= S.precision;
-  }
 };
 
 static constexpr fltSemantics semIEEEhalf = {15, -14, 11, 16};
@@ -290,6 +282,12 @@ const fltSemantics &APFloatBase::x87DoubleExtended() {
 }
 const fltSemantics &APFloatBase::Bogus() { return semBogus; }
 
+bool APFloatBase::isRepresentableBy(const fltSemantics &A,
+                                    const fltSemantics &B) {
+  return A.maxExponent <= B.maxExponent && A.minExponent >= B.minExponent &&
+         A.precision <= B.precision;
+}
+
 constexpr RoundingMode APFloatBase::rmNearestTiesToEven;
 constexpr RoundingMode APFloatBase::rmTowardPositive;
 constexpr RoundingMode APFloatBase::rmTowardNegative;
@@ -5527,7 +5525,7 @@ APFloat::opStatus APFloat::convertToInteger(APSInt &result,
 double APFloat::convertToDouble() const {
   if (&getSemantics() == (const llvm::fltSemantics *)&semIEEEdouble)
     return getIEEE().convertToDouble();
-  assert(getSemantics().isRepresentableBy(semIEEEdouble) &&
+  assert(isRepresentableBy(getSemantics(), semIEEEdouble) &&
          "Float semantics is not representable by IEEEdouble");
   APFloat Temp = *this;
   bool LosesInfo;
@@ -5541,7 +5539,7 @@ double APFloat::convertToDouble() const {
 float128 APFloat::convertToQuad() const {
   if (&getSemantics() == (const llvm::fltSemantics *)&semIEEEquad)
     return getIEEE().convertToQuad();
-  assert(getSemantics().isRepresentableBy(semIEEEquad) &&
+  assert(isRepresentableBy(getSemantics(), semIEEEquad) &&
          "Float semantics is not representable by IEEEquad");
   APFloat Temp = *this;
   bool LosesInfo;
@@ -5555,7 +5553,7 @@ float128 APFloat::convertToQuad() const {
 float APFloat::convertToFloat() const {
   if (&getSemantics() == (const llvm::fltSemantics *)&semIEEEsingle)
     return getIEEE().convertToFloat();
-  assert(getSemantics().isRepresentableBy(semIEEEsingle) &&
+  assert(isRepresentableBy(getSemantics(), semIEEEsingle) &&
          "Float semantics is not representable by IEEEsingle");
   APFloat Temp = *this;
   bool LosesInfo;

From ab023199d595187d248abe67aa2fd8635be51fdb Mon Sep 17 00:00:00 2001
From: Congcong Cai <congcongcai0907@163.com>
Date: Tue, 14 Jan 2025 06:15:50 +0800
Subject: [PATCH 343/408] [clang-tidy] fix wrong float to float conversion
 check when floating point type is not standard type (#122637)

compare type kind is the wrong way to compare floating point type compatibility.
more generic compatibility check is needed.
---
 .../clang-tidy/bugprone/NarrowingConversionsCheck.cpp    | 4 +++-
 clang-tools-extra/docs/ReleaseNotes.rst                  | 5 +++++
 ...rrowing-conversions-narrowingfloatingpoint-option.cpp | 9 +++++++++
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.cpp
index 408390ebc70b6..bafcd402ca851 100644
--- a/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.cpp
@@ -513,7 +513,9 @@ void NarrowingConversionsCheck::handleFloatingCast(const ASTContext &Context,
       return;
     }
     const BuiltinType *FromType = getBuiltinType(Rhs);
-    if (ToType->getKind() < FromType->getKind())
+    if (!llvm::APFloatBase::isRepresentableBy(
+            Context.getFloatTypeSemantics(FromType->desugar()),
+            Context.getFloatTypeSemantics(ToType->desugar())))
       diagNarrowType(SourceLoc, Lhs, Rhs);
   }
 }
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 375de831f0e11..3fe2f0ce01bcc 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -210,6 +210,11 @@ Changes in existing checks
   <clang-tidy/checks/bugprone/forwarding-reference-overload>` check by fixing
   a crash when determining if an ``enable_if[_t]`` was found.
 
+- Improve :doc:`bugprone-narrowing-conversions
+  <clang-tidy/checks/bugprone/narrowing-conversions>` to avoid incorrect check
+  results when floating point type is not ``float``, ``double`` and
+  ``long double``.
+
 - Improved :doc:`bugprone-optional-value-conversion
   <clang-tidy/checks/bugprone/optional-value-conversion>` to support detecting
   conversion directly by ``std::make_unique`` and ``std::make_shared``.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-narrowingfloatingpoint-option.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-narrowingfloatingpoint-option.cpp
index 9ded2f0923f4e..180b789e45bb3 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-narrowingfloatingpoint-option.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-narrowingfloatingpoint-option.cpp
@@ -36,6 +36,15 @@ void narrow_double_to_float_not_ok(double d) {
   f = narrow_double_to_float_return();
 }
 
+float narrow_float16_to_float_return(_Float16 f) {
+  return f;
+}
+
+_Float16 narrow_float_to_float16_return(float f) {
+  return f;
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: narrowing conversion from 'float' to '_Float16' [bugprone-narrowing-conversions]
+}
+
 void narrow_fp_constants() {
   float f;
   f = 0.5; // [dcl.init.list] 7.2 : in-range fp constant to narrower float is not a narrowing.

From 251ef3f5037d3e89fa457d146fb2521215e13feb Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Mon, 13 Jan 2025 23:16:57 +0100
Subject: [PATCH 344/408] [LLD][COFF] Use appropriate symbol table for -include
 argument on ARM64X (#122554)

Move `LinkerDriver::addUndefined` to` SymbolTable` to allow its use with
both symbol tables on ARM64X and rename it to `addGCRoot` to clarify its
distinct role compared to the existing `SymbolTable::addUndefined`.

Command-line `-include` arguments now apply to the EC symbol table, with
`mainSymtab` introduced in `linkerMain`. There will be more similar
cases. For `.drectve` sections, the corresponding symbol table is used
based on the context.
---
 lld/COFF/Driver.cpp         | 64 +++++++++++------------------------
 lld/COFF/Driver.h           |  2 --
 lld/COFF/SymbolTable.cpp    | 29 ++++++++++++++++
 lld/COFF/SymbolTable.h      |  3 ++
 lld/test/COFF/arm64x-incl.s | 66 +++++++++++++++++++++++++++++++++++++
 5 files changed, 117 insertions(+), 47 deletions(-)
 create mode 100644 lld/test/COFF/arm64x-incl.s

diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index 791382fd9bdd4..0d89457046a50 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -479,7 +479,7 @@ void LinkerDriver::parseDirectives(InputFile *file) {
 
   // Handle /include: in bulk.
   for (StringRef inc : directives.includes)
-    addUndefined(inc);
+    file->symtab.addGCRoot(inc);
 
   // Handle /exclude-symbols: in bulk.
   for (StringRef e : directives.excludes) {
@@ -505,13 +505,13 @@ void LinkerDriver::parseDirectives(InputFile *file) {
     case OPT_entry:
       if (!arg->getValue()[0])
         Fatal(ctx) << "missing entry point symbol name";
-      ctx.config.entry = addUndefined(mangle(arg->getValue()), true);
+      ctx.config.entry = file->symtab.addGCRoot(mangle(arg->getValue()), true);
       break;
     case OPT_failifmismatch:
       checkFailIfMismatch(arg->getValue(), file);
       break;
     case OPT_incl:
-      addUndefined(arg->getValue());
+      file->symtab.addGCRoot(arg->getValue());
       break;
     case OPT_manifestdependency:
       ctx.config.manifestDependencies.insert(arg->getValue());
@@ -805,35 +805,6 @@ void LinkerDriver::addLibSearchPaths() {
   }
 }
 
-Symbol *LinkerDriver::addUndefined(StringRef name, bool aliasEC) {
-  Symbol *b = ctx.symtab.addUndefined(name);
-  if (!b->isGCRoot) {
-    b->isGCRoot = true;
-    ctx.config.gcroot.push_back(b);
-  }
-
-  // On ARM64EC, a symbol may be defined in either its mangled or demangled form
-  // (or both). Define an anti-dependency symbol that binds both forms, similar
-  // to how compiler-generated code references external functions.
-  if (aliasEC && isArm64EC(ctx.config.machine)) {
-    if (std::optional<std::string> mangledName =
-            getArm64ECMangledFunctionName(name)) {
-      auto u = dyn_cast<Undefined>(b);
-      if (u && !u->weakAlias) {
-        Symbol *t = ctx.symtab.addUndefined(saver().save(*mangledName));
-        u->setWeakAlias(t, true);
-      }
-    } else if (std::optional<std::string> demangledName =
-                   getArm64ECDemangledFunctionName(name)) {
-      Symbol *us = ctx.symtab.addUndefined(saver().save(*demangledName));
-      auto u = dyn_cast<Undefined>(us);
-      if (u && !u->weakAlias)
-        u->setWeakAlias(b, true);
-    }
-  }
-  return b;
-}
-
 void LinkerDriver::addUndefinedGlob(StringRef arg) {
   Expected<GlobPattern> pat = GlobPattern::create(arg);
   if (!pat) {
@@ -849,7 +820,7 @@ void LinkerDriver::addUndefinedGlob(StringRef arg) {
   });
 
   for (Symbol *sym : syms)
-    addUndefined(sym->getName());
+    ctx.symtab.addGCRoot(sym->getName());
 }
 
 StringRef LinkerDriver::mangleMaybe(Symbol *s) {
@@ -1487,7 +1458,7 @@ void LinkerDriver::maybeCreateECExportThunk(StringRef name, Symbol *&sym) {
     expName = saver().save("EXP+" + *mangledName);
   else
     expName = saver().save("EXP+" + name);
-  sym = addUndefined(expName);
+  sym = ctx.symtabEC->addGCRoot(expName);
   if (auto undef = dyn_cast<Undefined>(sym)) {
     if (!undef->getWeakAlias()) {
       auto thunk = make<ECExportThunkChunk>(def);
@@ -1537,7 +1508,8 @@ void LinkerDriver::createECExportThunks() {
 
 void LinkerDriver::pullArm64ECIcallHelper() {
   if (!ctx.config.arm64ECIcallHelper)
-    ctx.config.arm64ECIcallHelper = addUndefined("__icall_helper_arm64ec");
+    ctx.config.arm64ECIcallHelper =
+        ctx.symtabEC->addGCRoot("__icall_helper_arm64ec");
 }
 
 // In MinGW, if no symbols are chosen to be exported, then all symbols are
@@ -1976,6 +1948,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
       setMachine(machine);
     }
   }
+  SymbolTable &mainSymtab = ctx.hybridSymtab ? *ctx.hybridSymtab : ctx.symtab;
 
   // Handle /nodefaultlib:<filename>
   {
@@ -2062,7 +2035,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
 
   // Handle /include
   for (auto *arg : args.filtered(OPT_incl))
-    addUndefined(arg->getValue());
+    mainSymtab.addGCRoot(arg->getValue());
 
   // Handle /implib
   if (auto *arg = args.getLastArg(OPT_implib))
@@ -2493,22 +2466,22 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
     if (auto *arg = args.getLastArg(OPT_entry)) {
       if (!arg->getValue()[0])
         Fatal(ctx) << "missing entry point symbol name";
-      config->entry = addUndefined(mangle(arg->getValue()), true);
+      config->entry = ctx.symtab.addGCRoot(mangle(arg->getValue()), true);
     } else if (!config->entry && !config->noEntry) {
       if (args.hasArg(OPT_dll)) {
         StringRef s = (config->machine == I386) ? "__DllMainCRTStartup@12"
                                                 : "_DllMainCRTStartup";
-        config->entry = addUndefined(s, true);
+        config->entry = ctx.symtab.addGCRoot(s, true);
       } else if (config->driverWdm) {
         // /driver:wdm implies /entry:_NtProcessStartup
-        config->entry = addUndefined(mangle("_NtProcessStartup"), true);
+        config->entry = ctx.symtab.addGCRoot(mangle("_NtProcessStartup"), true);
       } else {
         // Windows specific -- If entry point name is not given, we need to
         // infer that from user-defined entry name.
         StringRef s = findDefaultEntry();
         if (s.empty())
           Fatal(ctx) << "entry point must be defined";
-        config->entry = addUndefined(s, true);
+        config->entry = ctx.symtab.addGCRoot(s, true);
         Log(ctx) << "Entry name inferred: " << s;
       }
     }
@@ -2520,9 +2493,10 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
     for (auto *arg : args.filtered(OPT_delayload)) {
       config->delayLoads.insert(StringRef(arg->getValue()).lower());
       if (config->machine == I386) {
-        config->delayLoadHelper = addUndefined("___delayLoadHelper2@8");
+        config->delayLoadHelper = ctx.symtab.addGCRoot("___delayLoadHelper2@8");
       } else {
-        config->delayLoadHelper = addUndefined("__delayLoadHelper2", true);
+        config->delayLoadHelper =
+            ctx.symtab.addGCRoot("__delayLoadHelper2", true);
       }
     }
   }
@@ -2659,7 +2633,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
       for (Export &e : config->exports) {
         if (!e.forwardTo.empty())
           continue;
-        e.sym = addUndefined(e.name, !e.data);
+        e.sym = ctx.symtab.addGCRoot(e.name, !e.data);
         if (e.source != ExportSource::Directives)
           e.symbolName = mangleMaybe(e.sym);
       }
@@ -2701,13 +2675,13 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
 
       // Windows specific -- if __load_config_used can be resolved, resolve it.
       if (ctx.symtab.findUnderscore("_load_config_used"))
-        addUndefined(mangle("_load_config_used"));
+        ctx.symtab.addGCRoot(mangle("_load_config_used"));
 
       if (args.hasArg(OPT_include_optional)) {
         // Handle /includeoptional
         for (auto *arg : args.filtered(OPT_include_optional))
           if (isa_and_nonnull<LazyArchive>(ctx.symtab.find(arg->getValue())))
-            addUndefined(arg->getValue());
+            ctx.symtab.addGCRoot(arg->getValue());
       }
     } while (run());
   }
diff --git a/lld/COFF/Driver.h b/lld/COFF/Driver.h
index 5132568904298..9d4f1cbfcb584 100644
--- a/lld/COFF/Driver.h
+++ b/lld/COFF/Driver.h
@@ -173,8 +173,6 @@ class LinkerDriver {
 
   std::set<std::string> visitedLibs;
 
-  Symbol *addUndefined(StringRef sym, bool aliasEC = false);
-
   void addUndefinedGlob(StringRef arg);
 
   StringRef mangleMaybe(Symbol *s);
diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp
index ae88675ab93a1..b2f3ffe780e5d 100644
--- a/lld/COFF/SymbolTable.cpp
+++ b/lld/COFF/SymbolTable.cpp
@@ -651,6 +651,35 @@ Symbol *SymbolTable::addUndefined(StringRef name, InputFile *f,
   return s;
 }
 
+Symbol *SymbolTable::addGCRoot(StringRef name, bool aliasEC) {
+  Symbol *b = addUndefined(name);
+  if (!b->isGCRoot) {
+    b->isGCRoot = true;
+    ctx.config.gcroot.push_back(b);
+  }
+
+  // On ARM64EC, a symbol may be defined in either its mangled or demangled form
+  // (or both). Define an anti-dependency symbol that binds both forms, similar
+  // to how compiler-generated code references external functions.
+  if (aliasEC && isEC()) {
+    if (std::optional<std::string> mangledName =
+            getArm64ECMangledFunctionName(name)) {
+      auto u = dyn_cast<Undefined>(b);
+      if (u && !u->weakAlias) {
+        Symbol *t = addUndefined(saver().save(*mangledName));
+        u->setWeakAlias(t, true);
+      }
+    } else if (std::optional<std::string> demangledName =
+                   getArm64ECDemangledFunctionName(name)) {
+      Symbol *us = addUndefined(saver().save(*demangledName));
+      auto u = dyn_cast<Undefined>(us);
+      if (u && !u->weakAlias)
+        u->setWeakAlias(b, true);
+    }
+  }
+  return b;
+}
+
 // On ARM64EC, a function symbol may appear in both mangled and demangled forms:
 // - ARM64EC archives contain only the mangled name, while the demangled symbol
 //   is defined by the object file as an alias.
diff --git a/lld/COFF/SymbolTable.h b/lld/COFF/SymbolTable.h
index 5443815172dfd..4c749ae059d27 100644
--- a/lld/COFF/SymbolTable.h
+++ b/lld/COFF/SymbolTable.h
@@ -85,6 +85,9 @@ class SymbolTable {
   // added and before the writer writes results to a file.
   void compileBitcodeFiles();
 
+  // Creates an Undefined symbol and marks it as live.
+  Symbol *addGCRoot(StringRef sym, bool aliasEC = false);
+
   // Creates an Undefined symbol for a given name.
   Symbol *addUndefined(StringRef name);
 
diff --git a/lld/test/COFF/arm64x-incl.s b/lld/test/COFF/arm64x-incl.s
new file mode 100644
index 0000000000000..7ddfce1ebe693
--- /dev/null
+++ b/lld/test/COFF/arm64x-incl.s
@@ -0,0 +1,66 @@
+// REQUIRES: aarch64
+// RUN: split-file %s %t.dir && cd %t.dir
+
+// RUN: llvm-mc -filetype=obj -triple=arm64ec-windows sym-arm64ec.s -o sym-arm64ec.obj
+// RUN: llvm-mc -filetype=obj -triple=aarch64-windows sym-aarch64.s -o sym-aarch64.obj
+// RUN: llvm-mc -filetype=obj -triple=arm64ec-windows drectve.s -o drectve-arm64ec.obj
+// RUN: llvm-mc -filetype=obj -triple=aarch64-windows drectve.s -o drectve-aarch64.obj
+// RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj
+// RUN: llvm-mc -filetype=obj -triple=aarch64-windows %S/Inputs/loadconfig-arm64.s -o loadconfig-arm64.obj
+// RUN: llvm-lib -machine:arm64x -out:sym.lib sym-arm64ec.obj sym-aarch64.obj
+
+// Check that the command-line -include argument ensures the EC symbol is included.
+
+// RUN: lld-link -machine:arm64x -out:out-arg.dll -dll -noentry loadconfig-arm64.obj loadconfig-arm64ec.obj sym.lib -include:sym
+// RUN: llvm-readobj --hex-dump=.test out-arg.dll | FileCheck --check-prefix=EC %s
+// EC: 0x180004000 02000000                            ....
+
+// Check that the native .drectve -include argument ensures the native symbol is included.
+
+// RUN: lld-link -machine:arm64x -out:out-native.dll -dll -noentry loadconfig-arm64.obj loadconfig-arm64ec.obj sym.lib drectve-aarch64.obj
+// RUN: llvm-readobj --hex-dump=.test out-native.dll | FileCheck --check-prefix=NATIVE %s
+// NATIVE: 0x180004000 01000000                            ....
+
+// Check that the EC .drectve -include argument ensures the EC symbol is included.
+
+// RUN: lld-link -machine:arm64x -out:out-ec.dll -dll -noentry loadconfig-arm64.obj loadconfig-arm64ec.obj sym.lib drectve-arm64ec.obj
+// RUN: llvm-readobj --hex-dump=.test out-ec.dll | FileCheck --check-prefix=EC %s
+
+// Check that both native and EC .drectve -include arguments ensure both symbols are included.
+
+// RUN: lld-link -machine:arm64x -out:out-arg-native.dll -dll -noentry loadconfig-arm64.obj loadconfig-arm64ec.obj sym.lib \
+// RUN:          -include:sym drectve-aarch64.obj
+// RUN: llvm-readobj --hex-dump=.test out-arg-native.dll | FileCheck --check-prefix=BOTH %s
+// BOTH: 0x180004000 02000000 01000000                   ........
+
+// RUN: lld-link -machine:arm64x -out:out-both.dll -dll -noentry loadconfig-arm64.obj loadconfig-arm64ec.obj sym.lib \
+// RUN:          drectve-arm64ec.obj drectve-aarch64.obj
+// RUN: llvm-readobj --hex-dump=.test out-both.dll | FileCheck --check-prefix=BOTH %s
+
+// Check that including a missing symbol results in an error.
+
+// RUN: not lld-link -machine:arm64x -out:err.dll -dll -noentry loadconfig-arm64.obj loadconfig-arm64ec.obj -include:sym sym-aarch64.obj \
+// RUN:              2>&1 | FileCheck --check-prefix=ERR %s
+// ERR: lld-link: error: <root>: undefined symbol: sym
+
+// RUN: not lld-link -machine:arm64x -out:err.dll -dll -noentry loadconfig-arm64.obj loadconfig-arm64ec.obj drectve-arm64ec.obj sym-aarch64.obj \
+// RUN:              2>&1 | FileCheck --check-prefix=ERR %s
+
+// RUN: not lld-link -machine:arm64x -out:err.dll -dll -noentry loadconfig-arm64.obj loadconfig-arm64ec.obj drectve-aarch64.obj sym-arm64ec.obj \
+// RUN:              2>&1 | FileCheck --check-prefix=ERR %s
+
+#--- sym-aarch64.s
+        .section ".test","dr"
+        .globl sym
+sym:
+        .word 1
+
+#--- sym-arm64ec.s
+        .section ".test","dr"
+        .globl sym
+sym:
+        .word 2
+
+#--- drectve.s
+        .section .drectve, "yn"
+        .ascii " -include:sym"

From a8d2aeec8732c8d158f5b194a5191c0805bcd961 Mon Sep 17 00:00:00 2001
From: ChiaHungDuan <chiahungduan@google.com>
Date: Mon, 13 Jan 2025 14:17:48 -0800
Subject: [PATCH 345/408] [scudo] Fix the format of getStats() (#121608)

This is a quick fix for b71c44b9be17dc6295eb733d685b38e797f3c846

"last released" was removed by accident in primary64.h and the update of
"NumReleasesAttempted" was missing.
---
 compiler-rt/lib/scudo/standalone/primary64.h | 28 +++++++++++---------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
index 2b520ceb33148..e7da849c339bf 100644
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -1141,18 +1141,18 @@ template <typename Config> class SizeClassAllocator64 {
           BytesInFreeList - Region->ReleaseInfo.BytesInFreeListAtLastCheckpoint;
     }
     const uptr TotalChunks = Region->MemMapInfo.AllocatedUser / BlockSize;
-    Str->append(
-        "%s %02zu (%6zu): mapped: %6zuK popped: %7zu pushed: %7zu "
-        "inuse: %6zu total: %6zu releases: %6zu last "
-        "releases attempted: %6zuK latest pushed bytes: %6zuK region: 0x%zx "
-        "(0x%zx)\n",
-        Region->Exhausted ? "E" : " ", ClassId, getSizeByClassId(ClassId),
-        Region->MemMapInfo.MappedUser >> 10, Region->FreeListInfo.PoppedBlocks,
-        Region->FreeListInfo.PushedBlocks, InUseBlocks, TotalChunks,
-        Region->ReleaseInfo.NumReleasesAttempted,
-        Region->ReleaseInfo.LastReleasedBytes >> 10,
-        RegionPushedBytesDelta >> 10, Region->RegionBeg,
-        getRegionBaseByClassId(ClassId));
+    Str->append("%s %02zu (%6zu): mapped: %6zuK popped: %7zu pushed: %7zu "
+                "inuse: %6zu total: %6zu releases attempted: %6zu last "
+                "released: %6zuK latest pushed bytes: %6zuK region: 0x%zx "
+                "(0x%zx)\n",
+                Region->Exhausted ? "E" : " ", ClassId,
+                getSizeByClassId(ClassId), Region->MemMapInfo.MappedUser >> 10,
+                Region->FreeListInfo.PoppedBlocks,
+                Region->FreeListInfo.PushedBlocks, InUseBlocks, TotalChunks,
+                Region->ReleaseInfo.NumReleasesAttempted,
+                Region->ReleaseInfo.LastReleasedBytes >> 10,
+                RegionPushedBytesDelta >> 10, Region->RegionBeg,
+                getRegionBaseByClassId(ClassId));
   }
 
   void getRegionFragmentationInfo(RegionInfo *Region, uptr ClassId,
@@ -1297,6 +1297,10 @@ template <typename Config> class SizeClassAllocator64 {
         return 0;
     }
 
+    // The following steps contribute to the majority time spent in page
+    // releasing thus we increment the counter here.
+    ++Region->ReleaseInfo.NumReleasesAttempted;
+
     // Note that we have extracted the `GroupsToRelease` from region freelist.
     // It's safe to let pushBlocks()/popBlocks() access the remaining region
     // freelist. In the steps 3 and 4, we will temporarily release the FLLock

From 386dec2be9ff979a9ca522debd9387d2d3c83e76 Mon Sep 17 00:00:00 2001
From: jimingham <jingham@apple.com>
Date: Mon, 13 Jan 2025 14:43:19 -0800
Subject: [PATCH 346/408] Update ReleaseNotes.md

Mentioned native command definitions and support for breaking on inlined call-sites.
---
 llvm/docs/ReleaseNotes.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index a9d9e5fc7ace4..c24dc1976b4d0 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -415,6 +415,13 @@ Changes to LLDB
 
 * LLDB now parses shared libraries in parallel, resulting in an average 2x speedup when attaching (only available on Darwin platforms) and launching (available on all platforms).
 
+* It is now possible to implement lldb commands in Python that use lldb's native command-line parser.  In particular, that allows per-option/argument completion,
+  with all the basic completers automatically supported and auto-generated help.
+  The command template file in the lldb/examples/python/cmdtemplate.py has been updated to show how to use this. 
+
+* Breakpoints on "inlined call sites" are now supported.  Previous to this fix, breakpoints on source lines that only contained inlined call sites would be
+  moved to the next source line, causing you to miss the inlined executions.
+
 * On the command line, LLDB now limits tab completions to your terminal width to avoid wrapping.
 
   Old:

From cd264f09a4d2f25d75436abdeeb757c412c3a75c Mon Sep 17 00:00:00 2001
From: Andreas Jonson <andjo403@hotmail.com>
Date: Mon, 13 Jan 2025 23:51:51 +0100
Subject: [PATCH 347/408] [InstSimplify] Test select bit test with trunc to i1
 (NFC)

---
 llvm/test/Transforms/InstSimplify/select.ll | 28 +++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/llvm/test/Transforms/InstSimplify/select.ll b/llvm/test/Transforms/InstSimplify/select.ll
index 5f160aa165861..40539b8ade388 100644
--- a/llvm/test/Transforms/InstSimplify/select.ll
+++ b/llvm/test/Transforms/InstSimplify/select.ll
@@ -1749,3 +1749,31 @@ define <4 x i32> @select_vector_cmp_with_bitcasts(<2 x i64> %x, <4 x i32> %y) {
   %sel = select <4 x i1> %cmp, <4 x i32> %sub.bc, <4 x i32> zeroinitializer
   ret <4 x i32> %sel
 }
+
+define i8 @bittest_trunc_or(i8 %x) {
+; CHECK-LABEL: @bittest_trunc_or(
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i8 [[X1:%.*]] to i1
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[X1]], 1
+; CHECK-NEXT:    [[X:%.*]] = select i1 [[TRUNC]], i8 [[OR]], i8 [[X1]]
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %trunc = trunc i8 %x to i1
+  %or = or i8 %x, 1
+  %cond = select i1 %trunc, i8 %or, i8 %x
+  ret i8 %cond
+}
+
+define i8 @bittest_trunc_not_or(i8 %x) {
+; CHECK-LABEL: @bittest_trunc_not_or(
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i8 [[X:%.*]] to i1
+; CHECK-NEXT:    [[NOT:%.*]] = xor i1 [[TRUNC]], true
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[X]], 1
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[NOT]], i8 [[OR]], i8 [[X]]
+; CHECK-NEXT:    ret i8 [[COND]]
+;
+  %trunc = trunc i8 %x to i1
+  %not = xor i1 %trunc, true
+  %or = or i8 %x, 1
+  %cond = select i1 %not, i8 %or, i8 %x
+  ret i8 %cond
+}

From 8ce81f17a16b8b689895c7c093d0401a75c09882 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 13 Jan 2025 15:18:41 -0800
Subject: [PATCH 348/408] [LegalizeVectorOps][RISCV] Use VP_FP_EXTEND/ROUND
 when promoting VP_FP* operations. (#122784)

This preserves the original VL leading to more reuse of VL for vsetvli.
The VLOptimizer can also clean up a lot of this, but I'm not sure if it
gets all of it.

There are some regressions in here from propagating the mask too, but
I'm not sure if that's a concern.
---
 .../SelectionDAG/LegalizeVectorOps.cpp        |   23 +-
 llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll        |  544 +-
 .../RISCV/rvv/fixed-vectors-ceil-vp.ll        |  136 +-
 .../RISCV/rvv/fixed-vectors-floor-vp.ll       |  136 +-
 .../RISCV/rvv/fixed-vectors-fmaximum-vp.ll    |  136 +-
 .../RISCV/rvv/fixed-vectors-fminimum-vp.ll    |  136 +-
 .../RISCV/rvv/fixed-vectors-round-vp.ll       |  136 +-
 .../RISCV/rvv/fixed-vectors-roundeven-vp.ll   |  136 +-
 .../RISCV/rvv/fixed-vectors-roundtozero-vp.ll |  136 +-
 .../RISCV/rvv/fixed-vectors-vfadd-vp.ll       |  148 +-
 .../RISCV/rvv/fixed-vectors-vfdiv-vp.ll       |  148 +-
 .../RISCV/rvv/fixed-vectors-vfma-vp.ll        |  168 +-
 .../RISCV/rvv/fixed-vectors-vfmax-vp.ll       |   72 +-
 .../RISCV/rvv/fixed-vectors-vfmin-vp.ll       |   72 +-
 .../RISCV/rvv/fixed-vectors-vfmul-vp.ll       |  148 +-
 .../RISCV/rvv/fixed-vectors-vfsqrt-vp.ll      |   64 +-
 .../RISCV/rvv/fixed-vectors-vfsub-vp.ll       |  148 +-
 llvm/test/CodeGen/RISCV/rvv/floor-vp.ll       |  544 +-
 llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll    |  808 +-
 llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll    |  808 +-
 llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll   |  560 +-
 llvm/test/CodeGen/RISCV/rvv/rint-vp.ll        |  542 +-
 llvm/test/CodeGen/RISCV/rvv/round-vp.ll       |  544 +-
 llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll   |  544 +-
 llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll |  544 +-
 llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll       |  612 +-
 llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll       |  592 +-
 llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll        | 9950 ++++++++---------
 llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll       |  252 +-
 llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll       |  252 +-
 llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll       |  296 +-
 llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll      |  140 +-
 llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll       |  592 +-
 33 files changed, 10284 insertions(+), 9783 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 89a00c5a4f043..a6d1b1cb7b104 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -737,7 +737,17 @@ void VectorLegalizer::Promote(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
               .getVectorElementType()
               .isFloatingPoint() &&
           NVT.isVector() && NVT.getVectorElementType().isFloatingPoint())
-        Operands[j] = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(j));
+        if (ISD::isVPOpcode(Node->getOpcode())) {
+          unsigned EVLIdx =
+              *ISD::getVPExplicitVectorLengthIdx(Node->getOpcode());
+          unsigned MaskIdx = *ISD::getVPMaskIdx(Node->getOpcode());
+          Operands[j] =
+              DAG.getNode(ISD::VP_FP_EXTEND, dl, NVT, Node->getOperand(j),
+                          Node->getOperand(MaskIdx), Node->getOperand(EVLIdx));
+        } else {
+          Operands[j] =
+              DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(j));
+        }
       else
         Operands[j] = DAG.getNode(ISD::BITCAST, dl, NVT, Node->getOperand(j));
     else
@@ -750,8 +760,15 @@ void VectorLegalizer::Promote(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
   if ((VT.isFloatingPoint() && NVT.isFloatingPoint()) ||
       (VT.isVector() && VT.getVectorElementType().isFloatingPoint() &&
        NVT.isVector() && NVT.getVectorElementType().isFloatingPoint()))
-    Res = DAG.getNode(ISD::FP_ROUND, dl, VT, Res,
-                      DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
+    if (ISD::isVPOpcode(Node->getOpcode())) {
+      unsigned EVLIdx = *ISD::getVPExplicitVectorLengthIdx(Node->getOpcode());
+      unsigned MaskIdx = *ISD::getVPMaskIdx(Node->getOpcode());
+      Res = DAG.getNode(ISD::VP_FP_ROUND, dl, VT, Res,
+                        Node->getOperand(MaskIdx), Node->getOperand(EVLIdx));
+    } else {
+      Res = DAG.getNode(ISD::FP_ROUND, dl, VT, Res,
+                        DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
+    }
   else
     Res = DAG.getNode(ISD::BITCAST, dl, VT, Res);
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
index a81f0ac982f56..1b9c78a20ec3b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
@@ -17,23 +17,27 @@ declare <vscale x 1 x bfloat> @llvm.vp.ceil.nxv1bf16(<vscale x 1 x bfloat>, <vsc
 define <vscale x 1 x bfloat> @vp_ceil_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv1bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vfabs.v v11, v10, v0.t
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.ceil.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x bfloat> %v
@@ -42,12 +46,12 @@ define <vscale x 1 x bfloat> @vp_ceil_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vs
 define <vscale x 1 x bfloat> @vp_ceil_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv1bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -55,7 +59,7 @@ define <vscale x 1 x bfloat> @vp_ceil_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat>
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.ceil.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -67,23 +71,27 @@ declare <vscale x 2 x bfloat> @llvm.vp.ceil.nxv2bf16(<vscale x 2 x bfloat>, <vsc
 define <vscale x 2 x bfloat> @vp_ceil_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv2bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vfabs.v v11, v10, v0.t
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmv.v.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.ceil.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x bfloat> %v
@@ -92,12 +100,12 @@ define <vscale x 2 x bfloat> @vp_ceil_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vs
 define <vscale x 2 x bfloat> @vp_ceil_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -105,7 +113,7 @@ define <vscale x 2 x bfloat> @vp_ceil_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat>
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.ceil.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -117,25 +125,27 @@ declare <vscale x 4 x bfloat> @llvm.vp.ceil.nxv4bf16(<vscale x 4 x bfloat>, <vsc
 define <vscale x 4 x bfloat> @vp_ceil_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv4bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v10, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v9, v12, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v12, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v10, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.ceil.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x bfloat> %v
@@ -144,12 +154,12 @@ define <vscale x 4 x bfloat> @vp_ceil_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vs
 define <vscale x 4 x bfloat> @vp_ceil_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv4bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -157,7 +167,7 @@ define <vscale x 4 x bfloat> @vp_ceil_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat>
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.ceil.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -169,25 +179,27 @@ declare <vscale x 8 x bfloat> @llvm.vp.ceil.nxv8bf16(<vscale x 8 x bfloat>, <vsc
 define <vscale x 8 x bfloat> @vp_ceil_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v12, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v12, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.ceil.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x bfloat> %v
@@ -196,12 +208,12 @@ define <vscale x 8 x bfloat> @vp_ceil_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vs
 define <vscale x 8 x bfloat> @vp_ceil_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv8bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -209,7 +221,7 @@ define <vscale x 8 x bfloat> @vp_ceil_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat>
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.ceil.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -221,25 +233,27 @@ declare <vscale x 16 x bfloat> @llvm.vp.ceil.nxv16bf16(<vscale x 16 x bfloat>, <
 define <vscale x 16 x bfloat> @vp_ceil_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv16bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v12, v0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.ceil.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x bfloat> %v
@@ -248,12 +262,12 @@ define <vscale x 16 x bfloat> @vp_ceil_vv_nxv16bf16(<vscale x 16 x bfloat> %va,
 define <vscale x 16 x bfloat> @vp_ceil_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv16bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
@@ -261,7 +275,7 @@ define <vscale x 16 x bfloat> @vp_ceil_vv_nxv16bf16_unmasked(<vscale x 16 x bflo
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.ceil.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -279,59 +293,64 @@ define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16(<vscale x 32 x bfloat> %va,
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v12, v0, a2
+; CHECK-NEXT:    vslidedown.vx v17, v0, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    vmv1r.v v18, v17
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v8, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v18, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a2, 3
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmv1r.v v0, v18
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB10_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB10_2:
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v7
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -346,42 +365,55 @@ define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16(<vscale x 32 x bfloat> %va,
 define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv32bf16_unmasked:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v24
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vmset.m v16
 ; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v12, v24, a2
+; CHECK-NEXT:    vslidedown.vx v16, v16, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    vmv1r.v v17, v16
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v8, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v17, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a2, 3
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB11_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB11_2:
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
@@ -390,8 +422,14 @@ define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16_unmasked(<vscale x 32 x bflo
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.ceil.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x bfloat> %v
@@ -418,23 +456,27 @@ define <vscale x 1 x half> @vp_ceil_vv_nxv1f16(<vscale x 1 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vp_ceil_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v9, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.ceil.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x half> %v
@@ -458,12 +500,12 @@ define <vscale x 1 x half> @vp_ceil_vv_nxv1f16_unmasked(<vscale x 1 x half> %va,
 ;
 ; ZVFHMIN-LABEL: vp_ceil_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -471,7 +513,7 @@ define <vscale x 1 x half> @vp_ceil_vv_nxv1f16_unmasked(<vscale x 1 x half> %va,
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.ceil.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -500,23 +542,27 @@ define <vscale x 2 x half> @vp_ceil_vv_nxv2f16(<vscale x 2 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vp_ceil_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v9, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
+; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.ceil.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x half> %v
@@ -540,12 +586,12 @@ define <vscale x 2 x half> @vp_ceil_vv_nxv2f16_unmasked(<vscale x 2 x half> %va,
 ;
 ; ZVFHMIN-LABEL: vp_ceil_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -553,7 +599,7 @@ define <vscale x 2 x half> @vp_ceil_vv_nxv2f16_unmasked(<vscale x 2 x half> %va,
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.ceil.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -582,25 +628,27 @@ define <vscale x 4 x half> @vp_ceil_vv_nxv4f16(<vscale x 4 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vp_ceil_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v12, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.ceil.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x half> %v
@@ -624,12 +672,12 @@ define <vscale x 4 x half> @vp_ceil_vv_nxv4f16_unmasked(<vscale x 4 x half> %va,
 ;
 ; ZVFHMIN-LABEL: vp_ceil_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -637,7 +685,7 @@ define <vscale x 4 x half> @vp_ceil_vv_nxv4f16_unmasked(<vscale x 4 x half> %va,
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.ceil.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -668,25 +716,27 @@ define <vscale x 8 x half> @vp_ceil_vv_nxv8f16(<vscale x 8 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vp_ceil_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.ceil.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %v
@@ -710,12 +760,12 @@ define <vscale x 8 x half> @vp_ceil_vv_nxv8f16_unmasked(<vscale x 8 x half> %va,
 ;
 ; ZVFHMIN-LABEL: vp_ceil_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -723,7 +773,7 @@ define <vscale x 8 x half> @vp_ceil_vv_nxv8f16_unmasked(<vscale x 8 x half> %va,
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.ceil.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -754,25 +804,27 @@ define <vscale x 16 x half> @vp_ceil_vv_nxv16f16(<vscale x 16 x half> %va, <vsca
 ;
 ; ZVFHMIN-LABEL: vp_ceil_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v12, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.ceil.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x half> %v
@@ -796,12 +848,12 @@ define <vscale x 16 x half> @vp_ceil_vv_nxv16f16_unmasked(<vscale x 16 x half> %
 ;
 ; ZVFHMIN-LABEL: vp_ceil_vv_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
@@ -809,7 +861,7 @@ define <vscale x 16 x half> @vp_ceil_vv_nxv16f16_unmasked(<vscale x 16 x half> %
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.ceil.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -846,59 +898,64 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v12, v0, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v17, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    vmv1r.v v18, v17
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v18, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a2, 3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v18
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v8, v7
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v7, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -928,42 +985,55 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16_unmasked(<vscale x 32 x half> %
 ;
 ; ZVFHMIN-LABEL: vp_ceil_vv_nxv32f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vmset.m v16
 ; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v12, v24, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v16, v16, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    vmv1r.v v17, v16
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a2, 3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v17
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB23_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
@@ -972,8 +1042,14 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.ceil.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
index 5fe55583f314c..a9b255bb62aeb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
@@ -30,23 +30,27 @@ define <2 x half> @vp_ceil_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl)
 ;
 ; ZVFHMIN-LABEL: vp_ceil_v2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v9, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.ceil.v2f16(<2 x half> %va, <2 x i1> %m, i32 %evl)
   ret <2 x half> %v
@@ -70,12 +74,12 @@ define <2 x half> @vp_ceil_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) {
 ;
 ; ZVFHMIN-LABEL: vp_ceil_v2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -83,7 +87,7 @@ define <2 x half> @vp_ceil_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.ceil.v2f16(<2 x half> %va, <2 x i1> splat (i1 true), i32 %evl)
@@ -112,23 +116,27 @@ define <4 x half> @vp_ceil_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl)
 ;
 ; ZVFHMIN-LABEL: vp_ceil_v4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v9, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
+; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.ceil.v4f16(<4 x half> %va, <4 x i1> %m, i32 %evl)
   ret <4 x half> %v
@@ -152,12 +160,12 @@ define <4 x half> @vp_ceil_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) {
 ;
 ; ZVFHMIN-LABEL: vp_ceil_v4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -165,7 +173,7 @@ define <4 x half> @vp_ceil_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.ceil.v4f16(<4 x half> %va, <4 x i1> splat (i1 true), i32 %evl)
@@ -194,25 +202,27 @@ define <8 x half> @vp_ceil_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl)
 ;
 ; ZVFHMIN-LABEL: vp_ceil_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v12, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.ceil.v8f16(<8 x half> %va, <8 x i1> %m, i32 %evl)
   ret <8 x half> %v
@@ -236,12 +246,12 @@ define <8 x half> @vp_ceil_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) {
 ;
 ; ZVFHMIN-LABEL: vp_ceil_v8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -249,7 +259,7 @@ define <8 x half> @vp_ceil_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.ceil.v8f16(<8 x half> %va, <8 x i1> splat (i1 true), i32 %evl)
@@ -280,25 +290,27 @@ define <16 x half> @vp_ceil_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %e
 ;
 ; ZVFHMIN-LABEL: vp_ceil_v16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.ceil.v16f16(<16 x half> %va, <16 x i1> %m, i32 %evl)
   ret <16 x half> %v
@@ -322,12 +334,12 @@ define <16 x half> @vp_ceil_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) {
 ;
 ; ZVFHMIN-LABEL: vp_ceil_v16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -335,7 +347,7 @@ define <16 x half> @vp_ceil_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.ceil.v16f16(<16 x half> %va, <16 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
index 49255320c40a6..d500469003aea 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
@@ -30,23 +30,27 @@ define <2 x half> @vp_floor_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl)
 ;
 ; ZVFHMIN-LABEL: vp_floor_v2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v9, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.floor.v2f16(<2 x half> %va, <2 x i1> %m, i32 %evl)
   ret <2 x half> %v
@@ -70,12 +74,12 @@ define <2 x half> @vp_floor_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) {
 ;
 ; ZVFHMIN-LABEL: vp_floor_v2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -83,7 +87,7 @@ define <2 x half> @vp_floor_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.floor.v2f16(<2 x half> %va, <2 x i1> splat (i1 true), i32 %evl)
@@ -112,23 +116,27 @@ define <4 x half> @vp_floor_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl)
 ;
 ; ZVFHMIN-LABEL: vp_floor_v4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v9, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
+; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.floor.v4f16(<4 x half> %va, <4 x i1> %m, i32 %evl)
   ret <4 x half> %v
@@ -152,12 +160,12 @@ define <4 x half> @vp_floor_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) {
 ;
 ; ZVFHMIN-LABEL: vp_floor_v4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -165,7 +173,7 @@ define <4 x half> @vp_floor_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.floor.v4f16(<4 x half> %va, <4 x i1> splat (i1 true), i32 %evl)
@@ -194,25 +202,27 @@ define <8 x half> @vp_floor_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl)
 ;
 ; ZVFHMIN-LABEL: vp_floor_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v12, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
-; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.floor.v8f16(<8 x half> %va, <8 x i1> %m, i32 %evl)
   ret <8 x half> %v
@@ -236,12 +246,12 @@ define <8 x half> @vp_floor_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) {
 ;
 ; ZVFHMIN-LABEL: vp_floor_v8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -249,7 +259,7 @@ define <8 x half> @vp_floor_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.floor.v8f16(<8 x half> %va, <8 x i1> splat (i1 true), i32 %evl)
@@ -280,25 +290,27 @@ define <16 x half> @vp_floor_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %
 ;
 ; ZVFHMIN-LABEL: vp_floor_v16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
-; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.floor.v16f16(<16 x half> %va, <16 x i1> %m, i32 %evl)
   ret <16 x half> %v
@@ -322,12 +334,12 @@ define <16 x half> @vp_floor_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl)
 ;
 ; ZVFHMIN-LABEL: vp_floor_v16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -335,7 +347,7 @@ define <16 x half> @vp_floor_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.floor.v16f16(<16 x half> %va, <16 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
index 11f92555f56cd..4f11e6c3c386a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
@@ -26,22 +26,20 @@ define <2 x half> @vfmax_vv_v2f16(<2 x half> %va, <2 x half> %vb, <2 x i1> %m, i
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_v2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v8, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v11, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8, v0.t
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v11, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v9, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.maximum.v2f16(<2 x half> %va, <2 x half> %vb, <2 x i1> %m, i32 %evl)
   ret <2 x half> %v
@@ -60,18 +58,18 @@ define <2 x half> @vfmax_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %vb, i32 z
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_v2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v9
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.maximum.v2f16(<2 x half> %va, <2 x half> %vb, <2 x i1> splat (i1 true), i32 %evl)
@@ -96,22 +94,20 @@ define <4 x half> @vfmax_vv_v4f16(<4 x half> %va, <4 x half> %vb, <4 x i1> %m, i
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_v4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v8, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v11, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8, v0.t
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v11, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v9, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.maximum.v4f16(<4 x half> %va, <4 x half> %vb, <4 x i1> %m, i32 %evl)
   ret <4 x half> %v
@@ -130,18 +126,18 @@ define <4 x half> @vfmax_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %vb, i32 z
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_v4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v9
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.maximum.v4f16(<4 x half> %va, <4 x half> %vb, <4 x i1> splat (i1 true), i32 %evl)
@@ -166,24 +162,22 @@ define <8 x half> @vfmax_vv_v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, i
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v14, v14, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v12, v14, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v14, v12, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v14, v14, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v14, v12, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v14, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vfmax.vv v10, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vfmax.vv v12, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.maximum.v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, i32 %evl)
   ret <8 x half> %v
@@ -202,18 +196,18 @@ define <8 x half> @vfmax_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %vb, i32 z
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_v8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v12, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
 ; ZVFHMIN-NEXT:    vmerge.vvm v10, v12, v10, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v10, v10, v8
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.maximum.v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> splat (i1 true), i32 %evl)
@@ -240,24 +234,22 @@ define <16 x half> @vfmax_vv_v16f16(<16 x half> %va, <16 x half> %vb, <16 x i1>
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_v16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v12, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v20, v20, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v16, v20, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v20, v16, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v20, v20, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v20, v16, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v20, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vfmax.vv v12, v8, v24, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vfmax.vv v16, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.maximum.v16f16(<16 x half> %va, <16 x half> %vb, <16 x i1> %m, i32 %evl)
   ret <16 x half> %v
@@ -276,18 +268,18 @@ define <16 x half> @vfmax_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %vb, i
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_v16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v16, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v12, v16, v12, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v12, v12, v8
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.maximum.v16f16(<16 x half> %va, <16 x half> %vb, <16 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
index 3fb586b67a21b..2e2103ad5e06d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
@@ -26,22 +26,20 @@ define <2 x half> @vfmin_vv_v2f16(<2 x half> %va, <2 x half> %vb, <2 x i1> %m, i
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_v2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v8, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v11, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8, v0.t
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v11, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v9, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.minimum.v2f16(<2 x half> %va, <2 x half> %vb, <2 x i1> %m, i32 %evl)
   ret <2 x half> %v
@@ -60,18 +58,18 @@ define <2 x half> @vfmin_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %vb, i32 z
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_v2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v9
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.minimum.v2f16(<2 x half> %va, <2 x half> %vb, <2 x i1> splat (i1 true), i32 %evl)
@@ -96,22 +94,20 @@ define <4 x half> @vfmin_vv_v4f16(<4 x half> %va, <4 x half> %vb, <4 x i1> %m, i
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_v4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v8, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v11, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8, v0.t
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v11, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v9, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.minimum.v4f16(<4 x half> %va, <4 x half> %vb, <4 x i1> %m, i32 %evl)
   ret <4 x half> %v
@@ -130,18 +126,18 @@ define <4 x half> @vfmin_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %vb, i32 z
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_v4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v9
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.minimum.v4f16(<4 x half> %va, <4 x half> %vb, <4 x i1> splat (i1 true), i32 %evl)
@@ -166,24 +162,22 @@ define <8 x half> @vfmin_vv_v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, i
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v14, v14, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v12, v14, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v14, v12, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v14, v14, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v14, v12, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v14, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vfmin.vv v10, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vfmin.vv v12, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.minimum.v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, i32 %evl)
   ret <8 x half> %v
@@ -202,18 +196,18 @@ define <8 x half> @vfmin_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %vb, i32 z
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_v8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v12, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
 ; ZVFHMIN-NEXT:    vmerge.vvm v10, v12, v10, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v10, v10, v8
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.minimum.v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> splat (i1 true), i32 %evl)
@@ -240,24 +234,22 @@ define <16 x half> @vfmin_vv_v16f16(<16 x half> %va, <16 x half> %vb, <16 x i1>
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_v16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v12, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v20, v20, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v16, v20, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v20, v16, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v20, v20, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v20, v16, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v20, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vfmin.vv v12, v8, v24, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vfmin.vv v16, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.minimum.v16f16(<16 x half> %va, <16 x half> %vb, <16 x i1> %m, i32 %evl)
   ret <16 x half> %v
@@ -276,18 +268,18 @@ define <16 x half> @vfmin_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %vb, i
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_v16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v16, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v12, v16, v12, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v12, v12, v8
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.minimum.v16f16(<16 x half> %va, <16 x half> %vb, <16 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
index 232a8a4827cb1..a4ff079846fd8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
@@ -30,23 +30,27 @@ define <2 x half> @vp_round_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl)
 ;
 ; ZVFHMIN-LABEL: vp_round_v2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v9, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.round.v2f16(<2 x half> %va, <2 x i1> %m, i32 %evl)
   ret <2 x half> %v
@@ -70,12 +74,12 @@ define <2 x half> @vp_round_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) {
 ;
 ; ZVFHMIN-LABEL: vp_round_v2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -83,7 +87,7 @@ define <2 x half> @vp_round_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.round.v2f16(<2 x half> %va, <2 x i1> splat (i1 true), i32 %evl)
@@ -112,23 +116,27 @@ define <4 x half> @vp_round_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl)
 ;
 ; ZVFHMIN-LABEL: vp_round_v4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v9, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
+; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.round.v4f16(<4 x half> %va, <4 x i1> %m, i32 %evl)
   ret <4 x half> %v
@@ -152,12 +160,12 @@ define <4 x half> @vp_round_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) {
 ;
 ; ZVFHMIN-LABEL: vp_round_v4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -165,7 +173,7 @@ define <4 x half> @vp_round_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.round.v4f16(<4 x half> %va, <4 x i1> splat (i1 true), i32 %evl)
@@ -194,25 +202,27 @@ define <8 x half> @vp_round_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl)
 ;
 ; ZVFHMIN-LABEL: vp_round_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v12, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.round.v8f16(<8 x half> %va, <8 x i1> %m, i32 %evl)
   ret <8 x half> %v
@@ -236,12 +246,12 @@ define <8 x half> @vp_round_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) {
 ;
 ; ZVFHMIN-LABEL: vp_round_v8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -249,7 +259,7 @@ define <8 x half> @vp_round_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.round.v8f16(<8 x half> %va, <8 x i1> splat (i1 true), i32 %evl)
@@ -280,25 +290,27 @@ define <16 x half> @vp_round_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %
 ;
 ; ZVFHMIN-LABEL: vp_round_v16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.round.v16f16(<16 x half> %va, <16 x i1> %m, i32 %evl)
   ret <16 x half> %v
@@ -322,12 +334,12 @@ define <16 x half> @vp_round_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl)
 ;
 ; ZVFHMIN-LABEL: vp_round_v16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -335,7 +347,7 @@ define <16 x half> @vp_round_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.round.v16f16(<16 x half> %va, <16 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
index 7c80c037403c2..c28d5fb1a8193 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
@@ -30,23 +30,27 @@ define <2 x half> @vp_roundeven_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %
 ;
 ; ZVFHMIN-LABEL: vp_roundeven_v2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v9, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.roundeven.v2f16(<2 x half> %va, <2 x i1> %m, i32 %evl)
   ret <2 x half> %v
@@ -70,12 +74,12 @@ define <2 x half> @vp_roundeven_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl)
 ;
 ; ZVFHMIN-LABEL: vp_roundeven_v2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -83,7 +87,7 @@ define <2 x half> @vp_roundeven_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.roundeven.v2f16(<2 x half> %va, <2 x i1> splat (i1 true), i32 %evl)
@@ -112,23 +116,27 @@ define <4 x half> @vp_roundeven_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %
 ;
 ; ZVFHMIN-LABEL: vp_roundeven_v4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v9, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
+; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.roundeven.v4f16(<4 x half> %va, <4 x i1> %m, i32 %evl)
   ret <4 x half> %v
@@ -152,12 +160,12 @@ define <4 x half> @vp_roundeven_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl)
 ;
 ; ZVFHMIN-LABEL: vp_roundeven_v4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -165,7 +173,7 @@ define <4 x half> @vp_roundeven_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.roundeven.v4f16(<4 x half> %va, <4 x i1> splat (i1 true), i32 %evl)
@@ -194,25 +202,27 @@ define <8 x half> @vp_roundeven_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %
 ;
 ; ZVFHMIN-LABEL: vp_roundeven_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v12, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.roundeven.v8f16(<8 x half> %va, <8 x i1> %m, i32 %evl)
   ret <8 x half> %v
@@ -236,12 +246,12 @@ define <8 x half> @vp_roundeven_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl)
 ;
 ; ZVFHMIN-LABEL: vp_roundeven_v8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -249,7 +259,7 @@ define <8 x half> @vp_roundeven_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.roundeven.v8f16(<8 x half> %va, <8 x i1> splat (i1 true), i32 %evl)
@@ -280,25 +290,27 @@ define <16 x half> @vp_roundeven_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroe
 ;
 ; ZVFHMIN-LABEL: vp_roundeven_v16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.roundeven.v16f16(<16 x half> %va, <16 x i1> %m, i32 %evl)
   ret <16 x half> %v
@@ -322,12 +334,12 @@ define <16 x half> @vp_roundeven_v16f16_unmasked(<16 x half> %va, i32 zeroext %e
 ;
 ; ZVFHMIN-LABEL: vp_roundeven_v16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -335,7 +347,7 @@ define <16 x half> @vp_roundeven_v16f16_unmasked(<16 x half> %va, i32 zeroext %e
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.roundeven.v16f16(<16 x half> %va, <16 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
index 65a4725267cd3..64d3664a4c372 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
@@ -30,23 +30,27 @@ define <2 x half> @vp_roundtozero_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext
 ;
 ; ZVFHMIN-LABEL: vp_roundtozero_v2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v9, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.roundtozero.v2f16(<2 x half> %va, <2 x i1> %m, i32 %evl)
   ret <2 x half> %v
@@ -70,12 +74,12 @@ define <2 x half> @vp_roundtozero_v2f16_unmasked(<2 x half> %va, i32 zeroext %ev
 ;
 ; ZVFHMIN-LABEL: vp_roundtozero_v2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -83,7 +87,7 @@ define <2 x half> @vp_roundtozero_v2f16_unmasked(<2 x half> %va, i32 zeroext %ev
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.roundtozero.v2f16(<2 x half> %va, <2 x i1> splat (i1 true), i32 %evl)
@@ -112,23 +116,27 @@ define <4 x half> @vp_roundtozero_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext
 ;
 ; ZVFHMIN-LABEL: vp_roundtozero_v4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v9, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
+; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.roundtozero.v4f16(<4 x half> %va, <4 x i1> %m, i32 %evl)
   ret <4 x half> %v
@@ -152,12 +160,12 @@ define <4 x half> @vp_roundtozero_v4f16_unmasked(<4 x half> %va, i32 zeroext %ev
 ;
 ; ZVFHMIN-LABEL: vp_roundtozero_v4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -165,7 +173,7 @@ define <4 x half> @vp_roundtozero_v4f16_unmasked(<4 x half> %va, i32 zeroext %ev
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.roundtozero.v4f16(<4 x half> %va, <4 x i1> splat (i1 true), i32 %evl)
@@ -194,25 +202,27 @@ define <8 x half> @vp_roundtozero_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext
 ;
 ; ZVFHMIN-LABEL: vp_roundtozero_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v12, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
-; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.roundtozero.v8f16(<8 x half> %va, <8 x i1> %m, i32 %evl)
   ret <8 x half> %v
@@ -236,12 +246,12 @@ define <8 x half> @vp_roundtozero_v8f16_unmasked(<8 x half> %va, i32 zeroext %ev
 ;
 ; ZVFHMIN-LABEL: vp_roundtozero_v8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -249,7 +259,7 @@ define <8 x half> @vp_roundtozero_v8f16_unmasked(<8 x half> %va, i32 zeroext %ev
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.roundtozero.v8f16(<8 x half> %va, <8 x i1> splat (i1 true), i32 %evl)
@@ -280,25 +290,27 @@ define <16 x half> @vp_roundtozero_v16f16(<16 x half> %va, <16 x i1> %m, i32 zer
 ;
 ; ZVFHMIN-LABEL: vp_roundtozero_v16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
-; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.roundtozero.v16f16(<16 x half> %va, <16 x i1> %m, i32 %evl)
   ret <16 x half> %v
@@ -322,12 +334,12 @@ define <16 x half> @vp_roundtozero_v16f16_unmasked(<16 x half> %va, i32 zeroext
 ;
 ; ZVFHMIN-LABEL: vp_roundtozero_v16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -335,7 +347,7 @@ define <16 x half> @vp_roundtozero_v16f16_unmasked(<16 x half> %va, i32 zeroext
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.roundtozero.v16f16(<16 x half> %va, <16 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfadd-vp.ll
index 7a7236235d120..f80c158324684 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfadd-vp.ll
@@ -19,13 +19,13 @@ define <2 x half> @vfadd_vv_v2f16(<2 x half> %va, <2 x half> %b, <2 x i1> %m, i3
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_v2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.fadd.v2f16(<2 x half> %va, <2 x half> %b, <2 x i1> %m, i32 %evl)
   ret <2 x half> %v
@@ -40,12 +40,12 @@ define <2 x half> @vfadd_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %b, i32 ze
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_v2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.fadd.v2f16(<2 x half> %va, <2 x half> %b, <2 x i1> splat (i1 true), i32 %evl)
@@ -64,12 +64,13 @@ define <2 x half> @vfadd_vf_v2f16(<2 x half> %va, half %b, <2 x i1> %m, i32 zero
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <2 x half> poison, half %b, i32 0
   %vb = shufflevector <2 x half> %elt.head, <2 x half> poison, <2 x i32> zeroinitializer
@@ -89,11 +90,12 @@ define <2 x half> @vfadd_vf_v2f16_unmasked(<2 x half> %va, half %b, i32 zeroext
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <2 x half> poison, half %b, i32 0
@@ -113,13 +115,13 @@ define <3 x half> @vfadd_vv_v3f16(<3 x half> %va, <3 x half> %b, <3 x i1> %m, i3
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_v3f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <3 x half> @llvm.vp.fadd.v3f16(<3 x half> %va, <3 x half> %b, <3 x i1> %m, i32 %evl)
   ret <3 x half> %v
@@ -136,13 +138,13 @@ define <4 x half> @vfadd_vv_v4f16(<4 x half> %va, <4 x half> %b, <4 x i1> %m, i3
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_v4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.fadd.v4f16(<4 x half> %va, <4 x half> %b, <4 x i1> %m, i32 %evl)
   ret <4 x half> %v
@@ -157,12 +159,12 @@ define <4 x half> @vfadd_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %b, i32 ze
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_v4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.fadd.v4f16(<4 x half> %va, <4 x half> %b, <4 x i1> splat (i1 true), i32 %evl)
@@ -181,12 +183,13 @@ define <4 x half> @vfadd_vf_v4f16(<4 x half> %va, half %b, <4 x i1> %m, i32 zero
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <4 x half> poison, half %b, i32 0
   %vb = shufflevector <4 x half> %elt.head, <4 x half> poison, <4 x i32> zeroinitializer
@@ -206,11 +209,12 @@ define <4 x half> @vfadd_vf_v4f16_unmasked(<4 x half> %va, half %b, i32 zeroext
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <4 x half> poison, half %b, i32 0
@@ -230,13 +234,13 @@ define <8 x half> @vfadd_vv_v8f16(<8 x half> %va, <8 x half> %b, <8 x i1> %m, i3
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v10, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.fadd.v8f16(<8 x half> %va, <8 x half> %b, <8 x i1> %m, i32 %evl)
   ret <8 x half> %v
@@ -251,12 +255,12 @@ define <8 x half> @vfadd_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %b, i32 ze
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_v8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v10, v12, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.fadd.v8f16(<8 x half> %va, <8 x half> %b, <8 x i1> splat (i1 true), i32 %evl)
@@ -275,12 +279,13 @@ define <8 x half> @vfadd_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zero
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v10, v10, v12, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -300,11 +305,12 @@ define <8 x half> @vfadd_vf_v8f16_unmasked(<8 x half> %va, half %b, i32 zeroext
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v10, v10, v12
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
@@ -324,13 +330,13 @@ define <16 x half> @vfadd_vv_v16f16(<16 x half> %va, <16 x half> %b, <16 x i1> %
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_v16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v12, v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.fadd.v16f16(<16 x half> %va, <16 x half> %b, <16 x i1> %m, i32 %evl)
   ret <16 x half> %v
@@ -345,12 +351,12 @@ define <16 x half> @vfadd_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %b, i3
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_v16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v12, v16, v12
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.fadd.v16f16(<16 x half> %va, <16 x half> %b, <16 x i1> splat (i1 true), i32 %evl)
@@ -369,12 +375,13 @@ define <16 x half> @vfadd_vf_v16f16(<16 x half> %va, half %b, <16 x i1> %m, i32
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v12, v12, v16, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <16 x half> poison, half %b, i32 0
   %vb = shufflevector <16 x half> %elt.head, <16 x half> poison, <16 x i32> zeroinitializer
@@ -394,11 +401,12 @@ define <16 x half> @vfadd_vf_v16f16_unmasked(<16 x half> %va, half %b, i32 zeroe
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v12, v12, v16
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <16 x half> poison, half %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfdiv-vp.ll
index cb83e5ff4f2b3..23baa60de1532 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfdiv-vp.ll
@@ -19,13 +19,13 @@ define <2 x half> @vfdiv_vv_v2f16(<2 x half> %va, <2 x half> %b, <2 x i1> %m, i3
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_v2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v9, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.fdiv.v2f16(<2 x half> %va, <2 x half> %b, <2 x i1> %m, i32 %evl)
   ret <2 x half> %v
@@ -40,12 +40,12 @@ define <2 x half> @vfdiv_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %b, i32 ze
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_v2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v9, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.fdiv.v2f16(<2 x half> %va, <2 x half> %b, <2 x i1> splat (i1 true), i32 %evl)
@@ -64,12 +64,13 @@ define <2 x half> @vfdiv_vf_v2f16(<2 x half> %va, half %b, <2 x i1> %m, i32 zero
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v10, v8, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <2 x half> poison, half %b, i32 0
   %vb = shufflevector <2 x half> %elt.head, <2 x half> poison, <2 x i32> zeroinitializer
@@ -89,11 +90,12 @@ define <2 x half> @vfdiv_vf_v2f16_unmasked(<2 x half> %va, half %b, i32 zeroext
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v10, v8
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <2 x half> poison, half %b, i32 0
@@ -113,13 +115,13 @@ define <3 x half> @vfdiv_vv_v3f16(<3 x half> %va, <3 x half> %b, <3 x i1> %m, i3
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_v3f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v9, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <3 x half> @llvm.vp.fdiv.v3f16(<3 x half> %va, <3 x half> %b, <3 x i1> %m, i32 %evl)
   ret <3 x half> %v
@@ -136,13 +138,13 @@ define <4 x half> @vfdiv_vv_v4f16(<4 x half> %va, <4 x half> %b, <4 x i1> %m, i3
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_v4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v9, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.fdiv.v4f16(<4 x half> %va, <4 x half> %b, <4 x i1> %m, i32 %evl)
   ret <4 x half> %v
@@ -157,12 +159,12 @@ define <4 x half> @vfdiv_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %b, i32 ze
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_v4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v9, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.fdiv.v4f16(<4 x half> %va, <4 x half> %b, <4 x i1> splat (i1 true), i32 %evl)
@@ -181,12 +183,13 @@ define <4 x half> @vfdiv_vf_v4f16(<4 x half> %va, half %b, <4 x i1> %m, i32 zero
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v10, v8, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <4 x half> poison, half %b, i32 0
   %vb = shufflevector <4 x half> %elt.head, <4 x half> poison, <4 x i32> zeroinitializer
@@ -206,11 +209,12 @@ define <4 x half> @vfdiv_vf_v4f16_unmasked(<4 x half> %va, half %b, i32 zeroext
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v10, v8
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <4 x half> poison, half %b, i32 0
@@ -230,13 +234,13 @@ define <8 x half> @vfdiv_vv_v8f16(<8 x half> %va, <8 x half> %b, <8 x i1> %m, i3
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v10, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.fdiv.v8f16(<8 x half> %va, <8 x half> %b, <8 x i1> %m, i32 %evl)
   ret <8 x half> %v
@@ -251,12 +255,12 @@ define <8 x half> @vfdiv_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %b, i32 ze
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_v8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v10, v12, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.fdiv.v8f16(<8 x half> %va, <8 x half> %b, <8 x i1> splat (i1 true), i32 %evl)
@@ -275,12 +279,13 @@ define <8 x half> @vfdiv_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zero
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v10, v10, v12, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -300,11 +305,12 @@ define <8 x half> @vfdiv_vf_v8f16_unmasked(<8 x half> %va, half %b, i32 zeroext
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v10, v10, v12
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
@@ -324,13 +330,13 @@ define <16 x half> @vfdiv_vv_v16f16(<16 x half> %va, <16 x half> %b, <16 x i1> %
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_v16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v12, v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.fdiv.v16f16(<16 x half> %va, <16 x half> %b, <16 x i1> %m, i32 %evl)
   ret <16 x half> %v
@@ -345,12 +351,12 @@ define <16 x half> @vfdiv_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %b, i3
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_v16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v12, v16, v12
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.fdiv.v16f16(<16 x half> %va, <16 x half> %b, <16 x i1> splat (i1 true), i32 %evl)
@@ -369,12 +375,13 @@ define <16 x half> @vfdiv_vf_v16f16(<16 x half> %va, half %b, <16 x i1> %m, i32
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v12, v12, v16, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <16 x half> poison, half %b, i32 0
   %vb = shufflevector <16 x half> %elt.head, <16 x half> poison, <16 x i32> zeroinitializer
@@ -394,11 +401,12 @@ define <16 x half> @vfdiv_vf_v16f16_unmasked(<16 x half> %va, half %b, i32 zeroe
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v12, v12, v16
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <16 x half> poison, half %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
index c61f9cd9b5bd7..bde842dcc7600 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
@@ -20,14 +20,14 @@ define <2 x half> @vfma_vv_v2f16(<2 x half> %va, <2 x half> %b, <2 x half> %c, <
 ;
 ; ZVFHMIN-LABEL: vfma_vv_v2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v10, v11, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.fma.v2f16(<2 x half> %va, <2 x half> %b, <2 x half> %c, <2 x i1> %m, i32 %evl)
   ret <2 x half> %v
@@ -42,13 +42,13 @@ define <2 x half> @vfma_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %b, <2 x ha
 ;
 ; ZVFHMIN-LABEL: vfma_vv_v2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v10, v11
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.fma.v2f16(<2 x half> %va, <2 x half> %b, <2 x half> %c, <2 x i1> splat (i1 true), i32 %evl)
@@ -65,15 +65,17 @@ define <2 x half> @vfma_vf_v2f16(<2 x half> %va, half %b, <2 x half> %vc, <2 x i
 ; ZVFHMIN-LABEL: vfma_vf_v2f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <2 x half> poison, half %b, i32 0
   %vb = shufflevector <2 x half> %elt.head, <2 x half> poison, <2 x i32> zeroinitializer
@@ -91,14 +93,16 @@ define <2 x half> @vfma_vf_v2f16_unmasked(<2 x half> %va, half %b, <2 x half> %v
 ; ZVFHMIN-LABEL: vfma_vf_v2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <2 x half> poison, half %b, i32 0
@@ -119,14 +123,14 @@ define <4 x half> @vfma_vv_v4f16(<4 x half> %va, <4 x half> %b, <4 x half> %c, <
 ;
 ; ZVFHMIN-LABEL: vfma_vv_v4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v10, v11, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.fma.v4f16(<4 x half> %va, <4 x half> %b, <4 x half> %c, <4 x i1> %m, i32 %evl)
   ret <4 x half> %v
@@ -141,13 +145,13 @@ define <4 x half> @vfma_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %b, <4 x ha
 ;
 ; ZVFHMIN-LABEL: vfma_vv_v4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v10, v11
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.fma.v4f16(<4 x half> %va, <4 x half> %b, <4 x half> %c, <4 x i1> splat (i1 true), i32 %evl)
@@ -164,15 +168,17 @@ define <4 x half> @vfma_vf_v4f16(<4 x half> %va, half %b, <4 x half> %vc, <4 x i
 ; ZVFHMIN-LABEL: vfma_vf_v4f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <4 x half> poison, half %b, i32 0
   %vb = shufflevector <4 x half> %elt.head, <4 x half> poison, <4 x i32> zeroinitializer
@@ -190,14 +196,16 @@ define <4 x half> @vfma_vf_v4f16_unmasked(<4 x half> %va, half %b, <4 x half> %v
 ; ZVFHMIN-LABEL: vfma_vf_v4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <4 x half> poison, half %b, i32 0
@@ -218,14 +226,14 @@ define <8 x half> @vfma_vv_v8f16(<8 x half> %va, <8 x half> %b, <8 x half> %c, <
 ;
 ; ZVFHMIN-LABEL: vfma_vv_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.fma.v8f16(<8 x half> %va, <8 x half> %b, <8 x half> %c, <8 x i1> %m, i32 %evl)
   ret <8 x half> %v
@@ -240,13 +248,13 @@ define <8 x half> @vfma_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %b, <8 x ha
 ;
 ; ZVFHMIN-LABEL: vfma_vv_v8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.fma.v8f16(<8 x half> %va, <8 x half> %b, <8 x half> %c, <8 x i1> splat (i1 true), i32 %evl)
@@ -263,15 +271,17 @@ define <8 x half> @vfma_vf_v8f16(<8 x half> %va, half %b, <8 x half> %vc, <8 x i
 ; ZVFHMIN-LABEL: vfma_vf_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -289,14 +299,16 @@ define <8 x half> @vfma_vf_v8f16_unmasked(<8 x half> %va, half %b, <8 x half> %v
 ; ZVFHMIN-LABEL: vfma_vf_v8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
@@ -317,14 +329,14 @@ define <16 x half> @vfma_vv_v16f16(<16 x half> %va, <16 x half> %b, <16 x half>
 ;
 ; ZVFHMIN-LABEL: vfma_vv_v16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.fma.v16f16(<16 x half> %va, <16 x half> %b, <16 x half> %c, <16 x i1> %m, i32 %evl)
   ret <16 x half> %v
@@ -339,13 +351,13 @@ define <16 x half> @vfma_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %b, <16
 ;
 ; ZVFHMIN-LABEL: vfma_vv_v16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.fma.v16f16(<16 x half> %va, <16 x half> %b, <16 x half> %c, <16 x i1> splat (i1 true), i32 %evl)
@@ -362,15 +374,17 @@ define <16 x half> @vfma_vf_v16f16(<16 x half> %va, half %b, <16 x half> %vc, <1
 ; ZVFHMIN-LABEL: vfma_vf_v16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <16 x half> poison, half %b, i32 0
   %vb = shufflevector <16 x half> %elt.head, <16 x half> poison, <16 x i32> zeroinitializer
@@ -388,14 +402,16 @@ define <16 x half> @vfma_vf_v16f16_unmasked(<16 x half> %va, half %b, <16 x half
 ; ZVFHMIN-LABEL: vfma_vf_v16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <16 x half> poison, half %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll
index cad7adbc19f3c..b37c47a32ba21 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll
@@ -19,13 +19,13 @@ define <2 x half> @vfmax_vv_v2f16(<2 x half> %va, <2 x half> %vb, <2 x i1> %m, i
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_v2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.maxnum.v2f16(<2 x half> %va, <2 x half> %vb, <2 x i1> %m, i32 %evl)
   ret <2 x half> %v
@@ -40,12 +40,12 @@ define <2 x half> @vfmax_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %vb, i32 z
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_v2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.maxnum.v2f16(<2 x half> %va, <2 x half> %vb, <2 x i1> splat (i1 true), i32 %evl)
@@ -63,13 +63,13 @@ define <4 x half> @vfmax_vv_v4f16(<4 x half> %va, <4 x half> %vb, <4 x i1> %m, i
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_v4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.maxnum.v4f16(<4 x half> %va, <4 x half> %vb, <4 x i1> %m, i32 %evl)
   ret <4 x half> %v
@@ -84,12 +84,12 @@ define <4 x half> @vfmax_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %vb, i32 z
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_v4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.maxnum.v4f16(<4 x half> %va, <4 x half> %vb, <4 x i1> splat (i1 true), i32 %evl)
@@ -107,13 +107,13 @@ define <8 x half> @vfmax_vv_v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, i
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v10, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.maxnum.v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, i32 %evl)
   ret <8 x half> %v
@@ -128,12 +128,12 @@ define <8 x half> @vfmax_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %vb, i32 z
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_v8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v10, v12, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.maxnum.v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> splat (i1 true), i32 %evl)
@@ -151,13 +151,13 @@ define <16 x half> @vfmax_vv_v16f16(<16 x half> %va, <16 x half> %vb, <16 x i1>
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_v16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v12, v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.maxnum.v16f16(<16 x half> %va, <16 x half> %vb, <16 x i1> %m, i32 %evl)
   ret <16 x half> %v
@@ -172,12 +172,12 @@ define <16 x half> @vfmax_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %vb, i
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_v16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v12, v16, v12
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.maxnum.v16f16(<16 x half> %va, <16 x half> %vb, <16 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll
index d8ee7a7044b49..261523e8ace50 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll
@@ -19,13 +19,13 @@ define <2 x half> @vfmin_vv_v2f16(<2 x half> %va, <2 x half> %vb, <2 x i1> %m, i
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_v2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.minnum.v2f16(<2 x half> %va, <2 x half> %vb, <2 x i1> %m, i32 %evl)
   ret <2 x half> %v
@@ -40,12 +40,12 @@ define <2 x half> @vfmin_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %vb, i32 z
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_v2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.minnum.v2f16(<2 x half> %va, <2 x half> %vb, <2 x i1> splat (i1 true), i32 %evl)
@@ -63,13 +63,13 @@ define <4 x half> @vfmin_vv_v4f16(<4 x half> %va, <4 x half> %vb, <4 x i1> %m, i
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_v4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.minnum.v4f16(<4 x half> %va, <4 x half> %vb, <4 x i1> %m, i32 %evl)
   ret <4 x half> %v
@@ -84,12 +84,12 @@ define <4 x half> @vfmin_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %vb, i32 z
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_v4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.minnum.v4f16(<4 x half> %va, <4 x half> %vb, <4 x i1> splat (i1 true), i32 %evl)
@@ -107,13 +107,13 @@ define <8 x half> @vfmin_vv_v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, i
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v10, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.minnum.v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, i32 %evl)
   ret <8 x half> %v
@@ -128,12 +128,12 @@ define <8 x half> @vfmin_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %vb, i32 z
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_v8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v10, v12, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.minnum.v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> splat (i1 true), i32 %evl)
@@ -151,13 +151,13 @@ define <16 x half> @vfmin_vv_v16f16(<16 x half> %va, <16 x half> %vb, <16 x i1>
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_v16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v12, v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.minnum.v16f16(<16 x half> %va, <16 x half> %vb, <16 x i1> %m, i32 %evl)
   ret <16 x half> %v
@@ -172,12 +172,12 @@ define <16 x half> @vfmin_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %vb, i
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_v16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v12, v16, v12
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.minnum.v16f16(<16 x half> %va, <16 x half> %vb, <16 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmul-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmul-vp.ll
index 86f140723d7f8..7e03a3cf95577 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmul-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmul-vp.ll
@@ -19,13 +19,13 @@ define <2 x half> @vfmul_vv_v2f16(<2 x half> %va, <2 x half> %b, <2 x i1> %m, i3
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_v2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v9, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.fmul.v2f16(<2 x half> %va, <2 x half> %b, <2 x i1> %m, i32 %evl)
   ret <2 x half> %v
@@ -40,12 +40,12 @@ define <2 x half> @vfmul_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %b, i32 ze
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_v2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v9, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.fmul.v2f16(<2 x half> %va, <2 x half> %b, <2 x i1> splat (i1 true), i32 %evl)
@@ -64,12 +64,13 @@ define <2 x half> @vfmul_vf_v2f16(<2 x half> %va, half %b, <2 x i1> %m, i32 zero
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v10, v8, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <2 x half> poison, half %b, i32 0
   %vb = shufflevector <2 x half> %elt.head, <2 x half> poison, <2 x i32> zeroinitializer
@@ -89,11 +90,12 @@ define <2 x half> @vfmul_vf_v2f16_unmasked(<2 x half> %va, half %b, i32 zeroext
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v10, v8
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <2 x half> poison, half %b, i32 0
@@ -113,13 +115,13 @@ define <3 x half> @vfmul_vv_v3f16(<3 x half> %va, <3 x half> %b, <3 x i1> %m, i3
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_v3f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v9, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <3 x half> @llvm.vp.fmul.v3f16(<3 x half> %va, <3 x half> %b, <3 x i1> %m, i32 %evl)
   ret <3 x half> %v
@@ -136,13 +138,13 @@ define <4 x half> @vfmul_vv_v4f16(<4 x half> %va, <4 x half> %b, <4 x i1> %m, i3
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_v4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v9, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.fmul.v4f16(<4 x half> %va, <4 x half> %b, <4 x i1> %m, i32 %evl)
   ret <4 x half> %v
@@ -157,12 +159,12 @@ define <4 x half> @vfmul_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %b, i32 ze
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_v4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v9, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.fmul.v4f16(<4 x half> %va, <4 x half> %b, <4 x i1> splat (i1 true), i32 %evl)
@@ -181,12 +183,13 @@ define <4 x half> @vfmul_vf_v4f16(<4 x half> %va, half %b, <4 x i1> %m, i32 zero
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v10, v8, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <4 x half> poison, half %b, i32 0
   %vb = shufflevector <4 x half> %elt.head, <4 x half> poison, <4 x i32> zeroinitializer
@@ -206,11 +209,12 @@ define <4 x half> @vfmul_vf_v4f16_unmasked(<4 x half> %va, half %b, i32 zeroext
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v10, v8
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <4 x half> poison, half %b, i32 0
@@ -230,13 +234,13 @@ define <8 x half> @vfmul_vv_v8f16(<8 x half> %va, <8 x half> %b, <8 x i1> %m, i3
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v10, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.fmul.v8f16(<8 x half> %va, <8 x half> %b, <8 x i1> %m, i32 %evl)
   ret <8 x half> %v
@@ -251,12 +255,12 @@ define <8 x half> @vfmul_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %b, i32 ze
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_v8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v10, v12, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.fmul.v8f16(<8 x half> %va, <8 x half> %b, <8 x i1> splat (i1 true), i32 %evl)
@@ -275,12 +279,13 @@ define <8 x half> @vfmul_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zero
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v10, v10, v12, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -300,11 +305,12 @@ define <8 x half> @vfmul_vf_v8f16_unmasked(<8 x half> %va, half %b, i32 zeroext
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v10, v10, v12
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
@@ -324,13 +330,13 @@ define <16 x half> @vfmul_vv_v16f16(<16 x half> %va, <16 x half> %b, <16 x i1> %
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_v16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v12, v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.fmul.v16f16(<16 x half> %va, <16 x half> %b, <16 x i1> %m, i32 %evl)
   ret <16 x half> %v
@@ -345,12 +351,12 @@ define <16 x half> @vfmul_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %b, i3
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_v16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v12, v16, v12
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.fmul.v16f16(<16 x half> %va, <16 x half> %b, <16 x i1> splat (i1 true), i32 %evl)
@@ -369,12 +375,13 @@ define <16 x half> @vfmul_vf_v16f16(<16 x half> %va, half %b, <16 x i1> %m, i32
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v12, v12, v16, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <16 x half> poison, half %b, i32 0
   %vb = shufflevector <16 x half> %elt.head, <16 x half> poison, <16 x i32> zeroinitializer
@@ -394,11 +401,12 @@ define <16 x half> @vfmul_vf_v16f16_unmasked(<16 x half> %va, half %b, i32 zeroe
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v12, v12, v16
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <16 x half> poison, half %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll
index c1e63cbf0b138..6244419de65b1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll
@@ -19,12 +19,12 @@ define <2 x half> @vfsqrt_vv_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_v2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v9, v9, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.sqrt.v2f16(<2 x half> %va, <2 x i1> %m, i32 %evl)
   ret <2 x half> %v
@@ -39,11 +39,11 @@ define <2 x half> @vfsqrt_vv_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) {
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_v2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v9, v9
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.sqrt.v2f16(<2 x half> %va, <2 x i1> splat (i1 true), i32 %evl)
@@ -61,12 +61,12 @@ define <4 x half> @vfsqrt_vv_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_v4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v9, v9, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.sqrt.v4f16(<4 x half> %va, <4 x i1> %m, i32 %evl)
   ret <4 x half> %v
@@ -81,11 +81,11 @@ define <4 x half> @vfsqrt_vv_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) {
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_v4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v9, v9
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.sqrt.v4f16(<4 x half> %va, <4 x i1> splat (i1 true), i32 %evl)
@@ -103,12 +103,12 @@ define <8 x half> @vfsqrt_vv_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v10, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.sqrt.v8f16(<8 x half> %va, <8 x i1> %m, i32 %evl)
   ret <8 x half> %v
@@ -123,11 +123,11 @@ define <8 x half> @vfsqrt_vv_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) {
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_v8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v10, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.sqrt.v8f16(<8 x half> %va, <8 x i1> splat (i1 true), i32 %evl)
@@ -145,12 +145,12 @@ define <16 x half> @vfsqrt_vv_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_v16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.sqrt.v16f16(<16 x half> %va, <16 x i1> %m, i32 %evl)
   ret <16 x half> %v
@@ -165,11 +165,11 @@ define <16 x half> @vfsqrt_vv_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl)
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_v16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v12, v12
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.sqrt.v16f16(<16 x half> %va, <16 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsub-vp.ll
index d0a0bf516d355..58a510047d625 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsub-vp.ll
@@ -19,13 +19,13 @@ define <2 x half> @vfsub_vv_v2f16(<2 x half> %va, <2 x half> %b, <2 x i1> %m, i3
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_v2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v9, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.fsub.v2f16(<2 x half> %va, <2 x half> %b, <2 x i1> %m, i32 %evl)
   ret <2 x half> %v
@@ -40,12 +40,12 @@ define <2 x half> @vfsub_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %b, i32 ze
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_v2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v9, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.fsub.v2f16(<2 x half> %va, <2 x half> %b, <2 x i1> splat (i1 true), i32 %evl)
@@ -64,12 +64,13 @@ define <2 x half> @vfsub_vf_v2f16(<2 x half> %va, half %b, <2 x i1> %m, i32 zero
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v10, v8, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <2 x half> poison, half %b, i32 0
   %vb = shufflevector <2 x half> %elt.head, <2 x half> poison, <2 x i32> zeroinitializer
@@ -89,11 +90,12 @@ define <2 x half> @vfsub_vf_v2f16_unmasked(<2 x half> %va, half %b, i32 zeroext
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v10, v8
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <2 x half> poison, half %b, i32 0
@@ -113,13 +115,13 @@ define <3 x half> @vfsub_vv_v3f16(<3 x half> %va, <3 x half> %b, <3 x i1> %m, i3
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_v3f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v9, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <3 x half> @llvm.vp.fsub.v3f16(<3 x half> %va, <3 x half> %b, <3 x i1> %m, i32 %evl)
   ret <3 x half> %v
@@ -136,13 +138,13 @@ define <4 x half> @vfsub_vv_v4f16(<4 x half> %va, <4 x half> %b, <4 x i1> %m, i3
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_v4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v9, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.fsub.v4f16(<4 x half> %va, <4 x half> %b, <4 x i1> %m, i32 %evl)
   ret <4 x half> %v
@@ -157,12 +159,12 @@ define <4 x half> @vfsub_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %b, i32 ze
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_v4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v9, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.fsub.v4f16(<4 x half> %va, <4 x half> %b, <4 x i1> splat (i1 true), i32 %evl)
@@ -181,12 +183,13 @@ define <4 x half> @vfsub_vf_v4f16(<4 x half> %va, half %b, <4 x i1> %m, i32 zero
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v10, v8, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <4 x half> poison, half %b, i32 0
   %vb = shufflevector <4 x half> %elt.head, <4 x half> poison, <4 x i32> zeroinitializer
@@ -206,11 +209,12 @@ define <4 x half> @vfsub_vf_v4f16_unmasked(<4 x half> %va, half %b, i32 zeroext
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v10, v8
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <4 x half> poison, half %b, i32 0
@@ -230,13 +234,13 @@ define <8 x half> @vfsub_vv_v8f16(<8 x half> %va, <8 x half> %b, <8 x i1> %m, i3
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v10, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.fsub.v8f16(<8 x half> %va, <8 x half> %b, <8 x i1> %m, i32 %evl)
   ret <8 x half> %v
@@ -251,12 +255,12 @@ define <8 x half> @vfsub_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %b, i32 ze
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_v8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v10, v12, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.fsub.v8f16(<8 x half> %va, <8 x half> %b, <8 x i1> splat (i1 true), i32 %evl)
@@ -275,12 +279,13 @@ define <8 x half> @vfsub_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zero
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v10, v10, v12, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -300,11 +305,12 @@ define <8 x half> @vfsub_vf_v8f16_unmasked(<8 x half> %va, half %b, i32 zeroext
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v10, v10, v12
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
@@ -324,13 +330,13 @@ define <16 x half> @vfsub_vv_v16f16(<16 x half> %va, <16 x half> %b, <16 x i1> %
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_v16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v12, v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.fsub.v16f16(<16 x half> %va, <16 x half> %b, <16 x i1> %m, i32 %evl)
   ret <16 x half> %v
@@ -345,12 +351,12 @@ define <16 x half> @vfsub_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %b, i3
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_v16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v12, v16, v12
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.fsub.v16f16(<16 x half> %va, <16 x half> %b, <16 x i1> splat (i1 true), i32 %evl)
@@ -369,12 +375,13 @@ define <16 x half> @vfsub_vf_v16f16(<16 x half> %va, half %b, <16 x i1> %m, i32
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v12, v12, v16, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <16 x half> poison, half %b, i32 0
   %vb = shufflevector <16 x half> %elt.head, <16 x half> poison, <16 x i32> zeroinitializer
@@ -394,11 +401,12 @@ define <16 x half> @vfsub_vf_v16f16_unmasked(<16 x half> %va, half %b, i32 zeroe
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v12, v12, v16
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <16 x half> poison, half %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
index 74a00c655d526..f9b5095c9af1d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
@@ -17,23 +17,27 @@ declare <vscale x 1 x bfloat> @llvm.vp.floor.nxv1bf16(<vscale x 1 x bfloat>, <vs
 define <vscale x 1 x bfloat> @vp_floor_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv1bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vfabs.v v11, v10, v0.t
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.floor.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x bfloat> %v
@@ -42,12 +46,12 @@ define <vscale x 1 x bfloat> @vp_floor_nxv1bf16(<vscale x 1 x bfloat> %va, <vsca
 define <vscale x 1 x bfloat> @vp_floor_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv1bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -55,7 +59,7 @@ define <vscale x 1 x bfloat> @vp_floor_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.floor.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -67,23 +71,27 @@ declare <vscale x 2 x bfloat> @llvm.vp.floor.nxv2bf16(<vscale x 2 x bfloat>, <vs
 define <vscale x 2 x bfloat> @vp_floor_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv2bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vfabs.v v11, v10, v0.t
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vmv.v.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.floor.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x bfloat> %v
@@ -92,12 +100,12 @@ define <vscale x 2 x bfloat> @vp_floor_nxv2bf16(<vscale x 2 x bfloat> %va, <vsca
 define <vscale x 2 x bfloat> @vp_floor_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -105,7 +113,7 @@ define <vscale x 2 x bfloat> @vp_floor_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.floor.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -117,25 +125,27 @@ declare <vscale x 4 x bfloat> @llvm.vp.floor.nxv4bf16(<vscale x 4 x bfloat>, <vs
 define <vscale x 4 x bfloat> @vp_floor_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv4bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v10, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v9, v12, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v12, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 2
-; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v10, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.floor.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x bfloat> %v
@@ -144,12 +154,12 @@ define <vscale x 4 x bfloat> @vp_floor_nxv4bf16(<vscale x 4 x bfloat> %va, <vsca
 define <vscale x 4 x bfloat> @vp_floor_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv4bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -157,7 +167,7 @@ define <vscale x 4 x bfloat> @vp_floor_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.floor.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -169,25 +179,27 @@ declare <vscale x 8 x bfloat> @llvm.vp.floor.nxv8bf16(<vscale x 8 x bfloat>, <vs
 define <vscale x 8 x bfloat> @vp_floor_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v12, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 2
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v12, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.floor.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x bfloat> %v
@@ -196,12 +208,12 @@ define <vscale x 8 x bfloat> @vp_floor_nxv8bf16(<vscale x 8 x bfloat> %va, <vsca
 define <vscale x 8 x bfloat> @vp_floor_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv8bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -209,7 +221,7 @@ define <vscale x 8 x bfloat> @vp_floor_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.floor.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -221,25 +233,27 @@ declare <vscale x 16 x bfloat> @llvm.vp.floor.nxv16bf16(<vscale x 16 x bfloat>,
 define <vscale x 16 x bfloat> @vp_floor_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv16bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v12, v0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 2
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.floor.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x bfloat> %v
@@ -248,12 +262,12 @@ define <vscale x 16 x bfloat> @vp_floor_nxv16bf16(<vscale x 16 x bfloat> %va, <v
 define <vscale x 16 x bfloat> @vp_floor_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv16bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
@@ -261,7 +275,7 @@ define <vscale x 16 x bfloat> @vp_floor_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.floor.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -279,59 +293,64 @@ define <vscale x 32 x bfloat> @vp_floor_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v12, v0, a2
+; CHECK-NEXT:    vslidedown.vx v17, v0, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    vmv1r.v v18, v17
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v8, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v18, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a2, 2
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmv1r.v v0, v18
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB10_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB10_2:
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v7
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 2
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -346,42 +365,55 @@ define <vscale x 32 x bfloat> @vp_floor_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 define <vscale x 32 x bfloat> @vp_floor_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv32bf16_unmasked:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v24
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vmset.m v16
 ; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v12, v24, a2
+; CHECK-NEXT:    vslidedown.vx v16, v16, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    vmv1r.v v17, v16
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v8, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v17, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a2, 2
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB11_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB11_2:
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
@@ -390,8 +422,14 @@ define <vscale x 32 x bfloat> @vp_floor_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.floor.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x bfloat> %v
@@ -418,23 +456,27 @@ define <vscale x 1 x half> @vp_floor_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vp_floor_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v9, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.floor.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x half> %v
@@ -458,12 +500,12 @@ define <vscale x 1 x half> @vp_floor_nxv1f16_unmasked(<vscale x 1 x half> %va, i
 ;
 ; ZVFHMIN-LABEL: vp_floor_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -471,7 +513,7 @@ define <vscale x 1 x half> @vp_floor_nxv1f16_unmasked(<vscale x 1 x half> %va, i
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.floor.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -500,23 +542,27 @@ define <vscale x 2 x half> @vp_floor_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vp_floor_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v9, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
+; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.floor.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x half> %v
@@ -540,12 +586,12 @@ define <vscale x 2 x half> @vp_floor_nxv2f16_unmasked(<vscale x 2 x half> %va, i
 ;
 ; ZVFHMIN-LABEL: vp_floor_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -553,7 +599,7 @@ define <vscale x 2 x half> @vp_floor_nxv2f16_unmasked(<vscale x 2 x half> %va, i
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.floor.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -582,25 +628,27 @@ define <vscale x 4 x half> @vp_floor_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vp_floor_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v12, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
-; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.floor.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x half> %v
@@ -624,12 +672,12 @@ define <vscale x 4 x half> @vp_floor_nxv4f16_unmasked(<vscale x 4 x half> %va, i
 ;
 ; ZVFHMIN-LABEL: vp_floor_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -637,7 +685,7 @@ define <vscale x 4 x half> @vp_floor_nxv4f16_unmasked(<vscale x 4 x half> %va, i
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.floor.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -668,25 +716,27 @@ define <vscale x 8 x half> @vp_floor_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vp_floor_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
-; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.floor.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %v
@@ -710,12 +760,12 @@ define <vscale x 8 x half> @vp_floor_nxv8f16_unmasked(<vscale x 8 x half> %va, i
 ;
 ; ZVFHMIN-LABEL: vp_floor_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -723,7 +773,7 @@ define <vscale x 8 x half> @vp_floor_nxv8f16_unmasked(<vscale x 8 x half> %va, i
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.floor.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -754,25 +804,27 @@ define <vscale x 16 x half> @vp_floor_nxv16f16(<vscale x 16 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vp_floor_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v12, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.floor.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x half> %v
@@ -796,12 +848,12 @@ define <vscale x 16 x half> @vp_floor_nxv16f16_unmasked(<vscale x 16 x half> %va
 ;
 ; ZVFHMIN-LABEL: vp_floor_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
@@ -809,7 +861,7 @@ define <vscale x 16 x half> @vp_floor_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.floor.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -846,59 +898,64 @@ define <vscale x 32 x half> @vp_floor_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v12, v0, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v17, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    vmv1r.v v18, v17
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v18, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a2, 2
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v18
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v8, v7
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v7, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -928,42 +985,55 @@ define <vscale x 32 x half> @vp_floor_nxv32f16_unmasked(<vscale x 32 x half> %va
 ;
 ; ZVFHMIN-LABEL: vp_floor_nxv32f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vmset.m v16
 ; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v12, v24, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v16, v16, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    vmv1r.v v17, v16
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a2, 2
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v17
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB23_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
@@ -972,8 +1042,14 @@ define <vscale x 32 x half> @vp_floor_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.floor.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
index 7e0c3f45de463..d56e46f7db3ab 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
@@ -21,20 +21,18 @@ define <vscale x 1 x bfloat> @vfmax_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v9, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v11, v11, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vmerge.vvm v9, v11, v8, v0
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; CHECK-NEXT:    vmerge.vvm v8, v9, v11, v0
 ; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8, v0.t
-; CHECK-NEXT:    vmerge.vvm v8, v8, v11, v0
+; CHECK-NEXT:    vmfeq.vv v0, v11, v11, v0.t
+; CHECK-NEXT:    vmerge.vvm v9, v11, v9, v0
 ; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfmax.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vfmax.vv v9, v9, v8, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.maximum.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x bfloat> %v
@@ -54,7 +52,7 @@ define <vscale x 1 x bfloat> @vfmax_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
 ; CHECK-NEXT:    vfmax.vv v9, v8, v9
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.maximum.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -68,20 +66,18 @@ define <vscale x 2 x bfloat> @vfmax_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v11, v11, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v9, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vmerge.vvm v9, v11, v8, v0
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; CHECK-NEXT:    vmerge.vvm v8, v9, v11, v0
 ; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8, v0.t
-; CHECK-NEXT:    vmerge.vvm v8, v8, v11, v0
+; CHECK-NEXT:    vmfeq.vv v0, v11, v11, v0.t
+; CHECK-NEXT:    vmerge.vvm v9, v11, v9, v0
 ; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfmax.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vfmax.vv v9, v9, v8, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.maximum.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x bfloat> %v
@@ -101,7 +97,7 @@ define <vscale x 2 x bfloat> @vfmax_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
 ; CHECK-NEXT:    vfmax.vv v9, v8, v9
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.maximum.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -115,22 +111,20 @@ define <vscale x 4 x bfloat> @vfmax_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vmfeq.vv v8, v14, v14, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmerge.vvm v16, v12, v14, v0
+; CHECK-NEXT:    vmerge.vvm v16, v14, v12, v0
 ; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfeq.vv v8, v14, v14, v0.t
+; CHECK-NEXT:    vmfeq.vv v8, v12, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v14, v12, v0
+; CHECK-NEXT:    vmerge.vvm v8, v12, v14, v0
 ; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfmax.vv v10, v8, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    vfmax.vv v12, v8, v16, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.maximum.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x bfloat> %v
@@ -150,7 +144,7 @@ define <vscale x 4 x bfloat> @vfmax_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 ; CHECK-NEXT:    vmfeq.vv v0, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v10, v12, v10, v0
 ; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.maximum.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -164,22 +158,20 @@ define <vscale x 8 x bfloat> @vfmax_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v12, v0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v20, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v20, v10
+; CHECK-NEXT:    vmfeq.vv v8, v20, v20, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmerge.vvm v24, v16, v20, v0
+; CHECK-NEXT:    vmerge.vvm v24, v20, v16, v0
 ; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfeq.vv v8, v20, v20, v0.t
+; CHECK-NEXT:    vmfeq.vv v8, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v20, v16, v0
+; CHECK-NEXT:    vmerge.vvm v8, v16, v20, v0
 ; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfmax.vv v12, v8, v24, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vfmax.vv v16, v8, v24, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.maximum.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x bfloat> %v
@@ -199,7 +191,7 @@ define <vscale x 8 x bfloat> @vfmax_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v12, v16, v12, v0
 ; CHECK-NEXT:    vfmax.vv v12, v12, v8
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.maximum.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -214,32 +206,58 @@ define <vscale x 16 x bfloat> @vfmax_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <v
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    li a2, 24
+; CHECK-NEXT:    mul a1, a1, a2
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v8, v24, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v8, v8, v24, v0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmfeq.vv v8, v24, v24, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -259,19 +277,19 @@ define <vscale x 16 x bfloat> @vfmax_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT:    vmfeq.vv v0, v24, v24
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
+; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfmax.vv v16, v8, v16
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
@@ -292,147 +310,136 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 5
-; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    slli a1, a1, 5
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a2, 25
+; CHECK-NEXT:    li a2, 24
 ; CHECK-NEXT:    mul a1, a1, a2
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    li a5, 24
-; CHECK-NEXT:    mul a4, a4, a5
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs1r.v v0, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vslidedown.vx v25, v0, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
+; CHECK-NEXT:    slli a3, a3, 3
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
 ; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20, v0.t
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 4
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v12, v24, v24, v0.t
+; CHECK-NEXT:    vmfeq.vv v24, v8, v8, v0.t
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li a3, 25
+; CHECK-NEXT:    li a3, 24
 ; CHECK-NEXT:    mul a2, a2, a3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vmfeq.vv v12, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v16, v16, v8, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB10_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB10_2:
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v8, v24, v24, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 25
+; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v24, v24, v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 25
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16, v0.t
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmfeq.vv v8, v24, v24, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
-; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 25
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 5
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -448,70 +455,76 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a2, 25
-; CHECK-NEXT:    mul a1, a1, a2
+; CHECK-NEXT:    slli a1, a1, 5
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x19, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 25 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmv8r.v v0, v8
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    li a2, 24
+; CHECK-NEXT:    mul a1, a1, a2
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vmset.m v24
+; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmset.m v16
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v24, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vslidedown.vx v24, v16, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 3
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v4
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v12, v24, v24, v0.t
-; CHECK-NEXT:    vmv8r.v v0, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a3, a2, 4
-; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
-; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT:    vmfeq.vv v25, v16, v16, v0.t
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a3, a2, 3
-; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a2, a2, a3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 4
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl1r.v v13, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vmfeq.vv v12, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
-; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a3, a2, 3
-; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v16, v16, v8, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a3, a2, 3
-; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
@@ -520,48 +533,47 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB11_2:
 ; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 4
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
-; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    vmfeq.vv v7, v24, v24
+; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 4
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
+; CHECK-NEXT:    vmerge.vvm v16, v24, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 4
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfmax.vv v16, v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 3
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 25
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -590,20 +602,18 @@ define <vscale x 1 x half> @vfmax_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v8, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v11, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8, v0.t
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v11, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v9, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.maximum.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x half> %v
@@ -633,7 +643,7 @@ define <vscale x 1 x half> @vfmax_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v9
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.maximum.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -660,20 +670,18 @@ define <vscale x 2 x half> @vfmax_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v8, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v11, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8, v0.t
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v11, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v9, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.maximum.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x half> %v
@@ -703,7 +711,7 @@ define <vscale x 2 x half> @vfmax_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v9
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.maximum.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -730,22 +738,20 @@ define <vscale x 4 x half> @vfmax_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v14, v14, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v12, v14, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v14, v12, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v14, v14, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v14, v12, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v14, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vfmax.vv v10, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vfmax.vv v12, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.maximum.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %vb, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x half> %v
@@ -775,7 +781,7 @@ define <vscale x 4 x half> @vfmax_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
 ; ZVFHMIN-NEXT:    vmerge.vvm v10, v12, v10, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v10, v10, v8
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.maximum.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -804,22 +810,20 @@ define <vscale x 8 x half> @vfmax_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v12, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v20, v20, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v16, v20, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v20, v16, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v20, v20, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v20, v16, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v20, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vfmax.vv v12, v8, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vfmax.vv v16, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.maximum.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %v
@@ -849,7 +853,7 @@ define <vscale x 8 x half> @vfmax_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v12, v16, v12, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v12, v12, v8
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.maximum.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -879,32 +883,58 @@ define <vscale x 16 x half> @vfmax_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    li a2, 24
+; ZVFHMIN-NEXT:    mul a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v7, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vmv1r.v v16, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v24, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfmax.vv v16, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v8, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfmax.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -934,19 +964,19 @@ define <vscale x 16 x half> @vfmax_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT:    vmfeq.vv v7, v24, v24
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v24, v24
+; ZVFHMIN-NEXT:    vmfeq.vv v7, v16, v16
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v8, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
@@ -997,147 +1027,136 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 5
-; ZVFHMIN-NEXT:    add a1, a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a2, 25
+; ZVFHMIN-NEXT:    li a2, 24
 ; ZVFHMIN-NEXT:    mul a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    li a5, 24
-; ZVFHMIN-NEXT:    mul a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs1r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vslidedown.vx v25, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv1r.v v0, v25
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v12, v24, v24, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v24, v8, v8, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    li a3, 25
+; ZVFHMIN-NEXT:    li a3, 24
 ; ZVFHMIN-NEXT:    mul a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v24
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv1r.v v0, v25
 ; ZVFHMIN-NEXT:    vmfeq.vv v12, v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v8, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v25
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v24, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 25
+; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 25
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmv1r.v v0, v9
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16, v0.t
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v9
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 25
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v24, v16, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a1, a0, 5
-; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -1164,70 +1183,76 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a2, 25
-; ZVFHMIN-NEXT:    mul a1, a1, a2
+; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x19, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 25 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v0, v8
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    li a2, 24
+; ZVFHMIN-NEXT:    mul a1, a1, a2
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v16
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v8, v24, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vslidedown.vx v24, v16, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv1r.v v0, v24
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v12, v24, v24, v0.t
-; ZVFHMIN-NEXT:    vmv8r.v v0, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 4
-; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v25, v16, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 3
-; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    li a3, 24
+; ZVFHMIN-NEXT:    mul a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v25
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl1r.v v13, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv1r.v v0, v24
 ; ZVFHMIN-NEXT:    vmfeq.vv v12, v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v13
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 3
-; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v8, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v24
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 3
-; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
@@ -1236,48 +1261,47 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a1, a0, 4
-; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT:    vmfeq.vv v7, v16, v16
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v7, v24, v24
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v8, v24, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a1, a0, 4
-; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v8, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v24, v8, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a1, a0, 4
-; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a1, a0, 3
-; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 25
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
index d877073c3487d..81e4a548f560e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
@@ -21,20 +21,18 @@ define <vscale x 1 x bfloat> @vfmin_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v9, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v11, v11, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vmerge.vvm v9, v11, v8, v0
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; CHECK-NEXT:    vmerge.vvm v8, v9, v11, v0
 ; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8, v0.t
-; CHECK-NEXT:    vmerge.vvm v8, v8, v11, v0
+; CHECK-NEXT:    vmfeq.vv v0, v11, v11, v0.t
+; CHECK-NEXT:    vmerge.vvm v9, v11, v9, v0
 ; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfmin.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vfmin.vv v9, v9, v8, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.minimum.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x bfloat> %v
@@ -54,7 +52,7 @@ define <vscale x 1 x bfloat> @vfmin_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
 ; CHECK-NEXT:    vfmin.vv v9, v8, v9
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.minimum.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -68,20 +66,18 @@ define <vscale x 2 x bfloat> @vfmin_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v11, v11, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v9, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vmerge.vvm v9, v11, v8, v0
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; CHECK-NEXT:    vmerge.vvm v8, v9, v11, v0
 ; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8, v0.t
-; CHECK-NEXT:    vmerge.vvm v8, v8, v11, v0
+; CHECK-NEXT:    vmfeq.vv v0, v11, v11, v0.t
+; CHECK-NEXT:    vmerge.vvm v9, v11, v9, v0
 ; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfmin.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vfmin.vv v9, v9, v8, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.minimum.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x bfloat> %v
@@ -101,7 +97,7 @@ define <vscale x 2 x bfloat> @vfmin_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
 ; CHECK-NEXT:    vfmin.vv v9, v8, v9
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.minimum.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -115,22 +111,20 @@ define <vscale x 4 x bfloat> @vfmin_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vmfeq.vv v8, v14, v14, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmerge.vvm v16, v12, v14, v0
+; CHECK-NEXT:    vmerge.vvm v16, v14, v12, v0
 ; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfeq.vv v8, v14, v14, v0.t
+; CHECK-NEXT:    vmfeq.vv v8, v12, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v14, v12, v0
+; CHECK-NEXT:    vmerge.vvm v8, v12, v14, v0
 ; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfmin.vv v10, v8, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    vfmin.vv v12, v8, v16, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.minimum.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x bfloat> %v
@@ -150,7 +144,7 @@ define <vscale x 4 x bfloat> @vfmin_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 ; CHECK-NEXT:    vmfeq.vv v0, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v10, v12, v10, v0
 ; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.minimum.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -164,22 +158,20 @@ define <vscale x 8 x bfloat> @vfmin_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v12, v0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v20, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v20, v10
+; CHECK-NEXT:    vmfeq.vv v8, v20, v20, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmerge.vvm v24, v16, v20, v0
+; CHECK-NEXT:    vmerge.vvm v24, v20, v16, v0
 ; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfeq.vv v8, v20, v20, v0.t
+; CHECK-NEXT:    vmfeq.vv v8, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v20, v16, v0
+; CHECK-NEXT:    vmerge.vvm v8, v16, v20, v0
 ; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfmin.vv v12, v8, v24, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vfmin.vv v16, v8, v24, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.minimum.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x bfloat> %v
@@ -199,7 +191,7 @@ define <vscale x 8 x bfloat> @vfmin_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v12, v16, v12, v0
 ; CHECK-NEXT:    vfmin.vv v12, v12, v8
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.minimum.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -214,32 +206,58 @@ define <vscale x 16 x bfloat> @vfmin_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <v
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    li a2, 24
+; CHECK-NEXT:    mul a1, a1, a2
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v8, v24, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v8, v8, v24, v0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmfeq.vv v8, v24, v24, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -259,19 +277,19 @@ define <vscale x 16 x bfloat> @vfmin_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT:    vmfeq.vv v0, v24, v24
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
+; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfmin.vv v16, v8, v16
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
@@ -292,147 +310,136 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 5
-; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    slli a1, a1, 5
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a2, 25
+; CHECK-NEXT:    li a2, 24
 ; CHECK-NEXT:    mul a1, a1, a2
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    li a5, 24
-; CHECK-NEXT:    mul a4, a4, a5
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs1r.v v0, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vslidedown.vx v25, v0, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
+; CHECK-NEXT:    slli a3, a3, 3
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
 ; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20, v0.t
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 4
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v12, v24, v24, v0.t
+; CHECK-NEXT:    vmfeq.vv v24, v8, v8, v0.t
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li a3, 25
+; CHECK-NEXT:    li a3, 24
 ; CHECK-NEXT:    mul a2, a2, a3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vmfeq.vv v12, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v16, v16, v8, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB10_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB10_2:
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v8, v24, v24, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 25
+; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v24, v24, v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 25
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16, v0.t
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmfeq.vv v8, v24, v24, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
-; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 25
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 5
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -448,70 +455,76 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a2, 25
-; CHECK-NEXT:    mul a1, a1, a2
+; CHECK-NEXT:    slli a1, a1, 5
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x19, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 25 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmv8r.v v0, v8
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    li a2, 24
+; CHECK-NEXT:    mul a1, a1, a2
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vmset.m v24
+; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmset.m v16
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v24, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vslidedown.vx v24, v16, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 3
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v4
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v12, v24, v24, v0.t
-; CHECK-NEXT:    vmv8r.v v0, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a3, a2, 4
-; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
-; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT:    vmfeq.vv v25, v16, v16, v0.t
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a3, a2, 3
-; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a2, a2, a3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 4
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl1r.v v13, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vmfeq.vv v12, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
-; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a3, a2, 3
-; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v16, v16, v8, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a3, a2, 3
-; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
@@ -520,48 +533,47 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB11_2:
 ; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 4
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
-; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    vmfeq.vv v7, v24, v24
+; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 4
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
+; CHECK-NEXT:    vmerge.vvm v16, v24, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 4
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfmin.vv v16, v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 3
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 25
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -590,20 +602,18 @@ define <vscale x 1 x half> @vfmin_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v8, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v11, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8, v0.t
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v11, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v9, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.minimum.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x half> %v
@@ -633,7 +643,7 @@ define <vscale x 1 x half> @vfmin_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v9
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.minimum.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -660,20 +670,18 @@ define <vscale x 2 x half> @vfmin_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v8, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v11, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8, v0.t
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v11, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v9, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.minimum.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x half> %v
@@ -703,7 +711,7 @@ define <vscale x 2 x half> @vfmin_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v9
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.minimum.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -730,22 +738,20 @@ define <vscale x 4 x half> @vfmin_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v14, v14, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v12, v14, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v14, v12, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v14, v14, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v14, v12, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v14, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vfmin.vv v10, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vfmin.vv v12, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.minimum.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %vb, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x half> %v
@@ -775,7 +781,7 @@ define <vscale x 4 x half> @vfmin_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
 ; ZVFHMIN-NEXT:    vmerge.vvm v10, v12, v10, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v10, v10, v8
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.minimum.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -804,22 +810,20 @@ define <vscale x 8 x half> @vfmin_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v12, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v20, v20, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v16, v20, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v20, v16, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v20, v20, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v20, v16, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v20, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vfmin.vv v12, v8, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vfmin.vv v16, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.minimum.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %v
@@ -849,7 +853,7 @@ define <vscale x 8 x half> @vfmin_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v12, v16, v12, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v12, v12, v8
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.minimum.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -879,32 +883,58 @@ define <vscale x 16 x half> @vfmin_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    li a2, 24
+; ZVFHMIN-NEXT:    mul a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v7, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vmv1r.v v16, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v24, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfmin.vv v16, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v8, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfmin.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -934,19 +964,19 @@ define <vscale x 16 x half> @vfmin_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT:    vmfeq.vv v7, v24, v24
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v24, v24
+; ZVFHMIN-NEXT:    vmfeq.vv v7, v16, v16
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v8, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
@@ -997,147 +1027,136 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 5
-; ZVFHMIN-NEXT:    add a1, a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a2, 25
+; ZVFHMIN-NEXT:    li a2, 24
 ; ZVFHMIN-NEXT:    mul a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    li a5, 24
-; ZVFHMIN-NEXT:    mul a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs1r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vslidedown.vx v25, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv1r.v v0, v25
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v12, v24, v24, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v24, v8, v8, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    li a3, 25
+; ZVFHMIN-NEXT:    li a3, 24
 ; ZVFHMIN-NEXT:    mul a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v24
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv1r.v v0, v25
 ; ZVFHMIN-NEXT:    vmfeq.vv v12, v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v8, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v25
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v24, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 25
+; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 25
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmv1r.v v0, v9
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16, v0.t
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v9
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 25
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v24, v16, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a1, a0, 5
-; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -1164,70 +1183,76 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a2, 25
-; ZVFHMIN-NEXT:    mul a1, a1, a2
+; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x19, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 25 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v0, v8
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    li a2, 24
+; ZVFHMIN-NEXT:    mul a1, a1, a2
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v16
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v8, v24, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vslidedown.vx v24, v16, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv1r.v v0, v24
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v12, v24, v24, v0.t
-; ZVFHMIN-NEXT:    vmv8r.v v0, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 4
-; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v25, v16, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 3
-; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    li a3, 24
+; ZVFHMIN-NEXT:    mul a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v25
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl1r.v v13, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv1r.v v0, v24
 ; ZVFHMIN-NEXT:    vmfeq.vv v12, v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v13
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 3
-; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v8, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v24
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 3
-; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
@@ -1236,48 +1261,47 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a1, a0, 4
-; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT:    vmfeq.vv v7, v16, v16
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v7, v24, v24
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v8, v24, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a1, a0, 4
-; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v8, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v24, v8, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a1, a0, 4
-; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a1, a0, 3
-; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 25
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
index 9aa26e59c6a03..7d3700492ea7b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
@@ -17,22 +17,26 @@ declare <vscale x 1 x bfloat> @llvm.vp.nearbyint.nxv1bf16(<vscale x 1 x bfloat>,
 define <vscale x 1 x bfloat> @vp_nearbyint_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv1bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vfabs.v v11, v10, v0.t
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v11, v10, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.nearbyint.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 %evl)
@@ -42,19 +46,19 @@ define <vscale x 1 x bfloat> @vp_nearbyint_nxv1bf16(<vscale x 1 x bfloat> %va, <
 define <vscale x 1 x bfloat> @vp_nearbyint_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv1bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
@@ -67,22 +71,26 @@ declare <vscale x 2 x bfloat> @llvm.vp.nearbyint.nxv2bf16(<vscale x 2 x bfloat>,
 define <vscale x 2 x bfloat> @vp_nearbyint_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv2bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vfabs.v v11, v10, v0.t
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmv.v.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v11, v10, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.nearbyint.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 %evl)
@@ -92,19 +100,19 @@ define <vscale x 2 x bfloat> @vp_nearbyint_nxv2bf16(<vscale x 2 x bfloat> %va, <
 define <vscale x 2 x bfloat> @vp_nearbyint_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
@@ -117,24 +125,26 @@ declare <vscale x 4 x bfloat> @llvm.vp.nearbyint.nxv4bf16(<vscale x 4 x bfloat>,
 define <vscale x 4 x bfloat> @vp_nearbyint_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv4bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v10, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v9, v12, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v12, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
-; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v10, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.nearbyint.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 %evl)
@@ -144,19 +154,19 @@ define <vscale x 4 x bfloat> @vp_nearbyint_nxv4bf16(<vscale x 4 x bfloat> %va, <
 define <vscale x 4 x bfloat> @vp_nearbyint_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv4bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
 ; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
@@ -169,24 +179,26 @@ declare <vscale x 8 x bfloat> @llvm.vp.nearbyint.nxv8bf16(<vscale x 8 x bfloat>,
 define <vscale x 8 x bfloat> @vp_nearbyint_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v12, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
 ; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.nearbyint.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 %evl)
@@ -196,19 +208,19 @@ define <vscale x 8 x bfloat> @vp_nearbyint_nxv8bf16(<vscale x 8 x bfloat> %va, <
 define <vscale x 8 x bfloat> @vp_nearbyint_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv8bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
@@ -221,24 +233,26 @@ declare <vscale x 16 x bfloat> @llvm.vp.nearbyint.nxv16bf16(<vscale x 16 x bfloa
 define <vscale x 16 x bfloat> @vp_nearbyint_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv16bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v12, v0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v24, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.nearbyint.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 %evl)
@@ -248,19 +262,19 @@ define <vscale x 16 x bfloat> @vp_nearbyint_nxv16bf16(<vscale x 16 x bfloat> %va
 define <vscale x 16 x bfloat> @vp_nearbyint_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv16bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
@@ -273,55 +287,76 @@ declare <vscale x 32 x bfloat> @llvm.vp.nearbyint.nxv32bf16(<vscale x 32 x bfloa
 define <vscale x 32 x bfloat> @vp_nearbyint_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv32bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v12, v0, a2
+; CHECK-NEXT:    vslidedown.vx v17, v0, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    vmv1r.v v18, v17
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v8, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v18, v8, fa5, v0.t
 ; CHECK-NEXT:    frflags a2
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmv1r.v v0, v18
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    fsflags a2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB10_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB10_2:
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v7
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24, v0.t
 ; CHECK-NEXT:    fsflags a0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.nearbyint.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x bfloat> %v
@@ -330,42 +365,55 @@ define <vscale x 32 x bfloat> @vp_nearbyint_nxv32bf16(<vscale x 32 x bfloat> %va
 define <vscale x 32 x bfloat> @vp_nearbyint_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv32bf16_unmasked:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v24
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vmset.m v16
 ; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v12, v24, a2
+; CHECK-NEXT:    vslidedown.vx v16, v16, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    vmv1r.v v17, v16
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v8, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v17, v8, fa5, v0.t
 ; CHECK-NEXT:    frflags a2
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    fsflags a2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB11_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB11_2:
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
 ; CHECK-NEXT:    frflags a0
@@ -373,9 +421,15 @@ define <vscale x 32 x bfloat> @vp_nearbyint_nxv32bf16_unmasked(<vscale x 32 x bf
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    fsflags a0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.nearbyint.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x bfloat> %v
@@ -402,22 +456,26 @@ define <vscale x 1 x half> @vp_nearbyint_nxv1f16(<vscale x 1 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v9, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; ZVFHMIN-NEXT:    frflags a0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.nearbyint.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 %evl)
@@ -442,19 +500,19 @@ define <vscale x 1 x half> @vp_nearbyint_nxv1f16_unmasked(<vscale x 1 x half> %v
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
@@ -484,22 +542,26 @@ define <vscale x 2 x half> @vp_nearbyint_nxv2f16(<vscale x 2 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v9, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; ZVFHMIN-NEXT:    frflags a0
+; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.nearbyint.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
@@ -524,19 +586,19 @@ define <vscale x 2 x half> @vp_nearbyint_nxv2f16_unmasked(<vscale x 2 x half> %v
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
@@ -566,24 +628,26 @@ define <vscale x 4 x half> @vp_nearbyint_nxv4f16(<vscale x 4 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v12, fa5, v0.t
 ; ZVFHMIN-NEXT:    frflags a0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.nearbyint.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> %m, i32 %evl)
@@ -608,19 +672,19 @@ define <vscale x 4 x half> @vp_nearbyint_nxv4f16_unmasked(<vscale x 4 x half> %v
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
@@ -652,24 +716,26 @@ define <vscale x 8 x half> @vp_nearbyint_nxv8f16(<vscale x 8 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    frflags a0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.nearbyint.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 %evl)
@@ -694,19 +760,19 @@ define <vscale x 8 x half> @vp_nearbyint_nxv8f16_unmasked(<vscale x 8 x half> %v
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
@@ -738,24 +804,26 @@ define <vscale x 16 x half> @vp_nearbyint_nxv16f16(<vscale x 16 x half> %va, <vs
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v12, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    frflags a0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.nearbyint.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 %evl)
@@ -780,19 +848,19 @@ define <vscale x 16 x half> @vp_nearbyint_nxv16f16_unmasked(<vscale x 16 x half>
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
@@ -824,55 +892,76 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16(<vscale x 32 x half> %va, <vs
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v12, v0, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v17, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    vmv1r.v v18, v17
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v18, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    frflags a2
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v18
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    fsflags a2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v8, v7
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v7, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    frflags a0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
 ; ZVFHMIN-NEXT:    fsflags a0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.nearbyint.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
@@ -896,42 +985,55 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16_unmasked(<vscale x 32 x half>
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv32f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vmset.m v16
 ; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v12, v24, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v16, v16, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    vmv1r.v v17, v16
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    frflags a2
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v17
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    fsflags a2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB23_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; ZVFHMIN-NEXT:    frflags a0
@@ -939,9 +1041,15 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16_unmasked(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    fsflags a0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.nearbyint.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll
index 70ea1bc78d2e5..f044eceb9f930 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll
@@ -17,21 +17,25 @@ declare <vscale x 1 x bfloat> @llvm.vp.rint.nxv1bf16(<vscale x 1 x bfloat>, <vsc
 define <vscale x 1 x bfloat> @vp_rint_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_nxv1bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vfabs.v v11, v10, v0.t
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v11, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v11, v10, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.rint.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x bfloat> %v
@@ -40,18 +44,18 @@ define <vscale x 1 x bfloat> @vp_rint_nxv1bf16(<vscale x 1 x bfloat> %va, <vscal
 define <vscale x 1 x bfloat> @vp_rint_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_nxv1bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.rint.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -63,21 +67,25 @@ declare <vscale x 2 x bfloat> @llvm.vp.rint.nxv2bf16(<vscale x 2 x bfloat>, <vsc
 define <vscale x 2 x bfloat> @vp_rint_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_nxv2bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vfabs.v v11, v10, v0.t
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v11, fa5, v0.t
+; CHECK-NEXT:    vmv.v.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v11, v10, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.rint.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x bfloat> %v
@@ -86,18 +94,18 @@ define <vscale x 2 x bfloat> @vp_rint_nxv2bf16(<vscale x 2 x bfloat> %va, <vscal
 define <vscale x 2 x bfloat> @vp_rint_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.rint.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -109,23 +117,25 @@ declare <vscale x 4 x bfloat> @llvm.vp.rint.nxv4bf16(<vscale x 4 x bfloat>, <vsc
 define <vscale x 4 x bfloat> @vp_rint_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_nxv4bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v10, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v9, v12, fa5, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmflt.vf v8, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v10, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.rint.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x bfloat> %v
@@ -134,18 +144,18 @@ define <vscale x 4 x bfloat> @vp_rint_nxv4bf16(<vscale x 4 x bfloat> %va, <vscal
 define <vscale x 4 x bfloat> @vp_rint_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_nxv4bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.rint.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -157,23 +167,25 @@ declare <vscale x 8 x bfloat> @llvm.vp.rint.nxv8bf16(<vscale x 8 x bfloat>, <vsc
 define <vscale x 8 x bfloat> @vp_rint_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_nxv8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v12, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v16, fa5, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.rint.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x bfloat> %v
@@ -182,18 +194,18 @@ define <vscale x 8 x bfloat> @vp_rint_nxv8bf16(<vscale x 8 x bfloat> %va, <vscal
 define <vscale x 8 x bfloat> @vp_rint_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_nxv8bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.rint.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -205,23 +217,25 @@ declare <vscale x 16 x bfloat> @llvm.vp.rint.nxv16bf16(<vscale x 16 x bfloat>, <
 define <vscale x 16 x bfloat> @vp_rint_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_nxv16bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v12, v0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmflt.vf v8, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.rint.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x bfloat> %v
@@ -230,18 +244,18 @@ define <vscale x 16 x bfloat> @vp_rint_nxv16bf16(<vscale x 16 x bfloat> %va, <vs
 define <vscale x 16 x bfloat> @vp_rint_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_nxv16bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.rint.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -259,54 +273,60 @@ define <vscale x 32 x bfloat> @vp_rint_nxv32bf16(<vscale x 32 x bfloat> %va, <vs
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v12, v0, a2
+; CHECK-NEXT:    vslidedown.vx v17, v0, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    vmv1r.v v18, v17
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v24, v0.t
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v8, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmflt.vf v18, v8, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v18
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB10_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB10_2:
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v7
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -321,48 +341,67 @@ define <vscale x 32 x bfloat> @vp_rint_nxv32bf16(<vscale x 32 x bfloat> %va, <vs
 define <vscale x 32 x bfloat> @vp_rint_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_nxv32bf16_unmasked:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v24
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vmset.m v16
 ; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v12, v24, a2
+; CHECK-NEXT:    vslidedown.vx v16, v16, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    vmv1r.v v17, v16
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v8, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmflt.vf v17, v8, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB11_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB11_2:
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.rint.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x bfloat> %v
@@ -387,21 +426,25 @@ define <vscale x 1 x half> @vp_rint_nxv1f16(<vscale x 1 x half> %va, <vscale x 1
 ;
 ; ZVFHMIN-LABEL: vp_rint_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v9, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.rint.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x half> %v
@@ -423,18 +466,18 @@ define <vscale x 1 x half> @vp_rint_nxv1f16_unmasked(<vscale x 1 x half> %va, i3
 ;
 ; ZVFHMIN-LABEL: vp_rint_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.rint.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -461,21 +504,25 @@ define <vscale x 2 x half> @vp_rint_nxv2f16(<vscale x 2 x half> %va, <vscale x 2
 ;
 ; ZVFHMIN-LABEL: vp_rint_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v9, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.rint.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x half> %v
@@ -497,18 +544,18 @@ define <vscale x 2 x half> @vp_rint_nxv2f16_unmasked(<vscale x 2 x half> %va, i3
 ;
 ; ZVFHMIN-LABEL: vp_rint_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.rint.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -535,23 +582,25 @@ define <vscale x 4 x half> @vp_rint_nxv4f16(<vscale x 4 x half> %va, <vscale x 4
 ;
 ; ZVFHMIN-LABEL: vp_rint_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
-; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vmflt.vf v8, v12, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.rint.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x half> %v
@@ -573,18 +622,18 @@ define <vscale x 4 x half> @vp_rint_nxv4f16_unmasked(<vscale x 4 x half> %va, i3
 ;
 ; ZVFHMIN-LABEL: vp_rint_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.rint.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -613,23 +662,25 @@ define <vscale x 8 x half> @vp_rint_nxv8f16(<vscale x 8 x half> %va, <vscale x 8
 ;
 ; ZVFHMIN-LABEL: vp_rint_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
-; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.rint.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %v
@@ -651,18 +702,18 @@ define <vscale x 8 x half> @vp_rint_nxv8f16_unmasked(<vscale x 8 x half> %va, i3
 ;
 ; ZVFHMIN-LABEL: vp_rint_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.rint.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -691,23 +742,25 @@ define <vscale x 16 x half> @vp_rint_nxv16f16(<vscale x 16 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vp_rint_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v12, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmflt.vf v8, v24, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.rint.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x half> %v
@@ -729,18 +782,18 @@ define <vscale x 16 x half> @vp_rint_nxv16f16_unmasked(<vscale x 16 x half> %va,
 ;
 ; ZVFHMIN-LABEL: vp_rint_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.rint.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -775,54 +828,60 @@ define <vscale x 32 x half> @vp_rint_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v12, v0, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v17, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    vmv1r.v v18, v17
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmflt.vf v18, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v18
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v8, v7
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v7, v16, fa5, v0.t
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -850,48 +909,67 @@ define <vscale x 32 x half> @vp_rint_nxv32f16_unmasked(<vscale x 32 x half> %va,
 ;
 ; ZVFHMIN-LABEL: vp_rint_nxv32f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vmset.m v16
 ; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v12, v24, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v16, v16, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    vmv1r.v v17, v16
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v17
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB23_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.rint.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
index 4e15f13570583..39744dcecd718 100644
--- a/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
@@ -17,23 +17,27 @@ declare <vscale x 1 x bfloat> @llvm.vp.round.nxv1bf16(<vscale x 1 x bfloat>, <vs
 define <vscale x 1 x bfloat> @vp_round_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv1bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vfabs.v v11, v10, v0.t
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.round.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x bfloat> %v
@@ -42,12 +46,12 @@ define <vscale x 1 x bfloat> @vp_round_nxv1bf16(<vscale x 1 x bfloat> %va, <vsca
 define <vscale x 1 x bfloat> @vp_round_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv1bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -55,7 +59,7 @@ define <vscale x 1 x bfloat> @vp_round_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.round.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -67,23 +71,27 @@ declare <vscale x 2 x bfloat> @llvm.vp.round.nxv2bf16(<vscale x 2 x bfloat>, <vs
 define <vscale x 2 x bfloat> @vp_round_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv2bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vfabs.v v11, v10, v0.t
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmv.v.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.round.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x bfloat> %v
@@ -92,12 +100,12 @@ define <vscale x 2 x bfloat> @vp_round_nxv2bf16(<vscale x 2 x bfloat> %va, <vsca
 define <vscale x 2 x bfloat> @vp_round_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -105,7 +113,7 @@ define <vscale x 2 x bfloat> @vp_round_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.round.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -117,25 +125,27 @@ declare <vscale x 4 x bfloat> @llvm.vp.round.nxv4bf16(<vscale x 4 x bfloat>, <vs
 define <vscale x 4 x bfloat> @vp_round_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv4bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v10, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v9, v12, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v12, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 4
-; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v10, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.round.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x bfloat> %v
@@ -144,12 +154,12 @@ define <vscale x 4 x bfloat> @vp_round_nxv4bf16(<vscale x 4 x bfloat> %va, <vsca
 define <vscale x 4 x bfloat> @vp_round_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv4bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -157,7 +167,7 @@ define <vscale x 4 x bfloat> @vp_round_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.round.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -169,25 +179,27 @@ declare <vscale x 8 x bfloat> @llvm.vp.round.nxv8bf16(<vscale x 8 x bfloat>, <vs
 define <vscale x 8 x bfloat> @vp_round_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v12, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 4
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v12, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.round.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x bfloat> %v
@@ -196,12 +208,12 @@ define <vscale x 8 x bfloat> @vp_round_nxv8bf16(<vscale x 8 x bfloat> %va, <vsca
 define <vscale x 8 x bfloat> @vp_round_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv8bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -209,7 +221,7 @@ define <vscale x 8 x bfloat> @vp_round_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.round.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -221,25 +233,27 @@ declare <vscale x 16 x bfloat> @llvm.vp.round.nxv16bf16(<vscale x 16 x bfloat>,
 define <vscale x 16 x bfloat> @vp_round_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv16bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v12, v0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 4
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.round.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x bfloat> %v
@@ -248,12 +262,12 @@ define <vscale x 16 x bfloat> @vp_round_nxv16bf16(<vscale x 16 x bfloat> %va, <v
 define <vscale x 16 x bfloat> @vp_round_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv16bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
@@ -261,7 +275,7 @@ define <vscale x 16 x bfloat> @vp_round_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.round.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -279,59 +293,64 @@ define <vscale x 32 x bfloat> @vp_round_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v12, v0, a2
+; CHECK-NEXT:    vslidedown.vx v17, v0, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    vmv1r.v v18, v17
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v8, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v18, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a2, 4
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmv1r.v v0, v18
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB10_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB10_2:
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v7
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 4
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -346,42 +365,55 @@ define <vscale x 32 x bfloat> @vp_round_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 define <vscale x 32 x bfloat> @vp_round_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv32bf16_unmasked:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v24
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vmset.m v16
 ; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v12, v24, a2
+; CHECK-NEXT:    vslidedown.vx v16, v16, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    vmv1r.v v17, v16
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v8, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v17, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a2, 4
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB11_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB11_2:
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
@@ -390,8 +422,14 @@ define <vscale x 32 x bfloat> @vp_round_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.round.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x bfloat> %v
@@ -418,23 +456,27 @@ define <vscale x 1 x half> @vp_round_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vp_round_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v9, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.round.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x half> %v
@@ -458,12 +500,12 @@ define <vscale x 1 x half> @vp_round_nxv1f16_unmasked(<vscale x 1 x half> %va, i
 ;
 ; ZVFHMIN-LABEL: vp_round_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -471,7 +513,7 @@ define <vscale x 1 x half> @vp_round_nxv1f16_unmasked(<vscale x 1 x half> %va, i
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.round.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -500,23 +542,27 @@ define <vscale x 2 x half> @vp_round_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vp_round_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v9, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
+; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.round.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x half> %v
@@ -540,12 +586,12 @@ define <vscale x 2 x half> @vp_round_nxv2f16_unmasked(<vscale x 2 x half> %va, i
 ;
 ; ZVFHMIN-LABEL: vp_round_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -553,7 +599,7 @@ define <vscale x 2 x half> @vp_round_nxv2f16_unmasked(<vscale x 2 x half> %va, i
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.round.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -582,25 +628,27 @@ define <vscale x 4 x half> @vp_round_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vp_round_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v12, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.round.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x half> %v
@@ -624,12 +672,12 @@ define <vscale x 4 x half> @vp_round_nxv4f16_unmasked(<vscale x 4 x half> %va, i
 ;
 ; ZVFHMIN-LABEL: vp_round_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -637,7 +685,7 @@ define <vscale x 4 x half> @vp_round_nxv4f16_unmasked(<vscale x 4 x half> %va, i
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.round.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -668,25 +716,27 @@ define <vscale x 8 x half> @vp_round_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vp_round_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.round.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %v
@@ -710,12 +760,12 @@ define <vscale x 8 x half> @vp_round_nxv8f16_unmasked(<vscale x 8 x half> %va, i
 ;
 ; ZVFHMIN-LABEL: vp_round_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -723,7 +773,7 @@ define <vscale x 8 x half> @vp_round_nxv8f16_unmasked(<vscale x 8 x half> %va, i
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.round.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -754,25 +804,27 @@ define <vscale x 16 x half> @vp_round_nxv16f16(<vscale x 16 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vp_round_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v12, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.round.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x half> %v
@@ -796,12 +848,12 @@ define <vscale x 16 x half> @vp_round_nxv16f16_unmasked(<vscale x 16 x half> %va
 ;
 ; ZVFHMIN-LABEL: vp_round_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
@@ -809,7 +861,7 @@ define <vscale x 16 x half> @vp_round_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.round.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -846,59 +898,64 @@ define <vscale x 32 x half> @vp_round_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v12, v0, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v17, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    vmv1r.v v18, v17
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v18, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a2, 4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v18
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v8, v7
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v7, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -928,42 +985,55 @@ define <vscale x 32 x half> @vp_round_nxv32f16_unmasked(<vscale x 32 x half> %va
 ;
 ; ZVFHMIN-LABEL: vp_round_nxv32f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vmset.m v16
 ; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v12, v24, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v16, v16, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    vmv1r.v v17, v16
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a2, 4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v17
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB23_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
@@ -972,8 +1042,14 @@ define <vscale x 32 x half> @vp_round_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.round.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
index f1510a8c53181..df5844277c997 100644
--- a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
@@ -17,23 +17,27 @@ declare <vscale x 1 x bfloat> @llvm.vp.roundeven.nxv1bf16(<vscale x 1 x bfloat>,
 define <vscale x 1 x bfloat> @vp_roundeven_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv1bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vfabs.v v11, v10, v0.t
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.roundeven.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x bfloat> %v
@@ -42,12 +46,12 @@ define <vscale x 1 x bfloat> @vp_roundeven_nxv1bf16(<vscale x 1 x bfloat> %va, <
 define <vscale x 1 x bfloat> @vp_roundeven_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv1bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -55,7 +59,7 @@ define <vscale x 1 x bfloat> @vp_roundeven_nxv1bf16_unmasked(<vscale x 1 x bfloa
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.roundeven.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -67,23 +71,27 @@ declare <vscale x 2 x bfloat> @llvm.vp.roundeven.nxv2bf16(<vscale x 2 x bfloat>,
 define <vscale x 2 x bfloat> @vp_roundeven_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv2bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vfabs.v v11, v10, v0.t
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmv.v.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.roundeven.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x bfloat> %v
@@ -92,12 +100,12 @@ define <vscale x 2 x bfloat> @vp_roundeven_nxv2bf16(<vscale x 2 x bfloat> %va, <
 define <vscale x 2 x bfloat> @vp_roundeven_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -105,7 +113,7 @@ define <vscale x 2 x bfloat> @vp_roundeven_nxv2bf16_unmasked(<vscale x 2 x bfloa
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.roundeven.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -117,25 +125,27 @@ declare <vscale x 4 x bfloat> @llvm.vp.roundeven.nxv4bf16(<vscale x 4 x bfloat>,
 define <vscale x 4 x bfloat> @vp_roundeven_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv4bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v10, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v9, v12, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v12, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 0
-; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v10, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.roundeven.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x bfloat> %v
@@ -144,12 +154,12 @@ define <vscale x 4 x bfloat> @vp_roundeven_nxv4bf16(<vscale x 4 x bfloat> %va, <
 define <vscale x 4 x bfloat> @vp_roundeven_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv4bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -157,7 +167,7 @@ define <vscale x 4 x bfloat> @vp_roundeven_nxv4bf16_unmasked(<vscale x 4 x bfloa
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.roundeven.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -169,25 +179,27 @@ declare <vscale x 8 x bfloat> @llvm.vp.roundeven.nxv8bf16(<vscale x 8 x bfloat>,
 define <vscale x 8 x bfloat> @vp_roundeven_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v12, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 0
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v12, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.roundeven.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x bfloat> %v
@@ -196,12 +208,12 @@ define <vscale x 8 x bfloat> @vp_roundeven_nxv8bf16(<vscale x 8 x bfloat> %va, <
 define <vscale x 8 x bfloat> @vp_roundeven_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv8bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -209,7 +221,7 @@ define <vscale x 8 x bfloat> @vp_roundeven_nxv8bf16_unmasked(<vscale x 8 x bfloa
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.roundeven.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -221,25 +233,27 @@ declare <vscale x 16 x bfloat> @llvm.vp.roundeven.nxv16bf16(<vscale x 16 x bfloa
 define <vscale x 16 x bfloat> @vp_roundeven_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv16bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v12, v0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 0
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.roundeven.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x bfloat> %v
@@ -248,12 +262,12 @@ define <vscale x 16 x bfloat> @vp_roundeven_nxv16bf16(<vscale x 16 x bfloat> %va
 define <vscale x 16 x bfloat> @vp_roundeven_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv16bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
@@ -261,7 +275,7 @@ define <vscale x 16 x bfloat> @vp_roundeven_nxv16bf16_unmasked(<vscale x 16 x bf
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.roundeven.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -279,59 +293,64 @@ define <vscale x 32 x bfloat> @vp_roundeven_nxv32bf16(<vscale x 32 x bfloat> %va
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v12, v0, a2
+; CHECK-NEXT:    vslidedown.vx v17, v0, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    vmv1r.v v18, v17
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v8, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v18, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a2, 0
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmv1r.v v0, v18
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB10_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB10_2:
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v7
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 0
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -346,42 +365,55 @@ define <vscale x 32 x bfloat> @vp_roundeven_nxv32bf16(<vscale x 32 x bfloat> %va
 define <vscale x 32 x bfloat> @vp_roundeven_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv32bf16_unmasked:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v24
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vmset.m v16
 ; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v12, v24, a2
+; CHECK-NEXT:    vslidedown.vx v16, v16, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    vmv1r.v v17, v16
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v8, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v17, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a2, 0
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB11_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB11_2:
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
@@ -390,8 +422,14 @@ define <vscale x 32 x bfloat> @vp_roundeven_nxv32bf16_unmasked(<vscale x 32 x bf
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.roundeven.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x bfloat> %v
@@ -418,23 +456,27 @@ define <vscale x 1 x half> @vp_roundeven_nxv1f16(<vscale x 1 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vp_roundeven_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v9, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.roundeven.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x half> %v
@@ -458,12 +500,12 @@ define <vscale x 1 x half> @vp_roundeven_nxv1f16_unmasked(<vscale x 1 x half> %v
 ;
 ; ZVFHMIN-LABEL: vp_roundeven_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -471,7 +513,7 @@ define <vscale x 1 x half> @vp_roundeven_nxv1f16_unmasked(<vscale x 1 x half> %v
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.roundeven.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -500,23 +542,27 @@ define <vscale x 2 x half> @vp_roundeven_nxv2f16(<vscale x 2 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vp_roundeven_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v9, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
+; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.roundeven.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x half> %v
@@ -540,12 +586,12 @@ define <vscale x 2 x half> @vp_roundeven_nxv2f16_unmasked(<vscale x 2 x half> %v
 ;
 ; ZVFHMIN-LABEL: vp_roundeven_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -553,7 +599,7 @@ define <vscale x 2 x half> @vp_roundeven_nxv2f16_unmasked(<vscale x 2 x half> %v
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.roundeven.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -582,25 +628,27 @@ define <vscale x 4 x half> @vp_roundeven_nxv4f16(<vscale x 4 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vp_roundeven_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v12, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.roundeven.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x half> %v
@@ -624,12 +672,12 @@ define <vscale x 4 x half> @vp_roundeven_nxv4f16_unmasked(<vscale x 4 x half> %v
 ;
 ; ZVFHMIN-LABEL: vp_roundeven_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -637,7 +685,7 @@ define <vscale x 4 x half> @vp_roundeven_nxv4f16_unmasked(<vscale x 4 x half> %v
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.roundeven.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -668,25 +716,27 @@ define <vscale x 8 x half> @vp_roundeven_nxv8f16(<vscale x 8 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vp_roundeven_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.roundeven.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %v
@@ -710,12 +760,12 @@ define <vscale x 8 x half> @vp_roundeven_nxv8f16_unmasked(<vscale x 8 x half> %v
 ;
 ; ZVFHMIN-LABEL: vp_roundeven_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -723,7 +773,7 @@ define <vscale x 8 x half> @vp_roundeven_nxv8f16_unmasked(<vscale x 8 x half> %v
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.roundeven.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -754,25 +804,27 @@ define <vscale x 16 x half> @vp_roundeven_nxv16f16(<vscale x 16 x half> %va, <vs
 ;
 ; ZVFHMIN-LABEL: vp_roundeven_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v12, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.roundeven.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x half> %v
@@ -796,12 +848,12 @@ define <vscale x 16 x half> @vp_roundeven_nxv16f16_unmasked(<vscale x 16 x half>
 ;
 ; ZVFHMIN-LABEL: vp_roundeven_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
@@ -809,7 +861,7 @@ define <vscale x 16 x half> @vp_roundeven_nxv16f16_unmasked(<vscale x 16 x half>
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.roundeven.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -846,59 +898,64 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16(<vscale x 32 x half> %va, <vs
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v12, v0, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v17, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    vmv1r.v v18, v17
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v18, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a2, 0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v18
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v8, v7
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v7, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -928,42 +985,55 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16_unmasked(<vscale x 32 x half>
 ;
 ; ZVFHMIN-LABEL: vp_roundeven_nxv32f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vmset.m v16
 ; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v12, v24, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v16, v16, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    vmv1r.v v17, v16
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a2, 0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v17
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB23_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
@@ -972,8 +1042,14 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16_unmasked(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.roundeven.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
index dd7db58ccdf34..1300d8cd64ebb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
@@ -17,23 +17,27 @@ declare <vscale x 1 x bfloat> @llvm.vp.roundtozero.nxv1bf16(<vscale x 1 x bfloat
 define <vscale x 1 x bfloat> @vp_roundtozero_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv1bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vfabs.v v11, v10, v0.t
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.roundtozero.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x bfloat> %v
@@ -42,12 +46,12 @@ define <vscale x 1 x bfloat> @vp_roundtozero_nxv1bf16(<vscale x 1 x bfloat> %va,
 define <vscale x 1 x bfloat> @vp_roundtozero_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv1bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -55,7 +59,7 @@ define <vscale x 1 x bfloat> @vp_roundtozero_nxv1bf16_unmasked(<vscale x 1 x bfl
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.roundtozero.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -67,23 +71,27 @@ declare <vscale x 2 x bfloat> @llvm.vp.roundtozero.nxv2bf16(<vscale x 2 x bfloat
 define <vscale x 2 x bfloat> @vp_roundtozero_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv2bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vfabs.v v11, v10, v0.t
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vmv.v.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.roundtozero.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x bfloat> %v
@@ -92,12 +100,12 @@ define <vscale x 2 x bfloat> @vp_roundtozero_nxv2bf16(<vscale x 2 x bfloat> %va,
 define <vscale x 2 x bfloat> @vp_roundtozero_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -105,7 +113,7 @@ define <vscale x 2 x bfloat> @vp_roundtozero_nxv2bf16_unmasked(<vscale x 2 x bfl
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.roundtozero.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -117,25 +125,27 @@ declare <vscale x 4 x bfloat> @llvm.vp.roundtozero.nxv4bf16(<vscale x 4 x bfloat
 define <vscale x 4 x bfloat> @vp_roundtozero_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv4bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v10, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v9, v12, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v12, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v10, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.roundtozero.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x bfloat> %v
@@ -144,12 +154,12 @@ define <vscale x 4 x bfloat> @vp_roundtozero_nxv4bf16(<vscale x 4 x bfloat> %va,
 define <vscale x 4 x bfloat> @vp_roundtozero_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv4bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -157,7 +167,7 @@ define <vscale x 4 x bfloat> @vp_roundtozero_nxv4bf16_unmasked(<vscale x 4 x bfl
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.roundtozero.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -169,25 +179,27 @@ declare <vscale x 8 x bfloat> @llvm.vp.roundtozero.nxv8bf16(<vscale x 8 x bfloat
 define <vscale x 8 x bfloat> @vp_roundtozero_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v12, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v12, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.roundtozero.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x bfloat> %v
@@ -196,12 +208,12 @@ define <vscale x 8 x bfloat> @vp_roundtozero_nxv8bf16(<vscale x 8 x bfloat> %va,
 define <vscale x 8 x bfloat> @vp_roundtozero_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv8bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -209,7 +221,7 @@ define <vscale x 8 x bfloat> @vp_roundtozero_nxv8bf16_unmasked(<vscale x 8 x bfl
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.roundtozero.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -221,25 +233,27 @@ declare <vscale x 16 x bfloat> @llvm.vp.roundtozero.nxv16bf16(<vscale x 16 x bfl
 define <vscale x 16 x bfloat> @vp_roundtozero_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv16bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v12, v0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.roundtozero.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x bfloat> %v
@@ -248,12 +262,12 @@ define <vscale x 16 x bfloat> @vp_roundtozero_nxv16bf16(<vscale x 16 x bfloat> %
 define <vscale x 16 x bfloat> @vp_roundtozero_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv16bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
@@ -261,7 +275,7 @@ define <vscale x 16 x bfloat> @vp_roundtozero_nxv16bf16_unmasked(<vscale x 16 x
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.roundtozero.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -279,59 +293,64 @@ define <vscale x 32 x bfloat> @vp_roundtozero_nxv32bf16(<vscale x 32 x bfloat> %
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v12, v0, a2
+; CHECK-NEXT:    vslidedown.vx v17, v0, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    vmv1r.v v18, v17
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v8, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v18, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a2, 1
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmv1r.v v0, v18
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB10_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB10_2:
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v7
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -346,42 +365,55 @@ define <vscale x 32 x bfloat> @vp_roundtozero_nxv32bf16(<vscale x 32 x bfloat> %
 define <vscale x 32 x bfloat> @vp_roundtozero_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv32bf16_unmasked:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v24
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vmset.m v16
 ; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v12, v24, a2
+; CHECK-NEXT:    vslidedown.vx v16, v16, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    vmv1r.v v17, v16
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v8, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v17, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a2, 1
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB11_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB11_2:
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
 ; CHECK-NEXT:    fsrmi a0, 1
@@ -390,8 +422,14 @@ define <vscale x 32 x bfloat> @vp_roundtozero_nxv32bf16_unmasked(<vscale x 32 x
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.roundtozero.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x bfloat> %v
@@ -418,23 +456,27 @@ define <vscale x 1 x half> @vp_roundtozero_nxv1f16(<vscale x 1 x half> %va, <vsc
 ;
 ; ZVFHMIN-LABEL: vp_roundtozero_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v9, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.roundtozero.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x half> %v
@@ -458,12 +500,12 @@ define <vscale x 1 x half> @vp_roundtozero_nxv1f16_unmasked(<vscale x 1 x half>
 ;
 ; ZVFHMIN-LABEL: vp_roundtozero_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -471,7 +513,7 @@ define <vscale x 1 x half> @vp_roundtozero_nxv1f16_unmasked(<vscale x 1 x half>
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.roundtozero.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -500,23 +542,27 @@ define <vscale x 2 x half> @vp_roundtozero_nxv2f16(<vscale x 2 x half> %va, <vsc
 ;
 ; ZVFHMIN-LABEL: vp_roundtozero_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v9, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
+; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v11, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.roundtozero.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x half> %v
@@ -540,12 +586,12 @@ define <vscale x 2 x half> @vp_roundtozero_nxv2f16_unmasked(<vscale x 2 x half>
 ;
 ; ZVFHMIN-LABEL: vp_roundtozero_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -553,7 +599,7 @@ define <vscale x 2 x half> @vp_roundtozero_nxv2f16_unmasked(<vscale x 2 x half>
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.roundtozero.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -582,25 +628,27 @@ define <vscale x 4 x half> @vp_roundtozero_nxv4f16(<vscale x 4 x half> %va, <vsc
 ;
 ; ZVFHMIN-LABEL: vp_roundtozero_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v12, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
-; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.roundtozero.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x half> %v
@@ -624,12 +672,12 @@ define <vscale x 4 x half> @vp_roundtozero_nxv4f16_unmasked(<vscale x 4 x half>
 ;
 ; ZVFHMIN-LABEL: vp_roundtozero_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -637,7 +685,7 @@ define <vscale x 4 x half> @vp_roundtozero_nxv4f16_unmasked(<vscale x 4 x half>
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.roundtozero.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -668,25 +716,27 @@ define <vscale x 8 x half> @vp_roundtozero_nxv8f16(<vscale x 8 x half> %va, <vsc
 ;
 ; ZVFHMIN-LABEL: vp_roundtozero_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
-; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.roundtozero.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %v
@@ -710,12 +760,12 @@ define <vscale x 8 x half> @vp_roundtozero_nxv8f16_unmasked(<vscale x 8 x half>
 ;
 ; ZVFHMIN-LABEL: vp_roundtozero_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -723,7 +773,7 @@ define <vscale x 8 x half> @vp_roundtozero_nxv8f16_unmasked(<vscale x 8 x half>
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.roundtozero.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -754,25 +804,27 @@ define <vscale x 16 x half> @vp_roundtozero_nxv16f16(<vscale x 16 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vp_roundtozero_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v12, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.roundtozero.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x half> %v
@@ -796,12 +848,12 @@ define <vscale x 16 x half> @vp_roundtozero_nxv16f16_unmasked(<vscale x 16 x hal
 ;
 ; ZVFHMIN-LABEL: vp_roundtozero_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
@@ -809,7 +861,7 @@ define <vscale x 16 x half> @vp_roundtozero_nxv16f16_unmasked(<vscale x 16 x hal
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.roundtozero.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -846,59 +898,64 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16(<vscale x 32 x half> %va, <
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v12, v0, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v17, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    vmv1r.v v18, v17
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v18, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a2, 1
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v18
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v8, v7
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v7, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -928,42 +985,55 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16_unmasked(<vscale x 32 x hal
 ;
 ; ZVFHMIN-LABEL: vp_roundtozero_nxv32f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vmset.m v16
 ; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v12, v24, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v16, v16, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    vmv1r.v v17, v16
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a2, 1
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v17
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB23_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
@@ -972,8 +1042,14 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16_unmasked(<vscale x 32 x hal
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.roundtozero.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
index 4a8ee54d87eba..9e78bbdc4f441 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
@@ -18,12 +18,12 @@ define <vscale x 1 x bfloat> @vfadd_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vsca
 ; CHECK-LABEL: vfadd_vv_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfadd.vv v9, v9, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x bfloat> %v
@@ -37,7 +37,7 @@ define <vscale x 1 x bfloat> @vfadd_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfadd.vv v9, v9, v10
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -50,12 +50,12 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloa
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfadd.vv v9, v10, v8, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
@@ -69,12 +69,12 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_commute(<vscale x 1 x bfloat> %v
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfadd.vv v9, v8, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
@@ -92,7 +92,7 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfadd.vv v9, v10, v8
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
@@ -111,7 +111,7 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_unmasked_commute(<vscale x 1 x b
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfadd.vv v9, v8, v10
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
@@ -126,12 +126,12 @@ define <vscale x 2 x bfloat> @vfadd_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vsca
 ; CHECK-LABEL: vfadd_vv_nxv2bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfadd.vv v9, v9, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x bfloat> %v
@@ -145,7 +145,7 @@ define <vscale x 2 x bfloat> @vfadd_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfadd.vv v9, v9, v10
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -158,12 +158,12 @@ define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloa
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfadd.vv v9, v10, v8, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 2 x bfloat> %elt.head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
@@ -181,7 +181,7 @@ define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfadd.vv v9, v10, v8
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
@@ -196,12 +196,12 @@ define <vscale x 4 x bfloat> @vfadd_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vsca
 ; CHECK-LABEL: vfadd_vv_nxv4bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfadd.vv v10, v12, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x bfloat> %v
@@ -215,7 +215,7 @@ define <vscale x 4 x bfloat> @vfadd_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfadd.vv v10, v12, v10
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -228,12 +228,12 @@ define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloa
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfadd.vv v10, v10, v12, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 4 x bfloat> %elt.head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
@@ -251,7 +251,7 @@ define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfadd.vv v10, v10, v12
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
@@ -266,12 +266,12 @@ define <vscale x 8 x bfloat> @vfadd_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vsca
 ; CHECK-LABEL: vfadd_vv_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfadd.vv v12, v16, v12, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x bfloat> %v
@@ -285,7 +285,7 @@ define <vscale x 8 x bfloat> @vfadd_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfadd.vv v12, v16, v12
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -298,12 +298,12 @@ define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloa
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfadd.vv v12, v12, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
@@ -321,7 +321,7 @@ define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfadd.vv v12, v12, v16
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
@@ -336,12 +336,12 @@ define <vscale x 16 x bfloat> @vfadd_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <v
 ; CHECK-LABEL: vfadd_vv_nxv16bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfadd.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x bfloat> %v
@@ -355,7 +355,7 @@ define <vscale x 16 x bfloat> @vfadd_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfadd.vv v16, v24, v16
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -368,12 +368,12 @@ define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bf
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v12, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 16 x bfloat> %elt.head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
@@ -391,7 +391,7 @@ define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfadd.vv v16, v16, v24
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
@@ -409,10 +409,18 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
@@ -421,31 +429,59 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 3
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20, v0.t
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 4
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20, v0.t
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vfadd.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB22_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB22_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfadd.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -478,12 +514,12 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    addi a3, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB23_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
@@ -495,7 +531,7 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfadd.vv v16, v24, v16
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
@@ -514,76 +550,76 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 4
-; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv8r.v v24, v8
+; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vmv.v.x v16, a1
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a3, a1, 3
-; CHECK-NEXT:    add a1, a3, a1
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 3
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 3
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
 ; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; CHECK-NEXT:    vmv4r.v v8, v16
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a3, a2, 3
-; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vfadd.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB24_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB24_2:
-; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 3
-; CHECK-NEXT:    add a0, a1, a0
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24, v0.t
+; CHECK-NEXT:    vmv8r.v v24, v16
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vfadd.vv v24, v16, v24, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 4
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -601,19 +637,14 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vmset.m v24
 ; CHECK-NEXT:    vmv.v.x v16, a1
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
@@ -622,41 +653,30 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB25_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB25_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfadd.vv v16, v16, v24
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -679,12 +699,12 @@ define <vscale x 1 x half> @vfadd_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfadd_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %b, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x half> %v
@@ -704,7 +724,7 @@ define <vscale x 1 x half> @vfadd_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %b, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -723,12 +743,12 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16(<vscale x 1 x half> %va, half %b, <
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -748,12 +768,12 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16_commute(<vscale x 1 x half> %va, ha
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v8, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -777,7 +797,7 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16_unmasked(<vscale x 1 x half> %va, h
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -802,7 +822,7 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16_unmasked_commute(<vscale x 1 x half
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v8, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -823,12 +843,12 @@ define <vscale x 2 x half> @vfadd_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfadd_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.fadd.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %b, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x half> %v
@@ -848,7 +868,7 @@ define <vscale x 2 x half> @vfadd_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.fadd.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %b, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -867,12 +887,12 @@ define <vscale x 2 x half> @vfadd_vf_nxv2f16(<vscale x 2 x half> %va, half %b, <
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 2 x half> %elt.head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
@@ -896,7 +916,7 @@ define <vscale x 2 x half> @vfadd_vf_nxv2f16_unmasked(<vscale x 2 x half> %va, h
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
@@ -917,12 +937,12 @@ define <vscale x 4 x half> @vfadd_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfadd_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v10, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.fadd.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %b, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x half> %v
@@ -942,7 +962,7 @@ define <vscale x 4 x half> @vfadd_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v10, v12, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.fadd.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %b, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -961,12 +981,12 @@ define <vscale x 4 x half> @vfadd_vf_nxv4f16(<vscale x 4 x half> %va, half %b, <
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v10, v10, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -990,7 +1010,7 @@ define <vscale x 4 x half> @vfadd_vf_nxv4f16_unmasked(<vscale x 4 x half> %va, h
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v10, v10, v12
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
@@ -1011,12 +1031,12 @@ define <vscale x 8 x half> @vfadd_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfadd_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v12, v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %b, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %v
@@ -1036,7 +1056,7 @@ define <vscale x 8 x half> @vfadd_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v12, v16, v12
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %b, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -1055,12 +1075,12 @@ define <vscale x 8 x half> @vfadd_vf_nxv8f16(<vscale x 8 x half> %va, half %b, <
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v12, v12, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -1084,7 +1104,7 @@ define <vscale x 8 x half> @vfadd_vf_nxv8f16_unmasked(<vscale x 8 x half> %va, h
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v12, v12, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
@@ -1105,12 +1125,12 @@ define <vscale x 16 x half> @vfadd_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFHMIN-LABEL: vfadd_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.fadd.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %b, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x half> %v
@@ -1130,7 +1150,7 @@ define <vscale x 16 x half> @vfadd_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.fadd.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %b, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -1149,12 +1169,12 @@ define <vscale x 16 x half> @vfadd_vf_nxv16f16(<vscale x 16 x half> %va, half %b
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -1178,7 +1198,7 @@ define <vscale x 16 x half> @vfadd_vf_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
@@ -1202,10 +1222,18 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
@@ -1214,31 +1242,59 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv4r.v v8, v16
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfadd.vv v16, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB48_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB48_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -1277,12 +1333,12 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    addi a3, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB49_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
@@ -1294,7 +1350,7 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
@@ -1319,76 +1375,76 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 4
-; ZVFHMIN-NEXT:    add a1, a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v24, v8
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a3, a1, 3
-; ZVFHMIN-NEXT:    add a1, a3, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vmv4r.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 3
-; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v16, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfadd.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB50_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB50_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a1, a0, 3
-; ZVFHMIN-NEXT:    add a0, a1, a0
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    vmv8r.v v24, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfadd.vv v24, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a1, a0, 4
-; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -1412,19 +1468,14 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
@@ -1433,41 +1484,30 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v16, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB51_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB51_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
index 2997fad370e46..532629ef7a8a8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
@@ -18,12 +18,12 @@ define <vscale x 1 x bfloat> @vfdiv_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vsca
 ; CHECK-LABEL: vfdiv_vv_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v9, v9, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.fdiv.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x bfloat> %v
@@ -37,7 +37,7 @@ define <vscale x 1 x bfloat> @vfdiv_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v9, v9, v10
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.fdiv.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -50,12 +50,12 @@ define <vscale x 1 x bfloat> @vfdiv_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloa
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v9, v10, v8, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
@@ -73,7 +73,7 @@ define <vscale x 1 x bfloat> @vfdiv_vf_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v9, v10, v8
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
@@ -88,12 +88,12 @@ define <vscale x 2 x bfloat> @vfdiv_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vsca
 ; CHECK-LABEL: vfdiv_vv_nxv2bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v9, v9, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.fdiv.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x bfloat> %v
@@ -107,7 +107,7 @@ define <vscale x 2 x bfloat> @vfdiv_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v9, v9, v10
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.fdiv.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -120,12 +120,12 @@ define <vscale x 2 x bfloat> @vfdiv_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloa
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v9, v10, v8, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 2 x bfloat> %elt.head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
@@ -143,7 +143,7 @@ define <vscale x 2 x bfloat> @vfdiv_vf_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v9, v10, v8
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
@@ -158,12 +158,12 @@ define <vscale x 4 x bfloat> @vfdiv_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vsca
 ; CHECK-LABEL: vfdiv_vv_nxv4bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v10, v12, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.fdiv.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x bfloat> %v
@@ -177,7 +177,7 @@ define <vscale x 4 x bfloat> @vfdiv_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v10, v12, v10
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.fdiv.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -190,12 +190,12 @@ define <vscale x 4 x bfloat> @vfdiv_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloa
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v10, v10, v12, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 4 x bfloat> %elt.head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
@@ -213,7 +213,7 @@ define <vscale x 4 x bfloat> @vfdiv_vf_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v10, v10, v12
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
@@ -228,12 +228,12 @@ define <vscale x 8 x bfloat> @vfdiv_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vsca
 ; CHECK-LABEL: vfdiv_vv_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v12, v16, v12, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.fdiv.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x bfloat> %v
@@ -247,7 +247,7 @@ define <vscale x 8 x bfloat> @vfdiv_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v12, v16, v12
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.fdiv.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -260,12 +260,12 @@ define <vscale x 8 x bfloat> @vfdiv_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloa
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v12, v12, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
@@ -283,7 +283,7 @@ define <vscale x 8 x bfloat> @vfdiv_vf_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v12, v12, v16
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
@@ -298,12 +298,12 @@ define <vscale x 16 x bfloat> @vfdiv_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <v
 ; CHECK-LABEL: vfdiv_vv_nxv16bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.fdiv.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x bfloat> %v
@@ -317,7 +317,7 @@ define <vscale x 16 x bfloat> @vfdiv_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v16, v24, v16
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.fdiv.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -330,12 +330,12 @@ define <vscale x 16 x bfloat> @vfdiv_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bf
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v12, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 16 x bfloat> %elt.head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
@@ -353,7 +353,7 @@ define <vscale x 16 x bfloat> @vfdiv_vf_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v16, v16, v24
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
@@ -371,10 +371,18 @@ define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
@@ -383,31 +391,59 @@ define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 3
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20, v0.t
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 4
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20, v0.t
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfdiv.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vfdiv.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB20_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB20_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -440,12 +476,12 @@ define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    addi a3, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB21_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
@@ -457,7 +493,7 @@ define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v16, v24, v16
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
@@ -476,76 +512,76 @@ define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 4
-; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv8r.v v24, v8
+; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vmv.v.x v16, a1
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a3, a1, 3
-; CHECK-NEXT:    add a1, a3, a1
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 3
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 3
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
 ; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; CHECK-NEXT:    vmv4r.v v8, v16
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a3, a2, 3
-; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfdiv.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vfdiv.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB22_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB22_2:
-; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 3
-; CHECK-NEXT:    add a0, a1, a0
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24, v0.t
+; CHECK-NEXT:    vmv8r.v v24, v16
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfdiv.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vfdiv.vv v24, v16, v24, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 4
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -563,19 +599,14 @@ define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vmset.m v24
 ; CHECK-NEXT:    vmv.v.x v16, a1
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
@@ -584,41 +615,30 @@ define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfdiv.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vfdiv.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB23_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB23_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v16, v16, v24
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -641,12 +661,12 @@ define <vscale x 1 x half> @vfdiv_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v9, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.fdiv.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %b, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x half> %v
@@ -666,7 +686,7 @@ define <vscale x 1 x half> @vfdiv_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v9, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.fdiv.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %b, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -685,12 +705,12 @@ define <vscale x 1 x half> @vfdiv_vf_nxv1f16(<vscale x 1 x half> %va, half %b, <
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v10, v8, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -714,7 +734,7 @@ define <vscale x 1 x half> @vfdiv_vf_nxv1f16_unmasked(<vscale x 1 x half> %va, h
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v10, v8
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -735,12 +755,12 @@ define <vscale x 2 x half> @vfdiv_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v9, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.fdiv.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %b, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x half> %v
@@ -760,7 +780,7 @@ define <vscale x 2 x half> @vfdiv_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v9, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.fdiv.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %b, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -779,12 +799,12 @@ define <vscale x 2 x half> @vfdiv_vf_nxv2f16(<vscale x 2 x half> %va, half %b, <
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v10, v8, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 2 x half> %elt.head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
@@ -808,7 +828,7 @@ define <vscale x 2 x half> @vfdiv_vf_nxv2f16_unmasked(<vscale x 2 x half> %va, h
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v10, v8
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
@@ -829,12 +849,12 @@ define <vscale x 4 x half> @vfdiv_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v10, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.fdiv.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %b, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x half> %v
@@ -854,7 +874,7 @@ define <vscale x 4 x half> @vfdiv_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v10, v12, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.fdiv.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %b, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -873,12 +893,12 @@ define <vscale x 4 x half> @vfdiv_vf_nxv4f16(<vscale x 4 x half> %va, half %b, <
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v10, v10, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -902,7 +922,7 @@ define <vscale x 4 x half> @vfdiv_vf_nxv4f16_unmasked(<vscale x 4 x half> %va, h
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v10, v10, v12
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
@@ -923,12 +943,12 @@ define <vscale x 8 x half> @vfdiv_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v12, v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.fdiv.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %b, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %v
@@ -948,7 +968,7 @@ define <vscale x 8 x half> @vfdiv_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v12, v16, v12
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.fdiv.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %b, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -967,12 +987,12 @@ define <vscale x 8 x half> @vfdiv_vf_nxv8f16(<vscale x 8 x half> %va, half %b, <
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v12, v12, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -996,7 +1016,7 @@ define <vscale x 8 x half> @vfdiv_vf_nxv8f16_unmasked(<vscale x 8 x half> %va, h
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v12, v12, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
@@ -1017,12 +1037,12 @@ define <vscale x 16 x half> @vfdiv_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.fdiv.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %b, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x half> %v
@@ -1042,7 +1062,7 @@ define <vscale x 16 x half> @vfdiv_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v24, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.fdiv.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %b, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -1061,12 +1081,12 @@ define <vscale x 16 x half> @vfdiv_vf_nxv16f16(<vscale x 16 x half> %va, half %b
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -1090,7 +1110,7 @@ define <vscale x 16 x half> @vfdiv_vf_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
@@ -1114,10 +1134,18 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
@@ -1126,31 +1154,59 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv4r.v v8, v16
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfdiv.vv v16, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB44_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB44_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -1189,12 +1245,12 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    addi a3, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB45_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
@@ -1206,7 +1262,7 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v24, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
@@ -1231,76 +1287,76 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 4
-; ZVFHMIN-NEXT:    add a1, a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v24, v8
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a3, a1, 3
-; ZVFHMIN-NEXT:    add a1, a3, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vmv4r.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 3
-; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v16, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfdiv.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB46_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB46_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a1, a0, 3
-; ZVFHMIN-NEXT:    add a0, a1, a0
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    vmv8r.v v24, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfdiv.vv v24, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a1, a0, 4
-; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -1324,19 +1380,14 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
@@ -1345,41 +1396,30 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v16, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB47_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB47_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
index ab67e9833c78a..5ee5d40d8313d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
@@ -17,14 +17,14 @@ declare <vscale x 1 x bfloat> @llvm.vp.fma.nxv1bf16(<vscale x 1 x bfloat>, <vsca
 define <vscale x 1 x bfloat> @vfma_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x bfloat> %c, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfma_vv_nxv1bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v10
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v10, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v12, v10, v11, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.fma.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x bfloat> %c, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x bfloat> %v
@@ -33,13 +33,13 @@ define <vscale x 1 x bfloat> @vfma_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscal
 define <vscale x 1 x bfloat> @vfma_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x bfloat> %c, i32 zeroext %evl) {
 ; CHECK-LABEL: vfma_vv_nxv1bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v10
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v12, v10, v11
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.fma.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x bfloat> %c, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -50,15 +50,15 @@ define <vscale x 1 x bfloat> @vfma_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat
 ; CHECK-LABEL: vfma_vf_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v12, v11, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
@@ -70,15 +70,15 @@ define <vscale x 1 x bfloat> @vfma_vf_nxv1bf16_commute(<vscale x 1 x bfloat> %va
 ; CHECK-LABEL: vfma_vf_nxv1bf16_commute:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v11, v8, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v11
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v11, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
@@ -90,14 +90,14 @@ define <vscale x 1 x bfloat> @vfma_vf_nxv1bf16_unmasked(<vscale x 1 x bfloat> %v
 ; CHECK-LABEL: vfma_vf_nxv1bf16_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v12, v11, v10
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
@@ -110,14 +110,14 @@ define <vscale x 1 x bfloat> @vfma_vf_nxv1bf16_unmasked_commute(<vscale x 1 x bf
 ; CHECK-LABEL: vfma_vf_nxv1bf16_unmasked_commute:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v12, v11, v10
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
@@ -131,14 +131,14 @@ declare <vscale x 2 x bfloat> @llvm.vp.fma.nxv2bf16(<vscale x 2 x bfloat>, <vsca
 define <vscale x 2 x bfloat> @vfma_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x bfloat> %c, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfma_vv_nxv2bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v10
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v10, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v12, v10, v11, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.fma.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x bfloat> %c, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x bfloat> %v
@@ -147,13 +147,13 @@ define <vscale x 2 x bfloat> @vfma_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscal
 define <vscale x 2 x bfloat> @vfma_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x bfloat> %c, i32 zeroext %evl) {
 ; CHECK-LABEL: vfma_vv_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v10
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v12, v10, v11
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.fma.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x bfloat> %c, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -164,15 +164,15 @@ define <vscale x 2 x bfloat> @vfma_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloat
 ; CHECK-LABEL: vfma_vf_nxv2bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v12, v11, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 2 x bfloat> %elt.head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
@@ -184,15 +184,15 @@ define <vscale x 2 x bfloat> @vfma_vf_nxv2bf16_commute(<vscale x 2 x bfloat> %va
 ; CHECK-LABEL: vfma_vf_nxv2bf16_commute:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v11, v8, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v11
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v11, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 2 x bfloat> %elt.head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
@@ -204,14 +204,14 @@ define <vscale x 2 x bfloat> @vfma_vf_nxv2bf16_unmasked(<vscale x 2 x bfloat> %v
 ; CHECK-LABEL: vfma_vf_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v12, v11, v10
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
@@ -224,14 +224,14 @@ define <vscale x 2 x bfloat> @vfma_vf_nxv2bf16_unmasked_commute(<vscale x 2 x bf
 ; CHECK-LABEL: vfma_vf_nxv2bf16_unmasked_commute:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v12, v11, v10
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
@@ -245,14 +245,14 @@ declare <vscale x 4 x bfloat> @llvm.vp.fma.nxv4bf16(<vscale x 4 x bfloat>, <vsca
 define <vscale x 4 x bfloat> @vfma_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x bfloat> %c, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfma_vv_nxv4bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v14, v10, v12, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v14
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v14, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.fma.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x bfloat> %c, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x bfloat> %v
@@ -261,13 +261,13 @@ define <vscale x 4 x bfloat> @vfma_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscal
 define <vscale x 4 x bfloat> @vfma_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x bfloat> %c, i32 zeroext %evl) {
 ; CHECK-LABEL: vfma_vv_nxv4bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v14, v10, v12
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v14
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.fma.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x bfloat> %c, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -278,15 +278,15 @@ define <vscale x 4 x bfloat> @vfma_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloat
 ; CHECK-LABEL: vfma_vf_nxv4bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v14, v12, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v14
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v14, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 4 x bfloat> %elt.head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
@@ -298,15 +298,15 @@ define <vscale x 4 x bfloat> @vfma_vf_nxv4bf16_commute(<vscale x 4 x bfloat> %va
 ; CHECK-LABEL: vfma_vf_nxv4bf16_commute:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v12, v14, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 4 x bfloat> %elt.head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
@@ -318,14 +318,14 @@ define <vscale x 4 x bfloat> @vfma_vf_nxv4bf16_unmasked(<vscale x 4 x bfloat> %v
 ; CHECK-LABEL: vfma_vf_nxv4bf16_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v14, v12, v10
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v14
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
@@ -338,14 +338,14 @@ define <vscale x 4 x bfloat> @vfma_vf_nxv4bf16_unmasked_commute(<vscale x 4 x bf
 ; CHECK-LABEL: vfma_vf_nxv4bf16_unmasked_commute:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v14, v12, v10
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v14
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
@@ -359,14 +359,14 @@ declare <vscale x 8 x bfloat> @llvm.vp.fma.nxv8bf16(<vscale x 8 x bfloat>, <vsca
 define <vscale x 8 x bfloat> @vfma_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfma_vv_nxv8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v20, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v20, v10, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v20, v12, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v20
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v20, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.fma.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x bfloat> %v
@@ -375,13 +375,13 @@ define <vscale x 8 x bfloat> @vfma_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscal
 define <vscale x 8 x bfloat> @vfma_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 zeroext %evl) {
 ; CHECK-LABEL: vfma_vv_nxv8bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v20, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v20, v12, v16
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v20
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.fma.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -392,15 +392,15 @@ define <vscale x 8 x bfloat> @vfma_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat
 ; CHECK-LABEL: vfma_vf_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10, v0.t
 ; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v20, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v20, v10, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v20, v16, v12, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v20
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v20, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
@@ -412,15 +412,15 @@ define <vscale x 8 x bfloat> @vfma_vf_nxv8bf16_commute(<vscale x 8 x bfloat> %va
 ; CHECK-LABEL: vfma_vf_nxv8bf16_commute:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10, v0.t
 ; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v20, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v20, v10, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v16, v20, v12, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
@@ -432,14 +432,14 @@ define <vscale x 8 x bfloat> @vfma_vf_nxv8bf16_unmasked(<vscale x 8 x bfloat> %v
 ; CHECK-LABEL: vfma_vf_nxv8bf16_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v20, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v20, v16, v12
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v20
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
@@ -452,14 +452,14 @@ define <vscale x 8 x bfloat> @vfma_vf_nxv8bf16_unmasked_commute(<vscale x 8 x bf
 ; CHECK-LABEL: vfma_vf_nxv8bf16_unmasked_commute:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v20, v10
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v20, v16, v12
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v20
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
@@ -479,17 +479,17 @@ define <vscale x 16 x bfloat> @vfma_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vs
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -504,13 +504,13 @@ define <vscale x 16 x bfloat> @vfma_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vs
 define <vscale x 16 x bfloat> @vfma_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x bfloat> %c, i32 zeroext %evl) {
 ; CHECK-LABEL: vfma_vv_nxv16bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v12
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v0, v16, v24
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.fma.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x bfloat> %c, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -520,32 +520,18 @@ define <vscale x 16 x bfloat> @vfma_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat>
 define <vscale x 16 x bfloat> @vfma_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bfloat %b, <vscale x 16 x bfloat> %vc, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfma_vf_nxv16bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv.v.x v12, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    fmv.x.h a0, fa0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    vmv.v.x v4, a0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 16 x bfloat> %elt.head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
@@ -557,15 +543,15 @@ define <vscale x 16 x bfloat> @vfma_vf_nxv16bf16_commute(<vscale x 16 x bfloat>
 ; CHECK-LABEL: vfma_vf_nxv16bf16_commute:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vmv.v.x v4, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v4
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v4, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v24, v8, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 16 x bfloat> %elt.head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
@@ -577,14 +563,14 @@ define <vscale x 16 x bfloat> @vfma_vf_nxv16bf16_unmasked(<vscale x 16 x bfloat>
 ; CHECK-LABEL: vfma_vf_nxv16bf16_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
 ; CHECK-NEXT:    vmv.v.x v12, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v12
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v0, v24, v16
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
@@ -597,14 +583,14 @@ define <vscale x 16 x bfloat> @vfma_vf_nxv16bf16_unmasked_commute(<vscale x 16 x
 ; CHECK-LABEL: vfma_vf_nxv16bf16_unmasked_commute:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
 ; CHECK-NEXT:    vmv.v.x v12, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v12
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v0, v24, v16
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
@@ -621,142 +607,124 @@ define <vscale x 32 x bfloat> @vfma_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vs
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    mv a3, a2
 ; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a3, a3, a2
+; CHECK-NEXT:    mv a3, a2
 ; CHECK-NEXT:    slli a2, a2, 2
 ; CHECK-NEXT:    add a2, a2, a3
 ; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
 ; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    vl8re16.v v0, (a0)
+; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 5
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv8r.v v24, v8
+; CHECK-NEXT:    vl8re16.v v8, (a0)
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a0, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a1, a0
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 4
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs1r.v v24, (a4) # Unknown-size Folded Spill
-; CHECK-NEXT:    vslidedown.vx v24, v24, a2
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs1r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sltu a2, a1, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv4r.v v8, v16
-; CHECK-NEXT:    vmv8r.v v24, v16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a4, a3, 5
-; CHECK-NEXT:    add a3, a4, a3
+; CHECK-NEXT:    slli a3, a3, 4
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
 ; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a4, a3, 4
-; CHECK-NEXT:    add a3, a4, a3
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v4
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl1r.v v0, (a3) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:    slli a2, a2, 1
+; CHECK-NEXT:    add a2, a2, a3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28, v0.t
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 5
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfmadd.vv v24, v16, v8, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24, v0.t
 ; CHECK-NEXT:    addi a2, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a1, a0, .LBB30_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:  .LBB30_2:
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    mv a2, a0
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a2, a2, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a2
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a2, a0, 5
-; CHECK-NEXT:    add a0, a2, a0
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a2, a0, 4
-; CHECK-NEXT:    add a0, a2, a0
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v0
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a2, a0, 5
-; CHECK-NEXT:    add a0, a2, a0
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v24, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a2, a0, 5
-; CHECK-NEXT:    add a0, a2, a0
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    vmv.v.v v16, v8
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a1, a1, a0
+; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    slli a0, a0, 2
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
@@ -777,23 +745,23 @@ define <vscale x 32 x bfloat> @vfma_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat>
 ; CHECK-NEXT:    slli a2, a2, 5
 ; CHECK-NEXT:    sub sp, sp, a2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT:    vl8re16.v v24, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    mv a2, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:    slli a2, a2, 1
+; CHECK-NEXT:    add a2, a2, a3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re16.v v16, (a0)
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v7
+; CHECK-NEXT:    vmset.m v24
 ; CHECK-NEXT:    slli a0, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a1, a0
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v7, a2
+; CHECK-NEXT:    vslidedown.vx v0, v24, a2
 ; CHECK-NEXT:    sltu a2, a1, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
@@ -801,34 +769,32 @@ define <vscale x 32 x bfloat> @vfma_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat>
 ; CHECK-NEXT:    slli a3, a3, 3
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv4r.v v8, v16
-; CHECK-NEXT:    vmv8r.v v24, v16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 4
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:    slli a2, a2, 1
+; CHECK-NEXT:    add a2, a2, a3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20, v0.t
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfmadd.vv v8, v24, v16, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v20, v8, v0.t
 ; CHECK-NEXT:    bltu a1, a0, .LBB31_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a1, a0
@@ -837,31 +803,33 @@ define <vscale x 32 x bfloat> @vfma_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat>
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    mv a2, a0
+; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a2
+; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vfmacc.vv v0, v16, v24
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfmadd.vv v0, v24, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v16, v0
+; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add sp, sp, a0
@@ -879,140 +847,127 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfl
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a2, a2, a1
+; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:    slli a1, a1, 2
 ; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    vmv8r.v v24, v8
 ; CHECK-NEXT:    fmv.x.h a2, fa0
 ; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a4, a1, 5
-; CHECK-NEXT:    add a1, a4, a1
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    slli a1, a3, 1
 ; CHECK-NEXT:    srli a3, a3, 2
 ; CHECK-NEXT:    sub a4, a0, a1
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 4
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vs1r.v v0, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
 ; CHECK-NEXT:    sltu a3, a0, a4
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a4
 ; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    mv a5, a4
+; CHECK-NEXT:    slli a4, a4, 4
+; CHECK-NEXT:    add a4, sp, a4
+; CHECK-NEXT:    addi a4, a4, 16
+; CHECK-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20, v0.t
+; CHECK-NEXT:    csrr a4, vlenb
+; CHECK-NEXT:    slli a4, a4, 3
+; CHECK-NEXT:    add a4, sp, a4
+; CHECK-NEXT:    addi a4, a4, 16
+; CHECK-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a4, vlenb
 ; CHECK-NEXT:    slli a4, a4, 3
-; CHECK-NEXT:    add a5, a5, a4
+; CHECK-NEXT:    mv a5, a4
 ; CHECK-NEXT:    slli a4, a4, 1
 ; CHECK-NEXT:    add a4, a4, a5
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28, v0.t
 ; CHECK-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.x v24, a2
-; CHECK-NEXT:    vmv4r.v v8, v24
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a4, a2, 4
-; CHECK-NEXT:    add a2, a4, a2
+; CHECK-NEXT:    slli a2, a2, 5
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28
+; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 5
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v20, v8
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfmadd.vv v24, v16, v8, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24, v0.t
 ; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a0, a1, .LBB32_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB32_2:
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 5
-; CHECK-NEXT:    add a1, a2, a1
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a2, a2, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a2
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 5
-; CHECK-NEXT:    add a1, a2, a1
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 4
-; CHECK-NEXT:    add a1, a2, a1
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 5
-; CHECK-NEXT:    add a1, a2, a1
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v24, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    vmv.v.v v16, v8
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a1, a1, a0
+; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    slli a0, a0, 2
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
@@ -1032,140 +987,132 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_commute(<vscale x 32 x bfloat>
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a2, a2, a1
+; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:    slli a1, a1, 2
 ; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
-; CHECK-NEXT:    fmv.x.h a2, fa0
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a4, a1, 5
-; CHECK-NEXT:    add a1, a4, a1
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    fmv.x.h a1, fa0
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vmv.v.x v24, a1
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    slli a1, a1, 5
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    slli a1, a3, 1
-; CHECK-NEXT:    srli a3, a3, 2
-; CHECK-NEXT:    sub a4, a0, a1
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 4
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vs1r.v v0, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a3
-; CHECK-NEXT:    sltu a3, a0, a4
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a3, a3, a4
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    mv a5, a4
-; CHECK-NEXT:    slli a4, a4, 3
-; CHECK-NEXT:    add a5, a5, a4
-; CHECK-NEXT:    slli a4, a4, 1
-; CHECK-NEXT:    add a4, a4, a5
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v24, a2
-; CHECK-NEXT:    vmv4r.v v8, v24
+; CHECK-NEXT:    slli a1, a2, 1
+; CHECK-NEXT:    srli a2, a2, 2
+; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a4, a2, 4
-; CHECK-NEXT:    add a2, a4, a2
+; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28
+; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    slli a2, a2, 5
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v16, v8, v24, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28, v0.t
+; CHECK-NEXT:    vmv4r.v v24, v8
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:    slli a2, a2, 1
+; CHECK-NEXT:    add a2, a2, a3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfmadd.vv v24, v16, v8, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24, v0.t
 ; CHECK-NEXT:    addi a2, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a0, a1, .LBB33_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB33_2:
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 5
-; CHECK-NEXT:    add a1, a2, a1
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a2, a2, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a2
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 4
-; CHECK-NEXT:    add a1, a2, a1
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 5
-; CHECK-NEXT:    add a1, a2, a1
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 5
-; CHECK-NEXT:    add a1, a2, a1
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v8, v24, v16, v0.t
 ; CHECK-NEXT:    vmv.v.v v16, v8
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a1, a1, a0
+; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    slli a0, a0, 2
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
@@ -1191,24 +1138,12 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat>
 ; CHECK-NEXT:    fmv.x.h a2, fa0
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v7
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    mv a4, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmset.m v24
 ; CHECK-NEXT:    slli a1, a3, 1
 ; CHECK-NEXT:    srli a3, a3, 2
 ; CHECK-NEXT:    sub a4, a0, a1
 ; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v7, a3
+; CHECK-NEXT:    vslidedown.vx v0, v24, a3
 ; CHECK-NEXT:    sltu a3, a0, a4
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a4
@@ -1216,57 +1151,76 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat>
 ; CHECK-NEXT:    slli a4, a4, 4
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
+; CHECK-NEXT:    addi a4, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a4, vlenb
+; CHECK-NEXT:    slli a4, a4, 3
+; CHECK-NEXT:    add a4, sp, a4
+; CHECK-NEXT:    addi a4, a4, 16
+; CHECK-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v24, a2
-; CHECK-NEXT:    vmv4r.v v8, v24
+; CHECK-NEXT:    vmv.v.x v8, a2
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    mv a4, a2
+; CHECK-NEXT:    slli a2, a2, 1
+; CHECK-NEXT:    add a2, a2, a4
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    mv a4, a2
+; CHECK-NEXT:    slli a2, a2, 1
+; CHECK-NEXT:    add a2, a2, a4
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28, v0.t
 ; CHECK-NEXT:    addi a2, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v20, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v20, v8, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB34_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB34_2:
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v24
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v0, v24, v8
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v16, v0
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -1291,86 +1245,92 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_unmasked_commute(<vscale x 32 x
 ; CHECK-NEXT:    slli a1, a1, 5
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT:    fmv.x.h a2, fa0
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v7
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    mv a4, a1
+; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a4
+; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    slli a1, a3, 1
-; CHECK-NEXT:    srli a3, a3, 2
-; CHECK-NEXT:    sub a4, a0, a1
-; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v7, a3
-; CHECK-NEXT:    sltu a3, a0, a4
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a3, a3, a4
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 4
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v24, a2
-; CHECK-NEXT:    vmv4r.v v8, v24
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    fmv.x.h a1, fa0
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vmset.m v24
+; CHECK-NEXT:    vmv.v.x v8, a1
+; CHECK-NEXT:    slli a1, a2, 1
+; CHECK-NEXT:    srli a2, a2, 2
+; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v24, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:    slli a2, a2, 1
+; CHECK-NEXT:    add a2, a2, a3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20, v0.t
 ; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v16, v8, v24, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfmadd.vv v8, v24, v16, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v20, v8, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB35_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB35_2:
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a2
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v24
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v16
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v0, v24, v16
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfmadd.vv v0, v24, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v16, v0
+; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add sp, sp, a0
@@ -1396,14 +1356,14 @@ define <vscale x 1 x half> @vfma_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x 1
 ;
 ; ZVFHMIN-LABEL: vfma_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v10, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.fma.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %b, <vscale x 1 x half> %c, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x half> %v
@@ -1418,13 +1378,13 @@ define <vscale x 1 x half> @vfma_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <v
 ;
 ; ZVFHMIN-LABEL: vfma_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v10, v11
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.fma.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %b, <vscale x 1 x half> %c, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -1441,15 +1401,15 @@ define <vscale x 1 x half> @vfma_vf_nxv1f16(<vscale x 1 x half> %va, half %b, <v
 ; ZVFHMIN-LABEL: vfma_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -1467,15 +1427,15 @@ define <vscale x 1 x half> @vfma_vf_nxv1f16_commute(<vscale x 1 x half> %va, hal
 ; ZVFHMIN-LABEL: vfma_vf_nxv1f16_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v11, v8, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v11
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v11, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -1493,14 +1453,14 @@ define <vscale x 1 x half> @vfma_vf_nxv1f16_unmasked(<vscale x 1 x half> %va, ha
 ; ZVFHMIN-LABEL: vfma_vf_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1519,14 +1479,14 @@ define <vscale x 1 x half> @vfma_vf_nxv1f16_unmasked_commute(<vscale x 1 x half>
 ; ZVFHMIN-LABEL: vfma_vf_nxv1f16_unmasked_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1547,14 +1507,14 @@ define <vscale x 2 x half> @vfma_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x 2
 ;
 ; ZVFHMIN-LABEL: vfma_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v10, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.fma.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %b, <vscale x 2 x half> %c, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x half> %v
@@ -1569,13 +1529,13 @@ define <vscale x 2 x half> @vfma_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <v
 ;
 ; ZVFHMIN-LABEL: vfma_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v10, v11
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.fma.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %b, <vscale x 2 x half> %c, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -1592,15 +1552,15 @@ define <vscale x 2 x half> @vfma_vf_nxv2f16(<vscale x 2 x half> %va, half %b, <v
 ; ZVFHMIN-LABEL: vfma_vf_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 2 x half> %elt.head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
@@ -1618,15 +1578,15 @@ define <vscale x 2 x half> @vfma_vf_nxv2f16_commute(<vscale x 2 x half> %va, hal
 ; ZVFHMIN-LABEL: vfma_vf_nxv2f16_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v11, v8, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v11
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v11, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 2 x half> %elt.head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
@@ -1644,14 +1604,14 @@ define <vscale x 2 x half> @vfma_vf_nxv2f16_unmasked(<vscale x 2 x half> %va, ha
 ; ZVFHMIN-LABEL: vfma_vf_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
@@ -1670,14 +1630,14 @@ define <vscale x 2 x half> @vfma_vf_nxv2f16_unmasked_commute(<vscale x 2 x half>
 ; ZVFHMIN-LABEL: vfma_vf_nxv2f16_unmasked_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
@@ -1698,14 +1658,14 @@ define <vscale x 4 x half> @vfma_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x 4
 ;
 ; ZVFHMIN-LABEL: vfma_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.fma.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %b, <vscale x 4 x half> %c, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x half> %v
@@ -1720,13 +1680,13 @@ define <vscale x 4 x half> @vfma_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <v
 ;
 ; ZVFHMIN-LABEL: vfma_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.fma.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %b, <vscale x 4 x half> %c, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -1743,15 +1703,15 @@ define <vscale x 4 x half> @vfma_vf_nxv4f16(<vscale x 4 x half> %va, half %b, <v
 ; ZVFHMIN-LABEL: vfma_vf_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -1769,15 +1729,15 @@ define <vscale x 4 x half> @vfma_vf_nxv4f16_commute(<vscale x 4 x half> %va, hal
 ; ZVFHMIN-LABEL: vfma_vf_nxv4f16_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v14, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -1795,14 +1755,14 @@ define <vscale x 4 x half> @vfma_vf_nxv4f16_unmasked(<vscale x 4 x half> %va, ha
 ; ZVFHMIN-LABEL: vfma_vf_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
@@ -1821,14 +1781,14 @@ define <vscale x 4 x half> @vfma_vf_nxv4f16_unmasked_commute(<vscale x 4 x half>
 ; ZVFHMIN-LABEL: vfma_vf_nxv4f16_unmasked_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
@@ -1849,14 +1809,14 @@ define <vscale x 8 x half> @vfma_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x 8
 ;
 ; ZVFHMIN-LABEL: vfma_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.fma.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %b, <vscale x 8 x half> %c, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %v
@@ -1871,13 +1831,13 @@ define <vscale x 8 x half> @vfma_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <v
 ;
 ; ZVFHMIN-LABEL: vfma_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.fma.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %b, <vscale x 8 x half> %c, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -1894,15 +1854,15 @@ define <vscale x 8 x half> @vfma_vf_nxv8f16(<vscale x 8 x half> %va, half %b, <v
 ; ZVFHMIN-LABEL: vfma_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -1920,15 +1880,15 @@ define <vscale x 8 x half> @vfma_vf_nxv8f16_commute(<vscale x 8 x half> %va, hal
 ; ZVFHMIN-LABEL: vfma_vf_nxv8f16_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v20, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -1946,14 +1906,14 @@ define <vscale x 8 x half> @vfma_vf_nxv8f16_unmasked(<vscale x 8 x half> %va, ha
 ; ZVFHMIN-LABEL: vfma_vf_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
@@ -1972,14 +1932,14 @@ define <vscale x 8 x half> @vfma_vf_nxv8f16_unmasked_commute(<vscale x 8 x half>
 ; ZVFHMIN-LABEL: vfma_vf_nxv8f16_unmasked_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
@@ -2006,17 +1966,17 @@ define <vscale x 16 x half> @vfma_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -2037,13 +1997,13 @@ define <vscale x 16 x half> @vfma_vv_nxv16f16_unmasked(<vscale x 16 x half> %va,
 ;
 ; ZVFHMIN-LABEL: vfma_vv_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.fma.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %b, <vscale x 16 x half> %c, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -2059,32 +2019,18 @@ define <vscale x 16 x half> @vfma_vf_nxv16f16(<vscale x 16 x half> %va, half %b,
 ;
 ; ZVFHMIN-LABEL: vfma_vf_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv4r.v v16, v8
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vmv8r.v v8, v24
+; ZVFHMIN-NEXT:    vmv.v.x v4, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -2102,15 +2048,15 @@ define <vscale x 16 x half> @vfma_vf_nxv16f16_commute(<vscale x 16 x half> %va,
 ; ZVFHMIN-LABEL: vfma_vf_nxv16f16_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv.v.x v4, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -2128,14 +2074,14 @@ define <vscale x 16 x half> @vfma_vf_nxv16f16_unmasked(<vscale x 16 x half> %va,
 ; ZVFHMIN-LABEL: vfma_vf_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
@@ -2154,14 +2100,14 @@ define <vscale x 16 x half> @vfma_vf_nxv16f16_unmasked_commute(<vscale x 16 x ha
 ; ZVFHMIN-LABEL: vfma_vf_nxv16f16_unmasked_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
@@ -2186,142 +2132,124 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    mv a3, a2
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a3, a3, a2
+; ZVFHMIN-NEXT:    mv a3, a2
 ; ZVFHMIN-NEXT:    slli a2, a2, 2
 ; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v24, v0
-; ZVFHMIN-NEXT:    vl8re16.v v0, (a0)
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv8r.v v24, v8
+; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a1, a0
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs1r.v v24, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vslidedown.vx v24, v24, a2
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs1r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a1, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a4, a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
-; ZVFHMIN-NEXT:    add a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv4r.v v8, v16
-; ZVFHMIN-NEXT:    vmv8r.v v24, v16
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 5
-; ZVFHMIN-NEXT:    add a3, a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 4
-; ZVFHMIN-NEXT:    add a3, a4, a3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    bltu a1, a0, .LBB66_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:  .LBB66_2:
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a2, a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a2
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a2, a0, 5
-; ZVFHMIN-NEXT:    add a0, a2, a0
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a2, a0, 4
-; ZVFHMIN-NEXT:    add a0, a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a2, a0, 5
-; ZVFHMIN-NEXT:    add a0, a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a2, a0, 5
-; ZVFHMIN-NEXT:    add a0, a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vmv.v.v v16, v8
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a1, a1, a0
+; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 2
 ; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -2349,23 +2277,23 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16_unmasked(<vscale x 32 x half> %va,
 ; ZVFHMIN-NEXT:    slli a2, a2, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a2
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a2
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8re16.v v16, (a0)
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    slli a0, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a1, a0
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a1, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
@@ -2373,34 +2301,32 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16_unmasked(<vscale x 32 x half> %va,
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv4r.v v8, v16
-; ZVFHMIN-NEXT:    vmv8r.v v24, v16
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
-; ZVFHMIN-NEXT:    add a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8, v0.t
 ; ZVFHMIN-NEXT:    bltu a1, a0, .LBB67_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a1, a0
@@ -2409,31 +2335,33 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16_unmasked(<vscale x 32 x half> %va,
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a2, a0
+; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a2
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmacc.vv v0, v16, v24
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v0
+; ZVFHMIN-NEXT:    vmv8r.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -2457,151 +2385,138 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16(<vscale x 32 x half> %va, half %b,
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    mv a2, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a2, a2, a1
+; ZVFHMIN-NEXT:    mv a2, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 2
 ; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    vmv8r.v v24, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a4, a1, 5
-; ZVFHMIN-NEXT:    add a1, a4, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    sub a4, a0, a1
-; ZVFHMIN-NEXT:    csrr a5, vlenb
-; ZVFHMIN-NEXT:    slli a5, a5, 4
-; ZVFHMIN-NEXT:    add a5, sp, a5
-; ZVFHMIN-NEXT:    addi a5, a5, 16
-; ZVFHMIN-NEXT:    vs1r.v v0, (a5) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
 ; ZVFHMIN-NEXT:    sltu a3, a0, a4
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    mv a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 4
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a5, a5, a4
+; ZVFHMIN-NEXT:    mv a5, a4
 ; ZVFHMIN-NEXT:    slli a4, a4, 1
 ; ZVFHMIN-NEXT:    add a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v24, a2
-; ZVFHMIN-NEXT:    vmv4r.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 4
-; ZVFHMIN-NEXT:    add a2, a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 5
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB68_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB68_2:
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 5
-; ZVFHMIN-NEXT:    add a1, a2, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a2, a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 5
-; ZVFHMIN-NEXT:    add a1, a2, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 4
-; ZVFHMIN-NEXT:    add a1, a2, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 5
-; ZVFHMIN-NEXT:    add a1, a2, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vmv.v.v v16, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a1, a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
-  %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
-  %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
-  ret <vscale x 32 x half> %v
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vmv.v.v v16, v8
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
+  %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x half> %v
 }
 
 define <vscale x 32 x half> @vfma_vf_nxv32f16_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
@@ -2616,140 +2531,132 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_commute(<vscale x 32 x half> %va,
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    mv a2, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a2, a2, a1
+; ZVFHMIN-NEXT:    mv a2, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 2
 ; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a4, a1, 5
-; ZVFHMIN-NEXT:    add a1, a4, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    slli a1, a3, 1
-; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    sub a4, a0, a1
-; ZVFHMIN-NEXT:    csrr a5, vlenb
-; ZVFHMIN-NEXT:    slli a5, a5, 4
-; ZVFHMIN-NEXT:    add a5, sp, a5
-; ZVFHMIN-NEXT:    addi a5, a5, 16
-; ZVFHMIN-NEXT:    vs1r.v v0, (a5) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a4
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a3, a3, a4
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a5, a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a2
-; ZVFHMIN-NEXT:    vmv4r.v v8, v24
+; ZVFHMIN-NEXT:    slli a1, a2, 1
+; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 4
-; ZVFHMIN-NEXT:    add a2, a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a2, a2, 5
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
+; ZVFHMIN-NEXT:    vmv4r.v v24, v8
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB69_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB69_2:
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 5
-; ZVFHMIN-NEXT:    add a1, a2, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a2, a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 4
-; ZVFHMIN-NEXT:    add a1, a2, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 5
-; ZVFHMIN-NEXT:    add a1, a2, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 5
-; ZVFHMIN-NEXT:    add a1, a2, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv.v.v v16, v8
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a1, a1, a0
+; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 2
 ; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -2781,24 +2688,12 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked(<vscale x 32 x half> %va,
 ; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a4, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    sub a4, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a3
 ; ZVFHMIN-NEXT:    sltu a3, a0, a4
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
@@ -2806,57 +2701,76 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked(<vscale x 32 x half> %va,
 ; ZVFHMIN-NEXT:    slli a4, a4, 4
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    addi a4, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a2
-; ZVFHMIN-NEXT:    vmv4r.v v8, v24
+; ZVFHMIN-NEXT:    vmv.v.x v8, a2
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB70_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB70_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v8
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v0
 ; ZVFHMIN-NEXT:    vmv8r.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
@@ -2887,86 +2801,92 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked_commute(<vscale x 32 x ha
 ; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a4, a1
+; ZVFHMIN-NEXT:    mv a2, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a4
+; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    slli a1, a3, 1
-; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    sub a4, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a4
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a3, a3, a4
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a2
-; ZVFHMIN-NEXT:    vmv4r.v v8, v24
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-NEXT:    slli a1, a2, 1
+; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB71_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB71_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v0
+; ZVFHMIN-NEXT:    vmv8r.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -3878,17 +3798,15 @@ define <vscale x 1 x half> @vfmsub_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfmsub_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v10, v11, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %negc = call <vscale x 1 x half> @llvm.vp.fneg.nxv1f16(<vscale x 1 x half> %c, <vscale x 1 x i1> %m, i32 %evl)
   %v = call <vscale x 1 x half> @llvm.vp.fma.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %b, <vscale x 1 x half> %negc, <vscale x 1 x i1> %m, i32 %evl)
@@ -3905,16 +3823,14 @@ define <vscale x 1 x half> @vfmsub_vv_nxv1f16_unmasked(<vscale x 1 x half> %va,
 ; ZVFHMIN-LABEL: vfmsub_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vxor.vx v8, v10, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %negc = call <vscale x 1 x half> @llvm.vp.fneg.nxv1f16(<vscale x 1 x half> %c, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -3932,19 +3848,17 @@ define <vscale x 1 x half> @vfmsub_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: vfmsub_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v9, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -3963,19 +3877,17 @@ define <vscale x 1 x half> @vfmsub_vf_nxv1f16_commute(<vscale x 1 x half> %va, h
 ; ZVFHMIN-LABEL: vfmsub_vf_nxv1f16_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v9, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v11, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v11
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v9, v8, v11, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -3994,18 +3906,16 @@ define <vscale x 1 x half> @vfmsub_vf_nxv1f16_unmasked(<vscale x 1 x half> %va,
 ; ZVFHMIN-LABEL: vfmsub_vf_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v9, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v9, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v9
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -4025,18 +3935,16 @@ define <vscale x 1 x half> @vfmsub_vf_nxv1f16_unmasked_commute(<vscale x 1 x hal
 ; ZVFHMIN-LABEL: vfmsub_vf_nxv1f16_unmasked_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v9, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v9, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v9
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -4060,14 +3968,13 @@ define <vscale x 1 x half> @vfnmadd_vv_nxv1f16(<vscale x 1 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v10, v9, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 1 x half> @llvm.vp.fneg.nxv1f16(<vscale x 1 x half> %b, <vscale x 1 x i1> %m, i32 %evl)
   %negc = call <vscale x 1 x half> @llvm.vp.fneg.nxv1f16(<vscale x 1 x half> %c, <vscale x 1 x i1> %m, i32 %evl)
@@ -4088,14 +3995,13 @@ define <vscale x 1 x half> @vfnmadd_vv_nxv1f16_commuted(<vscale x 1 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v9, v10, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 1 x half> @llvm.vp.fneg.nxv1f16(<vscale x 1 x half> %b, <vscale x 1 x i1> %m, i32 %evl)
   %negc = call <vscale x 1 x half> @llvm.vp.fneg.nxv1f16(<vscale x 1 x half> %c, <vscale x 1 x i1> %m, i32 %evl)
@@ -4116,13 +4022,12 @@ define <vscale x 1 x half> @vfnmadd_vv_nxv1f16_unmasked(<vscale x 1 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v9, v10, v11
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 1 x half> @llvm.vp.fneg.nxv1f16(<vscale x 1 x half> %b, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -4144,13 +4049,12 @@ define <vscale x 1 x half> @vfnmadd_vv_nxv1f16_unmasked_commuted(<vscale x 1 x h
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v9, v10, v11
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 1 x half> @llvm.vp.fneg.nxv1f16(<vscale x 1 x half> %b, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -4169,20 +4073,18 @@ define <vscale x 1 x half> @vfnmadd_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: vfnmadd_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -4202,20 +4104,18 @@ define <vscale x 1 x half> @vfnmadd_vf_nxv1f16_commute(<vscale x 1 x half> %va,
 ; ZVFHMIN-LABEL: vfnmadd_vf_nxv1f16_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v9, v8, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -4235,19 +4135,17 @@ define <vscale x 1 x half> @vfnmadd_vf_nxv1f16_unmasked(<vscale x 1 x half> %va,
 ; ZVFHMIN-LABEL: vfnmadd_vf_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -4268,19 +4166,17 @@ define <vscale x 1 x half> @vfnmadd_vf_nxv1f16_unmasked_commute(<vscale x 1 x ha
 ; ZVFHMIN-LABEL: vfnmadd_vf_nxv1f16_unmasked_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -4303,17 +4199,16 @@ define <vscale x 1 x half> @vfnmadd_vf_nxv1f16_neg_splat(<vscale x 1 x half> %va
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v9, v10, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -4335,17 +4230,16 @@ define <vscale x 1 x half> @vfnmadd_vf_nxv1f16_neg_splat_commute(<vscale x 1 x h
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v10, v9, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -4367,16 +4261,15 @@ define <vscale x 1 x half> @vfnmadd_vf_nxv1f16_neg_splat_unmasked(<vscale x 1 x
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v10, v9, v11
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -4399,16 +4292,15 @@ define <vscale x 1 x half> @vfnmadd_vf_nxv1f16_neg_splat_unmasked_commute(<vscal
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v10, v9, v11
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -4433,14 +4325,13 @@ define <vscale x 1 x half> @vfnmsub_vv_nxv1f16(<vscale x 1 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v10, v9, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 1 x half> @llvm.vp.fneg.nxv1f16(<vscale x 1 x half> %b, <vscale x 1 x i1> %m, i32 %evl)
   %negc = call <vscale x 1 x half> @llvm.vp.fneg.nxv1f16(<vscale x 1 x half> %c, <vscale x 1 x i1> %m, i32 %evl)
@@ -4461,14 +4352,13 @@ define <vscale x 1 x half> @vfnmsub_vv_nxv1f16_commuted(<vscale x 1 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v9, v10, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 1 x half> @llvm.vp.fneg.nxv1f16(<vscale x 1 x half> %b, <vscale x 1 x i1> %m, i32 %evl)
   %negc = call <vscale x 1 x half> @llvm.vp.fneg.nxv1f16(<vscale x 1 x half> %c, <vscale x 1 x i1> %m, i32 %evl)
@@ -4489,13 +4379,12 @@ define <vscale x 1 x half> @vfnmsub_vv_nxv1f16_unmasked(<vscale x 1 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v9, v10, v11
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 1 x half> @llvm.vp.fneg.nxv1f16(<vscale x 1 x half> %b, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -4517,13 +4406,12 @@ define <vscale x 1 x half> @vfnmsub_vv_nxv1f16_unmasked_commuted(<vscale x 1 x h
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v9, v10, v11
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 1 x half> @llvm.vp.fneg.nxv1f16(<vscale x 1 x half> %b, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -4542,19 +4430,17 @@ define <vscale x 1 x half> @vfnmsub_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: vfnmsub_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v9, v11, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -4573,19 +4459,17 @@ define <vscale x 1 x half> @vfnmsub_vf_nxv1f16_commute(<vscale x 1 x half> %va,
 ; ZVFHMIN-LABEL: vfnmsub_vf_nxv1f16_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v9, v8, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v11, v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v11, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -4604,18 +4488,16 @@ define <vscale x 1 x half> @vfnmsub_vf_nxv1f16_unmasked(<vscale x 1 x half> %va,
 ; ZVFHMIN-LABEL: vfnmsub_vf_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -4635,18 +4517,16 @@ define <vscale x 1 x half> @vfnmsub_vf_nxv1f16_unmasked_commute(<vscale x 1 x ha
 ; ZVFHMIN-LABEL: vfnmsub_vf_nxv1f16_unmasked_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -4668,18 +4548,15 @@ define <vscale x 1 x half> @vfnmsub_vf_nxv1f16_neg_splat(<vscale x 1 x half> %va
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v9, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v10, v9, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v11, v9, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v11, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -4700,18 +4577,15 @@ define <vscale x 1 x half> @vfnmsub_vf_nxv1f16_neg_splat_commute(<vscale x 1 x h
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v9, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v9, v10, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v9, v11, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -4732,17 +4606,14 @@ define <vscale x 1 x half> @vfnmsub_vf_nxv1f16_neg_splat_unmasked(<vscale x 1 x
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v9, v10, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v9, v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v9, v10, v11
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -4764,17 +4635,14 @@ define <vscale x 1 x half> @vfnmsub_vf_nxv1f16_neg_splat_unmasked_commute(<vscal
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v9, v10, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v9, v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v9, v10, v11
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -4797,17 +4665,15 @@ define <vscale x 2 x half> @vfmsub_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfmsub_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v10, v11, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %negc = call <vscale x 2 x half> @llvm.vp.fneg.nxv2f16(<vscale x 2 x half> %c, <vscale x 2 x i1> %m, i32 %evl)
   %v = call <vscale x 2 x half> @llvm.vp.fma.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %b, <vscale x 2 x half> %negc, <vscale x 2 x i1> %m, i32 %evl)
@@ -4824,16 +4690,14 @@ define <vscale x 2 x half> @vfmsub_vv_nxv2f16_unmasked(<vscale x 2 x half> %va,
 ; ZVFHMIN-LABEL: vfmsub_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vxor.vx v8, v10, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %negc = call <vscale x 2 x half> @llvm.vp.fneg.nxv2f16(<vscale x 2 x half> %c, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -4851,19 +4715,17 @@ define <vscale x 2 x half> @vfmsub_vf_nxv2f16(<vscale x 2 x half> %va, half %b,
 ; ZVFHMIN-LABEL: vfmsub_vf_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v9, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 2 x half> %elt.head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
@@ -4882,19 +4744,17 @@ define <vscale x 2 x half> @vfmsub_vf_nxv2f16_commute(<vscale x 2 x half> %va, h
 ; ZVFHMIN-LABEL: vfmsub_vf_nxv2f16_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v9, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v11, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v11
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v9, v8, v11, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 2 x half> %elt.head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
@@ -4913,18 +4773,16 @@ define <vscale x 2 x half> @vfmsub_vf_nxv2f16_unmasked(<vscale x 2 x half> %va,
 ; ZVFHMIN-LABEL: vfmsub_vf_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v9, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v9, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v9
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
@@ -4944,18 +4802,16 @@ define <vscale x 2 x half> @vfmsub_vf_nxv2f16_unmasked_commute(<vscale x 2 x hal
 ; ZVFHMIN-LABEL: vfmsub_vf_nxv2f16_unmasked_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v9, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v9, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v9
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
@@ -4979,14 +4835,13 @@ define <vscale x 2 x half> @vfnmadd_vv_nxv2f16(<vscale x 2 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v10, v9, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 2 x half> @llvm.vp.fneg.nxv2f16(<vscale x 2 x half> %b, <vscale x 2 x i1> %m, i32 %evl)
   %negc = call <vscale x 2 x half> @llvm.vp.fneg.nxv2f16(<vscale x 2 x half> %c, <vscale x 2 x i1> %m, i32 %evl)
@@ -5007,14 +4862,13 @@ define <vscale x 2 x half> @vfnmadd_vv_nxv2f16_commuted(<vscale x 2 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v9, v10, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 2 x half> @llvm.vp.fneg.nxv2f16(<vscale x 2 x half> %b, <vscale x 2 x i1> %m, i32 %evl)
   %negc = call <vscale x 2 x half> @llvm.vp.fneg.nxv2f16(<vscale x 2 x half> %c, <vscale x 2 x i1> %m, i32 %evl)
@@ -5035,13 +4889,12 @@ define <vscale x 2 x half> @vfnmadd_vv_nxv2f16_unmasked(<vscale x 2 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v9, v10, v11
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 2 x half> @llvm.vp.fneg.nxv2f16(<vscale x 2 x half> %b, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -5063,13 +4916,12 @@ define <vscale x 2 x half> @vfnmadd_vv_nxv2f16_unmasked_commuted(<vscale x 2 x h
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v9, v10, v11
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 2 x half> @llvm.vp.fneg.nxv2f16(<vscale x 2 x half> %b, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -5088,20 +4940,18 @@ define <vscale x 2 x half> @vfnmadd_vf_nxv2f16(<vscale x 2 x half> %va, half %b,
 ; ZVFHMIN-LABEL: vfnmadd_vf_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 2 x half> %elt.head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
@@ -5121,20 +4971,18 @@ define <vscale x 2 x half> @vfnmadd_vf_nxv2f16_commute(<vscale x 2 x half> %va,
 ; ZVFHMIN-LABEL: vfnmadd_vf_nxv2f16_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v9, v8, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 2 x half> %elt.head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
@@ -5154,19 +5002,17 @@ define <vscale x 2 x half> @vfnmadd_vf_nxv2f16_unmasked(<vscale x 2 x half> %va,
 ; ZVFHMIN-LABEL: vfnmadd_vf_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
@@ -5187,19 +5033,17 @@ define <vscale x 2 x half> @vfnmadd_vf_nxv2f16_unmasked_commute(<vscale x 2 x ha
 ; ZVFHMIN-LABEL: vfnmadd_vf_nxv2f16_unmasked_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
@@ -5222,17 +5066,16 @@ define <vscale x 2 x half> @vfnmadd_vf_nxv2f16_neg_splat(<vscale x 2 x half> %va
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v9, v10, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 2 x half> %elt.head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
@@ -5254,17 +5097,16 @@ define <vscale x 2 x half> @vfnmadd_vf_nxv2f16_neg_splat_commute(<vscale x 2 x h
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v10, v9, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 2 x half> %elt.head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
@@ -5286,16 +5128,15 @@ define <vscale x 2 x half> @vfnmadd_vf_nxv2f16_neg_splat_unmasked(<vscale x 2 x
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v10, v9, v11
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
@@ -5318,16 +5159,15 @@ define <vscale x 2 x half> @vfnmadd_vf_nxv2f16_neg_splat_unmasked_commute(<vscal
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v10, v9, v11
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
@@ -5352,14 +5192,13 @@ define <vscale x 2 x half> @vfnmsub_vv_nxv2f16(<vscale x 2 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v10, v9, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 2 x half> @llvm.vp.fneg.nxv2f16(<vscale x 2 x half> %b, <vscale x 2 x i1> %m, i32 %evl)
   %negc = call <vscale x 2 x half> @llvm.vp.fneg.nxv2f16(<vscale x 2 x half> %c, <vscale x 2 x i1> %m, i32 %evl)
@@ -5380,14 +5219,13 @@ define <vscale x 2 x half> @vfnmsub_vv_nxv2f16_commuted(<vscale x 2 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v9, v10, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 2 x half> @llvm.vp.fneg.nxv2f16(<vscale x 2 x half> %b, <vscale x 2 x i1> %m, i32 %evl)
   %negc = call <vscale x 2 x half> @llvm.vp.fneg.nxv2f16(<vscale x 2 x half> %c, <vscale x 2 x i1> %m, i32 %evl)
@@ -5408,13 +5246,12 @@ define <vscale x 2 x half> @vfnmsub_vv_nxv2f16_unmasked(<vscale x 2 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v9, v10, v11
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 2 x half> @llvm.vp.fneg.nxv2f16(<vscale x 2 x half> %b, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -5436,13 +5273,12 @@ define <vscale x 2 x half> @vfnmsub_vv_nxv2f16_unmasked_commuted(<vscale x 2 x h
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v9, v10, v11
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 2 x half> @llvm.vp.fneg.nxv2f16(<vscale x 2 x half> %b, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -5461,19 +5297,17 @@ define <vscale x 2 x half> @vfnmsub_vf_nxv2f16(<vscale x 2 x half> %va, half %b,
 ; ZVFHMIN-LABEL: vfnmsub_vf_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v9, v11, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 2 x half> %elt.head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
@@ -5492,19 +5326,17 @@ define <vscale x 2 x half> @vfnmsub_vf_nxv2f16_commute(<vscale x 2 x half> %va,
 ; ZVFHMIN-LABEL: vfnmsub_vf_nxv2f16_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v9, v8, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v11, v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v11, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 2 x half> %elt.head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
@@ -5523,18 +5355,16 @@ define <vscale x 2 x half> @vfnmsub_vf_nxv2f16_unmasked(<vscale x 2 x half> %va,
 ; ZVFHMIN-LABEL: vfnmsub_vf_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
@@ -5554,18 +5384,16 @@ define <vscale x 2 x half> @vfnmsub_vf_nxv2f16_unmasked_commute(<vscale x 2 x ha
 ; ZVFHMIN-LABEL: vfnmsub_vf_nxv2f16_unmasked_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
@@ -5587,18 +5415,15 @@ define <vscale x 2 x half> @vfnmsub_vf_nxv2f16_neg_splat(<vscale x 2 x half> %va
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v9, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v10, v9, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v11, v9, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v11, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 2 x half> %elt.head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
@@ -5619,18 +5444,15 @@ define <vscale x 2 x half> @vfnmsub_vf_nxv2f16_neg_splat_commute(<vscale x 2 x h
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v9, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v9, v10, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v9, v11, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 2 x half> %elt.head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
@@ -5651,17 +5473,14 @@ define <vscale x 2 x half> @vfnmsub_vf_nxv2f16_neg_splat_unmasked(<vscale x 2 x
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v9, v10, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v9, v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v9, v10, v11
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
@@ -5683,17 +5502,14 @@ define <vscale x 2 x half> @vfnmsub_vf_nxv2f16_neg_splat_unmasked_commute(<vscal
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v9, v10, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v9, v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v9, v10, v11
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
@@ -5716,17 +5532,15 @@ define <vscale x 4 x half> @vfmsub_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfmsub_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14, v0.t
 ; ZVFHMIN-NEXT:    ret
   %negc = call <vscale x 4 x half> @llvm.vp.fneg.nxv4f16(<vscale x 4 x half> %c, <vscale x 4 x i1> %m, i32 %evl)
   %v = call <vscale x 4 x half> @llvm.vp.fma.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %b, <vscale x 4 x half> %negc, <vscale x 4 x i1> %m, i32 %evl)
@@ -5743,16 +5557,14 @@ define <vscale x 4 x half> @vfmsub_vv_nxv4f16_unmasked(<vscale x 4 x half> %va,
 ; ZVFHMIN-LABEL: vfmsub_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vxor.vx v8, v10, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
   %negc = call <vscale x 4 x half> @llvm.vp.fneg.nxv4f16(<vscale x 4 x half> %c, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -5770,19 +5582,17 @@ define <vscale x 4 x half> @vfmsub_vf_nxv4f16(<vscale x 4 x half> %va, half %b,
 ; ZVFHMIN-LABEL: vfmsub_vf_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v9, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v12, v14, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -5801,19 +5611,17 @@ define <vscale x 4 x half> @vfmsub_vf_nxv4f16_commute(<vscale x 4 x half> %va, h
 ; ZVFHMIN-LABEL: vfmsub_vf_nxv4f16_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v9, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v8, v14, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v8, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -5832,18 +5640,16 @@ define <vscale x 4 x half> @vfmsub_vf_nxv4f16_unmasked(<vscale x 4 x half> %va,
 ; ZVFHMIN-LABEL: vfmsub_vf_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v9, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v9, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v12, v14
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
@@ -5863,18 +5669,16 @@ define <vscale x 4 x half> @vfmsub_vf_nxv4f16_unmasked_commute(<vscale x 4 x hal
 ; ZVFHMIN-LABEL: vfmsub_vf_nxv4f16_unmasked_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v9, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v9, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v12, v14
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
@@ -5898,14 +5702,13 @@ define <vscale x 4 x half> @vfnmadd_vv_nxv4f16(<vscale x 4 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v10, v14, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 4 x half> @llvm.vp.fneg.nxv4f16(<vscale x 4 x half> %b, <vscale x 4 x i1> %m, i32 %evl)
   %negc = call <vscale x 4 x half> @llvm.vp.fneg.nxv4f16(<vscale x 4 x half> %c, <vscale x 4 x i1> %m, i32 %evl)
@@ -5926,14 +5729,13 @@ define <vscale x 4 x half> @vfnmadd_vv_nxv4f16_commuted(<vscale x 4 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14, v0.t
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 4 x half> @llvm.vp.fneg.nxv4f16(<vscale x 4 x half> %b, <vscale x 4 x i1> %m, i32 %evl)
   %negc = call <vscale x 4 x half> @llvm.vp.fneg.nxv4f16(<vscale x 4 x half> %c, <vscale x 4 x i1> %m, i32 %evl)
@@ -5954,13 +5756,12 @@ define <vscale x 4 x half> @vfnmadd_vv_nxv4f16_unmasked(<vscale x 4 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 4 x half> @llvm.vp.fneg.nxv4f16(<vscale x 4 x half> %b, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -5982,13 +5783,12 @@ define <vscale x 4 x half> @vfnmadd_vv_nxv4f16_unmasked_commuted(<vscale x 4 x h
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 4 x half> @llvm.vp.fneg.nxv4f16(<vscale x 4 x half> %b, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -6007,20 +5807,18 @@ define <vscale x 4 x half> @vfnmadd_vf_nxv4f16(<vscale x 4 x half> %va, half %b,
 ; ZVFHMIN-LABEL: vfnmadd_vf_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -6040,20 +5838,18 @@ define <vscale x 4 x half> @vfnmadd_vf_nxv4f16_commute(<vscale x 4 x half> %va,
 ; ZVFHMIN-LABEL: vfnmadd_vf_nxv4f16_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v14, v8, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -6073,19 +5869,17 @@ define <vscale x 4 x half> @vfnmadd_vf_nxv4f16_unmasked(<vscale x 4 x half> %va,
 ; ZVFHMIN-LABEL: vfnmadd_vf_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v12
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
@@ -6106,19 +5900,17 @@ define <vscale x 4 x half> @vfnmadd_vf_nxv4f16_unmasked_commute(<vscale x 4 x ha
 ; ZVFHMIN-LABEL: vfnmadd_vf_nxv4f16_unmasked_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v12
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
@@ -6141,17 +5933,16 @@ define <vscale x 4 x half> @vfnmadd_vf_nxv4f16_neg_splat(<vscale x 4 x half> %va
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -6173,17 +5964,16 @@ define <vscale x 4 x half> @vfnmadd_vf_nxv4f16_neg_splat_commute(<vscale x 4 x h
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v10, v14, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -6205,16 +5995,15 @@ define <vscale x 4 x half> @vfnmadd_vf_nxv4f16_neg_splat_unmasked(<vscale x 4 x
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v10, v14, v12
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
@@ -6237,16 +6026,15 @@ define <vscale x 4 x half> @vfnmadd_vf_nxv4f16_neg_splat_unmasked_commute(<vscal
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v10, v14, v12
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
@@ -6271,14 +6059,13 @@ define <vscale x 4 x half> @vfnmsub_vv_nxv4f16(<vscale x 4 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v10, v14, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 4 x half> @llvm.vp.fneg.nxv4f16(<vscale x 4 x half> %b, <vscale x 4 x i1> %m, i32 %evl)
   %negc = call <vscale x 4 x half> @llvm.vp.fneg.nxv4f16(<vscale x 4 x half> %c, <vscale x 4 x i1> %m, i32 %evl)
@@ -6299,14 +6086,13 @@ define <vscale x 4 x half> @vfnmsub_vv_nxv4f16_commuted(<vscale x 4 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14, v0.t
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 4 x half> @llvm.vp.fneg.nxv4f16(<vscale x 4 x half> %b, <vscale x 4 x i1> %m, i32 %evl)
   %negc = call <vscale x 4 x half> @llvm.vp.fneg.nxv4f16(<vscale x 4 x half> %c, <vscale x 4 x i1> %m, i32 %evl)
@@ -6327,13 +6113,12 @@ define <vscale x 4 x half> @vfnmsub_vv_nxv4f16_unmasked(<vscale x 4 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 4 x half> @llvm.vp.fneg.nxv4f16(<vscale x 4 x half> %b, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -6355,13 +6140,12 @@ define <vscale x 4 x half> @vfnmsub_vv_nxv4f16_unmasked_commuted(<vscale x 4 x h
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 4 x half> @llvm.vp.fneg.nxv4f16(<vscale x 4 x half> %b, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -6380,19 +6164,17 @@ define <vscale x 4 x half> @vfnmsub_vf_nxv4f16(<vscale x 4 x half> %va, half %b,
 ; ZVFHMIN-LABEL: vfnmsub_vf_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v12, v14, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -6411,19 +6193,17 @@ define <vscale x 4 x half> @vfnmsub_vf_nxv4f16_commute(<vscale x 4 x half> %va,
 ; ZVFHMIN-LABEL: vfnmsub_vf_nxv4f16_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v14, v8, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v8, v14, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -6442,18 +6222,16 @@ define <vscale x 4 x half> @vfnmsub_vf_nxv4f16_unmasked(<vscale x 4 x half> %va,
 ; ZVFHMIN-LABEL: vfnmsub_vf_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v12
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
@@ -6473,18 +6251,16 @@ define <vscale x 4 x half> @vfnmsub_vf_nxv4f16_unmasked_commute(<vscale x 4 x ha
 ; ZVFHMIN-LABEL: vfnmsub_vf_nxv4f16_unmasked_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v12
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
@@ -6506,18 +6282,15 @@ define <vscale x 4 x half> @vfnmsub_vf_nxv4f16_neg_splat(<vscale x 4 x half> %va
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v9, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v10, v14, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v14, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -6538,18 +6311,15 @@ define <vscale x 4 x half> @vfnmsub_vf_nxv4f16_neg_splat_commute(<vscale x 4 x h
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v9, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -6570,17 +6340,14 @@ define <vscale x 4 x half> @vfnmsub_vf_nxv4f16_neg_splat_unmasked(<vscale x 4 x
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v9, v10, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v9, v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
@@ -6602,17 +6369,14 @@ define <vscale x 4 x half> @vfnmsub_vf_nxv4f16_neg_splat_unmasked_commute(<vscal
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v9, v10, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v9, v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
@@ -6635,17 +6399,15 @@ define <vscale x 8 x half> @vfmsub_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfmsub_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v12, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
+; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20, v0.t
 ; ZVFHMIN-NEXT:    ret
   %negc = call <vscale x 8 x half> @llvm.vp.fneg.nxv8f16(<vscale x 8 x half> %c, <vscale x 8 x i1> %m, i32 %evl)
   %v = call <vscale x 8 x half> @llvm.vp.fma.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %b, <vscale x 8 x half> %negc, <vscale x 8 x i1> %m, i32 %evl)
@@ -6662,16 +6424,14 @@ define <vscale x 8 x half> @vfmsub_vv_nxv8f16_unmasked(<vscale x 8 x half> %va,
 ; ZVFHMIN-LABEL: vfmsub_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vxor.vx v8, v12, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
   %negc = call <vscale x 8 x half> @llvm.vp.fneg.nxv8f16(<vscale x 8 x half> %c, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -6689,19 +6449,17 @@ define <vscale x 8 x half> @vfmsub_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: vfmsub_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v20, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -6720,19 +6478,17 @@ define <vscale x 8 x half> @vfmsub_vf_nxv8f16_commute(<vscale x 8 x half> %va, h
 ; ZVFHMIN-LABEL: vfmsub_vf_nxv8f16_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v20, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -6751,18 +6507,16 @@ define <vscale x 8 x half> @vfmsub_vf_nxv8f16_unmasked(<vscale x 8 x half> %va,
 ; ZVFHMIN-LABEL: vfmsub_vf_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v10, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v20
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
@@ -6782,18 +6536,16 @@ define <vscale x 8 x half> @vfmsub_vf_nxv8f16_unmasked_commute(<vscale x 8 x hal
 ; ZVFHMIN-LABEL: vfmsub_vf_nxv8f16_unmasked_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v10, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v20
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
@@ -6817,14 +6569,13 @@ define <vscale x 8 x half> @vfnmadd_vv_nxv8f16(<vscale x 8 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
 ; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v20, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 8 x half> @llvm.vp.fneg.nxv8f16(<vscale x 8 x half> %b, <vscale x 8 x i1> %m, i32 %evl)
   %negc = call <vscale x 8 x half> @llvm.vp.fneg.nxv8f16(<vscale x 8 x half> %c, <vscale x 8 x i1> %m, i32 %evl)
@@ -6845,14 +6596,13 @@ define <vscale x 8 x half> @vfnmadd_vv_nxv8f16_commuted(<vscale x 8 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
 ; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20, v0.t
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 8 x half> @llvm.vp.fneg.nxv8f16(<vscale x 8 x half> %b, <vscale x 8 x i1> %m, i32 %evl)
   %negc = call <vscale x 8 x half> @llvm.vp.fneg.nxv8f16(<vscale x 8 x half> %c, <vscale x 8 x i1> %m, i32 %evl)
@@ -6873,13 +6623,12 @@ define <vscale x 8 x half> @vfnmadd_vv_nxv8f16_unmasked(<vscale x 8 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
 ; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 8 x half> @llvm.vp.fneg.nxv8f16(<vscale x 8 x half> %b, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -6901,13 +6650,12 @@ define <vscale x 8 x half> @vfnmadd_vv_nxv8f16_unmasked_commuted(<vscale x 8 x h
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
 ; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 8 x half> @llvm.vp.fneg.nxv8f16(<vscale x 8 x half> %b, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -6926,20 +6674,18 @@ define <vscale x 8 x half> @vfnmadd_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: vfnmadd_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -6959,20 +6705,18 @@ define <vscale x 8 x half> @vfnmadd_vf_nxv8f16_commute(<vscale x 8 x half> %va,
 ; ZVFHMIN-LABEL: vfnmadd_vf_nxv8f16_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v20, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -6992,19 +6736,17 @@ define <vscale x 8 x half> @vfnmadd_vf_nxv8f16_unmasked(<vscale x 8 x half> %va,
 ; ZVFHMIN-LABEL: vfnmadd_vf_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
@@ -7025,19 +6767,17 @@ define <vscale x 8 x half> @vfnmadd_vf_nxv8f16_unmasked_commute(<vscale x 8 x ha
 ; ZVFHMIN-LABEL: vfnmadd_vf_nxv8f16_unmasked_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
@@ -7060,17 +6800,16 @@ define <vscale x 8 x half> @vfnmadd_vf_nxv8f16_neg_splat(<vscale x 8 x half> %va
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -7092,17 +6831,16 @@ define <vscale x 8 x half> @vfnmadd_vf_nxv8f16_neg_splat_commute(<vscale x 8 x h
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v20, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -7124,16 +6862,15 @@ define <vscale x 8 x half> @vfnmadd_vf_nxv8f16_neg_splat_unmasked(<vscale x 8 x
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
+; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v20, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
@@ -7156,16 +6893,15 @@ define <vscale x 8 x half> @vfnmadd_vf_nxv8f16_neg_splat_unmasked_commute(<vscal
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
+; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v20, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
@@ -7190,14 +6926,13 @@ define <vscale x 8 x half> @vfnmsub_vv_nxv8f16(<vscale x 8 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
 ; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v20, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 8 x half> @llvm.vp.fneg.nxv8f16(<vscale x 8 x half> %b, <vscale x 8 x i1> %m, i32 %evl)
   %negc = call <vscale x 8 x half> @llvm.vp.fneg.nxv8f16(<vscale x 8 x half> %c, <vscale x 8 x i1> %m, i32 %evl)
@@ -7218,14 +6953,13 @@ define <vscale x 8 x half> @vfnmsub_vv_nxv8f16_commuted(<vscale x 8 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
 ; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20, v0.t
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 8 x half> @llvm.vp.fneg.nxv8f16(<vscale x 8 x half> %b, <vscale x 8 x i1> %m, i32 %evl)
   %negc = call <vscale x 8 x half> @llvm.vp.fneg.nxv8f16(<vscale x 8 x half> %c, <vscale x 8 x i1> %m, i32 %evl)
@@ -7246,13 +6980,12 @@ define <vscale x 8 x half> @vfnmsub_vv_nxv8f16_unmasked(<vscale x 8 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
 ; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 8 x half> @llvm.vp.fneg.nxv8f16(<vscale x 8 x half> %b, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -7274,13 +7007,12 @@ define <vscale x 8 x half> @vfnmsub_vv_nxv8f16_unmasked_commuted(<vscale x 8 x h
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
 ; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 8 x half> @llvm.vp.fneg.nxv8f16(<vscale x 8 x half> %b, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -7299,19 +7031,17 @@ define <vscale x 8 x half> @vfnmsub_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: vfnmsub_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v20, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -7330,19 +7060,17 @@ define <vscale x 8 x half> @vfnmsub_vf_nxv8f16_commute(<vscale x 8 x half> %va,
 ; ZVFHMIN-LABEL: vfnmsub_vf_nxv8f16_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v20, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v20, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -7361,18 +7089,16 @@ define <vscale x 8 x half> @vfnmsub_vf_nxv8f16_unmasked(<vscale x 8 x half> %va,
 ; ZVFHMIN-LABEL: vfnmsub_vf_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
@@ -7392,18 +7118,16 @@ define <vscale x 8 x half> @vfnmsub_vf_nxv8f16_unmasked_commute(<vscale x 8 x ha
 ; ZVFHMIN-LABEL: vfnmsub_vf_nxv8f16_unmasked_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
@@ -7425,18 +7149,15 @@ define <vscale x 8 x half> @vfnmsub_vf_nxv8f16_neg_splat(<vscale x 8 x half> %va
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v12, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v20, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v20, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -7457,18 +7178,15 @@ define <vscale x 8 x half> @vfnmsub_vf_nxv8f16_neg_splat_commute(<vscale x 8 x h
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v12, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -7489,17 +7207,14 @@ define <vscale x 8 x half> @vfnmsub_vf_nxv8f16_neg_splat_unmasked(<vscale x 8 x
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v12, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v10, v12, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
@@ -7521,17 +7236,14 @@ define <vscale x 8 x half> @vfnmsub_vf_nxv8f16_neg_splat_unmasked_commute(<vscal
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v12, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v10, v12, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
@@ -7553,34 +7265,18 @@ define <vscale x 16 x half> @vfmsub_vv_nxv16f16(<vscale x 16 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vfmsub_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v16, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    vmv4r.v v4, v12
+; ZVFHMIN-NEXT:    vmv4r.v v20, v8
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v24, v16, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %negc = call <vscale x 16 x half> @llvm.vp.fneg.nxv16f16(<vscale x 16 x half> %c, <vscale x 16 x i1> %m, i32 %evl)
   %v = call <vscale x 16 x half> @llvm.vp.fma.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %b, <vscale x 16 x half> %negc, <vscale x 16 x i1> %m, i32 %evl)
@@ -7597,16 +7293,14 @@ define <vscale x 16 x half> @vfmsub_vv_nxv16f16_unmasked(<vscale x 16 x half> %v
 ; ZVFHMIN-LABEL: vfmsub_vv_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vxor.vx v8, v16, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
   %negc = call <vscale x 16 x half> @llvm.vp.fneg.nxv16f16(<vscale x 16 x half> %c, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -7623,36 +7317,20 @@ define <vscale x 16 x half> @vfmsub_vf_nxv16f16(<vscale x 16 x half> %va, half %
 ;
 ; ZVFHMIN-LABEL: vfmsub_vf_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v4, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v12, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    vmv4r.v v16, v8
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v4, a0
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vmv8r.v v8, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -7671,19 +7349,17 @@ define <vscale x 16 x half> @vfmsub_vf_nxv16f16_commute(<vscale x 16 x half> %va
 ; ZVFHMIN-LABEL: vfmsub_vf_nxv16f16_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v4, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v12, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vmv.v.x v4, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -7708,22 +7384,20 @@ define <vscale x 16 x half> @vfmsub_vf_nxv16f16_unmasked(<vscale x 16 x half> %v
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs4r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v12, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v12, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 2
@@ -7755,22 +7429,20 @@ define <vscale x 16 x half> @vfmsub_vf_nxv16f16_unmasked_commute(<vscale x 16 x
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs4r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v12, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v12, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 2
@@ -7798,17 +7470,16 @@ define <vscale x 16 x half> @vfnmadd_vv_nxv16f16(<vscale x 16 x half> %va, <vsca
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv4r.v v4, v8
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v24, v16, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 16 x half> @llvm.vp.fneg.nxv16f16(<vscale x 16 x half> %b, <vscale x 16 x i1> %m, i32 %evl)
   %negc = call <vscale x 16 x half> @llvm.vp.fneg.nxv16f16(<vscale x 16 x half> %c, <vscale x 16 x i1> %m, i32 %evl)
@@ -7834,18 +7505,17 @@ define <vscale x 16 x half> @vfnmadd_vv_nxv16f16_commuted(<vscale x 16 x half> %
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v24, v16, a1, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -7872,13 +7542,12 @@ define <vscale x 16 x half> @vfnmadd_vv_nxv16f16_unmasked(<vscale x 16 x half> %
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
 ; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 16 x half> @llvm.vp.fneg.nxv16f16(<vscale x 16 x half> %b, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -7900,13 +7569,12 @@ define <vscale x 16 x half> @vfnmadd_vv_nxv16f16_unmasked_commuted(<vscale x 16
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
 ; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 16 x half> @llvm.vp.fneg.nxv16f16(<vscale x 16 x half> %b, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -7924,36 +7592,19 @@ define <vscale x 16 x half> @vfnmadd_vf_nxv16f16(<vscale x 16 x half> %va, half
 ;
 ; ZVFHMIN-LABEL: vfnmadd_vf_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v4, a1
-; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v4, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v20, v8, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v24, v12, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -7973,20 +7624,18 @@ define <vscale x 16 x half> @vfnmadd_vf_nxv16f16_commute(<vscale x 16 x half> %v
 ; ZVFHMIN-LABEL: vfnmadd_vf_nxv16f16_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v4, a1
-; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vmv.v.x v4, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v28, v8, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v16, v12, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -8012,23 +7661,21 @@ define <vscale x 16 x half> @vfnmadd_vf_nxv16f16_unmasked(<vscale x 16 x half> %
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs4r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v24
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 2
@@ -8061,23 +7708,21 @@ define <vscale x 16 x half> @vfnmadd_vf_nxv16f16_unmasked_commute(<vscale x 16 x
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs4r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v24
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 2
@@ -8114,19 +7759,18 @@ define <vscale x 16 x half> @vfnmadd_vf_nxv16f16_neg_splat(<vscale x 16 x half>
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vxor.vx v4, v16, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v4, v16, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl4r.v v4, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 2
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -8151,21 +7795,36 @@ define <vscale x 16 x half> @vfnmadd_vf_nxv16f16_neg_splat_commute(<vscale x 16
 ;
 ; ZVFHMIN-LABEL: vfnmadd_vf_nxv16f16_neg_splat_commute:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv4r.v v4, v8
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 2
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v4, v16, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vmv8r.v v8, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl4r.v v4, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -8187,16 +7846,15 @@ define <vscale x 16 x half> @vfnmadd_vf_nxv16f16_neg_splat_unmasked(<vscale x 16
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v24
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
@@ -8219,16 +7877,15 @@ define <vscale x 16 x half> @vfnmadd_vf_nxv16f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v24
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
@@ -8251,17 +7908,16 @@ define <vscale x 16 x half> @vfnmsub_vv_nxv16f16(<vscale x 16 x half> %va, <vsca
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv4r.v v4, v8
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v24, v16, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 16 x half> @llvm.vp.fneg.nxv16f16(<vscale x 16 x half> %b, <vscale x 16 x i1> %m, i32 %evl)
   %negc = call <vscale x 16 x half> @llvm.vp.fneg.nxv16f16(<vscale x 16 x half> %c, <vscale x 16 x i1> %m, i32 %evl)
@@ -8287,18 +7943,17 @@ define <vscale x 16 x half> @vfnmsub_vv_nxv16f16_commuted(<vscale x 16 x half> %
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v24, v16, a1, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -8325,13 +7980,12 @@ define <vscale x 16 x half> @vfnmsub_vv_nxv16f16_unmasked(<vscale x 16 x half> %
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
 ; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 16 x half> @llvm.vp.fneg.nxv16f16(<vscale x 16 x half> %b, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -8353,13 +8007,12 @@ define <vscale x 16 x half> @vfnmsub_vv_nxv16f16_unmasked_commuted(<vscale x 16
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
 ; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 16 x half> @llvm.vp.fneg.nxv16f16(<vscale x 16 x half> %b, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -8377,36 +8030,20 @@ define <vscale x 16 x half> @vfnmsub_vf_nxv16f16(<vscale x 16 x half> %va, half
 ;
 ; ZVFHMIN-LABEL: vfnmsub_vf_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v4, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    vmv4r.v v16, v12
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v4, a0
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT:    vmv8r.v v8, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -8425,19 +8062,17 @@ define <vscale x 16 x half> @vfnmsub_vf_nxv16f16_commute(<vscale x 16 x half> %v
 ; ZVFHMIN-LABEL: vfnmsub_vf_nxv16f16_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v4, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v4, a1
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -8462,22 +8097,20 @@ define <vscale x 16 x half> @vfnmsub_vf_nxv16f16_unmasked(<vscale x 16 x half> %
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs4r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v24
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 2
@@ -8509,22 +8142,20 @@ define <vscale x 16 x half> @vfnmsub_vf_nxv16f16_unmasked_commute(<vscale x 16 x
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs4r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v24
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 2
@@ -8551,20 +8182,17 @@ define <vscale x 16 x half> @vfnmsub_vf_nxv16f16_neg_splat(<vscale x 16 x half>
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv4r.v v4, v8
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v12, v16, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v16, a0
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v24, v16, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -8582,37 +8210,20 @@ define <vscale x 16 x half> @vfnmsub_vf_nxv16f16_neg_splat_commute(<vscale x 16
 ;
 ; ZVFHMIN-LABEL: vfnmsub_vf_nxv16f16_neg_splat_commute:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v12, v16, a1, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    vmv4r.v v20, v12
+; ZVFHMIN-NEXT:    vmv4r.v v4, v8
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v16, a0
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v24, v16, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -8633,17 +8244,14 @@ define <vscale x 16 x half> @vfnmsub_vf_nxv16f16_neg_splat_unmasked(<vscale x 16
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v12, v16, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v12, v16, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
@@ -8665,17 +8273,14 @@ define <vscale x 16 x half> @vfnmsub_vf_nxv16f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v12, v16, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v12, v16, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
@@ -8701,105 +8306,103 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
+; ZVFHMIN-NEXT:    sub sp, sp, a2
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a3, a2
 ; ZVFHMIN-NEXT:    slli a2, a2, 2
 ; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 5
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v24, v8
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
 ; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a0, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    sub a4, a1, a0
+; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a3
+; ZVFHMIN-NEXT:    sltu a3, a1, a4
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a2, v0.t
-; ZVFHMIN-NEXT:    slli a2, a0, 1
-; ZVFHMIN-NEXT:    mv a3, a1
-; ZVFHMIN-NEXT:    vmv4r.v v4, v12
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    bltu a1, a2, .LBB280_2
-; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a3, a3, a4
+; ZVFHMIN-NEXT:    vmv1r.v v0, v6
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 2
+; ZVFHMIN-NEXT:    add a2, a2, a3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    bltu a1, a0, .LBB280_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:  .LBB280_2:
-; ZVFHMIN-NEXT:    addi a4, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
-; ZVFHMIN-NEXT:    add a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    sub a2, a1, a2
-; ZVFHMIN-NEXT:    srli a0, a0, 2
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 5
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    sltu a1, a1, a2
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a0
-; ZVFHMIN-NEXT:    addi a1, a1, -1
-; ZVFHMIN-NEXT:    and a1, a1, a2
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a2, a0
@@ -8807,27 +8410,65 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    add a0, a0, a2
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 2
 ; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vmv.v.v v16, v8
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -8851,118 +8492,106 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    sub sp, sp, a2
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
 ; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v24, v8
+; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
 ; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    vmset.m v16
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    sub a4, a1, a0
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v20, v7, a3
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v16, a3
 ; ZVFHMIN-NEXT:    sltu a3, a1, a4
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v0, v24, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vxor.vx v16, v8, a2
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
+; ZVFHMIN-NEXT:    vmv4r.v v8, v16
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv1r.v v0, v20
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
 ; ZVFHMIN-NEXT:    bltu a1, a0, .LBB281_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:  .LBB281_2:
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a2
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v24
-; ZVFHMIN-NEXT:    vmv8r.v v8, v16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v24
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -8991,50 +8620,33 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16(<vscale x 32 x half> %va, half %
 ; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-NEXT:    lui a3, 8
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a2
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v24, v16, a3, v0.t
-; ZVFHMIN-NEXT:    slli a2, a1, 1
-; ZVFHMIN-NEXT:    mv a3, a0
-; ZVFHMIN-NEXT:    vmv4r.v v4, v28
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    bltu a0, a2, .LBB282_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:  .LBB282_2:
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
-; ZVFHMIN-NEXT:    addi a4, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    vmv8r.v v24, v8
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    vxor.vx v8, v16, a1, v0.t
+; ZVFHMIN-NEXT:    slli a1, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    sub a4, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
+; ZVFHMIN-NEXT:    sltu a3, a0, a4
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 4
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 5
+; ZVFHMIN-NEXT:    slli a4, a4, 3
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
 ; ZVFHMIN-NEXT:    mv a5, a4
@@ -9042,74 +8654,88 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16(<vscale x 32 x half> %va, half %
 ; ZVFHMIN-NEXT:    add a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
-; ZVFHMIN-NEXT:    add a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    sub a2, a0, a2
-; ZVFHMIN-NEXT:    srli a1, a1, 2
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 5
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    sltu a0, a0, a2
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
-; ZVFHMIN-NEXT:    addi a0, a0, -1
-; ZVFHMIN-NEXT:    and a0, a0, a2
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28, v0.t
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v24, a2
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB282_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB282_2:
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vmv.v.v v16, v8
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 2
@@ -9138,115 +8764,129 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_commute(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 2
+; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-NEXT:    lui a3, 8
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a2
+; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    lui a2, 8
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
+; ZVFHMIN-NEXT:    slli a1, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    sub a2, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
+; ZVFHMIN-NEXT:    sltu a3, a0, a2
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a2, a3, a2
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a3, v0.t
-; ZVFHMIN-NEXT:    slli a2, a1, 1
-; ZVFHMIN-NEXT:    mv a3, a0
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    bltu a0, a2, .LBB283_2
-; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB283_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB283_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vmv4r.v v4, v12
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    sub a2, a0, a2
-; ZVFHMIN-NEXT:    srli a1, a1, 2
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
-; ZVFHMIN-NEXT:    add a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    sltu a0, a0, a2
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
-; ZVFHMIN-NEXT:    addi a0, a0, -1
-; ZVFHMIN-NEXT:    and a0, a0, a2
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -9271,53 +8911,42 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 2
-; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v24, v16
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
 ; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v24, a1
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
 ; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    sub a4, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a3
 ; ZVFHMIN-NEXT:    sltu a3, a0, a4
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv4r.v v8, v24
-; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    addi a4, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 4
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a4, a2
 ; ZVFHMIN-NEXT:    slli a2, a2, 1
@@ -9325,63 +8954,59 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v28, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB284_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB284_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v8, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v24, v0
-; ZVFHMIN-NEXT:    vmv8r.v v8, v24
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v0
+; ZVFHMIN-NEXT:    vmv8r.v v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -9406,119 +9031,100 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 2
+; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    lui a2, 8
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
 ; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    sub a4, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a4
+; ZVFHMIN-NEXT:    sub a2, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a3
+; ZVFHMIN-NEXT:    sltu a3, a0, a2
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a3, a3, a4
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    and a2, a3, a2
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB285_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB285_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
+; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v0
+; ZVFHMIN-NEXT:    vmv8r.v v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -9545,255 +9151,163 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    slli a2, a2, 2
 ; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
 ; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a3, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v16, a2, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v24, v24, a2, v0.t
-; ZVFHMIN-NEXT:    slli a2, a0, 1
-; ZVFHMIN-NEXT:    vmv4r.v v4, v12
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vmv4r.v v12, v28
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    bltu a1, a2, .LBB286_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:  .LBB286_2:
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    sub a2, a1, a2
-; ZVFHMIN-NEXT:    srli a0, a0, 2
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
-; ZVFHMIN-NEXT:    add a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    sltu a1, a1, a2
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a0
-; ZVFHMIN-NEXT:    addi a1, a1, -1
-; ZVFHMIN-NEXT:    and a1, a1, a2
+; ZVFHMIN-NEXT:    vxor.vx v8, v16, a2, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
-  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
-  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
-  ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vv_nxv32f16_commuted:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vl8re16.v v24, (a0)
-; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT:    vfnmadd.vv v8, v16, v24, v0.t
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vv_nxv32f16_commuted:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    slli a0, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    sub a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a3
+; ZVFHMIN-NEXT:    sltu a3, a1, a4
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v24, v24, a2, v0.t
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a3, a3, a4
+; ZVFHMIN-NEXT:    vmv1r.v v0, v6
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a3, a2
 ; ZVFHMIN-NEXT:    slli a2, a2, 2
 ; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
-; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a3, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v24, v24, a2, v0.t
-; ZVFHMIN-NEXT:    slli a2, a0, 1
-; ZVFHMIN-NEXT:    vmv4r.v v4, v20
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    addi a4, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    bltu a1, a2, .LBB287_2
+; ZVFHMIN-NEXT:    bltu a1, a0, .LBB286_2
 ; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:  .LBB287_2:
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
-; ZVFHMIN-NEXT:    add a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    sub a2, a1, a2
-; ZVFHMIN-NEXT:    srli a0, a0, 2
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 5
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    sltu a1, a1, a2
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a0
-; ZVFHMIN-NEXT:    addi a1, a1, -1
-; ZVFHMIN-NEXT:    and a1, a1, a2
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:  .LBB286_2:
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a2
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
@@ -9802,334 +9316,420 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_commuted(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
   %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negb, <vscale x 32 x half> %va, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
 }
 
-define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vv_nxv32f16_unmasked:
+define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vv_nxv32f16_commuted:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vl8re16.v v24, (a0)
 ; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT:    vfnmadd.vv v8, v16, v24
+; ZVFH-NEXT:    vfnmadd.vv v8, v16, v24, v0.t
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-LABEL: vfnmadd_vv_nxv32f16_unmasked:
+; ZVFHMIN-LABEL: vfnmadd_vv_nxv32f16_commuted:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
+; ZVFHMIN-NEXT:    sub sp, sp, a2
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a3, a3, a2
+; ZVFHMIN-NEXT:    mv a3, a2
 ; ZVFHMIN-NEXT:    slli a2, a2, 2
 ; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a2, a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a2
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
+; ZVFHMIN-NEXT:    vxor.vx v8, v16, a2, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a4, a0, 5
-; ZVFHMIN-NEXT:    add a0, a4, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    slli a0, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    sub a4, a1, a0
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v16, v7, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs1r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a3
 ; ZVFHMIN-NEXT:    sltu a3, a1, a4
-; ZVFHMIN-NEXT:    csrr a5, vlenb
-; ZVFHMIN-NEXT:    mv a6, a5
-; ZVFHMIN-NEXT:    slli a5, a5, 3
-; ZVFHMIN-NEXT:    add a6, a6, a5
-; ZVFHMIN-NEXT:    slli a5, a5, 1
-; ZVFHMIN-NEXT:    add a5, a5, a6
-; ZVFHMIN-NEXT:    add a5, sp, a5
-; ZVFHMIN-NEXT:    addi a5, a5, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v0, v16, a2
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vxor.vx v24, v24, a2, v0.t
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
+; ZVFHMIN-NEXT:    vmv1r.v v0, v6
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    mv a4, a2
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a4, a4, a2
+; ZVFHMIN-NEXT:    mv a4, a2
 ; ZVFHMIN-NEXT:    slli a2, a2, 1
 ; ZVFHMIN-NEXT:    add a2, a2, a4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 4
-; ZVFHMIN-NEXT:    add a2, a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 5
-; ZVFHMIN-NEXT:    add a2, a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 5
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 4
-; ZVFHMIN-NEXT:    add a2, a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 2
+; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 5
-; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vmv.v.v v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 5
-; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    bltu a1, a0, .LBB288_2
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    bltu a1, a0, .LBB287_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:  .LBB288_2:
+; ZVFHMIN-NEXT:  .LBB287_2:
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a2, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a2, a2, a0
+; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a2
+; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a2, a0, 5
-; ZVFHMIN-NEXT:    add a0, a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vmv.v.v v16, v8
+; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a1, a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
-  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
+  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negb, <vscale x 32 x half> %va, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
 }
 
-define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vv_nxv32f16_unmasked_commuted:
+define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vv_nxv32f16_unmasked:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vl8re16.v v24, (a0)
 ; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfnmadd.vv v8, v16, v24
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-LABEL: vfnmadd_vv_nxv32f16_unmasked_commuted:
+; ZVFHMIN-LABEL: vfnmadd_vv_nxv32f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    sub sp, sp, a2
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a3, a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 2
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
 ; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v0, v16
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
 ; ZVFHMIN-NEXT:    lui a2, 8
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmset.m v8
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v0, v0, a2
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a4, a0, 5
-; ZVFHMIN-NEXT:    add a0, a4, a0
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
 ; ZVFHMIN-NEXT:    slli a0, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    sub a4, a1, a0
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v8, v8, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs1r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
 ; ZVFHMIN-NEXT:    sltu a3, a1, a4
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v0, v24, a2
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vxor.vx v8, v24, a2
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    mv a4, a2
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a4, a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 5
-; ZVFHMIN-NEXT:    add a2, a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 4
-; ZVFHMIN-NEXT:    add a2, a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
+; ZVFHMIN-NEXT:    bltu a1, a0, .LBB288_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:  .LBB288_2:
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    ret
+  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vv_nxv32f16_unmasked_commuted:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vl8re16.v v24, (a0)
+; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT:    vfnmadd.vv v8, v16, v24
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfnmadd_vv_nxv32f16_unmasked_commuted:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    sub sp, sp, a2
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 4
-; ZVFHMIN-NEXT:    add a2, a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
+; ZVFHMIN-NEXT:    lui a2, 8
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v16, a2
+; ZVFHMIN-NEXT:    slli a0, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    sub a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    sltu a3, a1, a4
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v24, v24, a2
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 5
-; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv4r.v v16, v8
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 4
-; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v28, v8
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
 ; ZVFHMIN-NEXT:    bltu a1, a0, .LBB289_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:  .LBB289_2:
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a2, a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a2
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a2, a0, 4
-; ZVFHMIN-NEXT:    add a0, a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v0, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v24, v8
-; ZVFHMIN-NEXT:    vmv8r.v v8, v24
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a1, a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -10159,124 +9759,120 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16(<vscale x 32 x half> %va, half
 ; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-NEXT:    lui a4, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    lui a2, 8
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a2
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    mv a3, a0
+; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a4, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a4, v0.t
-; ZVFHMIN-NEXT:    slli a2, a1, 1
-; ZVFHMIN-NEXT:    addi a4, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv4r.v v4, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    bltu a0, a2, .LBB290_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:  .LBB290_2:
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
-; ZVFHMIN-NEXT:    add a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
+; ZVFHMIN-NEXT:    vxor.vx v24, v8, a2, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
+; ZVFHMIN-NEXT:    slli a1, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    sub a2, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
+; ZVFHMIN-NEXT:    sltu a3, a0, a2
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a2, a3, a2
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    sub a2, a0, a2
-; ZVFHMIN-NEXT:    srli a1, a1, 2
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 5
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    sltu a0, a0, a2
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
-; ZVFHMIN-NEXT:    addi a0, a0, -1
-; ZVFHMIN-NEXT:    and a0, a0, a2
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB290_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB290_2:
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vmv.v.v v16, v8
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a1, a0
@@ -10307,111 +9903,89 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_commute(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 2
+; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-NEXT:    lui a4, 8
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    lui a2, 8
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a2
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    mv a3, a0
+; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a4, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v24, v16, a4, v0.t
-; ZVFHMIN-NEXT:    slli a2, a1, 1
-; ZVFHMIN-NEXT:    vmv4r.v v20, v28
-; ZVFHMIN-NEXT:    addi a4, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv4r.v v4, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    bltu a0, a2, .LBB291_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:  .LBB291_2:
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
-; ZVFHMIN-NEXT:    add a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
-; ZVFHMIN-NEXT:    sub a2, a0, a2
-; ZVFHMIN-NEXT:    srli a1, a1, 2
+; ZVFHMIN-NEXT:    vxor.vx v24, v8, a2, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
+; ZVFHMIN-NEXT:    slli a1, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    sub a2, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
+; ZVFHMIN-NEXT:    sltu a3, a0, a2
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a2, a3, a2
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    addi a3, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    sltu a0, a0, a2
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
-; ZVFHMIN-NEXT:    addi a0, a0, -1
-; ZVFHMIN-NEXT:    and a0, a0, a2
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB291_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB291_2:
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a1, a0
@@ -10420,11 +9994,46 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_commute(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vmv.v.v v16, v8
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -10453,101 +10062,95 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a4, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a2
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
 ; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    sub a2, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a2
+; ZVFHMIN-NEXT:    sub a4, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a3
+; ZVFHMIN-NEXT:    sltu a3, a0, a4
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a2, a3, a2
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
-; ZVFHMIN-NEXT:    add a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
+; ZVFHMIN-NEXT:    and a3, a3, a4
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 4
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    mv a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 1
+; ZVFHMIN-NEXT:    add a4, a4, a5
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v8, a2
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v28, v8
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB292_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB292_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v24, v8
-; ZVFHMIN-NEXT:    vmv8r.v v8, v24
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -10578,100 +10181,97 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a4, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a2
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
 ; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    sub a2, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a2
+; ZVFHMIN-NEXT:    sub a4, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a3
+; ZVFHMIN-NEXT:    sltu a3, a0, a4
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a2, a3, a2
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
-; ZVFHMIN-NEXT:    add a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    and a3, a3, a4
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    addi a4, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 4
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v8, a2
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB293_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB293_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -10699,115 +10299,136 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    slli a1, a1, 2
 ; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-NEXT:    lui a4, 8
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-NEXT:    mv a3, a0
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a4, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a4, v0.t
-; ZVFHMIN-NEXT:    slli a2, a1, 1
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv4r.v v4, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    bltu a0, a2, .LBB294_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:  .LBB294_2:
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
+; ZVFHMIN-NEXT:    slli a1, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    vxor.vx v8, v24, a2, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
+; ZVFHMIN-NEXT:    sub a2, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
+; ZVFHMIN-NEXT:    sltu a3, a0, a2
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a2, a3, a2
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    sub a2, a0, a2
-; ZVFHMIN-NEXT:    srli a1, a1, 2
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
-; ZVFHMIN-NEXT:    add a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    sltu a0, a0, a2
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
-; ZVFHMIN-NEXT:    addi a0, a0, -1
-; ZVFHMIN-NEXT:    and a0, a0, a2
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB294_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB294_2:
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vmv.v.v v16, v8
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -10839,49 +10460,28 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_commute(<vscale x 32
 ; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-NEXT:    lui a4, 8
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-NEXT:    mv a3, a0
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a4, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v24, v16, a4, v0.t
-; ZVFHMIN-NEXT:    slli a2, a1, 1
-; ZVFHMIN-NEXT:    addi a4, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv4r.v v4, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    bltu a0, a2, .LBB295_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:  .LBB295_2:
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    lui a2, 8
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
+; ZVFHMIN-NEXT:    slli a1, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
 ; ZVFHMIN-NEXT:    mv a5, a4
@@ -10890,72 +10490,106 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_commute(<vscale x 32
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
-; ZVFHMIN-NEXT:    add a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a2, v0.t
+; ZVFHMIN-NEXT:    sub a2, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
+; ZVFHMIN-NEXT:    sltu a3, a0, a2
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a2, a3, a2
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    sub a2, a0, a2
-; ZVFHMIN-NEXT:    srli a1, a1, 2
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 5
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    sltu a0, a0, a2
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
-; ZVFHMIN-NEXT:    addi a0, a0, -1
-; ZVFHMIN-NEXT:    and a0, a0, a2
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB295_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB295_2:
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vmv.v.v v16, v8
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a1, a0
@@ -10986,13 +10620,17 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a2, a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 2
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
@@ -11001,125 +10639,81 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v0, a1
 ; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a4, a1, 5
-; ZVFHMIN-NEXT:    add a1, a4, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    vxor.vx v0, v0, a2
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vxor.vx v8, v0, a2
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v24, v24, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs1r.v v24, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a3
 ; ZVFHMIN-NEXT:    sltu a3, a0, a2
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a2, a3, a2
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a4, a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
-; ZVFHMIN-NEXT:    add a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 4
-; ZVFHMIN-NEXT:    add a3, a4, a3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 5
-; ZVFHMIN-NEXT:    add a3, a4, a3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 4
-; ZVFHMIN-NEXT:    add a3, a4, a3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 5
-; ZVFHMIN-NEXT:    add a2, a3, a2
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 4
-; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v28, v16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB296_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB296_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a2, a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 4
-; ZVFHMIN-NEXT:    add a1, a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v24, v16
-; ZVFHMIN-NEXT:    vmv8r.v v8, v24
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a1, a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -11145,142 +10739,101 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a2, a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 2
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v0, v16
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    vmset.m v8
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    vxor.vx v0, v0, a2
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a4, a1, 5
-; ZVFHMIN-NEXT:    add a1, a4, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
 ; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    vxor.vx v0, v24, a2
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vxor.vx v8, v24, a2
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v8, v8, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs1r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
 ; ZVFHMIN-NEXT:    sltu a3, a0, a2
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a2, a3, a2
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a4, a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
-; ZVFHMIN-NEXT:    add a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 5
-; ZVFHMIN-NEXT:    add a3, a4, a3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 4
-; ZVFHMIN-NEXT:    add a3, a4, a3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 4
-; ZVFHMIN-NEXT:    add a3, a4, a3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 5
-; ZVFHMIN-NEXT:    add a2, a3, a2
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 4
-; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v28, v8
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB297_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB297_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a2, a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 4
-; ZVFHMIN-NEXT:    add a1, a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v24, v8
-; ZVFHMIN-NEXT:    vmv8r.v v8, v24
+; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a1, a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v0
+; ZVFHMIN-NEXT:    vmv8r.v v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -11308,105 +10861,164 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    slli a2, a2, 2
 ; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
 ; ZVFHMIN-NEXT:    lui a2, 8
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    vxor.vx v8, v16, a2, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a3, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    slli a0, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    sub a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a3
+; ZVFHMIN-NEXT:    sltu a3, a1, a4
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v16, a2, v0.t
 ; ZVFHMIN-NEXT:    vxor.vx v24, v24, a2, v0.t
-; ZVFHMIN-NEXT:    slli a2, a0, 1
-; ZVFHMIN-NEXT:    vmv4r.v v4, v12
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vmv4r.v v12, v28
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    bltu a1, a2, .LBB298_2
-; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a3, a3, a4
+; ZVFHMIN-NEXT:    vmv1r.v v0, v6
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:  .LBB298_2:
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    slli a2, a2, 2
+; ZVFHMIN-NEXT:    add a2, a2, a3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    sub a2, a1, a2
-; ZVFHMIN-NEXT:    srli a0, a0, 2
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
-; ZVFHMIN-NEXT:    add a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    sltu a1, a1, a2
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a0
-; ZVFHMIN-NEXT:    addi a1, a1, -1
-; ZVFHMIN-NEXT:    and a1, a1, a2
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    bltu a1, a0, .LBB298_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:  .LBB298_2:
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -11431,132 +11043,165 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
+; ZVFHMIN-NEXT:    sub sp, sp, a2
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a3, a2
 ; ZVFHMIN-NEXT:    slli a2, a2, 2
 ; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
+; ZVFHMIN-NEXT:    lui a2, 8
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    vxor.vx v8, v16, a2, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    slli a0, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    sub a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a3
+; ZVFHMIN-NEXT:    sltu a3, a1, a4
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v24, v24, a2, v0.t
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a3, a3, a4
+; ZVFHMIN-NEXT:    vmv1r.v v0, v6
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 2
+; ZVFHMIN-NEXT:    add a2, a2, a3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vmv.v.v v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
-; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a3, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v24, v24, a2, v0.t
-; ZVFHMIN-NEXT:    slli a2, a0, 1
-; ZVFHMIN-NEXT:    vmv4r.v v4, v20
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    addi a4, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    bltu a1, a2, .LBB299_2
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    bltu a1, a0, .LBB299_2
 ; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:  .LBB299_2:
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
-; ZVFHMIN-NEXT:    add a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    sub a2, a1, a2
-; ZVFHMIN-NEXT:    srli a0, a0, 2
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 5
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    sltu a1, a1, a2
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a0
-; ZVFHMIN-NEXT:    addi a1, a1, -1
-; ZVFHMIN-NEXT:    and a1, a1, a2
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a2, a0
+; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a2
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vmv.v.v v16, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
@@ -11582,157 +11227,100 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    sub sp, sp, a2
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a3, a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 2
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
 ; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a2, a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a2
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    vmset.m v8
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a4, a0, 5
-; ZVFHMIN-NEXT:    add a0, a4, a0
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    slli a0, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    sub a4, a1, a0
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v16, v7, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs1r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
 ; ZVFHMIN-NEXT:    sltu a3, a1, a4
-; ZVFHMIN-NEXT:    csrr a5, vlenb
-; ZVFHMIN-NEXT:    mv a6, a5
-; ZVFHMIN-NEXT:    slli a5, a5, 3
-; ZVFHMIN-NEXT:    add a6, a6, a5
-; ZVFHMIN-NEXT:    slli a5, a5, 1
-; ZVFHMIN-NEXT:    add a5, a5, a6
-; ZVFHMIN-NEXT:    add a5, sp, a5
-; ZVFHMIN-NEXT:    addi a5, a5, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v0, v16, a2
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vxor.vx v8, v24, a2
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    mv a4, a2
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a4, a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 4
-; ZVFHMIN-NEXT:    add a2, a4, a2
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 5
-; ZVFHMIN-NEXT:    add a2, a4, a2
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 4
-; ZVFHMIN-NEXT:    add a2, a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 5
-; ZVFHMIN-NEXT:    add a2, a3, a2
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 5
-; ZVFHMIN-NEXT:    add a2, a3, a2
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
 ; ZVFHMIN-NEXT:    bltu a1, a0, .LBB300_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:  .LBB300_2:
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a2, a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a2
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a2, a0, 5
-; ZVFHMIN-NEXT:    add a0, a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a1, a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -11757,142 +11345,101 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    sub sp, sp, a2
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a3, a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 2
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
 ; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v0, v16
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
 ; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    vmset.m v8
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v0, v0, a2
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a4, a0, 5
-; ZVFHMIN-NEXT:    add a0, a4, a0
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vxor.vx v8, v16, a2
 ; ZVFHMIN-NEXT:    slli a0, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    sub a4, a1, a0
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v8, v8, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs1r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    sltu a3, a1, a4
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v0, v24, a2
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a3, a3, a4
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    mv a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a4, a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 5
-; ZVFHMIN-NEXT:    add a2, a4, a2
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    sltu a3, a1, a4
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v24, v24, a2
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 4
-; ZVFHMIN-NEXT:    add a2, a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv4r.v v16, v8
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 4
-; ZVFHMIN-NEXT:    add a2, a4, a2
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 5
-; ZVFHMIN-NEXT:    add a2, a3, a2
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 4
-; ZVFHMIN-NEXT:    add a2, a3, a2
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v28, v8
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
 ; ZVFHMIN-NEXT:    bltu a1, a0, .LBB301_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:  .LBB301_2:
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a2, a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a2
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a2, a0, 4
-; ZVFHMIN-NEXT:    add a0, a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v0, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v24, v8
-; ZVFHMIN-NEXT:    vmv8r.v v8, v24
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a1, a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -11922,51 +11469,33 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16(<vscale x 32 x half> %va, half
 ; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    vmv8r.v v24, v16
 ; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-NEXT:    lui a3, 8
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v16, a2
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v8, a3, v0.t
-; ZVFHMIN-NEXT:    slli a2, a1, 1
-; ZVFHMIN-NEXT:    mv a3, a0
-; ZVFHMIN-NEXT:    vmv4r.v v12, v20
-; ZVFHMIN-NEXT:    addi a4, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    bltu a0, a2, .LBB302_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:  .LBB302_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    vxor.vx v16, v8, a1, v0.t
+; ZVFHMIN-NEXT:    slli a1, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    sub a4, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
+; ZVFHMIN-NEXT:    sltu a3, a0, a4
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 4
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv4r.v v4, v28
+; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 5
+; ZVFHMIN-NEXT:    slli a4, a4, 3
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
 ; ZVFHMIN-NEXT:    mv a5, a4
@@ -11974,73 +11503,87 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16(<vscale x 32 x half> %va, half
 ; ZVFHMIN-NEXT:    add a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v24, a2
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
-; ZVFHMIN-NEXT:    add a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    sub a2, a0, a2
-; ZVFHMIN-NEXT:    srli a1, a1, 2
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 5
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    sltu a0, a0, a2
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
-; ZVFHMIN-NEXT:    addi a0, a0, -1
-; ZVFHMIN-NEXT:    and a0, a0, a2
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB302_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB302_2:
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vmv.v.v v16, v8
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a1, a0
@@ -12070,116 +11613,130 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_commute(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 2
+; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v24, v16
-; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-NEXT:    lui a3, 8
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v16, a2
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a3, v0.t
-; ZVFHMIN-NEXT:    slli a2, a1, 1
-; ZVFHMIN-NEXT:    mv a3, a0
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    slli a1, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    sub a4, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
+; ZVFHMIN-NEXT:    sltu a3, a0, a4
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 4
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    bltu a0, a2, .LBB303_2
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    mv a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 1
+; ZVFHMIN-NEXT:    add a4, a4, a5
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a2
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB303_2
 ; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB303_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv4r.v v4, v28
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    sub a2, a0, a2
-; ZVFHMIN-NEXT:    srli a1, a1, 2
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
-; ZVFHMIN-NEXT:    add a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    sltu a0, a0, a2
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
-; ZVFHMIN-NEXT:    addi a0, a0, -1
-; ZVFHMIN-NEXT:    and a0, a0, a2
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -12204,51 +11761,41 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 2
-; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v8, a1
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    sub a4, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a3
 ; ZVFHMIN-NEXT:    sltu a3, a0, a4
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    addi a4, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 4
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv4r.v v8, v24
+; ZVFHMIN-NEXT:    vmv.v.x v16, a2
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a4, a2
@@ -12256,64 +11803,59 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    add a2, a2, a4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v28, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB304_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB304_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v8
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v24, v0
-; ZVFHMIN-NEXT:    vmv8r.v v8, v24
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -12338,46 +11880,41 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 2
-; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v8, a1
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    sub a4, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a3
 ; ZVFHMIN-NEXT:    sltu a3, a0, a4
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 4
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    addi a4, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    vmv4r.v v16, v24
+; ZVFHMIN-NEXT:    vmv.v.x v16, a2
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a4, a2
@@ -12386,68 +11923,58 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v28, v8
-; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB305_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB305_2:
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB305_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB305_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v8
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v24, v16
-; ZVFHMIN-NEXT:    vmv8r.v v8, v24
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -12478,8 +12005,13 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
@@ -12488,113 +12020,110 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-NEXT:    lui a3, 8
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a2
-; ZVFHMIN-NEXT:    vxor.vx v24, v16, a3, v0.t
-; ZVFHMIN-NEXT:    slli a2, a1, 1
-; ZVFHMIN-NEXT:    mv a3, a0
-; ZVFHMIN-NEXT:    vmv4r.v v20, v28
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    bltu a0, a2, .LBB306_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:  .LBB306_2:
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv4r.v v4, v12
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
-; ZVFHMIN-NEXT:    add a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    sub a2, a0, a2
-; ZVFHMIN-NEXT:    srli a1, a1, 2
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 5
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
+; ZVFHMIN-NEXT:    slli a1, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    vxor.vx v8, v16, a2, v0.t
+; ZVFHMIN-NEXT:    sub a2, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
+; ZVFHMIN-NEXT:    sltu a3, a0, a2
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a2, a3, a2
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
-; ZVFHMIN-NEXT:    add a3, a3, a4
+; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    sltu a0, a0, a2
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
-; ZVFHMIN-NEXT:    addi a0, a0, -1
-; ZVFHMIN-NEXT:    and a0, a0, a2
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB306_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB306_2:
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a1, a0
@@ -12630,106 +12159,93 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_commute(<vscale x 32
 ; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-NEXT:    lui a3, 8
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a2
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a3, v0.t
-; ZVFHMIN-NEXT:    slli a2, a1, 1
-; ZVFHMIN-NEXT:    mv a3, a0
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    bltu a0, a2, .LBB307_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:  .LBB307_2:
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv4r.v v4, v20
-; ZVFHMIN-NEXT:    addi a4, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
-; ZVFHMIN-NEXT:    add a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
+; ZVFHMIN-NEXT:    slli a1, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
+; ZVFHMIN-NEXT:    sub a2, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
+; ZVFHMIN-NEXT:    sltu a3, a0, a2
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a2, a3, a2
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 5
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    sub a2, a0, a2
-; ZVFHMIN-NEXT:    srli a1, a1, 2
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    sltu a0, a0, a2
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
-; ZVFHMIN-NEXT:    addi a0, a0, -1
-; ZVFHMIN-NEXT:    and a0, a0, a2
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
+; ZVFHMIN-NEXT:    vmv4r.v v24, v8
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB307_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB307_2:
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a1, a0
@@ -12738,9 +12254,24 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_commute(<vscale x 32
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vmv.v.v v16, v8
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a1, a0
@@ -12770,118 +12301,100 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 2
+; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    vxor.vx v24, v24, a2
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a2
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a3
 ; ZVFHMIN-NEXT:    sltu a3, a0, a2
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a2, a3, a2
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
-; ZVFHMIN-NEXT:    add a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 5
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vmv4r.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB308_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB308_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v8
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v0
-; ZVFHMIN-NEXT:    vmv8r.v v8, v16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -12906,119 +12419,100 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 2
+; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    vmset.m v8
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    vxor.vx v24, v24, a2
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
 ; ZVFHMIN-NEXT:    sltu a3, a0, a2
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a2, a3, a2
+; ZVFHMIN-NEXT:    vmv4r.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
-; ZVFHMIN-NEXT:    add a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 5
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB309_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB309_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v0, v8
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v24
-; ZVFHMIN-NEXT:    vmv8r.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -16580,14 +16074,14 @@ define <vscale x 1 x half> @vfma_vv_nxv1f16_double_neg(<vscale x 1 x half> %a, <
 ;
 ; ZVFHMIN-LABEL: vfma_vv_nxv1f16_double_neg:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v12, v10, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %nega = call <vscale x 1 x half> @llvm.vp.fneg.nxv1f16(<vscale x 1 x half> %a, <vscale x 1 x i1> %m, i32 %evl)
   %negb = call <vscale x 1 x half> @llvm.vp.fneg.nxv1f16(<vscale x 1 x half> %b, <vscale x 1 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
index 74d289951530a..4523b43274eff 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
@@ -18,12 +18,12 @@ define <vscale x 1 x bfloat> @vfmax_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vsca
 ; CHECK-LABEL: vfmax_vv_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfmax.vv v9, v9, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.maxnum.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x bfloat> %v
@@ -37,7 +37,7 @@ define <vscale x 1 x bfloat> @vfmax_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfmax.vv v9, v9, v10
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.maxnum.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -50,12 +50,12 @@ define <vscale x 2 x bfloat> @vfmax_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vsca
 ; CHECK-LABEL: vfmax_vv_nxv2bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmax.vv v9, v9, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.maxnum.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x bfloat> %v
@@ -69,7 +69,7 @@ define <vscale x 2 x bfloat> @vfmax_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmax.vv v9, v9, v10
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.maxnum.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -82,12 +82,12 @@ define <vscale x 4 x bfloat> @vfmax_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vsca
 ; CHECK-LABEL: vfmax_vv_nxv4bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmax.vv v10, v12, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.maxnum.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x bfloat> %v
@@ -101,7 +101,7 @@ define <vscale x 4 x bfloat> @vfmax_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmax.vv v10, v12, v10
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.maxnum.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -114,12 +114,12 @@ define <vscale x 8 x bfloat> @vfmax_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vsca
 ; CHECK-LABEL: vfmax_vv_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfmax.vv v12, v16, v12, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.maxnum.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x bfloat> %v
@@ -133,7 +133,7 @@ define <vscale x 8 x bfloat> @vfmax_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfmax.vv v12, v16, v12
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.maxnum.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -146,12 +146,12 @@ define <vscale x 16 x bfloat> @vfmax_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <v
 ; CHECK-LABEL: vfmax_vv_nxv16bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmax.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.maxnum.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x bfloat> %v
@@ -165,7 +165,7 @@ define <vscale x 16 x bfloat> @vfmax_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmax.vv v16, v24, v16
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.maxnum.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -181,10 +181,18 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
@@ -193,31 +201,59 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 3
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20, v0.t
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 4
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20, v0.t
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfmax.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vfmax.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB10_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB10_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmax.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -250,12 +286,12 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    addi a3, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmax.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB11_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
@@ -267,7 +303,7 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmax.vv v16, v24, v16
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
@@ -291,12 +327,12 @@ define <vscale x 1 x half> @vfmax_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfmax_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.maxnum.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x half> %v
@@ -316,7 +352,7 @@ define <vscale x 1 x half> @vfmax_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.maxnum.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -335,12 +371,12 @@ define <vscale x 2 x half> @vfmax_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfmax_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.maxnum.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x half> %v
@@ -360,7 +396,7 @@ define <vscale x 2 x half> @vfmax_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.maxnum.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -379,12 +415,12 @@ define <vscale x 4 x half> @vfmax_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfmax_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v10, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.maxnum.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %vb, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x half> %v
@@ -404,7 +440,7 @@ define <vscale x 4 x half> @vfmax_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v10, v12, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.maxnum.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -423,12 +459,12 @@ define <vscale x 8 x half> @vfmax_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfmax_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v12, v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.maxnum.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %v
@@ -448,7 +484,7 @@ define <vscale x 8 x half> @vfmax_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v12, v16, v12
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.maxnum.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -467,12 +503,12 @@ define <vscale x 16 x half> @vfmax_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFHMIN-LABEL: vfmax_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.maxnum.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x half> %v
@@ -492,7 +528,7 @@ define <vscale x 16 x half> @vfmax_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v24, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.maxnum.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -514,10 +550,18 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
@@ -526,31 +570,59 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv4r.v v8, v16
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfmax.vv v16, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -589,12 +661,12 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    addi a3, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB23_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
@@ -606,7 +678,7 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v24, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
index 1aaddef39bc1c..a621dc282beb3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
@@ -18,12 +18,12 @@ define <vscale x 1 x bfloat> @vfmin_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vsca
 ; CHECK-LABEL: vfmin_vv_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfmin.vv v9, v9, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.minnum.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x bfloat> %v
@@ -37,7 +37,7 @@ define <vscale x 1 x bfloat> @vfmin_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfmin.vv v9, v9, v10
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.minnum.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -50,12 +50,12 @@ define <vscale x 2 x bfloat> @vfmin_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vsca
 ; CHECK-LABEL: vfmin_vv_nxv2bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmin.vv v9, v9, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.minnum.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x bfloat> %v
@@ -69,7 +69,7 @@ define <vscale x 2 x bfloat> @vfmin_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmin.vv v9, v9, v10
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.minnum.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -82,12 +82,12 @@ define <vscale x 4 x bfloat> @vfmin_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vsca
 ; CHECK-LABEL: vfmin_vv_nxv4bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmin.vv v10, v12, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.minnum.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x bfloat> %v
@@ -101,7 +101,7 @@ define <vscale x 4 x bfloat> @vfmin_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmin.vv v10, v12, v10
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.minnum.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -114,12 +114,12 @@ define <vscale x 8 x bfloat> @vfmin_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vsca
 ; CHECK-LABEL: vfmin_vv_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfmin.vv v12, v16, v12, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.minnum.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x bfloat> %v
@@ -133,7 +133,7 @@ define <vscale x 8 x bfloat> @vfmin_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfmin.vv v12, v16, v12
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.minnum.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -146,12 +146,12 @@ define <vscale x 16 x bfloat> @vfmin_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <v
 ; CHECK-LABEL: vfmin_vv_nxv16bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmin.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.minnum.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x bfloat> %v
@@ -165,7 +165,7 @@ define <vscale x 16 x bfloat> @vfmin_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmin.vv v16, v24, v16
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.minnum.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -181,10 +181,18 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
@@ -193,31 +201,59 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 3
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20, v0.t
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 4
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20, v0.t
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfmin.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vfmin.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB10_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB10_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmin.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -250,12 +286,12 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    addi a3, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmin.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB11_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
@@ -267,7 +303,7 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmin.vv v16, v24, v16
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
@@ -291,12 +327,12 @@ define <vscale x 1 x half> @vfmin_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfmin_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.minnum.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x half> %v
@@ -316,7 +352,7 @@ define <vscale x 1 x half> @vfmin_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.minnum.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -335,12 +371,12 @@ define <vscale x 2 x half> @vfmin_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfmin_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.minnum.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x half> %v
@@ -360,7 +396,7 @@ define <vscale x 2 x half> @vfmin_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.minnum.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -379,12 +415,12 @@ define <vscale x 4 x half> @vfmin_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfmin_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v10, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.minnum.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %vb, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x half> %v
@@ -404,7 +440,7 @@ define <vscale x 4 x half> @vfmin_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v10, v12, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.minnum.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -423,12 +459,12 @@ define <vscale x 8 x half> @vfmin_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfmin_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v12, v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.minnum.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %v
@@ -448,7 +484,7 @@ define <vscale x 8 x half> @vfmin_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v12, v16, v12
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.minnum.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -467,12 +503,12 @@ define <vscale x 16 x half> @vfmin_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFHMIN-LABEL: vfmin_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.minnum.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x half> %v
@@ -492,7 +528,7 @@ define <vscale x 16 x half> @vfmin_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v24, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.minnum.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -514,10 +550,18 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
@@ -526,31 +570,59 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv4r.v v8, v16
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfmin.vv v16, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -589,12 +661,12 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    addi a3, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB23_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
@@ -606,7 +678,7 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v24, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
index 06f74dd995748..c1617cd365216 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
@@ -20,12 +20,12 @@ define <vscale x 1 x half> @vfmul_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfmul_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v9, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.fmul.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %b, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x half> %v
@@ -45,7 +45,7 @@ define <vscale x 1 x half> @vfmul_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v9, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.fmul.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %b, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -64,12 +64,12 @@ define <vscale x 1 x half> @vfmul_vf_nxv1f16(<vscale x 1 x half> %va, half %b, <
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v10, v8, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -93,7 +93,7 @@ define <vscale x 1 x half> @vfmul_vf_nxv1f16_unmasked(<vscale x 1 x half> %va, h
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v10, v8
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -114,12 +114,12 @@ define <vscale x 2 x half> @vfmul_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfmul_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v9, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.fmul.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %b, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x half> %v
@@ -139,7 +139,7 @@ define <vscale x 2 x half> @vfmul_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v9, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.fmul.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %b, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -158,12 +158,12 @@ define <vscale x 2 x half> @vfmul_vf_nxv2f16(<vscale x 2 x half> %va, half %b, <
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v10, v8, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 2 x half> %elt.head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
@@ -187,7 +187,7 @@ define <vscale x 2 x half> @vfmul_vf_nxv2f16_unmasked(<vscale x 2 x half> %va, h
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v10, v8
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
@@ -208,12 +208,12 @@ define <vscale x 4 x half> @vfmul_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfmul_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v10, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.fmul.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %b, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x half> %v
@@ -233,7 +233,7 @@ define <vscale x 4 x half> @vfmul_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v10, v12, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.fmul.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %b, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -252,12 +252,12 @@ define <vscale x 4 x half> @vfmul_vf_nxv4f16(<vscale x 4 x half> %va, half %b, <
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v10, v10, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -281,7 +281,7 @@ define <vscale x 4 x half> @vfmul_vf_nxv4f16_unmasked(<vscale x 4 x half> %va, h
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v10, v10, v12
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
@@ -302,12 +302,12 @@ define <vscale x 8 x half> @vfmul_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfmul_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v12, v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.fmul.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %b, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %v
@@ -327,7 +327,7 @@ define <vscale x 8 x half> @vfmul_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v12, v16, v12
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.fmul.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %b, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -346,12 +346,12 @@ define <vscale x 8 x half> @vfmul_vf_nxv8f16(<vscale x 8 x half> %va, half %b, <
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v12, v12, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -375,7 +375,7 @@ define <vscale x 8 x half> @vfmul_vf_nxv8f16_unmasked(<vscale x 8 x half> %va, h
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v12, v12, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
@@ -396,12 +396,12 @@ define <vscale x 16 x half> @vfmul_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFHMIN-LABEL: vfmul_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.fmul.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %b, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x half> %v
@@ -421,7 +421,7 @@ define <vscale x 16 x half> @vfmul_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v24, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.fmul.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %b, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -440,12 +440,12 @@ define <vscale x 16 x half> @vfmul_vf_nxv16f16(<vscale x 16 x half> %va, half %b
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -469,7 +469,7 @@ define <vscale x 16 x half> @vfmul_vf_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
@@ -493,10 +493,18 @@ define <vscale x 32 x half> @vfmul_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
@@ -505,31 +513,59 @@ define <vscale x 32 x half> @vfmul_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv4r.v v8, v16
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfmul.vv v16, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB20_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB20_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -568,12 +604,12 @@ define <vscale x 32 x half> @vfmul_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    addi a3, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB21_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
@@ -585,7 +621,7 @@ define <vscale x 32 x half> @vfmul_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v24, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
@@ -610,76 +646,76 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 4
-; ZVFHMIN-NEXT:    add a1, a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v24, v8
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a3, a1, 3
-; ZVFHMIN-NEXT:    add a1, a3, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vmv4r.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 3
-; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v16, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfmul.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a1, a0, 3
-; ZVFHMIN-NEXT:    add a0, a1, a0
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    vmv8r.v v24, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfmul.vv v24, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a1, a0, 4
-; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -703,19 +739,14 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
@@ -724,41 +755,30 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v16, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB23_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
index 6193fdd38a642..4336b27eb134a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
@@ -18,11 +18,11 @@ define <vscale x 1 x bfloat> @vfsqrt_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vsc
 ; CHECK-LABEL: vfsqrt_vv_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v9, v9, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.sqrt.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x bfloat> %v
@@ -35,7 +35,7 @@ define <vscale x 1 x bfloat> @vfsqrt_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat>
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v9, v9
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.sqrt.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -48,11 +48,11 @@ define <vscale x 2 x bfloat> @vfsqrt_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vsc
 ; CHECK-LABEL: vfsqrt_vv_nxv2bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v9, v9, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.sqrt.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x bfloat> %v
@@ -65,7 +65,7 @@ define <vscale x 2 x bfloat> @vfsqrt_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat>
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v9, v9
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.sqrt.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -78,11 +78,11 @@ define <vscale x 4 x bfloat> @vfsqrt_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vsc
 ; CHECK-LABEL: vfsqrt_vv_nxv4bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v10, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.sqrt.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x bfloat> %v
@@ -95,7 +95,7 @@ define <vscale x 4 x bfloat> @vfsqrt_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat>
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v10, v10
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.sqrt.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -108,11 +108,11 @@ define <vscale x 8 x bfloat> @vfsqrt_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vsc
 ; CHECK-LABEL: vfsqrt_vv_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v12, v12, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.sqrt.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x bfloat> %v
@@ -125,7 +125,7 @@ define <vscale x 8 x bfloat> @vfsqrt_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat>
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v12, v12
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.sqrt.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -138,11 +138,11 @@ define <vscale x 16 x bfloat> @vfsqrt_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <
 ; CHECK-LABEL: vfsqrt_vv_nxv16bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v16, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.sqrt.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x bfloat> %v
@@ -155,7 +155,7 @@ define <vscale x 16 x bfloat> @vfsqrt_vv_nxv16bf16_unmasked(<vscale x 16 x bfloa
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v16, v16
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.sqrt.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -175,25 +175,25 @@ define <vscale x 32 x bfloat> @vfsqrt_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    sltu a4, a0, a3
 ; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v24, v24, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB10_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB10_2:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfsqrt.v v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vfsqrt.v v24, v24, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.sqrt.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x bfloat> %v
@@ -210,15 +210,15 @@ define <vscale x 32 x bfloat> @vfsqrt_vv_nxv32bf16_unmasked(<vscale x 32 x bfloa
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    sltu a4, a0, a3
 ; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v16, a2
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v16, v16, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB11_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
@@ -227,7 +227,7 @@ define <vscale x 32 x bfloat> @vfsqrt_vv_nxv32bf16_unmasked(<vscale x 32 x bfloa
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v16, v16
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.sqrt.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -245,11 +245,11 @@ define <vscale x 1 x half> @vfsqrt_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v9, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.sqrt.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x half> %v
@@ -268,7 +268,7 @@ define <vscale x 1 x half> @vfsqrt_vv_nxv1f16_unmasked(<vscale x 1 x half> %va,
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v9, v9
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.sqrt.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -287,11 +287,11 @@ define <vscale x 2 x half> @vfsqrt_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v9, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.sqrt.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x half> %v
@@ -310,7 +310,7 @@ define <vscale x 2 x half> @vfsqrt_vv_nxv2f16_unmasked(<vscale x 2 x half> %va,
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v9, v9
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.sqrt.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -329,11 +329,11 @@ define <vscale x 4 x half> @vfsqrt_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v10, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.sqrt.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x half> %v
@@ -352,7 +352,7 @@ define <vscale x 4 x half> @vfsqrt_vv_nxv4f16_unmasked(<vscale x 4 x half> %va,
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v10, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.sqrt.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -371,11 +371,11 @@ define <vscale x 8 x half> @vfsqrt_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.sqrt.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %v
@@ -394,7 +394,7 @@ define <vscale x 8 x half> @vfsqrt_vv_nxv8f16_unmasked(<vscale x 8 x half> %va,
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v12, v12
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.sqrt.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -413,11 +413,11 @@ define <vscale x 16 x half> @vfsqrt_vv_nxv16f16(<vscale x 16 x half> %va, <vscal
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v16, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.sqrt.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x half> %v
@@ -436,7 +436,7 @@ define <vscale x 16 x half> @vfsqrt_vv_nxv16f16_unmasked(<vscale x 16 x half> %v
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v16, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.sqrt.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -462,25 +462,25 @@ define <vscale x 32 x half> @vfsqrt_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    sltu a4, a0, a3
 ; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v24, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsqrt.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfsqrt.v v24, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.sqrt.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
@@ -503,15 +503,15 @@ define <vscale x 32 x half> @vfsqrt_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    sltu a4, a0, a3
 ; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v16, a2
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v16, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB23_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
@@ -520,7 +520,7 @@ define <vscale x 32 x half> @vfsqrt_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v16, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.sqrt.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
index 08664fd1c4819..059408a1c9c3f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
@@ -18,12 +18,12 @@ define <vscale x 1 x bfloat> @vfsub_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vsca
 ; CHECK-LABEL: vfsub_vv_nxv1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfsub.vv v9, v9, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.fsub.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x bfloat> %v
@@ -37,7 +37,7 @@ define <vscale x 1 x bfloat> @vfsub_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfsub.vv v9, v9, v10
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.fsub.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -50,12 +50,12 @@ define <vscale x 1 x bfloat> @vfsub_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloa
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfsub.vv v9, v10, v8, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
@@ -73,7 +73,7 @@ define <vscale x 1 x bfloat> @vfsub_vf_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfsub.vv v9, v10, v8
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
@@ -88,12 +88,12 @@ define <vscale x 2 x bfloat> @vfsub_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vsca
 ; CHECK-LABEL: vfsub_vv_nxv2bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfsub.vv v9, v9, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.fsub.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x bfloat> %v
@@ -107,7 +107,7 @@ define <vscale x 2 x bfloat> @vfsub_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfsub.vv v9, v9, v10
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.fsub.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -120,12 +120,12 @@ define <vscale x 2 x bfloat> @vfsub_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloa
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfsub.vv v9, v10, v8, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 2 x bfloat> %elt.head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
@@ -143,7 +143,7 @@ define <vscale x 2 x bfloat> @vfsub_vf_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfsub.vv v9, v10, v8
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
@@ -158,12 +158,12 @@ define <vscale x 4 x bfloat> @vfsub_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vsca
 ; CHECK-LABEL: vfsub_vv_nxv4bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfsub.vv v10, v12, v10, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.fsub.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x bfloat> %v
@@ -177,7 +177,7 @@ define <vscale x 4 x bfloat> @vfsub_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfsub.vv v10, v12, v10
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.fsub.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -190,12 +190,12 @@ define <vscale x 4 x bfloat> @vfsub_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloa
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfsub.vv v10, v10, v12, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 4 x bfloat> %elt.head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
@@ -213,7 +213,7 @@ define <vscale x 4 x bfloat> @vfsub_vf_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfsub.vv v10, v10, v12
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
@@ -228,12 +228,12 @@ define <vscale x 8 x bfloat> @vfsub_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vsca
 ; CHECK-LABEL: vfsub_vv_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfsub.vv v12, v16, v12, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.fsub.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x bfloat> %v
@@ -247,7 +247,7 @@ define <vscale x 8 x bfloat> @vfsub_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfsub.vv v12, v16, v12
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.fsub.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -260,12 +260,12 @@ define <vscale x 8 x bfloat> @vfsub_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloa
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfsub.vv v12, v12, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
@@ -283,7 +283,7 @@ define <vscale x 8 x bfloat> @vfsub_vf_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfsub.vv v12, v12, v16
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
@@ -298,12 +298,12 @@ define <vscale x 16 x bfloat> @vfsub_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <v
 ; CHECK-LABEL: vfsub_vv_nxv16bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsub.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.fsub.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x bfloat> %v
@@ -317,7 +317,7 @@ define <vscale x 16 x bfloat> @vfsub_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsub.vv v16, v24, v16
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.fsub.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -330,12 +330,12 @@ define <vscale x 16 x bfloat> @vfsub_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bf
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v12, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsub.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 16 x bfloat> %elt.head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
@@ -353,7 +353,7 @@ define <vscale x 16 x bfloat> @vfsub_vf_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsub.vv v16, v16, v24
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
@@ -371,10 +371,18 @@ define <vscale x 32 x bfloat> @vfsub_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
@@ -383,31 +391,59 @@ define <vscale x 32 x bfloat> @vfsub_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 3
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20, v0.t
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 4
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20, v0.t
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfsub.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vfsub.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB20_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB20_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsub.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -440,12 +476,12 @@ define <vscale x 32 x bfloat> @vfsub_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    addi a3, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsub.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB21_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
@@ -457,7 +493,7 @@ define <vscale x 32 x bfloat> @vfsub_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsub.vv v16, v24, v16
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
@@ -476,76 +512,76 @@ define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 4
-; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv8r.v v24, v8
+; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vmv.v.x v16, a1
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a3, a1, 3
-; CHECK-NEXT:    add a1, a3, a1
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 3
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 3
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
 ; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; CHECK-NEXT:    vmv4r.v v8, v16
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a3, a2, 3
-; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfsub.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vfsub.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB22_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB22_2:
-; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 3
-; CHECK-NEXT:    add a0, a1, a0
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24, v0.t
+; CHECK-NEXT:    vmv8r.v v24, v16
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfsub.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vfsub.vv v24, v16, v24, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 4
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -563,19 +599,14 @@ define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vmset.m v24
 ; CHECK-NEXT:    vmv.v.x v16, a1
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
@@ -584,41 +615,30 @@ define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfsub.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vfsub.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB23_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB23_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsub.vv v16, v16, v24
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -641,12 +661,12 @@ define <vscale x 1 x half> @vfsub_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfsub_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v9, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.fsub.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %b, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x half> %v
@@ -666,7 +686,7 @@ define <vscale x 1 x half> @vfsub_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v9, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.fsub.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %b, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -685,12 +705,12 @@ define <vscale x 1 x half> @vfsub_vf_nxv1f16(<vscale x 1 x half> %va, half %b, <
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v10, v8, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -714,7 +734,7 @@ define <vscale x 1 x half> @vfsub_vf_nxv1f16_unmasked(<vscale x 1 x half> %va, h
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v10, v8
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -735,12 +755,12 @@ define <vscale x 2 x half> @vfsub_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfsub_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v9, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.fsub.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %b, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x half> %v
@@ -760,7 +780,7 @@ define <vscale x 2 x half> @vfsub_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v9, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.fsub.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %b, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -779,12 +799,12 @@ define <vscale x 2 x half> @vfsub_vf_nxv2f16(<vscale x 2 x half> %va, half %b, <
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v10, v8, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 2 x half> %elt.head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
@@ -808,7 +828,7 @@ define <vscale x 2 x half> @vfsub_vf_nxv2f16_unmasked(<vscale x 2 x half> %va, h
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v10, v8
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
@@ -829,12 +849,12 @@ define <vscale x 4 x half> @vfsub_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfsub_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v10, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.fsub.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %b, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x half> %v
@@ -854,7 +874,7 @@ define <vscale x 4 x half> @vfsub_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v10, v12, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.fsub.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %b, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -873,12 +893,12 @@ define <vscale x 4 x half> @vfsub_vf_nxv4f16(<vscale x 4 x half> %va, half %b, <
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v10, v10, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -902,7 +922,7 @@ define <vscale x 4 x half> @vfsub_vf_nxv4f16_unmasked(<vscale x 4 x half> %va, h
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v10, v10, v12
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
@@ -923,12 +943,12 @@ define <vscale x 8 x half> @vfsub_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfsub_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v12, v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.fsub.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %b, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %v
@@ -948,7 +968,7 @@ define <vscale x 8 x half> @vfsub_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v12, v16, v12
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.fsub.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %b, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -967,12 +987,12 @@ define <vscale x 8 x half> @vfsub_vf_nxv8f16(<vscale x 8 x half> %va, half %b, <
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v12, v12, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -996,7 +1016,7 @@ define <vscale x 8 x half> @vfsub_vf_nxv8f16_unmasked(<vscale x 8 x half> %va, h
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v12, v12, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
@@ -1017,12 +1037,12 @@ define <vscale x 16 x half> @vfsub_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFHMIN-LABEL: vfsub_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.fsub.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %b, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x half> %v
@@ -1042,7 +1062,7 @@ define <vscale x 16 x half> @vfsub_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v24, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.fsub.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %b, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -1061,12 +1081,12 @@ define <vscale x 16 x half> @vfsub_vf_nxv16f16(<vscale x 16 x half> %va, half %b
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -1090,7 +1110,7 @@ define <vscale x 16 x half> @vfsub_vf_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
@@ -1114,10 +1134,18 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
@@ -1126,31 +1154,59 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv4r.v v8, v16
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfsub.vv v16, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB44_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB44_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -1189,12 +1245,12 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    addi a3, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB45_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
@@ -1206,7 +1262,7 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v24, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
@@ -1231,76 +1287,76 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 4
-; ZVFHMIN-NEXT:    add a1, a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v24, v8
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a3, a1, 3
-; ZVFHMIN-NEXT:    add a1, a3, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vmv4r.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 3
-; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v16, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfsub.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB46_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB46_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a1, a0, 3
-; ZVFHMIN-NEXT:    add a0, a1, a0
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    vmv8r.v v24, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfsub.vv v24, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a1, a0, 4
-; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -1324,19 +1380,14 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
@@ -1345,41 +1396,30 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v16, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB47_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB47_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16

From 051612c0180e4e5a9ba750a994a91d2c1b05b00c Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
Date: Mon, 13 Jan 2025 15:48:03 -0800
Subject: [PATCH 349/408] [mlir][ValueBounds] memref.dim and tensor.dim are
 always positive (#122804)

Add the constraint that the length of a memref or tensor dimension is
always non-negative (at least 0) even if we don't know which dimension
we're querying the length of.
---
 .../Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.cpp  |  1 +
 .../Dialect/Tensor/IR/ValueBoundsOpInterfaceImpl.cpp  |  1 +
 .../MemRef/value-bounds-op-interface-impl.mlir        | 11 +++++++++++
 .../Tensor/value-bounds-op-interface-impl.mlir        | 11 +++++++++++
 4 files changed, 24 insertions(+)

diff --git a/mlir/lib/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.cpp b/mlir/lib/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.cpp
index daec22cf6ebdc..11400de35e430 100644
--- a/mlir/lib/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.cpp
@@ -51,6 +51,7 @@ struct DimOpInterface
     auto dimOp = cast<DimOp>(op);
     assert(value == dimOp.getResult() && "invalid value");
 
+    cstr.bound(value) >= 0;
     auto constIndex = dimOp.getConstantIndex();
     if (!constIndex.has_value())
       return;
diff --git a/mlir/lib/Dialect/Tensor/IR/ValueBoundsOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/IR/ValueBoundsOpInterfaceImpl.cpp
index 06f2c16406d3c..5bb6259dd543d 100644
--- a/mlir/lib/Dialect/Tensor/IR/ValueBoundsOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/ValueBoundsOpInterfaceImpl.cpp
@@ -38,6 +38,7 @@ struct DimOpInterface
     auto dimOp = cast<DimOp>(op);
     assert(value == dimOp.getResult() && "invalid value");
 
+    cstr.bound(value) >= 0;
     auto constIndex = dimOp.getConstantIndex();
     if (!constIndex.has_value())
       return;
diff --git a/mlir/test/Dialect/MemRef/value-bounds-op-interface-impl.mlir b/mlir/test/Dialect/MemRef/value-bounds-op-interface-impl.mlir
index dc311c6b59ea4..8bd7ae8df9049 100644
--- a/mlir/test/Dialect/MemRef/value-bounds-op-interface-impl.mlir
+++ b/mlir/test/Dialect/MemRef/value-bounds-op-interface-impl.mlir
@@ -52,6 +52,17 @@ func.func @memref_dim(%m: memref<?xf32>) -> index {
 
 // -----
 
+// CHECK-LABEL: func @memref_dim_all_positive(
+func.func @memref_dim_all_positive(%m: memref<?xf32>, %x: index) {
+  %c0 = arith.constant 0 : index
+  %0 = memref.dim %m, %x : memref<?xf32>
+  // expected-remark @below{{true}}
+  "test.compare"(%0, %c0) {cmp = "GE"} : (index, index) -> ()
+  return
+}
+
+// -----
+
 // CHECK-LABEL: func @memref_get_global(
 //       CHECK:   %[[c4:.*]] = arith.constant 4 : index
 //       CHECK:   return %[[c4]]
diff --git a/mlir/test/Dialect/Tensor/value-bounds-op-interface-impl.mlir b/mlir/test/Dialect/Tensor/value-bounds-op-interface-impl.mlir
index c0f64d3c84361..6610d3180cf02 100644
--- a/mlir/test/Dialect/Tensor/value-bounds-op-interface-impl.mlir
+++ b/mlir/test/Dialect/Tensor/value-bounds-op-interface-impl.mlir
@@ -44,6 +44,17 @@ func.func @dim(%t: tensor<?xf32>) -> index {
 
 // -----
 
+// CHECK-LABEL: func @dim_all_positive(
+func.func @dim_all_positive(%t: tensor<?xf32>, %x: index) {
+  %c0 = arith.constant 0 : index
+  %0 = tensor.dim %t, %x : tensor<?xf32>
+  // expected-remark @below{{true}}
+  "test.compare"(%0, %c0) {cmp = "GE" } : (index, index) -> ()
+  return
+}
+
+// -----
+
 // CHECK-LABEL: func @empty(
 //  CHECK-SAME:     %[[sz:.*]]: index
 //       CHECK:   %[[c6:.*]] = arith.constant 6 : index

From 7d8b4eb0ead277f41ff69525ed807f9f6e227f37 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston@google.com>
Date: Mon, 13 Jan 2025 16:15:47 -0800
Subject: [PATCH 350/408] [sanitizer][NFCI] Add Options parameter to
 LowerAllowCheckPass (#122765)

This is glue code to convert LowerAllowCheckPass from a FUNCTION_PASS to
FUNCTION_PASS_WITH_PARAMS. The parameters are currently unused.

Future work will plumb `-fsanitize-skip-hot-cutoff` (introduced in
https://github.com/llvm/llvm-project/pull/121619) to
LowerAllowCheckOptions.
---
 clang/lib/CodeGen/BackendUtil.cpp                 |  9 +++++----
 .../Instrumentation/LowerAllowCheckPass.h         |  9 +++++++++
 llvm/lib/Passes/PassBuilder.cpp                   | 15 +++++++++++++++
 llvm/lib/Passes/PassRegistry.def                  |  5 ++++-
 4 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 79e6bf3d24dff..62c3696741a5a 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -795,11 +795,12 @@ static void addSanitizers(const Triple &TargetTriple,
   }
 
   if (LowerAllowCheckPass::IsRequested()) {
+    LowerAllowCheckPass::Options Opts;
     // We want to call it after inline, which is about OptimizerEarlyEPCallback.
-    PB.registerOptimizerEarlyEPCallback([](ModulePassManager &MPM,
-                                           OptimizationLevel Level,
-                                           ThinOrFullLTOPhase Phase) {
-      MPM.addPass(createModuleToFunctionPassAdaptor(LowerAllowCheckPass()));
+    PB.registerOptimizerEarlyEPCallback([&Opts](ModulePassManager &MPM,
+                                                OptimizationLevel Level,
+                                                ThinOrFullLTOPhase Phase) {
+      MPM.addPass(createModuleToFunctionPassAdaptor(LowerAllowCheckPass(Opts)));
     });
   }
 }
diff --git a/llvm/include/llvm/Transforms/Instrumentation/LowerAllowCheckPass.h b/llvm/include/llvm/Transforms/Instrumentation/LowerAllowCheckPass.h
index af974818fec5f..edf9102151a42 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/LowerAllowCheckPass.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/LowerAllowCheckPass.h
@@ -24,9 +24,18 @@ namespace llvm {
 // from the hot code.
 class LowerAllowCheckPass : public PassInfoMixin<LowerAllowCheckPass> {
 public:
+  struct Options {
+    std::vector<unsigned int> placeholder; // TODO: cutoffs
+  };
+
+  explicit LowerAllowCheckPass(LowerAllowCheckPass::Options Opts)
+      : Opts(Opts) {};
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 
   static bool IsRequested();
+
+private:
+  LowerAllowCheckPass::Options Opts;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index f923d5aabe0a0..0a75153de4810 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -821,6 +821,21 @@ Expected<EmbedBitcodeOptions> parseEmbedBitcodePassOptions(StringRef Params) {
   return Result;
 }
 
+Expected<LowerAllowCheckPass::Options>
+parseLowerAllowCheckPassOptions(StringRef Params) {
+  LowerAllowCheckPass::Options Result;
+  while (!Params.empty()) {
+    StringRef ParamName;
+    std::tie(ParamName, Params) = Params.split(';');
+
+    return make_error<StringError>(
+        formatv("invalid LowerAllowCheck pass parameter '{0}' ", ParamName)
+            .str(),
+        inconvertibleErrorCode());
+  }
+  return Result;
+}
+
 Expected<MemorySanitizerOptions> parseMSanPassOptions(StringRef Params) {
   MemorySanitizerOptions Result;
   while (!Params.empty()) {
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 1021d7fcd9247..284f15e4ad664 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -402,7 +402,6 @@ FUNCTION_PASS("loop-load-elim", LoopLoadEliminationPass())
 FUNCTION_PASS("loop-simplify", LoopSimplifyPass())
 FUNCTION_PASS("loop-sink", LoopSinkPass())
 FUNCTION_PASS("loop-versioning", LoopVersioningPass())
-FUNCTION_PASS("lower-allow-check", LowerAllowCheckPass())
 FUNCTION_PASS("lower-atomic", LowerAtomicPass())
 FUNCTION_PASS("lower-constant-intrinsics", LowerConstantIntrinsicsPass())
 FUNCTION_PASS("lower-expect", LowerExpectIntrinsicPass())
@@ -553,6 +552,10 @@ FUNCTION_PASS_WITH_PARAMS(
     parseLoopVectorizeOptions,
     "no-interleave-forced-only;interleave-forced-only;no-vectorize-forced-only;"
     "vectorize-forced-only")
+FUNCTION_PASS_WITH_PARAMS(
+    "lower-allow-check", "LowerAllowCheckPass",
+    [](LowerAllowCheckPass::Options Opts) { return LowerAllowCheckPass(Opts); },
+    parseLowerAllowCheckPassOptions, "")
 FUNCTION_PASS_WITH_PARAMS(
     "lower-matrix-intrinsics", "LowerMatrixIntrinsicsPass",
     [](bool Minimal) { return LowerMatrixIntrinsicsPass(Minimal); },

From 1515caf7a59dc20cb932b724b2ef5c1d1a593427 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston@google.com>
Date: Mon, 13 Jan 2025 16:42:33 -0800
Subject: [PATCH 351/408] Revert "[sanitizer][NFCI] Add Options parameter to
 LowerAllowCheckPass" (#122833)

Reverts llvm/llvm-project#122765

Reason: buildbot breakage
(https://lab.llvm.org/buildbot/#/builders/46/builds/10393)

```
z:\b\llvm-clang-x86_64-sie-win\build\bin\clang.exe -cc1 -internal-isystem Z:\b\llvm-clang-x86_64-sie-win\build\lib\clang\20\include -nostdsysteminc -triple x86_64-pc-linux-gnu -emit-llvm -o - Z:\b\llvm-clang-x86_64-sie-win\llvm-project\clang\test\CodeGen\allow-ubsan-check-inline.c -fsanitize=signed-integer-overflow -mllvm -ubsan-guard-checks -O3 -mllvm -lower-allow-check-random-rate=1 -Rpass=lower-allow-check -Rpass-missed=lower-allow-check -fno-inline 2>&1 | z:\b\llvm-clang-x86_64-sie-win\build\bin\filecheck.exe Z:\b\llvm-clang-x86_64-sie-win\llvm-project\clang\test\CodeGen\allow-ubsan-check-inline.c --check-prefixes=NOINL --implicit-check-not="remark:"
# executed command: 'z:\b\llvm-clang-x86_64-sie-win\build\bin\clang.exe' -cc1 -internal-isystem 'Z:\b\llvm-clang-x86_64-sie-win\build\lib\clang\20\include' -nostdsysteminc -triple x86_64-pc-linux-gnu -emit-llvm -o - 'Z:\b\llvm-clang-x86_64-sie-win\llvm-project\clang\test\CodeGen\allow-ubsan-check-inline.c' -fsanitize=signed-integer-overflow -mllvm -ubsan-guard-checks -O3 -mllvm -lower-allow-check-random-rate=1 -Rpass=lower-allow-check -Rpass-missed=lower-allow-check -fno-inline
# note: command had no output on stdout or stderr
# error: command failed with exit status: 0xc0000409
# executed command: 'z:\b\llvm-clang-x86_64-sie-win\build\bin\filecheck.exe' 'Z:\b\llvm-clang-x86_64-sie-win\llvm-project\clang\test\CodeGen\allow-ubsan-check-inline.c' --check-prefixes=NOINL --implicit-check-not=remark:
# .---command stderr------------
# | FileCheck error: '<stdin>' is empty.
# | FileCheck command line:  z:\b\llvm-clang-x86_64-sie-win\build\bin\filecheck.exe Z:\b\llvm-clang-x86_64-sie-win\llvm-project\clang\test\CodeGen\allow-ubsan-check-inline.c --check-prefixes=NOINL --implicit-check-not=remark:
# `-----------------------------
# error: command failed with exit status: 2
```
---
 clang/lib/CodeGen/BackendUtil.cpp                 |  9 ++++-----
 .../Instrumentation/LowerAllowCheckPass.h         |  9 ---------
 llvm/lib/Passes/PassBuilder.cpp                   | 15 ---------------
 llvm/lib/Passes/PassRegistry.def                  |  5 +----
 4 files changed, 5 insertions(+), 33 deletions(-)

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 62c3696741a5a..79e6bf3d24dff 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -795,12 +795,11 @@ static void addSanitizers(const Triple &TargetTriple,
   }
 
   if (LowerAllowCheckPass::IsRequested()) {
-    LowerAllowCheckPass::Options Opts;
     // We want to call it after inline, which is about OptimizerEarlyEPCallback.
-    PB.registerOptimizerEarlyEPCallback([&Opts](ModulePassManager &MPM,
-                                                OptimizationLevel Level,
-                                                ThinOrFullLTOPhase Phase) {
-      MPM.addPass(createModuleToFunctionPassAdaptor(LowerAllowCheckPass(Opts)));
+    PB.registerOptimizerEarlyEPCallback([](ModulePassManager &MPM,
+                                           OptimizationLevel Level,
+                                           ThinOrFullLTOPhase Phase) {
+      MPM.addPass(createModuleToFunctionPassAdaptor(LowerAllowCheckPass()));
     });
   }
 }
diff --git a/llvm/include/llvm/Transforms/Instrumentation/LowerAllowCheckPass.h b/llvm/include/llvm/Transforms/Instrumentation/LowerAllowCheckPass.h
index edf9102151a42..af974818fec5f 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/LowerAllowCheckPass.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/LowerAllowCheckPass.h
@@ -24,18 +24,9 @@ namespace llvm {
 // from the hot code.
 class LowerAllowCheckPass : public PassInfoMixin<LowerAllowCheckPass> {
 public:
-  struct Options {
-    std::vector<unsigned int> placeholder; // TODO: cutoffs
-  };
-
-  explicit LowerAllowCheckPass(LowerAllowCheckPass::Options Opts)
-      : Opts(Opts) {};
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 
   static bool IsRequested();
-
-private:
-  LowerAllowCheckPass::Options Opts;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 0a75153de4810..f923d5aabe0a0 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -821,21 +821,6 @@ Expected<EmbedBitcodeOptions> parseEmbedBitcodePassOptions(StringRef Params) {
   return Result;
 }
 
-Expected<LowerAllowCheckPass::Options>
-parseLowerAllowCheckPassOptions(StringRef Params) {
-  LowerAllowCheckPass::Options Result;
-  while (!Params.empty()) {
-    StringRef ParamName;
-    std::tie(ParamName, Params) = Params.split(';');
-
-    return make_error<StringError>(
-        formatv("invalid LowerAllowCheck pass parameter '{0}' ", ParamName)
-            .str(),
-        inconvertibleErrorCode());
-  }
-  return Result;
-}
-
 Expected<MemorySanitizerOptions> parseMSanPassOptions(StringRef Params) {
   MemorySanitizerOptions Result;
   while (!Params.empty()) {
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 284f15e4ad664..1021d7fcd9247 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -402,6 +402,7 @@ FUNCTION_PASS("loop-load-elim", LoopLoadEliminationPass())
 FUNCTION_PASS("loop-simplify", LoopSimplifyPass())
 FUNCTION_PASS("loop-sink", LoopSinkPass())
 FUNCTION_PASS("loop-versioning", LoopVersioningPass())
+FUNCTION_PASS("lower-allow-check", LowerAllowCheckPass())
 FUNCTION_PASS("lower-atomic", LowerAtomicPass())
 FUNCTION_PASS("lower-constant-intrinsics", LowerConstantIntrinsicsPass())
 FUNCTION_PASS("lower-expect", LowerExpectIntrinsicPass())
@@ -552,10 +553,6 @@ FUNCTION_PASS_WITH_PARAMS(
     parseLoopVectorizeOptions,
     "no-interleave-forced-only;interleave-forced-only;no-vectorize-forced-only;"
     "vectorize-forced-only")
-FUNCTION_PASS_WITH_PARAMS(
-    "lower-allow-check", "LowerAllowCheckPass",
-    [](LowerAllowCheckPass::Options Opts) { return LowerAllowCheckPass(Opts); },
-    parseLowerAllowCheckPassOptions, "")
 FUNCTION_PASS_WITH_PARAMS(
     "lower-matrix-intrinsics", "LowerMatrixIntrinsicsPass",
     [](bool Minimal) { return LowerMatrixIntrinsicsPass(Minimal); },

From 2201164477982c2bd20fa2e925f567585c390805 Mon Sep 17 00:00:00 2001
From: Daniel Paoliello <danpao@microsoft.com>
Date: Mon, 13 Jan 2025 16:53:42 -0800
Subject: [PATCH 352/408] [llvm] Win x64 Unwind V2 2/n: Support dumping
 UOP_Epilog (#110338)

Adds support to objdump and readobj for reading the `UOP_Epilog` entries
of Windows x64 unwind v2.

`UOP_Epilog` has a weird format:

The first `UOP_Epilog` in the unwind data is the "header":
* The least-significant bit of `OpInfo` is the "At End" flag, which
signifies that there is an epilog at the very end of the associated
function.
* `CodeOffset` is the length each epilog described by the current unwind
information (all epilogs have the same length).

Any subsequent `UOP_Epilog` represents another epilog for the current
function, where `OpInfo` and `CodeOffset` are combined to a 12-bit value
which is the offset of the beginning of the epilog from the end of the
current function. If the offset is 0, then this entry is actually
padding and can be ignored.
---
 llvm/include/llvm/Support/Win64EH.h           |   5 +
 .../llvm-objdump/COFF/win64-unwindv2.yaml     | 175 ++++++++++++++++++
 .../COFF/unwind-x86_64-image.yaml             |  71 ++++---
 llvm/tools/llvm-objdump/COFFDump.cpp          |  24 ++-
 llvm/tools/llvm-readobj/Win64EHDumper.cpp     |  26 ++-
 llvm/tools/llvm-readobj/Win64EHDumper.h       |   3 +-
 6 files changed, 273 insertions(+), 31 deletions(-)
 create mode 100644 llvm/test/tools/llvm-objdump/COFF/win64-unwindv2.yaml

diff --git a/llvm/include/llvm/Support/Win64EH.h b/llvm/include/llvm/Support/Win64EH.h
index cf54f49286830..8a9be15373917 100644
--- a/llvm/include/llvm/Support/Win64EH.h
+++ b/llvm/include/llvm/Support/Win64EH.h
@@ -124,6 +124,11 @@ union UnwindCode {
   uint8_t getOpInfo() const {
     return (u.UnwindOpAndOpInfo >> 4) & 0x0F;
   }
+  /// Gets the offset for an UOP_Epilog unwind code.
+  uint32_t getEpilogOffset() const {
+    assert(getUnwindOp() == UOP_Epilog);
+    return (getOpInfo() << 8) | static_cast<uint32_t>(u.CodeOffset);
+  }
 };
 
 enum {
diff --git a/llvm/test/tools/llvm-objdump/COFF/win64-unwindv2.yaml b/llvm/test/tools/llvm-objdump/COFF/win64-unwindv2.yaml
new file mode 100644
index 0000000000000..20abcb959120d
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/COFF/win64-unwindv2.yaml
@@ -0,0 +1,175 @@
+# RUN: yaml2obj %s -o %t.exe
+# RUN: llvm-objdump --unwind-info %t.exe | FileCheck %s
+
+# CHECK-LABEL:  Unwind info:
+# CHECK-EMPTY:
+# CHECK-NEXT:   Function Table:
+# CHECK-NEXT:     Start Address: 0x1010
+# CHECK-NEXT:     End Address: 0x1017
+# CHECK-NEXT:     Unwind Info Address: 0x2000
+# CHECK-NEXT:       Version: 2
+# CHECK-NEXT:       Flags: 0
+# CHECK-NEXT:       Size of prolog: 4
+# CHECK-NEXT:       Number of Codes: 3
+# CHECK-NEXT:       No frame pointer used
+# CHECK-NEXT:       Unwind Codes:
+# CHECK-NEXT:         0x01: UOP_Epilog atend=yes, length=0x1
+# CHECK-NEXT:         0x0b: UOP_Epilog offset=0xB
+# CHECK-NEXT:         0x04: UOP_AllocSmall 72
+# CHECK-EMPTY:
+# CHECK-NEXT:   Function Table:
+# CHECK-NEXT:     Start Address: 0x1020
+# CHECK-NEXT:     End Address: 0x105c
+# CHECK-NEXT:     Unwind Info Address: 0x200c
+# CHECK-NEXT:       Version: 1
+# CHECK-NEXT:       Flags: 3 UNW_ExceptionHandler UNW_TerminateHandler
+# CHECK-NEXT:       Size of prolog: 4
+# CHECK-NEXT:       Number of Codes: 1
+# CHECK-NEXT:       No frame pointer used
+# CHECK-NEXT:       Unwind Codes:
+# CHECK-NEXT:         0x04: UOP_AllocSmall 56
+
+--- !COFF
+OptionalHeader:
+  AddressOfEntryPoint: 4128
+  ImageBase:       5368709120
+  SectionAlignment: 4096
+  FileAlignment:   512
+  MajorOperatingSystemVersion: 6
+  MinorOperatingSystemVersion: 0
+  MajorImageVersion: 0
+  MinorImageVersion: 0
+  MajorSubsystemVersion: 6
+  MinorSubsystemVersion: 0
+  Subsystem:       IMAGE_SUBSYSTEM_WINDOWS_CUI
+  DLLCharacteristics: [ IMAGE_DLL_CHARACTERISTICS_HIGH_ENTROPY_VA, IMAGE_DLL_CHARACTERISTICS_DYNAMIC_BASE, IMAGE_DLL_CHARACTERISTICS_NX_COMPAT, IMAGE_DLL_CHARACTERISTICS_TERMINAL_SERVER_AWARE ]
+  SizeOfStackReserve: 1048576
+  SizeOfStackCommit: 4096
+  SizeOfHeapReserve: 1048576
+  SizeOfHeapCommit: 4096
+  ExportTable:
+    RelativeVirtualAddress: 0
+    Size:            0
+  ImportTable:
+    RelativeVirtualAddress: 0
+    Size:            0
+  ResourceTable:
+    RelativeVirtualAddress: 0
+    Size:            0
+  ExceptionTable:
+    RelativeVirtualAddress: 12288
+    Size:            24
+  CertificateTable:
+    RelativeVirtualAddress: 0
+    Size:            0
+  BaseRelocationTable:
+    RelativeVirtualAddress: 0
+    Size:            0
+  Debug:
+    RelativeVirtualAddress: 0
+    Size:            0
+  Architecture:
+    RelativeVirtualAddress: 0
+    Size:            0
+  GlobalPtr:
+    RelativeVirtualAddress: 0
+    Size:            0
+  TlsTable:
+    RelativeVirtualAddress: 0
+    Size:            0
+  LoadConfigTable:
+    RelativeVirtualAddress: 0
+    Size:            0
+  BoundImport:
+    RelativeVirtualAddress: 0
+    Size:            0
+  IAT:
+    RelativeVirtualAddress: 0
+    Size:            0
+  DelayImportDescriptor:
+    RelativeVirtualAddress: 0
+    Size:            0
+  ClrRuntimeHeader:
+    RelativeVirtualAddress: 0
+    Size:            0
+header:
+  Machine:         IMAGE_FILE_MACHINE_AMD64
+  Characteristics: [ IMAGE_FILE_EXECUTABLE_IMAGE, IMAGE_FILE_LARGE_ADDRESS_AWARE ]
+sections:
+  - Name:            .text
+    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  4096
+    VirtualSize:     8
+    SectionData:     00000000
+  - Name:            .xdata
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  8192
+    VirtualSize:     40
+    SectionData:     0204030001160B0604820000190401000462000070100000FFFF010804051E0009330000
+  - Name:            .pdata
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  12288
+    VirtualSize:     24
+    SectionData:     101000001710000000200000201000005C1000000C200000
+symbols:
+  - Name:            .text
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+  - Name:            .xdata
+    Value:           0
+    SectionNumber:   2
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+  - Name:            .pdata
+    Value:           0
+    SectionNumber:   3
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+  - Name:            other
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_FUNCTION
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+  - Name:            _ZN4RAIID2Ev
+    Value:           16
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_FUNCTION
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+  - Name:            entry
+    Value:           32
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_FUNCTION
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+  - Name:            _ZN4RAIID1Ev
+    Value:           16
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+  - Name:            _Unwind_Resume
+    Value:           96
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_FUNCTION
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+  - Name:            __gxx_personality_seh0
+    Value:           112
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_FUNCTION
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+  - Name:            GCC_except_table2
+    Value:           20
+    SectionNumber:   2
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+...
diff --git a/llvm/test/tools/llvm-readobj/COFF/unwind-x86_64-image.yaml b/llvm/test/tools/llvm-readobj/COFF/unwind-x86_64-image.yaml
index 5780cf7a0467b..b9d6822a77f4a 100644
--- a/llvm/test/tools/llvm-readobj/COFF/unwind-x86_64-image.yaml
+++ b/llvm/test/tools/llvm-readobj/COFF/unwind-x86_64-image.yaml
@@ -1,26 +1,47 @@
 # RUN: yaml2obj %s -o %t.exe
 # RUN: llvm-readobj --unwind %t.exe | FileCheck %s
 
-# CHECK:         RuntimeFunction {
-# CHECK:          StartAddress: entry (0x140001020)
-# CHECK-NEXT:     EndAddress: (0x14000105C)
-# CHECK-NEXT:     UnwindInfoAddress: (0x140002008)
-# CHECK-NEXT:     UnwindInfo {
-# CHECK-NEXT:       Version: 1
-# CHECK-NEXT:       Flags [ (0x3)
-# CHECK-NEXT:         ExceptionHandler (0x1)
-# CHECK-NEXT:         TerminateHandler (0x2)
-# CHECK-NEXT:       ]
-# CHECK-NEXT:       PrologSize: 4
-# CHECK-NEXT:       FrameRegister: -
-# CHECK-NEXT:       FrameOffset: -
-# CHECK-NEXT:       UnwindCodeCount: 1
-# CHECK-NEXT:       UnwindCodes [
-# CHECK-NEXT:         0x04: ALLOC_SMALL size=56
-# CHECK-NEXT:       ]
-# CHECK-NEXT:       Handler: __gxx_personality_seh0 (0x140001070)
+# CHECK-LABEL:  UnwindInformation [
+# CHECK-NEXT:     RuntimeFunction {
+# CHECK-NEXT:       StartAddress: _ZN4RAIID2Ev (0x140001010)
+# CHECK-NEXT:       EndAddress: (0x140001017)
+# CHECK-NEXT:       UnwindInfoAddress: .xdata (0x140002000)
+# CHECK-NEXT:       UnwindInfo {
+# CHECK-NEXT:         Version: 2
+# CHECK-NEXT:         Flags [ (0x0)
+# CHECK-NEXT:         ]
+# CHECK-NEXT:         PrologSize: 4
+# CHECK-NEXT:         FrameRegister: -
+# CHECK-NEXT:         FrameOffset: -
+# CHECK-NEXT:         UnwindCodeCount: 3
+# CHECK-NEXT:         UnwindCodes [
+# CHECK-NEXT:           0x01: EPILOG atend=yes, length=0x1
+# CHECK-NEXT:           0x0B: EPILOG offset=0xB
+# CHECK-NEXT:           0x04: ALLOC_SMALL size=72
+# CHECK-NEXT:         ]
+# CHECK-NEXT:       }
 # CHECK-NEXT:     }
-# CHECK-NEXT:   }
+# CHECK-NEXT:     RuntimeFunction {
+# CHECK-NEXT:       StartAddress: entry (0x140001020)
+# CHECK-NEXT:       EndAddress: (0x14000105C)
+# CHECK-NEXT:       UnwindInfoAddress: (0x14000200C)
+# CHECK-NEXT:       UnwindInfo {
+# CHECK-NEXT:         Version: 1
+# CHECK-NEXT:         Flags [ (0x3)
+# CHECK-NEXT:           ExceptionHandler (0x1)
+# CHECK-NEXT:           TerminateHandler (0x2)
+# CHECK-NEXT:         ]
+# CHECK-NEXT:         PrologSize: 4
+# CHECK-NEXT:         FrameRegister: -
+# CHECK-NEXT:         FrameOffset: -
+# CHECK-NEXT:         UnwindCodeCount: 1
+# CHECK-NEXT:         UnwindCodes [
+# CHECK-NEXT:           0x04: ALLOC_SMALL size=56
+# CHECK-NEXT:         ]
+# CHECK-NEXT:         Handler: __gxx_personality_seh0 (0x140001070)
+# CHECK-NEXT:       }
+# CHECK-NEXT:     }
+# CHECK-NEXT:   ]
 
 --- !COFF
 OptionalHeader:
@@ -92,18 +113,18 @@ sections:
   - Name:            .text
     Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
     VirtualAddress:  4096
-    VirtualSize:     113
-    SectionData:     C3662E0F1F8400000000000F1F4400005048890C2458C3660F1F8400000000004883EC38E8D7FFFFFFE900000000488D4C2430E8D8FFFFFF904883C438C3488944242889542424488D4C2430E8BFFFFFFF488B4C2428E805000000CC0F1F4000C3662E0F1F8400000000000F1F440000C3
-  - Name:            .rdata
+    VirtualSize:     8
+    SectionData:     00000000
+  - Name:            .xdata
     Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
     VirtualAddress:  8192
-    VirtualSize:     32
-    SectionData:     0101010001020000190401000462000070100000FFFF010804051E0009330000
+    VirtualSize:     40
+    SectionData:     0204030001160B0604820000190401000462000070100000FFFF010804051E0009330000
   - Name:            .pdata
     Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
     VirtualAddress:  12288
     VirtualSize:     24
-    SectionData:     101000001710000000200000201000005C10000008200000
+    SectionData:     101000001710000000200000201000005C1000000C200000
 symbols:
   - Name:            .text
     Value:           0
diff --git a/llvm/tools/llvm-objdump/COFFDump.cpp b/llvm/tools/llvm-objdump/COFFDump.cpp
index 71697fa01e627..b22c9a4127fc3 100644
--- a/llvm/tools/llvm-objdump/COFFDump.cpp
+++ b/llvm/tools/llvm-objdump/COFFDump.cpp
@@ -240,10 +240,10 @@ static unsigned getNumUsedSlots(const UnwindCode &UnwindCode) {
   case UOP_AllocSmall:
   case UOP_SetFPReg:
   case UOP_PushMachFrame:
+  case UOP_Epilog:
     return 1;
   case UOP_SaveNonVol:
   case UOP_SaveXMM128:
-  case UOP_Epilog:
     return 2;
   case UOP_SaveNonVolBig:
   case UOP_SaveXMM128Big:
@@ -257,7 +257,7 @@ static unsigned getNumUsedSlots(const UnwindCode &UnwindCode) {
 // Prints one unwind code. Because an unwind code can occupy up to 3 slots in
 // the unwind codes array, this function requires that the correct number of
 // slots is provided.
-static void printUnwindCode(ArrayRef<UnwindCode> UCs) {
+static void printUnwindCode(ArrayRef<UnwindCode> UCs, bool &SeenFirstEpilog) {
   assert(UCs.size() >= getNumUsedSlots(UCs[0]));
   outs() <<  format("      0x%02x: ", unsigned(UCs[0].u.CodeOffset))
          << getUnwindCodeTypeName(UCs[0].getUnwindOp());
@@ -301,11 +301,29 @@ static void printUnwindCode(ArrayRef<UnwindCode> UCs) {
     outs() << " " << (UCs[0].getOpInfo() ? "w/o" : "w")
            << " error code";
     break;
+
+  case UOP_Epilog:
+    if (SeenFirstEpilog) {
+      uint32_t Offset = UCs[0].getEpilogOffset();
+      if (Offset == 0) {
+        outs() << " padding";
+      } else {
+        outs() << " offset=" << format("0x%X", Offset);
+      }
+    } else {
+      SeenFirstEpilog = true;
+      bool AtEnd = (UCs[0].getOpInfo() & 0x1) != 0;
+      uint32_t Length = UCs[0].u.CodeOffset;
+      outs() << " atend=" << (AtEnd ? "yes" : "no")
+             << ", length=" << format("0x%X", Length);
+    }
+    break;
   }
   outs() << "\n";
 }
 
 static void printAllUnwindCodes(ArrayRef<UnwindCode> UCs) {
+  bool SeenFirstEpilog = false;
   for (const UnwindCode *I = UCs.begin(), *E = UCs.end(); I < E; ) {
     unsigned UsedSlots = getNumUsedSlots(*I);
     if (UsedSlots > UCs.size()) {
@@ -316,7 +334,7 @@ static void printAllUnwindCodes(ArrayRef<UnwindCode> UCs) {
              << " remaining in buffer";
       return ;
     }
-    printUnwindCode(ArrayRef(I, E));
+    printUnwindCode(ArrayRef(I, E), SeenFirstEpilog);
     I += UsedSlots;
   }
 }
diff --git a/llvm/tools/llvm-readobj/Win64EHDumper.cpp b/llvm/tools/llvm-readobj/Win64EHDumper.cpp
index e4bd772191514..e17a035f1d371 100644
--- a/llvm/tools/llvm-readobj/Win64EHDumper.cpp
+++ b/llvm/tools/llvm-readobj/Win64EHDumper.cpp
@@ -65,6 +65,8 @@ static StringRef getUnwindCodeTypeName(uint8_t Code) {
   case UOP_SaveXMM128: return "SAVE_XMM128";
   case UOP_SaveXMM128Big: return "SAVE_XMM128_FAR";
   case UOP_PushMachFrame: return "PUSH_MACHFRAME";
+  case UOP_Epilog:
+    return "EPILOG";
   }
 }
 
@@ -99,6 +101,7 @@ static unsigned getNumUsedSlots(const UnwindCode &UnwindCode) {
   case UOP_AllocSmall:
   case UOP_SetFPReg:
   case UOP_PushMachFrame:
+  case UOP_Epilog:
     return 1;
   case UOP_SaveNonVol:
   case UOP_SaveXMM128:
@@ -254,7 +257,8 @@ void Dumper::printRuntimeFunctionEntry(const Context &Ctx,
 // Prints one unwind code. Because an unwind code can occupy up to 3 slots in
 // the unwind codes array, this function requires that the correct number of
 // slots is provided.
-void Dumper::printUnwindCode(const UnwindInfo& UI, ArrayRef<UnwindCode> UC) {
+void Dumper::printUnwindCode(const UnwindInfo &UI, ArrayRef<UnwindCode> UC,
+                             bool &SeenFirstEpilog) {
   assert(UC.size() >= getNumUsedSlots(UC[0]));
 
   SW.startLine() << format("0x%02X: ", unsigned(UC[0].u.CodeOffset))
@@ -306,6 +310,23 @@ void Dumper::printUnwindCode(const UnwindInfo& UI, ArrayRef<UnwindCode> UC) {
   case UOP_PushMachFrame:
     OS << " errcode=" << (UC[0].getOpInfo() == 0 ? "no" : "yes");
     break;
+
+  case UOP_Epilog:
+    if (SeenFirstEpilog) {
+      uint32_t Offset = UC[0].getEpilogOffset();
+      if (Offset == 0) {
+        OS << " padding";
+      } else {
+        OS << " offset=" << format("0x%X", Offset);
+      }
+    } else {
+      SeenFirstEpilog = true;
+      bool AtEnd = (UC[0].getOpInfo() & 0x1) != 0;
+      uint32_t Length = UC[0].u.CodeOffset;
+      OS << " atend=" << (AtEnd ? "yes" : "no")
+         << ", length=" << format("0x%X", Length);
+    }
+    break;
   }
 
   OS << "\n";
@@ -330,6 +351,7 @@ void Dumper::printUnwindInfo(const Context &Ctx, const coff_section *Section,
   {
     ListScope UCS(SW, "UnwindCodes");
     ArrayRef<UnwindCode> UC(&UI.UnwindCodes[0], UI.NumCodes);
+    bool SeenFirstEpilog = false;
     for (const UnwindCode *UCI = UC.begin(), *UCE = UC.end(); UCI < UCE; ++UCI) {
       unsigned UsedSlots = getNumUsedSlots(*UCI);
       if (UsedSlots > UC.size()) {
@@ -337,7 +359,7 @@ void Dumper::printUnwindInfo(const Context &Ctx, const coff_section *Section,
         return;
       }
 
-      printUnwindCode(UI, ArrayRef(UCI, UCE));
+      printUnwindCode(UI, ArrayRef(UCI, UCE), SeenFirstEpilog);
       UCI = UCI + UsedSlots - 1;
     }
   }
diff --git a/llvm/tools/llvm-readobj/Win64EHDumper.h b/llvm/tools/llvm-readobj/Win64EHDumper.h
index 97458c916bec6..a23d30be7a113 100644
--- a/llvm/tools/llvm-readobj/Win64EHDumper.h
+++ b/llvm/tools/llvm-readobj/Win64EHDumper.h
@@ -44,7 +44,8 @@ class Dumper {
                                  const object::coff_section *Section,
                                  uint64_t SectionOffset,
                                  const RuntimeFunction &RF);
-  void printUnwindCode(const UnwindInfo& UI, ArrayRef<UnwindCode> UC);
+  void printUnwindCode(const UnwindInfo &UI, ArrayRef<UnwindCode> UC,
+                       bool &SeenFirstEpilog);
   void printUnwindInfo(const Context &Ctx, const object::coff_section *Section,
                        off_t Offset, const UnwindInfo &UI);
   void printRuntimeFunction(const Context &Ctx,

From 1908c41259dbd43567bb8fd32ee69862411305ef Mon Sep 17 00:00:00 2001
From: GeorgeHuyubo <113479859+GeorgeHuyubo@users.noreply.github.com>
Date: Mon, 13 Jan 2025 17:07:39 -0800
Subject: [PATCH 353/408] Revert "[lldb] Switch debuginfod cache to use lldb
 index cache settings" (#122816)

This reverts commit 7b808e73aa0193c8a42eae8f2420a803f424bee1.

Previous commit which change default debuginfod cache path and pruning
policy settings is problematic. It broke multiple tests across lldb and
llvm. Reverting for now.

Co-authored-by: George Hu <georgehuyubo@gmail.com>
---
 .../Debuginfod/SymbolLocatorDebuginfod.cpp    | 17 ++++-------
 llvm/include/llvm/Debuginfod/Debuginfod.h     |  4 +--
 llvm/lib/Debuginfod/Debuginfod.cpp            | 29 ++++++-------------
 llvm/unittests/Debuginfod/DebuginfodTests.cpp |  3 +-
 4 files changed, 17 insertions(+), 36 deletions(-)

diff --git a/lldb/source/Plugins/SymbolLocator/Debuginfod/SymbolLocatorDebuginfod.cpp b/lldb/source/Plugins/SymbolLocator/Debuginfod/SymbolLocatorDebuginfod.cpp
index 905f4d783ac95..f9aa6b1a98765 100644
--- a/lldb/source/Plugins/SymbolLocator/Debuginfod/SymbolLocatorDebuginfod.cpp
+++ b/lldb/source/Plugins/SymbolLocator/Debuginfod/SymbolLocatorDebuginfod.cpp
@@ -8,7 +8,6 @@
 
 #include "SymbolLocatorDebuginfod.h"
 
-#include "lldb/Core/DataFileCache.h"
 #include "lldb/Core/PluginManager.h"
 #include "lldb/Interpreter/OptionValueString.h"
 #include "lldb/Utility/Args.h"
@@ -173,14 +172,11 @@ GetFileForModule(const ModuleSpec &module_spec,
   // Grab LLDB's Debuginfod overrides from the
   // plugin.symbol-locator.debuginfod.* settings.
   PluginProperties &plugin_props = GetGlobalPluginProperties();
-  // Grab the lldb index cache settings from the global module list properties.
-  ModuleListProperties &properties =
-      ModuleList::GetGlobalModuleListProperties();
-  std::string cache_path = properties.GetLLDBIndexCachePath().GetPath();
-
-  llvm::CachePruningPolicy pruning_policy =
-      DataFileCache::GetLLDBIndexCachePolicy();
-
+  llvm::Expected<std::string> cache_path_or_err = plugin_props.GetCachePath();
+  // A cache location is *required*.
+  if (!cache_path_or_err)
+    return {};
+  std::string cache_path = *cache_path_or_err;
   llvm::SmallVector<llvm::StringRef> debuginfod_urls =
       llvm::getDefaultDebuginfodUrls();
   std::chrono::milliseconds timeout = plugin_props.GetTimeout();
@@ -193,8 +189,7 @@ GetFileForModule(const ModuleSpec &module_spec,
   if (!file_name.empty())
     cache_file_name += "-" + file_name.str();
   llvm::Expected<std::string> result = llvm::getCachedOrDownloadArtifact(
-      cache_file_name, url_path, cache_path, debuginfod_urls, timeout,
-      pruning_policy);
+      cache_file_name, url_path, cache_path, debuginfod_urls, timeout);
   if (result)
     return FileSpec(*result);
 
diff --git a/llvm/include/llvm/Debuginfod/Debuginfod.h b/llvm/include/llvm/Debuginfod/Debuginfod.h
index aebcf31cd4822..99fe15ad85979 100644
--- a/llvm/include/llvm/Debuginfod/Debuginfod.h
+++ b/llvm/include/llvm/Debuginfod/Debuginfod.h
@@ -25,7 +25,6 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Object/BuildID.h"
-#include "llvm/Support/CachePruning.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Mutex.h"
@@ -96,8 +95,7 @@ Expected<std::string> getCachedOrDownloadArtifact(StringRef UniqueKey,
 /// found, uses the UniqueKey for the local cache file.
 Expected<std::string> getCachedOrDownloadArtifact(
     StringRef UniqueKey, StringRef UrlPath, StringRef CacheDirectoryPath,
-    ArrayRef<StringRef> DebuginfodUrls, std::chrono::milliseconds Timeout,
-    llvm::CachePruningPolicy policy);
+    ArrayRef<StringRef> DebuginfodUrls, std::chrono::milliseconds Timeout);
 
 class ThreadPoolInterface;
 
diff --git a/llvm/lib/Debuginfod/Debuginfod.cpp b/llvm/lib/Debuginfod/Debuginfod.cpp
index 17efaea892c66..4c785117ae8ef 100644
--- a/llvm/lib/Debuginfod/Debuginfod.cpp
+++ b/llvm/lib/Debuginfod/Debuginfod.cpp
@@ -106,14 +106,6 @@ Expected<std::string> getDefaultDebuginfodCacheDirectory() {
   return std::string(CacheDirectory);
 }
 
-Expected<llvm::CachePruningPolicy> getDefaultDebuginfodCachePruningPolicy() {
-  Expected<CachePruningPolicy> PruningPolicyOrErr =
-      parseCachePruningPolicy(std::getenv("DEBUGINFOD_CACHE_POLICY"));
-  if (!PruningPolicyOrErr)
-    return PruningPolicyOrErr.takeError();
-  return *PruningPolicyOrErr;
-}
-
 std::chrono::milliseconds getDefaultDebuginfodTimeout() {
   long Timeout;
   const char *DebuginfodTimeoutEnv = std::getenv("DEBUGINFOD_TIMEOUT");
@@ -177,15 +169,9 @@ Expected<std::string> getCachedOrDownloadArtifact(StringRef UniqueKey,
     return CacheDirOrErr.takeError();
   CacheDir = *CacheDirOrErr;
 
-  Expected<llvm::CachePruningPolicy> PruningPolicyOrErr =
-      getDefaultDebuginfodCachePruningPolicy();
-  if (!PruningPolicyOrErr)
-    return PruningPolicyOrErr.takeError();
-  llvm::CachePruningPolicy PruningPolicy = *PruningPolicyOrErr;
-
-  return getCachedOrDownloadArtifact(
-      UniqueKey, UrlPath, CacheDir, getDefaultDebuginfodUrls(),
-      getDefaultDebuginfodTimeout(), PruningPolicy);
+  return getCachedOrDownloadArtifact(UniqueKey, UrlPath, CacheDir,
+                                     getDefaultDebuginfodUrls(),
+                                     getDefaultDebuginfodTimeout());
 }
 
 namespace {
@@ -264,8 +250,7 @@ static SmallVector<std::string, 0> getHeaders() {
 
 Expected<std::string> getCachedOrDownloadArtifact(
     StringRef UniqueKey, StringRef UrlPath, StringRef CacheDirectoryPath,
-    ArrayRef<StringRef> DebuginfodUrls, std::chrono::milliseconds Timeout,
-    llvm::CachePruningPolicy policy) {
+    ArrayRef<StringRef> DebuginfodUrls, std::chrono::milliseconds Timeout) {
   SmallString<64> AbsCachedArtifactPath;
   sys::path::append(AbsCachedArtifactPath, CacheDirectoryPath,
                     "llvmcache-" + UniqueKey);
@@ -319,7 +304,11 @@ Expected<std::string> getCachedOrDownloadArtifact(
         continue;
     }
 
-    pruneCache(CacheDirectoryPath, policy);
+    Expected<CachePruningPolicy> PruningPolicyOrErr =
+        parseCachePruningPolicy(std::getenv("DEBUGINFOD_CACHE_POLICY"));
+    if (!PruningPolicyOrErr)
+      return PruningPolicyOrErr.takeError();
+    pruneCache(CacheDirectoryPath, *PruningPolicyOrErr);
 
     // Return the path to the artifact on disk.
     return std::string(AbsCachedArtifactPath);
diff --git a/llvm/unittests/Debuginfod/DebuginfodTests.cpp b/llvm/unittests/Debuginfod/DebuginfodTests.cpp
index 8dacf2ae5b3f8..5312912599e93 100644
--- a/llvm/unittests/Debuginfod/DebuginfodTests.cpp
+++ b/llvm/unittests/Debuginfod/DebuginfodTests.cpp
@@ -37,7 +37,6 @@ TEST(DebuginfodClient, CacheHit) {
   sys::fs::createTemporaryFile("llvmcache-key", "temp", FD, CachedFilePath);
   StringRef CacheDir = sys::path::parent_path(CachedFilePath);
   StringRef UniqueKey = sys::path::filename(CachedFilePath);
-  llvm::CachePruningPolicy policy;
   EXPECT_TRUE(UniqueKey.consume_front("llvmcache-"));
   raw_fd_ostream OF(FD, true, /*unbuffered=*/true);
   OF << "contents\n";
@@ -45,7 +44,7 @@ TEST(DebuginfodClient, CacheHit) {
   OF.close();
   Expected<std::string> PathOrErr = getCachedOrDownloadArtifact(
       UniqueKey, /*UrlPath=*/"/null", CacheDir,
-      /*DebuginfodUrls=*/{}, /*Timeout=*/std::chrono::milliseconds(1), policy);
+      /*DebuginfodUrls=*/{}, /*Timeout=*/std::chrono::milliseconds(1));
   EXPECT_THAT_EXPECTED(PathOrErr, HasValue(CachedFilePath));
 }
 

From 5ea1c873647c02c80556594b9738de6768d98bf1 Mon Sep 17 00:00:00 2001
From: wanglei <wanglei@loongson.cn>
Date: Tue, 14 Jan 2025 09:16:11 +0800
Subject: [PATCH 354/408] [LLDB][LoongArch] Add LSX and LASX register
 definitions and operations

With this patch, vector registers can be read and written when debugging a live process.

Note: We currently assume that all LoongArch64 processors include the
LSX and LASX extensions.

To add test cases, the following modifications were also made:
lldb/packages/Python/lldbsuite/test/lldbtest.py
lldb/packages/Python/lldbsuite/test/make/Makefile.rules

Reviewed By: DavidSpickett, SixWeining

Pull Request: https://github.com/llvm/llvm-project/pull/120664
---
 .../Python/lldbsuite/test/lldbtest.py         |  11 ++
 .../Python/lldbsuite/test/make/Makefile.rules |   4 +
 ...NativeRegisterContextLinux_loongarch64.cpp | 176 +++++++++++++++++-
 .../NativeRegisterContextLinux_loongarch64.h  |  21 ++-
 .../RegisterContextPOSIX_loongarch64.cpp      |  10 +
 .../RegisterContextPOSIX_loongarch64.h        |   8 +
 .../Utility/RegisterInfoPOSIX_loongarch64.cpp |  63 ++++++-
 .../Utility/RegisterInfoPOSIX_loongarch64.h   |  12 ++
 .../Utility/RegisterInfos_loongarch64.h       |  89 +++++++++
 .../Utility/lldb-loongarch-register-enums.h   |  70 +++++++
 .../Utility/LoongArch_DWARF_Registers.h       |  66 +++++++
 .../linux/loongarch64/simd_registers/Makefile |   3 +
 .../TestLoongArch64LinuxSIMDRegisters.py      |  93 +++++++++
 .../linux/loongarch64/simd_registers/main.c   | 108 +++++++++++
 14 files changed, 728 insertions(+), 6 deletions(-)
 create mode 100644 lldb/test/API/linux/loongarch64/simd_registers/Makefile
 create mode 100644 lldb/test/API/linux/loongarch64/simd_registers/TestLoongArch64LinuxSIMDRegisters.py
 create mode 100644 lldb/test/API/linux/loongarch64/simd_registers/main.c

diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py
index 1338d16a9171e..6ff8e17030b10 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbtest.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py
@@ -1379,6 +1379,17 @@ def isAArch64Windows(self):
             return arch in ["aarch64", "arm64", "arm64e"]
         return False
 
+    def isLoongArch(self):
+        """Returns true if the architecture is LoongArch."""
+        arch = self.getArchitecture().lower()
+        return arch in ["loongarch64", "loongarch32"]
+
+    def isLoongArchLSX(self):
+        return self.isLoongArch() and "lsx" in self.getCPUInfo()
+
+    def isLoongArchLASX(self):
+        return self.isLoongArch() and "lasx" in self.getCPUInfo()
+
     def getArchitecture(self):
         """Returns the architecture in effect the test suite is running with."""
         return lldbplatformutil.getArchitecture()
diff --git a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
index d0045ac9f91a7..2da6ff226b615 100644
--- a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
+++ b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
@@ -210,6 +210,10 @@ else
 	ifeq "$(findstring mips,$(ARCH))" "mips"
 		override ARCHFLAG := -
 	endif
+	ifeq "$(findstring loongarch,$(ARCH))" "loongarch"
+		override ARCH :=
+		override ARCHFLAG :=
+	endif
 
 	ifeq "$(SPLIT_DEBUG_SYMBOLS)" "YES"
 		DSYM = $(EXE).debug
diff --git a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_loongarch64.cpp b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_loongarch64.cpp
index 9ffc8ada920cb..b04018ee243fd 100644
--- a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_loongarch64.cpp
+++ b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_loongarch64.cpp
@@ -27,7 +27,17 @@
 // struct iovec definition
 #include <sys/uio.h>
 
-#define REG_CONTEXT_SIZE (GetGPRSize() + GetFPRSize())
+#ifndef NT_LOONGARCH_LSX
+#define NT_LOONGARCH_LSX 0xa02 /* LoongArch SIMD eXtension registers */
+#endif
+
+#ifndef NT_LOONGARCH_LASX
+#define NT_LOONGARCH_LASX                                                      \
+  0xa03 /* LoongArch Advanced SIMD eXtension registers */
+#endif
+
+#define REG_CONTEXT_SIZE                                                       \
+  (GetGPRSize() + GetFPRSize() + sizeof(m_lsx) + sizeof(m_lasx))
 
 using namespace lldb;
 using namespace lldb_private;
@@ -62,6 +72,8 @@ NativeRegisterContextLinux_loongarch64::NativeRegisterContextLinux_loongarch64(
       NativeRegisterContextLinux(native_thread) {
   ::memset(&m_fpr, 0, sizeof(m_fpr));
   ::memset(&m_gpr, 0, sizeof(m_gpr));
+  ::memset(&m_lsx, 0, sizeof(m_lsx));
+  ::memset(&m_lasx, 0, sizeof(m_lasx));
 
   ::memset(&m_hwp_regs, 0, sizeof(m_hwp_regs));
   ::memset(&m_hbp_regs, 0, sizeof(m_hbp_regs));
@@ -75,6 +87,8 @@ NativeRegisterContextLinux_loongarch64::NativeRegisterContextLinux_loongarch64(
 
   m_gpr_is_valid = false;
   m_fpu_is_valid = false;
+  m_lsx_is_valid = false;
+  m_lasx_is_valid = false;
 }
 
 const RegisterInfoPOSIX_loongarch64 &
@@ -135,6 +149,22 @@ Status NativeRegisterContextLinux_loongarch64::ReadRegister(
     offset = CalculateFprOffset(reg_info);
     assert(offset < GetFPRSize());
     src = (uint8_t *)GetFPRBuffer() + offset;
+  } else if (IsLSX(reg)) {
+    error = ReadLSX();
+    if (error.Fail())
+      return error;
+
+    offset = CalculateLsxOffset(reg_info);
+    assert(offset < sizeof(m_lsx));
+    src = (uint8_t *)&m_lsx + offset;
+  } else if (IsLASX(reg)) {
+    error = ReadLASX();
+    if (error.Fail())
+      return error;
+
+    offset = CalculateLasxOffset(reg_info);
+    assert(offset < sizeof(m_lasx));
+    src = (uint8_t *)&m_lasx + offset;
   } else
     return Status::FromErrorString(
         "failed - register wasn't recognized to be a GPR or an FPR, "
@@ -184,6 +214,28 @@ Status NativeRegisterContextLinux_loongarch64::WriteRegister(
     ::memcpy(dst, reg_value.GetBytes(), reg_info->byte_size);
 
     return WriteFPR();
+  } else if (IsLSX(reg)) {
+    error = ReadLSX();
+    if (error.Fail())
+      return error;
+
+    offset = CalculateLsxOffset(reg_info);
+    assert(offset < sizeof(m_lsx));
+    dst = (uint8_t *)&m_lsx + offset;
+    ::memcpy(dst, reg_value.GetBytes(), reg_info->byte_size);
+
+    return WriteLSX();
+  } else if (IsLASX(reg)) {
+    error = ReadLASX();
+    if (error.Fail())
+      return error;
+
+    offset = CalculateLasxOffset(reg_info);
+    assert(offset < sizeof(m_lasx));
+    dst = (uint8_t *)&m_lasx + offset;
+    ::memcpy(dst, reg_value.GetBytes(), reg_info->byte_size);
+
+    return WriteLASX();
   }
 
   return Status::FromErrorString("Failed to write register value");
@@ -203,10 +255,22 @@ Status NativeRegisterContextLinux_loongarch64::ReadAllRegisterValues(
   if (error.Fail())
     return error;
 
+  error = ReadLSX();
+  if (error.Fail())
+    return error;
+
+  error = ReadLASX();
+  if (error.Fail())
+    return error;
+
   uint8_t *dst = data_sp->GetBytes();
   ::memcpy(dst, GetGPRBuffer(), GetGPRSize());
   dst += GetGPRSize();
   ::memcpy(dst, GetFPRBuffer(), GetFPRSize());
+  dst += GetFPRSize();
+  ::memcpy(dst, &m_lsx, sizeof(m_lsx));
+  dst += sizeof(m_lsx);
+  ::memcpy(dst, &m_lasx, sizeof(m_lasx));
 
   return error;
 }
@@ -247,11 +311,27 @@ Status NativeRegisterContextLinux_loongarch64::WriteAllRegisterValues(
 
   src += GetRegisterInfoInterface().GetGPRSize();
   ::memcpy(GetFPRBuffer(), src, GetFPRSize());
-
+  m_fpu_is_valid = true;
   error = WriteFPR();
   if (error.Fail())
     return error;
 
+  // Currently, we assume that LoongArch always support LASX.
+  // TODO: check whether LSX/LASX exists.
+  src += GetFPRSize();
+  ::memcpy(&m_lsx, src, sizeof(m_lsx));
+  m_lsx_is_valid = true;
+  error = WriteLSX();
+  if (error.Fail())
+    return error;
+
+  src += sizeof(m_lsx);
+  ::memcpy(&m_lasx, src, sizeof(m_lasx));
+  m_lasx_is_valid = true;
+  error = WriteLASX();
+  if (error.Fail())
+    return error;
+
   return error;
 }
 
@@ -265,6 +345,16 @@ bool NativeRegisterContextLinux_loongarch64::IsFPR(unsigned reg) const {
          RegisterInfoPOSIX_loongarch64::FPRegSet;
 }
 
+bool NativeRegisterContextLinux_loongarch64::IsLSX(unsigned reg) const {
+  return GetRegisterInfo().GetRegisterSetFromRegisterIndex(reg) ==
+         RegisterInfoPOSIX_loongarch64::LSXRegSet;
+}
+
+bool NativeRegisterContextLinux_loongarch64::IsLASX(unsigned reg) const {
+  return GetRegisterInfo().GetRegisterSetFromRegisterIndex(reg) ==
+         RegisterInfoPOSIX_loongarch64::LASXRegSet;
+}
+
 Status NativeRegisterContextLinux_loongarch64::ReadGPR() {
   Status error;
 
@@ -325,13 +415,85 @@ Status NativeRegisterContextLinux_loongarch64::WriteFPR() {
   ioVec.iov_len = GetFPRSize();
 
   m_fpu_is_valid = false;
+  m_lsx_is_valid = false;
+  m_lasx_is_valid = false;
 
   return WriteRegisterSet(&ioVec, GetFPRSize(), NT_FPREGSET);
 }
 
+Status NativeRegisterContextLinux_loongarch64::ReadLSX() {
+  Status error;
+
+  if (m_lsx_is_valid)
+    return error;
+
+  struct iovec ioVec;
+  ioVec.iov_base = &m_lsx;
+  ioVec.iov_len = sizeof(m_lsx);
+
+  error = ReadRegisterSet(&ioVec, sizeof(m_lsx), NT_LOONGARCH_LSX);
+
+  if (error.Success())
+    m_lsx_is_valid = true;
+
+  return error;
+}
+
+Status NativeRegisterContextLinux_loongarch64::WriteLSX() {
+  Status error = ReadLSX();
+  if (error.Fail())
+    return error;
+
+  struct iovec ioVec;
+  ioVec.iov_base = &m_lsx;
+  ioVec.iov_len = sizeof(m_lsx);
+
+  m_fpu_is_valid = false;
+  m_lsx_is_valid = false;
+  m_lasx_is_valid = false;
+
+  return WriteRegisterSet(&ioVec, sizeof(m_lsx), NT_LOONGARCH_LSX);
+}
+
+Status NativeRegisterContextLinux_loongarch64::ReadLASX() {
+  Status error;
+
+  if (m_lasx_is_valid)
+    return error;
+
+  struct iovec ioVec;
+  ioVec.iov_base = &m_lasx;
+  ioVec.iov_len = sizeof(m_lasx);
+
+  error = ReadRegisterSet(&ioVec, sizeof(m_lasx), NT_LOONGARCH_LASX);
+
+  if (error.Success())
+    m_lasx_is_valid = true;
+
+  return error;
+}
+
+Status NativeRegisterContextLinux_loongarch64::WriteLASX() {
+  Status error = ReadLASX();
+  if (error.Fail())
+    return error;
+
+  struct iovec ioVec;
+  ioVec.iov_base = &m_lasx;
+  ioVec.iov_len = sizeof(m_lasx);
+
+  m_fpu_is_valid = false;
+  m_lsx_is_valid = false;
+  m_lasx_is_valid = false;
+
+  return WriteRegisterSet(&ioVec, sizeof(m_lasx), NT_LOONGARCH_LASX);
+}
+
 void NativeRegisterContextLinux_loongarch64::InvalidateAllRegisters() {
   m_gpr_is_valid = false;
   m_fpu_is_valid = false;
+  m_lsx_is_valid = false;
+  m_lasx_is_valid = false;
 }
 
 uint32_t NativeRegisterContextLinux_loongarch64::CalculateFprOffset(
@@ -339,6 +501,16 @@ uint32_t NativeRegisterContextLinux_loongarch64::CalculateFprOffset(
   return reg_info->byte_offset - GetGPRSize();
 }
 
+uint32_t NativeRegisterContextLinux_loongarch64::CalculateLsxOffset(
+    const RegisterInfo *reg_info) const {
+  return reg_info->byte_offset - GetGPRSize() - sizeof(m_fpr);
+}
+
+uint32_t NativeRegisterContextLinux_loongarch64::CalculateLasxOffset(
+    const RegisterInfo *reg_info) const {
+  return reg_info->byte_offset - GetGPRSize() - sizeof(m_fpr) - sizeof(m_lsx);
+}
+
 std::vector<uint32_t>
 NativeRegisterContextLinux_loongarch64::GetExpeditedRegisters(
     ExpeditedRegs expType) const {
diff --git a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_loongarch64.h b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_loongarch64.h
index 633b26fa970de..2b2bb7d29d82f 100644
--- a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_loongarch64.h
+++ b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_loongarch64.h
@@ -62,6 +62,14 @@ class NativeRegisterContextLinux_loongarch64
 
   Status WriteFPR() override;
 
+  Status ReadLSX();
+
+  Status WriteLSX();
+
+  Status ReadLASX();
+
+  Status WriteLASX();
+
   void *GetGPRBuffer() override { return &m_gpr; }
 
   void *GetFPRBuffer() override { return &m_fpr; }
@@ -73,18 +81,29 @@ class NativeRegisterContextLinux_loongarch64
 private:
   bool m_gpr_is_valid;
   bool m_fpu_is_valid;
+  bool m_lsx_is_valid;
+  bool m_lasx_is_valid;
   bool m_refresh_hwdebug_info;
 
   RegisterInfoPOSIX_loongarch64::GPR m_gpr;
-
   RegisterInfoPOSIX_loongarch64::FPR m_fpr;
+  RegisterInfoPOSIX_loongarch64::LSX m_lsx;
+  RegisterInfoPOSIX_loongarch64::LASX m_lasx;
 
   bool IsGPR(unsigned reg) const;
 
   bool IsFPR(unsigned reg) const;
 
+  bool IsLSX(unsigned reg) const;
+
+  bool IsLASX(unsigned reg) const;
+
   uint32_t CalculateFprOffset(const RegisterInfo *reg_info) const;
 
+  uint32_t CalculateLsxOffset(const RegisterInfo *reg_info) const;
+
+  uint32_t CalculateLasxOffset(const RegisterInfo *reg_info) const;
+
   const RegisterInfoPOSIX_loongarch64 &GetRegisterInfo() const;
 
   llvm::Error ReadHardwareDebugInfo() override;
diff --git a/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_loongarch64.cpp b/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_loongarch64.cpp
index 49f371fb949b7..3306fb20dae9b 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_loongarch64.cpp
+++ b/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_loongarch64.cpp
@@ -80,3 +80,13 @@ bool RegisterContextPOSIX_loongarch64::IsFPR(unsigned int reg) {
   return m_register_info_up->GetRegisterSetFromRegisterIndex(reg) ==
          RegisterInfoPOSIX_loongarch64::FPRegSet;
 }
+
+bool RegisterContextPOSIX_loongarch64::IsLSX(unsigned int reg) {
+  return m_register_info_up->GetRegisterSetFromRegisterIndex(reg) ==
+         RegisterInfoPOSIX_loongarch64::LSXRegSet;
+}
+
+bool RegisterContextPOSIX_loongarch64::IsLASX(unsigned int reg) {
+  return m_register_info_up->GetRegisterSetFromRegisterIndex(reg) ==
+         RegisterInfoPOSIX_loongarch64::LASXRegSet;
+}
diff --git a/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_loongarch64.h b/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_loongarch64.h
index 95f93bb41f015..dca24e4b585bf 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_loongarch64.h
+++ b/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_loongarch64.h
@@ -50,14 +50,22 @@ class RegisterContextPOSIX_loongarch64 : public lldb_private::RegisterContext {
 
   bool IsFPR(unsigned reg);
 
+  bool IsLSX(unsigned reg);
+
+  bool IsLASX(unsigned reg);
+
   size_t GetFPRSize() { return sizeof(RegisterInfoPOSIX_loongarch64::FPR); }
 
   uint32_t GetRegNumFCSR() const { return fpr_fcsr_loongarch; }
 
   virtual bool ReadGPR() = 0;
   virtual bool ReadFPR() = 0;
+  virtual bool ReadLSX() { return false; }
+  virtual bool ReadLASX() { return false; }
   virtual bool WriteGPR() = 0;
   virtual bool WriteFPR() = 0;
+  virtual bool WriteLSX() { return false; }
+  virtual bool WriteLASX() { return false; }
 };
 
 #endif // LLDB_SOURCE_PLUGINS_PROCESS_UTILITY_REGISTERCONTEXTPOSIX_LOONGARCH64_H
diff --git a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_loongarch64.cpp b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_loongarch64.cpp
index 6c723afe4b694..61cd40ddcfc84 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_loongarch64.cpp
+++ b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_loongarch64.cpp
@@ -19,10 +19,19 @@
 #define FPR_OFFSET(idx) ((idx)*8 + sizeof(RegisterInfoPOSIX_loongarch64::GPR))
 #define FCC_OFFSET(idx) ((idx)*1 + 32 * 8 + sizeof(RegisterInfoPOSIX_loongarch64::GPR))
 #define FCSR_OFFSET (8 * 1 + 32 * 8 + sizeof(RegisterInfoPOSIX_loongarch64::GPR))
+#define LSX_OFFSET(idx)                                                        \
+  ((idx) * 16 + sizeof(RegisterInfoPOSIX_loongarch64::GPR) +                   \
+   sizeof(RegisterInfoPOSIX_loongarch64::FPR))
+#define LASX_OFFSET(idx)                                                       \
+  ((idx) * 32 + sizeof(RegisterInfoPOSIX_loongarch64::GPR) +                   \
+   sizeof(RegisterInfoPOSIX_loongarch64::FPR) +                                \
+   sizeof(RegisterInfoPOSIX_loongarch64::LSX))
 
 #define REG_CONTEXT_SIZE                                                       \
   (sizeof(RegisterInfoPOSIX_loongarch64::GPR) +                                \
-   sizeof(RegisterInfoPOSIX_loongarch64::FPR))
+   sizeof(RegisterInfoPOSIX_loongarch64::FPR) +                                \
+   sizeof(RegisterInfoPOSIX_loongarch64::LSX) +                                \
+   sizeof(RegisterInfoPOSIX_loongarch64::LASX))
 
 #define DECLARE_REGISTER_INFOS_LOONGARCH64_STRUCT
 #include "RegisterInfos_loongarch64.h"
@@ -56,7 +65,9 @@ uint32_t RegisterInfoPOSIX_loongarch64::GetRegisterInfoCount(
 enum {
   k_num_gpr_registers = gpr_last_loongarch - gpr_first_loongarch + 1,
   k_num_fpr_registers = fpr_last_loongarch - fpr_first_loongarch + 1,
-  k_num_register_sets = 2
+  k_num_lsx_registers = lsx_last_loongarch - lsx_first_loongarch + 1,
+  k_num_lasx_registers = lasx_last_loongarch - lasx_first_loongarch + 1,
+  k_num_register_sets = 4
 };
 
 // LoongArch64 general purpose registers.
@@ -105,13 +116,55 @@ static_assert(((sizeof g_fpr_regnums_loongarch64 /
                1) == k_num_fpr_registers,
               "g_fpr_regnums_loongarch64 has wrong number of register infos");
 
+// LoongArch64 lsx vector registers.
+static const uint32_t g_lsx_regnums_loongarch64[] = {
+    lsx_vr0_loongarch,  lsx_vr1_loongarch,  lsx_vr2_loongarch,
+    lsx_vr3_loongarch,  lsx_vr4_loongarch,  lsx_vr5_loongarch,
+    lsx_vr6_loongarch,  lsx_vr7_loongarch,  lsx_vr8_loongarch,
+    lsx_vr9_loongarch,  lsx_vr10_loongarch, lsx_vr11_loongarch,
+    lsx_vr12_loongarch, lsx_vr13_loongarch, lsx_vr14_loongarch,
+    lsx_vr15_loongarch, lsx_vr16_loongarch, lsx_vr17_loongarch,
+    lsx_vr18_loongarch, lsx_vr19_loongarch, lsx_vr20_loongarch,
+    lsx_vr21_loongarch, lsx_vr22_loongarch, lsx_vr23_loongarch,
+    lsx_vr24_loongarch, lsx_vr25_loongarch, lsx_vr26_loongarch,
+    lsx_vr27_loongarch, lsx_vr28_loongarch, lsx_vr29_loongarch,
+    lsx_vr30_loongarch, lsx_vr31_loongarch, LLDB_INVALID_REGNUM};
+
+static_assert(((sizeof g_lsx_regnums_loongarch64 /
+                sizeof g_lsx_regnums_loongarch64[0]) -
+               1) == k_num_lsx_registers,
+              "g_lsx_regnums_loongarch64 has wrong number of register infos");
+
+// LoongArch64 lasx vector registers.
+static const uint32_t g_lasx_regnums_loongarch64[] = {
+    lasx_xr0_loongarch,  lasx_xr1_loongarch,  lasx_xr2_loongarch,
+    lasx_xr3_loongarch,  lasx_xr4_loongarch,  lasx_xr5_loongarch,
+    lasx_xr6_loongarch,  lasx_xr7_loongarch,  lasx_xr8_loongarch,
+    lasx_xr9_loongarch,  lasx_xr10_loongarch, lasx_xr11_loongarch,
+    lasx_xr12_loongarch, lasx_xr13_loongarch, lasx_xr14_loongarch,
+    lasx_xr15_loongarch, lasx_xr16_loongarch, lasx_xr17_loongarch,
+    lasx_xr18_loongarch, lasx_xr19_loongarch, lasx_xr20_loongarch,
+    lasx_xr21_loongarch, lasx_xr22_loongarch, lasx_xr23_loongarch,
+    lasx_xr24_loongarch, lasx_xr25_loongarch, lasx_xr26_loongarch,
+    lasx_xr27_loongarch, lasx_xr28_loongarch, lasx_xr29_loongarch,
+    lasx_xr30_loongarch, lasx_xr31_loongarch, LLDB_INVALID_REGNUM};
+
+static_assert(((sizeof g_lasx_regnums_loongarch64 /
+                sizeof g_lasx_regnums_loongarch64[0]) -
+               1) == k_num_lasx_registers,
+              "g_lasx_regnums_loongarch64 has wrong number of register infos");
+
 // Register sets for LoongArch64.
 static const lldb_private::RegisterSet
     g_reg_sets_loongarch64[k_num_register_sets] = {
         {"General Purpose Registers", "gpr", k_num_gpr_registers,
          g_gpr_regnums_loongarch64},
         {"Floating Point Registers", "fpr", k_num_fpr_registers,
-         g_fpr_regnums_loongarch64}};
+         g_fpr_regnums_loongarch64},
+        {"LSX Vector Registers", "lsx", k_num_lsx_registers,
+         g_lsx_regnums_loongarch64},
+        {"LASX Vector Registers", "lasx", k_num_lasx_registers,
+         g_lasx_regnums_loongarch64}};
 
 RegisterInfoPOSIX_loongarch64::RegisterInfoPOSIX_loongarch64(
     const lldb_private::ArchSpec &target_arch, lldb_private::Flags flags)
@@ -147,6 +200,10 @@ size_t RegisterInfoPOSIX_loongarch64::GetRegisterSetFromRegisterIndex(
     return GPRegSet;
   if (reg_index >= fpr_first_loongarch && reg_index <= fpr_last_loongarch)
     return FPRegSet;
+  if (reg_index >= lsx_first_loongarch && reg_index <= lsx_last_loongarch)
+    return LSXRegSet;
+  if (reg_index >= lasx_first_loongarch && reg_index <= lasx_last_loongarch)
+    return LASXRegSet;
   return LLDB_INVALID_REGNUM;
 }
 
diff --git a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_loongarch64.h b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_loongarch64.h
index a3338acbbc97b..0ff08bb8c0e92 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_loongarch64.h
+++ b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_loongarch64.h
@@ -26,6 +26,8 @@ class RegisterInfoPOSIX_loongarch64
   enum RegSetKind {
     GPRegSet,
     FPRegSet,
+    LSXRegSet,
+    LASXRegSet,
   };
 
   struct GPR {
@@ -43,6 +45,16 @@ class RegisterInfoPOSIX_loongarch64
     uint32_t fcsr;
   };
 
+  /* 32 registers, 128 bits width per register. */
+  struct LSX {
+    uint64_t vr[32 * 2];
+  };
+
+  /* 32 registers, 256 bits width per register. */
+  struct LASX {
+    uint64_t xr[32 * 4];
+  };
+
   RegisterInfoPOSIX_loongarch64(const lldb_private::ArchSpec &target_arch,
                                 lldb_private::Flags flags);
 
diff --git a/lldb/source/Plugins/Process/Utility/RegisterInfos_loongarch64.h b/lldb/source/Plugins/Process/Utility/RegisterInfos_loongarch64.h
index 3fb1e6a5fbef2..ff8fe5990ce11 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterInfos_loongarch64.h
+++ b/lldb/source/Plugins/Process/Utility/RegisterInfos_loongarch64.h
@@ -25,6 +25,14 @@
 #error FPR_OFFSET must be defined before including this header file
 #endif
 
+#ifndef LSX_OFFSET
+#error LSX_OFFSET must be defined before including this header file
+#endif
+
+#ifndef LASX_OFFSET
+#error LASX_OFFSET must be defined before including this header file
+#endif
+
 using namespace loongarch_dwarf;
 
 // clang-format off
@@ -74,6 +82,21 @@ using namespace loongarch_dwarf;
     FPR64_KIND(fpr_##reg, generic_kind), nullptr, nullptr, nullptr,            \
   }
 
+#define DEFINE_LSX(reg, generic_kind) \
+  {                                                                            \
+    #reg, nullptr, 16, LSX_OFFSET(lsx_##reg##_loongarch - lsx_first_loongarch),\
+    lldb::eEncodingVector, lldb::eFormatVectorOfUInt8,                         \
+    KIND_HELPER(lsx_##reg, generic_kind), nullptr, nullptr, nullptr,           \
+  }
+
+#define DEFINE_LASX(reg, generic_kind) \
+  {                                                                            \
+    #reg, nullptr, 32,                                                         \
+    LASX_OFFSET(lasx_##reg##_loongarch - lasx_first_loongarch),                \
+    lldb::eEncodingVector, lldb::eFormatVectorOfUInt8,                         \
+    KIND_HELPER(lasx_##reg, generic_kind), nullptr, nullptr, nullptr,          \
+  }
+
 // clang-format on
 
 static lldb_private::RegisterInfo g_register_infos_loongarch64[] = {
@@ -166,6 +189,72 @@ static lldb_private::RegisterInfo g_register_infos_loongarch64[] = {
     DEFINE_FCC(fcc6, LLDB_INVALID_REGNUM),
     DEFINE_FCC(fcc7, LLDB_INVALID_REGNUM),
     DEFINE_FCSR(fcsr, LLDB_INVALID_REGNUM),
+
+    DEFINE_LSX(vr0, LLDB_INVALID_REGNUM),
+    DEFINE_LSX(vr1, LLDB_INVALID_REGNUM),
+    DEFINE_LSX(vr2, LLDB_INVALID_REGNUM),
+    DEFINE_LSX(vr3, LLDB_INVALID_REGNUM),
+    DEFINE_LSX(vr4, LLDB_INVALID_REGNUM),
+    DEFINE_LSX(vr5, LLDB_INVALID_REGNUM),
+    DEFINE_LSX(vr6, LLDB_INVALID_REGNUM),
+    DEFINE_LSX(vr7, LLDB_INVALID_REGNUM),
+    DEFINE_LSX(vr8, LLDB_INVALID_REGNUM),
+    DEFINE_LSX(vr9, LLDB_INVALID_REGNUM),
+    DEFINE_LSX(vr10, LLDB_INVALID_REGNUM),
+    DEFINE_LSX(vr11, LLDB_INVALID_REGNUM),
+    DEFINE_LSX(vr12, LLDB_INVALID_REGNUM),
+    DEFINE_LSX(vr13, LLDB_INVALID_REGNUM),
+    DEFINE_LSX(vr14, LLDB_INVALID_REGNUM),
+    DEFINE_LSX(vr15, LLDB_INVALID_REGNUM),
+    DEFINE_LSX(vr16, LLDB_INVALID_REGNUM),
+    DEFINE_LSX(vr17, LLDB_INVALID_REGNUM),
+    DEFINE_LSX(vr18, LLDB_INVALID_REGNUM),
+    DEFINE_LSX(vr19, LLDB_INVALID_REGNUM),
+    DEFINE_LSX(vr20, LLDB_INVALID_REGNUM),
+    DEFINE_LSX(vr21, LLDB_INVALID_REGNUM),
+    DEFINE_LSX(vr22, LLDB_INVALID_REGNUM),
+    DEFINE_LSX(vr23, LLDB_INVALID_REGNUM),
+    DEFINE_LSX(vr24, LLDB_INVALID_REGNUM),
+    DEFINE_LSX(vr25, LLDB_INVALID_REGNUM),
+    DEFINE_LSX(vr26, LLDB_INVALID_REGNUM),
+    DEFINE_LSX(vr27, LLDB_INVALID_REGNUM),
+    DEFINE_LSX(vr28, LLDB_INVALID_REGNUM),
+    DEFINE_LSX(vr29, LLDB_INVALID_REGNUM),
+    DEFINE_LSX(vr30, LLDB_INVALID_REGNUM),
+    DEFINE_LSX(vr31, LLDB_INVALID_REGNUM),
+
+    DEFINE_LASX(xr0, LLDB_INVALID_REGNUM),
+    DEFINE_LASX(xr1, LLDB_INVALID_REGNUM),
+    DEFINE_LASX(xr2, LLDB_INVALID_REGNUM),
+    DEFINE_LASX(xr3, LLDB_INVALID_REGNUM),
+    DEFINE_LASX(xr4, LLDB_INVALID_REGNUM),
+    DEFINE_LASX(xr5, LLDB_INVALID_REGNUM),
+    DEFINE_LASX(xr6, LLDB_INVALID_REGNUM),
+    DEFINE_LASX(xr7, LLDB_INVALID_REGNUM),
+    DEFINE_LASX(xr8, LLDB_INVALID_REGNUM),
+    DEFINE_LASX(xr9, LLDB_INVALID_REGNUM),
+    DEFINE_LASX(xr10, LLDB_INVALID_REGNUM),
+    DEFINE_LASX(xr11, LLDB_INVALID_REGNUM),
+    DEFINE_LASX(xr12, LLDB_INVALID_REGNUM),
+    DEFINE_LASX(xr13, LLDB_INVALID_REGNUM),
+    DEFINE_LASX(xr14, LLDB_INVALID_REGNUM),
+    DEFINE_LASX(xr15, LLDB_INVALID_REGNUM),
+    DEFINE_LASX(xr16, LLDB_INVALID_REGNUM),
+    DEFINE_LASX(xr17, LLDB_INVALID_REGNUM),
+    DEFINE_LASX(xr18, LLDB_INVALID_REGNUM),
+    DEFINE_LASX(xr19, LLDB_INVALID_REGNUM),
+    DEFINE_LASX(xr20, LLDB_INVALID_REGNUM),
+    DEFINE_LASX(xr21, LLDB_INVALID_REGNUM),
+    DEFINE_LASX(xr22, LLDB_INVALID_REGNUM),
+    DEFINE_LASX(xr23, LLDB_INVALID_REGNUM),
+    DEFINE_LASX(xr24, LLDB_INVALID_REGNUM),
+    DEFINE_LASX(xr25, LLDB_INVALID_REGNUM),
+    DEFINE_LASX(xr26, LLDB_INVALID_REGNUM),
+    DEFINE_LASX(xr27, LLDB_INVALID_REGNUM),
+    DEFINE_LASX(xr28, LLDB_INVALID_REGNUM),
+    DEFINE_LASX(xr29, LLDB_INVALID_REGNUM),
+    DEFINE_LASX(xr30, LLDB_INVALID_REGNUM),
+    DEFINE_LASX(xr31, LLDB_INVALID_REGNUM),
 };
 
 #endif // DECLARE_REGISTER_INFOS_LOONGARCH64_STRUCT
diff --git a/lldb/source/Plugins/Process/Utility/lldb-loongarch-register-enums.h b/lldb/source/Plugins/Process/Utility/lldb-loongarch-register-enums.h
index f55c807f86c00..accd53048f93e 100644
--- a/lldb/source/Plugins/Process/Utility/lldb-loongarch-register-enums.h
+++ b/lldb/source/Plugins/Process/Utility/lldb-loongarch-register-enums.h
@@ -172,6 +172,76 @@ enum {
   fpr_fs6_loongarch = fpr_f30_loongarch,
   fpr_fs7_loongarch = fpr_f31_loongarch,
 
+  lsx_first_loongarch = fpr_last_loongarch + 1,
+  lsx_vr0_loongarch = lsx_first_loongarch,
+  lsx_vr1_loongarch,
+  lsx_vr2_loongarch,
+  lsx_vr3_loongarch,
+  lsx_vr4_loongarch,
+  lsx_vr5_loongarch,
+  lsx_vr6_loongarch,
+  lsx_vr7_loongarch,
+  lsx_vr8_loongarch,
+  lsx_vr9_loongarch,
+  lsx_vr10_loongarch,
+  lsx_vr11_loongarch,
+  lsx_vr12_loongarch,
+  lsx_vr13_loongarch,
+  lsx_vr14_loongarch,
+  lsx_vr15_loongarch,
+  lsx_vr16_loongarch,
+  lsx_vr17_loongarch,
+  lsx_vr18_loongarch,
+  lsx_vr19_loongarch,
+  lsx_vr20_loongarch,
+  lsx_vr21_loongarch,
+  lsx_vr22_loongarch,
+  lsx_vr23_loongarch,
+  lsx_vr24_loongarch,
+  lsx_vr25_loongarch,
+  lsx_vr26_loongarch,
+  lsx_vr27_loongarch,
+  lsx_vr28_loongarch,
+  lsx_vr29_loongarch,
+  lsx_vr30_loongarch,
+  lsx_vr31_loongarch,
+  lsx_last_loongarch = lsx_vr31_loongarch,
+
+  lasx_first_loongarch = lsx_last_loongarch + 1,
+  lasx_xr0_loongarch = lasx_first_loongarch,
+  lasx_xr1_loongarch,
+  lasx_xr2_loongarch,
+  lasx_xr3_loongarch,
+  lasx_xr4_loongarch,
+  lasx_xr5_loongarch,
+  lasx_xr6_loongarch,
+  lasx_xr7_loongarch,
+  lasx_xr8_loongarch,
+  lasx_xr9_loongarch,
+  lasx_xr10_loongarch,
+  lasx_xr11_loongarch,
+  lasx_xr12_loongarch,
+  lasx_xr13_loongarch,
+  lasx_xr14_loongarch,
+  lasx_xr15_loongarch,
+  lasx_xr16_loongarch,
+  lasx_xr17_loongarch,
+  lasx_xr18_loongarch,
+  lasx_xr19_loongarch,
+  lasx_xr20_loongarch,
+  lasx_xr21_loongarch,
+  lasx_xr22_loongarch,
+  lasx_xr23_loongarch,
+  lasx_xr24_loongarch,
+  lasx_xr25_loongarch,
+  lasx_xr26_loongarch,
+  lasx_xr27_loongarch,
+  lasx_xr28_loongarch,
+  lasx_xr29_loongarch,
+  lasx_xr30_loongarch,
+  lasx_xr31_loongarch,
+  lasx_last_loongarch = lasx_xr31_loongarch,
+
   k_num_registers_loongarch
 };
 
diff --git a/lldb/source/Utility/LoongArch_DWARF_Registers.h b/lldb/source/Utility/LoongArch_DWARF_Registers.h
index 596806348ee24..264f329d71e07 100644
--- a/lldb/source/Utility/LoongArch_DWARF_Registers.h
+++ b/lldb/source/Utility/LoongArch_DWARF_Registers.h
@@ -90,6 +90,72 @@ enum {
   dwarf_fpr_fcc7,
   dwarf_fpr_fcsr,
 
+  dwarf_lsx_vr0,
+  dwarf_lsx_vr1,
+  dwarf_lsx_vr2,
+  dwarf_lsx_vr3,
+  dwarf_lsx_vr4,
+  dwarf_lsx_vr5,
+  dwarf_lsx_vr6,
+  dwarf_lsx_vr7,
+  dwarf_lsx_vr8,
+  dwarf_lsx_vr9,
+  dwarf_lsx_vr10,
+  dwarf_lsx_vr11,
+  dwarf_lsx_vr12,
+  dwarf_lsx_vr13,
+  dwarf_lsx_vr14,
+  dwarf_lsx_vr15,
+  dwarf_lsx_vr16,
+  dwarf_lsx_vr17,
+  dwarf_lsx_vr18,
+  dwarf_lsx_vr19,
+  dwarf_lsx_vr20,
+  dwarf_lsx_vr21,
+  dwarf_lsx_vr22,
+  dwarf_lsx_vr23,
+  dwarf_lsx_vr24,
+  dwarf_lsx_vr25,
+  dwarf_lsx_vr26,
+  dwarf_lsx_vr27,
+  dwarf_lsx_vr28,
+  dwarf_lsx_vr29,
+  dwarf_lsx_vr30,
+  dwarf_lsx_vr31,
+
+  dwarf_lasx_xr0,
+  dwarf_lasx_xr1,
+  dwarf_lasx_xr2,
+  dwarf_lasx_xr3,
+  dwarf_lasx_xr4,
+  dwarf_lasx_xr5,
+  dwarf_lasx_xr6,
+  dwarf_lasx_xr7,
+  dwarf_lasx_xr8,
+  dwarf_lasx_xr9,
+  dwarf_lasx_xr10,
+  dwarf_lasx_xr11,
+  dwarf_lasx_xr12,
+  dwarf_lasx_xr13,
+  dwarf_lasx_xr14,
+  dwarf_lasx_xr15,
+  dwarf_lasx_xr16,
+  dwarf_lasx_xr17,
+  dwarf_lasx_xr18,
+  dwarf_lasx_xr19,
+  dwarf_lasx_xr20,
+  dwarf_lasx_xr21,
+  dwarf_lasx_xr22,
+  dwarf_lasx_xr23,
+  dwarf_lasx_xr24,
+  dwarf_lasx_xr25,
+  dwarf_lasx_xr26,
+  dwarf_lasx_xr27,
+  dwarf_lasx_xr28,
+  dwarf_lasx_xr29,
+  dwarf_lasx_xr30,
+  dwarf_lasx_xr31,
+
   // register name alias
   dwarf_gpr_zero = dwarf_gpr_r0,
   dwarf_gpr_ra = dwarf_gpr_r1,
diff --git a/lldb/test/API/linux/loongarch64/simd_registers/Makefile b/lldb/test/API/linux/loongarch64/simd_registers/Makefile
new file mode 100644
index 0000000000000..10495940055b6
--- /dev/null
+++ b/lldb/test/API/linux/loongarch64/simd_registers/Makefile
@@ -0,0 +1,3 @@
+C_SOURCES := main.c
+
+include Makefile.rules
diff --git a/lldb/test/API/linux/loongarch64/simd_registers/TestLoongArch64LinuxSIMDRegisters.py b/lldb/test/API/linux/loongarch64/simd_registers/TestLoongArch64LinuxSIMDRegisters.py
new file mode 100644
index 0000000000000..03e1cc734b708
--- /dev/null
+++ b/lldb/test/API/linux/loongarch64/simd_registers/TestLoongArch64LinuxSIMDRegisters.py
@@ -0,0 +1,93 @@
+"""
+Test lldb's ability to read and write the LoongArch SIMD registers.
+"""
+
+from enum import Enum
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class Mode(Enum):
+    LSX = 0
+    LASX = 1
+
+
+class LoongArch64LinuxRegisters(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    def get_build_flags(self, mode):
+        cflags = "-march=la464"
+        if mode == Mode.LASX:
+            cflags += " -DLASX"
+
+        return {"CFLAGS_EXTRAS": cflags}
+
+    def make_simd_value(self, n, mode):
+        count = 32 if mode == Mode.LASX else 16
+        return "{" + " ".join(["0x{:02x}".format(n)] * count) + "}"
+
+    def check_simd_values(self, value_offset, mode):
+        reg_prefix = "xr" if mode == Mode.LASX else "vr"
+        for i in range(32):
+            self.expect(
+                "register read {}{}".format(reg_prefix, i),
+                substrs=[self.make_simd_value(i + value_offset, mode)],
+            )
+
+    def simd_registers_impl(self, mode):
+        self.build(dictionary=self.get_build_flags(mode))
+        self.runCmd("file " + self.getBuildArtifact("a.out"), CURRENT_EXECUTABLE_SET)
+
+        lldbutil.run_break_set_by_file_and_line(
+            self,
+            "main.c",
+            line_number("main.c", "// Set break point at this line."),
+            num_expected_locations=1,
+        )
+
+        self.runCmd("run", RUN_SUCCEEDED)
+
+        if self.process().GetState() == lldb.eStateExited:
+            self.fail("Test program failed to run.")
+
+        self.expect(
+            "thread list",
+            STOPPED_DUE_TO_BREAKPOINT,
+            substrs=["stopped", "stop reason = breakpoint"],
+        )
+
+        self.check_simd_values(0, mode)
+        self.runCmd("expression write_simd_regs(1)")
+        self.check_simd_values(0, mode)
+
+        reg_prefix = "xr" if mode == Mode.LASX else "vr"
+        for i in range(32):
+            self.runCmd(
+                'register write {}{} "{}"'.format(
+                    reg_prefix, i, self.make_simd_value(i + 1, mode)
+                )
+            )
+
+        # Should be visible within lldb.
+        self.check_simd_values(1, mode)
+
+        # The program should agree with lldb.
+        self.expect("continue", substrs=["exited with status = 0"])
+
+    @skipUnlessArch("loongarch64")
+    @skipUnlessPlatform(["linux"])
+    def test_lsx(self):
+        """Test read/write of LSX registers."""
+        if not self.isLoongArchLSX():
+            self.skipTest("LSX must be present.")
+        self.simd_registers_impl(Mode.LSX)
+
+    @skipUnlessArch("loongarch64")
+    @skipUnlessPlatform(["linux"])
+    def test_lasx(self):
+        """Test read/write of LASX registers."""
+        if not self.isLoongArchLASX():
+            self.skipTest("LASX must be present.")
+        self.simd_registers_impl(Mode.LASX)
diff --git a/lldb/test/API/linux/loongarch64/simd_registers/main.c b/lldb/test/API/linux/loongarch64/simd_registers/main.c
new file mode 100644
index 0000000000000..a19a13f1d3082
--- /dev/null
+++ b/lldb/test/API/linux/loongarch64/simd_registers/main.c
@@ -0,0 +1,108 @@
+#include <stdint.h>
+
+#ifdef LASX
+#define ELEM_COUNT 32
+#define REPLGR2VR_B "xvreplgr2vr.b $xr"
+#define ST "xvst $xr"
+#else
+#define ELEM_COUNT 16
+#define REPLGR2VR_B "vreplgr2vr.b $vr"
+#define ST "vst $vr"
+#endif
+
+// base is added to each value. If base = 2, then
+// assume the vector element type is char:
+// $reg0 = { 0x02 * $ELEM_COUNT }
+// $reg1 = { 0x03 * $ELEM_COUNT } etc.
+void write_simd_regs(unsigned base) {
+#define WRITE_SIMD(NUM)                                                        \
+  asm volatile(REPLGR2VR_B #NUM ", %0\n\t" ::"r"(base + NUM))
+  WRITE_SIMD(0);
+  WRITE_SIMD(1);
+  WRITE_SIMD(2);
+  WRITE_SIMD(3);
+  WRITE_SIMD(4);
+  WRITE_SIMD(5);
+  WRITE_SIMD(6);
+  WRITE_SIMD(7);
+  WRITE_SIMD(8);
+  WRITE_SIMD(9);
+  WRITE_SIMD(10);
+  WRITE_SIMD(11);
+  WRITE_SIMD(12);
+  WRITE_SIMD(13);
+  WRITE_SIMD(14);
+  WRITE_SIMD(15);
+  WRITE_SIMD(16);
+  WRITE_SIMD(17);
+  WRITE_SIMD(18);
+  WRITE_SIMD(19);
+  WRITE_SIMD(20);
+  WRITE_SIMD(21);
+  WRITE_SIMD(22);
+  WRITE_SIMD(23);
+  WRITE_SIMD(24);
+  WRITE_SIMD(25);
+  WRITE_SIMD(26);
+  WRITE_SIMD(27);
+  WRITE_SIMD(28);
+  WRITE_SIMD(29);
+  WRITE_SIMD(30);
+  WRITE_SIMD(31);
+}
+
+unsigned verify_simd_regs() {
+  uint8_t simd_reg[ELEM_COUNT];
+  uint8_t target = 0;
+
+#define VERIFY_SIMD(NUM)                                                       \
+  do {                                                                         \
+    for (int i = 0; i < ELEM_COUNT; ++i)                                       \
+      simd_reg[i] = 0;                                                         \
+    asm volatile(ST #NUM ", %0\n\t" ::"m"(simd_reg));                          \
+    target = NUM + 1;                                                          \
+    for (int i = 0; i < ELEM_COUNT; ++i)                                       \
+      if (simd_reg[i] != target)                                               \
+        return 1;                                                              \
+  } while (0)
+
+  VERIFY_SIMD(0);
+  VERIFY_SIMD(1);
+  VERIFY_SIMD(2);
+  VERIFY_SIMD(3);
+  VERIFY_SIMD(4);
+  VERIFY_SIMD(5);
+  VERIFY_SIMD(6);
+  VERIFY_SIMD(7);
+  VERIFY_SIMD(8);
+  VERIFY_SIMD(9);
+  VERIFY_SIMD(10);
+  VERIFY_SIMD(11);
+  VERIFY_SIMD(12);
+  VERIFY_SIMD(13);
+  VERIFY_SIMD(14);
+  VERIFY_SIMD(15);
+  VERIFY_SIMD(16);
+  VERIFY_SIMD(17);
+  VERIFY_SIMD(18);
+  VERIFY_SIMD(19);
+  VERIFY_SIMD(20);
+  VERIFY_SIMD(21);
+  VERIFY_SIMD(22);
+  VERIFY_SIMD(23);
+  VERIFY_SIMD(24);
+  VERIFY_SIMD(25);
+  VERIFY_SIMD(26);
+  VERIFY_SIMD(27);
+  VERIFY_SIMD(28);
+  VERIFY_SIMD(29);
+  VERIFY_SIMD(30);
+  VERIFY_SIMD(31);
+
+  return 0;
+}
+int main(int argc, char *argv[]) {
+  write_simd_regs(0);
+
+  return verify_simd_regs(); // Set break point at this line.
+}

From ba4dc5a0d6ecb772befe418d42f9c7089563d690 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Mon, 13 Jan 2025 17:23:12 -0800
Subject: [PATCH 355/408] [flang][cuda] Pass the device address for global
 descriptor (#122802)

---
 .../Optimizer/Transforms/CUFOpConversion.cpp  | 65 +++++++++++++------
 flang/test/Fir/CUDA/cuda-launch.fir           | 42 ++++++++++++
 2 files changed, 86 insertions(+), 21 deletions(-)

diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 8c525fc6daff5..d61d9f63cb294 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -366,6 +366,23 @@ struct CUFAllocOpConversion : public mlir::OpRewritePattern<cuf::AllocOp> {
   const fir::LLVMTypeConverter *typeConverter;
 };
 
+static mlir::Value genGetDeviceAddress(mlir::PatternRewriter &rewriter,
+                                       mlir::ModuleOp mod, mlir::Location loc,
+                                       mlir::Value inputArg) {
+  fir::FirOpBuilder builder(rewriter, mod);
+  mlir::func::FuncOp callee =
+      fir::runtime::getRuntimeFunc<mkRTKey(CUFGetDeviceAddress)>(loc, builder);
+  auto fTy = callee.getFunctionType();
+  mlir::Value conv = createConvertOp(rewriter, loc, fTy.getInput(0), inputArg);
+  mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
+  mlir::Value sourceLine =
+      fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
+  llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+      builder, loc, fTy, conv, sourceFile, sourceLine)};
+  auto call = rewriter.create<fir::CallOp>(loc, callee, args);
+  return createConvertOp(rewriter, loc, inputArg.getType(), call->getResult(0));
+}
+
 struct DeclareOpConversion : public mlir::OpRewritePattern<fir::DeclareOp> {
   using OpRewritePattern::OpRewritePattern;
 
@@ -382,26 +399,10 @@ struct DeclareOpConversion : public mlir::OpRewritePattern<fir::DeclareOp> {
         if (cuf::isRegisteredDeviceGlobal(global)) {
           rewriter.setInsertionPointAfter(addrOfOp);
           auto mod = op->getParentOfType<mlir::ModuleOp>();
-          fir::FirOpBuilder builder(rewriter, mod);
-          mlir::Location loc = op.getLoc();
-          mlir::func::FuncOp callee =
-              fir::runtime::getRuntimeFunc<mkRTKey(CUFGetDeviceAddress)>(
-                  loc, builder);
-          auto fTy = callee.getFunctionType();
-          mlir::Type toTy = fTy.getInput(0);
-          mlir::Value inputArg =
-              createConvertOp(rewriter, loc, toTy, addrOfOp.getResult());
-          mlir::Value sourceFile =
-              fir::factory::locationToFilename(builder, loc);
-          mlir::Value sourceLine =
-              fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
-          llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
-              builder, loc, fTy, inputArg, sourceFile, sourceLine)};
-          auto call = rewriter.create<fir::CallOp>(loc, callee, args);
-          mlir::Value cast = createConvertOp(
-              rewriter, loc, op.getMemref().getType(), call->getResult(0));
+          mlir::Value devAddr = genGetDeviceAddress(rewriter, mod, op.getLoc(),
+                                                    addrOfOp.getResult());
           rewriter.startOpModification(op);
-          op.getMemrefMutable().assign(cast);
+          op.getMemrefMutable().assign(devAddr);
           rewriter.finalizeOpModification(op);
           return success();
         }
@@ -771,10 +772,32 @@ struct CUFLaunchOpConversion
             loc, clusterDimsAttr.getZ().getInt());
       }
     }
+    llvm::SmallVector<mlir::Value> args;
+    auto mod = op->getParentOfType<mlir::ModuleOp>();
+    for (mlir::Value arg : op.getArgs()) {
+      // If the argument is a global descriptor, make sure we pass the device
+      // copy of this descriptor and not the host one.
+      if (mlir::isa<fir::BaseBoxType>(fir::unwrapRefType(arg.getType()))) {
+        if (auto declareOp =
+                mlir::dyn_cast_or_null<fir::DeclareOp>(arg.getDefiningOp())) {
+          if (auto addrOfOp = mlir::dyn_cast_or_null<fir::AddrOfOp>(
+                  declareOp.getMemref().getDefiningOp())) {
+            if (auto global = symTab.lookup<fir::GlobalOp>(
+                    addrOfOp.getSymbol().getRootReference().getValue())) {
+              if (cuf::isRegisteredDeviceGlobal(global)) {
+                arg = genGetDeviceAddress(rewriter, mod, op.getLoc(),
+                                          declareOp.getResult());
+              }
+            }
+          }
+        }
+      }
+      args.push_back(arg);
+    }
+
     auto gpuLaunchOp = rewriter.create<mlir::gpu::LaunchFuncOp>(
         loc, kernelName, mlir::gpu::KernelDim3{gridSizeX, gridSizeY, gridSizeZ},
-        mlir::gpu::KernelDim3{blockSizeX, blockSizeY, blockSizeZ}, zero,
-        op.getArgs());
+        mlir::gpu::KernelDim3{blockSizeX, blockSizeY, blockSizeZ}, zero, args);
     if (clusterDimX && clusterDimY && clusterDimZ) {
       gpuLaunchOp.getClusterSizeXMutable().assign(clusterDimX);
       gpuLaunchOp.getClusterSizeYMutable().assign(clusterDimY);
diff --git a/flang/test/Fir/CUDA/cuda-launch.fir b/flang/test/Fir/CUDA/cuda-launch.fir
index f11bcbdb7fce5..1e19b3bea1296 100644
--- a/flang/test/Fir/CUDA/cuda-launch.fir
+++ b/flang/test/Fir/CUDA/cuda-launch.fir
@@ -62,3 +62,45 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
 // CHECK-LABEL: func.func @_QMmod1Phost_sub()
 // CHECK: gpu.launch_func  @cuda_device_mod::@_QMmod1Psub1 clusters in (%c2{{.*}}, %c2{{.*}}, %c1{{.*}})
 
+// -----
+
+module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
+  gpu.module @cuda_device_mod {
+    gpu.func @_QMdevptrPtest(%arg0: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) kernel {
+      gpu.return
+    }
+  }
+  fir.global @_QMdevptrEdev_ptr {data_attr = #cuf.cuda<device>} : !fir.box<!fir.ptr<!fir.array<?xf32>>> {
+    %c0 = arith.constant 0 : index
+    %0 = fir.zero_bits !fir.ptr<!fir.array<?xf32>>
+    %1 = fir.shape %c0 : (index) -> !fir.shape<1>
+    %2 = fir.embox %0(%1) {allocator_idx = 2 : i32} : (!fir.ptr<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xf32>>>
+    fir.has_value %2 : !fir.box<!fir.ptr<!fir.array<?xf32>>>
+  }
+  func.func @_QMdevptrPtest(%arg0: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "dp"}) attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
+    return
+  }
+  func.func @_QQmain() {
+    %c1_i32 = arith.constant 1 : i32
+    %c4 = arith.constant 4 : index
+    %0 = cuf.alloc !fir.array<4xf32> {bindc_name = "a_dev", data_attr = #cuf.cuda<device>, uniq_name = "_QFEa_dev"} -> !fir.ref<!fir.array<4xf32>>
+    %1 = fir.shape %c4 : (index) -> !fir.shape<1>
+    %2 = fir.declare %0(%1) {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFEa_dev"} : (!fir.ref<!fir.array<4xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<4xf32>>
+    %3 = fir.address_of(@_QMdevptrEdev_ptr) : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+    %4 = fir.declare %3 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMdevptrEdev_ptr"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+    %5 = fir.embox %2(%1) : (!fir.ref<!fir.array<4xf32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xf32>>>
+    fir.store %5 to %4 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+    cuf.sync_descriptor @_QMdevptrEdev_ptr
+    cuf.kernel_launch @_QMdevptrPtest<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32>>>(%4) : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+    cuf.free %2 : !fir.ref<!fir.array<4xf32>> {data_attr = #cuf.cuda<device>}
+    return
+  }
+}
+
+// CHECK-LABEL: func.func @_QQmain()
+// CHECK: %[[ADDROF:.*]] = fir.address_of(@_QMdevptrEdev_ptr) : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+// CHECK: %[[DECL:.*]] = fir.declare %[[ADDROF]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMdevptrEdev_ptr"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+// CHECK: %[[CONV_DECL:.*]] = fir.convert %[[DECL]] : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> !fir.llvm_ptr<i8>
+// CHECK: %[[DEVADDR:.*]] = fir.call @_FortranACUFGetDeviceAddress(%[[CONV_DECL]], %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
+// CHECK: %[[CONV_DEVADDR:.*]] = fir.convert %[[DEVADDR]] : (!fir.llvm_ptr<i8>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+// CHECK: gpu.launch_func  @cuda_device_mod::@_QMdevptrPtest blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}})  dynamic_shared_memory_size %{{.*}} args(%[[CONV_DEVADDR]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)

From c701c18bed0c6c1bfd4a1dcfa9f207ddbb74cdfc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Mon, 13 Jan 2025 17:28:34 -0800
Subject: [PATCH 356/408] [flang][cuda] Move interface to __cuda_device
 (#122771)

---
 flang/lib/Semantics/resolve-names.cpp |  4 +++-
 flang/module/__cuda_device.f90        | 32 +++++++++++++++++++++++++++
 flang/module/cudadevice.f90           | 17 +-------------
 flang/tools/f18/CMakeLists.txt        |  6 ++++-
 4 files changed, 41 insertions(+), 18 deletions(-)
 create mode 100644 flang/module/__cuda_device.f90

diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 51e7c5960dc2e..c1663082f86d4 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -4015,7 +4015,9 @@ bool SubprogramVisitor::Pre(const parser::PrefixSpec::Attributes &attrs) {
           *attrs == common::CUDASubprogramAttrs::Device) {
         const Scope &scope{currScope()};
         const Scope *mod{FindModuleContaining(scope)};
-        if (mod && mod->GetName().value() == "cudadevice") {
+        if (mod &&
+            (mod->GetName().value() == "cudadevice" ||
+                mod->GetName().value() == "__cuda_device")) {
           return false;
         }
         // Implicitly USE the cudadevice module by copying its symbols in the
diff --git a/flang/module/__cuda_device.f90 b/flang/module/__cuda_device.f90
new file mode 100644
index 0000000000000..81b1f5aa334bb
--- /dev/null
+++ b/flang/module/__cuda_device.f90
@@ -0,0 +1,32 @@
+!===-- module/__cuda_device.f90 --------------------------------------------===!
+!
+! Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+! See https://llvm.org/LICENSE.txt for license information.
+! SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+!
+!===------------------------------------------------------------------------===!
+
+! This module contains CUDA Fortran interfaces used in cudadevice.f90.
+
+module __cuda_device
+implicit none
+
+  ! Set PRIVATE by default to explicitly only export what is meant
+  ! to be exported by this MODULE.
+
+  interface
+    attributes(device) function __fadd_rd(x, y) bind(c, name='__nv_fadd_rd')
+      real, intent(in), value :: x, y
+      real :: __fadd_rd
+    end function
+  end interface
+  public :: __fadd_rd
+
+  interface
+    attributes(device) function __fadd_ru(x, y) bind(c, name='__nv_fadd_ru')
+      real, intent(in), value :: x, y
+      real :: __fadd_ru
+    end function
+  end interface
+  public :: __fadd_ru
+end module
diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index e06c706538fe6..b07f82be6a724 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -9,6 +9,7 @@
 ! CUDA Fortran procedures available in device subprogram
 
 module cudadevice
+  use __cuda_device, only: __fadd_rd, __fadd_ru
 implicit none
 
   ! Set PRIVATE by default to explicitly only export what is meant
@@ -71,20 +72,4 @@ attributes(device) subroutine threadfence_system()
   end interface
   public :: threadfence_system
 
-  interface
-    attributes(device) function __fadd_rd(x, y) bind(c, name='__nv_fadd_rd')
-      real, intent(in) :: x, y
-      real :: __fadd_rd
-    end function
-  end interface
-  public :: __fadd_rd
-
-  interface
-    attributes(device) function __fadd_ru(x, y) bind(c, name='__nv_fadd_ru')
-      real, intent(in) :: x, y
-      real :: __fadd_ru
-    end function
-  end interface
-  public :: __fadd_ru
-
 end module
diff --git a/flang/tools/f18/CMakeLists.txt b/flang/tools/f18/CMakeLists.txt
index 4362fcf053761..cc2bc5b8eb5ce 100644
--- a/flang/tools/f18/CMakeLists.txt
+++ b/flang/tools/f18/CMakeLists.txt
@@ -21,6 +21,7 @@ set(MODULES_WITHOUT_IMPLEMENTATION
   "__ppc_intrinsics"
   "mma"
   "__cuda_builtins"
+  "__cuda_device"
   "cudadevice"
   "ieee_arithmetic"
   "ieee_exceptions"
@@ -67,9 +68,12 @@ if (NOT CMAKE_CROSSCOMPILING)
     elseif(${filename} STREQUAL "__ppc_intrinsics" OR
            ${filename} STREQUAL "mma")
       set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__ppc_types.mod)
-    elseif(${filename} STREQUAL "cudadevice")
+    elseif(${filename} STREQUAL "__cuda_device")
       set(opts -fc1 -xcuda)
       set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__cuda_builtins.mod)
+    elseif(${filename} STREQUAL "cudadevice")
+      set(opts -fc1 -xcuda)
+      set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__cuda_device.mod)
     else()
       set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__fortran_builtins.mod)
       if(${filename} STREQUAL "iso_fortran_env")

From 3a9977efaa24089c1cbd987d8fafa9831cbb780d Mon Sep 17 00:00:00 2001
From: Longsheng Mou <longshengmou@gmail.com>
Date: Tue, 14 Jan 2025 09:43:00 +0800
Subject: [PATCH 357/408] [mlir][linalg] Fix no-null pointer check (#122727)

This PR fixes
[Bug19](https://pvs-studio.com/en/blog/posts/cpp/1188/#ID6222F3BAF3)
mentioned in https://pvs-studio.com/en/blog/posts/cpp/1188/.
---
 mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index a1d619c8cd19d..67dd21aafe4fe 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -2244,7 +2244,7 @@ transform::ConvertToLoopsOp::apply(transform::TransformRewriter &rewriter,
   SmallVector<Operation *> loops;
   for (Operation *target : state.getPayloadOps(getTarget())) {
     auto tilingOp = dyn_cast<TilingInterface>(*target);
-    if (!target) {
+    if (!tilingOp) {
       DiagnosedSilenceableFailure diag =
           emitSilenceableError()
           << "expected the payload to implement TilingInterface";

From 4bd9edc15a323f09116c356404b0c926a02b69a9 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 13 Jan 2025 17:54:51 -0800
Subject: [PATCH 358/408] [RISCV] Remove loads from fixed-vectors-extract.ll.
 NFC (#122796)

These test cases weren't trying to test load+extract. I believe they
only used loads because fixed vector arguments weren't supported when
they were written or they weren't copied from the structure of other
tests that pre-date fixed vector argument support.

Reduces diff from #122671.
---
 .../RISCV/rvv/fixed-vectors-extract.ll        | 425 ++++++------------
 1 file changed, 148 insertions(+), 277 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
index a193d4e4e689f..7e45136372b6c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
@@ -7,50 +7,42 @@
 ; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+f,+d,+m -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV32,RV32M
 ; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+f,+d,+m -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV64,RV64M
 
-define i8 @extractelt_v16i8(ptr %x) nounwind {
+define i8 @extractelt_v16i8(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: extractelt_v16i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 7
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %a = load <16 x i8>, ptr %x
   %b = extractelement <16 x i8> %a, i32 7
   ret i8 %b
 }
 
-define i16 @extractelt_v8i16(ptr %x) nounwind {
+define i16 @extractelt_v8i16(<8 x i16> %a) nounwind {
 ; CHECK-LABEL: extractelt_v8i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 7
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %a = load <8 x i16>, ptr %x
   %b = extractelement <8 x i16> %a, i32 7
   ret i16 %b
 }
 
-define i32 @extractelt_v4i32(ptr %x) nounwind {
+define i32 @extractelt_v4i32(<4 x i32> %a) nounwind {
 ; CHECK-LABEL: extractelt_v4i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 2
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %a = load <4 x i32>, ptr %x
   %b = extractelement <4 x i32> %a, i32 2
   ret i32 %b
 }
 
-define i64 @extractelt_v2i64(ptr %x) nounwind {
+define i64 @extractelt_v2i64(<2 x i64> %a) nounwind {
 ; RV32-LABEL: extractelt_v2i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    li a0, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v9, v8, a0
@@ -60,126 +52,104 @@ define i64 @extractelt_v2i64(ptr %x) nounwind {
 ;
 ; RV64-LABEL: extractelt_v2i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vle64.v v8, (a0)
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    ret
-  %a = load <2 x i64>, ptr %x
   %b = extractelement <2 x i64> %a, i32 0
   ret i64 %b
 }
 
-define bfloat @extractelt_v8bf16(ptr %x) nounwind {
+define bfloat @extractelt_v8bf16(<8 x bfloat> %a) nounwind {
 ; CHECK-LABEL: extractelt_v8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 7
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    fmv.h.x fa0, a0
 ; CHECK-NEXT:    ret
-  %a = load <8 x bfloat>, ptr %x
   %b = extractelement <8 x bfloat> %a, i32 7
   ret bfloat %b
 }
 
-define half @extractelt_v8f16(ptr %x) nounwind {
+define half @extractelt_v8f16(<8 x half> %a) nounwind {
 ; ZVFH-LABEL: extractelt_v8f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFH-NEXT:    vle16.v v8, (a0)
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ZVFH-NEXT:    vslidedown.vi v8, v8, 7
 ; ZVFH-NEXT:    vfmv.f.s fa0, v8
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: extractelt_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 7
 ; ZVFHMIN-NEXT:    vmv.x.s a0, v8
 ; ZVFHMIN-NEXT:    fmv.h.x fa0, a0
 ; ZVFHMIN-NEXT:    ret
-  %a = load <8 x half>, ptr %x
   %b = extractelement <8 x half> %a, i32 7
   ret half %b
 }
 
-define float @extractelt_v4f32(ptr %x) nounwind {
+define float @extractelt_v4f32(<4 x float> %a) nounwind {
 ; CHECK-LABEL: extractelt_v4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 2
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
-  %a = load <4 x float>, ptr %x
   %b = extractelement <4 x float> %a, i32 2
   ret float %b
 }
 
-define double @extractelt_v2f64(ptr %x) nounwind {
+define double @extractelt_v2f64(<2 x double> %a) nounwind {
 ; CHECK-LABEL: extractelt_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
-  %a = load <2 x double>, ptr %x
   %b = extractelement <2 x double> %a, i32 0
   ret double %b
 }
 
-define i8 @extractelt_v32i8(ptr %x) nounwind {
+define i8 @extractelt_v32i8(<32 x i8> %a) nounwind {
 ; CHECK-LABEL: extractelt_v32i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 7
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %a = load <32 x i8>, ptr %x
   %b = extractelement <32 x i8> %a, i32 7
   ret i8 %b
 }
 
-define i16 @extractelt_v16i16(ptr %x) nounwind {
+define i16 @extractelt_v16i16(<16 x i16> %a) nounwind {
 ; CHECK-LABEL: extractelt_v16i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 7
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %a = load <16 x i16>, ptr %x
   %b = extractelement <16 x i16> %a, i32 7
   ret i16 %b
 }
 
-define i32 @extractelt_v8i32(ptr %x) nounwind {
+define i32 @extractelt_v8i32(<8 x i32> %a) nounwind {
 ; CHECK-LABEL: extractelt_v8i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 6
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %a = load <8 x i32>, ptr %x
   %b = extractelement <8 x i32> %a, i32 6
   ret i32 %b
 }
 
-define i64 @extractelt_v4i64(ptr %x) nounwind {
+define i64 @extractelt_v4i64(<4 x i64> %a) nounwind {
 ; RV32-LABEL: extractelt_v4i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v8, 3
+; RV32-NEXT:    li a0, 32
 ; RV32-NEXT:    vsrl.vx v10, v8, a0
 ; RV32-NEXT:    vmv.x.s a1, v10
 ; RV32-NEXT:    vmv.x.s a0, v8
@@ -187,36 +157,29 @@ define i64 @extractelt_v4i64(ptr %x) nounwind {
 ;
 ; RV64-LABEL: extractelt_v4i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vle64.v v8, (a0)
+; RV64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    ret
-  %a = load <4 x i64>, ptr %x
   %b = extractelement <4 x i64> %a, i32 3
   ret i64 %b
 }
 
-define bfloat @extractelt_v16bf16(ptr %x) nounwind {
+define bfloat @extractelt_v16bf16(<16 x bfloat> %a) nounwind {
 ; CHECK-LABEL: extractelt_v16bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 7
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    fmv.h.x fa0, a0
 ; CHECK-NEXT:    ret
-  %a = load <16 x bfloat>, ptr %x
   %b = extractelement <16 x bfloat> %a, i32 7
   ret bfloat %b
 }
 
-define half @extractelt_v16f16(ptr %x) nounwind {
+define half @extractelt_v16f16(<16 x half> %a) nounwind {
 ; ZVFH-LABEL: extractelt_v16f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFH-NEXT:    vle16.v v8, (a0)
 ; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ZVFH-NEXT:    vslidedown.vi v8, v8, 7
 ; ZVFH-NEXT:    vfmv.f.s fa0, v8
@@ -224,40 +187,32 @@ define half @extractelt_v16f16(ptr %x) nounwind {
 ;
 ; ZVFHMIN-LABEL: extractelt_v16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 7
 ; ZVFHMIN-NEXT:    vmv.x.s a0, v8
 ; ZVFHMIN-NEXT:    fmv.h.x fa0, a0
 ; ZVFHMIN-NEXT:    ret
-  %a = load <16 x half>, ptr %x
   %b = extractelement <16 x half> %a, i32 7
   ret half %b
 }
 
-define float @extractelt_v8f32(ptr %x) nounwind {
+define float @extractelt_v8f32(<8 x float> %a) nounwind {
 ; CHECK-LABEL: extractelt_v8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 2
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
-  %a = load <8 x float>, ptr %x
   %b = extractelement <8 x float> %a, i32 2
   ret float %b
 }
 
-define double @extractelt_v4f64(ptr %x) nounwind {
+define double @extractelt_v4f64(<4 x double> %a) nounwind {
 ; CHECK-LABEL: extractelt_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
-  %a = load <4 x double>, ptr %x
   %b = extractelement <4 x double> %a, i32 0
   ret double %b
 }
@@ -266,11 +221,9 @@ define double @extractelt_v4f64(ptr %x) nounwind {
 ; incorrect use of getSimpleValueType().
 ; NOTE: Type legalization is bitcasting to vXi32 and doing 2 independent
 ; slidedowns and extracts.
-define i64 @extractelt_v3i64(ptr %x) nounwind {
+define i64 @extractelt_v3i64(<3 x i64> %a) nounwind {
 ; RV32-LABEL: extractelt_v3i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 3, e64, m2, ta, ma
-; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32-NEXT:    vslidedown.vi v8, v8, 5
@@ -280,18 +233,16 @@ define i64 @extractelt_v3i64(ptr %x) nounwind {
 ;
 ; RV64-LABEL: extractelt_v3i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 3, e64, m2, ta, ma
-; RV64-NEXT:    vle64.v v8, (a0)
+; RV64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v8, v8, 2
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    ret
-  %a = load <3 x i64>, ptr %x
   %b = extractelement <3 x i64> %a, i32 2
   ret i64 %b
 }
 
 ; A LMUL8 type
-define i32 @extractelt_v32i32(ptr %x) nounwind {
+define i32 @extractelt_v32i32(<32 x i32> %a) nounwind {
 ; RV32-LABEL: extractelt_v32i32:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -256
@@ -299,11 +250,10 @@ define i32 @extractelt_v32i32(ptr %x) nounwind {
 ; RV32-NEXT:    sw s0, 248(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    addi s0, sp, 256
 ; RV32-NEXT:    andi sp, sp, -128
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    mv a0, sp
-; RV32-NEXT:    vse32.v v8, (a0)
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    lw a0, 124(sp)
 ; RV32-NEXT:    addi sp, s0, -256
 ; RV32-NEXT:    lw ra, 252(sp) # 4-byte Folded Reload
@@ -318,24 +268,22 @@ define i32 @extractelt_v32i32(ptr %x) nounwind {
 ; RV64-NEXT:    sd s0, 240(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    addi s0, sp, 256
 ; RV64-NEXT:    andi sp, sp, -128
-; RV64-NEXT:    li a1, 32
-; RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    mv a0, sp
-; RV64-NEXT:    vse32.v v8, (a0)
+; RV64-NEXT:    li a0, 32
+; RV64-NEXT:    mv a1, sp
+; RV64-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV64-NEXT:    vse32.v v8, (a1)
 ; RV64-NEXT:    lw a0, 124(sp)
 ; RV64-NEXT:    addi sp, s0, -256
 ; RV64-NEXT:    ld ra, 248(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 240(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 256
 ; RV64-NEXT:    ret
-  %a = load <32 x i32>, ptr %x
   %b = extractelement <32 x i32> %a, i32 31
   ret i32 %b
 }
 
 ; Exercise type legalization for type beyond LMUL8
-define i32 @extractelt_v64i32(ptr %x) nounwind {
+define i32 @extractelt_v64i32(<64 x i32> %a) nounwind {
 ; RV32-LABEL: extractelt_v64i32:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -256
@@ -343,12 +291,10 @@ define i32 @extractelt_v64i32(ptr %x) nounwind {
 ; RV32-NEXT:    sw s0, 248(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    addi s0, sp, 256
 ; RV32-NEXT:    andi sp, sp, -128
-; RV32-NEXT:    addi a0, a0, 128
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    mv a0, sp
-; RV32-NEXT:    vse32.v v8, (a0)
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vse32.v v16, (a1)
 ; RV32-NEXT:    lw a0, 124(sp)
 ; RV32-NEXT:    addi sp, s0, -256
 ; RV32-NEXT:    lw ra, 252(sp) # 4-byte Folded Reload
@@ -363,315 +309,275 @@ define i32 @extractelt_v64i32(ptr %x) nounwind {
 ; RV64-NEXT:    sd s0, 240(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    addi s0, sp, 256
 ; RV64-NEXT:    andi sp, sp, -128
-; RV64-NEXT:    addi a0, a0, 128
-; RV64-NEXT:    li a1, 32
-; RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    mv a0, sp
-; RV64-NEXT:    vse32.v v8, (a0)
+; RV64-NEXT:    li a0, 32
+; RV64-NEXT:    mv a1, sp
+; RV64-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV64-NEXT:    vse32.v v16, (a1)
 ; RV64-NEXT:    lw a0, 124(sp)
 ; RV64-NEXT:    addi sp, s0, -256
 ; RV64-NEXT:    ld ra, 248(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 240(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 256
 ; RV64-NEXT:    ret
-  %a = load <64 x i32>, ptr %x
   %b = extractelement <64 x i32> %a, i32 63
   ret i32 %b
 }
 
-define i8 @extractelt_v16i8_idx(ptr %x, i32 zeroext %idx) nounwind {
+define i8 @extractelt_v16i8_idx(<16 x i8> %a, i32 zeroext %idx) nounwind {
 ; CHECK-LABEL: extractelt_v16i8_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vslidedown.vx v8, v8, a1
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %a = load <16 x i8>, ptr %x
   %b = extractelement <16 x i8> %a, i32 %idx
   ret i8 %b
 }
 
-define i16 @extractelt_v8i16_idx(ptr %x, i32 zeroext %idx) nounwind {
+define i16 @extractelt_v8i16_idx(<8 x i16> %a, i32 zeroext %idx) nounwind {
 ; CHECK-LABEL: extractelt_v8i16_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vslidedown.vx v8, v8, a1
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %a = load <8 x i16>, ptr %x
   %b = extractelement <8 x i16> %a, i32 %idx
   ret i16 %b
 }
 
-define i32 @extractelt_v4i32_idx(ptr %x, i32 zeroext %idx) nounwind {
+define i32 @extractelt_v4i32_idx(<4 x i32> %a, i32 zeroext %idx) nounwind {
 ; CHECK-LABEL: extractelt_v4i32_idx:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vslidedown.vx v8, v8, a1
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %a = load <4 x i32>, ptr %x
   %b = add <4 x i32> %a, %a
   %c = extractelement <4 x i32> %b, i32 %idx
   ret i32 %c
 }
 
-define i64 @extractelt_v2i64_idx(ptr %x, i32 zeroext %idx) nounwind {
+define i64 @extractelt_v2i64_idx(<2 x i64> %a, i32 zeroext %idx) nounwind {
 ; RV32-LABEL: extractelt_v2i64_idx:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    vadd.vv v8, v8, v8
-; RV32-NEXT:    vslidedown.vx v8, v8, a1
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vslidedown.vx v8, v8, a0
 ; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v8, v8, a2
+; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: extractelt_v2i64_idx:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    vadd.vv v8, v8, v8
-; RV64-NEXT:    vslidedown.vx v8, v8, a1
+; RV64-NEXT:    vslidedown.vx v8, v8, a0
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    ret
-  %a = load <2 x i64>, ptr %x
   %b = add <2 x i64> %a, %a
   %c = extractelement <2 x i64> %b, i32 %idx
   ret i64 %c
 }
 
-define bfloat @extractelt_v8bf16_idx(ptr %x, i32 zeroext %idx) nounwind {
+define bfloat @extractelt_v8bf16_idx(<8 x bfloat> %a, i32 zeroext %idx) nounwind {
 ; CHECK-LABEL: extractelt_v8bf16_idx:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfadd.vv v8, v10, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
-; CHECK-NEXT:    vslidedown.vx v8, v10, a1
+; CHECK-NEXT:    vslidedown.vx v8, v10, a0
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    fmv.h.x fa0, a0
 ; CHECK-NEXT:    ret
-  %a = load <8 x bfloat>, ptr %x
   %b = fadd <8 x bfloat> %a, %a
   %c = extractelement <8 x bfloat> %b, i32 %idx
   ret bfloat %c
 }
 
-define half @extractelt_v8f16_idx(ptr %x, i32 zeroext %idx) nounwind {
+define half @extractelt_v8f16_idx(<8 x half> %a, i32 zeroext %idx) nounwind {
 ; ZVFH-LABEL: extractelt_v8f16_idx:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFH-NEXT:    vle16.v v8, (a0)
 ; ZVFH-NEXT:    vfadd.vv v8, v8, v8
-; ZVFH-NEXT:    vslidedown.vx v8, v8, a1
+; ZVFH-NEXT:    vslidedown.vx v8, v8, a0
 ; ZVFH-NEXT:    vfmv.f.s fa0, v8
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: extractelt_v8f16_idx:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFHMIN-NEXT:    vslidedown.vx v8, v10, a1
+; ZVFHMIN-NEXT:    vslidedown.vx v8, v10, a0
 ; ZVFHMIN-NEXT:    vmv.x.s a0, v8
 ; ZVFHMIN-NEXT:    fmv.h.x fa0, a0
 ; ZVFHMIN-NEXT:    ret
-  %a = load <8 x half>, ptr %x
   %b = fadd <8 x half> %a, %a
   %c = extractelement <8 x half> %b, i32 %idx
   ret half %c
 }
 
-define float @extractelt_v4f32_idx(ptr %x, i32 zeroext %idx) nounwind {
+define float @extractelt_v4f32_idx(<4 x float> %a, i32 zeroext %idx) nounwind {
 ; CHECK-LABEL: extractelt_v4f32_idx:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8
-; CHECK-NEXT:    vslidedown.vx v8, v8, a1
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
-  %a = load <4 x float>, ptr %x
   %b = fadd <4 x float> %a, %a
   %c = extractelement <4 x float> %b, i32 %idx
   ret float %c
 }
 
-define double @extractelt_v2f64_idx(ptr %x, i32 zeroext %idx) nounwind {
+define double @extractelt_v2f64_idx(<2 x double> %a, i32 zeroext %idx) nounwind {
 ; CHECK-LABEL: extractelt_v2f64_idx:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8
-; CHECK-NEXT:    vslidedown.vx v8, v8, a1
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
-  %a = load <2 x double>, ptr %x
   %b = fadd <2 x double> %a, %a
   %c = extractelement <2 x double> %b, i32 %idx
   ret double %c
 }
 
-define i8 @extractelt_v32i8_idx(ptr %x, i32 zeroext %idx) nounwind {
+define i8 @extractelt_v32i8_idx(<32 x i8> %a, i32 zeroext %idx) nounwind {
 ; CHECK-LABEL: extractelt_v32i8_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 32
-; CHECK-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vslidedown.vx v8, v8, a1
+; CHECK-NEXT:    vsetivli zero, 1, e8, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %a = load <32 x i8>, ptr %x
   %b = extractelement <32 x i8> %a, i32 %idx
   ret i8 %b
 }
 
-define i16 @extractelt_v16i16_idx(ptr %x, i32 zeroext %idx) nounwind {
+define i16 @extractelt_v16i16_idx(<16 x i16> %a, i32 zeroext %idx) nounwind {
 ; CHECK-LABEL: extractelt_v16i16_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vslidedown.vx v8, v8, a1
+; CHECK-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %a = load <16 x i16>, ptr %x
   %b = extractelement <16 x i16> %a, i32 %idx
   ret i16 %b
 }
 
-define i32 @extractelt_v8i32_idx(ptr %x, i32 zeroext %idx) nounwind {
+define i32 @extractelt_v8i32_idx(<8 x i32> %a, i32 zeroext %idx) nounwind {
 ; CHECK-LABEL: extractelt_v8i32_idx:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vslidedown.vx v8, v8, a1
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %a = load <8 x i32>, ptr %x
   %b = add <8 x i32> %a, %a
   %c = extractelement <8 x i32> %b, i32 %idx
   ret i32 %c
 }
 
-define i64 @extractelt_v4i64_idx(ptr %x, i32 zeroext %idx) nounwind {
+define i64 @extractelt_v4i64_idx(<4 x i64> %a, i32 zeroext %idx) nounwind {
 ; RV32-LABEL: extractelt_v4i64_idx:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    vadd.vv v8, v8, v8
-; RV32-NEXT:    vslidedown.vx v8, v8, a1
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vslidedown.vx v8, v8, a0
 ; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vx v8, v8, a2
+; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: extractelt_v4i64_idx:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    vadd.vv v8, v8, v8
-; RV64-NEXT:    vslidedown.vx v8, v8, a1
+; RV64-NEXT:    vslidedown.vx v8, v8, a0
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    ret
-  %a = load <4 x i64>, ptr %x
   %b = add <4 x i64> %a, %a
   %c = extractelement <4 x i64> %b, i32 %idx
   ret i64 %c
 }
 
-define bfloat @extractelt_v16bf16_idx(ptr %x, i32 zeroext %idx) nounwind {
+define bfloat @extractelt_v16bf16_idx(<16 x bfloat> %a, i32 zeroext %idx) nounwind {
 ; CHECK-LABEL: extractelt_v16bf16_idx:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfadd.vv v8, v12, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v8
-; CHECK-NEXT:    vslidedown.vx v8, v12, a1
+; CHECK-NEXT:    vslidedown.vx v8, v12, a0
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    fmv.h.x fa0, a0
 ; CHECK-NEXT:    ret
-  %a = load <16 x bfloat>, ptr %x
   %b = fadd <16 x bfloat> %a, %a
   %c = extractelement <16 x bfloat> %b, i32 %idx
   ret bfloat %c
 }
 
-define half @extractelt_v16f16_idx(ptr %x, i32 zeroext %idx) nounwind {
+define half @extractelt_v16f16_idx(<16 x half> %a, i32 zeroext %idx) nounwind {
 ; ZVFH-LABEL: extractelt_v16f16_idx:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFH-NEXT:    vle16.v v8, (a0)
 ; ZVFH-NEXT:    vfadd.vv v8, v8, v8
-; ZVFH-NEXT:    vslidedown.vx v8, v8, a1
+; ZVFH-NEXT:    vslidedown.vx v8, v8, a0
 ; ZVFH-NEXT:    vfmv.f.s fa0, v8
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: extractelt_v16f16_idx:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v8, v12, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v8
-; ZVFHMIN-NEXT:    vslidedown.vx v8, v12, a1
+; ZVFHMIN-NEXT:    vslidedown.vx v8, v12, a0
 ; ZVFHMIN-NEXT:    vmv.x.s a0, v8
 ; ZVFHMIN-NEXT:    fmv.h.x fa0, a0
 ; ZVFHMIN-NEXT:    ret
-  %a = load <16 x half>, ptr %x
   %b = fadd <16 x half> %a, %a
   %c = extractelement <16 x half> %b, i32 %idx
   ret half %c
 }
 
-define float @extractelt_v8f32_idx(ptr %x, i32 zeroext %idx) nounwind {
+define float @extractelt_v8f32_idx(<8 x float> %a, i32 zeroext %idx) nounwind {
 ; CHECK-LABEL: extractelt_v8f32_idx:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8
-; CHECK-NEXT:    vslidedown.vx v8, v8, a1
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
-  %a = load <8 x float>, ptr %x
   %b = fadd <8 x float> %a, %a
   %c = extractelement <8 x float> %b, i32 %idx
   ret float %c
 }
 
-define double @extractelt_v4f64_idx(ptr %x, i32 zeroext %idx) nounwind {
+define double @extractelt_v4f64_idx(<4 x double> %a, i32 zeroext %idx) nounwind {
 ; CHECK-LABEL: extractelt_v4f64_idx:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8
-; CHECK-NEXT:    vslidedown.vx v8, v8, a1
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
-  %a = load <4 x double>, ptr %x
   %b = fadd <4 x double> %a, %a
   %c = extractelement <4 x double> %b, i32 %idx
   ret double %c
@@ -681,32 +587,27 @@ define double @extractelt_v4f64_idx(ptr %x, i32 zeroext %idx) nounwind {
 ; incorrect use of getSimpleValueType_idx(, i32 zeroext %idx).
 ; NOTE: Type legalization is bitcasting to vXi32 and doing 2 independent
 ; slidedowns and extracts.
-define i64 @extractelt_v3i64_idx(ptr %x, i32 zeroext %idx) nounwind {
+define i64 @extractelt_v3i64_idx(<3 x i64> %a, i32 zeroext %idx) nounwind {
 ; RV32-LABEL: extractelt_v3i64_idx:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 3, e64, m2, ta, ma
-; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    add a1, a1, a1
-; RV32-NEXT:    addi a0, a1, 1
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vadd.vv v8, v8, v8
+; RV32-NEXT:    add a0, a0, a0
 ; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT:    vslidedown.vx v10, v8, a1
-; RV32-NEXT:    vslidedown.vx v8, v8, a0
+; RV32-NEXT:    vslidedown.vx v10, v8, a0
+; RV32-NEXT:    addi a1, a0, 1
 ; RV32-NEXT:    vmv.x.s a0, v10
+; RV32-NEXT:    vslidedown.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: extractelt_v3i64_idx:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 3, e64, m2, ta, ma
-; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64-NEXT:    vadd.vv v8, v8, v8
-; RV64-NEXT:    vslidedown.vx v8, v8, a1
+; RV64-NEXT:    vslidedown.vx v8, v8, a0
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    ret
-  %a = load <3 x i64>, ptr %x
   %b = add <3 x i64> %a, %a
   %c = extractelement <3 x i64> %b, i32 %idx
   ret i64 %c
@@ -818,7 +719,7 @@ define i32 @extractelt_v32i32_idx(ptr %x, i32 zeroext %idx) nounwind {
   ret i32 %c
 }
 
-define i32 @extractelt_v64i32_idx(ptr %x, i32 zeroext %idx) nounwind {
+define i32 @extractelt_v64i32_idx(<64 x i32> %a, i32 zeroext %idx) nounwind {
 ; RV32-LABEL: extractelt_v64i32_idx:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -384
@@ -826,21 +727,18 @@ define i32 @extractelt_v64i32_idx(ptr %x, i32 zeroext %idx) nounwind {
 ; RV32-NEXT:    sw s0, 376(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    addi s0, sp, 384
 ; RV32-NEXT:    andi sp, sp, -128
-; RV32-NEXT:    andi a1, a1, 63
-; RV32-NEXT:    mv a2, sp
-; RV32-NEXT:    li a3, 32
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    addi a0, a0, 128
-; RV32-NEXT:    vle32.v v16, (a0)
-; RV32-NEXT:    addi a0, sp, 128
-; RV32-NEXT:    slli a1, a1, 2
-; RV32-NEXT:    add a1, a2, a1
-; RV32-NEXT:    vadd.vv v16, v16, v16
+; RV32-NEXT:    andi a0, a0, 63
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    li a2, 32
+; RV32-NEXT:    addi a3, sp, 128
+; RV32-NEXT:    slli a0, a0, 2
+; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vadd.vv v8, v8, v8
-; RV32-NEXT:    vse32.v v8, (a2)
-; RV32-NEXT:    vse32.v v16, (a0)
-; RV32-NEXT:    lw a0, 0(a1)
+; RV32-NEXT:    vadd.vv v16, v16, v16
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    vse32.v v16, (a3)
+; RV32-NEXT:    vse32.v v8, (a1)
+; RV32-NEXT:    lw a0, 0(a0)
 ; RV32-NEXT:    addi sp, s0, -384
 ; RV32-NEXT:    lw ra, 380(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 376(sp) # 4-byte Folded Reload
@@ -854,117 +752,96 @@ define i32 @extractelt_v64i32_idx(ptr %x, i32 zeroext %idx) nounwind {
 ; RV64-NEXT:    sd s0, 368(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    addi s0, sp, 384
 ; RV64-NEXT:    andi sp, sp, -128
-; RV64-NEXT:    andi a1, a1, 63
-; RV64-NEXT:    mv a2, sp
-; RV64-NEXT:    li a3, 32
-; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    addi a0, a0, 128
-; RV64-NEXT:    vle32.v v16, (a0)
-; RV64-NEXT:    addi a0, sp, 128
-; RV64-NEXT:    slli a1, a1, 2
-; RV64-NEXT:    add a1, a2, a1
-; RV64-NEXT:    vadd.vv v16, v16, v16
+; RV64-NEXT:    andi a0, a0, 63
+; RV64-NEXT:    mv a1, sp
+; RV64-NEXT:    li a2, 32
+; RV64-NEXT:    addi a3, sp, 128
+; RV64-NEXT:    slli a0, a0, 2
+; RV64-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV64-NEXT:    vadd.vv v8, v8, v8
-; RV64-NEXT:    vse32.v v8, (a2)
-; RV64-NEXT:    vse32.v v16, (a0)
-; RV64-NEXT:    lw a0, 0(a1)
+; RV64-NEXT:    vadd.vv v16, v16, v16
+; RV64-NEXT:    add a0, a1, a0
+; RV64-NEXT:    vse32.v v16, (a3)
+; RV64-NEXT:    vse32.v v8, (a1)
+; RV64-NEXT:    lw a0, 0(a0)
 ; RV64-NEXT:    addi sp, s0, -384
 ; RV64-NEXT:    ld ra, 376(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 368(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 384
 ; RV64-NEXT:    ret
-  %a = load <64 x i32>, ptr %x
   %b = add <64 x i32> %a, %a
   %c = extractelement <64 x i32> %b, i32 %idx
   ret i32 %c
 }
 
-define void @store_extractelt_v16i8(ptr %x, ptr %p) nounwind {
+define void @store_extractelt_v16i8(<16 x i8> %a, ptr %p) nounwind {
 ; CHECK-LABEL: store_extractelt_v16i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vslidedown.vi v8, v8, 7
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    vslidedown.vi v8, v8, 7
+; CHECK-NEXT:    vse8.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %a = load <16 x i8>, ptr %x
   %b = extractelement <16 x i8> %a, i32 7
   store i8 %b, ptr %p
   ret void
 }
 
-define void @store_extractelt_v8i16(ptr %x, ptr %p) nounwind {
+define void @store_extractelt_v8i16(<8 x i16> %a, ptr %p) nounwind {
 ; CHECK-LABEL: store_extractelt_v8i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vslidedown.vi v8, v8, 7
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vse16.v v8, (a1)
+; CHECK-NEXT:    vslidedown.vi v8, v8, 7
+; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %a = load <8 x i16>, ptr %x
   %b = extractelement <8 x i16> %a, i32 7
   store i16 %b, ptr %p
   ret void
 }
 
-define void @store_extractelt_v4i32(ptr %x, ptr %p) nounwind {
+define void @store_extractelt_v4i32(<4 x i32> %a, ptr %p) nounwind {
 ; CHECK-LABEL: store_extractelt_v4i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vslidedown.vi v8, v8, 2
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vse32.v v8, (a1)
+; CHECK-NEXT:    vslidedown.vi v8, v8, 2
+; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %a = load <4 x i32>, ptr %x
   %b = extractelement <4 x i32> %a, i32 2
   store i32 %b, ptr %p
   ret void
 }
 
 ; FIXME: Use vse64.v on RV32 to avoid two scalar extracts and two scalar stores.
-define void @store_extractelt_v2i64(ptr %x, ptr %p) nounwind {
+define void @store_extractelt_v2i64(<2 x i64> %a, ptr %p) nounwind {
 ; RV32-LABEL: store_extractelt_v2i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vslidedown.vi v8, v8, 1
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v9, v8, a0
-; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vslidedown.vi v8, v8, 1
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsrl.vx v9, v8, a1
+; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    vmv.x.s a2, v9
-; RV32-NEXT:    sw a0, 0(a1)
-; RV32-NEXT:    sw a2, 4(a1)
+; RV32-NEXT:    sw a1, 0(a0)
+; RV32-NEXT:    sw a2, 4(a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: store_extractelt_v2i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vle64.v v8, (a0)
-; RV64-NEXT:    vslidedown.vi v8, v8, 1
 ; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vse64.v v8, (a1)
+; RV64-NEXT:    vslidedown.vi v8, v8, 1
+; RV64-NEXT:    vse64.v v8, (a0)
 ; RV64-NEXT:    ret
-  %a = load <2 x i64>, ptr %x
   %b = extractelement <2 x i64> %a, i64 1
   store i64 %b, ptr %p
   ret void
 }
 
-define void @store_extractelt_v2f64(ptr %x, ptr %p) nounwind {
+define void @store_extractelt_v2f64(<2 x double> %a, ptr %p) nounwind {
 ; CHECK-LABEL: store_extractelt_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vse64.v v8, (a1)
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vse64.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %a = load <2 x double>, ptr %x
   %b = extractelement <2 x double> %a, i64 1
   store double %b, ptr %p
   ret void
@@ -1246,30 +1123,24 @@ define float @extractelt_fdiv_v4f32(<4 x float> %x) {
   ret float %ext
 }
 
-define i32 @extractelt_v16i32_idx7_exact_vlen(ptr %x) nounwind vscale_range(2,2) {
+define i32 @extractelt_v16i32_idx7_exact_vlen(<16 x i32> %a) nounwind vscale_range(2,2) {
 ; CHECK-LABEL: extractelt_v16i32_idx7_exact_vlen:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl1re32.v v8, (a0)
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 3
+; CHECK-NEXT:    vslidedown.vi v8, v9, 3
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %a = load <16 x i32>, ptr %x
   %b = extractelement <16 x i32> %a, i32 7
   ret i32 %b
 }
 
-define i32 @extractelt_v16i32_idx15_exact_vlen(ptr %x) nounwind vscale_range(2,2) {
+define i32 @extractelt_v16i32_idx15_exact_vlen(<16 x i32> %a) nounwind vscale_range(2,2) {
 ; CHECK-LABEL: extractelt_v16i32_idx15_exact_vlen:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi a0, a0, 48
-; CHECK-NEXT:    vl1re32.v v8, (a0)
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 3
+; CHECK-NEXT:    vslidedown.vi v8, v11, 3
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %a = load <16 x i32>, ptr %x
   %b = extractelement <16 x i32> %a, i32 15
   ret i32 %b
 }

From 58708151ecaab8786c58b20eefc548dbdb23c8cc Mon Sep 17 00:00:00 2001
From: Hongren Zheng <i@zenithal.me>
Date: Tue, 14 Jan 2025 10:19:15 +0800
Subject: [PATCH 359/408] [mlir][docs] Guide on generating alias for
 type/attribute (#121698)

This is part of
https://discourse.llvm.org/t/rfc-introduce-opasm-type-attr-interface-for-pretty-print-in-asmprinter/83792.

Verbose printing of commonly used type/attribute that is long could
severely reduce the readablity of the resulting assembly, and it has
been asked several times in the LLVM discourse how to generate alias.

Cc @ftynse

### Discussion

* I am not sure where to put the markdown, so I put it in `mlir/docs/`.
* Documentation on `OpAsmOpInterface` (controlling
`AsmResultName`/`BlockArgName`/etc) could also be added in this
markdown, so I used the title `Customizing AsmPrinter Behavior` and let
further PR to update the content.
---
 mlir/docs/DefiningDialects/Assembly.md | 51 ++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 mlir/docs/DefiningDialects/Assembly.md

diff --git a/mlir/docs/DefiningDialects/Assembly.md b/mlir/docs/DefiningDialects/Assembly.md
new file mode 100644
index 0000000000000..d69349390ee3e
--- /dev/null
+++ b/mlir/docs/DefiningDialects/Assembly.md
@@ -0,0 +1,51 @@
+# Customizing Assembly Behavior
+
+[TOC]
+
+## Generating Aliases
+
+To reduce verbosity in the resulting assembly, `AsmPrinter` can generate aliases for frequently used types and attributes.
+
+For example, `!my_dialect.type<a=3,b=4,c=5,d=tuple,e=another_type>` and `#my_dialect.attr<a=3>` can be aliased to `!my_dialect_type` and `#my_dialect_attr`, simplifying further references.
+
+To enable this, the owning dialect of these types/attributes can define an interface to hook into the `AsmPrinter`. This is effective only when the assembly is not printed in generic form.
+
+```cpp
+// OpAsmDialectInterface is defined in
+// https://github.com/llvm/llvm-project/blob/91ab10e8d6c256d841da1a1a1b47c334e08d95b9/mlir/include/mlir/IR/OpImplementation.h#L1738
+struct MyDialectOpAsmDialectInterface : public OpAsmDialectInterface {
+ public:
+  using OpAsmDialectInterface::OpAsmDialectInterface;
+
+  AliasResult getAlias(Type type, raw_ostream& os) const override {
+    if (mlir::isa<MyType>(type)) {
+      os << "my_dialect_type";
+      // Could return OverridableAlias when
+      // allowing other dialect to override the alias.
+      //
+      // Other dialects are allowed to provide alias for
+      // type/attribute not owned by them
+      // but the final result would depend on the registration order
+      // of these dialects in the MLIRContext
+      return AliasResult::FinalAlias;
+    }
+    return AliasResult::NoAlias;
+  }
+
+  AliasResult getAlias(Attribute attr, raw_ostream& os) const override {
+    if (mlir::isa<MyAttribute>(attr)) {
+      os << "my_dialect_attr";
+      return AliasResult::FinalAlias;
+    }
+    return AliasResult::NoAlias;
+  }
+};
+
+void MyDialect::initialize() {
+  // register the interface to the dialect
+  addInterface<MyDialectOpAsmDialectInterface>();
+}
+```
+
+* If `getAlias` provides an alias with a trailing digit, `AsmPrinter` appends an underscore to avoid conflicts with autogenerated IDs.
+* If multiple types/attributes have the same alias from `getAlias`, a number is appended to the alias to avoid conflicts.
\ No newline at end of file

From 7c51c310ad9a50e721e5f17f2f27f066a0d77b80 Mon Sep 17 00:00:00 2001
From: vporpo <vporpodas@google.com>
Date: Mon, 13 Jan 2025 18:25:25 -0800
Subject: [PATCH 360/408] [SandboxVec][BottomUpVec] Clean up dead address
 instrs (#122536)

When we vectorize loads or stores we only keep the address of the first
lane. The rest may become dead. This patch adds the address operands of
vectorized loads or stores to the dead candidates set, such that they
get erased if dead.
---
 .../SandboxVectorizer/Passes/BottomUpVec.h    |  3 +-
 .../SandboxVectorizer/Passes/BottomUpVec.cpp  | 40 +++++++++++++++----
 .../SandboxVectorizer/bottomup_basic.ll       |  9 -----
 .../SandboxVectorizer/bottomup_seed_slice.ll  |  1 -
 .../bottomup_seed_slice_pow2.ll               |  3 --
 5 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h
index bd45634814b07..1a53ca6e06f5f 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h
@@ -25,7 +25,7 @@ namespace llvm::sandboxir {
 class BottomUpVec final : public FunctionPass {
   bool Change = false;
   std::unique_ptr<LegalityAnalysis> Legality;
-  SmallVector<Instruction *> DeadInstrCandidates;
+  DenseSet<Instruction *> DeadInstrCandidates;
 
   /// Creates and returns a vector instruction that replaces the instructions in
   /// \p Bndl. \p Operands are the already vectorized operands.
@@ -35,6 +35,7 @@ class BottomUpVec final : public FunctionPass {
   void tryEraseDeadInstrs();
   /// Packs all elements of \p ToPack into a vector and returns that vector.
   Value *createPack(ArrayRef<Value *> ToPack);
+  void collectPotentiallyDeadInstrs(ArrayRef<Value *> Bndl);
   /// Recursively try to vectorize \p Bndl and its operands.
   Value *vectorizeRec(ArrayRef<Value *> Bndl, unsigned Depth);
   /// Entry point for vectorization starting from \p Seeds.
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
index 18e072c17d202..d44199609838d 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
@@ -157,9 +157,11 @@ Value *BottomUpVec::createVectorInstr(ArrayRef<Value *> Bndl,
 
 void BottomUpVec::tryEraseDeadInstrs() {
   // Visiting the dead instructions bottom-to-top.
-  sort(DeadInstrCandidates,
+  SmallVector<Instruction *> SortedDeadInstrCandidates(
+      DeadInstrCandidates.begin(), DeadInstrCandidates.end());
+  sort(SortedDeadInstrCandidates,
        [](Instruction *I1, Instruction *I2) { return I1->comesBefore(I2); });
-  for (Instruction *I : reverse(DeadInstrCandidates)) {
+  for (Instruction *I : reverse(SortedDeadInstrCandidates)) {
     if (I->hasNUses(0))
       I->eraseFromParent();
   }
@@ -218,6 +220,31 @@ Value *BottomUpVec::createPack(ArrayRef<Value *> ToPack) {
   return LastInsert;
 }
 
+void BottomUpVec::collectPotentiallyDeadInstrs(ArrayRef<Value *> Bndl) {
+  for (Value *V : Bndl)
+    DeadInstrCandidates.insert(cast<Instruction>(V));
+  // Also collect the GEPs of vectorized loads and stores.
+  auto Opcode = cast<Instruction>(Bndl[0])->getOpcode();
+  switch (Opcode) {
+  case Instruction::Opcode::Load: {
+    for (Value *V : drop_begin(Bndl))
+      if (auto *Ptr =
+              dyn_cast<Instruction>(cast<LoadInst>(V)->getPointerOperand()))
+        DeadInstrCandidates.insert(Ptr);
+    break;
+  }
+  case Instruction::Opcode::Store: {
+    for (Value *V : drop_begin(Bndl))
+      if (auto *Ptr =
+              dyn_cast<Instruction>(cast<StoreInst>(V)->getPointerOperand()))
+        DeadInstrCandidates.insert(Ptr);
+    break;
+  }
+  default:
+    break;
+  }
+}
+
 Value *BottomUpVec::vectorizeRec(ArrayRef<Value *> Bndl, unsigned Depth) {
   Value *NewVec = nullptr;
   const auto &LegalityRes = Legality->canVectorize(Bndl);
@@ -247,11 +274,10 @@ Value *BottomUpVec::vectorizeRec(ArrayRef<Value *> Bndl, unsigned Depth) {
     }
     NewVec = createVectorInstr(Bndl, VecOperands);
 
-    // Collect the original scalar instructions as they may be dead.
-    if (NewVec != nullptr) {
-      for (Value *V : Bndl)
-        DeadInstrCandidates.push_back(cast<Instruction>(V));
-    }
+    // Collect any potentially dead scalar instructions, including the original
+    // scalars and pointer operands of loads/stores.
+    if (NewVec != nullptr)
+      collectPotentiallyDeadInstrs(Bndl);
     break;
   }
   case LegalityResultID::Pack: {
diff --git a/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll b/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll
index 785d1f4ef666f..d34c8f88e4b3c 100644
--- a/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll
+++ b/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll
@@ -5,7 +5,6 @@ define void @store_load(ptr %ptr) {
 ; CHECK-LABEL: define void @store_load(
 ; CHECK-SAME: ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr float, ptr [[PTR]], i32 0
-; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr float, ptr [[PTR]], i32 1
 ; CHECK-NEXT:    [[VECL:%.*]] = load <2 x float>, ptr [[PTR0]], align 4
 ; CHECK-NEXT:    store <2 x float> [[VECL]], ptr [[PTR0]], align 4
 ; CHECK-NEXT:    ret void
@@ -24,9 +23,7 @@ define void @store_fpext_load(ptr %ptr) {
 ; CHECK-LABEL: define void @store_fpext_load(
 ; CHECK-SAME: ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr float, ptr [[PTR]], i32 0
-; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr float, ptr [[PTR]], i32 1
 ; CHECK-NEXT:    [[PTRD0:%.*]] = getelementptr double, ptr [[PTR]], i32 0
-; CHECK-NEXT:    [[PTRD1:%.*]] = getelementptr double, ptr [[PTR]], i32 1
 ; CHECK-NEXT:    [[VECL:%.*]] = load <2 x float>, ptr [[PTR0]], align 4
 ; CHECK-NEXT:    [[VCAST:%.*]] = fpext <2 x float> [[VECL]] to <2 x double>
 ; CHECK-NEXT:    store <2 x double> [[VCAST]], ptr [[PTRD0]], align 8
@@ -49,9 +46,7 @@ define void @store_fcmp_zext_load(ptr %ptr) {
 ; CHECK-LABEL: define void @store_fcmp_zext_load(
 ; CHECK-SAME: ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr float, ptr [[PTR]], i32 0
-; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr float, ptr [[PTR]], i32 1
 ; CHECK-NEXT:    [[PTRB0:%.*]] = getelementptr i32, ptr [[PTR]], i32 0
-; CHECK-NEXT:    [[PTRB1:%.*]] = getelementptr i32, ptr [[PTR]], i32 1
 ; CHECK-NEXT:    [[VECL1:%.*]] = load <2 x float>, ptr [[PTR0]], align 4
 ; CHECK-NEXT:    [[VECL:%.*]] = load <2 x float>, ptr [[PTR0]], align 4
 ; CHECK-NEXT:    [[VCMP:%.*]] = fcmp ogt <2 x float> [[VECL]], [[VECL1]]
@@ -80,7 +75,6 @@ define void @store_fadd_load(ptr %ptr) {
 ; CHECK-LABEL: define void @store_fadd_load(
 ; CHECK-SAME: ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr float, ptr [[PTR]], i32 0
-; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr float, ptr [[PTR]], i32 1
 ; CHECK-NEXT:    [[VECL:%.*]] = load <2 x float>, ptr [[PTR0]], align 4
 ; CHECK-NEXT:    [[VECL1:%.*]] = load <2 x float>, ptr [[PTR0]], align 4
 ; CHECK-NEXT:    [[VEC:%.*]] = fadd <2 x float> [[VECL]], [[VECL1]]
@@ -104,7 +98,6 @@ define void @store_fneg_load(ptr %ptr) {
 ; CHECK-LABEL: define void @store_fneg_load(
 ; CHECK-SAME: ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr float, ptr [[PTR]], i32 0
-; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr float, ptr [[PTR]], i32 1
 ; CHECK-NEXT:    [[VECL:%.*]] = load <2 x float>, ptr [[PTR0]], align 4
 ; CHECK-NEXT:    [[VEC:%.*]] = fneg <2 x float> [[VECL]]
 ; CHECK-NEXT:    store <2 x float> [[VEC]], ptr [[PTR0]], align 4
@@ -147,7 +140,6 @@ define void @pack_scalars(ptr %ptr, ptr %ptr2) {
 ; CHECK-LABEL: define void @pack_scalars(
 ; CHECK-SAME: ptr [[PTR:%.*]], ptr [[PTR2:%.*]]) {
 ; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr float, ptr [[PTR]], i32 0
-; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr float, ptr [[PTR]], i32 1
 ; CHECK-NEXT:    [[LD0:%.*]] = load float, ptr [[PTR0]], align 4
 ; CHECK-NEXT:    [[LD1:%.*]] = load float, ptr [[PTR2]], align 4
 ; CHECK-NEXT:    [[PACK:%.*]] = insertelement <2 x float> poison, float [[LD0]], i32 0
@@ -191,7 +183,6 @@ define void @pack_vectors(ptr %ptr, ptr %ptr2) {
 ; CHECK-LABEL: define void @pack_vectors(
 ; CHECK-SAME: ptr [[PTR:%.*]], ptr [[PTR2:%.*]]) {
 ; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr <2 x float>, ptr [[PTR]], i32 0
-; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr float, ptr [[PTR]], i32 2
 ; CHECK-NEXT:    [[LD0:%.*]] = load <2 x float>, ptr [[PTR0]], align 8
 ; CHECK-NEXT:    [[LD1:%.*]] = load float, ptr [[PTR2]], align 4
 ; CHECK-NEXT:    [[VPACK:%.*]] = extractelement <2 x float> [[LD0]], i32 0
diff --git a/llvm/test/Transforms/SandboxVectorizer/bottomup_seed_slice.ll b/llvm/test/Transforms/SandboxVectorizer/bottomup_seed_slice.ll
index 46cda3c80aaa3..8459c3addaa83 100644
--- a/llvm/test/Transforms/SandboxVectorizer/bottomup_seed_slice.ll
+++ b/llvm/test/Transforms/SandboxVectorizer/bottomup_seed_slice.ll
@@ -7,7 +7,6 @@ define void @slice_seeds(ptr %ptr, float %val) {
 ; CHECK-LABEL: define void @slice_seeds(
 ; CHECK-SAME: ptr [[PTR:%.*]], float [[VAL:%.*]]) {
 ; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr float, ptr [[PTR]], i32 0
-; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr float, ptr [[PTR]], i32 1
 ; CHECK-NEXT:    [[PTR2:%.*]] = getelementptr float, ptr [[PTR]], i32 2
 ; CHECK-NEXT:    [[LD2:%.*]] = load float, ptr [[PTR2]], align 4
 ; CHECK-NEXT:    store float [[LD2]], ptr [[PTR2]], align 4
diff --git a/llvm/test/Transforms/SandboxVectorizer/bottomup_seed_slice_pow2.ll b/llvm/test/Transforms/SandboxVectorizer/bottomup_seed_slice_pow2.ll
index 22119c4491b92..e186d5fa86e4a 100644
--- a/llvm/test/Transforms/SandboxVectorizer/bottomup_seed_slice_pow2.ll
+++ b/llvm/test/Transforms/SandboxVectorizer/bottomup_seed_slice_pow2.ll
@@ -6,7 +6,6 @@ define void @pow2(ptr %ptr, float %val) {
 ; POW2-LABEL: define void @pow2(
 ; POW2-SAME: ptr [[PTR:%.*]], float [[VAL:%.*]]) {
 ; POW2-NEXT:    [[PTR0:%.*]] = getelementptr float, ptr [[PTR]], i32 0
-; POW2-NEXT:    [[PTR1:%.*]] = getelementptr float, ptr [[PTR]], i32 1
 ; POW2-NEXT:    [[PTR2:%.*]] = getelementptr float, ptr [[PTR]], i32 2
 ; POW2-NEXT:    [[VECL:%.*]] = load <2 x float>, ptr [[PTR0]], align 4
 ; POW2-NEXT:    [[LD2:%.*]] = load float, ptr [[PTR2]], align 4
@@ -17,8 +16,6 @@ define void @pow2(ptr %ptr, float %val) {
 ; NON-POW2-LABEL: define void @pow2(
 ; NON-POW2-SAME: ptr [[PTR:%.*]], float [[VAL:%.*]]) {
 ; NON-POW2-NEXT:    [[PTR0:%.*]] = getelementptr float, ptr [[PTR]], i32 0
-; NON-POW2-NEXT:    [[PTR1:%.*]] = getelementptr float, ptr [[PTR]], i32 1
-; NON-POW2-NEXT:    [[PTR2:%.*]] = getelementptr float, ptr [[PTR]], i32 2
 ; NON-POW2-NEXT:    [[PACK2:%.*]] = load <3 x float>, ptr [[PTR0]], align 4
 ; NON-POW2-NEXT:    store <3 x float> [[PACK2]], ptr [[PTR0]], align 4
 ; NON-POW2-NEXT:    ret void

From 64c2156d8802b0d7724f65ce854844670e4ec457 Mon Sep 17 00:00:00 2001
From: Malavika Samak <malavika.samak@gmail.com>
Date: Tue, 14 Jan 2025 08:15:15 +0530
Subject: [PATCH 361/408] [Wunsafe-buffer-usage] Fix false positive when const
 sized array is indexed by const evaluatable expressions (#119340)

Do not warn when constant sized array is indexed by expressions that
evaluate to a const value. For instance, sizeof(T) expression value can
be evaluated at compile time and if an array is indexed by such an
expression, it's bounds can be validated.

(rdar://140320289)

Co-authored-by: MalavikaSamak <malavika2@apple.com>
---
 clang/lib/Analysis/UnsafeBufferUsage.cpp      |  7 ++--
 .../warn-unsafe-buffer-usage-array.cpp        | 32 +++++++++++++++++++
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp
index a9aff39df6474..bef5fa8624ce4 100644
--- a/clang/lib/Analysis/UnsafeBufferUsage.cpp
+++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp
@@ -453,8 +453,11 @@ AST_MATCHER(ArraySubscriptExpr, isSafeArraySubscript) {
     return false;
   }
 
-  if (const auto *IdxLit = dyn_cast<IntegerLiteral>(Node.getIdx())) {
-    const APInt ArrIdx = IdxLit->getValue();
+  Expr::EvalResult EVResult;
+  if (Node.getIdx()->EvaluateAsInt(EVResult, Finder->getASTContext())) {
+    llvm::APSInt ArrIdx = EVResult.Val.getInt();
+    // FIXME: ArrIdx.isNegative() we could immediately emit an error as that's a
+    // bug
     if (ArrIdx.isNonNegative() && ArrIdx.getLimitedValue() < limit)
       return true;
   }
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-array.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-array.cpp
index 7dd6c83dbba2a..e80b54b7c6967 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-array.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-array.cpp
@@ -92,3 +92,35 @@ char access_strings() {
   c = array_string[5];
   return c;
 }
+
+struct T {
+  int array[10];
+};
+
+const int index = 1;
+
+constexpr int get_const(int x) {
+  if(x < 3)
+    return ++x;
+  else
+    return x + 5;
+};
+
+void array_indexed_const_expr(unsigned idx) {
+  // expected-note@+2 {{change type of 'arr' to 'std::array' to label it for hardening}}
+  // expected-warning@+1{{'arr' is an unsafe buffer that does not perform bounds checks}}
+  int arr[10];
+  arr[sizeof(int)] = 5;
+
+  int array[sizeof(T)];
+  array[sizeof(int)] = 5;
+  array[sizeof(T) -1 ] = 3;
+
+  int k = arr[6 & 5];
+  k = arr[2 << index];
+  k = arr[8 << index]; // expected-note {{used in buffer access here}}
+  k = arr[16 >> 1];
+  k = arr[get_const(index)];
+  k = arr[get_const(5)]; // expected-note {{used in buffer access here}}
+  k = arr[get_const(4)];
+}

From 717230c959bcb01343ca9e43b053fb62e736b4ec Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Tue, 14 Jan 2025 14:02:09 +1100
Subject: [PATCH 362/408] [JITLink] Fix empty comment on LinkGraph::intern
 method.

---
 llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
index df347049e85d8..4ff8d92d8de93 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
@@ -1269,10 +1269,11 @@ class LinkGraph {
     return splitBlockImpl(std::move(Blocks), Cache);
   }
 
-  //
+  /// Intern the given string in the LinkGraph's SymbolStringPool.
   orc::SymbolStringPtr intern(StringRef SymbolName) {
     return SSP->intern(SymbolName);
   }
+
   /// Add an external symbol.
   /// Some formats (e.g. ELF) allow Symbols to have sizes. For Symbols whose
   /// size is not known, you should substitute '0'.

From d90a42751f9bfa73ed3555c702e70cf34d97bb39 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 13 Jan 2025 19:37:31 -0800
Subject: [PATCH 363/408] [RISCV] Remove loads from single element fixed vector
 reduction tests. NFC (#122808)

These tests weren't interested in the loads. Removing them reduces the
diffs from #122671.
---
 .../RISCV/rvv/fixed-vectors-reduction-fp.ll   |  48 +---
 .../RISCV/rvv/fixed-vectors-reduction-int.ll  | 255 ++++++------------
 2 files changed, 94 insertions(+), 209 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
index 8bf30f8f0d072..2b279389253b0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
@@ -4,29 +4,25 @@
 
 declare half @llvm.vector.reduce.fadd.v1f16(half, <1 x half>)
 
-define half @vreduce_fadd_v1f16(ptr %x, half %s) {
+define half @vreduce_fadd_v1f16(<1 x half> %v, half %s) {
 ; CHECK-LABEL: vreduce_fadd_v1f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    fadd.h fa0, fa0, fa5
 ; CHECK-NEXT:    ret
-  %v = load <1 x half>, ptr %x
   %red = call reassoc half @llvm.vector.reduce.fadd.v1f16(half %s, <1 x half> %v)
   ret half %red
 }
 
-define half @vreduce_ord_fadd_v1f16(ptr %x, half %s) {
+define half @vreduce_ord_fadd_v1f16(<1 x half> %v, half %s) {
 ; CHECK-LABEL: vreduce_ord_fadd_v1f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
 ; CHECK-NEXT:    vfredosum.vs v8, v8, v9
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x half>, ptr %x
   %red = call half @llvm.vector.reduce.fadd.v1f16(half %s, <1 x half> %v)
   ret half %red
 }
@@ -271,61 +267,53 @@ define half @vreduce_ord_fadd_v128f16(ptr %x, half %s) {
 
 declare float @llvm.vector.reduce.fadd.v1f32(float, <1 x float>)
 
-define float @vreduce_fadd_v1f32(ptr %x, float %s) {
+define float @vreduce_fadd_v1f32(<1 x float> %v, float %s) {
 ; CHECK-LABEL: vreduce_fadd_v1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    fadd.s fa0, fa0, fa5
 ; CHECK-NEXT:    ret
-  %v = load <1 x float>, ptr %x
   %red = call reassoc float @llvm.vector.reduce.fadd.v1f32(float %s, <1 x float> %v)
   ret float %red
 }
 
-define float @vreduce_ord_fadd_v1f32(ptr %x, float %s) {
+define float @vreduce_ord_fadd_v1f32(<1 x float> %v, float %s) {
 ; CHECK-LABEL: vreduce_ord_fadd_v1f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
 ; CHECK-NEXT:    vfredosum.vs v8, v8, v9
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x float>, ptr %x
   %red = call float @llvm.vector.reduce.fadd.v1f32(float %s, <1 x float> %v)
   ret float %red
 }
 
-define float @vreduce_fwadd_v1f32(ptr %x, float %s) {
+define float @vreduce_fwadd_v1f32(<1 x half> %v, float %s) {
 ; CHECK-LABEL: vreduce_fwadd_v1f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    vfwcvt.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa5, v9
 ; CHECK-NEXT:    fadd.s fa0, fa0, fa5
 ; CHECK-NEXT:    ret
-  %v = load <1 x half>, ptr %x
   %e = fpext <1 x half> %v to <1 x float>
   %red = call reassoc float @llvm.vector.reduce.fadd.v1f32(float %s, <1 x float> %e)
   ret float %red
 }
 
-define float @vreduce_ord_fwadd_v1f32(ptr %x, float %s) {
+define float @vreduce_ord_fwadd_v1f32(<1 x half> %v, float %s) {
 ; CHECK-LABEL: vreduce_ord_fwadd_v1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwredosum.vs v8, v8, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x half>, ptr %x
   %e = fpext <1 x half> %v to <1 x float>
   %red = call float @llvm.vector.reduce.fadd.v1f32(float %s, <1 x float> %e)
   ret float %red
@@ -815,61 +803,53 @@ define float @vreduce_ord_fwadd_v64f32(ptr %x, float %s) {
 
 declare double @llvm.vector.reduce.fadd.v1f64(double, <1 x double>)
 
-define double @vreduce_fadd_v1f64(ptr %x, double %s) {
+define double @vreduce_fadd_v1f64(<1 x double> %v, double %s) {
 ; CHECK-LABEL: vreduce_fadd_v1f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    fadd.d fa0, fa0, fa5
 ; CHECK-NEXT:    ret
-  %v = load <1 x double>, ptr %x
   %red = call reassoc double @llvm.vector.reduce.fadd.v1f64(double %s, <1 x double> %v)
   ret double %red
 }
 
-define double @vreduce_ord_fadd_v1f64(ptr %x, double %s) {
+define double @vreduce_ord_fadd_v1f64(<1 x double> %v, double %s) {
 ; CHECK-LABEL: vreduce_ord_fadd_v1f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
 ; CHECK-NEXT:    vfredosum.vs v8, v8, v9
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x double>, ptr %x
   %red = call double @llvm.vector.reduce.fadd.v1f64(double %s, <1 x double> %v)
   ret double %red
 }
 
-define double @vreduce_fwadd_v1f64(ptr %x, double %s) {
+define double @vreduce_fwadd_v1f64(<1 x float> %v, double %s) {
 ; CHECK-LABEL: vreduce_fwadd_v1f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vfwcvt.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa5, v9
 ; CHECK-NEXT:    fadd.d fa0, fa0, fa5
 ; CHECK-NEXT:    ret
-  %v = load <1 x float>, ptr %x
   %e = fpext <1 x float> %v to <1 x double>
   %red = call reassoc double @llvm.vector.reduce.fadd.v1f64(double %s, <1 x double> %e)
   ret double %red
 }
 
-define double @vreduce_ord_fwadd_v1f64(ptr %x, double %s) {
+define double @vreduce_ord_fwadd_v1f64(<1 x float> %v, double %s) {
 ; CHECK-LABEL: vreduce_ord_fwadd_v1f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfwredosum.vs v8, v8, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x float>, ptr %x
   %e = fpext <1 x float> %v to <1 x double>
   %red = call double @llvm.vector.reduce.fadd.v1f64(double %s, <1 x double> %e)
   ret double %red
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
index 2ea618bf8a226..707d1202aca0f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
@@ -4,14 +4,12 @@
 
 declare i8 @llvm.vector.reduce.add.v1i8(<1 x i8>)
 
-define i8 @vreduce_add_v1i8(ptr %x) {
+define i8 @vreduce_add_v1i8(<1 x i8> %v) {
 ; CHECK-LABEL: vreduce_add_v1i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x i8>, ptr %x
   %red = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> %v)
   ret i8 %red
 }
@@ -169,41 +167,35 @@ define i8 @vreduce_add_v256i8(ptr %x) {
 
 declare i16 @llvm.vector.reduce.add.v1i16(<1 x i16>)
 
-define i16 @vreduce_add_v1i16(ptr %x) {
+define i16 @vreduce_add_v1i16(<1 x i16> %v) {
 ; CHECK-LABEL: vreduce_add_v1i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x i16>, ptr %x
   %red = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %v)
   ret i16 %red
 }
 
-define i16 @vwreduce_add_v1i16(ptr %x) {
+define i16 @vwreduce_add_v1i16(<1 x i8> %v) {
 ; CHECK-LABEL: vwreduce_add_v1i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vsext.vf2 v9, v8
 ; CHECK-NEXT:    vmv.x.s a0, v9
 ; CHECK-NEXT:    ret
-  %v = load <1 x i8>, ptr %x
   %e = sext <1 x i8> %v to <1 x i16>
   %red = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %e)
   ret i16 %red
 }
 
-define i16 @vwreduce_uadd_v1i16(ptr %x) {
+define i16 @vwreduce_uadd_v1i16(<1 x i8> %v) {
 ; CHECK-LABEL: vwreduce_uadd_v1i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vzext.vf2 v9, v8
 ; CHECK-NEXT:    vmv.x.s a0, v9
 ; CHECK-NEXT:    ret
-  %v = load <1 x i8>, ptr %x
   %e = zext <1 x i8> %v to <1 x i16>
   %red = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %e)
   ret i16 %red
@@ -581,41 +573,35 @@ define i16 @vwreduce_uadd_v128i16(ptr %x) {
 
 declare i32 @llvm.vector.reduce.add.v1i32(<1 x i32>)
 
-define i32 @vreduce_add_v1i32(ptr %x) {
+define i32 @vreduce_add_v1i32(<1 x i32> %v) {
 ; CHECK-LABEL: vreduce_add_v1i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x i32>, ptr %x
   %red = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %v)
   ret i32 %red
 }
 
-define i32 @vwreduce_add_v1i32(ptr %x) {
+define i32 @vwreduce_add_v1i32(<1 x i16> %v) {
 ; CHECK-LABEL: vwreduce_add_v1i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    vsext.vf2 v9, v8
 ; CHECK-NEXT:    vmv.x.s a0, v9
 ; CHECK-NEXT:    ret
-  %v = load <1 x i16>, ptr %x
   %e = sext <1 x i16> %v to <1 x i32>
   %red = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %e)
   ret i32 %red
 }
 
-define i32 @vwreduce_uadd_v1i32(ptr %x) {
+define i32 @vwreduce_uadd_v1i32(<1 x i16> %v) {
 ; CHECK-LABEL: vwreduce_uadd_v1i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    vzext.vf2 v9, v8
 ; CHECK-NEXT:    vmv.x.s a0, v9
 ; CHECK-NEXT:    ret
-  %v = load <1 x i16>, ptr %x
   %e = zext <1 x i16> %v to <1 x i32>
   %red = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %e)
   ret i32 %red
@@ -940,12 +926,11 @@ define i32 @vwreduce_uadd_v64i32(ptr %x) {
 
 declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64>)
 
-define i64 @vreduce_add_v1i64(ptr %x) {
+define i64 @vreduce_add_v1i64(<1 x i64> %v) {
 ; RV32-LABEL: vreduce_add_v1i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v9, v8, a0
 ; RV32-NEXT:    vmv.x.s a1, v9
 ; RV32-NEXT:    vmv.x.s a0, v8
@@ -954,21 +939,18 @@ define i64 @vreduce_add_v1i64(ptr %x) {
 ; RV64-LABEL: vreduce_add_v1i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    ret
-  %v = load <1 x i64>, ptr %x
   %red = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %v)
   ret i64 %red
 }
 
-define i64 @vwreduce_add_v1i64(ptr %x) {
+define i64 @vwreduce_add_v1i64(<1 x i32> %v) {
 ; RV32-LABEL: vwreduce_add_v1i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    li a0, 32
 ; RV32-NEXT:    vsext.vf2 v9, v8
+; RV32-NEXT:    li a0, 32
 ; RV32-NEXT:    vsrl.vx v8, v9, a0
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    vmv.x.s a0, v9
@@ -977,23 +959,20 @@ define i64 @vwreduce_add_v1i64(ptr %x) {
 ; RV64-LABEL: vwreduce_add_v1i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vle32.v v8, (a0)
 ; RV64-NEXT:    vsext.vf2 v9, v8
 ; RV64-NEXT:    vmv.x.s a0, v9
 ; RV64-NEXT:    ret
-  %v = load <1 x i32>, ptr %x
   %e = sext <1 x i32> %v to <1 x i64>
   %red = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %e)
   ret i64 %red
 }
 
-define i64 @vwreduce_uadd_v1i64(ptr %x) {
+define i64 @vwreduce_uadd_v1i64(<1 x i32> %v) {
 ; RV32-LABEL: vwreduce_uadd_v1i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    li a0, 32
 ; RV32-NEXT:    vzext.vf2 v9, v8
+; RV32-NEXT:    li a0, 32
 ; RV32-NEXT:    vsrl.vx v8, v9, a0
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    vmv.x.s a0, v9
@@ -1002,11 +981,9 @@ define i64 @vwreduce_uadd_v1i64(ptr %x) {
 ; RV64-LABEL: vwreduce_uadd_v1i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vle32.v v8, (a0)
 ; RV64-NEXT:    vzext.vf2 v9, v8
 ; RV64-NEXT:    vmv.x.s a0, v9
 ; RV64-NEXT:    ret
-  %v = load <1 x i32>, ptr %x
   %e = zext <1 x i32> %v to <1 x i64>
   %red = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %e)
   ret i64 %red
@@ -1670,14 +1647,12 @@ define i64 @vwreduce_uadd_v64i64(ptr %x) {
 
 declare i8 @llvm.vector.reduce.and.v1i8(<1 x i8>)
 
-define i8 @vreduce_and_v1i8(ptr %x) {
+define i8 @vreduce_and_v1i8(<1 x i8> %v) {
 ; CHECK-LABEL: vreduce_and_v1i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x i8>, ptr %x
   %red = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> %v)
   ret i8 %red
 }
@@ -1829,14 +1804,12 @@ define i8 @vreduce_and_v256i8(ptr %x) {
 
 declare i16 @llvm.vector.reduce.and.v1i16(<1 x i16>)
 
-define i16 @vreduce_and_v1i16(ptr %x) {
+define i16 @vreduce_and_v1i16(<1 x i16> %v) {
 ; CHECK-LABEL: vreduce_and_v1i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x i16>, ptr %x
   %red = call i16 @llvm.vector.reduce.and.v1i16(<1 x i16> %v)
   ret i16 %red
 }
@@ -1954,14 +1927,12 @@ define i16 @vreduce_and_v128i16(ptr %x) {
 
 declare i32 @llvm.vector.reduce.and.v1i32(<1 x i32>)
 
-define i32 @vreduce_and_v1i32(ptr %x) {
+define i32 @vreduce_and_v1i32(<1 x i32> %v) {
 ; CHECK-LABEL: vreduce_and_v1i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x i32>, ptr %x
   %red = call i32 @llvm.vector.reduce.and.v1i32(<1 x i32> %v)
   ret i32 %red
 }
@@ -2063,12 +2034,11 @@ define i32 @vreduce_and_v64i32(ptr %x) {
 
 declare i64 @llvm.vector.reduce.and.v1i64(<1 x i64>)
 
-define i64 @vreduce_and_v1i64(ptr %x) {
+define i64 @vreduce_and_v1i64(<1 x i64> %v) {
 ; RV32-LABEL: vreduce_and_v1i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v9, v8, a0
 ; RV32-NEXT:    vmv.x.s a1, v9
 ; RV32-NEXT:    vmv.x.s a0, v8
@@ -2077,10 +2047,8 @@ define i64 @vreduce_and_v1i64(ptr %x) {
 ; RV64-LABEL: vreduce_and_v1i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    ret
-  %v = load <1 x i64>, ptr %x
   %red = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> %v)
   ret i64 %red
 }
@@ -2273,14 +2241,12 @@ define i64 @vreduce_and_v64i64(ptr %x) nounwind {
 
 declare i8 @llvm.vector.reduce.or.v1i8(<1 x i8>)
 
-define i8 @vreduce_or_v1i8(ptr %x) {
+define i8 @vreduce_or_v1i8(<1 x i8> %v) {
 ; CHECK-LABEL: vreduce_or_v1i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x i8>, ptr %x
   %red = call i8 @llvm.vector.reduce.or.v1i8(<1 x i8> %v)
   ret i8 %red
 }
@@ -2430,14 +2396,12 @@ define i8 @vreduce_or_v256i8(ptr %x) {
 
 declare i16 @llvm.vector.reduce.or.v1i16(<1 x i16>)
 
-define i16 @vreduce_or_v1i16(ptr %x) {
+define i16 @vreduce_or_v1i16(<1 x i16> %v) {
 ; CHECK-LABEL: vreduce_or_v1i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x i16>, ptr %x
   %red = call i16 @llvm.vector.reduce.or.v1i16(<1 x i16> %v)
   ret i16 %red
 }
@@ -2555,14 +2519,12 @@ define i16 @vreduce_or_v128i16(ptr %x) {
 
 declare i32 @llvm.vector.reduce.or.v1i32(<1 x i32>)
 
-define i32 @vreduce_or_v1i32(ptr %x) {
+define i32 @vreduce_or_v1i32(<1 x i32> %v) {
 ; CHECK-LABEL: vreduce_or_v1i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x i32>, ptr %x
   %red = call i32 @llvm.vector.reduce.or.v1i32(<1 x i32> %v)
   ret i32 %red
 }
@@ -2664,12 +2626,11 @@ define i32 @vreduce_or_v64i32(ptr %x) {
 
 declare i64 @llvm.vector.reduce.or.v1i64(<1 x i64>)
 
-define i64 @vreduce_or_v1i64(ptr %x) {
+define i64 @vreduce_or_v1i64(<1 x i64> %v) {
 ; RV32-LABEL: vreduce_or_v1i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v9, v8, a0
 ; RV32-NEXT:    vmv.x.s a1, v9
 ; RV32-NEXT:    vmv.x.s a0, v8
@@ -2678,10 +2639,8 @@ define i64 @vreduce_or_v1i64(ptr %x) {
 ; RV64-LABEL: vreduce_or_v1i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    ret
-  %v = load <1 x i64>, ptr %x
   %red = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> %v)
   ret i64 %red
 }
@@ -2874,14 +2833,12 @@ define i64 @vreduce_or_v64i64(ptr %x) nounwind {
 
 declare i8 @llvm.vector.reduce.xor.v1i8(<1 x i8>)
 
-define i8 @vreduce_xor_v1i8(ptr %x) {
+define i8 @vreduce_xor_v1i8(<1 x i8> %v) {
 ; CHECK-LABEL: vreduce_xor_v1i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x i8>, ptr %x
   %red = call i8 @llvm.vector.reduce.xor.v1i8(<1 x i8> %v)
   ret i8 %red
 }
@@ -3039,14 +2996,12 @@ define i8 @vreduce_xor_v256i8(ptr %x) {
 
 declare i16 @llvm.vector.reduce.xor.v1i16(<1 x i16>)
 
-define i16 @vreduce_xor_v1i16(ptr %x) {
+define i16 @vreduce_xor_v1i16(<1 x i16> %v) {
 ; CHECK-LABEL: vreduce_xor_v1i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x i16>, ptr %x
   %red = call i16 @llvm.vector.reduce.xor.v1i16(<1 x i16> %v)
   ret i16 %red
 }
@@ -3171,14 +3126,12 @@ define i16 @vreduce_xor_v128i16(ptr %x) {
 
 declare i32 @llvm.vector.reduce.xor.v1i32(<1 x i32>)
 
-define i32 @vreduce_xor_v1i32(ptr %x) {
+define i32 @vreduce_xor_v1i32(<1 x i32> %v) {
 ; CHECK-LABEL: vreduce_xor_v1i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x i32>, ptr %x
   %red = call i32 @llvm.vector.reduce.xor.v1i32(<1 x i32> %v)
   ret i32 %red
 }
@@ -3286,12 +3239,11 @@ define i32 @vreduce_xor_v64i32(ptr %x) {
 
 declare i64 @llvm.vector.reduce.xor.v1i64(<1 x i64>)
 
-define i64 @vreduce_xor_v1i64(ptr %x) {
+define i64 @vreduce_xor_v1i64(<1 x i64> %v) {
 ; RV32-LABEL: vreduce_xor_v1i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v9, v8, a0
 ; RV32-NEXT:    vmv.x.s a1, v9
 ; RV32-NEXT:    vmv.x.s a0, v8
@@ -3300,10 +3252,8 @@ define i64 @vreduce_xor_v1i64(ptr %x) {
 ; RV64-LABEL: vreduce_xor_v1i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    ret
-  %v = load <1 x i64>, ptr %x
   %red = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> %v)
   ret i64 %red
 }
@@ -3508,14 +3458,12 @@ define i64 @vreduce_xor_v64i64(ptr %x) nounwind {
 
 declare i8 @llvm.vector.reduce.smin.v1i8(<1 x i8>)
 
-define i8 @vreduce_smin_v1i8(ptr %x) {
+define i8 @vreduce_smin_v1i8(<1 x i8> %v) {
 ; CHECK-LABEL: vreduce_smin_v1i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x i8>, ptr %x
   %red = call i8 @llvm.vector.reduce.smin.v1i8(<1 x i8> %v)
   ret i8 %red
 }
@@ -3666,14 +3614,12 @@ define i8 @vreduce_smin_v256i8(ptr %x) {
 
 declare i16 @llvm.vector.reduce.smin.v1i16(<1 x i16>)
 
-define i16 @vreduce_smin_v1i16(ptr %x) {
+define i16 @vreduce_smin_v1i16(<1 x i16> %v) {
 ; CHECK-LABEL: vreduce_smin_v1i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x i16>, ptr %x
   %red = call i16 @llvm.vector.reduce.smin.v1i16(<1 x i16> %v)
   ret i16 %red
 }
@@ -3791,14 +3737,12 @@ define i16 @vreduce_smin_v128i16(ptr %x) {
 
 declare i32 @llvm.vector.reduce.smin.v1i32(<1 x i32>)
 
-define i32 @vreduce_smin_v1i32(ptr %x) {
+define i32 @vreduce_smin_v1i32(<1 x i32> %v) {
 ; CHECK-LABEL: vreduce_smin_v1i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x i32>, ptr %x
   %red = call i32 @llvm.vector.reduce.smin.v1i32(<1 x i32> %v)
   ret i32 %red
 }
@@ -3900,12 +3844,11 @@ define i32 @vreduce_smin_v64i32(ptr %x) {
 
 declare i64 @llvm.vector.reduce.smin.v1i64(<1 x i64>)
 
-define i64 @vreduce_smin_v1i64(ptr %x) {
+define i64 @vreduce_smin_v1i64(<1 x i64> %v) {
 ; RV32-LABEL: vreduce_smin_v1i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v9, v8, a0
 ; RV32-NEXT:    vmv.x.s a1, v9
 ; RV32-NEXT:    vmv.x.s a0, v8
@@ -3914,10 +3857,8 @@ define i64 @vreduce_smin_v1i64(ptr %x) {
 ; RV64-LABEL: vreduce_smin_v1i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    ret
-  %v = load <1 x i64>, ptr %x
   %red = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> %v)
   ret i64 %red
 }
@@ -4110,14 +4051,12 @@ define i64 @vreduce_smin_v64i64(ptr %x) nounwind {
 
 declare i8 @llvm.vector.reduce.smax.v1i8(<1 x i8>)
 
-define i8 @vreduce_smax_v1i8(ptr %x) {
+define i8 @vreduce_smax_v1i8(<1 x i8> %v) {
 ; CHECK-LABEL: vreduce_smax_v1i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x i8>, ptr %x
   %red = call i8 @llvm.vector.reduce.smax.v1i8(<1 x i8> %v)
   ret i8 %red
 }
@@ -4268,14 +4207,12 @@ define i8 @vreduce_smax_v256i8(ptr %x) {
 
 declare i16 @llvm.vector.reduce.smax.v1i16(<1 x i16>)
 
-define i16 @vreduce_smax_v1i16(ptr %x) {
+define i16 @vreduce_smax_v1i16(<1 x i16> %v) {
 ; CHECK-LABEL: vreduce_smax_v1i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x i16>, ptr %x
   %red = call i16 @llvm.vector.reduce.smax.v1i16(<1 x i16> %v)
   ret i16 %red
 }
@@ -4393,14 +4330,12 @@ define i16 @vreduce_smax_v128i16(ptr %x) {
 
 declare i32 @llvm.vector.reduce.smax.v1i32(<1 x i32>)
 
-define i32 @vreduce_smax_v1i32(ptr %x) {
+define i32 @vreduce_smax_v1i32(<1 x i32> %v) {
 ; CHECK-LABEL: vreduce_smax_v1i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x i32>, ptr %x
   %red = call i32 @llvm.vector.reduce.smax.v1i32(<1 x i32> %v)
   ret i32 %red
 }
@@ -4502,12 +4437,11 @@ define i32 @vreduce_smax_v64i32(ptr %x) {
 
 declare i64 @llvm.vector.reduce.smax.v1i64(<1 x i64>)
 
-define i64 @vreduce_smax_v1i64(ptr %x) {
+define i64 @vreduce_smax_v1i64(<1 x i64> %v) {
 ; RV32-LABEL: vreduce_smax_v1i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v9, v8, a0
 ; RV32-NEXT:    vmv.x.s a1, v9
 ; RV32-NEXT:    vmv.x.s a0, v8
@@ -4516,10 +4450,8 @@ define i64 @vreduce_smax_v1i64(ptr %x) {
 ; RV64-LABEL: vreduce_smax_v1i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    ret
-  %v = load <1 x i64>, ptr %x
   %red = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> %v)
   ret i64 %red
 }
@@ -4712,14 +4644,12 @@ define i64 @vreduce_smax_v64i64(ptr %x) nounwind {
 
 declare i8 @llvm.vector.reduce.umin.v1i8(<1 x i8>)
 
-define i8 @vreduce_umin_v1i8(ptr %x) {
+define i8 @vreduce_umin_v1i8(<1 x i8> %v) {
 ; CHECK-LABEL: vreduce_umin_v1i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x i8>, ptr %x
   %red = call i8 @llvm.vector.reduce.umin.v1i8(<1 x i8> %v)
   ret i8 %red
 }
@@ -4870,14 +4800,12 @@ define i8 @vreduce_umin_v256i8(ptr %x) {
 
 declare i16 @llvm.vector.reduce.umin.v1i16(<1 x i16>)
 
-define i16 @vreduce_umin_v1i16(ptr %x) {
+define i16 @vreduce_umin_v1i16(<1 x i16> %v) {
 ; CHECK-LABEL: vreduce_umin_v1i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x i16>, ptr %x
   %red = call i16 @llvm.vector.reduce.umin.v1i16(<1 x i16> %v)
   ret i16 %red
 }
@@ -4995,14 +4923,12 @@ define i16 @vreduce_umin_v128i16(ptr %x) {
 
 declare i32 @llvm.vector.reduce.umin.v1i32(<1 x i32>)
 
-define i32 @vreduce_umin_v1i32(ptr %x) {
+define i32 @vreduce_umin_v1i32(<1 x i32> %v) {
 ; CHECK-LABEL: vreduce_umin_v1i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x i32>, ptr %x
   %red = call i32 @llvm.vector.reduce.umin.v1i32(<1 x i32> %v)
   ret i32 %red
 }
@@ -5104,12 +5030,11 @@ define i32 @vreduce_umin_v64i32(ptr %x) {
 
 declare i64 @llvm.vector.reduce.umin.v1i64(<1 x i64>)
 
-define i64 @vreduce_umin_v1i64(ptr %x) {
+define i64 @vreduce_umin_v1i64(<1 x i64> %v) {
 ; RV32-LABEL: vreduce_umin_v1i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v9, v8, a0
 ; RV32-NEXT:    vmv.x.s a1, v9
 ; RV32-NEXT:    vmv.x.s a0, v8
@@ -5118,10 +5043,8 @@ define i64 @vreduce_umin_v1i64(ptr %x) {
 ; RV64-LABEL: vreduce_umin_v1i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    ret
-  %v = load <1 x i64>, ptr %x
   %red = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> %v)
   ret i64 %red
 }
@@ -5314,14 +5237,12 @@ define i64 @vreduce_umin_v64i64(ptr %x) nounwind {
 
 declare i8 @llvm.vector.reduce.umax.v1i8(<1 x i8>)
 
-define i8 @vreduce_umax_v1i8(ptr %x) {
+define i8 @vreduce_umax_v1i8(<1 x i8> %v) {
 ; CHECK-LABEL: vreduce_umax_v1i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x i8>, ptr %x
   %red = call i8 @llvm.vector.reduce.umax.v1i8(<1 x i8> %v)
   ret i8 %red
 }
@@ -5471,14 +5392,12 @@ define i8 @vreduce_umax_v256i8(ptr %x) {
 
 declare i16 @llvm.vector.reduce.umax.v1i16(<1 x i16>)
 
-define i16 @vreduce_umax_v1i16(ptr %x) {
+define i16 @vreduce_umax_v1i16(<1 x i16> %v) {
 ; CHECK-LABEL: vreduce_umax_v1i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x i16>, ptr %x
   %red = call i16 @llvm.vector.reduce.umax.v1i16(<1 x i16> %v)
   ret i16 %red
 }
@@ -5596,14 +5515,12 @@ define i16 @vreduce_umax_v128i16(ptr %x) {
 
 declare i32 @llvm.vector.reduce.umax.v1i32(<1 x i32>)
 
-define i32 @vreduce_umax_v1i32(ptr %x) {
+define i32 @vreduce_umax_v1i32(<1 x i32> %v) {
 ; CHECK-LABEL: vreduce_umax_v1i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x i32>, ptr %x
   %red = call i32 @llvm.vector.reduce.umax.v1i32(<1 x i32> %v)
   ret i32 %red
 }
@@ -5705,12 +5622,11 @@ define i32 @vreduce_umax_v64i32(ptr %x) {
 
 declare i64 @llvm.vector.reduce.umax.v1i64(<1 x i64>)
 
-define i64 @vreduce_umax_v1i64(ptr %x) {
+define i64 @vreduce_umax_v1i64(<1 x i64> %v) {
 ; RV32-LABEL: vreduce_umax_v1i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v9, v8, a0
 ; RV32-NEXT:    vmv.x.s a1, v9
 ; RV32-NEXT:    vmv.x.s a0, v8
@@ -5719,10 +5635,8 @@ define i64 @vreduce_umax_v1i64(ptr %x) {
 ; RV64-LABEL: vreduce_umax_v1i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    ret
-  %v = load <1 x i64>, ptr %x
   %red = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> %v)
   ret i64 %red
 }
@@ -5915,14 +5829,12 @@ define i64 @vreduce_umax_v64i64(ptr %x) nounwind {
 
 declare i8 @llvm.vector.reduce.mul.v1i8(<1 x i8>)
 
-define i8 @vreduce_mul_v1i8(ptr %x) {
+define i8 @vreduce_mul_v1i8(<1 x i8> %v) {
 ; CHECK-LABEL: vreduce_mul_v1i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x i8>, ptr %x
   %red = call i8 @llvm.vector.reduce.mul.v1i8(<1 x i8> %v)
   ret i8 %red
 }
@@ -6147,14 +6059,12 @@ define i8 @vreduce_mul_v256i8(ptr %x) {
 
 declare i16 @llvm.vector.reduce.mul.v1i16(<1 x i16>)
 
-define i16 @vreduce_mul_v1i16(ptr %x) {
+define i16 @vreduce_mul_v1i16(<1 x i16> %v) {
 ; CHECK-LABEL: vreduce_mul_v1i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x i16>, ptr %x
   %red = call i16 @llvm.vector.reduce.mul.v1i16(<1 x i16> %v)
   ret i16 %red
 }
@@ -6321,14 +6231,12 @@ define i16 @vreduce_mul_v128i16(ptr %x) {
 
 declare i32 @llvm.vector.reduce.mul.v1i32(<1 x i32>)
 
-define i32 @vreduce_mul_v1i32(ptr %x) {
+define i32 @vreduce_mul_v1i32(<1 x i32> %v) {
 ; CHECK-LABEL: vreduce_mul_v1i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
-  %v = load <1 x i32>, ptr %x
   %red = call i32 @llvm.vector.reduce.mul.v1i32(<1 x i32> %v)
   ret i32 %red
 }
@@ -6464,12 +6372,11 @@ define i32 @vreduce_mul_v64i32(ptr %x) {
 
 declare i64 @llvm.vector.reduce.mul.v1i64(<1 x i64>)
 
-define i64 @vreduce_mul_v1i64(ptr %x) {
+define i64 @vreduce_mul_v1i64(<1 x i64> %v) {
 ; RV32-LABEL: vreduce_mul_v1i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v9, v8, a0
 ; RV32-NEXT:    vmv.x.s a1, v9
 ; RV32-NEXT:    vmv.x.s a0, v8
@@ -6478,10 +6385,8 @@ define i64 @vreduce_mul_v1i64(ptr %x) {
 ; RV64-LABEL: vreduce_mul_v1i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    ret
-  %v = load <1 x i64>, ptr %x
   %red = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> %v)
   ret i64 %red
 }

From 9f114afe092483983a82a73c82704f11bb28bf8c Mon Sep 17 00:00:00 2001
From: lialan <me@alanli.org>
Date: Tue, 14 Jan 2025 12:31:25 +0800
Subject: [PATCH 364/408] [MLIR][ROCDL] Convert `math::fpowi` to ROCDL call
 (#122640)

* Have to relax static assert to allow reuse of existing template
patterns for conversion.
---
 .../GPUCommon/OpToFuncCallLowering.h           | 10 +++++++---
 .../lib/Conversion/MathToROCDL/MathToROCDL.cpp |  3 ++-
 .../Conversion/MathToROCDL/math-to-rocdl.mlir  | 18 ++++++++++++++++++
 3 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h b/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h
index 3b94abd88f9ed..46fd182346b3b 100644
--- a/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h
@@ -57,9 +57,13 @@ struct OpToFuncCallLowering : public ConvertOpToLLVMPattern<SourceOp> {
         std::is_base_of<OpTrait::OneResult<SourceOp>, SourceOp>::value,
         "expected single result op");
 
-    static_assert(std::is_base_of<OpTrait::SameOperandsAndResultType<SourceOp>,
-                                  SourceOp>::value,
-                  "expected op with same operand and result types");
+    if constexpr (!std::is_base_of<OpTrait::SameOperandsAndResultType<SourceOp>,
+                                   SourceOp>::value) {
+      assert(op->getNumOperands() > 0 &&
+             "expected op to take at least one operand");
+      assert(op->getResultTypes().front() == op->getOperand(0).getType() &&
+             "expected op with same operand and result types");
+    }
 
     if (!op->template getParentOfType<FunctionOpInterface>()) {
       return rewriter.notifyMatchFailure(
diff --git a/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp b/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp
index c17bfe4f71a98..838eef30a938f 100644
--- a/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp
+++ b/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp
@@ -57,7 +57,6 @@ void mlir::populateMathToROCDLConversionPatterns(
   // Handled by mathToLLVM: math::FmaOp
   // Handled by mathToLLVM: math::LogOp (32-bit only)
   // FIXME: math::IPowIOp
-  // FIXME: math::FPowIOp
   // Handled by mathToLLVM: math::RoundEvenOp
   // Handled by mathToLLVM: math::RoundOp
   // Handled by mathToLLVM: math::SqrtOp
@@ -114,6 +113,8 @@ void mlir::populateMathToROCDLConversionPatterns(
                                   "__ocml_tan_f64", "__ocml_tan_f16");
   populateOpPatterns<math::ErfOp>(converter, patterns, "__ocml_erf_f32",
                                   "__ocml_erf_f64", "__ocml_erf_f16");
+  populateOpPatterns<math::FPowIOp>(converter, patterns, "__ocml_pown_f32",
+                                    "__ocml_pown_f64", "__ocml_pown_f16");
   // Single arith pattern that needs a ROCDL call, probably not
   // worth creating a separate pass for it.
   populateOpPatterns<arith::RemFOp>(converter, patterns, "__ocml_fmod_f32",
diff --git a/mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir b/mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir
index e0ea18d41f66d..e4b2f01d6544a 100644
--- a/mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir
+++ b/mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir
@@ -484,6 +484,24 @@ module @test_module {
 
 // -----
 
+module @test_module {
+  // CHECK: llvm.func @__ocml_pown_f16(f16, i32) -> f16
+  // CHECK: llvm.func @__ocml_pown_f32(f32, i32) -> f32
+  // CHECK: llvm.func @__ocml_pown_f64(f64, i32) -> f64
+  // CHECK-LABEL: func @math_fpowi
+  func.func @math_fpowi(%arg0: f16, %arg1: f32, %arg2: f64, %arg3: i32) -> (f16, f32, f64) {
+    // CHECK: llvm.call @__ocml_pown_f16(%{{.*}}) : (f16, i32) -> f16
+    %0 = math.fpowi %arg0, %arg3 : f16, i32
+    // CHECK: llvm.call @__ocml_pown_f32(%{{.*}}) : (f32, i32) -> f32
+    %1 = math.fpowi %arg1, %arg3 : f32, i32
+    // CHECK: llvm.call @__ocml_pown_f64(%{{.*}}) : (f64, i32) -> f64
+    %2 = math.fpowi %arg2, %arg3 : f64, i32
+    return %0, %1, %2 : f16, f32, f64
+  }
+}
+
+// -----
+
 // Math operation not inside function
 // Ensure it not crash
 

From cb2560d33b029b50c10bbc4348bbb944382fb659 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Tue, 14 Jan 2025 12:44:24 +0800
Subject: [PATCH 365/408] [VPlan] Verify plan before optimizations. NFC
 (#122678)

I've been exploring verifying the VPlan before and after the EVL
transformation steps, and noticed that the VPlan comes out in an invalid
state between construction and optimisation.

In adjustRecipesForReductions, we leave behind some dead recipes which
are invalid:

1) When we replace a link with a reduction recipe, the old link ends up
becoming a use-before-def:

    WIDEN ir<%l7> = add ir<%sum.02>, ir<%indvars.iv>.1
    WIDEN ir<%l8> = add ir<%l7>.1, ir<%l3>
    WIDEN ir<%l9> = add ir<%l8>.1, ir<%l5>
    ...
    REDUCE ir<%l7>.1 = ir<%sum.02> + reduce.add (ir<%indvars.iv>.1)
    REDUCE ir<%l8>.1 = ir<%l7>.1 + reduce.add (ir<%l3>)
    REDUCE ir<%l9>.1 = ir<%l8>.1 + reduce.add (ir<%l5>)

2) When transforming an AnyOf reduction phi to a boolean, we leave
behind a select with mismatching operand types, which will trigger the
assertions in VTypeAnalysis after #122679

This adds an extra verification step and deletes the dead recipes
eagerly to keep the plan valid.
---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ee352c0b12302..744faef192438 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9624,6 +9624,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
     VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
                                        WithoutRuntimeCheck);
   }
+
+  assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
   return Plan;
 }
 
@@ -9698,6 +9700,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
   VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
   VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
   VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
+  SmallVector<VPRecipeBase *> ToDelete;
+
   for (VPRecipeBase &R : Header->phis()) {
     auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
     if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
@@ -9817,10 +9821,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
           CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc());
       // Append the recipe to the end of the VPBasicBlock because we need to
       // ensure that it comes after all of it's inputs, including CondOp.
-      // Note that this transformation may leave over dead recipes (including
-      // CurrentLink), which will be cleaned by a later VPlan transform.
+      // Delete CurrentLink as it will be invalid if its operand is replaced
+      // with a reduction defined at the bottom of the block in the next link.
       LinkVPBB->appendRecipe(RedRecipe);
       CurrentLink->replaceAllUsesWith(RedRecipe);
+      ToDelete.push_back(CurrentLink);
       PreviousLink = RedRecipe;
     }
   }
@@ -9935,6 +9940,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
         Cmp = Builder.createNot(Cmp);
       VPValue *Or = Builder.createOr(PhiR, Cmp);
       Select->getVPSingleValue()->replaceAllUsesWith(Or);
+      // Delete Select now that it has invalid types.
+      ToDelete.push_back(Select);
 
       // Convert the reduction phi to operate on bools.
       PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
@@ -9952,6 +9959,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
   }
 
   VPlanTransforms::clearReductionWrapFlags(*Plan);
+  for (VPRecipeBase *R : ToDelete)
+    R->eraseFromParent();
 }
 
 void VPDerivedIVRecipe::execute(VPTransformState &State) {

From ffe5cddb68ab84348866b3a3ac727d263b2a44c2 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Tue, 14 Jan 2025 12:51:01 +0800
Subject: [PATCH 366/408] [RISCV] Support vp.{gather,scatter} in
 RISCVGatherScatterLowering (#122232)

This adds support for lowering llvm.vp.{gather,scatter}s to
experimental.vp.strided.{load,store}.

This will help us handle strided accesses with EVL tail folding that are
emitted from the loop vectorizer, but note that it's still not enough.
We will also need to handle the vector step not being loop-invariant
(i.e. produced by @llvm.experimental.vector.length) in a future patch.
---
 .../RISCV/RISCVGatherScatterLowering.cpp      |  94 +++++++++----
 .../rvv/fixed-vectors-strided-load-store.ll   | 111 ++++++++++++++++
 .../CodeGen/RISCV/rvv/strided-load-store.ll   | 123 ++++++++++++++++++
 3 files changed, 300 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
index f1e974f973cbe..a71e6bbb93638 100644
--- a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
@@ -63,8 +63,7 @@ class RISCVGatherScatterLowering : public FunctionPass {
   }
 
 private:
-  bool tryCreateStridedLoadStore(IntrinsicInst *II, Type *DataType, Value *Ptr,
-                                 Value *AlignOp);
+  bool tryCreateStridedLoadStore(IntrinsicInst *II);
 
   std::pair<Value *, Value *> determineBaseAndStride(Instruction *Ptr,
                                                      IRBuilderBase &Builder);
@@ -483,12 +482,46 @@ RISCVGatherScatterLowering::determineBaseAndStride(Instruction *Ptr,
   return P;
 }
 
-bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II,
-                                                           Type *DataType,
-                                                           Value *Ptr,
-                                                           Value *AlignOp) {
+bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II) {
+  VectorType *DataType;
+  Value *StoreVal = nullptr, *Ptr, *Mask, *EVL = nullptr;
+  MaybeAlign MA;
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::masked_gather:
+    DataType = cast<VectorType>(II->getType());
+    Ptr = II->getArgOperand(0);
+    MA = cast<ConstantInt>(II->getArgOperand(1))->getMaybeAlignValue();
+    Mask = II->getArgOperand(2);
+    break;
+  case Intrinsic::vp_gather:
+    DataType = cast<VectorType>(II->getType());
+    Ptr = II->getArgOperand(0);
+    MA = II->getParamAlign(0).value_or(
+        DL->getABITypeAlign(DataType->getElementType()));
+    Mask = II->getArgOperand(1);
+    EVL = II->getArgOperand(2);
+    break;
+  case Intrinsic::masked_scatter:
+    DataType = cast<VectorType>(II->getArgOperand(0)->getType());
+    StoreVal = II->getArgOperand(0);
+    Ptr = II->getArgOperand(1);
+    MA = cast<ConstantInt>(II->getArgOperand(2))->getMaybeAlignValue();
+    Mask = II->getArgOperand(3);
+    break;
+  case Intrinsic::vp_scatter:
+    DataType = cast<VectorType>(II->getArgOperand(0)->getType());
+    StoreVal = II->getArgOperand(0);
+    Ptr = II->getArgOperand(1);
+    MA = II->getParamAlign(1).value_or(
+        DL->getABITypeAlign(DataType->getElementType()));
+    Mask = II->getArgOperand(2);
+    EVL = II->getArgOperand(3);
+    break;
+  default:
+    llvm_unreachable("Unexpected intrinsic");
+  }
+
   // Make sure the operation will be supported by the backend.
-  MaybeAlign MA = cast<ConstantInt>(AlignOp)->getMaybeAlignValue();
   EVT DataTypeVT = TLI->getValueType(*DL, DataType);
   if (!MA || !TLI->isLegalStridedLoadStore(DataTypeVT, *MA))
     return false;
@@ -514,23 +547,27 @@ bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II,
 
   Builder.SetInsertPoint(II);
 
-  Value *EVL = Builder.CreateElementCount(
-      IntegerType::get(Ctx, 32), cast<VectorType>(DataType)->getElementCount());
+  if (!EVL)
+    EVL = Builder.CreateElementCount(
+        Builder.getInt32Ty(), cast<VectorType>(DataType)->getElementCount());
 
   CallInst *Call;
-  if (II->getIntrinsicID() == Intrinsic::masked_gather) {
+
+  if (!StoreVal) {
     Call = Builder.CreateIntrinsic(
         Intrinsic::experimental_vp_strided_load,
         {DataType, BasePtr->getType(), Stride->getType()},
-        {BasePtr, Stride, II->getArgOperand(2), EVL});
-    Call = Builder.CreateIntrinsic(
-        Intrinsic::vp_select, {DataType},
-        {II->getOperand(2), Call, II->getArgOperand(3), EVL});
+        {BasePtr, Stride, Mask, EVL});
+
+    // Merge llvm.masked.gather's passthru
+    if (II->getIntrinsicID() == Intrinsic::masked_gather)
+      Call = Builder.CreateIntrinsic(Intrinsic::vp_select, {DataType},
+                                     {Mask, Call, II->getArgOperand(3), EVL});
   } else
     Call = Builder.CreateIntrinsic(
         Intrinsic::experimental_vp_strided_store,
         {DataType, BasePtr->getType(), Stride->getType()},
-        {II->getArgOperand(0), BasePtr, Stride, II->getArgOperand(3), EVL});
+        {StoreVal, BasePtr, Stride, Mask, EVL});
 
   Call->takeName(II);
   II->replaceAllUsesWith(Call);
@@ -558,30 +595,31 @@ bool RISCVGatherScatterLowering::runOnFunction(Function &F) {
 
   StridedAddrs.clear();
 
-  SmallVector<IntrinsicInst *, 4> Gathers;
-  SmallVector<IntrinsicInst *, 4> Scatters;
+  SmallVector<IntrinsicInst *, 4> Worklist;
 
   bool Changed = false;
 
   for (BasicBlock &BB : F) {
     for (Instruction &I : BB) {
       IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
-      if (II && II->getIntrinsicID() == Intrinsic::masked_gather) {
-        Gathers.push_back(II);
-      } else if (II && II->getIntrinsicID() == Intrinsic::masked_scatter) {
-        Scatters.push_back(II);
+      if (!II)
+        continue;
+      switch (II->getIntrinsicID()) {
+      case Intrinsic::masked_gather:
+      case Intrinsic::masked_scatter:
+      case Intrinsic::vp_gather:
+      case Intrinsic::vp_scatter:
+        Worklist.push_back(II);
+        break;
+      default:
+        break;
       }
     }
   }
 
   // Rewrite gather/scatter to form strided load/store if possible.
-  for (auto *II : Gathers)
-    Changed |= tryCreateStridedLoadStore(
-        II, II->getType(), II->getArgOperand(0), II->getArgOperand(1));
-  for (auto *II : Scatters)
-    Changed |=
-        tryCreateStridedLoadStore(II, II->getArgOperand(0)->getType(),
-                                  II->getArgOperand(1), II->getArgOperand(2));
+  for (auto *II : Worklist)
+    Changed |= tryCreateStridedLoadStore(II);
 
   // Remove any dead phis.
   while (!MaybeDeadPHIs.empty()) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
index 2cbbfc019ab4d..83a9b23a387d2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
@@ -1030,3 +1030,114 @@ vector.body:                                      ; preds = %vector.body, %entry
 for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
+
+define void @vp_gather(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
+; CHECK-LABEL: @vp_gather(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR1:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[VEC_IND_SCALAR1]]
+; CHECK-NEXT:    [[ELEMS:%.*]] = sub i64 1024, [[VEC_IND_SCALAR]]
+; CHECK-NEXT:    [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[ELEMS]], i32 32, i1 false)
+; CHECK-NEXT:    [[ODD:%.*]] = and <32 x i64> [[VEC_IND]], splat (i64 1)
+; CHECK-NEXT:    [[MASK:%.*]] = icmp ne <32 x i64> [[ODD]], zeroinitializer
+; CHECK-NEXT:    [[WIDE_VP_GATHER:%.*]] = call <32 x i8> @llvm.experimental.vp.strided.load.v32i8.p0.i64(ptr [[TMP0]], i64 5, <32 x i1> [[MASK]], i32 [[EVL]])
+; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[VEC_IND_SCALAR]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I2]], align 1
+; CHECK-NEXT:    [[I4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_VP_GATHER]]
+; CHECK-NEXT:    store <32 x i8> [[I4]], ptr [[I2]], align 1
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR1]] = add i64 [[VEC_IND_SCALAR1]], 160
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <32 x i64> [[VEC_IND]], splat (i64 32)
+; CHECK-NEXT:    [[I6:%.*]] = icmp eq i64 [[VEC_IND_NEXT_SCALAR]], 1024
+; CHECK-NEXT:    br i1 [[I6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
+  %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
+  %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
+
+  %elems = sub i64 1024, %index
+  %evl = call i32 @llvm.experimental.get.vector.length.i64(i64 %elems, i32 32, i1 false)
+
+  %odd = and <32 x i64> %vec.ind, splat (i64 1)
+  %mask = icmp ne <32 x i64> %odd, splat (i64 0)
+
+  %wide.vp.gather = call <32 x i8> @llvm.vp.gather(<32 x ptr> %i1, <32 x i1> %mask, i32 %evl)
+  %i2 = getelementptr inbounds i8, ptr %A, i64 %index
+  %wide.load = load <32 x i8>, ptr %i2, align 1
+  %i4 = add <32 x i8> %wide.load, %wide.vp.gather
+  store <32 x i8> %i4, ptr %i2, align 1
+  %index.next = add nuw i64 %index, 32
+  %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
+  %i6 = icmp eq i64 %index.next, 1024
+  br i1 %i6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @vp_scatter(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
+; CHECK-LABEL: @vp_scatter(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR1:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[I:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[VEC_IND_SCALAR]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[VEC_IND_SCALAR1]]
+; CHECK-NEXT:    [[ELEMS:%.*]] = sub i64 1024, [[VEC_IND_SCALAR]]
+; CHECK-NEXT:    [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[ELEMS]], i32 32, i1 false)
+; CHECK-NEXT:    [[ODD:%.*]] = and <32 x i64> [[VEC_IND]], splat (i64 1)
+; CHECK-NEXT:    [[MASK:%.*]] = icmp ne <32 x i64> [[ODD]], zeroinitializer
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.experimental.vp.strided.load.v32i8.p0.i64(ptr [[TMP0]], i64 5, <32 x i1> [[MASK]], i32 [[EVL]])
+; CHECK-NEXT:    [[I4:%.*]] = add <32 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_LOAD]]
+; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.v32i8.p0.i64(<32 x i8> [[I4]], ptr [[TMP0]], i64 5, <32 x i1> [[MASK]], i32 [[EVL]])
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR1]] = add i64 [[VEC_IND_SCALAR1]], 160
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <32 x i64> [[VEC_IND]], splat (i64 32)
+; CHECK-NEXT:    [[I5:%.*]] = icmp eq i64 [[VEC_IND_NEXT_SCALAR]], 1024
+; CHECK-NEXT:    br i1 [[I5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
+  %i = getelementptr inbounds i8, ptr %B, i64 %index
+  %wide.load = load <32 x i8>, ptr %i, align 1
+  %i2 = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
+  %i3 = getelementptr inbounds i8, ptr %A, <32 x i64> %i2
+
+
+  %elems = sub i64 1024, %index
+  %evl = call i32 @llvm.experimental.get.vector.length.i64(i64 %elems, i32 32, i1 false)
+
+  %odd = and <32 x i64> %vec.ind, splat (i64 1)
+  %mask = icmp ne <32 x i64> %odd, splat (i64 0)
+
+  %wide.masked.gather = call <32 x i8> @llvm.vp.gather(<32 x ptr> %i3, <32 x i1> %mask, i32 %evl)
+  %i4 = add <32 x i8> %wide.masked.gather, %wide.load
+  call void @llvm.vp.scatter(<32 x i8> %i4, <32 x ptr> %i3, <32 x i1> %mask, i32 %evl)
+  %index.next = add nuw i64 %index, 32
+  %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
+  %i5 = icmp eq i64 %index.next, 1024
+  br i1 %i5, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
index b1ece9fa8272d..7c1fab9bfe91a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
@@ -398,3 +398,126 @@ define <vscale x 1 x i64> @vector_base_vector_offset(ptr %p, <vscale x 1 x i64>
 declare i64 @llvm.vscale.i64()
 declare void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64>, <vscale x 1 x ptr>, i32, <vscale x 1 x i1>)
 declare <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr>, i32, <vscale x 1 x i1>, <vscale x 1 x i64>)
+
+
+; TODO: Make the step loop variant to reflect what the loop vectorizer will emit
+; in an EVL tail folding configuration.
+
+define <vscale x 1 x i64> @vp_gather(ptr %a, i32 %len) {
+; CHECK-LABEL: @vp_gather(
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP0]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR1:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[TMP1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACCUM:%.*]] = phi <vscale x 1 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ELEMS:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[VEC_IND_SCALAR]]
+; CHECK-NEXT:    [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[ELEMS]], i32 1, i1 true)
+; CHECK-NEXT:    [[ODD:%.*]] = and <vscale x 1 x i64> [[VEC_IND]], splat (i64 1)
+; CHECK-NEXT:    [[MASK:%.*]] = icmp ne <vscale x 1 x i64> [[ODD]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[A:%.*]], i64 [[VEC_IND_SCALAR1]], i32 3
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 1 x i64> @llvm.experimental.vp.strided.load.nxv1i64.p0.i64(ptr [[TMP2]], i64 16, <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
+; CHECK-NEXT:    [[ACCUM_NEXT]] = add <vscale x 1 x i64> [[ACCUM]], [[GATHER]]
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], [[TMP0]]
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR1]] = add i64 [[VEC_IND_SCALAR1]], [[TMP0]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret <vscale x 1 x i64> [[ACCUM_NEXT]]
+;
+vector.ph:
+  %wide.trip.count = zext i32 %len to i64
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = tail call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+  %.splatinsert = insertelement <vscale x 1 x i64> poison, i64 %0, i64 0
+  %.splat = shufflevector <vscale x 1 x i64> %.splatinsert, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.ind = phi <vscale x 1 x i64> [ %1, %vector.ph ], [ %vec.ind.next, %vector.body ]
+  %accum = phi <vscale x 1 x i64> [ zeroinitializer, %vector.ph ], [ %accum.next, %vector.body ]
+
+  %elems = sub i64 %wide.trip.count, %index
+  %evl = call i32 @llvm.experimental.get.vector.length.i64(i64 %elems, i32 1, i1 true)
+
+  %odd = and <vscale x 1 x i64> %vec.ind, splat (i64 1)
+  %mask = icmp ne <vscale x 1 x i64> %odd, splat (i64 0)
+
+  %2 = getelementptr inbounds %struct.foo, ptr %a, <vscale x 1 x i64> %vec.ind, i32 3
+  %gather = call <vscale x 1 x i64> @llvm.vp.gather(<vscale x 1 x ptr> %2, <vscale x 1 x i1> %mask, i32 %evl)
+  %accum.next = add <vscale x 1 x i64> %accum, %gather
+  %index.next = add nuw i64 %index, %0
+  %vec.ind.next = add <vscale x 1 x i64> %vec.ind, %.splat
+  %3 = icmp ne i64 %index.next, %wide.trip.count
+  br i1 %3, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret <vscale x 1 x i64> %accum.next
+}
+
+; TODO: Make the step loop variant to reflect what the loop vectorizer will emit
+; in an EVL tail folding configuration.
+
+define void @vp_scatter(ptr %a, i32 %len) {
+; CHECK-LABEL: @vp_scatter(
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP0]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR1:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[TMP1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ELEMS:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[VEC_IND_SCALAR]]
+; CHECK-NEXT:    [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[ELEMS]], i32 1, i1 true)
+; CHECK-NEXT:    [[ODD:%.*]] = and <vscale x 1 x i64> [[VEC_IND]], splat (i64 1)
+; CHECK-NEXT:    [[MASK:%.*]] = icmp ne <vscale x 1 x i64> [[ODD]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[A:%.*]], i64 [[VEC_IND_SCALAR1]], i32 3
+; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.nxv1i64.p0.i64(<vscale x 1 x i64> zeroinitializer, ptr [[TMP2]], i64 16, <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], [[TMP0]]
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR1]] = add i64 [[VEC_IND_SCALAR1]], [[TMP0]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+vector.ph:
+  %wide.trip.count = zext i32 %len to i64
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = tail call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+  %.splatinsert = insertelement <vscale x 1 x i64> poison, i64 %0, i64 0
+  %.splat = shufflevector <vscale x 1 x i64> %.splatinsert, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.ind = phi <vscale x 1 x i64> [ %1, %vector.ph ], [ %vec.ind.next, %vector.body ]
+
+  %elems = sub i64 %wide.trip.count, %index
+  %evl = call i32 @llvm.experimental.get.vector.length.i64(i64 %elems, i32 1, i1 true)
+
+  %odd = and <vscale x 1 x i64> %vec.ind, splat (i64 1)
+  %mask = icmp ne <vscale x 1 x i64> %odd, splat (i64 0)
+
+  %2 = getelementptr inbounds %struct.foo, ptr %a, <vscale x 1 x i64> %vec.ind, i32 3
+  tail call void @llvm.vp.scatter(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x ptr> %2, <vscale x 1 x i1> %mask, i32 %evl)
+  %index.next = add nuw i64 %index, %0
+  %vec.ind.next = add <vscale x 1 x i64> %vec.ind, %.splat
+  %3 = icmp ne i64 %index.next, %wide.trip.count
+  br i1 %3, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}

From 35f9d2ac49eb3344f528c5b0c3b75330ade93982 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 13 Jan 2025 20:53:13 -0800
Subject: [PATCH 367/408] [CodeGen] Migrate away from PointerUnion::dyn_cast
 (NFC) (#122778)

Note that PointerUnion::dyn_cast has been soft deprecated in
PointerUnion.h:

  // FIXME: Replace the uses of is(), get() and dyn_cast() with
  //        isa<T>, cast<T> and the llvm::dyn_cast<T>

Literal migration would result in dyn_cast_if_present (see the
definition of PointerUnion::dyn_cast), but this patch uses dyn_cast
because we expect Prototype.P to be nonnull.
---
 clang/lib/CodeGen/CGCall.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 0fde4d8ee296b..e0cf6ca69f0df 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -4507,7 +4507,7 @@ void CodeGenFunction::EmitCallArgs(
   // First, if a prototype was provided, use those argument types.
   bool IsVariadic = false;
   if (Prototype.P) {
-    const auto *MD = Prototype.P.dyn_cast<const ObjCMethodDecl *>();
+    const auto *MD = dyn_cast<const ObjCMethodDecl *>(Prototype.P);
     if (MD) {
       IsVariadic = MD->isVariadic();
       ExplicitCC = getCallingConventionForDecl(

From 1ac52ec40acfcc039ab830404e7f7ac72dfe8480 Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl@google.com>
Date: Mon, 13 Jan 2025 21:35:52 -0800
Subject: [PATCH 368/408] [NFC]Update -fsplit-machine-functions now aarch64
 function splitting is supported (#122860)

With https://reviews.llvm.org/D157157, mfs is supported on aarch64.
---
 clang/include/clang/Driver/Options.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index bbf5c0e7e7fd1..7c41d38e00a49 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4364,7 +4364,7 @@ defm split_machine_functions: BoolFOption<"split-machine-functions",
   CodeGenOpts<"SplitMachineFunctions">, DefaultFalse,
   PosFlag<SetTrue, [], [ClangOption, CC1Option], "Enable">,
   NegFlag<SetFalse, [], [ClangOption], "Disable">,
-  BothFlags<[], [ClangOption], " late function splitting using profile information (x86 ELF)">>;
+  BothFlags<[], [ClangOption], " late function splitting using profile information (x86 and aarch64 ELF)">>;
 
 defm strict_return : BoolFOption<"strict-return",
   CodeGenOpts<"StrictReturn">, DefaultTrue,

From 9b5857a68381652dbea2a0c9efa734b6c4cf38c9 Mon Sep 17 00:00:00 2001
From: soumyaGhoshh <soumya_ghosh.mail@icloud.com>
Date: Tue, 14 Jan 2025 12:19:40 +0530
Subject: [PATCH 369/408] [X86] Fix parentheses for clarity in
 X86ISelLowering.cpp (#121954)

fixes #121908

This pull request resolves issue #121908 by adding proper parentheses in
X86ISelLowering.cpp to follow operator precedence rules. This change
ensures better code clarity and avoids ambiguity.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 434d88db04163..00a0aba0515f0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -42431,7 +42431,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
       // Canonicalize to VPERMV if both sources are the same.
       if (V1 == V2) {
         for (int &M : Mask)
-          M = (M < 0 ? M : M & Mask.size() - 1);
+          M = (M < 0 ? M : M & (Mask.size() - 1));
         SDValue NewMask = getConstVector(Mask, MaskVT, DAG, DL,
                                          /*IsMask=*/true);
         return DAG.getNode(X86ISD::VPERMV, DL, VT, NewMask, N.getOperand(0));

From 4eaff6c58ae2f130ac8d63cf2c87bbb483114876 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Tue, 14 Jan 2025 17:28:07 +1100
Subject: [PATCH 370/408] [JITLink] Use target triple for LinkGraph pointer
 size and endianness.

Removes LinkGraph's PointerSize and Endianness members and uses the triple to
find these values instead.

Also removes some redundant Triple copies.
---
 .../llvm/ExecutionEngine/JITLink/JITLink.h    | 35 ++++--------
 .../JITLink/JITLinkMemoryManager.h            |  7 +--
 .../JITLink/COFFLinkGraphBuilder.cpp          | 16 +-----
 .../JITLink/ELFLinkGraphBuilder.h             |  5 +-
 llvm/lib/ExecutionEngine/JITLink/JITLink.cpp  | 25 ++-------
 .../JITLink/JITLinkMemoryManager.cpp          | 17 +++---
 .../JITLink/MachOLinkGraphBuilder.cpp         | 19 ++-----
 llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp | 18 ++-----
 .../Orc/DebugObjectManagerPlugin.cpp          |  6 +--
 .../ExecutionEngine/Orc/ELFNixPlatform.cpp    | 54 ++++---------------
 .../Orc/EPCIndirectionUtils.cpp               | 16 +++---
 .../ExecutionEngine/Orc/ExecutionUtils.cpp    | 38 ++-----------
 .../Orc/JITLinkRedirectableSymbolManager.cpp  |  5 +-
 .../Orc/JITLinkReentryTrampolines.cpp         |  7 +--
 .../lib/ExecutionEngine/Orc/MachOPlatform.cpp | 19 ++-----
 llvm/lib/ExecutionEngine/Orc/SectCreate.cpp   |  2 +-
 .../JITLink/AArch32ErrorTests.cpp             | 12 ++---
 .../JITLink/JITLinkTestUtils.cpp              |  2 +-
 .../JITLink/LinkGraphTests.cpp                | 32 +++++------
 .../JITLink/MachOLinkGraphTests.cpp           |  2 +-
 .../JITLink/MemoryManagerErrorTests.cpp       |  2 +-
 .../ExecutionEngine/JITLink/StubsTests.cpp    | 11 ++--
 .../EPCGenericJITLinkMemoryManagerTest.cpp    |  3 +-
 .../Orc/MapperJITLinkMemoryManagerTest.cpp    | 15 ++++--
 .../Orc/ObjectLinkingLayerTest.cpp            | 29 +++++-----
 25 files changed, 128 insertions(+), 269 deletions(-)

diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
index 4ff8d92d8de93..2af9119670141 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
@@ -1007,26 +1007,12 @@ class LinkGraph {
   using GetEdgeKindNameFunction = const char *(*)(Edge::Kind);
 
   LinkGraph(std::string Name, std::shared_ptr<orc::SymbolStringPool> SSP,
-            const Triple &TT, SubtargetFeatures Features, unsigned PointerSize,
-            llvm::endianness Endianness,
+            Triple TT, SubtargetFeatures Features,
             GetEdgeKindNameFunction GetEdgeKindName)
-      : Name(std::move(Name)), SSP(std::move(SSP)), TT(TT),
-        Features(std::move(Features)), PointerSize(PointerSize),
-        Endianness(Endianness), GetEdgeKindName(std::move(GetEdgeKindName)) {}
-
-  LinkGraph(std::string Name, std::shared_ptr<orc::SymbolStringPool> SSP,
-            const Triple &TT, unsigned PointerSize, llvm::endianness Endianness,
-            GetEdgeKindNameFunction GetEdgeKindName)
-      : LinkGraph(std::move(Name), std::move(SSP), TT, SubtargetFeatures(),
-                  PointerSize, Endianness, GetEdgeKindName) {}
-
-  LinkGraph(std::string Name, std::shared_ptr<orc::SymbolStringPool> SSP,
-            const Triple &TT, GetEdgeKindNameFunction GetEdgeKindName)
-      : LinkGraph(std::move(Name), std::move(SSP), TT, SubtargetFeatures(),
-                  Triple::getArchPointerBitWidth(TT.getArch()) / 8,
-                  TT.isLittleEndian() ? endianness::little : endianness::big,
-                  GetEdgeKindName) {
-    assert(!(Triple::getArchPointerBitWidth(TT.getArch()) % 8) &&
+      : Name(std::move(Name)), SSP(std::move(SSP)), TT(std::move(TT)),
+        Features(std::move(Features)),
+        GetEdgeKindName(std::move(GetEdgeKindName)) {
+    assert(!(Triple::getArchPointerBitWidth(this->TT.getArch()) % 8) &&
            "Arch bitwidth is not a multiple of 8");
   }
 
@@ -1047,10 +1033,12 @@ class LinkGraph {
   const SubtargetFeatures &getFeatures() const { return Features; }
 
   /// Returns the pointer size for use in this graph.
-  unsigned getPointerSize() const { return PointerSize; }
+  unsigned getPointerSize() const { return TT.getArchPointerBitWidth() / 8; }
 
   /// Returns the endianness of content in this graph.
-  llvm::endianness getEndianness() const { return Endianness; }
+  llvm::endianness getEndianness() const {
+    return TT.isLittleEndian() ? endianness::little : endianness::big;
+  }
 
   const char *getEdgeKindName(Edge::Kind K) const { return GetEdgeKindName(K); }
 
@@ -1640,8 +1628,6 @@ class LinkGraph {
   std::shared_ptr<orc::SymbolStringPool> SSP;
   Triple TT;
   SubtargetFeatures Features;
-  unsigned PointerSize;
-  llvm::endianness Endianness;
   GetEdgeKindNameFunction GetEdgeKindName = nullptr;
   DenseMap<StringRef, std::unique_ptr<Section>> Sections;
   // FIXME(jared): these should become dense maps
@@ -2039,8 +2025,7 @@ createLinkGraphFromObject(MemoryBufferRef ObjectBuffer,
 
 /// Create a \c LinkGraph defining the given absolute symbols.
 std::unique_ptr<LinkGraph>
-absoluteSymbolsLinkGraph(const Triple &TT,
-                         std::shared_ptr<orc::SymbolStringPool> SSP,
+absoluteSymbolsLinkGraph(Triple TT, std::shared_ptr<orc::SymbolStringPool> SSP,
                          orc::SymbolMap Symbols);
 
 /// Link the given graph.
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h b/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h
index 1f8bab24c4827..c88469ba8a2c3 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h
@@ -25,6 +25,7 @@
 #include "llvm/Support/MSVCErrorWorkarounds.h"
 #include "llvm/Support/Memory.h"
 #include "llvm/Support/RecyclingAllocator.h"
+#include "llvm/TargetParser/Triple.h"
 
 #include <cassert>
 #include <cstdint>
@@ -322,14 +323,14 @@ class SimpleSegmentAlloc {
       JITLinkMemoryManager::InFlightAlloc::OnFinalizedFunction;
 
   static void Create(JITLinkMemoryManager &MemMgr,
-                     std::shared_ptr<orc::SymbolStringPool> SSP,
+                     std::shared_ptr<orc::SymbolStringPool> SSP, Triple TT,
                      const JITLinkDylib *JD, SegmentMap Segments,
                      OnCreatedFunction OnCreated);
 
   static Expected<SimpleSegmentAlloc>
   Create(JITLinkMemoryManager &MemMgr,
-         std::shared_ptr<orc::SymbolStringPool> SSP, const JITLinkDylib *JD,
-         SegmentMap Segments);
+         std::shared_ptr<orc::SymbolStringPool> SSP, Triple TT,
+         const JITLinkDylib *JD, SegmentMap Segments);
 
   SimpleSegmentAlloc(SimpleSegmentAlloc &&);
   SimpleSegmentAlloc &operator=(SimpleSegmentAlloc &&);
diff --git a/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp b/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp
index 5d43f7b03bc2a..d3315aad126cb 100644
--- a/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp
@@ -32,9 +32,8 @@ COFFLinkGraphBuilder::COFFLinkGraphBuilder(
     LinkGraph::GetEdgeKindNameFunction GetEdgeKindName)
     : Obj(Obj),
       G(std::make_unique<LinkGraph>(Obj.getFileName().str(), std::move(SSP),
-                                    createTripleWithCOFFFormat(TT),
-                                    std::move(Features), getPointerSize(Obj),
-                                    getEndianness(Obj),
+                                    createTripleWithCOFFFormat(std::move(TT)),
+                                    std::move(Features),
                                     std::move(GetEdgeKindName))) {
   LLVM_DEBUG({
     dbgs() << "Created COFFLinkGraphBuilder for \"" << Obj.getFileName()
@@ -44,17 +43,6 @@ COFFLinkGraphBuilder::COFFLinkGraphBuilder(
 
 COFFLinkGraphBuilder::~COFFLinkGraphBuilder() = default;
 
-unsigned
-COFFLinkGraphBuilder::getPointerSize(const object::COFFObjectFile &Obj) {
-  return Obj.getBytesInAddress();
-}
-
-llvm::endianness
-COFFLinkGraphBuilder::getEndianness(const object::COFFObjectFile &Obj) {
-  return Obj.isLittleEndian() ? llvm::endianness::little
-                              : llvm::endianness::big;
-}
-
 uint64_t COFFLinkGraphBuilder::getSectionSize(const object::COFFObjectFile &Obj,
                                               const object::coff_section *Sec) {
   // Consider the difference between executable form and object form.
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h b/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
index fbe40fe5f7571..8dd176cd07f92 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
+++ b/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
@@ -194,9 +194,8 @@ ELFLinkGraphBuilder<ELFT>::ELFLinkGraphBuilder(
     SubtargetFeatures Features, StringRef FileName,
     LinkGraph::GetEdgeKindNameFunction GetEdgeKindName)
     : ELFLinkGraphBuilderBase(std::make_unique<LinkGraph>(
-          FileName.str(), std::move(SSP), Triple(std::move(TT)),
-          std::move(Features), ELFT::Is64Bits ? 8 : 4,
-          llvm::endianness(ELFT::Endianness), std::move(GetEdgeKindName))),
+          FileName.str(), std::move(SSP), std::move(TT), std::move(Features),
+          std::move(GetEdgeKindName))),
       Obj(Obj) {
   LLVM_DEBUG(
       { dbgs() << "Created ELFLinkGraphBuilder for \"" << FileName << "\""; });
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
index 5edfbdb05f7b3..6b77330bb764b 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
@@ -507,32 +507,13 @@ createLinkGraphFromObject(MemoryBufferRef ObjectBuffer,
 }
 
 std::unique_ptr<LinkGraph>
-absoluteSymbolsLinkGraph(const Triple &TT,
-                         std::shared_ptr<orc::SymbolStringPool> SSP,
+absoluteSymbolsLinkGraph(Triple TT, std::shared_ptr<orc::SymbolStringPool> SSP,
                          orc::SymbolMap Symbols) {
-  unsigned PointerSize;
-  endianness Endianness =
-      TT.isLittleEndian() ? endianness::little : endianness::big;
-  switch (TT.getArch()) {
-  case Triple::aarch64:
-  case llvm::Triple::riscv64:
-  case Triple::x86_64:
-    PointerSize = 8;
-    break;
-  case llvm::Triple::arm:
-  case llvm::Triple::riscv32:
-  case llvm::Triple::x86:
-    PointerSize = 4;
-    break;
-  default:
-    llvm::report_fatal_error("unhandled target architecture");
-  }
-
   static std::atomic<uint64_t> Counter = {0};
   auto Index = Counter.fetch_add(1, std::memory_order_relaxed);
   auto G = std::make_unique<LinkGraph>(
-      "<Absolute Symbols " + std::to_string(Index) + ">", std::move(SSP), TT,
-      PointerSize, Endianness, /*GetEdgeKindName=*/nullptr);
+      "<Absolute Symbols " + std::to_string(Index) + ">", std::move(SSP),
+      std::move(TT), SubtargetFeatures(), getGenericEdgeKindName);
   for (auto &[Name, Def] : Symbols) {
     auto &Sym =
         G->addAbsoluteSymbol(*Name, Def.getAddress(), /*Size=*/0,
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
index 2e3323af98e94..6c7e27e429849 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
@@ -145,7 +145,8 @@ orc::shared::AllocActions &BasicLayout::graphAllocActions() {
 
 void SimpleSegmentAlloc::Create(JITLinkMemoryManager &MemMgr,
                                 std::shared_ptr<orc::SymbolStringPool> SSP,
-                                const JITLinkDylib *JD, SegmentMap Segments,
+                                Triple TT, const JITLinkDylib *JD,
+                                SegmentMap Segments,
                                 OnCreatedFunction OnCreated) {
 
   static_assert(orc::AllocGroup::NumGroups == 32,
@@ -156,8 +157,9 @@ void SimpleSegmentAlloc::Create(JITLinkMemoryManager &MemMgr,
       "__---.finalize", "__R--.finalize", "__-W-.finalize", "__RW-.finalize",
       "__--X.finalize", "__R-X.finalize", "__-WX.finalize", "__RWX.finalize"};
 
-  auto G = std::make_unique<LinkGraph>("", std::move(SSP), Triple(), 0,
-                                       llvm::endianness::native, nullptr);
+  auto G =
+      std::make_unique<LinkGraph>("", std::move(SSP), std::move(TT),
+                                  SubtargetFeatures(), getGenericEdgeKindName);
   orc::AllocGroupSmallMap<Block *> ContentBlocks;
 
   orc::ExecutorAddr NextAddr(0x100000);
@@ -201,13 +203,12 @@ void SimpleSegmentAlloc::Create(JITLinkMemoryManager &MemMgr,
                   });
 }
 
-Expected<SimpleSegmentAlloc>
-SimpleSegmentAlloc::Create(JITLinkMemoryManager &MemMgr,
-                           std::shared_ptr<orc::SymbolStringPool> SSP,
-                           const JITLinkDylib *JD, SegmentMap Segments) {
+Expected<SimpleSegmentAlloc> SimpleSegmentAlloc::Create(
+    JITLinkMemoryManager &MemMgr, std::shared_ptr<orc::SymbolStringPool> SSP,
+    Triple TT, const JITLinkDylib *JD, SegmentMap Segments) {
   std::promise<MSVCPExpected<SimpleSegmentAlloc>> AllocP;
   auto AllocF = AllocP.get_future();
-  Create(MemMgr, std::move(SSP), JD, std::move(Segments),
+  Create(MemMgr, std::move(SSP), std::move(TT), JD, std::move(Segments),
          [&](Expected<SimpleSegmentAlloc> Result) {
            AllocP.set_value(std::move(Result));
          });
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
index 0e7ecf6cc1383..3e757f780b550 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
@@ -53,10 +53,10 @@ MachOLinkGraphBuilder::MachOLinkGraphBuilder(
     std::shared_ptr<orc::SymbolStringPool> SSP, Triple TT,
     SubtargetFeatures Features,
     LinkGraph::GetEdgeKindNameFunction GetEdgeKindName)
-    : Obj(Obj), G(std::make_unique<LinkGraph>(
-                    std::string(Obj.getFileName()), std::move(SSP),
-                    std::move(TT), std::move(Features), getPointerSize(Obj),
-                    getEndianness(Obj), std::move(GetEdgeKindName))) {
+    : Obj(Obj),
+      G(std::make_unique<LinkGraph>(
+          std::string(Obj.getFileName()), std::move(SSP), std::move(TT),
+          std::move(Features), std::move(GetEdgeKindName))) {
   auto &MachHeader = Obj.getHeader64();
   SubsectionsViaSymbols = MachHeader.flags & MachO::MH_SUBSECTIONS_VIA_SYMBOLS;
 }
@@ -104,17 +104,6 @@ bool MachOLinkGraphBuilder::isZeroFillSection(const NormalizedSection &NSec) {
   }
 }
 
-unsigned
-MachOLinkGraphBuilder::getPointerSize(const object::MachOObjectFile &Obj) {
-  return Obj.is64Bit() ? 8 : 4;
-}
-
-llvm::endianness
-MachOLinkGraphBuilder::getEndianness(const object::MachOObjectFile &Obj) {
-  return Obj.isLittleEndian() ? llvm::endianness::little
-                              : llvm::endianness::big;
-}
-
 Section &MachOLinkGraphBuilder::getCommonSection() {
   if (!CommonSection)
     CommonSection = &G->createSection(CommonSectionName,
diff --git a/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp
index a435997478151..17f4c3c741141 100644
--- a/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp
@@ -54,22 +54,10 @@ class COFFHeaderMaterializationUnit : public MaterializationUnit {
   StringRef getName() const override { return "COFFHeaderMU"; }
 
   void materialize(std::unique_ptr<MaterializationResponsibility> R) override {
-    unsigned PointerSize;
-    llvm::endianness Endianness;
-    const auto &TT = CP.getExecutionSession().getTargetTriple();
-
-    switch (TT.getArch()) {
-    case Triple::x86_64:
-      PointerSize = 8;
-      Endianness = llvm::endianness::little;
-      break;
-    default:
-      llvm_unreachable("Unrecognized architecture");
-    }
-
     auto G = std::make_unique<jitlink::LinkGraph>(
-        "<COFFHeaderMU>", CP.getExecutionSession().getSymbolStringPool(), TT,
-        PointerSize, Endianness, jitlink::getGenericEdgeKindName);
+        "<COFFHeaderMU>", CP.getExecutionSession().getSymbolStringPool(),
+        CP.getExecutionSession().getTargetTriple(), SubtargetFeatures(),
+        jitlink::getGenericEdgeKindName);
     auto &HeaderSection = G->createSection("__header", MemProt::Read);
     auto &HeaderBlock = createHeaderBlock(*G, HeaderSection);
 
diff --git a/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp
index 180905346d561..80b7452a0b226 100644
--- a/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp
@@ -331,9 +331,9 @@ Expected<SimpleSegmentAlloc> ELFDebugObject::finalizeWorkingMemory() {
   size_t Size = Buffer->getBufferSize();
 
   // Allocate working memory for debug object in read-only segment.
-  auto Alloc =
-      SimpleSegmentAlloc::Create(MemMgr, ES.getSymbolStringPool(), JD,
-                                 {{MemProt::Read, {Size, Align(PageSize)}}});
+  auto Alloc = SimpleSegmentAlloc::Create(
+      MemMgr, ES.getSymbolStringPool(), ES.getTargetTriple(), JD,
+      {{MemProt::Read, {Size, Align(PageSize)}}});
   if (!Alloc)
     return Alloc;
 
diff --git a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
index 5f017d9979c3c..8aa517a27d997 100644
--- a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
@@ -40,34 +40,10 @@ getArgDataBufferType(const ArgTs &...Args) {
 
 std::unique_ptr<jitlink::LinkGraph> createPlatformGraph(ELFNixPlatform &MOP,
                                                         std::string Name) {
-  unsigned PointerSize;
-  llvm::endianness Endianness;
-  const auto &TT = MOP.getExecutionSession().getTargetTriple();
-
-  switch (TT.getArch()) {
-  case Triple::x86_64:
-    PointerSize = 8;
-    Endianness = llvm::endianness::little;
-    break;
-  case Triple::aarch64:
-    PointerSize = 8;
-    Endianness = llvm::endianness::little;
-    break;
-  case Triple::ppc64:
-    PointerSize = 8;
-    Endianness = llvm::endianness::big;
-    break;
-  case Triple::ppc64le:
-    PointerSize = 8;
-    Endianness = llvm::endianness::little;
-    break;
-  default:
-    llvm_unreachable("Unrecognized architecture");
-  }
-
+  auto &ES = MOP.getExecutionSession();
   return std::make_unique<jitlink::LinkGraph>(
-      std::move(Name), MOP.getExecutionSession().getSymbolStringPool(), TT,
-      PointerSize, Endianness, jitlink::getGenericEdgeKindName);
+      std::move(Name), ES.getSymbolStringPool(), ES.getTargetTriple(),
+      SubtargetFeatures(), jitlink::getGenericEdgeKindName);
 }
 
 // Creates a Bootstrap-Complete LinkGraph to run deferred actions.
@@ -156,30 +132,22 @@ class DSOHandleMaterializationUnit : public MaterializationUnit {
   StringRef getName() const override { return "DSOHandleMU"; }
 
   void materialize(std::unique_ptr<MaterializationResponsibility> R) override {
-    unsigned PointerSize;
-    llvm::endianness Endianness;
+
+    auto &ES = ENP.getExecutionSession();
+
     jitlink::Edge::Kind EdgeKind;
-    const auto &TT = ENP.getExecutionSession().getTargetTriple();
 
-    switch (TT.getArch()) {
+    switch (ES.getTargetTriple().getArch()) {
     case Triple::x86_64:
-      PointerSize = 8;
-      Endianness = llvm::endianness::little;
       EdgeKind = jitlink::x86_64::Pointer64;
       break;
     case Triple::aarch64:
-      PointerSize = 8;
-      Endianness = llvm::endianness::little;
       EdgeKind = jitlink::aarch64::Pointer64;
       break;
     case Triple::ppc64:
-      PointerSize = 8;
-      Endianness = llvm::endianness::big;
       EdgeKind = jitlink::ppc64::Pointer64;
       break;
     case Triple::ppc64le:
-      PointerSize = 8;
-      Endianness = llvm::endianness::little;
       EdgeKind = jitlink::ppc64::Pointer64;
       break;
     default:
@@ -188,13 +156,13 @@ class DSOHandleMaterializationUnit : public MaterializationUnit {
 
     // void *__dso_handle = &__dso_handle;
     auto G = std::make_unique<jitlink::LinkGraph>(
-        "<DSOHandleMU>", ENP.getExecutionSession().getSymbolStringPool(), TT,
-        PointerSize, Endianness, jitlink::getGenericEdgeKindName);
+        "<DSOHandleMU>", ES.getSymbolStringPool(), ES.getTargetTriple(),
+        SubtargetFeatures(), jitlink::getGenericEdgeKindName);
     auto &DSOHandleSection =
         G->createSection(".data.__dso_handle", MemProt::Read);
     auto &DSOHandleBlock = G->createContentBlock(
-        DSOHandleSection, getDSOHandleContent(PointerSize), orc::ExecutorAddr(),
-        8, 0);
+        DSOHandleSection, getDSOHandleContent(G->getPointerSize()),
+        orc::ExecutorAddr(), 8, 0);
     auto &DSOHandleSymbol = G->addDefinedSymbol(
         DSOHandleBlock, 0, *R->getInitializerSymbol(), DSOHandleBlock.getSize(),
         jitlink::Linkage::Strong, jitlink::Scope::Default, false, true);
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp
index a44da22cb3512..8e4937d405620 100644
--- a/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp
@@ -110,8 +110,8 @@ Error EPCTrampolinePool::grow() {
   auto &EPC = EPCIU.getExecutorProcessControl();
   auto PageSize = EPC.getPageSize();
   auto Alloc = SimpleSegmentAlloc::Create(
-      EPC.getMemMgr(), EPC.getSymbolStringPool(), nullptr,
-      {{MemProt::Read | MemProt::Exec, {PageSize, Align(PageSize)}}});
+      EPC.getMemMgr(), EPC.getSymbolStringPool(), EPC.getTargetTriple(),
+      nullptr, {{MemProt::Read | MemProt::Exec, {PageSize, Align(PageSize)}}});
   if (!Alloc)
     return Alloc.takeError();
 
@@ -294,10 +294,11 @@ EPCIndirectionUtils::writeResolverBlock(ExecutorAddr ReentryFnAddr,
   assert(ABI && "ABI can not be null");
   auto ResolverSize = ABI->getResolverCodeSize();
 
-  auto Alloc = SimpleSegmentAlloc::Create(
-      EPC.getMemMgr(), EPC.getSymbolStringPool(), nullptr,
-      {{MemProt::Read | MemProt::Exec,
-        {ResolverSize, Align(EPC.getPageSize())}}});
+  auto Alloc =
+      SimpleSegmentAlloc::Create(EPC.getMemMgr(), EPC.getSymbolStringPool(),
+                                 EPC.getTargetTriple(), nullptr,
+                                 {{MemProt::Read | MemProt::Exec,
+                                   {ResolverSize, Align(EPC.getPageSize())}}});
 
   if (!Alloc)
     return Alloc.takeError();
@@ -363,7 +364,8 @@ EPCIndirectionUtils::getIndirectStubs(unsigned NumStubs) {
     auto PtrProt = MemProt::Read | MemProt::Write;
 
     auto Alloc = SimpleSegmentAlloc::Create(
-        EPC.getMemMgr(), EPC.getSymbolStringPool(), nullptr,
+        EPC.getMemMgr(), EPC.getSymbolStringPool(), EPC.getTargetTriple(),
+        nullptr,
         {{StubProt, {static_cast<size_t>(StubBytes), Align(PageSize)}},
          {PtrProt, {static_cast<size_t>(PtrBytes), Align(PageSize)}}});
 
diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
index 58cd05cd88ba1..89a7a4e86f1bf 100644
--- a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
@@ -525,49 +525,17 @@ Error DLLImportDefinitionGenerator::tryToGenerate(
   return L.add(JD, std::move(*G));
 }
 
-Expected<unsigned>
-DLLImportDefinitionGenerator::getTargetPointerSize(const Triple &TT) {
-  switch (TT.getArch()) {
-  case Triple::x86_64:
-    return 8;
-  default:
-    return make_error<StringError>(
-        "architecture unsupported by DLLImportDefinitionGenerator",
-        inconvertibleErrorCode());
-  }
-}
-
-Expected<llvm::endianness>
-DLLImportDefinitionGenerator::getEndianness(const Triple &TT) {
-  switch (TT.getArch()) {
-  case Triple::x86_64:
-    return llvm::endianness::little;
-  default:
-    return make_error<StringError>(
-        "architecture unsupported by DLLImportDefinitionGenerator",
-        inconvertibleErrorCode());
-  }
-}
-
 Expected<std::unique_ptr<jitlink::LinkGraph>>
 DLLImportDefinitionGenerator::createStubsGraph(const SymbolMap &Resolved) {
-  Triple TT = ES.getTargetTriple();
-  auto PointerSize = getTargetPointerSize(TT);
-  if (!PointerSize)
-    return PointerSize.takeError();
-  auto Endianness = getEndianness(TT);
-  if (!Endianness)
-    return Endianness.takeError();
-
   auto G = std::make_unique<jitlink::LinkGraph>(
-      "<DLLIMPORT_STUBS>", ES.getSymbolStringPool(), TT, *PointerSize,
-      *Endianness, jitlink::getGenericEdgeKindName);
+      "<DLLIMPORT_STUBS>", ES.getSymbolStringPool(), ES.getTargetTriple(),
+      SubtargetFeatures(), jitlink::getGenericEdgeKindName);
   jitlink::Section &Sec =
       G->createSection(getSectionName(), MemProt::Read | MemProt::Exec);
 
   for (auto &KV : Resolved) {
     jitlink::Symbol &Target = G->addAbsoluteSymbol(
-        *KV.first, KV.second.getAddress(), *PointerSize,
+        *KV.first, KV.second.getAddress(), G->getPointerSize(),
         jitlink::Linkage::Strong, jitlink::Scope::Local, false);
 
     // Create __imp_ symbol
diff --git a/llvm/lib/ExecutionEngine/Orc/JITLinkRedirectableSymbolManager.cpp b/llvm/lib/ExecutionEngine/Orc/JITLinkRedirectableSymbolManager.cpp
index 8dd479a64f1b5..06c545d62d76a 100644
--- a/llvm/lib/ExecutionEngine/Orc/JITLinkRedirectableSymbolManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/JITLinkRedirectableSymbolManager.cpp
@@ -24,12 +24,9 @@ void JITLinkRedirectableSymbolManager::emitRedirectableSymbols(
     std::unique_ptr<MaterializationResponsibility> R, SymbolMap InitialDests) {
 
   auto &ES = ObjLinkingLayer.getExecutionSession();
-  Triple TT = ES.getTargetTriple();
-
   auto G = std::make_unique<jitlink::LinkGraph>(
       ("<indirect stubs graph #" + Twine(++StubGraphIdx) + ">").str(),
-      ES.getSymbolStringPool(), TT, TT.isArch64Bit() ? 8 : 4,
-      TT.isLittleEndian() ? endianness::little : endianness::big,
+      ES.getSymbolStringPool(), ES.getTargetTriple(), SubtargetFeatures(),
       jitlink::getGenericEdgeKindName);
   auto &PointerSection =
       G->createSection(StubPtrSectionName, MemProt::Write | MemProt::Read);
diff --git a/llvm/lib/ExecutionEngine/Orc/JITLinkReentryTrampolines.cpp b/llvm/lib/ExecutionEngine/Orc/JITLinkReentryTrampolines.cpp
index d9f859654d41e..d305342e6fed8 100644
--- a/llvm/lib/ExecutionEngine/Orc/JITLinkReentryTrampolines.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/JITLinkReentryTrampolines.cpp
@@ -123,16 +123,13 @@ void JITLinkReentryTrampolines::emit(ResourceTrackerSP RT,
 
   JITDylibSP JD(&RT->getJITDylib());
   auto &ES = ObjLinkingLayer.getExecutionSession();
-  Triple TT = ES.getTargetTriple();
 
   auto ReentryGraphSym =
       ES.intern(("__orc_reentry_graph_#" + Twine(++ReentryGraphIdx)).str());
 
   auto G = std::make_unique<jitlink::LinkGraph>(
-      (*ReentryGraphSym).str(), ES.getSymbolStringPool(), TT,
-      TT.isArch64Bit() ? 8 : 4,
-      TT.isLittleEndian() ? endianness::little : endianness::big,
-      jitlink::getGenericEdgeKindName);
+      (*ReentryGraphSym).str(), ES.getSymbolStringPool(), ES.getTargetTriple(),
+      SubtargetFeatures(), jitlink::getGenericEdgeKindName);
 
   auto &ReentryFnSym = G->addExternalSymbol(ReentryFnName, 0, false);
 
diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
index 0013eddb1f2c9..e0d40cf2de5aa 100644
--- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -94,23 +94,10 @@ using SPSRegisterSymbolsArgs =
 
 std::unique_ptr<jitlink::LinkGraph> createPlatformGraph(MachOPlatform &MOP,
                                                         std::string Name) {
-  unsigned PointerSize;
-  llvm::endianness Endianness;
-  const auto &TT = MOP.getExecutionSession().getTargetTriple();
-
-  switch (TT.getArch()) {
-  case Triple::aarch64:
-  case Triple::x86_64:
-    PointerSize = 8;
-    Endianness = llvm::endianness::little;
-    break;
-  default:
-    llvm_unreachable("Unrecognized architecture");
-  }
-
+  auto &ES = MOP.getExecutionSession();
   return std::make_unique<jitlink::LinkGraph>(
-      std::move(Name), MOP.getExecutionSession().getSymbolStringPool(), TT,
-      PointerSize, Endianness, jitlink::getGenericEdgeKindName);
+      std::move(Name), ES.getSymbolStringPool(), ES.getTargetTriple(),
+      SubtargetFeatures(), jitlink::getGenericEdgeKindName);
 }
 
 // Creates a Bootstrap-Complete LinkGraph to run deferred actions.
diff --git a/llvm/lib/ExecutionEngine/Orc/SectCreate.cpp b/llvm/lib/ExecutionEngine/Orc/SectCreate.cpp
index 4ed05ef3573f8..c50654cf47a2f 100644
--- a/llvm/lib/ExecutionEngine/Orc/SectCreate.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/SectCreate.cpp
@@ -20,7 +20,7 @@ void SectCreateMaterializationUnit::materialize(
       "orc_sectcreate_" + SectName,
       ObjLinkingLayer.getExecutionSession().getSymbolStringPool(),
       ObjLinkingLayer.getExecutionSession().getTargetTriple(),
-      getGenericEdgeKindName);
+      SubtargetFeatures(), getGenericEdgeKindName);
 
   auto &Sect = G->createSection(SectName, MP);
   auto Content = G->allocateContent(
diff --git a/llvm/unittests/ExecutionEngine/JITLink/AArch32ErrorTests.cpp b/llvm/unittests/ExecutionEngine/JITLink/AArch32ErrorTests.cpp
index b646f88b6fbcf..6e9957f53a357 100644
--- a/llvm/unittests/ExecutionEngine/JITLink/AArch32ErrorTests.cpp
+++ b/llvm/unittests/ExecutionEngine/JITLink/AArch32ErrorTests.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/Orc/SymbolStringPool.h"
+#include "llvm/TargetParser/Triple.h"
 #include "llvm/Testing/Support/Error.h"
 #include "gtest/gtest.h"
 #include <llvm/ExecutionEngine/JITLink/aarch32.h>
@@ -17,10 +18,9 @@ using namespace llvm::jitlink::aarch32;
 using namespace llvm::support;
 using namespace llvm::support::endian;
 
-constexpr unsigned PointerSize = 4;
 auto G = std::make_unique<LinkGraph>(
     "foo", std::make_shared<orc::SymbolStringPool>(),
-    Triple("armv7-linux-gnueabi"), PointerSize, endianness::little,
+    Triple("armv7-linux-gnueabi"), SubtargetFeatures(),
     aarch32::getEdgeKindName);
 auto &Sec =
     G->createSection("__data", orc::MemProt::Read | orc::MemProt::Write);
@@ -49,7 +49,7 @@ class AArch32Errors : public testing::Test {
   void SetUp() override {
     G = std::make_unique<LinkGraph>(
         "foo", std::make_shared<orc::SymbolStringPool>(),
-        Triple("armv7-linux-gnueabi"), PointerSize, endianness::little,
+        Triple("armv7-linux-gnueabi"), SubtargetFeatures(),
         aarch32::getEdgeKindName);
     S = &G->createSection("__data", orc::MemProt::Read | orc::MemProt::Write);
   }
@@ -78,7 +78,7 @@ class AArch32Errors : public testing::Test {
   Symbol &createSymbolWithDistance(Block &Origin, uint64_t Dist) {
     uint64_t TargetAddr = Origin.getAddress().getValue() + Dist;
     return G->addAnonymousSymbol(createBlock(Zeros, TargetAddr), 0 /*Offset*/,
-                                 PointerSize, false, false);
+                                 G->getPointerSize(), false, false);
   };
 
   template <endianness Endian> void write(uint8_t *Mem, HalfWords Data) {
@@ -149,8 +149,8 @@ TEST_F(AArch32Errors, applyFixupDataGeneric) {
   Block &TargetBlock = createBlock(Zeros, 0x2000);
 
   constexpr uint64_t OffsetInTarget = 0;
-  Symbol &TargetSymbol = G->addAnonymousSymbol(TargetBlock, OffsetInTarget,
-                                               PointerSize, false, false);
+  Symbol &TargetSymbol = G->addAnonymousSymbol(
+      TargetBlock, OffsetInTarget, G->getPointerSize(), false, false);
 
   constexpr uint64_t OffsetInOrigin = 0;
   Edge::Kind Invalid = Edge::GenericEdgeKind::Invalid;
diff --git a/llvm/unittests/ExecutionEngine/JITLink/JITLinkTestUtils.cpp b/llvm/unittests/ExecutionEngine/JITLink/JITLinkTestUtils.cpp
index 24e540b97c0cf..2d0801fb6ac67 100644
--- a/llvm/unittests/ExecutionEngine/JITLink/JITLinkTestUtils.cpp
+++ b/llvm/unittests/ExecutionEngine/JITLink/JITLinkTestUtils.cpp
@@ -94,7 +94,7 @@ TEST(JITLinkMocks, SmokeTest) {
   // successfully.
   auto G = std::make_unique<LinkGraph>(
       "foo", std::make_shared<orc::SymbolStringPool>(),
-      Triple("x86_64-apple-darwin"), 8, llvm::endianness::little,
+      Triple("x86_64-apple-darwin"), SubtargetFeatures(),
       getGenericEdgeKindName);
 
   ArrayRef<char> Content = "hello, world!";
diff --git a/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp b/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp
index 3b96019766c84..11a379c7e5024 100644
--- a/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp
+++ b/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp
@@ -22,7 +22,7 @@ using namespace llvm::jitlink;
 TEST(LinkGraphTest, Construction) {
   // Check that LinkGraph construction works as expected.
   LinkGraph G("foo", std::make_shared<orc::SymbolStringPool>(),
-              Triple("x86_64-apple-darwin"), 8, llvm::endianness::little,
+              Triple("x86_64-apple-darwin"), SubtargetFeatures(),
               getGenericEdgeKindName);
   EXPECT_EQ(G.getName(), "foo");
   EXPECT_EQ(G.getTargetTriple().str(), "x86_64-apple-darwin");
@@ -37,7 +37,7 @@ TEST(LinkGraphTest, Construction) {
 TEST(LinkGraphTest, AddressAccess) {
   // Check that we can get addresses for blocks, symbols, and edges.
   LinkGraph G("foo", std::make_shared<orc::SymbolStringPool>(),
-              Triple("x86_64-apple-darwin"), 8, llvm::endianness::little,
+              Triple("x86_64-apple-darwin"), SubtargetFeatures(),
               getGenericEdgeKindName);
 
   auto &Sec1 =
@@ -57,7 +57,7 @@ TEST(LinkGraphTest, AddressAccess) {
 TEST(LinkGraphTest, SectionEmpty) {
   // Check that Section::empty behaves as expected.
   LinkGraph G("foo", std::make_shared<orc::SymbolStringPool>(),
-              Triple("x86_64-apple-darwin"), 8, llvm::endianness::little,
+              Triple("x86_64-apple-darwin"), SubtargetFeatures(),
               getGenericEdgeKindName);
   auto &Sec1 =
       G.createSection("__data.1", orc::MemProt::Read | orc::MemProt::Write);
@@ -76,7 +76,7 @@ TEST(LinkGraphTest, SectionEmpty) {
 TEST(LinkGraphTest, BlockAndSymbolIteration) {
   // Check that we can iterate over blocks within Sections and across sections.
   LinkGraph G("foo", std::make_shared<orc::SymbolStringPool>(),
-              Triple("x86_64-apple-darwin"), 8, llvm::endianness::little,
+              Triple("x86_64-apple-darwin"), SubtargetFeatures(),
               getGenericEdgeKindName);
   auto &Sec1 =
       G.createSection("__data.1", orc::MemProt::Read | orc::MemProt::Write);
@@ -130,7 +130,7 @@ TEST(LinkGraphTest, BlockAndSymbolIteration) {
 TEST(LinkGraphTest, ContentAccessAndUpdate) {
   // Check that we can make a defined symbol external.
   LinkGraph G("foo", std::make_shared<orc::SymbolStringPool>(),
-              Triple("x86_64-apple-darwin"), 8, llvm::endianness::little,
+              Triple("x86_64-apple-darwin"), SubtargetFeatures(),
               getGenericEdgeKindName);
   auto &Sec =
       G.createSection("__data", orc::MemProt::Read | orc::MemProt::Write);
@@ -220,7 +220,7 @@ TEST(LinkGraphTest, ContentAccessAndUpdate) {
 TEST(LinkGraphTest, MakeExternal) {
   // Check that we can make defined and absolute symbols external.
   LinkGraph G("foo", std::make_shared<orc::SymbolStringPool>(),
-              Triple("x86_64-apple-darwin"), 8, llvm::endianness::little,
+              Triple("x86_64-apple-darwin"), SubtargetFeatures(),
               getGenericEdgeKindName);
   auto &Sec =
       G.createSection("__data", orc::MemProt::Read | orc::MemProt::Write);
@@ -291,7 +291,7 @@ TEST(LinkGraphTest, MakeExternal) {
 TEST(LinkGraphTest, MakeAbsolute) {
   // Check that we can make defined and external symbols absolute.
   LinkGraph G("foo", std::make_shared<orc::SymbolStringPool>(),
-              Triple("x86_64-apple-darwin"), 8, llvm::endianness::little,
+              Triple("x86_64-apple-darwin"), SubtargetFeatures(),
               getGenericEdgeKindName);
   auto &Sec =
       G.createSection("__data", orc::MemProt::Read | orc::MemProt::Write);
@@ -361,7 +361,7 @@ TEST(LinkGraphTest, MakeAbsolute) {
 TEST(LinkGraphTest, MakeDefined) {
   // Check that we can make an external symbol defined.
   LinkGraph G("foo", std::make_shared<orc::SymbolStringPool>(),
-              Triple("x86_64-apple-darwin"), 8, llvm::endianness::little,
+              Triple("x86_64-apple-darwin"), SubtargetFeatures(),
               getGenericEdgeKindName);
   auto &Sec =
       G.createSection("__data", orc::MemProt::Read | orc::MemProt::Write);
@@ -410,7 +410,7 @@ TEST(LinkGraphTest, MakeDefined) {
 TEST(LinkGraphTest, TransferDefinedSymbol) {
   // Check that we can transfer a defined symbol from one block to another.
   LinkGraph G("foo", std::make_shared<orc::SymbolStringPool>(),
-              Triple("x86_64-apple-darwin"), 8, llvm::endianness::little,
+              Triple("x86_64-apple-darwin"), SubtargetFeatures(),
               getGenericEdgeKindName);
   auto &Sec =
       G.createSection("__data", orc::MemProt::Read | orc::MemProt::Write);
@@ -446,7 +446,7 @@ TEST(LinkGraphTest, TransferDefinedSymbolAcrossSections) {
   // Check that we can transfer a defined symbol from an existing block in one
   // section to another.
   LinkGraph G("foo", std::make_shared<orc::SymbolStringPool>(),
-              Triple("x86_64-apple-darwin"), 8, llvm::endianness::little,
+              Triple("x86_64-apple-darwin"), SubtargetFeatures(),
               getGenericEdgeKindName);
   auto &Sec1 =
       G.createSection("__data.1", orc::MemProt::Read | orc::MemProt::Write);
@@ -481,7 +481,7 @@ TEST(LinkGraphTest, TransferBlock) {
   // Check that we can transfer a block (and all associated symbols) from one
   // section to another.
   LinkGraph G("foo", std::make_shared<orc::SymbolStringPool>(),
-              Triple("x86_64-apple-darwin"), 8, llvm::endianness::little,
+              Triple("x86_64-apple-darwin"), SubtargetFeatures(),
               getGenericEdgeKindName);
   auto &Sec1 =
       G.createSection("__data.1", orc::MemProt::Read | orc::MemProt::Write);
@@ -530,7 +530,7 @@ TEST(LinkGraphTest, MergeSections) {
   // Check that we can transfer a block (and all associated symbols) from one
   // section to another.
   LinkGraph G("foo", std::make_shared<orc::SymbolStringPool>(),
-              Triple("x86_64-apple-darwin"), 8, llvm::endianness::little,
+              Triple("x86_64-apple-darwin"), SubtargetFeatures(),
               getGenericEdgeKindName);
   auto &Sec1 =
       G.createSection("__data.1", orc::MemProt::Read | orc::MemProt::Write);
@@ -617,7 +617,7 @@ TEST(LinkGraphTest, MergeSections) {
 TEST(LinkGraphTest, SplitBlock) {
   // Check that the LinkGraph::splitBlock test works as expected.
   LinkGraph G("foo", std::make_shared<orc::SymbolStringPool>(),
-              Triple("x86_64-apple-darwin"), 8, llvm::endianness::little,
+              Triple("x86_64-apple-darwin"), SubtargetFeatures(),
               getGenericEdgeKindName);
   auto &Sec =
       G.createSection("__data", orc::MemProt::Read | orc::MemProt::Write);
@@ -730,7 +730,7 @@ TEST(LinkGraphTest, SplitBlock) {
 
 TEST(LinkGraphTest, GraphAllocationMethods) {
   LinkGraph G("foo", std::make_shared<orc::SymbolStringPool>(),
-              Triple("x86_64-apple-darwin"), 8, llvm::endianness::little,
+              Triple("x86_64-apple-darwin"), SubtargetFeatures(),
               getGenericEdgeKindName);
 
   // Test allocation of sized, uninitialized buffer.
@@ -752,7 +752,7 @@ TEST(LinkGraphTest, GraphAllocationMethods) {
 TEST(LinkGraphTest, IsCStringBlockTest) {
   // Check that the LinkGraph::splitBlock test works as expected.
   LinkGraph G("foo", std::make_shared<orc::SymbolStringPool>(),
-              Triple("x86_64-apple-darwin"), 8, llvm::endianness::little,
+              Triple("x86_64-apple-darwin"), SubtargetFeatures(),
               getGenericEdgeKindName);
   auto &Sec =
       G.createSection("__data", orc::MemProt::Read | orc::MemProt::Write);
@@ -777,7 +777,7 @@ TEST(LinkGraphTest, IsCStringBlockTest) {
 
 TEST(LinkGraphTest, BasicLayoutHonorsNoAlloc) {
   LinkGraph G("foo", std::make_shared<orc::SymbolStringPool>(),
-              Triple("x86_64-apple-darwin"), 8, llvm::endianness::little,
+              Triple("x86_64-apple-darwin"), SubtargetFeatures(),
               getGenericEdgeKindName);
 
   // Create a regular section and block.
diff --git a/llvm/unittests/ExecutionEngine/JITLink/MachOLinkGraphTests.cpp b/llvm/unittests/ExecutionEngine/JITLink/MachOLinkGraphTests.cpp
index b68310aaf3990..213849c076cfe 100644
--- a/llvm/unittests/ExecutionEngine/JITLink/MachOLinkGraphTests.cpp
+++ b/llvm/unittests/ExecutionEngine/JITLink/MachOLinkGraphTests.cpp
@@ -22,7 +22,7 @@ using namespace llvm::jitlink;
 TEST(MachOLinkGraphTest, GetStandardSections) {
   // Check that LinkGraph construction works as expected.
   LinkGraph G("foo", std::make_shared<orc::SymbolStringPool>(),
-              Triple("arm64-apple-darwin"), 8, llvm::endianness::little,
+              Triple("arm64-apple-darwin"), SubtargetFeatures(),
               getGenericEdgeKindName);
 
   auto &Data = getMachODefaultRWDataSection(G);
diff --git a/llvm/unittests/ExecutionEngine/JITLink/MemoryManagerErrorTests.cpp b/llvm/unittests/ExecutionEngine/JITLink/MemoryManagerErrorTests.cpp
index d1284ffb2ecf2..48a38aefe4e6d 100644
--- a/llvm/unittests/ExecutionEngine/JITLink/MemoryManagerErrorTests.cpp
+++ b/llvm/unittests/ExecutionEngine/JITLink/MemoryManagerErrorTests.cpp
@@ -20,7 +20,7 @@ TEST(MemoryManagerErrorTest, ErrorOnFirstAllocate) {
   // Check that we can get addresses for blocks, symbols, and edges.
   auto G = std::make_unique<LinkGraph>(
       "foo", std::make_shared<orc::SymbolStringPool>(),
-      Triple("x86_64-apple-darwin"), 8, llvm::endianness::little,
+      Triple("x86_64-apple-darwin"), SubtargetFeatures(),
       getGenericEdgeKindName);
 
   ArrayRef<char> Content = "hello, world!";
diff --git a/llvm/unittests/ExecutionEngine/JITLink/StubsTests.cpp b/llvm/unittests/ExecutionEngine/JITLink/StubsTests.cpp
index 8efbb54f59e84..8a53d0a560ba3 100644
--- a/llvm/unittests/ExecutionEngine/JITLink/StubsTests.cpp
+++ b/llvm/unittests/ExecutionEngine/JITLink/StubsTests.cpp
@@ -58,7 +58,7 @@ TEST(StubsTest, StubsGeneration_x86_64) {
   const char PointerJumpStubContent[6] = {
       static_cast<char>(0xFFu), 0x25, 0x00, 0x00, 0x00, 0x00};
   LinkGraph G("foo", std::make_shared<orc::SymbolStringPool>(),
-              Triple("x86_64-apple-darwin"), 8, llvm::endianness::little,
+              Triple("x86_64-apple-darwin"), SubtargetFeatures(),
               getGenericEdgeKindName);
   auto [PointerSym, StubSym] = GenerateStub(G, 8U, x86_64::Pointer64);
 
@@ -79,7 +79,7 @@ TEST(StubsTest, StubsGeneration_aarch64) {
       0x00, 0x02, 0x1f, (char)0xd6u  // BR  x16
   };
   LinkGraph G("foo", std::make_shared<orc::SymbolStringPool>(),
-              Triple("aarch64-linux-gnu"), 8, llvm::endianness::little,
+              Triple("aarch64-linux-gnu"), SubtargetFeatures(),
               getGenericEdgeKindName);
   auto [PointerSym, StubSym] = GenerateStub(G, 8U, aarch64::Pointer64);
 
@@ -100,7 +100,7 @@ TEST(StubsTest, StubsGeneration_i386) {
   const char PointerJumpStubContent[6] = {
       static_cast<char>(0xFFu), 0x25, 0x00, 0x00, 0x00, 0x00};
   LinkGraph G("foo", std::make_shared<orc::SymbolStringPool>(),
-              Triple("i386-unknown-linux-gnu"), 8, llvm::endianness::little,
+              Triple("i386-unknown-linux-gnu"), SubtargetFeatures(),
               getGenericEdgeKindName);
   auto [PointerSym, StubSym] = GenerateStub(G, 4U, i386::Pointer32);
 
@@ -130,7 +130,7 @@ TEST(StubsTest, StubsGeneration_loongarch32) {
       0x4c // jr $t8
   };
   LinkGraph G("foo", std::make_shared<orc::SymbolStringPool>(),
-              Triple("loongarch32"), 4, llvm::endianness::little,
+              Triple("loongarch32"), SubtargetFeatures(),
               getGenericEdgeKindName);
   auto [PointerSym, StubSym] = GenerateStub(G, 4U, loongarch::Pointer32);
 
@@ -162,8 +162,9 @@ TEST(StubsTest, StubsGeneration_loongarch64) {
       0x00,
       0x4c // jr $t8
   };
+
   LinkGraph G("foo", std::make_shared<orc::SymbolStringPool>(),
-              Triple("loongarch64"), 8, llvm::endianness::little,
+              Triple("loongarch64"), SubtargetFeatures(),
               getGenericEdgeKindName);
   auto [PointerSym, StubSym] = GenerateStub(G, 8U, loongarch::Pointer64);
 
diff --git a/llvm/unittests/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManagerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManagerTest.cpp
index 1b96b988068f2..93707bc52a5a9 100644
--- a/llvm/unittests/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManagerTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManagerTest.cpp
@@ -116,7 +116,8 @@ TEST(EPCGenericJITLinkMemoryManagerTest, AllocFinalizeFree) {
   auto MemMgr = std::make_unique<EPCGenericJITLinkMemoryManager>(*SelfEPC, SAs);
   StringRef Hello = "hello";
   auto SSA = jitlink::SimpleSegmentAlloc::Create(
-      *MemMgr, std::make_shared<orc::SymbolStringPool>(), nullptr,
+      *MemMgr, std::make_shared<orc::SymbolStringPool>(),
+      Triple("x86_64-apple-darwin"), nullptr,
       {{MemProt::Read, {Hello.size(), Align(1)}}});
   EXPECT_THAT_EXPECTED(SSA, Succeeded());
   auto SegInfo = SSA->getSegInfo(MemProt::Read);
diff --git a/llvm/unittests/ExecutionEngine/Orc/MapperJITLinkMemoryManagerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/MapperJITLinkMemoryManagerTest.cpp
index 61dd1a4234d54..d6ce80821641c 100644
--- a/llvm/unittests/ExecutionEngine/Orc/MapperJITLinkMemoryManagerTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/MapperJITLinkMemoryManagerTest.cpp
@@ -76,7 +76,8 @@ TEST(MapperJITLinkMemoryManagerTest, InProcess) {
 
   StringRef Hello = "hello";
   auto SSA1 = jitlink::SimpleSegmentAlloc::Create(
-      *MemMgr, std::make_shared<orc::SymbolStringPool>(), nullptr,
+      *MemMgr, std::make_shared<orc::SymbolStringPool>(),
+      Triple("x86_64-apple-darwin"), nullptr,
       {{MemProt::Read, {Hello.size(), Align(1)}}});
   EXPECT_THAT_EXPECTED(SSA1, Succeeded());
 
@@ -93,7 +94,8 @@ TEST(MapperJITLinkMemoryManagerTest, InProcess) {
   EXPECT_EQ(Counter->InitCount, 1);
 
   auto SSA2 = jitlink::SimpleSegmentAlloc::Create(
-      *MemMgr, std::make_shared<orc::SymbolStringPool>(), nullptr,
+      *MemMgr, std::make_shared<orc::SymbolStringPool>(),
+      Triple("x86_64-apple-darwin"), nullptr,
       {{MemProt::Read, {Hello.size(), Align(1)}}});
   EXPECT_THAT_EXPECTED(SSA2, Succeeded());
 
@@ -140,7 +142,8 @@ TEST(MapperJITLinkMemoryManagerTest, Coalescing) {
   auto SSP = std::make_shared<orc::SymbolStringPool>();
 
   auto SSA1 = jitlink::SimpleSegmentAlloc::Create(
-      *MemMgr, SSP, nullptr, {{MemProt::Read, {1024, Align(1)}}});
+      *MemMgr, SSP, Triple("x86_64-apple-darwin"), nullptr,
+      {{MemProt::Read, {1024, Align(1)}}});
   EXPECT_THAT_EXPECTED(SSA1, Succeeded());
   auto SegInfo1 = SSA1->getSegInfo(MemProt::Read);
   ExecutorAddr TargetAddr1(SegInfo1.Addr);
@@ -148,7 +151,8 @@ TEST(MapperJITLinkMemoryManagerTest, Coalescing) {
   EXPECT_THAT_EXPECTED(FA1, Succeeded());
 
   auto SSA2 = jitlink::SimpleSegmentAlloc::Create(
-      *MemMgr, SSP, nullptr, {{MemProt::Read, {1024, Align(1)}}});
+      *MemMgr, SSP, Triple("x86_64-apple-darwin"), nullptr,
+      {{MemProt::Read, {1024, Align(1)}}});
   EXPECT_THAT_EXPECTED(SSA2, Succeeded());
   auto FA2 = SSA2->finalize();
   EXPECT_THAT_EXPECTED(FA2, Succeeded());
@@ -160,7 +164,8 @@ TEST(MapperJITLinkMemoryManagerTest, Coalescing) {
   EXPECT_THAT_ERROR(std::move(Err3), Succeeded());
 
   auto SSA3 = jitlink::SimpleSegmentAlloc::Create(
-      *MemMgr, SSP, nullptr, {{MemProt::Read, {2048, Align(1)}}});
+      *MemMgr, SSP, Triple("x86_64-apple-darwin"), nullptr,
+      {{MemProt::Read, {2048, Align(1)}}});
   EXPECT_THAT_EXPECTED(SSA3, Succeeded());
 
   auto SegInfo3 = SSA3->getSegInfo(MemProt::Read);
diff --git a/llvm/unittests/ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp
index 31a48e86539fd..415ad0ee80577 100644
--- a/llvm/unittests/ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp
@@ -46,8 +46,8 @@ class ObjectLinkingLayerTest : public testing::Test {
 
 TEST_F(ObjectLinkingLayerTest, AddLinkGraph) {
   auto G = std::make_unique<LinkGraph>(
-      "foo", ES.getSymbolStringPool(), Triple("x86_64-apple-darwin"), 8,
-      llvm::endianness::little, x86_64::getEdgeKindName);
+      "foo", ES.getSymbolStringPool(), Triple("x86_64-apple-darwin"),
+      SubtargetFeatures(), x86_64::getEdgeKindName);
 
   auto &Sec1 = G->createSection("__data", MemProt::Read | MemProt::Write);
   auto &B1 = G->createContentBlock(Sec1, BlockContent,
@@ -73,8 +73,8 @@ TEST_F(ObjectLinkingLayerTest, ResourceTracker) {
   std::vector<ResourceTrackerSP> Trackers;
   for (unsigned I = 0; I < 64; I++) {
     auto G = std::make_unique<LinkGraph>(
-        "foo", ES.getSymbolStringPool(), Triple("x86_64-apple-darwin"), 8,
-        llvm::endianness::little, x86_64::getEdgeKindName);
+        "foo", ES.getSymbolStringPool(), Triple("x86_64-apple-darwin"),
+        SubtargetFeatures(), x86_64::getEdgeKindName);
 
     auto &Sec1 = G->createSection("__data", MemProt::Read | MemProt::Write);
     auto &B1 = G->createContentBlock(Sec1, BlockContent,
@@ -140,8 +140,8 @@ TEST_F(ObjectLinkingLayerTest, ClaimLateDefinedWeakSymbols) {
 
   ObjLinkingLayer.addPlugin(std::make_unique<TestPlugin>());
   auto G = std::make_unique<LinkGraph>(
-      "foo", ES.getSymbolStringPool(), Triple("x86_64-apple-darwin"), 8,
-      llvm::endianness::little, getGenericEdgeKindName);
+      "foo", ES.getSymbolStringPool(), Triple("x86_64-apple-darwin"),
+      SubtargetFeatures(), getGenericEdgeKindName);
 
   auto &DataSec = G->createSection("__data", MemProt::Read | MemProt::Write);
   auto &DataBlock = G->createContentBlock(DataSec, BlockContent,
@@ -193,8 +193,8 @@ TEST_F(ObjectLinkingLayerTest, HandleErrorDuringPostAllocationPass) {
 
   ObjLinkingLayer.addPlugin(std::make_unique<TestPlugin>());
   auto G = std::make_unique<LinkGraph>(
-      "foo", ES.getSymbolStringPool(), Triple("x86_64-apple-darwin"), 8,
-      llvm::endianness::little, getGenericEdgeKindName);
+      "foo", ES.getSymbolStringPool(), Triple("x86_64-apple-darwin"),
+      SubtargetFeatures(), getGenericEdgeKindName);
 
   auto &DataSec = G->createSection("__data", MemProt::Read | MemProt::Write);
   auto &DataBlock = G->createContentBlock(DataSec, BlockContent,
@@ -247,8 +247,8 @@ TEST_F(ObjectLinkingLayerTest, AddAndRemovePlugins) {
 
   {
     auto G1 = std::make_unique<LinkGraph>(
-        "G1", ES.getSymbolStringPool(), Triple("x86_64-apple-darwin"), 8,
-        llvm::endianness::little, x86_64::getEdgeKindName);
+        "G1", ES.getSymbolStringPool(), Triple("x86_64-apple-darwin"),
+        SubtargetFeatures(), x86_64::getEdgeKindName);
 
     auto &DataSec = G1->createSection("__data", MemProt::Read | MemProt::Write);
     auto &DataBlock = G1->createContentBlock(DataSec, BlockContent,
@@ -265,8 +265,8 @@ TEST_F(ObjectLinkingLayerTest, AddAndRemovePlugins) {
 
   {
     auto G2 = std::make_unique<LinkGraph>(
-        "G2", ES.getSymbolStringPool(), Triple("x86_64-apple-darwin"), 8,
-        llvm::endianness::little, x86_64::getEdgeKindName);
+        "G2", ES.getSymbolStringPool(), Triple("x86_64-apple-darwin"),
+        SubtargetFeatures(), x86_64::getEdgeKindName);
 
     auto &DataSec = G2->createSection("__data", MemProt::Read | MemProt::Write);
     auto &DataBlock = G2->createContentBlock(DataSec, BlockContent,
@@ -324,8 +324,9 @@ TEST(ObjectLinkingLayerSearchGeneratorTest, AbsoluteSymbolsObjectLayer) {
 
   auto G = EPCDynamicLibrarySearchGenerator::GetForTargetProcess(
       ES, {}, [&](JITDylib &JD, SymbolMap Syms) {
-        auto G = absoluteSymbolsLinkGraph(
-            ES.getTargetTriple(), ES.getSymbolStringPool(), std::move(Syms));
+        auto G =
+            absoluteSymbolsLinkGraph(Triple("x86_64-apple-darwin"),
+                                     ES.getSymbolStringPool(), std::move(Syms));
         return ObjLinkingLayer.add(JD, std::move(G));
       });
   ASSERT_THAT_EXPECTED(G, Succeeded());

From 87d7aebdd43102160d93d8e5859228d860ea072d Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik@samsung.com>
Date: Mon, 13 Jan 2025 12:42:39 +0100
Subject: [PATCH 371/408] [RISCV][test] Add more 64-bit tests in
 zbb-logic-neg-imm.ll

---
 llvm/test/CodeGen/RISCV/zbb-logic-neg-imm.ll | 79 ++++++++++++++++++--
 1 file changed, 74 insertions(+), 5 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/zbb-logic-neg-imm.ll b/llvm/test/CodeGen/RISCV/zbb-logic-neg-imm.ll
index f1e4bd09fcb92..393302c7bb5ab 100644
--- a/llvm/test/CodeGen/RISCV/zbb-logic-neg-imm.ll
+++ b/llvm/test/CodeGen/RISCV/zbb-logic-neg-imm.ll
@@ -4,9 +4,9 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+zbb -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64,NOZBS64
 ; RUN: llc -mtriple=riscv32 -mattr=+zbb,+zbs -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,RV32,ZBS
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV32,ZBS,ZBS32
 ; RUN: llc -mtriple=riscv64 -mattr=+zbb,+zbs -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64,ZBS
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64,ZBS,ZBS64
 
 define i32 @and0xabcdefff(i32 %x) {
 ; CHECK-LABEL: and0xabcdefff:
@@ -301,8 +301,8 @@ define i64 @andimm64(i64 %x) {
   ret i64 %and
 }
 
-define i64 @andimm64srli(i64 %x) {
-; RV32-LABEL: andimm64srli:
+define i64 @orimm64srli(i64 %x) {
+; RV32-LABEL: orimm64srli:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    lui a2, 1040384
 ; RV32-NEXT:    orn a0, a0, a2
@@ -310,7 +310,7 @@ define i64 @andimm64srli(i64 %x) {
 ; RV32-NEXT:    or a1, a1, a2
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: andimm64srli:
+; RV64-LABEL: orimm64srli:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 983040
 ; RV64-NEXT:    srli a1, a1, 3
@@ -319,3 +319,72 @@ define i64 @andimm64srli(i64 %x) {
   %or = or i64 %x, -2305843009180139521
   ret i64 %or
 }
+
+define i64 @andnofff(i64 %x) {
+; RV32-LABEL: andnofff:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lui a2, 1044480
+; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    andi a0, a0, 255
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: andnofff:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a1, -1
+; RV64-NEXT:    slli a1, a1, 56
+; RV64-NEXT:    addi a1, a1, 255
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    ret
+  %and = and i64 %x, -72057594037927681
+  ret i64 %and
+}
+
+define i64 @ornofff(i64 %x) {
+; NOZBS32-LABEL: ornofff:
+; NOZBS32:       # %bb.0:
+; NOZBS32-NEXT:    lui a2, 524288
+; NOZBS32-NEXT:    or a1, a1, a2
+; NOZBS32-NEXT:    ori a0, a0, 2047
+; NOZBS32-NEXT:    ret
+;
+; NOZBS64-LABEL: ornofff:
+; NOZBS64:       # %bb.0:
+; NOZBS64-NEXT:    li a1, -1
+; NOZBS64-NEXT:    slli a1, a1, 63
+; NOZBS64-NEXT:    addi a1, a1, 2047
+; NOZBS64-NEXT:    or a0, a0, a1
+; NOZBS64-NEXT:    ret
+;
+; ZBS32-LABEL: ornofff:
+; ZBS32:       # %bb.0:
+; ZBS32-NEXT:    ori a0, a0, 2047
+; ZBS32-NEXT:    bseti a1, a1, 31
+; ZBS32-NEXT:    ret
+;
+; ZBS64-LABEL: ornofff:
+; ZBS64:       # %bb.0:
+; ZBS64-NEXT:    ori a0, a0, 2047
+; ZBS64-NEXT:    bseti a0, a0, 63
+; ZBS64-NEXT:    ret
+  %or = or i64 %x, -9223372036854773761
+  ret i64 %or
+}
+
+define i64 @xornofff(i64 %x) {
+; RV32-LABEL: xornofff:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lui a2, 983040
+; RV32-NEXT:    xor a1, a1, a2
+; RV32-NEXT:    xori a0, a0, 255
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: xornofff:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a1, -1
+; RV64-NEXT:    slli a1, a1, 60
+; RV64-NEXT:    addi a1, a1, 255
+; RV64-NEXT:    xor a0, a0, a1
+; RV64-NEXT:    ret
+  %xor = xor i64 %x, -1152921504606846721
+  ret i64 %xor
+}

From 726cfc67b69633119279a6369263491421861b1d Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 13 Jan 2025 23:36:09 -0800
Subject: [PATCH 372/408] [RISCV] Don't convert virtual register Register to
 MCRegister in isCompressibleInst. (#122843)

Calling MCRegisterClass::contains with a Register does an implicit
conversion from Register to MCRegister. I think MCRegister is only
intended to be used for physical registers. We should protect this
implicit conversion by checking for physical registers first.

While I was here I removed some unnecessary parentheses from the output.
---
 .../TableGen/AsmPredicateCombiningRISCV.td    | 51 +++++++++++++++----
 llvm/utils/TableGen/CompressInstEmitter.cpp   | 18 ++++---
 2 files changed, 52 insertions(+), 17 deletions(-)

diff --git a/llvm/test/TableGen/AsmPredicateCombiningRISCV.td b/llvm/test/TableGen/AsmPredicateCombiningRISCV.td
index 57ed00583db14..d93d3e767735f 100644
--- a/llvm/test/TableGen/AsmPredicateCombiningRISCV.td
+++ b/llvm/test/TableGen/AsmPredicateCombiningRISCV.td
@@ -60,23 +60,23 @@ def BigInst : RVInst<1, [AsmPred1]>;
 def SmallInst1 : RVInst16<1, []>;
 def : CompressPat<(BigInst Regs:$r), (SmallInst1 Regs:$r), [AsmPred1]>;
 // COMPRESS:      if (STI.getFeatureBits()[arch::AsmCond1] &&
-// COMPRESS-NEXT: (MI.getOperand(0).isReg()) &&
-// COMPRESS-NEXT: (archMCRegisterClasses[arch::RegsRegClassID].contains(MI.getOperand(0).getReg()))) {
+// COMPRESS-NEXT: MI.getOperand(0).isReg() &&
+// COMPRESS-NEXT: archMCRegisterClasses[arch::RegsRegClassID].contains(MI.getOperand(0).getReg())) {
 // COMPRESS-NEXT: // SmallInst1 $r
 
 def SmallInst2 : RVInst16<2, []>;
 def : CompressPat<(BigInst Regs:$r), (SmallInst2 Regs:$r), [AsmPred2]>;
 // COMPRESS:      if (STI.getFeatureBits()[arch::AsmCond2a] &&
 // COMPRESS-NEXT: STI.getFeatureBits()[arch::AsmCond2b] &&
-// COMPRESS-NEXT: (MI.getOperand(0).isReg()) &&
-// COMPRESS-NEXT: (archMCRegisterClasses[arch::RegsRegClassID].contains(MI.getOperand(0).getReg()))) {
+// COMPRESS-NEXT: MI.getOperand(0).isReg() &&
+// COMPRESS-NEXT: archMCRegisterClasses[arch::RegsRegClassID].contains(MI.getOperand(0).getReg())) {
 // COMPRESS-NEXT: // SmallInst2 $r
 
 def SmallInst3 : RVInst16<2, []>;
 def : CompressPat<(BigInst Regs:$r), (SmallInst3 Regs:$r), [AsmPred3]>;
 // COMPRESS:      if ((STI.getFeatureBits()[arch::AsmCond3a] || STI.getFeatureBits()[arch::AsmCond3b]) &&
-// COMPRESS-NEXT: (MI.getOperand(0).isReg()) &&
-// COMPRESS-NEXT: (archMCRegisterClasses[arch::RegsRegClassID].contains(MI.getOperand(0).getReg()))) {
+// COMPRESS-NEXT: MI.getOperand(0).isReg() &&
+// COMPRESS-NEXT: archMCRegisterClasses[arch::RegsRegClassID].contains(MI.getOperand(0).getReg())) {
 // COMPRESS-NEXT: // SmallInst3 $r
 
 def SmallInst4 : RVInst16<2, []>;
@@ -84,16 +84,47 @@ def : CompressPat<(BigInst Regs:$r), (SmallInst4 Regs:$r), [AsmPred1, AsmPred2]>
 // COMPRESS:      if (STI.getFeatureBits()[arch::AsmCond1] &&
 // COMPRESS-NEXT: STI.getFeatureBits()[arch::AsmCond2a] &&
 // COMPRESS-NEXT: STI.getFeatureBits()[arch::AsmCond2b] &&
-// COMPRESS-NEXT: (MI.getOperand(0).isReg()) &&
-// COMPRESS-NEXT: (archMCRegisterClasses[arch::RegsRegClassID].contains(MI.getOperand(0).getReg()))) {
+// COMPRESS-NEXT: MI.getOperand(0).isReg() &&
+// COMPRESS-NEXT: archMCRegisterClasses[arch::RegsRegClassID].contains(MI.getOperand(0).getReg())) {
 // COMPRESS-NEXT: // SmallInst4 $r
 
 def SmallInst5 : RVInst16<2, []>;
 def : CompressPat<(BigInst Regs:$r), (SmallInst5 Regs:$r), [AsmPred1, AsmPred3]>;
 // COMPRESS:      if (STI.getFeatureBits()[arch::AsmCond1] &&
 // COMPRESS-NEXT: (STI.getFeatureBits()[arch::AsmCond3a] || STI.getFeatureBits()[arch::AsmCond3b]) &&
-// COMPRESS-NEXT: (MI.getOperand(0).isReg()) &&
-// COMPRESS-NEXT: (archMCRegisterClasses[arch::RegsRegClassID].contains(MI.getOperand(0).getReg()))) {
+// COMPRESS-NEXT: MI.getOperand(0).isReg() &&
+// COMPRESS-NEXT: archMCRegisterClasses[arch::RegsRegClassID].contains(MI.getOperand(0).getReg())) {
 // COMPRESS-NEXT: // SmallInst5 $r
 
 // COMPRESS-LABEL: static bool uncompressInst
+
+// COMPRESS-LABEL: static bool isCompressibleInst
+
+// COMPRESS:      if (STI.getFeatureBits()[arch::AsmCond1] &&
+// COMPRESS-NEXT: MI.getOperand(0).isReg() && MI.getOperand(0).getReg().isPhysical() &&
+// COMPRESS-NEXT: archMCRegisterClasses[arch::RegsRegClassID].contains(MI.getOperand(0).getReg())) {
+// COMPRESS-NEXT: // SmallInst1 $r
+
+// COMPRESS:      if (STI.getFeatureBits()[arch::AsmCond2a] &&
+// COMPRESS-NEXT: STI.getFeatureBits()[arch::AsmCond2b] &&
+// COMPRESS-NEXT: MI.getOperand(0).isReg() && MI.getOperand(0).getReg().isPhysical() &&
+// COMPRESS-NEXT: archMCRegisterClasses[arch::RegsRegClassID].contains(MI.getOperand(0).getReg())) {
+// COMPRESS-NEXT: // SmallInst2 $r
+
+// COMPRESS:      if ((STI.getFeatureBits()[arch::AsmCond3a] || STI.getFeatureBits()[arch::AsmCond3b]) &&
+// COMPRESS-NEXT: MI.getOperand(0).isReg() && MI.getOperand(0).getReg().isPhysical() &&
+// COMPRESS-NEXT: archMCRegisterClasses[arch::RegsRegClassID].contains(MI.getOperand(0).getReg())) {
+// COMPRESS-NEXT: // SmallInst3 $r
+
+// COMPRESS:      if (STI.getFeatureBits()[arch::AsmCond1] &&
+// COMPRESS-NEXT: STI.getFeatureBits()[arch::AsmCond2a] &&
+// COMPRESS-NEXT: STI.getFeatureBits()[arch::AsmCond2b] &&
+// COMPRESS-NEXT: MI.getOperand(0).isReg() && MI.getOperand(0).getReg().isPhysical() &&
+// COMPRESS-NEXT: archMCRegisterClasses[arch::RegsRegClassID].contains(MI.getOperand(0).getReg())) {
+// COMPRESS-NEXT: // SmallInst4 $r
+
+// COMPRESS:      if (STI.getFeatureBits()[arch::AsmCond1] &&
+// COMPRESS-NEXT: (STI.getFeatureBits()[arch::AsmCond3a] || STI.getFeatureBits()[arch::AsmCond3b]) &&
+// COMPRESS-NEXT: MI.getOperand(0).isReg() && MI.getOperand(0).getReg().isPhysical() &&
+// COMPRESS-NEXT: archMCRegisterClasses[arch::RegsRegClassID].contains(MI.getOperand(0).getReg())) {
+// COMPRESS-NEXT: // SmallInst5 $r
diff --git a/llvm/utils/TableGen/CompressInstEmitter.cpp b/llvm/utils/TableGen/CompressInstEmitter.cpp
index 7ebfe50a86d0f..1def8b1ab2290 100644
--- a/llvm/utils/TableGen/CompressInstEmitter.cpp
+++ b/llvm/utils/TableGen/CompressInstEmitter.cpp
@@ -773,13 +773,17 @@ void CompressInstEmitter::emitCompressInstEmitter(raw_ostream &OS,
           // This is a register operand. Check the register class.
           // Don't check register class if this is a tied operand, it was done
           // for the operand its tied to.
-          if (DestOperand.getTiedRegister() == -1)
-            CondStream.indent(6)
-                << "(MI.getOperand(" << OpIdx << ").isReg()) &&\n"
-                << "      (" << TargetName << "MCRegisterClasses[" << TargetName
-                << "::" << ClassRec->getName()
-                << "RegClassID].contains(MI.getOperand(" << OpIdx
-                << ").getReg())) &&\n";
+          if (DestOperand.getTiedRegister() == -1) {
+            CondStream.indent(6) << "MI.getOperand(" << OpIdx << ").isReg()";
+            if (EType == EmitterType::CheckCompress)
+              CondStream << " && MI.getOperand(" << OpIdx
+                         << ").getReg().isPhysical()";
+            CondStream << " &&\n"
+                       << indent(6) << TargetName << "MCRegisterClasses["
+                       << TargetName << "::" << ClassRec->getName()
+                       << "RegClassID].contains(MI.getOperand(" << OpIdx
+                       << ").getReg()) &&\n";
+          }
 
           if (CompressOrUncompress)
             CodeStream.indent(6)

From c9bc0fffa74d374bec57a1c1a320ec99b49f4e29 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Tue, 14 Jan 2025 18:38:49 +1100
Subject: [PATCH 373/408] [JITLink] Fix incorrect file name in unit test file
 comment.

---
 llvm/unittests/ExecutionEngine/JITLink/MachOLinkGraphTests.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/unittests/ExecutionEngine/JITLink/MachOLinkGraphTests.cpp b/llvm/unittests/ExecutionEngine/JITLink/MachOLinkGraphTests.cpp
index 213849c076cfe..173ec14fe260f 100644
--- a/llvm/unittests/ExecutionEngine/JITLink/MachOLinkGraphTests.cpp
+++ b/llvm/unittests/ExecutionEngine/JITLink/MachOLinkGraphTests.cpp
@@ -1,4 +1,4 @@
-//===------ LinkGraphTests.cpp - Unit tests for core JITLink classes ------===//
+//===------ MachOLinkGraphTests.cpp - Unit tests for MachO LinkGraphs -----===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.

From 99fc649c5f9a0592f4aaed7945a0ffa79a191c33 Mon Sep 17 00:00:00 2001
From: Ben Shi <2283975856@qq.com>
Date: Tue, 14 Jan 2025 16:13:41 +0800
Subject: [PATCH 374/408] [AVR][NFC] Improve format of target description files
 (#122845)

---
 llvm/lib/Target/AVR/AVRInstrInfo.td | 415 +++++++---------------------
 1 file changed, 104 insertions(+), 311 deletions(-)

diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.td b/llvm/lib/Target/AVR/AVRInstrInfo.td
index 5474a42e58848..792aff828c031 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.td
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.td
@@ -1147,28 +1147,16 @@ let canFoldAsLoad = 1, isReMaterializable = 1, mayLoad = 1,
                   Requires<[HasLPM]>;
   }
 
-  def LPMRdZ : FLPMX<0, 0,
-                     (outs GPR8
-                      : $rd),
-                     (ins ZREG
-                      : $z),
-                     "lpm\t$rd, $z", []>,
+  def LPMRdZ : FLPMX<0, 0, (outs GPR8:$rd), (ins ZREG:$z), "lpm\t$rd, $z", []>,
                Requires<[HasLPMX]>;
 
   // Load program memory, while postincrementing the Z register.
   let Defs = [R31R30] in {
-    def LPMRdZPi : FLPMX<0, 1,
-                         (outs GPR8
-                          : $rd),
-                         (ins ZREG
-                          : $z),
+    def LPMRdZPi : FLPMX<0, 1, (outs GPR8:$rd), (ins ZREG:$z),
                          "lpm\t$rd, $z+", []>,
                    Requires<[HasLPMX]>;
 
-    def LPMWRdZPi : Pseudo<(outs DREGS
-                            : $dst),
-                           (ins ZREG
-                            : $z),
+    def LPMWRdZPi : Pseudo<(outs DREGS:$dst), (ins ZREG:$z),
                            "lpmw\t$dst, $z+", []>,
                     Requires<[HasLPMX]>;
   }
@@ -1176,20 +1164,18 @@ let canFoldAsLoad = 1, isReMaterializable = 1, mayLoad = 1,
 
 // Extended load program memory operations.
 let mayLoad = 1, hasSideEffects = 0 in {
-  let Defs = [R0],
-      Uses = [R31R30] in def ELPM
-      : F16<0b1001010111011000, (outs), (ins), "elpm", []>,
-      Requires<[HasELPM]>;
+  let Defs = [R0], Uses = [R31R30] in
+  def ELPM : F16<0b1001010111011000, (outs), (ins), "elpm", []>,
+             Requires<[HasELPM]>;
 
-  def ELPMRdZ : FLPMX<1, 0, (outs GPR8:$rd), (ins ZREG:$z),
-                      "elpm\t$rd, $z", []>,
+  def ELPMRdZ : FLPMX<1, 0, (outs GPR8:$rd), (ins ZREG:$z), "elpm\t$rd, $z",
+                      []>,
                 Requires<[HasELPMX]>;
 
-  let Defs = [R31R30] in {
-    def ELPMRdZPi : FLPMX<1, 1, (outs GPR8:$rd), (ins ZREG:$z),
-                          "elpm\t$rd, $z+", []>,
-                    Requires<[HasELPMX]>;
-  }
+  let Defs = [R31R30] in
+  def ELPMRdZPi : FLPMX<1, 1, (outs GPR8:$rd), (ins ZREG:$z), "elpm\t$rd, $z+",
+                        []>,
+                  Requires<[HasELPMX]>;
 
   // These pseudo instructions are combination of the OUT and ELPM instructions.
   let Defs = [R0] in {
@@ -1217,116 +1203,64 @@ let mayLoad = 1, hasSideEffects = 0 in {
 
 // Store program memory operations.
 let Uses = [R1, R0] in {
-  let Uses = [R31R30, R1, R0] in def SPM
-      : F16<0b1001010111101000, (outs), (ins), "spm", []>,
-      Requires<[HasSPM]>;
-
-  let Defs = [R31R30] in def SPMZPi : F16<0b1001010111111000, (outs),
-                                          (ins ZREG
-                                           : $z),
-                                          "spm $z+", []>,
-      Requires<[HasSPMX]>;
+  let Uses = [R31R30, R1, R0] in
+  def SPM : F16<0b1001010111101000, (outs), (ins), "spm", []>,
+            Requires<[HasSPM]>;
+
+  let Defs = [R31R30] in 
+  def SPMZPi : F16<0b1001010111111000, (outs), (ins ZREG:$z), "spm $z+", []>,
+               Requires<[HasSPMX]>;
 }
 
 // Read data from IO location operations.
 let canFoldAsLoad = 1, isReMaterializable = 1 in {
-  def INRdA : FIORdA<(outs GPR8
-                      : $rd),
-                     (ins imm_port6
-                      : $A),
-                     "in\t$rd, $A", [(set i8
-                                         : $rd, (load ioaddr8
-                                                  : $A))]>;
-
-  def INWRdA : Pseudo<(outs DREGS
-                       : $dst),
-                      (ins imm_port6
-                       : $src),
-                      "inw\t$dst, $src", [(set i16
-                                           : $dst, (load ioaddr16
-                                                    : $src))]>;
+  def INRdA : FIORdA<(outs GPR8:$rd), (ins imm_port6:$A), "in\t$rd, $A",
+                     [(set i8:$rd, (load ioaddr8:$A))]>;
+
+  def INWRdA : Pseudo<(outs DREGS:$d), (ins imm_port6:$s), "inw\t$d, $s",
+                      [(set i16:$d, (load ioaddr16:$s))]>;
 }
 
 // Write data to IO location operations.
-def OUTARr : FIOARr<(outs),
-                    (ins imm_port6
-                     : $A, GPR8
-                     : $rr),
-                    "out\t$A, $rr", [(store i8
-                                         : $rr, ioaddr8
-                                         : $A)]>;
-
-def OUTWARr : Pseudo<(outs),
-                     (ins imm_port6
-                      : $dst, DREGS
-                      : $src),
-                     "outw\t$dst, $src", [(store i16
-                                           : $src, ioaddr16
-                                           : $dst)]>;
+def OUTARr : FIOARr<(outs), (ins imm_port6:$A, GPR8:$rr), "out\t$A, $rr",
+                    [(store i8:$rr, ioaddr8:$A)]>;
+
+def OUTWARr : Pseudo<(outs), (ins imm_port6:$dst, DREGS:$src),
+                     "outw\t$dst, $src", [(store i16:$src, ioaddr16:$dst)]>;
 
 // Stack push/pop operations.
 let Defs = [SP], Uses = [SP], hasSideEffects = 0 in {
   // Stack push operations.
   let mayStore = 1 in {
-    def PUSHRr : FRd<0b1001, 0b0011111, (outs),
-                     (ins GPR8
-                      : $rd),
-                     "push\t$rd", []>,
+    def PUSHRr : FRd<0b1001, 0b0011111, (outs), (ins GPR8:$rd), "push\t$rd",
+                     []>,
                  Requires<[HasSRAM]>;
 
-    def PUSHWRr : Pseudo<(outs),
-                         (ins DREGS
-                          : $reg),
-                         "pushw\t$reg", []>,
+    def PUSHWRr : Pseudo<(outs), (ins DREGS:$reg), "pushw\t$reg", []>,
                   Requires<[HasSRAM]>;
   }
 
   // Stack pop operations.
   let mayLoad = 1 in {
-    def POPRd : FRd<0b1001, 0b0001111,
-                    (outs GPR8
-                     : $rd),
-                    (ins), "pop\t$rd", []>,
+    def POPRd : FRd<0b1001, 0b0001111, (outs GPR8:$rd), (ins), "pop\t$rd", []>,
                 Requires<[HasSRAM]>;
 
-    def POPWRd : Pseudo<(outs DREGS
-                         : $reg),
-                        (ins), "popw\t$reg", []>,
+    def POPWRd : Pseudo<(outs DREGS:$reg), (ins), "popw\t$reg", []>,
                  Requires<[HasSRAM]>;
   }
 }
 
 // Read-Write-Modify (RMW) instructions.
-def XCHZRd : FZRd<0b100,
-                  (outs GPR8
-                   : $rd),
-                  (ins ZREG
-                   : $z),
-                  "xch\t$z, $rd", []>,
+def XCHZRd : FZRd<0b100, (outs GPR8:$rd), (ins ZREG:$z), "xch\t$z, $rd", []>,
              Requires<[SupportsRMW]>;
 
-def LASZRd : FZRd<0b101,
-                  (outs GPR8
-                   : $rd),
-                  (ins ZREG
-                   : $z),
-                  "las\t$z, $rd", []>,
+def LASZRd : FZRd<0b101, (outs GPR8:$rd), (ins ZREG:$z), "las\t$z, $rd", []>,
              Requires<[SupportsRMW]>;
 
-def LACZRd : FZRd<0b110,
-                  (outs GPR8
-                   : $rd),
-                  (ins ZREG
-                   : $z),
-                  "lac\t$z, $rd", []>,
+def LACZRd : FZRd<0b110, (outs GPR8:$rd), (ins ZREG:$z), "lac\t$z, $rd", []>,
              Requires<[SupportsRMW]>;
 
-def LATZRd : FZRd<0b111,
-                  (outs GPR8
-                   : $rd),
-                  (ins ZREG
-                   : $z),
-                  "lat\t$z, $rd", []>,
+def LATZRd : FZRd<0b111, (outs GPR8:$rd), (ins ZREG:$z), "lat\t$z, $rd", []>,
              Requires<[SupportsRMW]>;
 
 //===----------------------------------------------------------------------===//
@@ -1337,240 +1271,106 @@ def LATZRd : FZRd<0b111,
 let Constraints = "$src = $rd", Defs = [SREG] in {
   // 8-bit LSL is an alias of ADD Rd, Rd
 
-  def LSLWRd : Pseudo<(outs DREGS
-                       : $rd),
-                      (ins DREGS
-                       : $src),
-                      "lslw\t$rd",
-                      [(set i16
-                        : $rd, (AVRlsl i16
-                                : $src))]>;
+  def LSLWRd : Pseudo<(outs DREGS:$rd), (ins DREGS:$src), "lslw\t$rd",
+                      [(set i16:$rd, (AVRlsl i16:$src))]>;
 
   def LSLWHiRd : Pseudo<(outs DREGS:$rd), (ins DREGS:$src), "lslwhi\t$rd",
                         [(set i16:$rd, (AVRlslhi i16:$src))]>;
 
-  def LSLWNRd : Pseudo<(outs DLDREGS
-                        : $rd),
-                       (ins DREGS
-                        : $src, imm16
-                        : $bits),
-                       "lslwn\t$rd, $bits", [
-                         (set i16
-                          : $rd, (AVRlslwn i16
-                                  : $src, imm
-                                  : $bits))
-                       ]>;
-
-  def LSLBNRd : Pseudo<(outs LD8
-                        : $rd),
-                       (ins GPR8
-                        : $src, imm_ldi8
-                        : $bits),
-                       "lslbn\t$rd, $bits", [
-                         (set i8
-                          : $rd, (AVRlslbn i8
-                                  : $src, imm
-                                  : $bits))
-                       ]>;
-
-  def LSRRd
-      : FRd<0b1001, 0b0100110,
-            (outs GPR8
-             : $rd),
-            (ins GPR8
-             : $src),
-            "lsr\t$rd", [(set i8
-                          : $rd, (AVRlsr i8
-                                  : $src))]>;
-
-  def LSRWRd : Pseudo<(outs DREGS
-                       : $rd),
-                      (ins DREGS
-                       : $src),
-                      "lsrw\t$rd",
-                      [(set i16
-                        : $rd, (AVRlsr i16
-                                : $src))]>;
+  def LSLWNRd : Pseudo<(outs DLDREGS:$rd), (ins DREGS:$src, imm16:$bits),
+                       "lslwn\t$rd, $bits",
+                       [(set i16:$rd, (AVRlslwn i16:$src, imm:$bits))]>;
+
+  def LSLBNRd : Pseudo<(outs LD8:$rd), (ins GPR8:$src, imm_ldi8:$bits),
+                       "lslbn\t$rd, $bits",
+                       [(set i8:$rd, (AVRlslbn i8:$src, imm:$bits))]>;
+
+  def LSRRd : FRd<0b1001, 0b0100110, (outs GPR8:$rd), (ins GPR8:$src), "lsr\t$rd",
+                  [(set i8:$rd, (AVRlsr i8:$src))]>;
+
+  def LSRWRd : Pseudo<(outs DREGS:$rd), (ins DREGS:$src), "lsrw\t$rd",
+                      [(set i16:$rd, (AVRlsr i16:$src))]>;
 
   def LSRWLoRd : Pseudo<(outs DREGS:$rd), (ins DREGS:$src), "lsrwlo\t$rd",
                         [(set i16:$rd, (AVRlsrlo i16:$src))]>;
 
-  def LSRWNRd : Pseudo<(outs DLDREGS
-                        : $rd),
-                       (ins DREGS
-                        : $src, imm16
-                        : $bits),
-                       "lsrwn\t$rd, $bits", [
-                         (set i16
-                          : $rd, (AVRlsrwn i16
-                                  : $src, imm
-                                  : $bits))
-                       ]>;
-
-  def LSRBNRd : Pseudo<(outs LD8
-                        : $rd),
-                       (ins GPR8
-                        : $src, imm_ldi8
-                        : $bits),
-                       "lsrbn\t$rd, $bits", [
-                         (set i8
-                          : $rd, (AVRlsrbn i8
-                                  : $src, imm
-                                  : $bits))
-                       ]>;
-
-  def ASRRd
-      : FRd<0b1001, 0b0100101,
-            (outs GPR8
-             : $rd),
-            (ins GPR8
-             : $src),
-            "asr\t$rd", [(set i8
-                          : $rd, (AVRasr i8
-                                  : $src))]>;
-
-  def ASRWNRd : Pseudo<(outs DREGS
-                        : $rd),
-                       (ins DREGS
-                        : $src, imm16
-                        : $bits),
-                       "asrwn\t$rd, $bits", [
-                         (set i16
-                          : $rd, (AVRasrwn i16
-                                  : $src, imm
-                                  : $bits))
-                       ]>;
-
-  def ASRBNRd : Pseudo<(outs LD8
-                        : $rd),
-                       (ins GPR8
-                        : $src, imm_ldi8
-                        : $bits),
-                       "asrbn\t$rd, $bits", [
-                         (set i8
-                          : $rd, (AVRasrbn i8
-                                  : $src, imm
-                                  : $bits))
-                       ]>;
-
-  def ASRWRd : Pseudo<(outs DREGS
-                       : $rd),
-                      (ins DREGS
-                       : $src),
-                      "asrw\t$rd",
-                      [(set i16
-                        : $rd, (AVRasr i16
-                                : $src))]>;
+  def LSRWNRd : Pseudo<(outs DLDREGS:$rd), (ins DREGS:$src, imm16:$bits),
+                       "lsrwn\t$rd, $bits",
+                       [(set i16:$rd, (AVRlsrwn i16:$src, imm:$bits))]>;
+
+  def LSRBNRd : Pseudo<(outs LD8:$rd), (ins GPR8:$src, imm_ldi8:$bits),
+                       "lsrbn\t$rd, $bits",
+                       [(set i8:$rd, (AVRlsrbn i8:$src, imm:$bits))]>;
+
+  def ASRRd : FRd<0b1001, 0b0100101, (outs GPR8:$rd), (ins GPR8:$src), "asr\t$rd",
+                  [(set i8:$rd, (AVRasr i8:$src))]>;
+
+  def ASRWNRd : Pseudo<(outs DREGS:$rd), (ins DREGS:$src, imm16:$bits),
+                       "asrwn\t$rd, $bits",
+                       [(set i16:$rd, (AVRasrwn i16:$src, imm:$bits))]>;
+
+  def ASRBNRd : Pseudo<(outs LD8:$rd), (ins GPR8:$src, imm_ldi8:$bits),
+                       "asrbn\t$rd, $bits",
+                       [(set i8:$rd, (AVRasrbn i8:$src, imm:$bits))]>;
+
+  def ASRWRd : Pseudo<(outs DREGS:$rd), (ins DREGS:$src), "asrw\t$rd",
+                      [(set i16:$rd, (AVRasr i16:$src))]>;
 
   def ASRWLoRd : Pseudo<(outs DREGS:$rd), (ins DREGS:$src), "asrwlo\t$rd",
                         [(set i16:$rd, (AVRasrlo i16:$src))]>;
+
   let Uses = [R1] in
-  def ROLBRdR1 : Pseudo<(outs GPR8:$rd),
-                        (ins GPR8:$src),
-                        "rolb\t$rd",
+  def ROLBRdR1 : Pseudo<(outs GPR8:$rd), (ins GPR8:$src), "rolb\t$rd",
                         [(set i8:$rd, (AVRrol i8:$src))]>,
                  Requires<[HasNonTinyEncoding]>;
 
   let Uses = [R17] in
-  def ROLBRdR17 : Pseudo<(outs GPR8:$rd),
-                         (ins GPR8:$src),
-                         "rolb\t$rd",
+  def ROLBRdR17 : Pseudo<(outs GPR8:$rd), (ins GPR8:$src), "rolb\t$rd",
                          [(set i8:$rd, (AVRrol i8:$src))]>,
                   Requires<[HasTinyEncoding]>;
 
-  def RORBRd : Pseudo<(outs GPR8
-                       : $rd),
-                      (ins GPR8
-                       : $src),
-                      "rorb\t$rd",
-                      [(set i8
-                        : $rd, (AVRror i8
-                                : $src))]>;
+  def RORBRd : Pseudo<(outs GPR8:$rd), (ins GPR8:$src), "rorb\t$rd",
+                      [(set i8:$rd, (AVRror i8:$src))]>;
 
   // Bit rotate operations.
   let Uses = [SREG] in {
+    def ROLWRd : Pseudo<(outs DREGS:$rd), (ins DREGS:$src), "rolw\t$rd",
+                        [(set i16:$rd, (AVRrol i16:$src))]>;
 
-    def ROLWRd
-        : Pseudo<(outs DREGS
-                  : $rd),
-                 (ins DREGS
-                  : $src),
-                 "rolw\t$rd",
-                 [(set i16
-                   : $rd, (AVRrol i16
-                           : $src))]>;
-
-    def RORRd : FRd<0b1001, 0b0100111,
-                    (outs GPR8
-                     : $rd),
-                    (ins GPR8
-                     : $src),
+    def RORRd : FRd<0b1001, 0b0100111, (outs GPR8:$rd), (ins GPR8:$src),
                     "ror\t$rd", []>;
 
-    def RORWRd
-        : Pseudo<(outs DREGS
-                  : $rd),
-                 (ins DREGS
-                  : $src),
-                 "rorw\t$rd",
-                 [(set i16
-                   : $rd, (AVRror i16
-                           : $src))]>;
+    def RORWRd : Pseudo<(outs DREGS:$rd), (ins DREGS:$src), "rorw\t$rd",
+                        [(set i16:$rd, (AVRror i16:$src))]>;
   }
 }
 
 // SWAP Rd
 // Swaps the high and low nibbles in a register.
-let Constraints =
-    "$src = $rd" in def SWAPRd : FRd<0b1001, 0b0100010,
-                                     (outs GPR8
-                                      : $rd),
-                                     (ins GPR8
-                                      : $src),
-                                     "swap\t$rd", [(set i8
-                                                    : $rd, (AVRSwap i8
-                                                            : $src))]>;
+let Constraints = "$src = $rd" in
+def SWAPRd : FRd<0b1001, 0b0100010, (outs GPR8:$rd), (ins GPR8:$src),
+                 "swap\t$rd", [(set i8:$rd, (AVRSwap i8:$src))]>;
 
 // IO register bit set/clear operations.
 //: TODO: add patterns when popcount(imm)==2 to be expanded with 2 sbi/cbi
 // instead of in+ori+out which requires one more instr.
-def SBIAb : FIOBIT<0b10, (outs),
-                   (ins imm_port5
-                    : $addr, i8imm
-                    : $b),
-                   "sbi\t$addr, $b", [(store(or(i8(load lowioaddr8
-                                                     : $addr)),
-                                               iobitpos8
-                                               : $b),
-                                         lowioaddr8
-                                         : $addr)]>;
-
-def CBIAb : FIOBIT<0b00, (outs),
-                   (ins imm_port5
-                    : $addr, i8imm
-                    : $b),
-                   "cbi\t$addr, $b", [(store(and(i8(load lowioaddr8
-                                                      : $addr)),
-                                               iobitposn8
-                                               : $b),
-                                         lowioaddr8
-                                         : $addr)]>;
+def SBIAb : FIOBIT<0b10, (outs), (ins imm_port5:$addr, i8imm:$b),
+                   "sbi\t$addr, $b",
+                   [(store(or(i8(load lowioaddr8:$addr)), iobitpos8:$b),
+                     lowioaddr8:$addr)]>;
+
+def CBIAb : FIOBIT<0b00, (outs), (ins imm_port5:$addr, i8imm :$b),
+                   "cbi\t$addr, $b",
+                   [(store(and(i8(load lowioaddr8:$addr)), iobitposn8:$b),
+                     lowioaddr8:$addr)]>;
 
 // Status register bit load/store operations.
-let Defs = [SREG] in def BST : FRdB<0b01, (outs),
-                                    (ins GPR8
-                                     : $rd, i8imm
-                                     : $b),
-                                    "bst\t$rd, $b", []>;
-
-let Constraints = "$src = $rd",
-    Uses = [SREG] in def BLD : FRdB<0b00,
-                                    (outs GPR8
-                                     : $rd),
-                                    (ins GPR8
-                                     : $src, i8imm
-                                     : $b),
-                                    "bld\t$rd, $b", []>;
+let Defs = [SREG] in
+def BST : FRdB<0b01, (outs), (ins GPR8:$rd, i8imm:$b), "bst\t$rd, $b", []>;
+
+let Constraints = "$src = $rd", Uses = [SREG] in
+def BLD : FRdB<0b00, (outs GPR8:$rd), (ins GPR8:$src, i8imm:$b), "bld\t$rd, $b",
+               []>;
 
 def CBR : InstAlias<"cbr\t$rd, $k", (ANDIRdK LD8 : $rd, imm_com8 : $k), 0>;
 
@@ -1595,15 +1395,8 @@ def ROL : InstAlias<"rol\t$rd", (ADCRdRr GPR8 : $rd, GPR8 : $rd)>;
 def : InstAlias<"ser\t$rd", (LDIRdK LD8 : $rd, 0xff), 0>;
 
 let hasSideEffects=1 in {
-  let Defs = [SREG] in def BSETs : FS<0,
-                                      (outs),
-                                      (ins i8imm:$s),
-                                      "bset\t$s", []>;
-
-  let Defs = [SREG] in def BCLRs : FS<1,
-                                      (outs),
-                                      (ins i8imm:$s),
-                                      "bclr\t$s", []>;
+  let Defs = [SREG] in def BSETs : FS<0, (outs), (ins i8imm:$s), "bset\t$s", []>;
+  let Defs = [SREG] in def BCLRs : FS<1, (outs), (ins i8imm:$s), "bclr\t$s", []>;
 }
 
 // Set/clear aliases for the carry (C) status flag (bit 0).

From cfe5a0847a42d7e67942d70f938d2d664a95990c Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik@samsung.com>
Date: Tue, 14 Jan 2025 09:15:14 +0100
Subject: [PATCH 375/408] [RISCV] Enable Zbb ANDN/ORN/XNOR for more 64-bit
 constants (#122698)

This extends PR #120221 to 64-bit constants that don't match
the 12-low-bits-set pattern.
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp  |  7 ++++---
 llvm/test/CodeGen/RISCV/zbb-logic-neg-imm.ll | 21 +++++++++-----------
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 0070fd4520429..9ccf95970e5b5 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3216,17 +3216,18 @@ bool RISCVDAGToDAGISel::selectSHXADD_UWOp(SDValue N, unsigned ShAmt,
 bool RISCVDAGToDAGISel::selectInvLogicImm(SDValue N, SDValue &Val) {
   if (!isa<ConstantSDNode>(N))
     return false;
-
   int64_t Imm = cast<ConstantSDNode>(N)->getSExtValue();
-  if ((Imm & 0xfff) != 0xfff || Imm == -1)
+
+  // For 32-bit signed constants, we can only substitute LUI+ADDI with LUI.
+  if (isInt<32>(Imm) && ((Imm & 0xfff) != 0xfff || Imm == -1))
     return false;
 
+  // Abandon this transform if the constant is needed elsewhere.
   for (const SDNode *U : N->users()) {
     if (!ISD::isBitwiseLogicOp(U->getOpcode()))
       return false;
   }
 
-  // For 32-bit signed constants we already know it's a win: LUI+ADDI vs LUI.
   // For 64-bit constants, the instruction sequences get complex,
   // so we select inverted only if it's cheaper.
   if (!isInt<32>(Imm)) {
diff --git a/llvm/test/CodeGen/RISCV/zbb-logic-neg-imm.ll b/llvm/test/CodeGen/RISCV/zbb-logic-neg-imm.ll
index 393302c7bb5ab..d953d34e2d7b9 100644
--- a/llvm/test/CodeGen/RISCV/zbb-logic-neg-imm.ll
+++ b/llvm/test/CodeGen/RISCV/zbb-logic-neg-imm.ll
@@ -330,10 +330,9 @@ define i64 @andnofff(i64 %x) {
 ;
 ; RV64-LABEL: andnofff:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, -1
-; RV64-NEXT:    slli a1, a1, 56
-; RV64-NEXT:    addi a1, a1, 255
-; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    lui a1, 1048560
+; RV64-NEXT:    srli a1, a1, 8
+; RV64-NEXT:    andn a0, a0, a1
 ; RV64-NEXT:    ret
   %and = and i64 %x, -72057594037927681
   ret i64 %and
@@ -349,10 +348,9 @@ define i64 @ornofff(i64 %x) {
 ;
 ; NOZBS64-LABEL: ornofff:
 ; NOZBS64:       # %bb.0:
-; NOZBS64-NEXT:    li a1, -1
-; NOZBS64-NEXT:    slli a1, a1, 63
-; NOZBS64-NEXT:    addi a1, a1, 2047
-; NOZBS64-NEXT:    or a0, a0, a1
+; NOZBS64-NEXT:    lui a1, 1048575
+; NOZBS64-NEXT:    srli a1, a1, 1
+; NOZBS64-NEXT:    orn a0, a0, a1
 ; NOZBS64-NEXT:    ret
 ;
 ; ZBS32-LABEL: ornofff:
@@ -380,10 +378,9 @@ define i64 @xornofff(i64 %x) {
 ;
 ; RV64-LABEL: xornofff:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, -1
-; RV64-NEXT:    slli a1, a1, 60
-; RV64-NEXT:    addi a1, a1, 255
-; RV64-NEXT:    xor a0, a0, a1
+; RV64-NEXT:    lui a1, 1048575
+; RV64-NEXT:    srli a1, a1, 4
+; RV64-NEXT:    xnor a0, a0, a1
 ; RV64-NEXT:    ret
   %xor = xor i64 %x, -1152921504606846721
   ret i64 %xor

From 1a935d7a17519e9b75d12c3caf9a54a3405a0af3 Mon Sep 17 00:00:00 2001
From: Guy David <49722543+guy-david@users.noreply.github.com>
Date: Tue, 14 Jan 2025 10:18:31 +0200
Subject: [PATCH 376/408] [llvm] Mark scavenging spill-slots as *spilled* stack
 objects. (#122673)

This seems like an oversight when copying code from other backends.
---
 llvm/lib/Target/AArch64/AArch64FrameLowering.cpp      | 2 +-
 llvm/lib/Target/AMDGPU/SIFrameLowering.cpp            | 2 +-
 llvm/lib/Target/ARC/ARCFrameLowering.cpp              | 4 ++--
 llvm/lib/Target/ARM/ARMFrameLowering.cpp              | 2 +-
 llvm/lib/Target/CSKY/CSKYFrameLowering.cpp            | 2 +-
 llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp  | 4 ++--
 llvm/lib/Target/Mips/MipsSEFrameLowering.cpp          | 8 ++++----
 llvm/lib/Target/PowerPC/PPCFrameLowering.cpp          | 5 ++---
 llvm/lib/Target/RISCV/RISCVFrameLowering.cpp          | 4 ++--
 llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp      | 8 ++++----
 llvm/lib/Target/XCore/XCoreFrameLowering.cpp          | 4 ++--
 llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp        | 2 +-
 llvm/test/CodeGen/PowerPC/alloca-crspill.ll           | 4 ++--
 llvm/test/CodeGen/RISCV/rvv/addi-rvv-stack-object.mir | 2 +-
 14 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 64dfb1e39485f..206e410047db5 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -3902,7 +3902,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
       const TargetRegisterClass &RC = AArch64::GPR64RegClass;
       unsigned Size = TRI->getSpillSize(RC);
       Align Alignment = TRI->getSpillAlign(RC);
-      int FI = MFI.CreateStackObject(Size, Alignment, false);
+      int FI = MFI.CreateSpillStackObject(Size, Alignment);
       RS->addScavengingFrameIndex(FI);
       LLVM_DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
                         << " as the emergency spill slot.\n");
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index dcd4f0f65e8ef..2e2523312840a 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1438,7 +1438,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
     // second VGPR emergency frame index.
     if (HaveSGPRToVMemSpill &&
         allocateScavengingFrameIndexesNearIncomingSP(MF)) {
-      RS->addScavengingFrameIndex(MFI.CreateStackObject(4, Align(4), false));
+      RS->addScavengingFrameIndex(MFI.CreateSpillStackObject(4, Align(4)));
     }
   }
 }
diff --git a/llvm/lib/Target/ARC/ARCFrameLowering.cpp b/llvm/lib/Target/ARC/ARCFrameLowering.cpp
index 472f1c13f362e..95054eac8c4fa 100644
--- a/llvm/lib/Target/ARC/ARCFrameLowering.cpp
+++ b/llvm/lib/Target/ARC/ARCFrameLowering.cpp
@@ -438,8 +438,8 @@ void ARCFrameLowering::processFunctionBeforeFrameFinalized(
   LLVM_DEBUG(dbgs() << "Current stack size: " << MFI.getStackSize() << "\n");
   const TargetRegisterClass *RC = &ARC::GPR32RegClass;
   if (MFI.hasStackObjects()) {
-    int RegScavFI = MFI.CreateStackObject(RegInfo->getSpillSize(*RC),
-                                          RegInfo->getSpillAlign(*RC), false);
+    int RegScavFI = MFI.CreateSpillStackObject(RegInfo->getSpillSize(*RC),
+                                               RegInfo->getSpillAlign(*RC));
     RS->addScavengingFrameIndex(RegScavFI);
     LLVM_DEBUG(dbgs() << "Created scavenging index RegScavFI=" << RegScavFI
                       << "\n");
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index f495aa701e875..8b94bfac9b0c0 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -2925,7 +2925,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
         unsigned Size = TRI->getSpillSize(RC);
         Align Alignment = TRI->getSpillAlign(RC);
         RS->addScavengingFrameIndex(
-            MFI.CreateStackObject(Size, Alignment, false));
+            MFI.CreateSpillStackObject(Size, Alignment));
         --RegsNeeded;
       }
     }
diff --git a/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp b/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp
index c023b5a0de5ad..f29caafe39526 100644
--- a/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp
+++ b/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp
@@ -441,7 +441,7 @@ void CSKYFrameLowering::determineCalleeSaves(MachineFunction &MF,
     unsigned size = TRI->getSpillSize(*RC);
     Align align = TRI->getSpillAlign(*RC);
 
-    RS->addScavengingFrameIndex(MFI.CreateStackObject(size, align, false));
+    RS->addScavengingFrameIndex(MFI.CreateSpillStackObject(size, align));
   }
 
   unsigned FnSize = EstimateFunctionSizeInBytes(MF, *TII);
diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
index 1a787c63c6241..4b7b5483f5b81 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
@@ -167,8 +167,8 @@ void LoongArchFrameLowering::processFunctionBeforeFrameFinalized(
 
   // Create emergency spill slots.
   for (unsigned i = 0; i < ScavSlotsNum; ++i) {
-    int FI = MFI.CreateStackObject(RI->getSpillSize(RC), RI->getSpillAlign(RC),
-                                   false);
+    int FI =
+        MFI.CreateSpillStackObject(RI->getSpillSize(RC), RI->getSpillAlign(RC));
     RS->addScavengingFrameIndex(FI);
     if (IsLargeFunction && LAFI->getBranchRelaxationSpillFrameIndex() == -1)
       LAFI->setBranchRelaxationSpillFrameIndex(FI);
diff --git a/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp b/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
index 165e389158119..04f3a974f5408 100644
--- a/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -892,8 +892,8 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF,
     // it should be 32-bit.
     const TargetRegisterClass &RC = STI.isGP64bit() ?
       Mips::GPR64RegClass : Mips::GPR32RegClass;
-    int FI = MF.getFrameInfo().CreateStackObject(TRI->getSpillSize(RC),
-                                                 TRI->getSpillAlign(RC), false);
+    int FI = MF.getFrameInfo().CreateSpillStackObject(TRI->getSpillSize(RC),
+                                                      TRI->getSpillAlign(RC));
     RS->addScavengingFrameIndex(FI);
   }
 
@@ -908,8 +908,8 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF,
 
   const TargetRegisterClass &RC =
       ABI.ArePtrs64bit() ? Mips::GPR64RegClass : Mips::GPR32RegClass;
-  int FI = MF.getFrameInfo().CreateStackObject(TRI->getSpillSize(RC),
-                                               TRI->getSpillAlign(RC), false);
+  int FI = MF.getFrameInfo().CreateSpillStackObject(TRI->getSpillSize(RC),
+                                                    TRI->getSpillAlign(RC));
   RS->addScavengingFrameIndex(FI);
 }
 
diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index b118976b4731c..39ebd7f8d0df2 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -2307,7 +2307,7 @@ PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF,
     const TargetRegisterInfo &TRI = *Subtarget.getRegisterInfo();
     unsigned Size = TRI.getSpillSize(RC);
     Align Alignment = TRI.getSpillAlign(RC);
-    RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Alignment, false));
+    RS->addScavengingFrameIndex(MFI.CreateSpillStackObject(Size, Alignment));
 
     // Might we have over-aligned allocas?
     bool HasAlVars =
@@ -2315,8 +2315,7 @@ PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF,
 
     // These kinds of spills might need two registers.
     if (spillsCR(MF) || HasAlVars)
-      RS->addScavengingFrameIndex(
-          MFI.CreateStackObject(Size, Alignment, false));
+      RS->addScavengingFrameIndex(MFI.CreateSpillStackObject(Size, Alignment));
   }
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index ed3ec31028067..911cea27a48ac 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -1595,8 +1595,8 @@ void RISCVFrameLowering::processFunctionBeforeFrameFinalized(
   ScavSlotsNum = std::max(ScavSlotsNum, getScavSlotsNumForRVV(MF));
 
   for (unsigned I = 0; I < ScavSlotsNum; I++) {
-    int FI = MFI.CreateStackObject(RegInfo->getSpillSize(*RC),
-                                   RegInfo->getSpillAlign(*RC), false);
+    int FI = MFI.CreateSpillStackObject(RegInfo->getSpillSize(*RC),
+                                        RegInfo->getSpillAlign(*RC));
     RS->addScavengingFrameIndex(FI);
 
     if (IsLargeFunction && RVFI->getBranchRelaxationScratchFrameIndex() == -1)
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index ec3ebcbd86d79..54a950ee213f4 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -457,9 +457,9 @@ void SystemZELFFrameLowering::processFunctionBeforeFrameFinalized(
     // Create 2 for the case where both addresses in an MVC are
     // out of range.
     RS->addScavengingFrameIndex(
-        MFFrame.CreateStackObject(getPointerSize(), Align(8), false));
+        MFFrame.CreateSpillStackObject(getPointerSize(), Align(8)));
     RS->addScavengingFrameIndex(
-        MFFrame.CreateStackObject(getPointerSize(), Align(8), false));
+        MFFrame.CreateSpillStackObject(getPointerSize(), Align(8)));
   }
 
   // If R6 is used as an argument register it is still callee saved. If it in
@@ -1491,8 +1491,8 @@ void SystemZXPLINKFrameLowering::processFunctionBeforeFrameFinalized(
   if (!isUInt<12>(MaxReach)) {
     // We may need register scavenging slots if some parts of the frame
     // are outside the reach of an unsigned 12-bit displacement.
-    RS->addScavengingFrameIndex(MFFrame.CreateStackObject(8, Align(8), false));
-    RS->addScavengingFrameIndex(MFFrame.CreateStackObject(8, Align(8), false));
+    RS->addScavengingFrameIndex(MFFrame.CreateSpillStackObject(8, Align(8)));
+    RS->addScavengingFrameIndex(MFFrame.CreateSpillStackObject(8, Align(8)));
   }
 }
 
diff --git a/llvm/lib/Target/XCore/XCoreFrameLowering.cpp b/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
index 2bf91dc37999d..01896bf98cc1f 100644
--- a/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -576,7 +576,7 @@ processFunctionBeforeFrameFinalized(MachineFunction &MF,
   unsigned Size = TRI.getSpillSize(RC);
   Align Alignment = TRI.getSpillAlign(RC);
   if (XFI->isLargeFrame(MF) || hasFP(MF))
-    RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Alignment, false));
+    RS->addScavengingFrameIndex(MFI.CreateSpillStackObject(Size, Alignment));
   if (XFI->isLargeFrame(MF) && !hasFP(MF))
-    RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Alignment, false));
+    RS->addScavengingFrameIndex(MFI.CreateSpillStackObject(Size, Alignment));
 }
diff --git a/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp b/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp
index 005ba10b81313..f8ed5dd5e757a 100644
--- a/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp
@@ -276,7 +276,7 @@ void XtensaFrameLowering::processFunctionBeforeFrameFinalized(
   unsigned Size = TRI->getSpillSize(RC);
   Align Alignment = TRI->getSpillAlign(RC);
   for (unsigned I = 0; I < ScavSlotsNum; I++) {
-    int FI = MFI.CreateStackObject(Size, Alignment, false);
+    int FI = MFI.CreateSpillStackObject(Size, Alignment);
     RS->addScavengingFrameIndex(FI);
 
     if (IsLargeFunction &&
diff --git a/llvm/test/CodeGen/PowerPC/alloca-crspill.ll b/llvm/test/CodeGen/PowerPC/alloca-crspill.ll
index da6a206ff9401..cbcfd9a6a0dab 100644
--- a/llvm/test/CodeGen/PowerPC/alloca-crspill.ll
+++ b/llvm/test/CodeGen/PowerPC/alloca-crspill.ll
@@ -53,7 +53,7 @@ declare signext i32 @do_something(ptr)
 ; CHECK64-NEXT:       stack-id: default, callee-saved-register: '', callee-saved-restored: true,
 ; CHECK64-NEXT:       local-offset: 0, debug-info-variable: '', debug-info-expression: '',
 ; CHECK64-NEXT:       debug-info-location: '' }
-; CHECK64-NEXT:   - { id: 1, name: '', type: default, offset: -16, size: 8, alignment: 8,
+; CHECK64-NEXT:   - { id: 1, name: '', type: spill-slot, offset: -16, size: 8, alignment: 8,
 ; CHECK64-NEXT:       stack-id: default, callee-saved-register: '', callee-saved-restored: true,
 ; CHECK64-NEXT:       debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
 
@@ -72,7 +72,7 @@ declare signext i32 @do_something(ptr)
 ; CHECK32-NEXT:       stack-id: default, callee-saved-register: '', callee-saved-restored: true,
 ; CHECK32-NEXT:       local-offset: 0, debug-info-variable: '', debug-info-expression: '',
 ; CHECK32-NEXT:       debug-info-location: '' }
-; CHECK32-NEXT:   - { id: 1, name: '', type: default, offset: -8, size: 4, alignment: 4,
+; CHECK32-NEXT:   - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
 ; CHECK32-NEXT:       stack-id: default, callee-saved-register: '', callee-saved-restored: true,
 ; CHECK32-NEXT:       debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/addi-rvv-stack-object.mir b/llvm/test/CodeGen/RISCV/rvv/addi-rvv-stack-object.mir
index 080a89e41f0d5..812a29b9f0f4e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/addi-rvv-stack-object.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/addi-rvv-stack-object.mir
@@ -40,7 +40,7 @@ stack:
   - { id: 0, name: local0, type: default, offset: 0, size: 16, alignment: 16, 
       stack-id: scalable-vector, callee-saved-register: '', callee-saved-restored: true, 
       debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-# CHECK: - { id: 2, name: '', type: default, offset: -16, size: 8, alignment: 8,
+# CHECK: - { id: 2, name: '', type: spill-slot, offset: -16, size: 8, alignment: 8,
 # CHECK:     stack-id: default, callee-saved-register: '', callee-saved-restored: true,
 # CHECK:     debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
 callSites:       []

From 95f7c2f88dc5b2fd851c3181b03300538151133e Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Tue, 14 Jan 2025 09:26:02 +0100
Subject: [PATCH 377/408] [lldb] Reduce duplication in two of DWARFDIE context
 functions (#122712)

This doesn't make much of a difference now, but it makes it easier to
add -gsimple-template-names support to these functions (the idea is to
add an argument to say whether you want the name as spelled in the debug
info, or the one embellished with template arguments -- we have use
cases for both).
---
 .../Plugins/SymbolFile/DWARF/DWARFDIE.cpp     | 67 +++++++++----------
 1 file changed, 33 insertions(+), 34 deletions(-)

diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp
index 4b864b549f8ce..6857878b354a0 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp
@@ -376,6 +376,36 @@ lldb_private::Type *DWARFDIE::ResolveTypeUID(const DWARFDIE &die) const {
   return nullptr;
 }
 
+static CompilerContext GetContextEntry(DWARFDIE die) {
+  auto ctx = [die](CompilerContextKind kind) {
+    return CompilerContext(kind, ConstString(die.GetName()));
+  };
+
+  switch (die.Tag()) {
+  case DW_TAG_module:
+    return ctx(CompilerContextKind::Module);
+  case DW_TAG_namespace:
+    return ctx(CompilerContextKind::Namespace);
+  case DW_TAG_class_type:
+  case DW_TAG_structure_type:
+    return ctx(CompilerContextKind::ClassOrStruct);
+  case DW_TAG_union_type:
+    return ctx(CompilerContextKind::Union);
+  case DW_TAG_enumeration_type:
+    return ctx(CompilerContextKind::Enum);
+  case DW_TAG_subprogram:
+    return ctx(CompilerContextKind::Function);
+  case DW_TAG_variable:
+    return ctx(CompilerContextKind::Variable);
+  case DW_TAG_typedef:
+    return ctx(CompilerContextKind::Typedef);
+  case DW_TAG_base_type:
+    return ctx(CompilerContextKind::Builtin);
+  default:
+    llvm_unreachable("Check tag type in the caller!");
+  }
+}
+
 static void GetDeclContextImpl(DWARFDIE die,
                                llvm::SmallSet<lldb::user_id_t, 4> &seen,
                                std::vector<CompilerContext> &context) {
@@ -388,34 +418,17 @@ static void GetDeclContextImpl(DWARFDIE die,
     }
 
     // Add this DIE's contribution at the end of the chain.
-    auto push_ctx = [&](CompilerContextKind kind, llvm::StringRef name) {
-      context.push_back({kind, ConstString(name)});
-    };
     switch (die.Tag()) {
     case DW_TAG_module:
-      push_ctx(CompilerContextKind::Module, die.GetName());
-      break;
     case DW_TAG_namespace:
-      push_ctx(CompilerContextKind::Namespace, die.GetName());
-      break;
     case DW_TAG_class_type:
     case DW_TAG_structure_type:
-      push_ctx(CompilerContextKind::ClassOrStruct, die.GetName());
-      break;
     case DW_TAG_union_type:
-      push_ctx(CompilerContextKind::Union, die.GetName());
-      break;
     case DW_TAG_enumeration_type:
-      push_ctx(CompilerContextKind::Enum, die.GetName());
-      break;
     case DW_TAG_subprogram:
-      push_ctx(CompilerContextKind::Function, die.GetName());
-      break;
     case DW_TAG_variable:
-      push_ctx(CompilerContextKind::Variable, die.GetName());
-      break;
     case DW_TAG_typedef:
-      push_ctx(CompilerContextKind::Typedef, die.GetName());
+      context.push_back(GetContextEntry(die));
       break;
     default:
       break;
@@ -439,32 +452,18 @@ static void GetTypeLookupContextImpl(DWARFDIE die,
   // Stop if we hit a cycle.
   while (die && seen.insert(die.GetID()).second) {
     // Add this DIE's contribution at the end of the chain.
-    auto push_ctx = [&](CompilerContextKind kind, llvm::StringRef name) {
-      context.push_back({kind, ConstString(name)});
-    };
     switch (die.Tag()) {
     case DW_TAG_namespace:
-      push_ctx(CompilerContextKind::Namespace, die.GetName());
-      break;
     case DW_TAG_class_type:
     case DW_TAG_structure_type:
-      push_ctx(CompilerContextKind::ClassOrStruct, die.GetName());
-      break;
     case DW_TAG_union_type:
-      push_ctx(CompilerContextKind::Union, die.GetName());
-      break;
     case DW_TAG_enumeration_type:
-      push_ctx(CompilerContextKind::Enum, die.GetName());
-      break;
     case DW_TAG_variable:
-      push_ctx(CompilerContextKind::Variable, die.GetName());
-      break;
     case DW_TAG_typedef:
-      push_ctx(CompilerContextKind::Typedef, die.GetName());
-      break;
     case DW_TAG_base_type:
-      push_ctx(CompilerContextKind::Builtin, die.GetName());
+      context.push_back(GetContextEntry(die));
       break;
+
     // If any of the tags below appear in the parent chain, stop the decl
     // context and return. Prior to these being in here, if a type existed in a
     // namespace "a" like "a::my_struct", but we also have a function in that

From 53c7fe50d869386459226aeac5ec72ee918737c9 Mon Sep 17 00:00:00 2001
From: lonely eagle <2020382038@qq.com>
Date: Tue, 14 Jan 2025 16:27:44 +0800
Subject: [PATCH 378/408] [mlir][nvgpu]add dim check test to nvgpu.mma op.
 (#122864)

add shape checks of matrixA, matrixB, and matrixC to the nvgpu.mma's verify.
---
 mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp | 12 +++++++++++
 mlir/test/Dialect/NVGPU/invalid.mlir       | 24 ++++++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
index de9bbcbace692..a027350e8a5f7 100644
--- a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
+++ b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
@@ -203,6 +203,18 @@ static LogicalResult verifyMmaSyncOp(Operation *op,
   // Basic verification
   //
 
+  if (aShape.size() != 2) {
+    return op->emitError() << "matrixA must be 2 dimensional vector";
+  }
+
+  if (bShape.size() != 2) {
+    return op->emitError() << "matrixB must be 2 dimensional vector";
+  }
+
+  if (cShape.size() != 2) {
+    return op->emitError() << "matrixC must be 2 dimensional vector";
+  }
+
   auto [m, n, k] = mmaShape;
 
   // verify warp-wide size for vector a
diff --git a/mlir/test/Dialect/NVGPU/invalid.mlir b/mlir/test/Dialect/NVGPU/invalid.mlir
index f7db1140794e5..b5bfbe9ff27b7 100644
--- a/mlir/test/Dialect/NVGPU/invalid.mlir
+++ b/mlir/test/Dialect/NVGPU/invalid.mlir
@@ -354,3 +354,27 @@ func.func @rcp_unsupported_ftz(%in : vector<16xf32>) {
   // expected-error @+1 {{'nvgpu.rcp' op has a limitation. #nvgpu<rcp_rounding_mode approx> or non-ftz is not supported yet.}}
   %out = nvgpu.rcp %in {rounding = approx} : vector<16xf32>
 }
+
+// -----
+
+func.func @check_matrixA_dim(%arg0: vector<16xf16>, %arg1: vector<2x2xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> {
+  // expected-error @+1 {{matrixA must be 2 dimensional vector}}
+  %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<16xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
+  return %d : vector<2x2xf16>
+}
+
+// -----
+
+func.func @check_matrixB_dim(%arg0: vector<4x4xf16>, %arg1: vector<4xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> {
+  // expected-error @+1 {{matrixB must be 2 dimensional vector}}
+  %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<4x4xf16>, vector<4xf16>, vector<2x2xf16>) -> vector<2x2xf16>
+  return %d : vector<2x2xf16>
+}
+
+// -----
+
+func.func @check_matrixC_dim(%arg0: vector<4x4xf16>, %arg1: vector<2x2xf16>, %arg2: vector<4xf16>) -> vector<2x2xf16> {
+  // expected-error @+1 {{matrixC must be 2 dimensional vector}}
+  %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<4x4xf16>, vector<2x2xf16>, vector<4xf16>) -> vector<2x2xf16>
+  return %d : vector<2x2xf16>
+}

From e4e85e04c33bbb9ab298ab18d56e2d6de89f80c2 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray@arm.com>
Date: Tue, 14 Jan 2025 08:34:39 +0000
Subject: [PATCH 379/408] [NFC][AArch64] Add relnote saying SVE2.1 and SME2.1
 now fully implemented by ACLE (#122705)

---
 clang/docs/ReleaseNotes.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 9eeb872aa57d7..794943b24a003 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1068,6 +1068,9 @@ X86 Support
 Arm and AArch64 Support
 ^^^^^^^^^^^^^^^^^^^^^^^
 
+- Implementation of SVE2.1 and SME2.1 in accordance with the Arm C Language
+  Extensions (ACLE) is now available.
+
 - In the ARM Target, the frame pointer (FP) of a leaf function can be retained
   by using the ``-fno-omit-frame-pointer`` option. If you want to eliminate the FP
   in leaf functions after enabling ``-fno-omit-frame-pointer``, you can do so by adding

From 19c516c8d5716c3ab7ceb5c01705c9dc9a7e6c0a Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray@arm.com>
Date: Tue, 14 Jan 2025 08:35:36 +0000
Subject: [PATCH 380/408] [AArch64] Add DC CIGDPAPA and DC CIPAPA instructions
 (#122718)

Add `DC CIGDPAPA` and `DC CIPAPA` instructions, for the RME extension,
which was added as part of Armv9.1-A, but these instructions were
missed.
---
 llvm/lib/Target/AArch64/AArch64SystemOperands.td | 5 +++++
 llvm/test/MC/AArch64/armv9.1a-rme.s              | 5 +++++
 llvm/test/MC/Disassembler/AArch64/armv9a-rme.txt | 7 +++++++
 3 files changed, 17 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
index df5db8fa514a1..8f6c593d3e681 100644
--- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
@@ -246,6 +246,11 @@ def : DC<"CIPAE",   0b100, 0b0111, 0b1110, 0b000>;
 def : DC<"CIGDPAE", 0b100, 0b0111, 0b1110, 0b111>;
 }
 
+let Requires = [{ {AArch64::FeatureRME} }] in {
+def : DC<"CIGDPAPA", 0b110, 0b0111, 0b1110, 0b101>;
+def : DC<"CIPAPA",   0b110, 0b0111, 0b1110, 0b001>;
+}
+
 let Requires = [{ {AArch64::FeatureOCCMO} }] in {
 // Outer cacheable CMO (FEAT_OCCMO)
 def : DC<"CIVAOC", 0b011, 0b0111, 0b1111, 0b000>;
diff --git a/llvm/test/MC/AArch64/armv9.1a-rme.s b/llvm/test/MC/AArch64/armv9.1a-rme.s
index 80b1d7d57a463..5f455fbb569c2 100644
--- a/llvm/test/MC/AArch64/armv9.1a-rme.s
+++ b/llvm/test/MC/AArch64/armv9.1a-rme.s
@@ -68,3 +68,8 @@ sys #6, c8, c7, #4
 // CHECK-NO-RME: sys #6, c8, c4, #7
 // CHECK-NO-RME: sys #6, c8, c1, #4
 // CHECK-NO-RME: sys #6, c8, c7, #4
+
+dc cigdpapa, x0
+dc cipapa, x0
+// CHECK: dc cigdpapa, x0                    // encoding: [0xa0,0x7e,0x0e,0xd5]
+// CHECK: dc cipapa, x0                      // encoding: [0x20,0x7e,0x0e,0xd5]
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9a-rme.txt b/llvm/test/MC/Disassembler/AArch64/armv9a-rme.txt
index 25812004be2c0..00572be6f91a0 100644
--- a/llvm/test/MC/Disassembler/AArch64/armv9a-rme.txt
+++ b/llvm/test/MC/Disassembler/AArch64/armv9a-rme.txt
@@ -23,3 +23,10 @@
 # CHECK-NO-RME: sys #6, c8, c4, #7
 # CHECK-NO-RME: sys #6, c8, c1, #4
 # CHECK-NO-RME: sys #6, c8, c7, #4
+
+[0xa0,0x7e,0x0e,0xd5]
+[0x20,0x7e,0x0e,0xd5]
+# CHECK: dc cigdpapa, x0
+# CHECK: dc cipapa, x0
+# CHECK-NO-RME: sys #6, c7, c14, #5, x0
+# CHECK-NO-RME: sys #6, c7, c14, #1, x0

From 42595bdaefb6b066896c20b69ab66ff2a7fe8477 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Tue, 14 Jan 2025 19:41:38 +1100
Subject: [PATCH 381/408] [JITLink] Teach aarch64 GOT & PLT table managers to
 discover existing entries.

aarch64::GOTTableManager and aarch64::PLTTableManager will now look for
existing GOT and PLT sections and re-use existing entries if they're present.

This will be used for an upcoming MachO patch to enable compact unwind support.
---
 .../llvm/ExecutionEngine/JITLink/aarch64.h    | 16 +++-
 .../ExecutionEngine/JITLink/ELF_aarch64.cpp   |  4 +-
 .../ExecutionEngine/JITLink/MachO_arm64.cpp   |  4 +-
 llvm/lib/ExecutionEngine/JITLink/aarch64.cpp  | 20 +++++
 .../ExecutionEngine/JITLink/AArch64Tests.cpp  | 90 +++++++++++++++++++
 .../ExecutionEngine/JITLink/CMakeLists.txt    |  1 +
 6 files changed, 129 insertions(+), 6 deletions(-)
 create mode 100644 llvm/unittests/ExecutionEngine/JITLink/AArch64Tests.cpp

diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/aarch64.h b/llvm/include/llvm/ExecutionEngine/JITLink/aarch64.h
index 62221caa71c9e..e5e8ef1bab18b 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/aarch64.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/aarch64.h
@@ -786,6 +786,11 @@ class GOTTableManager : public TableManager<GOTTableManager> {
 public:
   static StringRef getSectionName() { return "$__GOT"; }
 
+  GOTTableManager(LinkGraph &G) {
+    if ((GOTSection = G.findSectionByName(getSectionName())))
+      registerExistingEntries();
+  }
+
   bool visitEdge(LinkGraph &G, Block *B, Edge &E) {
     Edge::Kind KindToSet = Edge::Invalid;
     const char *BlockWorkingMem = B->getContent().data();
@@ -848,16 +853,21 @@ class GOTTableManager : public TableManager<GOTTableManager> {
     return *GOTSection;
   }
 
+  void registerExistingEntries();
+
   Section *GOTSection = nullptr;
 };
 
 /// Procedure Linkage Table Builder.
 class PLTTableManager : public TableManager<PLTTableManager> {
 public:
-  PLTTableManager(GOTTableManager &GOT) : GOT(GOT) {}
-
   static StringRef getSectionName() { return "$__STUBS"; }
 
+  PLTTableManager(LinkGraph &G, GOTTableManager &GOT) : GOT(GOT) {
+    if ((StubsSection = G.findSectionByName(getSectionName())))
+      registerExistingEntries();
+  }
+
   bool visitEdge(LinkGraph &G, Block *B, Edge &E) {
     if (E.getKind() == aarch64::Branch26PCRel && !E.getTarget().isDefined()) {
       DEBUG_WITH_TYPE("jitlink", {
@@ -884,6 +894,8 @@ class PLTTableManager : public TableManager<PLTTableManager> {
     return *StubsSection;
   }
 
+  void registerExistingEntries();
+
   GOTTableManager &GOT;
   Section *StubsSection = nullptr;
 };
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp
index 260a493f0ffd2..b617fe222df00 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp
@@ -659,8 +659,8 @@ const uint8_t TLSDescTableManager_ELF_aarch64::TLSDescEntryContent[16] = {
 Error buildTables_ELF_aarch64(LinkGraph &G) {
   LLVM_DEBUG(dbgs() << "Visiting edges in graph:\n");
 
-  aarch64::GOTTableManager GOT;
-  aarch64::PLTTableManager PLT(GOT);
+  aarch64::GOTTableManager GOT(G);
+  aarch64::PLTTableManager PLT(G, GOT);
   TLSInfoTableManager_ELF_aarch64 TLSInfo;
   TLSDescTableManager_ELF_aarch64 TLSDesc(TLSInfo);
   visitExistingEdges(G, GOT, PLT, TLSDesc, TLSInfo);
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
index 8b43dc97e49a8..29061fff9c2ae 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
@@ -558,8 +558,8 @@ namespace jitlink {
 Error buildTables_MachO_arm64(LinkGraph &G) {
   LLVM_DEBUG(dbgs() << "Visiting edges in graph:\n");
 
-  aarch64::GOTTableManager GOT;
-  aarch64::PLTTableManager PLT(GOT);
+  aarch64::GOTTableManager GOT(G);
+  aarch64::PLTTableManager PLT(G, GOT);
   visitExistingEdges(G, GOT, PLT);
   return Error::success();
 }
diff --git a/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp b/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp
index 968ed217d8a96..e2364ad786a42 100644
--- a/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp
@@ -202,6 +202,26 @@ static Error writeStoreRegSeq(AppendFtor &Append, unsigned DstLocReg,
   return Append(Instr);
 }
 
+void GOTTableManager::registerExistingEntries() {
+  for (auto *EntrySym : GOTSection->symbols()) {
+    assert(EntrySym->getBlock().edges_size() == 1 &&
+           "GOT block edge count != 1");
+    registerPreExistingEntry(EntrySym->getBlock().edges().begin()->getTarget(),
+                             *EntrySym);
+  }
+}
+
+void PLTTableManager::registerExistingEntries() {
+  for (auto *EntrySym : StubsSection->symbols()) {
+    assert(EntrySym->getBlock().edges_size() == 2 &&
+           "PLT block edge count != 2");
+    auto &GOTSym = EntrySym->getBlock().edges().begin()->getTarget();
+    assert(GOTSym.getBlock().edges_size() == 1 && "GOT block edge count != 1");
+    registerPreExistingEntry(GOTSym.getBlock().edges().begin()->getTarget(),
+                             *EntrySym);
+  }
+}
+
 const char *getPointerSigningFunctionSectionName() { return "$__ptrauth_sign"; }
 
 /// Creates a pointer signing function section, block, and symbol to reserve
diff --git a/llvm/unittests/ExecutionEngine/JITLink/AArch64Tests.cpp b/llvm/unittests/ExecutionEngine/JITLink/AArch64Tests.cpp
new file mode 100644
index 0000000000000..34918ead5b49b
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/JITLink/AArch64Tests.cpp
@@ -0,0 +1,90 @@
+//===------- AArch64Tests.cpp - Unit tests for the AArch64 backend --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <llvm/BinaryFormat/ELF.h>
+#include <llvm/ExecutionEngine/JITLink/aarch64.h>
+
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::jitlink;
+using namespace llvm::jitlink::aarch64;
+
+TEST(AArch64, EmptyLinkGraph) {
+  LinkGraph G("foo", std::make_shared<orc::SymbolStringPool>(),
+              Triple("arm64-apple-darwin"), SubtargetFeatures(),
+              getEdgeKindName);
+  EXPECT_EQ(G.getName(), "foo");
+  EXPECT_EQ(G.getTargetTriple().str(), "arm64-apple-darwin");
+  EXPECT_EQ(G.getPointerSize(), 8U);
+  EXPECT_EQ(G.getEndianness(), llvm::endianness::little);
+  EXPECT_TRUE(G.external_symbols().empty());
+  EXPECT_TRUE(G.absolute_symbols().empty());
+  EXPECT_TRUE(G.defined_symbols().empty());
+  EXPECT_TRUE(G.blocks().empty());
+}
+
+TEST(AArch64, GOTAndStubs) {
+  LinkGraph G("foo", std::make_shared<orc::SymbolStringPool>(),
+              Triple("arm64-apple-darwin"), SubtargetFeatures(),
+              getEdgeKindName);
+
+  auto &External = G.addExternalSymbol("external", 0, false);
+
+  // First table accesses. We expect the graph to be empty:
+  EXPECT_EQ(G.findSectionByName(GOTTableManager::getSectionName()), nullptr);
+  EXPECT_EQ(G.findSectionByName(PLTTableManager::getSectionName()), nullptr);
+
+  {
+    // Create first GOT and PLT table managers and request a PLT stub. This
+    // should force creation of both a PLT stub and GOT entry.
+    GOTTableManager GOT(G);
+    PLTTableManager PLT(G, GOT);
+
+    PLT.getEntryForTarget(G, External);
+  }
+
+  auto *GOTSec = G.findSectionByName(GOTTableManager::getSectionName());
+  EXPECT_NE(GOTSec, nullptr);
+  if (GOTSec) {
+    // Expect one entry in the GOT now.
+    EXPECT_EQ(GOTSec->symbols_size(), 1U);
+    EXPECT_EQ(GOTSec->blocks_size(), 1U);
+  }
+
+  auto *PLTSec = G.findSectionByName(PLTTableManager::getSectionName());
+  EXPECT_NE(PLTSec, nullptr);
+  if (PLTSec) {
+    // Expect one entry in the PLT.
+    EXPECT_EQ(PLTSec->symbols_size(), 1U);
+    EXPECT_EQ(PLTSec->blocks_size(), 1U);
+  }
+
+  {
+    // Create second GOT and PLT table managers and request a PLT stub. This
+    // should force creation of both a PLT stub and GOT entry.
+    GOTTableManager GOT(G);
+    PLTTableManager PLT(G, GOT);
+
+    PLT.getEntryForTarget(G, External);
+  }
+
+  EXPECT_EQ(G.findSectionByName(GOTTableManager::getSectionName()), GOTSec);
+  if (GOTSec) {
+    // Expect the same one entry in the GOT.
+    EXPECT_EQ(GOTSec->symbols_size(), 1U);
+    EXPECT_EQ(GOTSec->blocks_size(), 1U);
+  }
+
+  EXPECT_EQ(G.findSectionByName(PLTTableManager::getSectionName()), PLTSec);
+  if (PLTSec) {
+    // Expect the same one entry in the GOT.
+    EXPECT_EQ(PLTSec->symbols_size(), 1U);
+    EXPECT_EQ(PLTSec->blocks_size(), 1U);
+  }
+}
diff --git a/llvm/unittests/ExecutionEngine/JITLink/CMakeLists.txt b/llvm/unittests/ExecutionEngine/JITLink/CMakeLists.txt
index d1c7b799880a3..bbf6b1bf1e0ed 100644
--- a/llvm/unittests/ExecutionEngine/JITLink/CMakeLists.txt
+++ b/llvm/unittests/ExecutionEngine/JITLink/CMakeLists.txt
@@ -10,6 +10,7 @@ set(LLVM_LINK_COMPONENTS
 add_llvm_unittest(JITLinkTests
     AArch32Tests.cpp
     AArch32ErrorTests.cpp
+    AArch64Tests.cpp
     EHFrameSupportTests.cpp
     JITLinkTestUtils.cpp
     LinkGraphTests.cpp

From 40fa7f5e8b315159d45aa280c771af5998bdc75e Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <piotr.sobczak@amd.com>
Date: Tue, 14 Jan 2025 10:00:40 +0100
Subject: [PATCH 382/408] [AMDGPU] Fix computed kill mask (#122736)

Replace S_XOR with S_ANDN2 when computing the kill mask in demote/kill
lowering. This has the effect of AND'ing demote/kill condition with exec
which is needed for proper live mask update.

The S_XOR is inadequate because it may return true for lane with exec=0.

This patch fixes an image corruption in game.

I think the issue went unnoticed because demote/kill condition is often
naturally dependent on exec, so AND'ing with exec is usually not
required.
---
 llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp       |  2 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll  | 16 ++++++++--------
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll     |  4 ++--
 .../CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll     | 16 ++++++++--------
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll |  4 ++--
 llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll  |  4 ++--
 llvm/test/CodeGen/AMDGPU/skip-if-dead.ll         |  8 ++++----
 llvm/test/CodeGen/AMDGPU/wave32.ll               |  8 ++++----
 llvm/test/CodeGen/AMDGPU/wqm.ll                  |  4 ++--
 9 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 9fbb847da2af1..263f6497b9a7a 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -960,7 +960,7 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
       // so exec mask needs to be factored in.
       TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
       ComputeKilledMaskMI =
-          BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec);
+          BuildMI(MBB, MI, DL, TII->get(AndN2Opc), TmpReg).addReg(Exec).add(Op);
       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
                          .addReg(LiveMaskReg)
                          .addReg(TmpReg);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
index 0f60f40bd337b..e79177c5df5eb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
@@ -78,7 +78,7 @@ define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) {
 ; SI:       ; %bb.0: ; %.entry
 ; SI-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
 ; SI-NEXT:    s_mov_b64 s[2:3], exec
-; SI-NEXT:    s_xor_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_andn2_b64 s[0:1], exec, s[0:1]
 ; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
 ; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
 ; SI-NEXT:    s_cbranch_scc0 .LBB1_2
@@ -96,7 +96,7 @@ define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) {
 ; GFX9:       ; %bb.0: ; %.entry
 ; GFX9-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
 ; GFX9-NEXT:    s_mov_b64 s[2:3], exec
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_andn2_b64 s[0:1], exec, s[0:1]
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
 ; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB1_2
@@ -115,7 +115,7 @@ define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) {
 ; GFX10-32-NEXT:    v_cmp_le_f32_e64 s0, 0, v1
 ; GFX10-32-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX10-32-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v0
-; GFX10-32-NEXT:    s_xor_b32 s0, s0, exec_lo
+; GFX10-32-NEXT:    s_andn2_b32 s0, exec_lo, s0
 ; GFX10-32-NEXT:    s_andn2_b32 s1, s1, s0
 ; GFX10-32-NEXT:    s_cbranch_scc0 .LBB1_2
 ; GFX10-32-NEXT:  ; %bb.1: ; %.entry
@@ -133,7 +133,7 @@ define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) {
 ; GFX10-64-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
 ; GFX10-64-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX10-64-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
-; GFX10-64-NEXT:    s_xor_b64 s[0:1], s[0:1], exec
+; GFX10-64-NEXT:    s_andn2_b64 s[0:1], exec, s[0:1]
 ; GFX10-64-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
 ; GFX10-64-NEXT:    s_cbranch_scc0 .LBB1_2
 ; GFX10-64-NEXT:  ; %bb.1: ; %.entry
@@ -556,7 +556,7 @@ define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32
 ; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
-; SI-NEXT:    s_xor_b64 s[14:15], vcc, exec
+; SI-NEXT:    s_andn2_b64 s[14:15], exec, vcc
 ; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
 ; SI-NEXT:    s_cbranch_scc0 .LBB5_2
 ; SI-NEXT:  ; %bb.1: ; %.entry
@@ -580,7 +580,7 @@ define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32
 ; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
-; GFX9-NEXT:    s_xor_b64 s[14:15], vcc, exec
+; GFX9-NEXT:    s_andn2_b64 s[14:15], exec, vcc
 ; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB5_2
 ; GFX9-NEXT:  ; %bb.1: ; %.entry
@@ -604,7 +604,7 @@ define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32
 ; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-32-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v0
-; GFX10-32-NEXT:    s_xor_b32 s13, vcc_lo, exec_lo
+; GFX10-32-NEXT:    s_andn2_b32 s13, exec_lo, vcc_lo
 ; GFX10-32-NEXT:    s_andn2_b32 s12, s12, s13
 ; GFX10-32-NEXT:    s_cbranch_scc0 .LBB5_2
 ; GFX10-32-NEXT:  ; %bb.1: ; %.entry
@@ -628,7 +628,7 @@ define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32
 ; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-64-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
-; GFX10-64-NEXT:    s_xor_b64 s[14:15], vcc, exec
+; GFX10-64-NEXT:    s_andn2_b64 s[14:15], exec, vcc
 ; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
 ; GFX10-64-NEXT:    s_cbranch_scc0 .LBB5_2
 ; GFX10-64-NEXT:  ; %bb.1: ; %.entry
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
index e0dacd47e51c8..94aad397284ff 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
@@ -55,7 +55,7 @@ define amdgpu_gs void @false() {
 ; GCN: v_cmp_lt_i32
 ; GCN: v_cmp_lt_i32
 ; GCN: s_or_b64 s[0:1]
-; GCN: s_xor_b64 s[0:1], s[0:1], exec
+; GCN: s_and{{n2|_not1}}_b64 s[0:1], exec, s[0:1]
 ; GCN: s_and{{n2|_not1}}_b64 s[2:3], s[2:3], s[0:1]
 ; GCN: s_and_b64 exec, exec, s[2:3]
 define amdgpu_gs void @and(i32 %a, i32 %b, i32 %c, i32 %d) {
@@ -238,7 +238,7 @@ define amdgpu_ps void @fcmp_x2(float %a) #0 {
 ; GCN: v_cmp_neq_f32_e32 vcc, 0
 ; GCN-DAG: s_wqm_b64 s[2:3], vcc
 ; GCN-DAG: s_mov_b64 s[0:1], exec
-; GCN: s_xor_b64 s[2:3], s[2:3], exec
+; GCN: s_and{{n2|_not1}}_b64 s[2:3], exec, s[2:3]
 ; GCN: s_and{{n2|_not1}}_b64 s[0:1], s[0:1], s[2:3]
 ; GCN: s_and_b64 exec, exec, s[0:1]
 define amdgpu_ps float @wqm(float %a) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
index 004a720b9ab48..13ce979a954c2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
@@ -78,7 +78,7 @@ define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) {
 ; SI:       ; %bb.0: ; %.entry
 ; SI-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
 ; SI-NEXT:    s_mov_b64 s[2:3], exec
-; SI-NEXT:    s_xor_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_andn2_b64 s[0:1], exec, s[0:1]
 ; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
 ; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
 ; SI-NEXT:    s_cbranch_scc0 .LBB1_2
@@ -96,7 +96,7 @@ define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) {
 ; GFX9:       ; %bb.0: ; %.entry
 ; GFX9-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
 ; GFX9-NEXT:    s_mov_b64 s[2:3], exec
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_andn2_b64 s[0:1], exec, s[0:1]
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
 ; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB1_2
@@ -115,7 +115,7 @@ define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) {
 ; GFX10-32-NEXT:    v_cmp_le_f32_e64 s0, 0, v1
 ; GFX10-32-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX10-32-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v0
-; GFX10-32-NEXT:    s_xor_b32 s0, s0, exec_lo
+; GFX10-32-NEXT:    s_andn2_b32 s0, exec_lo, s0
 ; GFX10-32-NEXT:    s_andn2_b32 s1, s1, s0
 ; GFX10-32-NEXT:    s_cbranch_scc0 .LBB1_2
 ; GFX10-32-NEXT:  ; %bb.1: ; %.entry
@@ -133,7 +133,7 @@ define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) {
 ; GFX10-64-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
 ; GFX10-64-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX10-64-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
-; GFX10-64-NEXT:    s_xor_b64 s[0:1], s[0:1], exec
+; GFX10-64-NEXT:    s_andn2_b64 s[0:1], exec, s[0:1]
 ; GFX10-64-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
 ; GFX10-64-NEXT:    s_cbranch_scc0 .LBB1_2
 ; GFX10-64-NEXT:  ; %bb.1: ; %.entry
@@ -557,7 +557,7 @@ define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32
 ; SI-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
-; SI-NEXT:    s_xor_b64 s[14:15], vcc, exec
+; SI-NEXT:    s_andn2_b64 s[14:15], exec, vcc
 ; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
 ; SI-NEXT:    s_cbranch_scc0 .LBB5_2
 ; SI-NEXT:  ; %bb.1: ; %.entry
@@ -581,7 +581,7 @@ define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32
 ; GFX9-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
-; GFX9-NEXT:    s_xor_b64 s[14:15], vcc, exec
+; GFX9-NEXT:    s_andn2_b64 s[14:15], exec, vcc
 ; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB5_2
 ; GFX9-NEXT:  ; %bb.1: ; %.entry
@@ -605,7 +605,7 @@ define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32
 ; GFX10-32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
 ; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-32-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v0
-; GFX10-32-NEXT:    s_xor_b32 s13, vcc_lo, exec_lo
+; GFX10-32-NEXT:    s_andn2_b32 s13, exec_lo, vcc_lo
 ; GFX10-32-NEXT:    s_andn2_b32 s12, s12, s13
 ; GFX10-32-NEXT:    s_cbranch_scc0 .LBB5_2
 ; GFX10-32-NEXT:  ; %bb.1: ; %.entry
@@ -629,7 +629,7 @@ define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32
 ; GFX10-64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
 ; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-64-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
-; GFX10-64-NEXT:    s_xor_b64 s[14:15], vcc, exec
+; GFX10-64-NEXT:    s_andn2_b64 s[14:15], exec, vcc
 ; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
 ; GFX10-64-NEXT:    s_cbranch_scc0 .LBB5_2
 ; GFX10-64-NEXT:  ; %bb.1: ; %.entry
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll
index a4da690b2af1f..34c6149fe92f9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll
@@ -43,12 +43,12 @@ main_body:
 ;CHECK: v_cmp_eq_u32_e32 [[CMP:[^,]+]], v0, v1
 
 ;WAVE64: s_wqm_b64 [[WQM:[^,]+]], [[CMP]]
-;WAVE64: s_xor_b64 [[KILL:[^,]+]], [[WQM]], exec
+;WAVE64: s_andn2_b64 [[KILL:[^,]+]], exec, [[WQM]]
 ;WAVE64: s_andn2_b64 [[MASK:[^,]+]], [[EXEC:[^,]+]], [[KILL]]
 ;WAVE64: s_and_b64 exec, exec, [[MASK]]
 
 ;WAVE32: s_wqm_b32 [[WQM:[^,]+]], [[CMP]]
-;WAVE32: s_xor_b32 [[KILL:[^,]+]], [[WQM]], exec
+;WAVE32: s_and{{n2|_not1}}_b32 [[KILL:[^,]+]], exec_lo, [[WQM]]
 ;WAVE32: s_and{{n2|_not1}}_b32 [[MASK:[^,]+]], [[EXEC:[^,]+]], [[KILL]]
 ;WAVE32: s_and_b32 exec_lo, exec_lo, [[MASK]]
 
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll
index cef959f45437d..53698ff71de62 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll
@@ -17,7 +17,7 @@ define amdgpu_ps float @uniform_kill(float %a, i32 %b, float %c) {
 ; SI-NEXT:  ; %bb.2: ; %endif1
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; SI-NEXT:    s_wqm_b64 s[4:5], s[2:3]
-; SI-NEXT:    s_xor_b64 s[4:5], s[4:5], exec
+; SI-NEXT:    s_andn2_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; SI-NEXT:    s_cbranch_scc0 .LBB0_6
 ; SI-NEXT:  ; %bb.3: ; %endif1
@@ -59,7 +59,7 @@ define amdgpu_ps float @uniform_kill(float %a, i32 %b, float %c) {
 ; FLAT-NEXT:  ; %bb.2: ; %endif1
 ; FLAT-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; FLAT-NEXT:    s_wqm_b64 s[4:5], s[2:3]
-; FLAT-NEXT:    s_xor_b64 s[4:5], s[4:5], exec
+; FLAT-NEXT:    s_andn2_b64 s[4:5], exec, s[4:5]
 ; FLAT-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; FLAT-NEXT:    s_cbranch_scc0 .LBB0_6
 ; FLAT-NEXT:  ; %bb.3: ; %endif1
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index fdcb6c941e16e..715ea57d473f5 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -761,7 +761,7 @@ define amdgpu_ps float @test_kill_control_flow_return(i32 inreg %arg) #0 {
 ; SI-NEXT:    s_cmp_eq_u32 s0, 1
 ; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; SI-NEXT:    s_mov_b64 s[2:3], exec
-; SI-NEXT:    s_xor_b64 s[4:5], s[4:5], exec
+; SI-NEXT:    s_andn2_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
 ; SI-NEXT:    s_cbranch_scc0 .LBB9_4
 ; SI-NEXT:  ; %bb.1: ; %entry
@@ -798,7 +798,7 @@ define amdgpu_ps float @test_kill_control_flow_return(i32 inreg %arg) #0 {
 ; GFX10-WAVE64-NEXT:    s_cmp_eq_u32 s0, 1
 ; GFX10-WAVE64-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX10-WAVE64-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX10-WAVE64-NEXT:    s_xor_b64 s[4:5], s[4:5], exec
+; GFX10-WAVE64-NEXT:    s_andn2_b64 s[4:5], exec, s[4:5]
 ; GFX10-WAVE64-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
 ; GFX10-WAVE64-NEXT:    s_cbranch_scc0 .LBB9_4
 ; GFX10-WAVE64-NEXT:  ; %bb.1: ; %entry
@@ -835,7 +835,7 @@ define amdgpu_ps float @test_kill_control_flow_return(i32 inreg %arg) #0 {
 ; GFX10-WAVE32-NEXT:    s_cmp_eq_u32 s0, 1
 ; GFX10-WAVE32-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX10-WAVE32-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX10-WAVE32-NEXT:    s_xor_b32 s2, s2, exec_lo
+; GFX10-WAVE32-NEXT:    s_andn2_b32 s2, exec_lo, s2
 ; GFX10-WAVE32-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX10-WAVE32-NEXT:    s_cbranch_scc0 .LBB9_4
 ; GFX10-WAVE32-NEXT:  ; %bb.1: ; %entry
@@ -873,7 +873,7 @@ define amdgpu_ps float @test_kill_control_flow_return(i32 inreg %arg) #0 {
 ; GFX11-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX11-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b64 s[4:5], s[4:5], exec
+; GFX11-NEXT:    s_and_not1_b64 s[4:5], exec, s[4:5]
 ; GFX11-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[4:5]
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB9_4
 ; GFX11-NEXT:  ; %bb.1: ; %entry
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 9b13ce6ab69cc..4e17be1ebb312 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -1767,7 +1767,7 @@ define amdgpu_gs void @test_kill_i1_terminator_i1(i32 %a, i32 %b, i32 %c, i32 %d
 ; GFX1032-NEXT:    v_cmp_lt_i32_e64 s0, v2, v3
 ; GFX1032-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT:    s_xor_b32 s0, s0, exec_lo
+; GFX1032-NEXT:    s_andn2_b32 s0, exec_lo, s0
 ; GFX1032-NEXT:    s_andn2_b32 s1, s1, s0
 ; GFX1032-NEXT:    s_and_b32 exec_lo, exec_lo, s1
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
@@ -1783,7 +1783,7 @@ define amdgpu_gs void @test_kill_i1_terminator_i1(i32 %a, i32 %b, i32 %c, i32 %d
 ; GFX1064-NEXT:    v_cmp_lt_i32_e64 s[0:1], v2, v3
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT:    s_xor_b64 s[0:1], s[0:1], exec
+; GFX1064-NEXT:    s_andn2_b64 s[0:1], exec, s[0:1]
 ; GFX1064-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
 ; GFX1064-NEXT:    s_and_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
@@ -2256,7 +2256,7 @@ define amdgpu_ps void @test_wqm_vote(float %a) {
 ; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-NEXT:    s_wqm_b32 s1, vcc_lo
-; GFX1032-NEXT:    s_xor_b32 s1, s1, exec_lo
+; GFX1032-NEXT:    s_andn2_b32 s1, exec_lo, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    s_cbranch_scc0 .LBB44_2
 ; GFX1032-NEXT:  ; %bb.1:
@@ -2274,7 +2274,7 @@ define amdgpu_ps void @test_wqm_vote(float %a) {
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064-NEXT:    s_wqm_b64 s[2:3], vcc
-; GFX1064-NEXT:    s_xor_b64 s[2:3], s[2:3], exec
+; GFX1064-NEXT:    s_andn2_b64 s[2:3], exec, s[2:3]
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1064-NEXT:    s_cbranch_scc0 .LBB44_2
 ; GFX1064-NEXT:  ; %bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index d18a2288ef244..14395648de3db 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -2938,7 +2938,7 @@ define amdgpu_ps float @test_strict_wqm_within_wqm_with_kill(<8 x i32> inreg %rs
 ; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
-; GFX9-W64-NEXT:    s_xor_b64 s[0:1], vcc, exec
+; GFX9-W64-NEXT:    s_andn2_b64 s[0:1], exec, vcc
 ; GFX9-W64-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[0:1]
 ; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB51_2
 ; GFX9-W64-NEXT:  ; %bb.1: ; %main_body
@@ -2973,7 +2973,7 @@ define amdgpu_ps float @test_strict_wqm_within_wqm_with_kill(<8 x i32> inreg %rs
 ; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
-; GFX10-W32-NEXT:    s_xor_b32 s0, vcc_lo, exec_lo
+; GFX10-W32-NEXT:    s_andn2_b32 s0, exec_lo, vcc_lo
 ; GFX10-W32-NEXT:    s_andn2_b32 s12, s12, s0
 ; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB51_2
 ; GFX10-W32-NEXT:  ; %bb.1: ; %main_body

From a94f08174c0312bca0ff6405640eb8a3ff986084 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Tue, 14 Jan 2025 09:02:33 +0000
Subject: [PATCH 383/408] LAA: regen a test with UTC (NFC) (#122748)

---
 .../wrapping-pointer-versioning.ll            | 176 +++++++++++-------
 1 file changed, 112 insertions(+), 64 deletions(-)

diff --git a/llvm/test/Analysis/LoopAccessAnalysis/wrapping-pointer-versioning.ll b/llvm/test/Analysis/LoopAccessAnalysis/wrapping-pointer-versioning.ll
index 1ebe91a044b78..71c20bc2b2a82 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/wrapping-pointer-versioning.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/wrapping-pointer-versioning.ll
@@ -1,6 +1,6 @@
-; RUN: opt -passes='print<access-info>' -aa-pipeline='basic-aa' -disable-output < %s  2>&1 | FileCheck %s --check-prefix=LAA
-
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='print<access-info>' -aa-pipeline='basic-aa' \
+; RUN:   -disable-output %s 2>&1 | FileCheck %s
 
 ; For this loop:
 ;   unsigned index = 0;
@@ -19,24 +19,33 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 ; to check that the pointers don't wrap since the GEPs are not
 ; inbound.
 
-; LAA-LABEL: f1
-; LAA: Memory dependences are safe{{$}}
-; LAA: SCEV assumptions:
-; LAA:      {0,+,2}<%for.body> Added Flags: <nusw>
-; LAA-NEXT: {%a,+,4}<%for.body> Added Flags: <nusw>
-
 ; The expression for %mul_ext as analyzed by SCEV is
 ;    (zext i32 {0,+,2}<%for.body> to i64)
 ; We have added the nusw flag to turn this expression into the SCEV expression:
 ;    i64 {0,+,2}<%for.body>
 
-; LAA: [PSE]  %arrayidxA = getelementptr i16, ptr %a, i64 %mul_ext:
-; LAA-NEXT: ((2 * (zext i32 {0,+,2}<%for.body> to i64))<nuw><nsw> + %a)
-; LAA-NEXT: --> {%a,+,4}<%for.body>
-
-
-define void @f1(ptr noalias %a,
-                ptr noalias %b, i64 %N) {
+define void @f1(ptr noalias %a, ptr noalias %b, i64 %N) {
+; CHECK-LABEL: 'f1'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Forward:
+; CHECK-NEXT:            %loadA = load i16, ptr %arrayidxA, align 2 ->
+; CHECK-NEXT:            store i16 %add, ptr %arrayidxA, align 2
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      {0,+,2}<%for.body> Added Flags: <nusw>
+; CHECK-NEXT:      {%a,+,4}<%for.body> Added Flags: <nusw>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+; CHECK-NEXT:      [PSE] %arrayidxA = getelementptr i16, ptr %a, i64 %mul_ext:
+; CHECK-NEXT:        ((2 * (zext i32 {0,+,2}<%for.body> to i64))<nuw><nsw> + %a)
+; CHECK-NEXT:        --> {%a,+,4}<%for.body>
+;
 entry:
   br label %for.body
 
@@ -86,23 +95,33 @@ for.end:                                          ; preds = %for.body
 ; This loop has a negative stride for A, and the nusw flag is required in
 ; order to properly extend the increment from i32 -4 to i64 -4.
 
-; LAA-LABEL: f2
-; LAA: Memory dependences are safe{{$}}
-; LAA: SCEV assumptions:
-; LAA-NEXT: {(2 * (trunc i64 %N to i32)),+,-2}<%for.body> Added Flags: <nusw>
-; LAA-NEXT: {((4 * (zext i31 (trunc i64 %N to i31) to i64))<nuw><nsw> + %a),+,-4}<%for.body> Added Flags: <nusw>
-
 ; The expression for %mul_ext as analyzed by SCEV is
 ;     (zext i32 {(2 * (trunc i64 %N to i32)),+,-2}<%for.body> to i64)
 ; We have added the nusw flag to turn this expression into the following SCEV:
 ;     i64 {zext i32 (2 * (trunc i64 %N to i32)) to i64,+,-2}<%for.body>
 
-; LAA: [PSE]  %arrayidxA = getelementptr i16, ptr %a, i64 %mul_ext:
-; LAA-NEXT: ((2 * (zext i32 {(2 * (trunc i64 %N to i32)),+,-2}<%for.body> to i64))<nuw><nsw> + %a)
-; LAA-NEXT: --> {((4 * (zext i31 (trunc i64 %N to i31) to i64))<nuw><nsw> + %a),+,-4}<%for.body>
-
-define void @f2(ptr noalias %a,
-                ptr noalias %b, i64 %N) {
+define void @f2(ptr noalias %a, ptr noalias %b, i64 %N) {
+; CHECK-LABEL: 'f2'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Forward:
+; CHECK-NEXT:            %loadA = load i16, ptr %arrayidxA, align 2 ->
+; CHECK-NEXT:            store i16 %add, ptr %arrayidxA, align 2
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      {(2 * (trunc i64 %N to i32)),+,-2}<%for.body> Added Flags: <nusw>
+; CHECK-NEXT:      {((4 * (zext i31 (trunc i64 %N to i31) to i64))<nuw><nsw> + %a),+,-4}<%for.body> Added Flags: <nusw>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+; CHECK-NEXT:      [PSE] %arrayidxA = getelementptr i16, ptr %a, i64 %mul_ext:
+; CHECK-NEXT:        ((2 * (zext i32 {(2 * (trunc i64 %N to i32)),+,-2}<%for.body> to i64))<nuw><nsw> + %a)
+; CHECK-NEXT:        --> {((4 * (zext i31 (trunc i64 %N to i31) to i64))<nuw><nsw> + %a),+,-4}<%for.body>
+;
 entry:
   %TruncN = trunc i64 %N to i32
   br label %for.body
@@ -137,23 +156,33 @@ for.end:                                          ; preds = %for.body
 ; We replicate the tests above, but this time sign extend 2 * index instead
 ; of zero extending it.
 
-; LAA-LABEL: f3
-; LAA: Memory dependences are safe{{$}}
-; LAA: SCEV assumptions:
-; LAA-NEXT: {0,+,2}<%for.body> Added Flags: <nssw>
-; LAA-NEXT: {%a,+,4}<%for.body> Added Flags: <nusw>
-
 ; The expression for %mul_ext as analyzed by SCEV is
 ;     i64 (sext i32 {0,+,2}<%for.body> to i64)
 ; We have added the nssw flag to turn this expression into the following SCEV:
 ;     i64 {0,+,2}<%for.body>
 
-; LAA: [PSE]  %arrayidxA = getelementptr i16, ptr %a, i64 %mul_ext:
-; LAA-NEXT: ((2 * (sext i32 {0,+,2}<%for.body> to i64))<nsw> + %a)
-; LAA-NEXT: --> {%a,+,4}<%for.body>
-
-define void @f3(ptr noalias %a,
-                ptr noalias %b, i64 %N) {
+define void @f3(ptr noalias %a, ptr noalias %b, i64 %N) {
+; CHECK-LABEL: 'f3'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Forward:
+; CHECK-NEXT:            %loadA = load i16, ptr %arrayidxA, align 2 ->
+; CHECK-NEXT:            store i16 %add, ptr %arrayidxA, align 2
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      {0,+,2}<%for.body> Added Flags: <nssw>
+; CHECK-NEXT:      {%a,+,4}<%for.body> Added Flags: <nusw>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+; CHECK-NEXT:      [PSE] %arrayidxA = getelementptr i16, ptr %a, i64 %mul_ext:
+; CHECK-NEXT:        ((2 * (sext i32 {0,+,2}<%for.body> to i64))<nsw> + %a)
+; CHECK-NEXT:        --> {%a,+,4}<%for.body>
+;
 entry:
   br label %for.body
 
@@ -184,23 +213,33 @@ for.end:                                          ; preds = %for.body
   ret void
 }
 
-; LAA-LABEL: f4
-; LAA: Memory dependences are safe{{$}}
-; LAA: SCEV assumptions:
-; LAA-NEXT: {(2 * (trunc i64 %N to i32)),+,-2}<%for.body> Added Flags: <nssw>
-; LAA-NEXT: {((2 * (sext i32 (2 * (trunc i64 %N to i32)) to i64))<nsw> + %a),+,-4}<%for.body> Added Flags: <nusw>
-
 ; The expression for %mul_ext as analyzed by SCEV is
 ;     i64  (sext i32 {(2 * (trunc i64 %N to i32)),+,-2}<%for.body> to i64)
 ; We have added the nssw flag to turn this expression into the following SCEV:
 ;     i64 {sext i32 (2 * (trunc i64 %N to i32)) to i64,+,-2}<%for.body>
 
-; LAA: [PSE]  %arrayidxA = getelementptr i16, ptr %a, i64 %mul_ext:
-; LAA-NEXT: ((2 * (sext i32 {(2 * (trunc i64 %N to i32)),+,-2}<%for.body> to i64))<nsw> + %a)
-; LAA-NEXT: --> {((2 * (sext i32 (2 * (trunc i64 %N to i32)) to i64))<nsw> + %a),+,-4}<%for.body>
-
-define void @f4(ptr noalias %a,
-                ptr noalias %b, i64 %N) {
+define void @f4(ptr noalias %a, ptr noalias %b, i64 %N) {
+; CHECK-LABEL: 'f4'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Forward:
+; CHECK-NEXT:            %loadA = load i16, ptr %arrayidxA, align 2 ->
+; CHECK-NEXT:            store i16 %add, ptr %arrayidxA, align 2
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      {(2 * (trunc i64 %N to i32)),+,-2}<%for.body> Added Flags: <nssw>
+; CHECK-NEXT:      {((2 * (sext i32 (2 * (trunc i64 %N to i32)) to i64))<nsw> + %a),+,-4}<%for.body> Added Flags: <nusw>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+; CHECK-NEXT:      [PSE] %arrayidxA = getelementptr i16, ptr %a, i64 %mul_ext:
+; CHECK-NEXT:        ((2 * (sext i32 {(2 * (trunc i64 %N to i32)),+,-2}<%for.body> to i64))<nsw> + %a)
+; CHECK-NEXT:        --> {((2 * (sext i32 (2 * (trunc i64 %N to i32)) to i64))<nsw> + %a),+,-4}<%for.body>
+;
 entry:
   %TruncN = trunc i64 %N to i32
   br label %for.body
@@ -239,18 +278,27 @@ for.end:                                          ; preds = %for.body
 ;
 ; We can still analyze this by adding the required no wrap SCEV predicates.
 
-; LAA-LABEL: f5
-; LAA: Memory dependences are safe{{$}}
-; LAA: SCEV assumptions:
-; LAA-NEXT: {(2 * (trunc i64 %N to i32)),+,-2}<%for.body> Added Flags: <nssw>
-; LAA-EMPTY:
-
-; LAA: [PSE]  %arrayidxA = getelementptr inbounds i16, ptr %a, i32 %mul:
-; LAA-NEXT: ((2 * (sext i32 {(2 * (trunc i64 %N to i32)),+,-2}<%for.body> to i64))<nsw> + %a)
-; LAA-NEXT: --> {((2 * (sext i32 (2 * (trunc i64 %N to i32)) to i64))<nsw> + %a),+,-4}<%for.body>
-
-define void @f5(ptr noalias %a,
-                ptr noalias %b, i64 %N) {
+define void @f5(ptr noalias %a, ptr noalias %b, i64 %N) {
+; CHECK-LABEL: 'f5'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Forward:
+; CHECK-NEXT:            %loadA = load i16, ptr %arrayidxA, align 2 ->
+; CHECK-NEXT:            store i16 %add, ptr %arrayidxA, align 2
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      {(2 * (trunc i64 %N to i32)),+,-2}<%for.body> Added Flags: <nssw>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+; CHECK-NEXT:      [PSE] %arrayidxA = getelementptr inbounds i16, ptr %a, i32 %mul:
+; CHECK-NEXT:        ((2 * (sext i32 {(2 * (trunc i64 %N to i32)),+,-2}<%for.body> to i64))<nsw> + %a)
+; CHECK-NEXT:        --> {((2 * (sext i32 (2 * (trunc i64 %N to i32)) to i64))<nsw> + %a),+,-4}<%for.body>
+;
 entry:
   %TruncN = trunc i64 %N to i32
   br label %for.body

From 7a0f75c7385e971b84f05da2e48c138dc40f2b3b Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me@antoniofrighetto.com>
Date: Tue, 14 Jan 2025 10:02:17 +0100
Subject: [PATCH 384/408] Reapply "[GVN] MemorySSA for GVN: add optional
 `AllowMemorySSA`"

Original commit: eb63cd62a4a1907dbd58f12660efd8244e7d81e9

Previously reverted due to non-negligible compile-time impact in
stage1-ReleaseLTO-g scenario. The issue has been addressed by
always reusing previously computed MemorySSA results, and request
new ones only when `isMemorySSAEnabled` is set.

Co-authored-by: Momchil Velikov <momchil.velikov@arm.com>
---
 llvm/include/llvm/Transforms/Scalar/GVN.h | 13 ++++++---
 llvm/lib/Passes/PassBuilder.cpp           |  2 ++
 llvm/lib/Passes/PassRegistry.def          |  2 +-
 llvm/lib/Transforms/Scalar/GVN.cpp        | 32 ++++++++++++++++++-----
 llvm/test/Other/new-pm-print-pipeline.ll  |  4 +--
 5 files changed, 41 insertions(+), 12 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Scalar/GVN.h b/llvm/include/llvm/Transforms/Scalar/GVN.h
index be6c0ec5edab0..c8be390799836 100644
--- a/llvm/include/llvm/Transforms/Scalar/GVN.h
+++ b/llvm/include/llvm/Transforms/Scalar/GVN.h
@@ -77,6 +77,7 @@ struct GVNOptions {
   std::optional<bool> AllowLoadInLoopPRE;
   std::optional<bool> AllowLoadPRESplitBackedge;
   std::optional<bool> AllowMemDep;
+  std::optional<bool> AllowMemorySSA;
 
   GVNOptions() = default;
 
@@ -108,6 +109,12 @@ struct GVNOptions {
     AllowMemDep = MemDep;
     return *this;
   }
+
+  /// Enables or disables use of MemorySSA.
+  GVNOptions &setMemorySSA(bool MemSSA) {
+    AllowMemorySSA = MemSSA;
+    return *this;
+  }
 };
 
 /// The core GVN pass object.
@@ -144,6 +151,7 @@ class GVNPass : public PassInfoMixin<GVNPass> {
   bool isLoadInLoopPREEnabled() const;
   bool isLoadPRESplitBackedgeEnabled() const;
   bool isMemDepEnabled() const;
+  bool isMemorySSAEnabled() const;
 
   /// This class holds the mapping between values and value numbers.  It is used
   /// as an efficient mechanism to determine the expression-wise equivalence of
@@ -383,9 +391,8 @@ class GVNPass : public PassInfoMixin<GVNPass> {
   void assignBlockRPONumber(Function &F);
 };
 
-/// Create a legacy GVN pass. This also allows parameterizing whether or not
-/// MemDep is enabled.
-FunctionPass *createGVNPass(bool NoMemDepAnalysis = false);
+/// Create a legacy GVN pass.
+FunctionPass *createGVNPass();
 
 /// A simple and fast domtree-based GVN pass to hoist common expressions
 /// from sibling branches.
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index f923d5aabe0a0..94782547325ed 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -1042,6 +1042,8 @@ Expected<GVNOptions> parseGVNOptions(StringRef Params) {
       Result.setLoadPRESplitBackedge(Enable);
     } else if (ParamName == "memdep") {
       Result.setMemDep(Enable);
+    } else if (ParamName == "memoryssa") {
+      Result.setMemorySSA(Enable);
     } else {
       return make_error<StringError>(
           formatv("invalid GVN pass parameter '{0}' ", ParamName).str(),
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 1021d7fcd9247..a93a995655a14 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -526,7 +526,7 @@ FUNCTION_PASS_WITH_PARAMS(
     "gvn", "GVNPass", [](GVNOptions Opts) { return GVNPass(Opts); },
     parseGVNOptions,
     "no-pre;pre;no-load-pre;load-pre;no-split-backedge-load-pre;"
-    "split-backedge-load-pre;no-memdep;memdep")
+    "split-backedge-load-pre;no-memdep;memdep;no-memoryssa;memoryssa")
 FUNCTION_PASS_WITH_PARAMS(
     "hardware-loops", "HardwareLoopsPass",
     [](HardwareLoopOptions Opts) { return HardwareLoopsPass(Opts); },
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 229fffe92b99c..31af2d8a617b6 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -113,6 +113,8 @@ static cl::opt<bool>
 GVNEnableSplitBackedgeInLoadPRE("enable-split-backedge-in-load-pre",
                                 cl::init(false));
 static cl::opt<bool> GVNEnableMemDep("enable-gvn-memdep", cl::init(true));
+static cl::opt<bool> GVNEnableMemorySSA("enable-gvn-memoryssa",
+                                        cl::init(false));
 
 static cl::opt<uint32_t> MaxNumDeps(
     "gvn-max-num-deps", cl::Hidden, cl::init(100),
@@ -820,6 +822,10 @@ bool GVNPass::isMemDepEnabled() const {
   return Options.AllowMemDep.value_or(GVNEnableMemDep);
 }
 
+bool GVNPass::isMemorySSAEnabled() const {
+  return Options.AllowMemorySSA.value_or(GVNEnableMemorySSA);
+}
+
 PreservedAnalyses GVNPass::run(Function &F, FunctionAnalysisManager &AM) {
   // FIXME: The order of evaluation of these 'getResult' calls is very
   // significant! Re-ordering these variables will cause GVN when run alone to
@@ -833,6 +839,11 @@ PreservedAnalyses GVNPass::run(Function &F, FunctionAnalysisManager &AM) {
       isMemDepEnabled() ? &AM.getResult<MemoryDependenceAnalysis>(F) : nullptr;
   auto &LI = AM.getResult<LoopAnalysis>(F);
   auto *MSSA = AM.getCachedResult<MemorySSAAnalysis>(F);
+  if (isMemorySSAEnabled() && !MSSA) {
+    assert(!MemDep &&
+           "On-demand computation of MemSSA implies that MemDep is disabled!");
+    MSSA = &AM.getResult<MemorySSAAnalysis>(F);
+  }
   auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
   bool Changed = runImpl(F, AC, DT, TLI, AA, MemDep, LI, &ORE,
                          MSSA ? &MSSA->getMSSA() : nullptr);
@@ -861,7 +872,9 @@ void GVNPass::printPipeline(
     OS << (*Options.AllowLoadPRESplitBackedge ? "" : "no-")
        << "split-backedge-load-pre;";
   if (Options.AllowMemDep != std::nullopt)
-    OS << (*Options.AllowMemDep ? "" : "no-") << "memdep";
+    OS << (*Options.AllowMemDep ? "" : "no-") << "memdep;";
+  if (Options.AllowMemorySSA != std::nullopt)
+    OS << (*Options.AllowMemorySSA ? "" : "no-") << "memoryssa";
   OS << '>';
 }
 
@@ -3293,8 +3306,11 @@ class llvm::gvn::GVNLegacyPass : public FunctionPass {
 public:
   static char ID; // Pass identification, replacement for typeid
 
-  explicit GVNLegacyPass(bool NoMemDepAnalysis = !GVNEnableMemDep)
-      : FunctionPass(ID), Impl(GVNOptions().setMemDep(!NoMemDepAnalysis)) {
+  explicit GVNLegacyPass(bool MemDepAnalysis = GVNEnableMemDep,
+                         bool MemSSAAnalysis = GVNEnableMemorySSA)
+      : FunctionPass(ID), Impl(GVNOptions()
+                                   .setMemDep(MemDepAnalysis)
+                                   .setMemorySSA(MemSSAAnalysis)) {
     initializeGVNLegacyPassPass(*PassRegistry::getPassRegistry());
   }
 
@@ -3303,6 +3319,9 @@ class llvm::gvn::GVNLegacyPass : public FunctionPass {
       return false;
 
     auto *MSSAWP = getAnalysisIfAvailable<MemorySSAWrapperPass>();
+    if (Impl.isMemorySSAEnabled() && !MSSAWP)
+      MSSAWP = &getAnalysis<MemorySSAWrapperPass>();
+
     return Impl.runImpl(
         F, getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
         getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
@@ -3330,6 +3349,8 @@ class llvm::gvn::GVNLegacyPass : public FunctionPass {
     AU.addPreserved<LoopInfoWrapperPass>();
     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
     AU.addPreserved<MemorySSAWrapperPass>();
+    if (Impl.isMemorySSAEnabled())
+      AU.addRequired<MemorySSAWrapperPass>();
   }
 
 private:
@@ -3341,6 +3362,7 @@ char GVNLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(GVNLegacyPass, "gvn", "Global Value Numbering", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
@@ -3349,6 +3371,4 @@ INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
 INITIALIZE_PASS_END(GVNLegacyPass, "gvn", "Global Value Numbering", false, false)
 
 // The public interface to this file...
-FunctionPass *llvm::createGVNPass(bool NoMemDepAnalysis) {
-  return new GVNLegacyPass(NoMemDepAnalysis);
-}
+FunctionPass *llvm::createGVNPass() { return new GVNLegacyPass(); }
diff --git a/llvm/test/Other/new-pm-print-pipeline.ll b/llvm/test/Other/new-pm-print-pipeline.ll
index 9016473b36ba4..eb3ffe3a098dd 100644
--- a/llvm/test/Other/new-pm-print-pipeline.ll
+++ b/llvm/test/Other/new-pm-print-pipeline.ll
@@ -31,8 +31,8 @@
 ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(loop-unroll<>,loop-unroll<partial;peeling;runtime;upperbound;profile-peeling;full-unroll-max=5;O1>,loop-unroll<no-partial;no-peeling;no-runtime;no-upperbound;no-profile-peeling;full-unroll-max=7;O1>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-10
 ; CHECK-10: function(loop-unroll<O2>,loop-unroll<partial;peeling;runtime;upperbound;profile-peeling;full-unroll-max=5;O1>,loop-unroll<no-partial;no-peeling;no-runtime;no-upperbound;no-profile-peeling;full-unroll-max=7;O1>)
 
-; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(gvn<>,gvn<pre;load-pre;split-backedge-load-pre;memdep>,gvn<no-pre;no-load-pre;no-split-backedge-load-pre;no-memdep>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-11
-; CHECK-11: function(gvn<>,gvn<pre;load-pre;split-backedge-load-pre;memdep>,gvn<no-pre;no-load-pre;no-split-backedge-load-pre;no-memdep>)
+; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(gvn<>,gvn<pre;load-pre;split-backedge-load-pre;memdep;memoryssa>,gvn<no-pre;no-load-pre;no-split-backedge-load-pre;no-memdep;no-memoryssa>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-11
+; CHECK-11: function(gvn<>,gvn<pre;load-pre;split-backedge-load-pre;memdep;memoryssa>,gvn<no-pre;no-load-pre;no-split-backedge-load-pre;no-memdep;no-memoryssa>)
 
 ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(early-cse<>,early-cse<memssa>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-12
 ; CHECK-12: function(early-cse<>,early-cse<memssa>)

From 6d2534546582721b8d7f10963c329de0a04f0bfe Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Tue, 14 Jan 2025 01:10:42 -0800
Subject: [PATCH 385/408] Remove the `CustomEntry` escape hatch from builtin
 TableGen (#120861)

This was an especially challenging escape hatch because it directly
forced the use of a specific X-macro structure and prevented any other
form of TableGen emission.

The problematic feature that motivated this is a case where a builtin's
prototype can't be represented in the mini-language used by TableGen.
Instead of adding a complete custom entry for this, this PR just teaches
the prototype handling to do the same thing the X-macros did in this
case: emit an empty string and let the Clang builtin handling respond
appropriately.

This should produce identical results while preserving all the rest of
the structured representation in the builtin TableGen code.
---
 clang/include/clang/Basic/Builtins.td         |  8 ++++---
 clang/include/clang/Basic/BuiltinsBase.td     | 13 +++++++----
 clang/utils/TableGen/ClangBuiltinsEmitter.cpp | 22 ++++++++++++++-----
 3 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index f4216bd01a074..ea22690ce4f5c 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -3347,10 +3347,12 @@ def VFork : LibBuiltin<"unistd.h"> {
 }
 
 // POSIX pthread.h
-// FIXME: This should be a GNULibBuiltin, but it's currently missing the prototype.
 
-def PthreadCreate : CustomEntry {
-  let Entry = "LIBBUILTIN(pthread_create, \"\",  \"fC<2,3>\", PTHREAD_H, ALL_GNU_LANGUAGES)";
+def PthreadCreate : GNULibBuiltin<"pthread.h"> {
+  let Spellings = ["pthread_create"];
+  let Attributes = [FunctionWithoutBuiltinPrefix, Callback<[2, 3]>];
+  // Note that we don't have an expressable prototype so we leave it empty.
+  let Prototype = "";
 }
 
 def SigSetJmp : LibBuiltin<"setjmp.h"> {
diff --git a/clang/include/clang/Basic/BuiltinsBase.td b/clang/include/clang/Basic/BuiltinsBase.td
index 1a1096d41da40..6180a94aa4b5c 100644
--- a/clang/include/clang/Basic/BuiltinsBase.td
+++ b/clang/include/clang/Basic/BuiltinsBase.td
@@ -17,6 +17,11 @@ class IndexedAttribute<string baseMangling, int I> : Attribute<baseMangling> {
   int Index = I;
 }
 
+class MultiIndexAttribute<string baseMangling, list<int> Is>
+    : Attribute<baseMangling> {
+  list<int> Indices = Is;
+}
+
 // Standard Attributes
 // -------------------
 def NoReturn : Attribute<"r">;
@@ -77,6 +82,10 @@ def Constexpr : Attribute<"E">;
 // Builtin is immediate and must be constant evaluated. Implies Constexpr, and will only be supported in C++20 mode.
 def Consteval : Attribute<"EG">;
 
+// Callback behavior: the first index argument is called with the arguments
+// indicated by the remaining indices.
+class Callback<list<int> ArgIndices> : MultiIndexAttribute<"C", ArgIndices>;
+
 // Builtin kinds
 // =============
 
@@ -92,10 +101,6 @@ class Builtin {
   bit EnableOpenCLLong = 0;
 }
 
-class CustomEntry {
-  string Entry;
-}
-
 class AtomicBuiltin : Builtin;
 
 class LibBuiltin<string header, string languages = "ALL_LANGUAGES"> : Builtin {
diff --git a/clang/utils/TableGen/ClangBuiltinsEmitter.cpp b/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
index 94cc218376002..6aca4edfdfb88 100644
--- a/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
+++ b/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "TableGenBackends.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
@@ -39,6 +40,14 @@ class PrototypeParser {
 private:
   void ParsePrototype(StringRef Prototype) {
     Prototype = Prototype.trim();
+
+    // Some builtins don't have an expressible prototype, simply emit an empty
+    // string for them.
+    if (Prototype.empty()) {
+      Type = "";
+      return;
+    }
+
     ParseTypes(Prototype);
   }
 
@@ -246,8 +255,15 @@ void PrintAttributes(const Record *Builtin, BuiltinType BT, raw_ostream &OS) {
 
   for (const auto *Attr : Builtin->getValueAsListOfDefs("Attributes")) {
     OS << Attr->getValueAsString("Mangling");
-    if (Attr->isSubClassOf("IndexedAttribute"))
+    if (Attr->isSubClassOf("IndexedAttribute")) {
       OS << ':' << Attr->getValueAsInt("Index") << ':';
+    } else if (Attr->isSubClassOf("MultiIndexAttribute")) {
+      OS << '<';
+      llvm::ListSeparator Sep(",");
+      for (int64_t Index : Attr->getValueAsListOfInts("Indices"))
+        OS << Sep << Index;
+      OS << '>';
+    }
   }
   OS << '\"';
 }
@@ -405,10 +421,6 @@ void clang::EmitClangBuiltins(const RecordKeeper &Records, raw_ostream &OS) {
     EmitBuiltin(OS, Builtin);
   }
 
-  for (const auto *Entry : Records.getAllDerivedDefinitions("CustomEntry")) {
-    OS << Entry->getValueAsString("Entry") << '\n';
-  }
-
   OS << R"c++(
 #undef ATOMIC_BUILTIN
 #undef BUILTIN

From 4cc9bf149f07edec5ea910af8b3ead17ae8b29b7 Mon Sep 17 00:00:00 2001
From: higher-performance <higher.performance.github@gmail.com>
Date: Tue, 14 Jan 2025 04:22:20 -0500
Subject: [PATCH 386/408] Propagate lifetimebound from formal parameters to
 those in the canonical declaration and use that for analysis (#107627)

This partially fixes #62072 by making sure that re-declarations of a
function do not have the effect of removing lifetimebound from the
canonical declaration.

It doesn't handle the implicit 'this' parameter, but that can be
addressed in a separate fix.
---
 clang/lib/Sema/CheckExprLifetime.cpp      | 37 ++++++++----
 clang/lib/Sema/SemaDecl.cpp               | 68 ++++++++++++++++-------
 clang/test/SemaCXX/attr-lifetimebound.cpp |  5 ++
 3 files changed, 79 insertions(+), 31 deletions(-)

diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp
index 837414c4840d7..27e6b5b2cb393 100644
--- a/clang/lib/Sema/CheckExprLifetime.cpp
+++ b/clang/lib/Sema/CheckExprLifetime.cpp
@@ -525,7 +525,20 @@ static bool isNormalAssignmentOperator(const FunctionDecl *FD) {
   return false;
 }
 
+static const FunctionDecl *
+getDeclWithMergedLifetimeBoundAttrs(const FunctionDecl *FD) {
+  return FD != nullptr ? FD->getMostRecentDecl() : nullptr;
+}
+
+static const CXXMethodDecl *
+getDeclWithMergedLifetimeBoundAttrs(const CXXMethodDecl *CMD) {
+  const FunctionDecl *FD = CMD;
+  return cast_if_present<CXXMethodDecl>(
+      getDeclWithMergedLifetimeBoundAttrs(FD));
+}
+
 bool implicitObjectParamIsLifetimeBound(const FunctionDecl *FD) {
+  FD = getDeclWithMergedLifetimeBoundAttrs(FD);
   const TypeSourceInfo *TSI = FD->getTypeSourceInfo();
   if (!TSI)
     return false;
@@ -647,9 +660,9 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call,
     }
   }
 
-  for (unsigned I = 0,
-                N = std::min<unsigned>(Callee->getNumParams(), Args.size());
-       I != N; ++I) {
+  const FunctionDecl *CanonCallee = getDeclWithMergedLifetimeBoundAttrs(Callee);
+  unsigned NP = std::min(Callee->getNumParams(), CanonCallee->getNumParams());
+  for (unsigned I = 0, N = std::min<unsigned>(NP, Args.size()); I != N; ++I) {
     Expr *Arg = Args[I];
     RevertToOldSizeRAII RAII(Path);
     if (auto *DAE = dyn_cast<CXXDefaultArgExpr>(Arg)) {
@@ -657,11 +670,12 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call,
           {IndirectLocalPathEntry::DefaultArg, DAE, DAE->getParam()});
       Arg = DAE->getExpr();
     }
-    if (CheckCoroCall || Callee->getParamDecl(I)->hasAttr<LifetimeBoundAttr>())
-      VisitLifetimeBoundArg(Callee->getParamDecl(I), Arg);
+    if (CheckCoroCall ||
+        CanonCallee->getParamDecl(I)->hasAttr<LifetimeBoundAttr>())
+      VisitLifetimeBoundArg(CanonCallee->getParamDecl(I), Arg);
     else if (const auto *CaptureAttr =
-                 Callee->getParamDecl(I)->getAttr<LifetimeCaptureByAttr>();
-             CaptureAttr && isa<CXXConstructorDecl>(Callee) &&
+                 CanonCallee->getParamDecl(I)->getAttr<LifetimeCaptureByAttr>();
+             CaptureAttr && isa<CXXConstructorDecl>(CanonCallee) &&
              llvm::any_of(CaptureAttr->params(), [](int ArgIdx) {
                return ArgIdx == LifetimeCaptureByAttr::THIS;
              }))
@@ -678,11 +692,11 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call,
       // `lifetimebound` and shares the same code path. This implies the emitted
       // diagnostics will be emitted under `-Wdangling`, not
       // `-Wdangling-capture`.
-      VisitLifetimeBoundArg(Callee->getParamDecl(I), Arg);
+      VisitLifetimeBoundArg(CanonCallee->getParamDecl(I), Arg);
     else if (EnableGSLAnalysis && I == 0) {
       // Perform GSL analysis for the first argument
-      if (shouldTrackFirstArgument(Callee)) {
-        VisitGSLPointerArg(Callee, Arg);
+      if (shouldTrackFirstArgument(CanonCallee)) {
+        VisitGSLPointerArg(CanonCallee, Arg);
       } else if (auto *Ctor = dyn_cast<CXXConstructExpr>(Call);
                  Ctor && shouldTrackFirstArgumentForConstructor(Ctor)) {
         VisitGSLPointerArg(Ctor->getConstructor(), Arg);
@@ -1245,7 +1259,8 @@ static AnalysisResult analyzePathForGSLPointer(const IndirectLocalPath &Path,
   return Report;
 }
 
-static bool isAssignmentOperatorLifetimeBound(CXXMethodDecl *CMD) {
+static bool isAssignmentOperatorLifetimeBound(const CXXMethodDecl *CMD) {
+  CMD = getDeclWithMergedLifetimeBoundAttrs(CMD);
   return CMD && isNormalAssignmentOperator(CMD) && CMD->param_size() == 1 &&
          CMD->getParamDecl(0)->hasAttr<LifetimeBoundAttr>();
 }
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 5b7275c316f74..f5e57988b7fa8 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -3239,6 +3239,42 @@ void Sema::mergeDeclAttributes(NamedDecl *New, Decl *Old,
   if (!foundAny) New->dropAttrs();
 }
 
+// Returns the number of added attributes.
+template <class T>
+static unsigned propagateAttribute(ParmVarDecl *To, const ParmVarDecl *From,
+                                   Sema &S) {
+  unsigned found = 0;
+  for (const auto *I : From->specific_attrs<T>()) {
+    if (!DeclHasAttr(To, I)) {
+      T *newAttr = cast<T>(I->clone(S.Context));
+      newAttr->setInherited(true);
+      To->addAttr(newAttr);
+      ++found;
+    }
+  }
+  return found;
+}
+
+template <class F>
+static void propagateAttributes(ParmVarDecl *To, const ParmVarDecl *From,
+                                F &&propagator) {
+  if (!From->hasAttrs()) {
+    return;
+  }
+
+  bool foundAny = To->hasAttrs();
+
+  // Ensure that any moving of objects within the allocated map is
+  // done before we process them.
+  if (!foundAny)
+    To->setAttrs(AttrVec());
+
+  foundAny |= std::forward<F>(propagator)(To, From) != 0;
+
+  if (!foundAny)
+    To->dropAttrs();
+}
+
 /// mergeParamDeclAttributes - Copy attributes from the old parameter
 /// to the new one.
 static void mergeParamDeclAttributes(ParmVarDecl *newDecl,
@@ -3262,26 +3298,17 @@ static void mergeParamDeclAttributes(ParmVarDecl *newDecl,
            diag::note_carries_dependency_missing_first_decl) << 1/*Param*/;
   }
 
-  if (!oldDecl->hasAttrs())
-    return;
-
-  bool foundAny = newDecl->hasAttrs();
-
-  // Ensure that any moving of objects within the allocated map is
-  // done before we process them.
-  if (!foundAny) newDecl->setAttrs(AttrVec());
-
-  for (const auto *I : oldDecl->specific_attrs<InheritableParamAttr>()) {
-    if (!DeclHasAttr(newDecl, I)) {
-      InheritableAttr *newAttr =
-        cast<InheritableParamAttr>(I->clone(S.Context));
-      newAttr->setInherited(true);
-      newDecl->addAttr(newAttr);
-      foundAny = true;
-    }
-  }
-
-  if (!foundAny) newDecl->dropAttrs();
+  propagateAttributes(
+      newDecl, oldDecl, [&S](ParmVarDecl *To, const ParmVarDecl *From) {
+        unsigned found = 0;
+        found += propagateAttribute<InheritableParamAttr>(To, From, S);
+        // Propagate the lifetimebound attribute from parameters to the
+        // most recent declaration. Note that this doesn't include the implicit
+        // 'this' parameter, as the attribute is applied to the function type in
+        // that case.
+        found += propagateAttribute<LifetimeBoundAttr>(To, From, S);
+        return found;
+      });
 }
 
 static bool EquivalentArrayTypes(QualType Old, QualType New,
@@ -6960,6 +6987,7 @@ static void checkInheritableAttr(Sema &S, NamedDecl &ND) {
 static void checkLifetimeBoundAttr(Sema &S, NamedDecl &ND) {
   // Check the attributes on the function type and function params, if any.
   if (const auto *FD = dyn_cast<FunctionDecl>(&ND)) {
+    FD = FD->getMostRecentDecl();
     // Don't declare this variable in the second operand of the for-statement;
     // GCC miscompiles that by ending its lifetime before evaluating the
     // third operand. See gcc.gnu.org/PR86769.
diff --git a/clang/test/SemaCXX/attr-lifetimebound.cpp b/clang/test/SemaCXX/attr-lifetimebound.cpp
index 896793f996666..e7c8b35cb0c48 100644
--- a/clang/test/SemaCXX/attr-lifetimebound.cpp
+++ b/clang/test/SemaCXX/attr-lifetimebound.cpp
@@ -33,6 +33,10 @@ namespace usage_invalid {
 namespace usage_ok {
   struct IntRef { int *target; };
 
+  const int &crefparam(const int &param); // Omitted in first decl
+  const int &crefparam(const int &param); // Omitted in second decl
+  const int &crefparam(const int &param [[clang::lifetimebound]]); // Add LB
+  const int &crefparam(const int &param) { return param; } // Omit in impl
   int &refparam(int &param [[clang::lifetimebound]]);
   int &classparam(IntRef param [[clang::lifetimebound]]);
 
@@ -62,6 +66,7 @@ namespace usage_ok {
   int *p = A().class_member(); // expected-warning {{temporary whose address is used as value of local variable 'p' will be destroyed at the end of the full-expression}}
   int *q = A(); // expected-warning {{temporary whose address is used as value of local variable 'q' will be destroyed at the end of the full-expression}}
   int *r = A(1); // expected-warning {{temporary whose address is used as value of local variable 'r' will be destroyed at the end of the full-expression}}
+  const int& s = crefparam(2); // expected-warning {{temporary bound to local reference 's' will be destroyed at the end of the full-expression}}
 
   void test_assignment() {
     p = A().class_member(); // expected-warning {{object backing the pointer p will be destroyed at the end of the full-expression}}

From ea4a87957f7a30139680d5e4856b754a14de0c4b Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Tue, 14 Jan 2025 09:26:18 +0000
Subject: [PATCH 387/408] [gn build] Port 42595bdaefb6

---
 .../gn/secondary/llvm/unittests/ExecutionEngine/JITLink/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/JITLink/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/JITLink/BUILD.gn
index 7f74b335e30ed..78802e5cc2368 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/JITLink/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/JITLink/BUILD.gn
@@ -16,6 +16,7 @@ unittest("JITLinkTests") {
   sources = [
     "AArch32ErrorTests.cpp",
     "AArch32Tests.cpp",
+    "AArch64Tests.cpp",
     "EHFrameSupportTests.cpp",
     "JITLinkTestUtils.cpp",
     "LinkGraphTests.cpp",

From 0209739597b42f3f617db89043a9c1efe7825c0d Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi@arm.com>
Date: Tue, 14 Jan 2025 09:34:27 +0000
Subject: [PATCH 388/408] [InterleavedAccessPass]: Ensure that dead nodes get
 erased only once (#122643)

Use SmallSetVector instead of SmallVector to avoid duplication,
so that dead nodes get erased/deleted only once.
---
 llvm/lib/CodeGen/InterleavedAccessPass.cpp    | 41 ++++++++++---------
 .../AArch64/sve-interleave4.ll                | 14 +++++++
 2 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 8b6e3180986c3..c6d5533fd2bae 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -100,21 +100,21 @@ class InterleavedAccessImpl {
 
   /// Transform an interleaved load into target specific intrinsics.
   bool lowerInterleavedLoad(LoadInst *LI,
-                            SmallVectorImpl<Instruction *> &DeadInsts);
+                            SmallSetVector<Instruction *, 32> &DeadInsts);
 
   /// Transform an interleaved store into target specific intrinsics.
   bool lowerInterleavedStore(StoreInst *SI,
-                             SmallVectorImpl<Instruction *> &DeadInsts);
+                             SmallSetVector<Instruction *, 32> &DeadInsts);
 
   /// Transform a load and a deinterleave intrinsic into target specific
   /// instructions.
   bool lowerDeinterleaveIntrinsic(IntrinsicInst *II,
-                                  SmallVectorImpl<Instruction *> &DeadInsts);
+                                  SmallSetVector<Instruction *, 32> &DeadInsts);
 
   /// Transform an interleave intrinsic and a store into target specific
   /// instructions.
   bool lowerInterleaveIntrinsic(IntrinsicInst *II,
-                                SmallVectorImpl<Instruction *> &DeadInsts);
+                                SmallSetVector<Instruction *, 32> &DeadInsts);
 
   /// Returns true if the uses of an interleaved load by the
   /// extractelement instructions in \p Extracts can be replaced by uses of the
@@ -249,7 +249,7 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
 }
 
 bool InterleavedAccessImpl::lowerInterleavedLoad(
-    LoadInst *LI, SmallVectorImpl<Instruction *> &DeadInsts) {
+    LoadInst *LI, SmallSetVector<Instruction *, 32> &DeadInsts) {
   if (!LI->isSimple() || isa<ScalableVectorType>(LI->getType()))
     return false;
 
@@ -348,9 +348,9 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
     return !Extracts.empty() || BinOpShuffleChanged;
   }
 
-  append_range(DeadInsts, Shuffles);
+  DeadInsts.insert(Shuffles.begin(), Shuffles.end());
 
-  DeadInsts.push_back(LI);
+  DeadInsts.insert(LI);
   return true;
 }
 
@@ -453,7 +453,7 @@ bool InterleavedAccessImpl::tryReplaceExtracts(
 }
 
 bool InterleavedAccessImpl::lowerInterleavedStore(
-    StoreInst *SI, SmallVectorImpl<Instruction *> &DeadInsts) {
+    StoreInst *SI, SmallSetVector<Instruction *, 32> &DeadInsts) {
   if (!SI->isSimple())
     return false;
 
@@ -473,13 +473,13 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
     return false;
 
   // Already have a new target specific interleaved store. Erase the old store.
-  DeadInsts.push_back(SI);
-  DeadInsts.push_back(SVI);
+  DeadInsts.insert(SI);
+  DeadInsts.insert(SVI);
   return true;
 }
 
 bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
-    IntrinsicInst *DI, SmallVectorImpl<Instruction *> &DeadInsts) {
+    IntrinsicInst *DI, SmallSetVector<Instruction *, 32> &DeadInsts) {
   LoadInst *LI = dyn_cast<LoadInst>(DI->getOperand(0));
 
   if (!LI || !LI->hasOneUse() || !LI->isSimple())
@@ -488,17 +488,19 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
   LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n");
 
   // Try and match this with target specific intrinsics.
-  if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI, DeadInsts))
+  SmallVector<Instruction *, 4> DeinterleaveDeadInsts;
+  if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI, DeinterleaveDeadInsts))
     return false;
 
+  DeadInsts.insert(DeinterleaveDeadInsts.begin(), DeinterleaveDeadInsts.end());
   // We now have a target-specific load, so delete the old one.
-  DeadInsts.push_back(DI);
-  DeadInsts.push_back(LI);
+  DeadInsts.insert(DI);
+  DeadInsts.insert(LI);
   return true;
 }
 
 bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
-    IntrinsicInst *II, SmallVectorImpl<Instruction *> &DeadInsts) {
+    IntrinsicInst *II, SmallSetVector<Instruction *, 32> &DeadInsts) {
   if (!II->hasOneUse())
     return false;
 
@@ -515,16 +517,15 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
     return false;
 
   // We now have a target-specific store, so delete the old one.
-  DeadInsts.push_back(SI);
-  DeadInsts.push_back(II);
-  DeadInsts.insert(DeadInsts.end(), InterleaveDeadInsts.begin(),
-                   InterleaveDeadInsts.end());
+  DeadInsts.insert(SI);
+  DeadInsts.insert(II);
+  DeadInsts.insert(InterleaveDeadInsts.begin(), InterleaveDeadInsts.end());
   return true;
 }
 
 bool InterleavedAccessImpl::runOnFunction(Function &F) {
   // Holds dead instructions that will be erased later.
-  SmallVector<Instruction *, 32> DeadInsts;
+  SmallSetVector<Instruction *, 32> DeadInsts;
   bool Changed = false;
 
   for (auto &I : instructions(F)) {
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll
index e8d113ae3763d..085089978d8f5 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll
@@ -55,3 +55,17 @@ define void @mix_interleave4_interleave2(ptr %dst1, ptr %dst2, <vscale x 4 x i32
   store <vscale x 8 x i32> %interleaved, ptr %dst2, align 4
   ret void
 }
+
+; This case tests when the interleave is using same parameter twice,
+; the dead parameter will not get deleted twice.
+define void @duplicate_by_interleave(<vscale x 4 x i32> %A, <vscale x 4 x i32> %B, ptr writeonly %AB_duplicate) {
+; CHECK-LABEL: define void @duplicate_by_interleave
+; CHECK-SAME: (<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]], ptr writeonly [[AB_DUPLICATE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[B]], <vscale x 4 x i1> splat (i1 true), ptr [[AB_DUPLICATE]])
+; CHECK-NEXT:    ret void
+;
+  %interleave = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %A, <vscale x 4 x i32> %B)
+  %duplicate_by_interleave = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleave, <vscale x 8 x i32> %interleave)
+  store <vscale x 16 x i32> %duplicate_by_interleave, ptr %AB_duplicate, align 4
+  ret void
+}

From df1a84d2ed6565ea2a5ff8111eb984499ba9e571 Mon Sep 17 00:00:00 2001
From: Jacob Lalonde <jalalonde@fb.com>
Date: Tue, 14 Jan 2025 01:35:41 -0800
Subject: [PATCH 389/408] [llvm][Docs] Add Minidump related LLDB release notes
 (#122759)

Add some release notes for the Minidump work I did over the last few
months.
---
 llvm/docs/ReleaseNotes.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index c24dc1976b4d0..be98725700178 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -450,6 +450,13 @@ Changes to LLDB
           _regexp-display   -- Evaluate an expression at every stop (see 'h...
   ```
 
+  * Minidumps generated by LLDB now support:
+    * 64 bit memory (due to 64b support, Minidumps are now paged to disk while being written).
+    * Capturing of TLS variables.
+    * Multiple signals or exceptions, including breakpoints.
+
+  * [New Core File API](https://lldb.llvm.org/python_api/lldb.SBSaveCoreOptions.html). This gives greater control on the data captured into the core file, relative to the existing `process save-core` styles.
+
 Changes to BOLT
 ---------------------------------
 

From ac857f9bdd500d274d7996e0fa14aaf8b765d745 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Tue, 14 Jan 2025 10:57:36 +0100
Subject: [PATCH 390/408] [clang][bytecode] Change the way we do init chains
 (#122871)

See the comment in Compiler<>::VisitCXXThisExpr.
We need to mark the InitList explicitly, so we later know what to refer
to when the init chain is active.
---
 clang/lib/AST/ByteCode/Compiler.cpp | 44 +++++++++++++++++++++++++++--
 clang/lib/AST/ByteCode/Compiler.h   |  2 ++
 clang/test/AST/ByteCode/records.cpp |  6 ++++
 clang/test/AST/ByteCode/unions.cpp  | 20 +++++++++++++
 4 files changed, 69 insertions(+), 3 deletions(-)

diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index 036f9608bf3ca..2326480fe2eaf 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -90,6 +90,8 @@ bool InitLink::emit(Compiler<Emitter> *Ctx, const Expr *E) const {
     if (!Ctx->emitConstUint32(Offset, E))
       return false;
     return Ctx->emitArrayElemPtrPopUint32(E);
+  case K_InitList:
+    return true;
   default:
     llvm_unreachable("Unhandled InitLink kind");
   }
@@ -1717,6 +1719,8 @@ bool Compiler<Emitter>::VisitArraySubscriptExpr(const ArraySubscriptExpr *E) {
 template <class Emitter>
 bool Compiler<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
                                       const Expr *ArrayFiller, const Expr *E) {
+  InitLinkScope<Emitter> ILS(this, InitLink::InitList());
+
   QualType QT = E->getType();
   if (const auto *AT = QT->getAs<AtomicType>())
     QT = AT->getValueType();
@@ -1754,6 +1758,7 @@ bool Compiler<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
     auto initPrimitiveField = [=](const Record::Field *FieldToInit,
                                   const Expr *Init, PrimType T) -> bool {
       InitStackScope<Emitter> ISS(this, isa<CXXDefaultInitExpr>(Init));
+      InitLinkScope<Emitter> ILS(this, InitLink::Field(FieldToInit->Offset));
       if (!this->visit(Init))
         return false;
 
@@ -1766,6 +1771,7 @@ bool Compiler<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
                                   const Expr *Init) -> bool {
       InitStackScope<Emitter> ISS(this, isa<CXXDefaultInitExpr>(Init));
       InitLinkScope<Emitter> ILS(this, InitLink::Field(FieldToInit->Offset));
+
       // Non-primitive case. Get a pointer to the field-to-initialize
       // on the stack and recurse into visitInitializer().
       if (!this->emitGetPtrField(FieldToInit->Offset, Init))
@@ -3812,6 +3818,7 @@ template <class Emitter> bool Compiler<Emitter>::visit(const Expr *E) {
 
     if (!this->emitGetPtrLocal(*LocalIndex, E))
       return false;
+    InitLinkScope<Emitter> ILS(this, InitLink::Temp(*LocalIndex));
     return this->visitInitializer(E);
   }
 
@@ -4848,18 +4855,49 @@ bool Compiler<Emitter>::VisitCXXThisExpr(const CXXThisExpr *E) {
   // instance pointer of the current function frame, but e.g. to the declaration
   // currently being initialized. Here we emit the necessary instruction(s) for
   // this scenario.
-  if (!InitStackActive || !E->isImplicit())
+  if (!InitStackActive)
     return this->emitThis(E);
 
-  if (InitStackActive && !InitStack.empty()) {
+  if (!InitStack.empty()) {
+    // If our init stack is, for example:
+    // 0 Stack: 3 (decl)
+    // 1 Stack: 6 (init list)
+    // 2 Stack: 1 (field)
+    // 3 Stack: 6 (init list)
+    // 4 Stack: 1 (field)
+    //
+    // We want to find the LAST element in it that's an init list,
+    // which is marked with the K_InitList marker. The index right
+    // before that points to an init list. We need to find the
+    // elements before the K_InitList element that point to a base
+    // (e.g. a decl or This), optionally followed by field, elem, etc.
+    // In the example above, we want to emit elements [0..2].
     unsigned StartIndex = 0;
+    unsigned EndIndex = 0;
+    // Find the init list.
     for (StartIndex = InitStack.size() - 1; StartIndex > 0; --StartIndex) {
+      if (InitStack[StartIndex].Kind == InitLink::K_InitList ||
+          InitStack[StartIndex].Kind == InitLink::K_This) {
+        EndIndex = StartIndex;
+        --StartIndex;
+        break;
+      }
+    }
+
+    // Walk backwards to find the base.
+    for (; StartIndex > 0; --StartIndex) {
+      if (InitStack[StartIndex].Kind == InitLink::K_InitList)
+        continue;
+
       if (InitStack[StartIndex].Kind != InitLink::K_Field &&
           InitStack[StartIndex].Kind != InitLink::K_Elem)
         break;
     }
 
-    for (unsigned I = StartIndex, N = InitStack.size(); I != N; ++I) {
+    // Emit the instructions.
+    for (unsigned I = StartIndex; I != EndIndex; ++I) {
+      if (InitStack[I].Kind == InitLink::K_InitList)
+        continue;
       if (!InitStack[I].template emit<Emitter>(this, E))
         return false;
     }
diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h
index 71765b18cb1a9..2d5b76f789543 100644
--- a/clang/lib/AST/ByteCode/Compiler.h
+++ b/clang/lib/AST/ByteCode/Compiler.h
@@ -51,9 +51,11 @@ struct InitLink {
     K_Temp = 2,
     K_Decl = 3,
     K_Elem = 5,
+    K_InitList = 6
   };
 
   static InitLink This() { return InitLink{K_This}; }
+  static InitLink InitList() { return InitLink{K_InitList}; }
   static InitLink Field(unsigned Offset) {
     InitLink IL{K_Field};
     IL.Offset = Offset;
diff --git a/clang/test/AST/ByteCode/records.cpp b/clang/test/AST/ByteCode/records.cpp
index 4601aface135e..d329219264d89 100644
--- a/clang/test/AST/ByteCode/records.cpp
+++ b/clang/test/AST/ByteCode/records.cpp
@@ -1678,3 +1678,9 @@ namespace NonConst {
     static_assert(s.getSize() == 10, "");
   }
 }
+
+namespace ExplicitThisInTemporary {
+  struct B { B *p = this; };
+  constexpr bool g(B b) { return &b == b.p; }
+  static_assert(g({}), "");
+}
diff --git a/clang/test/AST/ByteCode/unions.cpp b/clang/test/AST/ByteCode/unions.cpp
index 7b39bb1bb9316..e90b123c90de0 100644
--- a/clang/test/AST/ByteCode/unions.cpp
+++ b/clang/test/AST/ByteCode/unions.cpp
@@ -401,4 +401,24 @@ namespace UnionInBase {
                                         // both-note {{subobject 'y' is not initialized}}
   static_assert(return_uninit().a.x == 2);
 }
+
+/// FIXME: Our diagnostic here is a little off.
+namespace One {
+  struct A { long x; };
+
+  union U;
+  constexpr A foo(U *up);
+  union U {
+    A a = foo(this); // both-note {{in call to 'foo(&u)'}}
+    int y;
+  };
+
+  constexpr A foo(U *up) {
+    return {up->y}; // both-note {{read of member 'y' of union}}
+  }
+
+  constinit U u = {}; // both-error {{constant init}} \
+                      // both-note {{constinit}}
+}
+
 #endif

From 04733fac1009fdf7cd89fb24997b1f8866a518fc Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Tue, 14 Jan 2025 09:52:27 +0000
Subject: [PATCH 391/408] [llvm][Docs] Formatting changes to LLDB release notes

---
 llvm/docs/ReleaseNotes.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index be98725700178..f2a706cd3324d 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -417,7 +417,7 @@ Changes to LLDB
 
 * It is now possible to implement lldb commands in Python that use lldb's native command-line parser.  In particular, that allows per-option/argument completion,
   with all the basic completers automatically supported and auto-generated help.
-  The command template file in the lldb/examples/python/cmdtemplate.py has been updated to show how to use this. 
+  The command template file in `lldb/examples/python/cmdtemplate.py` has been updated to show how to use this.
 
 * Breakpoints on "inlined call sites" are now supported.  Previous to this fix, breakpoints on source lines that only contained inlined call sites would be
   moved to the next source line, causing you to miss the inlined executions.
@@ -450,12 +450,12 @@ Changes to LLDB
           _regexp-display   -- Evaluate an expression at every stop (see 'h...
   ```
 
-  * Minidumps generated by LLDB now support:
-    * 64 bit memory (due to 64b support, Minidumps are now paged to disk while being written).
-    * Capturing of TLS variables.
-    * Multiple signals or exceptions, including breakpoints.
+* Minidumps generated by LLDB now support:
+  * 64 bit memory (due to 64b support, Minidumps are now paged to disk while being written).
+  * Capturing of TLS variables.
+  * Multiple signals or exceptions, including breakpoints.
 
-  * [New Core File API](https://lldb.llvm.org/python_api/lldb.SBSaveCoreOptions.html). This gives greater control on the data captured into the core file, relative to the existing `process save-core` styles.
+* [New Core File API](https://lldb.llvm.org/python_api/lldb.SBSaveCoreOptions.html). This gives greater control on the data captured into the core file, relative to the existing `process save-core` styles.
 
 Changes to BOLT
 ---------------------------------

From cfd7e024c6a97b0083f2e25a9d03d7dd516a0452 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Tue, 14 Jan 2025 10:00:15 +0000
Subject: [PATCH 392/408] [llvm][Docs] Add release note for lldb-server port
 mapping changes

---
 llvm/docs/ReleaseNotes.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index f2a706cd3324d..bf753ccee4c59 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -457,6 +457,11 @@ Changes to LLDB
 
 * [New Core File API](https://lldb.llvm.org/python_api/lldb.SBSaveCoreOptions.html). This gives greater control on the data captured into the core file, relative to the existing `process save-core` styles.
 
+* `lldb-server` now listens to a single port for gdbserver connections and provides
+  that port to the connection handler processes. This means that only 2 ports need
+  to be opened in the firewall (one for the `lldb-server` platform, one for gdbserver connections).
+  In addition, due to this work, `lldb-server` now works on Windows in the server mode.
+
 Changes to BOLT
 ---------------------------------
 

From 05f9cdd58de0a11819c392f6b09beddb809bf395 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche@google.com>
Date: Tue, 14 Jan 2025 11:09:48 +0100
Subject: [PATCH 393/408] [CI] Remove Check Clang Format from watched workflows
 (#122740)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This was useful to test metrics before we had an actual workflow, now it
generates noise.

Signed-off-by: Nathan Gauër <brioche@google.com>
---
 .ci/metrics/metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py
index cbff478b9ba15..8edc00bc6bd37 100644
--- a/.ci/metrics/metrics.py
+++ b/.ci/metrics/metrics.py
@@ -12,7 +12,7 @@
     "https://influx-prod-13-prod-us-east-0.grafana.net/api/v1/push/influx/write"
 )
 GITHUB_PROJECT = "llvm/llvm-project"
-WORKFLOWS_TO_TRACK = ["Check code formatting", "LLVM Premerge Checks"]
+WORKFLOWS_TO_TRACK = ["LLVM Premerge Checks"]
 SCRAPE_INTERVAL_SECONDS = 5 * 60
 
 
From 89063433792699c5913ba116cab09b534c549e56 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Tue, 14 Jan 2025 10:19:45 +0000
Subject: [PATCH 394/408] [MLIR][OpenMP] Add the host_eval clause (#116048)

This patch adds the definition of a new entry block argument-defining
`host_eval` clause. This is intended to implement the passthrough
approach discussed in [this
RFC](https://discourse.llvm.org/t/rfc-openmp-dialect-representation-of-num-teams-thread-limit-and-target-spmd/81106),
for supporting host-evaluated clauses that apply to operations nested
inside of `omp.target`.
---
 mlir/docs/Dialects/OpenMPDialect/_index.md    |  3 +-
 .../mlir/Dialect/OpenMP/OpenMPClauses.td      | 38 +++++++++++++++++++
 .../Dialect/OpenMP/OpenMPOpsInterfaces.td     | 31 ++++++++++++---
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  |  9 +++++
 4 files changed, 74 insertions(+), 7 deletions(-)

diff --git a/mlir/docs/Dialects/OpenMPDialect/_index.md b/mlir/docs/Dialects/OpenMPDialect/_index.md
index 3d28fe7819129..03d5b95217cce 100644
--- a/mlir/docs/Dialects/OpenMPDialect/_index.md
+++ b/mlir/docs/Dialects/OpenMPDialect/_index.md
@@ -297,7 +297,8 @@ arguments for the region of that MLIR operation. This enables, for example, the
 introduction of private copies of the same underlying variable defined outside
 the MLIR operation the clause is attached to. Currently, clauses with this
 property can be classified into three main categories:
-  - Map-like clauses: `map`, `use_device_addr` and `use_device_ptr`.
+  - Map-like clauses: `host_eval` (compiler internal, not defined by the OpenMP
+  specification), `map`, `use_device_addr` and `use_device_ptr`.
   - Reduction-like clauses: `in_reduction`, `reduction` and `task_reduction`.
   - Privatization clauses: `private`.
 
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td
index 98d2e80ed2d81..8af054be322a5 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td
@@ -470,6 +470,44 @@ class OpenMP_HintClauseSkip<
 
 def OpenMP_HintClause : OpenMP_HintClauseSkip<>;
 
+//===----------------------------------------------------------------------===//
+// Not in the spec: Clause-like structure to hold host-evaluated values.
+//===----------------------------------------------------------------------===//
+
+class OpenMP_HostEvalClauseSkip<
+    bit traits = false, bit arguments = false, bit assemblyFormat = false,
+    bit description = false, bit extraClassDeclaration = false
+  > : OpenMP_Clause<traits, arguments, assemblyFormat, description,
+                    extraClassDeclaration> {
+  let traits = [
+    BlockArgOpenMPOpInterface, IsolatedFromAbove
+  ];
+
+  let arguments = (ins
+    Variadic<AnyType>:$host_eval_vars
+  );
+
+  let extraClassDeclaration = [{
+    unsigned numHostEvalBlockArgs() {
+      return getHostEvalVars().size();
+    }
+  }];
+
+  let description = [{
+    The optional `host_eval_vars` holds values defined outside of the region of
+    the `IsolatedFromAbove` operation for which a corresponding entry block
+    argument is defined. The only legal uses for these captured values are the
+    following:
+      - `num_teams` or `thread_limit` clause of an immediately nested
+      `omp.teams` operation.
+      - If the operation is the top-level `omp.target` of a target SPMD kernel:
+        - `num_threads` clause of the nested `omp.parallel` operation.
+        - Bounds and steps of the nested `omp.loop_nest` operation.
+  }];
+}
+
+def OpenMP_HostEvalClause : OpenMP_HostEvalClauseSkip<>;
+
 //===----------------------------------------------------------------------===//
 // V5.2: [3.4] `if` clause
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
index c4cf0f7afb3a3..c863e5772032c 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
@@ -25,6 +25,10 @@ def BlockArgOpenMPOpInterface : OpInterface<"BlockArgOpenMPOpInterface"> {
 
   let methods = [
     // Default-implemented methods to be overriden by the corresponding clauses.
+    InterfaceMethod<"Get number of block arguments defined by `host_eval`.",
+                    "unsigned", "numHostEvalBlockArgs", (ins), [{}], [{
+      return 0;
+    }]>,
     InterfaceMethod<"Get number of block arguments defined by `in_reduction`.",
                     "unsigned", "numInReductionBlockArgs", (ins), [{}], [{
       return 0;
@@ -54,10 +58,16 @@ def BlockArgOpenMPOpInterface : OpInterface<"BlockArgOpenMPOpInterface"> {
       return 0;
     }]>,
 
-    // Unified access methods for clause-associated entry block arguments.
+    // Unified access methods for start indices of clause-associated entry block
+    // arguments.
+    InterfaceMethod<"Get start index of block arguments defined by `host_eval`.",
+                    "unsigned", "getHostEvalBlockArgsStart", (ins), [{
+      return 0;
+    }]>,
     InterfaceMethod<"Get start index of block arguments defined by `in_reduction`.",
                     "unsigned", "getInReductionBlockArgsStart", (ins), [{
-      return 0;
+      auto iface = ::llvm::cast<BlockArgOpenMPOpInterface>(*$_op);
+      return iface.getHostEvalBlockArgsStart() + $_op.numHostEvalBlockArgs();
     }]>,
     InterfaceMethod<"Get start index of block arguments defined by `map`.",
                     "unsigned", "getMapBlockArgsStart", (ins), [{
@@ -91,6 +101,14 @@ def BlockArgOpenMPOpInterface : OpInterface<"BlockArgOpenMPOpInterface"> {
       return iface.getUseDeviceAddrBlockArgsStart() + $_op.numUseDeviceAddrBlockArgs();
     }]>,
 
+    // Unified access methods for clause-associated entry block arguments.
+    InterfaceMethod<"Get block arguments defined by `host_eval`.",
+                    "::llvm::MutableArrayRef<::mlir::BlockArgument>",
+                    "getHostEvalBlockArgs", (ins), [{
+      auto iface = ::llvm::cast<BlockArgOpenMPOpInterface>(*$_op);
+      return $_op->getRegion(0).getArguments().slice(
+          iface.getHostEvalBlockArgsStart(), $_op.numHostEvalBlockArgs());
+    }]>,
     InterfaceMethod<"Get block arguments defined by `in_reduction`.",
                     "::llvm::MutableArrayRef<::mlir::BlockArgument>",
                     "getInReductionBlockArgs", (ins), [{
@@ -147,10 +165,11 @@ def BlockArgOpenMPOpInterface : OpInterface<"BlockArgOpenMPOpInterface"> {
 
   let verify = [{
     auto iface = ::llvm::cast<BlockArgOpenMPOpInterface>($_op);
-    unsigned expectedArgs = iface.numInReductionBlockArgs() +
-        iface.numMapBlockArgs() + iface.numPrivateBlockArgs() +
-        iface.numReductionBlockArgs() + iface.numTaskReductionBlockArgs() +
-        iface.numUseDeviceAddrBlockArgs() + iface.numUseDevicePtrBlockArgs();
+    unsigned expectedArgs = iface.numHostEvalBlockArgs() +
+        iface.numInReductionBlockArgs() + iface.numMapBlockArgs() +
+        iface.numPrivateBlockArgs() + iface.numReductionBlockArgs() +
+        iface.numTaskReductionBlockArgs() + iface.numUseDeviceAddrBlockArgs() +
+        iface.numUseDevicePtrBlockArgs();
     if ($_op->getRegion(0).getNumArguments() < expectedArgs)
       return $_op->emitOpError() << "expected at least " << expectedArgs
                                  << " entry block argument(s)";
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index ca7e08e9f18b5..2235fe2ee668d 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -504,6 +504,7 @@ struct ReductionParseArgs {
       : vars(vars), types(types), byref(byref), syms(syms) {}
 };
 struct AllRegionParseArgs {
+  std::optional<MapParseArgs> hostEvalArgs;
   std::optional<ReductionParseArgs> inReductionArgs;
   std::optional<MapParseArgs> mapArgs;
   std::optional<PrivateParseArgs> privateArgs;
@@ -647,6 +648,11 @@ static ParseResult parseBlockArgRegion(OpAsmParser &parser, Region &region,
                                        AllRegionParseArgs args) {
   llvm::SmallVector<OpAsmParser::Argument> entryBlockArgs;
 
+  if (failed(parseBlockArgClause(parser, entryBlockArgs, "host_eval",
+                                 args.hostEvalArgs)))
+    return parser.emitError(parser.getCurrentLocation())
+           << "invalid `host_eval` format";
+
   if (failed(parseBlockArgClause(parser, entryBlockArgs, "in_reduction",
                                  args.inReductionArgs)))
     return parser.emitError(parser.getCurrentLocation())
@@ -812,6 +818,7 @@ struct ReductionPrintArgs {
       : vars(vars), types(types), byref(byref), syms(syms) {}
 };
 struct AllRegionPrintArgs {
+  std::optional<MapPrintArgs> hostEvalArgs;
   std::optional<ReductionPrintArgs> inReductionArgs;
   std::optional<MapPrintArgs> mapArgs;
   std::optional<PrivatePrintArgs> privateArgs;
@@ -902,6 +909,8 @@ static void printBlockArgRegion(OpAsmPrinter &p, Operation *op, Region &region,
   auto iface = llvm::cast<mlir::omp::BlockArgOpenMPOpInterface>(op);
   MLIRContext *ctx = op->getContext();
 
+  printBlockArgClause(p, ctx, "host_eval", iface.getHostEvalBlockArgs(),
+                      args.hostEvalArgs);
   printBlockArgClause(p, ctx, "in_reduction", iface.getInReductionBlockArgs(),
                       args.inReductionArgs);
   printBlockArgClause(p, ctx, "map_entries", iface.getMapBlockArgs(),

From 9d7d8d2c87b3503681b362f6391d97227c62c2e8 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Tue, 14 Jan 2025 10:21:46 +0000
Subject: [PATCH 395/408] [MLIR][OpenMP] Add host_eval clause to omp.target
 (#116049)

This patch adds the `host_eval` clause to the `omp.target` operation.
Additionally, it updates its op verifier to make sure all uses of block
arguments defined by this clause fall within one of the few cases where
they are allowed.

MLIR to LLVM IR translation fails on translation of this clause with a
not-yet-implemented error.
---
 mlir/docs/Dialects/OpenMPDialect/_index.md    |  58 ++++-
 .../mlir/Dialect/OpenMP/OpenMPDialect.h       |   1 +
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td |  33 ++-
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  | 206 +++++++++++++++++-
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |   5 +
 mlir/test/Dialect/OpenMP/invalid.mlir         |  94 +++++++-
 mlir/test/Dialect/OpenMP/ops.mlir             |  54 ++++-
 mlir/test/Target/LLVMIR/openmp-todo.mlir      |  14 ++
 8 files changed, 446 insertions(+), 19 deletions(-)

diff --git a/mlir/docs/Dialects/OpenMPDialect/_index.md b/mlir/docs/Dialects/OpenMPDialect/_index.md
index 03d5b95217cce..b651b3c06485c 100644
--- a/mlir/docs/Dialects/OpenMPDialect/_index.md
+++ b/mlir/docs/Dialects/OpenMPDialect/_index.md
@@ -298,7 +298,8 @@ introduction of private copies of the same underlying variable defined outside
 the MLIR operation the clause is attached to. Currently, clauses with this
 property can be classified into three main categories:
   - Map-like clauses: `host_eval` (compiler internal, not defined by the OpenMP
-  specification), `map`, `use_device_addr` and `use_device_ptr`.
+  specification: [see more](#host-evaluated-clauses-in-target-regions)), `map`,
+  `use_device_addr` and `use_device_ptr`.
   - Reduction-like clauses: `in_reduction`, `reduction` and `task_reduction`.
   - Privatization clauses: `private`.
 
@@ -523,3 +524,58 @@ omp.parallel ... {
   omp.terminator
 } {omp.composite}
 ```
+
+## Host-Evaluated Clauses in Target Regions
+
+The `omp.target` operation, which represents the OpenMP `target` construct, is
+marked with the `IsolatedFromAbove` trait. This means that, inside of its
+region, no MLIR values defined outside of the op itself can be used. This is
+consistent with the OpenMP specification of the `target` construct, which
+mandates that all host device values used inside of the `target` region must
+either be privatized (data-sharing) or mapped (data-mapping).
+
+Normally, clauses applied to a construct are evaluated before entering that
+construct. Further, in some cases, the OpenMP specification stipulates that
+clauses be evaluated _on the host device_ on entry to a parent `target`
+construct. In particular, the `num_teams` and `thread_limit` clauses of the
+`teams` construct must be evaluated on the host device if it's nested inside or
+combined with a `target` construct.
+
+Additionally, the runtime library targeted by the MLIR to LLVM IR translation of
+the OpenMP dialect supports the optimized launch of SPMD kernels (i.e.
+`target teams distribute parallel {do,for}` in OpenMP), which requires
+specifying in advance what the total trip count of the loop is. Consequently, it
+is also beneficial to evaluate the trip count on the host device prior to the
+kernel launch.
+
+These host-evaluated values in MLIR would need to be placed outside of the
+`omp.target` region and also attached to the corresponding nested operations,
+which is not possible because of the `IsolatedFromAbove` trait. The solution
+implemented to address this problem has been to introduce the `host_eval`
+argument to the `omp.target` operation. It works similarly to a `map` clause,
+but its only intended use is to forward host-evaluated values to their
+corresponding operation inside of the region. Any uses outside of the previously
+described result in a verifier error.
+
+```mlir
+// Initialize %0, %1, %2, %3...
+omp.target host_eval(%0 -> %nt, %1 -> %lb, %2 -> %ub, %3 -> %step : i32, i32, i32, i32) {
+  omp.teams num_teams(to %nt : i32) {
+    omp.parallel {
+      omp.distribute {
+        omp.wsloop {
+          omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+            // ...
+            omp.yield
+          }
+          omp.terminator
+        } {omp.composite}
+        omp.terminator
+      } {omp.composite}
+      omp.terminator
+    } {omp.composite}
+    omp.terminator
+  }
+  omp.terminator
+}
+```
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPDialect.h b/mlir/include/mlir/Dialect/OpenMP/OpenMPDialect.h
index bee21432196e4..248ac2eb72c61 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPDialect.h
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPDialect.h
@@ -22,6 +22,7 @@
 #include "mlir/IR/SymbolTable.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
 
 #define GET_TYPEDEF_CLASSES
 #include "mlir/Dialect/OpenMP/OpenMPOpsTypes.h.inc"
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 65aa260a80cc0..c5b8890436708 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -1224,10 +1224,10 @@ def TargetOp : OpenMP_Op<"target", traits = [
   ], clauses = [
     // TODO: Complete clause list (defaultmap, uses_allocators).
     OpenMP_AllocateClause, OpenMP_BareClause, OpenMP_DependClause,
-    OpenMP_DeviceClause, OpenMP_HasDeviceAddrClause, OpenMP_IfClause,
-    OpenMP_InReductionClause, OpenMP_IsDevicePtrClause,
+    OpenMP_DeviceClause, OpenMP_HasDeviceAddrClause, OpenMP_HostEvalClause,
+    OpenMP_IfClause, OpenMP_InReductionClause, OpenMP_IsDevicePtrClause,
     OpenMP_MapClauseSkip<assemblyFormat = true>, OpenMP_NowaitClause,
-    OpenMP_PrivateClause, OpenMP_ThreadLimitClause,
+    OpenMP_PrivateClause, OpenMP_ThreadLimitClause
   ], singleRegion = true> {
   let summary = "target construct";
   let description = [{
@@ -1269,17 +1269,34 @@ def TargetOp : OpenMP_Op<"target", traits = [
 
       return getMapVars()[mapInfoOpIdx];
     }
+
+    /// Returns the innermost OpenMP dialect operation captured by this target
+    /// construct. For an operation to be detected as captured, it must be
+    /// inside a (possibly multi-level) nest of OpenMP dialect operation's
+    /// regions where none of these levels contain other operations considered
+    /// not-allowed for these purposes (i.e. only terminator operations are
+    /// allowed from the OpenMP dialect, and other dialect's operations are
+    /// allowed as long as they don't have a memory write effect).
+    ///
+    /// If there are omp.loop_nest operations in the sequence of nested
+    /// operations, the top level one will be the one captured.
+    Operation *getInnermostCapturedOmpOp();
+
+    /// Infers the kernel type (Generic, SPMD or Generic-SPMD) based on the
+    /// contents of the target region.
+    llvm::omp::OMPTgtExecModeFlags getKernelExecFlags();
   }] # clausesExtraClassDeclaration;
 
   let assemblyFormat = clausesAssemblyFormat # [{
-    custom<InReductionMapPrivateRegion>(
-        $region, $in_reduction_vars, type($in_reduction_vars),
-        $in_reduction_byref, $in_reduction_syms, $map_vars, type($map_vars),
-        $private_vars, type($private_vars), $private_syms, $private_maps)
-        attr-dict
+    custom<HostEvalInReductionMapPrivateRegion>(
+        $region, $host_eval_vars, type($host_eval_vars), $in_reduction_vars,
+        type($in_reduction_vars), $in_reduction_byref, $in_reduction_syms,
+        $map_vars, type($map_vars), $private_vars, type($private_vars),
+        $private_syms, $private_maps) attr-dict
   }];
 
   let hasVerifier = 1;
+  let hasRegionVerifier = 1;
 }
 
 
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 2235fe2ee668d..5a619254a5ee1 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -31,6 +31,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
+#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
 #include <cstddef>
 #include <iterator>
 #include <optional>
@@ -691,8 +692,10 @@ static ParseResult parseBlockArgRegion(OpAsmParser &parser, Region &region,
   return parser.parseRegion(region, entryBlockArgs);
 }
 
-static ParseResult parseInReductionMapPrivateRegion(
+static ParseResult parseHostEvalInReductionMapPrivateRegion(
     OpAsmParser &parser, Region &region,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &hostEvalVars,
+    SmallVectorImpl<Type> &hostEvalTypes,
     SmallVectorImpl<OpAsmParser::UnresolvedOperand> &inReductionVars,
     SmallVectorImpl<Type> &inReductionTypes,
     DenseBoolArrayAttr &inReductionByref, ArrayAttr &inReductionSyms,
@@ -702,6 +705,7 @@ static ParseResult parseInReductionMapPrivateRegion(
     llvm::SmallVectorImpl<Type> &privateTypes, ArrayAttr &privateSyms,
     DenseI64ArrayAttr &privateMaps) {
   AllRegionParseArgs args;
+  args.hostEvalArgs.emplace(hostEvalVars, hostEvalTypes);
   args.inReductionArgs.emplace(inReductionVars, inReductionTypes,
                                inReductionByref, inReductionSyms);
   args.mapArgs.emplace(mapVars, mapTypes);
@@ -931,13 +935,15 @@ static void printBlockArgRegion(OpAsmPrinter &p, Operation *op, Region &region,
   p.printRegion(region, /*printEntryBlockArgs=*/false);
 }
 
-static void printInReductionMapPrivateRegion(
-    OpAsmPrinter &p, Operation *op, Region &region, ValueRange inReductionVars,
+static void printHostEvalInReductionMapPrivateRegion(
+    OpAsmPrinter &p, Operation *op, Region &region, ValueRange hostEvalVars,
+    TypeRange hostEvalTypes, ValueRange inReductionVars,
     TypeRange inReductionTypes, DenseBoolArrayAttr inReductionByref,
     ArrayAttr inReductionSyms, ValueRange mapVars, TypeRange mapTypes,
     ValueRange privateVars, TypeRange privateTypes, ArrayAttr privateSyms,
     DenseI64ArrayAttr privateMaps) {
   AllRegionPrintArgs args;
+  args.hostEvalArgs.emplace(hostEvalVars, hostEvalTypes);
   args.inReductionArgs.emplace(inReductionVars, inReductionTypes,
                                inReductionByref, inReductionSyms);
   args.mapArgs.emplace(mapVars, mapTypes);
@@ -1720,11 +1726,12 @@ void TargetOp::build(OpBuilder &builder, OperationState &state,
   TargetOp::build(builder, state, /*allocate_vars=*/{}, /*allocator_vars=*/{},
                   clauses.bare, makeArrayAttr(ctx, clauses.dependKinds),
                   clauses.dependVars, clauses.device, clauses.hasDeviceAddrVars,
-                  clauses.ifExpr, /*in_reduction_vars=*/{},
-                  /*in_reduction_byref=*/nullptr, /*in_reduction_syms=*/nullptr,
-                  clauses.isDevicePtrVars, clauses.mapVars, clauses.nowait,
-                  clauses.privateVars, makeArrayAttr(ctx, clauses.privateSyms),
-                  clauses.threadLimit, /*private_maps=*/nullptr);
+                  clauses.hostEvalVars, clauses.ifExpr,
+                  /*in_reduction_vars=*/{}, /*in_reduction_byref=*/nullptr,
+                  /*in_reduction_syms=*/nullptr, clauses.isDevicePtrVars,
+                  clauses.mapVars, clauses.nowait, clauses.privateVars,
+                  makeArrayAttr(ctx, clauses.privateSyms), clauses.threadLimit,
+                  /*private_maps=*/nullptr);
 }
 
 LogicalResult TargetOp::verify() {
@@ -1742,6 +1749,189 @@ LogicalResult TargetOp::verify() {
   return verifyPrivateVarsMapping(*this);
 }
 
+LogicalResult TargetOp::verifyRegions() {
+  auto teamsOps = getOps<TeamsOp>();
+  if (std::distance(teamsOps.begin(), teamsOps.end()) > 1)
+    return emitError("target containing multiple 'omp.teams' nested ops");
+
+  // Check that host_eval values are only used in legal ways.
+  llvm::omp::OMPTgtExecModeFlags execFlags = getKernelExecFlags();
+  for (Value hostEvalArg :
+       cast<BlockArgOpenMPOpInterface>(getOperation()).getHostEvalBlockArgs()) {
+    for (Operation *user : hostEvalArg.getUsers()) {
+      if (auto teamsOp = dyn_cast<TeamsOp>(user)) {
+        if (llvm::is_contained({teamsOp.getNumTeamsLower(),
+                                teamsOp.getNumTeamsUpper(),
+                                teamsOp.getThreadLimit()},
+                               hostEvalArg))
+          continue;
+
+        return emitOpError() << "host_eval argument only legal as 'num_teams' "
+                                "and 'thread_limit' in 'omp.teams'";
+      }
+      if (auto parallelOp = dyn_cast<ParallelOp>(user)) {
+        if (execFlags == llvm::omp::OMP_TGT_EXEC_MODE_SPMD &&
+            hostEvalArg == parallelOp.getNumThreads())
+          continue;
+
+        return emitOpError()
+               << "host_eval argument only legal as 'num_threads' in "
+                  "'omp.parallel' when representing target SPMD";
+      }
+      if (auto loopNestOp = dyn_cast<LoopNestOp>(user)) {
+        if (execFlags != llvm::omp::OMP_TGT_EXEC_MODE_GENERIC &&
+            (llvm::is_contained(loopNestOp.getLoopLowerBounds(), hostEvalArg) ||
+             llvm::is_contained(loopNestOp.getLoopUpperBounds(), hostEvalArg) ||
+             llvm::is_contained(loopNestOp.getLoopSteps(), hostEvalArg)))
+          continue;
+
+        return emitOpError() << "host_eval argument only legal as loop bounds "
+                                "and steps in 'omp.loop_nest' when "
+                                "representing target SPMD or Generic-SPMD";
+      }
+
+      return emitOpError() << "host_eval argument illegal use in '"
+                           << user->getName() << "' operation";
+    }
+  }
+  return success();
+}
+
+/// Only allow OpenMP terminators and non-OpenMP ops that have known memory
+/// effects, but don't include a memory write effect.
+static bool siblingAllowedInCapture(Operation *op) {
+  if (!op)
+    return false;
+
+  bool isOmpDialect =
+      op->getContext()->getLoadedDialect<omp::OpenMPDialect>() ==
+      op->getDialect();
+
+  if (isOmpDialect)
+    return op->hasTrait<OpTrait::IsTerminator>();
+
+  if (auto memOp = dyn_cast<MemoryEffectOpInterface>(op)) {
+    SmallVector<SideEffects::EffectInstance<MemoryEffects::Effect>, 4> effects;
+    memOp.getEffects(effects);
+    return !llvm::any_of(effects, [&](MemoryEffects::EffectInstance &effect) {
+      return isa<MemoryEffects::Write>(effect.getEffect()) &&
+             isa<SideEffects::AutomaticAllocationScopeResource>(
+                 effect.getResource());
+    });
+  }
+  return true;
+}
+
+Operation *TargetOp::getInnermostCapturedOmpOp() {
+  Dialect *ompDialect = (*this)->getDialect();
+  Operation *capturedOp = nullptr;
+  DominanceInfo domInfo;
+
+  // Process in pre-order to check operations from outermost to innermost,
+  // ensuring we only enter the region of an operation if it meets the criteria
+  // for being captured. We stop the exploration of nested operations as soon as
+  // we process a region holding no operations to be captured.
+  walk<WalkOrder::PreOrder>([&](Operation *op) {
+    if (op == *this)
+      return WalkResult::advance();
+
+    // Ignore operations of other dialects or omp operations with no regions,
+    // because these will only be checked if they are siblings of an omp
+    // operation that can potentially be captured.
+    bool isOmpDialect = op->getDialect() == ompDialect;
+    bool hasRegions = op->getNumRegions() > 0;
+    if (!isOmpDialect || !hasRegions)
+      return WalkResult::skip();
+
+    // This operation cannot be captured if it can be executed more than once
+    // (i.e. its block's successors can reach it) or if it's not guaranteed to
+    // be executed before all exits of the region (i.e. it doesn't dominate all
+    // blocks with no successors reachable from the entry block).
+    Region *parentRegion = op->getParentRegion();
+    Block *parentBlock = op->getBlock();
+
+    for (Block *successor : parentBlock->getSuccessors())
+      if (successor->isReachable(parentBlock))
+        return WalkResult::interrupt();
+
+    for (Block &block : *parentRegion)
+      if (domInfo.isReachableFromEntry(&block) && block.hasNoSuccessors() &&
+          !domInfo.dominates(parentBlock, &block))
+        return WalkResult::interrupt();
+
+    // Don't capture this op if it has a not-allowed sibling, and stop recursing
+    // into nested operations.
+    for (Operation &sibling : op->getParentRegion()->getOps())
+      if (&sibling != op && !siblingAllowedInCapture(&sibling))
+        return WalkResult::interrupt();
+
+    // Don't continue capturing nested operations if we reach an omp.loop_nest.
+    // Otherwise, process the contents of this operation.
+    capturedOp = op;
+    return llvm::isa<LoopNestOp>(op) ? WalkResult::interrupt()
+                                     : WalkResult::advance();
+  });
+
+  return capturedOp;
+}
+
+llvm::omp::OMPTgtExecModeFlags TargetOp::getKernelExecFlags() {
+  using namespace llvm::omp;
+
+  // Make sure this region is capturing a loop. Otherwise, it's a generic
+  // kernel.
+  Operation *capturedOp = getInnermostCapturedOmpOp();
+  if (!isa_and_present<LoopNestOp>(capturedOp))
+    return OMP_TGT_EXEC_MODE_GENERIC;
+
+  SmallVector<LoopWrapperInterface> wrappers;
+  cast<LoopNestOp>(capturedOp).gatherWrappers(wrappers);
+  assert(!wrappers.empty());
+
+  // Ignore optional SIMD leaf construct.
+  auto *innermostWrapper = wrappers.begin();
+  if (isa<SimdOp>(innermostWrapper))
+    innermostWrapper = std::next(innermostWrapper);
+
+  long numWrappers = std::distance(innermostWrapper, wrappers.end());
+
+  // Detect Generic-SPMD: target-teams-distribute[-simd].
+  if (numWrappers == 1) {
+    if (!isa<DistributeOp>(innermostWrapper))
+      return OMP_TGT_EXEC_MODE_GENERIC;
+
+    Operation *teamsOp = (*innermostWrapper)->getParentOp();
+    if (!isa_and_present<TeamsOp>(teamsOp))
+      return OMP_TGT_EXEC_MODE_GENERIC;
+
+    if (teamsOp->getParentOp() == *this)
+      return OMP_TGT_EXEC_MODE_GENERIC_SPMD;
+  }
+
+  // Detect SPMD: target-teams-distribute-parallel-wsloop[-simd].
+  if (numWrappers == 2) {
+    if (!isa<WsloopOp>(innermostWrapper))
+      return OMP_TGT_EXEC_MODE_GENERIC;
+
+    innermostWrapper = std::next(innermostWrapper);
+    if (!isa<DistributeOp>(innermostWrapper))
+      return OMP_TGT_EXEC_MODE_GENERIC;
+
+    Operation *parallelOp = (*innermostWrapper)->getParentOp();
+    if (!isa_and_present<ParallelOp>(parallelOp))
+      return OMP_TGT_EXEC_MODE_GENERIC;
+
+    Operation *teamsOp = parallelOp->getParentOp();
+    if (!isa_and_present<TeamsOp>(teamsOp))
+      return OMP_TGT_EXEC_MODE_GENERIC;
+
+    if (teamsOp->getParentOp() == *this)
+      return OMP_TGT_EXEC_MODE_SPMD;
+  }
+
+  return OMP_TGT_EXEC_MODE_GENERIC;
+}
+
 //===----------------------------------------------------------------------===//
 // ParallelOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index d6112fa9af118..35a3750e02a66 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -174,6 +174,10 @@ static LogicalResult checkImplementationStatus(Operation &op) {
     if (op.getHint())
       op.emitWarning("hint clause discarded");
   };
+  auto checkHostEval = [&todo](auto op, LogicalResult &result) {
+    if (!op.getHostEvalVars().empty())
+      result = todo("host_eval");
+  };
   auto checkIf = [&todo](auto op, LogicalResult &result) {
     if (op.getIfExpr())
       result = todo("if");
@@ -284,6 +288,7 @@ static LogicalResult checkImplementationStatus(Operation &op) {
         checkBare(op, result);
         checkDevice(op, result);
         checkHasDeviceAddr(op, result);
+        checkHostEval(op, result);
         checkIf(op, result);
         checkInReduction(op, result);
         checkIsDevicePtr(op, result);
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index 1fbb4c93e855b..c611614265592 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -2138,11 +2138,103 @@ func.func @omp_target_update_data_depend(%a: memref<?xi32>) {
 
 // -----
 
+func.func @omp_target_multiple_teams() {
+  // expected-error @below {{target containing multiple 'omp.teams' nested ops}}
+  omp.target {
+    omp.teams {
+      omp.terminator
+    }
+    omp.teams {
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+func.func @omp_target_host_eval(%x : !llvm.ptr) {
+  // expected-error @below {{op host_eval argument illegal use in 'llvm.load' operation}}
+  omp.target host_eval(%x -> %arg0 : !llvm.ptr) {
+    %0 = llvm.load %arg0 : !llvm.ptr -> f32
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+func.func @omp_target_host_eval_teams(%x : i1) {
+  // expected-error @below {{op host_eval argument only legal as 'num_teams' and 'thread_limit' in 'omp.teams'}}
+  omp.target host_eval(%x -> %arg0 : i1) {
+    omp.teams if(%arg0) {
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+func.func @omp_target_host_eval_parallel(%x : i32) {
+  // expected-error @below {{op host_eval argument only legal as 'num_threads' in 'omp.parallel' when representing target SPMD}}
+  omp.target host_eval(%x -> %arg0 : i32) {
+    omp.parallel num_threads(%arg0 : i32) {
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+func.func @omp_target_host_eval_loop1(%x : i32) {
+  // expected-error @below {{op host_eval argument only legal as loop bounds and steps in 'omp.loop_nest' when representing target SPMD or Generic-SPMD}}
+  omp.target host_eval(%x -> %arg0 : i32) {
+    omp.wsloop {
+      omp.loop_nest (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) {
+        omp.yield
+      }
+    }
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+func.func @omp_target_host_eval_loop2(%x : i32) {
+  // expected-error @below {{op host_eval argument only legal as loop bounds and steps in 'omp.loop_nest' when representing target SPMD or Generic-SPMD}}
+  omp.target host_eval(%x -> %arg0 : i32) {
+    omp.teams {
+    ^bb0:
+      %0 = arith.constant 0 : i1
+      llvm.cond_br %0, ^bb1, ^bb2
+    ^bb1:
+      omp.distribute {
+        omp.loop_nest (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) {
+          omp.yield
+        }
+      }
+      llvm.br ^bb2
+    ^bb2:
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
 func.func @omp_target_depend(%data_var: memref<i32>) {
   // expected-error @below {{op expected as many depend values as depend variables}}
     "omp.target"(%data_var) ({
       "omp.terminator"() : () -> ()
-    }) {depend_kinds = [], operandSegmentSizes = array<i32: 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0>} : (memref<i32>) -> ()
+    }) {depend_kinds = [], operandSegmentSizes = array<i32: 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0>} : (memref<i32>) -> ()
    "func.return"() : () -> ()
 }
 
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index 26943068ed95a..b1901c333ade8 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -770,7 +770,7 @@ func.func @omp_target(%if_cond : i1, %device : si32,  %num_threads : i32, %devic
     "omp.target"(%device, %if_cond, %num_threads) ({
        // CHECK: omp.terminator
        omp.terminator
-    }) {nowait, operandSegmentSizes = array<i32: 0,0,0,1,0,1,0,0,0,0,1>} : ( si32, i1, i32 ) -> ()
+    }) {nowait, operandSegmentSizes = array<i32: 0,0,0,1,0,0,1,0,0,0,0,1>} : ( si32, i1, i32 ) -> ()
 
     // Test with optional map clause.
     // CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAL_1:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(tofrom) capture(ByRef) -> memref<?xi32> {name = ""}
@@ -2778,6 +2778,58 @@ func.func @omp_target_private_with_map_idx(%map1: memref<?xi32>, %map2: memref<?
   return
 }
 
+func.func @omp_target_host_eval(%x : i32) {
+  // CHECK: omp.target host_eval(%{{.*}} -> %[[HOST_ARG:.*]] : i32) {
+  // CHECK: omp.teams num_teams( to %[[HOST_ARG]] : i32)
+  // CHECK-SAME: thread_limit(%[[HOST_ARG]] : i32)
+  omp.target host_eval(%x -> %arg0 : i32) {
+    omp.teams num_teams(to %arg0 : i32) thread_limit(%arg0 : i32) {
+      omp.terminator
+    }
+    omp.terminator
+  }
+
+  // CHECK: omp.target host_eval(%{{.*}} -> %[[HOST_ARG:.*]] : i32) {
+  // CHECK: omp.teams {
+  // CHECK: omp.parallel num_threads(%[[HOST_ARG]] : i32) {
+  // CHECK: omp.distribute {
+  // CHECK: omp.wsloop {
+  // CHECK: omp.loop_nest (%{{.*}}) : i32 = (%[[HOST_ARG]]) to (%[[HOST_ARG]]) step (%[[HOST_ARG]]) {
+  omp.target host_eval(%x -> %arg0 : i32) {
+    omp.teams {
+      omp.parallel num_threads(%arg0 : i32) {
+        omp.distribute {
+          omp.wsloop {
+            omp.loop_nest (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) {
+              omp.yield
+            }
+          } {omp.composite}
+        } {omp.composite}
+        omp.terminator
+      } {omp.composite}
+      omp.terminator
+    }
+    omp.terminator
+  }
+
+  // CHECK: omp.target host_eval(%{{.*}} -> %[[HOST_ARG:.*]] : i32) {
+  // CHECK: omp.teams {
+  // CHECK: omp.distribute {
+  // CHECK: omp.loop_nest (%{{.*}}) : i32 = (%[[HOST_ARG]]) to (%[[HOST_ARG]]) step (%[[HOST_ARG]]) {
+  omp.target host_eval(%x -> %arg0 : i32) {
+    omp.teams {
+      omp.distribute {
+        omp.loop_nest (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) {
+          omp.yield
+        }
+      }
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
 // CHECK-LABEL: omp_loop
 func.func @omp_loop(%lb : index, %ub : index, %step : index) {
   // CHECK: omp.loop {
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index f8c9d911e3034..ffb865edff80d 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -247,6 +247,20 @@ llvm.func @target_has_device_addr(%x : !llvm.ptr) {
 
 // -----
 
+llvm.func @target_host_eval(%x : i32) {
+  // expected-error@below {{not yet implemented: Unhandled clause host_eval in omp.target operation}}
+  // expected-error@below {{LLVM Translation failed for operation: omp.target}}
+  omp.target host_eval(%x -> %arg0 : i32) {
+    omp.teams num_teams(to %arg0 : i32) {
+      omp.terminator
+    }
+    omp.terminator
+  }
+  llvm.return
+}
+
+// -----
+
 llvm.func @target_if(%x : i1) {
   // expected-error@below {{not yet implemented: Unhandled clause if in omp.target operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.target}}

From cc3aab580b680e8566e9f7a1ff9feff895ecfc49 Mon Sep 17 00:00:00 2001
From: Acim Maravic <Acim.Maravic@amd.com>
Date: Tue, 14 Jan 2025 11:22:20 +0100
Subject: [PATCH 396/408] [AMDGPU] Handle nontemporal and amdgpu.last.use
 metadata in amdgpu-lower-buffer-fat-pointers (#120139)

---
 .../AMDGPU/AMDGPULowerBufferFatPointers.cpp   |  12 -
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   3 +
 ...er-buffer-fat-pointers-lastuse-metadata.ll | 334 +++++++
 .../lower-buffer-fat-pointers-memops.ll       |  20 +-
 ...uffer-fat-pointers-nontemporal-metadata.ll | 886 ++++++++++++++++++
 5 files changed, 1233 insertions(+), 22 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index c7cdd7a37282c..8c6ea5f9bd0f8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1074,18 +1074,6 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
   Args.push_back(IRB.getInt32(0));
 
   uint32_t Aux = 0;
-  bool IsInvariant =
-      (isa<LoadInst>(I) && I->getMetadata(LLVMContext::MD_invariant_load));
-  bool IsNonTemporal = I->getMetadata(LLVMContext::MD_nontemporal);
-  // Atomic loads and stores need glc, atomic read-modify-write doesn't.
-  bool IsOneWayAtomic =
-      !isa<AtomicRMWInst>(I) && Order != AtomicOrdering::NotAtomic;
-  if (IsOneWayAtomic)
-    Aux |= AMDGPU::CPol::GLC;
-  if (IsNonTemporal && !IsInvariant)
-    Aux |= AMDGPU::CPol::SLC;
-  if (isa<LoadInst>(I) && ST->getGeneration() == AMDGPUSubtarget::GFX10)
-    Aux |= (Aux & AMDGPU::CPol::GLC ? AMDGPU::CPol::DLC : 0);
   if (IsVolatile)
     Aux |= AMDGPU::CPol::VOLATILE;
   Args.push_back(IRB.getInt32(Aux));
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 69dca988b2cad..21f1f20e5e69a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1202,6 +1202,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   Info.flags = MachineMemOperand::MONone;
   if (CI.hasMetadata(LLVMContext::MD_invariant_load))
     Info.flags |= MachineMemOperand::MOInvariant;
+  if (CI.hasMetadata(LLVMContext::MD_nontemporal))
+    Info.flags |= MachineMemOperand::MONonTemporal;
+  Info.flags |= getTargetMMOFlags(CI);
 
   if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
           AMDGPU::lookupRsrcIntrinsic(IntrID)) {
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
new file mode 100644
index 0000000000000..eaf8809d33fc3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
@@ -0,0 +1,334 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefix=GFX12 %s
+
+
+define amdgpu_kernel void @buffer_last_use_load_0(ptr addrspace(7) %in, ptr addrspace(7) %out) {
+; GFX12-LABEL: buffer_last_use_load_0:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x2
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX12-NEXT:    s_load_b128 s[8:11], s[4:5], 0x20
+; GFX12-NEXT:    s_load_b32 s6, s[4:5], 0x10
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9
+; GFX12-NEXT:    v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    scratch_load_b64 v[5:6], off, off offset:40
+; GFX12-NEXT:    scratch_load_b32 v4, off, off offset:36
+; GFX12-NEXT:    s_load_b32 s1, s[4:5], 0x30
+; GFX12-NEXT:    scratch_store_b128 off, v[7:10], off
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    scratch_load_b64 v[1:2], off, off offset:8
+; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4
+; GFX12-NEXT:    v_mov_b32_e32 v7, s6
+; GFX12-NEXT:    v_mov_b32_e32 v9, s0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v3, s1
+; GFX12-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x2
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v7
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_LU
+; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
+; GFX12-NEXT:    ; implicit-def: $vgpr9
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-NEXT:    v_mov_b32_e32 v4, s8
+; GFX12-NEXT:    s_mov_b32 s0, exec_lo
+; GFX12-NEXT:  .LBB0_3: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x1
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_store_b32 v8, v4, s[4:7], null offen
+; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX12-NEXT:    ; implicit-def: $vgpr8
+; GFX12-NEXT:    ; implicit-def: $vgpr4
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB0_3
+; GFX12-NEXT:  ; %bb.4:
+; GFX12-NEXT:    s_endpgm
+entry:
+  %val = load i32, ptr addrspace(7) %in, !amdgpu.last.use !{}
+  store i32 %val, ptr addrspace(7) %out
+  ret void
+}
+
+define amdgpu_kernel void @buffer_last_use_load_1(ptr addrspace(7) %in, ptr addrspace(7) %out) {
+; GFX12-LABEL: buffer_last_use_load_1:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x2
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX12-NEXT:    s_load_b128 s[8:11], s[4:5], 0x20
+; GFX12-NEXT:    s_load_b32 s6, s[4:5], 0x10
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
+; GFX12-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
+; GFX12-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
+; GFX12-NEXT:    scratch_store_b128 off, v[1:4], off offset:32
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    scratch_load_b64 v[6:7], off, off offset:40
+; GFX12-NEXT:    scratch_load_b32 v5, off, off offset:36
+; GFX12-NEXT:    s_load_b32 s1, s[4:5], 0x30
+; GFX12-NEXT:    scratch_store_b128 off, v[8:11], off
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    scratch_load_b64 v[2:3], off, off offset:8
+; GFX12-NEXT:    scratch_load_b32 v1, off, off offset:4
+; GFX12-NEXT:    v_mov_b32_e32 v8, s6
+; GFX12-NEXT:    v_lshl_add_u32 v9, v0, 2, s0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v4, s1
+; GFX12-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x2
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v5
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v6
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[5:6]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v0, v9, s[4:7], null offen th:TH_LOAD_LU
+; GFX12-NEXT:    ; implicit-def: $vgpr5_vgpr6_vgpr7_vgpr8
+; GFX12-NEXT:    ; implicit-def: $vgpr9
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-NEXT:    v_mov_b32_e32 v5, s8
+; GFX12-NEXT:    s_mov_b32 s0, exec_lo
+; GFX12-NEXT:  .LBB1_3: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x1
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_store_b32 v0, v5, s[4:7], null offen
+; GFX12-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX12-NEXT:    ; implicit-def: $vgpr0
+; GFX12-NEXT:    ; implicit-def: $vgpr5
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB1_3
+; GFX12-NEXT:  ; %bb.4:
+; GFX12-NEXT:    s_endpgm
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %val.gep = getelementptr inbounds i32, ptr addrspace(7) %in, i32 %tid
+  %val = load i32, ptr addrspace(7) %val.gep, align 4, !amdgpu.last.use !{}
+  store i32 %val, ptr addrspace(7) %out
+  ret void
+}
+
+define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %in, ptr addrspace(7) %out) {
+; GFX12-LABEL: buffer_last_use_and_volatile_load:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x2
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX12-NEXT:    s_load_b128 s[8:11], s[4:5], 0x20
+; GFX12-NEXT:    s_load_b32 s6, s[4:5], 0x10
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9
+; GFX12-NEXT:    v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    scratch_load_b64 v[5:6], off, off offset:40
+; GFX12-NEXT:    scratch_load_b32 v4, off, off offset:36
+; GFX12-NEXT:    s_load_b32 s1, s[4:5], 0x30
+; GFX12-NEXT:    scratch_store_b128 off, v[7:10], off
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    scratch_load_b64 v[1:2], off, off offset:8
+; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4
+; GFX12-NEXT:    v_mov_b32_e32 v7, s6
+; GFX12-NEXT:    v_mov_b32_e32 v9, s0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v3, s1
+; GFX12-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x2
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v7
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
+; GFX12-NEXT:    ; implicit-def: $vgpr9
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-NEXT:    v_mov_b32_e32 v4, s8
+; GFX12-NEXT:    s_mov_b32 s0, exec_lo
+; GFX12-NEXT:  .LBB2_3: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x1
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_store_b32 v8, v4, s[4:7], null offen
+; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX12-NEXT:    ; implicit-def: $vgpr8
+; GFX12-NEXT:    ; implicit-def: $vgpr4
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB2_3
+; GFX12-NEXT:  ; %bb.4:
+; GFX12-NEXT:    s_endpgm
+entry:
+  %val = load volatile i32, ptr addrspace(7) %in, !amdgpu.last.use !{}
+  store i32 %val, ptr addrspace(7) %out
+  ret void
+}
+
+define amdgpu_kernel void @buffer_last_use_and_nontemporal_load(ptr addrspace(7) %in, ptr addrspace(7) %out) {
+; GFX12-LABEL: buffer_last_use_and_nontemporal_load:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x2
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX12-NEXT:    s_load_b128 s[8:11], s[4:5], 0x20
+; GFX12-NEXT:    s_load_b32 s6, s[4:5], 0x10
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9
+; GFX12-NEXT:    v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    scratch_load_b64 v[5:6], off, off offset:40
+; GFX12-NEXT:    scratch_load_b32 v4, off, off offset:36
+; GFX12-NEXT:    s_load_b32 s1, s[4:5], 0x30
+; GFX12-NEXT:    scratch_store_b128 off, v[7:10], off
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    scratch_load_b64 v[1:2], off, off offset:8
+; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4
+; GFX12-NEXT:    v_mov_b32_e32 v7, s6
+; GFX12-NEXT:    v_mov_b32_e32 v9, s0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v3, s1
+; GFX12-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x2
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v7
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_LU
+; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
+; GFX12-NEXT:    ; implicit-def: $vgpr9
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-NEXT:    v_mov_b32_e32 v4, s8
+; GFX12-NEXT:    s_mov_b32 s0, exec_lo
+; GFX12-NEXT:  .LBB3_3: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x1
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_store_b32 v8, v4, s[4:7], null offen
+; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX12-NEXT:    ; implicit-def: $vgpr8
+; GFX12-NEXT:    ; implicit-def: $vgpr4
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB3_3
+; GFX12-NEXT:  ; %bb.4:
+; GFX12-NEXT:    s_endpgm
+entry:
+  %val = load i32, ptr addrspace(7) %in, !amdgpu.last.use !{}, !nontemporal !0
+  store i32 %val, ptr addrspace(7) %out
+  ret void
+}
+
+!0 = !{i32 1}
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-memops.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-memops.ll
index 57028a0f9b14f..75cf0615b2b53 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-memops.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-memops.ll
@@ -11,16 +11,16 @@ define void @loads(ptr addrspace(8) %buf) {
 ; CHECK-NEXT:    [[SCALAR:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 0)
 ; CHECK-NEXT:    [[VEC2:%.*]] = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) align 8 [[BUF]], i32 16, i32 0, i32 0)
 ; CHECK-NEXT:    [[VEC4:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
-; CHECK-NEXT:    [[NONTEMPORAL:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 2), !nontemporal [[META0:![0-9]+]]
+; CHECK-NEXT:    [[NONTEMPORAL:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 0), !nontemporal [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[INVARIANT:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 0), !invariant.load [[META1:![0-9]+]]
 ; CHECK-NEXT:    [[NONTEMPORAL_INVARIANT:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 0), !invariant.load [[META1]], !nontemporal [[META0]]
 ; CHECK-NEXT:    [[VOLATILE:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483648)
-; CHECK-NEXT:    [[VOLATILE_NONTEMPORAL:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483646), !nontemporal [[META0]]
+; CHECK-NEXT:    [[VOLATILE_NONTEMPORAL:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483648), !nontemporal [[META0]]
 ; CHECK-NEXT:    fence syncscope("wavefront") release
-; CHECK-NEXT:    [[ATOMIC:%.*]] = call float @llvm.amdgcn.raw.ptr.atomic.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483647)
+; CHECK-NEXT:    [[ATOMIC:%.*]] = call float @llvm.amdgcn.raw.ptr.atomic.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483648)
 ; CHECK-NEXT:    fence syncscope("wavefront") acquire
-; CHECK-NEXT:    [[ATOMIC_MONOTONIC:%.*]] = call float @llvm.amdgcn.raw.ptr.atomic.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 1)
-; CHECK-NEXT:    [[ATOMIC_ACQUIRE:%.*]] = call float @llvm.amdgcn.raw.ptr.atomic.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 1)
+; CHECK-NEXT:    [[ATOMIC_MONOTONIC:%.*]] = call float @llvm.amdgcn.raw.ptr.atomic.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[ATOMIC_ACQUIRE:%.*]] = call float @llvm.amdgcn.raw.ptr.atomic.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 0)
 ; CHECK-NEXT:    fence acquire
 ; CHECK-NEXT:    ret void
 ;
@@ -50,15 +50,15 @@ define void @stores(ptr addrspace(8) %buf, float %f, <4 x float> %f4) {
 ; CHECK-SAME: (ptr addrspace(8) [[BUF:%.*]], float [[F:%.*]], <4 x float> [[F4:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[F]], ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 0)
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[F4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[F]], ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 2), !nontemporal [[META0]]
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[F]], ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 0), !nontemporal [[META0]]
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[F]], ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483648)
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[F]], ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483646), !nontemporal [[META0]]
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[F]], ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483648), !nontemporal [[META0]]
 ; CHECK-NEXT:    fence syncscope("wavefront") release
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[F]], ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483647)
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[F]], ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483648)
 ; CHECK-NEXT:    fence syncscope("wavefront") acquire
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[F]], ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 1)
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[F]], ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 0)
 ; CHECK-NEXT:    fence release
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[F]], ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 1)
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[F]], ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %base = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
new file mode 100644
index 0000000000000..6bd0498a2a4e4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
@@ -0,0 +1,886 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=0 -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=1 -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=0 -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX940,GFX940-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=1 -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX940,GFX940-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=0 -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10,GFX10-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=1 -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10,GFX10-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=0 -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=1 -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=0 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12,GFX12-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=1 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12,GFX12-GISEL %s
+
+define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, ptr addrspace(7) %out) {
+; GFX9-SDAG-LABEL: buffer_nontemporal_load_store:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GFX9-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x10
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, 0
+; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s15
+; GFX9-SDAG-NEXT:    s_mov_b32 s15, s10
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s14, s7
+; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s12, s5
+; GFX9-SDAG-NEXT:    s_or_b64 s[14:15], s[14:15], s[10:11]
+; GFX9-SDAG-NEXT:    s_mov_b32 s13, s6
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen glc slc
+; GFX9-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x30
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x20
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s10
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, s7
+; GFX9-SDAG-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, s5
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s6
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: buffer_nontemporal_load_store:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x10
+; GFX9-GISEL-NEXT:    s_mov_b32 s11, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s11
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s11
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_mov_b32 s10, s1
+; GFX9-GISEL-NEXT:    s_mov_b32 s5, s2
+; GFX9-GISEL-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
+; GFX9-GISEL-NEXT:    s_mov_b32 s10, s3
+; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-GISEL-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc slc
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
+; GFX9-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x30
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s11
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s11
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_mov_b32 s10, s1
+; GFX9-GISEL-NEXT:    s_mov_b32 s5, s2
+; GFX9-GISEL-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
+; GFX9-GISEL-NEXT:    s_mov_b32 s10, s3
+; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen glc slc
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX940-SDAG-LABEL: buffer_nontemporal_load_store:
+; GFX940-SDAG:       ; %bb.0: ; %entry
+; GFX940-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX940-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX940-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x20
+; GFX940-SDAG-NEXT:    s_load_dword s7, s[4:5], 0x30
+; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-SDAG-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32 sc0 sc1
+; GFX940-SDAG-NEXT:    scratch_load_dwordx2 v[10:11], off, off offset:40
+; GFX940-SDAG-NEXT:    scratch_load_dword v4, off, off offset:36
+; GFX940-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX940-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GFX940-SDAG-NEXT:    scratch_store_dwordx4 off, v[0:3], off sc0 sc1
+; GFX940-SDAG-NEXT:    scratch_load_dwordx2 v[12:13], off, off offset:8
+; GFX940-SDAG-NEXT:    s_nop 0
+; GFX940-SDAG-NEXT:    scratch_load_dword v0, off, off offset:4
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v7, s6
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v9, s0
+; GFX940-SDAG-NEXT:    s_mov_b64 s[2:3], exec
+; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v6, v11
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v5, v10
+; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, v13
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, v12
+; GFX940-SDAG-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s7, v7
+; GFX940-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5]
+; GFX940-SDAG-NEXT:    s_nop 0
+; GFX940-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[6:7]
+; GFX940-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX940-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX940-SDAG-NEXT:    buffer_load_dword v8, v9, s[4:7], 0 offen nt
+; GFX940-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
+; GFX940-SDAG-NEXT:    ; implicit-def: $vgpr9
+; GFX940-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX940-SDAG-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX940-SDAG-NEXT:  ; %bb.2:
+; GFX940-SDAG-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v4, s8
+; GFX940-SDAG-NEXT:    s_mov_b64 s[0:1], exec
+; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-SDAG-NEXT:  .LBB0_3: ; =>This Inner Loop Header: Depth=1
+; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX940-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX940-SDAG-NEXT:    s_nop 0
+; GFX940-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX940-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX940-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX940-SDAG-NEXT:    buffer_store_dword v8, v4, s[4:7], 0 offen sc0 nt sc1
+; GFX940-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX940-SDAG-NEXT:    ; implicit-def: $vgpr8
+; GFX940-SDAG-NEXT:    ; implicit-def: $vgpr4
+; GFX940-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX940-SDAG-NEXT:    s_cbranch_execnz .LBB0_3
+; GFX940-SDAG-NEXT:  ; %bb.4:
+; GFX940-SDAG-NEXT:    s_endpgm
+;
+; GFX940-GISEL-LABEL: buffer_nontemporal_load_store:
+; GFX940-GISEL:       ; %bb.0: ; %entry
+; GFX940-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX940-GISEL-NEXT:    s_load_dword s11, s[4:5], 0x10
+; GFX940-GISEL-NEXT:    s_mov_b32 s7, 0
+; GFX940-GISEL-NEXT:    s_mov_b32 s8, s7
+; GFX940-GISEL-NEXT:    s_mov_b32 s10, s7
+; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-GISEL-NEXT:    s_mov_b32 s6, s1
+; GFX940-GISEL-NEXT:    s_mov_b32 s9, s2
+; GFX940-GISEL-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
+; GFX940-GISEL-NEXT:    s_mov_b32 s6, s3
+; GFX940-GISEL-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-GISEL-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen nt
+; GFX940-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
+; GFX940-GISEL-NEXT:    s_load_dword s9, s[4:5], 0x30
+; GFX940-GISEL-NEXT:    s_mov_b32 s4, s7
+; GFX940-GISEL-NEXT:    s_mov_b32 s8, s7
+; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-GISEL-NEXT:    s_mov_b32 s6, s1
+; GFX940-GISEL-NEXT:    s_mov_b32 s5, s2
+; GFX940-GISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; GFX940-GISEL-NEXT:    s_mov_b32 s6, s3
+; GFX940-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen sc0 nt sc1
+; GFX940-GISEL-NEXT:    s_endpgm
+;
+; GFX10-SDAG-LABEL: buffer_nontemporal_load_store:
+; GFX10-SDAG:       ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GFX10-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x10
+; GFX10-SDAG-NEXT:    s_mov_b32 s10, 0
+; GFX10-SDAG-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-SDAG-NEXT:    s_mov_b32 s13, s10
+; GFX10-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-SDAG-NEXT:    s_mov_b32 s12, s7
+; GFX10-SDAG-NEXT:    s_or_b64 s[14:15], s[12:13], s[10:11]
+; GFX10-SDAG-NEXT:    s_mov_b32 s12, s5
+; GFX10-SDAG-NEXT:    s_mov_b32 s13, s6
+; GFX10-SDAG-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen slc
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x30
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x20
+; GFX10-SDAG-NEXT:    s_mov_b32 s9, s10
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-SDAG-NEXT:    s_mov_b32 s8, s7
+; GFX10-SDAG-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
+; GFX10-SDAG-NEXT:    s_mov_b32 s8, s5
+; GFX10-SDAG-NEXT:    s_mov_b32 s9, s6
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: buffer_nontemporal_load_store:
+; GFX10-GISEL:       ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX10-GISEL-NEXT:    s_load_dword s5, s[8:9], 0x10
+; GFX10-GISEL-NEXT:    s_mov_b32 s7, 0
+; GFX10-GISEL-NEXT:    s_mov_b32 s10, s7
+; GFX10-GISEL-NEXT:    s_mov_b32 s4, s7
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_mov_b32 s6, s1
+; GFX10-GISEL-NEXT:    s_mov_b32 s11, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-GISEL-NEXT:    s_or_b64 s[0:1], s[6:7], s[10:11]
+; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
+; GFX10-GISEL-NEXT:    s_or_b64 s[2:3], s[6:7], s[4:5]
+; GFX10-GISEL-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen slc
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
+; GFX10-GISEL-NEXT:    s_load_dword s11, s[8:9], 0x30
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_mov_b32 s6, s1
+; GFX10-GISEL-NEXT:    s_mov_b32 s5, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-GISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
+; GFX10-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen glc slc
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: buffer_nontemporal_load_store:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_clause 0x2
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-SDAG-NEXT:    s_load_b128 s[8:11], s[4:5], 0x20
+; GFX11-SDAG-NEXT:    s_load_b32 s6, s[4:5], 0x10
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11
+; GFX11-SDAG-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    scratch_load_b64 v[5:6], off, off offset:40
+; GFX11-SDAG-NEXT:    scratch_load_b32 v4, off, off offset:36
+; GFX11-SDAG-NEXT:    s_load_b32 s1, s[4:5], 0x30
+; GFX11-SDAG-NEXT:    scratch_store_b128 off, v[7:10], off
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    scratch_load_b64 v[1:2], off, off offset:8
+; GFX11-SDAG-NEXT:    scratch_load_b32 v0, off, off offset:4
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v7, s6
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v9, s0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s7, v7
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
+; GFX11-SDAG-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-SDAG-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-SDAG-NEXT:    buffer_load_b32 v8, v9, s[4:7], 0 offen slc dlc
+; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr9
+; GFX11-SDAG-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-SDAG-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, s8
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-SDAG-NEXT:  .LBB0_3: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-SDAG-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-SDAG-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    buffer_store_b32 v8, v4, s[4:7], 0 offen glc slc dlc
+; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr8
+; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr4
+; GFX11-SDAG-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-SDAG-NEXT:    s_cbranch_execnz .LBB0_3
+; GFX11-SDAG-NEXT:  ; %bb.4:
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: buffer_nontemporal_load_store:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x10
+; GFX11-GISEL-NEXT:    s_mov_b32 s9, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_mov_b32 s10, s9
+; GFX11-GISEL-NEXT:    s_mov_b32 s6, s9
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
+; GFX11-GISEL-NEXT:    s_mov_b32 s11, s2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-GISEL-NEXT:    s_or_b64 s[0:1], s[8:9], s[10:11]
+; GFX11-GISEL-NEXT:    s_mov_b32 s8, s3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
+; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen slc dlc
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
+; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s9
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
+; GFX11-GISEL-NEXT:    s_mov_b32 s5, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; GFX11-GISEL-NEXT:    s_mov_b32 s8, s3
+; GFX11-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], 0 offen glc slc dlc
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: buffer_nontemporal_load_store:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_clause 0x2
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX12-SDAG-NEXT:    s_load_b128 s[8:11], s[4:5], 0x20
+; GFX12-SDAG-NEXT:    s_load_b32 s6, s[4:5], 0x10
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11
+; GFX12-SDAG-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    scratch_load_b64 v[5:6], off, off offset:40
+; GFX12-SDAG-NEXT:    scratch_load_b32 v4, off, off offset:36
+; GFX12-SDAG-NEXT:    s_load_b32 s1, s[4:5], 0x30
+; GFX12-SDAG-NEXT:    scratch_store_b128 off, v[7:10], off
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    scratch_load_b64 v[1:2], off, off offset:8
+; GFX12-SDAG-NEXT:    scratch_load_b32 v0, off, off offset:4
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v7, s6
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v9, s0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX12-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-SDAG-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x2
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s7, v7
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
+; GFX12-SDAG-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT:    buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_NT
+; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
+; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr9
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-SDAG-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX12-SDAG-NEXT:  ; %bb.2:
+; GFX12-SDAG-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v4, s8
+; GFX12-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX12-SDAG-NEXT:  .LBB0_3: ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x1
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-SDAG-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT:    buffer_store_b32 v8, v4, s[4:7], null offen th:TH_STORE_NT
+; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr8
+; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr4
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-SDAG-NEXT:    s_cbranch_execnz .LBB0_3
+; GFX12-SDAG-NEXT:  ; %bb.4:
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: buffer_nontemporal_load_store:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x10
+; GFX12-GISEL-NEXT:    s_mov_b32 s9, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_mov_b32 s10, s9
+; GFX12-GISEL-NEXT:    s_mov_b32 s6, s9
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
+; GFX12-GISEL-NEXT:    s_mov_b32 s11, s2
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    s_or_b64 s[0:1], s[8:9], s[10:11]
+; GFX12-GISEL-NEXT:    s_mov_b32 s8, s3
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
+; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen th:TH_LOAD_NT
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
+; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
+; GFX12-GISEL-NEXT:    s_mov_b32 s4, s9
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
+; GFX12-GISEL-NEXT:    s_mov_b32 s5, s2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; GFX12-GISEL-NEXT:    s_mov_b32 s8, s3
+; GFX12-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT
+; GFX12-GISEL-NEXT:    s_endpgm
+entry:
+  %val = load i32, ptr addrspace(7) %in, !nontemporal !0
+  store i32 %val, ptr addrspace(7) %out, !nontemporal !0
+  ret void
+}
+
+define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrspace(7) %in, ptr addrspace(7) %out) {
+; GFX9-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GFX9-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x10
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, 0
+; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s15
+; GFX9-SDAG-NEXT:    s_mov_b32 s15, s10
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s14, s7
+; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s12, s5
+; GFX9-SDAG-NEXT:    s_or_b64 s[14:15], s[14:15], s[10:11]
+; GFX9-SDAG-NEXT:    s_mov_b32 s13, s6
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen glc
+; GFX9-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x30
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x20
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s10
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, s7
+; GFX9-SDAG-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, s5
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s6
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x10
+; GFX9-GISEL-NEXT:    s_mov_b32 s11, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s11
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s11
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_mov_b32 s10, s1
+; GFX9-GISEL-NEXT:    s_mov_b32 s5, s2
+; GFX9-GISEL-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
+; GFX9-GISEL-NEXT:    s_mov_b32 s10, s3
+; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-GISEL-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
+; GFX9-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x30
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s11
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s11
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_mov_b32 s10, s1
+; GFX9-GISEL-NEXT:    s_mov_b32 s5, s2
+; GFX9-GISEL-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
+; GFX9-GISEL-NEXT:    s_mov_b32 s10, s3
+; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX940-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
+; GFX940-SDAG:       ; %bb.0: ; %entry
+; GFX940-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX940-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX940-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x20
+; GFX940-SDAG-NEXT:    s_load_dword s7, s[4:5], 0x30
+; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-SDAG-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32 sc0 sc1
+; GFX940-SDAG-NEXT:    scratch_load_dwordx2 v[10:11], off, off offset:40
+; GFX940-SDAG-NEXT:    scratch_load_dword v4, off, off offset:36
+; GFX940-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX940-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GFX940-SDAG-NEXT:    scratch_store_dwordx4 off, v[0:3], off sc0 sc1
+; GFX940-SDAG-NEXT:    scratch_load_dwordx2 v[12:13], off, off offset:8
+; GFX940-SDAG-NEXT:    s_nop 0
+; GFX940-SDAG-NEXT:    scratch_load_dword v0, off, off offset:4
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v7, s6
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v9, s0
+; GFX940-SDAG-NEXT:    s_mov_b64 s[2:3], exec
+; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v6, v11
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v5, v10
+; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, v13
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, v12
+; GFX940-SDAG-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s7, v7
+; GFX940-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5]
+; GFX940-SDAG-NEXT:    s_nop 0
+; GFX940-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[6:7]
+; GFX940-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX940-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX940-SDAG-NEXT:    buffer_load_dword v8, v9, s[4:7], 0 offen sc0 sc1
+; GFX940-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
+; GFX940-SDAG-NEXT:    ; implicit-def: $vgpr9
+; GFX940-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX940-SDAG-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX940-SDAG-NEXT:  ; %bb.2:
+; GFX940-SDAG-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v4, s8
+; GFX940-SDAG-NEXT:    s_mov_b64 s[0:1], exec
+; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-SDAG-NEXT:  .LBB1_3: ; =>This Inner Loop Header: Depth=1
+; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX940-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX940-SDAG-NEXT:    s_nop 0
+; GFX940-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX940-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX940-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX940-SDAG-NEXT:    buffer_store_dword v8, v4, s[4:7], 0 offen sc0 sc1
+; GFX940-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX940-SDAG-NEXT:    ; implicit-def: $vgpr8
+; GFX940-SDAG-NEXT:    ; implicit-def: $vgpr4
+; GFX940-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX940-SDAG-NEXT:    s_cbranch_execnz .LBB1_3
+; GFX940-SDAG-NEXT:  ; %bb.4:
+; GFX940-SDAG-NEXT:    s_endpgm
+;
+; GFX940-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
+; GFX940-GISEL:       ; %bb.0: ; %entry
+; GFX940-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX940-GISEL-NEXT:    s_load_dword s11, s[4:5], 0x10
+; GFX940-GISEL-NEXT:    s_mov_b32 s7, 0
+; GFX940-GISEL-NEXT:    s_mov_b32 s8, s7
+; GFX940-GISEL-NEXT:    s_mov_b32 s10, s7
+; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-GISEL-NEXT:    s_mov_b32 s6, s1
+; GFX940-GISEL-NEXT:    s_mov_b32 s9, s2
+; GFX940-GISEL-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
+; GFX940-GISEL-NEXT:    s_mov_b32 s6, s3
+; GFX940-GISEL-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-GISEL-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1
+; GFX940-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
+; GFX940-GISEL-NEXT:    s_load_dword s9, s[4:5], 0x30
+; GFX940-GISEL-NEXT:    s_mov_b32 s4, s7
+; GFX940-GISEL-NEXT:    s_mov_b32 s8, s7
+; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-GISEL-NEXT:    s_mov_b32 s6, s1
+; GFX940-GISEL-NEXT:    s_mov_b32 s5, s2
+; GFX940-GISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; GFX940-GISEL-NEXT:    s_mov_b32 s6, s3
+; GFX940-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1
+; GFX940-GISEL-NEXT:    s_endpgm
+;
+; GFX10-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
+; GFX10-SDAG:       ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GFX10-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x10
+; GFX10-SDAG-NEXT:    s_mov_b32 s10, 0
+; GFX10-SDAG-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-SDAG-NEXT:    s_mov_b32 s13, s10
+; GFX10-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-SDAG-NEXT:    s_mov_b32 s12, s7
+; GFX10-SDAG-NEXT:    s_or_b64 s[14:15], s[12:13], s[10:11]
+; GFX10-SDAG-NEXT:    s_mov_b32 s12, s5
+; GFX10-SDAG-NEXT:    s_mov_b32 s13, s6
+; GFX10-SDAG-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen glc dlc
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x30
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x20
+; GFX10-SDAG-NEXT:    s_mov_b32 s9, s10
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-SDAG-NEXT:    s_mov_b32 s8, s7
+; GFX10-SDAG-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
+; GFX10-SDAG-NEXT:    s_mov_b32 s8, s5
+; GFX10-SDAG-NEXT:    s_mov_b32 s9, s6
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
+; GFX10-GISEL:       ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX10-GISEL-NEXT:    s_load_dword s5, s[8:9], 0x10
+; GFX10-GISEL-NEXT:    s_mov_b32 s7, 0
+; GFX10-GISEL-NEXT:    s_mov_b32 s10, s7
+; GFX10-GISEL-NEXT:    s_mov_b32 s4, s7
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_mov_b32 s6, s1
+; GFX10-GISEL-NEXT:    s_mov_b32 s11, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-GISEL-NEXT:    s_or_b64 s[0:1], s[6:7], s[10:11]
+; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
+; GFX10-GISEL-NEXT:    s_or_b64 s[2:3], s[6:7], s[4:5]
+; GFX10-GISEL-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen glc dlc
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
+; GFX10-GISEL-NEXT:    s_load_dword s11, s[8:9], 0x30
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_mov_b32 s6, s1
+; GFX10-GISEL-NEXT:    s_mov_b32 s5, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-GISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
+; GFX10-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_clause 0x2
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-SDAG-NEXT:    s_load_b128 s[8:11], s[4:5], 0x20
+; GFX11-SDAG-NEXT:    s_load_b32 s6, s[4:5], 0x10
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11
+; GFX11-SDAG-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    scratch_load_b64 v[5:6], off, off offset:40
+; GFX11-SDAG-NEXT:    scratch_load_b32 v4, off, off offset:36
+; GFX11-SDAG-NEXT:    s_load_b32 s1, s[4:5], 0x30
+; GFX11-SDAG-NEXT:    scratch_store_b128 off, v[7:10], off
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    scratch_load_b64 v[1:2], off, off offset:8
+; GFX11-SDAG-NEXT:    scratch_load_b32 v0, off, off offset:4
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v7, s6
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v9, s0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s7, v7
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
+; GFX11-SDAG-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-SDAG-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-SDAG-NEXT:    buffer_load_b32 v8, v9, s[4:7], 0 offen glc dlc
+; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr9
+; GFX11-SDAG-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-SDAG-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, s8
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-SDAG-NEXT:  .LBB1_3: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-SDAG-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-SDAG-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    buffer_store_b32 v8, v4, s[4:7], 0 offen dlc
+; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr8
+; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr4
+; GFX11-SDAG-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-SDAG-NEXT:    s_cbranch_execnz .LBB1_3
+; GFX11-SDAG-NEXT:  ; %bb.4:
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x10
+; GFX11-GISEL-NEXT:    s_mov_b32 s9, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_mov_b32 s10, s9
+; GFX11-GISEL-NEXT:    s_mov_b32 s6, s9
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
+; GFX11-GISEL-NEXT:    s_mov_b32 s11, s2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-GISEL-NEXT:    s_or_b64 s[0:1], s[8:9], s[10:11]
+; GFX11-GISEL-NEXT:    s_mov_b32 s8, s3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
+; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen glc dlc
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
+; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s9
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
+; GFX11-GISEL-NEXT:    s_mov_b32 s5, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; GFX11-GISEL-NEXT:    s_mov_b32 s8, s3
+; GFX11-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], 0 offen dlc
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_clause 0x2
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX12-SDAG-NEXT:    s_load_b128 s[8:11], s[4:5], 0x20
+; GFX12-SDAG-NEXT:    s_load_b32 s6, s[4:5], 0x10
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11
+; GFX12-SDAG-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    scratch_load_b64 v[5:6], off, off offset:40
+; GFX12-SDAG-NEXT:    scratch_load_b32 v4, off, off offset:36
+; GFX12-SDAG-NEXT:    s_load_b32 s1, s[4:5], 0x30
+; GFX12-SDAG-NEXT:    scratch_store_b128 off, v[7:10], off
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    scratch_load_b64 v[1:2], off, off offset:8
+; GFX12-SDAG-NEXT:    scratch_load_b32 v0, off, off offset:4
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v7, s6
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v9, s0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX12-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-SDAG-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x2
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s7, v7
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
+; GFX12-SDAG-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT:    buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
+; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr9
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-SDAG-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX12-SDAG-NEXT:  ; %bb.2:
+; GFX12-SDAG-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v4, s8
+; GFX12-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX12-SDAG-NEXT:  .LBB1_3: ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x1
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-SDAG-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    buffer_store_b32 v8, v4, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr8
+; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr4
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-SDAG-NEXT:    s_cbranch_execnz .LBB1_3
+; GFX12-SDAG-NEXT:  ; %bb.4:
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x10
+; GFX12-GISEL-NEXT:    s_mov_b32 s9, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_mov_b32 s10, s9
+; GFX12-GISEL-NEXT:    s_mov_b32 s6, s9
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
+; GFX12-GISEL-NEXT:    s_mov_b32 s11, s2
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    s_or_b64 s[0:1], s[8:9], s[10:11]
+; GFX12-GISEL-NEXT:    s_mov_b32 s8, s3
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
+; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
+; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
+; GFX12-GISEL-NEXT:    s_mov_b32 s4, s9
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
+; GFX12-GISEL-NEXT:    s_mov_b32 s5, s2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; GFX12-GISEL-NEXT:    s_mov_b32 s8, s3
+; GFX12-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_endpgm
+entry:
+  %val = load volatile i32, ptr addrspace(7) %in, !nontemporal !0
+  store volatile i32 %val, ptr addrspace(7) %out, !nontemporal !0
+  ret void
+}
+
+!0 = !{i32 1}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10: {{.*}}
+; GFX11: {{.*}}
+; GFX12: {{.*}}
+; GFX9: {{.*}}
+; GFX940: {{.*}}

From 9988309d5537e2954376005b07e9750cb62574a3 Mon Sep 17 00:00:00 2001
From: Dan Klishch <30951924+DanShaders@users.noreply.github.com>
Date: Tue, 14 Jan 2025 11:27:51 +0100
Subject: [PATCH 397/408] [clang] Do not allow unorderable features in
 [[gnu::target{,_clones}]] (#98426)

This partially addresses #98244.
---
 clang/lib/Sema/SemaDecl.cpp          |  3 ++-
 clang/lib/Sema/SemaDeclAttr.cpp      |  3 ++-
 clang/test/Sema/attr-target-clones.c |  3 +++
 clang/test/Sema/attr-target-mv.c     | 14 ++++++++++++++
 clang/test/Sema/attr-target.c        |  2 ++
 5 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index f5e57988b7fa8..fd3a5ec49771d 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -11136,7 +11136,8 @@ static bool CheckMultiVersionValue(Sema &S, const FunctionDecl *FD) {
       }
 
       if (!TargetInfo.validateCpuSupports(BareFeat) ||
-          !TargetInfo.isValidFeatureName(BareFeat)) {
+          !TargetInfo.isValidFeatureName(BareFeat) ||
+          (BareFeat != "default" && TargetInfo.getFMVPriority(BareFeat) == 0)) {
         S.Diag(FD->getLocation(), diag::err_bad_multiversion_option)
             << Feature << BareFeat;
         return true;
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index c1663f2d15c88..c2d82b9aa9b32 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -3289,7 +3289,8 @@ bool Sema::checkTargetClonesAttrString(
       } else if (Cur == "default") {
         DefaultIsDupe = HasDefault;
         HasDefault = true;
-      } else if (!Context.getTargetInfo().isValidFeatureName(Cur))
+      } else if (!Context.getTargetInfo().isValidFeatureName(Cur) ||
+                 Context.getTargetInfo().getFMVPriority(Cur) == 0)
         return Diag(CurLoc, diag::warn_unsupported_target_attribute)
                << Unsupported << None << Cur << TargetClones;
       if (llvm::is_contained(StringsBuffer, Cur) || DefaultIsDupe)
diff --git a/clang/test/Sema/attr-target-clones.c b/clang/test/Sema/attr-target-clones.c
index e287fce7699b7..4597ea54d02bf 100644
--- a/clang/test/Sema/attr-target-clones.c
+++ b/clang/test/Sema/attr-target-clones.c
@@ -122,3 +122,6 @@ void good_overload5(int) __attribute__((target_clones("mmx", "sse4.2", "default"
 void good_isa_level(int) __attribute__((target_clones("default", "arch=x86-64", "arch=x86-64-v2", "arch=x86-64-v3", "arch=x86-64-v4")));
 // expected-warning@+1 {{unsupported CPU 'x86-64-v5' in the 'target_clones' attribute string; 'target_clones' attribute ignored}}
 void bad_isa_level(int) __attribute__((target_clones("default", "arch=x86-64-v5")));
+
+// expected-warning@+1 {{unsupported 'sha' in the 'target_clones' attribute string; 'target_clones' attribute ignored}}
+void bad_feature(void) __attribute__((target_clones("default", "sse4.2", "sha")));
diff --git a/clang/test/Sema/attr-target-mv.c b/clang/test/Sema/attr-target-mv.c
index 8218771275e1b..ddb1d82b02f09 100644
--- a/clang/test/Sema/attr-target-mv.c
+++ b/clang/test/Sema/attr-target-mv.c
@@ -170,3 +170,17 @@ int __attribute__((__overloadable__)) __attribute__((target("arch=sandybridge"))
 
 int __attribute__((__overloadable__)) __attribute__((target("sse4.2"))) good_overload7(void);
 int __attribute__((target("arch=sandybridge"))) good_overload7(int);
+
+// expected-error@+2 {{function multiversioning doesn't support feature 'sha'}}
+// expected-note@+2 {{function multiversioning caused by this declaration}}
+int __attribute__((target("sha"))) no_priority1(void);
+int __attribute__((target("default"))) no_priority1(void);
+
+int __attribute__((target("default"))) no_priority2(void);
+// expected-error@+1 {{function multiversioning doesn't support feature 'sha'}}
+int __attribute__((target("sha"))) no_priority2(void);
+
+int __attribute__((target("default"))) no_priority3(void);
+int __attribute__((target("avx2"))) no_priority3(void);
+// expected-error@+1 {{function multiversioning doesn't support feature 'sha'}}
+int __attribute__((target("sha"))) no_priority3(void);
diff --git a/clang/test/Sema/attr-target.c b/clang/test/Sema/attr-target.c
index 5328f056507a7..65ece3c27d299 100644
--- a/clang/test/Sema/attr-target.c
+++ b/clang/test/Sema/attr-target.c
@@ -33,6 +33,8 @@ void __attribute__((target("x86-64"))) baseline(void) {}
 //expected-warning@+1 {{unsupported 'x86-64-v2' in the 'target' attribute string}}
 void __attribute__((target("x86-64-v2"))) v2(void) {}
 
+int __attribute__((target("sha"))) good_target_but_not_for_fmv() { return 5; }
+
 #elifdef __aarch64__
 
 int __attribute__((target("sve,arch=armv8-a"))) foo(void) { return 4; }

From 0bf1591d01a218dff236e94ca9e0880013129855 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 14 Jan 2025 10:43:22 +0000
Subject: [PATCH 398/408] [VectorCombine] foldPermuteOfBinops - fold "shuffle
 (binop (shuffle, other)), undef" --> "binop (shuffle), (shuffle)". (#122118)

foldPermuteOfBinops currently requires both binop operands to be oneuse shuffles to fold the shuffles across the binop, but there will be cases where its still profitable to fold across the binop with only one foldable shuffle.
---
 .../Transforms/Vectorize/VectorCombine.cpp    |  71 +++++++-----
 .../test/Transforms/PhaseOrdering/X86/hadd.ll | 106 +++++++++--------
 .../test/Transforms/PhaseOrdering/X86/hsub.ll | 107 +++++++++---------
 .../AArch64/shuffletoidentity.ll              |   3 +-
 .../VectorCombine/X86/permute-of-binops.ll    |  30 +++--
 5 files changed, 166 insertions(+), 151 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 1a669b5058e79..ae2af6d346879 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1592,17 +1592,21 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
   if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem))
     return false;
 
-  Value *Op00, *Op01;
-  ArrayRef<int> Mask0;
-  if (!match(BinOp->getOperand(0),
-             m_OneUse(m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0)))))
+  Value *Op00, *Op01, *Op10, *Op11;
+  ArrayRef<int> Mask0, Mask1;
+  bool Match0 =
+      match(BinOp->getOperand(0),
+            m_OneUse(m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0))));
+  bool Match1 =
+      match(BinOp->getOperand(1),
+            m_OneUse(m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1))));
+  if (!Match0 && !Match1)
     return false;
 
-  Value *Op10, *Op11;
-  ArrayRef<int> Mask1;
-  if (!match(BinOp->getOperand(1),
-             m_OneUse(m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1)))))
-    return false;
+  Op00 = Match0 ? Op00 : BinOp->getOperand(0);
+  Op01 = Match0 ? Op01 : BinOp->getOperand(0);
+  Op10 = Match1 ? Op10 : BinOp->getOperand(1);
+  Op11 = Match1 ? Op11 : BinOp->getOperand(1);
 
   Instruction::BinaryOps Opcode = BinOp->getOpcode();
   auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
@@ -1620,37 +1624,46 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
       any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
     return false;
 
-  // Merge outer / inner shuffles.
+  // Merge outer / inner (or identity if no match) shuffles.
   SmallVector<int> NewMask0, NewMask1;
   for (int M : OuterMask) {
     if (M < 0 || M >= (int)NumSrcElts) {
       NewMask0.push_back(PoisonMaskElem);
       NewMask1.push_back(PoisonMaskElem);
     } else {
-      NewMask0.push_back(Mask0[M]);
-      NewMask1.push_back(Mask1[M]);
+      NewMask0.push_back(Match0 ? Mask0[M] : M);
+      NewMask1.push_back(Match1 ? Mask1[M] : M);
     }
   }
 
+  unsigned NumOpElts = Op0Ty->getNumElements();
+  bool IsIdentity0 = ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts);
+  bool IsIdentity1 = ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
+
   // Try to merge shuffles across the binop if the new shuffles are not costly.
   InstructionCost OldCost =
       TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) +
       TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, BinOpTy,
-                         OuterMask, CostKind, 0, nullptr, {BinOp}, &I) +
-      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty, Mask0,
-                         CostKind, 0, nullptr, {Op00, Op01},
-                         cast<Instruction>(BinOp->getOperand(0))) +
-      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op1Ty, Mask1,
-                         CostKind, 0, nullptr, {Op10, Op11},
-                         cast<Instruction>(BinOp->getOperand(1)));
+                         OuterMask, CostKind, 0, nullptr, {BinOp}, &I);
+  if (Match0)
+    OldCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty,
+                                  Mask0, CostKind, 0, nullptr, {Op00, Op01},
+                                  cast<Instruction>(BinOp->getOperand(0)));
+  if (Match1)
+    OldCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op1Ty,
+                                  Mask1, CostKind, 0, nullptr, {Op10, Op11},
+                                  cast<Instruction>(BinOp->getOperand(1)));
 
   InstructionCost NewCost =
-      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty, NewMask0,
-                         CostKind, 0, nullptr, {Op00, Op01}) +
-      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op1Ty, NewMask1,
-                         CostKind, 0, nullptr, {Op10, Op11}) +
       TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
 
+  if (!IsIdentity0)
+    NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty,
+                                  NewMask0, CostKind, 0, nullptr, {Op00, Op01});
+  if (!IsIdentity1)
+    NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op1Ty,
+                                  NewMask1, CostKind, 0, nullptr, {Op10, Op11});
+
   LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I
                     << "\n  OldCost: " << OldCost << " vs NewCost: " << NewCost
                     << "\n");
@@ -1659,16 +1672,18 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
   if (NewCost > OldCost)
     return false;
 
-  Value *Shuf0 = Builder.CreateShuffleVector(Op00, Op01, NewMask0);
-  Value *Shuf1 = Builder.CreateShuffleVector(Op10, Op11, NewMask1);
-  Value *NewBO = Builder.CreateBinOp(Opcode, Shuf0, Shuf1);
+  Value *LHS =
+      IsIdentity0 ? Op00 : Builder.CreateShuffleVector(Op00, Op01, NewMask0);
+  Value *RHS =
+      IsIdentity1 ? Op10 : Builder.CreateShuffleVector(Op10, Op11, NewMask1);
+  Value *NewBO = Builder.CreateBinOp(Opcode, LHS, RHS);
 
   // Intersect flags from the old binops.
   if (auto *NewInst = dyn_cast<Instruction>(NewBO))
     NewInst->copyIRFlags(BinOp);
 
-  Worklist.pushValue(Shuf0);
-  Worklist.pushValue(Shuf1);
+  Worklist.pushValue(LHS);
+  Worklist.pushValue(RHS);
   replaceValue(I, *NewBO);
   return true;
 }
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
index a4aea02a33511..4b1234fda0e18 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
@@ -394,13 +394,13 @@ define <8 x i32> @add_v8i32_01234u67(<8 x i32> %a, <8 x i32> %b) {
 ; SSE4-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 4
 ; SSE4-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
 ; SSE4-NEXT:    [[A45:%.*]] = add i32 [[A4]], [[A5]]
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[TMP5]]
 ; SSE4-NEXT:    [[HADD4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A45]], i64 4
 ; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[TMP4]], [[TMP7]]
 ; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[HADD4]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
 ; SSE4-NEXT:    ret <8 x i32> [[RESULT]]
 ;
@@ -605,10 +605,10 @@ define <4 x float> @add_v4f32_012u(<4 x float> %a, <4 x float> %b) {
 
 define <4 x float> @add_v4f32_uu23(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @add_v4f32_uu23(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 2>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 3>
-; CHECK-NEXT:    [[RESULT1:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x float> [[RESULT1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 2>
+; CHECK-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 3>
+; CHECK-NEXT:    [[RESULT2:%.*]] = fadd <4 x float> [[TMP2]], [[RESULT1]]
+; CHECK-NEXT:    ret <4 x float> [[RESULT2]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
@@ -632,10 +632,10 @@ define <4 x float> @add_v4f32_uu23(<4 x float> %a, <4 x float> %b) {
 
 define <4 x float> @add_v4f32_01uu(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @add_v4f32_01uu(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
@@ -712,9 +712,9 @@ define <8 x float> @add_v8f32_012u4567(<8 x float> %a, <8 x float> %b) {
 ; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
 ; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 4, i32 7>
 ; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
-; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP6:%.*]] = fadd <8 x float> [[TMP4]], [[TMP5]]
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP6:%.*]] = fadd <8 x float> [[TMP5]], [[TMP8]]
 ; SSE2-NEXT:    [[HADD5:%.*]] = insertelement <8 x float> [[TMP6]], float [[A67]], i64 5
 ; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
@@ -724,13 +724,13 @@ define <8 x float> @add_v8f32_012u4567(<8 x float> %a, <8 x float> %b) {
 ; SSE4-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
 ; SSE4-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
 ; SSE4-NEXT:    [[A67:%.*]] = fadd float [[A6]], [[A7]]
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP2]], [[TMP5]]
 ; SSE4-NEXT:    [[HADD5:%.*]] = insertelement <8 x float> [[TMP3]], float [[A67]], i64 5
 ; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP6:%.*]] = fadd <8 x float> [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[TMP7:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP6:%.*]] = fadd <8 x float> [[TMP4]], [[TMP7]]
 ; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
 ; SSE4-NEXT:    ret <8 x float> [[RESULT]]
 ;
@@ -801,10 +801,9 @@ define <2 x double> @add_v2f64_01(<2 x double> %a, <2 x double> %b) {
 
 define <2 x double> @add_v2f64_u1(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @add_v2f64_u1(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[B]], [[SHIFT]]
-; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT:    ret <2 x double> [[RESULT]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[RESULT1:%.*]] = fadd <2 x double> [[TMP1]], [[B]]
+; CHECK-NEXT:    ret <2 x double> [[RESULT1]]
 ;
   %a0 = extractelement <2 x double> %a, i32 0
   %a1 = extractelement <2 x double> %a, i32 1
@@ -820,10 +819,9 @@ define <2 x double> @add_v2f64_u1(<2 x double> %a, <2 x double> %b) {
 
 define <2 x double> @add_v2f64_0u(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @add_v2f64_0u(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[A]], [[SHIFT]]
-; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT:    ret <2 x double> [[RESULT]]
+; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[RESULT1:%.*]] = fadd <2 x double> [[TMP1]], [[RESULT]]
+; CHECK-NEXT:    ret <2 x double> [[RESULT1]]
 ;
   %a0 = extractelement <2 x double> %a, i32 0
   %a1 = extractelement <2 x double> %a, i32 1
@@ -884,9 +882,9 @@ define <4 x double> @add_v4f64_u123(<4 x double> %a, <4 x double> %b) {
 ; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
 ; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
 ; SSE4-NEXT:    [[B23:%.*]] = fadd double [[B2]], [[B3]]
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 poison>
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 poison>
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], [[TMP4]]
 ; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
 ; SSE4-NEXT:    ret <4 x double> [[RESULT]]
 ;
@@ -932,10 +930,10 @@ define <4 x double> @add_v4f64_0u23(<4 x double> %a, <4 x double> %b) {
 ; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
 ; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
 ; SSE4-NEXT:    [[B23:%.*]] = fadd double [[B2]], [[B3]]
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 2, i32 poison>
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 0, i32 poison, i32 3, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1:%.*]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 2, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> <i32 0, i32 poison, i32 3, i32 poison>
+; SSE4-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
+; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
 ; SSE4-NEXT:    ret <4 x double> [[RESULT]]
 ;
 ; AVX-LABEL: @add_v4f64_0u23(
@@ -980,9 +978,9 @@ define <4 x double> @add_v4f64_01u3(<4 x double> %a, <4 x double> %b) {
 ; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
 ; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
 ; SSE4-NEXT:    [[B23:%.*]] = fadd double [[B2]], [[B3]]
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], [[TMP4]]
 ; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
 ; SSE4-NEXT:    ret <4 x double> [[RESULT]]
 ;
@@ -1028,9 +1026,9 @@ define <4 x double> @add_v4f64_012u(<4 x double> %a, <4 x double> %b) {
 ; SSE4-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i64 2
 ; SSE4-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
 ; SSE4-NEXT:    [[A23:%.*]] = fadd double [[A2]], [[A3]]
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], [[TMP4]]
 ; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[A23]], i64 2
 ; SSE4-NEXT:    ret <4 x double> [[RESULT]]
 ;
@@ -1069,15 +1067,15 @@ define <4 x double> @add_v4f64_uu23(<4 x double> %a, <4 x double> %b) {
 ; SSE2-NEXT:    ret <4 x double> [[RESULT1]]
 ;
 ; SSE4-LABEL: @add_v4f64_uu23(
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
-; SSE4-NEXT:    [[RESULT1:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
+; SSE4-NEXT:    [[RESULT1:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
 ; SSE4-NEXT:    ret <4 x double> [[RESULT1]]
 ;
 ; AVX-LABEL: @add_v4f64_uu23(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
-; AVX-NEXT:    [[RESULT1:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
+; AVX-NEXT:    [[RESULT1:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
 ; AVX-NEXT:    ret <4 x double> [[RESULT1]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
@@ -1109,15 +1107,15 @@ define <4 x double> @add_v4f64_01uu(<4 x double> %a, <4 x double> %b) {
 ; SSE2-NEXT:    ret <4 x double> [[TMP4]]
 ;
 ; SSE4-LABEL: @add_v4f64_01uu(
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], [[TMP4]]
 ; SSE4-NEXT:    ret <4 x double> [[TMP3]]
 ;
 ; AVX-LABEL: @add_v4f64_01uu(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], [[TMP4]]
 ; AVX-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
index bcb316a4a73ea..c9cba0a4cc0ff 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
@@ -394,13 +394,13 @@ define <8 x i32> @sub_v8i32_01234u67(<8 x i32> %a, <8 x i32> %b) {
 ; SSE4-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 4
 ; SSE4-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
 ; SSE4-NEXT:    [[A45:%.*]] = sub i32 [[A4]], [[A5]]
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP2]], [[TMP4]]
 ; SSE4-NEXT:    [[HSUB4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A45]], i64 4
-; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP6:%.*]] = sub <8 x i32> [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[TMP6:%.*]] = sub <8 x i32> [[TMP7]], [[TMP5]]
 ; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[HSUB4]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
 ; SSE4-NEXT:    ret <8 x i32> [[RESULT]]
 ;
@@ -605,10 +605,10 @@ define <4 x float> @sub_v4f32_012u(<4 x float> %a, <4 x float> %b) {
 
 define <4 x float> @sub_v4f32_uu23(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @sub_v4f32_uu23(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 2>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 3>
-; CHECK-NEXT:    [[RESULT1:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x float> [[RESULT1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 2>
+; CHECK-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 3>
+; CHECK-NEXT:    [[RESULT2:%.*]] = fsub <4 x float> [[TMP2]], [[RESULT1]]
+; CHECK-NEXT:    ret <4 x float> [[RESULT2]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
@@ -632,10 +632,10 @@ define <4 x float> @sub_v4f32_uu23(<4 x float> %a, <4 x float> %b) {
 
 define <4 x float> @sub_v4f32_01uu(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @sub_v4f32_01uu(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1:%.*]], <4 x float> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
@@ -712,9 +712,9 @@ define <8 x float> @sub_v8f32_012u4567(<8 x float> %a, <8 x float> %b) {
 ; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 4, i32 6>
 ; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 5, i32 7>
 ; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
-; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP6:%.*]] = fsub <8 x float> [[TMP4]], [[TMP5]]
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP6:%.*]] = fsub <8 x float> [[TMP5]], [[TMP8]]
 ; SSE2-NEXT:    [[HSUB5:%.*]] = insertelement <8 x float> [[TMP6]], float [[A67]], i64 5
 ; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HSUB5]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
@@ -724,13 +724,13 @@ define <8 x float> @sub_v8f32_012u4567(<8 x float> %a, <8 x float> %b) {
 ; SSE4-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
 ; SSE4-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
 ; SSE4-NEXT:    [[A67:%.*]] = fsub float [[A6]], [[A7]]
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP2]], [[TMP5]]
 ; SSE4-NEXT:    [[HSUB5:%.*]] = insertelement <8 x float> [[TMP3]], float [[A67]], i64 5
 ; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP6:%.*]] = fsub <8 x float> [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[TMP7:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP6:%.*]] = fsub <8 x float> [[TMP4]], [[TMP7]]
 ; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HSUB5]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
 ; SSE4-NEXT:    ret <8 x float> [[RESULT]]
 ;
@@ -801,10 +801,9 @@ define <2 x double> @sub_v2f64_01(<2 x double> %a, <2 x double> %b) {
 
 define <2 x double> @sub_v2f64_u1(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @sub_v2f64_u1(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> [[B]], [[SHIFT]]
-; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT:    ret <2 x double> [[RESULT]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[RESULT1:%.*]] = fsub <2 x double> [[TMP1]], [[B]]
+; CHECK-NEXT:    ret <2 x double> [[RESULT1]]
 ;
   %a0 = extractelement <2 x double> %a, i32 0
   %a1 = extractelement <2 x double> %a, i32 1
@@ -820,10 +819,9 @@ define <2 x double> @sub_v2f64_u1(<2 x double> %a, <2 x double> %b) {
 
 define <2 x double> @sub_v2f64_0u(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @sub_v2f64_0u(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> [[A]], [[SHIFT]]
-; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT:    ret <2 x double> [[RESULT]]
+; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[RESULT1:%.*]] = fsub <2 x double> [[TMP1]], [[RESULT]]
+; CHECK-NEXT:    ret <2 x double> [[RESULT1]]
 ;
   %a0 = extractelement <2 x double> %a, i32 0
   %a1 = extractelement <2 x double> %a, i32 1
@@ -884,9 +882,9 @@ define <4 x double> @sub_v4f64_u123(<4 x double> %a, <4 x double> %b) {
 ; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
 ; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
 ; SSE4-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 poison>
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 poison>
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP2]], [[TMP4]]
 ; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
 ; SSE4-NEXT:    ret <4 x double> [[RESULT]]
 ;
@@ -932,14 +930,13 @@ define <4 x double> @sub_v4f64_0u23(<4 x double> %a, <4 x double> %b) {
 ; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
 ; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
 ; SSE4-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <4 x i32> <i32 0, i32 poison, i32 2, i32 poison>
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 3, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
-; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP1:%.*]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 3, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = fsub <4 x double> [[TMP1]], [[TMP3]]
+; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP2]], double [[B23]], i64 3
 ; SSE4-NEXT:    ret <4 x double> [[RESULT]]
 ;
 ; AVX-LABEL: @sub_v4f64_0u23(
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 6>
 ; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 3, i32 7>
 ; AVX-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
 ; AVX-NEXT:    ret <4 x double> [[TMP4]]
@@ -980,9 +977,9 @@ define <4 x double> @sub_v4f64_01u3(<4 x double> %a, <4 x double> %b) {
 ; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
 ; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
 ; SSE4-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP2]], [[TMP4]]
 ; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
 ; SSE4-NEXT:    ret <4 x double> [[RESULT]]
 ;
@@ -1028,9 +1025,9 @@ define <4 x double> @sub_v4f64_012u(<4 x double> %a, <4 x double> %b) {
 ; SSE4-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i64 2
 ; SSE4-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
 ; SSE4-NEXT:    [[A23:%.*]] = fsub double [[A2]], [[A3]]
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP2]], [[TMP4]]
 ; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[A23]], i64 2
 ; SSE4-NEXT:    ret <4 x double> [[RESULT]]
 ;
@@ -1069,15 +1066,15 @@ define <4 x double> @sub_v4f64_uu23(<4 x double> %a, <4 x double> %b) {
 ; SSE2-NEXT:    ret <4 x double> [[RESULT1]]
 ;
 ; SSE4-LABEL: @sub_v4f64_uu23(
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
-; SSE4-NEXT:    [[RESULT1:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
+; SSE4-NEXT:    [[RESULT1:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
 ; SSE4-NEXT:    ret <4 x double> [[RESULT1]]
 ;
 ; AVX-LABEL: @sub_v4f64_uu23(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
-; AVX-NEXT:    [[RESULT1:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
+; AVX-NEXT:    [[RESULT1:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
 ; AVX-NEXT:    ret <4 x double> [[RESULT1]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
@@ -1109,15 +1106,15 @@ define <4 x double> @sub_v4f64_01uu(<4 x double> %a, <4 x double> %b) {
 ; SSE2-NEXT:    ret <4 x double> [[TMP4]]
 ;
 ; SSE4-LABEL: @sub_v4f64_01uu(
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP2]], [[TMP4]]
 ; SSE4-NEXT:    ret <4 x double> [[TMP3]]
 ;
 ; AVX-LABEL: @sub_v4f64_01uu(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP2]], [[TMP4]]
 ; AVX-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
index f4c27794d3930..09875c5e0af40 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
@@ -937,9 +937,8 @@ define <4 x i64> @cast_mismatched_types(<4 x i32> %x) {
 
 define <4 x float> @fadd_mismatched_types(<4 x float> %x, <4 x float> %y) {
 ; CHECK-LABEL: @fadd_mismatched_types(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> poison, <4 x i32> <i32 0, i32 poison, i32 2, i32 poison>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 3, i32 poison>
-; CHECK-NEXT:    [[EXTSHUF:%.*]] = fadd fast <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[EXTSHUF:%.*]] = fadd fast <4 x float> [[TMP1:%.*]], [[TMP2]]
 ; CHECK-NEXT:    ret <4 x float> [[EXTSHUF]]
 ;
   %shuf.x = shufflevector <4 x float> %x, <4 x float> poison, <2 x i32> <i32 0, i32 2>
diff --git a/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll b/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
index 8db1990dcbb5d..1dc324bbd63ff 100644
--- a/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
@@ -70,16 +70,25 @@ define <4 x double> @fadd_v4f64_multiuse_op(<4 x double> %a, <4 x double> %b) {
   ret <4 x double> %post
 }
 
-; Negative test - multiple use of inner shuffle
+; Negative test - multiple use of inner shuffle (only fold if the moved shuffle is cheaper).
 define <4 x double> @fadd_v4f64_multiuse_shuffle(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: define <4 x double> @fadd_v4f64_multiuse_shuffle(
-; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[A1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[B1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
-; CHECK-NEXT:    [[OP:%.*]] = fadd <4 x double> [[A1]], [[B1]]
-; CHECK-NEXT:    [[POST:%.*]] = shufflevector <4 x double> [[OP]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    call void @use_v4f64(<4 x double> [[A1]])
-; CHECK-NEXT:    ret <4 x double> [[POST]]
+; SSE-LABEL: define <4 x double> @fadd_v4f64_multiuse_shuffle(
+; SSE-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[A1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; SSE-NEXT:    [[POST:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    call void @use_v4f64(<4 x double> [[A1]])
+; SSE-NEXT:    ret <4 x double> [[POST]]
+;
+; AVX-LABEL: define <4 x double> @fadd_v4f64_multiuse_shuffle(
+; AVX-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
+; AVX-NEXT:    [[A1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX-NEXT:    [[B1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; AVX-NEXT:    [[OP:%.*]] = fadd <4 x double> [[A1]], [[B1]]
+; AVX-NEXT:    [[POST:%.*]] = shufflevector <4 x double> [[OP]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; AVX-NEXT:    call void @use_v4f64(<4 x double> [[A1]])
+; AVX-NEXT:    ret <4 x double> [[POST]]
 ;
   %a1 = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   %b1 = shufflevector <4 x double> %b, <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
@@ -137,6 +146,3 @@ define <4 x i32> @sdiv_v4i32_poison_idx(<4 x i32> %a, <4 x i32> %b) {
   %post = shufflevector <4 x i32> %op, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 4>
   ret <4 x i32> %post
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
-; SSE: {{.*}}

From 27bc6bdaba1138d611e256e890023eefee677edc Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Tue, 14 Jan 2025 11:08:55 +0000
Subject: [PATCH 399/408] [OMPIRBuilder] Introduce struct to hold default
 kernel teams/threads (#116050)

This patch introduces the `OpenMPIRBuilder::TargetKernelDefaultAttrs`
structure used to simplify passing default and constant values for
number of teams and threads, and possibly other target kernel-related
information in the future.

This is used to forward values passed to `createTarget` to
`createTargetInit`, which previously used a default unrelated set of
values.
---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         | 13 ++--
 clang/lib/CodeGen/CGOpenMPRuntime.h           |  9 +--
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp      | 10 +--
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       | 41 ++++++----
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 75 +++++++++++--------
 .../Frontend/OpenMPIRBuilderTest.cpp          | 22 ++++--
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 12 +--
 .../LLVMIR/omptarget-region-device-llvm.mlir  |  2 +-
 8 files changed, 106 insertions(+), 78 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 1868b57cea903..244e3066f8fe4 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -5880,10 +5880,13 @@ void CGOpenMPRuntime::emitUsesAllocatorsFini(CodeGenFunction &CGF,
 
 void CGOpenMPRuntime::computeMinAndMaxThreadsAndTeams(
     const OMPExecutableDirective &D, CodeGenFunction &CGF,
-    int32_t &MinThreadsVal, int32_t &MaxThreadsVal, int32_t &MinTeamsVal,
-    int32_t &MaxTeamsVal) {
+    llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs) {
+  assert(Attrs.MaxTeams.size() == 1 && Attrs.MaxThreads.size() == 1 &&
+         "invalid default attrs structure");
+  int32_t &MaxTeamsVal = Attrs.MaxTeams.front();
+  int32_t &MaxThreadsVal = Attrs.MaxThreads.front();
 
-  getNumTeamsExprForTargetDirective(CGF, D, MinTeamsVal, MaxTeamsVal);
+  getNumTeamsExprForTargetDirective(CGF, D, Attrs.MinTeams, MaxTeamsVal);
   getNumThreadsExprForTargetDirective(CGF, D, MaxThreadsVal,
                                       /*UpperBoundOnly=*/true);
 
@@ -5901,12 +5904,12 @@ void CGOpenMPRuntime::computeMinAndMaxThreadsAndTeams(
       else
         continue;
 
-      MinThreadsVal = std::max(MinThreadsVal, AttrMinThreadsVal);
+      Attrs.MinThreads = std::max(Attrs.MinThreads, AttrMinThreadsVal);
       if (AttrMaxThreadsVal > 0)
         MaxThreadsVal = MaxThreadsVal > 0
                             ? std::min(MaxThreadsVal, AttrMaxThreadsVal)
                             : AttrMaxThreadsVal;
-      MinTeamsVal = std::max(MinTeamsVal, AttrMinBlocksVal);
+      Attrs.MinTeams = std::max(Attrs.MinTeams, AttrMinBlocksVal);
       if (AttrMaxBlocksVal > 0)
         MaxTeamsVal = MaxTeamsVal > 0 ? std::min(MaxTeamsVal, AttrMaxBlocksVal)
                                       : AttrMaxBlocksVal;
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
index 8ab5ee70a19fa..3791bb7159235 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -313,12 +313,9 @@ class CGOpenMPRuntime {
   llvm::OpenMPIRBuilder OMPBuilder;
 
   /// Helper to determine the min/max number of threads/teams for \p D.
-  void computeMinAndMaxThreadsAndTeams(const OMPExecutableDirective &D,
-                                       CodeGenFunction &CGF,
-                                       int32_t &MinThreadsVal,
-                                       int32_t &MaxThreadsVal,
-                                       int32_t &MinTeamsVal,
-                                       int32_t &MaxTeamsVal);
+  void computeMinAndMaxThreadsAndTeams(
+      const OMPExecutableDirective &D, CodeGenFunction &CGF,
+      llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs);
 
   /// Helper to emit outlined function for 'target' directive.
   /// \param D Directive to emit.
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index dbb19f2a8d825..81993dafae2b0 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -744,14 +744,12 @@ void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
 void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D,
                                         CodeGenFunction &CGF,
                                         EntryFunctionState &EST, bool IsSPMD) {
-  int32_t MinThreadsVal = 1, MaxThreadsVal = -1, MinTeamsVal = 1,
-          MaxTeamsVal = -1;
-  computeMinAndMaxThreadsAndTeams(D, CGF, MinThreadsVal, MaxThreadsVal,
-                                  MinTeamsVal, MaxTeamsVal);
+  llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs Attrs;
+  Attrs.IsSPMD = IsSPMD;
+  computeMinAndMaxThreadsAndTeams(D, CGF, Attrs);
 
   CGBuilderTy &Bld = CGF.Builder;
-  Bld.restoreIP(OMPBuilder.createTargetInit(
-      Bld, IsSPMD, MinThreadsVal, MaxThreadsVal, MinTeamsVal, MaxTeamsVal));
+  Bld.restoreIP(OMPBuilder.createTargetInit(Bld, Attrs));
   if (!IsSPMD)
     emitGenericVarsProlog(CGF, EST.Loc);
 }
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 4ce47b1c05d9b..8ca3bc08b5ad4 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2225,6 +2225,21 @@ class OpenMPIRBuilder {
           MapNamesArray(MapNamesArray) {}
   };
 
+  /// Container to pass the default attributes with which a kernel must be
+  /// launched, used to set kernel attributes and populate associated static
+  /// structures.
+  ///
+  /// For max values, < 0 means unset, == 0 means set but unknown at compile
+  /// time. The number of max values will be 1 except for the case where
+  /// ompx_bare is set.
+  struct TargetKernelDefaultAttrs {
+    bool IsSPMD = false;
+    SmallVector<int32_t, 3> MaxTeams = {-1};
+    int32_t MinTeams = 1;
+    SmallVector<int32_t, 3> MaxThreads = {-1};
+    int32_t MinThreads = 1;
+  };
+
   /// Data structure that contains the needed information to construct the
   /// kernel args vector.
   struct TargetKernelArgs {
@@ -2727,16 +2742,11 @@ class OpenMPIRBuilder {
   /// Create a runtime call for kmpc_target_init
   ///
   /// \param Loc The insert and source location description.
-  /// \param IsSPMD Flag to indicate if the kernel is an SPMD kernel or not.
-  /// \param MinThreads Minimal number of threads, or 0.
-  /// \param MaxThreads Maximal number of threads, or 0.
-  /// \param MinTeams Minimal number of teams, or 0.
-  /// \param MaxTeams Maximal number of teams, or 0.
-  InsertPointTy createTargetInit(const LocationDescription &Loc, bool IsSPMD,
-                                 int32_t MinThreadsVal = 0,
-                                 int32_t MaxThreadsVal = 0,
-                                 int32_t MinTeamsVal = 0,
-                                 int32_t MaxTeamsVal = 0);
+  /// \param Attrs Structure containing the default attributes, including
+  ///        numbers of threads and teams to launch the kernel with.
+  InsertPointTy createTargetInit(
+      const LocationDescription &Loc,
+      const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs);
 
   /// Create a runtime call for kmpc_target_deinit
   ///
@@ -2961,8 +2971,8 @@ class OpenMPIRBuilder {
   /// \param CodeGenIP The insertion point where the call to the outlined
   /// function should be emitted.
   /// \param EntryInfo The entry information about the function.
-  /// \param NumTeams Number of teams specified in the num_teams clause.
-  /// \param NumThreads Number of teams specified in the thread_limit clause.
+  /// \param DefaultAttrs Structure containing the default numbers of threads
+  ///        and teams to launch the kernel with.
   /// \param Inputs The input values to the region that will be passed.
   /// as arguments to the outlined function.
   /// \param BodyGenCB Callback that will generate the region code.
@@ -2975,9 +2985,10 @@ class OpenMPIRBuilder {
       const LocationDescription &Loc, bool IsOffloadEntry,
       OpenMPIRBuilder::InsertPointTy AllocaIP,
       OpenMPIRBuilder::InsertPointTy CodeGenIP,
-      TargetRegionEntryInfo &EntryInfo, ArrayRef<int32_t> NumTeams,
-      ArrayRef<int32_t> NumThreads, SmallVectorImpl<Value *> &Inputs,
-      GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB,
+      TargetRegionEntryInfo &EntryInfo,
+      const TargetKernelDefaultAttrs &DefaultAttrs,
+      SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
+      TargetBodyGenCallbackTy BodyGenCB,
       TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
       SmallVector<DependData> Dependencies = {}, bool HasNowait = false);
 
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 2b57a8dce3de5..df9b35ddd80ca 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -6119,10 +6119,12 @@ CallInst *OpenMPIRBuilder::createCachedThreadPrivate(
   return Builder.CreateCall(Fn, Args);
 }
 
-OpenMPIRBuilder::InsertPointTy
-OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD,
-                                  int32_t MinThreadsVal, int32_t MaxThreadsVal,
-                                  int32_t MinTeamsVal, int32_t MaxTeamsVal) {
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetInit(
+    const LocationDescription &Loc,
+    const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs) {
+  assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
+         "expected num_threads and num_teams to be specified");
+
   if (!updateToLocation(Loc))
     return Loc.IP;
 
@@ -6130,8 +6132,9 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD,
   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
   Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   Constant *IsSPMDVal = ConstantInt::getSigned(
-      Int8, IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC);
-  Constant *UseGenericStateMachineVal = ConstantInt::getSigned(Int8, !IsSPMD);
+      Int8, Attrs.IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC);
+  Constant *UseGenericStateMachineVal =
+      ConstantInt::getSigned(Int8, !Attrs.IsSPMD);
   Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
   Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
 
@@ -6149,21 +6152,23 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD,
 
   // Manifest the launch configuration in the metadata matching the kernel
   // environment.
-  if (MinTeamsVal > 1 || MaxTeamsVal > 0)
-    writeTeamsForKernel(T, *Kernel, MinTeamsVal, MaxTeamsVal);
+  if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
+    writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
 
-  // For max values, < 0 means unset, == 0 means set but unknown.
+  // If MaxThreads not set, select the maximum between the default workgroup
+  // size and the MinThreads value.
+  int32_t MaxThreadsVal = Attrs.MaxThreads.front();
   if (MaxThreadsVal < 0)
     MaxThreadsVal = std::max(
-        int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), MinThreadsVal);
+        int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), Attrs.MinThreads);
 
   if (MaxThreadsVal > 0)
-    writeThreadBoundsForKernel(T, *Kernel, MinThreadsVal, MaxThreadsVal);
+    writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
 
-  Constant *MinThreads = ConstantInt::getSigned(Int32, MinThreadsVal);
+  Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
   Constant *MaxThreads = ConstantInt::getSigned(Int32, MaxThreadsVal);
-  Constant *MinTeams = ConstantInt::getSigned(Int32, MinTeamsVal);
-  Constant *MaxTeams = ConstantInt::getSigned(Int32, MaxTeamsVal);
+  Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
+  Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
   Constant *ReductionDataSize = ConstantInt::getSigned(Int32, 0);
   Constant *ReductionBufferLength = ConstantInt::getSigned(Int32, 0);
 
@@ -6730,8 +6735,9 @@ FunctionCallee OpenMPIRBuilder::createDispatchDeinitFunction() {
 }
 
 static Expected<Function *> createOutlinedFunction(
-    OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, StringRef FuncName,
-    SmallVectorImpl<Value *> &Inputs,
+    OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
+    const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
+    StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
     OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
     OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
   SmallVector<Type *> ParameterTypes;
@@ -6798,7 +6804,7 @@ static Expected<Function *> createOutlinedFunction(
 
   // Insert target init call in the device compilation pass.
   if (OMPBuilder.Config.isTargetDevice())
-    Builder.restoreIP(OMPBuilder.createTargetInit(Builder, /*IsSPMD*/ false));
+    Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
 
   BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
 
@@ -6997,16 +7003,18 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder,
 
 static Error emitTargetOutlinedFunction(
     OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
-    TargetRegionEntryInfo &EntryInfo, Function *&OutlinedFn,
-    Constant *&OutlinedFnID, SmallVectorImpl<Value *> &Inputs,
+    TargetRegionEntryInfo &EntryInfo,
+    const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
+    Function *&OutlinedFn, Constant *&OutlinedFnID,
+    SmallVectorImpl<Value *> &Inputs,
     OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
     OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
 
   OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
-      [&OMPBuilder, &Builder, &Inputs, &CBFunc,
-       &ArgAccessorFuncCB](StringRef EntryFnName) {
-        return createOutlinedFunction(OMPBuilder, Builder, EntryFnName, Inputs,
-                                      CBFunc, ArgAccessorFuncCB);
+      [&](StringRef EntryFnName) {
+        return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
+                                      EntryFnName, Inputs, CBFunc,
+                                      ArgAccessorFuncCB);
       };
 
   return OMPBuilder.emitTargetRegionFunction(
@@ -7302,9 +7310,10 @@ void OpenMPIRBuilder::emitOffloadingArraysAndArgs(
 
 static void
 emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
-               OpenMPIRBuilder::InsertPointTy AllocaIP, Function *OutlinedFn,
-               Constant *OutlinedFnID, ArrayRef<int32_t> NumTeams,
-               ArrayRef<int32_t> NumThreads, SmallVectorImpl<Value *> &Args,
+               OpenMPIRBuilder::InsertPointTy AllocaIP,
+               const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
+               Function *OutlinedFn, Constant *OutlinedFnID,
+               SmallVectorImpl<Value *> &Args,
                OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB,
                SmallVector<llvm::OpenMPIRBuilder::DependData> Dependencies = {},
                bool HasNoWait = false) {
@@ -7385,9 +7394,9 @@ emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
 
   SmallVector<Value *, 3> NumTeamsC;
   SmallVector<Value *, 3> NumThreadsC;
-  for (auto V : NumTeams)
+  for (auto V : DefaultAttrs.MaxTeams)
     NumTeamsC.push_back(llvm::ConstantInt::get(Builder.getInt32Ty(), V));
-  for (auto V : NumThreads)
+  for (auto V : DefaultAttrs.MaxThreads)
     NumThreadsC.push_back(llvm::ConstantInt::get(Builder.getInt32Ty(), V));
 
   unsigned NumTargetItems = Info.NumberOfPtrs;
@@ -7428,7 +7437,7 @@ emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
 OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget(
     const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
     InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo,
-    ArrayRef<int32_t> NumTeams, ArrayRef<int32_t> NumThreads,
+    const TargetKernelDefaultAttrs &DefaultAttrs,
     SmallVectorImpl<Value *> &Args, GenMapInfoCallbackTy GenMapInfoCB,
     OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc,
     OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
@@ -7445,16 +7454,16 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget(
   // the target region itself is generated using the callbacks CBFunc
   // and ArgAccessorFuncCB
   if (Error Err = emitTargetOutlinedFunction(
-          *this, Builder, IsOffloadEntry, EntryInfo, OutlinedFn, OutlinedFnID,
-          Args, CBFunc, ArgAccessorFuncCB))
+          *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
+          OutlinedFnID, Args, CBFunc, ArgAccessorFuncCB))
     return Err;
 
   // If we are not on the target device, then we need to generate code
   // to make a remote call (offload) to the previously outlined function
   // that represents the target region. Do that now.
   if (!Config.isTargetDevice())
-    emitTargetCall(*this, Builder, AllocaIP, OutlinedFn, OutlinedFnID, NumTeams,
-                   NumThreads, Args, GenMapInfoCB, Dependencies, HasNowait);
+    emitTargetCall(*this, Builder, AllocaIP, DefaultAttrs, OutlinedFn,
+                   OutlinedFnID, Args, GenMapInfoCB, Dependencies, HasNowait);
   return Builder.saveIP();
 }
 
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index cdca725b14743..04ecd7ef327d5 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -6229,10 +6229,14 @@ TEST_F(OpenMPIRBuilderTest, TargetRegion) {
 
   TargetRegionEntryInfo EntryInfo("func", 42, 4711, 17);
   OpenMPIRBuilder::LocationDescription OmpLoc({Builder.saveIP(), DL});
+  OpenMPIRBuilder::TargetKernelDefaultAttrs DefaultAttrs = {
+      /*IsSPMD=*/false, /*MaxTeams=*/{-1}, /*MinTeams=*/0, /*MaxThreads=*/{0},
+      /*MinThreads=*/0};
+
   ASSERT_EXPECTED_INIT(
       OpenMPIRBuilder::InsertPointTy, AfterIP,
       OMPBuilder.createTarget(OmpLoc, /*IsOffloadEntry=*/true, Builder.saveIP(),
-                              Builder.saveIP(), EntryInfo, -1, 0, Inputs,
+                              Builder.saveIP(), EntryInfo, DefaultAttrs, Inputs,
                               GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB));
   Builder.restoreIP(AfterIP);
   OMPBuilder.finalize();
@@ -6339,13 +6343,15 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) {
                                    F->getEntryBlock().getFirstInsertionPt());
   TargetRegionEntryInfo EntryInfo("parent", /*DeviceID=*/1, /*FileID=*/2,
                                   /*Line=*/3, /*Count=*/0);
+  OpenMPIRBuilder::TargetKernelDefaultAttrs DefaultAttrs = {
+      /*IsSPMD=*/false, /*MaxTeams=*/{-1}, /*MinTeams=*/0, /*MaxThreads=*/{0},
+      /*MinThreads=*/0};
 
   ASSERT_EXPECTED_INIT(
       OpenMPIRBuilder::InsertPointTy, AfterIP,
       OMPBuilder.createTarget(Loc, /*IsOffloadEntry=*/true, EntryIP, EntryIP,
-                              EntryInfo, /*NumTeams=*/-1,
-                              /*NumThreads=*/0, CapturedArgs, GenMapInfoCB,
-                              BodyGenCB, SimpleArgAccessorCB));
+                              EntryInfo, DefaultAttrs, CapturedArgs,
+                              GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB));
   Builder.restoreIP(AfterIP);
 
   Builder.CreateRetVoid();
@@ -6496,13 +6502,15 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) {
                                    F->getEntryBlock().getFirstInsertionPt());
   TargetRegionEntryInfo EntryInfo("parent", /*DeviceID=*/1, /*FileID=*/2,
                                   /*Line=*/3, /*Count=*/0);
+  OpenMPIRBuilder::TargetKernelDefaultAttrs DefaultAttrs = {
+      /*IsSPMD=*/false, /*MaxTeams=*/{-1}, /*MinTeams=*/0, /*MaxThreads=*/{0},
+      /*MinThreads=*/0};
 
   ASSERT_EXPECTED_INIT(
       OpenMPIRBuilder::InsertPointTy, AfterIP,
       OMPBuilder.createTarget(Loc, /*IsOffloadEntry=*/true, EntryIP, EntryIP,
-                              EntryInfo, /*NumTeams=*/-1,
-                              /*NumThreads=*/0, CapturedArgs, GenMapInfoCB,
-                              BodyGenCB, SimpleArgAccessorCB));
+                              EntryInfo, DefaultAttrs, CapturedArgs,
+                              GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB));
   Builder.restoreIP(AfterIP);
 
   Builder.CreateRetVoid();
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 35a3750e02a66..25b0ffe2ced6d 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -4084,9 +4084,6 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   if (!getTargetEntryUniqueInfo(entryInfo, targetOp, parentName))
     return failure();
 
-  int32_t defaultValTeams = -1;
-  int32_t defaultValThreads = 0;
-
   MapInfoData mapData;
   collectMapDataFromMapOperands(mapData, mapVars, moduleTranslation, dl,
                                 builder);
@@ -4118,6 +4115,11 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
                                         allocaIP, codeGenIP);
   };
 
+  // TODO: Populate default attributes based on the construct and clauses.
+  llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs defaultAttrs = {
+      /*IsSPMD=*/false, /*MaxTeams=*/{-1}, /*MinTeams=*/0, /*MaxThreads=*/{0},
+      /*MinThreads=*/0};
+
   llvm::SmallVector<llvm::Value *, 4> kernelInput;
   for (size_t i = 0; i < mapVars.size(); ++i) {
     // declare target arguments are not passed to kernels as arguments
@@ -4141,8 +4143,8 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP =
       moduleTranslation.getOpenMPBuilder()->createTarget(
           ompLoc, isOffloadEntry, allocaIP, builder.saveIP(), entryInfo,
-          defaultValTeams, defaultValThreads, kernelInput, genMapInfoCB, bodyCB,
-          argAccessorCB, dds, targetOp.getNowait());
+          defaultAttrs, kernelInput, genMapInfoCB, bodyCB, argAccessorCB, dds,
+          targetOp.getNowait());
 
   if (failed(handleError(afterIP, opInst)))
     return failure();
diff --git a/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir
index 8993c0e85c5de..fa32a3030108d 100644
--- a/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir
@@ -29,7 +29,7 @@ module attributes {omp.is_target_device = true} {
 // CHECK:      @[[SRC_LOC:.*]] = private unnamed_addr constant [23 x i8] c"{{[^"]*}}", align 1
 // CHECK:      @[[IDENT:.*]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[SRC_LOC]] }, align 8
 // CHECK:      @[[DYNA_ENV:.*]] = weak_odr protected global %struct.DynamicEnvironmentTy zeroinitializer
-// CHECK:      @[[KERNEL_ENV:.*]] = weak_odr protected constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[IDENT]], ptr @[[DYNA_ENV]] }
+// CHECK:      @[[KERNEL_ENV:.*]] = weak_odr protected constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 -1, i32 0, i32 0 }, ptr @[[IDENT]], ptr @[[DYNA_ENV]] }
 // CHECK:      define weak_odr protected void @__omp_offloading_{{[^_]+}}_{{[^_]+}}_omp_target_region__l{{[0-9]+}}(ptr %[[DYN_PTR:.*]], ptr %[[ADDR_A:.*]], ptr %[[ADDR_B:.*]], ptr %[[ADDR_C:.*]])
 // CHECK:        %[[TMP_A:.*]] = alloca ptr, align 8
 // CHECK:        store ptr %[[ADDR_A]], ptr %[[TMP_A]], align 8

From ce7c8815a1b1220905d46a6daf377b03819fd1ce Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Tue, 14 Jan 2025 11:14:45 +0000
Subject: [PATCH 400/408] [AArch64] Add mayStore to more store instructions

As in #121565 we need to mark all stores as mayStore, hasSideEffects is not
enough to prevent moving loads past the instructions. And marking the
instructions as mayStore is a sensible thing to do on its own.
---
 llvm/lib/Target/AArch64/AArch64InstrFormats.td       |  2 ++
 llvm/lib/Target/AArch64/AArch64InstrInfo.td          |  5 +++++
 .../AArch64/Ampere/Ampere1B/mte-instructions.s       | 12 ++++++------
 .../llvm-mca/AArch64/Neoverse/N2-mte-instructions.s  | 12 ++++++------
 .../llvm-mca/AArch64/Neoverse/N3-mte-instructions.s  | 12 ++++++------
 5 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index f527f7e4eafbc..1ff8b77f88e27 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -5028,6 +5028,7 @@ class BaseStoreUnprivilegedLSUI<bits<2> sz, dag oops, dag iops, string asm>
    let Inst{9-5} = Rn;
    let Inst{4-0} = Rt;
    let PostEncoderMethod = "fixLoadStoreExclusive<1,0>";
+   let mayStore = 1;
 }
 
 multiclass StoreUnprivilegedLSUI<bits<2> sz, RegisterClass regtype, string asm> {
@@ -12532,6 +12533,7 @@ class Store64BV<bits<3> opc, string asm_inst, list<dag> pat = []>
                        (ins GPR64x8:$Rt, GPR64sp:$Rn), (outs GPR64:$Rs), pat> {
   bits<5> Rs;
   let Inst{20-16} = Rs;
+  let mayStore = 1;
 }
 
 class MOPSMemoryCopyMoveBase<bit isMove, bits<2> opcode, bits<2> op1,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 948701f897855..c994ffd571fd4 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -2799,14 +2799,17 @@ def : Pat<(int_aarch64_ldg GPR64:$Rt, (am_indexeds9s128 GPR64sp:$Rn,  simm9s16:$
 
 def : InstAlias<"ldg $Rt, [$Rn]", (LDG GPR64:$Rt, GPR64sp:$Rn, 0), 1>;
 
+let mayLoad = 1 in
 def LDGM : MemTagVector<1, "ldgm", "\t$Rt, [$Rn]",
                    (outs GPR64:$Rt), (ins GPR64sp:$Rn)>;
+let mayStore = 1 in {
 def STGM : MemTagVector<0, "stgm", "\t$Rt, [$Rn]",
                    (outs), (ins GPR64:$Rt, GPR64sp:$Rn)>;
 def STZGM : MemTagVector<0, "stzgm", "\t$Rt, [$Rn]",
                    (outs), (ins GPR64:$Rt, GPR64sp:$Rn)> {
   let Inst{23} = 0;
 }
+} // mayStore = 1
 
 defm STG   : MemTagStore<0b00, "stg">;
 defm STZG  : MemTagStore<0b01, "stzg">;
@@ -10018,8 +10021,10 @@ foreach i = 0-7 in {
 }
 
 let Predicates = [HasLS64] in {
+  let mayLoad = 1 in
   def LD64B: LoadStore64B<0b101, "ld64b", (ins GPR64sp:$Rn),
                                           (outs GPR64x8:$Rt)>;
+  let mayStore = 1 in
   def ST64B: LoadStore64B<0b001, "st64b", (ins GPR64x8:$Rt, GPR64sp:$Rn),
                                           (outs)>;
   def ST64BV:   Store64BV<0b011, "st64bv">;
diff --git a/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/mte-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/mte-instructions.s
index 5148522431edb..f78a9988aa0b8 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/mte-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/mte-instructions.s
@@ -215,12 +215,12 @@ stzgm	xzr, [x2]
 # CHECK-NEXT:  2      4     0.50    *             U     ldgm	x0, [x1]
 # CHECK-NEXT:  2      4     0.50    *             U     ldgm	x1, [sp]
 # CHECK-NEXT:  2      4     0.50    *             U     ldgm	xzr, [x2]
-# CHECK-NEXT:  1      1     0.50                  U     stgm	x0, [x1]
-# CHECK-NEXT:  1      1     0.50                  U     stgm	x1, [sp]
-# CHECK-NEXT:  1      1     0.50                  U     stgm	xzr, [x2]
-# CHECK-NEXT:  1      1     0.50                  U     stzgm	x0, [x1]
-# CHECK-NEXT:  1      1     0.50                  U     stzgm	x1, [sp]
-# CHECK-NEXT:  1      1     0.50                  U     stzgm	xzr, [x2]
+# CHECK-NEXT:  1      1     0.50           *      U     stgm	x0, [x1]
+# CHECK-NEXT:  1      1     0.50           *      U     stgm	x1, [sp]
+# CHECK-NEXT:  1      1     0.50           *      U     stgm	xzr, [x2]
+# CHECK-NEXT:  1      1     0.50           *      U     stzgm	x0, [x1]
+# CHECK-NEXT:  1      1     0.50           *      U     stzgm	x1, [sp]
+# CHECK-NEXT:  1      1     0.50           *      U     stzgm	xzr, [x2]
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0.0] - Ampere1BUnitA
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-mte-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-mte-instructions.s
index 05b931e1444bb..c497eec223427 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-mte-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-mte-instructions.s
@@ -215,12 +215,12 @@ stzgm	xzr, [x2]
 # CHECK-NEXT:  1      4     0.33    *             U     ldgm	x0, [x1]
 # CHECK-NEXT:  1      4     0.33    *             U     ldgm	x1, [sp]
 # CHECK-NEXT:  1      4     0.33    *             U     ldgm	xzr, [x2]
-# CHECK-NEXT:  2      1     0.50                  U     stgm	x0, [x1]
-# CHECK-NEXT:  2      1     0.50                  U     stgm	x1, [sp]
-# CHECK-NEXT:  2      1     0.50                  U     stgm	xzr, [x2]
-# CHECK-NEXT:  2      1     0.50                  U     stzgm	x0, [x1]
-# CHECK-NEXT:  2      1     0.50                  U     stzgm	x1, [sp]
-# CHECK-NEXT:  2      1     0.50                  U     stzgm	xzr, [x2]
+# CHECK-NEXT:  2      1     0.50           *      U     stgm	x0, [x1]
+# CHECK-NEXT:  2      1     0.50           *      U     stgm	x1, [sp]
+# CHECK-NEXT:  2      1     0.50           *      U     stgm	xzr, [x2]
+# CHECK-NEXT:  2      1     0.50           *      U     stzgm	x0, [x1]
+# CHECK-NEXT:  2      1     0.50           *      U     stzgm	x1, [sp]
+# CHECK-NEXT:  2      1     0.50           *      U     stzgm	xzr, [x2]
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0.0] - N2UnitB
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-mte-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-mte-instructions.s
index 020d0429bd264..132dd8eb220d4 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-mte-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-mte-instructions.s
@@ -215,12 +215,12 @@ stzgm	xzr, [x2]
 # CHECK-NEXT:  1      4     0.33    *             U     ldgm	x0, [x1]
 # CHECK-NEXT:  1      4     0.33    *             U     ldgm	x1, [sp]
 # CHECK-NEXT:  1      4     0.33    *             U     ldgm	xzr, [x2]
-# CHECK-NEXT:  2      1     0.50                  U     stgm	x0, [x1]
-# CHECK-NEXT:  2      1     0.50                  U     stgm	x1, [sp]
-# CHECK-NEXT:  2      1     0.50                  U     stgm	xzr, [x2]
-# CHECK-NEXT:  2      1     0.50                  U     stzgm	x0, [x1]
-# CHECK-NEXT:  2      1     0.50                  U     stzgm	x1, [sp]
-# CHECK-NEXT:  2      1     0.50                  U     stzgm	xzr, [x2]
+# CHECK-NEXT:  2      1     0.50           *      U     stgm	x0, [x1]
+# CHECK-NEXT:  2      1     0.50           *      U     stgm	x1, [sp]
+# CHECK-NEXT:  2      1     0.50           *      U     stgm	xzr, [x2]
+# CHECK-NEXT:  2      1     0.50           *      U     stzgm	x0, [x1]
+# CHECK-NEXT:  2      1     0.50           *      U     stzgm	x1, [sp]
+# CHECK-NEXT:  2      1     0.50           *      U     stzgm	xzr, [x2]
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0.0] - N3UnitB

From 0fe8469e08cfe5bbd4cd7ee42a8b931560ca041c Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Tue, 14 Jan 2025 11:27:55 +0000
Subject: [PATCH 401/408] SLPVectorizer: strip bad FIXME (NFC) (#122888)

Follow up on 4a0d53a (PatternMatch: migrate to CmpPredicate) to get rid
of the FIXME it introduced in SLPVectorizer: the FIXME is bad, and we'd
get no testable impact by using CmpPredicate::getMatching here.
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 2742c3777c1ed..b0b8f8249d657 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -11529,7 +11529,6 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
                                      ? CmpInst::BAD_FCMP_PREDICATE
                                      : CmpInst::BAD_ICMP_PREDICATE;
       auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
-      // FIXME: Use CmpPredicate::getMatching here.
       if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
            !match(VI, MatchCmp)) ||
           (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&

From b87fdd9ce612d53b0e0b73d7c062b39a042e8629 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 14 Jan 2025 13:43:10 +0200
Subject: [PATCH 402/408] [libcxx] Reindent a section of a CMake file. NFC.
 (#122800)

This was missed in 43ba97e7079525a9686e15a6963508dfbd493f81 (#111821)
when reindenting after
917ada35cd937ad4104dff89c48398bd796ba6b7 (#80007).
---
 libcxx/src/CMakeLists.txt | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt
index 41ab8cad5b7da..4e9bf900af4c5 100644
--- a/libcxx/src/CMakeLists.txt
+++ b/libcxx/src/CMakeLists.txt
@@ -248,18 +248,18 @@ if (LIBCXX_ENABLE_SHARED)
   list(APPEND LIBCXX_BUILD_TARGETS "cxx_shared")
 endif()
 
-  if(WIN32 AND NOT MINGW AND NOT "${CMAKE_HOST_SYSTEM_NAME}" STREQUAL "Windows")
-    # Since we most likely do not have a mt.exe replacement, disable the
-    # manifest bundling.  This allows a normal cmake invocation to pass which
-    # will attempt to use the manifest tool to generate the bundled manifest
-    if (${CMAKE_CXX_COMPILER_FRONTEND_VARIANT} STREQUAL "MSVC")
-      set_target_properties(cxx_shared PROPERTIES
-                            APPEND_STRING PROPERTY LINK_FLAGS " /MANIFEST:NO")
-    else()
-      set_target_properties(cxx_shared PROPERTIES
-                            APPEND_STRING PROPERTY LINK_FLAGS " -Xlinker /MANIFEST:NO")
-    endif()
+if(WIN32 AND NOT MINGW AND NOT "${CMAKE_HOST_SYSTEM_NAME}" STREQUAL "Windows")
+  # Since we most likely do not have a mt.exe replacement, disable the
+  # manifest bundling.  This allows a normal cmake invocation to pass which
+  # will attempt to use the manifest tool to generate the bundled manifest
+  if (${CMAKE_CXX_COMPILER_FRONTEND_VARIANT} STREQUAL "MSVC")
+    set_target_properties(cxx_shared PROPERTIES
+                          APPEND_STRING PROPERTY LINK_FLAGS " /MANIFEST:NO")
+  else()
+    set_target_properties(cxx_shared PROPERTIES
+                          APPEND_STRING PROPERTY LINK_FLAGS " -Xlinker /MANIFEST:NO")
   endif()
+endif()
 
 set(CMAKE_STATIC_LIBRARY_PREFIX "lib")
 

From e87f94a6a806a941242506680f88573d6a87a828 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Tue, 14 Jan 2025 11:59:41 +0000
Subject: [PATCH 403/408] [llvm-project] Fix typos mutli and mutliple. NFC.
 (#122880)

---
 .../clang-tidy/modernize/UseAutoCheck.cpp            |  4 ++--
 clang/lib/Basic/SourceManager.cpp                    |  2 +-
 flang/test/HLFIR/associate-codegen.fir               |  2 +-
 libc/test/src/unistd/getopt_test.cpp                 |  2 +-
 lldb/source/Commands/CommandObjectMemory.cpp         |  2 +-
 lldb/source/Target/StructuredDataPlugin.cpp          |  2 +-
 lldb/unittests/Target/RegisterFlagsTest.cpp          |  2 +-
 llvm/include/llvm/IR/DebugInfoMetadata.h             |  2 +-
 llvm/lib/Target/X86/X86LowerAMXType.cpp              |  2 +-
 llvm/test/CodeGen/AArch64/eon.ll                     |  2 +-
 llvm/test/DebugInfo/X86/multiple-at-const-val.ll     |  2 +-
 llvm/test/Transforms/EarlyCSE/guards.ll              |  2 +-
 .../InstCombine/matrix-multiplication-negation.ll    | 12 ++++++------
 .../RISCV/blend-any-of-reduction-cost.ll             |  4 ++--
 .../one-shot-bufferize-empty-tensor-elimination.mlir | 12 ++++++------
 mlir/test/Transforms/mem2reg.mlir                    |  2 +-
 mlir/test/Transforms/sroa.mlir                       |  2 +-
 17 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp
index aec67808846b1..7a2d804e173ce 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp
@@ -342,7 +342,7 @@ static void ignoreTypeLocClasses(
     Loc = Loc.getNextTypeLoc();
 }
 
-static bool isMutliLevelPointerToTypeLocClasses(
+static bool isMultiLevelPointerToTypeLocClasses(
     TypeLoc Loc,
     std::initializer_list<TypeLoc::TypeLocClass> const &LocClasses) {
   ignoreTypeLocClasses(Loc, {TypeLoc::Paren, TypeLoc::Qualified});
@@ -424,7 +424,7 @@ void UseAutoCheck::replaceExpr(
 
   auto Diag = diag(Range.getBegin(), Message);
 
-  bool ShouldReplenishVariableName = isMutliLevelPointerToTypeLocClasses(
+  bool ShouldReplenishVariableName = isMultiLevelPointerToTypeLocClasses(
       TSI->getTypeLoc(), {TypeLoc::FunctionProto, TypeLoc::ConstantArray});
 
   // Space after 'auto' to handle cases where the '*' in the pointer type is
diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp
index 44e982d3ee67f..b1f2180c1d462 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -1222,7 +1222,7 @@ unsigned SourceManager::getPresumedColumnNumber(SourceLocation Loc,
   return PLoc.getColumn();
 }
 
-// Check if mutli-byte word x has bytes between m and n, included. This may also
+// Check if multi-byte word x has bytes between m and n, included. This may also
 // catch bytes equal to n + 1.
 // The returned value holds a 0x80 at each byte position that holds a match.
 // see http://graphics.stanford.edu/~seander/bithacks.html#HasBetweenInWord
diff --git a/flang/test/HLFIR/associate-codegen.fir b/flang/test/HLFIR/associate-codegen.fir
index f5e015c4169f6..ad64959984a14 100644
--- a/flang/test/HLFIR/associate-codegen.fir
+++ b/flang/test/HLFIR/associate-codegen.fir
@@ -372,7 +372,7 @@ func.func @_QPtest_multiple_expr_uses_inside_elemental() {
 // CHECK:           return
 // CHECK:         }
 
-// Verify that we properly recognize mutliple consequent hlfir.associate using
+// Verify that we properly recognize multiple consequent hlfir.associate using
 // the same result of hlfir.elemental.
 func.func @_QPtest_multitple_associates_for_same_expr() {
   %c1 = arith.constant 1 : index
diff --git a/libc/test/src/unistd/getopt_test.cpp b/libc/test/src/unistd/getopt_test.cpp
index e6e87720cde48..8217f7bb6e731 100644
--- a/libc/test/src/unistd/getopt_test.cpp
+++ b/libc/test/src/unistd/getopt_test.cpp
@@ -155,7 +155,7 @@ TEST_F(LlvmLibcGetoptTest, ParseArgInNext) {
   EXPECT_EQ(test_globals::optind, 3);
 }
 
-TEST_F(LlvmLibcGetoptTest, ParseMutliInOne) {
+TEST_F(LlvmLibcGetoptTest, ParseMultiInOne) {
   array<char *, 3> argv{"prog"_c, "-abc"_c, nullptr};
 
   EXPECT_EQ(LIBC_NAMESPACE::getopt(2, argv.data(), "abc"), (int)'a');
diff --git a/lldb/source/Commands/CommandObjectMemory.cpp b/lldb/source/Commands/CommandObjectMemory.cpp
index b5612f21f1156..164c61d172017 100644
--- a/lldb/source/Commands/CommandObjectMemory.cpp
+++ b/lldb/source/Commands/CommandObjectMemory.cpp
@@ -1737,7 +1737,7 @@ class CommandObjectMemoryRegion : public CommandObjectParsed {
 
     // It is important that we track the address used to request the region as
     // this will give the correct section name in the case that regions overlap.
-    // On Windows we get mutliple regions that start at the same place but are
+    // On Windows we get multiple regions that start at the same place but are
     // different sizes and refer to different sections.
     std::vector<std::pair<lldb_private::MemoryRegionInfo, lldb::addr_t>>
         region_list;
diff --git a/lldb/source/Target/StructuredDataPlugin.cpp b/lldb/source/Target/StructuredDataPlugin.cpp
index 1b5894b5df4b6..8e3ceb094b361 100644
--- a/lldb/source/Target/StructuredDataPlugin.cpp
+++ b/lldb/source/Target/StructuredDataPlugin.cpp
@@ -43,7 +43,7 @@ ProcessSP StructuredDataPlugin::GetProcess() const {
 }
 
 void StructuredDataPlugin::InitializeBasePluginForDebugger(Debugger &debugger) {
-  // Create our mutliword command anchor if it doesn't already exist.
+  // Create our multiword command anchor if it doesn't already exist.
   auto &interpreter = debugger.GetCommandInterpreter();
   if (!interpreter.GetCommandObject("plugin structured-data")) {
     // Find the parent command.
diff --git a/lldb/unittests/Target/RegisterFlagsTest.cpp b/lldb/unittests/Target/RegisterFlagsTest.cpp
index f2c61c4988b03..ecffdd0fe44e6 100644
--- a/lldb/unittests/Target/RegisterFlagsTest.cpp
+++ b/lldb/unittests/Target/RegisterFlagsTest.cpp
@@ -274,7 +274,7 @@ TEST(RegisterFlagsTest, DumpEnums) {
                 .DumpEnums(5),
             "A: 0 = an_enumerator");
 
-  // Mutliple values can go on the same line, up to the width.
+  // Multiple values can go on the same line, up to the width.
   FieldEnum more_enum("long_enum",
                       {{0, "an_enumerator"},
                        {1, "another_enumerator"},
diff --git a/llvm/include/llvm/IR/DebugInfoMetadata.h b/llvm/include/llvm/IR/DebugInfoMetadata.h
index c770955be76da..5ea8c0d7b448d 100644
--- a/llvm/include/llvm/IR/DebugInfoMetadata.h
+++ b/llvm/include/llvm/IR/DebugInfoMetadata.h
@@ -2391,7 +2391,7 @@ DILocation::cloneWithDiscriminator(unsigned Discriminator) const {
   DIScope *Scope = getScope();
   // Skip all parent DILexicalBlockFile that already have a discriminator
   // assigned. We do not want to have nested DILexicalBlockFiles that have
-  // mutliple discriminators because only the leaf DILexicalBlockFile's
+  // multiple discriminators because only the leaf DILexicalBlockFile's
   // dominator will be used.
   for (auto *LBF = dyn_cast<DILexicalBlockFile>(Scope);
        LBF && LBF->getDiscriminator() != 0;
diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index 53091e7bb4dbd..fe963dddaac1f 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -498,7 +498,7 @@ bool X86LowerAMXType::visit() {
             DeadInsts.push_back(Bitcast);
           continue;
         }
-        // If load has mutli-user, duplicate a vector load.
+        // If load has multi-user, duplicate a vector load.
         // %src = load <256 x i32>, <256 x i32>* %addr, align 64
         // %2 = bitcast <256 x i32> %src to x86_amx
         // %add = add <256 x i32> %src, <256 x i32> %src2
diff --git a/llvm/test/CodeGen/AArch64/eon.ll b/llvm/test/CodeGen/AArch64/eon.ll
index 29c4c8ffd2038..3468a0f63436e 100644
--- a/llvm/test/CodeGen/AArch64/eon.ll
+++ b/llvm/test/CodeGen/AArch64/eon.ll
@@ -15,7 +15,7 @@ entry:
   ret i64 %xor
 }
 
-; Same check with mutliple uses of %neg
+; Same check with multiple uses of %neg
 define i64 @test2(i64 %a, i64 %b, i64 %c) {
 ; CHECK-LABEL: test2:
 ; CHECK: eon
diff --git a/llvm/test/DebugInfo/X86/multiple-at-const-val.ll b/llvm/test/DebugInfo/X86/multiple-at-const-val.ll
index 8a2439d769b1f..ec28c91ea57e0 100644
--- a/llvm/test/DebugInfo/X86/multiple-at-const-val.ll
+++ b/llvm/test/DebugInfo/X86/multiple-at-const-val.ll
@@ -2,7 +2,7 @@
 ; RUN: llvm-dwarfdump -v %t | FileCheck %s
 
 ; rdar://13071590
-; Check we are not emitting mutliple AT_const_value for a single member.
+; Check we are not emitting multiple AT_const_value for a single member.
 ; CHECK: .debug_info contents:
 ; CHECK: DW_TAG_compile_unit
 ; CHECK: DW_TAG_class_type
diff --git a/llvm/test/Transforms/EarlyCSE/guards.ll b/llvm/test/Transforms/EarlyCSE/guards.ll
index e837b774e6165..50ac014c8ac68 100644
--- a/llvm/test/Transforms/EarlyCSE/guards.ll
+++ b/llvm/test/Transforms/EarlyCSE/guards.ll
@@ -139,7 +139,7 @@ right:
 }
 
 define i32 @test5(i32 %val, i1 %c) {
-; Same as test4, but the %left block has mutliple predecessors.
+; Same as test4, but the %left block has multiple predecessors.
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COND0:%.*]] = icmp slt i32 [[VAL:%.*]], 40
diff --git a/llvm/test/Transforms/InstCombine/matrix-multiplication-negation.ll b/llvm/test/Transforms/InstCombine/matrix-multiplication-negation.ll
index 74aa014970e97..331de12ddd339 100644
--- a/llvm/test/Transforms/InstCombine/matrix-multiplication-negation.ll
+++ b/llvm/test/Transforms/InstCombine/matrix-multiplication-negation.ll
@@ -265,8 +265,8 @@ define <12 x double> @fneg_with_multiple_uses_2(<15 x double> %a, <20 x double>
   ret <12 x double> %res
 }
 ; negation should be moved to the second operand given it has the smallest operand count
-define <72 x double> @chain_of_matrix_mutliplies(<27 x double> %a, <3 x double> %b, <8 x double> %c) {
-; CHECK-LABEL: @chain_of_matrix_mutliplies(
+define <72 x double> @chain_of_matrix_multiplies(<27 x double> %a, <3 x double> %b, <8 x double> %c) {
+; CHECK-LABEL: @chain_of_matrix_multiplies(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fneg <3 x double> [[B:%.*]]
 ; CHECK-NEXT:    [[RES:%.*]] = tail call <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> [[A:%.*]], <3 x double> [[TMP1]], i32 9, i32 3, i32 1)
 ; CHECK-NEXT:    [[RES_2:%.*]] = tail call <72 x double> @llvm.matrix.multiply.v72f64.v9f64.v8f64(<9 x double> [[RES]], <8 x double> [[C:%.*]], i32 9, i32 1, i32 8)
@@ -280,8 +280,8 @@ define <72 x double> @chain_of_matrix_mutliplies(<27 x double> %a, <3 x double>
 
 ; first negation should be moved to %a
 ; second negation should be moved to the result of the second multipication
-define <6 x double> @chain_of_matrix_mutliplies_with_two_negations(<3 x double> %a, <5 x double> %b, <10 x double> %c) {
-; CHECK-LABEL: @chain_of_matrix_mutliplies_with_two_negations(
+define <6 x double> @chain_of_matrix_multiplies_with_two_negations(<3 x double> %a, <5 x double> %b, <10 x double> %c) {
+; CHECK-LABEL: @chain_of_matrix_multiplies_with_two_negations(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fneg <3 x double> [[A:%.*]]
 ; CHECK-NEXT:    [[RES:%.*]] = tail call <15 x double> @llvm.matrix.multiply.v15f64.v3f64.v5f64(<3 x double> [[TMP1]], <5 x double> [[B:%.*]], i32 3, i32 1, i32 5)
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <6 x double> @llvm.matrix.multiply.v6f64.v15f64.v10f64(<15 x double> [[RES]], <10 x double> [[C:%.*]], i32 3, i32 5, i32 2)
@@ -296,8 +296,8 @@ define <6 x double> @chain_of_matrix_mutliplies_with_two_negations(<3 x double>
 }
 
 ; negation should be propagated to the result of the second matrix multiplication
-define <6 x double> @chain_of_matrix_mutliplies_propagation(<15 x double> %a, <20 x double> %b, <8 x double> %c){
-; CHECK-LABEL: @chain_of_matrix_mutliplies_propagation(
+define <6 x double> @chain_of_matrix_multiplies_propagation(<15 x double> %a, <20 x double> %b, <8 x double> %c){
+; CHECK-LABEL: @chain_of_matrix_multiplies_propagation(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <12 x double> @llvm.matrix.multiply.v12f64.v15f64.v20f64(<15 x double> [[A:%.*]], <20 x double> [[B:%.*]], i32 3, i32 5, i32 4)
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <6 x double> @llvm.matrix.multiply.v6f64.v12f64.v8f64(<12 x double> [[TMP1]], <8 x double> [[C:%.*]], i32 3, i32 4, i32 2)
 ; CHECK-NEXT:    [[RES_2:%.*]] = fneg <6 x double> [[TMP2]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll
index 3d00c228baf51..87338e55c16c8 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll
@@ -58,8 +58,8 @@ exit:
   ret i32 %res
 }
 
-define i32 @any_of_reduction_used_in_blend_with_mutliple_phis(ptr %src, i64 %N, i1 %c.0, i1 %c.1) #0 {
-; CHECK-LABEL: define i32 @any_of_reduction_used_in_blend_with_mutliple_phis(
+define i32 @any_of_reduction_used_in_blend_with_multiple_phis(ptr %src, i64 %N, i1 %c.0, i1 %c.1) #0 {
+; CHECK-LABEL: define i32 @any_of_reduction_used_in_blend_with_multiple_phis(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i1 [[C_0:%.*]], i1 [[C_1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
index 820fb3dfa5e5e..4d91249bbd714 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
@@ -418,9 +418,9 @@ func.func @succeed_to_eliminate_one_empty_tensor() -> tensor<5x6x128xf32> {
 // empty with the new injected `SubsetExtraction`, i.e. the specific use
 // which has been tracked.
 
-// CHECK-ELIM-LABEL:   func.func @mutli_use_of_the_same_tensor_empty
-// CHECK-LABEL:   func.func @mutli_use_of_the_same_tensor_empty
-func.func @mutli_use_of_the_same_tensor_empty() -> tensor<5x6x128xf32> {
+// CHECK-ELIM-LABEL:   func.func @multi_use_of_the_same_tensor_empty
+// CHECK-LABEL:   func.func @multi_use_of_the_same_tensor_empty
+func.func @multi_use_of_the_same_tensor_empty() -> tensor<5x6x128xf32> {
   %cst_1 = arith.constant 1.0 : f32
   %cst_2 = arith.constant 2.0 : f32
   %cancatenated_empty = tensor.empty() : tensor<5x6x128xf32>
@@ -441,9 +441,9 @@ func.func @mutli_use_of_the_same_tensor_empty() -> tensor<5x6x128xf32> {
 
 // -----
 
-// CHECK-LABEL:   func.func @mutli_use_of_the_same_tensor_empty_creates_non_existent_read
-// CHECK-ELIM-LABEL:   func.func @mutli_use_of_the_same_tensor_empty_creates_non_existent_read
-func.func @mutli_use_of_the_same_tensor_empty_creates_non_existent_read(%arg1: tensor<5x6x128xf32> , %arg2: tensor<5x6x64xf32>)
+// CHECK-LABEL:   func.func @multi_use_of_the_same_tensor_empty_creates_non_existent_read
+// CHECK-ELIM-LABEL:   func.func @multi_use_of_the_same_tensor_empty_creates_non_existent_read
+func.func @multi_use_of_the_same_tensor_empty_creates_non_existent_read(%arg1: tensor<5x6x128xf32> , %arg2: tensor<5x6x64xf32>)
     -> (tensor<5x6x128xf32>, tensor<5x6x64xf32>) {
   %cst_1 = arith.constant 1.0 : f32
   %empty_1 = tensor.empty() : tensor<5x6x64xf32>
diff --git a/mlir/test/Transforms/mem2reg.mlir b/mlir/test/Transforms/mem2reg.mlir
index 89472ac0ca284..4b27f3305e89d 100644
--- a/mlir/test/Transforms/mem2reg.mlir
+++ b/mlir/test/Transforms/mem2reg.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-opt %s --pass-pipeline='builtin.module(any(mem2reg))' --split-input-file | FileCheck %s
 
-// Verifies that allocators with mutliple slots are handled properly.
+// Verifies that allocators with multiple slots are handled properly.
 
 // CHECK-LABEL: func.func @multi_slot_alloca
 func.func @multi_slot_alloca() -> (i32, i32) {
diff --git a/mlir/test/Transforms/sroa.mlir b/mlir/test/Transforms/sroa.mlir
index c9e80a6cf8dd1..f8dda68e2c8f4 100644
--- a/mlir/test/Transforms/sroa.mlir
+++ b/mlir/test/Transforms/sroa.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-opt %s --pass-pipeline='builtin.module(func.func(sroa))' --split-input-file | FileCheck %s
 
-// Verifies that allocators with mutliple slots are handled properly.
+// Verifies that allocators with multiple slots are handled properly.
 
 // CHECK-LABEL: func.func @multi_slot_alloca
 func.func @multi_slot_alloca() -> (i32, i32) {

From e409204a89c7fb1d1c040c18fac2fa8db9dfe893 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Tue, 14 Jan 2025 12:04:14 +0000
Subject: [PATCH 404/408] VectorCombine: teach foldExtractedCmps about samesign
 (#122883)

Follow up on 4a0d53a (PatternMatch: migrate to CmpPredicate) to get rid
of one of the FIXMEs it introduced by replacing a predicate comparison
with CmpPredicate::getMatching.
---
 .../Transforms/Vectorize/VectorCombine.cpp    | 10 +++--
 .../VectorCombine/X86/extract-cmp-binop.ll    | 37 +++++++++++++++++++
 2 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index ae2af6d346879..d17be8e1ac79e 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1097,10 +1097,12 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
   Instruction *I0, *I1;
   Constant *C0, *C1;
   CmpPredicate P0, P1;
-  // FIXME: Use CmpPredicate::getMatching here.
   if (!match(B0, m_Cmp(P0, m_Instruction(I0), m_Constant(C0))) ||
-      !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))) ||
-      P0 != static_cast<CmpInst::Predicate>(P1))
+      !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))))
+    return false;
+
+  auto MatchingPred = CmpPredicate::getMatching(P0, P1);
+  if (!MatchingPred)
     return false;
 
   // The compare operands must be extracts of the same vector with constant
@@ -1121,7 +1123,7 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
 
   // The original scalar pattern is:
   // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
-  CmpInst::Predicate Pred = P0;
+  CmpInst::Predicate Pred = *MatchingPred;
   unsigned CmpOpcode =
       CmpInst::isFPPredicate(Pred) ? Instruction::FCmp : Instruction::ICmp;
   auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll
index 775f2d2da5721..3346ebf0997f1 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll
@@ -66,6 +66,22 @@ define i1 @icmp_xor_v4i32(<4 x i32> %a) {
   ret i1 %r
 }
 
+define i1 @icmp_samesign_xor_v4i32(<4 x i32> %a) {
+; CHECK-LABEL: @icmp_samesign_xor_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> [[A:%.*]], <i32 poison, i32 -8, i32 poison, i32 42>
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <4 x i32> <i32 poison, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i1> [[SHIFT]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %e1 = extractelement <4 x i32> %a, i32 3
+  %e2 = extractelement <4 x i32> %a, i32 1
+  %cmp1 = icmp samesign ugt i32 %e1, 42
+  %cmp2 = icmp sgt i32 %e2, -8
+  %r = xor i1 %cmp1, %cmp2
+  ret i1 %r
+}
+
 ; add is not canonical (should be xor), but that is ok.
 
 define i1 @icmp_add_v8i32(<8 x i32> %a) {
@@ -146,6 +162,27 @@ define i1 @icmp_xor_v4i32_multiuse(<4 x i32> %a) {
   ret i1 %r
 }
 
+define i1 @icmp_samesign_xor_v4i32_multiuse(<4 x i32> %a) {
+; CHECK-LABEL: @icmp_samesign_xor_v4i32_multiuse(
+; CHECK-NEXT:    [[E2:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 1
+; CHECK-NEXT:    call void @use(i32 [[E2]])
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> [[A]], <i32 poison, i32 -8, i32 poison, i32 42>
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <4 x i32> <i32 poison, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i1> [[SHIFT]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1
+; CHECK-NEXT:    call void @use(i1 [[R]])
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %e1 = extractelement <4 x i32> %a, i32 3
+  %e2 = extractelement <4 x i32> %a, i32 1
+  call void @use(i32 %e2)
+  %cmp1 = icmp sgt i32 %e1, 42
+  %cmp2 = icmp samesign ugt i32 %e2, -8
+  %r = xor i1 %cmp1, %cmp2
+  call void @use(i1 %r)
+  ret i1 %r
+}
+
 ; Negative test - this could CSE/simplify.
 
 define i1 @same_extract_index(<4 x i32> %a) {

From aae259208a2dae815112638eab52023a8526c338 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Tue, 14 Jan 2025 12:06:22 +0000
Subject: [PATCH 405/408] LoopVersioning: improve a test, regen with UTC
 (#122876)

Improve a test by replacing undef with poison, and regenerate it using
UpdateTestChecks.
---
 .../LoopVersioning/incorrect-phi.ll           | 121 +++++++++++++++---
 1 file changed, 100 insertions(+), 21 deletions(-)

diff --git a/llvm/test/Transforms/LoopVersioning/incorrect-phi.ll b/llvm/test/Transforms/LoopVersioning/incorrect-phi.ll
index e5ef8df9fc9cd..b6896c467812b 100644
--- a/llvm/test/Transforms/LoopVersioning/incorrect-phi.ll
+++ b/llvm/test/Transforms/LoopVersioning/incorrect-phi.ll
@@ -1,62 +1,141 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes=loop-versioning -S < %s | FileCheck %s
 
 ; Make sure all PHIs are properly updated in the exit block.  Based on
 ; PR28037.
 
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
 @x = external global [2 x [3 x [5 x i16]]]
 
-; CHECK-LABEL: @phi_with_undef
-define void @phi_with_undef() {
+define void @phi_with_poison() {
+; CHECK-LABEL: define void @phi_with_poison() {
+; CHECK-NEXT:  [[BB6_LVER_CHECK:.*:]]
+; CHECK-NEXT:    [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 10, i64 0)
+; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 0, [[MUL_RESULT]]
+; CHECK-NEXT:    br i1 poison, label %[[BB6_PH_LVER_ORIG:.*]], label %[[BB6_PH:.*]]
+; CHECK:       [[BB6_PH_LVER_ORIG]]:
+; CHECK-NEXT:    br label %[[BB6_LVER_ORIG:.*]]
+; CHECK:       [[BB6_LVER_ORIG]]:
+; CHECK-NEXT:    [[_TMP1423_LVER_ORIG:%.*]] = phi i64 [ poison, %[[BB6_PH_LVER_ORIG]] ], [ [[_TMP142_LVER_ORIG:%.*]], %[[BB6_LVER_ORIG]] ]
+; CHECK-NEXT:    [[_TMP123_LVER_ORIG:%.*]] = getelementptr [2 x [3 x [5 x i16]]], ptr @x, i16 0, i64 poison
+; CHECK-NEXT:    [[_TMP126_LVER_ORIG:%.*]] = getelementptr [3 x [5 x i16]], ptr [[_TMP123_LVER_ORIG]], i16 0, i64 [[_TMP1423_LVER_ORIG]]
+; CHECK-NEXT:    [[_TMP129_LVER_ORIG:%.*]] = getelementptr [5 x i16], ptr [[_TMP126_LVER_ORIG]], i16 0, i64 poison
+; CHECK-NEXT:    [[_TMP130_LVER_ORIG:%.*]] = load i16, ptr [[_TMP129_LVER_ORIG]], align 2
+; CHECK-NEXT:    store i16 poison, ptr @x, align 2
+; CHECK-NEXT:    [[_TMP142_LVER_ORIG]] = add i64 [[_TMP1423_LVER_ORIG]], 1
+; CHECK-NEXT:    br i1 false, label %[[BB6_LVER_ORIG]], label %[[LOOP_EXIT_LOOPEXIT:.*]]
+; CHECK:       [[BB6_PH]]:
+; CHECK-NEXT:    br label %[[BB6:.*]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[_TMP1423:%.*]] = phi i64 [ poison, %[[BB6_PH]] ], [ [[_TMP142:%.*]], %[[BB6]] ]
+; CHECK-NEXT:    [[_TMP123:%.*]] = getelementptr [2 x [3 x [5 x i16]]], ptr @x, i16 0, i64 poison
+; CHECK-NEXT:    [[_TMP126:%.*]] = getelementptr [3 x [5 x i16]], ptr [[_TMP123]], i16 0, i64 [[_TMP1423]]
+; CHECK-NEXT:    [[_TMP129:%.*]] = getelementptr [5 x i16], ptr [[_TMP126]], i16 0, i64 poison
+; CHECK-NEXT:    [[_TMP130:%.*]] = load i16, ptr [[_TMP129]], align 2
+; CHECK-NEXT:    store i16 poison, ptr @x, align 2
+; CHECK-NEXT:    [[_TMP142]] = add i64 [[_TMP1423]], 1
+; CHECK-NEXT:    br i1 false, label %[[BB6]], label %[[LOOP_EXIT_LOOPEXIT1:.*]]
+; CHECK:       [[LOOP_EXIT_LOOPEXIT]]:
+; CHECK-NEXT:    [[_TMP142_LCSSA_PH:%.*]] = phi i64 [ [[_TMP142_LVER_ORIG]], %[[BB6_LVER_ORIG]] ]
+; CHECK-NEXT:    [[SPLIT_PH:%.*]] = phi i16 [ poison, %[[BB6_LVER_ORIG]] ]
+; CHECK-NEXT:    br label %[[LOOP_EXIT:.*]]
+; CHECK:       [[LOOP_EXIT_LOOPEXIT1]]:
+; CHECK-NEXT:    [[_TMP142_LCSSA_PH2:%.*]] = phi i64 [ [[_TMP142]], %[[BB6]] ]
+; CHECK-NEXT:    [[SPLIT_PH3:%.*]] = phi i16 [ poison, %[[BB6]] ]
+; CHECK-NEXT:    br label %[[LOOP_EXIT]]
+; CHECK:       [[LOOP_EXIT]]:
+; CHECK-NEXT:    [[_TMP142_LCSSA:%.*]] = phi i64 [ [[_TMP142_LCSSA_PH]], %[[LOOP_EXIT_LOOPEXIT]] ], [ [[_TMP142_LCSSA_PH2]], %[[LOOP_EXIT_LOOPEXIT1]] ]
+; CHECK-NEXT:    [[SPLIT:%.*]] = phi i16 [ [[SPLIT_PH]], %[[LOOP_EXIT_LOOPEXIT]] ], [ [[SPLIT_PH3]], %[[LOOP_EXIT_LOOPEXIT1]] ]
+; CHECK-NEXT:    br label %[[BB9:.*]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    ret void
+;
 bb6.lr.ph:                                        ; preds = %bb5.preheader
   br label %bb6
 
 bb6:                                              ; preds = %bb6.lr.ph, %bb6
-  %_tmp1423 = phi i64 [ undef, %bb6.lr.ph ], [ %_tmp142, %bb6 ]
-  %_tmp123 = getelementptr [2 x [3 x [5 x i16]]], ptr @x, i16 0, i64 undef
+  %_tmp1423 = phi i64 [ poison, %bb6.lr.ph ], [ %_tmp142, %bb6 ]
+  %_tmp123 = getelementptr [2 x [3 x [5 x i16]]], ptr @x, i16 0, i64 poison
   %_tmp126 = getelementptr [3 x [5 x i16]], ptr %_tmp123, i16 0, i64 %_tmp1423
-  %_tmp129 = getelementptr [5 x i16], ptr %_tmp126, i16 0, i64 undef
+  %_tmp129 = getelementptr [5 x i16], ptr %_tmp126, i16 0, i64 poison
   %_tmp130 = load i16, ptr %_tmp129
-  store i16 undef, ptr getelementptr ([2 x [3 x [5 x i16]]], ptr @x, i64 0, i64 undef, i64 undef, i64 undef)
+  store i16 poison, ptr getelementptr ([2 x [3 x [5 x i16]]], ptr @x, i64 0, i64 poison, i64 poison, i64 poison)
   %_tmp142 = add i64 %_tmp1423, 1
   br i1 false, label %bb6, label %loop.exit
 
 loop.exit:                                ; preds = %bb6
   %_tmp142.lcssa = phi i64 [ %_tmp142, %bb6 ]
-  %split = phi i16 [ undef, %bb6 ]
-; CHECK: %split.ph = phi i16 [ undef, %bb6.lver.orig ]
-; CHECK: %split.ph3 = phi i16 [ undef, %bb6 ]
-; CHECK: %split = phi i16 [ %split.ph, %loop.exit.loopexit ], [ %split.ph3, %loop.exit.loopexit1 ]
+  %split = phi i16 [ poison, %bb6 ]
   br label %bb9
 
 bb9:                                              ; preds = %bb9.loopexit, %bb1
   ret void
 }
 
-; CHECK-LABEL: @phi_with_non_loop_defined_value
 define void @phi_with_non_loop_defined_value() {
+; CHECK-LABEL: define void @phi_with_non_loop_defined_value() {
+; CHECK-NEXT:  [[BB6_LVER_CHECK:.*:]]
+; CHECK-NEXT:    [[T:%.*]] = add i16 1, 1
+; CHECK-NEXT:    [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 10, i64 0)
+; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 0, [[MUL_RESULT]]
+; CHECK-NEXT:    br i1 poison, label %[[BB6_PH_LVER_ORIG:.*]], label %[[BB6_PH:.*]]
+; CHECK:       [[BB6_PH_LVER_ORIG]]:
+; CHECK-NEXT:    br label %[[BB6_LVER_ORIG:.*]]
+; CHECK:       [[BB6_LVER_ORIG]]:
+; CHECK-NEXT:    [[_TMP1423_LVER_ORIG:%.*]] = phi i64 [ poison, %[[BB6_PH_LVER_ORIG]] ], [ [[_TMP142_LVER_ORIG:%.*]], %[[BB6_LVER_ORIG]] ]
+; CHECK-NEXT:    [[_TMP123_LVER_ORIG:%.*]] = getelementptr [2 x [3 x [5 x i16]]], ptr @x, i16 0, i64 poison
+; CHECK-NEXT:    [[_TMP126_LVER_ORIG:%.*]] = getelementptr [3 x [5 x i16]], ptr [[_TMP123_LVER_ORIG]], i16 0, i64 [[_TMP1423_LVER_ORIG]]
+; CHECK-NEXT:    [[_TMP129_LVER_ORIG:%.*]] = getelementptr [5 x i16], ptr [[_TMP126_LVER_ORIG]], i16 0, i64 poison
+; CHECK-NEXT:    [[_TMP130_LVER_ORIG:%.*]] = load i16, ptr [[_TMP129_LVER_ORIG]], align 2
+; CHECK-NEXT:    store i16 poison, ptr @x, align 2
+; CHECK-NEXT:    [[_TMP142_LVER_ORIG]] = add i64 [[_TMP1423_LVER_ORIG]], 1
+; CHECK-NEXT:    br i1 false, label %[[BB6_LVER_ORIG]], label %[[LOOP_EXIT_LOOPEXIT:.*]]
+; CHECK:       [[BB6_PH]]:
+; CHECK-NEXT:    br label %[[BB6:.*]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[_TMP1423:%.*]] = phi i64 [ poison, %[[BB6_PH]] ], [ [[_TMP142:%.*]], %[[BB6]] ]
+; CHECK-NEXT:    [[_TMP123:%.*]] = getelementptr [2 x [3 x [5 x i16]]], ptr @x, i16 0, i64 poison
+; CHECK-NEXT:    [[_TMP126:%.*]] = getelementptr [3 x [5 x i16]], ptr [[_TMP123]], i16 0, i64 [[_TMP1423]]
+; CHECK-NEXT:    [[_TMP129:%.*]] = getelementptr [5 x i16], ptr [[_TMP126]], i16 0, i64 poison
+; CHECK-NEXT:    [[_TMP130:%.*]] = load i16, ptr [[_TMP129]], align 2
+; CHECK-NEXT:    store i16 poison, ptr @x, align 2
+; CHECK-NEXT:    [[_TMP142]] = add i64 [[_TMP1423]], 1
+; CHECK-NEXT:    br i1 false, label %[[BB6]], label %[[LOOP_EXIT_LOOPEXIT1:.*]]
+; CHECK:       [[LOOP_EXIT_LOOPEXIT]]:
+; CHECK-NEXT:    [[_TMP142_LCSSA_PH:%.*]] = phi i64 [ [[_TMP142_LVER_ORIG]], %[[BB6_LVER_ORIG]] ]
+; CHECK-NEXT:    [[SPLIT_PH:%.*]] = phi i16 [ [[T]], %[[BB6_LVER_ORIG]] ]
+; CHECK-NEXT:    br label %[[LOOP_EXIT:.*]]
+; CHECK:       [[LOOP_EXIT_LOOPEXIT1]]:
+; CHECK-NEXT:    [[_TMP142_LCSSA_PH2:%.*]] = phi i64 [ [[_TMP142]], %[[BB6]] ]
+; CHECK-NEXT:    [[SPLIT_PH3:%.*]] = phi i16 [ [[T]], %[[BB6]] ]
+; CHECK-NEXT:    br label %[[LOOP_EXIT]]
+; CHECK:       [[LOOP_EXIT]]:
+; CHECK-NEXT:    [[_TMP142_LCSSA:%.*]] = phi i64 [ [[_TMP142_LCSSA_PH]], %[[LOOP_EXIT_LOOPEXIT]] ], [ [[_TMP142_LCSSA_PH2]], %[[LOOP_EXIT_LOOPEXIT1]] ]
+; CHECK-NEXT:    [[SPLIT:%.*]] = phi i16 [ [[SPLIT_PH]], %[[LOOP_EXIT_LOOPEXIT]] ], [ [[SPLIT_PH3]], %[[LOOP_EXIT_LOOPEXIT1]] ]
+; CHECK-NEXT:    br label %[[BB9:.*]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    ret void
+;
 bb6.lr.ph:                                        ; preds = %bb5.preheader
   %t = add i16 1, 1
   br label %bb6
 
 bb6:                                              ; preds = %bb6.lr.ph, %bb6
-  %_tmp1423 = phi i64 [ undef, %bb6.lr.ph ], [ %_tmp142, %bb6 ]
-  %_tmp123 = getelementptr [2 x [3 x [5 x i16]]], ptr @x, i16 0, i64 undef
+  %_tmp1423 = phi i64 [ poison, %bb6.lr.ph ], [ %_tmp142, %bb6 ]
+  %_tmp123 = getelementptr [2 x [3 x [5 x i16]]], ptr @x, i16 0, i64 poison
   %_tmp126 = getelementptr [3 x [5 x i16]], ptr %_tmp123, i16 0, i64 %_tmp1423
-  %_tmp129 = getelementptr [5 x i16], ptr %_tmp126, i16 0, i64 undef
+  %_tmp129 = getelementptr [5 x i16], ptr %_tmp126, i16 0, i64 poison
   %_tmp130 = load i16, ptr %_tmp129
-  store i16 undef, ptr getelementptr ([2 x [3 x [5 x i16]]], ptr @x, i64 0, i64 undef, i64 undef, i64 undef)
+  store i16 poison, ptr getelementptr ([2 x [3 x [5 x i16]]], ptr @x, i64 0, i64 poison, i64 poison, i64 poison)
   %_tmp142 = add i64 %_tmp1423, 1
   br i1 false, label %bb6, label %loop.exit
 
 loop.exit:                                ; preds = %bb6
   %_tmp142.lcssa = phi i64 [ %_tmp142, %bb6 ]
   %split = phi i16 [ %t, %bb6 ]
-; CHECK: %split.ph = phi i16 [ %t, %bb6.lver.orig ]
-; CHECK: %split.ph3 = phi i16 [ %t, %bb6 ]
-; CHECK: %split = phi i16 [ %split.ph, %loop.exit.loopexit ], [ %split.ph3, %loop.exit.loopexit1 ]
   br label %bb9
 
 bb9:                                              ; preds = %bb9.loopexit, %bb1

From 6a9e9878a200a6e494beed8944d6d08347945727 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 14 Jan 2025 12:16:28 +0000
Subject: [PATCH 406/408] [VectorCombine] foldPermuteOfBinops - ensure
 potential identity mask isn't length changing.

---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index d17be8e1ac79e..88d7cf2013a6b 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1639,8 +1639,12 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
   }
 
   unsigned NumOpElts = Op0Ty->getNumElements();
-  bool IsIdentity0 = ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts);
-  bool IsIdentity1 = ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
+  bool IsIdentity0 =
+      all_of(NewMask0, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
+      ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts);
+  bool IsIdentity1 =
+      all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
+      ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
 
   // Try to merge shuffles across the binop if the new shuffles are not costly.
   InstructionCost OldCost =

From fabc443e9394e460d328984d75570d9f017fe709 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Tue, 14 Jan 2025 12:34:37 +0000
Subject: [PATCH 407/408] [OMPIRBuilder] Support runtime number of teams and
 threads, and SPMD mode (#116051)

This patch introduces a `TargetKernelRuntimeAttrs` structure to hold
host-evaluated `num_teams`, `thread_limit`, `num_threads` and trip count
values passed to the runtime kernel offloading call.

Additionally, kernel type information is used to influence target device
code generation and the `IsSPMD` flag is replaced by `ExecFlags`, which
provides more granularity.
---
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp      |   5 +-
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |  38 ++-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 129 +++++---
 .../Frontend/OpenMPIRBuilderTest.cpp          | 281 ++++++++++++++++--
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |  12 +-
 5 files changed, 398 insertions(+), 67 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 81993dafae2b0..87c3635ed3f70 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -20,6 +20,7 @@
 #include "clang/AST/StmtVisitor.h"
 #include "clang/Basic/Cuda.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
 
 using namespace clang;
@@ -745,7 +746,9 @@ void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D,
                                         CodeGenFunction &CGF,
                                         EntryFunctionState &EST, bool IsSPMD) {
   llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs Attrs;
-  Attrs.IsSPMD = IsSPMD;
+  Attrs.ExecFlags =
+      IsSPMD ? llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD
+             : llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
   computeMinAndMaxThreadsAndTeams(D, CGF, Attrs);
 
   CGBuilderTy &Bld = CGF.Builder;
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 8ca3bc08b5ad4..7eceec3d8cf8f 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1389,9 +1389,6 @@ class OpenMPIRBuilder {
 
   /// Supporting functions for Reductions CodeGen.
 private:
-  /// Emit the llvm.used metadata.
-  void emitUsed(StringRef Name, std::vector<llvm::WeakTrackingVH> &List);
-
   /// Get the id of the current thread on the GPU.
   Value *getGPUThreadID();
 
@@ -2013,6 +2010,13 @@ class OpenMPIRBuilder {
   /// Value.
   GlobalValue *createGlobalFlag(unsigned Value, StringRef Name);
 
+  /// Emit the llvm.used metadata.
+  void emitUsed(StringRef Name, ArrayRef<llvm::WeakTrackingVH> List);
+
+  /// Emit the kernel execution mode.
+  GlobalVariable *emitKernelExecutionMode(StringRef KernelName,
+                                          omp::OMPTgtExecModeFlags Mode);
+
   /// Generate control flow and cleanup for cancellation.
   ///
   /// \param CancelFlag Flag indicating if the cancellation is performed.
@@ -2233,13 +2237,34 @@ class OpenMPIRBuilder {
   /// time. The number of max values will be 1 except for the case where
   /// ompx_bare is set.
   struct TargetKernelDefaultAttrs {
-    bool IsSPMD = false;
+    omp::OMPTgtExecModeFlags ExecFlags =
+        omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
     SmallVector<int32_t, 3> MaxTeams = {-1};
     int32_t MinTeams = 1;
     SmallVector<int32_t, 3> MaxThreads = {-1};
     int32_t MinThreads = 1;
   };
 
+  /// Container to pass LLVM IR runtime values or constants related to the
+  /// number of teams and threads with which the kernel must be launched, as
+  /// well as the trip count of the loop, if it is an SPMD or Generic-SPMD
+  /// kernel. These must be defined in the host prior to the call to the kernel
+  /// launch OpenMP RTL function.
+  struct TargetKernelRuntimeAttrs {
+    SmallVector<Value *, 3> MaxTeams = {nullptr};
+    Value *MinTeams = nullptr;
+    SmallVector<Value *, 3> TargetThreadLimit = {nullptr};
+    SmallVector<Value *, 3> TeamsThreadLimit = {nullptr};
+
+    /// 'parallel' construct 'num_threads' clause value, if present and it is an
+    /// SPMD kernel.
+    Value *MaxThreads = nullptr;
+
+    /// Total number of iterations of the SPMD or Generic-SPMD kernel or null if
+    /// it is a generic kernel.
+    Value *LoopTripCount = nullptr;
+  };
+
   /// Data structure that contains the needed information to construct the
   /// kernel args vector.
   struct TargetKernelArgs {
@@ -2971,7 +2996,9 @@ class OpenMPIRBuilder {
   /// \param CodeGenIP The insertion point where the call to the outlined
   /// function should be emitted.
   /// \param EntryInfo The entry information about the function.
-  /// \param DefaultAttrs Structure containing the default numbers of threads
+  /// \param DefaultAttrs Structure containing the default attributes, including
+  ///        numbers of threads and teams to launch the kernel with.
+  /// \param RuntimeAttrs Structure containing the runtime numbers of threads
   ///        and teams to launch the kernel with.
   /// \param Inputs The input values to the region that will be passed.
   /// as arguments to the outlined function.
@@ -2987,6 +3014,7 @@ class OpenMPIRBuilder {
       OpenMPIRBuilder::InsertPointTy CodeGenIP,
       TargetRegionEntryInfo &EntryInfo,
       const TargetKernelDefaultAttrs &DefaultAttrs,
+      const TargetKernelRuntimeAttrs &RuntimeAttrs,
       SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
       TargetBodyGenCallbackTy BodyGenCB,
       TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index df9b35ddd80ca..3242b38502300 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -830,6 +830,38 @@ GlobalValue *OpenMPIRBuilder::createGlobalFlag(unsigned Value, StringRef Name) {
   return GV;
 }
 
+void OpenMPIRBuilder::emitUsed(StringRef Name, ArrayRef<WeakTrackingVH> List) {
+  if (List.empty())
+    return;
+
+  // Convert List to what ConstantArray needs.
+  SmallVector<Constant *, 8> UsedArray;
+  UsedArray.resize(List.size());
+  for (unsigned I = 0, E = List.size(); I != E; ++I)
+    UsedArray[I] = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+        cast<Constant>(&*List[I]), Builder.getPtrTy());
+
+  if (UsedArray.empty())
+    return;
+  ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
+
+  auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
+                                ConstantArray::get(ATy, UsedArray), Name);
+
+  GV->setSection("llvm.metadata");
+}
+
+GlobalVariable *
+OpenMPIRBuilder::emitKernelExecutionMode(StringRef KernelName,
+                                         OMPTgtExecModeFlags Mode) {
+  auto *Int8Ty = Builder.getInt8Ty();
+  auto *GVMode = new GlobalVariable(
+      M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
+      ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
+  GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
+  return GVMode;
+}
+
 Constant *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr,
                                             uint32_t SrcLocStrSize,
                                             IdentFlag LocFlags,
@@ -2260,28 +2292,6 @@ static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I) {
   return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
 }
 
-void OpenMPIRBuilder::emitUsed(StringRef Name,
-                               std::vector<WeakTrackingVH> &List) {
-  if (List.empty())
-    return;
-
-  // Convert List to what ConstantArray needs.
-  SmallVector<Constant *, 8> UsedArray;
-  UsedArray.resize(List.size());
-  for (unsigned I = 0, E = List.size(); I != E; ++I)
-    UsedArray[I] = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
-        cast<Constant>(&*List[I]), Builder.getPtrTy());
-
-  if (UsedArray.empty())
-    return;
-  ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
-
-  auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
-                                ConstantArray::get(ATy, UsedArray), Name);
-
-  GV->setSection("llvm.metadata");
-}
-
 Value *OpenMPIRBuilder::getGPUThreadID() {
   return Builder.CreateCall(
       getOrCreateRuntimeFunction(M,
@@ -6131,10 +6141,9 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetInit(
   uint32_t SrcLocStrSize;
   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
   Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
-  Constant *IsSPMDVal = ConstantInt::getSigned(
-      Int8, Attrs.IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC);
-  Constant *UseGenericStateMachineVal =
-      ConstantInt::getSigned(Int8, !Attrs.IsSPMD);
+  Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
+  Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
+      Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD);
   Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
   Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
 
@@ -6765,6 +6774,12 @@ static Expected<Function *> createOutlinedFunction(
   auto Func =
       Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
 
+  if (OMPBuilder.Config.isTargetDevice()) {
+    Value *ExecMode =
+        OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
+    OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
+  }
+
   // Save insert point.
   IRBuilder<>::InsertPointGuard IPG(Builder);
   // If there's a DISubprogram associated with current function, then
@@ -7312,6 +7327,7 @@ static void
 emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
                OpenMPIRBuilder::InsertPointTy AllocaIP,
                const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
+               const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs,
                Function *OutlinedFn, Constant *OutlinedFnID,
                SmallVectorImpl<Value *> &Args,
                OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB,
@@ -7393,11 +7409,43 @@ emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
                                          /*ForEndCall=*/false);
 
   SmallVector<Value *, 3> NumTeamsC;
+  for (auto [DefaultVal, RuntimeVal] :
+       zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
+    NumTeamsC.push_back(RuntimeVal ? RuntimeVal : Builder.getInt32(DefaultVal));
+
+  // Calculate number of threads: 0 if no clauses specified, otherwise it is the
+  // minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
+  auto InitMaxThreadsClause = [&Builder](Value *Clause) {
+    if (Clause)
+      Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
+                                     /*isSigned=*/false);
+    return Clause;
+  };
+  auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
+    if (Clause)
+      Result = Result
+                   ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
+                                          Result, Clause)
+                   : Clause;
+  };
+
+  // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
+  // the NUM_THREADS clause is overriden by THREAD_LIMIT.
   SmallVector<Value *, 3> NumThreadsC;
-  for (auto V : DefaultAttrs.MaxTeams)
-    NumTeamsC.push_back(llvm::ConstantInt::get(Builder.getInt32Ty(), V));
-  for (auto V : DefaultAttrs.MaxThreads)
-    NumThreadsC.push_back(llvm::ConstantInt::get(Builder.getInt32Ty(), V));
+  Value *MaxThreadsClause = RuntimeAttrs.TeamsThreadLimit.size() == 1
+                                ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
+                                : nullptr;
+
+  for (auto [TeamsVal, TargetVal] : zip_equal(RuntimeAttrs.TeamsThreadLimit,
+                                              RuntimeAttrs.TargetThreadLimit)) {
+    Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
+    Value *NumThreads = InitMaxThreadsClause(TargetVal);
+
+    CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
+    CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
+
+    NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
+  }
 
   unsigned NumTargetItems = Info.NumberOfPtrs;
   // TODO: Use correct device ID
@@ -7406,14 +7454,19 @@ emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
   Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
   Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
                                              llvm::omp::IdentFlag(0), 0);
-  // TODO: Use correct NumIterations
-  Value *NumIterations = Builder.getInt64(0);
+
+  Value *TripCount = RuntimeAttrs.LoopTripCount
+                         ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
+                                                 Builder.getInt64Ty(),
+                                                 /*isSigned=*/false)
+                         : Builder.getInt64(0);
+
   // TODO: Use correct DynCGGroupMem
   Value *DynCGGroupMem = Builder.getInt32(0);
 
-  KArgs = OpenMPIRBuilder::TargetKernelArgs(
-      NumTargetItems, RTArgs, NumIterations, NumTeamsC, NumThreadsC,
-      DynCGGroupMem, HasNoWait);
+  KArgs = OpenMPIRBuilder::TargetKernelArgs(NumTargetItems, RTArgs, TripCount,
+                                            NumTeamsC, NumThreadsC,
+                                            DynCGGroupMem, HasNoWait);
 
   // The presence of certain clauses on the target directive require the
   // explicit generation of the target task.
@@ -7438,6 +7491,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget(
     const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
     InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo,
     const TargetKernelDefaultAttrs &DefaultAttrs,
+    const TargetKernelRuntimeAttrs &RuntimeAttrs,
     SmallVectorImpl<Value *> &Args, GenMapInfoCallbackTy GenMapInfoCB,
     OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc,
     OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
@@ -7462,8 +7516,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget(
   // to make a remote call (offload) to the previously outlined function
   // that represents the target region. Do that now.
   if (!Config.isTargetDevice())
-    emitTargetCall(*this, Builder, AllocaIP, DefaultAttrs, OutlinedFn,
-                   OutlinedFnID, Args, GenMapInfoCB, Dependencies, HasNowait);
+    emitTargetCall(*this, Builder, AllocaIP, DefaultAttrs, RuntimeAttrs,
+                   OutlinedFn, OutlinedFnID, Args, GenMapInfoCB, Dependencies,
+                   HasNowait);
   return Builder.saveIP();
 }
 
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index 04ecd7ef327d5..11f13beb9865c 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -6170,7 +6170,7 @@ TEST_F(OpenMPIRBuilderTest, TargetRegion) {
   OMPBuilder.setConfig(Config);
   F->setName("func");
   IRBuilder<> Builder(BB);
-  auto Int32Ty = Builder.getInt32Ty();
+  auto *Int32Ty = Builder.getInt32Ty();
 
   AllocaInst *APtr = Builder.CreateAlloca(Int32Ty, nullptr, "a_ptr");
   AllocaInst *BPtr = Builder.CreateAlloca(Int32Ty, nullptr, "b_ptr");
@@ -6229,16 +6229,22 @@ TEST_F(OpenMPIRBuilderTest, TargetRegion) {
 
   TargetRegionEntryInfo EntryInfo("func", 42, 4711, 17);
   OpenMPIRBuilder::LocationDescription OmpLoc({Builder.saveIP(), DL});
+  OpenMPIRBuilder::TargetKernelRuntimeAttrs RuntimeAttrs;
   OpenMPIRBuilder::TargetKernelDefaultAttrs DefaultAttrs = {
-      /*IsSPMD=*/false, /*MaxTeams=*/{-1}, /*MinTeams=*/0, /*MaxThreads=*/{0},
-      /*MinThreads=*/0};
+      /*ExecFlags=*/omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC,
+      /*MaxTeams=*/{10}, /*MinTeams=*/0, /*MaxThreads=*/{0}, /*MinThreads=*/0};
+  RuntimeAttrs.TargetThreadLimit[0] = Builder.getInt32(20);
+  RuntimeAttrs.TeamsThreadLimit[0] = Builder.getInt32(30);
+  RuntimeAttrs.MaxThreads = Builder.getInt32(40);
 
   ASSERT_EXPECTED_INIT(
       OpenMPIRBuilder::InsertPointTy, AfterIP,
       OMPBuilder.createTarget(OmpLoc, /*IsOffloadEntry=*/true, Builder.saveIP(),
-                              Builder.saveIP(), EntryInfo, DefaultAttrs, Inputs,
-                              GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB));
+                              Builder.saveIP(), EntryInfo, DefaultAttrs,
+                              RuntimeAttrs, Inputs, GenMapInfoCB, BodyGenCB,
+                              SimpleArgAccessorCB));
   Builder.restoreIP(AfterIP);
+
   OMPBuilder.finalize();
   Builder.CreateRetVoid();
 
@@ -6256,6 +6262,43 @@ TEST_F(OpenMPIRBuilderTest, TargetRegion) {
   StringRef FunctionName = KernelLaunchFunc->getName();
   EXPECT_TRUE(FunctionName.starts_with("__tgt_target_kernel"));
 
+  // Check num_teams and num_threads in call arguments
+  EXPECT_TRUE(Call->arg_size() >= 4);
+  Value *NumTeamsArg = Call->getArgOperand(2);
+  EXPECT_TRUE(isa<ConstantInt>(NumTeamsArg));
+  EXPECT_EQ(10U, cast<ConstantInt>(NumTeamsArg)->getZExtValue());
+  Value *NumThreadsArg = Call->getArgOperand(3);
+  EXPECT_TRUE(isa<ConstantInt>(NumThreadsArg));
+  EXPECT_EQ(20U, cast<ConstantInt>(NumThreadsArg)->getZExtValue());
+
+  // Check num_teams and num_threads kernel arguments (use number 5 starting
+  // from the end and counting the call to __tgt_target_kernel as the first use)
+  Value *KernelArgs = Call->getArgOperand(Call->arg_size() - 1);
+  EXPECT_TRUE(KernelArgs->getNumUses() >= 4);
+  Value *NumTeamsGetElemPtr = *std::next(KernelArgs->user_begin(), 3);
+  EXPECT_TRUE(isa<GetElementPtrInst>(NumTeamsGetElemPtr));
+  Value *NumTeamsStore = NumTeamsGetElemPtr->getUniqueUndroppableUser();
+  EXPECT_TRUE(isa<StoreInst>(NumTeamsStore));
+  Value *NumTeamsStoreArg = cast<StoreInst>(NumTeamsStore)->getValueOperand();
+  EXPECT_TRUE(isa<ConstantDataSequential>(NumTeamsStoreArg));
+  auto *NumTeamsStoreValue = cast<ConstantDataSequential>(NumTeamsStoreArg);
+  EXPECT_EQ(3U, NumTeamsStoreValue->getNumElements());
+  EXPECT_EQ(10U, NumTeamsStoreValue->getElementAsInteger(0));
+  EXPECT_EQ(0U, NumTeamsStoreValue->getElementAsInteger(1));
+  EXPECT_EQ(0U, NumTeamsStoreValue->getElementAsInteger(2));
+  Value *NumThreadsGetElemPtr = *std::next(KernelArgs->user_begin(), 2);
+  EXPECT_TRUE(isa<GetElementPtrInst>(NumThreadsGetElemPtr));
+  Value *NumThreadsStore = NumThreadsGetElemPtr->getUniqueUndroppableUser();
+  EXPECT_TRUE(isa<StoreInst>(NumThreadsStore));
+  Value *NumThreadsStoreArg =
+      cast<StoreInst>(NumThreadsStore)->getValueOperand();
+  EXPECT_TRUE(isa<ConstantDataSequential>(NumThreadsStoreArg));
+  auto *NumThreadsStoreValue = cast<ConstantDataSequential>(NumThreadsStoreArg);
+  EXPECT_EQ(3U, NumThreadsStoreValue->getNumElements());
+  EXPECT_EQ(20U, NumThreadsStoreValue->getElementAsInteger(0));
+  EXPECT_EQ(0U, NumThreadsStoreValue->getElementAsInteger(1));
+  EXPECT_EQ(0U, NumThreadsStoreValue->getElementAsInteger(2));
+
   // Check the fallback call
   BasicBlock *FallbackBlock = Branch->getSuccessor(0);
   Iter = FallbackBlock->rbegin();
@@ -6343,15 +6386,16 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) {
                                    F->getEntryBlock().getFirstInsertionPt());
   TargetRegionEntryInfo EntryInfo("parent", /*DeviceID=*/1, /*FileID=*/2,
                                   /*Line=*/3, /*Count=*/0);
+  OpenMPIRBuilder::TargetKernelRuntimeAttrs RuntimeAttrs;
   OpenMPIRBuilder::TargetKernelDefaultAttrs DefaultAttrs = {
-      /*IsSPMD=*/false, /*MaxTeams=*/{-1}, /*MinTeams=*/0, /*MaxThreads=*/{0},
-      /*MinThreads=*/0};
+      /*ExecFlags=*/omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC,
+      /*MaxTeams=*/{-1}, /*MinTeams=*/0, /*MaxThreads=*/{0}, /*MinThreads=*/0};
 
-  ASSERT_EXPECTED_INIT(
-      OpenMPIRBuilder::InsertPointTy, AfterIP,
-      OMPBuilder.createTarget(Loc, /*IsOffloadEntry=*/true, EntryIP, EntryIP,
-                              EntryInfo, DefaultAttrs, CapturedArgs,
-                              GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB));
+  ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, AfterIP,
+                       OMPBuilder.createTarget(
+                           Loc, /*IsOffloadEntry=*/true, EntryIP, EntryIP,
+                           EntryInfo, DefaultAttrs, RuntimeAttrs, CapturedArgs,
+                           GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB));
   Builder.restoreIP(AfterIP);
 
   Builder.CreateRetVoid();
@@ -6435,6 +6479,204 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) {
   auto *ExitBlock = EntryBlockBranch->getSuccessor(1);
   EXPECT_EQ(ExitBlock->getName(), "worker.exit");
   EXPECT_TRUE(isa<ReturnInst>(ExitBlock->getFirstNonPHI()));
+
+  // Check global exec_mode.
+  GlobalVariable *Used = M->getGlobalVariable("llvm.compiler.used");
+  EXPECT_NE(Used, nullptr);
+  Constant *UsedInit = Used->getInitializer();
+  EXPECT_NE(UsedInit, nullptr);
+  EXPECT_TRUE(isa<ConstantArray>(UsedInit));
+  auto *UsedInitData = cast<ConstantArray>(UsedInit);
+  EXPECT_EQ(1U, UsedInitData->getNumOperands());
+  Constant *ExecMode = UsedInitData->getOperand(0);
+  EXPECT_TRUE(isa<GlobalVariable>(ExecMode));
+  Constant *ExecModeValue = cast<GlobalVariable>(ExecMode)->getInitializer();
+  EXPECT_NE(ExecModeValue, nullptr);
+  EXPECT_TRUE(isa<ConstantInt>(ExecModeValue));
+  EXPECT_EQ(OMP_TGT_EXEC_MODE_GENERIC,
+            cast<ConstantInt>(ExecModeValue)->getZExtValue());
+}
+
+TEST_F(OpenMPIRBuilderTest, TargetRegionSPMD) {
+  using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+  OpenMPIRBuilder OMPBuilder(*M);
+  OMPBuilder.initialize();
+  OpenMPIRBuilderConfig Config(/*IsTargetDevice=*/false, /*IsGPU=*/false,
+                               /*OpenMPOffloadMandatory=*/false,
+                               /*HasRequiresReverseOffload=*/false,
+                               /*HasRequiresUnifiedAddress=*/false,
+                               /*HasRequiresUnifiedSharedMemory=*/false,
+                               /*HasRequiresDynamicAllocators=*/false);
+  OMPBuilder.setConfig(Config);
+  F->setName("func");
+  IRBuilder<> Builder(BB);
+
+  auto BodyGenCB = [&](InsertPointTy,
+                       InsertPointTy CodeGenIP) -> InsertPointTy {
+    Builder.restoreIP(CodeGenIP);
+    return Builder.saveIP();
+  };
+
+  auto SimpleArgAccessorCB = [&](Argument &, Value *, Value *&,
+                                 OpenMPIRBuilder::InsertPointTy,
+                                 OpenMPIRBuilder::InsertPointTy CodeGenIP) {
+    Builder.restoreIP(CodeGenIP);
+    return Builder.saveIP();
+  };
+
+  SmallVector<Value *> Inputs;
+  OpenMPIRBuilder::MapInfosTy CombinedInfos;
+  auto GenMapInfoCB =
+      [&](OpenMPIRBuilder::InsertPointTy) -> OpenMPIRBuilder::MapInfosTy & {
+    return CombinedInfos;
+  };
+
+  TargetRegionEntryInfo EntryInfo("func", 42, 4711, 17);
+  OpenMPIRBuilder::LocationDescription OmpLoc({Builder.saveIP(), DL});
+  OpenMPIRBuilder::TargetKernelRuntimeAttrs RuntimeAttrs;
+  OpenMPIRBuilder::TargetKernelDefaultAttrs DefaultAttrs = {
+      /*ExecFlags=*/omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD,
+      /*MaxTeams=*/{-1}, /*MinTeams=*/0, /*MaxThreads=*/{0}, /*MinThreads=*/0};
+  RuntimeAttrs.LoopTripCount = Builder.getInt64(1000);
+
+  ASSERT_EXPECTED_INIT(
+      OpenMPIRBuilder::InsertPointTy, AfterIP,
+      OMPBuilder.createTarget(OmpLoc, /*IsOffloadEntry=*/true, Builder.saveIP(),
+                              Builder.saveIP(), EntryInfo, DefaultAttrs,
+                              RuntimeAttrs, Inputs, GenMapInfoCB, BodyGenCB,
+                              SimpleArgAccessorCB));
+  Builder.restoreIP(AfterIP);
+
+  OMPBuilder.finalize();
+  Builder.CreateRetVoid();
+
+  // Check the kernel launch sequence
+  auto Iter = F->getEntryBlock().rbegin();
+  EXPECT_TRUE(isa<BranchInst>(&*(Iter)));
+  BranchInst *Branch = dyn_cast<BranchInst>(&*(Iter));
+  EXPECT_TRUE(isa<CmpInst>(&*(++Iter)));
+  EXPECT_TRUE(isa<CallInst>(&*(++Iter)));
+  CallInst *Call = dyn_cast<CallInst>(&*(Iter));
+
+  // Check that the kernel launch function is called
+  Function *KernelLaunchFunc = Call->getCalledFunction();
+  EXPECT_NE(KernelLaunchFunc, nullptr);
+  StringRef FunctionName = KernelLaunchFunc->getName();
+  EXPECT_TRUE(FunctionName.starts_with("__tgt_target_kernel"));
+
+  // Check the trip count kernel argument (use number 5 starting from the end
+  // and counting the call to __tgt_target_kernel as the first use)
+  Value *KernelArgs = Call->getArgOperand(Call->arg_size() - 1);
+  EXPECT_TRUE(KernelArgs->getNumUses() >= 6);
+  Value *TripCountGetElemPtr = *std::next(KernelArgs->user_begin(), 5);
+  EXPECT_TRUE(isa<GetElementPtrInst>(TripCountGetElemPtr));
+  Value *TripCountStore = TripCountGetElemPtr->getUniqueUndroppableUser();
+  EXPECT_TRUE(isa<StoreInst>(TripCountStore));
+  Value *TripCountStoreArg = cast<StoreInst>(TripCountStore)->getValueOperand();
+  EXPECT_TRUE(isa<ConstantInt>(TripCountStoreArg));
+  EXPECT_EQ(1000U, cast<ConstantInt>(TripCountStoreArg)->getZExtValue());
+
+  // Check the fallback call
+  BasicBlock *FallbackBlock = Branch->getSuccessor(0);
+  Iter = FallbackBlock->rbegin();
+  CallInst *FCall = dyn_cast<CallInst>(&*(++Iter));
+  // 'F' has a dummy DISubprogram which causes OutlinedFunc to also
+  // have a DISubprogram. In this case, the call to OutlinedFunc needs
+  // to have a debug loc, otherwise verifier will complain.
+  FCall->setDebugLoc(DL);
+  EXPECT_NE(FCall, nullptr);
+
+  // Check that the outlined function exists with the expected prefix
+  Function *OutlinedFunc = FCall->getCalledFunction();
+  EXPECT_NE(OutlinedFunc, nullptr);
+  StringRef FunctionName2 = OutlinedFunc->getName();
+  EXPECT_TRUE(FunctionName2.starts_with("__omp_offloading"));
+
+  EXPECT_FALSE(verifyModule(*M, &errs()));
+}
+
+TEST_F(OpenMPIRBuilderTest, TargetRegionDeviceSPMD) {
+  OpenMPIRBuilder OMPBuilder(*M);
+  OMPBuilder.setConfig(
+      OpenMPIRBuilderConfig(/*IsTargetDevice=*/true, /*IsGPU=*/false,
+                            /*OpenMPOffloadMandatory=*/false,
+                            /*HasRequiresReverseOffload=*/false,
+                            /*HasRequiresUnifiedAddress=*/false,
+                            /*HasRequiresUnifiedSharedMemory=*/false,
+                            /*HasRequiresDynamicAllocators=*/false));
+  OMPBuilder.initialize();
+  F->setName("func");
+  IRBuilder<> Builder(BB);
+  OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL});
+
+  Function *OutlinedFn = nullptr;
+  SmallVector<Value *> CapturedArgs;
+
+  auto SimpleArgAccessorCB = [&](Argument &, Value *, Value *&,
+                                 OpenMPIRBuilder::InsertPointTy,
+                                 OpenMPIRBuilder::InsertPointTy CodeGenIP) {
+    Builder.restoreIP(CodeGenIP);
+    return Builder.saveIP();
+  };
+
+  OpenMPIRBuilder::MapInfosTy CombinedInfos;
+  auto GenMapInfoCB =
+      [&](OpenMPIRBuilder::InsertPointTy) -> OpenMPIRBuilder::MapInfosTy & {
+    return CombinedInfos;
+  };
+
+  auto BodyGenCB = [&](OpenMPIRBuilder::InsertPointTy,
+                       OpenMPIRBuilder::InsertPointTy CodeGenIP)
+      -> OpenMPIRBuilder::InsertPointTy {
+    Builder.restoreIP(CodeGenIP);
+    OutlinedFn = CodeGenIP.getBlock()->getParent();
+    return Builder.saveIP();
+  };
+
+  IRBuilder<>::InsertPoint EntryIP(&F->getEntryBlock(),
+                                   F->getEntryBlock().getFirstInsertionPt());
+  TargetRegionEntryInfo EntryInfo("parent", /*DeviceID=*/1, /*FileID=*/2,
+                                  /*Line=*/3, /*Count=*/0);
+  OpenMPIRBuilder::TargetKernelRuntimeAttrs RuntimeAttrs;
+  OpenMPIRBuilder::TargetKernelDefaultAttrs DefaultAttrs = {
+      /*ExecFlags=*/omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD,
+      /*MaxTeams=*/{-1}, /*MinTeams=*/0, /*MaxThreads=*/{0}, /*MinThreads=*/0};
+
+  ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, AfterIP,
+                       OMPBuilder.createTarget(
+                           Loc, /*IsOffloadEntry=*/true, EntryIP, EntryIP,
+                           EntryInfo, DefaultAttrs, RuntimeAttrs, CapturedArgs,
+                           GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB));
+  Builder.restoreIP(AfterIP);
+
+  Builder.CreateRetVoid();
+  OMPBuilder.finalize();
+
+  // Check outlined function
+  EXPECT_FALSE(verifyModule(*M, &errs()));
+  EXPECT_NE(OutlinedFn, nullptr);
+  EXPECT_NE(F, OutlinedFn);
+
+  EXPECT_TRUE(OutlinedFn->hasWeakODRLinkage());
+  // Account for the "implicit" first argument.
+  EXPECT_EQ(OutlinedFn->getName(), "__omp_offloading_1_2_parent_l3");
+  EXPECT_EQ(OutlinedFn->arg_size(), 1U);
+
+  // Check global exec_mode.
+  GlobalVariable *Used = M->getGlobalVariable("llvm.compiler.used");
+  EXPECT_NE(Used, nullptr);
+  Constant *UsedInit = Used->getInitializer();
+  EXPECT_NE(UsedInit, nullptr);
+  EXPECT_TRUE(isa<ConstantArray>(UsedInit));
+  auto *UsedInitData = cast<ConstantArray>(UsedInit);
+  EXPECT_EQ(1U, UsedInitData->getNumOperands());
+  Constant *ExecMode = UsedInitData->getOperand(0);
+  EXPECT_TRUE(isa<GlobalVariable>(ExecMode));
+  Constant *ExecModeValue = cast<GlobalVariable>(ExecMode)->getInitializer();
+  EXPECT_NE(ExecModeValue, nullptr);
+  EXPECT_TRUE(isa<ConstantInt>(ExecModeValue));
+  EXPECT_EQ(OMP_TGT_EXEC_MODE_SPMD,
+            cast<ConstantInt>(ExecModeValue)->getZExtValue());
 }
 
 TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) {
@@ -6502,15 +6744,16 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) {
                                    F->getEntryBlock().getFirstInsertionPt());
   TargetRegionEntryInfo EntryInfo("parent", /*DeviceID=*/1, /*FileID=*/2,
                                   /*Line=*/3, /*Count=*/0);
+  OpenMPIRBuilder::TargetKernelRuntimeAttrs RuntimeAttrs;
   OpenMPIRBuilder::TargetKernelDefaultAttrs DefaultAttrs = {
-      /*IsSPMD=*/false, /*MaxTeams=*/{-1}, /*MinTeams=*/0, /*MaxThreads=*/{0},
-      /*MinThreads=*/0};
+      /*ExecFlags=*/omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC,
+      /*MaxTeams=*/{-1}, /*MinTeams=*/0, /*MaxThreads=*/{0}, /*MinThreads=*/0};
 
-  ASSERT_EXPECTED_INIT(
-      OpenMPIRBuilder::InsertPointTy, AfterIP,
-      OMPBuilder.createTarget(Loc, /*IsOffloadEntry=*/true, EntryIP, EntryIP,
-                              EntryInfo, DefaultAttrs, CapturedArgs,
-                              GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB));
+  ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, AfterIP,
+                       OMPBuilder.createTarget(
+                           Loc, /*IsOffloadEntry=*/true, EntryIP, EntryIP,
+                           EntryInfo, DefaultAttrs, RuntimeAttrs, CapturedArgs,
+                           GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB));
   Builder.restoreIP(AfterIP);
 
   Builder.CreateRetVoid();
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 25b0ffe2ced6d..5c36187540690 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -4115,10 +4115,12 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
                                         allocaIP, codeGenIP);
   };
 
-  // TODO: Populate default attributes based on the construct and clauses.
+  // TODO: Populate default and runtime attributes based on the construct and
+  // clauses.
+  llvm::OpenMPIRBuilder::TargetKernelRuntimeAttrs runtimeAttrs;
   llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs defaultAttrs = {
-      /*IsSPMD=*/false, /*MaxTeams=*/{-1}, /*MinTeams=*/0, /*MaxThreads=*/{0},
-      /*MinThreads=*/0};
+      /*ExecFlags=*/llvm::omp::OMP_TGT_EXEC_MODE_GENERIC, /*MaxTeams=*/{-1},
+      /*MinTeams=*/0, /*MaxThreads=*/{0}, /*MinThreads=*/0};
 
   llvm::SmallVector<llvm::Value *, 4> kernelInput;
   for (size_t i = 0; i < mapVars.size(); ++i) {
@@ -4143,8 +4145,8 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP =
       moduleTranslation.getOpenMPBuilder()->createTarget(
           ompLoc, isOffloadEntry, allocaIP, builder.saveIP(), entryInfo,
-          defaultAttrs, kernelInput, genMapInfoCB, bodyCB, argAccessorCB, dds,
-          targetOp.getNowait());
+          defaultAttrs, runtimeAttrs, kernelInput, genMapInfoCB, bodyCB,
+          argAccessorCB, dds, targetOp.getNowait());
 
   if (failed(handleError(afterIP, opInst)))
     return failure();

From d0b641b7e2a9b4120c11fc60b111a657b0420176 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Tue, 14 Jan 2025 12:35:50 +0000
Subject: [PATCH 408/408] [OMPIRBuilder] Propagate attributes to outlined
 target regions (#117875)

This patch copies the target-cpu and target-features attributes of
functions containing target regions into the corresponding outlined
function holding the target region.

This mirrors what is currently being done for all other outlined
functions through the `CodeExtractor` in `OpenMPIRBuilder::finalize()`.
---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 12 +++++++++
 .../Frontend/OpenMPIRBuilderTest.cpp          | 25 +++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 3242b38502300..c22eb4c7fe11e 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -6774,6 +6774,18 @@ static Expected<Function *> createOutlinedFunction(
   auto Func =
       Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
 
+  // Forward target-cpu and target-features function attributes from the
+  // original function to the new outlined function.
+  Function *ParentFn = Builder.GetInsertBlock()->getParent();
+
+  auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
+  if (TargetCpuAttr.isStringAttribute())
+    Func->addFnAttr(TargetCpuAttr);
+
+  auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
+  if (TargetFeaturesAttr.isStringAttribute())
+    Func->addFnAttr(TargetFeaturesAttr);
+
   if (OMPBuilder.Config.isTargetDevice()) {
     Value *ExecMode =
         OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index 11f13beb9865c..3b571cce09a4f 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -6169,6 +6169,8 @@ TEST_F(OpenMPIRBuilderTest, TargetRegion) {
   OpenMPIRBuilderConfig Config(false, false, false, false, false, false, false);
   OMPBuilder.setConfig(Config);
   F->setName("func");
+  F->addFnAttr("target-cpu", "x86-64");
+  F->addFnAttr("target-features", "+mmx,+sse");
   IRBuilder<> Builder(BB);
   auto *Int32Ty = Builder.getInt32Ty();
 
@@ -6320,6 +6322,13 @@ TEST_F(OpenMPIRBuilderTest, TargetRegion) {
   StringRef FunctionName2 = OutlinedFunc->getName();
   EXPECT_TRUE(FunctionName2.starts_with("__omp_offloading"));
 
+  // Check that target-cpu and target-features were propagated to the outlined
+  // function
+  EXPECT_EQ(OutlinedFunc->getFnAttribute("target-cpu"),
+            F->getFnAttribute("target-cpu"));
+  EXPECT_EQ(OutlinedFunc->getFnAttribute("target-features"),
+            F->getFnAttribute("target-features"));
+
   EXPECT_FALSE(verifyModule(*M, &errs()));
 }
 
@@ -6330,6 +6339,8 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) {
   OMPBuilder.initialize();
 
   F->setName("func");
+  F->addFnAttr("target-cpu", "gfx90a");
+  F->addFnAttr("target-features", "+gfx9-insts,+wavefrontsize64");
   IRBuilder<> Builder(BB);
   OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL});
 
@@ -6407,6 +6418,13 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) {
   Function *OutlinedFn = TargetStore->getFunction();
   EXPECT_NE(F, OutlinedFn);
 
+  // Check that target-cpu and target-features were propagated to the outlined
+  // function
+  EXPECT_EQ(OutlinedFn->getFnAttribute("target-cpu"),
+            F->getFnAttribute("target-cpu"));
+  EXPECT_EQ(OutlinedFn->getFnAttribute("target-features"),
+            F->getFnAttribute("target-features"));
+
   EXPECT_TRUE(OutlinedFn->hasWeakODRLinkage());
   // Account for the "implicit" first argument.
   EXPECT_EQ(OutlinedFn->getName(), "__omp_offloading_1_2_parent_l3");
@@ -6657,6 +6675,13 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDeviceSPMD) {
   EXPECT_NE(OutlinedFn, nullptr);
   EXPECT_NE(F, OutlinedFn);
 
+  // Check that target-cpu and target-features were propagated to the outlined
+  // function
+  EXPECT_EQ(OutlinedFn->getFnAttribute("target-cpu"),
+            F->getFnAttribute("target-cpu"));
+  EXPECT_EQ(OutlinedFn->getFnAttribute("target-features"),
+            F->getFnAttribute("target-features"));
+
   EXPECT_TRUE(OutlinedFn->hasWeakODRLinkage());
   // Account for the "implicit" first argument.
   EXPECT_EQ(OutlinedFn->getName(), "__omp_offloading_1_2_parent_l3");